summaryrefslogtreecommitdiff
path: root/mm/migrate.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 12:32:41 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 12:32:41 -0700
commit98931dd95fd489fcbfa97da563505a6f071d7c77 (patch)
tree44683fc4a92efa614acdca2742a7ff19d26da1e3 /mm/migrate.c
parentdf202b452fe6c6d6f1351bad485e2367ef1e644e (diff)
parentf403f22f8ccb12860b2b62fec3173c6ccd45938b (diff)
Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: "Almost all of MM here. A few things are still getting finished off, reviewed, etc. - Yang Shi has improved the behaviour of khugepaged collapsing of readonly file-backed transparent hugepages. - Johannes Weiner has arranged for zswap memory use to be tracked and managed on a per-cgroup basis. - Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for runtime enablement of the recent huge page vmemmap optimization feature. - Baolin Wang contributes a series to fix some issues around hugetlb pagetable invalidation. - Zhenwei Pi has fixed some interactions between hwpoisoned pages and virtualization. - Tong Tiangen has enabled the use of the presently x86-only page_table_check debugging feature on arm64 and riscv. - David Vernet has done some fixup work on the memcg selftests. - Peter Xu has taught userfaultfd to handle write protection faults against shmem- and hugetlbfs-backed files. - More DAMON development from SeongJae Park - adding online tuning of the feature and support for monitoring of fixed virtual address ranges. Also easier discovery of which monitoring operations are available. - Nadav Amit has done some optimization of TLB flushing during mprotect(). - Neil Brown continues to labor away at improving our swap-over-NFS support. - David Hildenbrand has some fixes to anon page COWing versus get_user_pages(). - Peng Liu fixed some errors in the core hugetlb code. - Joao Martins has reduced the amount of memory consumed by device-dax's compound devmaps. - Some cleanups of the arch-specific pagemap code from Anshuman Khandual. - Muchun Song has found and fixed some errors in the TLB flushing of transparent hugepages. - Roman Gushchin has done more work on the memcg selftests. ... and, of course, many smaller fixes and cleanups. Notably, the customary million cleanup serieses from Miaohe Lin" * tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (381 commits) mm: kfence: use PAGE_ALIGNED helper selftests: vm: add the "settings" file with timeout variable selftests: vm: add "test_hmm.sh" to TEST_FILES selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests selftests: vm: add migration to the .gitignore selftests/vm/pkeys: fix typo in comment ksm: fix typo in comment selftests: vm: add process_mrelease tests Revert "mm/vmscan: never demote for memcg reclaim" mm/kfence: print disabling or re-enabling message include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace" include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion" mm: fix a potential infinite loop in start_isolate_page_range() MAINTAINERS: add Muchun as co-maintainer for HugeTLB zram: fix Kconfig dependency warning mm/shmem: fix shmem folio swapoff hang cgroup: fix an error handling path in alloc_pagecache_max_30M() mm: damon: use HPAGE_PMD_SIZE tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate nodemask.h: fix compilation error with GCC12 ...
Diffstat (limited to 'mm/migrate.c')
-rw-r--r--mm/migrate.c194
1 files changed, 97 insertions, 97 deletions
diff --git a/mm/migrate.c b/mm/migrate.c
index 21d82636c291..e51588e95f57 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -177,6 +177,7 @@ static bool remove_migration_pte(struct folio *folio,
DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
while (page_vma_mapped_walk(&pvmw)) {
+ rmap_t rmap_flags = RMAP_NONE;
pte_t pte;
swp_entry_t entry;
struct page *new;
@@ -211,6 +212,9 @@ static bool remove_migration_pte(struct folio *folio,
else if (pte_swp_uffd_wp(*pvmw.pte))
pte = pte_mkuffd_wp(pte);
+ if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
+ rmap_flags |= RMAP_EXCLUSIVE;
+
if (unlikely(is_device_private_page(new))) {
if (pte_write(pte))
entry = make_writable_device_private_entry(
@@ -232,15 +236,17 @@ static bool remove_migration_pte(struct folio *folio,
pte = pte_mkhuge(pte);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
if (folio_test_anon(folio))
- hugepage_add_anon_rmap(new, vma, pvmw.address);
+ hugepage_add_anon_rmap(new, vma, pvmw.address,
+ rmap_flags);
else
- page_dup_rmap(new, true);
+ page_dup_file_rmap(new, true);
set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
} else
#endif
{
if (folio_test_anon(folio))
- page_add_anon_rmap(new, vma, pvmw.address, false);
+ page_add_anon_rmap(new, vma, pvmw.address,
+ rmap_flags);
else
page_add_file_rmap(new, vma, false);
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
@@ -471,11 +477,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
xas_lock_irq(&xas);
expected_count = 2 + page_has_private(page);
- if (page_count(page) != expected_count || xas_load(&xas) != page) {
- xas_unlock_irq(&xas);
- return -EAGAIN;
- }
-
if (!page_ref_freeze(page, expected_count)) {
xas_unlock_irq(&xas);
return -EAGAIN;
@@ -517,6 +518,12 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
folio_set_workingset(newfolio);
if (folio_test_checked(folio))
folio_set_checked(newfolio);
+ /*
+ * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via
+ * migration entries. We can still have PG_anon_exclusive set on an
+ * effectively unmapped and unreferenced first sub-pages of an
+ * anonymous THP: we can simply copy it here via PG_mappedtodisk.
+ */
if (folio_test_mappedtodisk(folio))
folio_set_mappedtodisk(newfolio);
@@ -836,21 +843,21 @@ static int fallback_migrate_page(struct address_space *mapping,
* < 0 - error code
* MIGRATEPAGE_SUCCESS - success
*/
-static int move_to_new_page(struct page *newpage, struct page *page,
+static int move_to_new_folio(struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
struct address_space *mapping;
int rc = -EAGAIN;
- bool is_lru = !__PageMovable(page);
+ bool is_lru = !__PageMovable(&src->page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
+ VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
- mapping = page_mapping(page);
+ mapping = folio_mapping(src);
if (likely(is_lru)) {
if (!mapping)
- rc = migrate_page(mapping, newpage, page, mode);
+ rc = migrate_page(mapping, &dst->page, &src->page, mode);
else if (mapping->a_ops->migratepage)
/*
* Most pages have a mapping and most filesystems
@@ -859,54 +866,54 @@ static int move_to_new_page(struct page *newpage, struct page *page,
* migratepage callback. This is the most common path
* for page migration.
*/
- rc = mapping->a_ops->migratepage(mapping, newpage,
- page, mode);
+ rc = mapping->a_ops->migratepage(mapping, &dst->page,
+ &src->page, mode);
else
- rc = fallback_migrate_page(mapping, newpage,
- page, mode);
+ rc = fallback_migrate_page(mapping, &dst->page,
+ &src->page, mode);
} else {
/*
* In case of non-lru page, it could be released after
* isolation step. In that case, we shouldn't try migration.
*/
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
- if (!PageMovable(page)) {
+ VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
+ if (!folio_test_movable(src)) {
rc = MIGRATEPAGE_SUCCESS;
- ClearPageIsolated(page);
+ folio_clear_isolated(src);
goto out;
}
- rc = mapping->a_ops->migratepage(mapping, newpage,
- page, mode);
+ rc = mapping->a_ops->migratepage(mapping, &dst->page,
+ &src->page, mode);
WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
- !PageIsolated(page));
+ !folio_test_isolated(src));
}
/*
- * When successful, old pagecache page->mapping must be cleared before
- * page is freed; but stats require that PageAnon be left as PageAnon.
+ * When successful, old pagecache src->mapping must be cleared before
+ * src is freed; but stats require that PageAnon be left as PageAnon.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- if (__PageMovable(page)) {
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ if (__PageMovable(&src->page)) {
+ VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
/*
* We clear PG_movable under page_lock so any compactor
* cannot try to migrate this page.
*/
- ClearPageIsolated(page);
+ folio_clear_isolated(src);
}
/*
- * Anonymous and movable page->mapping will be cleared by
+ * Anonymous and movable src->mapping will be cleared by
* free_pages_prepare so don't reset it here for keeping
* the type to work PageAnon, for example.
*/
- if (!PageMappingFlags(page))
- page->mapping = NULL;
+ if (!folio_mapping_flags(src))
+ src->mapping = NULL;
- if (likely(!is_zone_device_page(newpage)))
- flush_dcache_folio(page_folio(newpage));
+ if (likely(!folio_is_zone_device(dst)))
+ flush_dcache_folio(dst);
}
out:
return rc;
@@ -994,7 +1001,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
goto out_unlock;
if (unlikely(!is_lru)) {
- rc = move_to_new_page(newpage, page, mode);
+ rc = move_to_new_folio(dst, folio, mode);
goto out_unlock_both;
}
@@ -1025,7 +1032,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
}
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page, mode);
+ rc = move_to_new_folio(dst, folio, mode);
/*
* When successful, push newpage to LRU immediately: so that if it
@@ -1230,7 +1237,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
goto put_anon;
if (page_mapped(hpage)) {
- bool mapping_locked = false;
enum ttu_flags ttu = 0;
if (!PageAnon(hpage)) {
@@ -1244,19 +1250,18 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (unlikely(!mapping))
goto unlock_put_anon;
- mapping_locked = true;
- ttu |= TTU_RMAP_LOCKED;
+ ttu = TTU_RMAP_LOCKED;
}
try_to_migrate(src, ttu);
page_was_mapped = 1;
- if (mapping_locked)
+ if (ttu & TTU_RMAP_LOCKED)
i_mmap_unlock_write(mapping);
}
if (!page_mapped(hpage))
- rc = move_to_new_page(new_hpage, hpage, mode);
+ rc = move_to_new_folio(dst, src, mode);
if (page_was_mapped)
remove_migration_ptes(src,
@@ -1412,14 +1417,11 @@ retry:
nr_thp_split++;
goto retry;
}
-
- nr_failed_pages += nr_subpages;
- break;
- }
-
/* Hugetlb migration is unsupported */
- if (!no_subpage_counting)
+ } else if (!no_subpage_counting) {
nr_failed++;
+ }
+
nr_failed_pages += nr_subpages;
break;
case -ENOMEM:
@@ -1434,28 +1436,30 @@ retry:
nr_thp_split++;
goto retry;
}
-
- nr_failed_pages += nr_subpages;
- goto out;
+ } else if (!no_subpage_counting) {
+ nr_failed++;
}
- if (!no_subpage_counting)
- nr_failed++;
nr_failed_pages += nr_subpages;
+ /*
+ * There might be some subpages of fail-to-migrate THPs
+ * left in thp_split_pages list. Move them back to migration
+ * list so that they could be put back to the right list by
+ * the caller otherwise the page refcnt will be leaked.
+ */
+ list_splice_init(&thp_split_pages, from);
+ nr_thp_failed += thp_retry;
goto out;
case -EAGAIN:
- if (is_thp) {
+ if (is_thp)
thp_retry++;
- break;
- }
- retry++;
+ else
+ retry++;
break;
case MIGRATEPAGE_SUCCESS:
nr_succeeded += nr_subpages;
- if (is_thp) {
+ if (is_thp)
nr_thp_succeeded++;
- break;
- }
break;
default:
/*
@@ -1464,14 +1468,11 @@ retry:
* removed from migration page list and not
* retried in the next outer loop.
*/
- if (is_thp) {
+ if (is_thp)
nr_thp_failed++;
- nr_failed_pages += nr_subpages;
- break;
- }
-
- if (!no_subpage_counting)
+ else if (!no_subpage_counting)
nr_failed++;
+
nr_failed_pages += nr_subpages;
break;
}
@@ -1606,8 +1607,8 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
mmap_read_lock(mm);
err = -EFAULT;
- vma = find_vma(mm, addr);
- if (!vma || addr < vma->vm_start || !vma_migratable(vma))
+ vma = vma_lookup(mm, addr);
+ if (!vma || !vma_migratable(vma))
goto out;
/* FOLL_DUMP to ignore special (like zero) pages */
@@ -1802,13 +1803,18 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
goto set_status;
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_DUMP);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
err = PTR_ERR(page);
if (IS_ERR(page))
goto set_status;
- err = page ? page_to_nid(page) : -ENOENT;
+ if (page) {
+ err = page_to_nid(page);
+ put_page(page);
+ } else {
+ err = -ENOENT;
+ }
set_status:
*status = err;
@@ -1844,16 +1850,12 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
const void __user * __user *pages,
int __user *status)
{
-#define DO_PAGES_STAT_CHUNK_NR 16
+#define DO_PAGES_STAT_CHUNK_NR 16UL
const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
int chunk_status[DO_PAGES_STAT_CHUNK_NR];
while (nr_pages) {
- unsigned long chunk_nr;
-
- chunk_nr = nr_pages;
- if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
- chunk_nr = DO_PAGES_STAT_CHUNK_NR;
+ unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
if (in_compat_syscall()) {
if (get_compat_pages_array(chunk_pages, pages,
@@ -1969,7 +1971,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
#ifdef CONFIG_NUMA_BALANCING
/*
* Returns true if this is a safe migration target node for misplaced NUMA
- * pages. Currently it only checks the watermarks which crude
+ * pages. Currently it only checks the watermarks which is crude.
*/
static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
unsigned long nr_migrate_pages)
@@ -1979,7 +1981,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
struct zone *zone = pgdat->node_zones + z;
- if (!populated_zone(zone))
+ if (!managed_zone(zone))
continue;
/* Avoid waking kswapd by allocating pages_to_migrate pages. */
@@ -2015,7 +2017,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
- int page_lru;
int nr_pages = thp_nr_pages(page);
int order = compound_order(page);
@@ -2032,7 +2033,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
return 0;
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
- if (populated_zone(pgdat->node_zones + z))
+ if (managed_zone(pgdat->node_zones + z))
break;
}
wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
@@ -2042,8 +2043,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
if (isolate_lru_page(page))
return 0;
- page_lru = page_is_file_lru(page);
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page),
nr_pages);
/*
@@ -2116,7 +2116,6 @@ out:
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_NUMA */
/*
* node_demotion[] example:
@@ -2250,7 +2249,6 @@ out:
return target;
}
-#if defined(CONFIG_HOTPLUG_CPU)
/* Disable reclaim-based migration. */
static void __disable_all_migrate_targets(void)
{
@@ -2353,8 +2351,8 @@ out_clear:
*/
static void __set_migration_target_nodes(void)
{
- nodemask_t next_pass = NODE_MASK_NONE;
- nodemask_t this_pass = NODE_MASK_NONE;
+ nodemask_t next_pass;
+ nodemask_t this_pass;
nodemask_t used_targets = NODE_MASK_NONE;
int node, best_distance;
@@ -2443,6 +2441,7 @@ void set_migration_target_nodes(void)
* __set_migration_target_nodes() can be used as opposed to
* set_migration_target_nodes().
*/
+#ifdef CONFIG_MEMORY_HOTPLUG
static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
unsigned long action, void *_arg)
{
@@ -2488,15 +2487,17 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
return notifier_from_errno(0);
}
+#endif
void __init migrate_on_reclaim_init(void)
{
- node_demotion = kmalloc_array(nr_node_ids,
- sizeof(struct demotion_nodes),
- GFP_KERNEL);
+ node_demotion = kcalloc(nr_node_ids,
+ sizeof(struct demotion_nodes),
+ GFP_KERNEL);
WARN_ON(!node_demotion);
-
+#ifdef CONFIG_MEMORY_HOTPLUG
hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+#endif
/*
* At this point, all numa nodes with memory/CPus have their state
* properly set, so we can build the demotion order now.
@@ -2507,7 +2508,6 @@ void __init migrate_on_reclaim_init(void)
set_migration_target_nodes();
cpus_read_unlock();
}
-#endif /* CONFIG_HOTPLUG_CPU */
bool numa_demotion_enabled = false;
@@ -2523,12 +2523,11 @@ static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
- numa_demotion_enabled = true;
- else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
- numa_demotion_enabled = false;
- else
- return -EINVAL;
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &numa_demotion_enabled);
+ if (ret)
+ return ret;
return count;
}
@@ -2568,4 +2567,5 @@ delete_obj:
return err;
}
subsys_initcall(numa_init_sysfs);
-#endif
+#endif /* CONFIG_SYSFS */
+#endif /* CONFIG_NUMA */