From 3abef4e6c23feef4aa9ab161ae138d6d39ae69f3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 22 Feb 2013 16:34:27 -0800 Subject: mm: numa: take THP into account when migrating pages for NUMA balancing Wanpeng Li pointed out that numamigrate_isolate_page() assumes that only one base page is being migrated when in fact it can also be checking THP. The consequences are that a migration will be attempted when a target node is nearly full and fail later. It's unlikely to be user-visible but it should be fixed. While we are there, migrate_balanced_pgdat() should treat nr_migrate_pages as an unsigned long as it is treated as a watermark. Signed-off-by: Mel Gorman Suggested-by: Wanpeng Li Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Ingo Molnar Cc: Simon Jeons Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 2fd8b4af4744..77f4e70df24d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1461,7 +1461,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, * pages. Currently it only checks the watermarks which crude */ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, - int nr_migrate_pages) + unsigned long nr_migrate_pages) { int z; for (z = pgdat->nr_zones - 1; z >= 0; z--) { @@ -1559,8 +1559,10 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int ret = 0; + VM_BUG_ON(compound_order(page) && !PageTransHuge(page)); + /* Avoid migrating to a node that is nearly full */ - if (migrate_balanced_pgdat(pgdat, 1)) { + if (migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) { int page_lru; if (isolate_lru_page(page)) { -- cgit v1.2.3 From 340ef3902cf20cec43cdcd1e72ae5cb518be7328 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 22 Feb 2013 16:34:33 -0800 Subject: mm: numa: cleanup flow of transhuge page migration When correcting commit 04fa5d6a6547 ("mm: migrate: check page_count of THP before migrating") Hugh Dickins noted that the control flow for transhuge migration was difficult to follow. Unconditionally calling put_page() in numamigrate_isolate_page() made the failure paths of both migrate_misplaced_transhuge_page() and migrate_misplaced_page() more complex that they should be. Further, he was extremely wary that an unlock_page() should ever happen after a put_page() even if the put_page() should never be the final put_page. Hugh implemented the following cleanup to simplify the path by calling putback_lru_page() inside numamigrate_isolate_page() if it failed to isolate and always calling unlock_page() within migrate_misplaced_transhuge_page(). There is no functional change after this patch is applied but the code is easier to follow and unlock_page() always happens before put_page(). [mgorman@suse.de: changelog only] Signed-off-by: Mel Gorman Signed-off-by: Hugh Dickins Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Ingo Molnar Cc: Simon Jeons Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 28 ++++++----------- mm/migrate.c | 95 +++++++++++++++++++++++++------------------------------- 2 files changed, 52 insertions(+), 71 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9db521fa8e8a..f40b2ce23d60 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1296,7 +1296,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int target_nid; int current_nid = -1; bool migrated; - bool page_locked = false; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1318,7 +1317,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Acquire the page lock to serialise THP migrations */ spin_unlock(&mm->page_table_lock); lock_page(page); - page_locked = true; /* Confirm the PTE did not while locked */ spin_lock(&mm->page_table_lock); @@ -1331,34 +1329,26 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migrate the THP to the requested node */ migrated = migrate_misplaced_transhuge_page(mm, vma, - pmdp, pmd, addr, - page, target_nid); - if (migrated) - current_nid = target_nid; - else { - spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) { - unlock_page(page); - goto out_unlock; - } - goto clear_pmdnuma; - } + pmdp, pmd, addr, page, target_nid); + if (!migrated) + goto check_same; - task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(target_nid, HPAGE_PMD_NR, true); return 0; +check_same: + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; clear_pmdnuma: pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); - if (page_locked) - unlock_page(page); - out_unlock: spin_unlock(&mm->page_table_lock); if (current_nid != -1) - task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(current_nid, HPAGE_PMD_NR, false); return 0; } diff --git a/mm/migrate.c b/mm/migrate.c index 77f4e70df24d..f560071e89c5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1557,41 +1557,40 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { - int ret = 0; + int page_lru; VM_BUG_ON(compound_order(page) && !PageTransHuge(page)); /* Avoid migrating to a node that is nearly full */ - if (migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) { - int page_lru; + if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) + return 0; - if (isolate_lru_page(page)) { - put_page(page); - return 0; - } + if (isolate_lru_page(page)) + return 0; - /* Page is isolated */ - ret = 1; - page_lru = page_is_file_cache(page); - if (!PageTransHuge(page)) - inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); - else - mod_zone_page_state(page_zone(page), - NR_ISOLATED_ANON + page_lru, - HPAGE_PMD_NR); + /* + * migrate_misplaced_transhuge_page() skips page migration's usual + * check on page_count(), so we must do it here, now that the page + * has been isolated: a GUP pin, or any other pin, prevents migration. + * The expected page count is 3: 1 for page's mapcount and 1 for the + * caller's pin and 1 for the reference taken by isolate_lru_page(). + */ + if (PageTransHuge(page) && page_count(page) != 3) { + putback_lru_page(page); + return 0; } + page_lru = page_is_file_cache(page); + mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, + hpage_nr_pages(page)); + /* - * Page is either isolated or there is not enough space on the target - * node. If isolated, then it has taken a reference count and the - * callers reference can be safely dropped without the page - * disappearing underneath us during migration. Otherwise the page is - * not to be migrated but the callers reference should still be - * dropped so it does not leak. + * Isolating the page has taken another reference, so the + * caller's reference can be safely dropped without the page + * disappearing underneath us during migration. */ put_page(page); - - return ret; + return 1; } /* @@ -1602,7 +1601,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) int migrate_misplaced_page(struct page *page, int node) { pg_data_t *pgdat = NODE_DATA(node); - int isolated = 0; + int isolated; int nr_remaining; LIST_HEAD(migratepages); @@ -1610,20 +1609,16 @@ int migrate_misplaced_page(struct page *page, int node) * Don't migrate pages that are mapped in multiple processes. * TODO: Handle false sharing detection instead of this hammer */ - if (page_mapcount(page) != 1) { - put_page(page); + if (page_mapcount(page) != 1) goto out; - } /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! */ - if (numamigrate_update_ratelimit(pgdat, 1)) { - put_page(page); + if (numamigrate_update_ratelimit(pgdat, 1)) goto out; - } isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) @@ -1640,12 +1635,19 @@ int migrate_misplaced_page(struct page *page, int node) } else count_vm_numa_event(NUMA_PAGE_MIGRATE); BUG_ON(!list_empty(&migratepages)); -out: return isolated; + +out: + put_page(page); + return 0; } #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +/* + * Migrates a THP to a given target node. page must be locked and is unlocked + * before returning. + */ int migrate_misplaced_transhuge_page(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, pmd_t entry, @@ -1676,29 +1678,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, new_page = alloc_pages_node(node, (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); - if (!new_page) { - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - goto out_dropref; - } + if (!new_page) + goto out_fail; + page_xchg_last_nid(new_page, page_last_nid(page)); isolated = numamigrate_isolate_page(pgdat, page); - - /* - * Failing to isolate or a GUP pin prevents migration. The expected - * page count is 2. 1 for anonymous pages without a mapping and 1 - * for the callers pin. If the page was isolated, the page will - * need to be put back on the LRU. - */ - if (!isolated || page_count(page) != 2) { - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); + if (!isolated) { put_page(new_page); - if (isolated) { - putback_lru_page(page); - isolated = 0; - goto out; - } - goto out_keep_locked; + goto out_fail; } /* Prepare a page as a migration target */ @@ -1730,6 +1718,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, putback_lru_page(page); count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); + isolated = 0; goto out; } @@ -1774,9 +1763,11 @@ out: -HPAGE_PMD_NR); return isolated; +out_fail: + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); out_dropref: + unlock_page(page); put_page(page); -out_keep_locked: return 0; } #endif /* CONFIG_NUMA_BALANCING */ -- cgit v1.2.3 From 22b751c3d0376e86a377e3a0aa2ddbbe9d2eefc1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 22 Feb 2013 16:34:59 -0800 Subject: mm: rename page struct field helpers The function names page_xchg_last_nid(), page_last_nid() and reset_page_last_nid() were judged to be inconsistent so rename them to a struct_field_op style pattern. As it looked jarring to have reset_page_mapcount() and page_nid_reset_last() beside each other in memmap_init_zone(), this patch also renames reset_page_mapcount() to page_mapcount_reset(). There are others like init_page_count() but as it is used throughout the arch code a rename would likely cause more conflicts than it is worth. [akpm@linux-foundation.org: fix zcache] Signed-off-by: Mel Gorman Suggested-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/staging/zcache/zbud.c | 2 +- drivers/staging/zsmalloc/zsmalloc-main.c | 2 +- include/linux/mm.h | 20 ++++++++++---------- mm/huge_memory.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 4 ++-- mm/mmzone.c | 4 ++-- mm/page_alloc.c | 10 +++++----- mm/slob.c | 2 +- mm/slub.c | 2 +- 10 files changed, 25 insertions(+), 25 deletions(-) (limited to 'mm/migrate.c') diff --git a/drivers/staging/zcache/zbud.c b/drivers/staging/zcache/zbud.c index 328c397ea5dc..fdff5c6a0239 100644 --- a/drivers/staging/zcache/zbud.c +++ b/drivers/staging/zcache/zbud.c @@ -404,7 +404,7 @@ static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage, else zbud_pers_pageframes--; zbudpage_spin_unlock(zbudpage); - reset_page_mapcount(page); + page_mapcount_reset(page); init_page_count(page); page->index = 0; return page; diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c index 06f73a93a44d..e78d262c5249 100644 --- a/drivers/staging/zsmalloc/zsmalloc-main.c +++ b/drivers/staging/zsmalloc/zsmalloc-main.c @@ -472,7 +472,7 @@ static void reset_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; page->freelist = NULL; - reset_page_mapcount(page); + page_mapcount_reset(page); } static void free_zspage(struct page *first_page) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a5bbe3b9e56..fe039bdba4ed 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -367,7 +367,7 @@ static inline struct page *compound_head(struct page *page) * both from it and to it can be tracked, using atomic_inc_and_test * and atomic_add_negative(-1). */ -static inline void reset_page_mapcount(struct page *page) +static inline void page_mapcount_reset(struct page *page) { atomic_set(&(page)->_mapcount, -1); } @@ -658,28 +658,28 @@ static inline int page_to_nid(const struct page *page) #ifdef CONFIG_NUMA_BALANCING #ifdef LAST_NID_NOT_IN_PAGE_FLAGS -static inline int page_xchg_last_nid(struct page *page, int nid) +static inline int page_nid_xchg_last(struct page *page, int nid) { return xchg(&page->_last_nid, nid); } -static inline int page_last_nid(struct page *page) +static inline int page_nid_last(struct page *page) { return page->_last_nid; } -static inline void reset_page_last_nid(struct page *page) +static inline void page_nid_reset_last(struct page *page) { page->_last_nid = -1; } #else -static inline int page_last_nid(struct page *page) +static inline int page_nid_last(struct page *page) { return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; } -extern int page_xchg_last_nid(struct page *page, int nid); +extern int page_nid_xchg_last(struct page *page, int nid); -static inline void reset_page_last_nid(struct page *page) +static inline void page_nid_reset_last(struct page *page) { int nid = (1 << LAST_NID_SHIFT) - 1; @@ -688,17 +688,17 @@ static inline void reset_page_last_nid(struct page *page) } #endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ #else -static inline int page_xchg_last_nid(struct page *page, int nid) +static inline int page_nid_xchg_last(struct page *page, int nid) { return page_to_nid(page); } -static inline int page_last_nid(struct page *page) +static inline int page_nid_last(struct page *page) { return page_to_nid(page); } -static inline void reset_page_last_nid(struct page *page) +static inline void page_nid_reset_last(struct page *page) { } #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c63a21d0e991..6049376c7226 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1639,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page) page_tail->mapping = page->mapping; page_tail->index = page->index + i; - page_xchg_last_nid(page_tail, page_last_nid(page)); + page_nid_xchg_last(page_tail, page_nid_last(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6f7979c566d9..2ae78e255e08 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2316,7 +2316,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * it less likely we act on an unlikely task<->page * relation. */ - last_nid = page_xchg_last_nid(page, polnid); + last_nid = page_nid_xchg_last(page, polnid); if (last_nid != polnid) goto out; } diff --git a/mm/migrate.c b/mm/migrate.c index f560071e89c5..de5c371a7969 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1497,7 +1497,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN) & ~GFP_IOFS, 0); if (newpage) - page_xchg_last_nid(newpage, page_last_nid(page)); + page_nid_xchg_last(newpage, page_nid_last(page)); return newpage; } @@ -1681,7 +1681,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, if (!new_page) goto out_fail; - page_xchg_last_nid(new_page, page_last_nid(page)); + page_nid_xchg_last(new_page, page_nid_last(page)); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { diff --git a/mm/mmzone.c b/mm/mmzone.c index bce796e8487f..2ac0afbd68f3 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -98,14 +98,14 @@ void lruvec_init(struct lruvec *lruvec) } #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) -int page_xchg_last_nid(struct page *page, int nid) +int page_nid_xchg_last(struct page *page, int nid) { unsigned long old_flags, flags; int last_nid; do { old_flags = flags = page->flags; - last_nid = page_last_nid(page); + last_nid = page_nid_last(page); flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3ede25e6686e..445718b328b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -295,7 +295,7 @@ static void bad_page(struct page *page) /* Don't complain about poisoned pages */ if (PageHWPoison(page)) { - reset_page_mapcount(page); /* remove PageBuddy */ + page_mapcount_reset(page); /* remove PageBuddy */ return; } @@ -327,7 +327,7 @@ static void bad_page(struct page *page) dump_stack(); out: /* Leave bad fields for debug, except PageBuddy could make trouble */ - reset_page_mapcount(page); /* remove PageBuddy */ + page_mapcount_reset(page); /* remove PageBuddy */ add_taint(TAINT_BAD_PAGE); } @@ -613,7 +613,7 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } - reset_page_last_nid(page); + page_nid_reset_last(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -3894,8 +3894,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, set_page_links(page, zone, nid, pfn); mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); - reset_page_mapcount(page); - reset_page_last_nid(page); + page_mapcount_reset(page); + page_nid_reset_last(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for diff --git a/mm/slob.c b/mm/slob.c index a99fdf7a0907..eeed4a05a2ef 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -360,7 +360,7 @@ static void slob_free(void *block, int size) clear_slob_page_free(sp); spin_unlock_irqrestore(&slob_lock, flags); __ClearPageSlab(sp); - reset_page_mapcount(sp); + page_mapcount_reset(sp); slob_free_pages(b, 0); return; } diff --git a/mm/slub.c b/mm/slub.c index ba2ca53f6c3a..ebcc44eb43b9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1408,7 +1408,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlab(page); memcg_release_pages(s, order); - reset_page_mapcount(page); + page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; __free_memcg_kmem_pages(page, order); -- cgit v1.2.3 From c8d6553b9580188a1324486173d79c0f8642e870 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 22 Feb 2013 16:35:10 -0800 Subject: ksm: make KSM page migration possible KSM page migration is already supported in the case of memory hotremove, which takes the ksm_thread_mutex across all its migrations to keep life simple. But the new KSM NUMA merge_across_nodes knob introduces a problem, when it's set to non-default 0: if a KSM page is migrated to a different NUMA node, how do we migrate its stable node to the right tree? And what if that collides with an existing stable node? So far there's no provision for that, and this patch does not attempt to deal with it either. But how will I test a solution, when I don't know how to hotremove memory? The best answer is to enable KSM page migration in all cases now, and test more common cases. With THP and compaction added since KSM came in, page migration is now mainstream, and it's a shame that a KSM page can frustrate freeing a page block. Without worrying about merge_across_nodes 0 for now, this patch gets KSM page migration working reliably for default merge_across_nodes 1 (but leave the patch enabling it until near the end of the series). It's much simpler than I'd originally imagined, and does not require an additional tier of locking: page migration relies on the page lock, KSM page reclaim relies on the page lock, the page lock is enough for KSM page migration too. Almost all the care has to be in get_ksm_page(): that's the function which worries about when a stable node is stale and should be freed, now it also has to worry about the KSM page being migrated. The only new overhead is an additional put/get/lock/unlock_page when stable_tree_search() arrives at a matching node: to make sure migration respects the raised page count, and so does not migrate the page while we're busy with it here. That's probably avoidable, either by changing internal interfaces from using kpage to stable_node, or by moving the ksm_migrate_page() callsite into a page_freeze_refs() section (even if not swapcache); but this works well, I've no urge to pull it apart now. (Descents of the stable tree may pass through nodes whose KSM pages are under migration: being unlocked, the raised page count does not prevent that, nor need it: it's safe to memcmp against either old or new page.) You might worry about mremap, and whether page migration's rmap_walk to remove migration entries will find all the KSM locations where it inserted earlier: that should already be handled, by the satisfyingly heavy hammer of move_vma()'s call to ksm_madvise(,,,MADV_UNMERGEABLE,). Signed-off-by: Hugh Dickins Cc: Rik van Riel Cc: Petr Holasek Cc: Andrea Arcangeli Cc: Izik Eidus Cc: Gerald Schaefer Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++-------------- mm/migrate.c | 5 +++- 2 files changed, 77 insertions(+), 22 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/ksm.c b/mm/ksm.c index 4c22cdff02ad..df0529926703 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -499,6 +499,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) * In which case we can trust the content of the page, and it * returns the gotten page; but if the page has now been zapped, * remove the stale node from the stable tree and return NULL. + * But beware, the stable node's page might be being migrated. * * You would expect the stable_node to hold a reference to the ksm page. * But if it increments the page's count, swapping out has to wait for @@ -509,44 +510,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) * pointing back to this stable node. This relies on freeing a PageAnon * page to reset its page->mapping to NULL, and relies on no other use of * a page to put something that might look like our key in page->mapping. - * - * include/linux/pagemap.h page_cache_get_speculative() is a good reference, - * but this is different - made simpler by ksm_thread_mutex being held, but - * interesting for assuming that no other use of the struct page could ever - * put our expected_mapping into page->mapping (or a field of the union which - * coincides with page->mapping). - * - * Note: it is possible that get_ksm_page() will return NULL one moment, - * then page the next, if the page is in between page_freeze_refs() and - * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page * is on its way to being freed; but it is an anomaly to bear in mind. */ static struct page *get_ksm_page(struct stable_node *stable_node, bool locked) { struct page *page; void *expected_mapping; + unsigned long kpfn; - page = pfn_to_page(stable_node->kpfn); expected_mapping = (void *)stable_node + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); - if (page->mapping != expected_mapping) - goto stale; - if (!get_page_unless_zero(page)) +again: + kpfn = ACCESS_ONCE(stable_node->kpfn); + page = pfn_to_page(kpfn); + + /* + * page is computed from kpfn, so on most architectures reading + * page->mapping is naturally ordered after reading node->kpfn, + * but on Alpha we need to be more careful. + */ + smp_read_barrier_depends(); + if (ACCESS_ONCE(page->mapping) != expected_mapping) goto stale; - if (page->mapping != expected_mapping) { + + /* + * We cannot do anything with the page while its refcount is 0. + * Usually 0 means free, or tail of a higher-order page: in which + * case this node is no longer referenced, and should be freed; + * however, it might mean that the page is under page_freeze_refs(). + * The __remove_mapping() case is easy, again the node is now stale; + * but if page is swapcache in migrate_page_move_mapping(), it might + * still be our page, in which case it's essential to keep the node. + */ + while (!get_page_unless_zero(page)) { + /* + * Another check for page->mapping != expected_mapping would + * work here too. We have chosen the !PageSwapCache test to + * optimize the common case, when the page is or is about to + * be freed: PageSwapCache is cleared (under spin_lock_irq) + * in the freeze_refs section of __remove_mapping(); but Anon + * page->mapping reset to NULL later, in free_pages_prepare(). + */ + if (!PageSwapCache(page)) + goto stale; + cpu_relax(); + } + + if (ACCESS_ONCE(page->mapping) != expected_mapping) { put_page(page); goto stale; } + if (locked) { lock_page(page); - if (page->mapping != expected_mapping) { + if (ACCESS_ONCE(page->mapping) != expected_mapping) { unlock_page(page); put_page(page); goto stale; } } return page; + stale: + /* + * We come here from above when page->mapping or !PageSwapCache + * suggests that the node is stale; but it might be under migration. + * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), + * before checking whether node->kpfn has been changed. + */ + smp_rmb(); + if (ACCESS_ONCE(stable_node->kpfn) != kpfn) + goto again; remove_node_from_stable_tree(stable_node); return NULL; } @@ -1103,15 +1137,25 @@ static struct page *stable_tree_search(struct page *page) return NULL; ret = memcmp_pages(page, tree_page); + put_page(tree_page); - if (ret < 0) { - put_page(tree_page); + if (ret < 0) node = node->rb_left; - } else if (ret > 0) { - put_page(tree_page); + else if (ret > 0) node = node->rb_right; - } else + else { + /* + * Lock and unlock the stable_node's page (which + * might already have been migrated) so that page + * migration is sure to notice its raised count. + * It would be more elegant to return stable_node + * than kpage, but that involves more changes. + */ + tree_page = get_ksm_page(stable_node, true); + if (tree_page) + unlock_page(tree_page); return tree_page; + } } return NULL; @@ -1903,6 +1947,14 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage) if (stable_node) { VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); stable_node->kpfn = page_to_pfn(newpage); + /* + * newpage->mapping was set in advance; now we need smp_wmb() + * to make sure that the new stable_node->kpfn is visible + * to get_ksm_page() before it can see that oldpage->mapping + * has gone stale (or that PageSwapCache has been cleared). + */ + smp_wmb(); + set_page_stable_node(oldpage, NULL); } } #endif /* CONFIG_MIGRATION */ diff --git a/mm/migrate.c b/mm/migrate.c index de5c371a7969..e545ce7ddc17 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -464,7 +464,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) mlock_migrate_page(newpage, page); ksm_migrate_page(newpage, page); - + /* + * Please do not reorder this without considering how mm/ksm.c's + * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). + */ ClearPageSwapCache(page); ClearPagePrivate(page); set_page_private(page, 0); -- cgit v1.2.3 From b79bc0a0c79e06cc87e17530e9c1c56c6f297e17 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 22 Feb 2013 16:35:13 -0800 Subject: ksm: enable KSM page migration Migration of KSM pages is now safe: remove the PageKsm restrictions from mempolicy.c and migrate.c. But keep PageKsm out of __unmap_and_move()'s anon_vma contortions, which are irrelevant to KSM: it looks as if that code was preventing hotremove migration of KSM pages, unless they happened to be in swapcache. There is some question as to whether enforcing a NUMA mempolicy migration ought to migrate KSM pages, mapped into entirely unrelated processes; but moving page_mapcount > 1 is only permitted with MPOL_MF_MOVE_ALL anyway, and it seems reasonable to assume that you wouldn't set MADV_MERGEABLE on any area where this is a worry. Signed-off-by: Hugh Dickins Cc: Rik van Riel Cc: Petr Holasek Cc: Andrea Arcangeli Cc: Izik Eidus Cc: Gerald Schaefer Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 +-- mm/migrate.c | 21 +++------------------ 2 files changed, 4 insertions(+), 20 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2ae78e255e08..d344c36db63f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -496,9 +496,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* * vm_normal_page() filters out zero pages, but there might * still be PageReserved pages to skip, perhaps in a VDSO. - * And we cannot move PageKsm pages sensibly or safely yet. */ - if (PageReserved(page) || PageKsm(page)) + if (PageReserved(page)) continue; nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) diff --git a/mm/migrate.c b/mm/migrate.c index e545ce7ddc17..20a03eb0667f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -731,20 +731,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, lock_page(page); } - /* - * Only memory hotplug's offline_pages() caller has locked out KSM, - * and can safely migrate a KSM page. The other cases have skipped - * PageKsm along with PageReserved - but it is only now when we have - * the page lock that we can be certain it will not go KSM beneath us - * (KSM will not upgrade a page from PageAnon to PageKsm when it sees - * its pagecount raised, but only here do we take the page lock which - * serializes that). - */ - if (PageKsm(page) && !offlining) { - rc = -EBUSY; - goto unlock; - } - /* charge against new page */ mem_cgroup_prepare_migration(page, newpage, &mem); @@ -771,7 +757,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. */ - if (PageAnon(page)) { + if (PageAnon(page) && !PageKsm(page)) { /* * Only page_lock_anon_vma_read() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. @@ -851,7 +837,6 @@ uncharge: mem_cgroup_end_migration(mem, page, newpage, (rc == MIGRATEPAGE_SUCCESS || rc == MIGRATEPAGE_BALLOON_SUCCESS)); -unlock: unlock_page(page); out: return rc; @@ -1155,7 +1140,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto set_status; /* Use PageReserved to check for zero page */ - if (PageReserved(page) || PageKsm(page)) + if (PageReserved(page)) goto put_and_set; pp->page = page; @@ -1317,7 +1302,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, err = -ENOENT; /* Use PageReserved to check for zero page */ - if (!page || PageReserved(page) || PageKsm(page)) + if (!page || PageReserved(page)) goto set_status; err = page_to_nid(page); -- cgit v1.2.3 From 9c620e2bc5aa4256c102ada34e6c76204ed5898b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 22 Feb 2013 16:35:14 -0800 Subject: mm: remove offlining arg to migrate_pages No functional change, but the only purpose of the offlining argument to migrate_pages() etc, was to ensure that __unmap_and_move() could migrate a KSM page for memory hotremove (which took ksm_thread_mutex) but not for other callers. Now all cases are safe, remove the arg. Signed-off-by: Hugh Dickins Cc: Rik van Riel Cc: Petr Holasek Cc: Andrea Arcangeli Cc: Izik Eidus Cc: Gerald Schaefer Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 14 ++++++-------- mm/compaction.c | 2 +- mm/memory-failure.c | 7 +++---- mm/memory_hotplug.c | 3 +-- mm/mempolicy.c | 8 +++----- mm/migrate.c | 35 +++++++++++++---------------------- mm/page_alloc.c | 6 ++---- 7 files changed, 29 insertions(+), 46 deletions(-) (limited to 'mm/migrate.c') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 1e9f627967a3..a405d3dc0f61 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -40,11 +40,9 @@ extern void putback_movable_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, bool offlining, - enum migrate_mode mode, int reason); + unsigned long private, enum migrate_mode mode, int reason); extern int migrate_huge_page(struct page *, new_page_t x, - unsigned long private, bool offlining, - enum migrate_mode mode); + unsigned long private, enum migrate_mode mode); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -62,11 +60,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, bool offlining, - enum migrate_mode mode, int reason) { return -ENOSYS; } + unsigned long private, enum migrate_mode mode, int reason) + { return -ENOSYS; } static inline int migrate_huge_page(struct page *page, new_page_t x, - unsigned long private, bool offlining, - enum migrate_mode mode) { return -ENOSYS; } + unsigned long private, enum migrate_mode mode) + { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } diff --git a/mm/compaction.c b/mm/compaction.c index ef53f352b606..25e75e3e2ac6 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -980,7 +980,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, false, + (unsigned long)cc, cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, MR_COMPACTION); update_nr_listpages(cc); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1a56d63adf9c..e4683459ab26 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1465,7 +1465,7 @@ static int soft_offline_huge_page(struct page *page, int flags) unlock_page(hpage); /* Keep page count to indicate a given hugepage is isolated. */ - ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, + ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, MIGRATE_SYNC); put_page(hpage); if (ret) { @@ -1597,11 +1597,10 @@ static int __soft_offline_page(struct page *page, int flags) if (!ret) { LIST_HEAD(pagelist); inc_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - false, MIGRATE_SYNC, - MR_MEMORY_FAILURE); + MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 77a6abfe3291..dda1ca695a08 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1286,8 +1286,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * migrate_pages returns # of failed pages. */ ret = migrate_pages(&source, alloc_migrate_target, 0, - true, MIGRATE_SYNC, - MR_MEMORY_HOTPLUG); + MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d344c36db63f..e2661d1c5c33 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1014,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, - false, MIGRATE_SYNC, - MR_SYSCALL); + MIGRATE_SYNC, MR_SYSCALL); if (err) putback_lru_pages(&pagelist); } @@ -1259,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, - false, MIGRATE_SYNC, - MR_MEMPOLICY_MBIND); + (unsigned long)vma, + MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 20a03eb0667f..3bbaf5d230b0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -701,7 +701,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, } static int __unmap_and_move(struct page *page, struct page *newpage, - int force, bool offlining, enum migrate_mode mode) + int force, enum migrate_mode mode) { int rc = -EAGAIN; int remap_swapcache = 1; @@ -847,8 +847,7 @@ out: * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, bool offlining, - enum migrate_mode mode) + struct page *page, int force, enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -866,7 +865,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (unlikely(split_huge_page(page))) goto out; - rc = __unmap_and_move(page, newpage, force, offlining, mode); + rc = __unmap_and_move(page, newpage, force, mode); if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { /* @@ -926,8 +925,7 @@ out: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, bool offlining, - enum migrate_mode mode) + int force, enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -989,9 +987,8 @@ out: * * Return: Number of pages not migrated or error code. */ -int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, bool offlining, - enum migrate_mode mode, int reason) +int migrate_pages(struct list_head *from, new_page_t get_new_page, + unsigned long private, enum migrate_mode mode, int reason) { int retry = 1; int nr_failed = 0; @@ -1012,8 +1009,7 @@ int migrate_pages(struct list_head *from, cond_resched(); rc = unmap_and_move(get_new_page, private, - page, pass > 2, offlining, - mode); + page, pass > 2, mode); switch(rc) { case -ENOMEM: @@ -1046,15 +1042,13 @@ out: } int migrate_huge_page(struct page *hpage, new_page_t get_new_page, - unsigned long private, bool offlining, - enum migrate_mode mode) + unsigned long private, enum migrate_mode mode) { int pass, rc; for (pass = 0; pass < 10; pass++) { - rc = unmap_and_move_huge_page(get_new_page, - private, hpage, pass > 2, offlining, - mode); + rc = unmap_and_move_huge_page(get_new_page, private, + hpage, pass > 2, mode); switch (rc) { case -ENOMEM: goto out; @@ -1177,8 +1171,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0, MIGRATE_SYNC, - MR_SYSCALL); + (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_lru_pages(&pagelist); } @@ -1613,10 +1606,8 @@ int migrate_misplaced_page(struct page *page, int node) goto out; list_add(&page->lru, &migratepages); - nr_remaining = migrate_pages(&migratepages, - alloc_misplaced_dst_page, - node, false, MIGRATE_ASYNC, - MR_NUMA_MISPLACED); + nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, + node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); if (nr_remaining) { putback_lru_pages(&migratepages); isolated = 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 445718b328b6..64c83a8c3220 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6084,10 +6084,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, &cc->migratepages); cc->nr_migratepages -= nr_reclaimed; - ret = migrate_pages(&cc->migratepages, - alloc_migrate_target, - 0, false, MIGRATE_SYNC, - MR_CMA); + ret = migrate_pages(&cc->migratepages, alloc_migrate_target, + 0, MIGRATE_SYNC, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); -- cgit v1.2.3