From 4a8c7bb59ac85b038c29adf6d32ff56e11fbb267 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Thu, 14 Jan 2016 15:18:36 -0800 Subject: mm/mempolicy.c: convert the shared_policy lock to a rwlock When running the SPECint_rate gcc on some very large boxes it was noticed that the system was spending lots of time in mpol_shared_policy_lookup(). The gamess benchmark can also show it and is what I mostly used to chase down the issue since the setup for that I found to be easier. To be clear the binaries were on tmpfs because of disk I/O requirements. We then used text replication to avoid icache misses and having all the copies banging on the memory where the instruction code resides. This results in us hitting a bottleneck in mpol_shared_policy_lookup() since lookup is serialised by the shared_policy lock. I have only reproduced this on very large (3k+ cores) boxes. The problem starts showing up at just a few hundred ranks getting worse until it threatens to livelock once it gets large enough. For example on the gamess benchmark at 128 ranks this area consumes only ~1% of time, at 512 ranks it consumes nearly 13%, and at 2k ranks it is over 90%. To alleviate the contention in this area I converted the spinlock to an rwlock. This allows a large number of lookups to happen simultaneously. The results were quite good reducing this consumtion at max ranks to around 2%. [akpm@linux-foundation.org: tidy up code comments] Signed-off-by: Nathan Zimmer Acked-by: David Rientjes Acked-by: Vlastimil Babka Cc: Nadia Yvette Chambers Cc: Naoya Horiguchi Cc: Mel Gorman Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 87a177917cb2..d8caff071a30 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2142,12 +2142,14 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. - * They are protected by the sp->lock spinlock, which should be held + * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ -/* lookup first element intersecting start-end */ -/* Caller holds sp->lock */ +/* + * lookup first element intersecting start-end. Caller holds sp->lock for + * reading or for writing + */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { @@ -2178,8 +2180,10 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) return rb_entry(n, struct sp_node, nd); } -/* Insert a new shared policy into the list. */ -/* Caller holds sp->lock */ +/* + * Insert a new shared policy into the list. Caller holds sp->lock for + * writing. + */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; @@ -2211,13 +2215,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) if (!sp->root.rb_node) return NULL; - spin_lock(&sp->lock); + read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - spin_unlock(&sp->lock); + read_unlock(&sp->lock); return pol; } @@ -2360,7 +2364,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, int ret = 0; restart: - spin_lock(&sp->lock); + write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -2393,7 +2397,7 @@ restart: } if (new) sp_insert(sp, new); - spin_unlock(&sp->lock); + write_unlock(&sp->lock); ret = 0; err_out: @@ -2405,7 +2409,7 @@ err_out: return ret; alloc_new: - spin_unlock(&sp->lock); + write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) @@ -2431,7 +2435,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ - spin_lock_init(&sp->lock); + rwlock_init(&sp->lock); if (mpol) { struct vm_area_struct pvma; @@ -2497,14 +2501,14 @@ void mpol_free_shared_policy(struct shared_policy *p) if (!p->root.rb_node) return; - spin_lock(&p->lock); + write_lock(&p->lock); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(p, n); } - spin_unlock(&p->lock); + write_unlock(&p->lock); } #ifdef CONFIG_NUMA_BALANCING -- cgit v1.2.3 From 78ddc53473419073ffb2e91178001e87bc513524 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:42 -0800 Subject: thp: rename split_huge_page_pmd() to split_huge_pmd() We are going to decouple splitting THP PMD from splitting underlying compound page. This patch renames split_huge_page_pmd*() functions to split_huge_pmd*() to reflect the fact that it doesn't imply page splitting, only PMD. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/subpage-prot.c | 2 +- arch/x86/kernel/vm86_32.c | 6 +++++- include/linux/huge_mm.h | 8 ++------ mm/gup.c | 2 +- mm/huge_memory.c | 32 +++++++++++--------------------- mm/memory.c | 2 +- mm/mempolicy.c | 2 +- mm/mprotect.c | 2 +- mm/mremap.c | 2 +- mm/pagewalk.c | 2 +- 10 files changed, 25 insertions(+), 35 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index fa9fb5b4c66c..d5543514c1df 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -135,7 +135,7 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; - split_huge_page_pmd(vma, addr, pmd); + split_huge_pmd(vma, pmd, addr); return 0; } diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 483231ebbb0b..e574b8546518 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -175,7 +175,11 @@ static void mark_screen_rdonly(struct mm_struct *mm) if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); - split_huge_page_pmd_mm(mm, 0xA0000, pmd); + + if (pmd_trans_huge(*pmd)) { + struct vm_area_struct *vma = find_vma(mm, 0xA0000); + split_huge_pmd(vma, pmd, 0xA0000); + } if (pmd_none_or_clear_bad(pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ecb080d6ff42..805c7ae42280 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -102,7 +102,7 @@ static inline int split_huge_page(struct page *page) } extern void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd); -#define split_huge_page_pmd(__vma, __address, __pmd) \ +#define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ if (unlikely(pmd_trans_huge(*____pmd))) \ @@ -117,8 +117,6 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma, BUG_ON(pmd_trans_splitting(*____pmd) || \ pmd_trans_huge(*____pmd)); \ } while (0) -extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, - pmd_t *pmd); #if HPAGE_PMD_ORDER >= MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif @@ -183,11 +181,9 @@ static inline int split_huge_page(struct page *page) { return 0; } -#define split_huge_page_pmd(__vma, __address, __pmd) \ - do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) -#define split_huge_page_pmd_mm(__mm, __address, __pmd) \ +#define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) diff --git a/mm/gup.c b/mm/gup.c index 20fa606b4e76..1c50b506888d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -254,7 +254,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (is_huge_zero_page(page)) { spin_unlock(ptl); ret = 0; - split_huge_page_pmd(vma, address, pmd); + split_huge_pmd(vma, pmd, address); } else { get_page(page); spin_unlock(ptl); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f4da89cef2cd..0d70ec056ecc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1233,13 +1233,13 @@ alloc: if (unlikely(!new_page)) { if (!page) { - split_huge_page_pmd(vma, address, pmd); + split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; } else { ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); if (ret & VM_FAULT_OOM) { - split_huge_page(page); + split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; } put_user_huge_page(page); @@ -1252,10 +1252,10 @@ alloc: true))) { put_page(new_page); if (page) { - split_huge_page(page); + split_huge_pmd(vma, pmd, address); put_user_huge_page(page); } else - split_huge_page_pmd(vma, address, pmd); + split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); goto out; @@ -3131,17 +3131,7 @@ again: goto again; } -void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, - pmd_t *pmd) -{ - struct vm_area_struct *vma; - - vma = find_vma(mm, address); - BUG_ON(vma == NULL); - split_huge_page_pmd(vma, address, pmd); -} - -static void split_huge_page_address(struct mm_struct *mm, +static void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address) { pgd_t *pgd; @@ -3150,7 +3140,7 @@ static void split_huge_page_address(struct mm_struct *mm, VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pgd = pgd_offset(mm, address); + pgd = pgd_offset(vma->vm_mm, address); if (!pgd_present(*pgd)) return; @@ -3159,13 +3149,13 @@ static void split_huge_page_address(struct mm_struct *mm, return; pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) + if (!pmd_present(*pmd) || !pmd_trans_huge(*pmd)) return; /* * Caller holds the mmap_sem write mode, so a huge pmd cannot * materialize from under us. */ - split_huge_page_pmd_mm(mm, address, pmd); + __split_huge_page_pmd(vma, address, pmd); } void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -3181,7 +3171,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (start & ~HPAGE_PMD_MASK && (start & HPAGE_PMD_MASK) >= vma->vm_start && (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_page_address(vma->vm_mm, start); + split_huge_pmd_address(vma, start); /* * If the new end address isn't hpage aligned and it could @@ -3191,7 +3181,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (end & ~HPAGE_PMD_MASK && (end & HPAGE_PMD_MASK) >= vma->vm_start && (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_page_address(vma->vm_mm, end); + split_huge_pmd_address(vma, end); /* * If we're also updating the vma->vm_next->vm_start, if the new @@ -3205,6 +3195,6 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (nstart & ~HPAGE_PMD_MASK && (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) - split_huge_page_address(next->vm_mm, nstart); + split_huge_pmd_address(next, nstart); } } diff --git a/mm/memory.c b/mm/memory.c index eecdd05e9923..561b7ad7f27a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1193,7 +1193,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, BUG(); } #endif - split_huge_page_pmd(vma, addr, pmd); + split_huge_pmd(vma, pmd, addr); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d8caff071a30..5f7f9dace354 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -493,7 +493,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, pte_t *pte; spinlock_t *ptl; - split_huge_page_pmd(vma, addr, pmd); + split_huge_pmd(vma, pmd, addr); if (pmd_trans_unstable(pmd)) return 0; diff --git a/mm/mprotect.c b/mm/mprotect.c index c764402c464f..6047707085c1 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -160,7 +160,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) - split_huge_page_pmd(vma, addr, pmd); + split_huge_pmd(vma, pmd, addr); else { int nr_ptes = change_huge_pmd(vma, pmd, addr, newprot, prot_numa); diff --git a/mm/mremap.c b/mm/mremap.c index e55b157865d5..5969b5093850 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -209,7 +209,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, need_flush = true; continue; } else if (!err) { - split_huge_page_pmd(vma, old_addr, old_pmd); + split_huge_pmd(vma, old_pmd, old_addr); } VM_BUG_ON(pmd_trans_huge(*old_pmd)); } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 29f2f8b853ae..207244489a68 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -58,7 +58,7 @@ again: if (!walk->pte_entry) continue; - split_huge_page_pmd_mm(walk->mm, addr, pmd); + split_huge_pmd(walk->vma, pmd, addr); if (pmd_trans_unstable(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); -- cgit v1.2.3 From 248db92da13f25073e7ebbd5fb95615aafd771d1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:54:14 -0800 Subject: migrate_pages: try to split pages on queuing We are not able to migrate THPs. It means it's not enough to split only PMD on migration -- we need to split compound page under it too. Signed-off-by: Kirill A. Shutemov Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Sasha Levin Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5f7f9dace354..973434eff9dc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -489,14 +489,33 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, struct page *page; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; - int nid; + int nid, ret; pte_t *pte; spinlock_t *ptl; - split_huge_pmd(vma, pmd, addr); - if (pmd_trans_unstable(pmd)) - return 0; + if (pmd_trans_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (pmd_trans_huge(*pmd)) { + page = pmd_page(*pmd); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, addr); + } else { + get_page(page); + spin_unlock(ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return 0; + } + } else { + spin_unlock(ptl); + } + } +retry: pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) @@ -513,6 +532,21 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, nid = page_to_nid(page); if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) continue; + if (PageTail(page) && PageAnon(page)) { + get_page(page); + pte_unmap_unlock(pte, ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + /* Failed to split -- skip. */ + if (ret) { + pte = pte_offset_map_lock(walk->mm, pmd, + addr, &ptl); + continue; + } + goto retry; + } if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) migrate_page_add(page, qp->pagelist, flags); -- cgit v1.2.3 From d645fc0eabbe783d34a14fbb31768ad8571f0ad4 Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Fri, 15 Jan 2016 16:57:28 -0800 Subject: mm: mempolicy: skip non-migratable VMAs when setting MPOL_MF_LAZY MPOL_MF_LAZY is not visible from userspace since a720094ded8c ("mm: mempolicy: Hide MPOL_NOOP and MPOL_MF_LAZY from userspace for now"), but it should still skip non-migratable VMAs such as VM_IO, VM_PFNMAP, and VM_HUGETLB VMAs, and avoid useless overhead of minor faults. Signed-off-by: Liang Chen Signed-off-by: Gavin Guo Acked-by: Rik van Riel Cc: Mel Gorman Cc: Andi Kleen Cc: Vlastimil Babka Cc: David Rientjes Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 973434eff9dc..27d135408a22 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -644,7 +644,8 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ - if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + if (vma_migratable(vma) && + vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) change_prot_numa(vma, start, endvma); return 1; } -- cgit v1.2.3