From de09d31dd38a50fdce106c15abd68432eebbd014 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:42 -0800 Subject: page-flags: define PG_reserved behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As far as I can see there's no users of PG_reserved on compound pages. Let's use PF_NO_COMPOUND here. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index be934df69b85..cdf38252f82e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1267,8 +1267,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); - __SetPageHead(page); __ClearPageReserved(page); + __SetPageHead(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { /* * For gigantic hugepages allocated through bootmem at -- cgit v1.2.3 From d281ee6145183594788ab6d5b55f8d144e69eace Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:16 -0800 Subject: rmap: add argument to charge compound page We're going to allow mapping of individual 4k pages of THP compound page. It means we cannot rely on PageTransHuge() check to decide if map/unmap small page or THP. The patch adds new argument to rmap functions to indicate whether we want to operate on whole compound page or only the small page. [n-horiguchi@ah.jp.nec.com: fix mapcount mismatch in hugepage migration] Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 12 +++++++++--- kernel/events/uprobes.c | 4 ++-- mm/huge_memory.c | 16 ++++++++-------- mm/hugetlb.c | 4 ++-- mm/ksm.c | 4 ++-- mm/memory.c | 14 +++++++------- mm/migrate.c | 8 ++++---- mm/rmap.c | 48 +++++++++++++++++++++++++++++++----------------- mm/swapfile.c | 4 ++-- mm/userfaultfd.c | 2 +- 10 files changed, 68 insertions(+), 48 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 29446aeef36e..038b6e704d9b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -161,16 +161,22 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, struct anon_vma *page_get_anon_vma(struct page *page); +/* bitflags for do_page_add_anon_rmap() */ +#define RMAP_EXCLUSIVE 0x01 +#define RMAP_COMPOUND 0x02 + /* * rmap interfaces called when adding or removing pte of page */ void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); -void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); +void page_add_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long, bool); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, int); -void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); +void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long, bool); void page_add_file_rmap(struct page *); -void page_remove_rmap(struct page *); +void page_remove_rmap(struct page *, bool); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bb0669169716..060c7a0edfdf 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -175,7 +175,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, goto unlock; get_page(kpage); - page_add_new_anon_rmap(kpage, vma, addr); + page_add_new_anon_rmap(kpage, vma, addr, false); mem_cgroup_commit_charge(kpage, memcg, false); lru_cache_add_active_or_unevictable(kpage, vma); @@ -188,7 +188,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - page_remove_rmap(page); + page_remove_rmap(page, false); if (!page_mapped(page)) try_to_free_swap(page); pte_unmap_unlock(ptep, ptl); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 370d44a5e25b..b7669cfe9dc9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -797,7 +797,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr); + page_add_new_anon_rmap(page, vma, haddr, true); mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); @@ -1139,7 +1139,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], vma, haddr); + page_add_new_anon_rmap(pages[i], vma, haddr, false); mem_cgroup_commit_charge(pages[i], memcg, false); lru_cache_add_active_or_unevictable(pages[i], vma); pte = pte_offset_map(&_pmd, haddr); @@ -1151,7 +1151,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - page_remove_rmap(page); + page_remove_rmap(page, true); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1271,7 +1271,7 @@ alloc: entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); pmdp_huge_clear_flush_notify(vma, haddr, pmd); - page_add_new_anon_rmap(new_page, vma, haddr); + page_add_new_anon_rmap(new_page, vma, haddr, true); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); set_pmd_at(mm, haddr, pmd, entry); @@ -1281,7 +1281,7 @@ alloc: put_huge_zero_page(); } else { VM_BUG_ON_PAGE(!PageHead(page), page); - page_remove_rmap(page); + page_remove_rmap(page, true); put_page(page); } ret |= VM_FAULT_WRITE; @@ -1508,7 +1508,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, put_huge_zero_page(); } else { struct page *page = pmd_page(orig_pmd); - page_remove_rmap(page); + page_remove_rmap(page, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON_PAGE(!PageHead(page), page); @@ -2371,7 +2371,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * superfluous. */ pte_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page); + page_remove_rmap(src_page, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); } @@ -2682,7 +2682,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address); + page_add_new_anon_rmap(new_page, vma, address, true); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cdf38252f82e..e924529f7b38 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3186,7 +3186,7 @@ again: set_page_dirty(page); hugetlb_count_sub(pages_per_huge_page(h), mm); - page_remove_rmap(page); + page_remove_rmap(page, true); force_flush = !__tlb_remove_page(tlb, page); if (force_flush) { address += sz; @@ -3415,7 +3415,7 @@ retry_avoidcopy: mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); - page_remove_rmap(old_page); + page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; diff --git a/mm/ksm.c b/mm/ksm.c index 643abe7a75de..b4f7b69efad0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -956,13 +956,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } get_page(kpage); - page_add_anon_rmap(kpage, vma, addr); + page_add_anon_rmap(kpage, vma, addr, false); flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - page_remove_rmap(page); + page_remove_rmap(page, false); if (!page_mapped(page)) try_to_free_swap(page); put_page(page); diff --git a/mm/memory.c b/mm/memory.c index f9360dde6967..f964d190ce83 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1118,7 +1118,7 @@ again: mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page); + page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(!__tlb_remove_page(tlb, page))) { @@ -2118,7 +2118,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * thread doing COW. */ ptep_clear_flush_notify(vma, address, page_table); - page_add_new_anon_rmap(new_page, vma, address); + page_add_new_anon_rmap(new_page, vma, address, false); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); /* @@ -2151,7 +2151,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page); + page_remove_rmap(old_page, false); } /* Free the old page.. */ @@ -2567,7 +2567,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; - exclusive = 1; + exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); if (pte_swp_soft_dirty(orig_pte)) @@ -2577,7 +2577,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, do_page_add_anon_rmap(page, vma, address, exclusive); mem_cgroup_commit_charge(page, memcg, true); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, address); + page_add_new_anon_rmap(page, vma, address, false); mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma); } @@ -2735,7 +2735,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, } inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); + page_add_new_anon_rmap(page, vma, address, false); mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma); setpte: @@ -2824,7 +2824,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); + page_add_new_anon_rmap(page, vma, address, false); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page); diff --git a/mm/migrate.c b/mm/migrate.c index f7f345ddc9ae..3921f20f8de4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -167,7 +167,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, else page_dup_rmap(new); } else if (PageAnon(new)) - page_add_anon_rmap(new, vma, addr); + page_add_anon_rmap(new, vma, addr, false); else page_add_file_rmap(new); @@ -1815,7 +1815,7 @@ fail_putback: * guarantee the copy is visible before the pagetable update. */ flush_cache_range(vma, mmun_start, mmun_end); - page_add_anon_rmap(new_page, vma, mmun_start); + page_add_anon_rmap(new_page, vma, mmun_start, true); pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); @@ -1826,14 +1826,14 @@ fail_putback: flush_tlb_range(vma, mmun_start, mmun_end); mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); - page_remove_rmap(new_page); + page_remove_rmap(new_page, true); goto fail_putback; } mlock_migrate_page(new_page, page); set_page_memcg(new_page, page_memcg(page)); set_page_memcg(page, NULL); - page_remove_rmap(page); + page_remove_rmap(page, true); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); diff --git a/mm/rmap.c b/mm/rmap.c index 622756c16ac8..c330f9aba63a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1133,6 +1133,7 @@ static void __page_check_anon_rmap(struct page *page, * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock, and the page must be locked in * the anon_vma case: to serialize mapping,index checking after setting, @@ -1140,9 +1141,9 @@ static void __page_check_anon_rmap(struct page *page, * (but PageKsm is never downgraded to PageAnon). */ void page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, bool compound) { - do_page_add_anon_rmap(page, vma, address, 0); + do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); } /* @@ -1151,21 +1152,24 @@ void page_add_anon_rmap(struct page *page, * Everybody else should continue to use page_add_anon_rmap above. */ void do_page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, int exclusive) + struct vm_area_struct *vma, unsigned long address, int flags) { int first = atomic_inc_and_test(&page->_mapcount); if (first) { + bool compound = flags & RMAP_COMPOUND; + int nr = compound ? hpage_nr_pages(page) : 1; /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption * disabled. */ - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - hpage_nr_pages(page)); + } + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); } if (unlikely(PageKsm(page))) return; @@ -1173,7 +1177,8 @@ void do_page_add_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); /* address might be in next vma when migration races vma_adjust */ if (first) - __page_set_anon_rmap(page, vma, address, exclusive); + __page_set_anon_rmap(page, vma, address, + flags & RMAP_EXCLUSIVE); else __page_check_anon_rmap(page, vma, address); } @@ -1183,21 +1188,25 @@ void do_page_add_anon_rmap(struct page *page, * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @compound: charge the page as compound or small page * * Same as page_add_anon_rmap but must only be called on *new* pages. * This means the inc-and-test can be bypassed. * Page does not have to be locked. */ void page_add_new_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, bool compound) { + int nr = compound ? hpage_nr_pages(page) : 1; + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - hpage_nr_pages(page)); + } + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1249,13 +1258,17 @@ out: /** * page_remove_rmap - take down pte mapping from a page - * @page: page to remove mapping from + * @page: page to remove mapping from + * @compound: uncharge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page) +void page_remove_rmap(struct page *page, bool compound) { + int nr = compound ? hpage_nr_pages(page) : 1; + if (!PageAnon(page)) { + VM_BUG_ON_PAGE(compound && !PageHuge(page), page); page_remove_file_rmap(page); return; } @@ -1273,11 +1286,12 @@ void page_remove_rmap(struct page *page) * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + } - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - -hpage_nr_pages(page)); + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1416,7 +1430,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } else dec_mm_counter(mm, mm_counter_file(page)); - page_remove_rmap(page); + page_remove_rmap(page, PageHuge(page)); page_cache_release(page); out_unmap: diff --git a/mm/swapfile.c b/mm/swapfile.c index e6b8591a3ed2..058e6f0162eb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1160,10 +1160,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { - page_add_anon_rmap(page, vma, addr); + page_add_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, true); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, addr); + page_add_new_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma); } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 77fee9325a57..ae21a1f309c2 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -76,7 +76,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_uncharge_unlock; inc_mm_counter(dst_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, dst_vma, dst_addr); + page_add_new_anon_rmap(page, dst_vma, dst_addr, false); mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, dst_vma); -- cgit v1.2.3 From ddc58f27f9eee9117219936f77e90ad5b2e00e96 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:56 -0800 Subject: mm: drop tail page refcounting Tail page refcounting is utterly complicated and painful to support. It uses ->_mapcount on tail pages to store how many times this page is pinned. get_page() bumps ->_mapcount on tail page in addition to ->_count on head. This information is required by split_huge_page() to be able to distribute pins from head of compound page to tails during the split. We will need ->_mapcount to account PTE mappings of subpages of the compound page. We eliminate need in current meaning of ->_mapcount in tail pages by forbidding split entirely if the page is pinned. The only user of tail page refcounting is THP which is marked BROKEN for now. Let's drop all this mess. It makes get_page() and put_page() much simpler. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/gup.c | 4 - arch/powerpc/mm/hugetlbpage.c | 13 +- arch/s390/mm/gup.c | 13 +- arch/sparc/mm/gup.c | 14 +-- arch/x86/mm/gup.c | 4 - include/linux/mm.h | 47 ++------ include/linux/mm_types.h | 17 +-- mm/gup.c | 34 +----- mm/huge_memory.c | 41 +------ mm/hugetlb.c | 2 +- mm/internal.h | 44 ------- mm/swap.c | 273 +++--------------------------------------- 12 files changed, 40 insertions(+), 466 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 349995d19c7f..36a35115dc2e 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c @@ -87,8 +87,6 @@ static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; - if (PageTail(page)) - get_huge_page_tail(page); (*nr)++; page++; refs++; @@ -153,8 +151,6 @@ static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end, do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; - if (PageTail(page)) - get_huge_page_tail(page); (*nr)++; page++; refs++; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 61b8b7ccea4f..6bd3afa775d3 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -999,7 +999,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, { unsigned long mask; unsigned long pte_end; - struct page *head, *page, *tail; + struct page *head, *page; pte_t pte; int refs; @@ -1022,7 +1022,6 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, head = pte_page(pte); page = head + ((addr & (sz-1)) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -1044,15 +1043,5 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, return 0; } - /* - * Any tail page need their mapcount reference taken before we - * return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 21c74a71e2ab..1a2cad52babf 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -55,7 +55,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask, result; - struct page *head, *page, *tail; + struct page *head, *page; int refs; result = write ? 0 : _SEGMENT_ENTRY_PROTECT; @@ -67,7 +67,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, refs = 0; head = pmd_page(pmd); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -88,16 +87,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, return 0; } - /* - * Any tail page need their mapcount reference taken before we - * return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index 2e5c4fc2daa9..9091c5daa2e1 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -56,8 +56,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(head); return 0; } - if (head != page) - get_huge_page_tail(page); pages[*nr] = page; (*nr)++; @@ -70,7 +68,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (!(pmd_val(pmd) & _PAGE_VALID)) @@ -82,7 +80,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, refs = 0; head = pmd_page(pmd); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -103,15 +100,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, return 0; } - /* Any tail page need their mapcount reference taken before we - * return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index ae9a37bf1371..65b0261d4bf3 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -136,8 +136,6 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; - if (PageTail(page)) - get_huge_page_tail(page); (*nr)++; page++; refs++; @@ -212,8 +210,6 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; - if (PageTail(page)) - get_huge_page_tail(page); (*nr)++; page++; refs++; diff --git a/include/linux/mm.h b/include/linux/mm.h index 839d9e9a1c38..34387351930c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -466,44 +466,9 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } -static inline bool __compound_tail_refcounted(struct page *page) -{ - return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page); -} - -/* - * This takes a head page as parameter and tells if the - * tail page reference counting can be skipped. - * - * For this to be safe, PageSlab and PageHeadHuge must remain true on - * any given page where they return true here, until all tail pins - * have been released. - */ -static inline bool compound_tail_refcounted(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHead(page), page); - return __compound_tail_refcounted(page); -} - -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run from under us. - */ - VM_BUG_ON_PAGE(!PageTail(page), page); - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); - VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - if (compound_tail_refcounted(compound_head(page))) - atomic_inc(&page->_mapcount); -} - -extern bool __get_page_tail(struct page *page); - static inline void get_page(struct page *page) { - if (unlikely(PageTail(page))) - if (likely(__get_page_tail(page))) - return; + page = compound_head(page); /* * Getting a normal page or the head of a compound page * requires to already have an elevated page->_count. @@ -528,7 +493,15 @@ static inline void init_page_count(struct page *page) atomic_set(&page->_count, 1); } -void put_page(struct page *page); +void __put_page(struct page *page); + +static inline void put_page(struct page *page) +{ + page = compound_head(page); + if (put_page_testzero(page)) + __put_page(page); +} + void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6bc9a0ce2253..faf6fe88d6b3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -81,20 +81,9 @@ struct page { union { /* - * Count of ptes mapped in - * mms, to show when page is - * mapped & limit reverse map - * searches. - * - * Used also for tail pages - * refcounting instead of - * _count. Tail pages cannot - * be mapped and keeping the - * tail page _count zero at - * all times guarantees - * get_page_unless_zero() will - * never succeed on tail - * pages. + * Count of ptes mapped in mms, to show + * when page is mapped & limit reverse + * map searches. */ atomic_t _mapcount; diff --git a/mm/gup.c b/mm/gup.c index 1c50b506888d..7017abea9fd6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -130,7 +130,7 @@ retry: } if (flags & FOLL_GET) - get_page_foll(page); + get_page(page); if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -1153,7 +1153,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (write && !pmd_write(orig)) @@ -1162,7 +1162,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, refs = 0; head = pmd_page(orig); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1183,24 +1182,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } - /* - * Any tail pages need their mapcount reference taken before we - * return. (This allows the THP code to bump their ref count when - * they are split into base pages). - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (write && !pud_write(orig)) @@ -1209,7 +1197,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, refs = 0; head = pud_page(orig); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1230,12 +1217,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } @@ -1244,7 +1225,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, struct page **pages, int *nr) { int refs; - struct page *head, *page, *tail; + struct page *head, *page; if (write && !pgd_write(orig)) return 0; @@ -1252,7 +1233,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, refs = 0; head = pgd_page(orig); page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1273,12 +1253,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; } - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 016b70ab5ed4..72fd53fe2b61 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1038,37 +1038,6 @@ unlock: spin_unlock(ptl); } -/* - * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages - * during copy_user_huge_page()'s copy_page_rep(): in the case when - * the source page gets split and a tail freed before copy completes. - * Called under pmd_lock of checked pmd, so safe from splitting itself. - */ -static void get_user_huge_page(struct page *page) -{ - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { - struct page *endpage = page + HPAGE_PMD_NR; - - atomic_add(HPAGE_PMD_NR, &page->_count); - while (++page < endpage) - get_huge_page_tail(page); - } else { - get_page(page); - } -} - -static void put_user_huge_page(struct page *page) -{ - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { - struct page *endpage = page + HPAGE_PMD_NR; - - while (page < endpage) - put_page(page++); - } else { - put_page(page); - } -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -1221,7 +1190,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ret |= VM_FAULT_WRITE; goto out_unlock; } - get_user_huge_page(page); + get_page(page); spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && @@ -1242,7 +1211,7 @@ alloc: split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; } - put_user_huge_page(page); + put_page(page); } count_vm_event(THP_FAULT_FALLBACK); goto out; @@ -1253,7 +1222,7 @@ alloc: put_page(new_page); if (page) { split_huge_pmd(vma, pmd, address); - put_user_huge_page(page); + put_page(page); } else split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; @@ -1275,7 +1244,7 @@ alloc: spin_lock(ptl); if (page) - put_user_huge_page(page); + put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(ptl); mem_cgroup_cancel_charge(new_page, memcg, true); @@ -1360,7 +1329,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page), page); if (flags & FOLL_GET) - get_page_foll(page); + get_page(page); out: return page; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e924529f7b38..84af842e828d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3865,7 +3865,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); - get_page_foll(pages[i]); + get_page(pages[i]); } if (vmas) diff --git a/mm/internal.h b/mm/internal.h index 38e24b89e4c4..569facd1f6da 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -66,50 +66,6 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } -static inline void __get_page_tail_foll(struct page *page, - bool get_page_head) -{ - /* - * If we're getting a tail page, the elevated page->_count is - * required only in the head page and we will elevate the head - * page->_count and tail page->_mapcount. - * - * We elevate page_tail->_mapcount for tail pages to force - * page_tail->_count to be zero at all times to avoid getting - * false positives from get_page_unless_zero() with - * speculative page access (like in - * page_cache_get_speculative()) on tail pages. - */ - VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page); - if (get_page_head) - atomic_inc(&compound_head(page)->_count); - get_huge_page_tail(page); -} - -/* - * This is meant to be called as the FOLL_GET operation of - * follow_page() and it must be called while holding the proper PT - * lock while the pte (or pmd_trans_huge) is still mapping the page. - */ -static inline void get_page_foll(struct page *page) -{ - if (unlikely(PageTail(page))) - /* - * This is safe only because - * __split_huge_page_refcount() can't run under - * get_page_foll() because we hold the proper PT lock. - */ - __get_page_tail_foll(page, true); - else { - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. - */ - VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); - atomic_inc(&page->_count); - } -} - extern unsigned long highest_memmap_pfn; /* diff --git a/mm/swap.c b/mm/swap.c index 39395fb549c0..3d65480422e8 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -89,260 +89,14 @@ static void __put_compound_page(struct page *page) (*dtor)(page); } -/** - * Two special cases here: we could avoid taking compound_lock_irqsave - * and could skip the tail refcounting(in _mapcount). - * - * 1. Hugetlbfs page: - * - * PageHeadHuge will remain true until the compound page - * is released and enters the buddy allocator, and it could - * not be split by __split_huge_page_refcount(). - * - * So if we see PageHeadHuge set, and we have the tail page pin, - * then we could safely put head page. - * - * 2. Slab THP page: - * - * PG_slab is cleared before the slab frees the head page, and - * tail pin cannot be the last reference left on the head page, - * because the slab code is free to reuse the compound page - * after a kfree/kmem_cache_free without having to check if - * there's any tail pin left. In turn all tail pinsmust be always - * released while the head is still pinned by the slab code - * and so we know PG_slab will be still set too. - * - * So if we see PageSlab set, and we have the tail page pin, - * then we could safely put head page. - */ -static __always_inline -void put_unrefcounted_compound_page(struct page *page_head, struct page *page) -{ - /* - * If @page is a THP tail, we must read the tail page - * flags after the head page flags. The - * __split_huge_page_refcount side enforces write memory barriers - * between clearing PageTail and before the head page - * can be freed and reallocated. - */ - smp_rmb(); - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount cannot race - * here, see the comment above this function. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - if (put_page_testzero(page_head)) { - /* - * If this is the tail of a slab THP page, - * the tail pin must not be the last reference - * held on the page, because the PG_slab cannot - * be cleared before all tail pins (which skips - * the _mapcount tail refcounting) have been - * released. - * - * If this is the tail of a hugetlbfs page, - * the tail pin may be the last reference on - * the page instead, because PageHeadHuge will - * not go away until the compound page enters - * the buddy allocator. - */ - VM_BUG_ON_PAGE(PageSlab(page_head), page_head); - __put_compound_page(page_head); - } - } else - /* - * __split_huge_page_refcount run before us, - * @page was a THP tail. The split @page_head - * has been freed and reallocated as slab or - * hugetlbfs page of smaller order (only - * possible if reallocated as slab on x86). - */ - if (put_page_testzero(page)) - __put_single_page(page); -} - -static __always_inline -void put_refcounted_compound_page(struct page *page_head, struct page *page) -{ - if (likely(page != page_head && get_page_unless_zero(page_head))) { - unsigned long flags; - - /* - * @page_head wasn't a dangling pointer but it may not - * be a head page anymore by the time we obtain the - * lock. That is ok as long as it can't be freed from - * under us. - */ - flags = compound_lock_irqsave(page_head); - if (unlikely(!PageTail(page))) { - /* __split_huge_page_refcount run before us */ - compound_unlock_irqrestore(page_head, flags); - if (put_page_testzero(page_head)) { - /* - * The @page_head may have been freed - * and reallocated as a compound page - * of smaller order and then freed - * again. All we know is that it - * cannot have become: a THP page, a - * compound page of higher order, a - * tail page. That is because we - * still hold the refcount of the - * split THP tail and page_head was - * the THP head before the split. - */ - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } -out_put_single: - if (put_page_testzero(page)) - __put_single_page(page); - return; - } - VM_BUG_ON_PAGE(page_head != compound_head(page), page); - /* - * We can release the refcount taken by - * get_page_unless_zero() now that - * __split_huge_page_refcount() is blocked on the - * compound_lock. - */ - if (put_page_testzero(page_head)) - VM_BUG_ON_PAGE(1, page_head); - /* __split_huge_page_refcount will wait now */ - VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); - atomic_dec(&page->_mapcount); - VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); - VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - compound_unlock_irqrestore(page_head, flags); - - if (put_page_testzero(page_head)) { - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } - } else { - /* @page_head is a dangling pointer */ - VM_BUG_ON_PAGE(PageTail(page), page); - goto out_put_single; - } -} - -static void put_compound_page(struct page *page) -{ - struct page *page_head; - - /* - * We see the PageCompound set and PageTail not set, so @page maybe: - * 1. hugetlbfs head page, or - * 2. THP head page. - */ - if (likely(!PageTail(page))) { - if (put_page_testzero(page)) { - /* - * By the time all refcounts have been released - * split_huge_page cannot run anymore from under us. - */ - if (PageHead(page)) - __put_compound_page(page); - else - __put_single_page(page); - } - return; - } - - /* - * We see the PageCompound set and PageTail set, so @page maybe: - * 1. a tail hugetlbfs page, or - * 2. a tail THP page, or - * 3. a split THP page. - * - * Case 3 is possible, as we may race with - * __split_huge_page_refcount tearing down a THP page. - */ - page_head = compound_head(page); - if (!__compound_tail_refcounted(page_head)) - put_unrefcounted_compound_page(page_head, page); - else - put_refcounted_compound_page(page_head, page); -} - -void put_page(struct page *page) +void __put_page(struct page *page) { if (unlikely(PageCompound(page))) - put_compound_page(page); - else if (put_page_testzero(page)) + __put_compound_page(page); + else __put_single_page(page); } -EXPORT_SYMBOL(put_page); - -/* - * This function is exported but must not be called by anything other - * than get_page(). It implements the slow path of get_page(). - */ -bool __get_page_tail(struct page *page) -{ - /* - * This takes care of get_page() if run on a tail page - * returned by one of the get_user_pages/follow_page variants. - * get_user_pages/follow_page itself doesn't need the compound - * lock because it runs __get_page_tail_foll() under the - * proper PT lock that already serializes against - * split_huge_page(). - */ - unsigned long flags; - bool got; - struct page *page_head = compound_head(page); - - /* Ref to put_compound_page() comment. */ - if (!__compound_tail_refcounted(page_head)) { - smp_rmb(); - if (likely(PageTail(page))) { - /* - * This is a hugetlbfs page or a slab - * page. __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - __get_page_tail_foll(page, true); - return true; - } else { - /* - * __split_huge_page_refcount run - * before us, "page" was a THP - * tail. The split page_head has been - * freed and reallocated as slab or - * hugetlbfs page of smaller order - * (only possible if reallocated as - * slab on x86). - */ - return false; - } - } - - got = false; - if (likely(page != page_head && get_page_unless_zero(page_head))) { - /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. - */ - flags = compound_lock_irqsave(page_head); - /* here __split_huge_page_refcount won't run anymore */ - if (likely(PageTail(page))) { - __get_page_tail_foll(page, false); - got = true; - } - compound_unlock_irqrestore(page_head, flags); - if (unlikely(!got)) - put_page(page_head); - } - return got; -} -EXPORT_SYMBOL(__get_page_tail); +EXPORT_SYMBOL(__put_page); /** * put_pages_list() - release a list of pages @@ -918,15 +672,6 @@ void release_pages(struct page **pages, int nr, bool cold) for (i = 0; i < nr; i++) { struct page *page = pages[i]; - if (unlikely(PageCompound(page))) { - if (zone) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - zone = NULL; - } - put_compound_page(page); - continue; - } - /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the @@ -937,9 +682,19 @@ void release_pages(struct page **pages, int nr, bool cold) zone = NULL; } + page = compound_head(page); if (!put_page_testzero(page)) continue; + if (PageCompound(page)) { + if (zone) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + zone = NULL; + } + __put_compound_page(page); + continue; + } + if (PageLRU(page)) { struct zone *pagezone = page_zone(page); -- cgit v1.2.3 From 53f9263baba69fc1630e3c780c4d11b72643f962 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:42 -0800 Subject: mm: rework mapcount accounting to enable 4k mapping of THPs We're going to allow mapping of individual 4k pages of THP compound. It means we need to track mapcount on per small page basis. Straight-forward approach is to use ->_mapcount in all subpages to track how many time this subpage is mapped with PMDs or PTEs combined. But this is rather expensive: mapping or unmapping of a THP page with PMD would require HPAGE_PMD_NR atomic operations instead of single we have now. The idea is to store separately how many times the page was mapped as whole -- compound_mapcount. This frees up ->_mapcount in subpages to track PTE mapcount. We use the same approach as with compound page destructor and compound order to store compound_mapcount: use space in first tail page, ->mapping this time. Any time we map/unmap whole compound page (THP or hugetlb) -- we increment/decrement compound_mapcount. When we map part of compound page with PTE we operate on ->_mapcount of the subpage. page_mapcount() counts both: PTE and PMD mappings of the page. Basically, we have mapcount for a subpage spread over two counters. It makes tricky to detect when last mapcount for a page goes away. We introduced PageDoubleMap() for this. When we split THP PMD for the first time and there's other PMD mapping left we offset up ->_mapcount in all subpages by one and set PG_double_map on the compound page. These additional references go away with last compound_mapcount. This approach provides a way to detect when last mapcount goes away on per small page basis without introducing new overhead for most common cases. [akpm@linux-foundation.org: fix typo in comment] [mhocko@suse.com: ignore partial THP when moving task] Signed-off-by: Kirill A. Shutemov Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Sasha Levin Cc: Aneesh Kumar K.V Cc: Jerome Marchand Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 26 +++++++++++- include/linux/mm_types.h | 1 + include/linux/page-flags.h | 36 +++++++++++++++++ include/linux/rmap.h | 4 +- mm/debug.c | 5 ++- mm/huge_memory.c | 2 +- mm/hugetlb.c | 4 +- mm/memcontrol.c | 8 ++++ mm/memory.c | 2 +- mm/migrate.c | 2 +- mm/page_alloc.c | 13 ++++-- mm/rmap.c | 98 +++++++++++++++++++++++++++++++++++----------- 12 files changed, 166 insertions(+), 35 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 70f59de2e288..67e0fab225e8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -410,6 +410,19 @@ static inline int is_vmalloc_or_module_addr(const void *x) extern void kvfree(const void *addr); +static inline atomic_t *compound_mapcount_ptr(struct page *page) +{ + return &page[1].compound_mapcount; +} + +static inline int compound_mapcount(struct page *page) +{ + if (!PageCompound(page)) + return 0; + page = compound_head(page); + return atomic_read(compound_mapcount_ptr(page)) + 1; +} + /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -422,8 +435,17 @@ static inline void page_mapcount_reset(struct page *page) static inline int page_mapcount(struct page *page) { + int ret; VM_BUG_ON_PAGE(PageSlab(page), page); - return atomic_read(&page->_mapcount) + 1; + + ret = atomic_read(&page->_mapcount) + 1; + if (PageCompound(page)) { + page = compound_head(page); + ret += atomic_read(compound_mapcount_ptr(page)) + 1; + if (PageDoubleMap(page)) + ret--; + } + return ret; } static inline int page_count(struct page *page) @@ -934,7 +956,7 @@ static inline pgoff_t page_file_index(struct page *page) */ static inline int page_mapped(struct page *page) { - return atomic_read(&(page)->_mapcount) >= 0; + return atomic_read(&(page)->_mapcount) + compound_mapcount(page) >= 0; } /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index faf6fe88d6b3..809defe0597d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -54,6 +54,7 @@ struct page { * see PAGE_MAPPING_ANON below. */ void *s_mem; /* slab first object */ + atomic_t compound_mapcount; /* first tail page */ }; /* Second double word */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0c42acca0338..19724e6ebd26 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -126,6 +126,9 @@ enum pageflags { /* SLOB */ PG_slob_free = PG_private, + + /* Compound pages. Stored in first tail page's flags */ + PG_double_map = PG_private_2, }; #ifndef __GENERATING_BOUNDS_H @@ -523,10 +526,43 @@ static inline int PageTransTail(struct page *page) return PageTail(page); } +/* + * PageDoubleMap indicates that the compound page is mapped with PTEs as well + * as PMDs. + * + * This is required for optimization of rmap operations for THP: we can postpone + * per small page mapcount accounting (and its overhead from atomic operations) + * until the first PMD split. + * + * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up + * by one. This reference will go away with last compound_mapcount. + * + * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap(). + */ +static inline int PageDoubleMap(struct page *page) +{ + return PageHead(page) && test_bit(PG_double_map, &page[1].flags); +} + +static inline int TestSetPageDoubleMap(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHead(page), page); + return test_and_set_bit(PG_double_map, &page[1].flags); +} + +static inline int TestClearPageDoubleMap(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHead(page), page); + return test_and_clear_bit(PG_double_map, &page[1].flags); +} + #else TESTPAGEFLAG_FALSE(TransHuge) TESTPAGEFLAG_FALSE(TransCompound) TESTPAGEFLAG_FALSE(TransTail) +TESTPAGEFLAG_FALSE(DoubleMap) + TESTSETFLAG_FALSE(DoubleMap) + TESTCLEARFLAG_FALSE(DoubleMap) #endif /* diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 038b6e704d9b..ebf3750e42b2 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -183,9 +183,9 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); -static inline void page_dup_rmap(struct page *page) +static inline void page_dup_rmap(struct page *page, bool compound) { - atomic_inc(&page->_mapcount); + atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); } /* diff --git a/mm/debug.c b/mm/debug.c index 03c9a13f9f6a..f05b2d5d6481 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -79,9 +79,12 @@ static void dump_flags(unsigned long flags, void dump_page_badflags(struct page *page, const char *reason, unsigned long badflags) { - pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", + pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); + if (PageCompound(page)) + pr_cont(" compound_mapcount: %d", compound_mapcount(page)); + pr_cont("\n"); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); if (reason) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1de7ab5d1004..1588f688b75d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -989,7 +989,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); get_page(src_page); - page_dup_rmap(src_page); + page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); pmdp_set_wrprotect(src_mm, addr, src_pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 84af842e828d..12908dcf5831 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3102,7 +3102,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); - page_dup_rmap(ptepage); + page_dup_rmap(ptepage, true); set_huge_pte_at(dst, addr, dst_pte, entry); hugetlb_count_add(pages_per_huge_page(h), dst); } @@ -3585,7 +3585,7 @@ retry: ClearPagePrivate(page); hugepage_add_new_anon_rmap(page, vma, address); } else - page_dup_rmap(page); + page_dup_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3b8c845e0c2e..bee6b1c9fdce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4899,6 +4899,14 @@ retry: switch (get_mctgt_type(vma, addr, ptent, &target)) { case MC_TARGET_PAGE: page = target.page; + /* + * We can have a part of the split pmd here. Moving it + * can be done but it would be too convoluted so simply + * ignore such a partial THP and keep it in original + * memcg. There should be somebody mapping the head. + */ + if (PageTransCompound(page)) + goto put; if (isolate_lru_page(page)) goto put; if (!mem_cgroup_move_account(page, false, diff --git a/mm/memory.c b/mm/memory.c index 3b656d1a8e07..9b0dbc2f0b9a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -864,7 +864,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, page = vm_normal_page(vma, addr, pte); if (page) { get_page(page); - page_dup_rmap(page); + page_dup_rmap(page, false); rss[mm_counter(page)]++; } diff --git a/mm/migrate.c b/mm/migrate.c index 3921f20f8de4..91545da23fd1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -165,7 +165,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, addr); else - page_dup_rmap(new); + page_dup_rmap(new, true); } else if (PageAnon(new)) page_add_anon_rmap(new, vma, addr, false); else diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d02d6436add0..3221091da513 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -469,6 +469,7 @@ void prep_compound_page(struct page *page, unsigned int order) p->mapping = TAIL_MAPPING; set_compound_head(p, page); } + atomic_set(compound_mapcount_ptr(page), -1); } #ifdef CONFIG_DEBUG_PAGEALLOC @@ -733,7 +734,7 @@ static inline int free_pages_check(struct page *page) const char *bad_reason = NULL; unsigned long bad_flags = 0; - if (unlikely(page_mapcount(page))) + if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; @@ -857,7 +858,13 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) ret = 0; goto out; } - if (page->mapping != TAIL_MAPPING) { + /* mapping in first tail page is used for compound_mapcount() */ + if (page - head_page == 1) { + if (unlikely(compound_mapcount(page))) { + bad_page(page, "nonzero compound_mapcount", 0); + goto out; + } + } else if (page->mapping != TAIL_MAPPING) { bad_page(page, "corrupted mapping in tail page", 0); goto out; } @@ -1335,7 +1342,7 @@ static inline int check_new_page(struct page *page) const char *bad_reason = NULL; unsigned long bad_flags = 0; - if (unlikely(page_mapcount(page))) + if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; diff --git a/mm/rmap.c b/mm/rmap.c index aa68a4089a53..2e6257165527 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1122,7 +1122,7 @@ static void __page_check_anon_rmap(struct page *page, * over the call to page_add_new_anon_rmap. */ BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); - BUG_ON(page->index != linear_page_index(vma, address)); + BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); #endif } @@ -1152,9 +1152,28 @@ void page_add_anon_rmap(struct page *page, void do_page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, int flags) { - int first = atomic_inc_and_test(&page->_mapcount); + bool compound = flags & RMAP_COMPOUND; + bool first; + + if (PageTransCompound(page)) { + VM_BUG_ON_PAGE(!PageLocked(page), page); + if (compound) { + atomic_t *mapcount; + + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + mapcount = compound_mapcount_ptr(page); + first = atomic_inc_and_test(mapcount); + } else { + /* Anon THP always mapped first with PMD */ + first = 0; + VM_BUG_ON_PAGE(!page_mapcount(page), page); + atomic_inc(&page->_mapcount); + } + } else { + first = atomic_inc_and_test(&page->_mapcount); + } + if (first) { - bool compound = flags & RMAP_COMPOUND; int nr = compound ? hpage_nr_pages(page) : 1; /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because @@ -1173,6 +1192,7 @@ void do_page_add_anon_rmap(struct page *page, return; VM_BUG_ON_PAGE(!PageLocked(page), page); + /* address might be in next vma when migration races vma_adjust */ if (first) __page_set_anon_rmap(page, vma, address, @@ -1199,10 +1219,16 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); SetPageSwapBacked(page); - atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); + /* increment count (starts at -1) */ + atomic_set(compound_mapcount_ptr(page), 0); __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + } else { + /* Anon THP always mapped first with PMD */ + VM_BUG_ON_PAGE(PageTransCompound(page), page); + /* increment count (starts at -1) */ + atomic_set(&page->_mapcount, 0); } __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); __page_set_anon_rmap(page, vma, address, 1); @@ -1232,12 +1258,15 @@ static void page_remove_file_rmap(struct page *page) memcg = mem_cgroup_begin_page_stat(page); - /* page still mapped by someone else? */ - if (!atomic_add_negative(-1, &page->_mapcount)) + /* Hugepages are not counted in NR_FILE_MAPPED for now. */ + if (unlikely(PageHuge(page))) { + /* hugetlb pages are always mapped with pmds */ + atomic_dec(compound_mapcount_ptr(page)); goto out; + } - /* Hugepages are not counted in NR_FILE_MAPPED for now. */ - if (unlikely(PageHuge(page))) + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) goto out; /* @@ -1254,6 +1283,39 @@ out: mem_cgroup_end_page_stat(memcg); } +static void page_remove_anon_compound_rmap(struct page *page) +{ + int i, nr; + + if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) + return; + + /* Hugepages are not counted in NR_ANON_PAGES for now. */ + if (unlikely(PageHuge(page))) + return; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return; + + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + + if (TestClearPageDoubleMap(page)) { + /* + * Subpages can be mapped with PTEs too. Check how many of + * themi are still mapped. + */ + for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + if (atomic_add_negative(-1, &page[i]._mapcount)) + nr++; + } + } else { + nr = HPAGE_PMD_NR; + } + + if (nr) + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); +} + /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from @@ -1263,33 +1325,25 @@ out: */ void page_remove_rmap(struct page *page, bool compound) { - int nr = compound ? hpage_nr_pages(page) : 1; - if (!PageAnon(page)) { VM_BUG_ON_PAGE(compound && !PageHuge(page), page); page_remove_file_rmap(page); return; } + if (compound) + return page_remove_anon_compound_rmap(page); + /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) return; - /* Hugepages are not counted in NR_ANON_PAGES for now. */ - if (unlikely(PageHuge(page))) - return; - /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - if (compound) { - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - } - - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); + __dec_zone_page_state(page, NR_ANON_PAGES); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1710,7 +1764,7 @@ void hugepage_add_anon_rmap(struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(!anon_vma); /* address might be in next vma when migration races vma_adjust */ - first = atomic_inc_and_test(&page->_mapcount); + first = atomic_inc_and_test(compound_mapcount_ptr(page)); if (first) __hugepage_set_anon_rmap(page, vma, address, 0); } @@ -1719,7 +1773,7 @@ void hugepage_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { BUG_ON(address < vma->vm_start || address >= vma->vm_end); - atomic_set(&page->_mapcount, 0); + atomic_set(compound_mapcount_ptr(page), 0); __hugepage_set_anon_rmap(page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ -- cgit v1.2.3