diff options
Diffstat (limited to 'mm/khugepaged.c')
-rw-r--r-- | mm/khugepaged.c | 86 |
1 files changed, 58 insertions, 28 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5cb401aa2b9d..92e6f56a932d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -490,32 +490,43 @@ void __khugepaged_exit(struct mm_struct *mm) } } +static void release_pte_folio(struct folio *folio) +{ + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + -folio_nr_pages(folio)); + folio_unlock(folio); + folio_putback_lru(folio); +} + static void release_pte_page(struct page *page) { - mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_is_file_lru(page), - -compound_nr(page)); - unlock_page(page); - putback_lru_page(page); + release_pte_folio(page_folio(page)); } static void release_pte_pages(pte_t *pte, pte_t *_pte, struct list_head *compound_pagelist) { - struct page *page, *tmp; + struct folio *folio, *tmp; while (--_pte >= pte) { pte_t pteval = *_pte; + unsigned long pfn; - page = pte_page(pteval); - if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) && - !PageCompound(page)) - release_pte_page(page); + if (pte_none(pteval)) + continue; + pfn = pte_pfn(pteval); + if (is_zero_pfn(pfn)) + continue; + folio = pfn_folio(pfn); + if (folio_test_large(folio)) + continue; + release_pte_folio(folio); } - list_for_each_entry_safe(page, tmp, compound_pagelist, lru) { - list_del(&page->lru); - release_pte_page(page); + list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { + list_del(&folio->lru); + release_pte_folio(folio); } } @@ -625,7 +636,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ - if (isolate_lru_page(page)) { + if (!isolate_lru_page(page)) { unlock_page(page); result = SCAN_DEL_PAGE_LRU; goto out; @@ -847,6 +858,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } +/* + * See pmd_trans_unstable() for how the result may change out from + * underneath us, even if we hold mmap_lock in read. + */ static int find_pmd_or_thp_or_none(struct mm_struct *mm, unsigned long address, pmd_t **pmd) @@ -865,8 +880,12 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, #endif if (pmd_none(pmde)) return SCAN_PMD_NONE; + if (!pmd_present(pmde)) + return SCAN_PMD_NULL; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; + if (pmd_devmap(pmde)) + return SCAN_PMD_NULL; if (pmd_bad(pmde)) return SCAN_PMD_NULL; return SCAN_SUCCEED; @@ -1032,8 +1051,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, anon_vma_lock_write(vma->anon_vma); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, - address, address + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, + address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pte = pte_offset_map(pmd, address); @@ -1404,7 +1423,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v if (vma->anon_vma) lockdep_assert_held_write(&vma->anon_vma->root->rwsem); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd = pmdp_collapse_flush(vma, addr, pmdp); @@ -1460,14 +1479,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return SCAN_VMA_CHECK; - /* - * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings - * that got written to. Without this, we'd have to also lock the - * anon_vma if one exists. - */ - if (vma->anon_vma) - return SCAN_VMA_CHECK; - /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) return SCAN_PTE_UFFD_WP; @@ -1567,8 +1578,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, } /* step 4: remove pte entries */ + /* we make no change to anon, but protect concurrent anon page lookup */ + if (vma->anon_vma) + anon_vma_lock_write(vma->anon_vma); + collapse_and_free_pmd(mm, vma, haddr, pmd); + if (vma->anon_vma) + anon_vma_unlock_write(vma->anon_vma); i_mmap_unlock_write(vma->vm_file->f_mapping); maybe_install_pmd: @@ -1644,7 +1661,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, * has higher cost too. It would also probably require locking * the anon_vma. */ - if (vma->anon_vma) { + if (READ_ONCE(vma->anon_vma)) { result = SCAN_PAGE_ANON; goto next; } @@ -1673,6 +1690,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, if ((cc->is_khugepaged || is_target) && mmap_write_trylock(mm)) { /* + * Re-check whether we have an ->anon_vma, because + * collapse_and_free_pmd() requires that either no + * ->anon_vma exists or the anon_vma is locked. + * We already checked ->anon_vma above, but that check + * is racy because ->anon_vma can be populated under the + * mmap lock in read mode. + */ + if (vma->anon_vma) { + result = SCAN_PAGE_ANON; + goto unlock_next; + } + /* * When a vma is registered with uffd-wp, we can't * recycle the pmd pgtable because there can be pte * markers installed. Skip it only, so the rest mm/vma @@ -1921,7 +1950,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } - if (folio_isolate_lru(folio)) { + if (!folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; } @@ -2593,6 +2622,7 @@ static int madvise_collapse_errno(enum scan_result r) case SCAN_CGROUP_CHARGE_FAIL: return -EBUSY; /* Resource temporary unavailable - trying again might succeed */ + case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: @@ -2649,7 +2679,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, goto out_nolock; } - hend = vma->vm_end & HPAGE_PMD_MASK; + hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); |