summaryrefslogtreecommitdiff
path: root/mm/khugepaged.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/khugepaged.c')
-rw-r--r--mm/khugepaged.c86
1 files changed, 58 insertions, 28 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 5cb401aa2b9d..92e6f56a932d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -490,32 +490,43 @@ void __khugepaged_exit(struct mm_struct *mm)
}
}
+static void release_pte_folio(struct folio *folio)
+{
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ -folio_nr_pages(folio));
+ folio_unlock(folio);
+ folio_putback_lru(folio);
+}
+
static void release_pte_page(struct page *page)
{
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_is_file_lru(page),
- -compound_nr(page));
- unlock_page(page);
- putback_lru_page(page);
+ release_pte_folio(page_folio(page));
}
static void release_pte_pages(pte_t *pte, pte_t *_pte,
struct list_head *compound_pagelist)
{
- struct page *page, *tmp;
+ struct folio *folio, *tmp;
while (--_pte >= pte) {
pte_t pteval = *_pte;
+ unsigned long pfn;
- page = pte_page(pteval);
- if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
- !PageCompound(page))
- release_pte_page(page);
+ if (pte_none(pteval))
+ continue;
+ pfn = pte_pfn(pteval);
+ if (is_zero_pfn(pfn))
+ continue;
+ folio = pfn_folio(pfn);
+ if (folio_test_large(folio))
+ continue;
+ release_pte_folio(folio);
}
- list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
- list_del(&page->lru);
- release_pte_page(page);
+ list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
+ list_del(&folio->lru);
+ release_pte_folio(folio);
}
}
@@ -625,7 +636,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
*/
- if (isolate_lru_page(page)) {
+ if (!isolate_lru_page(page)) {
unlock_page(page);
result = SCAN_DEL_PAGE_LRU;
goto out;
@@ -847,6 +858,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_SUCCEED;
}
+/*
+ * See pmd_trans_unstable() for how the result may change out from
+ * underneath us, even if we hold mmap_lock in read.
+ */
static int find_pmd_or_thp_or_none(struct mm_struct *mm,
unsigned long address,
pmd_t **pmd)
@@ -865,8 +880,12 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
#endif
if (pmd_none(pmde))
return SCAN_PMD_NONE;
+ if (!pmd_present(pmde))
+ return SCAN_PMD_NULL;
if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED;
+ if (pmd_devmap(pmde))
+ return SCAN_PMD_NULL;
if (pmd_bad(pmde))
return SCAN_PMD_NULL;
return SCAN_SUCCEED;
@@ -1032,8 +1051,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
anon_vma_lock_write(vma->anon_vma);
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
- address, address + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
+ address + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
pte = pte_offset_map(pmd, address);
@@ -1404,7 +1423,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
if (vma->anon_vma)
lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
addr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
pmd = pmdp_collapse_flush(vma, addr, pmdp);
@@ -1460,14 +1479,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
return SCAN_VMA_CHECK;
- /*
- * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings
- * that got written to. Without this, we'd have to also lock the
- * anon_vma if one exists.
- */
- if (vma->anon_vma)
- return SCAN_VMA_CHECK;
-
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
if (userfaultfd_wp(vma))
return SCAN_PTE_UFFD_WP;
@@ -1567,8 +1578,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
}
/* step 4: remove pte entries */
+ /* we make no change to anon, but protect concurrent anon page lookup */
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+
collapse_and_free_pmd(mm, vma, haddr, pmd);
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
i_mmap_unlock_write(vma->vm_file->f_mapping);
maybe_install_pmd:
@@ -1644,7 +1661,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
* has higher cost too. It would also probably require locking
* the anon_vma.
*/
- if (vma->anon_vma) {
+ if (READ_ONCE(vma->anon_vma)) {
result = SCAN_PAGE_ANON;
goto next;
}
@@ -1673,6 +1690,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
if ((cc->is_khugepaged || is_target) &&
mmap_write_trylock(mm)) {
/*
+ * Re-check whether we have an ->anon_vma, because
+ * collapse_and_free_pmd() requires that either no
+ * ->anon_vma exists or the anon_vma is locked.
+ * We already checked ->anon_vma above, but that check
+ * is racy because ->anon_vma can be populated under the
+ * mmap lock in read mode.
+ */
+ if (vma->anon_vma) {
+ result = SCAN_PAGE_ANON;
+ goto unlock_next;
+ }
+ /*
* When a vma is registered with uffd-wp, we can't
* recycle the pmd pgtable because there can be pte
* markers installed. Skip it only, so the rest mm/vma
@@ -1921,7 +1950,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
goto out_unlock;
}
- if (folio_isolate_lru(folio)) {
+ if (!folio_isolate_lru(folio)) {
result = SCAN_DEL_PAGE_LRU;
goto out_unlock;
}
@@ -2593,6 +2622,7 @@ static int madvise_collapse_errno(enum scan_result r)
case SCAN_CGROUP_CHARGE_FAIL:
return -EBUSY;
/* Resource temporary unavailable - trying again might succeed */
+ case SCAN_PAGE_COUNT:
case SCAN_PAGE_LOCK:
case SCAN_PAGE_LRU:
case SCAN_DEL_PAGE_LRU:
@@ -2649,7 +2679,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
goto out_nolock;
}
- hend = vma->vm_end & HPAGE_PMD_MASK;
+ hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
}
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));