summaryrefslogtreecommitdiff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 12:32:41 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 12:32:41 -0700
commit98931dd95fd489fcbfa97da563505a6f071d7c77 (patch)
tree44683fc4a92efa614acdca2742a7ff19d26da1e3 /mm/huge_memory.c
parentdf202b452fe6c6d6f1351bad485e2367ef1e644e (diff)
parentf403f22f8ccb12860b2b62fec3173c6ccd45938b (diff)
Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: "Almost all of MM here. A few things are still getting finished off, reviewed, etc. - Yang Shi has improved the behaviour of khugepaged collapsing of readonly file-backed transparent hugepages. - Johannes Weiner has arranged for zswap memory use to be tracked and managed on a per-cgroup basis. - Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for runtime enablement of the recent huge page vmemmap optimization feature. - Baolin Wang contributes a series to fix some issues around hugetlb pagetable invalidation. - Zhenwei Pi has fixed some interactions between hwpoisoned pages and virtualization. - Tong Tiangen has enabled the use of the presently x86-only page_table_check debugging feature on arm64 and riscv. - David Vernet has done some fixup work on the memcg selftests. - Peter Xu has taught userfaultfd to handle write protection faults against shmem- and hugetlbfs-backed files. - More DAMON development from SeongJae Park - adding online tuning of the feature and support for monitoring of fixed virtual address ranges. Also easier discovery of which monitoring operations are available. - Nadav Amit has done some optimization of TLB flushing during mprotect(). - Neil Brown continues to labor away at improving our swap-over-NFS support. - David Hildenbrand has some fixes to anon page COWing versus get_user_pages(). - Peng Liu fixed some errors in the core hugetlb code. - Joao Martins has reduced the amount of memory consumed by device-dax's compound devmaps. - Some cleanups of the arch-specific pagemap code from Anshuman Khandual. - Muchun Song has found and fixed some errors in the TLB flushing of transparent hugepages. - Roman Gushchin has done more work on the memcg selftests. ... and, of course, many smaller fixes and cleanups. Notably, the customary million cleanup serieses from Miaohe Lin" * tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (381 commits) mm: kfence: use PAGE_ALIGNED helper selftests: vm: add the "settings" file with timeout variable selftests: vm: add "test_hmm.sh" to TEST_FILES selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests selftests: vm: add migration to the .gitignore selftests/vm/pkeys: fix typo in comment ksm: fix typo in comment selftests: vm: add process_mrelease tests Revert "mm/vmscan: never demote for memcg reclaim" mm/kfence: print disabling or re-enabling message include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace" include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion" mm: fix a potential infinite loop in start_isolate_page_range() MAINTAINERS: add Muchun as co-maintainer for HugeTLB zram: fix Kconfig dependency warning mm/shmem: fix shmem folio swapoff hang cgroup: fix an error handling path in alloc_pagecache_max_30M() mm: damon: use HPAGE_PMD_SIZE tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate nodemask.h: fix compilation error with GCC12 ...
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c177
1 files changed, 123 insertions, 54 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 910a138e9859..a77c78a2b6b5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -39,6 +39,7 @@
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
+#include "swap.h"
#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>
@@ -68,13 +69,6 @@ static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
-static inline bool file_thp_enabled(struct vm_area_struct *vma)
-{
- return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file &&
- !inode_is_open_for_write(vma->vm_file->f_inode) &&
- (vma->vm_flags & VM_EXEC);
-}
-
bool transparent_hugepage_active(struct vm_area_struct *vma)
{
/* The addr is used to check if the vma size fits */
@@ -86,8 +80,8 @@ bool transparent_hugepage_active(struct vm_area_struct *vma)
return __transparent_hugepage_enabled(vma);
if (vma_is_shmem(vma))
return shmem_huge_enabled(vma);
- if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
- return file_thp_enabled(vma);
+ if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma))
+ return true;
return false;
}
@@ -647,7 +641,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- page_add_new_anon_rmap(page, vma, haddr, true);
+ page_add_new_anon_rmap(page, vma, haddr);
lru_cache_add_inactive_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
@@ -725,15 +719,15 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
gfp_t gfp;
- struct page *page;
+ struct folio *folio;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
if (!transhuge_vma_suitable(vma, haddr))
return VM_FAULT_FALLBACK;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
- return VM_FAULT_OOM;
+ khugepaged_enter(vma, vma->vm_flags);
+
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) {
@@ -774,13 +768,12 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
return ret;
}
gfp = vma_thp_gfp_mask(vma);
- page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
- if (unlikely(!page)) {
+ folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
+ if (unlikely(!folio)) {
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- prep_transhuge_page(page);
- return __do_huge_pmd_anonymous_page(vmf, page, gfp);
+ return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1054,7 +1047,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
swp_entry_t entry = pmd_to_swp_entry(pmd);
VM_BUG_ON(!is_pmd_migration_entry(pmd));
- if (is_writable_migration_entry(entry)) {
+ if (!is_readable_migration_entry(entry)) {
entry = make_readable_migration_entry(
swp_offset(entry));
pmd = swp_entry_to_pmd(entry);
@@ -1097,23 +1090,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
- /*
- * If this page is a potentially pinned page, split and retry the fault
- * with smaller page size. Normally this should not happen because the
- * userspace should use MADV_DONTFORK upon pinned regions. This is a
- * best effort that the pinned pages won't be replaced by another
- * random page during the coming copy-on-write.
- */
- if (unlikely(page_needs_cow_for_dma(src_vma, src_page))) {
+ get_page(src_page);
+ if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) {
+ /* Page maybe pinned: split and retry the fault on PTEs. */
+ put_page(src_page);
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
__split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
return -EAGAIN;
}
-
- get_page(src_page);
- page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
out_zero_page:
mm_inc_nr_ptes(dst_mm);
@@ -1217,14 +1203,10 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
/* No huge zero pud yet */
}
- /* Please refer to comments in copy_huge_pmd() */
- if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) {
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
- __split_huge_pud(vma, src_pud, addr);
- return -EAGAIN;
- }
-
+ /*
+ * TODO: once we support anonymous pages, use page_try_dup_anon_rmap()
+ * and split if duplicating fails.
+ */
pudp_set_wrprotect(src_mm, addr, src_pud);
pud = pud_mkold(pud_wrprotect(pud));
set_pud_at(dst_mm, addr, dst_pud, pud);
@@ -1282,6 +1264,7 @@ unlock:
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
@@ -1290,6 +1273,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
+ VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
+ VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
+
if (is_huge_zero_pmd(orig_pmd))
goto fallback;
@@ -1303,6 +1289,10 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageHead(page), page);
+ /* Early check when only holding the PT lock. */
+ if (PageAnonExclusive(page))
+ goto reuse;
+
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
@@ -1317,8 +1307,14 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
put_page(page);
}
+ /* Recheck after temporarily dropping the PT lock. */
+ if (PageAnonExclusive(page)) {
+ unlock_page(page);
+ goto reuse;
+ }
+
/*
- * See do_wp_page(): we can only map the page writable if there are
+ * See do_wp_page(): we can only reuse the page exclusively if there are
* no additional references. Note that we always drain the LRU
* pagevecs immediately after adding a THP.
*/
@@ -1328,11 +1324,18 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
try_to_free_swap(page);
if (page_count(page) == 1) {
pmd_t entry;
+
+ page_move_anon_rmap(page, vma);
+ unlock_page(page);
+reuse:
+ if (unlikely(unshare)) {
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- unlock_page(page);
spin_unlock(vmf->ptl);
return VM_FAULT_WRITE;
}
@@ -1379,6 +1382,12 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+ if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
+ return ERR_PTR(-EMLINK);
+
+ VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
+
if (!try_grab_page(page, flags))
return ERR_PTR(-ENOMEM);
@@ -1692,18 +1701,21 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
* or if prot_numa but THP migration is not supported
* - HPAGE_PMD_NR if protections changed and TLB flush necessary
*/
-int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
+int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr, pgprot_t newprot,
+ unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
- pmd_t entry;
+ pmd_t oldpmd, entry;
bool preserve_write;
int ret;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+
if (prot_numa && !thp_migration_supported())
return 1;
@@ -1717,6 +1729,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
+ struct page *page = pfn_swap_entry_to_page(entry);
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
if (is_writable_migration_entry(entry)) {
@@ -1725,8 +1738,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* A protection check is difficult so
* just be safe and disable write
*/
- entry = make_readable_migration_entry(
- swp_offset(entry));
+ if (PageAnon(page))
+ entry = make_readable_exclusive_migration_entry(swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(swp_offset(entry));
newpmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
@@ -1778,12 +1793,12 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* The race makes MADV_DONTNEED miss the huge pmd and don't clear it
* which may break userspace.
*
- * pmdp_invalidate() is required to make sure we don't miss
+ * pmdp_invalidate_ad() is required to make sure we don't miss
* dirty/young flags set by hardware.
*/
- entry = pmdp_invalidate(vma, addr, pmd);
+ oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
- entry = pmd_modify(entry, newprot);
+ entry = pmd_modify(oldpmd, newprot);
if (preserve_write)
entry = pmd_mk_savedwrite(entry);
if (uffd_wp) {
@@ -1799,6 +1814,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
+
+ if (huge_pmd_needs_flush(oldpmd, entry))
+ tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
+
BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
unlock:
spin_unlock(ptl);
@@ -1946,6 +1965,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
+ bool anon_exclusive = false;
unsigned long addr;
int i;
@@ -2027,6 +2047,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pmd_to_swp_entry(old_pmd);
page = pfn_swap_entry_to_page(entry);
write = is_writable_migration_entry(entry);
+ if (PageAnon(page))
+ anon_exclusive = is_readable_exclusive_migration_entry(entry);
young = false;
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
@@ -2038,8 +2060,26 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
uffd_wp = pmd_uffd_wp(old_pmd);
+
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
+
+ /*
+ * Without "freeze", we'll simply split the PMD, propagating the
+ * PageAnonExclusive() flag for each PTE by setting it for
+ * each subpage -- no need to (temporarily) clear.
+ *
+ * With "freeze" we want to replace mapped pages by
+ * migration entries right away. This is only possible if we
+ * managed to clear PageAnonExclusive() -- see
+ * set_pmd_migration_entry().
+ *
+ * In case we cannot clear PageAnonExclusive(), split the PMD
+ * only and let try_to_migrate_one() fail later.
+ */
+ anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
+ if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
+ freeze = false;
}
/*
@@ -2061,6 +2101,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (write)
swp_entry = make_writable_migration_entry(
page_to_pfn(page + i));
+ else if (anon_exclusive)
+ swp_entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(page + i));
else
swp_entry = make_readable_migration_entry(
page_to_pfn(page + i));
@@ -2072,6 +2115,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
entry = maybe_mkwrite(entry, vma);
+ if (anon_exclusive)
+ SetPageAnonExclusive(page + i);
if (!write)
entry = pte_wrprotect(entry);
if (!young)
@@ -2249,8 +2294,6 @@ static void unmap_page(struct page *page)
try_to_migrate(folio, ttu_flags);
else
try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
-
- VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
static void remap_page(struct folio *folio, unsigned long nr)
@@ -2305,6 +2348,13 @@ static void __split_huge_page_tail(struct page *head, int tail,
*
* After successful get_page_unless_zero() might follow flags change,
* for example lock_page() which set PG_waiters.
+ *
+ * Note that for mapped sub-pages of an anonymous THP,
+ * PG_anon_exclusive has been cleared in unmap_page() and is stored in
+ * the migration entry instead from where remap_page() will restore it.
+ * We can still have PG_anon_exclusive set on effectively unmapped and
+ * unreferenced sub-pages of an anonymous THP: we can simply drop
+ * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
*/
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
@@ -3035,25 +3085,35 @@ late_initcall(split_huge_pages_debugfs);
#endif
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
struct page *page)
{
struct vm_area_struct *vma = pvmw->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long address = pvmw->address;
+ bool anon_exclusive;
pmd_t pmdval;
swp_entry_t entry;
pmd_t pmdswp;
if (!(pvmw->pmd && !pvmw->pte))
- return;
+ return 0;
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+
+ anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
+ if (anon_exclusive && page_try_share_anon_rmap(page)) {
+ set_pmd_at(mm, address, pvmw->pmd, pmdval);
+ return -EBUSY;
+ }
+
if (pmd_dirty(pmdval))
set_page_dirty(page);
if (pmd_write(pmdval))
entry = make_writable_migration_entry(page_to_pfn(page));
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
else
entry = make_readable_migration_entry(page_to_pfn(page));
pmdswp = swp_entry_to_pmd(entry);
@@ -3063,6 +3123,8 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
page_remove_rmap(page, vma, true);
put_page(page);
trace_set_migration_pmd(address, pmd_val(pmdswp));
+
+ return 0;
}
void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
@@ -3087,10 +3149,17 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
- if (PageAnon(new))
- page_add_anon_rmap(new, vma, mmun_start, true);
- else
+ if (PageAnon(new)) {
+ rmap_t rmap_flags = RMAP_COMPOUND;
+
+ if (!is_readable_migration_entry(entry))
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ page_add_anon_rmap(new, vma, mmun_start, rmap_flags);
+ } else {
page_add_file_rmap(new, vma, true);
+ }
+ VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new));
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
/* No need to invalidate - it was non-present before */