summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2020-10-06 21:39:45 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2020-10-06 21:39:46 +1100
commitf6c24cb83df0bfb1b2dc8af44d94abfa564f0362 (patch)
tree3def6e52a2bd916d9988fdcb789e48613cea919f /mm
parent05be379ca95b0b5bfce9b4d55ff4b32b83acb1df (diff)
parent5e64912209ae1a12be8f8ee66ee4ccca8fa3f9cd (diff)
Merge branch 'akpm-current/current' into master
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile1
-rw-r--r--mm/compaction.c11
-rw-r--r--mm/debug.c18
-rw-r--r--mm/debug_vm_pgtable.c219
-rw-r--r--mm/dmapool.c46
-rw-r--r--mm/fadvise.c9
-rw-r--r--mm/filemap.c132
-rw-r--r--mm/frame_vector.c4
-rw-r--r--mm/gup.c105
-rw-r--r--mm/gup_benchmark.c23
-rw-r--r--mm/highmem.c4
-rw-r--r--mm/huge_memory.c38
-rw-r--r--mm/hugetlb.c100
-rw-r--r--mm/hwpoison-inject.c18
-rw-r--r--mm/internal.h30
-rw-r--r--mm/kasan/report.c34
-rw-r--r--mm/khugepaged.c15
-rw-r--r--mm/kmemleak-test.c99
-rw-r--r--mm/kmemleak.c8
-rw-r--r--mm/madvise.c73
-rw-r--r--mm/memblock.c98
-rw-r--r--mm/memcontrol.c255
-rw-r--r--mm/memory-failure.c316
-rw-r--r--mm/memory.c152
-rw-r--r--mm/memory_hotplug.c216
-rw-r--r--mm/mempolicy.c8
-rw-r--r--mm/mempool.c18
-rw-r--r--mm/memremap.c329
-rw-r--r--mm/migrate.c14
-rw-r--r--mm/mincore.c28
-rw-r--r--mm/mmap.c95
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c1
-rw-r--r--mm/page_alloc.c240
-rw-r--r--mm/page_counter.c2
-rw-r--r--mm/page_io.c14
-rw-r--r--mm/page_isolation.c50
-rw-r--r--mm/page_owner.c10
-rw-r--r--mm/page_poison.c20
-rw-r--r--mm/page_reporting.c4
-rw-r--r--mm/readahead.c130
-rw-r--r--mm/rmap.c10
-rw-r--r--mm/shmem.c21
-rw-r--r--mm/shuffle.c2
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slab.h1
-rw-r--r--mm/slub.c32
-rw-r--r--mm/sparse.c12
-rw-r--r--mm/swap.c18
-rw-r--r--mm/swap_slots.c3
-rw-r--r--mm/swap_state.c38
-rw-r--r--mm/swapfile.c17
-rw-r--r--mm/truncate.c64
-rw-r--r--mm/util.c3
-rw-r--r--mm/vmalloc.c6
-rw-r--r--mm/vmscan.c23
-rw-r--r--mm/vmstat.c40
-rw-r--r--mm/workingset.c2
-rw-r--r--mm/z3fold.c3
-rw-r--r--mm/zbud.c1
62 files changed, 1635 insertions, 1662 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d4fda3e3c692..c7f30f8b282b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -152,6 +152,7 @@ config HAVE_BOOTMEM_INFO_NODE
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
+ select MEMORY_ISOLATION
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
depends on 64BIT || BROKEN
@@ -178,7 +179,6 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
- select MEMORY_ISOLATION
select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
@@ -832,10 +832,10 @@ config PERCPU_STATS
be used to help understand percpu memory usage.
config GUP_BENCHMARK
- bool "Enable infrastructure for get_user_pages_fast() benchmarking"
+ bool "Enable infrastructure for get_user_pages() and related calls benchmarking"
help
Provides /sys/kernel/debug/gup_benchmark that helps with testing
- performance of get_user_pages_fast().
+ performance of get_user_pages() and related calls.
See tools/testing/selftests/vm/gup_benchmark.c
diff --git a/mm/Makefile b/mm/Makefile
index d5649f1c12c0..d73aed0fc99c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -94,7 +94,6 @@ obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
-obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 176dcded298e..6e0ee5641788 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -180,11 +180,10 @@ bool compaction_deferred(struct zone *zone, int order)
return false;
/* Avoid possible overflow */
- if (++zone->compact_considered > defer_limit)
+ if (++zone->compact_considered >= defer_limit) {
zone->compact_considered = defer_limit;
-
- if (zone->compact_considered >= defer_limit)
return false;
+ }
trace_mm_compaction_deferred(zone, order);
@@ -626,7 +625,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
}
/* Found a free page, will break it into order-0 pages */
- order = page_order(page);
+ order = buddy_order(page);
isolated = __isolate_free_page(page, order);
if (!isolated)
break;
@@ -899,7 +898,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* potential isolation targets.
*/
if (PageBuddy(page)) {
- unsigned long freepage_order = page_order_unsafe(page);
+ unsigned long freepage_order = buddy_order_unsafe(page);
/*
* Without lock, we cannot be sure that what we got is
@@ -1173,7 +1172,7 @@ static bool suitable_migration_target(struct compact_control *cc,
* the only small danger is that we skip a potentially suitable
* pageblock, so it's not worth to check order for valid range.
*/
- if (page_order_unsafe(page) >= pageblock_order)
+ if (buddy_order_unsafe(page) >= pageblock_order)
return false;
}
diff --git a/mm/debug.c b/mm/debug.c
index ca8d1cacdecc..ccca576b2899 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -102,12 +102,12 @@ void __dump_page(struct page *page, const char *reason)
if (hpage_pincount_available(page)) {
pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
head, compound_order(head),
- head_mapcount(head),
- head_pincount(head));
+ head_compound_mapcount(head),
+ head_compound_pincount(head));
} else {
pr_warn("head:%p order:%u compound_mapcount:%d\n",
head, compound_order(head),
- head_mapcount(head));
+ head_compound_mapcount(head));
}
}
if (PageKsm(page))
@@ -120,6 +120,7 @@ void __dump_page(struct page *page, const char *reason)
struct hlist_node *dentry_first;
struct dentry *dentry_ptr;
struct dentry dentry;
+ unsigned long ino;
/*
* mapping can be invalid pointer and we don't want to crash
@@ -136,21 +137,22 @@ void __dump_page(struct page *page, const char *reason)
goto out_mapping;
}
- if (get_kernel_nofault(dentry_first, &host->i_dentry.first)) {
+ if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
+ get_kernel_nofault(ino, &host->i_ino)) {
pr_warn("aops:%ps with invalid host inode %px\n",
a_ops, host);
goto out_mapping;
}
if (!dentry_first) {
- pr_warn("aops:%ps ino:%lx\n", a_ops, host->i_ino);
+ pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
goto out_mapping;
}
dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
if (get_kernel_nofault(dentry, dentry_ptr)) {
- pr_warn("aops:%ps with invalid dentry %px\n", a_ops,
- dentry_ptr);
+ pr_warn("aops:%ps ino:%lx with invalid dentry %px\n",
+ a_ops, ino, dentry_ptr);
} else {
/*
* if dentry is corrupted, the %pd handler may still
@@ -158,7 +160,7 @@ void __dump_page(struct page *page, const char *reason)
* corrupted struct page
*/
pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n",
- a_ops, host->i_ino, &dentry);
+ a_ops, ino, &dentry);
}
}
out_mapping:
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 086309fb9b6f..c5ae822cc6bc 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -28,6 +28,7 @@
#include <linux/swapops.h>
#include <linux/start_kernel.h>
#include <linux/sched/mm.h>
+#include <linux/io.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -44,10 +45,17 @@
* entry type. But these bits might affect the ability to clear entries with
* pxx_clear() because of how dynamic page table folding works on s390. So
* while loading up the entries do not change the lower 4 bits. It does not
- * have affect any other platform.
+ * have affect any other platform. Also avoid the 62nd bit on ppc64 that is
+ * used to mark a pte entry.
*/
-#define S390_MASK_BITS 4
-#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
+#define S390_SKIP_MASK GENMASK(3, 0)
+#if __BITS_PER_LONG == 64
+#define PPC64_SKIP_MASK GENMASK(62, 62)
+#else
+#define PPC64_SKIP_MASK 0x0
+#endif
+#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK)
+#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
#define RANDOM_NZVALUE GENMASK(7, 0)
static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
@@ -71,15 +79,18 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
{
pte_t pte = pfn_pte(pfn, prot);
+ /*
+ * Architectures optimize set_pte_at by avoiding TLB flush.
+ * This requires set_pte_at to be not used to update an
+ * existing pte entry. Clear pte before we do set_pte_at
+ */
+
pr_debug("Validating PTE advanced\n");
pte = pfn_pte(pfn, prot);
set_pte_at(mm, vaddr, ptep, pte);
ptep_set_wrprotect(mm, vaddr, ptep);
pte = ptep_get(ptep);
WARN_ON(pte_write(pte));
-
- pte = pfn_pte(pfn, prot);
- set_pte_at(mm, vaddr, ptep, pte);
ptep_get_and_clear(mm, vaddr, ptep);
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
@@ -93,13 +104,11 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
pte = ptep_get(ptep);
WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
-
- pte = pfn_pte(pfn, prot);
- set_pte_at(mm, vaddr, ptep, pte);
ptep_get_and_clear_full(mm, vaddr, ptep, 1);
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
+ pte = pfn_pte(pfn, prot);
pte = pte_mkyoung(pte);
set_pte_at(mm, vaddr, ptep, pte);
ptep_test_and_clear_young(vma, vaddr, ptep);
@@ -111,10 +120,14 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
{
pte_t pte = pfn_pte(pfn, prot);
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
pr_debug("Validating PTE saved write\n");
WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
{
@@ -141,9 +154,9 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
static void __init pmd_advanced_tests(struct mm_struct *mm,
struct vm_area_struct *vma, pmd_t *pmdp,
unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+ pgprot_t prot, pgtable_t pgtable)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ pmd_t pmd;
if (!has_transparent_hugepage())
return;
@@ -152,19 +165,18 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
/* Align the address wrt HPAGE_PMD_SIZE */
vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
- pmd = pfn_pmd(pfn, prot);
+ pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+
+ pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_set_wrprotect(mm, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_write(pmd));
-
- pmd = pfn_pmd(pfn, prot);
- set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_huge_get_and_clear(mm, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
- pmd = pfn_pmd(pfn, prot);
+ pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(mm, vaddr, pmdp, pmd);
@@ -173,18 +185,20 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
pmd = READ_ONCE(*pmdp);
WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
-
- pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
- set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
+ pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
pmd = pmd_mkyoung(pmd);
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_test_and_clear_young(vma, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_young(pmd));
+
+ /* Clear the pte entries */
+ pmdp_huge_get_and_clear(mm, vaddr, pmdp);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
}
static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
@@ -199,11 +213,12 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_leaf(pmd));
}
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
{
pmd_t pmd;
- if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ if (!arch_ioremap_pmd_supported())
return;
pr_debug("Validating PMD huge\n");
@@ -217,10 +232,16 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
}
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ pmd_t pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
pr_debug("Validating PMD saved write\n");
WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
@@ -257,7 +278,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
unsigned long pfn, unsigned long vaddr,
pgprot_t prot)
{
- pud_t pud = pfn_pud(pfn, prot);
+ pud_t pud;
if (!has_transparent_hugepage())
return;
@@ -266,25 +287,19 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
/* Align the address wrt HPAGE_PUD_SIZE */
vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
+ pud = pud_mkhuge(pfn_pud(pfn, prot));
set_pud_at(mm, vaddr, pudp, pud);
pudp_set_wrprotect(mm, vaddr, pudp);
pud = READ_ONCE(*pudp);
WARN_ON(pud_write(pud));
#ifndef __PAGETABLE_PMD_FOLDED
- pud = pfn_pud(pfn, prot);
- set_pud_at(mm, vaddr, pudp, pud);
pudp_huge_get_and_clear(mm, vaddr, pudp);
pud = READ_ONCE(*pudp);
WARN_ON(!pud_none(pud));
-
- pud = pfn_pud(pfn, prot);
- set_pud_at(mm, vaddr, pudp, pud);
- pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
- pud = READ_ONCE(*pudp);
- WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
- pud = pfn_pud(pfn, prot);
+
+ pud = pud_mkhuge(pfn_pud(pfn, prot));
pud = pud_wrprotect(pud);
pud = pud_mkclean(pud);
set_pud_at(mm, vaddr, pudp, pud);
@@ -294,11 +309,20 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
pud = READ_ONCE(*pudp);
WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
+#ifndef __PAGETABLE_PMD_FOLDED
+ pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+ pud = pud_mkhuge(pfn_pud(pfn, prot));
pud = pud_mkyoung(pud);
set_pud_at(mm, vaddr, pudp, pud);
pudp_test_and_clear_young(vma, vaddr, pudp);
pud = READ_ONCE(*pudp);
WARN_ON(pud_young(pud));
+
+ pudp_huge_get_and_clear(mm, vaddr, pudp);
}
static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
@@ -313,11 +337,12 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pud_leaf(pud));
}
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
{
pud_t pud;
- if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ if (!arch_ioremap_pud_supported())
return;
pr_debug("Validating PUD huge\n");
@@ -331,6 +356,10 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
pud = READ_ONCE(*pudp);
WARN_ON(!pud_none(pud));
}
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
+#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pud_advanced_tests(struct mm_struct *mm,
@@ -350,7 +379,7 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pmd_advanced_tests(struct mm_struct *mm,
struct vm_area_struct *vma, pmd_t *pmdp,
unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+ pgprot_t prot, pgtable_t pgtable)
{
}
static void __init pud_advanced_tests(struct mm_struct *mm,
@@ -417,8 +446,6 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
* This entry points to next level page table page.
* Hence this must not qualify as pud_bad().
*/
- pmd_clear(pmdp);
- pud_clear(pudp);
pud_populate(mm, pudp, pmdp);
pud = READ_ONCE(*pudp);
WARN_ON(pud_bad(pud));
@@ -515,9 +542,10 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
#endif /* PAGETABLE_P4D_FOLDED */
static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
- unsigned long vaddr)
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
{
- pte_t pte = ptep_get(ptep);
+ pte_t pte = pfn_pte(pfn, prot);
pr_debug("Validating PTE clear\n");
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
@@ -550,7 +578,6 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
* This entry points to next level page table page.
* Hence this must not qualify as pmd_bad().
*/
- pmd_clear(pmdp);
pmd_populate(mm, pmdp, pgtable);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_bad(pmd));
@@ -784,57 +811,8 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pte_huge(pte_mkhuge(pte)));
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
}
-
-static void __init hugetlb_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pte_t *ptep, unsigned long pfn,
- unsigned long vaddr, pgprot_t prot)
-{
- struct page *page = pfn_to_page(pfn);
- pte_t pte = ptep_get(ptep);
- unsigned long paddr = __pfn_to_phys(pfn) & PMD_MASK;
-
- pr_debug("Validating HugeTLB advanced\n");
- pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot));
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- WARN_ON(!pte_same(pte, huge_ptep_get(ptep)));
- huge_pte_clear(mm, vaddr, ptep, PMD_SIZE);
- pte = huge_ptep_get(ptep);
- WARN_ON(!huge_pte_none(pte));
-
- pte = mk_huge_pte(page, prot);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- huge_ptep_set_wrprotect(mm, vaddr, ptep);
- pte = huge_ptep_get(ptep);
- WARN_ON(huge_pte_write(pte));
-
- pte = mk_huge_pte(page, prot);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- huge_ptep_get_and_clear(mm, vaddr, ptep);
- pte = huge_ptep_get(ptep);
- WARN_ON(!huge_pte_none(pte));
-
- pte = mk_huge_pte(page, prot);
- pte = huge_pte_wrprotect(pte);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- pte = huge_pte_mkwrite(pte);
- pte = huge_pte_mkdirty(pte);
- huge_ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
- pte = huge_ptep_get(ptep);
- WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
-}
#else /* !CONFIG_HUGETLB_PAGE */
static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init hugetlb_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pte_t *ptep, unsigned long pfn,
- unsigned long vaddr, pgprot_t prot)
-{
-}
#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -965,7 +943,13 @@ static int __init debug_vm_pgtable(void)
p4dp = p4d_alloc(mm, pgdp, vaddr);
pudp = pud_alloc(mm, p4dp, vaddr);
pmdp = pmd_alloc(mm, pudp, vaddr);
- ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
+ /*
+ * Allocate pgtable_t
+ */
+ if (pte_alloc(mm, pmdp)) {
+ pr_err("pgtable allocation failed\n");
+ return 1;
+ }
/*
* Save all the page table page addresses as the page table
@@ -985,32 +969,11 @@ static int __init debug_vm_pgtable(void)
p4d_basic_tests(p4d_aligned, prot);
pgd_basic_tests(pgd_aligned, prot);
- pte_clear_tests(mm, ptep, vaddr);
- pmd_clear_tests(mm, pmdp);
- pud_clear_tests(mm, pudp);
- p4d_clear_tests(mm, p4dp);
- pgd_clear_tests(mm, pgdp);
-
- pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
- pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
- pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
- hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-
pmd_leaf_tests(pmd_aligned, prot);
pud_leaf_tests(pud_aligned, prot);
- pmd_huge_tests(pmdp, pmd_aligned, prot);
- pud_huge_tests(pudp, pud_aligned, prot);
-
- pte_savedwrite_tests(pte_aligned, prot);
- pmd_savedwrite_tests(pmd_aligned, prot);
-
- pte_unmap_unlock(ptep, ptl);
-
- pmd_populate_tests(mm, pmdp, saved_ptep);
- pud_populate_tests(mm, pudp, saved_pmdp);
- p4d_populate_tests(mm, p4dp, saved_pudp);
- pgd_populate_tests(mm, pgdp, saved_p4dp);
+ pte_savedwrite_tests(pte_aligned, protnone);
+ pmd_savedwrite_tests(pmd_aligned, protnone);
pte_special_tests(pte_aligned, prot);
pte_protnone_tests(pte_aligned, protnone);
@@ -1029,11 +992,43 @@ static int __init debug_vm_pgtable(void)
pmd_swap_tests(pmd_aligned, prot);
swap_migration_tests();
- hugetlb_basic_tests(pte_aligned, prot);
pmd_thp_tests(pmd_aligned, prot);
pud_thp_tests(pud_aligned, prot);
+ hugetlb_basic_tests(pte_aligned, prot);
+
+ /*
+ * Page table modifying tests. They need to hold
+ * proper page table lock.
+ */
+
+ ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl);
+ pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot);
+ pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+ pte_unmap_unlock(ptep, ptl);
+
+ ptl = pmd_lock(mm, pmdp);
+ pmd_clear_tests(mm, pmdp);
+ pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
+ pmd_huge_tests(pmdp, pmd_aligned, prot);
+ pmd_populate_tests(mm, pmdp, saved_ptep);
+ spin_unlock(ptl);
+
+ ptl = pud_lock(mm, pudp);
+ pud_clear_tests(mm, pudp);
+ pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
+ pud_huge_tests(pudp, pud_aligned, prot);
+ pud_populate_tests(mm, pudp, saved_pmdp);
+ spin_unlock(ptl);
+
+ spin_lock(&mm->page_table_lock);
+ p4d_clear_tests(mm, p4dp);
+ pgd_clear_tests(mm, pgdp);
+ p4d_populate_tests(mm, p4dp, saved_pudp);
+ pgd_populate_tests(mm, pgdp, saved_p4dp);
+ spin_unlock(&mm->page_table_lock);
+
p4d_free(mm, saved_p4dp);
pud_free(mm, saved_pudp);
pmd_free(mm, saved_pmdp);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f9fb9bbd733e..a97c97232337 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -266,6 +266,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
*/
void dma_pool_destroy(struct dma_pool *pool)
{
+ struct dma_page *page, *tmp;
bool empty = false;
if (unlikely(!pool))
@@ -281,17 +282,13 @@ void dma_pool_destroy(struct dma_pool *pool)
device_remove_file(pool->dev, &dev_attr_pools);
mutex_unlock(&pools_reg_lock);
- while (!list_empty(&pool->page_list)) {
- struct dma_page *page;
- page = list_entry(pool->page_list.next,
- struct dma_page, page_list);
+ list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) {
if (is_page_busy(page)) {
if (pool->dev)
- dev_err(pool->dev,
- "dma_pool_destroy %s, %p busy\n",
+ dev_err(pool->dev, "%s %s, %p busy\n", __func__,
pool->name, page->vaddr);
else
- pr_err("dma_pool_destroy %s, %p busy\n",
+ pr_err("%s %s, %p busy\n", __func__,
pool->name, page->vaddr);
/* leak the still-in-use consistent memory */
list_del(&page->page_list);
@@ -355,12 +352,11 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
if (data[i] == POOL_POISON_FREED)
continue;
if (pool->dev)
- dev_err(pool->dev,
- "dma_pool_alloc %s, %p (corrupted)\n",
- pool->name, retval);
+ dev_err(pool->dev, "%s %s, %p (corrupted)\n",
+ __func__, pool->name, retval);
else
- pr_err("dma_pool_alloc %s, %p (corrupted)\n",
- pool->name, retval);
+ pr_err("%s %s, %p (corrupted)\n",
+ __func__, pool->name, retval);
/*
* Dump the first 4 bytes even if they are not
@@ -416,12 +412,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
if (!page) {
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev,
- "dma_pool_free %s, %p/%lx (bad dma)\n",
- pool->name, vaddr, (unsigned long)dma);
+ dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n",
+ __func__, pool->name, vaddr, &dma);
else
- pr_err("dma_pool_free %s, %p/%lx (bad dma)\n",
- pool->name, vaddr, (unsigned long)dma);
+ pr_err("%s %s, %p/%pad (bad dma)\n",
+ __func__, pool->name, vaddr, &dma);
return;
}
@@ -432,12 +427,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
if ((dma - page->dma) != offset) {
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev,
- "dma_pool_free %s, %p (bad vaddr)/%pad\n",
- pool->name, vaddr, &dma);
+ dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n",
+ __func__, pool->name, vaddr, &dma);
else
- pr_err("dma_pool_free %s, %p (bad vaddr)/%pad\n",
- pool->name, vaddr, &dma);
+ pr_err("%s %s, %p (bad vaddr)/%pad\n",
+ __func__, pool->name, vaddr, &dma);
return;
}
{
@@ -449,11 +443,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
}
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev, "dma_pool_free %s, dma %pad already free\n",
- pool->name, &dma);
+ dev_err(pool->dev, "%s %s, dma %pad already free\n",
+ __func__, pool->name, &dma);
else
- pr_err("dma_pool_free %s, dma %pad already free\n",
- pool->name, &dma);
+ pr_err("%s %s, dma %pad already free\n",
+ __func__, pool->name, &dma);
return;
}
}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 0e66f2aaeea3..d6baa4f451c5 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -141,7 +141,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
}
if (end_index >= start_index) {
- unsigned long count;
+ unsigned long nr_pagevec = 0;
/*
* It's common to FADV_DONTNEED right after
@@ -154,8 +154,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
*/
lru_add_drain();
- count = invalidate_mapping_pages(mapping,
- start_index, end_index);
+ invalidate_mapping_pagevec(mapping,
+ start_index, end_index,
+ &nr_pagevec);
/*
* If fewer pages were invalidated than expected then
@@ -163,7 +164,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
* a per-cpu pagevec for a remote CPU. Drain all
* pagevecs and try again.
*/
- if (count < (end_index - start_index + 1)) {
+ if (nr_pagevec) {
lru_add_drain_all();
invalidate_mapping_pages(mapping, start_index,
end_index);
diff --git a/mm/filemap.c b/mm/filemap.c
index 200947ed2990..1a6beaf69f49 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -249,7 +249,7 @@ static void page_cache_free_page(struct address_space *mapping,
freepage(page);
if (PageTransHuge(page) && !PageHuge(page)) {
- page_ref_sub(page, HPAGE_PMD_NR);
+ page_ref_sub(page, thp_nr_pages(page));
VM_BUG_ON_PAGE(page_count(page) <= 0, page);
} else {
put_page(page);
@@ -829,13 +829,12 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page);
noinline int __add_to_page_cache_locked(struct page *page,
struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask,
+ pgoff_t offset, gfp_t gfp,
void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
int error;
- void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
@@ -846,25 +845,46 @@ noinline int __add_to_page_cache_locked(struct page *page,
page->index = offset;
if (!huge) {
- error = mem_cgroup_charge(page, current->mm, gfp_mask);
+ error = mem_cgroup_charge(page, current->mm, gfp);
if (error)
goto error;
}
+ gfp &= GFP_RECLAIM_MASK;
+
do {
+ unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+ void *entry, *old = NULL;
+
+ if (order > thp_order(page))
+ xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
+ order, gfp);
xas_lock_irq(&xas);
- old = xas_load(&xas);
- if (old && !xa_is_value(old))
- xas_set_err(&xas, -EEXIST);
+ xas_for_each_conflict(&xas, entry) {
+ old = entry;
+ if (!xa_is_value(entry)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ }
+
+ if (old) {
+ if (shadowp)
+ *shadowp = old;
+ /* entry may have been split before we acquired lock */
+ order = xa_get_order(xas.xa, xas.xa_index);
+ if (order > thp_order(page)) {
+ xas_split(&xas, old, order);
+ xas_reset(&xas);
+ }
+ }
+
xas_store(&xas, page);
if (xas_error(&xas))
goto unlock;
- if (xa_is_value(old)) {
+ if (old)
mapping->nrexceptional--;
- if (shadowp)
- *shadowp = old;
- }
mapping->nrpages++;
/* hugetlb pages do not participate in page cache accounting */
@@ -872,7 +892,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
__inc_lruvec_page_state(page, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+ } while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
error = xas_error(&xas);
@@ -1425,7 +1445,7 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
* unlock_page - unlock a locked page
* @page: the page
*
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
+ * Unlocks the page and wakes up sleepers in wait_on_page_locked().
* Also wakes sleepers in wait_on_page_writeback() because the wakeup
* mechanism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
@@ -1645,19 +1665,19 @@ EXPORT_SYMBOL(page_cache_prev_miss);
/**
* find_get_entry - find and get a page cache entry
* @mapping: the address_space to search
- * @offset: the page cache index
+ * @index: The page cache index.
*
* Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned with an increased refcount.
+ * page cache page, the head page is returned with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
*/
-struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
{
- XA_STATE(xas, &mapping->i_pages, offset);
+ XA_STATE(xas, &mapping->i_pages, index);
struct page *page;
rcu_read_lock();
@@ -1685,7 +1705,6 @@ repeat:
put_page(page);
goto repeat;
}
- page = find_subpage(page, offset);
out:
rcu_read_unlock();
@@ -1693,40 +1712,37 @@ out:
}
/**
- * find_lock_entry - locate, pin and lock a page cache entry
- * @mapping: the address_space to search
- * @offset: the page cache index
+ * find_lock_entry - Locate and lock a page cache entry.
+ * @mapping: The address_space to search.
+ * @index: The page cache index.
*
- * Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned locked and with an increased
- * refcount.
+ * Looks up the page at @mapping & @index. If there is a page in the
+ * cache, the head page is returned locked and with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * find_lock_entry() may sleep.
- *
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Context: May sleep.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
*/
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
{
struct page *page;
repeat:
- page = find_get_entry(mapping, offset);
+ page = find_get_entry(mapping, index);
if (page && !xa_is_value(page)) {
lock_page(page);
/* Has the page been truncated? */
- if (unlikely(page_mapping(page) != mapping)) {
+ if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
}
return page;
}
-EXPORT_SYMBOL(find_lock_entry);
/**
* pagecache_get_page - Find and get a reference to a page.
@@ -1741,6 +1757,8 @@ EXPORT_SYMBOL(find_lock_entry);
*
* * %FGP_ACCESSED - The page will be marked accessed.
* * %FGP_LOCK - The page is returned locked.
+ * * %FGP_HEAD - If the page is present and a THP, return the head page
+ * rather than the exact page specified by the index.
* * %FGP_CREAT - If no page is present then a new page is allocated using
* @gfp_mask and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
@@ -1781,12 +1799,12 @@ repeat:
}
/* Has the page been truncated? */
- if (unlikely(compound_head(page)->mapping != mapping)) {
+ if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page->index != index, page);
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
}
if (fgp_flags & FGP_ACCESSED)
@@ -1796,6 +1814,8 @@ repeat:
if (page_is_idle(page))
clear_page_idle(page);
}
+ if (!(fgp_flags & FGP_HEAD))
+ page = find_subpage(page, index);
no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
@@ -2568,8 +2588,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
struct file *fpin = NULL;
- pgoff_t offset = vmf->pgoff;
unsigned int mmap_miss;
/* If we don't want any read-ahead, don't bother */
@@ -2580,8 +2600,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (vmf->vma->vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- page_cache_sync_readahead(mapping, ra, file, offset,
- ra->ra_pages);
+ page_cache_sync_ra(&ractl, ra, ra->ra_pages);
return fpin;
}
@@ -2601,10 +2620,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* mmap read-around
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
- ra_submit(ra, mapping, file);
+ ractl._index = ra->start;
+ do_page_cache_ra(&ractl, ra->size, ra->async_size);
return fpin;
}
@@ -2793,42 +2813,42 @@ void filemap_map_pages(struct vm_fault *vmf,
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *page;
+ struct page *head, *page;
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
rcu_read_lock();
- xas_for_each(&xas, page, end_pgoff) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, head, end_pgoff) {
+ if (xas_retry(&xas, head))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(head))
goto next;
/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
- if (PageLocked(page))
+ if (PageLocked(head))
goto next;
- if (!page_cache_get_speculative(page))
+ if (!page_cache_get_speculative(head))
goto next;
/* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
+ if (unlikely(head != xas_reload(&xas)))
goto skip;
- page = find_subpage(page, xas.xa_index);
+ page = find_subpage(head, xas.xa_index);
- if (!PageUptodate(page) ||
+ if (!PageUptodate(head) ||
PageReadahead(page) ||
PageHWPoison(page))
goto skip;
- if (!trylock_page(page))
+ if (!trylock_page(head))
goto skip;
- if (page->mapping != mapping || !PageUptodate(page))
+ if (head->mapping != mapping || !PageUptodate(head))
goto unlock;
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
- if (page->index >= max_idx)
+ if (xas.xa_index >= max_idx)
goto unlock;
if (mmap_miss > 0)
@@ -2840,12 +2860,12 @@ void filemap_map_pages(struct vm_fault *vmf,
last_pgoff = xas.xa_index;
if (alloc_set_pte(vmf, page))
goto unlock;
- unlock_page(page);
+ unlock_page(head);
goto next;
unlock:
- unlock_page(page);
+ unlock_page(head);
skip:
- put_page(page);
+ put_page(head);
next:
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd))
@@ -2984,7 +3004,7 @@ filler:
goto out;
/*
- * Page is not up to date and may be locked due one of the following
+ * Page is not up to date and may be locked due to one of the following
* case a: Page is being filled and the page lock is held
* case b: Read/write error clearing the page uptodate status
* case c: Truncation in progress (page locked)
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index 10f82d5643b6..3507e09cb3ff 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -12,7 +12,6 @@
* get_vaddr_frames() - map virtual addresses to pfns
* @start: starting user address
* @nr_frames: number of pages / pfns from start to map
- * @gup_flags: flags modifying lookup behaviour
* @vec: structure which receives pages / pfns of the addresses mapped.
* It should have space for at least nr_frames entries.
*
@@ -32,10 +31,11 @@
* This function takes care of grabbing mmap_lock as necessary.
*/
int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
- unsigned int gup_flags, struct frame_vector *vec)
+ struct frame_vector *vec)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
+ unsigned int gup_flags = FOLL_WRITE | FOLL_FORCE | FOLL_LONGTERM;
int ret = 0;
int err;
int locked;
diff --git a/mm/gup.c b/mm/gup.c
index e869c634cc9a..102877ed77a4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -329,6 +329,13 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
unsigned long index;
/*
+ * If this WARN_ON() fires, then the system *might* be leaking pages (by
+ * leaving them pinned), but probably not. More likely, gup/pup returned
+ * a hard -ERRNO error to the caller, who erroneously passed it here.
+ */
+ if (WARN_ON(IS_ERR_VALUE(npages)))
+ return;
+ /*
* TODO: this can be optimized for huge pages: if a series of pages is
* physically contiguous and part of the same compound page, then a
* single operation to the head page should suffice.
@@ -1483,35 +1490,6 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
mmap_read_unlock(mm);
return ret; /* 0 or negative error code */
}
-
-/**
- * get_dump_page() - pin user page in memory while writing it to core dump
- * @addr: user address
- *
- * Returns struct page pointer of user page pinned for dump,
- * to be freed afterwards by put_page().
- *
- * Returns NULL on any kind of failure - a hole must then be inserted into
- * the corefile, to preserve alignment with its headers; and also returns
- * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
- *
- * Called without mmap_lock, but after all other threads have been killed.
- */
-#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
-{
- struct vm_area_struct *vma;
- struct page *page;
-
- if (__get_user_pages(current->mm, addr, 1,
- FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
- NULL) < 1)
- return NULL;
- flush_cache_page(vma, addr, page_to_pfn(page));
- return page;
-}
-#endif /* CONFIG_ELF_CORE */
#else /* CONFIG_MMU */
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
@@ -1557,6 +1535,38 @@ finish_or_fault:
}
#endif /* !CONFIG_MMU */
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save diskspace.
+ *
+ * Called without mmap_lock (takes and releases the mmap_lock by itself).
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct page *page;
+ int locked = 1;
+ int ret;
+
+ if (mmap_read_lock_killable(mm))
+ return NULL;
+ ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET);
+ if (locked)
+ mmap_read_unlock(mm);
+ return (ret == 1) ? page : NULL;
+}
+#endif /* CONFIG_ELF_CORE */
+
#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
{
@@ -1747,6 +1757,25 @@ static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
}
#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+static bool is_valid_gup_flags(unsigned int gup_flags)
+{
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that with an assertion:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return false;
+ /*
+ * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
+ * that is, FOLL_LONGTERM is a specific case, more restrictive case of
+ * FOLL_PIN.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return false;
+
+ return true;
+}
+
#ifdef CONFIG_MMU
static long __get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
@@ -1842,11 +1871,7 @@ long get_user_pages_remote(struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
- /*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that with an assertion:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
@@ -1892,11 +1917,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
- /*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that with an assertion:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
return __gup_longterm_locked(current->mm, start, nr_pages,
@@ -2786,11 +2807,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
- /*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
/*
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index be690fa66a46..464cae1fa3ea 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -6,10 +6,10 @@
#include <linux/debugfs.h>
#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
-#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
-#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
-#define PIN_FAST_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
-#define PIN_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
+#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
+#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
+#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
+#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
struct gup_benchmark {
__u64 get_delta_usec;
@@ -28,7 +28,6 @@ static void put_back_pages(unsigned int cmd, struct page **pages,
switch (cmd) {
case GUP_FAST_BENCHMARK:
- case GUP_LONGTERM_BENCHMARK:
case GUP_BENCHMARK:
for (i = 0; i < nr_pages; i++)
put_page(pages[i]);
@@ -36,6 +35,7 @@ static void put_back_pages(unsigned int cmd, struct page **pages,
case PIN_FAST_BENCHMARK:
case PIN_BENCHMARK:
+ case PIN_LONGTERM_BENCHMARK:
unpin_user_pages(pages, nr_pages);
break;
}
@@ -50,6 +50,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
switch (cmd) {
case PIN_FAST_BENCHMARK:
case PIN_BENCHMARK:
+ case PIN_LONGTERM_BENCHMARK:
for (i = 0; i < nr_pages; i++) {
page = pages[i];
if (WARN(!page_maybe_dma_pinned(page),
@@ -101,11 +102,6 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = get_user_pages_fast(addr, nr, gup->flags,
pages + i);
break;
- case GUP_LONGTERM_BENCHMARK:
- nr = get_user_pages(addr, nr,
- gup->flags | FOLL_LONGTERM,
- pages + i, NULL);
- break;
case GUP_BENCHMARK:
nr = get_user_pages(addr, nr, gup->flags, pages + i,
NULL);
@@ -118,6 +114,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = pin_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
+ case PIN_LONGTERM_BENCHMARK:
+ nr = pin_user_pages(addr, nr,
+ gup->flags | FOLL_LONGTERM,
+ pages + i, NULL);
+ break;
default:
kvfree(pages);
ret = -EINVAL;
@@ -162,10 +163,10 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
switch (cmd) {
case GUP_FAST_BENCHMARK:
- case GUP_LONGTERM_BENCHMARK:
case GUP_BENCHMARK:
case PIN_FAST_BENCHMARK:
case PIN_BENCHMARK:
+ case PIN_LONGTERM_BENCHMARK:
break;
default:
return -EINVAL;
diff --git a/mm/highmem.c b/mm/highmem.c
index 64d8dea47dd1..1352a27951e3 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -369,7 +369,7 @@ void kunmap_high(struct page *page)
}
EXPORT_SYMBOL(kunmap_high);
-#endif
+#endif /* CONFIG_HIGHMEM */
#if defined(HASHED_PAGE_VIRTUAL)
@@ -481,4 +481,4 @@ void __init page_address_init(void)
}
}
-#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
+#endif /* defined(HASHED_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ec0f0cc49545..cba3812a5c3e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -622,6 +622,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
lru_cache_add_inactive_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
@@ -735,6 +736,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
} else {
set_huge_zero_page(pgtable, vma->vm_mm, vma,
haddr, vmf->pmd, zero_page);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
spin_unlock(vmf->ptl);
set = true;
}
@@ -2306,13 +2308,13 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
/*
* If we're also updating the vma->vm_next->vm_start, if the new
- * vm_next->vm_start isn't page aligned and it could previously
+ * vm_next->vm_start isn't hpage aligned and it could previously
* contain an hugepage: check if we need to split an huge pmd.
*/
if (adjust_next > 0) {
struct vm_area_struct *next = vma->vm_next;
unsigned long nstart = next->vm_start;
- nstart += adjust_next << PAGE_SHIFT;
+ nstart += adjust_next;
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
@@ -2335,13 +2337,13 @@ static void unmap_page(struct page *page)
VM_BUG_ON_PAGE(!unmap_success, page);
}
-static void remap_page(struct page *page)
+static void remap_page(struct page *page, unsigned int nr)
{
int i;
if (PageTransHuge(page)) {
remove_migration_ptes(page, page, true);
} else {
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ for (i = 0; i < nr; i++)
remove_migration_ptes(page + i, page + i, true);
}
}
@@ -2419,6 +2421,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
+ unsigned int nr = thp_nr_pages(head);
int i;
lruvec = mem_cgroup_page_lruvec(head, pgdat);
@@ -2434,7 +2437,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
xa_lock(&swap_cache->i_pages);
}
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
+ for (i = nr - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
if (head[i].index >= end) {
@@ -2454,7 +2457,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageCompound(head);
- split_page_owner(head, HPAGE_PMD_ORDER);
+ split_page_owner(head, nr);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
@@ -2473,9 +2476,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- remap_page(head);
+ remap_page(head, nr);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0; i < nr; i++) {
struct page *subpage = head + i;
if (subpage == page)
continue;
@@ -2494,7 +2497,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
int total_mapcount(struct page *page)
{
- int i, compound, ret;
+ int i, compound, nr, ret;
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -2502,16 +2505,17 @@ int total_mapcount(struct page *page)
return atomic_read(&page->_mapcount) + 1;
compound = compound_mapcount(page);
+ nr = compound_nr(page);
if (PageHuge(page))
return compound;
ret = compound;
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ for (i = 0; i < nr; i++)
ret += atomic_read(&page[i]._mapcount) + 1;
/* File pages has compound_mapcount included in _mapcount */
if (!PageAnon(page))
- return ret - compound * HPAGE_PMD_NR;
+ return ret - compound * nr;
if (PageDoubleMap(page))
- ret -= HPAGE_PMD_NR;
+ ret -= nr;
return ret;
}
@@ -2556,14 +2560,14 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
page = compound_head(page);
_total_mapcount = ret = 0;
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0; i < thp_nr_pages(page); i++) {
mapcount = atomic_read(&page[i]._mapcount) + 1;
ret = max(ret, mapcount);
_total_mapcount += mapcount;
}
if (PageDoubleMap(page)) {
ret -= 1;
- _total_mapcount -= HPAGE_PMD_NR;
+ _total_mapcount -= thp_nr_pages(page);
}
mapcount = compound_mapcount(page);
ret += mapcount;
@@ -2580,9 +2584,9 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
/* Additional pins from page cache */
if (PageAnon(page))
- extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
+ extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
else
- extra_pins = HPAGE_PMD_NR;
+ extra_pins = thp_nr_pages(page);
if (pextra_pins)
*pextra_pins = extra_pins;
return total_mapcount(page) == page_count(page) - extra_pins - 1;
@@ -2728,7 +2732,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
fail: if (mapping)
xa_unlock(&mapping->i_pages);
spin_unlock_irqrestore(&pgdata->lru_lock, flags);
- remap_page(head);
+ remap_page(head, thp_nr_pages(head));
ret = -EBUSY;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 71dc657b50aa..fe76f8fd5a73 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -240,7 +240,6 @@ get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
resv->region_cache_count--;
nrg = list_first_entry(&resv->region_cache, struct file_region, link);
- VM_BUG_ON(!nrg);
list_del(&nrg->link);
nrg->from = from;
@@ -309,8 +308,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
list_del(&rg->link);
kfree(rg);
- coalesce_file_region(resv, prg);
- return;
+ rg = prg;
}
nrg = list_next_entry(rg, link);
@@ -320,22 +318,20 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
list_del(&rg->link);
kfree(rg);
-
- coalesce_file_region(resv, nrg);
- return;
}
}
-/* Must be called with resv->lock held. Calling this with count_only == true
- * will count the number of pages to be added but will not modify the linked
- * list. If regions_needed != NULL and count_only == true, then regions_needed
- * will indicate the number of file_regions needed in the cache to carry out to
- * add the regions for this range.
+/*
+ * Must be called with resv->lock held.
+ *
+ * Calling this with regions_needed != NULL will count the number of pages
+ * to be added but will not modify the linked list. And regions_needed will
+ * indicate the number of file_regions needed in the cache to carry out to add
+ * the regions for this range.
*/
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
struct hugetlb_cgroup *h_cg,
- struct hstate *h, long *regions_needed,
- bool count_only)
+ struct hstate *h, long *regions_needed)
{
long add = 0;
struct list_head *head = &resv->regions;
@@ -371,14 +367,14 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
*/
if (rg->from > last_accounted_offset) {
add += rg->from - last_accounted_offset;
- if (!count_only) {
+ if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, rg->from);
record_hugetlb_cgroup_uncharge_info(h_cg, h,
resv, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(resv, nrg);
- } else if (regions_needed)
+ } else
*regions_needed += 1;
}
@@ -390,13 +386,13 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
*/
if (last_accounted_offset < t) {
add += t - last_accounted_offset;
- if (!count_only) {
+ if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, t);
record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(resv, nrg);
- } else if (regions_needed)
+ } else
*regions_needed += 1;
}
@@ -448,11 +444,8 @@ static int allocate_file_region_entries(struct resv_map *resv,
spin_lock(&resv->lock);
- list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
- list_del(&rg->link);
- list_add(&rg->link, &resv->region_cache);
- resv->region_cache_count++;
- }
+ list_splice(&allocated_regions, &resv->region_cache);
+ resv->region_cache_count += to_allocate;
}
return 0;
@@ -492,8 +485,8 @@ static long region_add(struct resv_map *resv, long f, long t,
retry:
/* Count how many regions are actually needed to execute this add. */
- add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed,
- true);
+ add_reservation_in_range(resv, f, t, NULL, NULL,
+ &actual_regions_needed);
/*
* Check for sufficient descriptors in the cache to accommodate
@@ -521,7 +514,7 @@ retry:
goto retry;
}
- add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false);
+ add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
resv->adds_in_progress -= in_regions_needed;
@@ -557,9 +550,9 @@ static long region_chg(struct resv_map *resv, long f, long t,
spin_lock(&resv->lock);
- /* Count how many hugepages in this range are NOT respresented. */
+ /* Count how many hugepages in this range are NOT represented. */
chg = add_reservation_in_range(resv, f, t, NULL, NULL,
- out_regions_needed, true);
+ out_regions_needed);
if (*out_regions_needed == 0)
*out_regions_needed = 1;
@@ -1047,21 +1040,17 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
if (nocma && is_migrate_cma_page(page))
continue;
- if (!PageHWPoison(page))
- break;
+ if (PageHWPoison(page))
+ continue;
+
+ list_move(&page->lru, &h->hugepage_activelist);
+ set_page_refcounted(page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ return page;
}
- /*
- * if 'non-isolated free hugepage' not found on the list,
- * the allocation fails.
- */
- if (&h->hugepage_freelists[nid] == &page->lru)
- return NULL;
- list_move(&page->lru, &h->hugepage_activelist);
- set_page_refcounted(page);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
- return page;
+ return NULL;
}
static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
@@ -1511,9 +1500,9 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- spin_lock(&hugetlb_lock);
set_hugetlb_cgroup(page, NULL);
set_hugetlb_cgroup_rsvd(page, NULL);
+ spin_lock(&hugetlb_lock);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
@@ -2423,7 +2412,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
h->resv_huge_pages--;
}
spin_lock(&hugetlb_lock);
- list_move(&page->lru, &h->hugepage_activelist);
+ list_add(&page->lru, &h->hugepage_activelist);
/* Fall through */
}
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
@@ -3801,23 +3790,23 @@ bool is_hugetlb_entry_migration(pte_t pte)
if (huge_pte_none(pte) || pte_present(pte))
return false;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp))
+ if (is_migration_entry(swp))
return true;
else
return false;
}
-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
{
swp_entry_t swp;
if (huge_pte_none(pte) || pte_present(pte))
- return 0;
+ return false;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp))
- return 1;
+ if (is_hwpoison_entry(swp))
+ return true;
else
- return 0;
+ return false;
}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
@@ -5350,10 +5339,16 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* !shared pmd case because we can allocate the pmd later as well, it makes the
* code much cleaner.
*
- * This routine must be called with i_mmap_rwsem held in at least read mode.
- * For hugetlbfs, this prevents removal of any page table entries associated
- * with the address space. This is important as we are setting up sharing
- * based on existing page table entries (mappings).
+ * This routine must be called with i_mmap_rwsem held in at least read mode if
+ * sharing is possible. For hugetlbfs, this prevents removal of any page
+ * table entries associated with the address space. This is important as we
+ * are setting up sharing based on existing page table entries (mappings).
+ *
+ * NOTE: This routine is only called from huge_pte_alloc. Some callers of
+ * huge_pte_alloc know that sharing is not possible and do not take
+ * i_mmap_rwsem as a performance optimization. This is handled by the
+ * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
+ * only required for subsequent processing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
@@ -5370,6 +5365,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
+ i_mmap_assert_locked(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index e488876b168a..1ae1ebc2b9b1 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -26,11 +26,6 @@ static int hwpoison_inject(void *data, u64 val)
p = pfn_to_page(pfn);
hpage = compound_head(p);
- /*
- * This implies unable to support free buddy pages.
- */
- if (!get_hwpoison_page(p))
- return 0;
if (!hwpoison_filter_enable)
goto inject;
@@ -40,23 +35,20 @@ static int hwpoison_inject(void *data, u64 val)
* This implies unable to support non-LRU pages.
*/
if (!PageLRU(hpage) && !PageHuge(p))
- goto put_out;
+ return 0;
/*
- * do a racy check with elevated page count, to make sure PG_hwpoison
- * will only be set for the targeted owner (or on a free page).
+ * do a racy check to make sure PG_hwpoison will only be set for
+ * the targeted owner (or on a free page).
* memory_failure() will redo the check reliably inside page lock.
*/
err = hwpoison_filter(hpage);
if (err)
- goto put_out;
+ return 0;
inject:
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
- return memory_failure(pfn, MF_COUNT_INCREASED);
-put_out:
- put_hwpoison_page(p);
- return 0;
+ return memory_failure(pfn, 0);
}
static int hwpoison_unpoison(void *data, u64 val)
diff --git a/mm/internal.h b/mm/internal.h
index 10c677655912..c43ccdddb0f6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,22 +49,20 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
struct zap_details *details);
-void force_page_cache_readahead(struct address_space *, struct file *,
- pgoff_t index, unsigned long nr_to_read);
-void __do_page_cache_readahead(struct address_space *, struct file *,
- pgoff_t index, unsigned long nr_to_read,
+void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
unsigned long lookahead_size);
-
-/*
- * Submit IO for the read-ahead request in file_ra_state.
- */
-static inline void ra_submit(struct file_ra_state *ra,
- struct address_space *mapping, struct file *filp)
+void force_page_cache_ra(struct readahead_control *, struct file_ra_state *,
+ unsigned long nr);
+static inline void force_page_cache_readahead(struct address_space *mapping,
+ struct file *file, pgoff_t index, unsigned long nr_to_read)
{
- __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
+ DEFINE_READAHEAD(ractl, file, mapping, index);
+ force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
}
+struct page *find_get_entry(struct address_space *mapping, pgoff_t index);
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t index);
+
/**
* page_evictable - test whether a page is evictable
* @page: the page to test
@@ -272,16 +270,16 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
* page from being allocated in parallel and returning garbage as the order.
* If a caller does not hold page_zone(page)->lock, it must guarantee that the
* page cannot be allocated or merged in parallel. Alternatively, it must
- * handle invalid values gracefully, and use page_order_unsafe() below.
+ * handle invalid values gracefully, and use buddy_order_unsafe() below.
*/
-static inline unsigned int page_order(struct page *page)
+static inline unsigned int buddy_order(struct page *page)
{
/* PageBuddy() must be checked by the caller */
return page_private(page);
}
/*
- * Like page_order(), but for callers who cannot afford to hold the zone lock.
+ * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
* PageBuddy() should be checked first by the caller to minimize race window,
* and invalid values must be handled gracefully.
*
@@ -291,7 +289,7 @@ static inline unsigned int page_order(struct page *page)
* times, potentially observing different values in the tests and the actual
* use of the result.
*/
-#define page_order_unsafe(page) READ_ONCE(page_private(page))
+#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
static inline bool is_cow_mapping(vm_flags_t flags)
{
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 4f49fa6cd1aa..00a53f1355ae 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -33,6 +33,8 @@
#include <asm/sections.h>
+#include <kunit/test.h>
+
#include "kasan.h"
#include "../slab.h"
@@ -93,7 +95,7 @@ static void end_report(unsigned long *flags)
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
- if (panic_on_warn) {
+ if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) {
/*
* This thread may hit another WARN() in the panic path.
* Resetting this prevents additional WARN() from panicking the
@@ -464,12 +466,37 @@ static bool report_enabled(void)
return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
}
+#if IS_ENABLED(CONFIG_KUNIT)
+static void kasan_update_kunit_status(struct kunit *cur_test)
+{
+ struct kunit_resource *resource;
+ struct kunit_kasan_expectation *kasan_data;
+
+ resource = kunit_find_named_resource(cur_test, "kasan_data");
+
+ if (!resource) {
+ kunit_set_failure(cur_test);
+ return;
+ }
+
+ kasan_data = (struct kunit_kasan_expectation *)resource->data;
+ kasan_data->report_found = true;
+ kunit_put_resource(resource);
+}
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
void kasan_report_invalid_free(void *object, unsigned long ip)
{
unsigned long flags;
u8 tag = get_tag(object);
object = reset_tag(object);
+
+#if IS_ENABLED(CONFIG_KUNIT)
+ if (current->kunit_test)
+ kasan_update_kunit_status(current->kunit_test);
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
start_report(&flags);
pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
print_tags(tag, object);
@@ -488,6 +515,11 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
void *untagged_addr;
unsigned long flags;
+#if IS_ENABLED(CONFIG_KUNIT)
+ if (current->kunit_test)
+ kasan_update_kunit_status(current->kunit_test);
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
disable_trace_on_warning();
tagged_addr = (void *)addr;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cfa0dba5fd3b..f1d5f6dde47c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -56,6 +56,9 @@ enum scan_result {
#define CREATE_TRACE_POINTS
#include <trace/events/huge_memory.h>
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+
/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
@@ -431,7 +434,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
+ return atomic_read(&mm->mm_users) == 0;
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
@@ -2292,8 +2295,6 @@ static void set_recommended_min_free_kbytes(void)
int start_stop_khugepaged(void)
{
- static struct task_struct *khugepaged_thread __read_mostly;
- static DEFINE_MUTEX(khugepaged_mutex);
int err = 0;
mutex_lock(&khugepaged_mutex);
@@ -2320,3 +2321,11 @@ fail:
mutex_unlock(&khugepaged_mutex);
return err;
}
+
+void khugepaged_min_free_kbytes_update(void)
+{
+ mutex_lock(&khugepaged_mutex);
+ if (khugepaged_enabled() && khugepaged_thread)
+ set_recommended_min_free_kbytes();
+ mutex_unlock(&khugepaged_mutex);
+}
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
deleted file mode 100644
index e19279ff6aa3..000000000000
--- a/mm/kmemleak-test.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * mm/kmemleak-test.c
- *
- * Copyright (C) 2008 ARM Limited
- * Written by Catalin Marinas <catalin.marinas@arm.com>
- */
-
-#define pr_fmt(fmt) "kmemleak: " fmt
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/list.h>
-#include <linux/percpu.h>
-#include <linux/fdtable.h>
-
-#include <linux/kmemleak.h>
-
-struct test_node {
- long header[25];
- struct list_head list;
- long footer[25];
-};
-
-static LIST_HEAD(test_list);
-static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
-
-/*
- * Some very simple testing. This function needs to be extended for
- * proper testing.
- */
-static int __init kmemleak_test_init(void)
-{
- struct test_node *elem;
- int i;
-
- pr_info("Kmemleak testing\n");
-
- /* make some orphan objects */
- pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
- pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
- pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
- pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
- pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
- pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
- pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
- pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
-#ifndef CONFIG_MODULES
- pr_info("kmem_cache_alloc(files_cachep) = %p\n",
- kmem_cache_alloc(files_cachep, GFP_KERNEL));
- pr_info("kmem_cache_alloc(files_cachep) = %p\n",
- kmem_cache_alloc(files_cachep, GFP_KERNEL));
-#endif
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
-
- /*
- * Add elements to a list. They should only appear as orphan
- * after the module is removed.
- */
- for (i = 0; i < 10; i++) {
- elem = kzalloc(sizeof(*elem), GFP_KERNEL);
- pr_info("kzalloc(sizeof(*elem)) = %p\n", elem);
- if (!elem)
- return -ENOMEM;
- INIT_LIST_HEAD(&elem->list);
- list_add_tail(&elem->list, &test_list);
- }
-
- for_each_possible_cpu(i) {
- per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
- pr_info("kmalloc(129) = %p\n",
- per_cpu(kmemleak_test_pointer, i));
- }
-
- return 0;
-}
-module_init(kmemleak_test_init);
-
-static void __exit kmemleak_test_exit(void)
-{
- struct test_node *elem, *tmp;
-
- /*
- * Remove the list elements without actually freeing the
- * memory.
- */
- list_for_each_entry_safe(elem, tmp, &test_list, list)
- list_del(&elem->list);
-}
-module_exit(kmemleak_test_exit);
-
-MODULE_LICENSE("GPL");
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5e252d91eb14..c0014d3b91c1 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1471,15 +1471,15 @@ static void kmemleak_scan(void)
if (kmemleak_stack_scan) {
struct task_struct *p, *g;
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
void *stack = try_get_task_stack(p);
if (stack) {
scan_block(stack, stack + THREAD_SIZE, NULL);
put_task_stack(p);
}
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
+ }
+ rcu_read_unlock();
}
/*
diff --git a/mm/madvise.c b/mm/madvise.c
index 0e0d61003fc6..fd1f448b4e1d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -224,25 +224,28 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct address_space *mapping)
{
- pgoff_t index;
+ XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
+ pgoff_t end_index = end / PAGE_SIZE;
struct page *page;
- swp_entry_t swap;
- for (; start < end; start += PAGE_SIZE) {
- index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ rcu_read_lock();
+ xas_for_each(&xas, page, end_index) {
+ swp_entry_t swap;
- page = find_get_entry(mapping, index);
- if (!xa_is_value(page)) {
- if (page)
- put_page(page);
+ if (!xa_is_value(page))
continue;
- }
+ xas_pause(&xas);
+ rcu_read_unlock();
+
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
NULL, 0, false);
if (page)
put_page(page);
+
+ rcu_read_lock();
}
+ rcu_read_unlock();
lru_add_drain(); /* Push any new pages onto the LRU now */
}
@@ -869,7 +872,6 @@ static long madvise_remove(struct vm_area_struct *vma,
static int madvise_inject_error(int behavior,
unsigned long start, unsigned long end)
{
- struct page *page;
struct zone *zone;
unsigned long size;
@@ -879,6 +881,7 @@ static int madvise_inject_error(int behavior,
for (; start < end; start += size) {
unsigned long pfn;
+ struct page *page;
int ret;
ret = get_user_pages_fast(start, 1, 0, &page);
@@ -893,32 +896,23 @@ static int madvise_inject_error(int behavior,
*/
size = page_size(compound_head(page));
- if (PageHWPoison(page)) {
- put_page(page);
- continue;
- }
-
if (behavior == MADV_SOFT_OFFLINE) {
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
- pfn, start);
-
+ pfn, start);
ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
- if (ret)
- return ret;
- continue;
+ } else {
+ pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
+ pfn, start);
+ /*
+ * Drop the page reference taken by get_user_pages_fast(). In
+ * the absence of MF_COUNT_INCREASED the memory_failure()
+ * routine is responsible for pinning the page to prevent it
+ * from being released back to the page allocator.
+ */
+ put_page(page);
+ ret = memory_failure(pfn, 0);
}
- pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
- pfn, start);
-
- /*
- * Drop the page reference taken by get_user_pages_fast(). In
- * the absence of MF_COUNT_INCREASED the memory_failure()
- * routine is responsible for pinning the page to prevent it
- * from being released back to the page allocator.
- */
- put_page(page);
- ret = memory_failure(pfn, 0);
if (ret)
return ret;
}
@@ -1091,23 +1085,6 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
if (write) {
if (mmap_write_lock_killable(current->mm))
return -EINTR;
-
- /*
- * We may have stolen the mm from another process
- * that is undergoing core dumping.
- *
- * Right now that's io_ring, in the future it may
- * be remote process management and not "current"
- * at all.
- *
- * We need to fix core dumping to not do this,
- * but for now we have the mmget_still_valid()
- * model.
- */
- if (!mmget_still_valid(current->mm)) {
- mmap_write_unlock(current->mm);
- return -EINTR;
- }
} else {
mmap_read_lock(current->mm);
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 45f198750be9..165f40a8a254 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -132,7 +132,26 @@ struct memblock_type physmem = {
};
#endif
-int memblock_debug __initdata_memblock;
+/*
+ * keep a pointer to &memblock.memory in the text section to use it in
+ * __next_mem_range() and its helpers.
+ * For architectures that do not keep memblock data after init, this
+ * pointer will be reset to NULL at memblock_discard()
+ */
+static __refdata struct memblock_type *memblock_memory = &memblock.memory;
+
+#define for_each_memblock_type(i, memblock_type, rgn) \
+ for (i = 0, rgn = &memblock_type->regions[0]; \
+ i < memblock_type->cnt; \
+ i++, rgn = &memblock_type->regions[i])
+
+#define memblock_dbg(fmt, ...) \
+ do { \
+ if (memblock_debug) \
+ pr_info(fmt, ##__VA_ARGS__); \
+ } while (0)
+
+static int memblock_debug __initdata_memblock;
static bool system_has_some_mirror __initdata_memblock = false;
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
@@ -391,6 +410,8 @@ void __init memblock_discard(void)
memblock.memory.max);
__memblock_free_late(addr, size);
}
+
+ memblock_memory = NULL;
}
#endif
@@ -941,42 +962,16 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP);
}
-/**
- * __next_reserved_mem_region - next function for for_each_reserved_region()
- * @idx: pointer to u64 loop variable
- * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL
- * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL
- *
- * Iterate over all reserved memory regions.
- */
-void __init_memblock __next_reserved_mem_region(u64 *idx,
- phys_addr_t *out_start,
- phys_addr_t *out_end)
-{
- struct memblock_type *type = &memblock.reserved;
-
- if (*idx < type->cnt) {
- struct memblock_region *r = &type->regions[*idx];
- phys_addr_t base = r->base;
- phys_addr_t size = r->size;
-
- if (out_start)
- *out_start = base;
- if (out_end)
- *out_end = base + size - 1;
-
- *idx += 1;
- return;
- }
-
- /* signal end of iteration */
- *idx = ULLONG_MAX;
-}
-
-static bool should_skip_region(struct memblock_region *m, int nid, int flags)
+static bool should_skip_region(struct memblock_type *type,
+ struct memblock_region *m,
+ int nid, int flags)
{
int m_nid = memblock_get_region_node(m);
+ /* we never skip regions when iterating memblock.reserved or physmem */
+ if (type != memblock_memory)
+ return false;
+
/* only memory regions are associated with nodes, check it */
if (nid != NUMA_NO_NODE && nid != m_nid)
return true;
@@ -1041,7 +1036,7 @@ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
phys_addr_t m_end = m->base + m->size;
int m_nid = memblock_get_region_node(m);
- if (should_skip_region(m, nid, flags))
+ if (should_skip_region(type_a, m, nid, flags))
continue;
if (!type_b) {
@@ -1145,7 +1140,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
phys_addr_t m_end = m->base + m->size;
int m_nid = memblock_get_region_node(m);
- if (should_skip_region(m, nid, flags))
+ if (should_skip_region(type_a, m, nid, flags))
continue;
if (!type_b) {
@@ -1649,23 +1644,6 @@ phys_addr_t __init_memblock memblock_reserved_size(void)
return memblock.reserved.total_size;
}
-phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
-{
- unsigned long pages = 0;
- struct memblock_region *r;
- unsigned long start_pfn, end_pfn;
-
- for_each_memblock(memory, r) {
- start_pfn = memblock_region_memory_base_pfn(r);
- end_pfn = memblock_region_memory_end_pfn(r);
- start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
- end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
- pages += end_pfn - start_pfn;
- }
-
- return PFN_PHYS(pages);
-}
-
/* lowest address */
phys_addr_t __init_memblock memblock_start_of_DRAM(void)
{
@@ -1689,7 +1667,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
* the memory memblock regions, if the @limit exceeds the total size
* of those regions, max_addr will keep original value PHYS_ADDR_MAX
*/
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (limit <= r->size) {
max_addr = r->base + limit;
break;
@@ -1859,7 +1837,7 @@ void __init_memblock memblock_trim_memory(phys_addr_t align)
phys_addr_t start, end, orig_start, orig_end;
struct memblock_region *r;
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
orig_start = r->base;
orig_end = r->base + r->size;
start = round_up(orig_start, align);
@@ -1915,7 +1893,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
}
}
-void __init_memblock __memblock_dump_all(void)
+static void __init_memblock __memblock_dump_all(void)
{
pr_info("MEMBLOCK configuration:\n");
pr_info(" memory size = %pa reserved size = %pa\n",
@@ -1929,6 +1907,12 @@ void __init_memblock __memblock_dump_all(void)
#endif
}
+void __init_memblock memblock_dump_all(void)
+{
+ if (memblock_debug)
+ __memblock_dump_all();
+}
+
void __init memblock_allow_resize(void)
{
memblock_can_resize = 1;
@@ -1981,7 +1965,7 @@ static unsigned long __init free_low_memory_core_early(void)
memblock_clear_hotplug(0, -1);
- for_each_reserved_mem_region(i, &start, &end)
+ for_each_reserved_mem_range(i, &start, &end)
reserve_bootmem_region(start, end);
/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5c1983c84395..c04b57ccefe9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -197,14 +197,6 @@ static struct move_charge_struct {
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
-enum charge_type {
- MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
- MEM_CGROUP_CHARGE_TYPE_ANON,
- MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
- MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
- NR_CHARGE_TYPE,
-};
-
/* for encoding cft->private value on file */
enum res_type {
_MEM,
@@ -1102,9 +1094,9 @@ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
* invocations for reference counting, or use mem_cgroup_iter_break()
* to cancel a hierarchy walk before the round-trip is complete.
*
- * Reclaimers can specify a node and a priority level in @reclaim to
- * divide up the memcgs in the hierarchy among all concurrent
- * reclaimers operating on the same node and priority.
+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
+ * in the hierarchy among all concurrent reclaimers operating on the
+ * same node.
*/
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -1456,6 +1448,70 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
return false;
}
+struct memory_stat {
+ const char *name;
+ unsigned int ratio;
+ unsigned int idx;
+};
+
+static struct memory_stat memory_stats[] = {
+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
+ { "file", PAGE_SIZE, NR_FILE_PAGES },
+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
+ { "percpu", 1, MEMCG_PERCPU_B },
+ { "sock", PAGE_SIZE, MEMCG_SOCK },
+ { "shmem", PAGE_SIZE, NR_SHMEM },
+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * The ratio will be initialized in memory_stats_init(). Because
+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
+ * constant(e.g. powerpc).
+ */
+ { "anon_thp", 0, NR_ANON_THPS },
+#endif
+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
+
+ /*
+ * Note: The slab_reclaimable and slab_unreclaimable must be
+ * together and slab_reclaimable must be in front.
+ */
+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
+
+ /* The memory events */
+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
+};
+
+static int __init memory_stats_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (memory_stats[i].idx == NR_ANON_THPS)
+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
+#endif
+ VM_BUG_ON(!memory_stats[i].ratio);
+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
+ }
+
+ return 0;
+}
+pure_initcall(memory_stats_init);
+
static char *memory_stat_format(struct mem_cgroup *memcg)
{
struct seq_buf s;
@@ -1476,52 +1532,19 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
* Current memory state:
*/
- seq_buf_printf(&s, "anon %llu\n",
- (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
- PAGE_SIZE);
- seq_buf_printf(&s, "kernel_stack %llu\n",
- (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
- 1024);
- seq_buf_printf(&s, "slab %llu\n",
- (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
- memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
- seq_buf_printf(&s, "percpu %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
- seq_buf_printf(&s, "sock %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_SOCK) *
- PAGE_SIZE);
-
- seq_buf_printf(&s, "shmem %llu\n",
- (u64)memcg_page_state(memcg, NR_SHMEM) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file_mapped %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file_dirty %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file_writeback %llu\n",
- (u64)memcg_page_state(memcg, NR_WRITEBACK) *
- PAGE_SIZE);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- seq_buf_printf(&s, "anon_thp %llu\n",
- (u64)memcg_page_state(memcg, NR_ANON_THPS) *
- HPAGE_PMD_SIZE);
-#endif
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ u64 size;
- for (i = 0; i < NR_LRU_LISTS; i++)
- seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
- (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ size = memcg_page_state(memcg, memory_stats[i].idx);
+ size *= memory_stats[i].ratio;
+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
- seq_buf_printf(&s, "slab_reclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
- seq_buf_printf(&s, "slab_unreclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
+ seq_buf_printf(&s, "slab %llu\n", size);
+ }
+ }
/* Accumulated memory events */
@@ -1529,22 +1552,6 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
memcg_events(memcg, PGFAULT));
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
memcg_events(memcg, PGMAJFAULT));
-
- seq_buf_printf(&s, "workingset_refault_anon %lu\n",
- memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
- seq_buf_printf(&s, "workingset_refault_file %lu\n",
- memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
- seq_buf_printf(&s, "workingset_activate_anon %lu\n",
- memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
- seq_buf_printf(&s, "workingset_activate_file %lu\n",
- memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
- seq_buf_printf(&s, "workingset_restore_anon %lu\n",
- memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
- seq_buf_printf(&s, "workingset_restore_file %lu\n",
- memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
- seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
- memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
-
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
memcg_events(memcg, PGREFILL));
seq_buf_printf(&s, "pgscan %lu\n",
@@ -1641,17 +1648,19 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
*/
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
- unsigned long max;
+ unsigned long max = READ_ONCE(memcg->memory.max);
- max = READ_ONCE(memcg->memory.max);
- if (mem_cgroup_swappiness(memcg)) {
- unsigned long memsw_max;
- unsigned long swap_max;
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ if (mem_cgroup_swappiness(memcg))
+ max += min(READ_ONCE(memcg->swap.max),
+ (unsigned long)total_swap_pages);
+ } else { /* v1 */
+ if (mem_cgroup_swappiness(memcg)) {
+ /* Calculate swap excess capacity from memsw limit */
+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
- memsw_max = memcg->memsw.max;
- swap_max = READ_ONCE(memcg->swap.max);
- swap_max = min(swap_max, (unsigned long)total_swap_pages);
- max = min(max + swap_max, memsw_max);
+ max += min(swap, (unsigned long)total_swap_pages);
+ }
}
return max;
}
@@ -1817,8 +1826,8 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
/*
- * When a new child is created while the hierarchy is under oom,
- * mem_cgroup_oom_lock() may not be called. Watch for underflow.
+ * Be careful about under_oom underflows becase a child memcg
+ * could have been added after mem_cgroup_mark_under_oom.
*/
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
@@ -2888,6 +2897,17 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
page = virt_to_head_page(p);
/*
+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
+ * or a pointer to obj_cgroup vector. In the latter case the lowest
+ * bit of the pointer is set.
+ * The page->mem_cgroup pointer can be asynchronously changed
+ * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
+ * from a valid memcg pointer to objcg vector or back.
+ */
+ if (!page->mem_cgroup)
+ return NULL;
+
+ /*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
* the page->obj_cgroups.
@@ -4255,17 +4275,16 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
new->size = size;
/* Copy thresholds (if any) to new array */
- if (thresholds->primary) {
- memcpy(new->entries, thresholds->primary->entries, (size - 1) *
- sizeof(struct mem_cgroup_threshold));
- }
+ if (thresholds->primary)
+ memcpy(new->entries, thresholds->primary->entries,
+ flex_array_size(new, entries, size - 1));
/* Add new threshold */
new->entries[size - 1].eventfd = eventfd;
new->entries[size - 1].threshold = threshold;
/* Sort thresholds. Registering of new threshold isn't time-critical */
- sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
+ sort(new->entries, size, sizeof(*new->entries),
compare_thresholds, NULL);
/* Find current threshold */
@@ -5291,13 +5310,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
memcg->use_hierarchy = true;
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
- page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
page_counter_init(&memcg->memory, NULL);
page_counter_init(&memcg->swap, NULL);
- page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
page_counter_init(&memcg->tcpmem, NULL);
/*
@@ -5426,7 +5443,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
- page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
@@ -5539,35 +5555,15 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
- struct page *page = NULL;
- struct address_space *mapping;
- pgoff_t pgoff;
-
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!(mc.flags & MOVE_FILE))
return NULL;
- mapping = vma->vm_file->f_mapping;
- pgoff = linear_page_index(vma, addr);
-
/* page is moved even if it's not RSS of this task(page-faulted). */
-#ifdef CONFIG_SWAP
/* shmem/tmpfs may report page out on swap: account for that too. */
- if (shmem_mapping(mapping)) {
- page = find_get_entry(mapping, pgoff);
- if (xa_is_value(page)) {
- swp_entry_t swp = radix_to_swp_entry(page);
- *entry = swp;
- page = find_get_page(swap_address_space(swp),
- swp_offset(swp));
- }
- } else
- page = find_get_page(mapping, pgoff);
-#else
- page = find_get_page(mapping, pgoff);
-#endif
- return page;
+ return find_get_incore_page(vma->vm_file->f_mapping,
+ linear_page_index(vma, addr));
}
/**
@@ -6393,6 +6389,35 @@ static int memory_stat_show(struct seq_file *m, void *v)
return 0;
}
+#ifdef CONFIG_NUMA
+static int memory_numa_stat_show(struct seq_file *m, void *v)
+{
+ int i;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ int nid;
+
+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
+ continue;
+
+ seq_printf(m, "%s", memory_stats[i].name);
+ for_each_node_state(nid, N_MEMORY) {
+ u64 size;
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
+ size *= memory_stats[i].ratio;
+ seq_printf(m, " N%d=%llu", nid, size);
+ }
+ seq_putc(m, '\n');
+ }
+
+ return 0;
+}
+#endif
+
static int memory_oom_group_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
@@ -6470,6 +6495,12 @@ static struct cftype memory_files[] = {
.name = "stat",
.seq_show = memory_stat_show,
},
+#ifdef CONFIG_NUMA
+ {
+ .name = "numa_stat",
+ .seq_show = memory_numa_stat_show,
+ },
+#endif
{
.name = "oom.group",
.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a1e73943445e..a2184b721fbf 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -65,6 +65,33 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
+static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
+{
+ if (hugepage_or_freepage) {
+ /*
+ * Doing this check for free pages is also fine since dissolve_free_huge_page
+ * returns 0 for non-hugetlb pages as well.
+ */
+ if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
+ /*
+ * We could fail to take off the target page from buddy
+ * for example due to racy page allocaiton, but that's
+ * acceptable because soft-offlined page is not broken
+ * and if someone really want to use it, they should
+ * take it.
+ */
+ return false;
+ }
+
+ SetPageHWPoison(page);
+ if (release)
+ put_page(page);
+ page_ref_inc(page);
+ num_poisoned_pages_inc();
+
+ return true;
+}
+
#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
u32 hwpoison_filter_enable = 0;
@@ -484,11 +511,12 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct address_space *mapping = page->mapping;
+ pgoff_t pgoff;
i_mmap_lock_read(mapping);
read_lock(&tasklist_lock);
+ pgoff = page_to_pgoff(page);
for_each_process(tsk) {
- pgoff_t pgoff = page_to_pgoff(page);
struct task_struct *t = task_early_kill(tsk, force_early);
if (!t)
@@ -554,6 +582,7 @@ static const char * const action_page_types[] = {
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
[MF_MSG_DAX] = "dax page",
+ [MF_MSG_UNSPLIT_THP] = "unsplit thp",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -824,7 +853,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
#define unevict (1UL << PG_unevictable)
#define mlock (1UL << PG_mlocked)
-#define writeback (1UL << PG_writeback)
#define lru (1UL << PG_lru)
#define head (1UL << PG_head)
#define slab (1UL << PG_slab)
@@ -873,7 +901,6 @@ static struct page_state {
#undef sc
#undef unevict
#undef mlock
-#undef writeback
#undef lru
#undef head
#undef slab
@@ -925,7 +952,7 @@ static int page_action(struct page_state *ps, struct page *p,
* Return: return 0 if failed to grab the refcount, otherwise true (some
* non-zero value.)
*/
-int get_hwpoison_page(struct page *page)
+static int get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
@@ -954,7 +981,6 @@ int get_hwpoison_page(struct page *page)
return 0;
}
-EXPORT_SYMBOL_GPL(get_hwpoison_page);
/*
* Do all that is necessary to remove user space mappings. Unmap
@@ -1104,6 +1130,25 @@ static int identify_page_state(unsigned long pfn, struct page *p,
return page_action(ps, p, pfn);
}
+static int try_to_split_thp_page(struct page *page, const char *msg)
+{
+ lock_page(page);
+ if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+ unsigned long pfn = page_to_pfn(page);
+
+ unlock_page(page);
+ if (!PageAnon(page))
+ pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
+ else
+ pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+ put_page(page);
+ return -EBUSY;
+ }
+ unlock_page(page);
+
+ return 0;
+}
+
static int memory_failure_hugetlb(unsigned long pfn, int flags)
{
struct page *p = pfn_to_page(pfn);
@@ -1145,7 +1190,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_dec();
unlock_page(head);
- put_hwpoison_page(head);
+ put_page(head);
return 0;
}
@@ -1326,23 +1371,11 @@ int memory_failure(unsigned long pfn, int flags)
}
if (PageTransHuge(hpage)) {
- lock_page(p);
- if (!PageAnon(p) || unlikely(split_huge_page(p))) {
- unlock_page(p);
- if (!PageAnon(p))
- pr_err("Memory failure: %#lx: non anonymous thp\n",
- pfn);
- else
- pr_err("Memory failure: %#lx: thp split failed\n",
- pfn);
- if (TestClearPageHWPoison(p))
- num_poisoned_pages_dec();
- put_hwpoison_page(p);
+ if (try_to_split_thp_page(p, "Memory Failure") < 0) {
+ action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
return -EBUSY;
}
- unlock_page(p);
VM_BUG_ON_PAGE(!page_count(p), p);
- hpage = compound_head(p);
}
/*
@@ -1382,10 +1415,7 @@ int memory_failure(unsigned long pfn, int flags)
* page_remove_rmap() in try_to_unmap_one(). So to determine page status
* correctly, we save a copy of the page flags at this time.
*/
- if (PageHuge(p))
- page_flags = hpage->flags;
- else
- page_flags = p->flags;
+ page_flags = p->flags;
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1394,14 +1424,14 @@ int memory_failure(unsigned long pfn, int flags)
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_dec();
unlock_page(p);
- put_hwpoison_page(p);
+ put_page(p);
return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unlock_page(p);
- put_hwpoison_page(p);
+ put_page(p);
return 0;
}
@@ -1417,11 +1447,8 @@ int memory_failure(unsigned long pfn, int flags)
/*
* Now take care of user space mappings.
* Abort on fail: __delete_from_page_cache() assumes unmapped page.
- *
- * When the raw error page is thp tail page, hpage points to the raw
- * page after thp split.
*/
- if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
+ if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto out;
@@ -1638,9 +1665,9 @@ int unpoison_memory(unsigned long pfn)
}
unlock_page(page);
- put_hwpoison_page(page);
+ put_page(page);
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
- put_hwpoison_page(page);
+ put_page(page);
return 0;
}
@@ -1680,6 +1707,9 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
} else if (is_free_buddy_page(p)) {
pr_info("%s: %#lx free buddy page\n", __func__, pfn);
ret = 0;
+ } else if (page_count(p)) {
+ /* raced with allocation */
+ ret = -EBUSY;
} else {
pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
__func__, pfn, p->flags);
@@ -1696,12 +1726,15 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
{
int ret = __get_any_page(page, pfn, flags);
+ if (ret == -EBUSY)
+ ret = __get_any_page(page, pfn, flags);
+
if (ret == 1 && !PageHuge(page) &&
!PageLRU(page) && !__PageMovable(page)) {
/*
* Try to free it.
*/
- put_hwpoison_page(page);
+ put_page(page);
shake_page(page, 1);
/*
@@ -1710,7 +1743,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
ret = __get_any_page(page, pfn, 0);
if (ret == 1 && !PageLRU(page)) {
/* Drop page reference which is from __get_any_page() */
- put_hwpoison_page(page);
+ put_page(page);
pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
pfn, page->flags, &page->flags);
return -EIO;
@@ -1719,69 +1752,51 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
return ret;
}
-static int soft_offline_huge_page(struct page *page, int flags)
+static bool isolate_page(struct page *page, struct list_head *pagelist)
{
- int ret;
- unsigned long pfn = page_to_pfn(page);
- struct page *hpage = compound_head(page);
- LIST_HEAD(pagelist);
+ bool isolated = false;
+ bool lru = PageLRU(page);
- /*
- * This double-check of PageHWPoison is to avoid the race with
- * memory_failure(). See also comment in __soft_offline_page().
- */
- lock_page(hpage);
- if (PageHWPoison(hpage)) {
- unlock_page(hpage);
- put_hwpoison_page(hpage);
- pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
- return -EBUSY;
+ if (PageHuge(page)) {
+ isolated = isolate_huge_page(page, pagelist);
+ } else {
+ if (lru)
+ isolated = !isolate_lru_page(page);
+ else
+ isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
+
+ if (isolated)
+ list_add(&page->lru, pagelist);
}
- unlock_page(hpage);
- ret = isolate_huge_page(hpage, &pagelist);
+ if (isolated && lru)
+ inc_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_lru(page));
+
/*
- * get_any_page() and isolate_huge_page() takes a refcount each,
- * so need to drop one here.
+ * If we succeed to isolate the page, we grabbed another refcount on
+ * the page, so we can safely drop the one we got from get_any_pages().
+ * If we failed to isolate the page, it means that we cannot go further
+ * and we will return an error, so drop the reference we got from
+ * get_any_pages() as well.
*/
- put_hwpoison_page(hpage);
- if (!ret) {
- pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
- return -EBUSY;
- }
-
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
- if (ret) {
- pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
- pfn, ret, page->flags, &page->flags);
- if (!list_empty(&pagelist))
- putback_movable_pages(&pagelist);
- if (ret > 0)
- ret = -EIO;
- } else {
- /*
- * We set PG_hwpoison only when the migration source hugepage
- * was successfully dissolved, because otherwise hwpoisoned
- * hugepage remains on free hugepage list, then userspace will
- * find it as SIGBUS by allocation failure. That's not expected
- * in soft-offlining.
- */
- ret = dissolve_free_huge_page(page);
- if (!ret) {
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- else
- ret = -EBUSY;
- }
- }
- return ret;
+ put_page(page);
+ return isolated;
}
-static int __soft_offline_page(struct page *page, int flags)
+/*
+ * __soft_offline_page handles hugetlb-pages and non-hugetlb pages.
+ * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
+ * If the page is mapped, it migrates the contents over.
+ */
+static int __soft_offline_page(struct page *page)
{
- int ret;
+ int ret = 0;
unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+ char const *msg_page[] = {"page", "hugepage"};
+ bool huge = PageHuge(page);
+ LIST_HEAD(pagelist);
/*
* Check PageHWPoison again inside page lock because PageHWPoison
@@ -1790,121 +1805,75 @@ static int __soft_offline_page(struct page *page, int flags)
* so there's no race between soft_offline_page() and memory_failure().
*/
lock_page(page);
- wait_on_page_writeback(page);
+ if (!PageHuge(page))
+ wait_on_page_writeback(page);
if (PageHWPoison(page)) {
unlock_page(page);
- put_hwpoison_page(page);
+ put_page(page);
pr_info("soft offline: %#lx page already poisoned\n", pfn);
- return -EBUSY;
+ return 0;
}
- /*
- * Try to invalidate first. This should work for
- * non dirty unmapped page cache pages.
- */
- ret = invalidate_inode_page(page);
+
+ if (!PageHuge(page))
+ /*
+ * Try to invalidate first. This should work for
+ * non dirty unmapped page cache pages.
+ */
+ ret = invalidate_inode_page(page);
unlock_page(page);
+
/*
* RED-PEN would be better to keep it isolated here, but we
* would need to fix isolation locking first.
*/
- if (ret == 1) {
- put_hwpoison_page(page);
+ if (ret) {
pr_info("soft_offline: %#lx: invalidated\n", pfn);
- SetPageHWPoison(page);
- num_poisoned_pages_inc();
+ page_handle_poison(page, false, true);
return 0;
}
- /*
- * Simple invalidation didn't work.
- * Try to migrate to a new page instead. migrate.c
- * handles a large number of cases for us.
- */
- if (PageLRU(page))
- ret = isolate_lru_page(page);
- else
- ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
- /*
- * Drop page reference which is came from get_any_page()
- * successful isolate_lru_page() already took another one.
- */
- put_hwpoison_page(page);
- if (!ret) {
- LIST_HEAD(pagelist);
- /*
- * After isolated lru page, the PageLRU will be cleared,
- * so use !__PageMovable instead for LRU page's mapping
- * cannot have PAGE_MAPPING_MOVABLE.
- */
- if (!__PageMovable(page))
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_lru(page));
- list_add(&page->lru, &pagelist);
+ if (isolate_page(hpage, &pagelist)) {
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
- if (ret) {
+ if (!ret) {
+ bool release = !huge;
+
+ if (!page_handle_poison(page, huge, release))
+ ret = -EBUSY;
+ } else {
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
- pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
- pfn, ret, page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
+ pfn, msg_page[huge], ret, page->flags, &page->flags);
if (ret > 0)
ret = -EIO;
}
} else {
- pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
- pfn, ret, page_count(page), page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s isolation failed: %d, page count %d, type %lx (%pGp)\n",
+ pfn, msg_page[huge], ret, page_count(page), page->flags, &page->flags);
+ ret = -EBUSY;
}
return ret;
}
-static int soft_offline_in_use_page(struct page *page, int flags)
+static int soft_offline_in_use_page(struct page *page)
{
- int ret;
- int mt;
struct page *hpage = compound_head(page);
- if (!PageHuge(page) && PageTransHuge(hpage)) {
- lock_page(page);
- if (!PageAnon(page) || unlikely(split_huge_page(page))) {
- unlock_page(page);
- if (!PageAnon(page))
- pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
- else
- pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
- put_hwpoison_page(page);
+ if (!PageHuge(page) && PageTransHuge(hpage))
+ if (try_to_split_thp_page(page, "soft offline") < 0)
return -EBUSY;
- }
- unlock_page(page);
- }
-
- /*
- * Setting MIGRATE_ISOLATE here ensures that the page will be linked
- * to free list immediately (not via pcplist) when released after
- * successful page migration. Otherwise we can't guarantee that the
- * page is really free after put_page() returns, so
- * set_hwpoison_free_buddy_page() highly likely fails.
- */
- mt = get_pageblock_migratetype(page);
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
- if (PageHuge(page))
- ret = soft_offline_huge_page(page, flags);
- else
- ret = __soft_offline_page(page, flags);
- set_pageblock_migratetype(page, mt);
- return ret;
+ return __soft_offline_page(page);
}
static int soft_offline_free_page(struct page *page)
{
- int rc = dissolve_free_huge_page(page);
+ int rc = 0;
+
+ if (!page_handle_poison(page, true, false))
+ rc = -EBUSY;
- if (!rc) {
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- else
- rc = -EBUSY;
- }
return rc;
}
@@ -1934,6 +1903,7 @@ int soft_offline_page(unsigned long pfn, int flags)
{
int ret;
struct page *page;
+ bool try_again = true;
if (!pfn_valid(pfn))
return -ENXIO;
@@ -1945,18 +1915,22 @@ int soft_offline_page(unsigned long pfn, int flags)
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
if (flags & MF_COUNT_INCREASED)
- put_hwpoison_page(page);
- return -EBUSY;
+ put_page(page);
+ return 0;
}
+retry:
get_online_mems();
ret = get_any_page(page, pfn, flags);
put_online_mems();
if (ret > 0)
- ret = soft_offline_in_use_page(page, flags);
+ ret = soft_offline_in_use_page(page);
else if (ret == 0)
- ret = soft_offline_free_page(page);
+ if (soft_offline_free_page(page) && try_again) {
+ try_again = false;
+ goto retry;
+ }
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index e3ce0ed49a7d..3f6c62b4569b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -793,15 +793,15 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* lock.
*/
static inline int
-copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pte_t *dst_pte, pte_t *src_pte,
- struct vm_area_struct *vma, struct vm_area_struct *new,
- unsigned long addr, int *rss, struct page **prealloc,
- pte_t pte, struct page *page)
+copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
+ struct page **prealloc, pte_t pte, struct page *page)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
struct page *new_page;
- if (!is_cow_mapping(vma->vm_flags))
+ if (!is_cow_mapping(src_vma->vm_flags))
return 1;
/*
@@ -864,15 +864,15 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* over and copy the page & arm it.
*/
*prealloc = NULL;
- copy_user_highpage(new_page, page, addr, vma);
+ copy_user_highpage(new_page, page, addr, src_vma);
__SetPageUptodate(new_page);
- page_add_new_anon_rmap(new_page, new, addr, false);
- lru_cache_add_inactive_or_unevictable(new_page, new);
+ page_add_new_anon_rmap(new_page, dst_vma, addr, false);
+ lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
rss[mm_counter(new_page)]++;
/* All done, just insert the new page copy in the child */
- pte = mk_pte(new_page, new->vm_page_prot);
- pte = maybe_mkwrite(pte_mkdirty(pte), new);
+ pte = mk_pte(new_page, dst_vma->vm_page_prot);
+ pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
}
@@ -882,24 +882,22 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* is required to copy this pte.
*/
static inline int
-copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, int *rss, struct page **prealloc)
+copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
+ struct page **prealloc)
{
- unsigned long vm_flags = vma->vm_flags;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
+ unsigned long vm_flags = src_vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
- page = vm_normal_page(vma, addr, pte);
+ page = vm_normal_page(src_vma, addr, pte);
if (page) {
int retval;
- retval = copy_present_page(dst_mm, src_mm,
- dst_pte, src_pte,
- vma, new,
- addr, rss, prealloc,
- pte, page);
+ retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, prealloc, pte, page);
if (retval <= 0)
return retval;
@@ -956,11 +954,13 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
return new_page;
}
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, unsigned long end)
+static int
+copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ unsigned long end)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
@@ -1003,15 +1003,15 @@ again:
if (unlikely(!pte_present(*src_pte))) {
entry.val = copy_nonpresent_pte(dst_mm, src_mm,
dst_pte, src_pte,
- vma, addr, rss);
+ src_vma, addr, rss);
if (entry.val)
break;
progress += 8;
continue;
}
/* copy_present_pte() will clear `*prealloc' if consumed */
- ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte,
- vma, new, addr, rss, &prealloc);
+ ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, &prealloc);
/*
* If we need a pre-allocated page for this pte, drop the
* locks, allocate, and try again.
@@ -1046,7 +1046,7 @@ again:
entry.val = 0;
} else if (ret) {
WARN_ON_ONCE(ret != -EAGAIN);
- prealloc = page_copy_prealloc(src_mm, vma, addr);
+ prealloc = page_copy_prealloc(src_mm, src_vma, addr);
if (!prealloc)
return -ENOMEM;
/* We've captured and resolved the error. Reset, try again. */
@@ -1060,11 +1060,13 @@ out:
return ret;
}
-static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, unsigned long end)
+static inline int
+copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
+ unsigned long end)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
pmd_t *src_pmd, *dst_pmd;
unsigned long next;
@@ -1077,9 +1079,9 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
|| pmd_devmap(*src_pmd)) {
int err;
- VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
+ VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
err = copy_huge_pmd(dst_mm, src_mm,
- dst_pmd, src_pmd, addr, vma);
+ dst_pmd, src_pmd, addr, src_vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
@@ -1088,18 +1090,20 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
}
if (pmd_none_or_clear_bad(src_pmd))
continue;
- if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
- vma, new, addr, next))
+ if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
+ addr, next))
return -ENOMEM;
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
return 0;
}
-static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, unsigned long end)
+static inline int
+copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
+ unsigned long end)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
pud_t *src_pud, *dst_pud;
unsigned long next;
@@ -1112,9 +1116,9 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
int err;
- VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
+ VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
err = copy_huge_pud(dst_mm, src_mm,
- dst_pud, src_pud, addr, vma);
+ dst_pud, src_pud, addr, src_vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
@@ -1123,18 +1127,19 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
}
if (pud_none_or_clear_bad(src_pud))
continue;
- if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
- vma, new, addr, next))
+ if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
+ addr, next))
return -ENOMEM;
} while (dst_pud++, src_pud++, addr = next, addr != end);
return 0;
}
-static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, unsigned long end)
+static inline int
+copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
+ unsigned long end)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
p4d_t *src_p4d, *dst_p4d;
unsigned long next;
@@ -1146,20 +1151,22 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(src_p4d))
continue;
- if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
- vma, new, addr, next))
+ if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
+ addr, next))
return -ENOMEM;
} while (dst_p4d++, src_p4d++, addr = next, addr != end);
return 0;
}
-int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- struct vm_area_struct *vma, struct vm_area_struct *new)
+int
+copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
pgd_t *src_pgd, *dst_pgd;
unsigned long next;
- unsigned long addr = vma->vm_start;
- unsigned long end = vma->vm_end;
+ unsigned long addr = src_vma->vm_start;
+ unsigned long end = src_vma->vm_end;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
bool is_cow;
int ret;
@@ -1170,19 +1177,19 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
- if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
- !vma->anon_vma)
+ if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
+ !src_vma->anon_vma)
return 0;
- if (is_vm_hugetlb_page(vma))
- return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+ if (is_vm_hugetlb_page(src_vma))
+ return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
- if (unlikely(vma->vm_flags & VM_PFNMAP)) {
+ if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
/*
* We do not free on error cases below as remove_vma
* gets called on error from higher level routine
*/
- ret = track_pfn_copy(vma);
+ ret = track_pfn_copy(src_vma);
if (ret)
return ret;
}
@@ -1193,11 +1200,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* parent mm. And a permission downgrade will only happen if
* is_cow_mapping() returns true.
*/
- is_cow = is_cow_mapping(vma->vm_flags);
+ is_cow = is_cow_mapping(src_vma->vm_flags);
if (is_cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
- 0, vma, src_mm, addr, end);
+ 0, src_vma, src_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
}
@@ -1208,8 +1215,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(src_pgd))
continue;
- if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
- vma, new, addr, next))) {
+ if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
+ addr, next))) {
ret = -ENOMEM;
break;
}
@@ -3621,7 +3628,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
* unlock_page(A)
* lock_page(B)
* lock_page(B)
- * pte_alloc_pne
+ * pte_alloc_one
* shrink_page_list
* wait_on_page_writeback(A)
* SetPageWriteback(B)
@@ -3629,7 +3636,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
* # flush A, B to clear the writeback
*/
if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
- vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
@@ -3737,13 +3744,14 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i;
- vm_fault_t ret;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
if (!transhuge_vma_suitable(vma, haddr))
- return VM_FAULT_FALLBACK;
+ return ret;
- ret = VM_FAULT_FALLBACK;
page = compound_head(page);
+ if (compound_order(page) != HPAGE_PMD_ORDER)
+ return ret;
/*
* Archs like ppc64 need additonal space to store information
@@ -3796,7 +3804,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
/**
* alloc_set_pte - setup new PTE entry for given page and add reverse page
- * mapping. If needed, the fucntion allocates page table or use pre-allocated.
+ * mapping. If needed, the function allocates page table or use pre-allocated.
*
* @vmf: fault environment
* @page: page to map
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ce3e73e3a5c1..d397af38f9ce 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -105,7 +105,7 @@ static struct resource *register_memory_resource(u64 start, u64 size,
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
if (strcmp(resource_name, "System RAM"))
- flags |= IORESOURCE_MEM_DRIVER_MANAGED;
+ flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
/*
* Make sure value parsed from 'mem=' only restricts memory adding
@@ -353,11 +353,19 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
#ifdef CONFIG_NUMA
int __weak memory_add_physaddr_to_nid(u64 start)
{
- pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+ pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
start);
return 0;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+
+int __weak phys_to_target_node(u64 start)
+{
+ pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+ start);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
#endif
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
@@ -617,31 +625,22 @@ void generic_online_page(struct page *page, unsigned int order)
}
EXPORT_SYMBOL_GPL(generic_online_page);
-static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
- void *arg)
+static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
{
const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn;
- int order;
/*
- * Online the pages. The callback might decide to keep some pages
- * PG_reserved (to add them to the buddy later), but we still account
- * them as being online/belonging to this zone ("present").
+ * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
+ * decide to not expose all pages to the buddy (e.g., expose them
+ * later). We account all pages as being online and belonging to this
+ * zone ("present").
*/
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
- order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
- /* __free_pages_core() wants pfns to be aligned to the order */
- if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
- order = 0;
- (*online_page_callback)(pfn_to_page(pfn), order);
- }
+ for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
+ (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
/* mark all involved sections as online */
online_mem_sections(start_pfn, end_pfn);
-
- *(unsigned long *)arg += nr_pages;
- return 0;
}
/* check which state of node_states will be changed when online memory */
@@ -702,9 +701,14 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
* Associate the pfn range with the given zone, initializing the memmaps
* and resizing the pgdat/zone data to span the added pages. After this
* call, all affected pages are PG_reserved.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
*/
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+ unsigned long nr_pages,
+ struct vmem_altmap *altmap, int migratetype)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
@@ -729,7 +733,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
* are reserved so nobody should be touching them so we should be safe
*/
memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
- MEMINIT_HOTPLUG, altmap);
+ MEMINIT_HOTPLUG, altmap, migratetype);
set_zone_contiguous(zone);
}
@@ -795,17 +799,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
int online_type, int nid)
{
unsigned long flags;
- unsigned long onlined_pages = 0;
struct zone *zone;
int need_zonelists_rebuild = 0;
int ret;
struct memory_notify arg;
+ /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
+ if (WARN_ON_ONCE(!nr_pages ||
+ !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
mem_hotplug_begin();
/* associate pfn range with the zone */
zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
@@ -817,6 +825,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
goto failed_addition;
/*
+ * Fixup the number of isolated pageblocks before marking the sections
+ * onlining, such that undo_isolate_page_range() works correctly.
+ */
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ /*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
@@ -826,36 +842,32 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
setup_zone_pageset(zone);
}
- ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
- online_pages_range);
- if (ret) {
- /* not a single memory resource was applicable */
- if (need_zonelists_rebuild)
- zone_pcp_reset(zone);
- goto failed_addition;
- }
-
- zone->present_pages += onlined_pages;
+ online_pages_range(pfn, nr_pages);
+ zone->present_pages += nr_pages;
pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages += onlined_pages;
+ zone->zone_pgdat->node_present_pages += nr_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ node_states_set_node(nid, &arg);
+ if (need_zonelists_rebuild)
+ build_all_zonelists(NULL);
+ zone_pcp_update(zone);
+
+ /* Basic onlining is complete, allow allocation of onlined pages. */
+ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
+
/*
* When exposing larger, physically contiguous memory areas to the
* buddy, shuffling in the buddy (when freeing onlined pages, putting
* them either to the head or the tail of the freelist) is only helpful
* for maintaining the shuffle, but not for creating the initial
* shuffle. Shuffle the whole zone to make sure the just onlined pages
- * are properly distributed across the whole freelist.
+ * are properly distributed across the whole freelist. Make sure to
+ * shuffle once pageblocks are no longer isolated.
*/
shuffle_zone(zone);
- node_states_set_node(nid, &arg);
- if (need_zonelists_rebuild)
- build_all_zonelists(NULL);
- zone_pcp_update(zone);
-
init_per_zone_wmark_min();
kswapd_run(nid);
@@ -1027,7 +1039,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
*
* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
*/
-int __ref add_memory_resource(int nid, struct resource *res)
+int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = PAGE_KERNEL };
u64 start, size;
@@ -1080,9 +1092,8 @@ int __ref add_memory_resource(int nid, struct resource *res)
}
/* link memory sections under this node.*/
- ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
- MEMINIT_HOTPLUG);
- BUG_ON(ret);
+ link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
+ MEMINIT_HOTPLUG);
/* create new memmap entry */
if (!strcmp(res->name, "System RAM"))
@@ -1091,6 +1102,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
/* device_online() will take the lock when calling online_pages() */
mem_hotplug_done();
+ /*
+ * In case we're allowed to merge the resource, flag it and trigger
+ * merging now that adding succeeded.
+ */
+ if (mhp_flags & MEMHP_MERGE_RESOURCE)
+ merge_system_ram_resource(res);
+
/* online pages if requested */
if (memhp_default_online_type != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
@@ -1107,7 +1125,7 @@ error:
}
/* requires device_hotplug_lock, see add_memory_resource() */
-int __ref __add_memory(int nid, u64 start, u64 size)
+int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
struct resource *res;
int ret;
@@ -1116,18 +1134,18 @@ int __ref __add_memory(int nid, u64 start, u64 size)
if (IS_ERR(res))
return PTR_ERR(res);
- ret = add_memory_resource(nid, res);
+ ret = add_memory_resource(nid, res, mhp_flags);
if (ret < 0)
release_memory_resource(res);
return ret;
}
-int add_memory(int nid, u64 start, u64 size)
+int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
int rc;
lock_device_hotplug();
- rc = __add_memory(nid, start, size);
+ rc = __add_memory(nid, start, size, mhp_flags);
unlock_device_hotplug();
return rc;
@@ -1149,14 +1167,14 @@ EXPORT_SYMBOL_GPL(add_memory);
*
* For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
* memory map") are created. Also, the created memory resource is flagged
- * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case
+ * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
* this memory as well (esp., not place kexec images onto it).
*
* The resource_name (visible via /proc/iomem) has to have the format
* "System RAM ($DRIVER)".
*/
int add_memory_driver_managed(int nid, u64 start, u64 size,
- const char *resource_name)
+ const char *resource_name, mhp_t mhp_flags)
{
struct resource *res;
int rc;
@@ -1174,7 +1192,7 @@ int add_memory_driver_managed(int nid, u64 start, u64 size,
goto out_unlock;
}
- rc = add_memory_resource(nid, res);
+ rc = add_memory_resource(nid, res, mhp_flags);
if (rc < 0)
release_memory_resource(res);
@@ -1371,28 +1389,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
return ret;
}
-/* Mark all sections offline and remove all free pages from the buddy. */
-static int
-offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
- void *data)
-{
- unsigned long *offlined_pages = (unsigned long *)data;
-
- *offlined_pages += __offline_isolated_pages(start, start + nr_pages);
- return 0;
-}
-
-/*
- * Check all pages in range, recorded as memory resource, are isolated.
- */
-static int
-check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
- void *data)
-{
- return test_pages_isolated(start_pfn, start_pfn + nr_pages,
- MEMORY_OFFLINE);
-}
-
static int __init cmdline_parse_movable_node(char *p)
{
movable_node_enabled = true;
@@ -1476,17 +1472,21 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
return 0;
}
-static int __ref __offline_pages(unsigned long start_pfn,
- unsigned long end_pfn)
+int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
- unsigned long pfn, nr_pages = 0;
- unsigned long offlined_pages = 0;
- int ret, node, nr_isolate_pageblock;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn, system_ram_pages = 0;
unsigned long flags;
struct zone *zone;
struct memory_notify arg;
+ int ret, node;
char *reason;
+ /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
+ if (WARN_ON_ONCE(!nr_pages ||
+ !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
mem_hotplug_begin();
/*
@@ -1497,9 +1497,9 @@ static int __ref __offline_pages(unsigned long start_pfn,
* memory holes PG_reserved, don't need pfn_valid() checks, and can
* avoid using walk_system_ram_range() later.
*/
- walk_system_ram_range(start_pfn, end_pfn - start_pfn, &nr_pages,
+ walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
count_system_ram_pages_cb);
- if (nr_pages != end_pfn - start_pfn) {
+ if (system_ram_pages != nr_pages) {
ret = -EINVAL;
reason = "memory holes";
goto failed_removal;
@@ -1519,11 +1519,10 @@ static int __ref __offline_pages(unsigned long start_pfn,
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
MEMORY_OFFLINE | REPORT_FAILURE);
- if (ret < 0) {
+ if (ret) {
reason = "failure to isolate range";
goto failed_removal;
}
- nr_isolate_pageblock = ret;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
@@ -1573,9 +1572,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
reason = "failure to dissolve huge pages";
goto failed_removal_isolated;
}
- /* check again */
- ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
- NULL, check_pages_isolated_cb);
+
/*
* per-cpu pages are drained in start_isolate_page_range, but if
* there are still pages that are not free, make sure that we
@@ -1588,30 +1585,30 @@ static int __ref __offline_pages(unsigned long start_pfn,
* because has_unmovable_pages explicitly checks for
* PageBuddy on freed pages on other zones.
*/
+ ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
if (ret)
drain_all_pages(zone);
} while (ret);
- /* Ok, all of our target is isolated.
- We cannot do rollback at this point. */
- walk_system_ram_range(start_pfn, end_pfn - start_pfn,
- &offlined_pages, offline_isolated_pages_cb);
- pr_info("Offlined Pages %ld\n", offlined_pages);
+ /* Mark all sections offline and remove free pages from the buddy. */
+ __offline_isolated_pages(start_pfn, end_pfn);
+ pr_info("Offlined Pages %ld\n", nr_pages);
+
/*
- * Onlining will reset pagetype flags and makes migrate type
- * MOVABLE, so just need to decrease the number of isolated
- * pageblocks zone counter here.
+ * The memory sections are marked offline, and the pageblock flags
+ * effectively stale; nobody should be touching them. Fixup the number
+ * of isolated pageblocks, memory onlining will properly revert this.
*/
spin_lock_irqsave(&zone->lock, flags);
- zone->nr_isolate_pageblock -= nr_isolate_pageblock;
+ zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
/* removal success */
- adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
- zone->present_pages -= offlined_pages;
+ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
+ zone->present_pages -= nr_pages;
pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages -= offlined_pages;
+ zone->zone_pgdat->node_present_pages -= nr_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
init_per_zone_wmark_min();
@@ -1648,11 +1645,6 @@ failed_removal:
return ret;
}
-int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
-{
- return __offline_pages(start_pfn, start_pfn + nr_pages);
-}
-
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
@@ -1741,26 +1733,6 @@ void try_offline_node(int nid)
}
EXPORT_SYMBOL(try_offline_node);
-static void __release_memory_resource(resource_size_t start,
- resource_size_t size)
-{
- int ret;
-
- /*
- * When removing memory in the same granularity as it was added,
- * this function never fails. It might only fail if resources
- * have to be adjusted or split. We'll ignore the error, as
- * removing of memory cannot fail.
- */
- ret = release_mem_region_adjustable(&iomem_resource, start, size);
- if (ret) {
- resource_size_t endres = start + size - 1;
-
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
- &start, &endres, ret);
- }
-}
-
static int __ref try_remove_memory(int nid, u64 start, u64 size)
{
int rc = 0;
@@ -1794,7 +1766,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
memblock_remove(start, size);
}
- __release_memory_resource(start, size);
+ release_mem_region_adjustable(start, size);
try_offline_node(nid);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eddbe4e56c73..3fde772ef5ef 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -875,13 +875,12 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
goto out;
}
- task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
- task_unlock(current);
mpol_put(new);
goto out;
}
+ task_lock(current);
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
@@ -1324,9 +1323,7 @@ static long do_mbind(unsigned long start, unsigned long len,
NODEMASK_SCRATCH(scratch);
if (scratch) {
mmap_write_lock(mm);
- task_lock(current);
err = mpol_set_nodemask(new, nmask, scratch);
- task_unlock(current);
if (err)
mmap_write_unlock(mm);
} else
@@ -1885,8 +1882,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
}
/* Return the node id preferred by the given mempolicy, or the given id */
-static int policy_node(gfp_t gfp, struct mempolicy *policy,
- int nd)
+static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
nd = policy->v.preferred_node;
diff --git a/mm/mempool.c b/mm/mempool.c
index 79bff63ecf27..f473cdddaff0 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -58,11 +58,10 @@ static void __check_element(mempool_t *pool, void *element, size_t size)
static void check_element(mempool_t *pool, void *element)
{
/* Mempools backed by slab allocator */
- if (pool->free == mempool_free_slab || pool->free == mempool_kfree)
+ if (pool->free == mempool_free_slab || pool->free == mempool_kfree) {
__check_element(pool, element, ksize(element));
-
- /* Mempools backed by page allocator */
- if (pool->free == mempool_free_pages) {
+ } else if (pool->free == mempool_free_pages) {
+ /* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
void *addr = kmap_atomic((struct page *)element);
@@ -82,11 +81,10 @@ static void __poison_element(void *element, size_t size)
static void poison_element(mempool_t *pool, void *element)
{
/* Mempools backed by slab allocator */
- if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+ if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) {
__poison_element(element, ksize(element));
-
- /* Mempools backed by page allocator */
- if (pool->alloc == mempool_alloc_pages) {
+ } else if (pool->alloc == mempool_alloc_pages) {
+ /* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
void *addr = kmap_atomic((struct page *)element);
@@ -107,7 +105,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_poison_kfree(element, _RET_IP_);
- if (pool->alloc == mempool_alloc_pages)
+ else if (pool->alloc == mempool_alloc_pages)
kasan_free_pages(element, (unsigned long)pool->pool_data);
}
@@ -115,7 +113,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_unpoison_slab(element);
- if (pool->alloc == mempool_alloc_pages)
+ else if (pool->alloc == mempool_alloc_pages)
kasan_alloc_pages(element, (unsigned long)pool->pool_data);
}
diff --git a/mm/memremap.c b/mm/memremap.c
index 006dace60b1a..2bb276680837 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -40,12 +40,10 @@ EXPORT_SYMBOL_GPL(memremap_compat_align);
#ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
EXPORT_SYMBOL(devmap_managed_key);
-static atomic_t devmap_managed_enable;
static void devmap_managed_enable_put(void)
{
- if (atomic_dec_and_test(&devmap_managed_enable))
- static_branch_disable(&devmap_managed_key);
+ static_branch_dec(&devmap_managed_key);
}
static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
@@ -56,8 +54,7 @@ static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
return -EINVAL;
}
- if (atomic_inc_return(&devmap_managed_enable) == 1)
- static_branch_enable(&devmap_managed_key);
+ static_branch_inc(&devmap_managed_key);
return 0;
}
#else
@@ -70,24 +67,28 @@ static void devmap_managed_enable_put(void)
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */
-static void pgmap_array_delete(struct resource *res)
+static void pgmap_array_delete(struct range *range)
{
- xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
+ xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end),
NULL, GFP_KERNEL);
synchronize_rcu();
}
-static unsigned long pfn_first(struct dev_pagemap *pgmap)
+static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id)
{
- return PHYS_PFN(pgmap->res.start) +
- vmem_altmap_offset(pgmap_altmap(pgmap));
+ struct range *range = &pgmap->ranges[range_id];
+ unsigned long pfn = PHYS_PFN(range->start);
+
+ if (range_id)
+ return pfn;
+ return pfn + vmem_altmap_offset(pgmap_altmap(pgmap));
}
-static unsigned long pfn_end(struct dev_pagemap *pgmap)
+static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
{
- const struct resource *res = &pgmap->res;
+ const struct range *range = &pgmap->ranges[range_id];
- return (res->start + resource_size(res)) >> PAGE_SHIFT;
+ return (range->start + range_len(range)) >> PAGE_SHIFT;
}
static unsigned long pfn_next(unsigned long pfn)
@@ -97,8 +98,28 @@ static unsigned long pfn_next(unsigned long pfn)
return pfn + 1;
}
-#define for_each_device_pfn(pfn, map) \
- for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
+/*
+ * This returns true if the page is reserved by ZONE_DEVICE driver.
+ */
+bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ struct dev_pagemap *pgmap;
+ struct vmem_altmap *altmap;
+ bool ret = false;
+
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (!pgmap)
+ return ret;
+ altmap = pgmap_altmap(pgmap);
+ if (altmap && pfn < (altmap->base_pfn + altmap->reserve))
+ ret = true;
+ put_dev_pagemap(pgmap);
+
+ return ret;
+}
+
+#define for_each_device_pfn(pfn, map, i) \
+ for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
static void dev_pagemap_kill(struct dev_pagemap *pgmap)
{
@@ -124,39 +145,49 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
pgmap->ref = NULL;
}
-void memunmap_pages(struct dev_pagemap *pgmap)
+static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
{
- struct resource *res = &pgmap->res;
+ struct range *range = &pgmap->ranges[range_id];
struct page *first_page;
- unsigned long pfn;
int nid;
- dev_pagemap_kill(pgmap);
- for_each_device_pfn(pfn, pgmap)
- put_page(pfn_to_page(pfn));
- dev_pagemap_cleanup(pgmap);
-
/* make sure to access a memmap that was actually initialized */
- first_page = pfn_to_page(pfn_first(pgmap));
+ first_page = pfn_to_page(pfn_first(pgmap, range_id));
/* pages are dead and unused, undo the arch mapping */
nid = page_to_nid(first_page);
mem_hotplug_begin();
- remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)));
+ remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)));
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- __remove_pages(PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), NULL);
+ __remove_pages(PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), NULL);
} else {
- arch_remove_memory(nid, res->start, resource_size(res),
+ arch_remove_memory(nid, range->start, range_len(range),
pgmap_altmap(pgmap));
- kasan_remove_zero_shadow(__va(res->start), resource_size(res));
+ kasan_remove_zero_shadow(__va(range->start), range_len(range));
}
mem_hotplug_done();
- untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
- pgmap_array_delete(res);
+ untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
+ pgmap_array_delete(range);
+}
+
+void memunmap_pages(struct dev_pagemap *pgmap)
+{
+ unsigned long pfn;
+ int i;
+
+ dev_pagemap_kill(pgmap);
+ for (i = 0; i < pgmap->nr_range; i++)
+ for_each_device_pfn(pfn, pgmap, i)
+ put_page(pfn_to_page(pfn));
+ dev_pagemap_cleanup(pgmap);
+
+ for (i = 0; i < pgmap->nr_range; i++)
+ pageunmap_range(pgmap, i);
+
WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
devmap_managed_enable_put();
}
@@ -175,6 +206,115 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref)
complete(&pgmap->done);
}
+static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
+ int range_id, int nid)
+{
+ struct range *range = &pgmap->ranges[range_id];
+ struct dev_pagemap *conflict_pgmap;
+ int error, is_ram;
+
+ if (WARN_ONCE(pgmap_altmap(pgmap) && range_id > 0,
+ "altmap not supported for multiple ranges\n"))
+ return -EINVAL;
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL);
+ if (conflict_pgmap) {
+ WARN(1, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ return -ENOMEM;
+ }
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL);
+ if (conflict_pgmap) {
+ WARN(1, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ return -ENOMEM;
+ }
+
+ is_ram = region_intersects(range->start, range_len(range),
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
+
+ if (is_ram != REGION_DISJOINT) {
+ WARN_ONCE(1, "attempted on %s region %#llx-%#llx\n",
+ is_ram == REGION_MIXED ? "mixed" : "ram",
+ range->start, range->end);
+ return -ENXIO;
+ }
+
+ error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(range->start),
+ PHYS_PFN(range->end), pgmap, GFP_KERNEL));
+ if (error)
+ return error;
+
+ if (nid < 0)
+ nid = numa_mem_id();
+
+ error = track_pfn_remap(NULL, &params->pgprot, PHYS_PFN(range->start), 0,
+ range_len(range));
+ if (error)
+ goto err_pfn_remap;
+
+ mem_hotplug_begin();
+
+ /*
+ * For device private memory we call add_pages() as we only need to
+ * allocate and initialize struct page for the device memory. More-
+ * over the device memory is un-accessible thus we do not want to
+ * create a linear mapping for the memory like arch_add_memory()
+ * would do.
+ *
+ * For all other device memory types, which are accessible by
+ * the CPU, we do want the linear mapping and thus use
+ * arch_add_memory().
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ error = add_pages(nid, PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), params);
+ } else {
+ error = kasan_add_zero_shadow(__va(range->start), range_len(range));
+ if (error) {
+ mem_hotplug_done();
+ goto err_kasan;
+ }
+
+ error = arch_add_memory(nid, range->start, range_len(range),
+ params);
+ }
+
+ if (!error) {
+ struct zone *zone;
+
+ zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
+ move_pfn_range_to_zone(zone, PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), params->altmap,
+ MIGRATE_MOVABLE);
+ }
+
+ mem_hotplug_done();
+ if (error)
+ goto err_add_memory;
+
+ /*
+ * Initialization of the pages has been deferred until now in order
+ * to allow us to do the work while not holding the hotplug lock.
+ */
+ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), pgmap);
+ percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
+ - pfn_first(pgmap, range_id));
+ return 0;
+
+err_add_memory:
+ kasan_remove_zero_shadow(__va(range->start), range_len(range));
+err_kasan:
+ untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
+err_pfn_remap:
+ pgmap_array_delete(range);
+ return error;
+}
+
+
/*
* Not device managed version of dev_memremap_pages, undone by
* memunmap_pages(). Please use dev_memremap_pages if you have a struct
@@ -182,17 +322,16 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref)
*/
void *memremap_pages(struct dev_pagemap *pgmap, int nid)
{
- struct resource *res = &pgmap->res;
- struct dev_pagemap *conflict_pgmap;
struct mhp_params params = {
- /*
- * We do not want any optional features only our own memmap
- */
.altmap = pgmap_altmap(pgmap),
.pgprot = PAGE_KERNEL,
};
- int error, is_ram;
+ const int nr_range = pgmap->nr_range;
bool need_devmap_managed = true;
+ int error, i;
+
+ if (WARN_ONCE(!nr_range, "nr_range must be specified\n"))
+ return ERR_PTR(-EINVAL);
switch (pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
@@ -251,105 +390,27 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
return ERR_PTR(error);
}
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL);
- if (conflict_pgmap) {
- WARN(1, "Conflicting mapping in same section\n");
- put_dev_pagemap(conflict_pgmap);
- error = -ENOMEM;
- goto err_array;
- }
-
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL);
- if (conflict_pgmap) {
- WARN(1, "Conflicting mapping in same section\n");
- put_dev_pagemap(conflict_pgmap);
- error = -ENOMEM;
- goto err_array;
- }
-
- is_ram = region_intersects(res->start, resource_size(res),
- IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
-
- if (is_ram != REGION_DISJOINT) {
- WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
- is_ram == REGION_MIXED ? "mixed" : "ram", res);
- error = -ENXIO;
- goto err_array;
- }
-
- error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
- PHYS_PFN(res->end), pgmap, GFP_KERNEL));
- if (error)
- goto err_array;
-
- if (nid < 0)
- nid = numa_mem_id();
-
- error = track_pfn_remap(NULL, &params.pgprot, PHYS_PFN(res->start),
- 0, resource_size(res));
- if (error)
- goto err_pfn_remap;
-
- mem_hotplug_begin();
-
/*
- * For device private memory we call add_pages() as we only need to
- * allocate and initialize struct page for the device memory. More-
- * over the device memory is un-accessible thus we do not want to
- * create a linear mapping for the memory like arch_add_memory()
- * would do.
- *
- * For all other device memory types, which are accessible by
- * the CPU, we do want the linear mapping and thus use
- * arch_add_memory().
+ * Clear the pgmap nr_range as it will be incremented for each
+ * successfully processed range. This communicates how many
+ * regions to unwind in the abort case.
*/
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- error = add_pages(nid, PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), &params);
- } else {
- error = kasan_add_zero_shadow(__va(res->start), resource_size(res));
- if (error) {
- mem_hotplug_done();
- goto err_kasan;
- }
-
- error = arch_add_memory(nid, res->start, resource_size(res),
- &params);
+ pgmap->nr_range = 0;
+ error = 0;
+ for (i = 0; i < nr_range; i++) {
+ error = pagemap_range(pgmap, &params, i, nid);
+ if (error)
+ break;
+ pgmap->nr_range++;
}
- if (!error) {
- struct zone *zone;
-
- zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
- move_pfn_range_to_zone(zone, PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), params.altmap);
+ if (i < nr_range) {
+ memunmap_pages(pgmap);
+ pgmap->nr_range = nr_range;
+ return ERR_PTR(error);
}
- mem_hotplug_done();
- if (error)
- goto err_add_memory;
-
- /*
- * Initialization of the pages has been deferred until now in order
- * to allow us to do the work while not holding the hotplug lock.
- */
- memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
- PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), pgmap);
- percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
- return __va(res->start);
-
- err_add_memory:
- kasan_remove_zero_shadow(__va(res->start), resource_size(res));
- err_kasan:
- untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
- err_pfn_remap:
- pgmap_array_delete(res);
- err_array:
- dev_pagemap_kill(pgmap);
- dev_pagemap_cleanup(pgmap);
- devmap_managed_enable_put();
- return ERR_PTR(error);
+ return __va(pgmap->ranges[0].start);
}
EXPORT_SYMBOL_GPL(memremap_pages);
@@ -369,7 +430,7 @@ EXPORT_SYMBOL_GPL(memremap_pages);
* 'live' on entry and will be killed and reaped at
* devm_memremap_pages_release() time, or if this routine fails.
*
- * 4/ res is expected to be a host memory range that could feasibly be
+ * 4/ range is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
* this is not enforced.
*/
@@ -426,7 +487,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
* In the cached case we're already holding a live reference.
*/
if (pgmap) {
- if (phys >= pgmap->res.start && phys <= pgmap->res.end)
+ if (phys >= pgmap->range.start && phys <= pgmap->range.end)
return pgmap;
put_dev_pagemap(pgmap);
}
@@ -451,8 +512,6 @@ void free_devmap_managed_page(struct page *page)
return;
}
- /* Clear Active bit in case of parallel mark_page_accessed */
- __ClearPageActive(page);
__ClearPageWaiters(page);
mem_cgroup_uncharge(page);
diff --git a/mm/migrate.c b/mm/migrate.c
index 4de11dfd730b..4cf1af88c1dd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -381,7 +381,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
int expected_count = 1;
/*
- * Device public or private pages have an extra refcount as they are
+ * Device private pages have an extra refcount as they are
* ZONE_DEVICE pages.
*/
expected_count += is_device_private_page(page);
@@ -1223,16 +1223,11 @@ out:
* we want to retry.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- put_page(page);
- if (reason == MR_MEMORY_FAILURE) {
+ if (reason != MR_MEMORY_FAILURE)
/*
- * Set PG_HWPoison on just freed page
- * intentionally. Although it's rather weird,
- * it's how HWPoison flag works at the moment.
+ * We release the page in page_handle_poison.
*/
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- }
+ put_page(page);
} else {
if (rc != -EAGAIN) {
if (likely(!__PageMovable(page))) {
@@ -3077,7 +3072,6 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
remove_migration_ptes(page, newpage, false);
unlock_page(page);
- migrate->cpages--;
if (is_zone_device_page(page))
put_page(page);
diff --git a/mm/mincore.c b/mm/mincore.c
index 453ff112470f..02db1a834021 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -48,7 +48,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
* and is up to date; i.e. that no page-in operation would be required
* at this time if an application were to map and access this page.
*/
-static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
+static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
{
unsigned char present = 0;
struct page *page;
@@ -59,31 +59,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
-#ifdef CONFIG_SWAP
- if (shmem_mapping(mapping)) {
- page = find_get_entry(mapping, pgoff);
- /*
- * shmem/tmpfs may return swap: account for swapcache
- * page too.
- */
- if (xa_is_value(page)) {
- swp_entry_t swp = radix_to_swp_entry(page);
- struct swap_info_struct *si;
-
- /* Prevent swap device to being swapoff under us */
- si = get_swap_device(swp);
- if (si) {
- page = find_get_page(swap_address_space(swp),
- swp_offset(swp));
- put_swap_device(si);
- } else
- page = NULL;
- }
- } else
- page = find_get_page(mapping, pgoff);
-#else
- page = find_get_page(mapping, pgoff);
-#endif
+ page = find_get_incore_page(mapping, index);
if (page) {
present = PageUptodate(page);
put_page(page);
diff --git a/mm/mmap.c b/mm/mmap.c
index a217f37ffc83..3beb9bdae61a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -143,7 +143,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
{
if (vma->vm_flags & VM_DENYWRITE)
- atomic_inc(&file_inode(file)->i_writecount);
+ allow_write_access(file);
if (vma->vm_flags & VM_SHARED)
mapping_unmap_writable(mapping);
@@ -474,8 +474,12 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
{
/*
* All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of the "next" vma being erased if
- * next->vm_start was reduced.
+ * with the possible exception of
+ *
+ * a. the "next" vma being erased if next->vm_start was reduced in
+ * __vma_adjust() -> __vma_unlink()
+ * b. the vma being erased in detach_vmas_to_be_unmapped() ->
+ * vma_rb_erase()
*/
validate_mm_rb(root, ignore);
@@ -485,13 +489,7 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
struct rb_root *root)
{
- /*
- * All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of the vma being erased.
- */
- validate_mm_rb(root, vma);
-
- __vma_rb_erase(vma, root);
+ vma_rb_erase_ignore(vma, root, vma);
}
/*
@@ -621,9 +619,9 @@ static void __vma_link_file(struct vm_area_struct *vma)
struct address_space *mapping = file->f_mapping;
if (vma->vm_flags & VM_DENYWRITE)
- atomic_dec(&file_inode(file)->i_writecount);
+ put_write_access(file_inode(file));
if (vma->vm_flags & VM_SHARED)
- atomic_inc(&mapping->i_mmap_writable);
+ mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
vma_interval_tree_insert(vma, &mapping->i_mmap);
@@ -677,7 +675,7 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
mm->map_count++;
}
-static __always_inline void __vma_unlink_common(struct mm_struct *mm,
+static __always_inline void __vma_unlink(struct mm_struct *mm,
struct vm_area_struct *vma,
struct vm_area_struct *ignore)
{
@@ -760,7 +758,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* vma expands, overlapping part of the next:
* mprotect case 5 shifting the boundary up.
*/
- adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
+ adjust_next = (end - next->vm_start);
exporter = next;
importer = vma;
VM_WARN_ON(expand != importer);
@@ -770,7 +768,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* split_vma inserting another: so it must be
* mprotect case 4 shifting the boundary down.
*/
- adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
+ adjust_next = -(vma->vm_end - end);
exporter = vma;
importer = next;
VM_WARN_ON(expand != importer);
@@ -825,7 +823,7 @@ again:
anon_vma_interval_tree_pre_update_vma(next);
}
- if (root) {
+ if (file) {
flush_dcache_mmap_lock(mapping);
vma_interval_tree_remove(vma, root);
if (adjust_next)
@@ -842,11 +840,11 @@ again:
}
vma->vm_pgoff = pgoff;
if (adjust_next) {
- next->vm_start += adjust_next << PAGE_SHIFT;
- next->vm_pgoff += adjust_next;
+ next->vm_start += adjust_next;
+ next->vm_pgoff += adjust_next >> PAGE_SHIFT;
}
- if (root) {
+ if (file) {
if (adjust_next)
vma_interval_tree_insert(next, root);
vma_interval_tree_insert(vma, root);
@@ -859,7 +857,7 @@ again:
* us to remove next before dropping the locks.
*/
if (remove_next != 3)
- __vma_unlink_common(mm, next, next);
+ __vma_unlink(mm, next, next);
else
/*
* vma is not before next if they've been
@@ -870,7 +868,7 @@ again:
* "next" (which is stored in post-swap()
* "vma").
*/
- __vma_unlink_common(mm, next, vma);
+ __vma_unlink(mm, next, vma);
if (file)
__remove_shared_vm_struct(next, file, mapping);
} else if (insert) {
@@ -897,10 +895,9 @@ again:
anon_vma_interval_tree_post_update_vma(next);
anon_vma_unlock_write(anon_vma);
}
- if (mapping)
- i_mmap_unlock_write(mapping);
- if (root) {
+ if (file) {
+ i_mmap_unlock_write(mapping);
uprobe_mmap(vma);
if (adjust_next)
@@ -1880,6 +1877,22 @@ unacct_error:
return error;
}
+static inline unsigned long gap_start_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_start offset to adjust gap address to the
+ * desired alignment
+ */
+ return (info->align_offset - addr) & info->align_mask;
+}
+
+static inline unsigned long gap_end_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_end offset to adjust gap address to the desired alignment */
+ return (addr - info->align_offset) & info->align_mask;
+}
+
static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
/*
@@ -1894,10 +1907,7 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;
/* Adjust search limits by the desired length */
if (info->high_limit < length)
@@ -1929,6 +1939,7 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
}
gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
+ gap_start += gap_start_offset(info, gap_start);
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
@@ -1957,6 +1968,7 @@ check_current:
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
gap_start = vm_end_gap(vma->vm_prev);
+ gap_start += gap_start_offset(info, gap_start);
gap_end = vm_start_gap(vma);
goto check_current;
}
@@ -1966,17 +1978,17 @@ check_current:
check_highest:
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
+ gap_start += gap_start_offset(info, gap_start);
gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
if (gap_start > high_limit)
return -ENOMEM;
found:
/* We found a suitable gap. Clip it with the original low_limit. */
- if (gap_start < info->low_limit)
+ if (gap_start < info->low_limit) {
gap_start = info->low_limit;
-
- /* Adjust gap address to the desired alignment */
- gap_start += (info->align_offset - gap_start) & info->align_mask;
+ gap_start += gap_start_offset(info, gap_start);
+ }
VM_BUG_ON(gap_start + info->length > info->high_limit);
VM_BUG_ON(gap_start + info->length > gap_end);
@@ -1989,16 +2001,14 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;
/*
* Adjust search limits by the desired length.
* See implementation comment at top of unmapped_area().
*/
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < length)
return -ENOMEM;
high_limit = gap_end - length;
@@ -2035,6 +2045,7 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
check_current:
/* Check if current node has a suitable gap */
gap_end = vm_start_gap(vma);
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < low_limit)
return -ENOMEM;
if (gap_start <= high_limit &&
@@ -2069,13 +2080,14 @@ check_current:
found:
/* We found a suitable gap. Clip it with the original high_limit. */
- if (gap_end > info->high_limit)
+ if (gap_end > info->high_limit) {
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
+ }
found_highest:
/* Compute highest gap address at the desired alignment */
gap_end -= info->length;
- gap_end -= (gap_end - info->align_offset) & info->align_mask;
VM_BUG_ON(gap_end < info->low_limit);
VM_BUG_ON(gap_end < gap_start);
@@ -2561,7 +2573,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (vma && (vma->vm_start <= addr))
return vma;
/* don't alter vm_end if the coredump is running */
- if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
+ if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL);
@@ -2587,9 +2599,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
return vma;
if (!(vma->vm_flags & VM_GROWSDOWN))
return NULL;
- /* don't alter vm_start if the coredump is running */
- if (!mmget_still_valid(mm))
- return NULL;
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
@@ -3232,7 +3241,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
* By setting it to reflect the virtual start address of the
* vma, merges and splits can happen in a seamless way, just
* using the existing file pgoff checks and manipulations.
- * Similarly in do_mmap and in do_brk.
+ * Similarly in do_mmap and in do_brk_flags.
*/
if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 4fc918163dd3..5654dd19addc 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -913,7 +913,7 @@ static int __mmu_interval_notifier_insert(
return -EOVERFLOW;
/* Must call with a mmget() held */
- if (WARN_ON(atomic_read(&mm->mm_count) <= 0))
+ if (WARN_ON(atomic_read(&mm->mm_users) <= 0))
return -EINVAL;
/* pairs with mmdrop in mmu_interval_notifier_remove() */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e90f25d6385d..8b84661a6410 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -64,6 +64,8 @@ int sysctl_oom_dump_tasks = 1;
* and mark_oom_victim
*/
DEFINE_MUTEX(oom_lock);
+/* Serializes oom_score_adj and oom_score_adj_min updates */
+DEFINE_MUTEX(oom_adj_mutex);
static inline bool is_memcg_oom(struct oom_control *oc)
{
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 358d6f28c627..7709f0e223f5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2849,6 +2849,7 @@ EXPORT_SYMBOL_GPL(wait_on_page_writeback);
*/
void wait_for_stable_page(struct page *page)
{
+ page = thp_head(page);
if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
wait_on_page_writeback(page);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9fba8859ecd7..7c996edb3abf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
#include <linux/nmi.h>
#include <linux/psi.h>
#include <linux/padata.h>
+#include <linux/khugepaged.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -155,16 +156,16 @@ static int __init early_init_on_alloc(char *buf)
int ret;
bool bool_result;
- if (!buf)
- return -EINVAL;
ret = kstrtobool(buf, &bool_result);
+ if (ret)
+ return ret;
if (bool_result && page_poisoning_enabled())
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
if (bool_result)
static_branch_enable(&init_on_alloc);
else
static_branch_disable(&init_on_alloc);
- return ret;
+ return 0;
}
early_param("init_on_alloc", early_init_on_alloc);
@@ -173,16 +174,16 @@ static int __init early_init_on_free(char *buf)
int ret;
bool bool_result;
- if (!buf)
- return -EINVAL;
ret = kstrtobool(buf, &bool_result);
+ if (ret)
+ return ret;
if (bool_result && page_poisoning_enabled())
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
if (bool_result)
static_branch_enable(&init_on_free);
else
static_branch_disable(&init_on_free);
- return ret;
+ return 0;
}
early_param("init_on_free", early_init_on_free);
@@ -762,7 +763,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) {}
#endif
-static inline void set_page_order(struct page *page, unsigned int order)
+static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
@@ -787,7 +788,7 @@ static inline bool page_is_buddy(struct page *page, struct page *buddy,
if (!page_is_guard(buddy) && !PageBuddy(buddy))
return false;
- if (page_order(buddy) != order)
+ if (buddy_order(buddy) != order)
return false;
/*
@@ -1025,7 +1026,7 @@ continue_merging:
}
done_merging:
- set_page_order(page, order);
+ set_buddy_order(page, order);
if (is_shuffle_order(order))
to_tail = shuffle_pick_tail();
@@ -1173,6 +1174,17 @@ static __always_inline bool free_pages_prepare(struct page *page,
trace_mm_page_free(page, order);
+ if (unlikely(PageHWPoison(page)) && !order) {
+ /*
+ * Do not let hwpoison pages hit pcplists/buddy
+ * Untie memcg state and reset page's owner
+ */
+ if (memcg_kmem_enabled() && PageKmemcg(page))
+ __memcg_kmem_uncharge_page(page, order);
+ reset_page_owner(page, order);
+ return false;
+ }
+
/*
* Check tail pages before head page information is cleared to
* avoid checking PageCompound for order-0 pages.
@@ -2120,7 +2132,7 @@ static inline void expand(struct zone *zone, struct page *page,
continue;
add_to_free_list(&page[size], zone, high, migratetype);
- set_page_order(&page[size], high);
+ set_buddy_order(&page[size], high);
}
}
@@ -2334,7 +2346,7 @@ static int move_freepages(struct zone *zone,
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
- order = page_order(page);
+ order = buddy_order(page);
move_to_free_list(page, zone, order, migratetype);
page += 1 << order;
pages_moved += 1 << order;
@@ -2458,7 +2470,7 @@ static inline void boost_watermark(struct zone *zone)
static void steal_suitable_fallback(struct zone *zone, struct page *page,
unsigned int alloc_flags, int start_type, bool whole_block)
{
- unsigned int current_order = page_order(page);
+ unsigned int current_order = buddy_order(page);
int free_pages, movable_pages, alike_pages;
int old_block_type;
@@ -3740,8 +3752,8 @@ retry:
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
- for_next_zone_zonelist_nodemask(zone, z, ac->zonelist,
- ac->highest_zoneidx, ac->nodemask) {
+ for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
+ ac->nodemask) {
struct page *page;
unsigned long mark;
@@ -3985,8 +3997,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* success so it is time to admit defeat. We will skip the OOM killer
* because it is very likely that the caller has a more reasonable
* fallback than shooting a random task.
+ *
+ * The OOM killer may not free memory on a specific node.
*/
- if (gfp_mask & __GFP_RETRY_MAYFAIL)
+ if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->highest_zoneidx < ZONE_NORMAL)
@@ -4003,10 +4017,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* failures more gracefully we should just bail out here.
*/
- /* The OOM killer may not free memory on a specific node */
- if (gfp_mask & __GFP_THISNODE)
- goto out;
-
/* Exhausted what can be done so it's blame time */
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
*did_some_progress = 1;
@@ -4254,13 +4264,12 @@ EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
/* Perform direct synchronous page reclaim */
-static int
+static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
- int progress;
unsigned int noreclaim_flag;
- unsigned long pflags;
+ unsigned long pflags, progress;
cond_resched();
@@ -4839,12 +4848,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
*alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
- return true;
-}
-
-/* Determine whether to spread dirty pages and what the first usable zone */
-static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
-{
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -4855,6 +4858,8 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
+
+ return true;
}
/*
@@ -4883,8 +4888,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
- finalise_ac(gfp_mask, &ac);
-
/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
@@ -4960,6 +4963,9 @@ void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page))
free_the_page(page, order);
+ else if (!PageHead(page))
+ while (order-- > 0)
+ free_the_page(page + (1 << order), order);
}
EXPORT_SYMBOL(__free_pages);
@@ -5650,7 +5656,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
int n, val;
int min_val = INT_MAX;
int best_node = NUMA_NO_NODE;
- const struct cpumask *tmp = cpumask_of_node(0);
/* Use the local node if we haven't already */
if (!node_isset(node, *used_node_mask)) {
@@ -5671,8 +5676,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
val += (n < node);
/* Give preference to headless and unused nodes */
- tmp = cpumask_of_node(n);
- if (!cpumask_empty(tmp))
+ if (!cpumask_empty(cpumask_of_node(n)))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
@@ -5968,7 +5972,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (*pfn < memblock_region_memory_end_pfn(r))
break;
}
@@ -5986,10 +5990,15 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
* done. Non-atomic initialization, single-pass.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn, enum meminit_context context,
- struct vmem_altmap *altmap)
+ unsigned long start_pfn,
+ enum meminit_context context,
+ struct vmem_altmap *altmap, int migratetype)
{
unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
@@ -6033,19 +6042,12 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
__SetPageReserved(page);
/*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
+ * such that unmovable allocations won't be scattered all
+ * over the place during system boot.
*/
- if (!(pfn & (pageblock_nr_pages - 1))) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
+ set_pageblock_migratetype(page, migratetype);
cond_resched();
}
pfn++;
@@ -6107,15 +6109,10 @@ void __ref memmap_init_zone_device(struct zone *zone,
* the address space during boot when many long-lived
* kernel allocations are made.
*
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
- *
* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in section_activate()
*/
- if (!(pfn & (pageblock_nr_pages - 1))) {
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
@@ -6150,7 +6147,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid,
if (end_pfn > start_pfn) {
size = end_pfn - start_pfn;
memmap_init_zone(size, nid, zone, start_pfn,
- MEMINIT_EARLY, NULL);
+ MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
}
}
}
@@ -6553,7 +6550,7 @@ static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long start_pfn, end_pfn;
struct memblock_region *r;
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
start_pfn = clamp(memblock_region_memory_base_pfn(r),
zone_start_pfn, zone_end_pfn);
end_pfn = clamp(memblock_region_memory_end_pfn(r),
@@ -6997,8 +6994,7 @@ static void __init init_unavailable_mem(void)
* Loop through unavailable ranges not covered by memblock.memory.
*/
pgcnt = 0;
- for_each_mem_range(i, &memblock.memory, NULL,
- NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
+ for_each_mem_range(i, &start, &end) {
if (next < start)
pgcnt += init_unavailable_range(PFN_DOWN(next),
PFN_UP(start));
@@ -7148,7 +7144,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
* options.
*/
if (movable_node_is_enabled()) {
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (!memblock_is_hotpluggable(r))
continue;
@@ -7169,7 +7165,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (mirrored_kernelcore) {
bool mem_below_4gb_not_mirrored = false;
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (memblock_is_mirror(r))
continue;
@@ -7904,6 +7900,8 @@ int __meminit init_per_zone_wmark_min(void)
setup_min_slab_ratio();
#endif
+ khugepaged_min_free_kbytes_update();
+
return 0;
}
postcore_initcall(init_per_zone_wmark_min)
@@ -8231,14 +8229,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
{
unsigned long iter = 0;
unsigned long pfn = page_to_pfn(page);
-
- /*
- * TODO we could make this much more efficient by not checking every
- * page in the range if we know all of them are in MOVABLE_ZONE and
- * that the movable zone guarantees that pages are migratable but
- * the later is not the case right now unfortunatelly. E.g. movablecore
- * can still lead to having bootmem allocations in zone_movable.
- */
+ unsigned long offset = pfn % pageblock_nr_pages;
if (is_migrate_cma_page(page)) {
/*
@@ -8252,12 +8243,18 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
return page;
}
- for (; iter < pageblock_nr_pages; iter++) {
+ for (; iter < pageblock_nr_pages - offset; iter++) {
if (!pfn_valid_within(pfn + iter))
continue;
page = pfn_to_page(pfn + iter);
+ /*
+ * Both, bootmem allocations and memory holes are marked
+ * PG_reserved and are unmovable. We can even have unmovable
+ * allocations inside ZONE_MOVABLE, for example when
+ * specifying "movablecore".
+ */
if (PageReserved(page))
return page;
@@ -8299,7 +8296,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
*/
if (!page_ref_count(page)) {
if (PageBuddy(page))
- iter += (1 << page_order(page)) - 1;
+ iter += (1 << buddy_order(page)) - 1;
continue;
}
@@ -8331,14 +8328,6 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
* it. But now, memory offline itself doesn't call
* shrink_node_slabs() and it still to be fixed.
*/
- /*
- * If the page is not RAM, page_count()should be 0.
- * we don't need more check. This is an _used_ not-movable page.
- *
- * The problematic thing here is PG_reserved pages. PG_reserved
- * is set to both of a memory hole page and a _used_ kernel
- * page at boot.
- */
return page;
}
return NULL;
@@ -8472,7 +8461,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
ret = start_isolate_page_range(pfn_max_align_down(start),
pfn_max_align_up(end), migratetype, 0);
- if (ret < 0)
+ if (ret)
return ret;
/*
@@ -8520,7 +8509,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
}
if (outer_start != start) {
- order = page_order(pfn_to_page(outer_start));
+ order = buddy_order(pfn_to_page(outer_start));
/*
* outer_start page could be small order buddy page and
@@ -8708,35 +8697,21 @@ void zone_pcp_reset(struct zone *zone)
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * All pages in the range must be in a single zone and isolated
- * before calling this.
+ * All pages in the range must be in a single zone, must not contain holes,
+ * must span full sections, and must be isolated before calling this function.
*/
-unsigned long
-__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
+ unsigned long pfn = start_pfn;
struct page *page;
struct zone *zone;
unsigned int order;
- unsigned long pfn;
unsigned long flags;
- unsigned long offlined_pages = 0;
-
- /* find the first valid pfn */
- for (pfn = start_pfn; pfn < end_pfn; pfn++)
- if (pfn_valid(pfn))
- break;
- if (pfn == end_pfn)
- return offlined_pages;
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
- pfn = start_pfn;
while (pfn < end_pfn) {
- if (!pfn_valid(pfn)) {
- pfn++;
- continue;
- }
page = pfn_to_page(pfn);
/*
* The HWPoisoned page may be not in buddy system, and
@@ -8744,7 +8719,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
*/
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
- offlined_pages++;
continue;
}
/*
@@ -8755,20 +8729,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(page_count(page));
BUG_ON(PageBuddy(page));
pfn++;
- offlined_pages++;
continue;
}
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
- order = page_order(page);
- offlined_pages += 1 << order;
+ order = buddy_order(page);
del_page_from_free_list(page, zone, order);
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
-
- return offlined_pages;
}
#endif
@@ -8783,7 +8753,7 @@ bool is_free_buddy_page(struct page *page)
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
- if (PageBuddy(page_head) && page_order(page_head) >= order)
+ if (PageBuddy(page_head) && buddy_order(page_head) >= order)
break;
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -8793,30 +8763,70 @@ bool is_free_buddy_page(struct page *page)
#ifdef CONFIG_MEMORY_FAILURE
/*
- * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
- * test is performed under the zone lock to prevent a race against page
- * allocation.
+ * Break down a higher-order page in sub-pages, and keep our target out of
+ * buddy allocator.
+ */
+static void break_down_buddy_pages(struct zone *zone, struct page *page,
+ struct page *target, int low, int high,
+ int migratetype)
+{
+ unsigned long size = 1 << high;
+ struct page *current_buddy, *next_page;
+
+ while (high > low) {
+ high--;
+ size >>= 1;
+
+ if (target >= &page[size]) {
+ next_page = page + size;
+ current_buddy = page;
+ } else {
+ next_page = page;
+ current_buddy = page + size;
+ }
+
+ if (set_page_guard(zone, current_buddy, high, migratetype))
+ continue;
+
+ if (current_buddy != target) {
+ add_to_free_list(current_buddy, zone, high, migratetype);
+ set_buddy_order(current_buddy, high);
+ page = next_page;
+ }
+ }
+}
+
+/*
+ * Take a page that will be marked as poisoned off the buddy allocator.
*/
-bool set_hwpoison_free_buddy_page(struct page *page)
+bool take_page_off_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
unsigned int order;
- bool hwpoisoned = false;
+ bool ret = false;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
+ int page_order = buddy_order(page_head);
+
+ if (PageBuddy(page_head) && page_order >= order) {
+ unsigned long pfn_head = page_to_pfn(page_head);
+ int migratetype = get_pfnblock_migratetype(page_head,
+ pfn_head);
- if (PageBuddy(page_head) && page_order(page_head) >= order) {
- if (!TestSetPageHWPoison(page))
- hwpoisoned = true;
+ del_page_from_free_list(page_head, zone, page_order);
+ break_down_buddy_pages(zone, page_head, page, 0,
+ page_order, migratetype);
+ ret = true;
break;
}
+ if (page_count(page_head) > 0)
+ break;
}
spin_unlock_irqrestore(&zone->lock, flags);
-
- return hwpoisoned;
+ return ret;
}
#endif
diff --git a/mm/page_counter.c b/mm/page_counter.c
index afe22ad335cc..b24a60b28bb0 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -109,7 +109,7 @@ bool page_counter_try_charge(struct page_counter *counter,
*
* The atomic_long_add_return() implies a full memory
* barrier between incrementing the count and reading
- * the limit. When racing with page_counter_limit(),
+ * the limit. When racing with page_counter_set_max(),
* we either see the new limit or the setter sees the
* counter has changed and retries.
*/
diff --git a/mm/page_io.c b/mm/page_io.c
index f9e9267f296f..433df1263349 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -312,7 +312,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
struct swap_info_struct *sis = page_swap_info(page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- if (data_race(sis->flags & SWP_FS)) {
+ if (data_race(sis->flags & SWP_FS_OPS)) {
struct kiocb kiocb;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -359,13 +359,11 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
return 0;
}
- ret = 0;
bio = get_swap_bio(GFP_NOIO, page, end_write_func);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
bio_associate_blkg_from_page(bio, page);
@@ -373,8 +371,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
set_page_writeback(page);
unlock_page(page);
submit_bio(bio);
-out:
- return ret;
+
+ return 0;
}
int swap_readpage(struct page *page, bool synchronous)
@@ -403,7 +401,7 @@ int swap_readpage(struct page *page, bool synchronous)
goto out;
}
- if (data_race(sis->flags & SWP_FS)) {
+ if (data_race(sis->flags & SWP_FS_OPS)) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -467,7 +465,7 @@ int swap_set_page_dirty(struct page *page)
{
struct swap_info_struct *sis = page_swap_info(page);
- if (data_race(sis->flags & SWP_FS)) {
+ if (data_race(sis->flags & SWP_FS_OPS)) {
struct address_space *mapping = sis->swap_file->f_mapping;
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 63a3db10a8c0..ca0a71be0e7d 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -17,22 +17,21 @@
static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
{
- struct page *unmovable = NULL;
- struct zone *zone;
+ struct zone *zone = page_zone(page);
+ struct page *unmovable;
unsigned long flags;
- int ret = -EBUSY;
-
- zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
/*
* We assume the caller intended to SET migrate type to isolate.
* If it is already set, then someone else must have raced and
- * set it before us. Return -EBUSY
+ * set it before us.
*/
- if (is_migrate_isolate_page(page))
- goto out;
+ if (is_migrate_isolate_page(page)) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return -EBUSY;
+ }
/*
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
@@ -49,25 +48,21 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
NULL);
__mod_zone_freepage_state(zone, -nr_pages, mt);
- ret = 0;
+ spin_unlock_irqrestore(&zone->lock, flags);
+ drain_all_pages(zone);
+ return 0;
}
-out:
spin_unlock_irqrestore(&zone->lock, flags);
- if (!ret) {
- drain_all_pages(zone);
- } else {
- WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
-
- if ((isol_flags & REPORT_FAILURE) && unmovable)
- /*
- * printk() with zone->lock held will likely trigger a
- * lockdep splat, so defer it here.
- */
- dump_page(unmovable, "unmovable page");
+ if (isol_flags & REPORT_FAILURE) {
+ /*
+ * printk() with zone->lock held will likely trigger a
+ * lockdep splat, so defer it here.
+ */
+ dump_page(unmovable, "unmovable page");
}
- return ret;
+ return -EBUSY;
}
static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
@@ -93,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* these pages to be merged.
*/
if (PageBuddy(page)) {
- order = page_order(page);
+ order = buddy_order(page);
if (order >= pageblock_order) {
pfn = page_to_pfn(page);
buddy_pfn = __find_buddy_pfn(pfn, order);
@@ -178,8 +173,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* (e.g. __offline_pages will need to call it after check for isolated range for
* a next retry).
*
- * Return: the number of isolated pageblocks on success and -EBUSY if any part
- * of range cannot be isolated.
+ * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned migratetype, int flags)
@@ -187,7 +181,6 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned long pfn;
unsigned long undo_pfn;
struct page *page;
- int nr_isolate_pageblock = 0;
BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
@@ -201,10 +194,9 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
undo_pfn = pfn;
goto undo;
}
- nr_isolate_pageblock++;
}
}
- return nr_isolate_pageblock;
+ return 0;
undo:
for (pfn = start_pfn;
pfn < undo_pfn;
@@ -264,7 +256,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* the correct MIGRATE_ISOLATE freelist. There is no
* simple way to verify that as VM_BUG_ON(), though.
*/
- pfn += 1 << page_order(page);
+ pfn += 1 << buddy_order(page);
else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
/* A HWPoisoned page cannot be also PageBuddy */
pfn++;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 360461509423..b735a8eafcdb 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -204,7 +204,7 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
page_owner->last_migrate_reason = reason;
}
-void __split_page_owner(struct page *page, unsigned int order)
+void __split_page_owner(struct page *page, unsigned int nr)
{
int i;
struct page_ext *page_ext = lookup_page_ext(page);
@@ -213,7 +213,7 @@ void __split_page_owner(struct page *page, unsigned int order)
if (unlikely(!page_ext))
return;
- for (i = 0; i < (1 << order); i++) {
+ for (i = 0; i < nr; i++) {
page_owner = get_page_owner(page_ext);
page_owner->order = 0;
page_ext = page_ext_next(page_ext);
@@ -295,7 +295,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
if (PageBuddy(page)) {
unsigned long freepage_order;
- freepage_order = page_order_unsafe(page);
+ freepage_order = buddy_order_unsafe(page);
if (freepage_order < MAX_ORDER)
pfn += (1UL << freepage_order) - 1;
continue;
@@ -490,7 +490,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
page = pfn_to_page(pfn);
if (PageBuddy(page)) {
- unsigned long freepage_order = page_order_unsafe(page);
+ unsigned long freepage_order = buddy_order_unsafe(page);
if (freepage_order < MAX_ORDER)
pfn += (1UL << freepage_order) - 1;
@@ -584,7 +584,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
* heavy lock contention.
*/
if (PageBuddy(page)) {
- unsigned long order = page_order_unsafe(page);
+ unsigned long order = buddy_order_unsafe(page);
if (order > 0 && order < MAX_ORDER)
pfn += (1UL << order) - 1;
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 34b9181ee5d1..ae0482cded87 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -8,13 +8,23 @@
#include <linux/ratelimit.h>
#include <linux/kasan.h>
-static bool want_page_poisoning __read_mostly;
+static DEFINE_STATIC_KEY_FALSE_RO(want_page_poisoning);
static int __init early_page_poison_param(char *buf)
{
- if (!buf)
- return -EINVAL;
- return strtobool(buf, &want_page_poisoning);
+ int ret;
+ bool tmp;
+
+ ret = strtobool(buf, &tmp);
+ if (ret)
+ return ret;
+
+ if (tmp)
+ static_branch_enable(&want_page_poisoning);
+ else
+ static_branch_disable(&want_page_poisoning);
+
+ return 0;
}
early_param("page_poison", early_page_poison_param);
@@ -31,7 +41,7 @@ bool page_poisoning_enabled(void)
* Page poisoning is debug page alloc for some arches. If
* either of those options are enabled, enable poisoning.
*/
- return (want_page_poisoning ||
+ return (static_branch_unlikely(&want_page_poisoning) ||
(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
debug_pagealloc_enabled()));
}
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 3bbd471cfc81..cd8e13d41df4 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -92,7 +92,7 @@ page_reporting_drain(struct page_reporting_dev_info *prdev,
* report on the new larger page when we make our way
* up to that higher order.
*/
- if (PageBuddy(page) && page_order(page) == order)
+ if (PageBuddy(page) && buddy_order(page) == order)
__SetPageReported(page);
} while ((sg = sg_next(sg)));
@@ -178,7 +178,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
* the new head of the free list before we release the
* zone lock.
*/
- if (&page->lru != list && !list_is_first(&page->lru, list))
+ if (!list_is_first(&page->lru, list))
list_rotate_to_front(&page->lru, list);
/* release lock before waiting on report processing */
diff --git a/mm/readahead.c b/mm/readahead.c
index 3c9a8dd7c56c..c6ffb76827da 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,10 +158,8 @@ out:
}
/**
- * page_cache_readahead_unbounded - Start unchecked readahead.
- * @mapping: File address space.
- * @file: This instance of the open file; used for authentication.
- * @index: First page index to read.
+ * page_cache_ra_unbounded - Start unchecked readahead.
+ * @ractl: Readahead control.
* @nr_to_read: The number of pages to read.
* @lookahead_size: Where to start the next readahead.
*
@@ -173,17 +171,13 @@ out:
* Context: File is referenced by caller. Mutexes may be held by caller.
* May sleep, but will not reenter filesystem to reclaim memory.
*/
-void page_cache_readahead_unbounded(struct address_space *mapping,
- struct file *file, pgoff_t index, unsigned long nr_to_read,
- unsigned long lookahead_size)
+void page_cache_ra_unbounded(struct readahead_control *ractl,
+ unsigned long nr_to_read, unsigned long lookahead_size)
{
+ struct address_space *mapping = ractl->mapping;
+ unsigned long index = readahead_index(ractl);
LIST_HEAD(page_pool);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
- struct readahead_control rac = {
- .mapping = mapping,
- .file = file,
- ._index = index,
- };
unsigned long i;
/*
@@ -204,7 +198,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
for (i = 0; i < nr_to_read; i++) {
struct page *page = xa_load(&mapping->i_pages, index + i);
- BUG_ON(index + i != rac._index + rac._nr_pages);
+ BUG_ON(index + i != ractl->_index + ractl->_nr_pages);
if (page && !xa_is_value(page)) {
/*
@@ -215,7 +209,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
* have a stable reference to this page, and it's
* not worth getting one just for that.
*/
- read_pages(&rac, &page_pool, true);
+ read_pages(ractl, &page_pool, true);
continue;
}
@@ -228,12 +222,12 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
} else if (add_to_page_cache_lru(page, mapping, index + i,
gfp_mask) < 0) {
put_page(page);
- read_pages(&rac, &page_pool, true);
+ read_pages(ractl, &page_pool, true);
continue;
}
if (i == nr_to_read - lookahead_size)
SetPageReadahead(page);
- rac._nr_pages++;
+ ractl->_nr_pages++;
}
/*
@@ -241,22 +235,22 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
- read_pages(&rac, &page_pool, false);
+ read_pages(ractl, &page_pool, false);
memalloc_nofs_restore(nofs);
}
-EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded);
+EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
/*
- * __do_page_cache_readahead() actually reads a chunk of disk. It allocates
+ * do_page_cache_ra() actually reads a chunk of disk. It allocates
* the pages first, then submits them for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*/
-void __do_page_cache_readahead(struct address_space *mapping,
- struct file *file, pgoff_t index, unsigned long nr_to_read,
- unsigned long lookahead_size)
+void do_page_cache_ra(struct readahead_control *ractl,
+ unsigned long nr_to_read, unsigned long lookahead_size)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = ractl->mapping->host;
+ unsigned long index = readahead_index(ractl);
loff_t isize = i_size_read(inode);
pgoff_t end_index; /* The last page we want to read */
@@ -270,20 +264,19 @@ void __do_page_cache_readahead(struct address_space *mapping,
if (nr_to_read > end_index - index)
nr_to_read = end_index - index + 1;
- page_cache_readahead_unbounded(mapping, file, index, nr_to_read,
- lookahead_size);
+ page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
}
/*
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once.
*/
-void force_page_cache_readahead(struct address_space *mapping,
- struct file *filp, pgoff_t index, unsigned long nr_to_read)
+void force_page_cache_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, unsigned long nr_to_read)
{
+ struct address_space *mapping = ractl->mapping;
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
- struct file_ra_state *ra = &filp->f_ra;
- unsigned long max_pages;
+ unsigned long max_pages, index;
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
!mapping->a_ops->readahead))
@@ -293,14 +286,16 @@ void force_page_cache_readahead(struct address_space *mapping,
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
+ index = readahead_index(ractl);
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
- nr_to_read = min(nr_to_read, max_pages);
+ nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
while (nr_to_read) {
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
- __do_page_cache_readahead(mapping, filp, index, this_chunk, 0);
+ ractl->_index = index;
+ do_page_cache_ra(ractl, this_chunk, 0);
index += this_chunk;
nr_to_read -= this_chunk;
@@ -437,14 +432,14 @@ static int try_context_readahead(struct address_space *mapping,
/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
-static void ondemand_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- bool hit_readahead_marker, pgoff_t index,
+static void ondemand_readahead(struct readahead_control *ractl,
+ struct file_ra_state *ra, bool hit_readahead_marker,
unsigned long req_size)
{
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
+ unsigned long index = readahead_index(ractl);
pgoff_t prev_index;
/*
@@ -482,7 +477,8 @@ static void ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_miss(mapping, index + 1, max_pages);
+ start = page_cache_next_miss(ractl->mapping, index + 1,
+ max_pages);
rcu_read_unlock();
if (!start || start - index > max_pages)
@@ -515,14 +511,15 @@ static void ondemand_readahead(struct address_space *mapping,
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
- if (try_context_readahead(mapping, ra, index, req_size, max_pages))
+ if (try_context_readahead(ractl->mapping, ra, index, req_size,
+ max_pages))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
- __do_page_cache_readahead(mapping, filp, index, req_size, 0);
+ do_page_cache_ra(ractl, req_size, 0);
return;
initial_readahead:
@@ -548,25 +545,12 @@ readit:
}
}
- ra_submit(ra, mapping, filp);
+ ractl->_index = ra->start;
+ do_page_cache_ra(ractl, ra->size, ra->async_size);
}
-/**
- * page_cache_sync_readahead - generic file readahead
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @index: Index of first page to be read.
- * @req_count: Total number of pages being read by the caller.
- *
- * page_cache_sync_readahead() should be called when a cache miss happened:
- * it will submit the read. The readahead logic may decide to piggyback more
- * pages onto the read request if access patterns suggest it will improve
- * performance.
- */
-void page_cache_sync_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- pgoff_t index, unsigned long req_count)
+void page_cache_sync_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, unsigned long req_count)
{
/* no read-ahead */
if (!ra->ra_pages)
@@ -576,35 +560,19 @@ void page_cache_sync_readahead(struct address_space *mapping,
return;
/* be dumb */
- if (filp && (filp->f_mode & FMODE_RANDOM)) {
- force_page_cache_readahead(mapping, filp, index, req_count);
+ if (ractl->file && (ractl->file->f_mode & FMODE_RANDOM)) {
+ force_page_cache_ra(ractl, ra, req_count);
return;
}
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, false, index, req_count);
+ ondemand_readahead(ractl, ra, false, req_count);
}
-EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
+EXPORT_SYMBOL_GPL(page_cache_sync_ra);
-/**
- * page_cache_async_readahead - file readahead for marked pages
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @page: The page at @index which triggered the readahead call.
- * @index: Index of first page to be read.
- * @req_count: Total number of pages being read by the caller.
- *
- * page_cache_async_readahead() should be called when a page is used which
- * is marked as PageReadahead; this is a marker to suggest that the application
- * has used up enough of the readahead window that we should start pulling in
- * more pages.
- */
-void
-page_cache_async_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- struct page *page, pgoff_t index,
- unsigned long req_count)
+void page_cache_async_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, struct page *page,
+ unsigned long req_count)
{
/* no read-ahead */
if (!ra->ra_pages)
@@ -621,16 +589,16 @@ page_cache_async_readahead(struct address_space *mapping,
/*
* Defer asynchronous read-ahead on IO congestion.
*/
- if (inode_read_congested(mapping->host))
+ if (inode_read_congested(ractl->mapping->host))
return;
if (blk_cgroup_congested())
return;
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, true, index, req_count);
+ ondemand_readahead(ractl, ra, true, req_count);
}
-EXPORT_SYMBOL_GPL(page_cache_async_readahead);
+EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
diff --git a/mm/rmap.c b/mm/rmap.c
index 9425260774a1..1b84945d655c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1205,7 +1205,7 @@ void page_add_file_rmap(struct page *page, bool compound)
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_inc_and_test(&page[i]._mapcount))
nr++;
}
@@ -1246,7 +1246,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
/* page still mapped by someone else? */
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
@@ -1293,7 +1293,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
* Subpages can be mapped with PTEs too. Check how many of
* them are still mapped.
*/
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
@@ -1303,10 +1303,10 @@ static void page_remove_anon_compound_rmap(struct page *page)
* page of the compound page is unmapped, but at least one
* small page is still mapped.
*/
- if (nr && nr < HPAGE_PMD_NR)
+ if (nr && nr < thp_nr_pages(page))
deferred_split_huge_page(page);
} else {
- nr = HPAGE_PMD_NR;
+ nr = thp_nr_pages(page);
}
if (unlikely(PageMlocked(page)))
diff --git a/mm/shmem.c b/mm/shmem.c
index d42c27e4769f..537c137698f8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1830,6 +1830,8 @@ repeat:
return error;
}
+ if (page)
+ hindex = page->index;
if (page && sgp == SGP_WRITE)
mark_page_accessed(page);
@@ -1840,11 +1842,10 @@ repeat:
unlock_page(page);
put_page(page);
page = NULL;
+ hindex = index;
}
- if (page || sgp == SGP_READ) {
- *pagep = page;
- return 0;
- }
+ if (page || sgp == SGP_READ)
+ goto out;
/*
* Fast cache lookup did not find it:
@@ -1969,14 +1970,13 @@ clear:
* it now, lest undo on failure cancel our earlier guarantee.
*/
if (sgp != SGP_WRITE && !PageUptodate(page)) {
- struct page *head = compound_head(page);
int i;
- for (i = 0; i < compound_nr(head); i++) {
- clear_highpage(head + i);
- flush_dcache_page(head + i);
+ for (i = 0; i < compound_nr(page); i++) {
+ clear_highpage(page + i);
+ flush_dcache_page(page + i);
}
- SetPageUptodate(head);
+ SetPageUptodate(page);
}
/* Perhaps the file has been truncated since we checked */
@@ -1992,6 +1992,7 @@ clear:
error = -EINVAL;
goto unlock;
}
+out:
*pagep = page + index - hindex;
return 0;
@@ -3983,7 +3984,7 @@ static struct file_system_type shmem_fs_type = {
.parameters = shmem_fs_parameters,
#endif
.kill_sb = kill_litter_super,
- .fs_flags = FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT,
};
int __init shmem_init(void)
diff --git a/mm/shuffle.c b/mm/shuffle.c
index 9b5cd4b004b0..9c2e145a747a 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -60,7 +60,7 @@ static struct page * __meminit shuffle_valid_page(struct zone *zone,
* ...is the page on the same list as the page we will
* shuffle it with?
*/
- if (page_order(page) != order)
+ if (buddy_order(page) != order)
return NULL;
return page;
diff --git a/mm/slab.c b/mm/slab.c
index f658e86ec8ce..c4a385d49362 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1062,7 +1062,7 @@ int slab_prepare_cpu(unsigned int cpu)
* Even if all the cpus of a node are down, we don't free the
* kmem_cache_node of any cache. This to avoid a race between cpu_down, and
* a kmalloc allocation from another cpu for memory from the node of
- * the cpu going down. The list3 structure is usually allocated from
+ * the cpu going down. The kmem_cache_node structure is usually allocated from
* kmem_cache_create() and gets destroyed at kmem_cache_destroy().
*/
int slab_dead_cpu(unsigned int cpu)
@@ -2305,8 +2305,6 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
/* Slab management obj is off-slab. */
freelist = kmem_cache_alloc_node(cachep->freelist_cache,
local_flags, nodeid);
- if (!freelist)
- return NULL;
} else {
/* We will use last bytes at the slab for freelist */
freelist = addr + (PAGE_SIZE << cachep->gfporder) -
diff --git a/mm/slab.h b/mm/slab.h
index 6cc323f1313a..95e5cc1bb2a3 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -46,7 +46,6 @@ struct kmem_cache {
#include <linux/kmemleak.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
-#include <linux/kmemleak.h>
/*
* State of the slab allocator.
diff --git a/mm/slub.c b/mm/slub.c
index 6d3574013b2f..8f66de8a5ab3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1956,7 +1956,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
/*
* Racy check. If we mistakenly see no partial slabs then we
* just allocate an empty slab. If we mistakenly try to get a
- * partial slab and there is none available then get_partials()
+ * partial slab and there is none available then get_partial()
* will return NULL.
*/
if (!n || !n->nr_partial)
@@ -2245,7 +2245,8 @@ redo:
}
} else {
m = M_FULL;
- if (kmem_cache_debug(s) && !lock) {
+#ifdef CONFIG_SLUB_DEBUG
+ if ((s->flags & SLAB_STORE_USER) && !lock) {
lock = 1;
/*
* This also ensures that the scanning of full
@@ -2254,6 +2255,7 @@ redo:
*/
spin_lock(&n->list_lock);
}
+#endif
}
if (l != m) {
@@ -2661,6 +2663,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
void *freelist;
struct page *page;
+ stat(s, ALLOC_SLOWPATH);
+
page = c->page;
if (!page) {
/*
@@ -2850,7 +2854,6 @@ redo:
page = c->page;
if (unlikely(!object || !node_match(page, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
- stat(s, ALLOC_SLOWPATH);
} else {
void *next_object = get_freepointer_safe(s, object);
@@ -3019,20 +3022,21 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
if (likely(!n)) {
- /*
- * If we just froze the page then put it onto the
- * per cpu partial list.
- */
- if (new.frozen && !was_frozen) {
+ if (likely(was_frozen)) {
+ /*
+ * The list lock was not taken therefore no list
+ * activity can be necessary.
+ */
+ stat(s, FREE_FROZEN);
+ } else if (new.frozen) {
+ /*
+ * If we just froze the page then put it onto the
+ * per cpu partial list.
+ */
put_cpu_partial(s, page, 1);
stat(s, CPU_PARTIAL_FREE);
}
- /*
- * The list lock was not taken therefore no list
- * activity can be necessary.
- */
- if (was_frozen)
- stat(s, FREE_FROZEN);
+
return;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index fcc3d176f1ea..7bd23f9d6cef 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -291,13 +291,11 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en
*/
static void __init memblocks_present(void)
{
- struct memblock_region *reg;
+ unsigned long start, end;
+ int i, nid;
- for_each_memblock(memory, reg) {
- memory_present(memblock_get_region_node(reg),
- memblock_region_memory_base_pfn(reg),
- memblock_region_memory_end_pfn(reg));
- }
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
+ memory_present(nid, start, end);
}
/*
@@ -314,6 +312,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
return coded_mem_map;
}
+#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Decode mem_map from the coded memmap
*/
@@ -323,6 +322,7 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
coded_mem_map &= SECTION_MAP_MASK;
return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}
+#endif /* CONFIG_MEMORY_HOTPLUG */
static void __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
diff --git a/mm/swap.c b/mm/swap.c
index 65ef7e3525bf..0eb057141a04 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -348,7 +348,7 @@ static bool need_activate_page_drain(int cpu)
return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
}
-void activate_page(struct page *page)
+static void activate_page(struct page *page)
{
page = compound_head(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -368,7 +368,7 @@ static inline void activate_page_drain(int cpu)
{
}
-void activate_page(struct page *page)
+static void activate_page(struct page *page)
{
pg_data_t *pgdat = page_pgdat(page);
@@ -481,9 +481,7 @@ EXPORT_SYMBOL(lru_cache_add);
* @vma: vma in which page is mapped for determining reclaimability
*
* Place @page on the inactive or unevictable LRU list, depending on its
- * evictability. Note that if the page is not evictable, it goes
- * directly back onto it's zone's unevictable list, it does NOT use a
- * per cpu pagevec.
+ * evictability.
*/
void lru_cache_add_inactive_or_unevictable(struct page *page,
struct vm_area_struct *vma)
@@ -598,11 +596,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
{
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
- bool active = PageActive(page);
int nr_pages = thp_nr_pages(page);
- del_page_from_lru_list(page, lruvec,
- LRU_INACTIVE_ANON + active);
+ del_page_from_lru_list(page, lruvec, page_lru(page));
ClearPageActive(page);
ClearPageReferenced(page);
/*
@@ -891,6 +887,7 @@ void release_pages(struct page **pages, int nr)
locked_pgdat = NULL;
}
+ page = compound_head(page);
if (is_huge_zero_page(page))
continue;
@@ -902,7 +899,7 @@ void release_pages(struct page **pages, int nr)
}
/*
* ZONE_DEVICE pages that return 'false' from
- * put_devmap_managed_page() do not require special
+ * page_is_devmap_managed() do not require special
* processing, and instead, expect a call to
* put_page_testzero().
*/
@@ -912,7 +909,6 @@ void release_pages(struct page **pages, int nr)
}
}
- page = compound_head(page);
if (!put_page_testzero(page))
continue;
@@ -943,8 +939,6 @@ void release_pages(struct page **pages, int nr)
del_page_from_lru_list(page, lruvec, page_off_lru(page));
}
- /* Clear Active bit in case of parallel mark_page_accessed */
- __ClearPageActive(page);
__ClearPageWaiters(page);
list_add(&page->lru, &pages_to_free);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 3e6453573a89..0357fbe70645 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -237,7 +237,7 @@ static int free_slot_cache(unsigned int cpu)
return 0;
}
-int enable_swap_slots_cache(void)
+void enable_swap_slots_cache(void)
{
mutex_lock(&swap_slots_cache_enable_mutex);
if (!swap_slot_cache_initialized) {
@@ -255,7 +255,6 @@ int enable_swap_slots_cache(void)
__reenable_swap_slots_cache();
out_unlock:
mutex_unlock(&swap_slots_cache_enable_mutex);
- return 0;
}
/* called with swap slot cache's alloc lock held */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c16eebb81d8b..ee465827420e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,6 +21,7 @@
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
+#include <linux/shmem_fs.h>
#include "internal.h"
/*
@@ -245,7 +246,7 @@ int add_to_swap(struct page *page)
goto fail;
/*
* Normally the page will be dirtied in unmap because its pte should be
- * dirty. A special case is MADV_FREE page. The page'e pte could have
+ * dirty. A special case is MADV_FREE page. The page's pte could have
* dirty bit cleared but the page's SwapBacked bit is still set because
* clearing the dirty bit and SwapBacked bit has no lock protected. For
* such page, unmap will not set dirty bit for it, so page reclaim will
@@ -414,6 +415,39 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
return page;
}
+/**
+ * find_get_incore_page - Find and get a page from the page or swap caches.
+ * @mapping: The address_space to search.
+ * @index: The page cache index.
+ *
+ * This differs from find_get_page() in that it will also look for the
+ * page in the swap cache.
+ *
+ * Return: The found page or %NULL.
+ */
+struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+{
+ swp_entry_t swp;
+ struct swap_info_struct *si;
+ struct page *page = find_get_entry(mapping, index);
+
+ if (!page)
+ return page;
+ if (!xa_is_value(page))
+ return find_subpage(page, index);
+ if (!shmem_mapping(mapping))
+ return NULL;
+
+ swp = radix_to_swp_entry(page);
+ /* Prevent swapoff from happening to us */
+ si = get_swap_device(swp);
+ if (!si)
+ return NULL;
+ page = find_get_page(swap_address_space(swp), swp_offset(swp));
+ put_swap_device(si);
+ return page;
+}
+
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
bool *new_page_allocated)
@@ -631,7 +665,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
goto skip;
/* Test swap type to make sure the dereference is safe */
- if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) {
+ if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
struct inode *inode = si->swap_file->f_mapping->host;
if (inode_read_congested(inode))
goto skip;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ced4635d924c..1c556c2158b6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -324,14 +324,15 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
static inline bool cluster_is_huge(struct swap_cluster_info *info)
{
- if (IS_ENABLED(CONFIG_THP_SWAP))
+ if (IS_ENABLED(CONFIG_THP_SWAP) && info)
return info->flags & CLUSTER_FLAG_HUGE;
return false;
}
static inline void cluster_clear_huge(struct swap_cluster_info *info)
{
- info->flags &= ~CLUSTER_FLAG_HUGE;
+ if (IS_ENABLED(CONFIG_THP_SWAP) && info)
+ info->flags &= ~CLUSTER_FLAG_HUGE;
}
static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
@@ -1184,7 +1185,6 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
bad_free:
pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
- goto out;
out:
return NULL;
}
@@ -1929,11 +1929,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
lru_cache_add_inactive_or_unevictable(page, vma);
}
swap_free(entry);
- /*
- * Move the page to the active list so it is not
- * immediately swapped out again after swapon.
- */
- activate_page(page);
out:
pte_unmap_unlock(pte, ptl);
if (page != swapcache) {
@@ -2437,7 +2432,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
if (ret >= 0)
sis->flags |= SWP_ACTIVATED;
if (!ret) {
- sis->flags |= SWP_FS;
+ sis->flags |= SWP_FS_OPS;
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
}
@@ -3348,7 +3343,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = inode_drain_writes(inode);
if (error) {
inode->i_flags &= ~S_SWAPFILE;
- goto bad_swap_unlock_inode;
+ goto free_swap_address_space;
}
mutex_lock(&swapon_mutex);
@@ -3373,6 +3368,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = 0;
goto out;
+free_swap_address_space:
+ exit_swap_address_space(p->type);
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
diff --git a/mm/truncate.c b/mm/truncate.c
index dd9ebc1da356..18cec39a9f53 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -168,7 +168,7 @@ void do_invalidatepage(struct page *page, unsigned int offset,
* becomes orphaned. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_fault().
*
- * We need to bale out if page->mapping is no longer equal to the original
+ * We need to bail out if page->mapping is no longer equal to the original
* mapping. This happens a) when the VM reclaimed the page while we waited on
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
@@ -177,12 +177,12 @@ static void
truncate_cleanup_page(struct address_space *mapping, struct page *page)
{
if (page_mapped(page)) {
- pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
+ unsigned int nr = thp_nr_pages(page);
unmap_mapping_pages(mapping, page->index, nr, false);
}
if (page_has_private(page))
- do_invalidatepage(page, 0, PAGE_SIZE);
+ do_invalidatepage(page, 0, thp_size(page));
/*
* Some filesystems seem to re-dirty the page even after
@@ -528,23 +528,8 @@ void truncate_inode_pages_final(struct address_space *mapping)
}
EXPORT_SYMBOL(truncate_inode_pages_final);
-/**
- * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
- * @start: the offset 'from' which to invalidate
- * @end: the offset 'to' which to invalidate (inclusive)
- *
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
- *
- * invalidate_mapping_pages() will not block on IO activity. It will not
- * invalidate pages which are dirty, locked, under writeback or mapped into
- * pagetables.
- *
- * Return: the number of the pages that were invalidated
- */
-unsigned long invalidate_mapping_pages(struct address_space *mapping,
- pgoff_t start, pgoff_t end)
+unsigned long __invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
{
pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
@@ -610,8 +595,13 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
* Invalidation is a hint that the page is no longer
* of interest and try to speed up its reclaim.
*/
- if (!ret)
+ if (!ret) {
deactivate_file_page(page);
+ /* It is likely on the pagevec of a remote CPU */
+ if (nr_pagevec)
+ (*nr_pagevec)++;
+ }
+
if (PageTransHuge(page))
put_page(page);
count += ret;
@@ -623,8 +613,40 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
}
return count;
}
+
+/**
+ * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_mapping_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ *
+ * Return: the number of the pages that were invalidated
+ */
+unsigned long invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ return __invalidate_mapping_pages(mapping, start, end, NULL);
+}
EXPORT_SYMBOL(invalidate_mapping_pages);
+/**
+ * This helper is similar with the above one, except that it accounts for pages
+ * that are likely on a pagevec and count them in @nr_pagevec, which will used by
+ * the caller.
+ */
+void invalidate_mapping_pagevec(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
+{
+ __invalidate_mapping_pages(mapping, start, end, nr_pagevec);
+}
+
/*
* This is like invalidate_complete_page(), except it ignores the page's
* refcount. We do this because invalidate_inode_pages2() needs stronger
diff --git a/mm/util.c b/mm/util.c
index 4e21fe7eae27..4ddb6e186dd5 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -69,7 +69,8 @@ EXPORT_SYMBOL(kstrdup);
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
- * Note: Strings allocated by kstrdup_const should be freed by kfree_const.
+ * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
+ * must not be passed to krealloc().
*
* Return: source string if it is in .rodata section otherwise
* fallback to kstrdup.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index be4724b916b3..04ac98bf5045 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2133,7 +2133,7 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
* It is up to the caller to do all required locking to keep the returned
* pointer valid.
*
- * Return: pointer to the found area or %NULL on faulure
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *find_vm_area(const void *addr)
{
@@ -2154,7 +2154,7 @@ struct vm_struct *find_vm_area(const void *addr)
* This function returns the found VM area, but using it is NOT safe
* on SMP machines, except for its size or flags.
*
- * Return: pointer to the found area or %NULL on faulure
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *remove_vm_area(const void *addr)
{
@@ -2447,7 +2447,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
if (unlikely(!page)) {
- /* Successfully allocated i pages, free them in __vunmap() */
+ /* Successfully allocated i pages, free them in __vfree() */
area->nr_pages = i;
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
goto fail;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 466fc3144fff..d848c76e035a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -699,6 +699,9 @@ void drop_slab_node(int nid)
do {
struct mem_cgroup *memcg = NULL;
+ if (fatal_signal_pending(current))
+ return;
+
freed = 0;
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
@@ -722,8 +725,7 @@ static inline int is_page_cache_freeable(struct page *page)
* that isolated the page, the page cache and optional buffer
* heads at page->private.
*/
- int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
- HPAGE_PMD_NR : 1;
+ int page_cache_pins = thp_nr_pages(page);
return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
}
@@ -1751,7 +1753,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* Restrictions:
*
* (1) Must be called with an elevated refcount on the page. This is a
- * fundamentnal difference from isolate_lru_pages (which is called
+ * fundamental difference from isolate_lru_pages (which is called
* without a stable reference).
* (2) the lru_lock must not be held.
* (3) interrupts must be enabled.
@@ -1845,13 +1847,12 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
struct page *page;
- enum lru_list lru;
while (!list_empty(list)) {
page = lru_to_page(list);
VM_BUG_ON_PAGE(PageLRU(page), page);
+ list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
- list_del(&page->lru);
spin_unlock_irq(&pgdat->lru_lock);
putback_lru_page(page);
spin_lock_irq(&pgdat->lru_lock);
@@ -1860,16 +1861,11 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
lruvec = mem_cgroup_page_lruvec(page, pgdat);
SetPageLRU(page);
- lru = page_lru(page);
-
- nr_pages = thp_nr_pages(page);
- update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
- list_move(&page->lru, &lruvec->lists[lru]);
+ add_page_to_lru_list(page, lruvec, page_lru(page));
if (put_page_testzero(page)) {
__ClearPageLRU(page);
- __ClearPageActive(page);
- del_page_from_lru_list(page, lruvec, lru);
+ del_page_from_lru_list(page, lruvec, page_off_lru(page));
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&pgdat->lru_lock);
@@ -1878,6 +1874,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
} else
list_add(&page->lru, &pages_to_free);
} else {
+ nr_pages = thp_nr_pages(page);
nr_moved += nr_pages;
if (PageActive(page))
workingset_age_nonresident(lruvec, nr_pages);
@@ -2237,7 +2234,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
- u64 fraction[2];
+ u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
unsigned long ap, fp;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4f7b4ee6aa12..79e5cd0abd0e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -167,8 +167,14 @@ EXPORT_SYMBOL(vm_zone_stat);
EXPORT_SYMBOL(vm_numa_stat);
EXPORT_SYMBOL(vm_node_stat);
+/* Maximum sync threshold for per-cpu vmstat counters */
#ifdef CONFIG_SMP
+#define MAX_THRESHOLD 125
+#else
+#define MAX_THRESHOLD 0
+#endif
+#ifdef CONFIG_SMP
int calculate_pressure_threshold(struct zone *zone)
{
int threshold;
@@ -186,11 +192,9 @@ int calculate_pressure_threshold(struct zone *zone)
threshold = max(1, (int)(watermark_distance / num_online_cpus()));
/*
- * Maximum threshold is 125
+ * Threshold is capped by MAX_THRESHOLD
*/
- threshold = min(125, threshold);
-
- return threshold;
+ return min(MAX_THRESHOLD, threshold);
}
int calculate_normal_threshold(struct zone *zone)
@@ -325,7 +329,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > t || x < -t)) {
+ if (unlikely(abs(x) > t)) {
zone_page_state_add(x, zone, item);
x = 0;
}
@@ -350,7 +354,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > t || x < -t)) {
+ if (unlikely(abs(x) > t)) {
node_page_state_add(x, pgdat, item);
x = 0;
}
@@ -511,7 +515,7 @@ static inline void mod_zone_state(struct zone *zone,
o = this_cpu_read(*p);
n = delta + o;
- if (n > t || n < -t) {
+ if (abs(n) > t) {
int os = overstep_mode * (t >> 1) ;
/* Overflow must be added to zone counters */
@@ -573,7 +577,7 @@ static inline void mod_node_state(struct pglist_data *pgdat,
o = this_cpu_read(*p);
n = delta + o;
- if (n > t || n < -t) {
+ if (abs(n) > t) {
int os = overstep_mode * (t >> 1) ;
/* Overflow must be added to node counters */
@@ -610,6 +614,7 @@ void dec_node_page_state(struct page *page, enum node_stat_item item)
}
EXPORT_SYMBOL(dec_node_page_state);
#else
+
/*
* Use interrupt disable to serialize counter updates
*/
@@ -1814,7 +1819,7 @@ static void refresh_vm_stats(struct work_struct *work)
int vmstat_refresh(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- long val;
+ long val, max_drift;
int err;
int i;
@@ -1825,17 +1830,22 @@ int vmstat_refresh(struct ctl_table *table, int write,
* pages, immediately after running a test. /proc/sys/vm/stat_refresh,
* which can equally be echo'ed to or cat'ted from (by root),
* can be used to update the stats just before reading them.
- *
- * Oh, and since global_zone_page_state() etc. are so careful to hide
- * transiently negative values, report an error here if any of
- * the stats is negative, so we know to go looking for imbalance.
*/
err = schedule_on_each_cpu(refresh_vm_stats);
if (err)
return err;
+
+ /*
+ * Since global_zone_page_state() etc. are so careful to hide
+ * transiently negative values, report an error here if any of
+ * the stats is negative and are less than the maximum drift value,
+ * so we know to go looking for imbalance.
+ */
+ max_drift = num_online_cpus() * MAX_THRESHOLD;
+
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
val = atomic_long_read(&vm_zone_stat[i]);
- if (val < 0) {
+ if (val < -max_drift) {
pr_warn("%s: %s %ld\n",
__func__, zone_stat_name(i), val);
err = -EINVAL;
@@ -1844,7 +1854,7 @@ int vmstat_refresh(struct ctl_table *table, int write,
#ifdef CONFIG_NUMA
for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
val = atomic_long_read(&vm_numa_stat[i]);
- if (val < 0) {
+ if (val < -max_drift) {
pr_warn("%s: %s %ld\n",
__func__, numa_stat_name(i), val);
err = -EINVAL;
diff --git a/mm/workingset.c b/mm/workingset.c
index 92e66113a577..8ed8e6296d8c 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -216,7 +216,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
- * @memcg: the lruvec that was aged
+ * @lruvec: the lruvec that was aged
* @nr_pages: the number of pages to count
*
* As in-memory pages are aged, non-resident pages need to be aged as
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 460b0feced26..18feaa0bc537 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -212,13 +212,12 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
{
struct z3fold_buddy_slots *slots;
- slots = kmem_cache_alloc(pool->c_handle,
+ slots = kmem_cache_zalloc(pool->c_handle,
(gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
if (slots) {
/* It will be freed separately in free_handle(). */
kmemleak_not_leak(slots);
- memset(slots->slot, 0, sizeof(slots->slot));
slots->pool = (unsigned long)pool;
rwlock_init(&slots->lock);
}
diff --git a/mm/zbud.c b/mm/zbud.c
index bc93aa4e46fc..c49966ece674 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -367,7 +367,6 @@ int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
spin_lock(&pool->lock);
/* First, try to find an unbuddied zbud page. */
- zhdr = NULL;
for_each_unbuddied_list(i, chunks) {
if (!list_empty(&pool->unbuddied[i])) {
zhdr = list_first_entry(&pool->unbuddied[i],