summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2017-03-10 13:14:11 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2017-03-10 13:14:13 +1100
commitbab31490ee78b5c5531bae6d54d411a49d2c2f30 (patch)
treef38d61b2da425a855b0b39d9733c879518f3b6cf
parente766a98118fe1ec0697d0cb6ad5e44b988a192d4 (diff)
parent328f8e70990e9bb7f7cb8f9e59299d5fa6ca699d (diff)
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/cgroup-v2.txt5
-rw-r--r--Documentation/dev-tools/kcov.rst2
-rw-r--r--Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt2
-rw-r--r--Documentation/filesystems/proc.txt6
-rw-r--r--Documentation/vm/userfaultfd.txt4
-rw-r--r--arch/arm/include/asm/page.h2
-rw-r--r--arch/cris/arch-v32/drivers/cryptocop.c2
-rw-r--r--arch/mips/kernel/perf_event_mipsxx.c2
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h85
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c2
-rw-r--r--arch/s390/kernel/perf_cpum_sf.c2
-rw-r--r--arch/sh/boards/mach-cayman/setup.c2
-rw-r--r--arch/sparc/lib/NG4memset.S26
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/mm/gup.c37
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/block/paride/pcd.c2
-rw-r--r--drivers/block/paride/pd.c2
-rw-r--r--drivers/block/paride/pf.c2
-rw-r--r--drivers/block/paride/pg.c2
-rw-r--r--drivers/block/paride/pt.c2
-rw-r--r--drivers/block/zram/zram_drv.c8
-rw-r--r--drivers/crypto/ux500/cryp/cryp.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c2
-rw-r--r--drivers/hv/channel.c2
-rw-r--r--drivers/isdn/hisax/st5481_b.c2
-rw-r--r--drivers/md/bcache/util.h1
-rw-r--r--drivers/media/dvb-frontends/drx39xyj/drx_driver.h8
-rw-r--r--drivers/mtd/spi-nor/spi-nor.c2
-rw-r--r--drivers/net/ethernet/qlogic/qlge/qlge.h4
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_core.c2
-rw-r--r--drivers/usb/gadget/legacy/inode.c3
-rw-r--r--drivers/usb/host/xhci.c4
-rw-r--r--drivers/virt/fsl_hypervisor.c7
-rw-r--r--firmware/Makefile3
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/dax.c48
-rw-r--r--fs/dcache.c18
-rw-r--r--fs/fat/inode.c13
-rw-r--r--fs/inode.c14
-rw-r--r--fs/internal.h4
-rw-r--r--fs/iomap.c20
-rw-r--r--fs/jbd2/journal.c9
-rw-r--r--fs/jbd2/transaction.c12
-rw-r--r--fs/namespace.c10
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c66
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c40
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/task_mmu.c17
-rw-r--r--fs/userfaultfd.c69
-rw-r--r--fs/xfs/kmem.c12
-rw-r--r--fs/xfs/kmem.h2
-rw-r--r--fs/xfs/libxfs/xfs_btree.c2
-rw-r--r--fs/xfs/xfs_aops.c6
-rw-r--r--fs/xfs/xfs_buf.c8
-rw-r--r--fs/xfs/xfs_trans.c12
-rw-r--r--include/dt-bindings/sound/cs42l42.h2
-rw-r--r--include/linux/bootmem.h2
-rw-r--r--include/linux/cpumask.h4
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/gfp.h18
-rw-r--r--include/linux/jbd2.h2
-rw-r--r--include/linux/kexec.h2
-rw-r--r--include/linux/memcontrol.h1
-rw-r--r--include/linux/mmu_notifier.h13
-rw-r--r--include/linux/mmzone.h10
-rw-r--r--include/linux/page-isolation.h5
-rw-r--r--include/linux/regulator/machine.h2
-rw-r--r--include/linux/rmap.h24
-rw-r--r--include/linux/rodata_test.h1
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/sched/mm.h26
-rw-r--r--include/linux/swap.h2
-rw-r--r--include/linux/userfaultfd_k.h13
-rw-r--r--include/linux/vm_event_item.h5
-rw-r--r--include/net/irda/timer.h2
-rw-r--r--include/trace/events/fs_dax.h130
-rw-r--r--include/uapi/linux/sysctl.h4
-rw-r--r--include/uapi/linux/userfaultfd.h5
-rw-r--r--init/initramfs.c2
-rw-r--r--kernel/cgroup/cgroup.c2
-rw-r--r--kernel/events/core.c2
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/locking/lockdep.c11
-rw-r--r--kernel/locking/qspinlock_paravirt.h3
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/taskstats.c14
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--lib/Kconfig.debug14
-rw-r--r--lib/Makefile1
-rw-r--r--lib/list_sort.c149
-rw-r--r--lib/radix-tree.c2
-rw-r--r--lib/scatterlist.c22
-rw-r--r--lib/test_list_sort.c150
-rw-r--r--lib/test_sort.c11
-rw-r--r--lib/vsprintf.c3
-rw-r--r--mm/compaction.c83
-rw-r--r--mm/filemap.c16
-rw-r--r--mm/huge_memory.c95
-rw-r--r--mm/internal.h36
-rw-r--r--mm/kasan/kasan.c3
-rw-r--r--mm/kasan/kasan.h2
-rw-r--r--mm/kasan/quarantine.c51
-rw-r--r--mm/kasan/report.c187
-rw-r--r--mm/khugepaged.c12
-rw-r--r--mm/madvise.c55
-rw-r--r--mm/memblock.c5
-rw-r--r--mm/memcontrol.c46
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/page-writeback.c5
-rw-r--r--mm/page_alloc.c256
-rw-r--r--mm/page_isolation.c11
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/rmap.c66
-rw-r--r--mm/rodata_test.c17
-rw-r--r--mm/swap.c76
-rw-r--r--mm/swapfile.c25
-rw-r--r--mm/vmscan.c376
-rw-r--r--mm/vmstat.c89
-rwxr-xr-xscripts/checkpatch.pl50
-rw-r--r--scripts/gdb/linux/constants.py.in7
-rw-r--r--scripts/gdb/linux/proc.py73
-rw-r--r--scripts/spelling.txt3
-rw-r--r--sound/soc/amd/acp-pcm-dma.c2
-rw-r--r--tools/lguest/lguest.c2
-rw-r--r--tools/lib/bpf/Makefile2
-rw-r--r--tools/lib/traceevent/Makefile2
-rw-r--r--tools/lib/traceevent/event-parse.h2
-rw-r--r--tools/testing/selftests/vm/Makefile4
-rw-r--r--usr/Kconfig10
135 files changed, 1852 insertions, 1151 deletions
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 49d7c997fa1e..e50b95c25868 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -871,6 +871,11 @@ PAGE_SIZE multiple when read back.
Amount of memory used in network transmission buffers
+ shmem
+
+ Amount of cached filesystem data that is swap-backed,
+ such as tmpfs, shm segments, shared anonymous mmap()s
+
file_mapped
Amount of cached filesystem data mapped with mmap()
diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst
index 2c41b713841f..44886c91e112 100644
--- a/Documentation/dev-tools/kcov.rst
+++ b/Documentation/dev-tools/kcov.rst
@@ -10,7 +10,7 @@ Note that kcov does not aim to collect as much coverage as possible. It aims
to collect more or less stable coverage that is function of syscall inputs.
To achieve this goal it does not collect coverage in soft/hard interrupts
and instrumentation of some inherently non-deterministic parts of kernel is
-disbled (e.g. scheduler, locking).
+disabled (e.g. scheduler, locking).
Usage
-----
diff --git a/Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt b/Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt
index c3f6546ebac7..6a23ad9ac53a 100644
--- a/Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt
@@ -45,7 +45,7 @@ Required Properties:
Optional Properties:
- reg-names: In addition to the required properties, the following are optional
- "efuse-address" - Contains efuse base address used to pick up ABB info.
- - "ldo-address" - Contains address of ABB LDO overide register address.
+ - "ldo-address" - Contains address of ABB LDO override register.
"efuse-address" is required for this.
- ti,ldovbb-vset-mask - Required if ldo-address is set, mask for LDO override
register to provide override vset value.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 9036dbf16156..4cddbce85ac9 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -413,6 +413,7 @@ Private_Clean: 0 kB
Private_Dirty: 0 kB
Referenced: 892 kB
Anonymous: 0 kB
+LazyFree: 0 kB
AnonHugePages: 0 kB
ShmemPmdMapped: 0 kB
Shared_Hugetlb: 0 kB
@@ -442,6 +443,11 @@ accessed.
"Anonymous" shows the amount of memory that does not belong to any file. Even
a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
and a page is modified, the file page is replaced by a private anonymous copy.
+"LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE).
+The memory isn't freed immediately with madvise(). It's freed in memory
+pressure if the memory is clean. Please note that the printed value might
+be lower than the real value due to optimizations used in the current
+implementation. If this is not desirable please file a bug report.
"AnonHugePages" shows the ammount of memory backed by transparent hugepage.
"ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by
huge pages.
diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt
index 0e5543a920e5..bb2f945f87ab 100644
--- a/Documentation/vm/userfaultfd.txt
+++ b/Documentation/vm/userfaultfd.txt
@@ -172,10 +172,6 @@ the same read(2) protocol as for the page fault notifications. The
manager has to explicitly enable these events by setting appropriate
bits in uffdio_api.features passed to UFFDIO_API ioctl:
-UFFD_FEATURE_EVENT_EXIT - enable notification about exit() of the
-non-cooperative process. When the monitored process exits, the uffd
-manager will get UFFD_EVENT_EXIT.
-
UFFD_FEATURE_EVENT_FORK - enable userfaultfd hooks for fork(). When
this feature is enabled, the userfaultfd context of the parent process
is duplicated into the newly created process. The manager receives
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 4355f0ec44d6..f98baaec0a15 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -17,6 +17,8 @@
#ifndef __ASSEMBLY__
+#include <linux/personality.h> /* For READ_IMPLIES_EXEC */
+
#ifndef CONFIG_MMU
#include <asm/page-nommu.h>
diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c
index ae6903d7fdbe..14970f11bbf2 100644
--- a/arch/cris/arch-v32/drivers/cryptocop.c
+++ b/arch/cris/arch-v32/drivers/cryptocop.c
@@ -2086,7 +2086,7 @@ static void cryptocop_job_queue_close(void)
dma_in_cfg.en = regk_dma_no;
REG_WR(dma, IN_DMA_INST, rw_cfg, dma_in_cfg);
- /* Disble the cryptocop. */
+ /* Disable the cryptocop. */
rw_cfg = REG_RD(strcop, regi_strcop, rw_cfg);
rw_cfg.en = 0;
REG_WR(strcop, regi_strcop, rw_cfg, rw_cfg);
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index 8c35b3152e1e..44b50646582d 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -618,7 +618,7 @@ static int mipspmu_event_init(struct perf_event *event)
return -ENOENT;
}
- if (event->cpu >= nr_cpumask_bits ||
+ if ((unsigned int)event->cpu >= nr_cpumask_bits ||
(event->cpu >= 0 && !cpu_online(event->cpu)))
return -ENODEV;
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 1eeeb72c7015..ec1e731e6a2d 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -347,23 +347,58 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
__r; \
})
+static inline int __pte_write(pte_t pte)
+{
+ return !!(pte_raw(pte) & cpu_to_be64(_PAGE_WRITE));
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+#define pte_savedwrite pte_savedwrite
+static inline bool pte_savedwrite(pte_t pte)
+{
+ /*
+ * Saved write ptes are prot none ptes that doesn't have
+ * privileged bit sit. We mark prot none as one which has
+ * present and pviliged bit set and RWX cleared. To mark
+ * protnone which used to have _PAGE_WRITE set we clear
+ * the privileged bit.
+ */
+ return !(pte_raw(pte) & cpu_to_be64(_PAGE_RWX | _PAGE_PRIVILEGED));
+}
+#else
+#define pte_savedwrite pte_savedwrite
+static inline bool pte_savedwrite(pte_t pte)
+{
+ return false;
+}
+#endif
+
+static inline int pte_write(pte_t pte)
+{
+ return __pte_write(pte) || pte_savedwrite(pte);
+}
+
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- if ((pte_raw(*ptep) & cpu_to_be64(_PAGE_WRITE)) == 0)
- return;
-
- pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0);
+ if (__pte_write(*ptep))
+ pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0);
+ else if (unlikely(pte_savedwrite(*ptep)))
+ pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0);
}
static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- if ((pte_raw(*ptep) & cpu_to_be64(_PAGE_WRITE)) == 0)
- return;
-
- pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1);
+ /*
+ * We should not find protnone for hugetlb, but this complete the
+ * interface.
+ */
+ if (__pte_write(*ptep))
+ pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1);
+ else if (unlikely(pte_savedwrite(*ptep)))
+ pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 1);
}
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
@@ -397,11 +432,6 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
pte_update(mm, addr, ptep, ~0UL, 0, 0);
}
-static inline int pte_write(pte_t pte)
-{
- return !!(pte_raw(pte) & cpu_to_be64(_PAGE_WRITE));
-}
-
static inline int pte_dirty(pte_t pte)
{
return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DIRTY));
@@ -465,19 +495,12 @@ static inline pte_t pte_clear_savedwrite(pte_t pte)
VM_BUG_ON(!pte_protnone(pte));
return __pte(pte_val(pte) | _PAGE_PRIVILEGED);
}
-
-#define pte_savedwrite pte_savedwrite
-static inline bool pte_savedwrite(pte_t pte)
+#else
+#define pte_clear_savedwrite pte_clear_savedwrite
+static inline pte_t pte_clear_savedwrite(pte_t pte)
{
- /*
- * Saved write ptes are prot none ptes that doesn't have
- * privileged bit sit. We mark prot none as one which has
- * present and pviliged bit set and RWX cleared. To mark
- * protnone which used to have _PAGE_WRITE set we clear
- * the privileged bit.
- */
- VM_BUG_ON(!pte_protnone(pte));
- return !(pte_raw(pte) & cpu_to_be64(_PAGE_RWX | _PAGE_PRIVILEGED));
+ VM_WARN_ON(1);
+ return __pte(pte_val(pte) & ~_PAGE_WRITE);
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -506,6 +529,8 @@ static inline unsigned long pte_pfn(pte_t pte)
/* Generic modifiers for PTE bits */
static inline pte_t pte_wrprotect(pte_t pte)
{
+ if (unlikely(pte_savedwrite(pte)))
+ return pte_clear_savedwrite(pte);
return __pte(pte_val(pte) & ~_PAGE_WRITE);
}
@@ -926,6 +951,7 @@ static inline int pmd_protnone(pmd_t pmd)
#define __HAVE_ARCH_PMD_WRITE
#define pmd_write(pmd) pte_write(pmd_pte(pmd))
+#define __pmd_write(pmd) __pte_write(pmd_pte(pmd))
#define pmd_savedwrite(pmd) pte_savedwrite(pmd_pte(pmd))
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -982,11 +1008,10 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
{
-
- if ((pmd_raw(*pmdp) & cpu_to_be64(_PAGE_WRITE)) == 0)
- return;
-
- pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0);
+ if (__pmd_write((*pmdp)))
+ pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0);
+ else if (unlikely(pmd_savedwrite(*pmdp)))
+ pmd_hugepage_update(mm, addr, pmdp, 0, _PAGE_PRIVILEGED);
}
static inline int pmd_trans_huge(pmd_t pmd)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index f3158fb16de3..8c68145ba1bd 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -601,7 +601,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
hva, NULL, NULL);
if (ptep) {
pte = kvmppc_read_update_linux_pte(ptep, 1);
- if (pte_write(pte))
+ if (__pte_write(pte))
write_ok = 1;
}
local_irq_restore(flags);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 6fca970373ee..ce6f2121fffe 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -256,7 +256,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
}
pte = kvmppc_read_update_linux_pte(ptep, writing);
if (pte_present(pte) && !pte_protnone(pte)) {
- if (writing && !pte_write(pte))
+ if (writing && !__pte_write(pte))
/* make the actual HPTE be read-only */
ptel = hpte_make_readonly(ptel);
is_ci = pte_ci(pte);
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 1c0b58545c04..534a10c00c54 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -823,7 +823,7 @@ static int cpumsf_pmu_event_init(struct perf_event *event)
}
/* Check online status of the CPU to which the event is pinned */
- if (event->cpu >= nr_cpumask_bits ||
+ if ((unsigned int)event->cpu >= nr_cpumask_bits ||
(event->cpu >= 0 && !cpu_online(event->cpu)))
return -ENODEV;
diff --git a/arch/sh/boards/mach-cayman/setup.c b/arch/sh/boards/mach-cayman/setup.c
index 340fd40b381d..9c292c27e0d7 100644
--- a/arch/sh/boards/mach-cayman/setup.c
+++ b/arch/sh/boards/mach-cayman/setup.c
@@ -128,7 +128,6 @@ static int __init smsc_superio_setup(void)
SMSC_SUPERIO_WRITE_INDEXED(1, SMSC_PRIMARY_INT_INDEX);
SMSC_SUPERIO_WRITE_INDEXED(12, SMSC_SECONDARY_INT_INDEX);
-#ifdef CONFIG_IDE
/*
* Only IDE1 exists on the Cayman
*/
@@ -158,7 +157,6 @@ static int __init smsc_superio_setup(void)
SMSC_SUPERIO_WRITE_INDEXED(0x01, 0xc5); /* GP45 = IDE1_IRQ */
SMSC_SUPERIO_WRITE_INDEXED(0x00, 0xc6); /* GP46 = nIOROP */
SMSC_SUPERIO_WRITE_INDEXED(0x00, 0xc7); /* GP47 = nIOWOP */
-#endif
/* Exit the configuration state */
outb(SMSC_EXIT_CONFIG_KEY, SMSC_CONFIG_PORT_ADDR);
diff --git a/arch/sparc/lib/NG4memset.S b/arch/sparc/lib/NG4memset.S
index 41da4bdd95cb..e7c2e70df263 100644
--- a/arch/sparc/lib/NG4memset.S
+++ b/arch/sparc/lib/NG4memset.S
@@ -13,14 +13,14 @@
.globl NG4memset
NG4memset:
andcc %o1, 0xff, %o4
- be,pt %icc, 1f
+ be,pt %xcc, 1f
mov %o2, %o1
sllx %o4, 8, %g1
or %g1, %o4, %o2
sllx %o2, 16, %g1
or %g1, %o2, %o2
sllx %o2, 32, %g1
- ba,pt %icc, 1f
+ ba,pt %xcc, 1f
or %g1, %o2, %o4
.size NG4memset,.-NG4memset
@@ -29,7 +29,7 @@ NG4memset:
NG4bzero:
clr %o4
1: cmp %o1, 16
- ble %icc, .Ltiny
+ ble %xcc, .Ltiny
mov %o0, %o3
sub %g0, %o0, %g1
and %g1, 0x7, %g1
@@ -37,7 +37,7 @@ NG4bzero:
sub %o1, %g1, %o1
1: stb %o4, [%o0 + 0x00]
subcc %g1, 1, %g1
- bne,pt %icc, 1b
+ bne,pt %xcc, 1b
add %o0, 1, %o0
.Laligned8:
cmp %o1, 64 + (64 - 8)
@@ -48,7 +48,7 @@ NG4bzero:
sub %o1, %g1, %o1
1: stx %o4, [%o0 + 0x00]
subcc %g1, 8, %g1
- bne,pt %icc, 1b
+ bne,pt %xcc, 1b
add %o0, 0x8, %o0
.Laligned64:
andn %o1, 64 - 1, %g1
@@ -58,30 +58,30 @@ NG4bzero:
1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
subcc %g1, 0x40, %g1
stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
- bne,pt %icc, 1b
+ bne,pt %xcc, 1b
add %o0, 0x40, %o0
.Lpostloop:
cmp %o1, 8
- bl,pn %icc, .Ltiny
+ bl,pn %xcc, .Ltiny
membar #StoreStore|#StoreLoad
.Lmedium:
andn %o1, 0x7, %g1
sub %o1, %g1, %o1
1: stx %o4, [%o0 + 0x00]
subcc %g1, 0x8, %g1
- bne,pt %icc, 1b
+ bne,pt %xcc, 1b
add %o0, 0x08, %o0
andcc %o1, 0x4, %g1
- be,pt %icc, .Ltiny
+ be,pt %xcc, .Ltiny
sub %o1, %g1, %o1
stw %o4, [%o0 + 0x00]
add %o0, 0x4, %o0
.Ltiny:
cmp %o1, 0
- be,pn %icc, .Lexit
+ be,pn %xcc, .Lexit
1: subcc %o1, 1, %o1
stb %o4, [%o0 + 0x00]
- bne,pt %icc, 1b
+ bne,pt %xcc, 1b
add %o0, 1, %o0
.Lexit:
retl
@@ -99,7 +99,7 @@ NG4bzero:
stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P
- bne,pt %icc, 1b
+ bne,pt %xcc, 1b
add %o0, 0x30, %o0
- ba,a,pt %icc, .Lpostloop
+ ba,a,pt %xcc, .Lpostloop
.size NG4bzero,.-NG4bzero
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8639bb2ae058..8f3d9cf26ff9 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -535,7 +535,7 @@ static void run_sync(void)
{
int enable_irqs = irqs_disabled();
- /* We may be called with interrupts disbled (on bootup). */
+ /* We may be called with interrupts disabled (on bootup). */
if (enable_irqs)
local_irq_enable();
on_each_cpu(do_sync_core, NULL, 1);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 307b1f4543de..2e3c34b1df37 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -338,6 +338,7 @@ void arch_crash_save_vmcoreinfo(void)
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+ VMCOREINFO_PHYS_BASE(phys_base);
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 99c7805a9693..1f3b6ef105cd 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -106,32 +106,35 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
- int nr_start = *nr;
- pte_t *ptep;
+ int nr_start = *nr, ret = 0;
+ pte_t *ptep, *ptem;
- ptep = pte_offset_map(&pmd, addr);
+ /*
+ * Keep the original mapped PTE value (ptem) around since we
+ * might increment ptep off the end of the page when finishing
+ * our loop iteration.
+ */
+ ptem = ptep = pte_offset_map(&pmd, addr);
do {
pte_t pte = gup_get_pte(ptep);
struct page *page;
/* Similar to the PMD case, NUMA hinting must take slow path */
- if (pte_protnone(pte)) {
- pte_unmap(ptep);
- return 0;
- }
+ if (pte_protnone(pte))
+ break;
+
+ if (!pte_allows_gup(pte_val(pte), write))
+ break;
if (pte_devmap(pte)) {
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
- pte_unmap(ptep);
- return 0;
+ break;
}
- } else if (!pte_allows_gup(pte_val(pte), write) ||
- pte_special(pte)) {
- pte_unmap(ptep);
- return 0;
- }
+ } else if (pte_special(pte))
+ break;
+
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
get_page(page);
@@ -141,9 +144,11 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
(*nr)++;
} while (ptep++, addr += PAGE_SIZE, addr != end);
- pte_unmap(ptep - 1);
+ if (addr == end)
+ ret = 1;
+ pte_unmap(ptem);
- return 1;
+ return ret;
}
static inline void get_head_page_multiple(struct page *page, int nr)
diff --git a/block/genhd.c b/block/genhd.c
index b26a5ea115d0..7df3f76607a4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -906,7 +906,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 10aed84244f5..939641d6e262 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -50,7 +50,7 @@
the slower the port i/o. In some cases, setting
this to zero will speed up the device. (default -1)
- major You may use this parameter to overide the
+ major You may use this parameter to override the
default major number (46) that this driver
will use. Be sure to change the device
name as well.
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 644ba0888bd4..9cfd2e06a649 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -61,7 +61,7 @@
first drive found.
- major You may use this parameter to overide the
+ major You may use this parameter to override the
default major number (45) that this driver
will use. Be sure to change the device
name as well.
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index ed93e8badf56..14c5d32f5d8b 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -59,7 +59,7 @@
the slower the port i/o. In some cases, setting
this to zero will speed up the device. (default -1)
- major You may use this parameter to overide the
+ major You may use this parameter to override the
default major number (47) that this driver
will use. Be sure to change the device
name as well.
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index 5db955fe3a94..3b5882bfb736 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -84,7 +84,7 @@
the slower the port i/o. In some cases, setting
this to zero will speed up the device. (default -1)
- major You may use this parameter to overide the
+ major You may use this parameter to override the
default major number (97) that this driver
will use. Be sure to change the device
name as well.
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 61fc6824299a..e815312a00ad 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -61,7 +61,7 @@
the slower the port i/o. In some cases, setting
this to zero will speed up the device. (default -1)
- major You may use this parameter to overide the
+ major You may use this parameter to override the
default major number (96) that this driver
will use. Be sure to change the device
name as well.
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index dceb5edd1e54..01944419b1f3 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -177,15 +177,17 @@ static bool page_same_filled(void *ptr, unsigned long *element)
{
unsigned int pos;
unsigned long *page;
+ unsigned long val;
page = (unsigned long *)ptr;
+ val = page[0];
- for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) {
- if (page[pos] != page[pos + 1])
+ for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
+ if (val != page[pos])
return false;
}
- *element = page[pos];
+ *element = val;
return true;
}
diff --git a/drivers/crypto/ux500/cryp/cryp.c b/drivers/crypto/ux500/cryp/cryp.c
index 43a0c8a26ab0..00a16ab601cb 100644
--- a/drivers/crypto/ux500/cryp/cryp.c
+++ b/drivers/crypto/ux500/cryp/cryp.c
@@ -82,7 +82,7 @@ void cryp_activity(struct cryp_device_data *device_data,
void cryp_flush_inoutfifo(struct cryp_device_data *device_data)
{
/*
- * We always need to disble the hardware before trying to flush the
+ * We always need to disable the hardware before trying to flush the
* FIFO. This is something that isn't written in the design
* specification, but we have been informed by the hardware designers
* that this must be done.
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 31375bdde6f1..011800f621c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -788,7 +788,7 @@ static int sdma_v3_0_start(struct amdgpu_device *adev)
}
}
- /* disble sdma engine before programing it */
+ /* disable sdma engine before programing it */
sdma_v3_0_ctx_switch_enable(adev, false);
sdma_v3_0_enable(adev, false);
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 81a80c82f1bd..bd0d1988feb2 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -543,7 +543,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel)
/*
* In case a device driver's probe() fails (e.g.,
* util_probe() -> vmbus_open() returns -ENOMEM) and the device is
- * rescinded later (e.g., we dynamically disble an Integrated Service
+ * rescinded later (e.g., we dynamically disable an Integrated Service
* in Hyper-V Manager), the driver's remove() invokes vmbus_close():
* here we should skip most of the below cleanup work.
*/
diff --git a/drivers/isdn/hisax/st5481_b.c b/drivers/isdn/hisax/st5481_b.c
index 409849165838..f64a36007800 100644
--- a/drivers/isdn/hisax/st5481_b.c
+++ b/drivers/isdn/hisax/st5481_b.c
@@ -239,7 +239,7 @@ static void st5481B_mode(struct st5481_bcs *bcs, int mode)
}
}
} else {
- // Disble B channel interrupts
+ // Disable B channel interrupts
st5481_usb_device_ctrl_msg(adapter, FFMSK_B1+(bcs->channel * 2), 0, NULL, NULL);
// Disable B channel FIFOs
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index a126919ed102..5d13930f0f22 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -4,7 +4,6 @@
#include <linux/blkdev.h>
#include <linux/errno.h>
-#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/sched/clock.h>
#include <linux/llist.h>
diff --git a/drivers/media/dvb-frontends/drx39xyj/drx_driver.h b/drivers/media/dvb-frontends/drx39xyj/drx_driver.h
index 7a681d8202c7..4442e478db72 100644
--- a/drivers/media/dvb-frontends/drx39xyj/drx_driver.h
+++ b/drivers/media/dvb-frontends/drx39xyj/drx_driver.h
@@ -256,8 +256,7 @@ int drxbsp_tuner_default_i2c_write_read(struct tuner_instance *tuner,
*
* The actual DAP implementation may be restricted to only one of the modes.
* A compiler warning or error will be generated if the DAP implementation
-* overides or cannot handle the mode defined below.
-*
+* overrides or cannot handle the mode defined below.
*/
#ifndef DRXDAP_SINGLE_MASTER
#define DRXDAP_SINGLE_MASTER 1
@@ -272,7 +271,7 @@ int drxbsp_tuner_default_i2c_write_read(struct tuner_instance *tuner,
*
* This maximum size may be restricted by the actual DAP implementation.
* A compiler warning or error will be generated if the DAP implementation
-* overides or cannot handle the chunksize defined below.
+* overrides or cannot handle the chunksize defined below.
*
* Beware that the DAP uses DRXDAP_MAX_WCHUNKSIZE to create a temporary data
* buffer. Do not undefine or choose too large, unless your system is able to
@@ -292,8 +291,7 @@ int drxbsp_tuner_default_i2c_write_read(struct tuner_instance *tuner,
*
* This maximum size may be restricted by the actual DAP implementation.
* A compiler warning or error will be generated if the DAP implementation
-* overides or cannot handle the chunksize defined below.
-*
+* overrides or cannot handle the chunksize defined below.
*/
#ifndef DRXDAP_MAX_RCHUNKSIZE
#define DRXDAP_MAX_RCHUNKSIZE 60
diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 561e46de8faa..03b1ffbb6a27 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -186,7 +186,7 @@ static inline int write_enable(struct spi_nor *nor)
}
/*
- * Send write disble instruction to the chip.
+ * Send write disable instruction to the chip.
*/
static inline int write_disable(struct spi_nor *nor)
{
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge.h b/drivers/net/ethernet/qlogic/qlge/qlge.h
index 6d31f92ef2b6..84ac50f92c9c 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge.h
+++ b/drivers/net/ethernet/qlogic/qlge/qlge.h
@@ -1162,8 +1162,8 @@ struct ob_mac_tso_iocb_rsp {
struct ib_mac_iocb_rsp {
u8 opcode; /* 0x20 */
u8 flags1;
-#define IB_MAC_IOCB_RSP_OI 0x01 /* Overide intr delay */
-#define IB_MAC_IOCB_RSP_I 0x02 /* Disble Intr Generation */
+#define IB_MAC_IOCB_RSP_OI 0x01 /* Override intr delay */
+#define IB_MAC_IOCB_RSP_I 0x02 /* Disable Intr Generation */
#define IB_MAC_CSUM_ERR_MASK 0x1c /* A mask to use for csum errs */
#define IB_MAC_IOCB_RSP_TE 0x04 /* Checksum error */
#define IB_MAC_IOCB_RSP_NU 0x08 /* No checksum rcvd */
diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c
index 109e2c99e6c1..95d8f25cbcca 100644
--- a/drivers/scsi/aic7xxx/aic79xx_core.c
+++ b/drivers/scsi/aic7xxx/aic79xx_core.c
@@ -6278,7 +6278,7 @@ ahd_reset(struct ahd_softc *ahd, int reinit)
* does not disable its parity logic prior to
* the start of the reset. This may cause a
* parity error to be detected and thus a
- * spurious SERR or PERR assertion. Disble
+ * spurious SERR or PERR assertion. Disable
* PERR and SERR responses during the CHIPRST.
*/
mod_cmd = cmd & ~(PCIM_CMD_PERRESPEN|PCIM_CMD_SERRESPEN);
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index 0513dfa008e6..a2c916869293 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -84,8 +84,7 @@ static int ep_open(struct inode *, struct file *);
/* /dev/gadget/$CHIP represents ep0 and the whole device */
enum ep0_state {
- /* DISBLED is the initial state.
- */
+ /* DISABLED is the initial state. */
STATE_DEV_DISABLED = 0,
/* Only one open() of /dev/gadget/$CHIP; only one file tracks
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 6d6c46000e56..50aee8b7718b 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -868,7 +868,7 @@ static void xhci_disable_port_wake_on_bits(struct xhci_hcd *xhci)
spin_lock_irqsave(&xhci->lock, flags);
- /* disble usb3 ports Wake bits*/
+ /* disable usb3 ports Wake bits */
port_index = xhci->num_usb3_ports;
port_array = xhci->usb3_ports;
while (port_index--) {
@@ -879,7 +879,7 @@ static void xhci_disable_port_wake_on_bits(struct xhci_hcd *xhci)
writel(t2, port_array[port_index]);
}
- /* disble usb2 ports Wake bits*/
+ /* disable usb2 ports Wake bits */
port_index = xhci->num_usb2_ports;
port_array = xhci->usb2_ports;
while (port_index--) {
diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c
index 150ce2abf6c8..d3eca879a0a8 100644
--- a/drivers/virt/fsl_hypervisor.c
+++ b/drivers/virt/fsl_hypervisor.c
@@ -243,11 +243,8 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
sg_list = PTR_ALIGN(sg_list_unaligned, sizeof(struct fh_sg_list));
/* Get the physical addresses of the source buffer */
- down_read(&current->mm->mmap_sem);
- num_pinned = get_user_pages(param.local_vaddr - lb_offset,
- num_pages, (param.source == -1) ? 0 : FOLL_WRITE,
- pages, NULL);
- up_read(&current->mm->mmap_sem);
+ num_pinned = get_user_pages_unlocked(param.local_vaddr - lb_offset,
+ num_pages, pages, (param.source == -1) ? 0 : FOLL_WRITE);
if (num_pinned != num_pages) {
/* get_user_pages() failed */
diff --git a/firmware/Makefile b/firmware/Makefile
index e297e1b52636..fa3e81c2a97b 100644
--- a/firmware/Makefile
+++ b/firmware/Makefile
@@ -176,7 +176,8 @@ quiet_cmd_fwbin = MK_FW $@
wordsize_deps := $(wildcard include/config/64bit.h include/config/32bit.h \
include/config/ppc32.h include/config/ppc64.h \
include/config/superh32.h include/config/superh64.h \
- include/config/x86_32.h include/config/x86_64.h)
+ include/config/x86_32.h include/config/x86_64.h \
+ firmware/Makefile)
$(patsubst %,$(obj)/%.gen.S, $(fw-shipped-y)): %: $(wordsize_deps)
$(call cmd,fwbin,$(patsubst %.gen.S,%,$@))
diff --git a/fs/buffer.c b/fs/buffer.c
index 9196f2a270da..2ba905716d34 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1938,7 +1938,7 @@ EXPORT_SYMBOL(page_zero_new_buffers);
static void
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
- struct iomap *iomap)
+ const struct iomap *iomap)
{
loff_t offset = block << inode->i_blkbits;
@@ -1991,7 +1991,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
}
int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block, struct iomap *iomap)
+ get_block_t *get_block, const struct iomap *iomap)
{
unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
diff --git a/fs/dax.c b/fs/dax.c
index de622d4282a6..1861ef05d07b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -546,21 +546,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
static int dax_load_hole(struct address_space *mapping, void **entry,
struct vm_fault *vmf)
{
+ struct inode *inode = mapping->host;
struct page *page;
int ret;
/* Hole page already exists? Return it... */
if (!radix_tree_exceptional_entry(*entry)) {
page = *entry;
- goto out;
+ goto finish_fault;
}
/* This will replace locked radix tree entry with a hole page */
page = find_or_create_page(mapping, vmf->pgoff,
vmf->gfp_mask | __GFP_ZERO);
- if (!page)
- return VM_FAULT_OOM;
- out:
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+
+finish_fault:
vmf->page = page;
ret = finish_fault(vmf);
vmf->page = NULL;
@@ -568,8 +572,10 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
if (!ret) {
/* Grab reference for PTE that is now referencing the page */
get_page(page);
- return VM_FAULT_NOPAGE;
+ ret = VM_FAULT_NOPAGE;
}
+out:
+ trace_dax_load_hole(inode, vmf, ret);
return ret;
}
@@ -838,6 +844,7 @@ static int dax_writeback_one(struct block_device *bdev,
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
+ trace_dax_writeback_one(mapping->host, index, dax.size >> PAGE_SHIFT);
unmap:
dax_unmap_atomic(bdev, &dax);
put_locked_mapping_entry(mapping, index, entry);
@@ -873,6 +880,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
start_index = wbc->range_start >> PAGE_SHIFT;
end_index = wbc->range_end >> PAGE_SHIFT;
+ trace_dax_writeback_range(inode, start_index, end_index);
+
tag_pages_for_writeback(mapping, start_index, end_index);
pagevec_init(&pvec, 0);
@@ -893,10 +902,12 @@ int dax_writeback_mapping_range(struct address_space *mapping,
ret = dax_writeback_one(bdev, mapping, indices[i],
pvec.pages[i]);
if (ret < 0)
- return ret;
+ goto out;
}
}
- return 0;
+out:
+ trace_dax_writeback_range_done(inode, start_index, end_index);
+ return ret;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
@@ -921,6 +932,7 @@ static int dax_insert_mapping(struct address_space *mapping,
return PTR_ERR(ret);
*entryp = ret;
+ trace_dax_insert_mapping(mapping->host, vmf, ret);
return vm_insert_mixed(vma, vaddr, dax.pfn);
}
@@ -932,6 +944,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
void *entry, **slot;
pgoff_t index = vmf->pgoff;
@@ -941,6 +954,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
if (entry)
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
+ trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
@@ -953,6 +967,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
*/
finish_mkwrite_fault(vmf);
put_locked_mapping_entry(mapping, index, entry);
+ trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -993,14 +1008,14 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(__dax_zero_page_range);
-static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
+static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
{
return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
}
static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ const struct iomap *iomap)
{
struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
@@ -1133,13 +1148,16 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
int vmf_ret = 0;
void *entry;
+ trace_dax_pte_fault(inode, vmf, vmf_ret);
/*
* Check whether offset isn't beyond end of file now. Caller is supposed
* to hold locks serializing us with truncate / punch hole so this is
* a reliable test.
*/
- if (pos >= i_size_read(inode))
- return VM_FAULT_SIGBUS;
+ if (pos >= i_size_read(inode)) {
+ vmf_ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE;
@@ -1150,8 +1168,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
* that we never have to deal with more than a single extent here.
*/
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
- if (error)
- return dax_fault_return(error);
+ if (error) {
+ vmf_ret = dax_fault_return(error);
+ goto out;
+ }
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
goto finish_iomap;
@@ -1235,6 +1255,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
+out:
+ trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 95d71eda8142..808ea99062c2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3548,8 +3548,6 @@ __setup("dhash_entries=", set_dhash_entries);
static void __init dcache_init_early(void)
{
- unsigned int loop;
-
/* If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
*/
@@ -3561,24 +3559,19 @@ static void __init dcache_init_early(void)
sizeof(struct hlist_bl_head),
dhash_entries,
13,
- HASH_EARLY,
+ HASH_EARLY | HASH_ZERO,
&d_hash_shift,
&d_hash_mask,
0,
0);
-
- for (loop = 0; loop < (1U << d_hash_shift); loop++)
- INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
}
static void __init dcache_init(void)
{
- unsigned int loop;
-
- /*
+ /*
* A constructor could be added for stable state like the lists,
* but it is probably not worth it because of the cache nature
- * of the dcache.
+ * of the dcache.
*/
dentry_cache = KMEM_CACHE(dentry,
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
@@ -3592,14 +3585,11 @@ static void __init dcache_init(void)
sizeof(struct hlist_bl_head),
dhash_entries,
13,
- 0,
+ HASH_ZERO | HASH_ADAPT,
&d_hash_shift,
&d_hash_mask,
0,
0);
-
- for (loop = 0; loop < (1U << d_hash_shift); loop++)
- INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
}
/* SLAB cache for __getname() consumers */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 338d2f73eb29..a2c05f2ada6d 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1359,6 +1359,16 @@ out:
return 0;
}
+static void fat_dummy_inode_init(struct inode *inode)
+{
+ /* Initialize this dummy inode to work as no-op. */
+ MSDOS_I(inode)->mmu_private = 0;
+ MSDOS_I(inode)->i_start = 0;
+ MSDOS_I(inode)->i_logstart = 0;
+ MSDOS_I(inode)->i_attrs = 0;
+ MSDOS_I(inode)->i_pos = 0;
+}
+
static int fat_read_root(struct inode *inode)
{
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
@@ -1803,12 +1813,13 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
fat_inode = new_inode(sb);
if (!fat_inode)
goto out_fail;
- MSDOS_I(fat_inode)->i_pos = 0;
+ fat_dummy_inode_init(fat_inode);
sbi->fat_inode = fat_inode;
fsinfo_inode = new_inode(sb);
if (!fsinfo_inode)
goto out_fail;
+ fat_dummy_inode_init(fsinfo_inode);
fsinfo_inode->i_ino = MSDOS_FSINFO_INO;
sbi->fsinfo_inode = fsinfo_inode;
insert_inode_hash(fsinfo_inode);
diff --git a/fs/inode.c b/fs/inode.c
index 88110fd0b282..32c8ee454ef0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1916,8 +1916,6 @@ __setup("ihash_entries=", set_ihash_entries);
*/
void __init inode_init_early(void)
{
- unsigned int loop;
-
/* If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
*/
@@ -1929,20 +1927,15 @@ void __init inode_init_early(void)
sizeof(struct hlist_head),
ihash_entries,
14,
- HASH_EARLY,
+ HASH_EARLY | HASH_ZERO,
&i_hash_shift,
&i_hash_mask,
0,
0);
-
- for (loop = 0; loop < (1U << i_hash_shift); loop++)
- INIT_HLIST_HEAD(&inode_hashtable[loop]);
}
void __init inode_init(void)
{
- unsigned int loop;
-
/* inode slab cache */
inode_cachep = kmem_cache_create("inode_cache",
sizeof(struct inode),
@@ -1960,14 +1953,11 @@ void __init inode_init(void)
sizeof(struct hlist_head),
ihash_entries,
14,
- 0,
+ HASH_ZERO | HASH_ADAPT,
&i_hash_shift,
&i_hash_mask,
0,
0);
-
- for (loop = 0; loop < (1U << i_hash_shift); loop++)
- INIT_HLIST_HEAD(&inode_hashtable[loop]);
}
void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
diff --git a/fs/internal.h b/fs/internal.h
index 11c6d89dce9c..cef253ac176c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -42,7 +42,7 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
*/
extern void guard_bio_eod(int rw, struct bio *bio);
extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block, struct iomap *iomap);
+ get_block_t *get_block, const struct iomap *iomap);
/*
* char_dev.c
@@ -179,7 +179,7 @@ extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
* iomap support:
*/
typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
- void *data, struct iomap *iomap);
+ void *data, const struct iomap *iomap);
loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, const struct iomap_ops *ops, void *data,
diff --git a/fs/iomap.c b/fs/iomap.c
index 3ca1a8e44135..0de6182e18ee 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -108,7 +108,7 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
static int
iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, struct iomap *iomap)
+ struct page **pagep, const struct iomap *iomap)
{
pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
@@ -151,7 +151,7 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
static loff_t
iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ const struct iomap *iomap)
{
struct iov_iter *i = data;
long status = 0;
@@ -273,7 +273,7 @@ __iomap_read_page(struct inode *inode, loff_t offset)
static loff_t
iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ const struct iomap *iomap)
{
long status = 0;
ssize_t written = 0;
@@ -338,7 +338,7 @@ iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
EXPORT_SYMBOL_GPL(iomap_file_dirty);
static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
- unsigned bytes, struct iomap *iomap)
+ unsigned bytes, const struct iomap *iomap)
{
struct page *page;
int status;
@@ -355,7 +355,7 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
}
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
- struct iomap *iomap)
+ const struct iomap *iomap)
{
sector_t sector = iomap->blkno +
(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
@@ -365,7 +365,7 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
static loff_t
iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
- void *data, struct iomap *iomap)
+ void *data, const struct iomap *iomap)
{
bool *did_zero = data;
loff_t written = 0;
@@ -434,7 +434,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page);
static loff_t
iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, const struct iomap *iomap)
{
struct page *page = data;
int ret;
@@ -525,7 +525,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
static loff_t
iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ const struct iomap *iomap)
{
struct fiemap_ctx *ctx = data;
loff_t ret = length;
@@ -710,7 +710,7 @@ static void iomap_dio_bio_end_io(struct bio *bio)
}
static blk_qc_t
-iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
+iomap_dio_zero(struct iomap_dio *dio, const struct iomap *iomap, loff_t pos,
unsigned len)
{
struct page *page = ZERO_PAGE(0);
@@ -734,7 +734,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
static loff_t
iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, const struct iomap *iomap)
{
struct iomap_dio *dio = data;
unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 7f8f962454e5..aaaed2ba1b65 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -43,6 +43,7 @@
#include <linux/backing-dev.h>
#include <linux/bitops.h>
#include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
#define CREATE_TRACE_POINTS
#include <trace/events/jbd2.h>
@@ -206,6 +207,14 @@ static int kjournald2(void *arg)
wake_up(&journal->j_wait_done_commit);
/*
+ * Make sure that no allocations from this kernel thread will ever
+ * recurse to the fs layer because we are responsible for the
+ * transaction commit and any fs involvement might get stuck waiting for
+ * the trasn. commit.
+ */
+ memalloc_nofs_save();
+
+ /*
* And now, wait forever for commit wakeup events.
*/
write_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 5e659ee08d6a..9ee4832b6f8b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -29,6 +29,7 @@
#include <linux/backing-dev.h>
#include <linux/bug.h>
#include <linux/module.h>
+#include <linux/sched/mm.h>
#include <trace/events/jbd2.h>
@@ -388,6 +389,11 @@ repeat:
rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
jbd2_journal_free_transaction(new_transaction);
+ /*
+ * Ensure that no allocations done while the transaction is open are
+ * going to recurse back to the fs layer.
+ */
+ handle->saved_alloc_context = memalloc_nofs_save();
return 0;
}
@@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
handle->h_transaction->t_tid, type,
line_no, nblocks);
+
return handle;
}
EXPORT_SYMBOL(jbd2__journal_start);
@@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle)
if (handle->h_rsv_handle)
jbd2_journal_free_reserved(handle->h_rsv_handle);
free_and_exit:
+ /*
+ * Scope of the GFP_NOFS context is over here and so we can restore the
+ * original alloc context.
+ */
+ memalloc_nofs_restore(handle->saved_alloc_context);
jbd2_free_handle(handle);
return err;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index cc1375eff88c..c8d4324857bd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3241,7 +3241,6 @@ static void __init init_mount_tree(void)
void __init mnt_init(void)
{
- unsigned u;
int err;
mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
@@ -3250,22 +3249,17 @@ void __init mnt_init(void)
mount_hashtable = alloc_large_system_hash("Mount-cache",
sizeof(struct hlist_head),
mhash_entries, 19,
- 0,
+ HASH_ZERO,
&m_hash_shift, &m_hash_mask, 0, 0);
mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
sizeof(struct hlist_head),
mphash_entries, 19,
- 0,
+ HASH_ZERO,
&mp_hash_shift, &mp_hash_mask, 0, 0);
if (!mount_hashtable || !mountpoint_hashtable)
panic("Failed to allocate mount hash table\n");
- for (u = 0; u <= m_hash_mask; u++)
- INIT_HLIST_HEAD(&mount_hashtable[u]);
- for (u = 0; u <= mp_hash_mask; u++)
- INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
-
kernfs_init();
err = sysfs_init();
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3e04279446e8..f0072145eead 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2612,20 +2612,48 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
+ if (ret == -EEXIST) {
+ if (oldmle)
+ __dlm_put_mle(oldmle);
+
+ spin_unlock(&dlm->master_lock);
+ spin_unlock(&dlm->spinlock);
+ mlog(0, "another process is already migrating it\n");
+ goto fail;
+ }
+
+ /*
+ * If an old mle is found, it should be put. If its type is BLOCK,
+ * it should be put again. Because it has been unhasded from the map
+ * in the function dlm_add_migration_mle.
+ * Otherwise the memory will be leaked. It will not be found again from
+ * the hash map.
+ */
+ if (oldmle) {
+ /* master is known, detach if not already detached */
+ __dlm_mle_detach_hb_events(dlm, oldmle);
+ __dlm_put_mle(oldmle);
+
+ /*
+ * If the type of the mle is BLOCK, it should be put once for
+ * release. Otherwise a memory leak may be caused because
+ * oldmle has been unhashed from the hash map and it will not
+ * be found any more.
+ */
+ if (oldmle->type == DLM_MLE_BLOCK)
+ __dlm_put_mle(oldmle);
+ }
+
/* get an extra reference on the mle.
* otherwise the assert_master from the new
* master will destroy this.
*/
dlm_get_mle_inuse(mle);
+ mle_added = 1;
+
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
- if (ret == -EEXIST) {
- mlog(0, "another process is already migrating it\n");
- goto fail;
- }
- mle_added = 1;
-
/*
* set the MIGRATING flag and flush asts
* if we fail after this we need to re-dirty the lockres
@@ -2642,12 +2670,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
}
fail:
- if (ret != -EEXIST && oldmle) {
- /* master is known, detach if not already detached */
- dlm_mle_detach_hb_events(dlm, oldmle);
- dlm_put_mle(oldmle);
- }
-
if (ret < 0) {
if (mle_added) {
dlm_mle_detach_hb_events(dlm, mle);
@@ -3182,16 +3204,24 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
if (ret < 0)
kmem_cache_free(dlm_mle_cache, mle);
+ /*
+ * If an old mle is found, it should be put. If its type is BLOCK,
+ * it should be put again because it has been unhashed from the map
+ * in the dlm_add_migration_mle().
+ * Otherwise the memory will be leaked. It will not be found again from
+ * the hash map.
+ */
+ if (oldmle) {
+ __dlm_mle_detach_hb_events(dlm, oldmle);
+ __dlm_put_mle(oldmle);
+ if (ret >= 0 && oldmle->type == DLM_MLE_BLOCK)
+ __dlm_put_mle(oldmle);
+ }
+
spin_unlock(&dlm->master_lock);
unlock:
spin_unlock(&dlm->spinlock);
- if (oldmle) {
- /* master is known, detach if not already detached */
- dlm_mle_detach_hb_events(dlm, oldmle);
- dlm_put_mle(oldmle);
- }
-
if (res)
dlm_lockres_put(res);
leave:
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 74407c6dd592..908b05942282 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2268,6 +2268,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
{
struct dlm_lock *lock, *next;
unsigned int freed = 0;
+ struct list_head *queue = NULL;
+ int i;
/* this node is the lockres master:
* 1) remove any stale locks for the dead node
@@ -2280,31 +2282,19 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
* to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */
/* TODO: check pending_asts, pending_basts here */
- list_for_each_entry_safe(lock, next, &res->granted, list) {
- if (lock->ml.node == dead_node) {
- list_del_init(&lock->list);
- dlm_lock_put(lock);
- /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
- dlm_lock_put(lock);
- freed++;
- }
- }
- list_for_each_entry_safe(lock, next, &res->converting, list) {
- if (lock->ml.node == dead_node) {
- list_del_init(&lock->list);
- dlm_lock_put(lock);
- /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
- dlm_lock_put(lock);
- freed++;
- }
- }
- list_for_each_entry_safe(lock, next, &res->blocked, list) {
- if (lock->ml.node == dead_node) {
- list_del_init(&lock->list);
- dlm_lock_put(lock);
- /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
- dlm_lock_put(lock);
- freed++;
+ for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) {
+ queue = dlm_list_idx_to_ptr(res, i);
+ list_for_each_entry_safe(lock, next, queue, list) {
+ if (lock->ml.node == dead_node) {
+ list_del_init(&lock->list);
+ dlm_lock_put(lock);
+ /*
+ * Can't schedule DLM_UNLOCK_FREE_LOCK: do
+ * manually
+ */
+ dlm_lock_put(lock);
+ freed++;
+ }
}
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2cc7a8030275..e250910cffc8 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,7 +58,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
struct proc_inode *ei;
struct inode *inode;
- ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->pid = NULL;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f08bd31c1081..f0c8b33d99b1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -441,6 +441,7 @@ struct mem_size_stats {
unsigned long private_dirty;
unsigned long referenced;
unsigned long anonymous;
+ unsigned long lazyfree;
unsigned long anonymous_thp;
unsigned long shmem_thp;
unsigned long swap;
@@ -457,8 +458,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
int i, nr = compound ? 1 << compound_order(page) : 1;
unsigned long size = nr * PAGE_SIZE;
- if (PageAnon(page))
+ if (PageAnon(page)) {
mss->anonymous += size;
+ if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
+ mss->lazyfree += size;
+ }
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
@@ -771,6 +775,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Private_Dirty: %8lu kB\n"
"Referenced: %8lu kB\n"
"Anonymous: %8lu kB\n"
+ "LazyFree: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
"ShmemPmdMapped: %8lu kB\n"
"Shared_Hugetlb: %8lu kB\n"
@@ -789,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.private_dirty >> 10,
mss.referenced >> 10,
mss.anonymous >> 10,
+ mss.lazyfree >> 10,
mss.anonymous_thp >> 10,
mss.shmem_thp >> 10,
mss.shared_hugetlb >> 10,
@@ -900,7 +906,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
+ pmd_t pmd = *pmdp;
+
+ /* See comment in change_huge_pmd() */
+ pmdp_invalidate(vma, addr, pmdp);
+ if (pmd_dirty(*pmdp))
+ pmd = pmd_mkdirty(pmd);
+ if (pmd_young(*pmdp))
+ pmd = pmd_mkyoung(pmd);
pmd = pmd_wrprotect(pmd);
pmd = pmd_clear_soft_dirty(pmd);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 973607df579d..2bb1c72380f2 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -138,8 +138,6 @@ out:
* userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
* context.
* @ctx: [in] Pointer to the userfaultfd context.
- *
- * Returns: In case of success, returns not zero.
*/
static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
{
@@ -490,7 +488,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
* in such case.
*/
down_read(&mm->mmap_sem);
- ret = 0;
+ ret = VM_FAULT_NOPAGE;
}
}
@@ -527,10 +525,11 @@ out:
return ret;
}
-static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
- struct userfaultfd_wait_queue *ewq)
+static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wait_queue *ewq)
{
- int ret = 0;
+ if (WARN_ON_ONCE(current->flags & PF_EXITING))
+ goto out;
ewq->ctx = ctx;
init_waitqueue_entry(&ewq->wq, current);
@@ -547,8 +546,16 @@ static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
break;
if (ACCESS_ONCE(ctx->released) ||
fatal_signal_pending(current)) {
- ret = -1;
__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
+ if (ewq->msg.event == UFFD_EVENT_FORK) {
+ struct userfaultfd_ctx *new;
+
+ new = (struct userfaultfd_ctx *)
+ (unsigned long)
+ ewq->msg.arg.reserved.reserved1;
+
+ userfaultfd_ctx_put(new);
+ }
break;
}
@@ -566,9 +573,8 @@ static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
* ctx may go away after this if the userfault pseudo fd is
* already released.
*/
-
+out:
userfaultfd_ctx_put(ctx);
- return ret;
}
static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
@@ -626,7 +632,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
return 0;
}
-static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
+static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
{
struct userfaultfd_ctx *ctx = fctx->orig;
struct userfaultfd_wait_queue ewq;
@@ -636,17 +642,15 @@ static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
ewq.msg.event = UFFD_EVENT_FORK;
ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
- return userfaultfd_event_wait_completion(ctx, &ewq);
+ userfaultfd_event_wait_completion(ctx, &ewq);
}
void dup_userfaultfd_complete(struct list_head *fcs)
{
- int ret = 0;
struct userfaultfd_fork_ctx *fctx, *n;
list_for_each_entry_safe(fctx, n, fcs, list) {
- if (!ret)
- ret = dup_fctx(fctx);
+ dup_fctx(fctx);
list_del(&fctx->list);
kfree(fctx);
}
@@ -689,8 +693,7 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
userfaultfd_event_wait_completion(ctx, &ewq);
}
-void userfaultfd_remove(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
+bool userfaultfd_remove(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
@@ -699,13 +702,11 @@ void userfaultfd_remove(struct vm_area_struct *vma,
ctx = vma->vm_userfaultfd_ctx.ctx;
if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
- return;
+ return true;
userfaultfd_ctx_get(ctx);
up_read(&mm->mmap_sem);
- *prev = NULL; /* We wait for ACK w/o the mmap semaphore */
-
msg_init(&ewq.msg);
ewq.msg.event = UFFD_EVENT_REMOVE;
@@ -714,7 +715,7 @@ void userfaultfd_remove(struct vm_area_struct *vma,
userfaultfd_event_wait_completion(ctx, &ewq);
- down_read(&mm->mmap_sem);
+ return false;
}
static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
@@ -775,34 +776,6 @@ void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
}
}
-void userfaultfd_exit(struct mm_struct *mm)
-{
- struct vm_area_struct *vma = mm->mmap;
-
- /*
- * We can do the vma walk without locking because the caller
- * (exit_mm) knows it now has exclusive access
- */
- while (vma) {
- struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
-
- if (ctx && (ctx->features & UFFD_FEATURE_EVENT_EXIT)) {
- struct userfaultfd_wait_queue ewq;
-
- userfaultfd_ctx_get(ctx);
-
- msg_init(&ewq.msg);
- ewq.msg.event = UFFD_EVENT_EXIT;
-
- userfaultfd_event_wait_completion(ctx, &ewq);
-
- ctx->features &= ~UFFD_FEATURE_EVENT_EXIT;
- }
-
- vma = vma->vm_next;
- }
-}
-
static int userfaultfd_release(struct inode *inode, struct file *file)
{
struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 2dfdc62f795e..6b7b04468aa8 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -66,7 +66,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
void *
kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
{
- unsigned noio_flag = 0;
+ unsigned nofs_flag = 0;
void *ptr;
gfp_t lflags;
@@ -78,17 +78,17 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
* __vmalloc() will allocate data pages and auxillary structures (e.g.
* pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
* here. Hence we need to tell memory reclaim that we are in such a
- * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+ * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
* the filesystem here and potentially deadlocking.
*/
- if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
- noio_flag = memalloc_noio_save();
+ if (flags & KM_NOFS)
+ nofs_flag = memalloc_nofs_save();
lflags = kmem_flags_convert(flags);
ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
- if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
- memalloc_noio_restore(noio_flag);
+ if (flags & KM_NOFS)
+ memalloc_nofs_restore(nofs_flag);
return ptr;
}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 689f746224e7..ae08cfd9552a 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
lflags = GFP_ATOMIC | __GFP_NOWARN;
} else {
lflags = GFP_KERNEL | __GFP_NOWARN;
- if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+ if (flags & KM_NOFS)
lflags &= ~__GFP_FS;
}
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c3decedc9455..3059a3ec7ecb 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2886,7 +2886,7 @@ xfs_btree_split_worker(
struct xfs_btree_split_args *args = container_of(work,
struct xfs_btree_split_args, work);
unsigned long pflags;
- unsigned long new_pflags = PF_FSTRANS;
+ unsigned long new_pflags = PF_MEMALLOC_NOFS;
/*
* we are in a transaction context here, but may also be doing work
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index bf65a9ea8642..330c6019120e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc(
* We hand off the transaction to the completion thread now, so
* clear the flag here.
*/
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return 0;
}
@@ -252,7 +252,7 @@ xfs_setfilesize_ioend(
* thus we need to mark ourselves as being in a transaction manually.
* Similarly for freeze protection.
*/
- current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
/* we abort the update if there was an IO error */
@@ -1021,7 +1021,7 @@ xfs_do_writepage(
* Given that we do not allow direct reclaim to call us, we should
* never be called while in a filesystem transaction.
*/
- if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
goto redirty;
/*
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b6208728ba39..ca09061369cb 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -443,17 +443,17 @@ _xfs_buf_map_pages(
bp->b_addr = NULL;
} else {
int retried = 0;
- unsigned noio_flag;
+ unsigned nofs_flag;
/*
* vm_map_ram() will allocate auxillary structures (e.g.
* pagetables) with GFP_KERNEL, yet we are likely to be under
* GFP_NOFS context here. Hence we need to tell memory reclaim
- * that we are in such a context via PF_MEMALLOC_NOIO to prevent
+ * that we are in such a context via PF_MEMALLOC_NOFS to prevent
* memory reclaim re-entering the filesystem here and
* potentially deadlocking.
*/
- noio_flag = memalloc_noio_save();
+ nofs_flag = memalloc_nofs_save();
do {
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-1, PAGE_KERNEL);
@@ -461,7 +461,7 @@ _xfs_buf_map_pages(
break;
vm_unmap_aliases();
} while (retried++ <= 1);
- memalloc_noio_restore(noio_flag);
+ memalloc_nofs_restore(nofs_flag);
if (!bp->b_addr)
return -ENOMEM;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 70f42ea86dfb..f5969c8274fc 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -134,7 +134,7 @@ xfs_trans_reserve(
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
/* Mark this thread as being in a transaction */
- current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
/*
* Attempt to reserve the needed disk blocks by decrementing
@@ -144,7 +144,7 @@ xfs_trans_reserve(
if (blocks > 0) {
error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
if (error != 0) {
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return -ENOSPC;
}
tp->t_blk_res += blocks;
@@ -221,7 +221,7 @@ undo_blocks:
tp->t_blk_res = 0;
}
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return error;
}
@@ -914,7 +914,7 @@ __xfs_trans_commit(
xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free(tp);
/*
@@ -944,7 +944,7 @@ out_unreserve:
if (commit_lsn == -1 && !error)
error = -EIO;
}
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
xfs_trans_free(tp);
@@ -998,7 +998,7 @@ xfs_trans_cancel(
xfs_log_done(mp, tp->t_ticket, NULL, false);
/* mark this thread as no longer being in a transaction */
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
xfs_trans_free(tp);
diff --git a/include/dt-bindings/sound/cs42l42.h b/include/dt-bindings/sound/cs42l42.h
index 399a123aed58..db69d84ed7d1 100644
--- a/include/dt-bindings/sound/cs42l42.h
+++ b/include/dt-bindings/sound/cs42l42.h
@@ -20,7 +20,7 @@
#define CS42L42_HPOUT_LOAD_1NF 0
#define CS42L42_HPOUT_LOAD_10NF 1
-/* HPOUT Clamp to GND Overide */
+/* HPOUT Clamp to GND Override */
#define CS42L42_HPOUT_CLAMP_EN 0
#define CS42L42_HPOUT_CLAMP_DIS 1
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 962164d36506..dbaf312b3317 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -358,6 +358,8 @@ extern void *alloc_large_system_hash(const char *tablename,
#define HASH_EARLY 0x00000001 /* Allocating during early boot? */
#define HASH_SMALL 0x00000002 /* sub-page allocation allowed, min
* shift passed via *_hash_shift */
+#define HASH_ZERO 0x00000004 /* Zero allocated hash table */
+#define HASH_ADAPT 0x00000008 /* Adaptive scale for large memory */
/* Only NUMA needs hash distribution. 64bit NUMA architectures have
* sufficient vmalloc space.
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 96f1e88b767c..a3ba193f042e 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -40,9 +40,9 @@ extern int nr_cpu_ids;
#ifdef CONFIG_CPUMASK_OFFSTACK
/* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also,
* not all bits may be allocated. */
-#define nr_cpumask_bits nr_cpu_ids
+#define nr_cpumask_bits ((unsigned int)nr_cpu_ids)
#else
-#define nr_cpumask_bits NR_CPUS
+#define nr_cpumask_bits ((unsigned int)NR_CPUS)
#endif
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 79fc59b2db0d..f1d7347ef49f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2681,7 +2681,7 @@ static const char * const kernel_read_file_str[] = {
static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
{
- if (id < 0 || id >= READING_MAX_ID)
+ if ((unsigned)id >= READING_MAX_ID)
return kernel_read_file_str[READING_UNKNOWN];
return kernel_read_file_str[id];
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index db373b9d3223..2bfcfd33e476 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -40,6 +40,11 @@ struct vm_area_struct;
#define ___GFP_DIRECT_RECLAIM 0x400000u
#define ___GFP_WRITE 0x800000u
#define ___GFP_KSWAPD_RECLAIM 0x1000000u
+#ifdef CONFIG_LOCKDEP
+#define ___GFP_NOLOCKDEP 0x4000000u
+#else
+#define ___GFP_NOLOCKDEP 0
+#endif
/* If the above are modified, __GFP_BITS_SHIFT may need updating */
/*
@@ -179,8 +184,11 @@ struct vm_area_struct;
#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK)
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
+/* Disable lockdep for GFP context tracking */
+#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
+
/* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT 25
+#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/*
@@ -202,8 +210,16 @@ struct vm_area_struct;
*
* GFP_NOIO will use direct reclaim to discard clean pages or slab pages
* that do not require the starting of any physical IO.
+ * Please try to avoid using this flag directly and instead use
+ * memalloc_noio_{save,restore} to mark the whole scope which cannot
+ * perform any IO with a short explanation why. All allocation requests
+ * will inherit GFP_NOIO implicitly.
*
* GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
+ * Please try to avoid using this flag directly and instead use
+ * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
+ * recurse into the FS layer with a short explanation why. All allocation
+ * requests will inherit GFP_NOFS implicitly.
*
* GFP_USER is for userspace allocations that also need to be directly
* accessibly by the kernel or hardware. It is typically used by hardware
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index dfaa1f4dcb0c..606b6bce3a5b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -491,6 +491,8 @@ struct jbd2_journal_handle
unsigned long h_start_jiffies;
unsigned int h_requested_credits;
+
+ unsigned int saved_alloc_context;
};
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d419d0e51fe5..e98e546b543c 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -283,6 +283,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
#define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+ vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5af377303880..baa274150210 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -46,6 +46,7 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */
+ MEM_CGROUP_STAT_SHMEM, /* # of pages charged as shmem */
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_DIRTY, /* # of dirty pages in page cache */
MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 51891fb0d3ce..c91b3bcd158f 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -394,18 +394,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
___pud; \
})
-#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \
-({ \
- unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
- pmd_t ___pmd; \
- \
- ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd); \
- mmu_notifier_invalidate_range(__mm, ___haddr, \
- ___haddr + HPAGE_PMD_SIZE); \
- \
- ___pmd; \
-})
-
/*
* set_pte_at_notify() sets the pte _after_ running the notifier.
* This is safe to start by updating the secondary MMUs, because the primary MMU
@@ -489,7 +477,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
#define ptep_clear_flush_notify ptep_clear_flush
#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
-#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
#define set_pte_at_notify set_pte_at
#endif /* CONFIG_MMU_NOTIFIER */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8e02b3750fe0..618499159a7c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -35,7 +35,7 @@
*/
#define PAGE_ALLOC_COSTLY_ORDER 3
-enum {
+enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
MIGRATE_RECLAIMABLE,
@@ -74,6 +74,11 @@ extern char * const migratetype_names[MIGRATE_TYPES];
# define is_migrate_cma_page(_page) false
#endif
+static inline bool is_migrate_movable(int mt)
+{
+ return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
+}
+
#define for_each_migratetype_order(order, type) \
for (order = 0; order < MAX_ORDER; order++) \
for (type = 0; type < MIGRATE_TYPES; type++)
@@ -149,7 +154,6 @@ enum node_stat_item {
NR_UNEVICTABLE, /* " " " " " */
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
- NR_PAGES_SCANNED, /* pages scanned since last reclaim */
WORKINGSET_REFAULT,
WORKINGSET_ACTIVATE,
WORKINGSET_NODERECLAIM,
@@ -630,6 +634,8 @@ typedef struct pglist_data {
int kswapd_order;
enum zone_type kswapd_classzone_idx;
+ int kswapd_failures; /* Number of 'reclaimed == 0' runs */
+
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_classzone_idx;
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 047d64706f2a..d4cd2014fa6f 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -33,10 +33,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
bool skip_hwpoisoned_pages);
void set_pageblock_migratetype(struct page *page, int migratetype);
int move_freepages_block(struct zone *zone, struct page *page,
- int migratetype);
-int move_freepages(struct zone *zone,
- struct page *start_page, struct page *end_page,
- int migratetype);
+ int migratetype, int *num_movable);
/*
* Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index ad3e5158e586..c9f795e9a2ee 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -65,7 +65,7 @@ struct regulator_state {
int uV; /* suspend voltage */
unsigned int mode; /* suspend regulator operating mode */
int enabled; /* is regulator enabled in this suspend state */
- int disabled; /* is the regulator disbled in this suspend state */
+ int disabled; /* is the regulator disabled in this suspend state */
};
/**
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 8c89e902df3e..fee10d744ebd 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -83,19 +83,17 @@ struct anon_vma_chain {
};
enum ttu_flags {
- TTU_UNMAP = 1, /* unmap mode */
- TTU_MIGRATION = 2, /* migration mode */
- TTU_MUNLOCK = 4, /* munlock mode */
- TTU_LZFREE = 8, /* lazy free mode */
- TTU_SPLIT_HUGE_PMD = 16, /* split huge PMD if any */
-
- TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
- TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
- TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
- TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible
+ TTU_MIGRATION = 0x1, /* migration mode */
+ TTU_MUNLOCK = 0x2, /* munlock mode */
+
+ TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
+ TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
+ TTU_IGNORE_ACCESS = 0x10, /* don't age */
+ TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */
+ TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible
* and caller guarantees they will
* do a final flush if necessary */
- TTU_RMAP_LOCKED = (1 << 12) /* do not grab rmap lock:
+ TTU_RMAP_LOCKED = 0x80 /* do not grab rmap lock:
* caller holds it */
};
@@ -193,8 +191,6 @@ static inline void page_dup_rmap(struct page *page, bool compound)
int page_referenced(struct page *, int is_locked,
struct mem_cgroup *memcg, unsigned long *vm_flags);
-#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
-
int try_to_unmap(struct page *, enum ttu_flags flags);
/* Avoid racy checks */
@@ -302,6 +298,6 @@ static inline int page_mkclean(struct page *page)
#define SWAP_AGAIN 1
#define SWAP_FAIL 2
#define SWAP_MLOCK 3
-#define SWAP_LZFREE 4
+#define SWAP_DIRTY 4
#endif /* _LINUX_RMAP_H */
diff --git a/include/linux/rodata_test.h b/include/linux/rodata_test.h
index ea05f6c51413..84766bcdd01f 100644
--- a/include/linux/rodata_test.h
+++ b/include/linux/rodata_test.h
@@ -14,7 +14,6 @@
#define _RODATA_TEST_H
#ifdef CONFIG_DEBUG_RODATA_TEST
-extern const int rodata_test_data;
void rodata_test(void);
#else
static inline void rodata_test(void) {}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 19b74f7c137d..6850e47145b8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1214,9 +1214,9 @@ extern struct pid *cad_pid;
#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
#define PF_FROZEN 0x00010000 /* Frozen for system suspend */
-#define PF_FSTRANS 0x00020000 /* Inside a filesystem transaction */
-#define PF_KSWAPD 0x00040000 /* I am kswapd */
-#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */
+#define PF_KSWAPD 0x00020000 /* I am kswapd */
+#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
+#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 830953ebb391..9daabe138c99 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -149,13 +149,21 @@ static inline bool in_vfork(struct task_struct *tsk)
return ret;
}
-/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
- * __GFP_FS is also cleared as it implies __GFP_IO.
+/*
+ * Applies per-task gfp context to the given allocation flags.
+ * PF_MEMALLOC_NOIO implies GFP_NOIO
+ * PF_MEMALLOC_NOFS implies GFP_NOFS
*/
-static inline gfp_t memalloc_noio_flags(gfp_t flags)
+static inline gfp_t current_gfp_context(gfp_t flags)
{
+ /*
+ * NOIO implies both NOIO and NOFS and it is a weaker context
+ * so always make sure it makes precendence
+ */
if (unlikely(current->flags & PF_MEMALLOC_NOIO))
flags &= ~(__GFP_IO | __GFP_FS);
+ else if (unlikely(current->flags & PF_MEMALLOC_NOFS))
+ flags &= ~__GFP_FS;
return flags;
}
@@ -171,4 +179,16 @@ static inline void memalloc_noio_restore(unsigned int flags)
current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
}
+static inline unsigned int memalloc_nofs_save(void)
+{
+ unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
+ current->flags |= PF_MEMALLOC_NOFS;
+ return flags;
+}
+
+static inline void memalloc_nofs_restore(unsigned int flags)
+{
+ current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
+}
+
#endif /* _LINUX_SCHED_MM_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 45e91dd6716d..486494e6b2fc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -279,7 +279,7 @@ extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_all(void);
extern void rotate_reclaimable_page(struct page *page);
extern void deactivate_file_page(struct page *page);
-extern void deactivate_page(struct page *page);
+extern void mark_page_lazyfree(struct page *page);
extern void swap_setup(void);
extern void add_page_to_unevictable_list(struct page *page);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 0468548acebf..48a3483dccb1 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -61,8 +61,7 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
unsigned long from, unsigned long to,
unsigned long len);
-extern void userfaultfd_remove(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
+extern bool userfaultfd_remove(struct vm_area_struct *vma,
unsigned long start,
unsigned long end);
@@ -72,8 +71,6 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
extern void userfaultfd_unmap_complete(struct mm_struct *mm,
struct list_head *uf);
-extern void userfaultfd_exit(struct mm_struct *mm);
-
#else /* CONFIG_USERFAULTFD */
/* mm helpers */
@@ -120,11 +117,11 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
{
}
-static inline void userfaultfd_remove(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
+static inline bool userfaultfd_remove(struct vm_area_struct *vma,
unsigned long start,
unsigned long end)
{
+ return true;
}
static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
@@ -139,10 +136,6 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
{
}
-static inline void userfaultfd_exit(struct mm_struct *mm)
-{
-}
-
#endif /* CONFIG_USERFAULTFD */
#endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 6aa1b6cb5828..d84ae90ccd5c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -25,7 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
FOR_ALL_ZONES(PGALLOC),
FOR_ALL_ZONES(ALLOCSTALL),
FOR_ALL_ZONES(PGSCAN_SKIP),
- PGFREE, PGACTIVATE, PGDEACTIVATE,
+ PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
PGFAULT, PGMAJFAULT,
PGLAZYFREED,
PGREFILL,
@@ -79,6 +79,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_SPLIT_PAGE_FAILED,
THP_DEFERRED_SPLIT_PAGE,
THP_SPLIT_PMD,
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+ THP_SPLIT_PUD,
+#endif
THP_ZERO_PAGE_ALLOC,
THP_ZERO_PAGE_ALLOC_FAILED,
#endif
diff --git a/include/net/irda/timer.h b/include/net/irda/timer.h
index cb2615ccf761..d784f242cf7b 100644
--- a/include/net/irda/timer.h
+++ b/include/net/irda/timer.h
@@ -59,7 +59,7 @@ struct lap_cb;
* Slot timer must never exceed 85 ms, and must always be at least 25 ms,
* suggested to 75-85 msec by IrDA lite. This doesn't work with a lot of
* devices, and other stackes uses a lot more, so it's best we do it as well
- * (Note : this is the default value and sysctl overides it - Jean II)
+ * (Note : this is the default value and sysctl overrides it - Jean II)
*/
#define SLOT_TIMEOUT (90*HZ/1000)
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index c566ddc87f73..08bb3ed18dcc 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -150,6 +150,136 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
+DECLARE_EVENT_CLASS(dax_pte_fault_class,
+ TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result),
+ TP_ARGS(inode, vmf, result),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long, vm_flags)
+ __field(unsigned long, address)
+ __field(pgoff_t, pgoff)
+ __field(dev_t, dev)
+ __field(unsigned int, flags)
+ __field(int, result)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->vm_flags = vmf->vma->vm_flags;
+ __entry->address = vmf->address;
+ __entry->flags = vmf->flags;
+ __entry->pgoff = vmf->pgoff;
+ __entry->result = result;
+ ),
+ TP_printk("dev %d:%d ino %#lx %s %s address %#lx pgoff %#lx %s",
+ MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ __entry->ino,
+ __entry->vm_flags & VM_SHARED ? "shared" : "private",
+ __print_flags(__entry->flags, "|", FAULT_FLAG_TRACE),
+ __entry->address,
+ __entry->pgoff,
+ __print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE)
+ )
+)
+
+#define DEFINE_PTE_FAULT_EVENT(name) \
+DEFINE_EVENT(dax_pte_fault_class, name, \
+ TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), \
+ TP_ARGS(inode, vmf, result))
+
+DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
+DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
+DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry);
+DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite);
+DEFINE_PTE_FAULT_EVENT(dax_load_hole);
+
+TRACE_EVENT(dax_insert_mapping,
+ TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry),
+ TP_ARGS(inode, vmf, radix_entry),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long, vm_flags)
+ __field(unsigned long, address)
+ __field(void *, radix_entry)
+ __field(dev_t, dev)
+ __field(int, write)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->vm_flags = vmf->vma->vm_flags;
+ __entry->address = vmf->address;
+ __entry->write = vmf->flags & FAULT_FLAG_WRITE;
+ __entry->radix_entry = radix_entry;
+ ),
+ TP_printk("dev %d:%d ino %#lx %s %s address %#lx radix_entry %#lx",
+ MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ __entry->ino,
+ __entry->vm_flags & VM_SHARED ? "shared" : "private",
+ __entry->write ? "write" : "read",
+ __entry->address,
+ (unsigned long)__entry->radix_entry
+ )
+)
+
+DECLARE_EVENT_CLASS(dax_writeback_range_class,
+ TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index),
+ TP_ARGS(inode, start_index, end_index),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(pgoff_t, start_index)
+ __field(pgoff_t, end_index)
+ __field(dev_t, dev)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->start_index = start_index;
+ __entry->end_index = end_index;
+ ),
+ TP_printk("dev %d:%d ino %#lx pgoff %#lx-%#lx",
+ MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ __entry->ino,
+ __entry->start_index,
+ __entry->end_index
+ )
+)
+
+#define DEFINE_WRITEBACK_RANGE_EVENT(name) \
+DEFINE_EVENT(dax_writeback_range_class, name, \
+ TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index),\
+ TP_ARGS(inode, start_index, end_index))
+
+DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range);
+DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range_done);
+
+TRACE_EVENT(dax_writeback_one,
+ TP_PROTO(struct inode *inode, pgoff_t pgoff, pgoff_t pglen),
+ TP_ARGS(inode, pgoff, pglen),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(pgoff_t, pgoff)
+ __field(pgoff_t, pglen)
+ __field(dev_t, dev)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->pgoff = pgoff;
+ __entry->pglen = pglen;
+ ),
+ TP_printk("dev %d:%d ino %#lx pgoff %#lx pglen %#lx",
+ MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pgoff,
+ __entry->pglen
+ )
+)
+
#endif /* _TRACE_FS_DAX_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index d2b12152e358..c6d18aaeb3a4 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -26,6 +26,10 @@
#include <linux/types.h>
#include <linux/compiler.h>
+#ifndef __KERNEL__
+#include <stddef.h> /* For size_t. */
+#endif
+
#define CTL_MAXNAME 10 /* how many path components do we allow in a
call to sysctl? In other words, what is
the largest acceptable value for the nlen
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index c055947c5c98..3b059530dac9 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,8 +18,7 @@
* means the userland is reading).
*/
#define UFFD_API ((__u64)0xAA)
-#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_EXIT | \
- UFFD_FEATURE_EVENT_FORK | \
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \
UFFD_FEATURE_EVENT_REMAP | \
UFFD_FEATURE_EVENT_REMOVE | \
UFFD_FEATURE_EVENT_UNMAP | \
@@ -113,7 +112,6 @@ struct uffd_msg {
#define UFFD_EVENT_REMAP 0x14
#define UFFD_EVENT_REMOVE 0x15
#define UFFD_EVENT_UNMAP 0x16
-#define UFFD_EVENT_EXIT 0x17
/* flags for UFFD_EVENT_PAGEFAULT */
#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
@@ -163,7 +161,6 @@ struct uffdio_api {
#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4)
#define UFFD_FEATURE_MISSING_SHMEM (1<<5)
#define UFFD_FEATURE_EVENT_UNMAP (1<<6)
-#define UFFD_FEATURE_EVENT_EXIT (1<<7)
__u64 features;
__u64 ioctls;
diff --git a/init/initramfs.c b/init/initramfs.c
index 981f286c1d16..bf7707b44988 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -611,7 +611,7 @@ static int __init populate_rootfs(void)
char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
if (err)
panic("%s", err); /* Failed to decompress INTERNAL initramfs */
- if (initrd_start) {
+ if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) {
#ifdef CONFIG_BLK_DEV_RAM
int fd;
printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n");
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b1cc1c306668..a1abf033564b 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2669,7 +2669,7 @@ static bool css_visible(struct cgroup_subsys_state *css)
*
* Returns 0 on success, -errno on failure. On failure, csses which have
* been processed already aren't cleaned up. The caller is responsible for
- * cleaning up with cgroup_apply_control_disble().
+ * cleaning up with cgroup_apply_control_disable().
*/
static int cgroup_apply_control_enable(struct cgroup *cgrp)
{
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6f41548f2e32..a17ed56c8ce1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -998,7 +998,7 @@ list_update_cgroup_event(struct perf_event *event,
*/
#define PERF_CPU_HRTIMER (1000 / HZ)
/*
- * function must be called with interrupts disbled
+ * function must be called with interrupts disabled
*/
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
diff --git a/kernel/exit.c b/kernel/exit.c
index e126ebf2400c..516acdb0e0ec 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -554,7 +554,6 @@ static void exit_mm(void)
enter_lazy_tlb(mm, current);
task_unlock(current);
mm_update_next_owner(mm);
- userfaultfd_exit(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
exit_oom_victim();
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e9d4f85b290c..db361f0b8c73 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -30,6 +30,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/sched/task.h>
+#include <linux/sched/mm.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
@@ -2863,6 +2864,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
if (unlikely(!debug_locks))
return;
+ gfp_mask = current_gfp_context(gfp_mask);
+
/* no reclaim without waiting on it */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
return;
@@ -2872,7 +2875,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
return;
/* We're only interested __GFP_FS allocations for now */
- if (!(gfp_mask & __GFP_FS))
+ if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
return;
/*
@@ -2881,6 +2884,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
return;
+ /* Disable lockdep if explicitly requested */
+ if (gfp_mask & __GFP_NOLOCKDEP)
+ return;
+
mark_held_locks(curr, RECLAIM_FS);
}
@@ -3861,7 +3868,7 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock);
void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
{
- current->lockdep_reclaim_gfp = gfp_mask;
+ current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
}
void lockdep_clear_current_reclaim_state(void)
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e6b2f7ad3e51..4ccfcaae5b89 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void)
*/
pv_lock_hash = alloc_large_system_hash("PV qspinlock",
sizeof(struct pv_hash_entry),
- pv_hash_size, 0, HASH_EARLY,
+ pv_hash_size, 0,
+ HASH_EARLY | HASH_ZERO,
&pv_lock_hash_bits, NULL,
pv_hash_size, pv_hash_size);
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 0143ac0ddceb..a1f8459c70ac 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -573,16 +573,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
- unsigned int i, pidhash_size;
+ unsigned int pidhash_size;
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
- HASH_EARLY | HASH_SMALL,
+ HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
pidhash_size = 1U << pidhash_shift;
-
- for (i = 0; i < pidhash_size; i++)
- INIT_HLIST_HEAD(&pid_hash[i]);
}
void __init pidmap_init(void)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 8a5e44236f78..4559e914452b 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -30,6 +30,7 @@
#include <linux/pid_namespace.h>
#include <net/genetlink.h>
#include <linux/atomic.h>
+#include <linux/sched/cputime.h>
/*
* Maximum length of a cpumask that can be specified in
@@ -210,6 +211,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
struct task_struct *tsk, *first;
unsigned long flags;
int rc = -ESRCH;
+ u64 delta, utime, stime;
+ u64 start_time;
/*
* Add additional stats from live tasks except zombie thread group
@@ -227,6 +230,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
memset(stats, 0, sizeof(*stats));
tsk = first;
+ start_time = ktime_get_ns();
do {
if (tsk->exit_state)
continue;
@@ -238,6 +242,16 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
*/
delayacct_add_tsk(stats, tsk);
+ /* calculate task elapsed time in nsec */
+ delta = start_time - tsk->start_time;
+ /* Convert to micro seconds */
+ do_div(delta, NSEC_PER_USEC);
+ stats->ac_etime += delta;
+
+ task_cputime(tsk, &utime, &stime);
+ stats->ac_utime += div_u64(utime, NSEC_PER_USEC);
+ stats->ac_stime += div_u64(stime, NSEC_PER_USEC);
+
stats->nvcsw += tsk->nvcsw;
stats->nivcsw += tsk->nivcsw;
} while_each_thread(first, tsk);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 1d68b5b7ad41..5fb1f2c87e6b 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -65,7 +65,7 @@ void stack_trace_print(void)
}
/*
- * When arch-specific code overides this function, the following
+ * When arch-specific code overrides this function, the following
* data should be filled up, assuming stack_trace_max_lock is held to
* prevent concurrent updates.
* stack_trace_index[]
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 97d62c2da6c2..924f210db65f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1718,19 +1718,21 @@ config LKDTM
Documentation/fault-injection/provoke-crashes.txt
config TEST_LIST_SORT
- bool "Linked list sorting test"
- depends on DEBUG_KERNEL
+ tristate "Linked list sorting test"
+ depends on DEBUG_KERNEL || m
help
Enable this to turn on 'list_sort()' function test. This test is
- executed only once during system boot, so affects only boot time.
+ executed only once during system boot (so affects only boot time),
+ or at module load time.
If unsure, say N.
config TEST_SORT
- bool "Array-based sort test"
- depends on DEBUG_KERNEL
+ tristate "Array-based sort test"
+ depends on DEBUG_KERNEL || m
help
- This option enables the self-test function of 'sort()' at boot.
+ This option enables the self-test function of 'sort()' at boot,
+ or at module load time.
If unsure, say N.
diff --git a/lib/Makefile b/lib/Makefile
index 320ac46a8725..786c4538a91f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o
obj-$(CONFIG_TEST_KASAN) += test_kasan.o
obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
+obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o
obj-$(CONFIG_TEST_LKM) += test_module.o
obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
obj-$(CONFIG_TEST_SORT) += test_sort.o
diff --git a/lib/list_sort.c b/lib/list_sort.c
index 3fe401067e20..9e9acc37652f 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -1,6 +1,3 @@
-
-#define pr_fmt(fmt) "list_sort_test: " fmt
-
#include <linux/kernel.h>
#include <linux/bug.h>
#include <linux/compiler.h>
@@ -145,149 +142,3 @@ void list_sort(void *priv, struct list_head *head,
merge_and_restore_back_links(priv, cmp, head, part[max_lev], list);
}
EXPORT_SYMBOL(list_sort);
-
-#ifdef CONFIG_TEST_LIST_SORT
-
-#include <linux/slab.h>
-#include <linux/random.h>
-
-/*
- * The pattern of set bits in the list length determines which cases
- * are hit in list_sort().
- */
-#define TEST_LIST_LEN (512+128+2) /* not including head */
-
-#define TEST_POISON1 0xDEADBEEF
-#define TEST_POISON2 0xA324354C
-
-struct debug_el {
- unsigned int poison1;
- struct list_head list;
- unsigned int poison2;
- int value;
- unsigned serial;
-};
-
-/* Array, containing pointers to all elements in the test list */
-static struct debug_el **elts __initdata;
-
-static int __init check(struct debug_el *ela, struct debug_el *elb)
-{
- if (ela->serial >= TEST_LIST_LEN) {
- pr_err("error: incorrect serial %d\n", ela->serial);
- return -EINVAL;
- }
- if (elb->serial >= TEST_LIST_LEN) {
- pr_err("error: incorrect serial %d\n", elb->serial);
- return -EINVAL;
- }
- if (elts[ela->serial] != ela || elts[elb->serial] != elb) {
- pr_err("error: phantom element\n");
- return -EINVAL;
- }
- if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) {
- pr_err("error: bad poison: %#x/%#x\n",
- ela->poison1, ela->poison2);
- return -EINVAL;
- }
- if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) {
- pr_err("error: bad poison: %#x/%#x\n",
- elb->poison1, elb->poison2);
- return -EINVAL;
- }
- return 0;
-}
-
-static int __init cmp(void *priv, struct list_head *a, struct list_head *b)
-{
- struct debug_el *ela, *elb;
-
- ela = container_of(a, struct debug_el, list);
- elb = container_of(b, struct debug_el, list);
-
- check(ela, elb);
- return ela->value - elb->value;
-}
-
-static int __init list_sort_test(void)
-{
- int i, count = 1, err = -ENOMEM;
- struct debug_el *el;
- struct list_head *cur;
- LIST_HEAD(head);
-
- pr_debug("start testing list_sort()\n");
-
- elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL);
- if (!elts) {
- pr_err("error: cannot allocate memory\n");
- return err;
- }
-
- for (i = 0; i < TEST_LIST_LEN; i++) {
- el = kmalloc(sizeof(*el), GFP_KERNEL);
- if (!el) {
- pr_err("error: cannot allocate memory\n");
- goto exit;
- }
- /* force some equivalencies */
- el->value = prandom_u32() % (TEST_LIST_LEN / 3);
- el->serial = i;
- el->poison1 = TEST_POISON1;
- el->poison2 = TEST_POISON2;
- elts[i] = el;
- list_add_tail(&el->list, &head);
- }
-
- list_sort(NULL, &head, cmp);
-
- err = -EINVAL;
- for (cur = head.next; cur->next != &head; cur = cur->next) {
- struct debug_el *el1;
- int cmp_result;
-
- if (cur->next->prev != cur) {
- pr_err("error: list is corrupted\n");
- goto exit;
- }
-
- cmp_result = cmp(NULL, cur, cur->next);
- if (cmp_result > 0) {
- pr_err("error: list is not sorted\n");
- goto exit;
- }
-
- el = container_of(cur, struct debug_el, list);
- el1 = container_of(cur->next, struct debug_el, list);
- if (cmp_result == 0 && el->serial >= el1->serial) {
- pr_err("error: order of equivalent elements not "
- "preserved\n");
- goto exit;
- }
-
- if (check(el, el1)) {
- pr_err("error: element check failed\n");
- goto exit;
- }
- count++;
- }
- if (head.prev != cur) {
- pr_err("error: list is corrupted\n");
- goto exit;
- }
-
-
- if (count != TEST_LIST_LEN) {
- pr_err("error: bad list length %d", count);
- goto exit;
- }
-
- err = 0;
-exit:
- for (i = 0; i < TEST_LIST_LEN; i++)
- kfree(elts[i]);
- kfree(elts);
- return err;
-}
-late_initcall(list_sort_test);
-#endif /* CONFIG_TEST_LIST_SORT */
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 691a9ad48497..898e87998417 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -2284,6 +2284,8 @@ static int radix_tree_cpu_dead(unsigned int cpu)
void __init radix_tree_init(void)
{
int ret;
+
+ BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
sizeof(struct radix_tree_node), 0,
SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index c6cf82242d65..77df28b21597 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -394,17 +394,26 @@ int sg_alloc_table_from_pages(struct sg_table *sgt,
unsigned long offset, unsigned long size,
gfp_t gfp_mask)
{
+ unsigned int chunk_pages;
unsigned int chunks;
unsigned int i;
unsigned int cur_page;
int ret;
struct scatterlist *s;
+ BUILD_BUG_ON(!typecheck(typeof(s->length), unsigned int));
+
/* compute number of contiguous chunks */
chunks = 1;
- for (i = 1; i < n_pages; ++i)
- if (page_to_pfn(pages[i]) != page_to_pfn(pages[i - 1]) + 1)
+ chunk_pages = 1;
+ for (i = 1; i < n_pages; ++i) {
+ if (page_to_pfn(pages[i]) != page_to_pfn(pages[i - 1]) + 1 ||
+ chunk_pages >= UINT_MAX >> PAGE_SHIFT) {
++chunks;
+ chunk_pages = 0;
+ }
+ ++chunk_pages;
+ }
ret = sg_alloc_table(sgt, chunks, gfp_mask);
if (unlikely(ret))
@@ -417,10 +426,15 @@ int sg_alloc_table_from_pages(struct sg_table *sgt,
unsigned int j;
/* look for the end of the current chunk */
- for (j = cur_page + 1; j < n_pages; ++j)
+ chunk_pages = 1;
+ for (j = cur_page + 1; j < n_pages; ++j) {
if (page_to_pfn(pages[j]) !=
- page_to_pfn(pages[j - 1]) + 1)
+ page_to_pfn(pages[j - 1]) + 1 ||
+ chunk_pages >= UINT_MAX >> PAGE_SHIFT) {
break;
+ }
+ ++chunk_pages;
+ }
chunk_size = ((j - cur_page) << PAGE_SHIFT) - offset;
sg_set_page(s, pages[cur_page], min(size, chunk_size), offset);
diff --git a/lib/test_list_sort.c b/lib/test_list_sort.c
new file mode 100644
index 000000000000..28e817387b04
--- /dev/null
+++ b/lib/test_list_sort.c
@@ -0,0 +1,150 @@
+#define pr_fmt(fmt) "list_sort_test: " fmt
+
+#include <linux/kernel.h>
+#include <linux/list_sort.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+
+/*
+ * The pattern of set bits in the list length determines which cases
+ * are hit in list_sort().
+ */
+#define TEST_LIST_LEN (512+128+2) /* not including head */
+
+#define TEST_POISON1 0xDEADBEEF
+#define TEST_POISON2 0xA324354C
+
+struct debug_el {
+ unsigned int poison1;
+ struct list_head list;
+ unsigned int poison2;
+ int value;
+ unsigned serial;
+};
+
+/* Array, containing pointers to all elements in the test list */
+static struct debug_el **elts __initdata;
+
+static int __init check(struct debug_el *ela, struct debug_el *elb)
+{
+ if (ela->serial >= TEST_LIST_LEN) {
+ pr_err("error: incorrect serial %d\n", ela->serial);
+ return -EINVAL;
+ }
+ if (elb->serial >= TEST_LIST_LEN) {
+ pr_err("error: incorrect serial %d\n", elb->serial);
+ return -EINVAL;
+ }
+ if (elts[ela->serial] != ela || elts[elb->serial] != elb) {
+ pr_err("error: phantom element\n");
+ return -EINVAL;
+ }
+ if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) {
+ pr_err("error: bad poison: %#x/%#x\n",
+ ela->poison1, ela->poison2);
+ return -EINVAL;
+ }
+ if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) {
+ pr_err("error: bad poison: %#x/%#x\n",
+ elb->poison1, elb->poison2);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int __init cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct debug_el *ela, *elb;
+
+ ela = container_of(a, struct debug_el, list);
+ elb = container_of(b, struct debug_el, list);
+
+ check(ela, elb);
+ return ela->value - elb->value;
+}
+
+static int __init list_sort_test(void)
+{
+ int i, count = 1, err = -ENOMEM;
+ struct debug_el *el;
+ struct list_head *cur;
+ LIST_HEAD(head);
+
+ pr_debug("start testing list_sort()\n");
+
+ elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL);
+ if (!elts) {
+ pr_err("error: cannot allocate memory\n");
+ return err;
+ }
+
+ for (i = 0; i < TEST_LIST_LEN; i++) {
+ el = kmalloc(sizeof(*el), GFP_KERNEL);
+ if (!el) {
+ pr_err("error: cannot allocate memory\n");
+ goto exit;
+ }
+ /* force some equivalencies */
+ el->value = prandom_u32() % (TEST_LIST_LEN / 3);
+ el->serial = i;
+ el->poison1 = TEST_POISON1;
+ el->poison2 = TEST_POISON2;
+ elts[i] = el;
+ list_add_tail(&el->list, &head);
+ }
+
+ list_sort(NULL, &head, cmp);
+
+ err = -EINVAL;
+ for (cur = head.next; cur->next != &head; cur = cur->next) {
+ struct debug_el *el1;
+ int cmp_result;
+
+ if (cur->next->prev != cur) {
+ pr_err("error: list is corrupted\n");
+ goto exit;
+ }
+
+ cmp_result = cmp(NULL, cur, cur->next);
+ if (cmp_result > 0) {
+ pr_err("error: list is not sorted\n");
+ goto exit;
+ }
+
+ el = container_of(cur, struct debug_el, list);
+ el1 = container_of(cur->next, struct debug_el, list);
+ if (cmp_result == 0 && el->serial >= el1->serial) {
+ pr_err("error: order of equivalent elements not "
+ "preserved\n");
+ goto exit;
+ }
+
+ if (check(el, el1)) {
+ pr_err("error: element check failed\n");
+ goto exit;
+ }
+ count++;
+ }
+ if (head.prev != cur) {
+ pr_err("error: list is corrupted\n");
+ goto exit;
+ }
+
+
+ if (count != TEST_LIST_LEN) {
+ pr_err("error: bad list length %d", count);
+ goto exit;
+ }
+
+ err = 0;
+exit:
+ for (i = 0; i < TEST_LIST_LEN; i++)
+ kfree(elts[i]);
+ kfree(elts);
+ return err;
+}
+module_init(list_sort_test);
+MODULE_LICENSE("GPL");
diff --git a/lib/test_sort.c b/lib/test_sort.c
index 4db3911db50a..d389c1cc2f6c 100644
--- a/lib/test_sort.c
+++ b/lib/test_sort.c
@@ -1,11 +1,8 @@
#include <linux/sort.h>
#include <linux/slab.h>
-#include <linux/init.h>
+#include <linux/module.h>
-/*
- * A simple boot-time regression test
- * License: GPL
- */
+/* a simple boot-time regression test */
#define TEST_LEN 1000
@@ -41,4 +38,6 @@ exit:
kfree(a);
return err;
}
-subsys_initcall(test_sort_init);
+
+module_init(test_sort_init);
+MODULE_LICENSE("GPL");
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 40d3cb42b0e7..21dbdf05dfdf 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1477,6 +1477,9 @@ int kptr_restrict __read_mostly;
* by an extra set of alphanumeric characters that are extended format
* specifiers.
*
+ * Please update scripts/checkpatch.pl when adding/removing conversion
+ * characters. (Search for "check for vsprintf extension").
+ *
* Right now we handle:
*
* - 'F' For symbolic function descriptor pointers with offset
diff --git a/mm/compaction.c b/mm/compaction.c
index 81e1eaa2a2cf..bc7903130501 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,11 +89,6 @@ static void map_pages(struct list_head *list)
list_splice(&tmp_list, list);
}
-static inline bool migrate_async_suitable(int migratetype)
-{
- return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
-}
-
#ifdef CONFIG_COMPACTION
int PageMovable(struct page *page)
@@ -988,6 +983,22 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION
+static bool suitable_migration_source(struct compact_control *cc,
+ struct page *page)
+{
+ int block_mt;
+
+ if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
+ return true;
+
+ block_mt = get_pageblock_migratetype(page);
+
+ if (cc->migratetype == MIGRATE_MOVABLE)
+ return is_migrate_movable(block_mt);
+ else
+ return block_mt == cc->migratetype;
+}
+
/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct compact_control *cc,
struct page *page)
@@ -1007,7 +1018,7 @@ static bool suitable_migration_target(struct compact_control *cc,
}
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (migrate_async_suitable(get_pageblock_migratetype(page)))
+ if (is_migrate_movable(get_pageblock_migratetype(page)))
return true;
/* Otherwise skip the block */
@@ -1242,8 +1253,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* Async compaction is optimistic to see if the minimum amount
* of work satisfies the allocation.
*/
- if (cc->mode == MIGRATE_ASYNC &&
- !migrate_async_suitable(get_pageblock_migratetype(page)))
+ if (!suitable_migration_source(cc, page))
continue;
/* Perform the isolation */
@@ -1276,11 +1286,11 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
-static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
- const int migratetype)
+static enum compact_result __compact_finished(struct zone *zone,
+ struct compact_control *cc)
{
unsigned int order;
- unsigned long watermark;
+ const int migratetype = cc->migratetype;
if (cc->contended || fatal_signal_pending(current))
return COMPACT_CONTENDED;
@@ -1308,12 +1318,16 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
- /* Compaction run is not finished if the watermark is not met */
- watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK];
-
- if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
- cc->alloc_flags))
- return COMPACT_CONTINUE;
+ if (cc->finishing_block) {
+ /*
+ * We have finished the pageblock, but better check again that
+ * we really succeeded.
+ */
+ if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+ cc->finishing_block = false;
+ else
+ return COMPACT_CONTINUE;
+ }
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
@@ -1335,20 +1349,40 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
* other migratetype buddy lists.
*/
if (find_suitable_fallback(area, order, migratetype,
- true, &can_steal) != -1)
- return COMPACT_SUCCESS;
+ true, &can_steal) != -1) {
+
+ /* movable pages are OK in any pageblock */
+ if (migratetype == MIGRATE_MOVABLE)
+ return COMPACT_SUCCESS;
+
+ /*
+ * We are stealing for a non-movable allocation. Make
+ * sure we finish compacting the current pageblock
+ * first so it is as free as possible and we won't
+ * have to steal another one soon. This only applies
+ * to sync compaction, as async compaction operates
+ * on pageblocks of the same migratetype.
+ */
+ if (cc->mode == MIGRATE_ASYNC ||
+ IS_ALIGNED(cc->migrate_pfn,
+ pageblock_nr_pages)) {
+ return COMPACT_SUCCESS;
+ }
+
+ cc->finishing_block = true;
+ return COMPACT_CONTINUE;
+ }
}
return COMPACT_NO_SUITABLE_PAGE;
}
static enum compact_result compact_finished(struct zone *zone,
- struct compact_control *cc,
- const int migratetype)
+ struct compact_control *cc)
{
int ret;
- ret = __compact_finished(zone, cc, migratetype);
+ ret = __compact_finished(zone, cc);
trace_mm_compaction_finished(zone, cc->order, ret);
if (ret == COMPACT_NO_SUITABLE_PAGE)
ret = COMPACT_CONTINUE;
@@ -1481,9 +1515,9 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
enum compact_result ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
- const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
const bool sync = cc->mode != MIGRATE_ASYNC;
+ cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
/* Compaction is likely to fail */
@@ -1533,8 +1567,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
migrate_prep_local();
- while ((ret = compact_finished(zone, cc, migratetype)) ==
- COMPACT_CONTINUE) {
+ while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
int err;
switch (isolate_migratepages(zone, cc)) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 1694623a6289..68b166a9eda0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2202,12 +2202,12 @@ int filemap_fault(struct vm_fault *vmf)
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
+ pgoff_t max_off;
struct page *page;
- loff_t size;
int ret = 0;
- size = round_up(i_size_read(inode), PAGE_SIZE);
- if (offset >= size >> PAGE_SHIFT)
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off))
return VM_FAULT_SIGBUS;
/*
@@ -2256,8 +2256,8 @@ retry_find:
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
- size = round_up(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= size >> PAGE_SHIFT)) {
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off)) {
unlock_page(page);
put_page(page);
return VM_FAULT_SIGBUS;
@@ -2323,7 +2323,7 @@ void filemap_map_pages(struct vm_fault *vmf,
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
- loff_t size;
+ unsigned long max_idx;
struct page *head, *page;
rcu_read_lock();
@@ -2369,8 +2369,8 @@ repeat:
if (page->mapping != mapping || !PageUptodate(page))
goto unlock;
- size = round_up(i_size_read(mapping->host), PAGE_SIZE);
- if (page->index >= size >> PAGE_SHIFT)
+ max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+ if (page->index >= max_idx)
goto unlock;
if (file->f_ra.mmap_miss > 0)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d36b2af4d1bf..2b4120f6930c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1564,18 +1564,16 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
ClearPageDirty(page);
unlock_page(page);
- if (PageActive(page))
- deactivate_page(page);
-
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ pmdp_invalidate(vma, addr, pmd);
orig_pmd = pmd_mkold(orig_pmd);
orig_pmd = pmd_mkclean(orig_pmd);
set_pmd_at(mm, addr, pmd, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
+
+ mark_page_lazyfree(page);
ret = true;
out:
spin_unlock(ptl);
@@ -1724,37 +1722,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
- int ret = 0;
+ pmd_t entry;
+ bool preserve_write;
+ int ret;
ptl = __pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
- pmd_t entry;
- bool preserve_write = prot_numa && pmd_write(*pmd);
- ret = 1;
+ if (!ptl)
+ return 0;
- /*
- * Avoid trapping faults against the zero page. The read-only
- * data is likely to be read-cached on the local CPU and
- * local/remote hits to the zero page are not interesting.
- */
- if (prot_numa && is_huge_zero_pmd(*pmd)) {
- spin_unlock(ptl);
- return ret;
- }
+ preserve_write = prot_numa && pmd_write(*pmd);
+ ret = 1;
- if (!prot_numa || !pmd_protnone(*pmd)) {
- entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
- entry = pmd_modify(entry, newprot);
- if (preserve_write)
- entry = pmd_mk_savedwrite(entry);
- ret = HPAGE_PMD_NR;
- set_pmd_at(mm, addr, pmd, entry);
- BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
- pmd_write(entry));
- }
- spin_unlock(ptl);
- }
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (prot_numa && is_huge_zero_pmd(*pmd))
+ goto unlock;
+
+ if (prot_numa && pmd_protnone(*pmd))
+ goto unlock;
+
+ /*
+ * In case prot_numa, we are under down_read(mmap_sem). It's critical
+ * to not clear pmd intermittently to avoid race with MADV_DONTNEED
+ * which is also under down_read(mmap_sem):
+ *
+ * CPU0: CPU1:
+ * change_huge_pmd(prot_numa=1)
+ * pmdp_huge_get_and_clear_notify()
+ * madvise_dontneed()
+ * zap_pmd_range()
+ * pmd_trans_huge(*pmd) == 0 (without ptl)
+ * // skip the pmd
+ * set_pmd_at();
+ * // pmd is re-established
+ *
+ * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
+ * which may break userspace.
+ *
+ * pmdp_invalidate() is required to make sure we don't miss
+ * dirty/young flags set by hardware.
+ */
+ entry = *pmd;
+ pmdp_invalidate(vma, addr, pmd);
+
+ /*
+ * Recover dirty/young flags. It relies on pmdp_invalidate to not
+ * corrupt them.
+ */
+ if (pmd_dirty(*pmd))
+ entry = pmd_mkdirty(entry);
+ if (pmd_young(*pmd))
+ entry = pmd_mkyoung(entry);
+ entry = pmd_modify(entry, newprot);
+ if (preserve_write)
+ entry = pmd_mk_savedwrite(entry);
+ ret = HPAGE_PMD_NR;
+ set_pmd_at(mm, addr, pmd, entry);
+ BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
+unlock:
+ spin_unlock(ptl);
return ret;
}
@@ -1828,7 +1858,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
- count_vm_event(THP_SPLIT_PMD);
+ count_vm_event(THP_SPLIT_PUD);
pudp_huge_clear_flush_notify(vma, haddr, pud);
}
@@ -2363,7 +2393,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
VM_BUG_ON_PAGE(!PageCompound(page), page);
if (PageAnon(head)) {
diff --git a/mm/internal.h b/mm/internal.h
index ccfc2a2969f4..0e4f558412fb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -81,11 +81,16 @@ static inline void set_page_refcounted(struct page *page)
extern unsigned long highest_memmap_pfn;
/*
+ * Maximum number of reclaim retries without progress before the OOM
+ * killer is consider the only way forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
+/*
* in mm/vmscan.c:
*/
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
-extern bool pgdat_reclaimable(struct pglist_data *pgdat);
/*
* in mm/rmap.c:
@@ -178,6 +183,7 @@ extern int user_min_free_kbytes;
struct compact_control {
struct list_head freepages; /* List of free pages to migrate to */
struct list_head migratepages; /* List of pages being migrated */
+ struct zone *zone;
unsigned long nr_freepages; /* Number of isolated free pages */
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long total_migrate_scanned;
@@ -185,17 +191,18 @@ struct compact_control {
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
+ const gfp_t gfp_mask; /* gfp mask of a direct compactor */
+ int order; /* order a direct compactor needs */
+ int migratetype; /* migratetype of direct compactor */
+ const unsigned int alloc_flags; /* alloc flags of a direct compactor */
+ const int classzone_idx; /* zone index of a direct compactor */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool ignore_block_suitable; /* Scan blocks considered unsuitable */
bool direct_compaction; /* False from kcompactd or /proc/... */
bool whole_zone; /* Whole zone should/has been scanned */
- int order; /* order a direct compactor needs */
- const gfp_t gfp_mask; /* gfp mask of a direct compactor */
- const unsigned int alloc_flags; /* alloc flags of a direct compactor */
- const int classzone_idx; /* zone index of a direct compactor */
- struct zone *zone;
bool contended; /* Signal lock or sched contention */
+ bool finishing_block; /* Finishing current pageblock */
};
unsigned long
@@ -481,6 +488,13 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
enum ttu_flags;
struct tlbflush_unmap_batch;
+
+/*
+ * only for MM internal work items which do not depend on
+ * any allocations or locks which might depend on allocations
+ */
+extern struct workqueue_struct *mm_percpu_wq;
+
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
@@ -498,4 +512,14 @@ extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];
+static inline bool is_migrate_highatomic(enum migratetype migratetype)
+{
+ return migratetype == MIGRATE_HIGHATOMIC;
+}
+
+static inline bool is_migrate_highatomic_page(struct page *page)
+{
+ return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
+}
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 9249649baa69..d29c9ebb7f99 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -579,7 +579,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
- kasan_report_double_free(cache, object, shadow_byte);
+ kasan_report_double_free(cache, object,
+ __builtin_return_address(1));
return true;
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 1c260e6b3b3c..75729173ade9 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -104,7 +104,7 @@ static inline bool kasan_report_enabled(void)
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_double_free(struct kmem_cache *cache, void *object,
- s8 shadow);
+ void *ip);
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 6f1ed1630873..3a8ddf8baf7d 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -25,6 +25,7 @@
#include <linux/printk.h>
#include <linux/shrinker.h>
#include <linux/slab.h>
+#include <linux/srcu.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -103,6 +104,7 @@ static int quarantine_tail;
/* Total size of all objects in global_quarantine across all batches. */
static unsigned long quarantine_size;
static DEFINE_SPINLOCK(quarantine_lock);
+DEFINE_STATIC_SRCU(remove_cache_srcu);
/* Maximum size of the global queue. */
static unsigned long quarantine_max_size;
@@ -173,17 +175,22 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
struct qlist_head *q;
struct qlist_head temp = QLIST_INIT;
+ /*
+ * Note: irq must be disabled until after we move the batch to the
+ * global quarantine. Otherwise quarantine_remove_cache() can miss
+ * some objects belonging to the cache if they are in our local temp
+ * list. quarantine_remove_cache() executes on_each_cpu() at the
+ * beginning which ensures that it either sees the objects in per-cpu
+ * lists or in the global quarantine.
+ */
local_irq_save(flags);
q = this_cpu_ptr(&cpu_quarantine);
qlist_put(q, &info->quarantine_link, cache->size);
- if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE))
+ if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
qlist_move_all(q, &temp);
- local_irq_restore(flags);
-
- if (unlikely(!qlist_empty(&temp))) {
- spin_lock_irqsave(&quarantine_lock, flags);
+ spin_lock(&quarantine_lock);
WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
if (global_quarantine[quarantine_tail].bytes >=
@@ -196,20 +203,33 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
if (new_tail != quarantine_head)
quarantine_tail = new_tail;
}
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ spin_unlock(&quarantine_lock);
}
+
+ local_irq_restore(flags);
}
void quarantine_reduce(void)
{
size_t total_size, new_quarantine_size, percpu_quarantines;
unsigned long flags;
+ int srcu_idx;
struct qlist_head to_free = QLIST_INIT;
if (likely(READ_ONCE(quarantine_size) <=
READ_ONCE(quarantine_max_size)))
return;
+ /*
+ * srcu critical section ensures that quarantine_remove_cache()
+ * will not miss objects belonging to the cache while they are in our
+ * local to_free list. srcu is chosen because (1) it gives us private
+ * grace period domain that does not interfere with anything else,
+ * and (2) it allows synchronize_srcu() to return without waiting
+ * if there are no pending read critical sections (which is the
+ * expected case).
+ */
+ srcu_idx = srcu_read_lock(&remove_cache_srcu);
spin_lock_irqsave(&quarantine_lock, flags);
/*
@@ -237,6 +257,7 @@ void quarantine_reduce(void)
spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, NULL);
+ srcu_read_unlock(&remove_cache_srcu, srcu_idx);
}
static void qlist_move_cache(struct qlist_head *from,
@@ -280,12 +301,28 @@ void quarantine_remove_cache(struct kmem_cache *cache)
unsigned long flags, i;
struct qlist_head to_free = QLIST_INIT;
+ /*
+ * Must be careful to not miss any objects that are being moved from
+ * per-cpu list to the global quarantine in quarantine_put(),
+ * nor objects being freed in quarantine_reduce(). on_each_cpu()
+ * achieves the first goal, while synchronize_srcu() achieves the
+ * second.
+ */
on_each_cpu(per_cpu_remove_cache, cache, 1);
spin_lock_irqsave(&quarantine_lock, flags);
- for (i = 0; i < QUARANTINE_BATCHES; i++)
+ for (i = 0; i < QUARANTINE_BATCHES; i++) {
+ if (qlist_empty(&global_quarantine[i]))
+ continue;
qlist_move_cache(&global_quarantine[i], &to_free, cache);
+ /* Scanning whole quarantine can take a while. */
+ spin_unlock_irqrestore(&quarantine_lock, flags);
+ cond_resched();
+ spin_lock_irqsave(&quarantine_lock, flags);
+ }
spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, cache);
+
+ synchronize_srcu(&remove_cache_srcu);
}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index f479365530b6..718a10a48a19 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -49,7 +49,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
return first_bad_addr;
}
-static void print_error_description(struct kasan_access_info *info)
+static bool addr_has_shadow(struct kasan_access_info *info)
+{
+ return (info->access_addr >=
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+}
+
+static const char *get_shadow_bug_type(struct kasan_access_info *info)
{
const char *bug_type = "unknown-crash";
u8 *shadow_addr;
@@ -96,12 +102,39 @@ static void print_error_description(struct kasan_access_info *info)
break;
}
- pr_err("BUG: KASAN: %s in %pS at addr %p\n",
- bug_type, (void *)info->ip,
- info->access_addr);
- pr_err("%s of size %zu by task %s/%d\n",
- info->is_write ? "Write" : "Read",
- info->access_size, current->comm, task_pid_nr(current));
+ return bug_type;
+}
+
+const char *get_wild_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+
+ if ((unsigned long)info->access_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if ((unsigned long)info->access_addr < TASK_SIZE)
+ bug_type = "user-memory-access";
+ else
+ bug_type = "wild-memory-access";
+
+ return bug_type;
+}
+
+static const char *get_bug_type(struct kasan_access_info *info)
+{
+ if (addr_has_shadow(info))
+ return get_shadow_bug_type(info);
+ return get_wild_bug_type(info);
+}
+
+static void print_error_description(struct kasan_access_info *info)
+{
+ const char *bug_type = get_bug_type(info);
+
+ pr_err("BUG: KASAN: %s in %pS\n",
+ bug_type, (void *)info->ip);
+ pr_err("%s of size %zu at addr %p by task %s/%d\n",
+ info->is_write ? "Write" : "Read", info->access_size,
+ info->access_addr, current->comm, task_pid_nr(current));
}
static inline bool kernel_or_module_addr(const void *addr)
@@ -142,9 +175,9 @@ static void kasan_end_report(unsigned long *flags)
kasan_enable_current();
}
-static void print_track(struct kasan_track *track)
+static void print_track(struct kasan_track *track, const char *prefix)
{
- pr_err("PID = %u\n", track->pid);
+ pr_err("%s by task %u:\n", prefix, track->pid);
if (track->stack) {
struct stack_trace trace;
@@ -155,59 +188,84 @@ static void print_track(struct kasan_track *track)
}
}
-static void kasan_object_err(struct kmem_cache *cache, void *object)
+static struct page *addr_to_page(const void *addr)
{
- struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+ if ((addr >= (void *)PAGE_OFFSET) &&
+ (addr < high_memory))
+ return virt_to_head_page(addr);
+ return NULL;
+}
- dump_stack();
- pr_err("Object at %p, in cache %s size: %d\n", object, cache->name,
- cache->object_size);
+static void describe_object_addr(struct kmem_cache *cache, void *object,
+ const void *addr)
+{
+ unsigned long access_addr = (unsigned long)addr;
+ unsigned long object_addr = (unsigned long)object;
+ const char *rel_type;
+ int rel_bytes;
- if (!(cache->flags & SLAB_KASAN))
+ pr_err("The buggy address belongs to the object at %p\n"
+ " which belongs to the cache %s of size %d\n",
+ object, cache->name, cache->object_size);
+
+ if (!addr)
return;
- pr_err("Allocated:\n");
- print_track(&alloc_info->alloc_track);
- pr_err("Freed:\n");
- print_track(&alloc_info->free_track);
+ if (access_addr < object_addr) {
+ rel_type = "to the left";
+ rel_bytes = object_addr - access_addr;
+ } else if (access_addr >= object_addr + cache->object_size) {
+ rel_type = "to the right";
+ rel_bytes = access_addr - (object_addr + cache->object_size);
+ } else {
+ rel_type = "inside";
+ rel_bytes = access_addr - object_addr;
+ }
+
+ pr_err("The buggy address is located %d bytes %s of\n"
+ " %d-byte region [%p, %p)\n",
+ rel_bytes, rel_type, cache->object_size, (void *)object_addr,
+ (void *)(object_addr + cache->object_size));
}
-void kasan_report_double_free(struct kmem_cache *cache, void *object,
- s8 shadow)
+static void describe_object(struct kmem_cache *cache, void *object,
+ const void *addr)
{
- unsigned long flags;
+ struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
- kasan_start_report(&flags);
- pr_err("BUG: Double free or freeing an invalid pointer\n");
- pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
- kasan_object_err(cache, object);
- kasan_end_report(&flags);
+ if (cache->flags & SLAB_KASAN) {
+ print_track(&alloc_info->alloc_track, "Allocated");
+ pr_err("\n");
+ print_track(&alloc_info->free_track, "Freed");
+ pr_err("\n");
+ }
+
+ describe_object_addr(cache, object, addr);
}
-static void print_address_description(struct kasan_access_info *info)
+static void print_address_description(void *addr)
{
- const void *addr = info->access_addr;
+ struct page *page = addr_to_page(addr);
- if ((addr >= (void *)PAGE_OFFSET) &&
- (addr < high_memory)) {
- struct page *page = virt_to_head_page(addr);
-
- if (PageSlab(page)) {
- void *object;
- struct kmem_cache *cache = page->slab_cache;
- object = nearest_obj(cache, page,
- (void *)info->access_addr);
- kasan_object_err(cache, object);
- return;
- }
- dump_page(page, "kasan: bad access detected");
+ dump_stack();
+ pr_err("\n");
+
+ if (page && PageSlab(page)) {
+ struct kmem_cache *cache = page->slab_cache;
+ void *object = nearest_obj(cache, page, addr);
+
+ describe_object(cache, object, addr);
}
- if (kernel_or_module_addr(addr)) {
- if (!init_task_stack_addr(addr))
- pr_err("Address belongs to variable %pS\n", addr);
+ if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
+ pr_err("The buggy address belongs to the variable:\n");
+ pr_err(" %pS\n", addr);
+ }
+
+ if (page) {
+ pr_err("The buggy address belongs to the page:\n");
+ dump_page(page, "kasan: bad access detected");
}
- dump_stack();
}
static bool row_is_guilty(const void *row, const void *guilty)
@@ -262,31 +320,34 @@ static void print_shadow_for_address(const void *addr)
}
}
+void kasan_report_double_free(struct kmem_cache *cache, void *object,
+ void *ip)
+{
+ unsigned long flags;
+
+ kasan_start_report(&flags);
+ pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip);
+ pr_err("\n");
+ print_address_description(object);
+ pr_err("\n");
+ print_shadow_for_address(object);
+ kasan_end_report(&flags);
+}
+
static void kasan_report_error(struct kasan_access_info *info)
{
unsigned long flags;
- const char *bug_type;
kasan_start_report(&flags);
- if (info->access_addr <
- kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
- if ((unsigned long)info->access_addr < PAGE_SIZE)
- bug_type = "null-ptr-deref";
- else if ((unsigned long)info->access_addr < TASK_SIZE)
- bug_type = "user-memory-access";
- else
- bug_type = "wild-memory-access";
- pr_err("BUG: KASAN: %s on address %p\n",
- bug_type, info->access_addr);
- pr_err("%s of size %zu by task %s/%d\n",
- info->is_write ? "Write" : "Read",
- info->access_size, current->comm,
- task_pid_nr(current));
+ print_error_description(info);
+ pr_err("\n");
+
+ if (!addr_has_shadow(info)) {
dump_stack();
} else {
- print_error_description(info);
- print_address_description(info);
+ print_address_description((void *)info->access_addr);
+ pr_err("\n");
print_shadow_for_address(info->first_bad_addr);
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ba40b7f673f4..7cb9c88bb4a3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -483,8 +483,7 @@ void __khugepaged_exit(struct mm_struct *mm)
static void release_pte_page(struct page *page)
{
- /* 0 stands for page_is_file_cache(page) == false */
- dec_node_page_state(page, NR_ISOLATED_ANON + 0);
+ dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
unlock_page(page);
putback_lru_page(page);
}
@@ -532,7 +531,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
/*
* We can do it before isolate_lru_page because the
@@ -550,7 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* The page must only be referenced by the scanned process
* and page swap cache.
*/
- if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ if (page_count(page) != 1 + PageSwapCache(page)) {
unlock_page(page);
result = SCAN_PAGE_COUNT;
goto out;
@@ -579,8 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_DEL_PAGE_LRU;
goto out;
}
- /* 0 stands for page_is_file_cache(page) == false */
- inc_node_page_state(page, NR_ISOLATED_ANON + 0);
+ inc_node_page_state(page,
+ NR_ISOLATED_ANON + page_is_file_cache(page));
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -1183,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
* The page must only be referenced by the scanned process
* and page swap cache.
*/
- if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ if (page_count(page) != 1 + PageSwapCache(page)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index dc5927c812d3..a09d2d3dfae9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -411,10 +411,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
ptent = pte_mkold(ptent);
ptent = pte_mkclean(ptent);
set_pte_at(mm, addr, pte, ptent);
- if (PageActive(page))
- deactivate_page(page);
tlb_remove_tlb_entry(tlb, pte, addr);
}
+ mark_page_lazyfree(page);
}
out:
if (nr_swap) {
@@ -513,7 +512,43 @@ static long madvise_dontneed(struct vm_area_struct *vma,
if (!can_madv_dontneed_vma(vma))
return -EINVAL;
- userfaultfd_remove(vma, prev, start, end);
+ if (!userfaultfd_remove(vma, start, end)) {
+ *prev = NULL; /* mmap_sem has been dropped, prev is stale */
+
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, start);
+ if (!vma)
+ return -ENOMEM;
+ if (start < vma->vm_start) {
+ /*
+ * This "vma" under revalidation is the one
+ * with the lowest vma->vm_start where start
+ * is also < vma->vm_end. If start <
+ * vma->vm_start it means an hole materialized
+ * in the user address space within the
+ * virtual range passed to MADV_DONTNEED.
+ */
+ return -ENOMEM;
+ }
+ if (!can_madv_dontneed_vma(vma))
+ return -EINVAL;
+ if (end > vma->vm_end) {
+ /*
+ * Don't fail if end > vma->vm_end. If the old
+ * vma was splitted while the mmap_sem was
+ * released the effect of the concurrent
+ * operation may not cause MADV_DONTNEED to
+ * have an undefined result. There may be an
+ * adjacent next vma that we'll walk
+ * next. userfaultfd_remove() will generate an
+ * UFFD_EVENT_REMOVE repetition on the
+ * end-vma->vm_end range, but the manager can
+ * handle a repetition fine.
+ */
+ end = vma->vm_end;
+ }
+ VM_WARN_ON(start >= end);
+ }
zap_page_range(vma, start, end - start);
return 0;
}
@@ -554,8 +589,10 @@ static long madvise_remove(struct vm_area_struct *vma,
* mmap_sem.
*/
get_file(f);
- userfaultfd_remove(vma, prev, start, end);
- up_read(&current->mm->mmap_sem);
+ if (userfaultfd_remove(vma, start, end)) {
+ /* mmap_sem was not released by userfaultfd_remove() */
+ up_read(&current->mm->mmap_sem);
+ }
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
@@ -613,13 +650,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
case MADV_FREE:
- /*
- * XXX: In this implementation, MADV_FREE works like
- * MADV_DONTNEED on swapless system or full swap.
- */
- if (get_nr_swap_pages() > 0)
- return madvise_free(vma, prev, start, end);
- /* passthrough */
+ return madvise_free(vma, prev, start, end);
case MADV_DONTNEED:
return madvise_dontneed(vma, prev, start, end);
default:
diff --git a/mm/memblock.c b/mm/memblock.c
index b64b47803e52..696f06d17c4e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1118,7 +1118,10 @@ unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
}
} while (left < right);
- return min(PHYS_PFN(type->regions[right].base), max_pfn);
+ if (right == type->cnt)
+ return max_pfn;
+ else
+ return min(PHYS_PFN(type->regions[right].base), max_pfn);
}
/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c52ec893e241..490d5b4676c1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -104,6 +104,7 @@ static const char * const mem_cgroup_stat_names[] = {
"cache",
"rss",
"rss_huge",
+ "shmem",
"mapped_file",
"dirty",
"writeback",
@@ -466,6 +467,8 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
struct mem_cgroup_tree_per_node *mctz;
mctz = soft_limit_tree_from_page(page);
+ if (!mctz)
+ return;
/*
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
@@ -503,7 +506,8 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
for_each_node(nid) {
mz = mem_cgroup_nodeinfo(memcg, nid);
mctz = soft_limit_tree_node(nid);
- mem_cgroup_remove_exceeded(mz, mctz);
+ if (mctz)
+ mem_cgroup_remove_exceeded(mz, mctz);
}
}
@@ -605,9 +609,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
if (PageAnon(page))
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
nr_pages);
- else
+ else {
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
nr_pages);
+ if (PageSwapBacked(page))
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SHMEM],
+ nr_pages);
+ }
if (compound) {
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
@@ -2558,7 +2566,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
* is empty. Do it lockless to prevent lock bouncing. Races
* are acceptable as soft limit is best effort anyway.
*/
- if (RB_EMPTY_ROOT(&mctz->rb_root))
+ if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
return 0;
/*
@@ -4135,17 +4143,22 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
kfree(memcg->nodeinfo[node]);
}
-static void mem_cgroup_free(struct mem_cgroup *memcg)
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- memcg_wb_domain_exit(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->stat);
kfree(memcg);
}
+static void mem_cgroup_free(struct mem_cgroup *memcg)
+{
+ memcg_wb_domain_exit(memcg);
+ __mem_cgroup_free(memcg);
+}
+
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
@@ -4196,7 +4209,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
fail:
if (memcg->id.id > 0)
idr_remove(&mem_cgroup_idr, memcg->id.id);
- mem_cgroup_free(memcg);
+ __mem_cgroup_free(memcg);
return NULL;
}
@@ -5200,6 +5213,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "sock %llu\n",
(u64)stat[MEMCG_SOCK] * PAGE_SIZE);
+ seq_printf(m, "shmem %llu\n",
+ (u64)stat[MEM_CGROUP_STAT_SHMEM] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
(u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
@@ -5468,8 +5483,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_anon, unsigned long nr_file,
- unsigned long nr_huge, unsigned long nr_kmem,
- struct page *dummy_page)
+ unsigned long nr_kmem, unsigned long nr_huge,
+ unsigned long nr_shmem, struct page *dummy_page)
{
unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
unsigned long flags;
@@ -5487,6 +5502,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
+ __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], nr_shmem);
__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
memcg_check_events(memcg, dummy_page);
@@ -5499,6 +5515,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
static void uncharge_list(struct list_head *page_list)
{
struct mem_cgroup *memcg = NULL;
+ unsigned long nr_shmem = 0;
unsigned long nr_anon = 0;
unsigned long nr_file = 0;
unsigned long nr_huge = 0;
@@ -5531,9 +5548,9 @@ static void uncharge_list(struct list_head *page_list)
if (memcg != page->mem_cgroup) {
if (memcg) {
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_huge, nr_kmem, page);
- pgpgout = nr_anon = nr_file =
- nr_huge = nr_kmem = 0;
+ nr_kmem, nr_huge, nr_shmem, page);
+ pgpgout = nr_anon = nr_file = nr_kmem = 0;
+ nr_huge = nr_shmem = 0;
}
memcg = page->mem_cgroup;
}
@@ -5547,8 +5564,11 @@ static void uncharge_list(struct list_head *page_list)
}
if (PageAnon(page))
nr_anon += nr_pages;
- else
+ else {
nr_file += nr_pages;
+ if (PageSwapBacked(page))
+ nr_shmem += nr_pages;
+ }
pgpgout++;
} else {
nr_kmem += 1 << compound_order(page);
@@ -5560,7 +5580,7 @@ static void uncharge_list(struct list_head *page_list)
if (memcg)
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_huge, nr_kmem, page);
+ nr_kmem, nr_huge, nr_shmem, page);
}
/**
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 27f7210e7fab..f85adfe57484 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -907,7 +907,7 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page);
static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
int trapno, int flags, struct page **hpagep)
{
- enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 295479b792ec..edff09061e32 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1207,7 +1207,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
/* Reset the nr_zones, order and classzone_idx before reuse */
pgdat->nr_zones = 0;
pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = 0;
+ pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
}
/* we can use NODE_DATA(nid) from here */
diff --git a/mm/migrate.c b/mm/migrate.c
index 9a0897a14d37..e0cb4b7e7506 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1719,9 +1719,6 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
{
int z;
- if (!pgdat_reclaimable(pgdat))
- return false;
-
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
struct zone *zone = pgdat->node_zones + z;
@@ -1944,7 +1941,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
/* Prepare a page as a migration target */
__SetPageLocked(new_page);
- __SetPageSwapBacked(new_page);
+ if (PageSwapBacked(page))
+ __SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
new_page->mapping = page->mapping;
diff --git a/mm/mlock.c b/mm/mlock.c
index 1050511f8b2b..02f138244bf5 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -442,7 +442,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
while (start < end) {
struct page *page;
- unsigned int page_mask;
+ unsigned int page_mask = 0;
unsigned long page_increm;
struct pagevec pvec;
struct zone *zone;
@@ -456,8 +456,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
* suits munlock very well (and if somehow an abnormal page
* has sneaked into the range, we won't oops here: great).
*/
- page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
- &page_mask);
+ page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
if (page && !IS_ERR(page)) {
if (PageTransTail(page)) {
@@ -468,8 +467,8 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
/*
* Any THP page found by follow_page_mask() may
* have gotten split before reaching
- * munlock_vma_page(), so we need to recompute
- * the page_mask here.
+ * munlock_vma_page(), so we need to compute
+ * the page_mask here instead.
*/
page_mask = munlock_vma_page(page);
unlock_page(page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d8ac2a7fb9e7..33df0583edb9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -650,9 +650,8 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
spin_lock_init(&dom->lock);
- init_timer_deferrable(&dom->period_timer);
- dom->period_timer.function = writeout_period;
- dom->period_timer.data = (unsigned long)dom;
+ setup_deferrable_timer(&dom->period_timer, writeout_period,
+ (unsigned long)dom);
dom->dirty_limit_tstamp = jiffies;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6cbde310abed..f749b7ff7c50 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1090,15 +1090,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
- unsigned long nr_scanned, flags;
+ unsigned long flags;
bool isolated_pageblocks;
spin_lock_irqsave(&zone->lock, flags);
isolated_pageblocks = has_isolate_pageblock(zone);
- nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
- if (nr_scanned)
- __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
-
while (count) {
struct page *page;
struct list_head *list;
@@ -1150,13 +1146,10 @@ static void free_one_page(struct zone *zone,
unsigned int order,
int migratetype)
{
- unsigned long nr_scanned, flags;
+ unsigned long flags;
+
spin_lock_irqsave(&zone->lock, flags);
__count_vm_events(PGFREE, 1 << order);
- nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
- if (nr_scanned)
- __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
-
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -1844,9 +1837,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
-int move_freepages(struct zone *zone,
+static int move_freepages(struct zone *zone,
struct page *start_page, struct page *end_page,
- int migratetype)
+ int migratetype, int *num_movable)
{
struct page *page;
unsigned int order;
@@ -1863,6 +1856,9 @@ int move_freepages(struct zone *zone,
VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
#endif
+ if (num_movable)
+ *num_movable = 0;
+
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
@@ -1873,6 +1869,15 @@ int move_freepages(struct zone *zone,
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
if (!PageBuddy(page)) {
+ /*
+ * We assume that pages that could be isolated for
+ * migration are movable. But we don't actually try
+ * isolating, as that would be expensive.
+ */
+ if (num_movable &&
+ (PageLRU(page) || __PageMovable(page)))
+ (*num_movable)++;
+
page++;
continue;
}
@@ -1888,7 +1893,7 @@ int move_freepages(struct zone *zone,
}
int move_freepages_block(struct zone *zone, struct page *page,
- int migratetype)
+ int migratetype, int *num_movable)
{
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
@@ -1905,7 +1910,8 @@ int move_freepages_block(struct zone *zone, struct page *page,
if (!zone_spans_pfn(zone, end_pfn))
return 0;
- return move_freepages(zone, start_page, end_page, migratetype);
+ return move_freepages(zone, start_page, end_page, migratetype,
+ num_movable);
}
static void change_pageblock_range(struct page *pageblock_page,
@@ -1955,28 +1961,75 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
/*
* This function implements actual steal behaviour. If order is large enough,
* we can steal whole pageblock. If not, we first move freepages in this
- * pageblock and check whether half of pages are moved or not. If half of
- * pages are moved, we can change migratetype of pageblock and permanently
- * use it's pages as requested migratetype in the future.
+ * pageblock to our migratetype and determine how many already-allocated pages
+ * are there in the pageblock with a compatible migratetype. If at least half
+ * of pages are free or compatible, we can change migratetype of the pageblock
+ * itself, so pages freed in the future will be put on the correct free list.
*/
static void steal_suitable_fallback(struct zone *zone, struct page *page,
- int start_type)
+ int start_type, bool whole_block)
{
unsigned int current_order = page_order(page);
- int pages;
+ struct free_area *area;
+ int free_pages, movable_pages, alike_pages;
+ int old_block_type;
+
+ old_block_type = get_pageblock_migratetype(page);
+
+ /*
+ * This can happen due to races and we want to prevent broken
+ * highatomic accounting.
+ */
+ if (is_migrate_highatomic(old_block_type))
+ goto single_page;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
change_pageblock_range(page, current_order, start_type);
- return;
+ goto single_page;
}
- pages = move_freepages_block(zone, page, start_type);
+ /* We are not allowed to try stealing from the whole block */
+ if (!whole_block)
+ goto single_page;
- /* Claim the whole block if over half of it is free */
- if (pages >= (1 << (pageblock_order-1)) ||
+ free_pages = move_freepages_block(zone, page, start_type,
+ &movable_pages);
+ /*
+ * Determine how many pages are compatible with our allocation.
+ * For movable allocation, it's the number of movable pages which
+ * we just obtained. For other types it's a bit more tricky.
+ */
+ if (start_type == MIGRATE_MOVABLE) {
+ alike_pages = movable_pages;
+ } else {
+ /*
+ * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
+ * to MOVABLE pageblock, consider all non-movable pages as
+ * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
+ * vice versa, be conservative since we can't distinguish the
+ * exact migratetype of non-movable pages.
+ */
+ if (old_block_type == MIGRATE_MOVABLE)
+ alike_pages = pageblock_nr_pages
+ - (free_pages + movable_pages);
+ else
+ alike_pages = 0;
+ }
+
+ /*
+ * If a sufficient number of pages in the block are either free or of
+ * comparable migratability as our allocation, claim the whole block.
+ */
+ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
set_pageblock_migratetype(page, start_type);
+
+ return;
+
+single_page:
+ area = &zone->free_area[current_order];
+ list_move(&page->lru, &area->free_list[start_type]);
}
/*
@@ -2042,11 +2095,11 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
/* Yoink! */
mt = get_pageblock_migratetype(page);
- if (mt != MIGRATE_HIGHATOMIC &&
- !is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
+ if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
+ && !is_migrate_cma(mt)) {
zone->nr_reserved_highatomic += pageblock_nr_pages;
set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
- move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
+ move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
}
out_unlock:
@@ -2100,8 +2153,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* from highatomic to ac->migratetype. So we should
* adjust the count once.
*/
- if (get_pageblock_migratetype(page) ==
- MIGRATE_HIGHATOMIC) {
+ if (is_migrate_highatomic_page(page)) {
/*
* It should never happen but changes to
* locking could inadvertently allow a per-cpu
@@ -2124,7 +2176,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* may increase.
*/
set_pageblock_migratetype(page, ac->migratetype);
- ret = move_freepages_block(zone, page, ac->migratetype);
+ ret = move_freepages_block(zone, page, ac->migratetype,
+ NULL);
if (ret) {
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
@@ -2136,8 +2189,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
return false;
}
-/* Remove an element from the buddy allocator from the fallback list */
-static inline struct page *
+/*
+ * Try finding a free buddy page on the fallback list and put it on the free
+ * list of requested migratetype, possibly along with other pages from the same
+ * block, depending on fragmentation avoidance heuristics. Returns true if
+ * fallback was found so that __rmqueue_smallest() can grab it.
+ */
+static inline bool
__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
{
struct free_area *area;
@@ -2158,33 +2216,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
- if (can_steal &&
- get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
- steal_suitable_fallback(zone, page, start_migratetype);
- /* Remove the page from the freelists */
- area->nr_free--;
- list_del(&page->lru);
- rmv_page_order(page);
-
- expand(zone, page, order, current_order, area,
- start_migratetype);
- /*
- * The pcppage_migratetype may differ from pageblock's
- * migratetype depending on the decisions in
- * find_suitable_fallback(). This is OK as long as it does not
- * differ for MIGRATE_CMA pageblocks. Those can be used as
- * fallback only via special __rmqueue_cma_fallback() function
- */
- set_pcppage_migratetype(page, start_migratetype);
+ steal_suitable_fallback(zone, page, start_migratetype,
+ can_steal);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
- return page;
+ return true;
}
- return NULL;
+ return false;
}
/*
@@ -2196,13 +2238,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
{
struct page *page;
+retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
if (migratetype == MIGRATE_MOVABLE)
page = __rmqueue_cma_fallback(zone, order);
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype);
+ if (!page && __rmqueue_fallback(zone, order, migratetype))
+ goto retry;
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
@@ -2373,6 +2416,13 @@ void drain_all_pages(struct zone *zone)
*/
static cpumask_t cpus_with_pcps;
+ /*
+ * Make sure nobody triggers this path before mm_percpu_wq is fully
+ * initialized.
+ */
+ if (WARN_ON_ONCE(!mm_percpu_wq))
+ return;
+
/* Workqueues cannot recurse */
if (current->flags & PF_WQ_WORKER)
return;
@@ -2422,7 +2472,7 @@ void drain_all_pages(struct zone *zone)
for_each_cpu(cpu, &cpus_with_pcps) {
struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
INIT_WORK(work, drain_local_pages_wq);
- schedule_work_on(cpu, work);
+ queue_work_on(cpu, mm_percpu_wq, work);
}
for_each_cpu(cpu, &cpus_with_pcps)
flush_work(per_cpu_ptr(&pcpu_drain, cpu));
@@ -2496,7 +2546,7 @@ void free_hot_cold_page(struct page *page, bool cold)
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Free ISOLATE pages back to the allocator because they are being
- * offlined but treat RESERVE as movable pages so we can get those
+ * offlined but treat HIGHATOMIC as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
@@ -2607,7 +2657,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
- && mt != MIGRATE_HIGHATOMIC)
+ && !is_migrate_highatomic(mt))
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
@@ -3518,19 +3568,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
}
/*
- * Maximum number of reclaim retries without any progress before OOM killer
- * is consider as the only way to move forward.
- */
-#define MAX_RECLAIM_RETRIES 16
-
-/*
* Checks whether it makes sense to retry the reclaim to make a forward progress
* for the given allocation request.
- * The reclaim feedback represented by did_some_progress (any progress during
- * the last reclaim round) and no_progress_loops (number of reclaim rounds without
- * any progress in a row) is considered as well as the reclaimable pages on the
- * applicable zone list (with a backoff mechanism which is a function of
- * no_progress_loops).
+ *
+ * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
+ * without success, or when we couldn't even meet the watermark if we
+ * reclaimed all remaining pages on the LRU lists.
*
* Returns true if a retry is viable or false to enter the oom path.
*/
@@ -3575,13 +3618,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
bool wmark;
available = reclaimable = zone_reclaimable_pages(zone);
- available -= DIV_ROUND_UP((*no_progress_loops) * available,
- MAX_RECLAIM_RETRIES);
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
/*
- * Would the allocation succeed if we reclaimed the whole
- * available?
+ * Would the allocation succeed if we reclaimed all
+ * reclaimable pages?
*/
wmark = __zone_watermark_ok(zone, order, min_wmark,
ac_classzone_idx(ac), alloc_flags, available);
@@ -3632,6 +3673,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
+ const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
@@ -3699,12 +3741,17 @@ retry_cpuset:
/*
* For costly allocations, try direct compaction first, as it's likely
- * that we have enough base pages and don't need to reclaim. Don't try
- * that for allocations that are allowed to ignore watermarks, as the
- * ALLOC_NO_WATERMARKS attempt didn't yet happen.
+ * that we have enough base pages and don't need to reclaim. For non-
+ * movable high-order allocations, do that as well, as compaction will
+ * try prevent permanent fragmentation by migrating from blocks of the
+ * same migratetype.
+ * Don't try this for allocations that are allowed to ignore
+ * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
- if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
- !gfp_pfmemalloc_allowed(gfp_mask)) {
+ if (can_direct_reclaim &&
+ (costly_order ||
+ (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
+ && !gfp_pfmemalloc_allowed(gfp_mask)) {
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
@@ -3716,7 +3763,7 @@ retry_cpuset:
* Checks for costly allocations with __GFP_NORETRY, which
* includes THP page fault allocations
*/
- if (gfp_mask & __GFP_NORETRY) {
+ if (costly_order && (gfp_mask & __GFP_NORETRY)) {
/*
* If compaction is deferred for high-order allocations,
* it is because sync compaction recently failed. If
@@ -3767,7 +3814,7 @@ retry:
/* Make sure we know about allocations which stall for too long */
if (time_after(jiffies, alloc_start + stall_timeout)) {
- warn_alloc(gfp_mask, ac->nodemask,
+ warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
"page allocation stalls for %ums, order:%u",
jiffies_to_msecs(jiffies-alloc_start), order);
stall_timeout += 10 * HZ;
@@ -3797,7 +3844,7 @@ retry:
* Do not retry costly high order allocations unless they are
* __GFP_REPEAT
*/
- if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+ if (costly_order && !(gfp_mask & __GFP_REPEAT))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -3967,10 +4014,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
goto out;
/*
- * Runtime PM, block IO and its error handling path can deadlock
- * because I/O on the device might not complete.
+ * Apply scoped allocation constraints. This is mainly about GFP_NOFS
+ * resp. GFP_NOIO which has to be inherited for all allocation requests
+ * from a particular context which has been marked by
+ * memalloc_no{fs,io}_{save,restore}.
*/
- alloc_mask = memalloc_noio_flags(gfp_mask);
+ alloc_mask = current_gfp_context(gfp_mask);
ac.spread_dirty_pages = false;
/*
@@ -4505,7 +4554,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
#endif
" writeback_tmp:%lukB"
" unstable:%lukB"
- " pages_scanned:%lu"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -4528,8 +4576,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_SHMEM)),
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
- node_page_state(pgdat, NR_PAGES_SCANNED),
- !pgdat_reclaimable(pgdat) ? "yes" : "no");
+ pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+ "yes" : "no");
}
for_each_populated_zone(zone) {
@@ -5738,6 +5786,11 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
&zone_start_pfn, &zone_end_pfn);
+
+ /* If this node has no page within this zone, return 0. */
+ if (zone_start_pfn == zone_end_pfn)
+ return 0;
+
nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
/*
@@ -7126,6 +7179,17 @@ static unsigned long __init arch_reserved_kernel_pages(void)
#endif
/*
+ * Adaptive scale is meant to reduce sizes of hash tables on large memory
+ * machines. As memory size is increased the scale is also increased but at
+ * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
+ * quadruples the scale is increased by one, which means the size of hash table
+ * only doubles, instead of quadrupling as well.
+ */
+#define ADAPT_SCALE_BASE (64ul << 30)
+#define ADAPT_SCALE_SHIFT 2
+#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
+
+/*
* allocate a large system hash table from bootmem
* - it is assumed that the hash table must contain an exact power-of-2
* quantity of entries
@@ -7144,6 +7208,7 @@ void *__init alloc_large_system_hash(const char *tablename,
unsigned long long max = high_limit;
unsigned long log2qty, size;
void *table = NULL;
+ gfp_t gfp_flags;
/* allow the kernel cmdline to have a say */
if (!numentries) {
@@ -7155,6 +7220,14 @@ void *__init alloc_large_system_hash(const char *tablename,
if (PAGE_SHIFT < 20)
numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
+ if (flags & HASH_ADAPT) {
+ unsigned long adapt;
+
+ for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
+ adapt <<= ADAPT_SCALE_SHIFT)
+ scale++;
+ }
+
/* limit to 1 bucket per 2^scale bytes of low memory */
if (scale > PAGE_SHIFT)
numentries >>= (scale - PAGE_SHIFT);
@@ -7188,12 +7261,17 @@ void *__init alloc_large_system_hash(const char *tablename,
log2qty = ilog2(numentries);
+ /*
+ * memblock allocator returns zeroed memory already, so HASH_ZERO is
+ * currently not used when HASH_EARLY is specified.
+ */
+ gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
table = memblock_virt_alloc_nopanic(size, 0);
else if (hashdist)
- table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+ table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
else {
/*
* If bucketsize is not a power-of-two, we may free
@@ -7201,8 +7279,8 @@ void *__init alloc_large_system_hash(const char *tablename,
* alloc_pages_exact() automatically does
*/
if (get_order(size) < MAX_ORDER) {
- table = alloc_pages_exact(size, GFP_ATOMIC);
- kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+ table = alloc_pages_exact(size, gfp_flags);
+ kmemleak_alloc(table, size, 1, gfp_flags);
}
}
} while (!table && size > PAGE_SIZE && --log2qty);
@@ -7424,7 +7502,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.zone = page_zone(pfn_to_page(start)),
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
- .gfp_mask = memalloc_noio_flags(gfp_mask),
+ .gfp_mask = current_gfp_context(gfp_mask),
};
INIT_LIST_HEAD(&cc.migratepages);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f4e17a57926a..5092e4ef00c8 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -66,7 +66,8 @@ out:
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
zone->nr_isolate_pageblock++;
- nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+ nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
+ NULL);
__mod_zone_freepage_state(zone, -nr_pages, migratetype);
}
@@ -88,7 +89,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
- if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ if (!is_migrate_isolate_page(page))
goto out;
/*
@@ -120,7 +121,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* pageblock scanning for freepage moving.
*/
if (!isolated_page) {
- nr_pages = move_freepages_block(zone, page, migratetype);
+ nr_pages = move_freepages_block(zone, page, migratetype, NULL);
__mod_zone_freepage_state(zone, nr_pages, migratetype);
}
set_pageblock_migratetype(page, migratetype);
@@ -205,7 +206,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn < end_pfn;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ if (!page || !is_migrate_isolate_page(page))
continue;
unset_migratetype_isolate(page, migratetype);
}
@@ -262,7 +263,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
*/
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ if (page && !is_migrate_isolate_page(page))
break;
}
page = __first_valid_page(start_pfn, end_pfn - start_pfn);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 60634dc53a88..c3cee247f2e6 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -261,7 +261,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
*/
for (; pfn < end_pfn; ) {
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
@@ -527,7 +527,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
*/
for (; pfn < end_pfn; ) {
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index bc2d1d522fa4..7d24bb93445b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1283,11 +1283,6 @@ void page_remove_rmap(struct page *page, bool compound)
*/
}
-struct rmap_private {
- enum ttu_flags flags;
- int lazyfreed;
-};
-
/*
* @arg: enum ttu_flags will be passed to this argument
*/
@@ -1303,8 +1298,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
pte_t pteval;
struct page *subpage;
int ret = SWAP_AGAIN;
- struct rmap_private *rp = arg;
- enum ttu_flags flags = rp->flags;
+ enum ttu_flags flags = (enum ttu_flags)arg;
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1316,12 +1310,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
while (page_vma_mapped_walk(&pvmw)) {
- subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
- address = pvmw.address;
-
- /* Unexpected PMD-mapped THP? */
- VM_BUG_ON_PAGE(!pvmw.pte, page);
-
/*
* If the page is mlock()d, we cannot swap it out.
* If it's recently referenced (perhaps page_referenced
@@ -1345,6 +1333,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
continue;
}
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_PAGE(!pvmw.pte, page);
+
+ subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ address = pvmw.address;
+
+
if (!(flags & TTU_IGNORE_ACCESS)) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
@@ -1418,13 +1413,28 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
+ WARN_ON_ONCE(1);
+ ret = SWAP_FAIL;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
- if (!PageDirty(page) && (flags & TTU_LZFREE)) {
- /* It's a freeable page by MADV_FREE */
- dec_mm_counter(mm, MM_ANONPAGES);
- rp->lazyfreed++;
- goto discard;
+ /* MADV_FREE page check */
+ if (!PageSwapBacked(page)) {
+ if (!PageDirty(page)) {
+ dec_mm_counter(mm, MM_ANONPAGES);
+ goto discard;
+ }
+
+ /*
+ * If the page was redirtied, it cannot be
+ * discarded. Remap the page to page table.
+ */
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = SWAP_DIRTY;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
}
if (swap_duplicate(entry) < 0) {
@@ -1492,18 +1502,15 @@ static int page_mapcount_is_zero(struct page *page)
* SWAP_AGAIN - we missed a mapping, try again later
* SWAP_FAIL - the page is unswappable
* SWAP_MLOCK - page is mlocked.
+ * SWAP_DIRTY - page is dirty MADV_FREE page
*/
int try_to_unmap(struct page *page, enum ttu_flags flags)
{
int ret;
- struct rmap_private rp = {
- .flags = flags,
- .lazyfreed = 0,
- };
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
- .arg = &rp,
+ .arg = (void *)flags,
.done = page_mapcount_is_zero,
.anon_lock = page_lock_anon_vma_read,
};
@@ -1524,11 +1531,8 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
else
ret = rmap_walk(page, &rwc);
- if (ret != SWAP_MLOCK && !page_mapcount(page)) {
+ if (ret != SWAP_MLOCK && !page_mapcount(page))
ret = SWAP_SUCCESS;
- if (rp.lazyfreed && !PageDirty(page))
- ret = SWAP_LZFREE;
- }
return ret;
}
@@ -1555,14 +1559,10 @@ static int page_not_mapped(struct page *page)
int try_to_munlock(struct page *page)
{
int ret;
- struct rmap_private rp = {
- .flags = TTU_MUNLOCK,
- .lazyfreed = 0,
- };
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
- .arg = &rp,
+ .arg = (void *)TTU_MUNLOCK,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 0fd21670b513..6bb4deb12e78 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -9,11 +9,12 @@
* as published by the Free Software Foundation; version 2
* of the License.
*/
+#define pr_fmt(fmt) "rodata_test: " fmt
+
#include <linux/uaccess.h>
#include <asm/sections.h>
const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
void rodata_test(void)
{
@@ -23,20 +24,20 @@ void rodata_test(void)
/* test 1: read the value */
/* If this test fails, some previous testrun has clobbered the state */
if (!rodata_test_data) {
- pr_err("rodata_test: test 1 fails (start data)\n");
+ pr_err("test 1 fails (start data)\n");
return;
}
/* test 2: write to the variable; this should fault */
if (!probe_kernel_write((void *)&rodata_test_data,
- (void *)&zero, sizeof(zero))) {
- pr_err("rodata_test: test data was not read only\n");
+ (void *)&zero, sizeof(zero))) {
+ pr_err("test data was not read only\n");
return;
}
/* test 3: check the value hasn't changed */
if (rodata_test_data == zero) {
- pr_err("rodata_test: test data was changed\n");
+ pr_err("test data was changed\n");
return;
}
@@ -44,13 +45,13 @@ void rodata_test(void)
start = (unsigned long)__start_rodata;
end = (unsigned long)__end_rodata;
if (start & (PAGE_SIZE - 1)) {
- pr_err("rodata_test: start of .rodata is not page size aligned\n");
+ pr_err("start of .rodata is not page size aligned\n");
return;
}
if (end & (PAGE_SIZE - 1)) {
- pr_err("rodata_test: end of .rodata is not page size aligned\n");
+ pr_err("end of .rodata is not page size aligned\n");
return;
}
- pr_info("rodata_test: all tests were successful\n");
+ pr_info("all tests were successful\n");
}
diff --git a/mm/swap.c b/mm/swap.c
index c4910f14f957..361bdb1575ab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -46,7 +46,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
#endif
@@ -561,20 +561,27 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
}
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_cache(page);
- int lru = page_lru_base_type(page);
+ if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+ !PageUnevictable(page)) {
+ bool active = PageActive(page);
- del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ del_page_from_lru_list(page, lruvec,
+ LRU_INACTIVE_ANON + active);
ClearPageActive(page);
ClearPageReferenced(page);
- add_page_to_lru_list(page, lruvec, lru);
+ /*
+ * lazyfree pages are clean anonymous pages. They have
+ * SwapBacked flag cleared to distinguish normal anonymous
+ * pages
+ */
+ ClearPageSwapBacked(page);
+ add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
- __count_vm_event(PGDEACTIVATE);
- update_page_reclaim_stat(lruvec, file, 0);
+ __count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
+ update_page_reclaim_stat(lruvec, 1, 0);
}
}
@@ -604,9 +611,9 @@ void lru_add_drain_cpu(int cpu)
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
- pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
activate_page_drain(cpu);
}
@@ -638,22 +645,22 @@ void deactivate_file_page(struct page *page)
}
/**
- * deactivate_page - deactivate a page
+ * mark_page_lazyfree - make an anon page lazyfree
* @page: page to deactivate
*
- * deactivate_page() moves @page to the inactive list if @page was on the active
- * list and was not an unevictable page. This is done to accelerate the reclaim
- * of @page.
+ * mark_page_lazyfree() moves @page to the inactive file list.
+ * This is done to accelerate the reclaim of @page.
*/
-void deactivate_page(struct page *page)
+void mark_page_lazyfree(struct page *page)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+ if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+ !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
- put_cpu_var(lru_deactivate_pvecs);
+ pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
+ put_cpu_var(lru_lazyfree_pvecs);
}
}
@@ -670,30 +677,19 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-/*
- * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
- * workqueue, aiding in getting memory freed.
- */
-static struct workqueue_struct *lru_add_drain_wq;
-
-static int __init lru_init(void)
-{
- lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0);
-
- if (WARN(!lru_add_drain_wq,
- "Failed to create workqueue lru_add_drain_wq"))
- return -ENOMEM;
-
- return 0;
-}
-early_initcall(lru_init);
-
void lru_add_drain_all(void)
{
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
int cpu;
+ /*
+ * Make sure nobody triggers this path before mm_percpu_wq is fully
+ * initialized.
+ */
+ if (WARN_ON(!mm_percpu_wq))
+ return;
+
mutex_lock(&lock);
get_online_cpus();
cpumask_clear(&has_work);
@@ -704,10 +700,10 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
- queue_work_on(cpu, lru_add_drain_wq, work);
+ queue_work_on(cpu, mm_percpu_wq, work);
cpumask_set_cpu(cpu, &has_work);
}
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 521ef9b6064f..5e1e9bffa3e2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1111,6 +1111,18 @@ int page_swapcount(struct page *page)
return count;
}
+static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+{
+ int count = 0;
+ pgoff_t offset = swp_offset(entry);
+ struct swap_cluster_info *ci;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ count = swap_count(si->swap_map[offset]);
+ unlock_cluster_or_swap_info(si, ci);
+ return count;
+}
+
/*
* How many references to @entry are currently swapped out?
* This does not give an exact answer when swap count is continued,
@@ -1119,17 +1131,11 @@ int page_swapcount(struct page *page)
int __swp_swapcount(swp_entry_t entry)
{
int count = 0;
- pgoff_t offset;
struct swap_info_struct *si;
- struct swap_cluster_info *ci;
si = __swap_info_get(entry);
- if (si) {
- offset = swp_offset(entry);
- ci = lock_cluster_or_swap_info(si, offset);
- count = swap_count(si->swap_map[offset]);
- unlock_cluster_or_swap_info(si, ci);
- }
+ if (si)
+ count = swap_swapcount(si, entry);
return count;
}
@@ -1291,7 +1297,8 @@ int free_swap_and_cache(swp_entry_t entry)
* Also recheck PageSwapCache now page is locked (above).
*/
if (PageSwapCache(page) && !PageWriteback(page) &&
- (!page_mapped(page) || mem_cgroup_swap_full(page))) {
+ (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
+ !swap_swapcount(p, entry)) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc8031ef994d..a3656f9f1803 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -230,12 +230,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
return nr;
}
-bool pgdat_reclaimable(struct pglist_data *pgdat)
-{
- return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) <
- pgdat_reclaimable_pages(pgdat) * 6;
-}
-
/**
* lruvec_lru_size - Returns the number of pages on the given LRU list.
* @lruvec: lru vector
@@ -912,7 +906,8 @@ static void page_check_dirty_writeback(struct page *page,
* Anonymous pages are not handled by flushers and must be written
* from reclaim context. Do not stall reclaim based on them
*/
- if (!page_is_file_cache(page)) {
+ if (!page_is_file_cache(page) ||
+ (PageAnon(page) && !PageSwapBacked(page))) {
*dirty = false;
*writeback = false;
return;
@@ -972,7 +967,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
- bool lazyfree = false;
int ret = SWAP_SUCCESS;
cond_resched();
@@ -994,7 +988,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
/* Double the slab pressure for mapped and swapcache pages */
- if (page_mapped(page) || PageSwapCache(page))
+ if ((page_mapped(page) || PageSwapCache(page)) &&
+ !(PageAnon(page) && !PageSwapBacked(page)))
sc->nr_scanned++;
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
@@ -1120,13 +1115,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
+ * Lazyfree page could be freed directly
*/
- if (PageAnon(page) && !PageSwapCache(page)) {
+ if (PageAnon(page) && PageSwapBacked(page) &&
+ !PageSwapCache(page)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
if (!add_to_swap(page, page_list))
goto activate_locked;
- lazyfree = true;
may_enter_fs = 1;
/* Adding to swap updated mapping */
@@ -1143,10 +1139,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
- if (page_mapped(page) && mapping) {
- switch (ret = try_to_unmap(page, lazyfree ?
- (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
- (ttu_flags | TTU_BATCH_FLUSH))) {
+ if (page_mapped(page)) {
+ switch (ret = try_to_unmap(page,
+ ttu_flags | TTU_BATCH_FLUSH)) {
+ case SWAP_DIRTY:
+ SetPageSwapBacked(page);
+ /* fall through */
case SWAP_FAIL:
nr_unmap_fail++;
goto activate_locked;
@@ -1154,8 +1152,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
case SWAP_MLOCK:
goto cull_mlocked;
- case SWAP_LZFREE:
- goto lazyfree;
case SWAP_SUCCESS:
; /* try to free the page below */
}
@@ -1267,10 +1263,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
}
-lazyfree:
- if (!mapping || !__remove_mapping(mapping, page, true))
- goto keep_locked;
+ if (PageAnon(page) && !PageSwapBacked(page)) {
+ /* follow __remove_mapping for reference */
+ if (!page_ref_freeze(page, 1))
+ goto keep_locked;
+ if (PageDirty(page)) {
+ page_ref_unfreeze(page, 1);
+ goto keep_locked;
+ }
+ count_vm_event(PGLAZYFREED);
+ } else if (!mapping || !__remove_mapping(mapping, page, true))
+ goto keep_locked;
/*
* At this point, we have no other references and there is
* no way to pick any more up (removed from LRU, removed
@@ -1280,9 +1284,6 @@ lazyfree:
*/
__ClearPageLocked(page);
free_it:
- if (ret == SWAP_LZFREE)
- count_vm_event(PGLAZYFREED);
-
nr_reclaimed++;
/*
@@ -1354,7 +1355,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
}
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true);
+ TTU_IGNORE_ACCESS, NULL, true);
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
@@ -1478,12 +1479,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
- unsigned long skipped = 0, total_skipped = 0;
+ unsigned long skipped = 0;
unsigned long scan, nr_pages;
LIST_HEAD(pages_skipped);
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
- !list_empty(src);) {
+ !list_empty(src); scan++) {
struct page *page;
page = lru_to_page(src);
@@ -1497,12 +1498,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
continue;
}
- /*
- * Account for scanned and skipped separetly to avoid the pgdat
- * being prematurely marked unreclaimable by pgdat_reclaimable.
- */
- scan++;
-
switch (__isolate_lru_page(page, mode)) {
case 0:
nr_pages = hpage_nr_pages(page);
@@ -1531,6 +1526,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
if (!list_empty(&pages_skipped)) {
int zid;
+ list_splice(&pages_skipped, src);
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
if (!nr_skipped[zid])
continue;
@@ -1538,17 +1534,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
skipped += nr_skipped[zid];
}
-
- /*
- * Account skipped pages as a partial scan as the pgdat may be
- * close to unreclaimable. If the LRU list is empty, account
- * skipped pages as a full scan.
- */
- total_skipped = list_empty(src) ? skipped : skipped >> 2;
-
- list_splice(&pages_skipped, src);
}
- *nr_scanned = scan + total_skipped;
+ *nr_scanned = scan;
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
scan, skipped, nr_taken, mode, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
@@ -1750,7 +1737,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
reclaim_stat->recent_scanned[file] += nr_taken;
if (global_reclaim(sc)) {
- __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
if (current_is_kswapd())
__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
else
@@ -1761,7 +1747,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
+ nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
&stat, false);
spin_lock_irq(&pgdat->lru_lock);
@@ -1953,8 +1939,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
- if (global_reclaim(sc))
- __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
__count_vm_events(PGREFILL, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2123,30 +2107,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
unsigned long anon, file;
- bool force_scan = false;
unsigned long ap, fp;
enum lru_list lru;
- bool some_scanned;
- int pass;
-
- /*
- * If the zone or memcg is small, nr[l] can be 0. This
- * results in no scanning on this priority and a potential
- * priority drop. Global direct reclaim can go to the next
- * zone and tends to have no problems. Global kswapd is for
- * zone balancing and it needs to scan a minimum amount. When
- * reclaiming for a memcg, a priority drop can cause high
- * latencies, so it's better to scan a minimum amount there as
- * well.
- */
- if (current_is_kswapd()) {
- if (!pgdat_reclaimable(pgdat))
- force_scan = true;
- if (!mem_cgroup_online(memcg))
- force_scan = true;
- }
- if (!global_reclaim(sc))
- force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
@@ -2277,55 +2239,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
fraction[1] = fp;
denominator = ap + fp + 1;
out:
- some_scanned = false;
- /* Only use force_scan on second pass. */
- for (pass = 0; !some_scanned && pass < 2; pass++) {
- *lru_pages = 0;
- for_each_evictable_lru(lru) {
- int file = is_file_lru(lru);
- unsigned long size;
- unsigned long scan;
-
- size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- scan = size >> sc->priority;
-
- if (!scan && pass && force_scan)
- scan = min(size, SWAP_CLUSTER_MAX);
-
- switch (scan_balance) {
- case SCAN_EQUAL:
- /* Scan lists relative to size */
- break;
- case SCAN_FRACT:
- /*
- * Scan types proportional to swappiness and
- * their relative recent reclaim efficiency.
- */
- scan = div64_u64(scan * fraction[file],
- denominator);
- break;
- case SCAN_FILE:
- case SCAN_ANON:
- /* Scan one type exclusively */
- if ((scan_balance == SCAN_FILE) != file) {
- size = 0;
- scan = 0;
- }
- break;
- default:
- /* Look ma, no brain */
- BUG();
- }
+ *lru_pages = 0;
+ for_each_evictable_lru(lru) {
+ int file = is_file_lru(lru);
+ unsigned long size;
+ unsigned long scan;
- *lru_pages += size;
- nr[lru] = scan;
+ size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ scan = size >> sc->priority;
+ /*
+ * If the cgroup's already been deleted, make sure to
+ * scrape out the remaining cache.
+ */
+ if (!scan && !mem_cgroup_online(memcg))
+ scan = min(size, SWAP_CLUSTER_MAX);
+ switch (scan_balance) {
+ case SCAN_EQUAL:
+ /* Scan lists relative to size */
+ break;
+ case SCAN_FRACT:
/*
- * Skip the second pass and don't force_scan,
- * if we found something to scan.
+ * Scan types proportional to swappiness and
+ * their relative recent reclaim efficiency.
*/
- some_scanned |= !!scan;
+ scan = div64_u64(scan * fraction[file],
+ denominator);
+ break;
+ case SCAN_FILE:
+ case SCAN_ANON:
+ /* Scan one type exclusively */
+ if ((scan_balance == SCAN_FILE) != file) {
+ size = 0;
+ scan = 0;
+ }
+ break;
+ default:
+ /* Look ma, no brain */
+ BUG();
}
+
+ *lru_pages += size;
+ nr[lru] = scan;
}
}
@@ -2604,22 +2559,38 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
sc->nr_scanned - nr_scanned,
node_lru_pages);
+ /*
+ * Record the subtree's reclaim efficiency. The reclaimed
+ * pages from slab is excluded here because the corresponding
+ * scanned pages is not accounted. Moreover, freeing a page
+ * by slab shrinking depends on each slab's object population,
+ * making the cost model (i.e. scan:free) different from that
+ * of LRU.
+ */
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+ sc->nr_scanned - nr_scanned,
+ sc->nr_reclaimed - nr_reclaimed);
+
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
- /* Record the subtree's reclaim efficiency */
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
- sc->nr_scanned - nr_scanned,
- sc->nr_reclaimed - nr_reclaimed);
-
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
+ /*
+ * Kswapd gives up on balancing particular nodes after too
+ * many failures to reclaim anything from them and goes to
+ * sleep. On reclaim progress, reset the failure counter. A
+ * successful direct reclaim run will revive a dormant kswapd.
+ */
+ if (reclaimable)
+ pgdat->kswapd_failures = 0;
+
return reclaimable;
}
@@ -2694,10 +2665,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
GFP_KERNEL | __GFP_HARDWALL))
continue;
- if (sc->priority != DEF_PRIORITY &&
- !pgdat_reclaimable(zone->zone_pgdat))
- continue; /* Let kswapd poll it */
-
/*
* If we already have plenty of memory free for
* compaction in this zone, don't free any more.
@@ -2827,8 +2794,10 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
- if (!managed_zone(zone) ||
- pgdat_reclaimable_pages(pgdat) == 0)
+ if (!managed_zone(zone))
+ continue;
+
+ if (!zone_reclaimable_pages(zone))
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2950,7 +2919,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long nr_reclaimed;
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
- .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
.reclaim_idx = gfp_zone(gfp_mask),
.order = order,
.nodemask = nodemask,
@@ -3030,7 +2999,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
int nid;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
.reclaim_idx = MAX_NR_ZONES - 1,
.target_mem_cgroup = memcg,
@@ -3084,22 +3053,44 @@ static void age_active_anon(struct pglist_data *pgdat,
} while (memcg);
}
-static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
+/*
+ * Returns true if there is an eligible zone balanced for the request order
+ * and classzone_idx
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
{
- unsigned long mark = high_wmark_pages(zone);
+ int i;
+ unsigned long mark = -1;
+ struct zone *zone;
- if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
- return false;
+ for (i = 0; i <= classzone_idx; i++) {
+ zone = pgdat->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ mark = high_wmark_pages(zone);
+ if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
+ return true;
+ }
/*
- * If any eligible zone is balanced then the node is not considered
- * to be congested or dirty
+ * If a node has no populated zone within classzone_idx, it does not
+ * need balancing by definition. This can happen if a zone-restricted
+ * allocation tries to wake a remote kswapd.
*/
- clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
- clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
- clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags);
+ if (mark == -1)
+ return true;
- return true;
+ return false;
+}
+
+/* Clear pgdat state for congested, dirty or under writeback. */
+static void clear_pgdat_congested(pg_data_t *pgdat)
+{
+ clear_bit(PGDAT_CONGESTED, &pgdat->flags);
+ clear_bit(PGDAT_DIRTY, &pgdat->flags);
+ clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}
/*
@@ -3110,8 +3101,6 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
*/
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
{
- int i;
-
/*
* The throttled processes are normally woken up in balance_pgdat() as
* soon as pfmemalloc_watermark_ok() is true. But there is a potential
@@ -3128,17 +3117,16 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
if (waitqueue_active(&pgdat->pfmemalloc_wait))
wake_up_all(&pgdat->pfmemalloc_wait);
- for (i = 0; i <= classzone_idx; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
+ /* Hopeless node, leave it to direct reclaim */
+ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ return true;
- if (!zone_balanced(zone, order, classzone_idx))
- return false;
+ if (pgdat_balanced(pgdat, order, classzone_idx)) {
+ clear_pgdat_congested(pgdat);
+ return true;
}
- return true;
+ return false;
}
/*
@@ -3214,9 +3202,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
count_vm_event(PAGEOUTRUN);
do {
+ unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
- sc.nr_reclaimed = 0;
sc.reclaim_idx = classzone_idx;
/*
@@ -3241,23 +3229,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
}
/*
- * Only reclaim if there are no eligible zones. Check from
- * high to low zone as allocations prefer higher zones.
- * Scanning from low to high zone would allow congestion to be
- * cleared during a very small window when a small low
- * zone was balanced even under extreme pressure when the
- * overall node may be congested. Note that sc.reclaim_idx
- * is not used as buffer_heads_over_limit may have adjusted
- * it.
+ * Only reclaim if there are no eligible zones. Note that
+ * sc.reclaim_idx is not used as buffer_heads_over_limit may
+ * have adjusted it.
*/
- for (i = classzone_idx; i >= 0; i--) {
- zone = pgdat->node_zones + i;
- if (!managed_zone(zone))
- continue;
-
- if (zone_balanced(zone, sc.order, classzone_idx))
- goto out;
- }
+ if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+ goto out;
/*
* Do some background aging of the anon list, to give
@@ -3271,7 +3248,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
*/
- if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
+ if (sc.priority < DEF_PRIORITY - 2)
sc.may_writepage = 1;
/* Call soft limit reclaim before calling shrink_node. */
@@ -3306,10 +3283,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
- if (raise_priority || !sc.nr_reclaimed)
+ nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+ if (raise_priority || !nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1);
+ if (!sc.nr_reclaimed)
+ pgdat->kswapd_failures++;
+
out:
/*
* Return the order kswapd stopped reclaiming at as
@@ -3320,6 +3301,22 @@ out:
return sc.order;
}
+/*
+ * pgdat->kswapd_classzone_idx is the highest zone index that a recent
+ * allocation request woke kswapd for. When kswapd has not woken recently,
+ * the value is MAX_NR_ZONES which is not a valid index. This compares a
+ * given classzone and returns it or the highest classzone index kswapd
+ * was recently woke for.
+ */
+static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
+ enum zone_type classzone_idx)
+{
+ if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+ return classzone_idx;
+
+ return max(pgdat->kswapd_classzone_idx, classzone_idx);
+}
+
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
unsigned int classzone_idx)
{
@@ -3331,7 +3328,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
- /* Try to sleep for a short interval */
+ /*
+ * Try to sleep for a short interval. Note that kcompactd will only be
+ * woken if it is possible to sleep for a short interval. This is
+ * deliberate on the assumption that if reclaim cannot keep an
+ * eligible zone balanced that it's also unlikely that compaction will
+ * succeed.
+ */
if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
/*
* Compaction records what page blocks it recently failed to
@@ -3355,7 +3358,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* the previous request that slept prematurely.
*/
if (remaining) {
- pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+ pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
}
@@ -3409,7 +3412,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
*/
static int kswapd(void *p)
{
- unsigned int alloc_order, reclaim_order, classzone_idx;
+ unsigned int alloc_order, reclaim_order;
+ unsigned int classzone_idx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
@@ -3439,20 +3443,23 @@ static int kswapd(void *p)
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
- pgdat->kswapd_order = alloc_order = reclaim_order = 0;
- pgdat->kswapd_classzone_idx = classzone_idx = 0;
+ pgdat->kswapd_order = 0;
+ pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
for ( ; ; ) {
bool ret;
+ alloc_order = reclaim_order = pgdat->kswapd_order;
+ classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
+
kswapd_try_sleep:
kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
classzone_idx);
/* Read the new order and classzone_idx */
alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = pgdat->kswapd_classzone_idx;
+ classzone_idx = kswapd_classzone_idx(pgdat, 0);
pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = 0;
+ pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
ret = try_to_freeze();
if (kthread_should_stop())
@@ -3478,9 +3485,6 @@ kswapd_try_sleep:
reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
-
- alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = pgdat->kswapd_classzone_idx;
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
@@ -3496,7 +3500,6 @@ kswapd_try_sleep:
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
pg_data_t *pgdat;
- int z;
if (!managed_zone(zone))
return;
@@ -3504,22 +3507,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
return;
pgdat = zone->zone_pgdat;
- pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+ pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
+ classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- /* Only wake kswapd if all zones are unbalanced */
- for (z = 0; z <= classzone_idx; z++) {
- zone = pgdat->node_zones + z;
- if (!managed_zone(zone))
- continue;
+ /* Hopeless node, leave it to direct reclaim */
+ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ return;
- if (zone_balanced(zone, order, classzone_idx))
- return;
- }
+ if (pgdat_balanced(pgdat, order, classzone_idx))
+ return;
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
wake_up_interruptible(&pgdat->kswapd_wait);
}
@@ -3725,7 +3726,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
int classzone_idx = gfp_zone(gfp_mask);
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
.order = order,
.priority = NODE_RECLAIM_PRIORITY,
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
@@ -3779,9 +3780,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
return NODE_RECLAIM_FULL;
- if (!pgdat_reclaimable(pgdat))
- return NODE_RECLAIM_FULL;
-
/*
* Do not scan if the allocation should not be delayed.
*/
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 69f9aff39a2e..4bbc775f9d08 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -954,7 +954,6 @@ const char * const vmstat_text[] = {
"nr_unevictable",
"nr_isolated_anon",
"nr_isolated_file",
- "nr_pages_scanned",
"workingset_refault",
"workingset_activate",
"workingset_nodereclaim",
@@ -992,6 +991,7 @@ const char * const vmstat_text[] = {
"pgfree",
"pgactivate",
"pgdeactivate",
+ "pglazyfree",
"pgfault",
"pgmajfault",
@@ -1065,6 +1065,9 @@ const char * const vmstat_text[] = {
"thp_split_page_failed",
"thp_deferred_split_page",
"thp_split_pmd",
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+ "thp_split_pud",
+#endif
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
#endif
@@ -1121,8 +1124,12 @@ static void frag_stop(struct seq_file *m, void *arg)
{
}
-/* Walk all the zones in a node and print using a callback */
+/*
+ * Walk zones in a node and print using a callback.
+ * If @assert_populated is true, only use callback for zones that are populated.
+ */
static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
+ bool assert_populated,
void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
{
struct zone *zone;
@@ -1130,7 +1137,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
unsigned long flags;
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
- if (!populated_zone(zone))
+ if (assert_populated && !populated_zone(zone))
continue;
spin_lock_irqsave(&zone->lock, flags);
@@ -1158,7 +1165,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
static int frag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, frag_show_print);
+ walk_zones_in_node(m, pgdat, true, frag_show_print);
return 0;
}
@@ -1199,7 +1206,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
seq_printf(m, "%6d ", order);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
+ walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print);
return 0;
}
@@ -1251,7 +1258,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
+ walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print);
return 0;
}
@@ -1277,7 +1284,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
+ walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print);
#endif /* CONFIG_PAGE_OWNER */
}
@@ -1375,7 +1382,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n min %lu"
"\n low %lu"
"\n high %lu"
- "\n node_scanned %lu"
"\n spanned %lu"
"\n present %lu"
"\n managed %lu",
@@ -1383,23 +1389,28 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
- node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
zone->spanned_pages,
zone->present_pages,
zone->managed_pages);
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- seq_printf(m, "\n %-12s %lu", vmstat_text[i],
- zone_page_state(zone, i));
-
seq_printf(m,
"\n protection: (%ld",
zone->lowmem_reserve[0]);
for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
- seq_printf(m,
- ")"
- "\n pagesets");
+ seq_putc(m, ')');
+
+ /* If unpopulated, no other information is useful */
+ if (!populated_zone(zone)) {
+ seq_putc(m, '\n');
+ return;
+ }
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ seq_printf(m, "\n %-12s %lu", vmstat_text[i],
+ zone_page_state(zone, i));
+
+ seq_printf(m, "\n pagesets");
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
@@ -1422,19 +1433,22 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n node_unreclaimable: %u"
"\n start_pfn: %lu"
"\n node_inactive_ratio: %u",
- !pgdat_reclaimable(zone->zone_pgdat),
+ pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
zone->zone_start_pfn,
zone->zone_pgdat->inactive_ratio);
seq_putc(m, '\n');
}
/*
- * Output information about zones in @pgdat.
+ * Output information about zones in @pgdat. All zones are printed regardless
+ * of whether they are populated or not: lowmem_reserve_ratio operates on the
+ * set of all zones and userspace would not be aware of such zones if they are
+ * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
*/
static int zoneinfo_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, zoneinfo_show_print);
+ walk_zones_in_node(m, pgdat, false, zoneinfo_show_print);
return 0;
}
@@ -1549,7 +1563,6 @@ static const struct file_operations proc_vmstat_file_operations = {
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SMP
-static struct workqueue_struct *vmstat_wq;
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
@@ -1584,22 +1597,9 @@ int vmstat_refresh(struct ctl_table *table, int write,
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
val = atomic_long_read(&vm_zone_stat[i]);
if (val < 0) {
- switch (i) {
- case NR_PAGES_SCANNED:
- /*
- * This is often seen to go negative in
- * recent kernels, but not to go permanently
- * negative. Whilst it would be nicer not to
- * have exceptions, rooting them out would be
- * another task, of rather low priority.
- */
- break;
- default:
- pr_warn("%s: %s %ld\n",
- __func__, vmstat_text[i], val);
- err = -EINVAL;
- break;
- }
+ pr_warn("%s: %s %ld\n",
+ __func__, vmstat_text[i], val);
+ err = -EINVAL;
}
}
if (err)
@@ -1620,7 +1620,7 @@ static void vmstat_update(struct work_struct *w)
* to occur in the future. Keep on running the
* update worker thread.
*/
- queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+ queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
}
@@ -1699,7 +1699,7 @@ static void vmstat_shepherd(struct work_struct *w)
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
if (!delayed_work_pending(dw) && need_update(cpu))
- queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
+ queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
}
put_online_cpus();
@@ -1715,7 +1715,6 @@ static void __init start_shepherd_timer(void)
INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
- vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
}
@@ -1761,11 +1760,15 @@ static int vmstat_cpu_dead(unsigned int cpu)
#endif
+struct workqueue_struct *mm_percpu_wq;
+
static int __init setup_vmstat(void)
{
-#ifdef CONFIG_SMP
- int ret;
+ int ret __maybe_unused;
+ mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+
+#ifdef CONFIG_SMP
ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
NULL, vmstat_cpu_dead);
if (ret < 0)
@@ -1853,7 +1856,7 @@ static int unusable_show(struct seq_file *m, void *arg)
if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
- walk_zones_in_node(m, pgdat, unusable_show_print);
+ walk_zones_in_node(m, pgdat, true, unusable_show_print);
return 0;
}
@@ -1905,7 +1908,7 @@ static int extfrag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, extfrag_show_print);
+ walk_zones_in_node(m, pgdat, true, extfrag_show_print);
return 0;
}
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index baa3c7be04ad..832e8150dba3 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2757,13 +2757,6 @@ sub process {
#print "is_start<$is_start> is_end<$is_end> length<$length>\n";
}
-# discourage the addition of CONFIG_EXPERIMENTAL in Kconfig.
- if ($realfile =~ /Kconfig/ &&
- $line =~ /.\s*depends on\s+.*\bEXPERIMENTAL\b/) {
- WARN("CONFIG_EXPERIMENTAL",
- "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n");
- }
-
# discourage the use of boolean for type definition attributes of Kconfig options
if ($realfile =~ /Kconfig/ &&
$line =~ /^\+\s*\bboolean\b/) {
@@ -3133,6 +3126,17 @@ sub process {
# check we are in a valid C source file if not then ignore this hunk
next if ($realfile !~ /\.(h|c)$/);
+# check if this appears to be the start function declaration, save the name
+ if ($sline =~ /^\+\{\s*$/ &&
+ $prevline =~ /^\+(?:(?:(?:$Storage|$Inline)\s*)*\s*$Type\s*)?($Ident)\(/) {
+ $context_function = $1;
+ }
+
+# check if this appears to be the end of function declaration
+ if ($sline =~ /^\+\}\s*$/) {
+ undef $context_function;
+ }
+
# check indentation of any line with a bare else
# (but not if it is a multiple line "if (foo) return bar; else return baz;")
# if the previous line is a break or return and is indented 1 tab more...
@@ -3157,12 +3161,6 @@ sub process {
}
}
-# discourage the addition of CONFIG_EXPERIMENTAL in #if(def).
- if ($line =~ /^\+\s*\#\s*if.*\bCONFIG_EXPERIMENTAL\b/) {
- WARN("CONFIG_EXPERIMENTAL",
- "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n");
- }
-
# check for RCS/CVS revision markers
if ($rawline =~ /^\+.*\$(Revision|Log|Id)(?:\$|)/) {
WARN("CVS_KEYWORD",
@@ -5676,6 +5674,32 @@ sub process {
}
}
+ # check for vsprintf extension %p<foo> misuses
+ if ($^V && $^V ge 5.10.0 &&
+ defined $stat &&
+ $stat =~ /^\+(?![^\{]*\{\s*).*\b(\w+)\s*\(.*$String\s*,/s &&
+ $1 !~ /^_*volatile_*$/) {
+ my $bad_extension = "";
+ my $lc = $stat =~ tr@\n@@;
+ $lc = $lc + $linenr;
+ for (my $count = $linenr; $count <= $lc; $count++) {
+ my $fmt = get_quoted_string($lines[$count - 1], raw_line($count, 0));
+ $fmt =~ s/%%//g;
+ if ($fmt =~ /(\%[\*\d\.]*p(?![\WFfSsBKRraEhMmIiUDdgVCbGN]).)/) {
+ $bad_extension = $1;
+ last;
+ }
+ }
+ if ($bad_extension ne "") {
+ my $stat_real = raw_line($linenr, 0);
+ for (my $count = $linenr + 1; $count <= $lc; $count++) {
+ $stat_real = $stat_real . "\n" . raw_line($count, 0);
+ }
+ WARN("VSPRINTF_POINTER_EXTENSION",
+ "Invalid vsprintf pointer extension '$bad_extension'\n" . "$here\n$stat_real\n");
+ }
+ }
+
# Check for misused memsets
if ($^V && $^V ge 5.10.0 &&
defined $stat &&
diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in
index 7986f4e0da12..7aad82406422 100644
--- a/scripts/gdb/linux/constants.py.in
+++ b/scripts/gdb/linux/constants.py.in
@@ -14,6 +14,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
+#include <linux/of_fdt.h>
/* We need to stringify expanded macros so that they can be parsed */
@@ -50,3 +51,9 @@ LX_VALUE(MNT_NOEXEC)
LX_VALUE(MNT_NOATIME)
LX_VALUE(MNT_NODIRATIME)
LX_VALUE(MNT_RELATIME)
+
+/* linux/of_fdt.h> */
+LX_VALUE(OF_DT_HEADER)
+
+/* Kernel Configs */
+LX_CONFIG(CONFIG_OF)
diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py
index 38b1f09d1cd9..086d27223c0c 100644
--- a/scripts/gdb/linux/proc.py
+++ b/scripts/gdb/linux/proc.py
@@ -16,6 +16,7 @@ from linux import constants
from linux import utils
from linux import tasks
from linux import lists
+from struct import *
class LxCmdLine(gdb.Command):
@@ -195,3 +196,75 @@ values of that process namespace"""
info_opts(MNT_INFO, m_flags)))
LxMounts()
+
+
+class LxFdtDump(gdb.Command):
+ """Output Flattened Device Tree header and dump FDT blob to the filename
+ specified as the command argument. Equivalent to
+ 'cat /proc/fdt > fdtdump.dtb' on a running target"""
+
+ def __init__(self):
+ super(LxFdtDump, self).__init__("lx-fdtdump", gdb.COMMAND_DATA,
+ gdb.COMPLETE_FILENAME)
+
+ def fdthdr_to_cpu(self, fdt_header):
+
+ fdt_header_be = ">IIIIIII"
+ fdt_header_le = "<IIIIIII"
+
+ if utils.get_target_endianness() == 1:
+ output_fmt = fdt_header_le
+ else:
+ output_fmt = fdt_header_be
+
+ return unpack(output_fmt, pack(fdt_header_be,
+ fdt_header['magic'],
+ fdt_header['totalsize'],
+ fdt_header['off_dt_struct'],
+ fdt_header['off_dt_strings'],
+ fdt_header['off_mem_rsvmap'],
+ fdt_header['version'],
+ fdt_header['last_comp_version']))
+
+ def invoke(self, arg, from_tty):
+
+ if not constants.LX_CONFIG_OF:
+ raise gdb.GdbError("Kernel not compiled with CONFIG_OF\n")
+
+ if len(arg) == 0:
+ filename = "fdtdump.dtb"
+ else:
+ filename = arg
+
+ py_fdt_header_ptr = gdb.parse_and_eval(
+ "(const struct fdt_header *) initial_boot_params")
+ py_fdt_header = py_fdt_header_ptr.dereference()
+
+ fdt_header = self.fdthdr_to_cpu(py_fdt_header)
+
+ if fdt_header[0] != constants.LX_OF_DT_HEADER:
+ raise gdb.GdbError("No flattened device tree magic found\n")
+
+ gdb.write("fdt_magic: 0x{:02X}\n".format(fdt_header[0]))
+ gdb.write("fdt_totalsize: 0x{:02X}\n".format(fdt_header[1]))
+ gdb.write("off_dt_struct: 0x{:02X}\n".format(fdt_header[2]))
+ gdb.write("off_dt_strings: 0x{:02X}\n".format(fdt_header[3]))
+ gdb.write("off_mem_rsvmap: 0x{:02X}\n".format(fdt_header[4]))
+ gdb.write("version: {}\n".format(fdt_header[5]))
+ gdb.write("last_comp_version: {}\n".format(fdt_header[6]))
+
+ inf = gdb.inferiors()[0]
+ fdt_buf = utils.read_memoryview(inf, py_fdt_header_ptr,
+ fdt_header[1]).tobytes()
+
+ try:
+ f = open(filename, 'wb')
+ except:
+ raise gdb.GdbError("Could not open file to dump fdt")
+
+ f.write(fdt_buf)
+ f.close()
+
+ gdb.write("Dumped fdt blob to " + filename + "\n")
+
+LxFdtDump()
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 0458b037c8a1..0545f5a8cabe 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -372,6 +372,8 @@ disassocation||disassociation
disapear||disappear
disapeared||disappeared
disappared||disappeared
+disble||disable
+disbled||disabled
disconnet||disconnect
discontinous||discontinuous
dispertion||dispersion
@@ -732,6 +734,7 @@ oustanding||outstanding
overaall||overall
overhread||overhead
overlaping||overlapping
+overide||override
overrided||overridden
overriden||overridden
overun||overrun
diff --git a/sound/soc/amd/acp-pcm-dma.c b/sound/soc/amd/acp-pcm-dma.c
index ec1067a679da..08b1399d1da2 100644
--- a/sound/soc/amd/acp-pcm-dma.c
+++ b/sound/soc/amd/acp-pcm-dma.c
@@ -89,7 +89,7 @@ static void acp_reg_write(u32 val, void __iomem *acp_mmio, u32 reg)
writel(val, acp_mmio + (reg * 4));
}
-/* Configure a given dma channel parameters - enable/disble,
+/* Configure a given dma channel parameters - enable/disable,
* number of descriptors, priority
*/
static void config_acp_dma_channel(void __iomem *acp_mmio, u8 ch_num,
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
index 11c8d9bc762e..5d19fdf80292 100644
--- a/tools/lguest/lguest.c
+++ b/tools/lguest/lguest.c
@@ -1387,7 +1387,7 @@ static bool pci_data_iowrite(u16 port, u32 mask, u32 val)
/* Allow writing to any other BAR, or expansion ROM */
iowrite(portoff, val, mask, &d->config_words[reg]);
return true;
- /* We let them overide latency timer and cacheline size */
+ /* We let them override latency timer and cacheline size */
} else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) {
/* Only let them change the first two fields. */
if (mask == 0xFFFFFFFF)
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index e2efddf10231..1f5300e56b44 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -132,7 +132,7 @@ else
Q = @
endif
-# Disable command line variables (CFLAGS) overide from top
+# Disable command line variables (CFLAGS) override from top
# level Makefile (perf), otherwise build Makefile will get
# the same command line setup.
MAKEOVERRIDES=
diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile
index 47076b15eebe..9b8555ea3459 100644
--- a/tools/lib/traceevent/Makefile
+++ b/tools/lib/traceevent/Makefile
@@ -135,7 +135,7 @@ else
Q = @
endif
-# Disable command line variables (CFLAGS) overide from top
+# Disable command line variables (CFLAGS) override from top
# level Makefile (perf), otherwise build Makefile will get
# the same command line setup.
MAKEOVERRIDES=
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
index 66342804161c..0c03538df74c 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -140,7 +140,7 @@ struct pevent_plugin_option {
* struct pevent_plugin_option PEVENT_PLUGIN_OPTIONS[] = {
* {
* .name = "option-name",
- * .plugin_alias = "overide-file-name", (optional)
+ * .plugin_alias = "override-file-name", (optional)
* .description = "description of option to show users",
* },
* {
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 4cff7e7ddcc4..41642ba5e318 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -1,5 +1,9 @@
# Makefile for vm selftests
+ifndef OUTPUT
+ OUTPUT := $(shell pwd)
+endif
+
CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS)
LDLIBS = -lrt
TEST_GEN_FILES = compaction_test
diff --git a/usr/Kconfig b/usr/Kconfig
index 6278f135256d..c0c48507e44e 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -21,6 +21,16 @@ config INITRAMFS_SOURCE
If you are not sure, leave it blank.
+config INITRAMFS_FORCE
+ bool "Ignore the initramfs passed by the bootloader"
+ depends on CMDLINE_EXTEND || CMDLINE_FORCE
+ help
+ This option causes the kernel to ignore the initramfs image
+ (or initrd image) passed to it by the bootloader. This is
+ analogous to CMDLINE_FORCE, which is found on some architectures,
+ and is useful if you cannot or don't want to change the image
+ your bootloader passes to the kernel.
+
config INITRAMFS_ROOT_UID
int "User ID to map to 0 (user root)"
depends on INITRAMFS_SOURCE!=""