summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-devices-memory25
-rw-r--r--Documentation/accounting/psi.txt107
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst36
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt13
-rw-r--r--Documentation/admin-guide/mm/pagemap.rst9
-rw-r--r--Documentation/core-api/genalloc.rst2
-rw-r--r--Documentation/dontdiff1
-rw-r--r--Documentation/lzo.txt39
-rw-r--r--Documentation/process/4.Coding.rst2
-rw-r--r--Documentation/translations/it_IT/process/4.Coding.rst2
-rw-r--r--Documentation/vm/slub.rst3
-rw-r--r--MAINTAINERS8
-rw-r--r--arch/alpha/include/asm/topology.h3
-rw-r--r--arch/arc/configs/axs101_defconfig1
-rw-r--r--arch/arc/configs/axs103_defconfig1
-rw-r--r--arch/arc/configs/axs103_smp_defconfig1
-rw-r--r--arch/arc/configs/haps_hs_defconfig1
-rw-r--r--arch/arc/configs/haps_hs_smp_defconfig1
-rw-r--r--arch/arc/configs/hsdk_defconfig1
-rw-r--r--arch/arc/configs/nps_defconfig1
-rw-r--r--arch/arc/configs/nsim_700_defconfig1
-rw-r--r--arch/arc/configs/nsim_hs_defconfig1
-rw-r--r--arch/arc/configs/nsim_hs_smp_defconfig1
-rw-r--r--arch/arc/configs/nsimosci_defconfig1
-rw-r--r--arch/arc/configs/nsimosci_hs_defconfig1
-rw-r--r--arch/arc/configs/nsimosci_hs_smp_defconfig1
-rw-r--r--arch/arc/configs/tb10x_defconfig1
-rw-r--r--arch/arc/configs/vdk_hs38_defconfig1
-rw-r--r--arch/arc/configs/vdk_hs38_smp_defconfig1
-rw-r--r--arch/arm/configs/bcm2835_defconfig1
-rw-r--r--arch/arm/configs/cns3420vb_defconfig1
-rw-r--r--arch/arm/configs/efm32_defconfig1
-rw-r--r--arch/arm/configs/eseries_pxa_defconfig1
-rw-r--r--arch/arm/configs/gemini_defconfig1
-rw-r--r--arch/arm/configs/mini2440_defconfig1
-rw-r--r--arch/arm/configs/moxart_defconfig1
-rw-r--r--arch/arm/configs/mps2_defconfig1
-rw-r--r--arch/arm/configs/nuc910_defconfig1
-rw-r--r--arch/arm/configs/nuc950_defconfig1
-rw-r--r--arch/arm/configs/nuc960_defconfig1
-rw-r--r--arch/arm/configs/stm32_defconfig1
-rw-r--r--arch/arm/mm/dma-mapping.c2
-rw-r--r--arch/arm/mm/mmu.c13
-rw-r--r--arch/arm64/Kconfig6
-rw-r--r--arch/arm64/include/asm/hugetlb.h12
-rw-r--r--arch/arm64/include/asm/memory.h4
-rw-r--r--arch/arm64/kernel/machine_kexec.c3
-rw-r--r--arch/arm64/mm/hugetlbpage.c20
-rw-r--r--arch/arm64/mm/init.c27
-rw-r--r--arch/arm64/mm/numa.c2
-rw-r--r--arch/c6x/mm/dma-coherent.c9
-rw-r--r--arch/h8300/configs/edosk2674_defconfig1
-rw-r--r--arch/h8300/configs/h8300h-sim_defconfig1
-rw-r--r--arch/h8300/configs/h8s-sim_defconfig1
-rw-r--r--arch/ia64/kernel/numa.c2
-rw-r--r--arch/ia64/kernel/perfmon.c59
-rw-r--r--arch/ia64/mm/discontig.c6
-rw-r--r--arch/m68k/configs/amcore_defconfig1
-rw-r--r--arch/m68k/configs/stmark2_defconfig1
-rw-r--r--arch/m68k/mm/memory.c2
-rw-r--r--arch/microblaze/mm/init.c5
-rw-r--r--arch/nds32/mm/init.c12
-rw-r--r--arch/nios2/configs/10m50_defconfig1
-rw-r--r--arch/nios2/configs/3c120_defconfig1
-rw-r--r--arch/openrisc/configs/or1ksim_defconfig1
-rw-r--r--arch/openrisc/configs/simple_smp_defconfig1
-rw-r--r--arch/openrisc/mm/ioremap.c11
-rw-r--r--arch/powerpc/configs/mpc512x_defconfig1
-rw-r--r--arch/powerpc/configs/ppc6xx_defconfig1
-rw-r--r--arch/powerpc/include/asm/book3s/64/hugetlb.h16
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h18
-rw-r--r--arch/powerpc/include/asm/book3s/64/radix.h4
-rw-r--r--arch/powerpc/include/asm/pci-bridge.h3
-rw-r--r--arch/powerpc/kernel/paca.c19
-rw-r--r--arch/powerpc/kernel/pci-common.c3
-rw-r--r--arch/powerpc/kernel/process.c12
-rw-r--r--arch/powerpc/kernel/setup-common.c4
-rw-r--r--arch/powerpc/kernel/setup_64.c24
-rw-r--r--arch/powerpc/kernel/vdso.c2
-rw-r--r--arch/powerpc/mm/fault.c6
-rw-r--r--arch/powerpc/mm/hash_utils_64.c6
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c25
-rw-r--r--arch/powerpc/mm/hugetlbpage-radix.c17
-rw-r--r--arch/powerpc/mm/mmu_context_iommu.c146
-rw-r--r--arch/powerpc/mm/numa.c16
-rw-r--r--arch/powerpc/mm/pgtable-book3e.c8
-rw-r--r--arch/powerpc/mm/pgtable-book3s64.c30
-rw-r--r--arch/powerpc/mm/pgtable-radix.c43
-rw-r--r--arch/powerpc/mm/ppc_mmu_32.c3
-rw-r--r--arch/powerpc/perf/callchain.c20
-rw-r--r--arch/powerpc/perf/core-book3s.c8
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype2
-rw-r--r--arch/powerpc/platforms/pasemi/iommu.c5
-rw-r--r--arch/powerpc/platforms/powernv/memtrace.c5
-rw-r--r--arch/powerpc/platforms/powernv/opal.c3
-rw-r--r--arch/powerpc/platforms/pseries/setup.c18
-rw-r--r--arch/powerpc/sysdev/dart_iommu.c7
-rw-r--r--arch/powerpc/sysdev/fsl_pci.c10
-rw-r--r--arch/riscv/kernel/vdso.c1
-rw-r--r--arch/s390/Kconfig2
-rw-r--r--arch/s390/include/asm/hugetlb.h7
-rw-r--r--arch/s390/include/asm/pgtable.h5
-rw-r--r--arch/s390/kernel/vdso.c2
-rw-r--r--arch/s390/mm/pgtable.c8
-rw-r--r--arch/s390/numa/numa.c14
-rw-r--r--arch/sh/configs/apsh4a3a_defconfig1
-rw-r--r--arch/sh/configs/edosk7705_defconfig1
-rw-r--r--arch/sh/configs/espt_defconfig1
-rw-r--r--arch/sh/configs/sdk7786_defconfig1
-rw-r--r--arch/sh/configs/sh2007_defconfig1
-rw-r--r--arch/sh/configs/sh7724_generic_defconfig1
-rw-r--r--arch/sh/configs/sh7763rdp_defconfig1
-rw-r--r--arch/sh/configs/sh7770_generic_defconfig1
-rw-r--r--arch/sh/configs/sh7785lcr_defconfig1
-rw-r--r--arch/sh/configs/ul2_defconfig1
-rw-r--r--arch/sh/configs/urquell_defconfig1
-rw-r--r--arch/sh/kernel/syscalls/syscalltbl.sh4
-rw-r--r--arch/sh/kernel/syscalls_32.S2
-rw-r--r--arch/sh/mm/init.c18
-rw-r--r--arch/sh/mm/numa.c5
-rw-r--r--arch/sparc/configs/sparc32_defconfig1
-rw-r--r--arch/sparc/configs/sparc64_defconfig1
-rw-r--r--arch/sparc/include/asm/pgtable_64.h30
-rw-r--r--arch/sparc/kernel/pci_fire.c3
-rw-r--r--arch/sparc/kernel/pci_schizo.c3
-rw-r--r--arch/sparc/kernel/prom_64.c7
-rw-r--r--arch/sparc/kernel/psycho_common.c3
-rw-r--r--arch/sparc/kernel/sbus.c3
-rw-r--r--arch/sparc/mm/init_64.c15
-rw-r--r--arch/unicore32/mm/mmu.c14
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/include/asm/hugetlb.h7
-rw-r--r--arch/x86/include/asm/paravirt.h13
-rw-r--r--arch/x86/include/asm/paravirt_types.h5
-rw-r--r--arch/x86/include/asm/pci.h3
-rw-r--r--arch/x86/include/asm/uaccess.h24
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c7
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/smpboot.c3
-rw-r--r--arch/x86/lib/usercopy_32.c8
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/numa.c4
-rw-r--r--arch/x86/xen/mmu.h4
-rw-r--r--arch/x86/xen/mmu_pv.c8
-rw-r--r--crypto/Makefile2
-rw-r--r--crypto/lzo-rle.c175
-rw-r--r--crypto/tcrypt.c4
-rw-r--r--drivers/base/memory.c42
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c5
-rw-r--r--drivers/block/zram/zcomp.c1
-rw-r--r--drivers/block/zram/zram_drv.c2
-rw-r--r--drivers/char/agp/efficeon-agp.c2
-rw-r--r--drivers/dma/dmaengine.c4
-rw-r--r--drivers/gpu/drm/i915/i915_utils.h6
-rw-r--r--drivers/hv/hv_balloon.c21
-rw-r--r--drivers/infiniband/hw/hfi1/affinity.c3
-rw-r--r--drivers/infiniband/hw/hfi1/init.c3
-rw-r--r--drivers/iommu/dmar.c5
-rw-r--r--drivers/iommu/intel-iommu.c3
-rw-r--r--drivers/misc/sgi-xp/xpc_uv.c3
-rw-r--r--drivers/misc/sram-exec.c2
-rw-r--r--drivers/misc/vmw_balloon.c32
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_main.c5
-rw-r--r--drivers/rapidio/rio_cm.c4
-rw-r--r--drivers/xen/balloon.c18
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/autofs/autofs_i.h3
-rw-r--r--fs/autofs/inode.c19
-rw-r--r--fs/binfmt_elf.c32
-rw-r--r--fs/binfmt_script.c41
-rw-r--r--fs/btrfs/ctree.h34
-rw-r--r--fs/buffer.c56
-rw-r--r--fs/eventpoll.c173
-rw-r--r--fs/exec.c9
-rw-r--r--fs/fat/file.c1
-rw-r--r--fs/file.c1
-rw-r--r--fs/hugetlbfs/inode.c14
-rw-r--r--fs/inode.c8
-rw-r--r--fs/kernfs/file.c31
-rw-r--r--fs/namei.c2
-rw-r--r--fs/ocfs2/alloc.c159
-rw-r--r--fs/ocfs2/aops.c22
-rw-r--r--fs/ocfs2/cluster/nodemanager.c14
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c29
-rw-r--r--fs/ocfs2/dlmglue.c5
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/slot_map.c8
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/pipe.c3
-rw-r--r--fs/proc/array.c16
-rw-r--r--fs/proc/base.c8
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/self.c16
-rw-r--r--fs/proc/stat.c60
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/proc/thread_self.c16
-rw-r--r--fs/ramfs/inode.c12
-rw-r--r--include/asm-generic/pgtable.h18
-rw-r--r--include/linux/acpi.h11
-rw-r--r--include/linux/balloon_compaction.h34
-rw-r--r--include/linux/bitops.h2
-rw-r--r--include/linux/build_bug.h19
-rw-r--r--include/linux/cgroup-defs.h4
-rw-r--r--include/linux/compaction.h7
-rw-r--r--include/linux/delay.h1
-rw-r--r--include/linux/device.h4
-rw-r--r--include/linux/dynamic_debug.h102
-rw-r--r--include/linux/frontswap.h7
-rw-r--r--include/linux/fs.h5
-rw-r--r--include/linux/genalloc.h2
-rw-r--r--include/linux/gfp.h32
-rw-r--r--include/linux/hmm.h294
-rw-r--r--include/linux/hugetlb.h70
-rw-r--r--include/linux/ipc_namespace.h1
-rw-r--r--include/linux/kasan-checks.h2
-rw-r--r--include/linux/kernel.h31
-rw-r--r--include/linux/kernfs.h8
-rw-r--r--include/linux/ksm.h7
-rw-r--r--include/linux/limits.h36
-rw-r--r--include/linux/list.h28
-rw-r--r--include/linux/lzo.h6
-rw-r--r--include/linux/memblock.h41
-rw-r--r--include/linux/memcontrol.h74
-rw-r--r--include/linux/memory_hotplug.h2
-rw-r--r--include/linux/mm.h102
-rw-r--r--include/linux/mm_types.h76
-rw-r--r--include/linux/mmzone.h63
-rw-r--r--include/linux/net.h2
-rw-r--r--include/linux/nodemask.h15
-rw-r--r--include/linux/page-flags.h44
-rw-r--r--include/linux/pagemap.h32
-rw-r--r--include/linux/pid.h1
-rw-r--r--include/linux/poison.h2
-rw-r--r--include/linux/printk.h2
-rw-r--r--include/linux/psi.h8
-rw-r--r--include/linux/psi_types.h83
-rw-r--r--include/linux/sched.h15
-rw-r--r--include/linux/sched/mm.h48
-rw-r--r--include/linux/sched/signal.h18
-rw-r--r--include/linux/shmem_fs.h3
-rw-r--r--include/linux/slab.h3
-rw-r--r--include/linux/slub_def.h12
-rw-r--r--include/linux/swap.h2
-rw-r--r--include/linux/uaccess.h34
-rw-r--r--include/uapi/linux/auto_fs.h2
-rw-r--r--include/uapi/linux/binfmts.h2
-rw-r--r--include/uapi/linux/fcntl.h1
-rw-r--r--include/uapi/linux/kernel-page-flags.h2
-rw-r--r--include/uapi/linux/limits.h4
-rw-r--r--init/Kconfig23
-rw-r--r--init/init_task.c3
-rw-r--r--init/initramfs.c12
-rw-r--r--ipc/ipc_sysctl.c14
-rw-r--r--ipc/sem.c7
-rw-r--r--ipc/util.c29
-rw-r--r--ipc/util.h46
-rw-r--r--kernel/.gitignore2
-rw-r--r--kernel/Makefile11
-rw-r--r--kernel/cgroup/cgroup.c107
-rw-r--r--kernel/configs.c42
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/dma/remap.c2
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/gcov/gcc_3_4.c6
-rw-r--r--kernel/hung_task.c1
-rw-r--r--kernel/kcov.c15
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/power/snapshot.c17
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/sched/core.c3
-rw-r--r--kernel/sched/fair.c15
-rw-r--r--kernel/sched/psi.c609
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/sys.c1
-rw-r--r--kernel/sysctl.c101
-rw-r--r--kernel/workqueue.c10
-rw-r--r--lib/Kconfig.debug36
-rw-r--r--lib/Kconfig.kasan10
-rw-r--r--lib/Kconfig.ubsan14
-rw-r--r--lib/Makefile1
-rw-r--r--lib/assoc_array.c1
-rw-r--r--lib/cpumask.c3
-rw-r--r--lib/debugobjects.c66
-rw-r--r--lib/div64.c4
-rw-r--r--lib/dynamic_debug.c22
-rw-r--r--lib/genalloc.c5
-rw-r--r--lib/lzo/lzo1x_compress.c130
-rw-r--r--lib/lzo/lzo1x_decompress_safe.c75
-rw-r--r--lib/lzo/lzodefs.h21
-rw-r--r--lib/test_kasan.c24
-rw-r--r--lib/test_ubsan.c11
-rw-r--r--lib/test_vmalloc.c551
-rw-r--r--lib/vsprintf.c5
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/Kconfig.debug17
-rw-r--r--mm/Makefile7
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/compaction.c1013
-rw-r--r--mm/debug.c4
-rw-r--r--mm/dmapool.c13
-rw-r--r--mm/failslab.c14
-rw-r--r--mm/filemap.c300
-rw-r--r--mm/gup.c200
-rw-r--r--mm/gup_benchmark.c8
-rw-r--r--mm/hmm.c1063
-rw-r--r--mm/huge_memory.c23
-rw-r--r--mm/hugetlb.c69
-rw-r--r--mm/internal.h24
-rw-r--r--mm/kasan/common.c31
-rw-r--r--mm/kasan/generic.c19
-rw-r--r--mm/kasan/generic_report.c3
-rw-r--r--mm/kasan/kasan.h3
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/kmemleak.c10
-rw-r--r--mm/ksm.c78
-rw-r--r--mm/list_lru.c3
-rw-r--r--mm/memblock.c67
-rw-r--r--mm/memcontrol.c161
-rw-r--r--mm/memfd.c3
-rw-r--r--mm/memory.c69
-rw-r--r--mm/memory_hotplug.c57
-rw-r--r--mm/mempolicy.c10
-rw-r--r--mm/mempool.c8
-rw-r--r--mm/migrate.c14
-rw-r--r--mm/mmap.c15
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/oom_kill.c81
-rw-r--r--mm/page-writeback.c24
-rw-r--r--mm/page_alloc.c881
-rw-r--r--mm/page_ext.c2
-rw-r--r--mm/page_owner.c8
-rw-r--r--mm/page_poison.c4
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/shmem.c741
-rw-r--r--mm/shuffle.c207
-rw-r--r--mm/shuffle.h64
-rw-r--r--mm/slab.c34
-rw-r--r--mm/slab.h11
-rw-r--r--mm/slab_common.c15
-rw-r--r--mm/slub.c107
-rw-r--r--mm/swap.c17
-rw-r--r--mm/swap_state.c23
-rw-r--r--mm/swapfile.c490
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/util.c37
-rw-r--r--mm/vmalloc.c491
-rw-r--r--mm/vmscan.c163
-rw-r--r--mm/vmstat.c15
-rw-r--r--net/core/pktgen.c3
-rw-r--r--net/qrtr/qrtr.c3
-rw-r--r--scripts/Makefile.kasan5
-rwxr-xr-xscripts/checkpatch.pl60
-rwxr-xr-xscripts/decode_stacktrace.sh9
-rw-r--r--scripts/gcc-plugins/Kconfig4
-rw-r--r--scripts/spelling.txt69
-rw-r--r--tools/include/linux/numa.h16
-rw-r--r--tools/perf/bench/numa.c7
-rw-r--r--tools/testing/selftests/memfd/memfd_test.c74
-rw-r--r--tools/testing/selftests/proc/.gitignore1
-rw-r--r--tools/testing/selftests/proc/Makefile1
-rw-r--r--tools/testing/selftests/proc/proc-loadavg-001.c2
-rw-r--r--tools/testing/selftests/proc/proc-pid-vm.c406
-rw-r--r--tools/testing/selftests/proc/proc-self-map-files-002.c2
-rw-r--r--tools/testing/selftests/proc/proc-self-syscall.c2
-rw-r--r--tools/testing/selftests/proc/proc-self-wchan.c2
-rw-r--r--tools/testing/selftests/proc/read.c14
-rwxr-xr-xtools/testing/selftests/sysctl/sysctl.sh55
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests16
-rw-r--r--tools/testing/selftests/vm/test_vmalloc.sh176
-rw-r--r--tools/vm/page-types.c2
-rw-r--r--tools/vm/slabinfo.c35
380 files changed, 9642 insertions, 3960 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory
index deef3b5723cf..02a4250964e0 100644
--- a/Documentation/ABI/testing/sysfs-devices-memory
+++ b/Documentation/ABI/testing/sysfs-devices-memory
@@ -91,3 +91,28 @@ Description:
memory section directory. For example, the following symbolic
link is created for memory section 9 on node0.
/sys/devices/system/node/node0/memory9 -> ../../memory/memory9
+
+What: /sys/devices/system/memory/probe
+Date: October 2005
+Contact: Linux Memory Management list <linux-mm@kvack.org>
+Description:
+ The file /sys/devices/system/memory/probe is write-only, and
+ when written will simulate a physical hot-add of a memory
+ section at the given address. For example, assuming a section
+ of unused memory exists at physical address 0x80000000, it can
+ be introduced to the kernel with the following command:
+ # echo 0x80000000 > /sys/devices/system/memory/probe
+Users: Memory hotplug testing and development
+
+What: /sys/devices/system/memory/memoryX/remove
+Date: February 2019
+Contact: Linux Memory Management list <linux-mm@kvack.org>
+Description:
+ The file /sys/devices/system/memory/memoryX/remove is
+ write-only, and when written with a boolean 'true' value will
+ simulate a physical hot-remove of that memory section. For
+ example, assuming a 1GB section size, the section added by the
+ above "probe" example could be removed again with the following
+ command:
+ # echo 1 > /sys/devices/system/memory/memory2/remove
+Users: Memory hotplug testing and development
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt
index b8ca28b60215..4fb40fe94828 100644
--- a/Documentation/accounting/psi.txt
+++ b/Documentation/accounting/psi.txt
@@ -63,6 +63,110 @@ tracked and exported as well, to allow detection of latency spikes
which wouldn't necessarily make a dent in the time averages, or to
average trends over custom time frames.
+Monitoring for pressure thresholds
+==================================
+
+Users can register triggers and use poll() to be woken up when resource
+pressure exceeds certain thresholds.
+
+A trigger describes the maximum cumulative stall time over a specific
+time window, e.g. 100ms of total stall time within any 500ms window to
+generate a wakeup event.
+
+To register a trigger user has to open psi interface file under
+/proc/pressure/ representing the resource to be monitored and write the
+desired threshold and time window. The open file descriptor should be
+used to wait for trigger events using select(), poll() or epoll().
+The following format is used:
+
+<some|full> <stall amount in us> <time window in us>
+
+For example writing "some 150000 1000000" into /proc/pressure/memory
+would add 150ms threshold for partial memory stall measured within
+1sec time window. Writing "full 50000 1000000" into /proc/pressure/io
+would add 50ms threshold for full io stall measured within 1sec time window.
+
+Triggers can be set on more than one psi metric and more than one trigger
+for the same psi metric can be specified. However for each trigger a separate
+file descriptor is required to be able to poll it separately from others,
+therefore for each trigger a separate open() syscall should be made even
+when opening the same psi interface file.
+
+Monitors activate only when system enters stall state for the monitored
+psi metric and deactivates upon exit from the stall state. While system is
+in the stall state psi signal growth is monitored at a rate of 10 times per
+tracking window.
+
+The kernel accepts window sizes ranging from 500ms to 10s, therefore min
+monitoring update interval is 50ms and max is 1s. Min limit is set to
+prevent overly frequent polling. Max limit is chosen as a high enough number
+after which monitors are most likely not needed and psi averages can be used
+instead.
+
+When activated, psi monitor stays active for at least the duration of one
+tracking window to avoid repeated activations/deactivations when system is
+bouncing in and out of the stall state.
+
+Notifications to the userspace are rate-limited to one per tracking window.
+
+The trigger will de-register when the file descriptor used to define the
+trigger is closed.
+
+Userspace monitor usage example
+===============================
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <poll.h>
+#include <string.h>
+#include <unistd.h>
+
+/*
+ * Monitor memory partial stall with 1s tracking window size
+ * and 150ms threshold.
+ */
+int main() {
+ const char trig[] = "some 150000 1000000";
+ struct pollfd fds;
+ int n;
+
+ fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK);
+ if (fds.fd < 0) {
+ printf("/proc/pressure/memory open error: %s\n",
+ strerror(errno));
+ return 1;
+ }
+ fds.events = POLLPRI;
+
+ if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
+ printf("/proc/pressure/memory write error: %s\n",
+ strerror(errno));
+ return 1;
+ }
+
+ printf("waiting for events...\n");
+ while (1) {
+ n = poll(&fds, 1, -1);
+ if (n < 0) {
+ printf("poll error: %s\n", strerror(errno));
+ return 1;
+ }
+ if (fds.revents & POLLERR) {
+ printf("got POLLERR, event source is gone\n");
+ return 0;
+ }
+ if (fds.revents & POLLPRI) {
+ printf("event triggered!\n");
+ } else {
+ printf("unknown event received: 0x%x\n", fds.revents);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
Cgroup2 interface
=================
@@ -71,3 +175,6 @@ mounted, pressure stall information is also tracked for tasks grouped
into cgroups. Each subdirectory in the cgroupfs mountpoint contains
cpu.pressure, memory.pressure, and io.pressure files; the format is
the same as the /proc/pressure/ files.
+
+Per-cgroup psi monitors can be specified and used the same way as
+system-wide ones.
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 61f8bbb0a1b2..a51bc1e9ef40 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -606,8 +606,8 @@ on an IO device and is an example of this type.
Protections
-----------
-A cgroup is protected to be allocated upto the configured amount of
-the resource if the usages of all its ancestors are under their
+A cgroup is protected upto the configured amount of the resource
+as long as the usages of all its ancestors are under their
protected levels. Protections can be hard guarantees or best effort
soft boundaries. Protections can also be over-committed in which case
only upto the amount available to the parent is protected among
@@ -1020,7 +1020,10 @@ PAGE_SIZE multiple when read back.
is within its effective min boundary, the cgroup's memory
won't be reclaimed under any conditions. If there is no
unprotected reclaimable memory available, OOM killer
- is invoked.
+ is invoked. Above the effective min boundary (or
+ effective low boundary if it is higher), pages are reclaimed
+ proportionally to the overage, reducing reclaim pressure for
+ smaller overages.
Effective min boundary is limited by memory.min values of
all ancestor cgroups. If there is memory.min overcommitment
@@ -1042,7 +1045,10 @@ PAGE_SIZE multiple when read back.
Best-effort memory protection. If the memory usage of a
cgroup is within its effective low boundary, the cgroup's
memory won't be reclaimed unless memory can be reclaimed
- from unprotected cgroups.
+ from unprotected cgroups. Above the effective low boundary (or
+ effective min boundary if it is higher), pages are reclaimed
+ proportionally to the overage, reducing reclaim pressure for
+ smaller overages.
Effective low boundary is limited by memory.low values of
all ancestor cgroups. If there is memory.low overcommitment
@@ -1189,6 +1195,10 @@ PAGE_SIZE multiple when read back.
Amount of cached filesystem data that was modified and
is currently being written back to disk
+ anon_thp
+ Amount of memory used in anonymous mappings backed by
+ transparent hugepages
+
inactive_anon, active_anon, inactive_file, active_file, unevictable
Amount of memory, swap-backed and filesystem-backed,
on the internal memory management lists used by the
@@ -1248,6 +1258,18 @@ PAGE_SIZE multiple when read back.
Amount of reclaimed lazyfree pages
+ thp_fault_alloc
+
+ Number of transparent hugepages which were allocated to satisfy
+ a page fault, including COW faults. This counter is not present
+ when CONFIG_TRANSPARENT_HUGEPAGE is not set.
+
+ thp_collapse_alloc
+
+ Number of transparent hugepages which were allocated to allow
+ collapsing an existing range of pages. This counter is not
+ present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
+
memory.swap.current
A read-only single value file which exists on non-root
cgroups.
@@ -2283,8 +2305,10 @@ system performance due to overreclaim, to the point where the feature
becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
-reserve. A cgroup enjoys reclaim protection when it's within its low,
-which makes delegation of subtrees possible.
+reserve. A cgroup enjoys reclaim protection when it's within its
+effective low, which makes delegation of subtrees possible. It also
+enjoys having reclaim pressure proportional to its overage when
+above its effective low.
The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 518a8b47cb56..7eb347fe95ec 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1828,6 +1828,9 @@
ip= [IP_PNP]
See Documentation/filesystems/nfs/nfsroot.txt.
+ ipcmni_extend [KNL] Extend the maximum number of unique System V
+ IPC identifiers from 32,768 to 8,388,608.
+
irqaffinity= [SMP] Set the default irq affinity mask
The argument is a cpu list, as described above.
@@ -3110,6 +3113,16 @@
This will also cause panics on machine check exceptions.
Useful together with panic=30 to trigger a reboot.
+ page_alloc.shuffle=
+ [KNL] Boolean flag to control whether the page allocator
+ should randomize its free lists. The randomization may
+ be automatically enabled if the kernel detects it is
+ running on a platform with a direct-mapped memory-side
+ cache, and this parameter can be used to
+ override/disable that behavior. The state of the flag
+ can be read from sysfs at:
+ /sys/module/page_alloc/parameters/shuffle.
+
page_owner= [KNL] Boot-time page_owner enabling option.
Storage of the information about who allocated
each page is disabled in default. With this switch,
diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst
index 3f7bade2c231..340a5aee9b80 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -75,9 +75,10 @@ number of times a page is mapped.
20. NOPAGE
21. KSM
22. THP
- 23. BALLOON
+ 23. OFFLINE
24. ZERO_PAGE
25. IDLE
+ 26. PGTABLE
* ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the
memory cgroup each page is charged to, indexed by PFN. Only available when
@@ -118,8 +119,8 @@ Short descriptions to the page flags
identical memory pages dynamically shared between one or more processes
22 - THP
contiguous pages which construct transparent hugepages
-23 - BALLOON
- balloon compaction page
+23 - OFFLINE
+ page is logically offline
24 - ZERO_PAGE
zero page for pfn_zero or huge_zero page
25 - IDLE
@@ -128,6 +129,8 @@ Short descriptions to the page flags
Note that this flag may be stale in case the page was accessed via
a PTE. To make sure the flag is up-to-date one has to read
``/sys/kernel/mm/page_idle/bitmap`` first.
+26 - PGTABLE
+ page is in use as a page table
IO related page flags
---------------------
diff --git a/Documentation/core-api/genalloc.rst b/Documentation/core-api/genalloc.rst
index 6b38a39fab24..a534cc7ebd05 100644
--- a/Documentation/core-api/genalloc.rst
+++ b/Documentation/core-api/genalloc.rst
@@ -129,7 +129,7 @@ writing of special-purpose memory allocators in the future.
:functions: gen_pool_for_each_chunk
.. kernel-doc:: lib/genalloc.c
- :functions: addr_in_gen_pool
+ :functions: gen_pool_has_addr
.. kernel-doc:: lib/genalloc.c
:functions: gen_pool_avail
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 2228fcc8e29f..ef25a066d952 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -106,7 +106,6 @@ compile.h*
conf
config
config-*
-config_data.h*
config.mak
config.mak.autogen
conmakehash
diff --git a/Documentation/lzo.txt b/Documentation/lzo.txt
index 6fa6a93d0949..f79934225d8d 100644
--- a/Documentation/lzo.txt
+++ b/Documentation/lzo.txt
@@ -78,16 +78,34 @@ Description
is an implementation design choice independent on the algorithm or
encoding.
+Versions
+
+0: Original version
+1: LZO-RLE
+
+Version 1 of LZO implements an extension to encode runs of zeros using run
+length encoding. This improves speed for data with many zeros, which is a
+common case for zram. This modifies the bitstream in a backwards compatible way
+(v1 can correctly decompress v0 compressed data, but v0 cannot read v1 data).
+
+For maximum compatibility, both versions are available under different names
+(lzo and lzo-rle). Differences in the encoding are noted in this document with
+e.g.: version 1 only.
+
Byte sequences
==============
First byte encoding::
- 0..17 : follow regular instruction encoding, see below. It is worth
- noting that codes 16 and 17 will represent a block copy from
- the dictionary which is empty, and that they will always be
+ 0..16 : follow regular instruction encoding, see below. It is worth
+ noting that code 16 will represent a block copy from the
+ dictionary which is empty, and that it will always be
invalid at this place.
+ 17 : bitstream version. If the first byte is 17, the next byte
+ gives the bitstream version (version 1 only). If the first byte
+ is not 17, the bitstream version is 0.
+
18..21 : copy 0..3 literals
state = (byte - 17) = 0..3 [ copy <state> literals ]
skip byte
@@ -140,6 +158,11 @@ Byte sequences
state = S (copy S literals after this block)
End of stream is reached if distance == 16384
+ In version 1 only, this instruction is also used to encode a run of
+ zeros if distance = 0xbfff, i.e. H = 1 and the D bits are all 1.
+ In this case, it is followed by a fourth byte, X.
+ run length = ((X << 3) | (0 0 0 0 0 L L L)) + 4.
+
0 0 1 L L L L L (32..63)
Copy of small block within 16kB distance (preferably less than 34B)
length = 2 + (L ?: 31 + (zero_bytes * 255) + non_zero_byte)
@@ -165,7 +188,9 @@ Authors
=======
This document was written by Willy Tarreau <w@1wt.eu> on 2014/07/19 during an
- analysis of the decompression code available in Linux 3.16-rc5. The code is
- tricky, it is possible that this document contains mistakes or that a few
- corner cases were overlooked. In any case, please report any doubt, fix, or
- proposed updates to the author(s) so that the document can be updated.
+ analysis of the decompression code available in Linux 3.16-rc5, and updated
+ by Dave Rodgman <dave.rodgman@arm.com> on 2018/10/30 to introduce run-length
+ encoding. The code is tricky, it is possible that this document contains
+ mistakes or that a few corner cases were overlooked. In any case, please
+ report any doubt, fix, or proposed updates to the author(s) so that the
+ document can be updated.
diff --git a/Documentation/process/4.Coding.rst b/Documentation/process/4.Coding.rst
index cfe264889447..4b7a5ab3cec1 100644
--- a/Documentation/process/4.Coding.rst
+++ b/Documentation/process/4.Coding.rst
@@ -249,7 +249,7 @@ features; most of these are found in the "kernel hacking" submenu. Several
of these options should be turned on for any kernel used for development or
testing purposes. In particular, you should turn on:
- - ENABLE_WARN_DEPRECATED, ENABLE_MUST_CHECK, and FRAME_WARN to get an
+ - ENABLE_MUST_CHECK and FRAME_WARN to get an
extra set of warnings for problems like the use of deprecated interfaces
or ignoring an important return value from a function. The output
generated by these warnings can be verbose, but one need not worry about
diff --git a/Documentation/translations/it_IT/process/4.Coding.rst b/Documentation/translations/it_IT/process/4.Coding.rst
index c61059015e52..c05b89e616dd 100644
--- a/Documentation/translations/it_IT/process/4.Coding.rst
+++ b/Documentation/translations/it_IT/process/4.Coding.rst
@@ -264,7 +264,7 @@ La maggior parte di queste opzioni possono essere attivate per qualsiasi
kernel utilizzato per lo sviluppo o a scopo di test. In particolare dovreste
attivare:
- - ENABLE_WARN_DEPRECATED, ENABLE_MUST_CHECK, e FRAME_WARN per ottenere degli
+ - ENABLE_MUST_CHECK e FRAME_WARN per ottenere degli
avvertimenti dedicati a problemi come l'uso di interfacce deprecate o
l'ignorare un importante valore di ritorno di una funzione. Il risultato
generato da questi avvertimenti può risultare verboso, ma non bisogna
diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst
index 933ada4368ff..4cfe1f1e75b5 100644
--- a/Documentation/vm/slub.rst
+++ b/Documentation/vm/slub.rst
@@ -49,9 +49,10 @@ Possible debug options are::
P Poisoning (object and padding)
U User tracking (free and alloc)
T Trace (please only use on single slabs)
- A Toggle failslab filter mark for the cache
+ A Enable/disable failslab filter mark for the cache
O Switch debugging off for caches that would have
caused higher minimum slab orders
+ W Enable/disable WARN_ON() on slab errors
- Switch all debugging off (useful if the kernel is
configured with CONFIG_SLUB_DEBUG_ON)
diff --git a/MAINTAINERS b/MAINTAINERS
index 10ca61e563d6..b5701497e7f2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9934,6 +9934,14 @@ F: kernel/sched/membarrier.c
F: include/uapi/linux/membarrier.h
F: arch/powerpc/include/asm/membarrier.h
+MEMBLOCK
+M: Mike Rapoport <rppt@linux.ibm.com>
+L: linux-mm@kvack.org
+S: Maintained
+F: include/linux/memblock.h
+F: mm/memblock.c
+F: Documentation/core-api/boot-time-mm.rst
+
MEMORY MANAGEMENT
L: linux-mm@kvack.org
W: http://www.linux-mm.org
diff --git a/arch/alpha/include/asm/topology.h b/arch/alpha/include/asm/topology.h
index e6e13a85796a..5a77a40567fa 100644
--- a/arch/alpha/include/asm/topology.h
+++ b/arch/alpha/include/asm/topology.h
@@ -4,6 +4,7 @@
#include <linux/smp.h>
#include <linux/threads.h>
+#include <linux/numa.h>
#include <asm/machvec.h>
#ifdef CONFIG_NUMA
@@ -29,7 +30,7 @@ static const struct cpumask *cpumask_of_node(int node)
{
int cpu;
- if (node == -1)
+ if (node == NUMA_NO_NODE)
return cpu_all_mask;
cpumask_clear(&node_to_cpumask_map[node]);
diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig
index 020d4493edfd..e31a8ebc3ecc 100644
--- a/arch/arc/configs/axs101_defconfig
+++ b/arch/arc/configs/axs101_defconfig
@@ -99,7 +99,6 @@ CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_STRIP_ASM_SYMS=y
CONFIG_SOFTLOCKUP_DETECTOR=y
diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig
index 666314fffc60..e0e8567f0d75 100644
--- a/arch/arc/configs/axs103_defconfig
+++ b/arch/arc/configs/axs103_defconfig
@@ -97,7 +97,6 @@ CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_STRIP_ASM_SYMS=y
CONFIG_SOFTLOCKUP_DETECTOR=y
diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig
index 429832b8560b..fcbc952bc75b 100644
--- a/arch/arc/configs/axs103_smp_defconfig
+++ b/arch/arc/configs/axs103_smp_defconfig
@@ -100,7 +100,6 @@ CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_STRIP_ASM_SYMS=y
CONFIG_SOFTLOCKUP_DETECTOR=y
diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig
index 240dd2cd5148..f56cc2070c11 100644
--- a/arch/arc/configs/haps_hs_defconfig
+++ b/arch/arc/configs/haps_hs_defconfig
@@ -75,7 +75,6 @@ CONFIG_EXT2_FS_XATTR=y
CONFIG_TMPFS=y
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_NFS_FS=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_MEMORY_INIT=y
# CONFIG_DEBUG_PREEMPT is not set
diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig
index 14ae7e5acc7c..b6f2482c7e74 100644
--- a/arch/arc/configs/haps_hs_smp_defconfig
+++ b/arch/arc/configs/haps_hs_smp_defconfig
@@ -78,7 +78,6 @@ CONFIG_EXT2_FS_XATTR=y
CONFIG_TMPFS=y
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_NFS_FS=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_SOFTLOCKUP_DETECTOR=y
# CONFIG_DEBUG_PREEMPT is not set
diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig
index 87b23b7fb781..6fd3d29546af 100644
--- a/arch/arc/configs/hsdk_defconfig
+++ b/arch/arc/configs/hsdk_defconfig
@@ -71,7 +71,6 @@ CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_STRIP_ASM_SYMS=y
CONFIG_SOFTLOCKUP_DETECTOR=y
diff --git a/arch/arc/configs/nps_defconfig b/arch/arc/configs/nps_defconfig
index 621f59407d76..f0a077c00efa 100644
--- a/arch/arc/configs/nps_defconfig
+++ b/arch/arc/configs/nps_defconfig
@@ -76,7 +76,6 @@ CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_ROOT_NFS=y
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_MEMORY_INIT=y
diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig
index 219c2a65294b..318e4cd29629 100644
--- a/arch/arc/configs/nsim_700_defconfig
+++ b/arch/arc/configs/nsim_700_defconfig
@@ -56,6 +56,5 @@ CONFIG_EXT2_FS_XATTR=y
CONFIG_TMPFS=y
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_NFS_FS=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_DEBUG_PREEMPT is not set
diff --git a/arch/arc/configs/nsim_hs_defconfig b/arch/arc/configs/nsim_hs_defconfig
index 739b90e5e893..c15807b0e0c1 100644
--- a/arch/arc/configs/nsim_hs_defconfig
+++ b/arch/arc/configs/nsim_hs_defconfig
@@ -56,6 +56,5 @@ CONFIG_EXT2_FS_XATTR=y
CONFIG_TMPFS=y
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_NFS_FS=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_DEBUG_PREEMPT is not set
diff --git a/arch/arc/configs/nsim_hs_smp_defconfig b/arch/arc/configs/nsim_hs_smp_defconfig
index b5895bdf3a93..65e983fd942b 100644
--- a/arch/arc/configs/nsim_hs_smp_defconfig
+++ b/arch/arc/configs/nsim_hs_smp_defconfig
@@ -55,5 +55,4 @@ CONFIG_EXT2_FS_XATTR=y
CONFIG_TMPFS=y
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_NFS_FS=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig
index 35dfc6491a09..08c5b99ac341 100644
--- a/arch/arc/configs/nsimosci_defconfig
+++ b/arch/arc/configs/nsimosci_defconfig
@@ -68,5 +68,4 @@ CONFIG_TMPFS=y
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig
index 1638e5bc9672..5b5e26d67955 100644
--- a/arch/arc/configs/nsimosci_hs_defconfig
+++ b/arch/arc/configs/nsimosci_hs_defconfig
@@ -66,5 +66,4 @@ CONFIG_TMPFS=y
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig
index 11cfbdb0f441..26af9b2f7fcb 100644
--- a/arch/arc/configs/nsimosci_hs_smp_defconfig
+++ b/arch/arc/configs/nsimosci_hs_smp_defconfig
@@ -77,6 +77,5 @@ CONFIG_TMPFS=y
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_FTRACE=y
diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig
index e71ade3cf9c8..5b5119d2b5d5 100644
--- a/arch/arc/configs/tb10x_defconfig
+++ b/arch/arc/configs/tb10x_defconfig
@@ -92,7 +92,6 @@ CONFIG_CONFIGFS_FS=y
# CONFIG_MISC_FILESYSTEMS is not set
# CONFIG_NETWORK_FILESYSTEMS is not set
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
CONFIG_STRIP_ASM_SYMS=y
CONFIG_DEBUG_FS=y
CONFIG_HEADERS_CHECK=y
diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig
index e447ace6fa1c..0c3b21416819 100644
--- a/arch/arc/configs/vdk_hs38_defconfig
+++ b/arch/arc/configs/vdk_hs38_defconfig
@@ -87,7 +87,6 @@ CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_STRIP_ASM_SYMS=y
CONFIG_DEBUG_SHIRQ=y
diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig
index c82cdb10aaf4..f9ad9d3ee702 100644
--- a/arch/arc/configs/vdk_hs38_smp_defconfig
+++ b/arch/arc/configs/vdk_hs38_smp_defconfig
@@ -91,7 +91,6 @@ CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_STRIP_ASM_SYMS=y
CONFIG_DEBUG_SHIRQ=y
diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig
index bd55a2be51fc..dcf7610cfe55 100644
--- a/arch/arm/configs/bcm2835_defconfig
+++ b/arch/arm/configs/bcm2835_defconfig
@@ -161,7 +161,6 @@ CONFIG_PRINTK_TIME=y
CONFIG_BOOT_PRINTK_DELAY=y
CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_UNUSED_SYMBOLS=y
CONFIG_DEBUG_MEMORY_INIT=y
diff --git a/arch/arm/configs/cns3420vb_defconfig b/arch/arm/configs/cns3420vb_defconfig
index c6dcd6e4f4e6..419b73564f29 100644
--- a/arch/arm/configs/cns3420vb_defconfig
+++ b/arch/arm/configs/cns3420vb_defconfig
@@ -60,7 +60,6 @@ CONFIG_EXT2_FS_XATTR=y
CONFIG_AUTOFS4_FS=y
CONFIG_FSCACHE=y
CONFIG_TMPFS=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
# CONFIG_ARM_UNWIND is not set
diff --git a/arch/arm/configs/efm32_defconfig b/arch/arm/configs/efm32_defconfig
index 860d27138e6f..ee42158f41ec 100644
--- a/arch/arm/configs/efm32_defconfig
+++ b/arch/arm/configs/efm32_defconfig
@@ -94,7 +94,6 @@ CONFIG_ROMFS_BACKED_BY_MTD=y
# CONFIG_NETWORK_FILESYSTEMS is not set
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_MAGIC_SYSRQ=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/configs/eseries_pxa_defconfig b/arch/arm/configs/eseries_pxa_defconfig
index cd27d651463c..eabb784cf7da 100644
--- a/arch/arm/configs/eseries_pxa_defconfig
+++ b/arch/arm/configs/eseries_pxa_defconfig
@@ -103,7 +103,6 @@ CONFIG_NFS_V3=y
CONFIG_PARTITION_ADVANCED=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_CRYPTO_CBC=m
CONFIG_CRYPTO_PCBC=m
diff --git a/arch/arm/configs/gemini_defconfig b/arch/arm/configs/gemini_defconfig
index 553777ac2814..ef9aae89907d 100644
--- a/arch/arm/configs/gemini_defconfig
+++ b/arch/arm/configs/gemini_defconfig
@@ -87,6 +87,5 @@ CONFIG_TMPFS_POSIX_ACL=y
CONFIG_ROMFS_FS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
diff --git a/arch/arm/configs/mini2440_defconfig b/arch/arm/configs/mini2440_defconfig
index 88ea02e7ba19..d95a8059d30b 100644
--- a/arch/arm/configs/mini2440_defconfig
+++ b/arch/arm/configs/mini2440_defconfig
@@ -298,7 +298,6 @@ CONFIG_NLS_KOI8_R=m
CONFIG_NLS_KOI8_U=m
CONFIG_NLS_UTF8=m
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_STRIP_ASM_SYMS=y
CONFIG_DEBUG_FS=y
diff --git a/arch/arm/configs/moxart_defconfig b/arch/arm/configs/moxart_defconfig
index 2da0d9ee2107..078228a19339 100644
--- a/arch/arm/configs/moxart_defconfig
+++ b/arch/arm/configs/moxart_defconfig
@@ -125,7 +125,6 @@ CONFIG_CONFIGFS_FS=y
CONFIG_JFFS2_FS=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_PAGEALLOC=y
CONFIG_DEBUG_OBJECTS=y
diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig
index 0bcdec7cc169..1d923dbb9928 100644
--- a/arch/arm/configs/mps2_defconfig
+++ b/arch/arm/configs/mps2_defconfig
@@ -100,7 +100,6 @@ CONFIG_ROOT_NFS=y
CONFIG_NLS=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/configs/nuc910_defconfig b/arch/arm/configs/nuc910_defconfig
index a72653645f9d..c0d152c02fba 100644
--- a/arch/arm/configs/nuc910_defconfig
+++ b/arch/arm/configs/nuc910_defconfig
@@ -47,7 +47,6 @@ CONFIG_ROMFS_FS=y
CONFIG_PARTITION_ADVANCED=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
# CONFIG_CRC32 is not set
diff --git a/arch/arm/configs/nuc950_defconfig b/arch/arm/configs/nuc950_defconfig
index 614a0a28d0b4..8dde1186c2ef 100644
--- a/arch/arm/configs/nuc950_defconfig
+++ b/arch/arm/configs/nuc950_defconfig
@@ -64,6 +64,5 @@ CONFIG_ROMFS_FS=y
CONFIG_PARTITION_ADVANCED=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
diff --git a/arch/arm/configs/nuc960_defconfig b/arch/arm/configs/nuc960_defconfig
index b84bbd216153..6bb784f8eb5b 100644
--- a/arch/arm/configs/nuc960_defconfig
+++ b/arch/arm/configs/nuc960_defconfig
@@ -53,7 +53,6 @@ CONFIG_ROMFS_FS=y
CONFIG_PARTITION_ADVANCED=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
# CONFIG_CRC32 is not set
diff --git a/arch/arm/configs/stm32_defconfig b/arch/arm/configs/stm32_defconfig
index ba805b757a8d..0258ba891376 100644
--- a/arch/arm/configs/stm32_defconfig
+++ b/arch/arm/configs/stm32_defconfig
@@ -80,7 +80,6 @@ CONFIG_EXT3_FS=y
CONFIG_NLS=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_MAGIC_SYSRQ=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 43f46aa7ef33..8a90f298af96 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -565,7 +565,7 @@ static void *__alloc_from_pool(size_t size, struct page **ret_page)
static bool __in_atomic_pool(void *start, size_t size)
{
- return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+ return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
}
static int __free_from_pool(void *start, size_t size)
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index f5cc1ccfea3d..57de0dde3ae0 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -719,16 +719,9 @@ EXPORT_SYMBOL(phys_mem_access_prot);
#define vectors_base() (vectors_high() ? 0xffff0000 : 0)
-static void __init *early_alloc_aligned(unsigned long sz, unsigned long align)
-{
- void *ptr = __va(memblock_phys_alloc(sz, align));
- memset(ptr, 0, sz);
- return ptr;
-}
-
static void __init *early_alloc(unsigned long sz)
{
- return early_alloc_aligned(sz, sz);
+ return memblock_alloc(sz, sz);
}
static void *__init late_alloc(unsigned long sz)
@@ -1000,7 +993,7 @@ void __init iotable_init(struct map_desc *io_desc, int nr)
if (!nr)
return;
- svm = early_alloc_aligned(sizeof(*svm) * nr, __alignof__(*svm));
+ svm = memblock_alloc(sizeof(*svm) * nr, __alignof__(*svm));
for (md = io_desc; nr; md++, nr--) {
create_mapping(md);
@@ -1022,7 +1015,7 @@ void __init vm_reserve_area_early(unsigned long addr, unsigned long size,
struct vm_struct *vm;
struct static_vm *svm;
- svm = early_alloc_aligned(sizeof(*svm), __alignof__(*svm));
+ svm = memblock_alloc(sizeof(*svm), __alignof__(*svm));
vm = &svm->vm;
vm->addr = (void *)addr;
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9d60b39795e7..ac16a11f4f02 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -18,7 +18,7 @@ config ARM64
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+ select ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION if (MEMORY_ISOLATION && COMPACTION) || CMA
select ARCH_HAS_KCOV
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PTE_SPECIAL
@@ -1482,6 +1482,10 @@ config SYSVIPC_COMPAT
def_bool y
depends on COMPAT && SYSVIPC
+config ARCH_ENABLE_HUGEPAGE_MIGRATION
+ def_bool y
+ depends on HUGETLB_PAGE && MIGRATION
+
menu "Power management options"
source "kernel/power/Kconfig"
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index fb6609875455..939a0a484252 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -20,6 +20,11 @@
#include <asm/page.h>
+#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
+#define arch_hugetlb_migration_supported arch_hugetlb_migration_supported
+extern bool arch_hugetlb_migration_supported(struct hstate *h);
+#endif
+
#define __HAVE_ARCH_HUGE_PTEP_GET
static inline pte_t huge_ptep_get(pte_t *ptep)
{
@@ -65,8 +70,11 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
#include <asm-generic/hugetlb.h>
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION
+static inline bool gigantic_page_runtime_allocation_supported(void)
+{
+ return true;
+}
#endif
#endif /* __ASM_HUGETLB_H */
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 0c656850eeea..b01ef0180a03 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -80,11 +80,7 @@
*/
#ifdef CONFIG_KASAN
#define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - KASAN_SHADOW_SCALE_SHIFT))
-#ifdef CONFIG_KASAN_EXTRA
-#define KASAN_THREAD_SHIFT 2
-#else
#define KASAN_THREAD_SHIFT 1
-#endif /* CONFIG_KASAN_EXTRA */
#else
#define KASAN_SHADOW_SIZE (0)
#define KASAN_THREAD_SHIFT 0
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index aa9c94113700..66b5d697d943 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -321,7 +321,7 @@ void crash_post_resume(void)
* but does not hold any data of loaded kernel image.
*
* Note that all the pages in crash dump kernel memory have been initially
- * marked as Reserved in kexec_reserve_crashkres_pages().
+ * marked as Reserved as memory was allocated via memblock_reserve().
*
* In hibernation, the pages which are Reserved and yet "nosave" are excluded
* from the hibernation iamge. crash_is_nosave() does thich check for crash
@@ -361,7 +361,6 @@ void crash_free_reserved_phys_range(unsigned long begin, unsigned long end)
for (addr = begin; addr < end; addr += PAGE_SIZE) {
page = phys_to_page(addr);
- ClearPageReserved(page);
free_reserved_page(page);
}
}
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 28cbc22d7e30..6b4a47b3adf4 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -27,6 +27,26 @@
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
+#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
+bool arch_hugetlb_migration_supported(struct hstate *h)
+{
+ size_t pagesize = huge_page_size(h);
+
+ switch (pagesize) {
+#ifdef CONFIG_ARM64_4K_PAGES
+ case PUD_SIZE:
+#endif
+ case PMD_SIZE:
+ case CONT_PMD_SIZE:
+ case CONT_PTE_SIZE:
+ return true;
+ }
+ pr_warn("%s: unrecognized huge page size 0x%lx\n",
+ __func__, pagesize);
+ return false;
+}
+#endif
+
int pmd_huge(pmd_t pmd)
{
return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 2302b4093a63..6bc135042f5e 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -118,35 +118,10 @@ static void __init reserve_crashkernel(void)
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
}
-
-static void __init kexec_reserve_crashkres_pages(void)
-{
-#ifdef CONFIG_HIBERNATION
- phys_addr_t addr;
- struct page *page;
-
- if (!crashk_res.end)
- return;
-
- /*
- * To reduce the size of hibernation image, all the pages are
- * marked as Reserved initially.
- */
- for (addr = crashk_res.start; addr < (crashk_res.end + 1);
- addr += PAGE_SIZE) {
- page = phys_to_page(addr);
- SetPageReserved(page);
- }
-#endif
-}
#else
static void __init reserve_crashkernel(void)
{
}
-
-static void __init kexec_reserve_crashkres_pages(void)
-{
-}
#endif /* CONFIG_KEXEC_CORE */
#ifdef CONFIG_CRASH_DUMP
@@ -568,8 +543,6 @@ void __init mem_init(void)
/* this will put all unused low memory onto the freelists */
memblock_free_all();
- kexec_reserve_crashkres_pages();
-
mem_init_print_info(NULL);
/*
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index ae34e3a1cef1..7a0a555b366a 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -120,7 +120,7 @@ static void __init setup_node_to_cpumask_map(void)
}
/* cpumask_of_node() will now work */
- pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
+ pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
}
/*
diff --git a/arch/c6x/mm/dma-coherent.c b/arch/c6x/mm/dma-coherent.c
index 75b79571732c..0be289839ce0 100644
--- a/arch/c6x/mm/dma-coherent.c
+++ b/arch/c6x/mm/dma-coherent.c
@@ -121,8 +121,6 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
*/
void __init coherent_mem_init(phys_addr_t start, u32 size)
{
- phys_addr_t bitmap_phys;
-
if (!size)
return;
@@ -138,11 +136,8 @@ void __init coherent_mem_init(phys_addr_t start, u32 size)
if (dma_size & (PAGE_SIZE - 1))
++dma_pages;
- bitmap_phys = memblock_phys_alloc(BITS_TO_LONGS(dma_pages) * sizeof(long),
- sizeof(long));
-
- dma_bitmap = phys_to_virt(bitmap_phys);
- memset(dma_bitmap, 0, dma_pages * PAGE_SIZE);
+ dma_bitmap = memblock_alloc(BITS_TO_LONGS(dma_pages) * sizeof(long),
+ sizeof(long));
}
static void c6x_dma_sync(struct device *dev, phys_addr_t paddr, size_t size,
diff --git a/arch/h8300/configs/edosk2674_defconfig b/arch/h8300/configs/edosk2674_defconfig
index 29fda12d5da9..23791dcf6c25 100644
--- a/arch/h8300/configs/edosk2674_defconfig
+++ b/arch/h8300/configs/edosk2674_defconfig
@@ -45,5 +45,4 @@ CONFIG_SERIAL_SH_SCI_CONSOLE=y
# CONFIG_SYSFS is not set
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/h8300/configs/h8300h-sim_defconfig b/arch/h8300/configs/h8300h-sim_defconfig
index 80624f46b0ed..7fc9c2f0acc0 100644
--- a/arch/h8300/configs/h8300h-sim_defconfig
+++ b/arch/h8300/configs/h8300h-sim_defconfig
@@ -45,5 +45,4 @@ CONFIG_SERIAL_SH_SCI_EARLYCON=y
# CONFIG_SYSFS is not set
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/h8300/configs/h8s-sim_defconfig b/arch/h8300/configs/h8s-sim_defconfig
index 29fda12d5da9..23791dcf6c25 100644
--- a/arch/h8300/configs/h8s-sim_defconfig
+++ b/arch/h8300/configs/h8s-sim_defconfig
@@ -45,5 +45,4 @@ CONFIG_SERIAL_SH_SCI_CONSOLE=y
# CONFIG_SYSFS is not set
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c
index 92c376279c6d..1315da6c7aeb 100644
--- a/arch/ia64/kernel/numa.c
+++ b/arch/ia64/kernel/numa.c
@@ -74,7 +74,7 @@ void __init build_cpu_to_node_map(void)
cpumask_clear(&node_to_cpu_mask[node]);
for_each_possible_early_cpu(cpu) {
- node = -1;
+ node = NUMA_NO_NODE;
for (i = 0; i < NR_CPUS; ++i)
if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) {
node = node_cpuid[i].nid;
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 46bff1661836..7a969f4c3534 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -583,17 +583,6 @@ pfm_put_task(struct task_struct *task)
if (task != current) put_task_struct(task);
}
-static inline void
-pfm_reserve_page(unsigned long a)
-{
- SetPageReserved(vmalloc_to_page((void *)a));
-}
-static inline void
-pfm_unreserve_page(unsigned long a)
-{
- ClearPageReserved(vmalloc_to_page((void*)a));
-}
-
static inline unsigned long
pfm_protect_ctx_ctxsw(pfm_context_t *x)
{
@@ -816,44 +805,6 @@ pfm_reset_msgq(pfm_context_t *ctx)
DPRINT(("ctx=%p msgq reset\n", ctx));
}
-static void *
-pfm_rvmalloc(unsigned long size)
-{
- void *mem;
- unsigned long addr;
-
- size = PAGE_ALIGN(size);
- mem = vzalloc(size);
- if (mem) {
- //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
- addr = (unsigned long)mem;
- while (size > 0) {
- pfm_reserve_page(addr);
- addr+=PAGE_SIZE;
- size-=PAGE_SIZE;
- }
- }
- return mem;
-}
-
-static void
-pfm_rvfree(void *mem, unsigned long size)
-{
- unsigned long addr;
-
- if (mem) {
- DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size));
- addr = (unsigned long) mem;
- while ((long) size > 0) {
- pfm_unreserve_page(addr);
- addr+=PAGE_SIZE;
- size-=PAGE_SIZE;
- }
- vfree(mem);
- }
- return;
-}
-
static pfm_context_t *
pfm_context_alloc(int ctx_flags)
{
@@ -1498,7 +1449,7 @@ pfm_free_smpl_buffer(pfm_context_t *ctx)
/*
* free the buffer
*/
- pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size);
+ vfree(ctx->ctx_smpl_hdr);
ctx->ctx_smpl_hdr = NULL;
ctx->ctx_smpl_size = 0UL;
@@ -2137,7 +2088,7 @@ doit:
* All memory free operations (especially for vmalloc'ed memory)
* MUST be done with interrupts ENABLED.
*/
- if (smpl_buf_addr) pfm_rvfree(smpl_buf_addr, smpl_buf_size);
+ vfree(smpl_buf_addr);
/*
* return the memory used by the context
@@ -2266,10 +2217,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
/*
* We do the easy to undo allocations first.
- *
- * pfm_rvmalloc(), clears the buffer, so there is no leak
*/
- smpl_buf = pfm_rvmalloc(size);
+ smpl_buf = vzalloc(size);
if (smpl_buf == NULL) {
DPRINT(("Can't allocate sampling buffer\n"));
return -ENOMEM;
@@ -2346,7 +2295,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
error:
vm_area_free(vma);
error_kmem:
- pfm_rvfree(smpl_buf, size);
+ vfree(smpl_buf);
return -ENOMEM;
}
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 8a965784340c..f9c36750c6a4 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -227,7 +227,7 @@ void __init setup_per_cpu_areas(void)
* CPUs are put into groups according to node. Walk cpu_map
* and create new groups at node boundaries.
*/
- prev_node = -1;
+ prev_node = NUMA_NO_NODE;
ai->nr_groups = 0;
for (unit = 0; unit < nr_units; unit++) {
cpu = cpu_map[unit];
@@ -435,7 +435,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize)
{
void *ptr = NULL;
u8 best = 0xff;
- int bestnode = -1, node, anynode = 0;
+ int bestnode = NUMA_NO_NODE, node, anynode = 0;
for_each_online_node(node) {
if (node_isset(node, memory_less_mask))
@@ -447,7 +447,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize)
anynode = node;
}
- if (bestnode == -1)
+ if (bestnode == NUMA_NO_NODE)
bestnode = anynode;
ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE,
diff --git a/arch/m68k/configs/amcore_defconfig b/arch/m68k/configs/amcore_defconfig
index 1ba10d57ddb1..0857cdbfde0c 100644
--- a/arch/m68k/configs/amcore_defconfig
+++ b/arch/m68k/configs/amcore_defconfig
@@ -88,7 +88,6 @@ CONFIG_ROMFS_FS=y
CONFIG_ROMFS_BACKED_BY_BOTH=y
# CONFIG_NETWORK_FILESYSTEMS is not set
CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
CONFIG_PANIC_ON_OOPS=y
diff --git a/arch/m68k/configs/stmark2_defconfig b/arch/m68k/configs/stmark2_defconfig
index 3d07b1de7eb0..69f23c7b0497 100644
--- a/arch/m68k/configs/stmark2_defconfig
+++ b/arch/m68k/configs/stmark2_defconfig
@@ -76,7 +76,6 @@ CONFIG_GPIO_GENERIC_PLATFORM=y
CONFIG_FSCACHE=y
# CONFIG_PROC_SYSCTL is not set
CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
CONFIG_SLUB_DEBUG_ON=y
CONFIG_PANIC_ON_OOPS=y
diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c
index b86a2e21693b..227c04fe60d2 100644
--- a/arch/m68k/mm/memory.c
+++ b/arch/m68k/mm/memory.c
@@ -51,7 +51,7 @@ void __init init_pointer_table(unsigned long ptable)
pr_debug("init_pointer_table: %lx, %x\n", ptable, PD_MARKBITS(dp));
/* unreserve the page so it's possible to free that page */
- PD_PAGE(dp)->flags &= ~(1 << PG_reserved);
+ __ClearPageReserved(PD_PAGE(dp));
init_page_count(PD_PAGE(dp));
return;
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index b17fd8aafd64..44f4b8910c21 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -363,8 +363,9 @@ void __init *early_get_page(void)
* Mem start + kernel_tlb -> here is limit
* because of mem mapping from head.S
*/
- return __va(memblock_alloc_base(PAGE_SIZE, PAGE_SIZE,
- memory_start + kernel_tlb));
+ return memblock_alloc_try_nid_raw(PAGE_SIZE, PAGE_SIZE,
+ MEMBLOCK_LOW_LIMIT, memory_start + kernel_tlb,
+ NUMA_NO_NODE);
}
#endif /* CONFIG_MMU */
diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c
index 253f79fc7196..d1e521cce317 100644
--- a/arch/nds32/mm/init.c
+++ b/arch/nds32/mm/init.c
@@ -78,8 +78,7 @@ static void __init map_ram(void)
}
/* Alloc one page for holding PTE's... */
- pte = (pte_t *) __va(memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE));
- memset(pte, 0, PAGE_SIZE);
+ pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
set_pmd(pme, __pmd(__pa(pte) + _PAGE_KERNEL_TABLE));
/* Fill the newly allocated page with PTE'S */
@@ -111,8 +110,7 @@ static void __init fixedrange_init(void)
pgd = swapper_pg_dir + pgd_index(vaddr);
pud = pud_offset(pgd, vaddr);
pmd = pmd_offset(pud, vaddr);
- fixmap_pmd_p = (pmd_t *) __va(memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE));
- memset(fixmap_pmd_p, 0, PAGE_SIZE);
+ fixmap_pmd_p = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
set_pmd(pmd, __pmd(__pa(fixmap_pmd_p) + _PAGE_KERNEL_TABLE));
#ifdef CONFIG_HIGHMEM
@@ -124,8 +122,7 @@ static void __init fixedrange_init(void)
pgd = swapper_pg_dir + pgd_index(vaddr);
pud = pud_offset(pgd, vaddr);
pmd = pmd_offset(pud, vaddr);
- pte = (pte_t *) __va(memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE));
- memset(pte, 0, PAGE_SIZE);
+ pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
set_pmd(pmd, __pmd(__pa(pte) + _PAGE_KERNEL_TABLE));
pkmap_page_table = pte;
#endif /* CONFIG_HIGHMEM */
@@ -150,8 +147,7 @@ void __init paging_init(void)
fixedrange_init();
/* allocate space for empty_zero_page */
- zero_page = __va(memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE));
- memset(zero_page, 0, PAGE_SIZE);
+ zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
zone_sizes_init();
empty_zero_page = virt_to_page(zero_page);
diff --git a/arch/nios2/configs/10m50_defconfig b/arch/nios2/configs/10m50_defconfig
index c601c8ff1ae6..7977ab7e2ca6 100644
--- a/arch/nios2/configs/10m50_defconfig
+++ b/arch/nios2/configs/10m50_defconfig
@@ -77,4 +77,3 @@ CONFIG_NFS_V3_ACL=y
CONFIG_ROOT_NFS=y
CONFIG_SUNRPC_DEBUG=y
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
diff --git a/arch/nios2/configs/3c120_defconfig b/arch/nios2/configs/3c120_defconfig
index fce33588d55c..ceb97cd85ac1 100644
--- a/arch/nios2/configs/3c120_defconfig
+++ b/arch/nios2/configs/3c120_defconfig
@@ -74,4 +74,3 @@ CONFIG_NFS_V3_ACL=y
CONFIG_ROOT_NFS=y
CONFIG_SUNRPC_DEBUG=y
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
diff --git a/arch/openrisc/configs/or1ksim_defconfig b/arch/openrisc/configs/or1ksim_defconfig
index a73aa90501be..d8ff4f8ffb88 100644
--- a/arch/openrisc/configs/or1ksim_defconfig
+++ b/arch/openrisc/configs/or1ksim_defconfig
@@ -54,5 +54,4 @@ CONFIG_SERIAL_OF_PLATFORM=y
# CONFIG_DNOTIFY is not set
CONFIG_TMPFS=y
CONFIG_NFS_FS=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/openrisc/configs/simple_smp_defconfig b/arch/openrisc/configs/simple_smp_defconfig
index b6e3c7e158e7..64278992df9c 100644
--- a/arch/openrisc/configs/simple_smp_defconfig
+++ b/arch/openrisc/configs/simple_smp_defconfig
@@ -61,6 +61,5 @@ CONFIG_SERIAL_OF_PLATFORM=y
CONFIG_TMPFS=y
CONFIG_NFS_FS=y
CONFIG_XZ_DEC=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_RCU_TRACE is not set
diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c
index 270d1c9bc0d6..051bcb4fefd3 100644
--- a/arch/openrisc/mm/ioremap.c
+++ b/arch/openrisc/mm/ioremap.c
@@ -122,13 +122,10 @@ pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm)
{
pte_t *pte;
- if (likely(mem_init_done)) {
- pte = (pte_t *) __get_free_page(GFP_KERNEL);
- } else {
- pte = (pte_t *) __va(memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE));
- }
+ if (likely(mem_init_done))
+ pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ else
+ pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
- if (pte)
- clear_page(pte);
return pte;
}
diff --git a/arch/powerpc/configs/mpc512x_defconfig b/arch/powerpc/configs/mpc512x_defconfig
index c2b1c4404683..e4bfb1101c0e 100644
--- a/arch/powerpc/configs/mpc512x_defconfig
+++ b/arch/powerpc/configs/mpc512x_defconfig
@@ -120,6 +120,5 @@ CONFIG_NFS_FS=y
CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_CRYPTO_HW is not set
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index 53687c3a70c4..7c6baf6df139 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -1123,7 +1123,6 @@ CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_KOI8_R=m
CONFIG_NLS_KOI8_U=m
CONFIG_DEBUG_INFO=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
CONFIG_UNUSED_SYMBOLS=y
CONFIG_HEADERS_CHECK=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index 5b0177733994..9d85dc2c5b86 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -13,6 +13,10 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags);
+extern void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte);
+
static inline int hstate_get_psize(struct hstate *hstate)
{
unsigned long shift;
@@ -32,8 +36,8 @@ static inline int hstate_get_psize(struct hstate *hstate)
}
}
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void)
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION
+static inline bool gigantic_page_runtime_allocation_supported(void)
{
return true;
}
@@ -42,4 +46,12 @@ static inline bool gigantic_page_supported(void)
/* hugepd entry valid bit */
#define HUGEPD_VAL_BITS (0x8000000000000000UL)
+#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
+extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+
+#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit
+extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t new_pte);
#endif
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 404e0f48f3f3..5043f846def1 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1306,6 +1306,24 @@ static inline int pud_pfn(pud_t pud)
BUILD_BUG();
return 0;
}
+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
+void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
+ pte_t *, pte_t, pte_t);
+
+/*
+ * Returns true for a R -> RW upgrade of pte
+ */
+static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_val)
+{
+ if (!(old_val & _PAGE_READ))
+ return false;
+
+ if ((!(old_val & _PAGE_WRITE)) && (new_val & _PAGE_WRITE))
+ return true;
+
+ return false;
+}
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 7d1a3d1543fc..5ab134eeed20 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -127,6 +127,10 @@ extern void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep
pte_t entry, unsigned long address,
int psize);
+extern void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte);
+
static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr,
unsigned long set)
{
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index aee4fcc24990..77fc21278fa2 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -10,6 +10,7 @@
#include <linux/pci.h>
#include <linux/list.h>
#include <linux/ioport.h>
+#include <linux/numa.h>
struct device_node;
@@ -265,7 +266,7 @@ extern int pcibios_map_io_space(struct pci_bus *bus);
#ifdef CONFIG_NUMA
#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = (NODE))
#else
-#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = -1)
+#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = NUMA_NO_NODE)
#endif
#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 913bfca09c4f..8c890c6557ed 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -11,6 +11,7 @@
#include <linux/export.h>
#include <linux/memblock.h>
#include <linux/sched/task.h>
+#include <linux/numa.h>
#include <asm/lppaca.h>
#include <asm/paca.h>
@@ -27,7 +28,7 @@
static void *__init alloc_paca_data(unsigned long size, unsigned long align,
unsigned long limit, int cpu)
{
- unsigned long pa;
+ void *ptr;
int nid;
/*
@@ -36,23 +37,21 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align,
* which will put its paca in the right place.
*/
if (cpu == boot_cpuid) {
- nid = -1;
+ nid = NUMA_NO_NODE;
memblock_set_bottom_up(true);
} else {
nid = early_cpu_to_node(cpu);
}
- pa = memblock_alloc_base_nid(size, align, limit, nid, MEMBLOCK_NONE);
- if (!pa) {
- pa = memblock_alloc_base(size, align, limit);
- if (!pa)
- panic("cannot allocate paca data");
- }
+ ptr = memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
+ limit, nid);
+ if (!ptr)
+ panic("cannot allocate paca data");
if (cpu == boot_cpuid)
memblock_set_bottom_up(false);
- return __va(pa);
+ return ptr;
}
#ifdef CONFIG_PPC_PSERIES
@@ -118,7 +117,6 @@ static struct slb_shadow * __init new_slb_shadow(int cpu, unsigned long limit)
}
s = alloc_paca_data(sizeof(*s), L1_CACHE_BYTES, limit, cpu);
- memset(s, 0, sizeof(*s));
s->persistent = cpu_to_be32(SLB_NUM_BOLTED);
s->buffer_length = cpu_to_be32(sizeof(*s));
@@ -222,7 +220,6 @@ void __init allocate_paca(int cpu)
paca = alloc_paca_data(sizeof(struct paca_struct), L1_CACHE_BYTES,
limit, cpu);
paca_ptrs[cpu] = paca;
- memset(paca, 0, sizeof(struct paca_struct));
initialise_paca(paca, cpu);
#ifdef CONFIG_PPC_PSERIES
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 88e4f69a09e5..4538e8ddde80 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -32,6 +32,7 @@
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/vgaarb.h>
+#include <linux/numa.h>
#include <asm/processor.h>
#include <asm/io.h>
@@ -132,7 +133,7 @@ struct pci_controller *pcibios_alloc_controller(struct device_node *dev)
int nid = of_node_to_nid(dev);
if (nid < 0 || !node_online(nid))
- nid = -1;
+ nid = NUMA_NO_NODE;
PHB_SET_NODE(phb, nid);
}
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ce393df243aa..6a4b59d574c2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1298,16 +1298,6 @@ void show_user_instructions(struct pt_regs *regs)
pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int));
- /*
- * Make sure the NIP points at userspace, not kernel text/data or
- * elsewhere.
- */
- if (!__access_ok(pc, NR_INSN_TO_PRINT * sizeof(int), USER_DS)) {
- pr_info("%s[%d]: Bad NIP, not dumping instructions.\n",
- current->comm, current->pid);
- return;
- }
-
seq_buf_init(&s, buf, sizeof(buf));
while (n) {
@@ -1318,7 +1308,7 @@ void show_user_instructions(struct pt_regs *regs)
for (i = 0; i < 8 && n; i++, n--, pc += sizeof(int)) {
int instr;
- if (probe_kernel_address((const void *)pc, instr)) {
+ if (probe_user_read(&instr, (void __user *)pc, sizeof(instr))) {
seq_buf_printf(&s, "XXXXXXXX ");
continue;
}
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index ca00fbb97cf8..82be48c123cf 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -459,8 +459,8 @@ void __init smp_setup_cpu_maps(void)
DBG("smp_setup_cpu_maps()\n");
- cpu_to_phys_id = __va(memblock_phys_alloc(nr_cpu_ids * sizeof(u32), __alignof__(u32)));
- memset(cpu_to_phys_id, 0, nr_cpu_ids * sizeof(u32));
+ cpu_to_phys_id = memblock_alloc(nr_cpu_ids * sizeof(u32),
+ __alignof__(u32));
for_each_node_by_type(dn, "cpu") {
const __be32 *intserv;
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 236c1151a3a7..3dcd779aef44 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -634,19 +634,17 @@ __init u64 ppc64_bolted_size(void)
static void *__init alloc_stack(unsigned long limit, int cpu)
{
- unsigned long pa;
+ void *ptr;
BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16);
- pa = memblock_alloc_base_nid(THREAD_SIZE, THREAD_SIZE, limit,
- early_cpu_to_node(cpu), MEMBLOCK_NONE);
- if (!pa) {
- pa = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit);
- if (!pa)
- panic("cannot allocate stacks");
- }
+ ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_SIZE,
+ MEMBLOCK_LOW_LIMIT, limit,
+ early_cpu_to_node(cpu));
+ if (!ptr)
+ panic("cannot allocate stacks");
- return __va(pa);
+ return ptr;
}
void __init irqstack_early_init(void)
@@ -739,20 +737,17 @@ void __init emergency_stack_init(void)
struct thread_info *ti;
ti = alloc_stack(limit, i);
- memset(ti, 0, THREAD_SIZE);
emerg_stack_init_thread_info(ti, i);
paca_ptrs[i]->emergency_sp = (void *)ti + THREAD_SIZE;
#ifdef CONFIG_PPC_BOOK3S_64
/* emergency stack for NMI exception handling. */
ti = alloc_stack(limit, i);
- memset(ti, 0, THREAD_SIZE);
emerg_stack_init_thread_info(ti, i);
paca_ptrs[i]->nmi_emergency_sp = (void *)ti + THREAD_SIZE;
/* emergency stack for machine check exception handling. */
ti = alloc_stack(limit, i);
- memset(ti, 0, THREAD_SIZE);
emerg_stack_init_thread_info(ti, i);
paca_ptrs[i]->mc_emergency_sp = (void *)ti + THREAD_SIZE;
#endif
@@ -933,8 +928,9 @@ static void __ref init_fallback_flush(void)
* hardware prefetch runoff. We don't have a recipe for load patterns to
* reliably avoid the prefetcher.
*/
- l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit));
- memset(l1d_flush_fallback_area, 0, l1d_size * 2);
+ l1d_flush_fallback_area = memblock_alloc_try_nid(l1d_size * 2,
+ l1d_size, MEMBLOCK_LOW_LIMIT,
+ limit, NUMA_NO_NODE);
for_each_possible_cpu(cpu) {
struct paca_struct *paca = paca_ptrs[cpu];
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 7725a9714736..a31b6234fcd7 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -798,7 +798,6 @@ static int __init vdso_init(void)
BUG_ON(vdso32_pagelist == NULL);
for (i = 0; i < vdso32_pages; i++) {
struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
- ClearPageReserved(pg);
get_page(pg);
vdso32_pagelist[i] = pg;
}
@@ -812,7 +811,6 @@ static int __init vdso_init(void)
BUG_ON(vdso64_pagelist == NULL);
for (i = 0; i < vdso64_pages; i++) {
struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
- ClearPageReserved(pg);
get_page(pg);
vdso64_pagelist[i] = pg;
}
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 887f11bcf330..ec74305fa330 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -276,12 +276,8 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
access_ok(nip, sizeof(*nip))) {
unsigned int inst;
- int res;
- pagefault_disable();
- res = __get_user_inatomic(inst, nip);
- pagefault_enable();
- if (!res)
+ if (!probe_user_read(&inst, nip, sizeof(inst)))
return !store_updates_sp(inst);
*must_retry = true;
}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 4aa0797000f7..3d4b2399192f 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -908,9 +908,9 @@ static void __init htab_initialize(void)
#ifdef CONFIG_DEBUG_PAGEALLOC
if (debug_pagealloc_enabled()) {
linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
- linear_map_hash_slots = __va(memblock_alloc_base(
- linear_map_hash_count, 1, ppc64_rma_size));
- memset(linear_map_hash_slots, 0, linear_map_hash_count);
+ linear_map_hash_slots = memblock_alloc_try_nid(
+ linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
+ ppc64_rma_size, NUMA_NO_NODE);
}
#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 2e6a8f9345d3..367ce3a4a503 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -121,3 +121,28 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
+
+pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ unsigned long pte_val;
+ /*
+ * Clear the _PAGE_PRESENT so that no hardware parallel update is
+ * possible. Also keep the pte_present true so that we don't take
+ * wrong fault.
+ */
+ pte_val = pte_update(vma->vm_mm, addr, ptep,
+ _PAGE_PRESENT, _PAGE_INVALID, 1);
+
+ return __pte(pte_val);
+}
+
+void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+
+ if (radix_enabled())
+ return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
+ old_pte, pte);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index 2486bee0f93e..11d9ea28a816 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -90,3 +90,20 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
return vm_unmapped_area(&info);
}
+
+void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ /*
+ * To avoid NMMU hang while relaxing access we need to flush the tlb before
+ * we set the new value.
+ */
+ if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ (atomic_read(&mm->context.copros) > 0))
+ radix__flush_hugetlb_page(vma, addr);
+
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index a712a650a8b6..74d43f522dc2 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -21,6 +21,7 @@
#include <linux/sizes.h>
#include <asm/mmu_context.h>
#include <asm/pte-walk.h>
+#include <linux/mm_inline.h>
static DEFINE_MUTEX(mem_list_mutex);
@@ -34,8 +35,18 @@ struct mm_iommu_table_group_mem_t {
atomic64_t mapped;
unsigned int pageshift;
u64 ua; /* userspace address */
- u64 entries; /* number of entries in hpas[] */
- u64 *hpas; /* vmalloc'ed */
+ u64 entries; /* number of entries in hpas/hpages[] */
+ /*
+ * in mm_iommu_get we temporarily use this to store
+ * struct page address.
+ *
+ * We need to convert ua to hpa in real mode. Make it
+ * simpler by storing physical address.
+ */
+ union {
+ struct page **hpages; /* vmalloc'ed */
+ phys_addr_t *hpas;
+ };
#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1)
u64 dev_hpa; /* Device memory base address */
};
@@ -80,64 +91,13 @@ bool mm_iommu_preregistered(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
-/*
- * Taken from alloc_migrate_target with changes to remove CMA allocations
- */
-struct page *new_iommu_non_cma_page(struct page *page, unsigned long private)
-{
- gfp_t gfp_mask = GFP_USER;
- struct page *new_page;
-
- if (PageCompound(page))
- return NULL;
-
- if (PageHighMem(page))
- gfp_mask |= __GFP_HIGHMEM;
-
- /*
- * We don't want the allocation to force an OOM if possibe
- */
- new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN);
- return new_page;
-}
-
-static int mm_iommu_move_page_from_cma(struct page *page)
-{
- int ret = 0;
- LIST_HEAD(cma_migrate_pages);
-
- /* Ignore huge pages for now */
- if (PageCompound(page))
- return -EBUSY;
-
- lru_add_drain();
- ret = isolate_lru_page(page);
- if (ret)
- return ret;
-
- list_add(&page->lru, &cma_migrate_pages);
- put_page(page); /* Drop the gup reference */
-
- ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page,
- NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE);
- if (ret) {
- if (!list_empty(&cma_migrate_pages))
- putback_movable_pages(&cma_migrate_pages);
- }
-
- return 0;
-}
-
static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
- unsigned long entries, unsigned long dev_hpa,
- struct mm_iommu_table_group_mem_t **pmem)
+ unsigned long entries, unsigned long dev_hpa,
+ struct mm_iommu_table_group_mem_t **pmem)
{
struct mm_iommu_table_group_mem_t *mem;
- long i, j, ret = 0, locked_entries = 0;
+ long i, ret = 0, locked_entries = 0;
unsigned int pageshift;
- unsigned long flags;
- unsigned long cur_ua;
- struct page *page = NULL;
mutex_lock(&mem_list_mutex);
@@ -187,58 +147,40 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
goto unlock_exit;
}
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL);
+ up_read(&mm->mmap_sem);
+ if (ret != entries) {
+ /* free the reference taken */
+ for (i = 0; i < ret; i++)
+ put_page(mem->hpages[i]);
+
+ vfree(mem->hpas);
+ kfree(mem);
+ ret = -EFAULT;
+ goto unlock_exit;
+ } else {
+ ret = 0;
+ }
+
+ pageshift = PAGE_SHIFT;
for (i = 0; i < entries; ++i) {
- cur_ua = ua + (i << PAGE_SHIFT);
- if (1 != get_user_pages_fast(cur_ua,
- 1/* pages */, 1/* iswrite */, &page)) {
- ret = -EFAULT;
- for (j = 0; j < i; ++j)
- put_page(pfn_to_page(mem->hpas[j] >>
- PAGE_SHIFT));
- vfree(mem->hpas);
- kfree(mem);
- goto unlock_exit;
- }
+ struct page *page = mem->hpages[i];
+
/*
- * If we get a page from the CMA zone, since we are going to
- * be pinning these entries, we might as well move them out
- * of the CMA zone if possible. NOTE: faulting in + migration
- * can be expensive. Batching can be considered later
+ * Allow to use larger than 64k IOMMU pages. Only do that
+ * if we are backed by hugetlb.
*/
- if (is_migrate_cma_page(page)) {
- if (mm_iommu_move_page_from_cma(page))
- goto populate;
- if (1 != get_user_pages_fast(cur_ua,
- 1/* pages */, 1/* iswrite */,
- &page)) {
- ret = -EFAULT;
- for (j = 0; j < i; ++j)
- put_page(pfn_to_page(mem->hpas[j] >>
- PAGE_SHIFT));
- vfree(mem->hpas);
- kfree(mem);
- goto unlock_exit;
- }
- }
-populate:
- pageshift = PAGE_SHIFT;
- if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) {
- pte_t *pte;
+ if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
struct page *head = compound_head(page);
- unsigned int compshift = compound_order(head);
- unsigned int pteshift;
-
- local_irq_save(flags); /* disables as well */
- pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift);
-
- /* Double check it is still the same pinned page */
- if (pte && pte_page(*pte) == head &&
- pteshift == compshift + PAGE_SHIFT)
- pageshift = max_t(unsigned int, pteshift,
- PAGE_SHIFT);
- local_irq_restore(flags);
+
+ pageshift = compound_order(head) + PAGE_SHIFT;
}
mem->pageshift = min(mem->pageshift, pageshift);
+ /*
+ * We don't need struct page reference any more, switch
+ * to physical address.
+ */
mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
}
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b5d1c45c1475..ac49e4158e50 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -84,7 +84,7 @@ static void __init setup_node_to_cpumask_map(void)
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
/* cpumask_of_node() will now work */
- dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
+ dbg("Node to cpumask map for %u nodes\n", nr_node_ids);
}
static int __init fake_numa_create_new_node(unsigned long end_pfn,
@@ -215,7 +215,7 @@ static void initialize_distance_lookup_table(int nid,
*/
static int associativity_to_nid(const __be32 *associativity)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
if (min_common_depth == -1)
goto out;
@@ -225,7 +225,7 @@ static int associativity_to_nid(const __be32 *associativity)
/* POWER4 LPAR uses 0xffff as invalid node */
if (nid == 0xffff || nid >= MAX_NUMNODES)
- nid = -1;
+ nid = NUMA_NO_NODE;
if (nid > 0 &&
of_read_number(associativity, 1) >= distance_ref_points_depth) {
@@ -244,7 +244,7 @@ out:
*/
static int of_node_to_nid_single(struct device_node *device)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
const __be32 *tmp;
tmp = of_get_associativity(device);
@@ -256,7 +256,7 @@ static int of_node_to_nid_single(struct device_node *device)
/* Walk the device tree upwards, looking for an associativity id */
int of_node_to_nid(struct device_node *device)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
of_node_get(device);
while (device) {
@@ -454,7 +454,7 @@ static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
*/
static int numa_setup_cpu(unsigned long lcpu)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
struct device_node *cpu;
/*
@@ -930,7 +930,7 @@ static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
{
struct drmem_lmb *lmb;
unsigned long lmb_size;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
lmb_size = drmem_lmb_size();
@@ -960,7 +960,7 @@ static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
static int hot_add_node_scn_to_nid(unsigned long scn_addr)
{
struct device_node *memory;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
for_each_node_by_type(memory, "memory") {
unsigned long start, size;
diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c
index e0ccf36714b2..53cbc7dc2df2 100644
--- a/arch/powerpc/mm/pgtable-book3e.c
+++ b/arch/powerpc/mm/pgtable-book3e.c
@@ -57,12 +57,8 @@ void vmemmap_remove_mapping(unsigned long start,
static __ref void *early_alloc_pgtable(unsigned long size)
{
- void *pt;
-
- pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS)));
- memset(pt, 0, size);
-
- return pt;
+ return memblock_alloc_try_nid(size, size, MEMBLOCK_LOW_LIMIT,
+ __pa(MAX_DMA_ADDRESS), NUMA_NO_NODE);
}
/*
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index ecd31569a120..92a3e4c39540 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -195,11 +195,8 @@ void __init mmu_partition_table_init(void)
unsigned long ptcr;
BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
- partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
- MEMBLOCK_ALLOC_ANYWHERE));
-
/* Initialize the Partition Table with no entries */
- memset((void *)partition_tb, 0, patb_size);
+ partition_tb = memblock_alloc(patb_size, patb_size);
/*
* update partition table control register,
@@ -401,6 +398,31 @@ void arch_report_meminfo(struct seq_file *m)
}
#endif /* CONFIG_PROC_FS */
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep)
+{
+ unsigned long pte_val;
+
+ /*
+ * Clear the _PAGE_PRESENT so that no hardware parallel update is
+ * possible. Also keep the pte_present true so that we don't take
+ * wrong fault.
+ */
+ pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0);
+
+ return __pte(pte_val);
+
+}
+
+void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+ if (radix_enabled())
+ return radix__ptep_modify_prot_commit(vma, addr,
+ ptep, old_pte, pte);
+ set_pte_at(vma->vm_mm, addr, ptep, pte);
+}
+
/*
* For hash translation mode, we use the deposited table to store hash slot
* information and they are stored at PTRS_PER_PMD offset from related pmd
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 931156069a81..e377684ac6ad 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -51,26 +51,15 @@ static int native_register_process_table(unsigned long base, unsigned long pg_sz
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
unsigned long region_start, unsigned long region_end)
{
- unsigned long pa = 0;
- void *pt;
+ phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
+ phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
- if (region_start || region_end) /* has region hint */
- pa = memblock_alloc_range(size, size, region_start, region_end,
- MEMBLOCK_NONE);
- else if (nid != -1) /* has node hint */
- pa = memblock_alloc_base_nid(size, size,
- MEMBLOCK_ALLOC_ANYWHERE,
- nid, MEMBLOCK_NONE);
+ if (region_start)
+ min_addr = region_start;
+ if (region_end)
+ max_addr = region_end;
- if (!pa)
- pa = memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE);
-
- BUG_ON(!pa);
-
- pt = __va(pa);
- memset(pt, 0, size);
-
- return pt;
+ return memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
}
static int early_map_kernel_page(unsigned long ea, unsigned long pa,
@@ -1063,3 +1052,21 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
}
/* See ptesync comment in radix__set_pte_at */
}
+
+void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ /*
+ * To avoid NMMU hang while relaxing access we need to flush the tlb before
+ * we set the new value. We need to do this only for radix, because hash
+ * translation does flush when updating the linux pte.
+ */
+ if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ (atomic_read(&mm->context.copros) > 0))
+ radix__flush_tlb_page(vma, addr);
+
+ set_pte_at(mm, addr, ptep, pte);
+}
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 3f4193201ee7..36a664f06c65 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -211,8 +211,7 @@ void __init MMU_init_hw(void)
* Find some memory for the hash table.
*/
if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
- Hash = __va(memblock_phys_alloc(Hash_size, Hash_size));
- memset(Hash, 0, Hash_size);
+ Hash = memblock_alloc(Hash_size, Hash_size);
_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 0af051a1974e..0680efb2237b 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -159,12 +159,8 @@ static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
((unsigned long)ptr & 7))
return -EFAULT;
- pagefault_disable();
- if (!__get_user_inatomic(*ret, ptr)) {
- pagefault_enable();
+ if (!probe_user_read(ret, ptr, sizeof(*ret)))
return 0;
- }
- pagefault_enable();
return read_user_stack_slow(ptr, ret, 8);
}
@@ -175,12 +171,8 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
((unsigned long)ptr & 3))
return -EFAULT;
- pagefault_disable();
- if (!__get_user_inatomic(*ret, ptr)) {
- pagefault_enable();
+ if (!probe_user_read(ret, ptr, sizeof(*ret)))
return 0;
- }
- pagefault_enable();
return read_user_stack_slow(ptr, ret, 4);
}
@@ -307,17 +299,11 @@ static inline int current_is_64bit(void)
*/
static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
{
- int rc;
-
if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
((unsigned long)ptr & 3))
return -EFAULT;
- pagefault_disable();
- rc = __get_user_inatomic(*ret, ptr);
- pagefault_enable();
-
- return rc;
+ return probe_user_read(ret, ptr, sizeof(*ret));
}
static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index b0723002a396..4b64ddf0db68 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -416,7 +416,6 @@ static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
static __u64 power_pmu_bhrb_to(u64 addr)
{
unsigned int instr;
- int ret;
__u64 target;
if (is_kernel_addr(addr)) {
@@ -427,13 +426,8 @@ static __u64 power_pmu_bhrb_to(u64 addr)
}
/* Userspace: need copy instruction here then translate it */
- pagefault_disable();
- ret = __get_user_inatomic(instr, (unsigned int __user *)addr);
- if (ret) {
- pagefault_enable();
+ if (probe_user_read(&instr, (unsigned int __user *)addr, sizeof(instr)))
return 0;
- }
- pagefault_enable();
target = branch_target(&instr);
if ((!target) || (instr & BRANCH_ABSOLUTE))
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 7c544607ba32..d07b753d6b20 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -325,7 +325,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
config PPC_RADIX_MMU
bool "Radix MMU Support"
depends on PPC_BOOK3S_64
- select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+ select ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION if (MEMORY_ISOLATION && COMPACTION) || CMA
default y
help
Enable support for the Power ISA 3.0 Radix style MMU. Currently this
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index f2971522fb4a..f62930f839ca 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -208,7 +208,9 @@ static int __init iob_init(struct device_node *dn)
pr_debug(" -> %s\n", __func__);
/* For 2G space, 8x64 pages (2^21 bytes) is max total l2 size */
- iob_l2_base = (u32 *)__va(memblock_alloc_base(1UL<<21, 1UL<<21, 0x80000000));
+ iob_l2_base = memblock_alloc_try_nid_raw(1UL << 21, 1UL << 21,
+ MEMBLOCK_LOW_LIMIT, 0x80000000,
+ NUMA_NO_NODE);
pr_info("IOBMAP L2 allocated at: %p\n", iob_l2_base);
@@ -269,4 +271,3 @@ void __init iommu_init_early_pasemi(void)
pasemi_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pasemi;
set_pci_dma_ops(&dma_iommu_ops);
}
-
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 84d038ed3882..248a38ad25c7 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
+#include <linux/numa.h>
#include <asm/machdep.h>
#include <asm/debugfs.h>
@@ -223,7 +224,7 @@ static int memtrace_online(void)
ent = &memtrace_array[i];
/* We have onlined this chunk previously */
- if (ent->nid == -1)
+ if (ent->nid == NUMA_NO_NODE)
continue;
/* Remove from io mappings */
@@ -257,7 +258,7 @@ static int memtrace_online(void)
*/
debugfs_remove_recursive(ent->dir);
pr_info("Added trace memory back to node %d\n", ent->nid);
- ent->size = ent->start = ent->nid = -1;
+ ent->size = ent->start = ent->nid = NUMA_NO_NODE;
}
if (ret)
return ret;
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index d1072d464e99..0711e2455f72 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -170,8 +170,7 @@ int __init early_init_dt_scan_recoverable_ranges(unsigned long node,
/*
* Allocate a buffer to hold the MC recoverable ranges.
*/
- mc_recoverable_range =__va(memblock_phys_alloc(size, __alignof__(u64)));
- memset(mc_recoverable_range, 0, size);
+ mc_recoverable_range = memblock_alloc(size, __alignof__(u64));
for (i = 0; i < mc_recoverable_range_len; i++) {
mc_recoverable_range[i].start_addr =
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 41f62ca27c63..e4f0dfd4ae33 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -130,8 +130,13 @@ static void __init fwnmi_init(void)
* It will be used in real mode mce handler, hence it needs to be
* below RMA.
*/
- mce_data_buf = __va(memblock_alloc_base(RTAS_ERROR_LOG_MAX * nr_cpus,
- RTAS_ERROR_LOG_MAX, ppc64_rma_size));
+ mce_data_buf = memblock_alloc_try_nid_raw(RTAS_ERROR_LOG_MAX * nr_cpus,
+ RTAS_ERROR_LOG_MAX, MEMBLOCK_LOW_LIMIT,
+ ppc64_rma_size, NUMA_NO_NODE);
+ if (!mce_data_buf)
+ panic("Failed to allocate %d bytes below %pa for MCE buffer\n",
+ RTAS_ERROR_LOG_MAX * nr_cpus, &ppc64_rma_size);
+
for_each_possible_cpu(i) {
paca_ptrs[i]->mce_data_buf = mce_data_buf +
(RTAS_ERROR_LOG_MAX * i);
@@ -140,8 +145,13 @@ static void __init fwnmi_init(void)
#ifdef CONFIG_PPC_BOOK3S_64
/* Allocate per cpu slb area to save old slb contents during MCE */
size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus;
- slb_ptr = __va(memblock_alloc_base(size, sizeof(struct slb_entry),
- ppc64_rma_size));
+ slb_ptr = memblock_alloc_try_nid_raw(size, sizeof(struct slb_entry),
+ MEMBLOCK_LOW_LIMIT, ppc64_rma_size,
+ NUMA_NO_NODE);
+ if (!slb_ptr)
+ panic("Failed to allocate %zu bytes below %pa for slb area\n",
+ size, &ppc64_rma_size);
+
for_each_possible_cpu(i)
paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i);
#endif
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index a5b40d1460f1..25bc25fe0d93 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -251,8 +251,11 @@ static void allocate_dart(void)
* 16MB (1 << 24) alignment. We allocate a full 16Mb chuck since we
* will blow up an entire large page anyway in the kernel mapping.
*/
- dart_tablebase = __va(memblock_alloc_base(1UL<<24,
- 1UL<<24, 0x80000000L));
+ dart_tablebase = memblock_alloc_try_nid_raw(SZ_16M, SZ_16M,
+ MEMBLOCK_LOW_LIMIT, SZ_2G,
+ NUMA_NO_NODE);
+ if (!dart_tablebase)
+ panic("Failed to allocate 16MB below 2GB for DART table\n");
/* There is no point scanning the DART space for leaks*/
kmemleak_no_scan((void *)dart_tablebase);
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 918be816b097..c8a1b26489f5 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -1068,13 +1068,11 @@ int fsl_pci_mcheck_exception(struct pt_regs *regs)
addr += mfspr(SPRN_MCAR);
if (is_in_pci_mem_space(addr)) {
- if (user_mode(regs)) {
- pagefault_disable();
- ret = get_user(inst, (__u32 __user *)regs->nip);
- pagefault_enable();
- } else {
+ if (user_mode(regs))
+ ret = probe_user_read(&inst, (void __user *)regs->nip,
+ sizeof(inst));
+ else
ret = probe_kernel_address((void *)regs->nip, inst);
- }
if (!ret && mcheck_handle_load(regs, inst)) {
regs->nip += 4;
diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c
index 582cb153eb24..0cd044122234 100644
--- a/arch/riscv/kernel/vdso.c
+++ b/arch/riscv/kernel/vdso.c
@@ -54,7 +54,6 @@ static int __init vdso_init(void)
struct page *pg;
pg = virt_to_page(vdso_start + (i << PAGE_SHIFT));
- ClearPageReserved(pg);
vdso_pagelist[i] = pg;
}
vdso_pagelist[i] = virt_to_page(vdso_data);
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b6e3d0653002..d6af92470a9a 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -69,7 +69,7 @@ config S390
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+ select ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION if (MEMORY_ISOLATION && COMPACTION) || CMA
select ARCH_HAS_KCOV
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_MEMORY
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 2d1afa58a4b6..57c952f5388e 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -116,7 +116,10 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
return pte_modify(pte, newprot);
}
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION
+static inline bool gigantic_page_runtime_allocation_supported(void)
+{
+ return true;
+}
#endif
#endif /* _ASM_S390_HUGETLB_H */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 063732414dfb..76dc344edb8c 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1069,8 +1069,9 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
-pte_t ptep_modify_prot_start(struct mm_struct *, unsigned long, pte_t *);
-void ptep_modify_prot_commit(struct mm_struct *, unsigned long, pte_t *, pte_t);
+pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
+void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
+ pte_t *, pte_t, pte_t);
#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 4ff354887db4..e7920a68a12e 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -291,7 +291,6 @@ static int __init vdso_init(void)
BUG_ON(vdso32_pagelist == NULL);
for (i = 0; i < vdso32_pages - 1; i++) {
struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
- ClearPageReserved(pg);
get_page(pg);
vdso32_pagelist[i] = pg;
}
@@ -309,7 +308,6 @@ static int __init vdso_init(void)
BUG_ON(vdso64_pagelist == NULL);
for (i = 0; i < vdso64_pages - 1; i++) {
struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
- ClearPageReserved(pg);
get_page(pg);
vdso64_pagelist[i] = pg;
}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 689b66f29fc6..8485d6dc2754 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -301,12 +301,13 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(ptep_xchg_lazy);
-pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep)
{
pgste_t pgste;
pte_t old;
int nodat;
+ struct mm_struct *mm = vma->vm_mm;
preempt_disable();
pgste = ptep_xchg_start(mm, addr, ptep);
@@ -319,10 +320,11 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
return old;
}
-void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
+void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
{
pgste_t pgste;
+ struct mm_struct *mm = vma->vm_mm;
if (!MACHINE_HAS_NX)
pte_val(pte) &= ~_PAGE_NOEXEC;
diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c
index d31bde0870d8..2d1271e2a70d 100644
--- a/arch/s390/numa/numa.c
+++ b/arch/s390/numa/numa.c
@@ -58,18 +58,6 @@ EXPORT_SYMBOL(__node_distance);
int numa_debug_enabled;
/*
- * alloc_node_data() - Allocate node data
- */
-static __init pg_data_t *alloc_node_data(void)
-{
- pg_data_t *res;
-
- res = (pg_data_t *) memblock_phys_alloc(sizeof(pg_data_t), 8);
- memset(res, 0, sizeof(pg_data_t));
- return res;
-}
-
-/*
* numa_setup_memory() - Assign bootmem to nodes
*
* The memory is first added to memblock without any respect to nodes.
@@ -105,7 +93,7 @@ static void __init numa_setup_memory(void)
/* Allocate and fill out node_data */
for (nid = 0; nid < MAX_NUMNODES; nid++)
- NODE_DATA(nid) = alloc_node_data();
+ NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8);
for_each_online_node(nid) {
unsigned long start_pfn, end_pfn;
diff --git a/arch/sh/configs/apsh4a3a_defconfig b/arch/sh/configs/apsh4a3a_defconfig
index 4710df43a5b5..6c7cdc3beb28 100644
--- a/arch/sh/configs/apsh4a3a_defconfig
+++ b/arch/sh/configs/apsh4a3a_defconfig
@@ -81,7 +81,6 @@ CONFIG_NLS_CODEPAGE_932=y
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
diff --git a/arch/sh/configs/edosk7705_defconfig b/arch/sh/configs/edosk7705_defconfig
index db756e099052..ef7cc31997b1 100644
--- a/arch/sh/configs/edosk7705_defconfig
+++ b/arch/sh/configs/edosk7705_defconfig
@@ -32,6 +32,5 @@ CONFIG_SH_PCLK_FREQ=31250000
# CONFIG_DNOTIFY is not set
# CONFIG_PROC_FS is not set
# CONFIG_SYSFS is not set
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_CRC32 is not set
diff --git a/arch/sh/configs/espt_defconfig b/arch/sh/configs/espt_defconfig
index 2985fe7c6d50..444d75947e70 100644
--- a/arch/sh/configs/espt_defconfig
+++ b/arch/sh/configs/espt_defconfig
@@ -111,7 +111,6 @@ CONFIG_NLS_ISO8859_15=y
CONFIG_NLS_KOI8_R=y
CONFIG_NLS_KOI8_U=y
CONFIG_NLS_UTF8=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig
index e9ee0c878ead..d16e9334cd98 100644
--- a/arch/sh/configs/sdk7786_defconfig
+++ b/arch/sh/configs/sdk7786_defconfig
@@ -209,7 +209,6 @@ CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_UTF8=m
CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
diff --git a/arch/sh/configs/sh2007_defconfig b/arch/sh/configs/sh2007_defconfig
index 34094e05e892..a1cf6447dbb1 100644
--- a/arch/sh/configs/sh2007_defconfig
+++ b/arch/sh/configs/sh2007_defconfig
@@ -157,7 +157,6 @@ CONFIG_NLS_ISO8859_15=y
CONFIG_NLS_KOI8_R=y
CONFIG_NLS_KOI8_U=y
CONFIG_NLS_UTF8=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
diff --git a/arch/sh/configs/sh7724_generic_defconfig b/arch/sh/configs/sh7724_generic_defconfig
index d15e53647983..d04bc27aa816 100644
--- a/arch/sh/configs/sh7724_generic_defconfig
+++ b/arch/sh/configs/sh7724_generic_defconfig
@@ -40,6 +40,5 @@ CONFIG_UIO_PDRV_GENIRQ=y
# CONFIG_PROC_FS is not set
# CONFIG_SYSFS is not set
# CONFIG_MISC_FILESYSTEMS is not set
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_CRC32 is not set
diff --git a/arch/sh/configs/sh7763rdp_defconfig b/arch/sh/configs/sh7763rdp_defconfig
index 2ef780fb9813..405bf62d22d0 100644
--- a/arch/sh/configs/sh7763rdp_defconfig
+++ b/arch/sh/configs/sh7763rdp_defconfig
@@ -113,7 +113,6 @@ CONFIG_NLS_ISO8859_15=y
CONFIG_NLS_KOI8_R=y
CONFIG_NLS_KOI8_U=y
CONFIG_NLS_UTF8=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/sh7770_generic_defconfig b/arch/sh/configs/sh7770_generic_defconfig
index 742634b37c0a..e5b733c2d988 100644
--- a/arch/sh/configs/sh7770_generic_defconfig
+++ b/arch/sh/configs/sh7770_generic_defconfig
@@ -42,6 +42,5 @@ CONFIG_UIO_PDRV_GENIRQ=y
# CONFIG_PROC_FS is not set
# CONFIG_SYSFS is not set
# CONFIG_MISC_FILESYSTEMS is not set
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_CRC32 is not set
diff --git a/arch/sh/configs/sh7785lcr_defconfig b/arch/sh/configs/sh7785lcr_defconfig
index 7098828d392e..5201bb78c6f9 100644
--- a/arch/sh/configs/sh7785lcr_defconfig
+++ b/arch/sh/configs/sh7785lcr_defconfig
@@ -109,7 +109,6 @@ CONFIG_NFSD_V4=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_CODEPAGE_932=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/sh/configs/ul2_defconfig b/arch/sh/configs/ul2_defconfig
index 5f2921a85192..1b7412df12e0 100644
--- a/arch/sh/configs/ul2_defconfig
+++ b/arch/sh/configs/ul2_defconfig
@@ -82,7 +82,6 @@ CONFIG_NFSD=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_CODEPAGE_932=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_CRYPTO_MICHAEL_MIC=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig
index 7d5591b7c088..f891045e633a 100644
--- a/arch/sh/configs/urquell_defconfig
+++ b/arch/sh/configs/urquell_defconfig
@@ -136,7 +136,6 @@ CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_CODEPAGE_932=y
CONFIG_NLS_ISO8859_1=y
CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
diff --git a/arch/sh/kernel/syscalls/syscalltbl.sh b/arch/sh/kernel/syscalls/syscalltbl.sh
index 85d78d9309ad..904b8e6e625d 100644
--- a/arch/sh/kernel/syscalls/syscalltbl.sh
+++ b/arch/sh/kernel/syscalls/syscalltbl.sh
@@ -13,10 +13,10 @@ emit() {
t_entry="$3"
while [ $t_nxt -lt $t_nr ]; do
- printf "__SYSCALL(%s, sys_ni_syscall, )\n" "${t_nxt}"
+ printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}"
t_nxt=$((t_nxt+1))
done
- printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}"
+ printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}"
}
grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S
index 96e9c54a07f5..bd1a9c544767 100644
--- a/arch/sh/kernel/syscalls_32.S
+++ b/arch/sh/kernel/syscalls_32.S
@@ -10,7 +10,7 @@
#include <linux/sys.h>
#include <linux/linkage.h>
-#define __SYSCALL(nr, entry, nargs) .long entry
+#define __SYSCALL(nr, entry) .long entry
.data
ENTRY(sys_call_table)
#include <asm/syscall_table.h>
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index a8e5c0e00fca..a0fa4de03dd5 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -192,24 +192,16 @@ void __init page_table_range_init(unsigned long start, unsigned long end,
void __init allocate_pgdat(unsigned int nid)
{
unsigned long start_pfn, end_pfn;
-#ifdef CONFIG_NEED_MULTIPLE_NODES
- unsigned long phys;
-#endif
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
#ifdef CONFIG_NEED_MULTIPLE_NODES
- phys = __memblock_alloc_base(sizeof(struct pglist_data),
- SMP_CACHE_BYTES, end_pfn << PAGE_SHIFT);
- /* Retry with all of system memory */
- if (!phys)
- phys = __memblock_alloc_base(sizeof(struct pglist_data),
- SMP_CACHE_BYTES, memblock_end_of_DRAM());
- if (!phys)
+ NODE_DATA(nid) = memblock_alloc_try_nid_nopanic(
+ sizeof(struct pglist_data),
+ SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ if (!NODE_DATA(nid))
panic("Can't allocate pgdat for node %d\n", nid);
-
- NODE_DATA(nid) = __va(phys);
- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
#endif
NODE_DATA(nid)->node_start_pfn = start_pfn;
diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c
index 830e8b3684e4..c4bde6148810 100644
--- a/arch/sh/mm/numa.c
+++ b/arch/sh/mm/numa.c
@@ -41,9 +41,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
__add_active_range(nid, start_pfn, end_pfn);
/* Node-local pgdat */
- NODE_DATA(nid) = __va(memblock_alloc_base(sizeof(struct pglist_data),
- SMP_CACHE_BYTES, end));
- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+ NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data),
+ SMP_CACHE_BYTES, nid);
NODE_DATA(nid)->node_start_pfn = start_pfn;
NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
diff --git a/arch/sparc/configs/sparc32_defconfig b/arch/sparc/configs/sparc32_defconfig
index 207a43a2d8b3..2d4f34c52c67 100644
--- a/arch/sparc/configs/sparc32_defconfig
+++ b/arch/sparc/configs/sparc32_defconfig
@@ -75,7 +75,6 @@ CONFIG_NFS_FS=y
CONFIG_ROOT_NFS=y
CONFIG_RPCSEC_GSS_KRB5=m
CONFIG_NLS=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig
index 4d4e1cc6402f..ea547d596fcf 100644
--- a/arch/sparc/configs/sparc64_defconfig
+++ b/arch/sparc/configs/sparc64_defconfig
@@ -201,7 +201,6 @@ CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_HUGETLBFS=y
CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
CONFIG_LOCKUP_DETECTOR=y
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 1393a8ac596b..22500c3be7a9 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -231,36 +231,6 @@ extern unsigned long _PAGE_ALL_SZ_BITS;
extern struct page *mem_map_zero;
#define ZERO_PAGE(vaddr) (mem_map_zero)
-/* This macro must be updated when the size of struct page grows above 80
- * or reduces below 64.
- * The idea that compiler optimizes out switch() statement, and only
- * leaves clrx instructions
- */
-#define mm_zero_struct_page(pp) do { \
- unsigned long *_pp = (void *)(pp); \
- \
- /* Check that struct page is either 64, 72, or 80 bytes */ \
- BUILD_BUG_ON(sizeof(struct page) & 7); \
- BUILD_BUG_ON(sizeof(struct page) < 64); \
- BUILD_BUG_ON(sizeof(struct page) > 80); \
- \
- switch (sizeof(struct page)) { \
- case 80: \
- _pp[9] = 0; /* fallthrough */ \
- case 72: \
- _pp[8] = 0; /* fallthrough */ \
- default: \
- _pp[7] = 0; \
- _pp[6] = 0; \
- _pp[5] = 0; \
- _pp[4] = 0; \
- _pp[3] = 0; \
- _pp[2] = 0; \
- _pp[1] = 0; \
- _pp[0] = 0; \
- } \
-} while (0)
-
/* PFNs are real physical page numbers. However, mem_map only begins to record
* per-page information starting at pfn_base. This is to handle systems where
* the first physical page in the machine is at some huge physical address,
diff --git a/arch/sparc/kernel/pci_fire.c b/arch/sparc/kernel/pci_fire.c
index be71ae086622..0ca08d455e80 100644
--- a/arch/sparc/kernel/pci_fire.c
+++ b/arch/sparc/kernel/pci_fire.c
@@ -11,6 +11,7 @@
#include <linux/export.h>
#include <linux/irq.h>
#include <linux/of_device.h>
+#include <linux/numa.h>
#include <asm/prom.h>
#include <asm/irq.h>
@@ -416,7 +417,7 @@ static int pci_fire_pbm_init(struct pci_pbm_info *pbm,
struct device_node *dp = op->dev.of_node;
int err;
- pbm->numa_node = -1;
+ pbm->numa_node = NUMA_NO_NODE;
pbm->pci_ops = &sun4u_pci_ops;
pbm->config_space_reg_bits = 12;
diff --git a/arch/sparc/kernel/pci_schizo.c b/arch/sparc/kernel/pci_schizo.c
index 934b97c72f7c..421aba00e6b0 100644
--- a/arch/sparc/kernel/pci_schizo.c
+++ b/arch/sparc/kernel/pci_schizo.c
@@ -12,6 +12,7 @@
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/of_device.h>
+#include <linux/numa.h>
#include <asm/iommu.h>
#include <asm/irq.h>
@@ -1347,7 +1348,7 @@ static int schizo_pbm_init(struct pci_pbm_info *pbm,
pbm->next = pci_pbm_root;
pci_pbm_root = pbm;
- pbm->numa_node = -1;
+ pbm->numa_node = NUMA_NO_NODE;
pbm->pci_ops = &sun4u_pci_ops;
pbm->config_space_reg_bits = 8;
diff --git a/arch/sparc/kernel/prom_64.c b/arch/sparc/kernel/prom_64.c
index e897a4ded3a1..c50ff1fee0e6 100644
--- a/arch/sparc/kernel/prom_64.c
+++ b/arch/sparc/kernel/prom_64.c
@@ -34,16 +34,13 @@
void * __init prom_early_alloc(unsigned long size)
{
- unsigned long paddr = memblock_phys_alloc(size, SMP_CACHE_BYTES);
- void *ret;
+ void *ret = memblock_alloc(size, SMP_CACHE_BYTES);
- if (!paddr) {
+ if (!ret) {
prom_printf("prom_early_alloc(%lu) failed\n", size);
prom_halt();
}
- ret = __va(paddr);
- memset(ret, 0, size);
prom_early_allocated += size;
return ret;
diff --git a/arch/sparc/kernel/psycho_common.c b/arch/sparc/kernel/psycho_common.c
index 81aa91e5c0e6..e90bcb6bad7f 100644
--- a/arch/sparc/kernel/psycho_common.c
+++ b/arch/sparc/kernel/psycho_common.c
@@ -5,6 +5,7 @@
*/
#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/numa.h>
#include <asm/upa.h>
@@ -454,7 +455,7 @@ void psycho_pbm_init_common(struct pci_pbm_info *pbm, struct platform_device *op
struct device_node *dp = op->dev.of_node;
pbm->name = dp->full_name;
- pbm->numa_node = -1;
+ pbm->numa_node = NUMA_NO_NODE;
pbm->chip_type = chip_type;
pbm->chip_version = of_getintprop_default(dp, "version#", 0);
pbm->chip_revision = of_getintprop_default(dp, "module-revision#", 0);
diff --git a/arch/sparc/kernel/sbus.c b/arch/sparc/kernel/sbus.c
index 41c5deb581b8..32141e1006c4 100644
--- a/arch/sparc/kernel/sbus.c
+++ b/arch/sparc/kernel/sbus.c
@@ -15,6 +15,7 @@
#include <linux/interrupt.h>
#include <linux/of.h>
#include <linux/of_device.h>
+#include <linux/numa.h>
#include <asm/page.h>
#include <asm/io.h>
@@ -561,7 +562,7 @@ static void __init sbus_iommu_init(struct platform_device *op)
op->dev.archdata.iommu = iommu;
op->dev.archdata.stc = strbuf;
- op->dev.archdata.numa_node = -1;
+ op->dev.archdata.numa_node = NUMA_NO_NODE;
reg_base = regs + SYSIO_IOMMUREG_BASE;
iommu->iommu_control = reg_base + IOMMU_CONTROL;
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index b4221d3727d0..ef340e8f209f 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -976,13 +976,13 @@ static u64 __init memblock_nid_range_sun4u(u64 start, u64 end, int *nid)
{
int prev_nid, new_nid;
- prev_nid = -1;
+ prev_nid = NUMA_NO_NODE;
for ( ; start < end; start += PAGE_SIZE) {
for (new_nid = 0; new_nid < num_node_masks; new_nid++) {
struct node_mem_mask *p = &node_masks[new_nid];
if ((start & p->mask) == p->match) {
- if (prev_nid == -1)
+ if (prev_nid == NUMA_NO_NODE)
prev_nid = new_nid;
break;
}
@@ -1089,16 +1089,13 @@ static void __init allocate_node_data(int nid)
struct pglist_data *p;
unsigned long start_pfn, end_pfn;
#ifdef CONFIG_NEED_MULTIPLE_NODES
- unsigned long paddr;
- paddr = memblock_phys_alloc_try_nid(sizeof(struct pglist_data),
- SMP_CACHE_BYTES, nid);
- if (!paddr) {
+ NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data),
+ SMP_CACHE_BYTES, nid);
+ if (!NODE_DATA(nid)) {
prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid);
prom_halt();
}
- NODE_DATA(nid) = __va(paddr);
- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
NODE_DATA(nid)->node_id = nid;
#endif
@@ -1208,7 +1205,7 @@ int of_node_to_nid(struct device_node *dp)
md = mdesc_grab();
count = 0;
- nid = -1;
+ nid = NUMA_NO_NODE;
mdesc_for_each_node_by_name(md, grp, "group") {
if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) {
nid = count;
diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c
index 040a8c279761..a40219291965 100644
--- a/arch/unicore32/mm/mmu.c
+++ b/arch/unicore32/mm/mmu.c
@@ -141,18 +141,12 @@ static void __init build_mem_type_table(void)
#define vectors_base() (vectors_high() ? 0xffff0000 : 0)
-static void __init *early_alloc(unsigned long sz)
-{
- void *ptr = __va(memblock_phys_alloc(sz, sz));
- memset(ptr, 0, sz);
- return ptr;
-}
-
static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr,
unsigned long prot)
{
if (pmd_none(*pmd)) {
- pte_t *pte = early_alloc(PTRS_PER_PTE * sizeof(pte_t));
+ pte_t *pte = memblock_alloc(PTRS_PER_PTE * sizeof(pte_t),
+ PTRS_PER_PTE * sizeof(pte_t));
__pmd_populate(pmd, __pa(pte) | prot);
}
BUG_ON(pmd_bad(*pmd));
@@ -354,7 +348,7 @@ static void __init devicemaps_init(void)
/*
* Allocate the vector page early.
*/
- vectors = early_alloc(PAGE_SIZE);
+ vectors = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
for (addr = VMALLOC_END; addr; addr += PGDIR_SIZE)
pmd_clear(pmd_off_k(addr));
@@ -431,7 +425,7 @@ void __init paging_init(void)
top_pmd = pmd_off_k(0xffff0000);
/* allocate the zero page. */
- zero_page = early_alloc(PAGE_SIZE);
+ zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
bootmem_init();
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 56107e27ec6a..bfce02cbf261 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,7 +23,7 @@ config X86_64
def_bool y
depends on 64BIT
# Options that are inherently 64-bit kernel only:
- select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+ select ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION if (MEMORY_ISOLATION && COMPACTION) || CMA
select ARCH_SUPPORTS_INT128
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 5d946f4a881f..9f908112bbb9 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -287,7 +287,6 @@ CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
CONFIG_FRAME_WARN=1024
CONFIG_MAGIC_SYSRQ=y
# CONFIG_UNUSED_SYMBOLS is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 300ad569d6a7..1d3badfda09e 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -286,7 +286,6 @@ CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
CONFIG_MAGIC_SYSRQ=y
# CONFIG_UNUSED_SYMBOLS is not set
CONFIG_DEBUG_KERNEL=y
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 7469d321f072..5a5e7119ced4 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -17,8 +17,11 @@ static inline void arch_clear_hugepage_flags(struct page *page)
{
}
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION
+static inline bool gigantic_page_runtime_allocation_supported(void)
+{
+ return true;
+}
#endif
#endif /* _ASM_X86_HUGETLB_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a97f28d914d5..c25c38a05c1c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -422,25 +422,26 @@ static inline pgdval_t pgd_val(pgd_t pgd)
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
-static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep)
{
pteval_t ret;
- ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, mm, addr, ptep);
+ ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep);
return (pte_t) { .pte = ret };
}
-static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
+static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
{
+
if (sizeof(pteval_t) > sizeof(long))
/* 5 arg words */
- pv_ops.mmu.ptep_modify_prot_commit(mm, addr, ptep, pte);
+ pv_ops.mmu.ptep_modify_prot_commit(vma, addr, ptep, pte);
else
PVOP_VCALL4(mmu.ptep_modify_prot_commit,
- mm, addr, ptep, pte.pte);
+ vma, addr, ptep, pte.pte);
}
static inline void set_pte(pte_t *ptep, pte_t pte)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 488c59686a73..2474e434a6f7 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -55,6 +55,7 @@ struct task_struct;
struct cpumask;
struct flush_tlb_info;
struct mmu_gather;
+struct vm_area_struct;
/*
* Wrapper type for pointers to code which uses the non-standard
@@ -254,9 +255,9 @@ struct pv_mmu_ops {
pte_t *ptep, pte_t pteval);
void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
- pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
+ pte_t (*ptep_modify_prot_start)(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep);
- void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
+ void (*ptep_modify_prot_commit)(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t pte);
struct paravirt_callee_save pte_val;
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 662963681ea6..e662f987dfa2 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/scatterlist.h>
+#include <linux/numa.h>
#include <asm/io.h>
#include <asm/pat.h>
#include <asm/x86_init.h>
@@ -141,7 +142,7 @@ cpumask_of_pcibus(const struct pci_bus *bus)
int node;
node = __pcibus_to_node(bus);
- return (node == -1) ? cpu_online_mask :
+ return (node == NUMA_NO_NODE) ? cpu_online_mask :
cpumask_of_node(node);
}
#endif
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 780f2b42c8ef..7a1778de0bbf 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -76,7 +76,7 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
#endif
/**
- * access_ok: - Checks if a user space pointer is valid
+ * access_ok - Checks if a user space pointer is valid
* @addr: User space pointer to start of block to check
* @size: Size of block to check
*
@@ -85,12 +85,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
*
* Checks if a pointer to a block of memory in user space is valid.
*
- * Returns true (nonzero) if the memory block may be valid, false (zero)
- * if it is definitely invalid.
- *
* Note that, depending on architecture, this function probably just
* checks that the pointer is in the user space range - after calling
* this function, memory access functions may still return -EFAULT.
+ *
+ * Return: true (nonzero) if the memory block may be valid, false (zero)
+ * if it is definitely invalid.
*/
#define access_ok(addr, size) \
({ \
@@ -135,7 +135,7 @@ extern int __get_user_bad(void);
__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
/**
- * get_user: - Get a simple variable from user space.
+ * get_user - Get a simple variable from user space.
* @x: Variable to store result.
* @ptr: Source address, in user space.
*
@@ -149,7 +149,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
* @ptr must have pointer-to-simple-variable type, and the result of
* dereferencing @ptr must be assignable to @x without a cast.
*
- * Returns zero on success, or -EFAULT on error.
+ * Return: zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
/*
@@ -227,7 +227,7 @@ extern void __put_user_4(void);
extern void __put_user_8(void);
/**
- * put_user: - Write a simple value into user space.
+ * put_user - Write a simple value into user space.
* @x: Value to copy to user space.
* @ptr: Destination address, in user space.
*
@@ -241,7 +241,7 @@ extern void __put_user_8(void);
* @ptr must have pointer-to-simple-variable type, and @x must be assignable
* to the result of dereferencing @ptr.
*
- * Returns zero on success, or -EFAULT on error.
+ * Return: zero on success, or -EFAULT on error.
*/
#define put_user(x, ptr) \
({ \
@@ -501,7 +501,7 @@ struct __large_struct { unsigned long buf[100]; };
} while (0)
/**
- * __get_user: - Get a simple variable from user space, with less checking.
+ * __get_user - Get a simple variable from user space, with less checking.
* @x: Variable to store result.
* @ptr: Source address, in user space.
*
@@ -518,7 +518,7 @@ struct __large_struct { unsigned long buf[100]; };
* Caller must check the pointer with access_ok() before calling this
* function.
*
- * Returns zero on success, or -EFAULT on error.
+ * Return: zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
@@ -526,7 +526,7 @@ struct __large_struct { unsigned long buf[100]; };
__get_user_nocheck((x), (ptr), sizeof(*(ptr)))
/**
- * __put_user: - Write a simple value into user space, with less checking.
+ * __put_user - Write a simple value into user space, with less checking.
* @x: Value to copy to user space.
* @ptr: Destination address, in user space.
*
@@ -543,7 +543,7 @@ struct __large_struct { unsigned long buf[100]; };
* Caller must check the pointer with access_ok() before calling this
* function.
*
- * Returns zero on success, or -EFAULT on error.
+ * Return: zero on success, or -EFAULT on error.
*/
#define __put_user(x, ptr) \
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index a555da094157..1e225528f0d7 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -27,6 +27,7 @@
#include <linux/crash_dump.h>
#include <linux/reboot.h>
#include <linux/memory.h>
+#include <linux/numa.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
@@ -1390,7 +1391,7 @@ static void __init build_socket_tables(void)
}
/* Set socket -> node values: */
- lnid = -1;
+ lnid = NUMA_NO_NODE;
for_each_present_cpu(cpu) {
int nid = cpu_to_node(cpu);
int apicid, sockid;
@@ -1521,7 +1522,7 @@ static void __init uv_system_init_hub(void)
new_hub->pnode = 0xffff;
new_hub->numa_blade_id = uv_node_to_blade_id(nodeid);
- new_hub->memory_nid = -1;
+ new_hub->memory_nid = NUMA_NO_NODE;
new_hub->nr_possible_cpus = 0;
new_hub->nr_online_cpus = 0;
}
@@ -1538,7 +1539,7 @@ static void __init uv_system_init_hub(void)
uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid);
uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
- if (uv_cpu_hub_info(cpu)->memory_nid == -1)
+ if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE)
uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
/* Init memoryless node: */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index e8796fcd7e5a..13af08827eef 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -171,7 +171,7 @@ void __init setup_per_cpu_areas(void)
unsigned long delta;
int rc;
- pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n",
+ pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%u\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ccd1f2a8e557..c91ff9f9fe8a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,6 +56,7 @@
#include <linux/stackprotector.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
+#include <linux/numa.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -841,7 +842,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
/* reduce the number of lines printed when booting a large cpu count system */
static void announce_cpu(int cpu, int apicid)
{
- static int current_node = -1;
+ static int current_node = NUMA_NO_NODE;
int node = early_cpu_to_node(cpu);
static int width, node_width;
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index bfd94e7812fc..7d290777246d 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -54,13 +54,13 @@ do { \
} while (0)
/**
- * clear_user: - Zero a block of memory in user space.
+ * clear_user - Zero a block of memory in user space.
* @to: Destination address, in user space.
* @n: Number of bytes to zero.
*
* Zero a block of memory in user space.
*
- * Returns number of bytes that could not be cleared.
+ * Return: number of bytes that could not be cleared.
* On success, this will be zero.
*/
unsigned long
@@ -74,14 +74,14 @@ clear_user(void __user *to, unsigned long n)
EXPORT_SYMBOL(clear_user);
/**
- * __clear_user: - Zero a block of memory in user space, with less checking.
+ * __clear_user - Zero a block of memory in user space, with less checking.
* @to: Destination address, in user space.
* @n: Number of bytes to zero.
*
* Zero a block of memory in user space. Caller must check
* the specified block with access_ok() before calling this function.
*
- * Returns number of bytes that could not be cleared.
+ * Return: number of bytes that could not be cleared.
* On success, this will be zero.
*/
unsigned long
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9d5c75f02295..667f1da36208 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1031,7 +1031,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
- unsigned int fault)
+ vm_fault_t fault)
{
struct task_struct *tsk = current;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1308f5408bf7..12c1b7a83ed7 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -123,7 +123,7 @@ void __init setup_node_to_cpumask_map(void)
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
/* cpumask_of_node() will now work */
- pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
+ pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
}
static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
@@ -866,7 +866,7 @@ const struct cpumask *cpumask_of_node(int node)
{
if (node >= nr_node_ids) {
printk(KERN_WARNING
- "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
+ "cpumask_of_node(%d): node > nr_node_ids(%u)\n",
node, nr_node_ids);
dump_stack();
return cpu_none_mask;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index a7e47cf7ec6c..6e4c6bd62203 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -17,8 +17,8 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep);
+void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t pte);
unsigned long xen_read_cr2_direct(void);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 0f4fe206dcc2..856a85814f00 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -306,20 +306,20 @@ static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
__xen_set_pte(ptep, pteval);
}
-pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
+pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
/* Just return the pte as-is. We preserve the bits on commit */
- trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
+ trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep);
return *ptep;
}
-void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t pte)
{
struct mmu_update u;
- trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
+ trace_xen_mmu_ptep_modify_prot_commit(vma->vm_mm, addr, ptep, pte);
xen_mc_batch();
u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
diff --git a/crypto/Makefile b/crypto/Makefile
index 799ed5e94606..fb5bf2a3a666 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -128,7 +128,7 @@ obj-$(CONFIG_CRYPTO_CRC32C) += crc32c_generic.o
obj-$(CONFIG_CRYPTO_CRC32) += crc32_generic.o
obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif_common.o crct10dif_generic.o
obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o
-obj-$(CONFIG_CRYPTO_LZO) += lzo.o
+obj-$(CONFIG_CRYPTO_LZO) += lzo.o lzo-rle.o
obj-$(CONFIG_CRYPTO_LZ4) += lz4.o
obj-$(CONFIG_CRYPTO_LZ4HC) += lz4hc.o
obj-$(CONFIG_CRYPTO_842) += 842.o
diff --git a/crypto/lzo-rle.c b/crypto/lzo-rle.c
new file mode 100644
index 000000000000..ea9c75b1db49
--- /dev/null
+++ b/crypto/lzo-rle.c
@@ -0,0 +1,175 @@
+/*
+ * Cryptographic API.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/lzo.h>
+#include <crypto/internal/scompress.h>
+
+struct lzorle_ctx {
+ void *lzorle_comp_mem;
+};
+
+static void *lzorle_alloc_ctx(struct crypto_scomp *tfm)
+{
+ void *ctx;
+
+ ctx = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ return ctx;
+}
+
+static int lzorle_init(struct crypto_tfm *tfm)
+{
+ struct lzorle_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ ctx->lzorle_comp_mem = lzorle_alloc_ctx(NULL);
+ if (IS_ERR(ctx->lzorle_comp_mem))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void lzorle_free_ctx(struct crypto_scomp *tfm, void *ctx)
+{
+ kvfree(ctx);
+}
+
+static void lzorle_exit(struct crypto_tfm *tfm)
+{
+ struct lzorle_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ lzorle_free_ctx(NULL, ctx->lzorle_comp_mem);
+}
+
+static int __lzorle_compress(const u8 *src, unsigned int slen,
+ u8 *dst, unsigned int *dlen, void *ctx)
+{
+ size_t tmp_len = *dlen; /* size_t(ulong) <-> uint on 64 bit */
+ int err;
+
+ err = lzorle1x_1_compress(src, slen, dst, &tmp_len, ctx);
+
+ if (err != LZO_E_OK)
+ return -EINVAL;
+
+ *dlen = tmp_len;
+ return 0;
+}
+
+static int lzorle_compress(struct crypto_tfm *tfm, const u8 *src,
+ unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+ struct lzorle_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ return __lzorle_compress(src, slen, dst, dlen, ctx->lzorle_comp_mem);
+}
+
+static int lzorle_scompress(struct crypto_scomp *tfm, const u8 *src,
+ unsigned int slen, u8 *dst, unsigned int *dlen,
+ void *ctx)
+{
+ return __lzorle_compress(src, slen, dst, dlen, ctx);
+}
+
+static int __lzorle_decompress(const u8 *src, unsigned int slen,
+ u8 *dst, unsigned int *dlen)
+{
+ int err;
+ size_t tmp_len = *dlen; /* size_t(ulong) <-> uint on 64 bit */
+
+ err = lzo1x_decompress_safe(src, slen, dst, &tmp_len);
+
+ if (err != LZO_E_OK)
+ return -EINVAL;
+
+ *dlen = tmp_len;
+ return 0;
+}
+
+static int lzorle_decompress(struct crypto_tfm *tfm, const u8 *src,
+ unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+ return __lzorle_decompress(src, slen, dst, dlen);
+}
+
+static int lzorle_sdecompress(struct crypto_scomp *tfm, const u8 *src,
+ unsigned int slen, u8 *dst, unsigned int *dlen,
+ void *ctx)
+{
+ return __lzorle_decompress(src, slen, dst, dlen);
+}
+
+static struct crypto_alg alg = {
+ .cra_name = "lzo-rle",
+ .cra_flags = CRYPTO_ALG_TYPE_COMPRESS,
+ .cra_ctxsize = sizeof(struct lzorle_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_init = lzorle_init,
+ .cra_exit = lzorle_exit,
+ .cra_u = { .compress = {
+ .coa_compress = lzorle_compress,
+ .coa_decompress = lzorle_decompress } }
+};
+
+static struct scomp_alg scomp = {
+ .alloc_ctx = lzorle_alloc_ctx,
+ .free_ctx = lzorle_free_ctx,
+ .compress = lzorle_scompress,
+ .decompress = lzorle_sdecompress,
+ .base = {
+ .cra_name = "lzo-rle",
+ .cra_driver_name = "lzo-rle-scomp",
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int __init lzorle_mod_init(void)
+{
+ int ret;
+
+ ret = crypto_register_alg(&alg);
+ if (ret)
+ return ret;
+
+ ret = crypto_register_scomp(&scomp);
+ if (ret) {
+ crypto_unregister_alg(&alg);
+ return ret;
+ }
+
+ return ret;
+}
+
+static void __exit lzorle_mod_fini(void)
+{
+ crypto_unregister_alg(&alg);
+ crypto_unregister_scomp(&scomp);
+}
+
+module_init(lzorle_mod_init);
+module_exit(lzorle_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZO-RLE Compression Algorithm");
+MODULE_ALIAS_CRYPTO("lzo-rle");
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index e7fb87e114a5..1ea2d5007ff5 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -76,8 +76,8 @@ static char *check[] = {
"cast6", "arc4", "michael_mic", "deflate", "crc32c", "tea", "xtea",
"khazad", "wp512", "wp384", "wp256", "tnepres", "xeta", "fcrypt",
"camellia", "seed", "salsa20", "rmd128", "rmd160", "rmd256", "rmd320",
- "lzo", "cts", "sha3-224", "sha3-256", "sha3-384", "sha3-512",
- "streebog256", "streebog512",
+ "lzo", "lzo-rle", "cts", "sha3-224", "sha3-256", "sha3-384",
+ "sha3-512", "streebog256", "streebog512",
NULL
};
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 048cbf7d5233..1ba9d1a6ba5e 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -521,7 +521,44 @@ out:
}
static DEVICE_ATTR_WO(probe);
-#endif
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct memory_block *mem = to_memory_block(dev);
+ unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
+ bool remove;
+ int ret;
+
+ ret = kstrtobool(buf, &remove);
+ if (ret)
+ return ret;
+ if (!remove)
+ return count;
+
+ if (!is_memblock_offlined(mem))
+ return -EBUSY;
+
+ ret = lock_device_hotplug_sysfs();
+ if (ret)
+ return ret;
+
+ if (device_remove_file_self(dev, attr)) {
+ __remove_memory(pfn_to_nid(start_pfn), PFN_PHYS(start_pfn),
+ MIN_MEMORY_BLOCK_SIZE * sections_per_block);
+ ret = count;
+ } else {
+ ret = -EBUSY;
+ }
+
+ unlock_device_hotplug();
+ return ret;
+}
+
+static DEVICE_ATTR_WO(remove);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+#endif /* CONFIG_ARCH_MEMORY_PROBE */
#ifdef CONFIG_MEMORY_FAILURE
/*
@@ -615,6 +652,9 @@ static struct attribute *memory_memblk_attrs[] = {
&dev_attr_removable.attr,
#ifdef CONFIG_MEMORY_HOTREMOVE
&dev_attr_valid_zones.attr,
+#ifdef CONFIG_ARCH_MEMORY_PROBE
+ &dev_attr_remove.attr,
+#endif
#endif
NULL
};
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 9a6f40cd8df6..83302ecdc8db 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -40,6 +40,7 @@
#include <linux/export.h>
#include <linux/debugfs.h>
#include <linux/prefetch.h>
+#include <linux/numa.h>
#include "mtip32xx.h"
#define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32)
@@ -4013,9 +4014,9 @@ static int get_least_used_cpu_on_node(int node)
/* Helper for selecting a node in round robin mode */
static inline int mtip_get_next_rr_node(void)
{
- static int next_node = -1;
+ static int next_node = NUMA_NO_NODE;
- if (next_node == -1) {
+ if (next_node == NUMA_NO_NODE) {
next_node = first_online_node;
return next_node;
}
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 4ed0a78fdc09..4d9a38890965 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -20,6 +20,7 @@
static const char * const backends[] = {
"lzo",
+ "lzo-rle",
#if IS_ENABLED(CONFIG_CRYPTO_LZ4)
"lz4",
#endif
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 04ca65912638..e7a5f1d1c314 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -41,7 +41,7 @@ static DEFINE_IDR(zram_index_idr);
static DEFINE_MUTEX(zram_index_mutex);
static int zram_major;
-static const char *default_compressor = "lzo";
+static const char *default_compressor = "lzo-rle";
/* Module params (documentation at end) */
static unsigned int num_devices = 1;
diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c
index 7f88490b5479..c53f0f9ef5b0 100644
--- a/drivers/char/agp/efficeon-agp.c
+++ b/drivers/char/agp/efficeon-agp.c
@@ -163,7 +163,6 @@ static int efficeon_free_gatt_table(struct agp_bridge_data *bridge)
unsigned long page = efficeon_private.l1_table[index];
if (page) {
efficeon_private.l1_table[index] = 0;
- ClearPageReserved(virt_to_page((char *)page));
free_page(page);
freed++;
}
@@ -219,7 +218,6 @@ static int efficeon_create_gatt_table(struct agp_bridge_data *bridge)
efficeon_free_gatt_table(agp_bridge);
return -ENOMEM;
}
- SetPageReserved(virt_to_page((char *)page));
for (offset = 0; offset < PAGE_SIZE; offset += clflush_chunk)
clflush((char *)page+offset);
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index f1a441ab395d..3a11b1092e80 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -63,6 +63,7 @@
#include <linux/acpi_dma.h>
#include <linux/of_dma.h>
#include <linux/mempool.h>
+#include <linux/numa.h>
static DEFINE_MUTEX(dma_list_mutex);
static DEFINE_IDA(dma_ida);
@@ -386,7 +387,8 @@ EXPORT_SYMBOL(dma_issue_pending_all);
static bool dma_chan_is_local(struct dma_chan *chan, int cpu)
{
int node = dev_to_node(chan->device->dev);
- return node == -1 || cpumask_test_cpu(cpu, cpumask_of_node(node));
+ return node == NUMA_NO_NODE ||
+ cpumask_test_cpu(cpu, cpumask_of_node(node));
}
/**
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index 9726df37c4c4..540e20eb032c 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -123,12 +123,6 @@ static inline u64 ptr_to_u64(const void *ptr)
#include <linux/list.h>
-static inline int list_is_first(const struct list_head *list,
- const struct list_head *head)
-{
- return head->next == list;
-}
-
static inline void __list_del_many(struct list_head *head,
struct list_head *first)
{
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 7c6349a50ef1..dd475f3bcc8a 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -681,8 +681,13 @@ static struct notifier_block hv_memory_nb = {
/* Check if the particular page is backed and can be onlined and online it. */
static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
{
- if (!has_pfn_is_backed(has, page_to_pfn(pg)))
+ if (!has_pfn_is_backed(has, page_to_pfn(pg))) {
+ if (!PageOffline(pg))
+ __SetPageOffline(pg);
return;
+ }
+ if (PageOffline(pg))
+ __ClearPageOffline(pg);
/* This frame is currently backed; online the page. */
__online_page_set_limits(pg);
@@ -771,7 +776,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
}
}
-static void hv_online_page(struct page *pg)
+static void hv_online_page(struct page *pg, unsigned int order)
{
struct hv_hotadd_state *has;
unsigned long flags;
@@ -780,10 +785,11 @@ static void hv_online_page(struct page *pg)
spin_lock_irqsave(&dm_device.ha_lock, flags);
list_for_each_entry(has, &dm_device.ha_region_list, list) {
/* The page belongs to a different HAS. */
- if ((pfn < has->start_pfn) || (pfn >= has->end_pfn))
+ if ((pfn < has->start_pfn) ||
+ (pfn + (1UL << order) > has->end_pfn))
continue;
- hv_page_online_one(has, pg);
+ hv_bring_pgs_online(has, pfn, 1UL << order);
break;
}
spin_unlock_irqrestore(&dm_device.ha_lock, flags);
@@ -1201,6 +1207,7 @@ static void free_balloon_pages(struct hv_dynmem_device *dm,
for (i = 0; i < num_pages; i++) {
pg = pfn_to_page(i + start_frame);
+ __ClearPageOffline(pg);
__free_page(pg);
dm->num_pages_ballooned--;
}
@@ -1213,7 +1220,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
struct dm_balloon_response *bl_resp,
int alloc_unit)
{
- unsigned int i = 0;
+ unsigned int i, j;
struct page *pg;
if (num_pages < alloc_unit)
@@ -1245,6 +1252,10 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
if (alloc_unit != 1)
split_page(pg, get_order(alloc_unit << PAGE_SHIFT));
+ /* mark all pages offline */
+ for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++)
+ __SetPageOffline(pg + j);
+
bl_resp->range_count++;
bl_resp->range_array[i].finfo.start_page =
page_to_pfn(pg);
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index 2baf38cc1e23..4fe662c3bbc1 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -48,6 +48,7 @@
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/numa.h>
#include "hfi.h"
#include "affinity.h"
@@ -777,7 +778,7 @@ void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
_dev_comp_vect_cpu_mask_clean_up(dd, entry);
unlock:
mutex_unlock(&node_affinity.lock);
- dd->node = -1;
+ dd->node = NUMA_NO_NODE;
}
/*
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 7841a0ad7cb6..8d294228c0f4 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -54,6 +54,7 @@
#include <linux/printk.h>
#include <linux/hrtimer.h>
#include <linux/bitmap.h>
+#include <linux/numa.h>
#include <rdma/rdma_vt.h>
#include "hfi.h"
@@ -1312,7 +1313,7 @@ static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev,
dd->unit = ret;
list_add(&dd->list, &hfi1_dev_list);
}
- dd->node = -1;
+ dd->node = NUMA_NO_NODE;
spin_unlock_irqrestore(&hfi1_devs_lock, flags);
idr_preload_end();
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index dc9f14811e0f..6b7df25e1488 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -39,6 +39,7 @@
#include <linux/dmi.h>
#include <linux/slab.h>
#include <linux/iommu.h>
+#include <linux/numa.h>
#include <asm/irq_remapping.h>
#include <asm/iommu_table.h>
@@ -477,7 +478,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg)
int node = acpi_map_pxm_to_node(rhsa->proximity_domain);
if (!node_online(node))
- node = -1;
+ node = NUMA_NO_NODE;
drhd->iommu->node = node;
return 0;
}
@@ -1062,7 +1063,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
iommu->msagaw = msagaw;
iommu->segment = drhd->segment;
- iommu->node = -1;
+ iommu->node = NUMA_NO_NODE;
ver = readl(iommu->reg + DMAR_VER_REG);
pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index af23cfc2a05e..b776e0b4c29e 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -48,6 +48,7 @@
#include <linux/dma-contiguous.h>
#include <linux/dma-direct.h>
#include <linux/crash_dump.h>
+#include <linux/numa.h>
#include <asm/irq_remapping.h>
#include <asm/cacheflush.h>
#include <asm/iommu.h>
@@ -1716,7 +1717,7 @@ static struct dmar_domain *alloc_domain(int flags)
return NULL;
memset(domain, 0, sizeof(*domain));
- domain->nid = -1;
+ domain->nid = NUMA_NO_NODE;
domain->flags = flags;
domain->has_iotlb_device = false;
INIT_LIST_HEAD(&domain->devices);
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index 0441abe87880..9e443df44b3b 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -22,6 +22,7 @@
#include <linux/module.h>
#include <linux/err.h>
#include <linux/slab.h>
+#include <linux/numa.h>
#include <asm/uv/uv_hub.h>
#if defined CONFIG_X86_64
#include <asm/uv/bios.h>
@@ -61,7 +62,7 @@ static struct xpc_heartbeat_uv *xpc_heartbeat_uv;
XPC_NOTIFY_MSG_SIZE_UV)
#define XPC_NOTIFY_IRQ_NAME "xpc_notify"
-static int xpc_mq_node = -1;
+static int xpc_mq_node = NUMA_NO_NODE;
static struct xpc_gru_mq_uv *xpc_activate_mq_uv;
static struct xpc_gru_mq_uv *xpc_notify_mq_uv;
diff --git a/drivers/misc/sram-exec.c b/drivers/misc/sram-exec.c
index 426ad912b441..d054e2842a5f 100644
--- a/drivers/misc/sram-exec.c
+++ b/drivers/misc/sram-exec.c
@@ -96,7 +96,7 @@ void *sram_exec_copy(struct gen_pool *pool, void *dst, void *src,
if (!part)
return NULL;
- if (!addr_in_gen_pool(pool, (unsigned long)dst, size))
+ if (!gen_pool_has_addr(pool, (unsigned long)dst, size))
return NULL;
base = (unsigned long)part->base;
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 7842999a0d24..ad807d5a3141 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -567,6 +567,36 @@ vmballoon_page_in_frames(enum vmballoon_page_size_type page_size)
}
/**
+ * vmballoon_mark_page_offline() - mark a page as offline
+ * @page: pointer for the page.
+ * @page_size: the size of the page.
+ */
+static void
+vmballoon_mark_page_offline(struct page *page,
+ enum vmballoon_page_size_type page_size)
+{
+ int i;
+
+ for (i = 0; i < vmballoon_page_in_frames(page_size); i++)
+ __SetPageOffline(page + i);
+}
+
+/**
+ * vmballoon_mark_page_online() - mark a page as online
+ * @page: pointer for the page.
+ * @page_size: the size of the page.
+ */
+static void
+vmballoon_mark_page_online(struct page *page,
+ enum vmballoon_page_size_type page_size)
+{
+ int i;
+
+ for (i = 0; i < vmballoon_page_in_frames(page_size); i++)
+ __ClearPageOffline(page + i);
+}
+
+/**
* vmballoon_send_get_target() - Retrieve desired balloon size from the host.
*
* @b: pointer to the balloon.
@@ -623,6 +653,7 @@ static int vmballoon_alloc_page_list(struct vmballoon *b,
ctl->page_size);
if (page) {
+ vmballoon_mark_page_offline(page, ctl->page_size);
/* Success. Add the page to the list and continue. */
list_add(&page->lru, &ctl->pages);
continue;
@@ -861,6 +892,7 @@ static void vmballoon_release_page_list(struct list_head *page_list,
list_for_each_entry_safe(page, tmp, page_list, lru) {
list_del(&page->lru);
+ vmballoon_mark_page_online(page, page_size);
__free_pages(page, vmballoon_page_order(page_size));
}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 38c430b94ae3..cf2bd7c982a2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -27,6 +27,7 @@
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <linux/atomic.h>
+#include <linux/numa.h>
#include <scsi/fc/fc_fcoe.h>
#include <net/udp_tunnel.h>
#include <net/pkt_cls.h>
@@ -6415,7 +6416,7 @@ int ixgbe_setup_tx_resources(struct ixgbe_ring *tx_ring)
{
struct device *dev = tx_ring->dev;
int orig_node = dev_to_node(dev);
- int ring_node = -1;
+ int ring_node = NUMA_NO_NODE;
int size;
size = sizeof(struct ixgbe_tx_buffer) * tx_ring->count;
@@ -6509,7 +6510,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
{
struct device *dev = rx_ring->dev;
int orig_node = dev_to_node(dev);
- int ring_node = -1;
+ int ring_node = NUMA_NO_NODE;
int size;
size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c
index bad0e0ea4f30..cf45829585cb 100644
--- a/drivers/rapidio/rio_cm.c
+++ b/drivers/rapidio/rio_cm.c
@@ -1215,7 +1215,9 @@ static int riocm_ch_listen(u16 ch_id)
riocm_debug(CHOP, "(ch_%d)", ch_id);
ch = riocm_get_channel(ch_id);
- if (!ch || !riocm_cmp_exch(ch, RIO_CM_CHAN_BOUND, RIO_CM_LISTEN))
+ if (!ch)
+ return -EINVAL;
+ if (!riocm_cmp_exch(ch, RIO_CM_CHAN_BOUND, RIO_CM_LISTEN))
ret = -EINVAL;
riocm_put_channel(ch);
return ret;
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index ceb5048de9a7..39b229f9e256 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -369,14 +369,20 @@ static enum bp_state reserve_additional_memory(void)
return BP_ECANCELED;
}
-static void xen_online_page(struct page *page)
+static void xen_online_page(struct page *page, unsigned int order)
{
- __online_page_set_limits(page);
+ unsigned long i, size = (1 << order);
+ unsigned long start_pfn = page_to_pfn(page);
+ struct page *p;
+ pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn);
mutex_lock(&balloon_mutex);
-
- __balloon_append(page);
-
+ for (i = 0; i < size; i++) {
+ p = pfn_to_page(start_pfn + i);
+ __online_page_set_limits(p);
+ __SetPageOffline(p);
+ __balloon_append(p);
+ }
mutex_unlock(&balloon_mutex);
}
@@ -441,6 +447,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages)
xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]);
/* Relinquish the page back to the allocator. */
+ __ClearPageOffline(page);
free_reserved_page(page);
}
@@ -467,6 +474,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
state = BP_EAGAIN;
break;
}
+ __SetPageOffline(page);
adjust_managed_page_count(page, -1);
xenmem_reservation_scrub_page(page);
list_add(&page->lru, &pages);
diff --git a/fs/Kconfig b/fs/Kconfig
index 3e6d3101f3ff..55c87a726a23 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -214,7 +214,7 @@ config HUGETLB_PAGE
config MEMFD_CREATE
def_bool TMPFS || HUGETLBFS
-config ARCH_HAS_GIGANTIC_PAGE
+config ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION
bool
source "fs/configfs/Kconfig"
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 3e59f0ed777b..70c132acdab1 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -105,6 +105,7 @@ struct autofs_wait_queue {
#define AUTOFS_SBI_CATATONIC 0x0001
#define AUTOFS_SBI_STRICTEXPIRE 0x0002
+#define AUTOFS_SBI_IGNORE 0x0004
struct autofs_sb_info {
u32 magic;
@@ -215,6 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
return -EINVAL;
/* We want a packet pipe */
pipe->f_flags |= O_DIRECT;
+ /* We don't expect -EAGAIN */
+ pipe->f_flags &= ~O_NONBLOCK;
return 0;
}
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 078992eee299..80597b88718b 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -82,18 +82,20 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",maxproto=%d", sbi->max_proto);
if (autofs_type_offset(sbi->type))
- seq_printf(m, ",offset");
+ seq_puts(m, ",offset");
else if (autofs_type_direct(sbi->type))
- seq_printf(m, ",direct");
+ seq_puts(m, ",direct");
else
- seq_printf(m, ",indirect");
+ seq_puts(m, ",indirect");
if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE)
- seq_printf(m, ",strictexpire");
+ seq_puts(m, ",strictexpire");
+ if (sbi->flags & AUTOFS_SBI_IGNORE)
+ seq_puts(m, ",ignore");
#ifdef CONFIG_CHECKPOINT_RESTORE
if (sbi->pipe)
seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino);
else
- seq_printf(m, ",pipe_ino=-1");
+ seq_puts(m, ",pipe_ino=-1");
#endif
return 0;
}
@@ -111,7 +113,8 @@ static const struct super_operations autofs_sops = {
};
enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
- Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire};
+ Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire,
+ Opt_ignore};
static const match_table_t tokens = {
{Opt_fd, "fd=%u"},
@@ -124,6 +127,7 @@ static const match_table_t tokens = {
{Opt_direct, "direct"},
{Opt_offset, "offset"},
{Opt_strictexpire, "strictexpire"},
+ {Opt_ignore, "ignore"},
{Opt_err, NULL}
};
@@ -206,6 +210,9 @@ static int parse_options(char *options,
case Opt_strictexpire:
sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
break;
+ case Opt_ignore:
+ sbi->flags |= AUTOFS_SBI_IGNORE;
+ break;
default:
return 1;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 54207327f98f..7d09d125f148 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -57,8 +57,6 @@
#endif
static int load_elf_binary(struct linux_binprm *bprm);
-static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
- int, int, unsigned long);
#ifdef CONFIG_USELIB
static int load_elf_library(struct file *);
@@ -347,7 +345,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
#ifndef elf_map
static unsigned long elf_map(struct file *filep, unsigned long addr,
- struct elf_phdr *eppnt, int prot, int type,
+ const struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
{
unsigned long map_addr;
@@ -387,7 +385,7 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
#endif /* !elf_map */
-static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
{
int i, first_idx = -1, last_idx = -1;
@@ -414,12 +412,13 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
* header pointed to by elf_ex, into a newly allocated array. The caller is
* responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
*/
-static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
+static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
struct file *elf_file)
{
struct elf_phdr *elf_phdata = NULL;
- int retval, size, err = -1;
+ int retval, err = -1;
loff_t pos = elf_ex->e_phoff;
+ unsigned int size;
/*
* If the size of this structure has changed, then punt, since
@@ -429,13 +428,9 @@ static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
goto out;
/* Sanity check the number of program headers... */
- if (elf_ex->e_phnum < 1 ||
- elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
- goto out;
-
/* ...and their total size. */
size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
- if (size > ELF_MIN_ALIGN)
+ if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN)
goto out;
elf_phdata = kmalloc(size, GFP_KERNEL);
@@ -2033,7 +2028,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
const kernel_siginfo_t *siginfo, struct pt_regs *regs)
{
- struct list_head *t;
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -2050,10 +2044,9 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
list_add(&ets->list, &info->thread_list);
}
- list_for_each(t, &info->thread_list) {
+ list_for_each_entry(ets, &info->thread_list, list) {
int sz;
- ets = list_entry(t, struct elf_thread_status, list);
sz = elf_dump_thread_status(siginfo->si_signo, ets);
info->thread_status_size += sz;
}
@@ -2117,20 +2110,17 @@ static size_t get_note_info_size(struct elf_note_info *info)
static int write_note_info(struct elf_note_info *info,
struct coredump_params *cprm)
{
+ struct elf_thread_status *ets;
int i;
- struct list_head *t;
for (i = 0; i < info->numnote; i++)
if (!writenote(info->notes + i, cprm))
return 0;
/* write out the thread status notes section */
- list_for_each(t, &info->thread_list) {
- struct elf_thread_status *tmp =
- list_entry(t, struct elf_thread_status, list);
-
- for (i = 0; i < tmp->num_notes; i++)
- if (!writenote(&tmp->notes[i], cprm))
+ list_for_each_entry(ets, &info->thread_list, list) {
+ for (i = 0; i < ets->num_notes; i++)
+ if (!writenote(&ets->notes[i], cprm))
return 0;
}
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 7cde3f46ad26..fedcbf3e1f1c 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -20,6 +20,7 @@ static int load_script(struct linux_binprm *bprm)
char *cp;
struct file *file;
int retval;
+ bool truncated = false, end_of_interp = false;
if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
return -ENOEXEC;
@@ -42,28 +43,56 @@ static int load_script(struct linux_binprm *bprm)
fput(bprm->file);
bprm->file = NULL;
- bprm->buf[BINPRM_BUF_SIZE - 1] = '\0';
- if ((cp = strchr(bprm->buf, '\n')) == NULL)
- cp = bprm->buf+BINPRM_BUF_SIZE-1;
+ /*
+ * Truncating interpreter arguments is okay: the interpreter
+ * can re-read the script to parse them on its own. Truncating
+ * the interpreter path itself, though, is bad. Note truncation
+ * here, and check for either newline or start of arguments
+ * below.
+ */
+ for (cp = bprm->buf+2;; cp++) {
+ if (!*cp || (*cp == '\n')) {
+ end_of_interp = true;
+ break;
+ }
+ if (cp == bprm->buf + BINPRM_BUF_SIZE - 1) {
+ truncated = true;
+ break;
+ }
+ }
*cp = '\0';
+
+ /* Truncate trailing whitespace */
while (cp > bprm->buf) {
cp--;
- if ((*cp == ' ') || (*cp == '\t'))
+ if ((*cp == ' ') || (*cp == '\t')) {
+ end_of_interp = true;
*cp = '\0';
- else
+ } else
break;
}
+ /* Skip leading whitespace */
for (cp = bprm->buf+2; (*cp == ' ') || (*cp == '\t'); cp++);
if (*cp == '\0')
return -ENOEXEC; /* No interpreter name found */
i_name = cp;
i_arg = NULL;
+ /*
+ * Skip until end of string or finding whitespace which
+ * signals the start of interpreter arguments.
+ */
for ( ; *cp && (*cp != ' ') && (*cp != '\t'); cp++)
/* nothing */ ;
- while ((*cp == ' ') || (*cp == '\t'))
+ /* Truncate and skip any whitespace in front of arguments */
+ while ((*cp == ' ') || (*cp == '\t')) {
+ end_of_interp = true;
*cp++ = '\0';
+ }
if (*cp)
i_arg = cp;
+ /* Fail exec if the name of the interpreter was cut off. */
+ if (truncated && !end_of_interp)
+ return -ENOEXEC;
/*
* OK, we've parsed out the interpreter name and
* (optional) argument.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 85140913c0f5..129d26226e70 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3449,31 +3449,17 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
#if defined(CONFIG_DYNAMIC_DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk(fs_info, KERN_DEBUG fmt, ##args); \
-} while (0)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args); \
-} while (0)
+ _dynamic_func_call_no_desc(fmt, btrfs_printk, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, \
- ##args);\
-} while (0)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, \
- ##args); \
-} while (0)
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
+ fs_info, KERN_DEBUG fmt, ##args)
#elif defined(DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
diff --git a/fs/buffer.c b/fs/buffer.c
index 89a4e42b9aad..cec17408bb1f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -955,10 +955,20 @@ grow_dev_page(struct block_device *bdev, sector_t block,
end_block = init_page_buffers(page, bdev,
(sector_t)index << sizebits,
size);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x01;
+#endif
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x02;
+#endif
goto failed;
+ }
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x04;
+#endif
}
/*
@@ -978,6 +988,9 @@ grow_dev_page(struct block_device *bdev, sector_t block,
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x08;
+#endif
failed:
unlock_page(page);
put_page(page);
@@ -1033,6 +1046,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
return NULL;
}
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_stamp = jiffies;
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+#endif
for (;;) {
struct buffer_head *bh;
int ret;
@@ -1044,6 +1063,24 @@ __getblk_slow(struct block_device *bdev, sector_t block,
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
+
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ if (!time_after(jiffies, current->getblk_stamp + 3 * HZ))
+ continue;
+ printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n",
+ current->comm, current->pid, current->getblk_executed,
+ current->getblk_bh_count, current->getblk_bh_state,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1L :
+ bdev->bd_super->s_blocksize, size,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1 :
+ bdev->bd_super->s_blocksize_bits,
+ IS_ERR_OR_NULL(bdev->bd_inode) ? -1 :
+ bdev->bd_inode->i_blkbits);
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+ current->getblk_stamp = jiffies;
+#endif
}
}
@@ -3219,6 +3256,11 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*/
static inline int buffer_busy(struct buffer_head *bh)
{
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x80;
+ current->getblk_bh_count = atomic_read(&bh->b_count);
+ current->getblk_bh_state = bh->b_state;
+#endif
return atomic_read(&bh->b_count) |
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
@@ -3257,11 +3299,18 @@ int try_to_free_buffers(struct page *page)
int ret = 0;
BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
+ if (PageWriteback(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x10;
+#endif
return 0;
+ }
if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(page, &buffers_to_free);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x20;
+#endif
goto out;
}
@@ -3285,6 +3334,9 @@ int try_to_free_buffers(struct page *page)
if (ret)
cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x40;
+#endif
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a5d219d920e7..4a0e98d87fcc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -50,10 +50,10 @@
*
* 1) epmutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->wq.lock (spinlock)
+ * 3) ep->lock (rwlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->wq.lock) because we manipulate objects
+ * We need a rwlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -85,7 +85,7 @@
* of epoll file descriptors, we use the current recursion depth as
* the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->wq.lock") to have it working,
+ * mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
* Events that require holding "epmutex" are very rare, while for
* normal operations the epoll private "ep->mtx" will guarantee
@@ -182,8 +182,6 @@ struct epitem {
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
- *
- * Access to it is protected by the lock inside wq.
*/
struct eventpoll {
/*
@@ -203,13 +201,16 @@ struct eventpoll {
/* List of ready file descriptors */
struct list_head rdllist;
+ /* Lock which protects rdllist and ovflist */
+ rwlock_t lock;
+
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
- * holding ->wq.lock.
+ * holding ->lock.
*/
struct epitem *ovflist;
@@ -697,17 +698,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, &txlist);
WRITE_ONCE(ep->ovflist, NULL);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
/*
* Now call the callback function.
*/
res = (*sproc)(ep, &txlist, priv);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
@@ -722,7 +723,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* contain them, and the list_splice() below takes care of them.
*/
if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ /*
+ * ->ovflist is LIFO, so we have to reverse it in order
+ * to keep in FIFO.
+ */
+ list_add(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
@@ -745,11 +750,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* the ->poll() wait list (delayed after we release the lock).
*/
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
if (!ep_locked)
mutex_unlock(&ep->mtx);
@@ -789,10 +794,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
rb_erase_cached(&epi->rbn, &ep->rbr);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -842,7 +847,7 @@ static void ep_free(struct eventpoll *ep)
* Walks through the whole tree by freeing each "struct epitem". At this
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
- * us during this operation. So we can avoid the lock on "ep->wq.lock".
+ * us during this operation. So we can avoid the lock on "ep->lock".
* We do not need to lock ep->mtx, either, we only do it to prevent
* a lockdep warning.
*/
@@ -1023,6 +1028,7 @@ static int ep_alloc(struct eventpoll **pep)
goto free_uid;
mutex_init(&ep->mtx);
+ rwlock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1112,21 +1118,107 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
+/**
+ * Adds a new entry to the tail of the list in a lockless way, i.e.
+ * multiple CPUs are allowed to call this function concurrently.
+ *
+ * Beware: it is necessary to prevent any other modifications of the
+ * existing list until all changes are completed, in other words
+ * concurrent list_add_tail_lockless() calls should be protected
+ * with a read lock, where write lock acts as a barrier which
+ * makes sure all list_add_tail_lockless() calls are fully
+ * completed.
+ *
+ * Also an element can be locklessly added to the list only in one
+ * direction i.e. either to the tail either to the head, otherwise
+ * concurrent access will corrupt the list.
+ *
+ * Returns %false if element has been already added to the list, %true
+ * otherwise.
+ */
+static inline bool list_add_tail_lockless(struct list_head *new,
+ struct list_head *head)
+{
+ struct list_head *prev;
+
+ /*
+ * This is simple 'new->next = head' operation, but cmpxchg()
+ * is used in order to detect that same element has been just
+ * added to the list from another CPU: the winner observes
+ * new->next == new.
+ */
+ if (cmpxchg(&new->next, new, head) != new)
+ return false;
+
+ /*
+ * Initially ->next of a new element must be updated with the head
+ * (we are inserting to the tail) and only then pointers are atomically
+ * exchanged. XCHG guarantees memory ordering, thus ->next should be
+ * updated before pointers are actually swapped and pointers are
+ * swapped before prev->next is updated.
+ */
+
+ prev = xchg(&head->prev, new);
+
+ /*
+ * It is safe to modify prev->next and new->prev, because a new element
+ * is added only to the tail and new->next is updated before XCHG.
+ */
+
+ prev->next = new;
+ new->prev = prev;
+
+ return true;
+}
+
+/**
+ * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
+ * i.e. multiple CPUs are allowed to call this function concurrently.
+ *
+ * Returns %false if epi element has been already chained, %true otherwise.
+ */
+static inline bool chain_epi_lockless(struct epitem *epi)
+{
+ struct eventpoll *ep = epi->ep;
+
+ /* Check that the same epi has not been just chained from another CPU */
+ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
+ return false;
+
+ /* Atomically exchange tail */
+ epi->next = xchg(&ep->ovflist, epi);
+
+ return true;
+}
+
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
+ *
+ * This callback takes a read lock in order not to content with concurrent
+ * events from another file descriptors, thus all modifications to ->rdllist
+ * or ->ovflist are lockless. Read lock is paired with the write lock from
+ * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * that lists state is seen correctly.
+ *
+ * Another thing worth to mention is that ep_poll_callback() can be called
+ * concurrently for the same @epi from different CPUs if poll table was inited
+ * with several wait queues entries. Plural wakeup from different CPUs of a
+ * single wait queue is serialized by wq.lock, but the case when multiple wait
+ * queues are used should be detected accordingly. This is detected using
+ * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
- unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key);
+ unsigned long flags;
int ewake = 0;
- spin_lock_irqsave(&ep->wq.lock, flags);
+ read_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1155,24 +1247,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (epi->next == EP_UNACTIVE_PTR) {
- epi->next = READ_ONCE(ep->ovflist);
- WRITE_ONCE(ep->ovflist, epi);
- if (epi->ws) {
- /*
- * Activate ep->ws since epi->ws may get
- * deactivated at any time.
- */
- __pm_stay_awake(ep->ws);
- }
-
- }
+ if (epi->next == EP_UNACTIVE_PTR &&
+ chain_epi_lockless(epi))
+ ep_pm_stay_awake_rcu(epi);
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ if (!ep_is_linked(epi) &&
+ list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
ep_pm_stay_awake_rcu(epi);
}
@@ -1197,13 +1280,13 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
break;
}
}
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
- spin_unlock_irqrestore(&ep->wq.lock, flags);
+ read_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1488,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1500,12 +1583,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
atomic_long_inc(&ep->user->epoll_watches);
@@ -1531,10 +1614,10 @@ error_unregister:
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
@@ -1578,9 +1661,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* 1) Flush epi changes above to other CPUs. This ensures
* we do not miss events from ep_poll_callback if an
* event occurs immediately after we call f_op->poll().
- * We need this because we did not take ep->wq.lock while
+ * We need this because we did not take ep->lock while
* changing epi above (but ep_poll_callback does take
- * ep->wq.lock).
+ * ep->lock).
*
* 2) We also need to ensure we do not miss _past_ events
* when calling f_op->poll(). This barrier also
@@ -1599,18 +1682,18 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -1771,9 +1854,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
*/
timed_out = 1;
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
eavail = ep_events_available(ep);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
goto send_events;
}
diff --git a/fs/exec.c b/fs/exec.c
index 966cd98a2ce2..7a4b5efeab3c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1563,7 +1563,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
/*
* Fill the binprm structure from the inode.
- * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ * Check permissions, then read the first BINPRM_BUF_SIZE bytes
*
* This may be called multiple times for binary chains (scripts for example).
*/
@@ -1944,15 +1944,10 @@ EXPORT_SYMBOL(set_binfmt);
*/
void set_dumpable(struct mm_struct *mm, int value)
{
- unsigned long old, new;
-
if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
return;
- do {
- old = READ_ONCE(mm->flags);
- new = (old & ~MMF_DUMPABLE_MASK) | value;
- } while (cmpxchg(&mm->flags, old, new) != old);
+ set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
}
SYSCALL_DEFINE3(execve,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 13935ee99e1e..b3bed32946b1 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -214,6 +214,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.fallocate = fat_fallocate,
};
diff --git a/fs/file.c b/fs/file.c
index 97df385d6ab0..3da91a112bab 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -457,6 +457,7 @@ struct files_struct init_files = {
.full_fds_bits = init_files.full_fds_bits_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+ .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 239c7ca09b74..ec32fece5e1e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -544,7 +544,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
inode_lock(inode);
/* protected by i_mutex */
- if (info->seals & F_SEAL_WRITE) {
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
inode_unlock(inode);
return -EPERM;
}
@@ -873,6 +873,18 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
+
+ /*
+ * page_private is subpool pointer in hugetlb pages. Transfer to
+ * new page. PagePrivate is not associated with page_private for
+ * hugetlb pages and can not be set here as only page_huge_active
+ * pages can be migrated.
+ */
+ if (page_private(page)) {
+ set_page_private(newpage, page_private(page));
+ set_page_private(page, 0);
+ }
+
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
diff --git a/fs/inode.c b/fs/inode.c
index 73432e64f874..e9d97add2b36 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2093,14 +2093,8 @@ EXPORT_SYMBOL(inode_dio_wait);
void inode_set_flags(struct inode *inode, unsigned int flags,
unsigned int mask)
{
- unsigned int old_flags, new_flags;
-
WARN_ON_ONCE(flags & ~mask);
- do {
- old_flags = READ_ONCE(inode->i_flags);
- new_flags = (old_flags & ~mask) | flags;
- } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
- new_flags) != old_flags));
+ set_mask_bits(&inode->i_flags, mask, flags);
}
EXPORT_SYMBOL(inode_set_flags);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index f8d5021a652e..ae948aaa4c53 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -832,26 +832,35 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
* to see if it supports poll (Neither 'poll' nor 'select' return
* an appropriate error code). When in doubt, set a suitable timeout value.
*/
+__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
+{
+ struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
+ struct kernfs_open_node *on = kn->attr.open;
+
+ poll_wait(of->file, &on->poll, wait);
+
+ if (of->event != atomic_read(&on->event))
+ return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+
+ return DEFAULT_POLLMASK;
+}
+
static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
{
struct kernfs_open_file *of = kernfs_of(filp);
struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
- struct kernfs_open_node *on = kn->attr.open;
+ __poll_t ret;
if (!kernfs_get_active(kn))
- goto trigger;
+ return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
- poll_wait(filp, &on->poll, wait);
+ if (kn->attr.ops->poll)
+ ret = kn->attr.ops->poll(of, wait);
+ else
+ ret = kernfs_generic_poll(of, wait);
kernfs_put_active(kn);
-
- if (of->event != atomic_read(&on->event))
- goto trigger;
-
- return DEFAULT_POLLMASK;
-
- trigger:
- return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+ return ret;
}
static void kernfs_notify_workfn(struct work_struct *work)
diff --git a/fs/namei.c b/fs/namei.c
index c59454263327..dede0147b3f6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -39,7 +39,6 @@
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>
-#include <linux/build_bug.h>
#include "internal.h"
#include "mount.h"
@@ -131,7 +130,6 @@ getname_flags(const char __user *filename, int flags, int *empty)
struct filename *result;
char *kname;
int len;
- BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0);
result = audit_reusename(filename);
if (result)
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d1cbb27808e2..6f0999015a44 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
return count;
}
-int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+static
+int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
- u64 start, len, trimmed, first_group, last_group, group;
+ u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
int ret, cnt;
u32 first_bit, last_bit, minlen;
struct buffer_head *main_bm_bh = NULL;
@@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
- struct ocfs2_trim_fs_info info, *pinfo = NULL;
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
@@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
return -EINVAL;
+ trace_ocfs2_trim_mainbm(start, len, minlen);
+
+next_group:
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
- if (start >= le32_to_cpu(main_bm->i_clusters)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- len = range->len >> osb->s_clustersize_bits;
- if (start + len > le32_to_cpu(main_bm->i_clusters))
- len = le32_to_cpu(main_bm->i_clusters) - start;
-
- trace_ocfs2_trim_fs(start, len, minlen);
-
- ocfs2_trim_fs_lock_res_init(osb);
- ret = ocfs2_trim_fs_lock(osb, NULL, 1);
- if (ret < 0) {
- if (ret != -EAGAIN) {
- mlog_errno(ret);
- ocfs2_trim_fs_lock_res_uninit(osb);
+ /*
+ * Do some check before trim the first group.
+ */
+ if (!group) {
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
goto out_unlock;
}
- mlog(ML_NOTICE, "Wait for trim on device (%s) to "
- "finish, which is running from another node.\n",
- osb->dev_str);
- ret = ocfs2_trim_fs_lock(osb, &info, 0);
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_trim_fs_lock_res_uninit(osb);
- goto out_unlock;
- }
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
- if (info.tf_valid && info.tf_success &&
- info.tf_start == start && info.tf_len == len &&
- info.tf_minlen == minlen) {
- /* Avoid sending duplicated trim to a shared device */
- mlog(ML_NOTICE, "The same trim on device (%s) was "
- "just done from node (%u), return.\n",
- osb->dev_str, info.tf_nodenum);
- range->len = info.tf_trimlen;
- goto out_trimunlock;
- }
+ /*
+ * Determine first and last group to examine based on
+ * start and len
+ */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb,
+ first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode,
+ start + len - 1);
+ group = first_group;
}
- info.tf_nodenum = osb->node_num;
- info.tf_start = start;
- info.tf_len = len;
- info.tf_minlen = minlen;
-
- /* Determine first and last group to examine based on start and len */
- first_group = ocfs2_which_cluster_group(main_bm_inode, start);
- if (first_group == osb->first_cluster_group_blkno)
- first_bit = start;
- else
- first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
- last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
- last_bit = osb->bitmap_cpg;
-
- trimmed = 0;
- for (group = first_group; group <= last_group;) {
+ do {
if (first_bit + len >= osb->bitmap_cpg)
last_bit = osb->bitmap_cpg;
else
@@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
else
group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
- }
- range->len = trimmed * sb->s_blocksize;
+ } while (0);
- info.tf_trimlen = range->len;
- info.tf_success = (ret ? 0 : 1);
- pinfo = &info;
-out_trimunlock:
- ocfs2_trim_fs_unlock(osb, pinfo);
- ocfs2_trim_fs_lock_res_uninit(osb);
out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh);
+ main_bm_bh = NULL;
out_mutex:
inode_unlock(main_bm_inode);
iput(main_bm_inode);
+
+ /*
+ * If all the groups trim are not done or failed, but we should release
+ * main_bm related locks for avoiding the current IO starve, then go to
+ * trim the next group
+ */
+ if (ret >= 0 && group <= last_group)
+ goto next_group;
out:
+ range->len = trimmed * sb->s_blocksize;
+ return ret;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ struct ocfs2_trim_fs_info info, *pinfo = NULL;
+
+ ocfs2_trim_fs_lock_res_init(osb);
+
+ trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
+
+ ret = ocfs2_trim_fs_lock(osb, NULL, 1);
+ if (ret < 0) {
+ if (ret != -EAGAIN) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ mlog(ML_NOTICE, "Wait for trim on device (%s) to "
+ "finish, which is running from another node.\n",
+ osb->dev_str);
+ ret = ocfs2_trim_fs_lock(osb, &info, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ if (info.tf_valid && info.tf_success &&
+ info.tf_start == range->start &&
+ info.tf_len == range->len &&
+ info.tf_minlen == range->minlen) {
+ /* Avoid sending duplicated trim to a shared device */
+ mlog(ML_NOTICE, "The same trim on device (%s) was "
+ "just done from node (%u), return.\n",
+ osb->dev_str, info.tf_nodenum);
+ range->len = info.tf_trimlen;
+ goto out;
+ }
+ }
+
+ info.tf_nodenum = osb->node_num;
+ info.tf_start = range->start;
+ info.tf_len = range->len;
+ info.tf_minlen = range->minlen;
+
+ ret = ocfs2_trim_mainbm(sb, range);
+
+ info.tf_trimlen = range->len;
+ info.tf_success = (ret < 0 ? 0 : 1);
+ pinfo = &info;
+out:
+ ocfs2_trim_fs_unlock(osb, pinfo);
+ ocfs2_trim_fs_lock_res_uninit(osb);
return ret;
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 832c1759a09a..e8ea6f783f19 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2153,13 +2153,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_dio_write_ctxt *dwc = NULL;
struct buffer_head *di_bh = NULL;
u64 p_blkno;
- loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
+ loff_t pos = iblock << i_blkbits;
+ sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
unsigned len, total_len = bh_result->b_size;
int ret = 0, first_get_block = 0;
len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
len = min(total_len, len);
+ /*
+ * bh_result->b_size is count in get_more_blocks according to write
+ * "pos" and "end", we need map twice to return different buffer state:
+ * 1. area in file size, not set NEW;
+ * 2. area out file size, set NEW.
+ *
+ * iblock endblk
+ * |--------|---------|---------|---------
+ * |<-------area in file------->|
+ */
+
+ if ((iblock <= endblk) &&
+ ((iblock + ((len - 1) >> i_blkbits)) > endblk))
+ len = (endblk - iblock + 1) << i_blkbits;
+
mlog(0, "get block of %lu at %llu:%u req %u\n",
inode->i_ino, pos, len, total_len);
@@ -2243,6 +2260,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
if (desc->c_needs_zero)
set_buffer_new(bh_result);
+ if (iblock > endblk)
+ set_buffer_new(bh_result);
+
/* May sleep in end_io. It should not happen in a irq context. So defer
* it to dio work queue. */
set_buffer_defer_completion(bh_result);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 0e4166cc23a0..4ac775e32240 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group,
struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
- o2net_disconnect_node(node);
+ if (cluster->cl_nodes[node->nd_num] == node) {
+ o2net_disconnect_node(node);
- if (cluster->cl_has_local &&
- (cluster->cl_local_node == node->nd_num)) {
- cluster->cl_has_local = 0;
- cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
- o2net_stop_listening(node);
+ if (cluster->cl_has_local &&
+ (cluster->cl_local_node == node->nd_num)) {
+ cluster->cl_has_local = 0;
+ cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+ o2net_stop_listening(node);
+ }
}
/* XXX call into net to stop this node from trading messages */
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 802636d50365..74896525dc1a 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2134,7 +2134,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
* if this had completed successfully
* before sending this lock state to the
* new master */
- BUG_ON(i != DLM_CONVERTING_LIST);
mlog(0, "node died with cancel pending "
"on %.*s. move back to granted list.\n",
res->lockname.len, res->lockname.name);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 63d701cd1e2e..58536f9f6a28 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -105,7 +105,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
enum dlm_status status;
int actions = 0;
int in_use;
- u8 owner;
+ u8 owner;
+ int recovery_wait = 0;
mlog(0, "master_node = %d, valblk = %d\n", master_node,
flags & LKM_VALBLK);
@@ -183,6 +184,11 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
flags, owner);
spin_lock(&res->spinlock);
spin_lock(&lock->spinlock);
+
+ if ((flags & LKM_CANCEL) &&
+ dlm_lock_on_list(&res->granted, lock))
+ status = DLM_CANCELGRANT;
+
/* if the master told us the lock was already granted,
* let the ast handle all of these actions */
if (status == DLM_CANCELGRANT) {
@@ -208,9 +214,12 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
}
if (flags & LKM_CANCEL)
lock->cancel_pending = 0;
- else
- lock->unlock_pending = 0;
-
+ else {
+ if (!lock->unlock_pending)
+ recovery_wait = 1;
+ else
+ lock->unlock_pending = 0;
+ }
}
/* get an extra ref on lock. if we are just switching
@@ -229,6 +238,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
mlog(0, "clearing convert_type at %smaster node\n",
master_node ? "" : "non-");
lock->ml.convert_type = LKM_IVMODE;
+ lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
}
/* remove the extra ref on lock */
@@ -244,6 +254,17 @@ leave:
spin_unlock(&res->spinlock);
wake_up(&res->wq);
+ if (recovery_wait) {
+ spin_lock(&res->spinlock);
+ /* Unlock request will directly succeed after owner dies,
+ * and the lock is already removed from grant list. We have to
+ * wait for RECOVERING done or we miss the chance to purge it
+ * since the removement is much faster than RECOVERING proc.
+ */
+ __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING);
+ spin_unlock(&res->spinlock);
+ }
+
/* let the caller's final dlm_lock_put handle the actual kfree */
if (actions & DLM_UNLOCK_FREE_LOCK) {
/* this should always be coupled with list removal */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7c835824247e..af405586c5b1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
{
struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+ /* Only one trimfs thread are allowed to work at the same time. */
+ mutex_lock(&osb->obs_trim_fs_mutex);
+
ocfs2_lock_res_init_once(lockres);
ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
@@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
ocfs2_simple_drop_lockres(osb, lockres);
ocfs2_lock_res_free(lockres);
+
+ mutex_unlock(&osb->obs_trim_fs_mutex);
}
static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 4f86ac0027b5..1f029fbe8b8d 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -407,6 +407,7 @@ struct ocfs2_super
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
struct ocfs2_lock_res osb_trim_fs_lockres;
+ struct mutex obs_trim_fs_mutex;
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 2ee76a90ba8f..dc4bce1649c1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
+
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
/* End of trace events for fs/ocfs2/alloc.c. */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d7407994f308..ea0756d83250 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -55,7 +55,7 @@ struct ocfs2_slot_info {
unsigned int si_blocks;
struct buffer_head **si_bh;
unsigned int si_num_slots;
- struct ocfs2_slot *si_slots;
+ struct ocfs2_slot si_slots[];
};
@@ -420,9 +420,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
struct inode *inode = NULL;
struct ocfs2_slot_info *si;
- si = kzalloc(sizeof(struct ocfs2_slot_info) +
- (sizeof(struct ocfs2_slot) * osb->max_slots),
- GFP_KERNEL);
+ si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL);
if (!si) {
status = -ENOMEM;
mlog_errno(status);
@@ -431,8 +429,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
si->si_extended = ocfs2_uses_extended_slot_map(osb);
si->si_num_slots = osb->max_slots;
- si->si_slots = (struct ocfs2_slot *)((char *)si +
- sizeof(struct ocfs2_slot_info));
inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3415e0b09398..96ae7cedd487 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
if (ocfs2_is_hard_readonly(osb))
goto leave;
+ mutex_init(&osb->obs_trim_fs_mutex);
+
status = ocfs2_dlm_init(osb);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/pipe.c b/fs/pipe.c
index 0ff09b490ddf..070aad543382 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -140,8 +140,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
struct page *page = buf->page;
if (page_count(page) == 1) {
- if (memcg_kmem_enabled())
- memcg_kmem_uncharge(page, 0);
+ memcg_kmem_uncharge(page, 0);
__SetPageLocked(page);
return 0;
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9d428d5a0ac8..2edbb657f859 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -343,28 +343,28 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
#ifdef CONFIG_SECCOMP
seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
#endif
- seq_printf(m, "\nSpeculation_Store_Bypass:\t");
+ seq_puts(m, "\nSpeculation_Store_Bypass:\t");
switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
case -EINVAL:
- seq_printf(m, "unknown");
+ seq_puts(m, "unknown");
break;
case PR_SPEC_NOT_AFFECTED:
- seq_printf(m, "not vulnerable");
+ seq_puts(m, "not vulnerable");
break;
case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
- seq_printf(m, "thread force mitigated");
+ seq_puts(m, "thread force mitigated");
break;
case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
- seq_printf(m, "thread mitigated");
+ seq_puts(m, "thread mitigated");
break;
case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
- seq_printf(m, "thread vulnerable");
+ seq_puts(m, "thread vulnerable");
break;
case PR_SPEC_DISABLE:
- seq_printf(m, "globally mitigated");
+ seq_puts(m, "globally mitigated");
break;
default:
- seq_printf(m, "vulnerable");
+ seq_puts(m, "vulnerable");
break;
}
seq_putc(m, '\n');
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dd2d6301d875..e89bdc2b3821 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -460,7 +460,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
if (unlikely(!sched_info_on()))
- seq_printf(m, "0 0 0\n");
+ seq_puts(m, "0 0 0\n");
else
seq_printf(m, "%llu %llu %lu\n",
(unsigned long long)task->se.sum_exec_runtime,
@@ -1090,10 +1090,6 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
task_lock(p);
if (!p->vfork_done && process_shares_mm(p, mm)) {
- pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
- task_pid_nr(p), p->comm,
- p->signal->oom_score_adj, oom_adj,
- task_pid_nr(task), task->comm);
p->signal->oom_score_adj = oom_adj;
if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
p->signal->oom_score_adj_min = (short)oom_adj;
@@ -3220,7 +3216,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
return d_splice_alias(inode, dentry);
}
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
{
struct task_struct *task;
unsigned tgid;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 141995470367..d1671e97f7fe 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -163,7 +163,7 @@ extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struc
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
-extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
+struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);
/* Lookups */
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 40b05e0d4274..544d1ee15aee 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page)
else if (page_count(page) == 0 && is_free_buddy_page(page))
u |= 1 << KPF_BUDDY;
- if (PageBalloon(page))
- u |= 1 << KPF_BALLOON;
+ if (PageOffline(page))
+ u |= 1 << KPF_OFFLINE;
if (PageTable(page))
u |= 1 << KPF_PGTABLE;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 6927b29ece76..8b145e7b9661 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -247,7 +247,7 @@ static int proc_root_getattr(const struct path *path, struct kstat *stat,
static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
- if (!proc_pid_lookup(dir, dentry, flags))
+ if (!proc_pid_lookup(dentry, flags))
return NULL;
return proc_lookup(dir, dentry, flags);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 127265e5c55f..57c0a1047250 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -38,6 +38,7 @@ int proc_setup_self(struct super_block *s)
struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *self;
+ int ret = -ENOMEM;
inode_lock(root_inode);
self = d_alloc_name(s->s_root, "self");
@@ -51,20 +52,19 @@ int proc_setup_self(struct super_block *s)
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_self_inode_operations;
d_add(self, inode);
+ ret = 0;
} else {
dput(self);
- self = ERR_PTR(-ENOMEM);
}
- } else {
- self = ERR_PTR(-ENOMEM);
}
inode_unlock(root_inode);
- if (IS_ERR(self)) {
+
+ if (ret)
pr_err("proc_fill_super: can't allocate /proc/self\n");
- return PTR_ERR(self);
- }
- ns->proc_self = self;
- return 0;
+ else
+ ns->proc_self = self;
+
+ return ret;
}
void __init proc_self_init(void)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 76175211b304..80c305f206bb 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -23,21 +23,21 @@
#ifdef arch_idle_time
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle;
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle = kcs->cpustat[CPUTIME_IDLE];
if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
idle += arch_idle_time(cpu);
return idle;
}
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait;
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
if (cpu_online(cpu) && nr_iowait_cpu(cpu))
iowait += arch_idle_time(cpu);
return iowait;
@@ -45,7 +45,7 @@ static u64 get_iowait_time(int cpu)
#else
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle, idle_usecs = -1ULL;
@@ -54,14 +54,14 @@ static u64 get_idle_time(int cpu)
if (idle_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle = kcs->cpustat[CPUTIME_IDLE];
else
idle = idle_usecs * NSEC_PER_USEC;
return idle;
}
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait, iowait_usecs = -1ULL;
@@ -70,7 +70,7 @@ static u64 get_iowait_time(int cpu)
if (iowait_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
else
iowait = iowait_usecs * NSEC_PER_USEC;
@@ -120,16 +120,18 @@ static int show_stat(struct seq_file *p, void *v)
getboottime64(&boottime);
for_each_possible_cpu(i) {
- user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle += get_idle_time(i);
- iowait += get_iowait_time(i);
- irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
+ user += kcs->cpustat[CPUTIME_USER];
+ nice += kcs->cpustat[CPUTIME_NICE];
+ system += kcs->cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(kcs, i);
+ iowait += get_iowait_time(kcs, i);
+ irq += kcs->cpustat[CPUTIME_IRQ];
+ softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
+ steal += kcs->cpustat[CPUTIME_STEAL];
+ guest += kcs->cpustat[CPUTIME_GUEST];
+ guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
@@ -155,17 +157,19 @@ static int show_stat(struct seq_file *p, void *v)
seq_putc(p, '\n');
for_each_online_cpu(i) {
+ struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle = get_idle_time(i);
- iowait = get_iowait_time(i);
- irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ user = kcs->cpustat[CPUTIME_USER];
+ nice = kcs->cpustat[CPUTIME_NICE];
+ system = kcs->cpustat[CPUTIME_SYSTEM];
+ idle = get_idle_time(kcs, i);
+ iowait = get_iowait_time(kcs, i);
+ irq = kcs->cpustat[CPUTIME_IRQ];
+ softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
+ steal = kcs->cpustat[CPUTIME_STEAL];
+ guest = kcs->cpustat[CPUTIME_GUEST];
+ guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0ea720e59ac2..92a91e7816d8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -948,10 +948,12 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
pte_t ptent = *pte;
if (pte_present(ptent)) {
- ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
- ptent = pte_wrprotect(ptent);
+ pte_t old_pte;
+
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_wrprotect(old_pte);
ptent = pte_clear_soft_dirty(ptent);
- ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_clear_soft_dirty(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index f912872fbf91..36bf0f2e102e 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -178,7 +178,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
seq_file_path(m, file, "");
} else if (mm && is_stack(vma)) {
seq_pad(m, ' ');
- seq_printf(m, "[stack]");
+ seq_puts(m, "[stack]");
}
seq_putc(m, '\n');
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index b905010ca9eb..f61ae53533f5 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -38,6 +38,7 @@ int proc_setup_thread_self(struct super_block *s)
struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *thread_self;
+ int ret = -ENOMEM;
inode_lock(root_inode);
thread_self = d_alloc_name(s->s_root, "thread-self");
@@ -51,20 +52,19 @@ int proc_setup_thread_self(struct super_block *s)
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_thread_self_inode_operations;
d_add(thread_self, inode);
+ ret = 0;
} else {
dput(thread_self);
- thread_self = ERR_PTR(-ENOMEM);
}
- } else {
- thread_self = ERR_PTR(-ENOMEM);
}
inode_unlock(root_inode);
- if (IS_ERR(thread_self)) {
+
+ if (ret)
pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
- return PTR_ERR(thread_self);
- }
- ns->proc_thread_self = thread_self;
- return 0;
+ else
+ ns->proc_thread_self = thread_self;
+
+ return ret;
}
void __init proc_thread_self_init(void)
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 11201b2d06b9..a26ae0b83f8f 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -145,6 +145,17 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
return error;
}
+static int ramfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+
+ inode = ramfs_get_inode(dir->i_sb, dir, mode, 0);
+ if (!inode)
+ return -ENOSPC;
+ d_tmpfile(dentry, inode);
+ return 0;
+}
+
static const struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
@@ -155,6 +166,7 @@ static const struct inode_operations ramfs_dir_inode_operations = {
.rmdir = simple_rmdir,
.mknod = ramfs_mknod,
.rename = simple_rename,
+ .tmpfile = ramfs_tmpfile,
};
/*
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 05e61e6c843f..fa782fba51ee 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -606,7 +606,7 @@ static inline int pmd_none_or_clear_bad(pmd_t *pmd)
return 0;
}
-static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm,
+static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep)
{
@@ -615,10 +615,10 @@ static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm,
* non-present, preventing the hardware from asynchronously
* updating it.
*/
- return ptep_get_and_clear(mm, addr, ptep);
+ return ptep_get_and_clear(vma->vm_mm, addr, ptep);
}
-static inline void __ptep_modify_prot_commit(struct mm_struct *mm,
+static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep, pte_t pte)
{
@@ -626,7 +626,7 @@ static inline void __ptep_modify_prot_commit(struct mm_struct *mm,
* The pte is non-present, so there's no hardware state to
* preserve.
*/
- set_pte_at(mm, addr, ptep, pte);
+ set_pte_at(vma->vm_mm, addr, ptep, pte);
}
#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@ -644,22 +644,22 @@ static inline void __ptep_modify_prot_commit(struct mm_struct *mm,
* queue the update to be done at some later time. The update must be
* actually committed before the pte lock is released, however.
*/
-static inline pte_t ptep_modify_prot_start(struct mm_struct *mm,
+static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep)
{
- return __ptep_modify_prot_start(mm, addr, ptep);
+ return __ptep_modify_prot_start(vma, addr, ptep);
}
/*
* Commit an update to a pte, leaving any hardware-controlled bits in
* the PTE unmodified.
*/
-static inline void ptep_modify_prot_commit(struct mm_struct *mm,
+static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
unsigned long addr,
- pte_t *ptep, pte_t pte)
+ pte_t *ptep, pte_t old_pte, pte_t pte)
{
- __ptep_modify_prot_commit(mm, addr, ptep, pte);
+ __ptep_modify_prot_commit(vma, addr, ptep, pte);
}
#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
#endif /* CONFIG_MMU */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 7a3dffe8f971..6ac47f5ea514 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -953,9 +953,6 @@ acpi_handle_printk(const char *level, void *handle, const char *fmt, ...) {}
#if defined(CONFIG_ACPI) && defined(CONFIG_DYNAMIC_DEBUG)
__printf(3, 4)
void __acpi_handle_debug(struct _ddebug *descriptor, acpi_handle handle, const char *fmt, ...);
-#else
-#define __acpi_handle_debug(descriptor, handle, fmt, ...) \
- acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__);
#endif
/*
@@ -985,12 +982,8 @@ void __acpi_handle_debug(struct _ddebug *descriptor, acpi_handle handle, const c
#else
#if defined(CONFIG_DYNAMIC_DEBUG)
#define acpi_handle_debug(handle, fmt, ...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- __acpi_handle_debug(&descriptor, handle, pr_fmt(fmt), \
- ##__VA_ARGS__); \
-} while (0)
+ _dynamic_func_call(fmt, __acpi_handle_debug, \
+ handle, pr_fmt(fmt), ##__VA_ARGS__)
#else
#define acpi_handle_debug(handle, fmt, ...) \
({ \
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 53051f3d8f25..f111c780ef1d 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -4,15 +4,18 @@
*
* Common interface definitions for making balloon pages movable by compaction.
*
- * Despite being perfectly possible to perform ballooned pages migration, they
- * make a special corner case to compaction scans because balloon pages are not
- * enlisted at any LRU list like the other pages we do compact / migrate.
+ * Balloon page migration makes use of the general non-lru movable page
+ * feature.
+ *
+ * page->private is used to reference the responsible balloon device.
+ * page->mapping is used in context of non-lru page migration to reference
+ * the address space operations for page isolation/migration/compaction.
*
* As the page isolation scanning step a compaction thread does is a lockless
* procedure (from a page standpoint), it might bring some racy situations while
* performing balloon page compaction. In order to sort out these racy scenarios
* and safely perform balloon's page compaction and migration we must, always,
- * ensure following these three simple rules:
+ * ensure following these simple rules:
*
* i. when updating a balloon's page ->mapping element, strictly do it under
* the following lock order, independently of the far superior
@@ -21,19 +24,8 @@
* +--spin_lock_irq(&b_dev_info->pages_lock);
* ... page->mapping updates here ...
*
- * ii. before isolating or dequeueing a balloon page from the balloon device
- * pages list, the page reference counter must be raised by one and the
- * extra refcount must be dropped when the page is enqueued back into
- * the balloon device page list, thus a balloon page keeps its reference
- * counter raised only while it is under our special handling;
- *
- * iii. after the lockless scan step have selected a potential balloon page for
- * isolation, re-test the PageBalloon mark and the PagePrivate flag
- * under the proper page lock, to ensure isolating a valid balloon page
- * (not yet isolated, nor under release procedure)
- *
- * iv. isolation or dequeueing procedure must clear PagePrivate flag under
- * page lock together with removing page from balloon device page list.
+ * ii. isolation or dequeueing procedure must remove the page from balloon
+ * device page list under b_dev_info->pages_lock.
*
* The functions provided by this interface are placed to help on coping with
* the aforementioned balloon page corner case, as well as to ensure the simple
@@ -103,7 +95,7 @@ extern int balloon_page_migrate(struct address_space *mapping,
static inline void balloon_page_insert(struct balloon_dev_info *balloon,
struct page *page)
{
- __SetPageBalloon(page);
+ __SetPageOffline(page);
__SetPageMovable(page, balloon->inode->i_mapping);
set_page_private(page, (unsigned long)balloon);
list_add(&page->lru, &balloon->pages);
@@ -119,7 +111,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
*/
static inline void balloon_page_delete(struct page *page)
{
- __ClearPageBalloon(page);
+ __ClearPageOffline(page);
__ClearPageMovable(page);
set_page_private(page, 0);
/*
@@ -149,13 +141,13 @@ static inline gfp_t balloon_mapping_gfp_mask(void)
static inline void balloon_page_insert(struct balloon_dev_info *balloon,
struct page *page)
{
- __SetPageBalloon(page);
+ __SetPageOffline(page);
list_add(&page->lru, &balloon->pages);
}
static inline void balloon_page_delete(struct page *page)
{
- __ClearPageBalloon(page);
+ __ClearPageOffline(page);
list_del(&page->lru);
}
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 705f7c442691..602af23b98c7 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -246,7 +246,7 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
new__ = (old__ & ~mask__) | bits__; \
} while (cmpxchg(ptr, old__, new__) != old__); \
\
- new__; \
+ old__; \
})
#endif
diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h
index faeec7433aab..0fe5426f2bdc 100644
--- a/include/linux/build_bug.h
+++ b/include/linux/build_bug.h
@@ -58,4 +58,23 @@
*/
#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
+/**
+ * static_assert - check integer constant expression at build time
+ *
+ * static_assert() is a wrapper for the C11 _Static_assert, with a
+ * little macro magic to make the message optional (defaulting to the
+ * stringification of the tested expression).
+ *
+ * Contrary to BUILD_BUG_ON(), static_assert() can be used at global
+ * scope, but requires the expression to be an integer constant
+ * expression (i.e., it is not enough that __builtin_constant_p() is
+ * true for expr).
+ *
+ * Also note that BUILD_BUG_ON() fails the build if the condition is
+ * true, while static_assert() fails the build if the expression is
+ * false.
+ */
+#define static_assert(expr, ...) __static_assert(expr, ##__VA_ARGS__, #expr)
+#define __static_assert(expr, msg, ...) _Static_assert(expr, msg)
+
#endif /* _LINUX_BUILD_BUG_H */
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 120d1d40704b..1c70803e9f77 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -32,6 +32,7 @@ struct kernfs_node;
struct kernfs_ops;
struct kernfs_open_file;
struct seq_file;
+struct poll_table_struct;
#define MAX_CGROUP_TYPE_NAMELEN 32
#define MAX_CGROUP_ROOT_NAMELEN 64
@@ -574,6 +575,9 @@ struct cftype {
ssize_t (*write)(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
+ __poll_t (*poll)(struct kernfs_open_file *of,
+ struct poll_table_struct *pt);
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lock_class_key lockdep_key;
#endif
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 68250a57aace..9569e7c786d3 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -88,14 +88,13 @@ extern int sysctl_compact_memory;
extern int sysctl_compaction_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos);
extern int sysctl_extfrag_threshold;
-extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos);
extern int sysctl_compact_unevictable_allowed;
extern int fragmentation_index(struct zone *zone, unsigned int order);
extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
unsigned int order, unsigned int alloc_flags,
- const struct alloc_context *ac, enum compact_priority prio);
+ const struct alloc_context *ac, enum compact_priority prio,
+ struct page **page);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern enum compact_result compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags, int classzone_idx);
@@ -227,8 +226,8 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_i
#endif /* CONFIG_COMPACTION */
-#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
struct node;
+#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
extern int compaction_register_node(struct node *node);
extern void compaction_unregister_node(struct node *node);
diff --git a/include/linux/delay.h b/include/linux/delay.h
index b78bab4395d8..8e6828094c1e 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -55,6 +55,7 @@ static inline void ndelay(unsigned long x)
extern unsigned long lpj_fine;
void calibrate_delay(void);
+void __attribute__((weak)) calibration_delay_done(void);
void msleep(unsigned int msecs);
unsigned long msleep_interruptible(unsigned int msecs);
void usleep_range(unsigned long min, unsigned long max);
diff --git a/include/linux/device.h b/include/linux/device.h
index 4cf0f3d5cb45..07083d14888c 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1103,7 +1103,7 @@ static inline void set_dev_node(struct device *dev, int node)
#else
static inline int dev_to_node(struct device *dev)
{
- return -1;
+ return NUMA_NO_NODE;
}
static inline void set_dev_node(struct device *dev, int node)
{
@@ -1556,7 +1556,7 @@ do { \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) && \
+ if (DYNAMIC_DEBUG_BRANCH(descriptor) && \
__ratelimit(&_rs)) \
__dynamic_dev_dbg(&descriptor, dev, dev_fmt(fmt), \
##__VA_ARGS__); \
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index b3419da1a776..c2be029b9b53 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -47,10 +47,10 @@ struct _ddebug {
} __attribute__((aligned(8)));
-int ddebug_add_module(struct _ddebug *tab, unsigned int n,
- const char *modname);
#if defined(CONFIG_DYNAMIC_DEBUG)
+int ddebug_add_module(struct _ddebug *tab, unsigned int n,
+ const char *modname);
extern int ddebug_remove_module(const char *mod_name);
extern __printf(2, 3)
void __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...);
@@ -71,7 +71,7 @@ void __dynamic_netdev_dbg(struct _ddebug *descriptor,
const struct net_device *dev,
const char *fmt, ...);
-#define DEFINE_DYNAMIC_DEBUG_METADATA_KEY(name, fmt, key, init) \
+#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \
static struct _ddebug __aligned(8) \
__attribute__((section("__verbose"))) name = { \
.modname = KBUILD_MODNAME, \
@@ -80,35 +80,27 @@ void __dynamic_netdev_dbg(struct _ddebug *descriptor,
.format = (fmt), \
.lineno = __LINE__, \
.flags = _DPRINTK_FLAGS_DEFAULT, \
- dd_key_init(key, init) \
+ _DPRINTK_KEY_INIT \
}
#ifdef CONFIG_JUMP_LABEL
-#define dd_key_init(key, init) key = (init)
-
#ifdef DEBUG
-#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \
- DEFINE_DYNAMIC_DEBUG_METADATA_KEY(name, fmt, .key.dd_key_true, \
- (STATIC_KEY_TRUE_INIT))
+
+#define _DPRINTK_KEY_INIT .key.dd_key_true = (STATIC_KEY_TRUE_INIT)
#define DYNAMIC_DEBUG_BRANCH(descriptor) \
static_branch_likely(&descriptor.key.dd_key_true)
#else
-#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \
- DEFINE_DYNAMIC_DEBUG_METADATA_KEY(name, fmt, .key.dd_key_false, \
- (STATIC_KEY_FALSE_INIT))
+#define _DPRINTK_KEY_INIT .key.dd_key_false = (STATIC_KEY_FALSE_INIT)
#define DYNAMIC_DEBUG_BRANCH(descriptor) \
static_branch_unlikely(&descriptor.key.dd_key_false)
#endif
-#else
+#else /* !HAVE_JUMP_LABEL */
-#define dd_key_init(key, init)
-
-#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \
- DEFINE_DYNAMIC_DEBUG_METADATA_KEY(name, fmt, 0, 0)
+#define _DPRINTK_KEY_INIT
#ifdef DEBUG
#define DYNAMIC_DEBUG_BRANCH(descriptor) \
@@ -120,46 +112,66 @@ void __dynamic_netdev_dbg(struct _ddebug *descriptor,
#endif
-#define dynamic_pr_debug(fmt, ...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (DYNAMIC_DEBUG_BRANCH(descriptor)) \
- __dynamic_pr_debug(&descriptor, pr_fmt(fmt), \
- ##__VA_ARGS__); \
+#define __dynamic_func_call(id, fmt, func, ...) do { \
+ DEFINE_DYNAMIC_DEBUG_METADATA(id, fmt); \
+ if (DYNAMIC_DEBUG_BRANCH(id)) \
+ func(&id, ##__VA_ARGS__); \
} while (0)
-#define dynamic_dev_dbg(dev, fmt, ...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (DYNAMIC_DEBUG_BRANCH(descriptor)) \
- __dynamic_dev_dbg(&descriptor, dev, fmt, \
- ##__VA_ARGS__); \
+#define __dynamic_func_call_no_desc(id, fmt, func, ...) do { \
+ DEFINE_DYNAMIC_DEBUG_METADATA(id, fmt); \
+ if (DYNAMIC_DEBUG_BRANCH(id)) \
+ func(__VA_ARGS__); \
} while (0)
+/*
+ * "Factory macro" for generating a call to func, guarded by a
+ * DYNAMIC_DEBUG_BRANCH. The dynamic debug decriptor will be
+ * initialized using the fmt argument. The function will be called with
+ * the address of the descriptor as first argument, followed by all
+ * the varargs. Note that fmt is repeated in invocations of this
+ * macro.
+ */
+#define _dynamic_func_call(fmt, func, ...) \
+ __dynamic_func_call(__UNIQUE_ID(ddebug), fmt, func, ##__VA_ARGS__)
+/*
+ * A variant that does the same, except that the descriptor is not
+ * passed as the first argument to the function; it is only called
+ * with precisely the macro's varargs.
+ */
+#define _dynamic_func_call_no_desc(fmt, func, ...) \
+ __dynamic_func_call_no_desc(__UNIQUE_ID(ddebug), fmt, func, ##__VA_ARGS__)
+
+#define dynamic_pr_debug(fmt, ...) \
+ _dynamic_func_call(fmt, __dynamic_pr_debug, \
+ pr_fmt(fmt), ##__VA_ARGS__)
+
+#define dynamic_dev_dbg(dev, fmt, ...) \
+ _dynamic_func_call(fmt,__dynamic_dev_dbg, \
+ dev, fmt, ##__VA_ARGS__)
+
#define dynamic_netdev_dbg(dev, fmt, ...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (DYNAMIC_DEBUG_BRANCH(descriptor)) \
- __dynamic_netdev_dbg(&descriptor, dev, fmt, \
- ##__VA_ARGS__); \
-} while (0)
+ _dynamic_func_call(fmt, __dynamic_netdev_dbg, \
+ dev, fmt, ##__VA_ARGS__)
-#define dynamic_hex_dump(prefix_str, prefix_type, rowsize, \
- groupsize, buf, len, ascii) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, \
- __builtin_constant_p(prefix_str) ? prefix_str : "hexdump");\
- if (DYNAMIC_DEBUG_BRANCH(descriptor)) \
- print_hex_dump(KERN_DEBUG, prefix_str, \
- prefix_type, rowsize, groupsize, \
- buf, len, ascii); \
-} while (0)
+#define dynamic_hex_dump(prefix_str, prefix_type, rowsize, \
+ groupsize, buf, len, ascii) \
+ _dynamic_func_call_no_desc(__builtin_constant_p(prefix_str) ? prefix_str : "hexdump", \
+ print_hex_dump, \
+ KERN_DEBUG, prefix_str, prefix_type, \
+ rowsize, groupsize, buf, len, ascii)
#else
#include <linux/string.h>
#include <linux/errno.h>
+static inline int ddebug_add_module(struct _ddebug *tab, unsigned int n,
+ const char *modname)
+{
+ return 0;
+}
+
static inline int ddebug_remove_module(const char *mod)
{
return 0;
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 011965c08b93..6d775984905b 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -7,6 +7,13 @@
#include <linux/bitops.h>
#include <linux/jump_label.h>
+/*
+ * Return code to denote that requested number of
+ * frontswap pages are unused(moved to page cache).
+ * Used in in shmem_unuse and try_to_unuse.
+ */
+#define FRONTSWAP_PAGES_UNUSED 2
+
struct frontswap_ops {
void (*init)(unsigned); /* this swap type was just swapon'ed */
int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9d362564f461..7bf12829a622 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -30,6 +30,7 @@
#include <linux/shrinker.h>
#include <linux/migrate_mode.h>
#include <linux/uidgid.h>
+#include <linux/fs_types.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/workqueue.h>
@@ -37,7 +38,8 @@
#include <linux/uuid.h>
#include <linux/errseq.h>
#include <linux/ioprio.h>
-#include <linux/fs_types.h>
+#include <linux/build_bug.h>
+#include <linux/stddef.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@@ -2484,6 +2486,7 @@ struct filename {
struct audit_names *aname;
const char iname[];
};
+static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
extern long vfs_truncate(const struct path *, loff_t);
extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index dd0a452373e7..c5e5bed82fbc 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -156,7 +156,7 @@ extern struct gen_pool *devm_gen_pool_create(struct device *dev,
int min_alloc_order, int nid, const char *name);
extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
-bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+extern bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start,
size_t size);
#ifdef CONFIG_OF
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 5f5e25fd6149..0af1a276aab5 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -24,21 +24,21 @@ struct vm_area_struct;
#define ___GFP_HIGH 0x20u
#define ___GFP_IO 0x40u
#define ___GFP_FS 0x80u
-#define ___GFP_WRITE 0x100u
-#define ___GFP_NOWARN 0x200u
-#define ___GFP_RETRY_MAYFAIL 0x400u
-#define ___GFP_NOFAIL 0x800u
-#define ___GFP_NORETRY 0x1000u
-#define ___GFP_MEMALLOC 0x2000u
-#define ___GFP_COMP 0x4000u
-#define ___GFP_ZERO 0x8000u
-#define ___GFP_NOMEMALLOC 0x10000u
-#define ___GFP_HARDWALL 0x20000u
-#define ___GFP_THISNODE 0x40000u
-#define ___GFP_ATOMIC 0x80000u
-#define ___GFP_ACCOUNT 0x100000u
-#define ___GFP_DIRECT_RECLAIM 0x200000u
-#define ___GFP_KSWAPD_RECLAIM 0x400000u
+#define ___GFP_ZERO 0x100u
+#define ___GFP_ATOMIC 0x200u
+#define ___GFP_DIRECT_RECLAIM 0x400u
+#define ___GFP_KSWAPD_RECLAIM 0x800u
+#define ___GFP_WRITE 0x1000u
+#define ___GFP_NOWARN 0x2000u
+#define ___GFP_RETRY_MAYFAIL 0x4000u
+#define ___GFP_NOFAIL 0x8000u
+#define ___GFP_NORETRY 0x10000u
+#define ___GFP_MEMALLOC 0x20000u
+#define ___GFP_COMP 0x40000u
+#define ___GFP_NOMEMALLOC 0x80000u
+#define ___GFP_HARDWALL 0x100000u
+#define ___GFP_THISNODE 0x200000u
+#define ___GFP_ACCOUNT 0x400000u
#ifdef CONFIG_LOCKDEP
#define ___GFP_NOLOCKDEP 0x800000u
#else
@@ -589,8 +589,8 @@ static inline bool pm_suspended_storage(void)
/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask);
-extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
#endif
+extern void free_contig_range(unsigned long pfn, unsigned int nr_pages);
#ifdef CONFIG_CMA
/* CMA stuff */
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 66f9ebbb1df3..7aadf18b29cb 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -77,8 +77,34 @@
#include <linux/migrate.h>
#include <linux/memremap.h>
#include <linux/completion.h>
+#include <linux/mmu_notifier.h>
-struct hmm;
+
+/*
+ * struct hmm - HMM per mm struct
+ *
+ * @mm: mm struct this HMM struct is bound to
+ * @lock: lock protecting ranges list
+ * @ranges: list of range being snapshotted
+ * @mirrors: list of mirrors for this mm
+ * @mmu_notifier: mmu notifier to track updates to CPU page table
+ * @mirrors_sem: read/write semaphore protecting the mirrors list
+ * @wq: wait queue for user waiting on a range invalidation
+ * @notifiers: count of active mmu notifiers
+ * @dead: is the mm dead ?
+ */
+struct hmm {
+ struct mm_struct *mm;
+ struct kref kref;
+ struct mutex lock;
+ struct list_head ranges;
+ struct list_head mirrors;
+ struct mmu_notifier mmu_notifier;
+ struct rw_semaphore mirrors_sem;
+ wait_queue_head_t wq;
+ long notifiers;
+ bool dead;
+};
/*
* hmm_pfn_flag_e - HMM flag enums
@@ -131,6 +157,7 @@ enum hmm_pfn_value_e {
/*
* struct hmm_range - track invalidation lock on virtual address range
*
+ * @hmm: the core HMM structure this range is active against
* @vma: the vm area struct for the range
* @list: all range lock are on a list
* @start: range virtual start address (inclusive)
@@ -138,10 +165,13 @@ enum hmm_pfn_value_e {
* @pfns: array of pfns (big enough for the range)
* @flags: pfn flags to match device driver page table
* @values: pfn value for some special case (none, special, error, ...)
+ * @default_flags: default flags for the range (write, read, ...)
+ * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
* @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
* @valid: pfns array did not change since it has been fill by an HMM function
*/
struct hmm_range {
+ struct hmm *hmm;
struct vm_area_struct *vma;
struct list_head list;
unsigned long start;
@@ -149,11 +179,66 @@ struct hmm_range {
uint64_t *pfns;
const uint64_t *flags;
const uint64_t *values;
+ uint64_t default_flags;
+ uint64_t pfn_flags_mask;
+ uint8_t page_shift;
uint8_t pfn_shift;
bool valid;
};
/*
+ * hmm_range_page_shift() - return the page shift for the range
+ * @range: range being queried
+ * Returns: page shift (page size = 1 << page shift) for the range
+ */
+static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
+{
+ return range->page_shift;
+}
+
+/*
+ * hmm_range_page_size() - return the page size for the range
+ * @range: range being queried
+ * Returns: page size for the range in bytes
+ */
+static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
+{
+ return 1UL << hmm_range_page_shift(range);
+}
+
+/*
+ * hmm_range_wait_until_valid() - wait for range to be valid
+ * @range: range affected by invalidation to wait on
+ * @timeout: time out for wait in ms (ie abort wait after that period of time)
+ * Returns: true if the range is valid, false otherwise.
+ */
+static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
+ unsigned long timeout)
+{
+ /* Check if mm is dead ? */
+ if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
+ range->valid = false;
+ return false;
+ }
+ if (range->valid)
+ return true;
+ wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
+ msecs_to_jiffies(timeout));
+ /* Return current valid status just in case we get lucky */
+ return range->valid;
+}
+
+/*
+ * hmm_range_valid() - test if a range is valid or not
+ * @range: range
+ * Returns: true if the range is valid, false otherwise.
+ */
+static inline bool hmm_range_valid(struct hmm_range *range)
+{
+ return range->valid;
+}
+
+/*
* hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn
* @range: range use to decode HMM pfn value
* @pfn: HMM pfn value to get corresponding struct page from
@@ -353,43 +438,194 @@ struct hmm_mirror {
int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
void hmm_mirror_unregister(struct hmm_mirror *mirror);
+/*
+ * hmm_mirror_mm_down_read() - lock the mmap_sem in read mode
+ * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
+ * Returns: -EINVAL if the mm is dead, 0 otherwise (lock taken).
+ *
+ * The device driver context which holds reference to mirror and thus to core
+ * hmm struct might outlive the mm against which it was created. To avoid every
+ * driver to check for that case provide an helper that check if mm is still
+ * alive and take the mmap_sem in read mode if so. If the mm have been destroy
+ * (mmu_notifier release call back did happen) then we return -EINVAL so that
+ * calling code knows that it is trying to do something against a mm that is
+ * no longer valid.
+ */
+static inline int hmm_mirror_mm_down_read(struct hmm_mirror *mirror)
+{
+ struct mm_struct *mm;
+
+ /* Sanity check ... */
+ if (!mirror || !mirror->hmm)
+ return -EINVAL;
+ /*
+ * Before trying to take the mmap_sem make sure the mm is still
+ * alive as device driver context might outlive the mm lifetime.
+ *
+ * FIXME: should we also check for mm that outlive its owning
+ * task ?
+ */
+ mm = READ_ONCE(mirror->hmm->mm);
+ if (mirror->hmm->dead || !mm)
+ return -EINVAL;
+
+ down_read(&mm->mmap_sem);
+ return 0;
+}
/*
- * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
- * driver lock that serializes device page table updates, then call
- * hmm_vma_range_done(), to check if the snapshot is still valid. The same
- * device driver page table update lock must also be used in the
- * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
- * table invalidation serializes on it.
- *
- * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
- * hmm_vma_get_pfns() WITHOUT ERROR !
- *
- * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
+ * hmm_mirror_mm_up_read() - unlock the mmap_sem from read mode
+ * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
*/
-int hmm_vma_get_pfns(struct hmm_range *range);
-bool hmm_vma_range_done(struct hmm_range *range);
+static inline void hmm_mirror_mm_up_read(struct hmm_mirror *mirror)
+{
+ up_read(&mirror->hmm->mm->mmap_sem);
+}
/*
- * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will
- * not migrate any device memory back to system memory. The HMM pfn array will
- * be updated with the fault result and current snapshot of the CPU page table
- * for the range.
+ * To snapshot the CPU page table you first have to call hmm_range_register()
+ * to register the range. If hmm_range_register() return an error then some-
+ * thing is horribly wrong and you should fail loudly. If it returned true then
+ * you can wait for the range to be stable with hmm_range_wait_until_valid()
+ * function, a range is valid when there are no concurrent changes to the CPU
+ * page table for the range.
+ *
+ * Once the range is valid you can call hmm_range_snapshot() if that returns
+ * without error then you can take your device page table lock (the same lock
+ * you use in the HMM mirror sync_cpu_device_pagetables() callback). After
+ * taking that lock you have to check the range validity, if it is still valid
+ * (ie hmm_range_valid() returns true) then you can program the device page
+ * table, otherwise you have to start again. Pseudo code:
+ *
+ * mydevice_prefault(mydevice, mm, start, end)
+ * {
+ * struct hmm_range range;
+ * ...
+ *
+ * ret = hmm_range_register(&range, mm, start, end, page_shift);
+ * if (ret)
+ * return ret;
+ *
+ * hmm_mirror_mm_down_read(mirror);
+ * again:
+ *
+ * if (!hmm_range_wait_until_valid(&range, TIMEOUT)) {
+ * up_read(&mm->mmap_sem);
+ * hmm_range_unregister(range);
+ * // Handle time out, either sleep or retry or something else
+ * ...
+ * return -ESOMETHING; || goto again;
+ * }
+ *
+ * ret = hmm_range_snapshot(&range); or hmm_range_fault(&range);
+ * if (ret == -EAGAIN) {
+ * hmm_mirror_mm_down_read(mirror);
+ * goto again;
+ * } else if (ret == -EBUSY) {
+ * goto again;
+ * }
+ *
+ * hmm_mirror_mm_up_read(mirror);
+ * if (ret) {
+ * hmm_range_unregister(range);
+ * return ret;
+ * }
+ *
+ * // It might not have snap-shoted the whole range but only the first
+ * // npages, the return values is the number of valid pages from the
+ * // start of the range.
+ * npages = ret;
*
- * The mmap_sem must be taken in read mode before entering and it might be
- * dropped by the function if the block argument is false. In that case, the
- * function returns -EAGAIN.
+ * ...
*
- * Return value does not reflect if the fault was successful for every single
- * address or not. Therefore, the caller must to inspect the HMM pfn array to
- * determine fault status for each address.
+ * mydevice_page_table_lock(mydevice);
+ * if (!hmm_range_valid(range)) {
+ * mydevice_page_table_unlock(mydevice);
+ * goto again;
+ * }
*
- * Trying to fault inside an invalid vma will result in -EINVAL.
+ * mydevice_populate_page_table(mydevice, range, npages);
+ * ...
+ * mydevice_take_page_table_unlock(mydevice);
+ * hmm_range_unregister(range);
+ *
+ * return 0;
+ * }
*
- * See the function description in mm/hmm.c for further documentation.
+ * The same scheme apply to hmm_range_fault() (ie replace hmm_range_snapshot()
+ * with hmm_range_fault() in above pseudo code).
+ *
+ * YOU MUST CALL hmm_range_unregister() ONCE AND ONLY ONCE EACH TIME YOU CALL
+ * hmm_range_register() AND hmm_range_register() RETURNED TRUE ! IF YOU DO NOT
+ * FOLLOW THIS RULE MEMORY CORRUPTION WILL ENSUE !
*/
-int hmm_vma_fault(struct hmm_range *range, bool block);
+int hmm_range_register(struct hmm_range *range,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ unsigned page_shift);
+void hmm_range_unregister(struct hmm_range *range);
+long hmm_range_snapshot(struct hmm_range *range);
+long hmm_range_fault(struct hmm_range *range, bool block);
+long hmm_range_dma_map(struct hmm_range *range,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool block);
+long hmm_range_dma_unmap(struct hmm_range *range,
+ struct vm_area_struct *vma,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool dirty);
+
+/*
+ * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
+ *
+ * When waiting for mmu notifiers we need some kind of time out otherwise we
+ * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to
+ * wait already.
+ */
+#define HMM_RANGE_DEFAULT_TIMEOUT 1000
+
+/* This is a temporary helper to avoid merge conflict between trees. */
+static inline bool hmm_vma_range_done(struct hmm_range *range)
+{
+ bool ret = hmm_range_valid(range);
+
+ hmm_range_unregister(range);
+ return ret;
+}
+
+static inline int hmm_vma_fault(struct hmm_range *range, bool block)
+{
+ long ret;
+
+ range->default_flags = 0;
+ range->pfn_flags_mask = -1UL;
+
+ ret = hmm_range_register(range, range->vma->vm_mm,
+ range->start, range->end,
+ PAGE_SHIFT);
+ if (ret)
+ return (int)ret;
+
+ if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
+ up_read(&range->vma->vm_mm->mmap_sem);
+ return -EAGAIN;
+ }
+
+ ret = hmm_range_fault(range, block);
+ if (ret <= 0) {
+ if (ret == -EBUSY || !ret) {
+ up_read(&range->vma->vm_mm->mmap_sem);
+ ret = -EBUSY;
+ } else if (ret == -EAGAIN)
+ ret = -EBUSY;
+ hmm_range_unregister(range);
+ return ret;
+ }
+ return 0;
+}
/* Below are for HMM internal use only! Not to be used by device driver! */
void hmm_mm_destroy(struct mm_struct *mm);
@@ -468,7 +704,7 @@ struct hmm_devmem_ops {
* Note that mmap semaphore is held in read mode at least when this
* callback occurs, hence the vma is valid upon callback entry.
*/
- int (*fault)(struct hmm_devmem *devmem,
+ vm_fault_t (*fault)(struct hmm_devmem *devmem,
struct vm_area_struct *vma,
unsigned long addr,
const struct page *page,
@@ -511,7 +747,7 @@ struct hmm_devmem_ops {
* chunk, as an optimization. It must, however, prioritize the faulting address
* over all the others.
*/
-typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
+typedef vm_fault_t (*dev_page_fault_t)(struct vm_area_struct *vma,
unsigned long addr,
const struct page *page,
unsigned int flags,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 087fd5f48c91..ea35263eb76b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -371,6 +371,8 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask);
struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address);
+struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask);
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx);
@@ -493,17 +495,54 @@ static inline pgoff_t basepage_index(struct page *page)
extern int dissolve_free_huge_page(struct page *page);
extern int dissolve_free_huge_pages(unsigned long start_pfn,
unsigned long end_pfn);
-static inline bool hugepage_migration_supported(struct hstate *h)
-{
+
#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
+#ifndef arch_hugetlb_migration_supported
+static inline bool arch_hugetlb_migration_supported(struct hstate *h)
+{
if ((huge_page_shift(h) == PMD_SHIFT) ||
- (huge_page_shift(h) == PGDIR_SHIFT))
+ (huge_page_shift(h) == PUD_SHIFT) ||
+ (huge_page_shift(h) == PGDIR_SHIFT))
return true;
else
return false;
+}
+#endif
#else
+static inline bool arch_hugetlb_migration_supported(struct hstate *h)
+{
return false;
+}
#endif
+
+static inline bool hugepage_migration_supported(struct hstate *h)
+{
+ return arch_hugetlb_migration_supported(h);
+}
+
+/*
+ * Movability check is different as compared to migration check.
+ * It determines whether or not a huge page should be placed on
+ * movable zone or not. Movability of any huge page should be
+ * required only if huge page size is supported for migration.
+ * There wont be any reason for the huge page to be movable if
+ * it is not migratable to start with. Also the size of the huge
+ * page should be large enough to be placed under a movable zone
+ * and still feasible enough to be migratable. Just the presence
+ * in movable zone does not make the migration feasible.
+ *
+ * So even though large huge page sizes like the gigantic ones
+ * are migratable they should not be movable because its not
+ * feasible to migrate them from movable zone.
+ */
+static inline bool hugepage_movable_supported(struct hstate *h)
+{
+ if (!hugepage_migration_supported(h))
+ return false;
+
+ if (hstate_is_gigantic(h))
+ return false;
+ return true;
}
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
@@ -543,6 +582,26 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
set_huge_pte_at(mm, addr, ptep, pte);
}
#endif
+
+#ifndef huge_ptep_modify_prot_start
+#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
+static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
+}
+#endif
+
+#ifndef huge_ptep_modify_prot_commit
+#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit
+static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
+#endif
+
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
#define alloc_huge_page(v, a, r) NULL
@@ -602,6 +661,11 @@ static inline bool hugepage_migration_supported(struct hstate *h)
return false;
}
+static inline bool hugepage_movable_supported(struct hstate *h)
+{
+ return false;
+}
+
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
struct mm_struct *mm, pte_t *pte)
{
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 6ab8c1bada3f..7d5f553a3b24 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -16,6 +16,7 @@ struct user_namespace;
struct ipc_ids {
int in_use;
unsigned short seq;
+ unsigned short deleted;
struct rw_semaphore rwsem;
struct idr ipcs_idr;
int max_idx;
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index d314150658a4..a61dc075e2ce 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -2,7 +2,7 @@
#ifndef _LINUX_KASAN_CHECKS_H
#define _LINUX_KASAN_CHECKS_H
-#ifdef CONFIG_KASAN
+#if defined(__SANITIZE_ADDRESS__) || defined(__KASAN_INTERNAL)
void kasan_check_read(const volatile void *p, unsigned int size);
void kasan_check_write(const volatile void *p, unsigned int size);
#else
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8f0e68e250a7..b4776cdee6ca 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -4,6 +4,7 @@
#include <stdarg.h>
+#include <linux/limits.h>
#include <linux/linkage.h>
#include <linux/stddef.h>
#include <linux/types.h>
@@ -14,36 +15,9 @@
#include <linux/printk.h>
#include <linux/build_bug.h>
#include <asm/byteorder.h>
+#include <asm/div64.h>
#include <uapi/linux/kernel.h>
-#define USHRT_MAX ((u16)(~0U))
-#define SHRT_MAX ((s16)(USHRT_MAX>>1))
-#define SHRT_MIN ((s16)(-SHRT_MAX - 1))
-#define INT_MAX ((int)(~0U>>1))
-#define INT_MIN (-INT_MAX - 1)
-#define UINT_MAX (~0U)
-#define LONG_MAX ((long)(~0UL>>1))
-#define LONG_MIN (-LONG_MAX - 1)
-#define ULONG_MAX (~0UL)
-#define LLONG_MAX ((long long)(~0ULL>>1))
-#define LLONG_MIN (-LLONG_MAX - 1)
-#define ULLONG_MAX (~0ULL)
-#define SIZE_MAX (~(size_t)0)
-#define PHYS_ADDR_MAX (~(phys_addr_t)0)
-
-#define U8_MAX ((u8)~0U)
-#define S8_MAX ((s8)(U8_MAX>>1))
-#define S8_MIN ((s8)(-S8_MAX - 1))
-#define U16_MAX ((u16)~0U)
-#define S16_MAX ((s16)(U16_MAX>>1))
-#define S16_MIN ((s16)(-S16_MAX - 1))
-#define U32_MAX ((u32)~0U)
-#define S32_MAX ((s32)(U32_MAX>>1))
-#define S32_MIN ((s32)(-S32_MAX - 1))
-#define U64_MAX ((u64)~0ULL)
-#define S64_MAX ((s64)(U64_MAX>>1))
-#define S64_MIN ((s64)(-S64_MAX - 1))
-
#define STACK_MAGIC 0xdeadbeef
/**
@@ -204,7 +178,6 @@
#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; })
#ifdef CONFIG_LBDAF
-# include <asm/div64.h>
# define sector_div(a, b) do_div(a, b)
#else
# define sector_div(n, b)( \
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 822a64e65b41..14d07d38e8f2 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -27,7 +27,8 @@ struct super_block;
struct file_system_type;
struct fs_context;
-struct kernfs_fs_context;
+struct poll_table_struct;
+
struct kernfs_open_node;
struct kernfs_iattrs;
@@ -262,6 +263,9 @@ struct kernfs_ops {
ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes,
loff_t off);
+ __poll_t (*poll)(struct kernfs_open_file *of,
+ struct poll_table_struct *pt);
+
int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -363,6 +367,8 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
const char *new_name, const void *new_ns);
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
+__poll_t kernfs_generic_poll(struct kernfs_open_file *of,
+ struct poll_table_struct *pt);
void kernfs_notify(struct kernfs_node *kn);
const void *kernfs_super_ns(struct super_block *sb);
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 161e8164abcf..e48b1e453ff5 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -53,6 +53,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
void ksm_migrate_page(struct page *newpage, struct page *oldpage);
+bool reuse_ksm_page(struct page *page,
+ struct vm_area_struct *vma, unsigned long address);
#else /* !CONFIG_KSM */
@@ -86,6 +88,11 @@ static inline void rmap_walk_ksm(struct page *page,
static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
}
+static inline bool reuse_ksm_page(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ return false;
+}
#endif /* CONFIG_MMU */
#endif /* !CONFIG_KSM */
diff --git a/include/linux/limits.h b/include/linux/limits.h
new file mode 100644
index 000000000000..76afcd24ff8c
--- /dev/null
+++ b/include/linux/limits.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_LIMITS_H
+#define _LINUX_LIMITS_H
+
+#include <uapi/linux/limits.h>
+#include <linux/types.h>
+
+#define USHRT_MAX ((unsigned short)~0U)
+#define SHRT_MAX ((short)(USHRT_MAX >> 1))
+#define SHRT_MIN ((short)(-SHRT_MAX - 1))
+#define INT_MAX ((int)(~0U >> 1))
+#define INT_MIN (-INT_MAX - 1)
+#define UINT_MAX (~0U)
+#define LONG_MAX ((long)(~0UL >> 1))
+#define LONG_MIN (-LONG_MAX - 1)
+#define ULONG_MAX (~0UL)
+#define LLONG_MAX ((long long)(~0ULL >> 1))
+#define LLONG_MIN (-LLONG_MAX - 1)
+#define ULLONG_MAX (~0ULL)
+#define SIZE_MAX (~(size_t)0)
+#define PHYS_ADDR_MAX (~(phys_addr_t)0)
+
+#define U8_MAX ((u8)~0U)
+#define S8_MAX ((s8)(U8_MAX >> 1))
+#define S8_MIN ((s8)(-S8_MAX - 1))
+#define U16_MAX ((u16)~0U)
+#define S16_MAX ((s16)(U16_MAX >> 1))
+#define S16_MIN ((s16)(-S16_MAX - 1))
+#define U32_MAX ((u32)~0U)
+#define S32_MAX ((s32)(U32_MAX >> 1))
+#define S32_MIN ((s32)(-S32_MAX - 1))
+#define U64_MAX ((u64)~0ULL)
+#define S64_MAX ((s64)(U64_MAX >> 1))
+#define S64_MIN ((s64)(-S64_MAX - 1))
+
+#endif /* _LINUX_LIMITS_H */
diff --git a/include/linux/list.h b/include/linux/list.h
index edb7628e46ed..9a2b3913d99c 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -151,6 +151,23 @@ static inline void list_replace_init(struct list_head *old,
}
/**
+ * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position
+ * @entry1: the location to place entry2
+ * @entry2: the location to place entry1
+ */
+static inline void list_swap(struct list_head *entry1,
+ struct list_head *entry2)
+{
+ struct list_head *pos = entry2->prev;
+
+ list_del(entry2);
+ list_replace(entry1, entry2);
+ if (pos == entry1)
+ pos = entry2;
+ list_add(entry1, pos);
+}
+
+/**
* list_del_init - deletes entry from list and reinitialize it.
* @entry: the element to delete from the list.
*/
@@ -207,6 +224,17 @@ static inline void list_bulk_move_tail(struct list_head *head,
}
/**
+ * list_is_first -- tests whether @ list is the first entry in list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_is_first(const struct list_head *list,
+ const struct list_head *head)
+{
+ return list->prev == head;
+}
+
+/**
* list_is_last - tests whether @list is the last entry in list @head
* @list: the entry to test
* @head: the head of the list
diff --git a/include/linux/lzo.h b/include/linux/lzo.h
index 2ae27cb89927..e95c7d1092b2 100644
--- a/include/linux/lzo.h
+++ b/include/linux/lzo.h
@@ -18,12 +18,16 @@
#define LZO1X_1_MEM_COMPRESS (8192 * sizeof(unsigned short))
#define LZO1X_MEM_COMPRESS LZO1X_1_MEM_COMPRESS
-#define lzo1x_worst_compress(x) ((x) + ((x) / 16) + 64 + 3)
+#define lzo1x_worst_compress(x) ((x) + ((x) / 16) + 64 + 3 + 2)
/* This requires 'wrkmem' of size LZO1X_1_MEM_COMPRESS */
int lzo1x_1_compress(const unsigned char *src, size_t src_len,
unsigned char *dst, size_t *dst_len, void *wrkmem);
+/* This requires 'wrkmem' of size LZO1X_1_MEM_COMPRESS */
+int lzorle1x_1_compress(const unsigned char *src, size_t src_len,
+ unsigned char *dst, size_t *dst_len, void *wrkmem);
+
/* safe decompression with overrun testing */
int lzo1x_decompress_safe(const unsigned char *src, size_t src_len,
unsigned char *dst, size_t *dst_len);
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 859b55b66db2..eb9ac119a875 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -244,6 +244,47 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
+ unsigned long *out_spfn,
+ unsigned long *out_epfn);
+/**
+ * for_each_free_mem_range_in_zone - iterate through zone specific free
+ * memblock areas
+ * @i: u64 used as loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over free (memory && !reserved) areas of memblock in a specific
+ * zone. Available once memblock and an empty zone is initialized. The main
+ * assumption is that the zone start, end, and pgdat have been associated.
+ * This way we can use the zone to determine NUMA node, and if a given part
+ * of the memblock is valid for the zone.
+ */
+#define for_each_free_mem_pfn_range_in_zone(i, zone, p_start, p_end) \
+ for (i = 0, \
+ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \
+ i != U64_MAX; \
+ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+
+/**
+ * for_each_free_mem_range_in_zone_from - iterate through zone specific
+ * free memblock areas from a given point
+ * @i: u64 used as loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over free (memory && !reserved) areas of memblock in a specific
+ * zone, continuing from current position. Available as soon as memblock is
+ * initialized.
+ */
+#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \
+ for (; i != U64_MAX; \
+ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
/**
* for_each_free_mem_range - iterate through free memblock areas
* @i: u64 used as loop variable
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 83ae11cbd12c..1803abfd7c00 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -333,6 +333,19 @@ static inline bool mem_cgroup_disabled(void)
return !cgroup_subsys_enabled(memory_cgrp_subsys);
}
+static inline void mem_cgroup_protection(struct mem_cgroup *memcg,
+ unsigned long *min, unsigned long *low)
+{
+ if (mem_cgroup_disabled()) {
+ *min = 0;
+ *low = 0;
+ return;
+ }
+
+ *min = READ_ONCE(memcg->memory.emin);
+ *low = READ_ONCE(memcg->memory.elow);
+}
+
enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
struct mem_cgroup *memcg);
@@ -429,6 +442,11 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
}
struct mem_cgroup *mem_cgroup_from_id(unsigned short id);
+static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
+{
+ return mem_cgroup_from_css(seq_css(m));
+}
+
static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
struct mem_cgroup_per_node *mz;
@@ -526,6 +544,8 @@ void mem_cgroup_handle_over_high(void);
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg);
+
void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
struct task_struct *p);
@@ -819,6 +839,13 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
{
}
+static inline void mem_cgroup_protection(struct mem_cgroup *memcg,
+ unsigned long *min, unsigned long *low)
+{
+ *min = 0;
+ *low = 0;
+}
+
static inline enum mem_cgroup_protection mem_cgroup_protected(
struct mem_cgroup *root, struct mem_cgroup *memcg)
{
@@ -937,6 +964,11 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
return NULL;
}
+static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
+{
+ return NULL;
+}
+
static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
return NULL;
@@ -971,6 +1003,11 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
return 0;
}
+static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
static inline void
mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
@@ -1273,12 +1310,12 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
void memcg_kmem_put_cache(struct kmem_cache *cachep);
-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
- struct mem_cgroup *memcg);
#ifdef CONFIG_MEMCG_KMEM
-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
-void memcg_kmem_uncharge(struct page *page, int order);
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
+void __memcg_kmem_uncharge(struct page *page, int order);
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+ struct mem_cgroup *memcg);
extern struct static_key_false memcg_kmem_enabled_key;
extern struct workqueue_struct *memcg_kmem_cache_wq;
@@ -1300,6 +1337,26 @@ static inline bool memcg_kmem_enabled(void)
return static_branch_unlikely(&memcg_kmem_enabled_key);
}
+static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+{
+ if (memcg_kmem_enabled())
+ return __memcg_kmem_charge(page, gfp, order);
+ return 0;
+}
+
+static inline void memcg_kmem_uncharge(struct page *page, int order)
+{
+ if (memcg_kmem_enabled())
+ __memcg_kmem_uncharge(page, order);
+}
+
+static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp,
+ int order, struct mem_cgroup *memcg)
+{
+ if (memcg_kmem_enabled())
+ return __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ return 0;
+}
/*
* helper for accessing a memcg's index. It will be used as an index in the
* child cache array in kmem_cache, and also to derive its name. This function
@@ -1325,6 +1382,15 @@ static inline void memcg_kmem_uncharge(struct page *page, int order)
{
}
+static inline int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+{
+ return 0;
+}
+
+static inline void __memcg_kmem_uncharge(struct page *page, int order)
+{
+}
+
#define for_each_memcg_cache_index(_idx) \
for (; NULL; )
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index cfd12078172a..8ade08c50d26 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -89,7 +89,7 @@ extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
unsigned long *valid_start, unsigned long *valid_end);
extern void __offline_isolated_pages(unsigned long, unsigned long);
-typedef void (*online_page_callback_t)(struct page *page);
+typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
extern int set_online_page_callback(online_page_callback_t callback);
extern int restore_online_page_callback(online_page_callback_t callback);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80bb6408fe73..154015295554 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -123,10 +123,45 @@ extern int mmap_rnd_compat_bits __read_mostly;
/*
* On some architectures it is expensive to call memset() for small sizes.
- * Those architectures should provide their own implementation of "struct page"
- * zeroing by defining this macro in <asm/pgtable.h>.
+ * If an architecture decides to implement their own version of
+ * mm_zero_struct_page they should wrap the defines below in a #ifndef and
+ * define their own version of this macro in <asm/pgtable.h>
*/
-#ifndef mm_zero_struct_page
+#if BITS_PER_LONG == 64
+/* This function must be updated when the size of struct page grows above 80
+ * or reduces below 56. The idea that compiler optimizes out switch()
+ * statement, and only leaves move/store instructions. Also the compiler can
+ * combine write statments if they are both assignments and can be reordered,
+ * this can result in several of the writes here being dropped.
+ */
+#define mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
+static inline void __mm_zero_struct_page(struct page *page)
+{
+ unsigned long *_pp = (void *)page;
+
+ /* Check that struct page is either 56, 64, 72, or 80 bytes */
+ BUILD_BUG_ON(sizeof(struct page) & 7);
+ BUILD_BUG_ON(sizeof(struct page) < 56);
+ BUILD_BUG_ON(sizeof(struct page) > 80);
+
+ switch (sizeof(struct page)) {
+ case 80:
+ _pp[9] = 0; /* fallthrough */
+ case 72:
+ _pp[8] = 0; /* fallthrough */
+ case 64:
+ _pp[7] = 0; /* fallthrough */
+ case 56:
+ _pp[6] = 0;
+ _pp[5] = 0;
+ _pp[4] = 0;
+ _pp[3] = 0;
+ _pp[2] = 0;
+ _pp[1] = 0;
+ _pp[0] = 0;
+ }
+}
+#else
#define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page)))
#endif
@@ -500,9 +535,6 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
struct mmu_gather;
struct inode;
-#define page_private(page) ((page)->private)
-#define set_page_private(page, v) ((page)->private = (v))
-
#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline int pmd_devmap(pmd_t pmd)
{
@@ -1192,11 +1224,18 @@ static inline void set_page_node(struct page *page, unsigned long node)
page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
}
+static inline void set_page_reserved(struct page *page, bool reserved)
+{
+ page->flags &= ~(1ul << PG_reserved);
+ page->flags |= (unsigned long)(!!reserved) << PG_reserved;
+}
+
static inline void set_page_links(struct page *page, enum zone_type zone,
- unsigned long node, unsigned long pfn)
+ unsigned long node, unsigned long pfn, bool reserved)
{
set_page_zone(page, zone);
set_page_node(page, node);
+ set_page_reserved(page, reserved);
#ifdef SECTION_IN_PAGE_FLAGS
set_page_section(page, pfn_to_section_nr(pfn));
#endif
@@ -1323,52 +1362,6 @@ static inline void clear_page_pfmemalloc(struct page *page)
}
/*
- * Different kinds of faults, as returned by handle_mm_fault().
- * Used to decide whether a process gets delivered SIGBUS or
- * just gets major/minor fault counters bumped up.
- */
-
-#define VM_FAULT_OOM 0x0001
-#define VM_FAULT_SIGBUS 0x0002
-#define VM_FAULT_MAJOR 0x0004
-#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
-#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
-#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
-#define VM_FAULT_SIGSEGV 0x0040
-
-#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
-#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
-#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
-#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */
-#define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */
-#define VM_FAULT_NEEDDSYNC 0x2000 /* ->fault did not modify page tables
- * and needs fsync() to complete (for
- * synchronous page faults in DAX) */
-
-#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
- VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
- VM_FAULT_FALLBACK)
-
-#define VM_FAULT_RESULT_TRACE \
- { VM_FAULT_OOM, "OOM" }, \
- { VM_FAULT_SIGBUS, "SIGBUS" }, \
- { VM_FAULT_MAJOR, "MAJOR" }, \
- { VM_FAULT_WRITE, "WRITE" }, \
- { VM_FAULT_HWPOISON, "HWPOISON" }, \
- { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \
- { VM_FAULT_SIGSEGV, "SIGSEGV" }, \
- { VM_FAULT_NOPAGE, "NOPAGE" }, \
- { VM_FAULT_LOCKED, "LOCKED" }, \
- { VM_FAULT_RETRY, "RETRY" }, \
- { VM_FAULT_FALLBACK, "FALLBACK" }, \
- { VM_FAULT_DONE_COW, "DONE_COW" }, \
- { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
-
-/* Encode hstate index for a hwpoisoned large page */
-#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
-#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
-
-/*
* Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
*/
extern void pagefault_out_of_memory(void);
@@ -1536,7 +1529,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages, int *locked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
-#ifdef CONFIG_FS_DAX
+
+#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA)
long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 74a059661924..fe0f672f53ce 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -22,7 +22,6 @@
#endif
#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
-typedef int vm_fault_t;
struct address_space;
struct mem_cgroup;
@@ -221,6 +220,9 @@ struct page {
#define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK)
#define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE)
+#define page_private(page) ((page)->private)
+#define set_page_private(page, v) ((page)->private = (v))
+
struct page_frag_cache {
void * va;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -621,6 +623,78 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
struct vm_fault;
+/**
+ * typedef vm_fault_t - Return type for page fault handlers.
+ *
+ * Page fault handlers return a bitmask of %VM_FAULT values.
+ */
+typedef __bitwise unsigned int vm_fault_t;
+
+/**
+ * enum vm_fault_reason - Page fault handlers return a bitmask of
+ * these values to tell the core VM what happened when handling the
+ * fault. Used to decide whether a process gets delivered SIGBUS or
+ * just gets major/minor fault counters bumped up.
+ *
+ * @VM_FAULT_OOM: Out Of Memory
+ * @VM_FAULT_SIGBUS: Bad access
+ * @VM_FAULT_MAJOR: Page read from storage
+ * @VM_FAULT_WRITE: Special case for get_user_pages
+ * @VM_FAULT_HWPOISON: Hit poisoned small page
+ * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded
+ * in upper bits
+ * @VM_FAULT_SIGSEGV: segmentation fault
+ * @VM_FAULT_NOPAGE: ->fault installed the pte, not return page
+ * @VM_FAULT_LOCKED: ->fault locked the returned page
+ * @VM_FAULT_RETRY: ->fault blocked, must retry
+ * @VM_FAULT_FALLBACK: huge page fault failed, fall back to small
+ * @VM_FAULT_DONE_COW: ->fault has fully handled COW
+ * @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs
+ * fsync() to complete (for synchronous page faults
+ * in DAX)
+ * @VM_FAULT_HINDEX_MASK: mask HINDEX value
+ *
+ */
+enum vm_fault_reason {
+ VM_FAULT_OOM = (__force vm_fault_t)0x000001,
+ VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002,
+ VM_FAULT_MAJOR = (__force vm_fault_t)0x000004,
+ VM_FAULT_WRITE = (__force vm_fault_t)0x000008,
+ VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010,
+ VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020,
+ VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040,
+ VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100,
+ VM_FAULT_LOCKED = (__force vm_fault_t)0x000200,
+ VM_FAULT_RETRY = (__force vm_fault_t)0x000400,
+ VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800,
+ VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
+ VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
+ VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
+};
+
+/* Encode hstate index for a hwpoisoned large page */
+#define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16))
+#define VM_FAULT_GET_HINDEX(x) (((x) >> 16) & 0xf)
+
+#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \
+ VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \
+ VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK)
+
+#define VM_FAULT_RESULT_TRACE \
+ { VM_FAULT_OOM, "OOM" }, \
+ { VM_FAULT_SIGBUS, "SIGBUS" }, \
+ { VM_FAULT_MAJOR, "MAJOR" }, \
+ { VM_FAULT_WRITE, "WRITE" }, \
+ { VM_FAULT_HWPOISON, "HWPOISON" }, \
+ { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \
+ { VM_FAULT_SIGSEGV, "SIGSEGV" }, \
+ { VM_FAULT_NOPAGE, "NOPAGE" }, \
+ { VM_FAULT_LOCKED, "LOCKED" }, \
+ { VM_FAULT_RETRY, "RETRY" }, \
+ { VM_FAULT_FALLBACK, "FALLBACK" }, \
+ { VM_FAULT_DONE_COW, "DONE_COW" }, \
+ { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
+
struct vm_special_mapping {
const char *name; /* The name, e.g. "[vdso]". */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 842f9189537b..da5321c747f8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -18,6 +18,8 @@
#include <linux/pageblock-flags.h>
#include <linux/page-flags-layout.h>
#include <linux/atomic.h>
+#include <linux/mm_types.h>
+#include <linux/page-flags.h>
#include <asm/page.h>
/* Free memory management - zoned buddy allocator. */
@@ -98,6 +100,62 @@ struct free_area {
unsigned long nr_free;
};
+/* Used for pages not on another list */
+static inline void add_to_free_area(struct page *page, struct free_area *area,
+ int migratetype)
+{
+ list_add(&page->lru, &area->free_list[migratetype]);
+ area->nr_free++;
+}
+
+/* Used for pages not on another list */
+static inline void add_to_free_area_tail(struct page *page, struct free_area *area,
+ int migratetype)
+{
+ list_add_tail(&page->lru, &area->free_list[migratetype]);
+ area->nr_free++;
+}
+
+#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
+/* Used to preserve page allocation order entropy */
+void add_to_free_area_random(struct page *page, struct free_area *area,
+ int migratetype);
+#else
+static inline void add_to_free_area_random(struct page *page,
+ struct free_area *area, int migratetype)
+{
+ add_to_free_area(page, area, migratetype);
+}
+#endif
+
+/* Used for pages which are on another list */
+static inline void move_to_free_area(struct page *page, struct free_area *area,
+ int migratetype)
+{
+ list_move(&page->lru, &area->free_list[migratetype]);
+}
+
+static inline struct page *get_page_from_free_area(struct free_area *area,
+ int migratetype)
+{
+ return list_first_entry_or_null(&area->free_list[migratetype],
+ struct page, lru);
+}
+
+static inline void del_page_from_free_area(struct page *page,
+ struct free_area *area, int migratetype)
+{
+ list_del(&page->lru);
+ __ClearPageBuddy(page);
+ set_page_private(page, 0);
+ area->nr_free--;
+}
+
+static inline bool free_area_empty(struct free_area *area, int migratetype)
+{
+ return list_empty(&area->free_list[migratetype]);
+}
+
struct pglist_data;
/*
@@ -480,6 +538,8 @@ struct zone {
unsigned long compact_cached_free_pfn;
/* pfn where async and sync compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[2];
+ unsigned long compact_init_migrate_pfn;
+ unsigned long compact_init_free_pfn;
#endif
#ifdef CONFIG_COMPACTION
@@ -1278,6 +1338,7 @@ void sparse_init(void);
#else
#define sparse_init() do {} while (0)
#define sparse_index_init(_sec, _nid) do {} while (0)
+#define pfn_present pfn_valid
#endif /* CONFIG_SPARSEMEM */
/*
@@ -1299,7 +1360,7 @@ void memory_present(int nid, unsigned long start, unsigned long end);
/*
* If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
- * need to check pfn validility within that MAX_ORDER_NR_PAGES block.
+ * need to check pfn validity within that MAX_ORDER_NR_PAGES block.
* pfn_valid_within() should be used in this case; we optimise this away
* when we have no holes within a MAX_ORDER_NR_PAGES block.
*/
diff --git a/include/linux/net.h b/include/linux/net.h
index e0930678c8bf..651fca72286c 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -263,7 +263,7 @@ do { \
#define net_dbg_ratelimited(fmt, ...) \
do { \
DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) && \
+ if (DYNAMIC_DEBUG_BRANCH(descriptor) && \
net_ratelimit()) \
__dynamic_pr_debug(&descriptor, pr_fmt(fmt), \
##__VA_ARGS__); \
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 5a30ad594ccc..ec74777c3907 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -98,6 +98,14 @@
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
extern nodemask_t _unused_nodemask_arg_;
+#if MAX_NUMNODES > 1
+extern unsigned int nr_node_ids;
+extern unsigned int nr_online_nodes;
+#else
+#define nr_node_ids 1U
+#define nr_online_nodes 1U
+#endif
+
/**
* nodemask_pr_args - printf args to output a nodemask
* @maskp: nodemask to be printed
@@ -108,7 +116,7 @@ extern nodemask_t _unused_nodemask_arg_;
__nodemask_pr_bits(maskp)
static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
{
- return m ? MAX_NUMNODES : 0;
+ return m ? nr_node_ids : 0;
}
static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
{
@@ -444,9 +452,6 @@ static inline int next_memory_node(int nid)
return next_node(nid, node_states[N_MEMORY]);
}
-extern int nr_node_ids;
-extern int nr_online_nodes;
-
static inline void node_set_online(int nid)
{
node_set_state(nid, N_ONLINE);
@@ -485,8 +490,6 @@ static inline int num_node_state(enum node_states state)
#define first_online_node 0
#define first_memory_node 0
#define next_online_node(nid) (MAX_NUMNODES)
-#define nr_node_ids 1
-#define nr_online_nodes 1
#define node_set_online(node) node_set_state((node), N_ONLINE)
#define node_set_offline(node) node_clear_state((node), N_ONLINE)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 39b4494e29f1..9f8712a4b1a5 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -17,8 +17,37 @@
/*
* Various page->flags bits:
*
- * PG_reserved is set for special pages, which can never be swapped out. Some
- * of them might not even exist...
+ * PG_reserved is set for special pages. The "struct page" of such a page
+ * should in general not be touched (e.g. set dirty) except by its owner.
+ * Pages marked as PG_reserved include:
+ * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS,
+ * initrd, HW tables)
+ * - Pages reserved or allocated early during boot (before the page allocator
+ * was initialized). This includes (depending on the architecture) the
+ * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much
+ * much more. Once (if ever) freed, PG_reserved is cleared and they will
+ * be given to the page allocator.
+ * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying
+ * to read/write these pages might end badly. Don't touch!
+ * - The zero page(s)
+ * - Pages not added to the page allocator when onlining a section because
+ * they were excluded via the online_page_callback() or because they are
+ * PG_hwpoison.
+ * - Pages allocated in the context of kexec/kdump (loaded kernel image,
+ * control pages, vmcoreinfo)
+ * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are
+ * not marked PG_reserved (as they might be in use by somebody else who does
+ * not respect the caching strategy).
+ * - Pages part of an offline section (struct pages of offline sections should
+ * not be trusted as they will be initialized when first onlined).
+ * - MCA pages on ia64
+ * - Pages holding CPU notes for POWER Firmware Assisted Dump
+ * - Device memory (e.g. PMEM, DAX, HMM)
+ * Some PG_reserved pages will be excluded from the hibernation image.
+ * PG_reserved does in general not hinder anybody from dumping or swapping
+ * and is no longer required for remap_pfn_range(). ioremap might require it.
+ * Consequently, PG_reserved for a page mapped into user space can indicate
+ * the zero page, the vDSO, MMIO pages or device memory.
*
* The PG_private bitflag is set on pagecache pages if they contain filesystem
* specific data (which is normally at page->private). It can be used by
@@ -671,7 +700,7 @@ PAGEFLAG_FALSE(DoubleMap)
/* Reserve 0x0000007f to catch underflows of page_mapcount */
#define PAGE_MAPCOUNT_RESERVE -128
#define PG_buddy 0x00000080
-#define PG_balloon 0x00000100
+#define PG_offline 0x00000100
#define PG_kmemcg 0x00000200
#define PG_table 0x00000400
@@ -706,10 +735,13 @@ static __always_inline void __ClearPage##uname(struct page *page) \
PAGE_TYPE_OPS(Buddy, buddy)
/*
- * PageBalloon() is true for pages that are on the balloon page list
- * (see mm/balloon_compaction.c).
+ * PageOffline() indicates that the page is logically offline although the
+ * containing section is online. (e.g. inflated in a balloon driver or
+ * not onlined when onlining the section).
+ * The content of these pages is effectively stale. Such pages should not
+ * be touched (read/write/dump/save) except by their owner.
*/
-PAGE_TYPE_OPS(Balloon, balloon)
+PAGE_TYPE_OPS(Offline, offline)
/*
* If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e2d7039af6a3..bcf909d0de5f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -164,7 +164,7 @@ void release_pages(struct page **pages, int nr);
* will find the page or it will not. Likewise, the old find_get_page could run
* either before the insertion or afterwards, depending on timing.
*/
-static inline int page_cache_get_speculative(struct page *page)
+static inline int __page_cache_add_speculative(struct page *page, int count)
{
#ifdef CONFIG_TINY_RCU
# ifdef CONFIG_PREEMPT_COUNT
@@ -180,10 +180,10 @@ static inline int page_cache_get_speculative(struct page *page)
* SMP requires.
*/
VM_BUG_ON_PAGE(page_count(page) == 0, page);
- page_ref_inc(page);
+ page_ref_add(page, count);
#else
- if (unlikely(!get_page_unless_zero(page))) {
+ if (unlikely(!page_ref_add_unless(page, count, 0))) {
/*
* Either the page has been freed, or will be freed.
* In either case, retry here and the caller should
@@ -197,27 +197,14 @@ static inline int page_cache_get_speculative(struct page *page)
return 1;
}
-/*
- * Same as above, but add instead of inc (could just be merged)
- */
-static inline int page_cache_add_speculative(struct page *page, int count)
+static inline int page_cache_get_speculative(struct page *page)
{
- VM_BUG_ON(in_interrupt());
-
-#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
-# ifdef CONFIG_PREEMPT_COUNT
- VM_BUG_ON(!in_atomic() && !irqs_disabled());
-# endif
- VM_BUG_ON_PAGE(page_count(page) == 0, page);
- page_ref_add(page, count);
-
-#else
- if (unlikely(!page_ref_add_unless(page, count, 0)))
- return 0;
-#endif
- VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page);
+ return __page_cache_add_speculative(page, 1);
+}
- return 1;
+static inline int page_cache_add_speculative(struct page *page, int count)
+{
+ return __page_cache_add_speculative(page, count);
}
#ifdef CONFIG_NUMA
@@ -252,6 +239,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
#define FGP_WRITE 0x00000008
#define FGP_NOFS 0x00000010
#define FGP_NOWAIT 0x00000020
+#define FGP_FOR_MMAP 0x00000040
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
int fgp_flags, gfp_t cache_gfp_mask);
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 14a9a39da9c7..b6f4ba16065a 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -109,7 +109,6 @@ extern struct pid *find_vpid(int nr);
*/
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
extern struct pid *alloc_pid(struct pid_namespace *ns);
extern void free_pid(struct pid *pid);
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 15927ebc22f2..5046bad0c1c5 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -30,7 +30,7 @@
*/
#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA)
-/********** mm/debug-pagealloc.c **********/
+/********** mm/page_poison.c **********/
#ifdef CONFIG_PAGE_POISONING_ZERO
#define PAGE_POISON 0x00
#else
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 77740a506ebb..02b5c115d89b 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -461,7 +461,7 @@ do { \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt)); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) && \
+ if (DYNAMIC_DEBUG_BRANCH(descriptor) && \
__ratelimit(&_rs)) \
__dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__); \
} while (0)
diff --git a/include/linux/psi.h b/include/linux/psi.h
index 7006008d5b72..e9029e19e60c 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -4,6 +4,7 @@
#include <linux/jump_label.h>
#include <linux/psi_types.h>
#include <linux/sched.h>
+#include <linux/poll.h>
struct seq_file;
struct css_set;
@@ -26,6 +27,13 @@ int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
int psi_cgroup_alloc(struct cgroup *cgrp);
void psi_cgroup_free(struct cgroup *cgrp);
void cgroup_move_task(struct task_struct *p, struct css_set *to);
+
+struct psi_trigger *psi_trigger_create(struct psi_group *group,
+ char *buf, size_t nbytes, enum psi_res res);
+void psi_trigger_destroy(struct psi_trigger *t);
+
+__poll_t psi_trigger_poll(struct psi_trigger *t, struct file *file,
+ poll_table *wait);
#endif
#else /* CONFIG_PSI */
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 2cf422db5d18..92750524af27 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -3,6 +3,7 @@
#include <linux/seqlock.h>
#include <linux/types.h>
+#include <linux/wait.h>
#ifdef CONFIG_PSI
@@ -11,7 +12,7 @@ enum psi_task_count {
NR_IOWAIT,
NR_MEMSTALL,
NR_RUNNING,
- NR_PSI_TASK_COUNTS,
+ NR_PSI_TASK_COUNTS = 3,
};
/* Task state bitmasks */
@@ -24,7 +25,7 @@ enum psi_res {
PSI_IO,
PSI_MEM,
PSI_CPU,
- NR_PSI_RESOURCES,
+ NR_PSI_RESOURCES = 3,
};
/*
@@ -41,7 +42,7 @@ enum psi_states {
PSI_CPU_SOME,
/* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE,
- NR_PSI_STATES,
+ NR_PSI_STATES = 6,
};
struct psi_group_cpu {
@@ -53,6 +54,9 @@ struct psi_group_cpu {
/* States of the tasks belonging to this group */
unsigned int tasks[NR_PSI_TASK_COUNTS];
+ /* Aggregate pressure state derived from the tasks */
+ u32 state_mask;
+
/* Period time sampling buckets for each state of interest (ns) */
u32 times[NR_PSI_STATES];
@@ -65,22 +69,81 @@ struct psi_group_cpu {
u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp;
};
+/* PSI growth tracking window */
+struct psi_window {
+ /* Window size in ns */
+ u64 size;
+
+ /* Start time of the current window in ns */
+ u64 start_time;
+
+ /* Value at the start of the window */
+ u64 start_value;
+
+ /* Value growth in the previous window */
+ u64 prev_growth;
+};
+
+struct psi_trigger {
+ /* PSI state being monitored by the trigger */
+ enum psi_states state;
+
+ /* User-spacified threshold in ns */
+ u64 threshold;
+
+ /* List node inside triggers list */
+ struct list_head node;
+
+ /* Backpointer needed during trigger destruction */
+ struct psi_group *group;
+
+ /* Wait queue for polling */
+ wait_queue_head_t event_wait;
+
+ /* Pending event flag */
+ int event;
+
+ /* Tracking window */
+ struct psi_window win;
+
+ /*
+ * Time last event was generated. Used for rate-limiting
+ * events to one per window
+ */
+ u64 last_event_time;
+};
+
struct psi_group {
- /* Protects data updated during an aggregation */
- struct mutex stat_lock;
+ /* Protects data used by the aggregator */
+ struct mutex update_lock;
/* Per-cpu task state & time tracking */
struct psi_group_cpu __percpu *pcpu;
- /* Periodic aggregation state */
- u64 total_prev[NR_PSI_STATES - 1];
- u64 last_update;
- u64 next_update;
+ /* Periodic work control */
+ atomic_t polling;
struct delayed_work clock_work;
- /* Total stall times and sampled pressure averages */
+ /* Total stall times observed */
u64 total[NR_PSI_STATES - 1];
+
+ /* Running pressure averages */
+ u64 avg_total[NR_PSI_STATES - 1];
+ u64 avg_last_update;
+ u64 avg_next_update;
unsigned long avg[NR_PSI_STATES - 1][3];
+
+ /* Configured polling triggers */
+ struct list_head triggers;
+ u32 nr_triggers[NR_PSI_STATES - 1];
+ u32 trigger_states;
+ u64 trigger_min_period;
+
+ /* Polling state */
+ /* Total stall times at the start of monitor activation */
+ u64 polling_total[NR_PSI_STATES - 1];
+ u64 polling_next_update;
+ u64 polling_until;
};
#else /* CONFIG_PSI */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6302b9e8db9a..53522645f96e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -48,6 +48,7 @@ struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
struct reclaim_state;
+struct capture_control;
struct robust_list_head;
struct sched_attr;
struct sched_param;
@@ -958,6 +959,9 @@ struct task_struct {
struct io_context *io_context;
+#ifdef CONFIG_COMPACTION
+ struct capture_control *capture_control;
+#endif
/* Ptrace state: */
unsigned long ptrace_message;
kernel_siginfo_t *last_siginfo;
@@ -1162,6 +1166,9 @@ struct task_struct {
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
+
+ /* Used by memcontrol for high relcaim: */
+ struct mem_cgroup *memcg_high_reclaim;
#endif
#ifdef CONFIG_BLK_CGROUP
@@ -1202,6 +1209,13 @@ struct task_struct {
unsigned long prev_lowest_stack;
#endif
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ unsigned long getblk_stamp;
+ unsigned int getblk_executed;
+ unsigned int getblk_bh_count;
+ unsigned long getblk_bh_state;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
@@ -1403,6 +1417,7 @@ extern struct pid *cad_pid;
#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
+#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 3bfa6a0cbba4..0cd9f10423fb 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -148,17 +148,25 @@ static inline bool in_vfork(struct task_struct *tsk)
* Applies per-task gfp context to the given allocation flags.
* PF_MEMALLOC_NOIO implies GFP_NOIO
* PF_MEMALLOC_NOFS implies GFP_NOFS
+ * PF_MEMALLOC_NOCMA implies no allocation from CMA region.
*/
static inline gfp_t current_gfp_context(gfp_t flags)
{
- /*
- * NOIO implies both NOIO and NOFS and it is a weaker context
- * so always make sure it makes precedence
- */
- if (unlikely(current->flags & PF_MEMALLOC_NOIO))
- flags &= ~(__GFP_IO | __GFP_FS);
- else if (unlikely(current->flags & PF_MEMALLOC_NOFS))
- flags &= ~__GFP_FS;
+ if (unlikely(current->flags &
+ (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_NOCMA))) {
+ /*
+ * NOIO implies both NOIO and NOFS and it is a weaker context
+ * so always make sure it makes precedence
+ */
+ if (current->flags & PF_MEMALLOC_NOIO)
+ flags &= ~(__GFP_IO | __GFP_FS);
+ else if (current->flags & PF_MEMALLOC_NOFS)
+ flags &= ~__GFP_FS;
+#ifdef CONFIG_CMA
+ if (current->flags & PF_MEMALLOC_NOCMA)
+ flags &= ~__GFP_MOVABLE;
+#endif
+ }
return flags;
}
@@ -248,6 +256,30 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
current->flags = (current->flags & ~PF_MEMALLOC) | flags;
}
+#ifdef CONFIG_CMA
+static inline unsigned int memalloc_nocma_save(void)
+{
+ unsigned int flags = current->flags & PF_MEMALLOC_NOCMA;
+
+ current->flags |= PF_MEMALLOC_NOCMA;
+ return flags;
+}
+
+static inline void memalloc_nocma_restore(unsigned int flags)
+{
+ current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags;
+}
+#else
+static inline unsigned int memalloc_nocma_save(void)
+{
+ return 0;
+}
+
+static inline void memalloc_nocma_restore(unsigned int flags)
+{
+}
+#endif
+
#ifdef CONFIG_MEMCG
/**
* memalloc_use_memcg - Starts the remote memcg charging scope.
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index ae5655197698..e412c092c1e8 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -418,10 +418,20 @@ static inline void set_restore_sigmask(void)
set_thread_flag(TIF_RESTORE_SIGMASK);
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
}
+
+static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
+{
+ clear_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK);
+}
+
static inline void clear_restore_sigmask(void)
{
clear_thread_flag(TIF_RESTORE_SIGMASK);
}
+static inline bool test_tsk_restore_sigmask(struct task_struct *tsk)
+{
+ return test_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK);
+}
static inline bool test_restore_sigmask(void)
{
return test_thread_flag(TIF_RESTORE_SIGMASK);
@@ -439,6 +449,10 @@ static inline void set_restore_sigmask(void)
current->restore_sigmask = true;
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
}
+static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
+{
+ tsk->restore_sigmask = false;
+}
static inline void clear_restore_sigmask(void)
{
current->restore_sigmask = false;
@@ -447,6 +461,10 @@ static inline bool test_restore_sigmask(void)
{
return current->restore_sigmask;
}
+static inline bool test_tsk_restore_sigmask(struct task_struct *tsk)
+{
+ return tsk->restore_sigmask;
+}
static inline bool test_and_clear_restore_sigmask(void)
{
if (!current->restore_sigmask)
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index f155dc607112..f3fb1edb3526 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -72,7 +72,8 @@ extern void shmem_unlock_mapping(struct address_space *mapping);
extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
-extern int shmem_unuse(swp_entry_t entry, struct page *page);
+extern int shmem_unuse(unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse);
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 11b45f7ae405..1fd9911890c6 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -109,6 +109,9 @@
#define SLAB_KASAN 0
#endif
+/* WARN_ON slab error */
+#define SLAB_WARN_ON_ERROR ((slab_flags_t __force)0x10000000U)
+
/* The following flags affect the page allocator grouping pages by mobility */
/* Objects are reclaimable */
#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U)
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 3a1a1dbc6f49..d2153789bd9f 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -81,12 +81,12 @@ struct kmem_cache_order_objects {
*/
struct kmem_cache {
struct kmem_cache_cpu __percpu *cpu_slab;
- /* Used for retriving partial slabs etc */
+ /* Used for retrieving partial slabs, etc. */
slab_flags_t flags;
unsigned long min_partial;
- unsigned int size; /* The size of an object including meta data */
- unsigned int object_size;/* The size of an object without meta data */
- unsigned int offset; /* Free pointer offset. */
+ unsigned int size; /* The size of an object including metadata */
+ unsigned int object_size;/* The size of an object without metadata */
+ unsigned int offset; /* Free pointer offset */
#ifdef CONFIG_SLUB_CPU_PARTIAL
/* Number of per cpu partial objects to keep around */
unsigned int cpu_partial;
@@ -110,7 +110,7 @@ struct kmem_cache {
#endif
#ifdef CONFIG_MEMCG
struct memcg_cache_params memcg_params;
- /* for propagation, maximum size of a stored attr */
+ /* For propagation, maximum size of a stored attr */
unsigned int max_attr_size;
#ifdef CONFIG_SYSFS
struct kset *memcg_kset;
@@ -151,7 +151,7 @@ struct kmem_cache {
#else
#define slub_cpu_partial(s) (0)
#define slub_set_cpu_partial(s, n)
-#endif // CONFIG_SLUB_CPU_PARTIAL
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
#ifdef CONFIG_SYSFS
#define SLAB_SUPPORTS_SYSFS
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 622025ac1461..649529be91f2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -625,7 +625,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
return vm_swappiness;
/* root ? */
- if (mem_cgroup_disabled() || !memcg->css.parent)
+ if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
return vm_swappiness;
return memcg->swappiness;
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 37b226e8df13..4c8b02f4edeb 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -263,6 +263,40 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
#define probe_kernel_address(addr, retval) \
probe_kernel_read(&retval, addr, sizeof(retval))
+/**
+ * probe_user_read(): safely attempt to read from a user location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ *
+ * We ensure that the copy_from_user is executed in atomic context so that
+ * do_page_fault() doesn't attempt to take mmap_sem. This makes
+ * probe_user_read() suitable for use within regions where the caller
+ * already holds mmap_sem, or other locks which nest inside mmap_sem.
+ *
+ * Returns 0 on success, -EFAULT on error.
+ */
+
+#ifndef probe_user_read
+static __always_inline long probe_user_read(void *dst, const void __user *src,
+ size_t size)
+{
+ long ret;
+
+ if (!access_ok(src, size))
+ return -EFAULT;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, size);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+#endif
+
#ifndef user_access_begin
#define user_access_begin(ptr,len) access_ok(ptr, len)
#define user_access_end() do { } while (0)
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index 082119630b49..1f7925afad2d 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -23,7 +23,7 @@
#define AUTOFS_MIN_PROTO_VERSION 3
#define AUTOFS_MAX_PROTO_VERSION 5
-#define AUTOFS_PROTO_SUBVERSION 4
+#define AUTOFS_PROTO_SUBVERSION 5
/*
* The wait_queue_token (autofs_wqt_t) is part of a structure which is passed
diff --git a/include/uapi/linux/binfmts.h b/include/uapi/linux/binfmts.h
index 4abad03a8853..689025d9c185 100644
--- a/include/uapi/linux/binfmts.h
+++ b/include/uapi/linux/binfmts.h
@@ -16,6 +16,6 @@ struct pt_regs;
#define MAX_ARG_STRINGS 0x7FFFFFFF
/* sizeof(linux_binprm->buf) */
-#define BINPRM_BUF_SIZE 128
+#define BINPRM_BUF_SIZE 256
#endif /* _UAPI_LINUX_BINFMTS_H */
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd9a350..a2f8658f1c55 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -41,6 +41,7 @@
#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
#define F_SEAL_GROW 0x0004 /* prevent file from growing */
#define F_SEAL_WRITE 0x0008 /* prevent writes */
+#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */
/* (1U << 31) is reserved for signed error codes */
/*
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index 21b9113c69da..6f2f2720f3ac 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -32,7 +32,7 @@
#define KPF_KSM 21
#define KPF_THP 22
-#define KPF_BALLOON 23
+#define KPF_OFFLINE 23
#define KPF_ZERO_PAGE 24
#define KPF_IDLE 25
#define KPF_PGTABLE 26
diff --git a/include/uapi/linux/limits.h b/include/uapi/linux/limits.h
index c3547f07605c..6bcbe3068761 100644
--- a/include/uapi/linux/limits.h
+++ b/include/uapi/linux/limits.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _LINUX_LIMITS_H
-#define _LINUX_LIMITS_H
+#ifndef _UAPI_LINUX_LIMITS_H
+#define _UAPI_LINUX_LIMITS_H
#define NR_OPEN 1024
diff --git a/init/Kconfig b/init/Kconfig
index 8bcaf22bb94e..5d8f947033ce 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1735,6 +1735,29 @@ config SLAB_FREELIST_HARDENED
sacrifies to harden the kernel slab allocator against common
freelist exploit methods.
+config SHUFFLE_PAGE_ALLOCATOR
+ bool "Page allocator randomization"
+ default SLAB_FREELIST_RANDOM && ACPI_NUMA
+ help
+ Randomization of the page allocator improves the average
+ utilization of a direct-mapped memory-side-cache. See section
+ 5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI
+ 6.2a specification for an example of how a platform advertises
+ the presence of a memory-side-cache. There are also incidental
+ security benefits as it reduces the predictability of page
+ allocations to compliment SLAB_FREELIST_RANDOM, but the
+ default granularity of shuffling on 4MB (MAX_ORDER) pages is
+ selected based on cache utilization benefits.
+
+ While the randomization improves cache utilization it may
+ negatively impact workloads on platforms without a cache. For
+ this reason, by default, the randomization is enabled only
+ after runtime detection of a direct-mapped memory-side-cache.
+ Otherwise, the randomization may be force enabled with the
+ 'page_alloc.shuffle' kernel command line parameter.
+
+ Say Y if unsure.
+
config SLUB_CPU_PARTIAL
default y
depends on SLUB && SMP
diff --git a/init/init_task.c b/init/init_task.c
index 9d2d1dafdaa0..c70ef656d0f4 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -10,6 +10,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/audit.h>
+#include <linux/numa.h>
#include <asm/pgtable.h>
#include <linux/uaccess.h>
@@ -154,7 +155,7 @@ struct task_struct init_task
.vtime.state = VTIME_SYS,
#endif
#ifdef CONFIG_NUMA_BALANCING
- .numa_preferred_nid = -1,
+ .numa_preferred_nid = NUMA_NO_NODE,
.numa_group = NULL,
.numa_faults = NULL,
#endif
diff --git a/init/initramfs.c b/init/initramfs.c
index 7cea802d00ef..4749e1115eef 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -431,7 +431,7 @@ static long __init flush_buffer(void *bufv, unsigned long len)
len -= written;
state = Reset;
} else
- error("junk in compressed archive");
+ error("junk within compressed archive");
}
return origLen;
}
@@ -488,9 +488,9 @@ static char * __init unpack_to_rootfs(char *buf, unsigned long len)
message = msg_buf;
}
} else
- error("junk in compressed archive");
+ error("invalid magic at start of compressed archive");
if (state != Reset)
- error("junk in compressed archive");
+ error("junk at the end of compressed archive");
this_header = saved_offset + my_inptr;
buf += my_inptr;
len -= my_inptr;
@@ -550,6 +550,7 @@ skip:
initrd_end = 0;
}
+#ifdef CONFIG_BLK_DEV_RAM
#define BUF_SIZE 1024
static void __init clean_rootfs(void)
{
@@ -596,6 +597,7 @@ static void __init clean_rootfs(void)
ksys_close(fd);
kfree(buf);
}
+#endif
static int __init populate_rootfs(void)
{
@@ -638,10 +640,8 @@ static int __init populate_rootfs(void)
printk(KERN_INFO "Unpacking initramfs...\n");
err = unpack_to_rootfs((char *)initrd_start,
initrd_end - initrd_start);
- if (err) {
+ if (err)
printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
- clean_rootfs();
- }
free_initrd();
#endif
}
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 49f9bf4ffc7f..d9ac6caf6a5e 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -120,7 +120,9 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
static int zero;
static int one = 1;
static int int_max = INT_MAX;
-static int ipc_mni = IPCMNI;
+int ipc_mni = IPCMNI;
+int ipc_mni_shift = IPCMNI_SHIFT;
+bool ipc_mni_extended;
static struct ctl_table ipc_kern_table[] = {
{
@@ -246,3 +248,13 @@ static int __init ipc_sysctl_init(void)
}
device_initcall(ipc_sysctl_init);
+
+static int __init ipc_mni_extend(char *str)
+{
+ ipc_mni = IPCMNI_EXTEND;
+ ipc_mni_shift = IPCMNI_EXTEND_SHIFT;
+ ipc_mni_extended = true;
+ pr_info("IPCMNI extended to %d.\n", ipc_mni);
+ return 0;
+}
+early_param("ipcmni_extend", ipc_mni_extend);
diff --git a/ipc/sem.c b/ipc/sem.c
index 80909464acff..7da4504bcc7c 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -488,18 +488,14 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
static struct sem_array *sem_alloc(size_t nsems)
{
struct sem_array *sma;
- size_t size;
if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0]))
return NULL;
- size = sizeof(*sma) + nsems * sizeof(sma->sems[0]);
- sma = kvmalloc(size, GFP_KERNEL);
+ sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL);
if (unlikely(!sma))
return NULL;
- memset(sma, 0, size);
-
return sma;
}
@@ -1680,6 +1676,7 @@ static long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg, int v
case IPC_SET:
if (copy_semid_from_user(&semid64, p, version))
return -EFAULT;
+ /* fall through */
case IPC_RMID:
return semctl_down(ns, semid, cmd, &semid64);
default:
diff --git a/ipc/util.c b/ipc/util.c
index 0af05752969f..3f11a81cf9b7 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -110,12 +110,13 @@ static const struct rhashtable_params ipc_kht_params = {
* @ids: ipc identifier set
*
* Set up the sequence range to use for the ipc identifier range (limited
- * below IPCMNI) then initialise the keys hashtable and ids idr.
+ * below ipc_mni) then initialise the keys hashtable and ids idr.
*/
void ipc_init_ids(struct ipc_ids *ids)
{
ids->in_use = 0;
- ids->seq = 0;
+ ids->deleted = false;
+ ids->seq = ipc_mni_extended ? 0 : -1; /* seq # is pre-incremented */
init_rwsem(&ids->rwsem);
rhashtable_init(&ids->key_ht, &ipc_kht_params);
idr_init(&ids->ipcs_idr);
@@ -198,6 +199,11 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
{
int idx, next_id = -1;
+/*
+ * To conserve sequence number space with extended ipc_mni when new ID
+ * is built, the sequence number is incremented only when one or more
+ * IDs have been removed previously.
+ */
#ifdef CONFIG_CHECKPOINT_RESTORE
next_id = ids->next_id;
ids->next_id = -1;
@@ -216,9 +222,13 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
*/
if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */
- new->seq = ids->seq++;
- if (ids->seq > IPCID_SEQ_MAX)
- ids->seq = 0;
+ if (!ipc_mni_extended || ids->deleted) {
+ ids->seq++;
+ if (ids->seq > IPCID_SEQ_MAX)
+ ids->seq = 0;
+ ids->deleted = false;
+ }
+ new->seq = ids->seq;
idx = idr_alloc(&ids->ipcs_idr, new, 0, 0, GFP_NOWAIT);
} else {
new->seq = ipcid_to_seqx(next_id);
@@ -226,7 +236,7 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
0, GFP_NOWAIT);
}
if (idx >= 0)
- new->id = SEQ_MULTIPLIER * new->seq + idx;
+ new->id = (new->seq << IPCMNI_SEQ_SHIFT) + idx;
return idx;
}
@@ -254,8 +264,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
/* 1) Initialize the refcount so that ipc_rcu_putref works */
refcount_set(&new->refcount, 1);
- if (limit > IPCMNI)
- limit = IPCMNI;
+ if (limit > ipc_mni)
+ limit = ipc_mni;
if (ids->in_use >= limit)
return -ENOSPC;
@@ -436,6 +446,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
idr_remove(&ids->ipcs_idr, idx);
ipc_kht_remove(ids, ipcp);
ids->in_use--;
+ ids->deleted = true;
ipcp->deleted = true;
if (unlikely(idx == ids->max_idx)) {
@@ -738,7 +749,7 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
if (total >= ids->in_use)
return NULL;
- for (; pos < IPCMNI; pos++) {
+ for (; pos < ipc_mni; pos++) {
ipc = idr_find(&ids->ipcs_idr, pos);
if (ipc != NULL) {
*new_pos = pos + 1;
diff --git a/ipc/util.h b/ipc/util.h
index e272be622ae7..99e3632b080d 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -15,8 +15,36 @@
#include <linux/err.h>
#include <linux/ipc_namespace.h>
-#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
-#define SEQ_MULTIPLIER (IPCMNI)
+/*
+ * The IPC ID contains 2 separate numbers - index and sequence number.
+ * By default,
+ * bits 0-14: index (32k, 15 bits)
+ * bits 15-30: sequence number (64k, 16 bits)
+ *
+ * When IPCMNI extension mode is turned on, the composition changes:
+ * bits 0-22: index (8M, 23 bits)
+ * bits 23-30: sequence number (256, 8 bits)
+ */
+#define IPCMNI_SHIFT 15
+#define IPCMNI_EXTEND_SHIFT 23
+#define IPCMNI (1 << IPCMNI_SHIFT)
+#define IPCMNI_EXTEND (1 << IPCMNI_EXTEND_SHIFT)
+
+#ifdef CONFIG_SYSVIPC_SYSCTL
+extern int ipc_mni;
+extern int ipc_mni_shift;
+extern bool ipc_mni_extended;
+
+#define IPCMNI_SEQ_SHIFT ipc_mni_shift
+#define IPCMNI_IDX_MASK ((1 << ipc_mni_shift) - 1)
+
+#else /* CONFIG_SYSVIPC_SYSCTL */
+
+#define ipc_mni IPCMNI
+#define ipc_mni_extended false
+#define IPCMNI_SEQ_SHIFT IPCMNI_SHIFT
+#define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1)
+#endif /* CONFIG_SYSVIPC_SYSCTL */
void sem_init(void);
void msg_init(void);
@@ -96,9 +124,9 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *);
#define IPC_MSG_IDS 1
#define IPC_SHM_IDS 2
-#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
-#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER)
-#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX)
+#define ipcid_to_idx(id) ((id) & IPCMNI_IDX_MASK)
+#define ipcid_to_seqx(id) ((id) >> IPCMNI_SEQ_SHIFT)
+#define IPCID_SEQ_MAX (INT_MAX >> IPCMNI_SEQ_SHIFT)
/* must be called with ids->rwsem acquired for writing */
int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
@@ -123,8 +151,8 @@ static inline int ipc_get_maxidx(struct ipc_ids *ids)
if (ids->in_use == 0)
return -1;
- if (ids->in_use == IPCMNI)
- return IPCMNI - 1;
+ if (ids->in_use == ipc_mni)
+ return ipc_mni - 1;
return ids->max_idx;
}
@@ -216,10 +244,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
static inline int sem_check_semmni(struct ipc_namespace *ns) {
/*
- * Check semmni range [0, IPCMNI]
+ * Check semmni range [0, ipc_mni]
* semmni is the last element of sem_ctls[4] array
*/
- return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > IPCMNI))
+ return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > ipc_mni))
? -ERANGE : 0;
}
diff --git a/kernel/.gitignore b/kernel/.gitignore
index b3097bde4e9c..6e699100872f 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,7 +1,5 @@
#
# Generated files
#
-config_data.h
-config_data.gz
timeconst.h
hz.bc
diff --git a/kernel/Makefile b/kernel/Makefile
index 6aa7543bcdb2..6c57e78817da 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -116,17 +116,8 @@ obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
KCOV_INSTRUMENT_stackleak.o := n
-$(obj)/configs.o: $(obj)/config_data.h
+$(obj)/configs.o: $(obj)/config_data.gz
targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
-
-filechk_ikconfiggz = \
- echo "static const char kernel_config_data[] __used = MAGIC_START"; \
- cat $< | scripts/bin2c; \
- echo "MAGIC_END;"
-
-targets += config_data.h
-$(obj)/config_data.h: $(obj)/config_data.gz FORCE
- $(call filechk,ikconfiggz)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5a8e35e61835..7adc9ac7c413 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3508,7 +3508,89 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
}
-#endif
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
+{
+ struct psi_trigger *old;
+ struct psi_trigger *new;
+ struct cgroup *cgrp;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ cgroup_get(cgrp);
+ cgroup_kn_unlock(of->kn);
+
+ new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+ if (IS_ERR(new)) {
+ cgroup_put(cgrp);
+ return PTR_ERR(new);
+ }
+
+ old = of->priv;
+ rcu_assign_pointer(of->priv, new);
+ if (old) {
+ synchronize_rcu();
+ psi_trigger_destroy(old);
+ }
+
+ cgroup_put(cgrp);
+
+ return nbytes;
+}
+
+static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+}
+
+static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+}
+
+static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
+ poll_table *pt)
+{
+ struct psi_trigger *t;
+ __poll_t ret;
+
+ rcu_read_lock();
+ t = rcu_dereference_raw(of->priv);
+ if (t)
+ ret = psi_trigger_poll(t, of->file, pt);
+ else
+ ret = DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static void cgroup_pressure_release(struct kernfs_open_file *of)
+{
+ struct psi_trigger *t = of->priv;
+
+ if (!t)
+ return;
+
+ rcu_assign_pointer(of->priv, NULL);
+ synchronize_rcu();
+ psi_trigger_destroy(t);
+}
+#endif /* CONFIG_PSI */
static int cgroup_file_open(struct kernfs_open_file *of)
{
@@ -3577,6 +3659,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
return ret ?: nbytes;
}
+static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
+{
+ struct cftype *cft = of->kn->priv;
+
+ if (cft->poll)
+ return cft->poll(of, pt);
+
+ return kernfs_generic_poll(of, pt);
+}
+
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
return seq_cft(seq)->seq_start(seq, ppos);
@@ -3615,6 +3707,7 @@ static struct kernfs_ops cgroup_kf_single_ops = {
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_show = cgroup_seqfile_show,
};
@@ -3623,6 +3716,7 @@ static struct kernfs_ops cgroup_kf_ops = {
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_start = cgroup_seqfile_start,
.seq_next = cgroup_seqfile_next,
.seq_stop = cgroup_seqfile_stop,
@@ -4651,18 +4745,27 @@ static struct cftype cgroup_base_files[] = {
.name = "io.pressure",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_io_pressure_show,
+ .write = cgroup_io_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
{
.name = "memory.pressure",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_memory_pressure_show,
+ .write = cgroup_memory_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
{
.name = "cpu.pressure",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_cpu_pressure_show,
+ .write = cgroup_cpu_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
-#endif
+#endif /* CONFIG_PSI */
{ } /* terminate */
};
diff --git a/kernel/configs.c b/kernel/configs.c
index 2df132b20217..b062425ccf8d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -30,37 +30,35 @@
#include <linux/init.h>
#include <linux/uaccess.h>
-/**************************************************/
-/* the actual current config file */
-
/*
- * Define kernel_config_data and kernel_config_data_size, which contains the
- * wrapped and compressed configuration file. The file is first compressed
- * with gzip and then bounded by two eight byte magic numbers to allow
- * extraction from a binary kernel image:
- *
- * IKCFG_ST
- * <image>
- * IKCFG_ED
+ * "IKCFG_ST" and "IKCFG_ED" are used to extract the config data from
+ * a binary kernel image or a module. See scripts/extract-ikconfig.
*/
-#define MAGIC_START "IKCFG_ST"
-#define MAGIC_END "IKCFG_ED"
-#include "config_data.h"
-
-
-#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
-#define kernel_config_data_size \
- (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
+asm (
+" .pushsection .rodata, \"a\" \n"
+" .ascii \"IKCFG_ST\" \n"
+" .global kernel_config_data \n"
+"kernel_config_data: \n"
+" .incbin \"kernel/config_data.gz\" \n"
+" .global kernel_config_data_end \n"
+"kernel_config_data_end: \n"
+" .ascii \"IKCFG_ED\" \n"
+" .popsection \n"
+);
#ifdef CONFIG_IKCONFIG_PROC
+extern char kernel_config_data;
+extern char kernel_config_data_end;
+
static ssize_t
ikconfig_read_current(struct file *file, char __user *buf,
size_t len, loff_t * offset)
{
return simple_read_from_buffer(buf, len, offset,
- kernel_config_data + MAGIC_SIZE,
- kernel_config_data_size);
+ &kernel_config_data,
+ &kernel_config_data_end -
+ &kernel_config_data);
}
static const struct file_operations ikconfig_file_ops = {
@@ -79,7 +77,7 @@ static int __init ikconfig_init(void)
if (!entry)
return -ENOMEM;
- proc_set_size(entry, kernel_config_data_size);
+ proc_set_size(entry, &kernel_config_data_end - &kernel_config_data);
return 0;
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 933cb3e45b98..093c9f917ed0 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
+ VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
arch_crash_save_vmcoreinfo();
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 7a723194ecbe..2b750f13bc8f 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -158,7 +158,7 @@ out:
bool dma_in_atomic_pool(void *start, size_t size)
{
- return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+ return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
}
void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
diff --git a/kernel/fork.c b/kernel/fork.c
index 08e1f9f22ddf..97d1a29babe2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -919,6 +919,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
+ tsk->memcg_high_reclaim = NULL;
#endif
return tsk;
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 1e32e66c9563..2dddecbdbe6e 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -245,8 +245,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info)
/* Duplicate gcov_info. */
active = num_counter_active(info);
- dup = kzalloc(sizeof(struct gcov_info) +
- sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
+ dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL);
if (!dup)
return NULL;
dup->version = info->version;
@@ -364,8 +363,7 @@ struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
{
struct gcov_iterator *iter;
- iter = kzalloc(sizeof(struct gcov_iterator) +
- num_counter_active(info) * sizeof(struct type_info),
+ iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)),
GFP_KERNEL);
if (iter)
iter->info = info;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 4a9191617076..0c11216171c9 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -19,6 +19,7 @@
#include <linux/utsname.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
+#include <linux/sched/sysctl.h>
#include <trace/events/sched.h>
diff --git a/kernel/kcov.c b/kernel/kcov.c
index c2277dbdbfb1..2ee38727844a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -20,6 +20,7 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/kcov.h>
+#include <linux/refcount.h>
#include <asm/setup.h>
/* Number of 64-bit words written per one comparison: */
@@ -44,7 +45,7 @@ struct kcov {
* - opened file descriptor
* - task with enabled coverage (we can't unwire it from another task)
*/
- atomic_t refcount;
+ refcount_t refcount;
/* The lock protects mode, size, area and t. */
spinlock_t lock;
enum kcov_mode mode;
@@ -228,12 +229,12 @@ EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
static void kcov_get(struct kcov *kcov)
{
- atomic_inc(&kcov->refcount);
+ refcount_inc(&kcov->refcount);
}
static void kcov_put(struct kcov *kcov)
{
- if (atomic_dec_and_test(&kcov->refcount)) {
+ if (refcount_dec_and_test(&kcov->refcount)) {
vfree(kcov->area);
kfree(kcov);
}
@@ -312,7 +313,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
if (!kcov)
return -ENOMEM;
kcov->mode = KCOV_MODE_DISABLED;
- atomic_set(&kcov->refcount, 1);
+ refcount_set(&kcov->refcount, 1);
spin_lock_init(&kcov->lock);
filep->private_data = kcov;
return nonseekable_open(inode, filep);
@@ -444,10 +445,8 @@ static int __init kcov_init(void)
* there is no need to protect it against removal races. The
* use of debugfs_create_file_unsafe() is actually safe here.
*/
- if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {
- pr_err("failed to create kcov in debugfs\n");
- return -ENOMEM;
- }
+ debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops);
+
return 0;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 65234c89d85b..103773a8e5e9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,6 +20,7 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
+#include <linux/numa.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -681,7 +682,7 @@ __kthread_create_worker(int cpu, unsigned int flags,
{
struct kthread_worker *worker;
struct task_struct *task;
- int node = -1;
+ int node = NUMA_NO_NODE;
worker = kzalloc(sizeof(*worker), GFP_KERNEL);
if (!worker)
diff --git a/kernel/module.c b/kernel/module.c
index 2ad1b5239910..0b9aa8ab89f0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2719,11 +2719,7 @@ static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsig
{
if (!debug)
return;
-#ifdef CONFIG_DYNAMIC_DEBUG
- if (ddebug_add_module(debug, num, mod->name))
- pr_err("dynamic debug error adding module: %s\n",
- debug->modname);
-#endif
+ ddebug_add_module(debug, num, mod->name);
}
static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
diff --git a/kernel/panic.c b/kernel/panic.c
index f121e6ba7e11..0ae0d7332f12 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -642,16 +642,14 @@ static int clear_warn_once_set(void *data, u64 val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
- NULL,
- clear_warn_once_set,
- "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set,
+ "%lld\n");
static __init int register_warn_debugfs(void)
{
/* Don't care about failure */
- debugfs_create_file("clear_warn_once", 0200, NULL,
- NULL, &clear_warn_once_fops);
+ debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL,
+ &clear_warn_once_fops);
return 0;
}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 640b2034edd6..4802b039b89f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1215,14 +1215,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(!PageHighMem(page));
- if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) ||
- PageReserved(page))
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
+ return NULL;
+
+ if (PageReserved(page) || PageOffline(page))
return NULL;
if (page_is_guard(page))
@@ -1277,8 +1279,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(PageHighMem(page));
@@ -1286,6 +1288,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
return NULL;
+ if (PageOffline(page))
+ return NULL;
+
if (PageReserved(page)
&& (!kernel_page_present(page) || pfn_is_nosave(pfn)))
return NULL;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 771e93f9c43f..6f357f4fc859 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -29,6 +29,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
#include <linux/compat.h>
+#include <linux/sched/signal.h>
/*
* Access another process' address space via ptrace.
@@ -924,18 +925,26 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_setsiginfo(child, &siginfo);
break;
- case PTRACE_GETSIGMASK:
+ case PTRACE_GETSIGMASK: {
+ sigset_t *mask;
+
if (addr != sizeof(sigset_t)) {
ret = -EINVAL;
break;
}
- if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+ if (test_tsk_restore_sigmask(child))
+ mask = &child->saved_sigmask;
+ else
+ mask = &child->blocked;
+
+ if (copy_to_user(datavp, mask, sizeof(sigset_t)))
ret = -EFAULT;
else
ret = 0;
break;
+ }
case PTRACE_SETSIGMASK: {
sigset_t new_set;
@@ -961,6 +970,8 @@ int ptrace_request(struct task_struct *child, long request,
child->blocked = new_set;
spin_unlock_irq(&child->sighand->siglock);
+ clear_tsk_restore_sigmask(child);
+
ret = 0;
break;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7aafbeeda2f..22d7b09d3540 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2220,6 +2220,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+#ifdef CONFIG_COMPACTION
+ p->capture_control = NULL;
+#endif
init_numa_balancing(clone_flags, p);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8213ff6e365d..ea74d43924b2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1173,7 +1173,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
- p->numa_preferred_nid = -1;
+ p->numa_preferred_nid = NUMA_NO_NODE;
return;
}
@@ -1193,13 +1193,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}
static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
@@ -1413,7 +1413,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
- if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
return true;
@@ -1861,7 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -2108,7 +2108,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
static void task_numa_placement(struct task_struct *p)
{
- int seq, nid, max_nid = -1;
+ int seq, nid, max_nid = NUMA_NO_NODE;
unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
@@ -2651,7 +2651,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
* the preferred node.
*/
if (dst_nid == p->numa_preferred_nid ||
- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+ (p->numa_preferred_nid != NUMA_NO_NODE &&
+ src_nid != p->numa_preferred_nid))
return;
}
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index c3484785b179..0bb130692ad6 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -4,6 +4,9 @@
* Copyright (c) 2018 Facebook, Inc.
* Author: Johannes Weiner <hannes@cmpxchg.org>
*
+ * Polling support by Suren Baghdasaryan <surenb@google.com>
+ * Copyright (c) 2018 Google, Inc.
+ *
* When CPU, memory and IO are contended, tasks experience delays that
* reduce throughput and introduce latencies into the workload. Memory
* and IO contention, in addition, can cause a full loss of forward
@@ -127,11 +130,16 @@
#include "../workqueue_internal.h"
#include <linux/sched/loadavg.h>
#include <linux/seq_file.h>
+#include <linux/eventfd.h>
#include <linux/proc_fs.h>
#include <linux/seqlock.h>
+#include <linux/uaccess.h>
#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/sched.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/poll.h>
#include <linux/psi.h>
#include "sched.h"
@@ -140,9 +148,9 @@ static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED
-bool psi_enable;
+static bool psi_enable;
#else
-bool psi_enable = true;
+static bool psi_enable = true;
#endif
static int __init setup_psi(char *str)
{
@@ -151,11 +159,16 @@ static int __init setup_psi(char *str)
__setup("psi=", setup_psi);
/* Running averages - we need to be higher-res than loadavg */
-#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */
+#define PSI_FREQ (2*HZ+1UL) /* 2 sec intervals */
#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
#define EXP_60s 1981 /* 1/exp(2s/60s) */
#define EXP_300s 2034 /* 1/exp(2s/300s) */
+/* PSI trigger definitions */
+#define WINDOW_MIN_US 500000 /* Min window size is 500ms */
+#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
+#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
+
/* Sampling frequency in nanoseconds */
static u64 psi_period __read_mostly;
@@ -173,9 +186,18 @@ static void group_init(struct psi_group *group)
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
- group->next_update = sched_clock() + psi_period;
+ group->avg_next_update = sched_clock() + psi_period;
+ atomic_set(&group->polling, 0);
INIT_DELAYED_WORK(&group->clock_work, psi_update_work);
- mutex_init(&group->stat_lock);
+ mutex_init(&group->update_lock);
+ /* Init trigger-related members */
+ INIT_LIST_HEAD(&group->triggers);
+ memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
+ group->trigger_states = 0;
+ group->trigger_min_period = U32_MAX;
+ memset(group->polling_total, 0, sizeof(group->polling_total));
+ group->polling_next_update = ULLONG_MAX;
+ group->polling_until = 0;
}
void __init psi_init(void)
@@ -210,20 +232,23 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
}
}
-static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
+static void get_recent_times(struct psi_group *group, int cpu, u32 *times,
+ u32 *pchanged_states)
{
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
- unsigned int tasks[NR_PSI_TASK_COUNTS];
u64 now, state_start;
+ enum psi_states s;
unsigned int seq;
- int s;
+ u32 state_mask;
+
+ *pchanged_states = 0;
/* Snapshot a coherent view of the CPU state */
do {
seq = read_seqcount_begin(&groupc->seq);
now = cpu_clock(cpu);
memcpy(times, groupc->times, sizeof(groupc->times));
- memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
+ state_mask = groupc->state_mask;
state_start = groupc->state_start;
} while (read_seqcount_retry(&groupc->seq, seq));
@@ -239,13 +264,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
* (u32) and our reported pressure close to what's
* actually happening.
*/
- if (test_state(tasks, s))
+ if (state_mask & (1 << s))
times[s] += now - state_start;
delta = times[s] - groupc->times_prev[s];
groupc->times_prev[s] = times[s];
times[s] = delta;
+ if (delta)
+ *pchanged_states |= (1 << s);
}
}
@@ -269,17 +296,14 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
avg[2] = calc_load(avg[2], EXP_300s, pct);
}
-static bool update_stats(struct psi_group *group)
+static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states)
{
u64 deltas[NR_PSI_STATES - 1] = { 0, };
- unsigned long missed_periods = 0;
unsigned long nonidle_total = 0;
- u64 now, expires, period;
+ u32 changed_states = 0;
int cpu;
int s;
- mutex_lock(&group->stat_lock);
-
/*
* Collect the per-cpu time buckets and average them into a
* single time sample that is normalized to wallclock time.
@@ -291,8 +315,10 @@ static bool update_stats(struct psi_group *group)
for_each_possible_cpu(cpu) {
u32 times[NR_PSI_STATES];
u32 nonidle;
+ u32 cpu_changed_states;
- get_recent_times(group, cpu, times);
+ get_recent_times(group, cpu, times, &cpu_changed_states);
+ changed_states |= cpu_changed_states;
nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
nonidle_total += nonidle;
@@ -317,12 +343,19 @@ static bool update_stats(struct psi_group *group)
for (s = 0; s < NR_PSI_STATES - 1; s++)
group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
- /* avgX= */
- now = sched_clock();
- expires = group->next_update;
- if (now < expires)
- goto out;
- if (now - expires > psi_period)
+ if (pchanged_states)
+ *pchanged_states = changed_states;
+}
+
+static u64 update_averages(struct psi_group *group, u64 now)
+{
+ unsigned long missed_periods = 0;
+ u64 expires, period;
+ u64 avg_next_update;
+ int s;
+
+ expires = group->avg_next_update;
+ if (now - expires >= psi_period)
missed_periods = div_u64(now - expires, psi_period);
/*
@@ -332,14 +365,14 @@ static bool update_stats(struct psi_group *group)
* But the deltas we sample out of the per-cpu buckets above
* are based on the actual time elapsing between clock ticks.
*/
- group->next_update = expires + ((1 + missed_periods) * psi_period);
- period = now - (group->last_update + (missed_periods * psi_period));
- group->last_update = now;
+ avg_next_update = expires + ((1 + missed_periods) * psi_period);
+ period = now - (group->avg_last_update + (missed_periods * psi_period));
+ group->avg_last_update = now;
for (s = 0; s < NR_PSI_STATES - 1; s++) {
u32 sample;
- sample = group->total[s] - group->total_prev[s];
+ sample = group->total[s] - group->avg_total[s];
/*
* Due to the lockless sampling of the time buckets,
* recorded time deltas can slip into the next period,
@@ -359,23 +392,244 @@ static bool update_stats(struct psi_group *group)
*/
if (sample > period)
sample = period;
- group->total_prev[s] += sample;
+ group->avg_total[s] += sample;
calc_avgs(group->avg[s], missed_periods, sample, period);
}
-out:
- mutex_unlock(&group->stat_lock);
- return nonidle_total;
+
+ return avg_next_update;
+}
+
+/* Trigger tracking window manupulations */
+static void window_reset(struct psi_window *win, u64 now, u64 value,
+ u64 prev_growth)
+{
+ win->start_time = now;
+ win->start_value = value;
+ win->prev_growth = prev_growth;
+}
+
+/*
+ * PSI growth tracking window update and growth calculation routine.
+ *
+ * This approximates a sliding tracking window by interpolating
+ * partially elapsed windows using historical growth data from the
+ * previous intervals. This minimizes memory requirements (by not storing
+ * all the intermediate values in the previous window) and simplifies
+ * the calculations. It works well because PSI signal changes only in
+ * positive direction and over relatively small window sizes the growth
+ * is close to linear.
+ */
+static u64 window_update(struct psi_window *win, u64 now, u64 value)
+{
+ u64 elapsed;
+ u64 growth;
+
+ elapsed = now - win->start_time;
+ growth = value - win->start_value;
+ /*
+ * After each tracking window passes win->start_value and
+ * win->start_time get reset and win->prev_growth stores
+ * the average per-window growth of the previous window.
+ * win->prev_growth is then used to interpolate additional
+ * growth from the previous window assuming it was linear.
+ */
+ if (elapsed > win->size)
+ window_reset(win, now, value, growth);
+ else {
+ u32 remaining;
+
+ remaining = win->size - elapsed;
+ growth += div_u64(win->prev_growth * remaining, win->size);
+ }
+
+ return growth;
+}
+
+static void init_triggers(struct psi_group *group, u64 now)
+{
+ struct psi_trigger *t;
+
+ list_for_each_entry(t, &group->triggers, node)
+ window_reset(&t->win, now, group->total[t->state], 0);
+ memcpy(group->polling_total, group->total,
+ sizeof(group->polling_total));
+ group->polling_next_update = now + group->trigger_min_period;
}
+static u64 update_triggers(struct psi_group *group, u64 now)
+{
+ struct psi_trigger *t;
+ bool new_stall = false;
+
+ /*
+ * On subsequent updates, calculate growth deltas and let
+ * watchers know when their specified thresholds are exceeded.
+ */
+ list_for_each_entry(t, &group->triggers, node) {
+ u64 growth;
+
+ /* Check for stall activity */
+ if (group->polling_total[t->state] == group->total[t->state])
+ continue;
+
+ /*
+ * Multiple triggers might be looking at the same state,
+ * remember to update group->polling_total[] once we've
+ * been through all of them. Also remember to extend the
+ * polling time if we see new stall activity.
+ */
+ new_stall = true;
+
+ /* Calculate growth since last update */
+ growth = window_update(&t->win, now, group->total[t->state]);
+ if (growth < t->threshold)
+ continue;
+
+ /* Limit event signaling to once per window */
+ if (now < t->last_event_time + t->win.size)
+ continue;
+
+ /* Generate an event */
+ if (cmpxchg(&t->event, 0, 1) == 0)
+ wake_up_interruptible(&t->event_wait);
+ t->last_event_time = now;
+ }
+
+ if (new_stall) {
+ memcpy(group->polling_total, group->total,
+ sizeof(group->polling_total));
+ }
+
+ return now + group->trigger_min_period;
+}
+
+/*
+ * psi_update_work represents slowpath accounting part while psi_group_change
+ * represents hotpath part. There are two potential races between them:
+ * 1. Changes to group->polling when slowpath checks for new stall, then hotpath
+ * records new stall and then slowpath resets group->polling flag. This leads
+ * to the exit from the polling mode while monitored state is still changing.
+ * 2. Slowpath overwriting an immediate update scheduled from the hotpath with
+ * a regular update further in the future and missing the immediate update.
+ * Both races are handled with a retry cycle in the slowpath:
+ *
+ * HOTPATH: | SLOWPATH:
+ * |
+ * A) times[cpu] += delta | E) delta = times[*]
+ * B) start_poll = (delta[poll_mask] &&| polling = g->polling
+ * cmpxchg(g->polling, 0, 1) == 0)| if delta[poll_mask]:
+ * if start_poll: | F) polling_until = now + grace_period
+ * C) mod_delayed_work(1) | if now > polling_until:
+ * else if !delayed_work_pending():| if polling:
+ * D) schedule_delayed_work(PSI_FREQ)| G) g->polling = polling = 0
+ * | smp_mb
+ * | H) goto SLOWPATH
+ * | else:
+ * | if !polling:
+ * | I) g->polling = polling = 1
+ * | J) if delta && first_pass:
+ * | next_avg = update_averages()
+ * | if polling:
+ * | next_poll = update_triggers()
+ * | if (delta && first_pass) || polling:
+ * | K) mod_delayed_work(
+ * | min(next_avg, next_poll))
+ * | if !polling:
+ * | first_pass = false
+ * | L) goto SLOWPATH
+ *
+ * Race #1 is represented by (EABGD) sequence in which case slowpath deactivates
+ * polling mode because it misses new monitored stall and hotpath doesn't
+ * activate it because at (B) g->polling is not yet reset by slowpath in (G).
+ * This race is handled by the (H) retry, which in the race described above
+ * results in the new sequence of (EABGDHEIK) that reactivates polling mode.
+ *
+ * Race #2 is represented by polling==false && (JABCK) sequence which overwrites
+ * immediate update scheduled at (C) with a later (next_avg) update scheduled at
+ * (K). This race is handled by the (L) retry which results in the new sequence
+ * of polling==false && (JABCKLEIK) that reactivates polling mode and
+ * reschedules the next polling update (next_poll).
+ *
+ * Note that retries can't result in an infinite loop because retry #1 happens
+ * only during polling reactivation and retry #2 happens only on the first pass.
+ * Constant reactivations are impossible because polling will stay active for at
+ * least grace_period. Worst case scenario involves two retries (HEJKLE)
+ */
static void psi_update_work(struct work_struct *work)
{
struct delayed_work *dwork;
struct psi_group *group;
+ bool first_pass = true;
+ u64 next_update;
+ u32 changed_states;
+ int polling;
bool nonidle;
+ u64 now;
dwork = to_delayed_work(work);
group = container_of(dwork, struct psi_group, clock_work);
+ mutex_lock(&group->update_lock);
+
+ now = sched_clock();
+
+retry:
+ collect_percpu_times(group, &changed_states);
+ polling = atomic_read(&group->polling);
+
+ if (changed_states & group->trigger_states) {
+ /* Initialize trigger windows when entering polling mode */
+ if (now > group->polling_until)
+ init_triggers(group, now);
+
+ /*
+ * Keep the monitor active for at least the duration of the
+ * minimum tracking window as long as monitor states are
+ * changing. This prevents frequent changes to polling flag
+ * when system bounces in and out of stall states.
+ */
+ group->polling_until = now +
+ group->trigger_min_period * UPDATES_PER_WINDOW;
+ }
+
+ /* Handle polling flag transitions */
+ if (now > group->polling_until) {
+ if (polling) {
+ group->polling_next_update = ULLONG_MAX;
+ polling = 0;
+ atomic_set(&group->polling, polling);
+ /*
+ * Memory barrier is needed to order group->polling=0
+ * write before times[] reads in collect_percpu_times()
+ * to detect possible race with hotpath that modifies
+ * times[] before it sets group->polling=1 (see Race #1
+ * description in the comments at the top).
+ */
+ smp_mb();
+ /*
+ * Check if we missed stall recorded by hotpath while
+ * polling flag was set to 1 causing hotpath to skip
+ * entering polling mode
+ */
+ goto retry;
+ }
+ } else {
+ if (!polling) {
+ /*
+ * This can happen as a fixup in the retry cycle after
+ * new stall is discovered
+ */
+ polling = 1;
+ atomic_set(&group->polling, polling);
+ }
+ }
+ /*
+ * At this point group->polling race with hotpath is resolved and
+ * we rely on local polling flag ignoring possible further changes
+ * to group->polling
+ */
+
+ nonidle = (changed_states & (1 << PSI_NONIDLE));
/*
* If there is task activity, periodically fold the per-cpu
* times and feed samples into the running averages. If things
@@ -383,18 +637,34 @@ static void psi_update_work(struct work_struct *work)
* Once restarted, we'll catch up the running averages in one
* go - see calc_avgs() and missed_periods.
*/
+ if (nonidle && first_pass) {
+ if (now >= group->avg_next_update)
+ group->avg_next_update = update_averages(group, now);
- nonidle = update_stats(group);
-
- if (nonidle) {
- unsigned long delay = 0;
- u64 now;
-
- now = sched_clock();
- if (group->next_update > now)
- delay = nsecs_to_jiffies(group->next_update - now) + 1;
- schedule_delayed_work(dwork, delay);
+ if (now >= group->polling_next_update) {
+ group->polling_next_update = update_triggers(
+ group, now);
+ }
}
+ if ((nonidle && first_pass) || polling) {
+ /* Calculate closest update time */
+ next_update = min(group->polling_next_update,
+ group->avg_next_update);
+ mod_delayed_work(system_wq, dwork, nsecs_to_jiffies(
+ next_update - now) + 1);
+ if (!polling) {
+ /*
+ * We might have overwritten an immediate update
+ * scheduled from the hotpath with a longer regular
+ * update (group->avg_next_update). Execute second pass
+ * retry to discover that and resume polling.
+ */
+ first_pass = false;
+ goto retry;
+ }
+ }
+
+ mutex_unlock(&group->update_lock);
}
static void record_times(struct psi_group_cpu *groupc, int cpu,
@@ -407,15 +677,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
delta = now - groupc->state_start;
groupc->state_start = now;
- if (test_state(groupc->tasks, PSI_IO_SOME)) {
+ if (groupc->state_mask & (1 << PSI_IO_SOME)) {
groupc->times[PSI_IO_SOME] += delta;
- if (test_state(groupc->tasks, PSI_IO_FULL))
+ if (groupc->state_mask & (1 << PSI_IO_FULL))
groupc->times[PSI_IO_FULL] += delta;
}
- if (test_state(groupc->tasks, PSI_MEM_SOME)) {
+ if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
groupc->times[PSI_MEM_SOME] += delta;
- if (test_state(groupc->tasks, PSI_MEM_FULL))
+ if (groupc->state_mask & (1 << PSI_MEM_FULL))
groupc->times[PSI_MEM_FULL] += delta;
else if (memstall_tick) {
u32 sample;
@@ -436,18 +706,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
}
}
- if (test_state(groupc->tasks, PSI_CPU_SOME))
+ if (groupc->state_mask & (1 << PSI_CPU_SOME))
groupc->times[PSI_CPU_SOME] += delta;
- if (test_state(groupc->tasks, PSI_NONIDLE))
+ if (groupc->state_mask & (1 << PSI_NONIDLE))
groupc->times[PSI_NONIDLE] += delta;
}
-static void psi_group_change(struct psi_group *group, int cpu,
+static u32 psi_group_change(struct psi_group *group, int cpu,
unsigned int clear, unsigned int set)
{
struct psi_group_cpu *groupc;
unsigned int t, m;
+ enum psi_states s;
+ u32 state_mask = 0;
groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -480,7 +752,16 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (set & (1 << t))
groupc->tasks[t]++;
+ /* Calculate state mask representing active states */
+ for (s = 0; s < NR_PSI_STATES; s++) {
+ if (test_state(groupc->tasks, s))
+ state_mask |= (1 << s);
+ }
+ groupc->state_mask = state_mask;
+
write_seqcount_end(&groupc->seq);
+
+ return state_mask;
}
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -541,7 +822,27 @@ void psi_task_change(struct task_struct *task, int clear, int set)
wake_clock = false;
while ((group = iterate_groups(task, &iter))) {
- psi_group_change(group, cpu, clear, set);
+ u32 state_mask = psi_group_change(group, cpu, clear, set);
+
+ /*
+ * Polling flag resets to 0 at the max rate of once per update
+ * window (at least 500ms interval). smp_wmb is required after
+ * group->polling 0-to-1 transition to order groupc->times and
+ * group->polling writes because stall detection logic in the
+ * slowpath relies on groupc->times changing before
+ * group->polling. Explicit smp_wmb is missing because cmpxchg()
+ * implies smp_mb.
+ */
+ if ((state_mask & group->trigger_states) &&
+ atomic_cmpxchg(&group->polling, 0, 1) == 0) {
+ /*
+ * Start polling immediately even if the work is already
+ * scheduled
+ */
+ mod_delayed_work(system_wq, &group->clock_work, 1);
+ continue;
+ }
+
if (wake_clock && !delayed_work_pending(&group->clock_work))
schedule_delayed_work(&group->clock_work, PSI_FREQ);
}
@@ -642,6 +943,8 @@ void psi_cgroup_free(struct cgroup *cgroup)
cancel_delayed_work_sync(&cgroup->psi.clock_work);
free_percpu(cgroup->psi.pcpu);
+ /* All triggers must be removed by now by psi_trigger_destroy */
+ WARN_ONCE(cgroup->psi.trigger_states, "psi: trigger leak\n");
}
/**
@@ -701,7 +1004,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
- update_stats(group);
+ /* Update averages before reporting them */
+ mutex_lock(&group->update_lock);
+ collect_percpu_times(group, NULL);
+ update_averages(group, sched_clock());
+ mutex_unlock(&group->update_lock);
for (full = 0; full < 2 - (res == PSI_CPU); full++) {
unsigned long avg[3];
@@ -753,25 +1060,223 @@ static int psi_cpu_open(struct inode *inode, struct file *file)
return single_open(file, psi_cpu_show, NULL);
}
+struct psi_trigger *psi_trigger_create(struct psi_group *group,
+ char *buf, size_t nbytes, enum psi_res res)
+{
+ struct psi_trigger *t;
+ enum psi_states state;
+ u32 threshold_us;
+ u32 window_us;
+
+ if (static_branch_likely(&psi_disabled))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
+ state = PSI_IO_SOME + res * 2;
+ else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
+ state = PSI_IO_FULL + res * 2;
+ else
+ return ERR_PTR(-EINVAL);
+
+ if (state >= PSI_NONIDLE)
+ return ERR_PTR(-EINVAL);
+
+ if (window_us < WINDOW_MIN_US ||
+ window_us > WINDOW_MAX_US)
+ return ERR_PTR(-EINVAL);
+
+ /* Check threshold */
+ if (threshold_us == 0 || threshold_us > window_us)
+ return ERR_PTR(-EINVAL);
+
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return ERR_PTR(-ENOMEM);
+
+ t->group = group;
+ t->state = state;
+ t->threshold = threshold_us * NSEC_PER_USEC;
+ t->win.size = window_us * NSEC_PER_USEC;
+ window_reset(&t->win, 0, 0, 0);
+
+ t->event = 0;
+ t->last_event_time = 0;
+ init_waitqueue_head(&t->event_wait);
+
+ mutex_lock(&group->update_lock);
+
+ list_add(&t->node, &group->triggers);
+ group->trigger_min_period = min(group->trigger_min_period,
+ div_u64(t->win.size, UPDATES_PER_WINDOW));
+ group->nr_triggers[t->state]++;
+ group->trigger_states |= (1 << t->state);
+
+ mutex_unlock(&group->update_lock);
+
+ return t;
+}
+
+void psi_trigger_destroy(struct psi_trigger *t)
+{
+ struct psi_group *group = t->group;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ mutex_lock(&group->update_lock);
+ if (!list_empty(&t->node)) {
+ struct psi_trigger *tmp;
+ u64 period = ULLONG_MAX;
+
+ list_del(&t->node);
+ group->nr_triggers[t->state]--;
+ if (!group->nr_triggers[t->state])
+ group->trigger_states &= ~(1 << t->state);
+ /* reset min update period for the remaining triggers */
+ list_for_each_entry(tmp, &group->triggers, node) {
+ period = min(period, div_u64(tmp->win.size,
+ UPDATES_PER_WINDOW));
+ }
+ group->trigger_min_period = period;
+ /*
+ * Wakeup waiters to stop polling.
+ * Can happen if cgroup is deleted from under
+ * a polling process.
+ */
+ wake_up_interruptible(&t->event_wait);
+ kfree(t);
+ }
+ mutex_unlock(&group->update_lock);
+}
+
+__poll_t psi_trigger_poll(struct psi_trigger *t,
+ struct file *file, poll_table *wait)
+{
+ if (static_branch_likely(&psi_disabled))
+ return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+
+ poll_wait(file, &t->event_wait, wait);
+
+ if (cmpxchg(&t->event, 1, 0) == 1)
+ return DEFAULT_POLLMASK | EPOLLPRI;
+
+ /* Wait */
+ return DEFAULT_POLLMASK;
+}
+
+static ssize_t psi_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, enum psi_res res)
+{
+ char buf[32];
+ size_t buf_size;
+ struct seq_file *seq;
+ struct psi_trigger *old;
+ struct psi_trigger *new;
+
+ if (static_branch_likely(&psi_disabled))
+ return -EOPNOTSUPP;
+
+ buf_size = min(nbytes, (sizeof(buf) - 1));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size - 1] = '\0';
+
+ new = psi_trigger_create(&psi_system, buf, nbytes, res);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ seq = file->private_data;
+ /* Take seq->lock to protect seq->private from concurrent writes */
+ mutex_lock(&seq->lock);
+ old = seq->private;
+ rcu_assign_pointer(seq->private, new);
+ mutex_unlock(&seq->lock);
+
+ if (old) {
+ synchronize_rcu();
+ psi_trigger_destroy(old);
+ }
+
+ return nbytes;
+}
+
+static ssize_t psi_io_write(struct file *file,
+ const char __user *user_buf, size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_IO);
+}
+
+static ssize_t psi_memory_write(struct file *file,
+ const char __user *user_buf, size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_MEM);
+}
+
+static ssize_t psi_cpu_write(struct file *file,
+ const char __user *user_buf, size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_CPU);
+}
+
+static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
+{
+ struct seq_file *seq = file->private_data;
+ struct psi_trigger *t;
+ __poll_t ret;
+
+ rcu_read_lock();
+ t = rcu_dereference_raw(seq->private);
+ if (t)
+ ret = psi_trigger_poll(t, file, wait);
+ else
+ ret = DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+ rcu_read_unlock();
+
+ return ret;
+
+}
+
+static int psi_fop_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct psi_trigger *t = seq->private;
+
+ if (static_branch_likely(&psi_disabled) || !t)
+ goto out;
+
+ rcu_assign_pointer(seq->private, NULL);
+ synchronize_rcu();
+ psi_trigger_destroy(t);
+out:
+ return single_release(inode, file);
+}
+
static const struct file_operations psi_io_fops = {
.open = psi_io_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .write = psi_io_write,
+ .poll = psi_fop_poll,
+ .release = psi_fop_release,
};
static const struct file_operations psi_memory_fops = {
.open = psi_memory_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .write = psi_memory_write,
+ .poll = psi_fop_poll,
+ .release = psi_fop_release,
};
static const struct file_operations psi_cpu_fops = {
.open = psi_cpu_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .write = psi_cpu_write,
+ .poll = psi_fop_poll,
+ .release = psi_fop_release,
};
static int __init psi_proc_init(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index b7953934aa99..3a9e41197d46 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3003,7 +3003,7 @@ static bool known_siginfo_layout(unsigned sig, int si_code)
if (si_code == SI_KERNEL)
return true;
else if ((si_code > SI_USER)) {
- if (sig_specific_sicodes(sig)) {
+ if (sig && sig_specific_sicodes(sig)) {
if (si_code <= sig_sicodes[sig].limit)
return true;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index c5f875048aef..12df0e5434b8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1747,6 +1747,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
if (who == RUSAGE_CHILDREN)
break;
+ /* fall through */
case RUSAGE_SELF:
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 987ae08147bf..4fce842c9bfb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -67,6 +67,8 @@
#include <linux/bpf.h>
#include <linux/mount.h>
+#include "../lib/kstrtox.h"
+
#include <linux/uaccess.h>
#include <asm/processor.h>
@@ -127,6 +129,7 @@ static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
+static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
@@ -1457,7 +1460,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_extfrag_threshold,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = sysctl_extfrag_handler,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_extfrag_threshold,
.extra2 = &max_extfrag_threshold,
},
@@ -1733,6 +1736,8 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(files_stat.max_files),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &long_max,
},
{
.procname = "nr_open",
@@ -2103,6 +2108,41 @@ static void proc_skip_char(char **buf, size_t *size, const char v)
}
}
+/**
+ * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
+ * fail on overflow
+ *
+ * @cp: kernel buffer containing the string to parse
+ * @endp: pointer to store the trailing characters
+ * @base: the base to use
+ * @res: where the parsed integer will be stored
+ *
+ * In case of success 0 is returned and @res will contain the parsed integer,
+ * @endp will hold any trailing characters.
+ * This function will fail the parse on overflow. If there wasn't an overflow
+ * the function will defer the decision what characters count as invalid to the
+ * caller.
+ */
+static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
+ unsigned long *res)
+{
+ unsigned long long result;
+ unsigned int rv;
+
+ cp = _parse_integer_fixup_radix(cp, &base);
+ rv = _parse_integer(cp, base, &result);
+ if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
+ return -ERANGE;
+
+ cp += rv;
+
+ if (endp)
+ *endp = (char *)cp;
+
+ *res = (unsigned long)result;
+ return 0;
+}
+
#define TMPBUFLEN 22
/**
* proc_get_long - reads an ASCII formatted integer from a user buffer
@@ -2146,7 +2186,8 @@ static int proc_get_long(char **buf, size_t *size,
if (!isdigit(*p))
return -EINVAL;
- *val = simple_strtoul(p, &p, 0);
+ if (strtoul_lenient(p, &p, 0, val))
+ return -EINVAL;
len = p - tmp;
@@ -2588,23 +2629,25 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{
+ int tmp, ret;
struct do_proc_dointvec_minmax_conv_param *param = data;
+ /*
+ * If writing, first do so via a temporary local int so we can
+ * bounds-check it before touching *valp.
+ */
+ int *ip = write ? &tmp : valp;
+
+ ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
+ if (ret)
+ return ret;
+
if (write) {
- int val = *negp ? -*lvalp : *lvalp;
- if ((param->min && *param->min > val) ||
- (param->max && *param->max < val))
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
return -EINVAL;
- *valp = val;
- } else {
- int val = *valp;
- if (val < 0) {
- *negp = true;
- *lvalp = -(unsigned long)val;
- } else {
- *negp = false;
- *lvalp = (unsigned long)val;
- }
+ *valp = tmp;
}
+
return 0;
}
@@ -2653,22 +2696,22 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
unsigned int *valp,
int write, void *data)
{
+ int ret;
+ unsigned int tmp;
struct do_proc_douintvec_minmax_conv_param *param = data;
+ /* write via temporary local uint for bounds-checking */
+ unsigned int *up = write ? &tmp : valp;
- if (write) {
- unsigned int val = *lvalp;
-
- if (*lvalp > UINT_MAX)
- return -EINVAL;
+ ret = do_proc_douintvec_conv(lvalp, up, write, data);
+ if (ret)
+ return ret;
- if ((param->min && *param->min > val) ||
- (param->max && *param->max < val))
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
return -ERANGE;
- *valp = val;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long) val;
+ *valp = tmp;
}
return 0;
@@ -2816,8 +2859,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (neg)
continue;
val = convmul * val / convdiv;
- if ((min && val < *min) || (max && val > *max))
- continue;
+ if ((min && val < *min) || (max && val > *max)) {
+ err = -EINVAL;
+ break;
+ }
*i = val;
} else {
val = convdiv * (*i) / convmul;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8cecbe309e30..0c85fb6e2737 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -918,6 +918,16 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
* CONTEXT:
* spin_lock_irq(rq->lock)
*
+ * This function is called during schedule() when a kworker is going
+ * to sleep. It's used by psi to identify aggregation workers during
+ * dequeuing, to allow periodic aggregation to shut-off when that
+ * worker is the last task in the system or cgroup to go to sleep.
+ *
+ * As this function doesn't involve any workqueue-related locking, it
+ * only returns stable values when called from inside the scheduler's
+ * queuing and dequeuing paths, when @task, which must be a kworker,
+ * is guaranteed to not be processing any works.
+ *
* Return:
* The last work function %current executed as a worker, NULL if it
* hasn't executed any work yet.
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 03d99dfb7373..4c9256e94eec 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -239,7 +239,6 @@ config ENABLE_MUST_CHECK
config FRAME_WARN
int "Warn for stack frames larger than (needs gcc 4.4)"
range 0 8192
- default 3072 if KASAN_EXTRA
default 2048 if GCC_PLUGIN_LATENT_ENTROPY
default 1280 if (!64BIT && PARISC)
default 1024 if (!64BIT && !PARISC)
@@ -283,23 +282,6 @@ config UNUSED_SYMBOLS
you really need it, and what the merge plan to the mainline kernel for
your module is.
-config PAGE_OWNER
- bool "Track page owner"
- depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
- select DEBUG_FS
- select STACKTRACE
- select STACKDEPOT
- select PAGE_EXTENSION
- help
- This keeps track of what call chain is the owner of a page, may
- help to find bare alloc_page(s) leaks. Even if you include this
- feature on your build, it is disabled in default. You should pass
- "page_owner=on" to boot parameter in order to enable it. Eats
- a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
- for user-space helper.
-
- If unsure, say N.
-
config DEBUG_FS
bool "Debug Filesystem"
help
@@ -1856,6 +1838,18 @@ config TEST_LKM
If unsure, say N.
+config TEST_VMALLOC
+ tristate "Test module for stress/performance analysis of vmalloc allocator"
+ default n
+ depends on m
+ help
+ This builds the "test_vmalloc" module that should be used for
+ stress and performance analysis. So, any new change for vmalloc
+ subsystem can be evaluated from performance and stability point
+ of view.
+
+ If unsure, say N.
+
config TEST_USER_COPY
tristate "Test user/kernel boundary protections"
depends on m
@@ -2080,6 +2074,12 @@ config IO_STRICT_DEVMEM
If in doubt, say Y.
+config DEBUG_AID_FOR_SYZBOT
+ bool "Additional debug code for syzbot"
+ default n
+ help
+ This option is intended for testing by syzbot.
+
source "arch/$(SRCARCH)/Kconfig.debug"
endmenu # Kernel hacking
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index d8c474b6691e..67d7d1309c52 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -78,16 +78,6 @@ config KASAN_SW_TAGS
endchoice
-config KASAN_EXTRA
- bool "KASAN: extra checks"
- depends on KASAN_GENERIC && DEBUG_KERNEL && !COMPILE_TEST
- help
- This enables further checks in generic KASAN, for now it only
- includes the address-use-after-scope check that can lead to
- excessive kernel stack usage, frame size warnings and longer
- compile time.
- See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715
-
choice
prompt "Instrumentation type"
depends on KASAN
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
index 98fa559ebd80..a2ae4a8e4fa6 100644
--- a/lib/Kconfig.ubsan
+++ b/lib/Kconfig.ubsan
@@ -27,15 +27,19 @@ config UBSAN_SANITIZE_ALL
Enabling this option will get kernel image size increased
significantly.
-config UBSAN_ALIGNMENT
- bool "Enable checking of pointers alignment"
+config UBSAN_NO_ALIGNMENT
+ bool "Disable checking of pointers alignment"
depends on UBSAN
- default y if !HAVE_EFFICIENT_UNALIGNED_ACCESS
+ default y if HAVE_EFFICIENT_UNALIGNED_ACCESS
help
- This option enables detection of unaligned memory accesses.
- Enabling this option on architectures that support unaligned
+ This option disables the check of unaligned memory accesses.
+ This option should be used when building allmodconfig.
+ Disabling this option on architectures that support unaligned
accesses may produce a lot of false positives.
+config UBSAN_ALIGNMENT
+ def_bool !UBSAN_NO_ALIGNMENT
+
config TEST_UBSAN
tristate "Module for testing for undefined behavior detection"
depends on m && UBSAN
diff --git a/lib/Makefile b/lib/Makefile
index 2649faa7d404..647517940b29 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -60,6 +60,7 @@ UBSAN_SANITIZE_test_ubsan.o := y
obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o
obj-$(CONFIG_TEST_LKM) += test_module.o
+obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o
obj-$(CONFIG_TEST_OVERFLOW) += test_overflow.o
obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
obj-$(CONFIG_TEST_SORT) += test_sort.o
diff --git a/lib/assoc_array.c b/lib/assoc_array.c
index c6659cb37033..0b61d3b77e5c 100644
--- a/lib/assoc_array.c
+++ b/lib/assoc_array.c
@@ -1115,6 +1115,7 @@ struct assoc_array_edit *assoc_array_delete(struct assoc_array *array,
index_key))
goto found_leaf;
}
+ /* fall through */
case assoc_array_walk_tree_empty:
case assoc_array_walk_found_wrong_shortcut:
default:
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 8d666ab84b5c..087a3e9a0202 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -5,6 +5,7 @@
#include <linux/cpumask.h>
#include <linux/export.h>
#include <linux/memblock.h>
+#include <linux/numa.h>
/**
* cpumask_next - get the next cpu in a cpumask
@@ -206,7 +207,7 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
/* Wrap: we always want a cpu. */
i %= num_online_cpus();
- if (node == -1) {
+ if (node == NUMA_NO_NODE) {
for_each_cpu(cpu, cpu_online_mask)
if (i-- == 0)
return cpu;
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 55437fd5128b..1546a3c00709 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -375,6 +375,7 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
struct debug_bucket *db;
struct debug_obj *obj;
unsigned long flags;
+ bool check_object_on_stack = false;
fill_pool();
@@ -391,7 +392,7 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
debug_objects_oom();
return;
}
- debug_object_is_on_stack(addr, onstack);
+ check_object_on_stack = true;
}
switch (obj->state) {
@@ -402,20 +403,24 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
break;
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "init");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "init");
debug_object_fixup(descr->fixup_init, addr, state);
return;
case ODEBUG_STATE_DESTROYED:
+ raw_spin_unlock_irqrestore(&db->lock, flags);
debug_print_object(obj, "init");
- break;
+ return;
default:
break;
}
raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (check_object_on_stack)
+ debug_object_is_on_stack(addr, onstack);
+
}
/**
@@ -473,33 +478,34 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
obj = lookup_object(addr, db);
if (obj) {
+ ret = 0;
switch (obj->state) {
case ODEBUG_STATE_INIT:
case ODEBUG_STATE_INACTIVE:
obj->state = ODEBUG_STATE_ACTIVE;
- ret = 0;
break;
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "activate");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "activate");
ret = debug_object_fixup(descr->fixup_activate, addr, state);
return ret ? 0 : -EINVAL;
case ODEBUG_STATE_DESTROYED:
- debug_print_object(obj, "activate");
ret = -EINVAL;
break;
default:
- ret = 0;
break;
}
raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (ret)
+ debug_print_object(obj, "activate");
return ret;
}
raw_spin_unlock_irqrestore(&db->lock, flags);
+
/*
* We are here when a static object is activated. We
* let the type specific code confirm whether this is
@@ -548,24 +554,30 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr)
if (!obj->astate)
obj->state = ODEBUG_STATE_INACTIVE;
else
- debug_print_object(obj, "deactivate");
+ goto out_unlock_print;
break;
case ODEBUG_STATE_DESTROYED:
- debug_print_object(obj, "deactivate");
- break;
+ goto out_unlock_print;
+
default:
break;
}
- } else {
+ }
+
+ raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (!obj) {
struct debug_obj o = { .object = addr,
.state = ODEBUG_STATE_NOTAVAILABLE,
.descr = descr };
debug_print_object(&o, "deactivate");
}
+ return;
+out_unlock_print:
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "deactivate");
}
EXPORT_SYMBOL_GPL(debug_object_deactivate);
@@ -599,15 +611,16 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr)
obj->state = ODEBUG_STATE_DESTROYED;
break;
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "destroy");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "destroy");
debug_object_fixup(descr->fixup_destroy, addr, state);
return;
case ODEBUG_STATE_DESTROYED:
+ raw_spin_unlock_irqrestore(&db->lock, flags);
debug_print_object(obj, "destroy");
- break;
+ return;
default:
break;
}
@@ -641,9 +654,9 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr)
switch (obj->state) {
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "free");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "free");
debug_object_fixup(descr->fixup_free, addr, state);
return;
default:
@@ -731,22 +744,27 @@ debug_object_active_state(void *addr, struct debug_obj_descr *descr,
if (obj->astate == expect)
obj->astate = next;
else
- debug_print_object(obj, "active_state");
+ goto out_unlock_print;
break;
default:
- debug_print_object(obj, "active_state");
- break;
+ goto out_unlock_print;
}
- } else {
+ }
+
+ raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (!obj) {
struct debug_obj o = { .object = addr,
.state = ODEBUG_STATE_NOTAVAILABLE,
.descr = descr };
debug_print_object(&o, "active_state");
}
+ return;
+out_unlock_print:
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "active_state");
}
EXPORT_SYMBOL_GPL(debug_object_active_state);
@@ -782,10 +800,10 @@ repeat:
switch (obj->state) {
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "free");
descr = obj->descr;
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "free");
debug_object_fixup(descr->fixup_free,
(void *) oaddr, state);
goto repeat;
@@ -978,19 +996,24 @@ check_results(void *addr, enum debug_obj_state state, int fixups, int warnings)
struct debug_obj *obj;
unsigned long flags;
int res = -EINVAL;
+ enum debug_obj_state obj_state;
db = get_bucket((unsigned long) addr);
raw_spin_lock_irqsave(&db->lock, flags);
obj = lookup_object(addr, db);
+ obj_state = obj ? obj->state : state;
+
+ raw_spin_unlock_irqrestore(&db->lock, flags);
+
if (!obj && state != ODEBUG_STATE_NONE) {
WARN(1, KERN_ERR "ODEBUG: selftest object not found\n");
goto out;
}
- if (obj && obj->state != state) {
+ if (obj_state != state) {
WARN(1, KERN_ERR "ODEBUG: selftest wrong state: %d != %d\n",
- obj->state, state);
+ obj_state, state);
goto out;
}
if (fixups != debug_objects_fixups) {
@@ -1005,7 +1028,6 @@ check_results(void *addr, enum debug_obj_state state, int fixups, int warnings)
}
res = 0;
out:
- raw_spin_unlock_irqrestore(&db->lock, flags);
if (res)
debug_objects_enabled = 0;
return res;
diff --git a/lib/div64.c b/lib/div64.c
index 01c8602bb6ff..ee146bb4c558 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -109,7 +109,7 @@ u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
quot = div_u64_rem(dividend, divisor, &rem32);
*remainder = rem32;
} else {
- int n = 1 + fls(high);
+ int n = fls(high);
quot = div_u64(dividend >> n, divisor >> n);
if (quot != 0)
@@ -147,7 +147,7 @@ u64 div64_u64(u64 dividend, u64 divisor)
if (high == 0) {
quot = div_u64(dividend, divisor);
} else {
- int n = 1 + fls(high);
+ int n = fls(high);
quot = div_u64(dividend >> n, divisor >> n);
if (quot != 0)
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index dbf2b457e47e..7bdf98c37e91 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -847,17 +847,19 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
const char *name)
{
struct ddebug_table *dt;
- const char *new_name;
dt = kzalloc(sizeof(*dt), GFP_KERNEL);
- if (dt == NULL)
- return -ENOMEM;
- new_name = kstrdup_const(name, GFP_KERNEL);
- if (new_name == NULL) {
- kfree(dt);
+ if (dt == NULL) {
+ pr_err("error adding module: %s\n", name);
return -ENOMEM;
}
- dt->mod_name = new_name;
+ /*
+ * For built-in modules, name lives in .rodata and is
+ * immortal. For loaded modules, name points at the name[]
+ * member of struct module, which lives at least as long as
+ * this struct ddebug_table.
+ */
+ dt->mod_name = name;
dt->num_ddebugs = n;
dt->ddebugs = tab;
@@ -868,7 +870,6 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
vpr_info("%u debug prints in module %s\n", n, dt->mod_name);
return 0;
}
-EXPORT_SYMBOL_GPL(ddebug_add_module);
/* helper for ddebug_dyndbg_(boot|module)_param_cb */
static int ddebug_dyndbg_param_cb(char *param, char *val,
@@ -913,7 +914,6 @@ int ddebug_dyndbg_module_param_cb(char *param, char *val, const char *module)
static void ddebug_table_free(struct ddebug_table *dt)
{
list_del_init(&dt->link);
- kfree_const(dt->mod_name);
kfree(dt);
}
@@ -930,15 +930,15 @@ int ddebug_remove_module(const char *mod_name)
mutex_lock(&ddebug_lock);
list_for_each_entry_safe(dt, nextdt, &ddebug_tables, link) {
- if (!strcmp(dt->mod_name, mod_name)) {
+ if (dt->mod_name == mod_name) {
ddebug_table_free(dt);
ret = 0;
+ break;
}
}
mutex_unlock(&ddebug_lock);
return ret;
}
-EXPORT_SYMBOL_GPL(ddebug_remove_module);
static void ddebug_remove_all_tables(void)
{
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 7e85d1e37a6e..2e45b3a94313 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -424,7 +424,7 @@ void gen_pool_for_each_chunk(struct gen_pool *pool,
EXPORT_SYMBOL(gen_pool_for_each_chunk);
/**
- * addr_in_gen_pool - checks if an address falls within the range of a pool
+ * gen_pool_has_addr - checks if an address falls within the range of a pool
* @pool: the generic memory pool
* @start: start address
* @size: size of the region
@@ -432,7 +432,7 @@ EXPORT_SYMBOL(gen_pool_for_each_chunk);
* Check if the range of addresses falls within the specified pool. Returns
* true if the entire range is contained in the pool and false otherwise.
*/
-bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start,
size_t size)
{
bool found = false;
@@ -451,6 +451,7 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
rcu_read_unlock();
return found;
}
+EXPORT_SYMBOL(gen_pool_has_addr);
/**
* gen_pool_avail - get available free space of the pool
diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c
index 236eb21167b5..4525fb094844 100644
--- a/lib/lzo/lzo1x_compress.c
+++ b/lib/lzo/lzo1x_compress.c
@@ -20,7 +20,8 @@
static noinline size_t
lzo1x_1_do_compress(const unsigned char *in, size_t in_len,
unsigned char *out, size_t *out_len,
- size_t ti, void *wrkmem)
+ size_t ti, void *wrkmem, signed char *state_offset,
+ const unsigned char bitstream_version)
{
const unsigned char *ip;
unsigned char *op;
@@ -35,27 +36,85 @@ lzo1x_1_do_compress(const unsigned char *in, size_t in_len,
ip += ti < 4 ? 4 - ti : 0;
for (;;) {
- const unsigned char *m_pos;
+ const unsigned char *m_pos = NULL;
size_t t, m_len, m_off;
u32 dv;
+ u32 run_length = 0;
literal:
ip += 1 + ((ip - ii) >> 5);
next:
if (unlikely(ip >= ip_end))
break;
dv = get_unaligned_le32(ip);
- t = ((dv * 0x1824429d) >> (32 - D_BITS)) & D_MASK;
- m_pos = in + dict[t];
- dict[t] = (lzo_dict_t) (ip - in);
- if (unlikely(dv != get_unaligned_le32(m_pos)))
- goto literal;
+
+ if (dv == 0 && bitstream_version) {
+ const unsigned char *ir = ip + 4;
+ const unsigned char *limit = ip_end
+ < (ip + MAX_ZERO_RUN_LENGTH + 1)
+ ? ip_end : ip + MAX_ZERO_RUN_LENGTH + 1;
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && \
+ defined(LZO_FAST_64BIT_MEMORY_ACCESS)
+ u64 dv64;
+
+ for (; (ir + 32) <= limit; ir += 32) {
+ dv64 = get_unaligned((u64 *)ir);
+ dv64 |= get_unaligned((u64 *)ir + 1);
+ dv64 |= get_unaligned((u64 *)ir + 2);
+ dv64 |= get_unaligned((u64 *)ir + 3);
+ if (dv64)
+ break;
+ }
+ for (; (ir + 8) <= limit; ir += 8) {
+ dv64 = get_unaligned((u64 *)ir);
+ if (dv64) {
+# if defined(__LITTLE_ENDIAN)
+ ir += __builtin_ctzll(dv64) >> 3;
+# elif defined(__BIG_ENDIAN)
+ ir += __builtin_clzll(dv64) >> 3;
+# else
+# error "missing endian definition"
+# endif
+ break;
+ }
+ }
+#else
+ while ((ir < (const unsigned char *)
+ ALIGN((uintptr_t)ir, 4)) &&
+ (ir < limit) && (*ir == 0))
+ ir++;
+ for (; (ir + 4) <= limit; ir += 4) {
+ dv = *((u32 *)ir);
+ if (dv) {
+# if defined(__LITTLE_ENDIAN)
+ ir += __builtin_ctz(dv) >> 3;
+# elif defined(__BIG_ENDIAN)
+ ir += __builtin_clz(dv) >> 3;
+# else
+# error "missing endian definition"
+# endif
+ break;
+ }
+ }
+#endif
+ while (likely(ir < limit) && unlikely(*ir == 0))
+ ir++;
+ run_length = ir - ip;
+ if (run_length > MAX_ZERO_RUN_LENGTH)
+ run_length = MAX_ZERO_RUN_LENGTH;
+ } else {
+ t = ((dv * 0x1824429d) >> (32 - D_BITS)) & D_MASK;
+ m_pos = in + dict[t];
+ dict[t] = (lzo_dict_t) (ip - in);
+ if (unlikely(dv != get_unaligned_le32(m_pos)))
+ goto literal;
+ }
ii -= ti;
ti = 0;
t = ip - ii;
if (t != 0) {
if (t <= 3) {
- op[-2] |= t;
+ op[*state_offset] |= t;
COPY4(op, ii);
op += t;
} else if (t <= 16) {
@@ -88,6 +147,17 @@ next:
}
}
+ if (unlikely(run_length)) {
+ ip += run_length;
+ run_length -= MIN_ZERO_RUN_LENGTH;
+ put_unaligned_le32((run_length << 21) | 0xfffc18
+ | (run_length & 0x7), op);
+ op += 4;
+ run_length = 0;
+ *state_offset = -3;
+ goto finished_writing_instruction;
+ }
+
m_len = 4;
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(LZO_USE_CTZ64)
@@ -170,7 +240,6 @@ m_len_done:
m_off = ip - m_pos;
ip += m_len;
- ii = ip;
if (m_len <= M2_MAX_LEN && m_off <= M2_MAX_OFFSET) {
m_off -= 1;
*op++ = (((m_len - 1) << 5) | ((m_off & 7) << 2));
@@ -207,29 +276,45 @@ m_len_done:
*op++ = (m_off << 2);
*op++ = (m_off >> 6);
}
+ *state_offset = -2;
+finished_writing_instruction:
+ ii = ip;
goto next;
}
*out_len = op - out;
return in_end - (ii - ti);
}
-int lzo1x_1_compress(const unsigned char *in, size_t in_len,
+int lzogeneric1x_1_compress(const unsigned char *in, size_t in_len,
unsigned char *out, size_t *out_len,
- void *wrkmem)
+ void *wrkmem, const unsigned char bitstream_version)
{
const unsigned char *ip = in;
unsigned char *op = out;
size_t l = in_len;
size_t t = 0;
+ signed char state_offset = -2;
+ unsigned int m4_max_offset;
+
+ // LZO v0 will never write 17 as first byte,
+ // so this is used to version the bitstream
+ if (bitstream_version > 0) {
+ *op++ = 17;
+ *op++ = bitstream_version;
+ m4_max_offset = M4_MAX_OFFSET_V1;
+ } else {
+ m4_max_offset = M4_MAX_OFFSET_V0;
+ }
while (l > 20) {
- size_t ll = l <= (M4_MAX_OFFSET + 1) ? l : (M4_MAX_OFFSET + 1);
+ size_t ll = l <= (m4_max_offset + 1) ? l : (m4_max_offset + 1);
uintptr_t ll_end = (uintptr_t) ip + ll;
if ((ll_end + ((t + ll) >> 5)) <= ll_end)
break;
BUILD_BUG_ON(D_SIZE * sizeof(lzo_dict_t) > LZO1X_1_MEM_COMPRESS);
memset(wrkmem, 0, D_SIZE * sizeof(lzo_dict_t));
- t = lzo1x_1_do_compress(ip, ll, op, out_len, t, wrkmem);
+ t = lzo1x_1_do_compress(ip, ll, op, out_len, t, wrkmem,
+ &state_offset, bitstream_version);
ip += ll;
op += *out_len;
l -= ll;
@@ -242,7 +327,7 @@ int lzo1x_1_compress(const unsigned char *in, size_t in_len,
if (op == out && t <= 238) {
*op++ = (17 + t);
} else if (t <= 3) {
- op[-2] |= t;
+ op[state_offset] |= t;
} else if (t <= 18) {
*op++ = (t - 3);
} else {
@@ -273,7 +358,24 @@ int lzo1x_1_compress(const unsigned char *in, size_t in_len,
*out_len = op - out;
return LZO_E_OK;
}
+
+int lzo1x_1_compress(const unsigned char *in, size_t in_len,
+ unsigned char *out, size_t *out_len,
+ void *wrkmem)
+{
+ return lzogeneric1x_1_compress(in, in_len, out, out_len, wrkmem, 0);
+}
+
+int lzorle1x_1_compress(const unsigned char *in, size_t in_len,
+ unsigned char *out, size_t *out_len,
+ void *wrkmem)
+{
+ return lzogeneric1x_1_compress(in, in_len, out, out_len,
+ wrkmem, LZO_VERSION);
+}
+
EXPORT_SYMBOL_GPL(lzo1x_1_compress);
+EXPORT_SYMBOL_GPL(lzorle1x_1_compress);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("LZO1X-1 Compressor");
diff --git a/lib/lzo/lzo1x_decompress_safe.c b/lib/lzo/lzo1x_decompress_safe.c
index a1c387f6afba..6d2600ea3b55 100644
--- a/lib/lzo/lzo1x_decompress_safe.c
+++ b/lib/lzo/lzo1x_decompress_safe.c
@@ -46,11 +46,23 @@ int lzo1x_decompress_safe(const unsigned char *in, size_t in_len,
const unsigned char * const ip_end = in + in_len;
unsigned char * const op_end = out + *out_len;
+ unsigned char bitstream_version;
+
op = out;
ip = in;
if (unlikely(in_len < 3))
goto input_overrun;
+
+ if (likely(*ip == 17)) {
+ bitstream_version = ip[1];
+ ip += 2;
+ if (unlikely(in_len < 5))
+ goto input_overrun;
+ } else {
+ bitstream_version = 0;
+ }
+
if (*ip > 17) {
t = *ip++ - 17;
if (t < 4) {
@@ -154,32 +166,49 @@ copy_literal_run:
m_pos -= next >> 2;
next &= 3;
} else {
- m_pos = op;
- m_pos -= (t & 8) << 11;
- t = (t & 7) + (3 - 1);
- if (unlikely(t == 2)) {
- size_t offset;
- const unsigned char *ip_last = ip;
+ NEED_IP(2);
+ next = get_unaligned_le16(ip);
+ if (((next & 0xfffc) == 0xfffc) &&
+ ((t & 0xf8) == 0x18) &&
+ likely(bitstream_version)) {
+ NEED_IP(3);
+ t &= 7;
+ t |= ip[2] << 3;
+ t += MIN_ZERO_RUN_LENGTH;
+ NEED_OP(t);
+ memset(op, 0, t);
+ op += t;
+ next &= 3;
+ ip += 3;
+ goto match_next;
+ } else {
+ m_pos = op;
+ m_pos -= (t & 8) << 11;
+ t = (t & 7) + (3 - 1);
+ if (unlikely(t == 2)) {
+ size_t offset;
+ const unsigned char *ip_last = ip;
- while (unlikely(*ip == 0)) {
- ip++;
- NEED_IP(1);
- }
- offset = ip - ip_last;
- if (unlikely(offset > MAX_255_COUNT))
- return LZO_E_ERROR;
+ while (unlikely(*ip == 0)) {
+ ip++;
+ NEED_IP(1);
+ }
+ offset = ip - ip_last;
+ if (unlikely(offset > MAX_255_COUNT))
+ return LZO_E_ERROR;
- offset = (offset << 8) - offset;
- t += offset + 7 + *ip++;
- NEED_IP(2);
+ offset = (offset << 8) - offset;
+ t += offset + 7 + *ip++;
+ NEED_IP(2);
+ next = get_unaligned_le16(ip);
+ }
+ ip += 2;
+ m_pos -= next >> 2;
+ next &= 3;
+ if (m_pos == op)
+ goto eof_found;
+ m_pos -= 0x4000;
}
- next = get_unaligned_le16(ip);
- ip += 2;
- m_pos -= next >> 2;
- next &= 3;
- if (m_pos == op)
- goto eof_found;
- m_pos -= 0x4000;
}
TEST_LB(m_pos);
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
diff --git a/lib/lzo/lzodefs.h b/lib/lzo/lzodefs.h
index 4edefd2f540c..b60851fcf6ce 100644
--- a/lib/lzo/lzodefs.h
+++ b/lib/lzo/lzodefs.h
@@ -13,9 +13,15 @@
*/
+/* Version
+ * 0: original lzo version
+ * 1: lzo with support for RLE
+ */
+#define LZO_VERSION 1
+
#define COPY4(dst, src) \
put_unaligned(get_unaligned((const u32 *)(src)), (u32 *)(dst))
-#if defined(__x86_64__)
+#if defined(CONFIG_X86_64) || defined(CONFIG_ARM64)
#define COPY8(dst, src) \
put_unaligned(get_unaligned((const u64 *)(src)), (u64 *)(dst))
#else
@@ -25,19 +31,21 @@
#if defined(__BIG_ENDIAN) && defined(__LITTLE_ENDIAN)
#error "conflicting endian definitions"
-#elif defined(__x86_64__)
+#elif defined(CONFIG_X86_64) || defined(CONFIG_ARM64)
#define LZO_USE_CTZ64 1
#define LZO_USE_CTZ32 1
-#elif defined(__i386__) || defined(__powerpc__)
+#define LZO_FAST_64BIT_MEMORY_ACCESS
+#elif defined(CONFIG_X86) || defined(CONFIG_PPC)
#define LZO_USE_CTZ32 1
-#elif defined(__arm__) && (__LINUX_ARM_ARCH__ >= 5)
+#elif defined(CONFIG_ARM) && (__LINUX_ARM_ARCH__ >= 5)
#define LZO_USE_CTZ32 1
#endif
#define M1_MAX_OFFSET 0x0400
#define M2_MAX_OFFSET 0x0800
#define M3_MAX_OFFSET 0x4000
-#define M4_MAX_OFFSET 0xbfff
+#define M4_MAX_OFFSET_V0 0xbfff
+#define M4_MAX_OFFSET_V1 0xbffe
#define M1_MIN_LEN 2
#define M1_MAX_LEN 2
@@ -53,6 +61,9 @@
#define M3_MARKER 32
#define M4_MARKER 16
+#define MIN_ZERO_RUN_LENGTH 4
+#define MAX_ZERO_RUN_LENGTH (2047 + MIN_ZERO_RUN_LENGTH)
+
#define lzo_dict_t unsigned short
#define D_BITS 13
#define D_SIZE (1u << D_BITS)
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 51b78405bf24..7de2702621dc 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -480,29 +480,6 @@ static noinline void __init copy_user_test(void)
kfree(kmem);
}
-static noinline void __init use_after_scope_test(void)
-{
- volatile char *volatile p;
-
- pr_info("use-after-scope on int\n");
- {
- int local = 0;
-
- p = (char *)&local;
- }
- p[0] = 1;
- p[3] = 1;
-
- pr_info("use-after-scope on array\n");
- {
- char local[1024] = {0};
-
- p = local;
- }
- p[0] = 1;
- p[1023] = 1;
-}
-
static noinline void __init kasan_alloca_oob_left(void)
{
volatile int i = 10;
@@ -682,7 +659,6 @@ static int __init kmalloc_tests_init(void)
kasan_alloca_oob_right();
ksize_unpoisons_memory();
copy_user_test();
- use_after_scope_test();
kmem_cache_double_free();
kmem_cache_invalid_free();
kasan_memchr();
diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c
index 280f4979d00e..9ea10adf7a66 100644
--- a/lib/test_ubsan.c
+++ b/lib/test_ubsan.c
@@ -42,14 +42,6 @@ static void test_ubsan_divrem_overflow(void)
val /= val2;
}
-static void test_ubsan_vla_bound_not_positive(void)
-{
- volatile int size = -1;
- char buf[size];
-
- (void)buf;
-}
-
static void test_ubsan_shift_out_of_bounds(void)
{
volatile int val = -1;
@@ -61,7 +53,7 @@ static void test_ubsan_shift_out_of_bounds(void)
static void test_ubsan_out_of_bounds(void)
{
volatile int i = 4, j = 5;
- volatile int arr[i];
+ volatile int arr[4];
arr[j] = i;
}
@@ -113,7 +105,6 @@ static const test_ubsan_fp test_ubsan_array[] = {
test_ubsan_mul_overflow,
test_ubsan_negate_overflow,
test_ubsan_divrem_overflow,
- test_ubsan_vla_bound_not_positive,
test_ubsan_shift_out_of_bounds,
test_ubsan_out_of_bounds,
test_ubsan_load_invalid_value,
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
new file mode 100644
index 000000000000..83cdcaa82bf6
--- /dev/null
+++ b/lib/test_vmalloc.c
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test module for stress and analyze performance of vmalloc allocator.
+ * (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/kthread.h>
+#include <linux/moduleparam.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/rwsem.h>
+#include <linux/mm.h>
+
+#define __param(type, name, init, msg) \
+ static type name = init; \
+ module_param(name, type, 0444); \
+ MODULE_PARM_DESC(name, msg) \
+
+__param(bool, single_cpu_test, false,
+ "Use single first online CPU to run tests");
+
+__param(bool, sequential_test_order, false,
+ "Use sequential stress tests order");
+
+__param(int, test_repeat_count, 1,
+ "Set test repeat counter");
+
+__param(int, test_loop_count, 1000000,
+ "Set test loop counter");
+
+__param(int, run_test_mask, INT_MAX,
+ "Set tests specified in the mask.\n\n"
+ "\t\tid: 1, name: fix_size_alloc_test\n"
+ "\t\tid: 2, name: full_fit_alloc_test\n"
+ "\t\tid: 4, name: long_busy_list_alloc_test\n"
+ "\t\tid: 8, name: random_size_alloc_test\n"
+ "\t\tid: 16, name: fix_align_alloc_test\n"
+ "\t\tid: 32, name: random_size_align_alloc_test\n"
+ "\t\tid: 64, name: align_shift_alloc_test\n"
+ "\t\tid: 128, name: pcpu_alloc_test\n"
+ /* Add a new test case description here. */
+);
+
+/*
+ * Depends on single_cpu_test parameter. If it is true, then
+ * use first online CPU to trigger a test on, otherwise go with
+ * all online CPUs.
+ */
+static cpumask_t cpus_run_test_mask = CPU_MASK_NONE;
+
+/*
+ * Read write semaphore for synchronization of setup
+ * phase that is done in main thread and workers.
+ */
+static DECLARE_RWSEM(prepare_for_test_rwsem);
+
+/*
+ * Completion tracking for worker threads.
+ */
+static DECLARE_COMPLETION(test_all_done_comp);
+static atomic_t test_n_undone = ATOMIC_INIT(0);
+
+static inline void
+test_report_one_done(void)
+{
+ if (atomic_dec_and_test(&test_n_undone))
+ complete(&test_all_done_comp);
+}
+
+static int random_size_align_alloc_test(void)
+{
+ unsigned long size, align, rnd;
+ void *ptr;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ get_random_bytes(&rnd, sizeof(rnd));
+
+ /*
+ * Maximum 1024 pages, if PAGE_SIZE is 4096.
+ */
+ align = 1 << (rnd % 23);
+
+ /*
+ * Maximum 10 pages.
+ */
+ size = ((rnd % 10) + 1) * PAGE_SIZE;
+
+ ptr = __vmalloc_node_range(size, align,
+ VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO,
+ PAGE_KERNEL,
+ 0, 0, __builtin_return_address(0));
+
+ if (!ptr)
+ return -1;
+
+ vfree(ptr);
+ }
+
+ return 0;
+}
+
+/*
+ * This test case is supposed to be failed.
+ */
+static int align_shift_alloc_test(void)
+{
+ unsigned long align;
+ void *ptr;
+ int i;
+
+ for (i = 0; i < BITS_PER_LONG; i++) {
+ align = ((unsigned long) 1) << i;
+
+ ptr = __vmalloc_node_range(PAGE_SIZE, align,
+ VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO,
+ PAGE_KERNEL,
+ 0, 0, __builtin_return_address(0));
+
+ if (!ptr)
+ return -1;
+
+ vfree(ptr);
+ }
+
+ return 0;
+}
+
+static int fix_align_alloc_test(void)
+{
+ void *ptr;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ ptr = __vmalloc_node_range(5 * PAGE_SIZE,
+ THREAD_ALIGN << 1,
+ VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO,
+ PAGE_KERNEL,
+ 0, 0, __builtin_return_address(0));
+
+ if (!ptr)
+ return -1;
+
+ vfree(ptr);
+ }
+
+ return 0;
+}
+
+static int random_size_alloc_test(void)
+{
+ unsigned int n;
+ void *p;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ get_random_bytes(&n, sizeof(i));
+ n = (n % 100) + 1;
+
+ p = vmalloc(n * PAGE_SIZE);
+
+ if (!p)
+ return -1;
+
+ *((__u8 *)p) = 1;
+ vfree(p);
+ }
+
+ return 0;
+}
+
+static int long_busy_list_alloc_test(void)
+{
+ void *ptr_1, *ptr_2;
+ void **ptr;
+ int rv = -1;
+ int i;
+
+ ptr = vmalloc(sizeof(void *) * 15000);
+ if (!ptr)
+ return rv;
+
+ for (i = 0; i < 15000; i++)
+ ptr[i] = vmalloc(1 * PAGE_SIZE);
+
+ for (i = 0; i < test_loop_count; i++) {
+ ptr_1 = vmalloc(100 * PAGE_SIZE);
+ if (!ptr_1)
+ goto leave;
+
+ ptr_2 = vmalloc(1 * PAGE_SIZE);
+ if (!ptr_2) {
+ vfree(ptr_1);
+ goto leave;
+ }
+
+ *((__u8 *)ptr_1) = 0;
+ *((__u8 *)ptr_2) = 1;
+
+ vfree(ptr_1);
+ vfree(ptr_2);
+ }
+
+ /* Success */
+ rv = 0;
+
+leave:
+ for (i = 0; i < 15000; i++)
+ vfree(ptr[i]);
+
+ vfree(ptr);
+ return rv;
+}
+
+static int full_fit_alloc_test(void)
+{
+ void **ptr, **junk_ptr, *tmp;
+ int junk_length;
+ int rv = -1;
+ int i;
+
+ junk_length = fls(num_online_cpus());
+ junk_length *= (32 * 1024 * 1024 / PAGE_SIZE);
+
+ ptr = vmalloc(sizeof(void *) * junk_length);
+ if (!ptr)
+ return rv;
+
+ junk_ptr = vmalloc(sizeof(void *) * junk_length);
+ if (!junk_ptr) {
+ vfree(ptr);
+ return rv;
+ }
+
+ for (i = 0; i < junk_length; i++) {
+ ptr[i] = vmalloc(1 * PAGE_SIZE);
+ junk_ptr[i] = vmalloc(1 * PAGE_SIZE);
+ }
+
+ for (i = 0; i < junk_length; i++)
+ vfree(junk_ptr[i]);
+
+ for (i = 0; i < test_loop_count; i++) {
+ tmp = vmalloc(1 * PAGE_SIZE);
+
+ if (!tmp)
+ goto error;
+
+ *((__u8 *)tmp) = 1;
+ vfree(tmp);
+ }
+
+ /* Success */
+ rv = 0;
+
+error:
+ for (i = 0; i < junk_length; i++)
+ vfree(ptr[i]);
+
+ vfree(ptr);
+ vfree(junk_ptr);
+
+ return rv;
+}
+
+static int fix_size_alloc_test(void)
+{
+ void *ptr;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ ptr = vmalloc(3 * PAGE_SIZE);
+
+ if (!ptr)
+ return -1;
+
+ *((__u8 *)ptr) = 0;
+
+ vfree(ptr);
+ }
+
+ return 0;
+}
+
+static int
+pcpu_alloc_test(void)
+{
+ int rv = 0;
+#ifndef CONFIG_NEED_PER_CPU_KM
+ void __percpu **pcpu;
+ size_t size, align;
+ int i;
+
+ pcpu = vmalloc(sizeof(void __percpu *) * 35000);
+ if (!pcpu)
+ return -1;
+
+ for (i = 0; i < 35000; i++) {
+ unsigned int r;
+
+ get_random_bytes(&r, sizeof(i));
+ size = (r % (PAGE_SIZE / 4)) + 1;
+
+ /*
+ * Maximum PAGE_SIZE
+ */
+ get_random_bytes(&r, sizeof(i));
+ align = 1 << ((i % 11) + 1);
+
+ pcpu[i] = __alloc_percpu(size, align);
+ if (!pcpu[i])
+ rv = -1;
+ }
+
+ for (i = 0; i < 35000; i++)
+ free_percpu(pcpu[i]);
+
+ vfree(pcpu);
+#endif
+ return rv;
+}
+
+struct test_case_desc {
+ const char *test_name;
+ int (*test_func)(void);
+};
+
+static struct test_case_desc test_case_array[] = {
+ { "fix_size_alloc_test", fix_size_alloc_test },
+ { "full_fit_alloc_test", full_fit_alloc_test },
+ { "long_busy_list_alloc_test", long_busy_list_alloc_test },
+ { "random_size_alloc_test", random_size_alloc_test },
+ { "fix_align_alloc_test", fix_align_alloc_test },
+ { "random_size_align_alloc_test", random_size_align_alloc_test },
+ { "align_shift_alloc_test", align_shift_alloc_test },
+ { "pcpu_alloc_test", pcpu_alloc_test },
+ /* Add a new test case here. */
+};
+
+struct test_case_data {
+ int test_failed;
+ int test_passed;
+ u64 time;
+};
+
+/* Split it to get rid of: WARNING: line over 80 characters */
+static struct test_case_data
+ per_cpu_test_data[NR_CPUS][ARRAY_SIZE(test_case_array)];
+
+static struct test_driver {
+ struct task_struct *task;
+ unsigned long start;
+ unsigned long stop;
+ int cpu;
+} per_cpu_test_driver[NR_CPUS];
+
+static void shuffle_array(int *arr, int n)
+{
+ unsigned int rnd;
+ int i, j, x;
+
+ for (i = n - 1; i > 0; i--) {
+ get_random_bytes(&rnd, sizeof(rnd));
+
+ /* Cut the range. */
+ j = rnd % i;
+
+ /* Swap indexes. */
+ x = arr[i];
+ arr[i] = arr[j];
+ arr[j] = x;
+ }
+}
+
+static int test_func(void *private)
+{
+ struct test_driver *t = private;
+ cpumask_t newmask = CPU_MASK_NONE;
+ int random_array[ARRAY_SIZE(test_case_array)];
+ int index, i, j, ret;
+ ktime_t kt;
+ u64 delta;
+
+ cpumask_set_cpu(t->cpu, &newmask);
+ set_cpus_allowed_ptr(current, &newmask);
+
+ for (i = 0; i < ARRAY_SIZE(test_case_array); i++)
+ random_array[i] = i;
+
+ if (!sequential_test_order)
+ shuffle_array(random_array, ARRAY_SIZE(test_case_array));
+
+ /*
+ * Block until initialization is done.
+ */
+ down_read(&prepare_for_test_rwsem);
+
+ t->start = get_cycles();
+ for (i = 0; i < ARRAY_SIZE(test_case_array); i++) {
+ index = random_array[i];
+
+ /*
+ * Skip tests if run_test_mask has been specified.
+ */
+ if (!((run_test_mask & (1 << index)) >> index))
+ continue;
+
+ kt = ktime_get();
+ for (j = 0; j < test_repeat_count; j++) {
+ ret = test_case_array[index].test_func();
+ if (!ret)
+ per_cpu_test_data[t->cpu][index].test_passed++;
+ else
+ per_cpu_test_data[t->cpu][index].test_failed++;
+ }
+
+ /*
+ * Take an average time that test took.
+ */
+ delta = (u64) ktime_us_delta(ktime_get(), kt);
+ do_div(delta, (u32) test_repeat_count);
+
+ per_cpu_test_data[t->cpu][index].time = delta;
+ }
+ t->stop = get_cycles();
+
+ up_read(&prepare_for_test_rwsem);
+ test_report_one_done();
+
+ /*
+ * Wait for the kthread_stop() call.
+ */
+ while (!kthread_should_stop())
+ msleep(10);
+
+ return 0;
+}
+
+static void
+init_test_configurtion(void)
+{
+ /*
+ * Reset all data of all CPUs.
+ */
+ memset(per_cpu_test_data, 0, sizeof(per_cpu_test_data));
+
+ if (single_cpu_test)
+ cpumask_set_cpu(cpumask_first(cpu_online_mask),
+ &cpus_run_test_mask);
+ else
+ cpumask_and(&cpus_run_test_mask, cpu_online_mask,
+ cpu_online_mask);
+
+ if (test_repeat_count <= 0)
+ test_repeat_count = 1;
+
+ if (test_loop_count <= 0)
+ test_loop_count = 1;
+}
+
+static void do_concurrent_test(void)
+{
+ int cpu, ret;
+
+ /*
+ * Set some basic configurations plus sanity check.
+ */
+ init_test_configurtion();
+
+ /*
+ * Put on hold all workers.
+ */
+ down_write(&prepare_for_test_rwsem);
+
+ for_each_cpu(cpu, &cpus_run_test_mask) {
+ struct test_driver *t = &per_cpu_test_driver[cpu];
+
+ t->cpu = cpu;
+ t->task = kthread_run(test_func, t, "vmalloc_test/%d", cpu);
+
+ if (!IS_ERR(t->task))
+ /* Success. */
+ atomic_inc(&test_n_undone);
+ else
+ pr_err("Failed to start kthread for %d CPU\n", cpu);
+ }
+
+ /*
+ * Now let the workers do their job.
+ */
+ up_write(&prepare_for_test_rwsem);
+
+ /*
+ * Sleep quiet until all workers are done with 1 second
+ * interval. Since the test can take a lot of time we
+ * can run into a stack trace of the hung task. That is
+ * why we go with completion_timeout and HZ value.
+ */
+ do {
+ ret = wait_for_completion_timeout(&test_all_done_comp, HZ);
+ } while (!ret);
+
+ for_each_cpu(cpu, &cpus_run_test_mask) {
+ struct test_driver *t = &per_cpu_test_driver[cpu];
+ int i;
+
+ if (!IS_ERR(t->task))
+ kthread_stop(t->task);
+
+ for (i = 0; i < ARRAY_SIZE(test_case_array); i++) {
+ if (!((run_test_mask & (1 << i)) >> i))
+ continue;
+
+ pr_info(
+ "Summary: %s passed: %d failed: %d repeat: %d loops: %d avg: %llu usec\n",
+ test_case_array[i].test_name,
+ per_cpu_test_data[cpu][i].test_passed,
+ per_cpu_test_data[cpu][i].test_failed,
+ test_repeat_count, test_loop_count,
+ per_cpu_test_data[cpu][i].time);
+ }
+
+ pr_info("All test took CPU%d=%lu cycles\n",
+ cpu, t->stop - t->start);
+ }
+}
+
+static int vmalloc_test_init(void)
+{
+ do_concurrent_test();
+ return -EAGAIN; /* Fail will directly unload the module */
+}
+
+static void vmalloc_test_exit(void)
+{
+}
+
+module_init(vmalloc_test_init)
+module_exit(vmalloc_test_exit)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Uladzislau Rezki");
+MODULE_DESCRIPTION("vmalloc test module");
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3add92329bae..30b00de4f321 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -17,6 +17,7 @@
*/
#include <stdarg.h>
+#include <linux/build_bug.h>
#include <linux/clk.h>
#include <linux/clk-provider.h>
#include <linux/module.h> /* for KSYM_SYMBOL_LEN */
@@ -405,6 +406,8 @@ struct printf_spec {
unsigned int base:8; /* number base, 8, 10 or 16 only */
signed int precision:16; /* # of digits/chars */
} __packed;
+static_assert(sizeof(struct printf_spec) == 8);
+
#define FIELD_WIDTH_MAX ((1 << 23) - 1)
#define PRECISION_MAX ((1 << 15) - 1)
@@ -422,8 +425,6 @@ char *number(char *buf, char *end, unsigned long long num,
int field_width = spec.field_width;
int precision = spec.precision;
- BUILD_BUG_ON(sizeof(struct printf_spec) != 8);
-
/* locase = 0 or 0x20. ORing digits or letters with 'locase'
* produces same digits or (maybe lowercased) letters */
locase = (spec.flags & SMALL);
diff --git a/mm/Kconfig b/mm/Kconfig
index 25c71eb8a7db..7e537ec7aea3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -695,6 +695,7 @@ config DEV_PAGEMAP_OPS
config HMM
bool
select MIGRATE_VMA_HELPER
+ select MMU_NOTIFIER
config HMM_MIRROR
bool "HMM mirror CPU page table into a device page table"
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 9a7b8b049d04..e3df921208c0 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -39,6 +39,23 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
Enable debug page memory allocations by default? This value
can be overridden by debug_pagealloc=off|on.
+config PAGE_OWNER
+ bool "Track page owner"
+ depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
+ select DEBUG_FS
+ select STACKTRACE
+ select STACKDEPOT
+ select PAGE_EXTENSION
+ help
+ This keeps track of what call chain is the owner of a page, may
+ help to find bare alloc_page(s) leaks. Even if you include this
+ feature on your build, it is disabled in default. You should pass
+ "page_owner=on" to boot parameter in order to enable it. Eats
+ a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
+ for user-space helper.
+
+ If unsure, say N.
+
config PAGE_POISONING
bool "Poison pages after freeing"
select PAGE_POISONING_NO_SANITY if HIBERNATION
diff --git a/mm/Makefile b/mm/Makefile
index d210cc9d6f80..ac5e5ba78874 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,7 +33,7 @@ mmu-$(CONFIG_MMU) += process_vm_access.o
endif
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
- maccess.o page_alloc.o page-writeback.o \
+ maccess.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
@@ -41,6 +41,11 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
interval_tree.o list_lru.o workingset.o \
debug.o $(mmu-y)
+# Give 'page_alloc' its own module-parameter namespace
+page-alloc-y := page_alloc.o
+page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
+
+obj-y += page-alloc.o
obj-y += init-mm.o
obj-y += memblock.o
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index ad6723e9d110..b55f28fbe831 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -191,8 +191,6 @@ static int __init cma_debugfs_init(void)
int i;
cma_debugfs_root = debugfs_create_dir("cma", NULL);
- if (!cma_debugfs_root)
- return -ENOMEM;
for (i = 0; i < cma_area_count; i++)
cma_debugfs_add_one(&cma_areas[i], i);
diff --git a/mm/compaction.c b/mm/compaction.c
index ef29490b0f46..98f99f41dfdc 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -66,7 +66,7 @@ static unsigned long release_freepages(struct list_head *freelist)
return high_pfn;
}
-static void map_pages(struct list_head *list)
+static void split_map_pages(struct list_head *list)
{
unsigned int i, order, nr_pages;
struct page *page, *next;
@@ -237,6 +237,70 @@ static bool pageblock_skip_persistent(struct page *page)
return false;
}
+static bool
+__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
+ bool check_target)
+{
+ struct page *page = pfn_to_online_page(pfn);
+ struct page *end_page;
+ unsigned long block_pfn;
+
+ if (!page)
+ return false;
+ if (zone != page_zone(page))
+ return false;
+ if (pageblock_skip_persistent(page))
+ return false;
+
+ /*
+ * If skip is already cleared do no further checking once the
+ * restart points have been set.
+ */
+ if (check_source && check_target && !get_pageblock_skip(page))
+ return true;
+
+ /*
+ * If clearing skip for the target scanner, do not select a
+ * non-movable pageblock as the starting point.
+ */
+ if (!check_source && check_target &&
+ get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+ return false;
+
+ /*
+ * Only clear the hint if a sample indicates there is either a
+ * free page or an LRU page in the block. One or other condition
+ * is necessary for the block to be a migration source/target.
+ */
+ block_pfn = pageblock_start_pfn(pfn);
+ pfn = max(block_pfn, zone->zone_start_pfn);
+ page = pfn_to_page(pfn);
+ if (zone != page_zone(page))
+ return false;
+ pfn = block_pfn + pageblock_nr_pages;
+ pfn = min(pfn, zone_end_pfn(zone));
+ end_page = pfn_to_page(pfn);
+
+ do {
+ if (pfn_valid_within(pfn)) {
+ if (check_source && PageLRU(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+
+ if (check_target && PageBuddy(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+ }
+
+ page += (1 << PAGE_ALLOC_COSTLY_ORDER);
+ pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
+ } while (page < end_page);
+
+ return false;
+}
+
/*
* This function is called to clear all cached information on pageblocks that
* should be skipped for page isolation when the migrate and free page scanner
@@ -244,30 +308,54 @@ static bool pageblock_skip_persistent(struct page *page)
*/
static void __reset_isolation_suitable(struct zone *zone)
{
- unsigned long start_pfn = zone->zone_start_pfn;
- unsigned long end_pfn = zone_end_pfn(zone);
- unsigned long pfn;
+ unsigned long migrate_pfn = zone->zone_start_pfn;
+ unsigned long free_pfn = zone_end_pfn(zone);
+ unsigned long reset_migrate = free_pfn;
+ unsigned long reset_free = migrate_pfn;
+ bool source_set = false;
+ bool free_set = false;
+
+ if (!zone->compact_blockskip_flush)
+ return;
zone->compact_blockskip_flush = false;
- /* Walk the zone and mark every pageblock as suitable for isolation */
- for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
- struct page *page;
-
+ /*
+ * Walk the zone and update pageblock skip information. Source looks
+ * for PageLRU while target looks for PageBuddy. When the scanner
+ * is found, both PageBuddy and PageLRU are checked as the pageblock
+ * is suitable as both source and target.
+ */
+ for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages,
+ free_pfn -= pageblock_nr_pages) {
cond_resched();
- page = pfn_to_online_page(pfn);
- if (!page)
- continue;
- if (zone != page_zone(page))
- continue;
- if (pageblock_skip_persistent(page))
- continue;
+ /* Update the migrate PFN */
+ if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) &&
+ migrate_pfn < reset_migrate) {
+ source_set = true;
+ reset_migrate = migrate_pfn;
+ zone->compact_init_migrate_pfn = reset_migrate;
+ zone->compact_cached_migrate_pfn[0] = reset_migrate;
+ zone->compact_cached_migrate_pfn[1] = reset_migrate;
+ }
- clear_pageblock_skip(page);
+ /* Update the free PFN */
+ if (__reset_isolation_pfn(zone, free_pfn, free_set, true) &&
+ free_pfn > reset_free) {
+ free_set = true;
+ reset_free = free_pfn;
+ zone->compact_init_free_pfn = reset_free;
+ zone->compact_cached_free_pfn = reset_free;
+ }
}
- reset_cached_positions(zone);
+ /* Leave no distance if no suitable block was reset */
+ if (reset_migrate >= reset_free) {
+ zone->compact_cached_migrate_pfn[0] = migrate_pfn;
+ zone->compact_cached_migrate_pfn[1] = migrate_pfn;
+ zone->compact_cached_free_pfn = free_pfn;
+ }
}
void reset_isolation_suitable(pg_data_t *pgdat)
@@ -286,15 +374,53 @@ void reset_isolation_suitable(pg_data_t *pgdat)
}
/*
+ * Sets the pageblock skip bit if it was clear. Note that this is a hint as
+ * locks are not required for read/writers. Returns true if it was already set.
+ */
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+ unsigned long pfn)
+{
+ bool skip;
+
+ /* Do no update if skip hint is being ignored */
+ if (cc->ignore_skip_hint)
+ return false;
+
+ if (!IS_ALIGNED(pfn, pageblock_nr_pages))
+ return false;
+
+ skip = get_pageblock_skip(page);
+ if (!skip && !cc->no_set_skip_hint)
+ set_pageblock_skip(page);
+
+ return skip;
+}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+ struct zone *zone = cc->zone;
+
+ pfn = pageblock_end_pfn(pfn);
+
+ /* Set for isolation rather than compaction */
+ if (cc->no_set_skip_hint)
+ return;
+
+ if (pfn > zone->compact_cached_migrate_pfn[0])
+ zone->compact_cached_migrate_pfn[0] = pfn;
+ if (cc->mode != MIGRATE_ASYNC &&
+ pfn > zone->compact_cached_migrate_pfn[1])
+ zone->compact_cached_migrate_pfn[1] = pfn;
+}
+
+/*
* If no pages were isolated then mark this pageblock to be skipped in the
* future. The information is later cleared by __reset_isolation_suitable().
*/
static void update_pageblock_skip(struct compact_control *cc,
- struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ struct page *page, unsigned long pfn)
{
struct zone *zone = cc->zone;
- unsigned long pfn;
if (cc->no_set_skip_hint)
return;
@@ -302,24 +428,11 @@ static void update_pageblock_skip(struct compact_control *cc,
if (!page)
return;
- if (nr_isolated)
- return;
-
set_pageblock_skip(page);
- pfn = page_to_pfn(page);
-
/* Update where async and sync compaction should restart */
- if (migrate_scanner) {
- if (pfn > zone->compact_cached_migrate_pfn[0])
- zone->compact_cached_migrate_pfn[0] = pfn;
- if (cc->mode != MIGRATE_ASYNC &&
- pfn > zone->compact_cached_migrate_pfn[1])
- zone->compact_cached_migrate_pfn[1] = pfn;
- } else {
- if (pfn < zone->compact_cached_free_pfn)
- zone->compact_cached_free_pfn = pfn;
- }
+ if (pfn < zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = pfn;
}
#else
static inline bool isolation_suitable(struct compact_control *cc,
@@ -334,32 +447,42 @@ static inline bool pageblock_skip_persistent(struct page *page)
}
static inline void update_pageblock_skip(struct compact_control *cc,
- struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ struct page *page, unsigned long pfn)
+{
+}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+}
+
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+ unsigned long pfn)
{
+ return false;
}
#endif /* CONFIG_COMPACTION */
/*
* Compaction requires the taking of some coarse locks that are potentially
- * very heavily contended. For async compaction, back out if the lock cannot
- * be taken immediately. For sync compaction, spin on the lock if needed.
+ * very heavily contended. For async compaction, trylock and record if the
+ * lock is contended. The lock will still be acquired but compaction will
+ * abort when the current block is finished regardless of success rate.
+ * Sync compaction acquires the lock.
*
- * Returns true if the lock is held
- * Returns false if the lock is not held and compaction should abort
+ * Always returns true which makes it easier to track lock state in callers.
*/
-static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
+static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
struct compact_control *cc)
{
- if (cc->mode == MIGRATE_ASYNC) {
- if (!spin_trylock_irqsave(lock, *flags)) {
- cc->contended = true;
- return false;
- }
- } else {
- spin_lock_irqsave(lock, *flags);
+ /* Track if the lock is contended in async mode */
+ if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
+ if (spin_trylock_irqsave(lock, *flags))
+ return true;
+
+ cc->contended = true;
}
+ spin_lock_irqsave(lock, *flags);
return true;
}
@@ -391,37 +514,7 @@ static bool compact_unlock_should_abort(spinlock_t *lock,
return true;
}
- if (need_resched()) {
- if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
- return true;
- }
- cond_resched();
- }
-
- return false;
-}
-
-/*
- * Aside from avoiding lock contention, compaction also periodically checks
- * need_resched() and either schedules in sync compaction or aborts async
- * compaction. This is similar to what compact_unlock_should_abort() does, but
- * is used where no lock is concerned.
- *
- * Returns false when no scheduling was needed, or sync compaction scheduled.
- * Returns true when async compaction should abort.
- */
-static inline bool compact_should_abort(struct compact_control *cc)
-{
- /* async compaction aborts if contended */
- if (need_resched()) {
- if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
- return true;
- }
-
- cond_resched();
- }
+ cond_resched();
return false;
}
@@ -435,19 +528,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
unsigned long *start_pfn,
unsigned long end_pfn,
struct list_head *freelist,
+ unsigned int stride,
bool strict)
{
int nr_scanned = 0, total_isolated = 0;
- struct page *cursor, *valid_page = NULL;
+ struct page *cursor;
unsigned long flags = 0;
bool locked = false;
unsigned long blockpfn = *start_pfn;
unsigned int order;
+ /* Strict mode is for isolation, speed is secondary */
+ if (strict)
+ stride = 1;
+
cursor = pfn_to_page(blockpfn);
/* Isolate free pages. */
- for (; blockpfn < end_pfn; blockpfn++, cursor++) {
+ for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) {
int isolated;
struct page *page = cursor;
@@ -465,9 +563,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (!pfn_valid_within(blockpfn))
goto isolate_fail;
- if (!valid_page)
- valid_page = page;
-
/*
* For compound pages such as THP and hugetlbfs, we can save
* potentially a lot of iterations if we skip them at once.
@@ -495,18 +590,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
* recheck as well.
*/
if (!locked) {
- /*
- * The zone lock must be held to isolate freepages.
- * Unfortunately this is a very coarse lock and can be
- * heavily contended if there are parallel allocations
- * or parallel compactions. For async compaction do not
- * spin on the lock and we acquire the lock as late as
- * possible.
- */
- locked = compact_trylock_irqsave(&cc->zone->lock,
+ locked = compact_lock_irqsave(&cc->zone->lock,
&flags, cc);
- if (!locked)
- break;
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
@@ -565,10 +650,6 @@ isolate_fail:
if (strict && blockpfn < end_pfn)
total_isolated = 0;
- /* Update the pageblock-skip if the whole pageblock was scanned */
- if (blockpfn == end_pfn)
- update_pageblock_skip(cc, valid_page, total_isolated, false);
-
cc->total_free_scanned += nr_scanned;
if (total_isolated)
count_compact_events(COMPACTISOLATED, total_isolated);
@@ -626,7 +707,7 @@ isolate_freepages_range(struct compact_control *cc,
break;
isolated = isolate_freepages_block(cc, &isolate_start_pfn,
- block_end_pfn, &freelist, true);
+ block_end_pfn, &freelist, 0, true);
/*
* In strict mode, isolate_freepages_block() returns 0 if
@@ -644,7 +725,7 @@ isolate_freepages_range(struct compact_control *cc,
}
/* __isolate_free_page() does not map the pages */
- map_pages(&freelist);
+ split_map_pages(&freelist);
if (pfn < end_pfn) {
/* Loop terminated early, cleanup. */
@@ -702,6 +783,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
+ bool skip_updated = false;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -719,8 +801,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
return 0;
}
- if (compact_should_abort(cc))
- return 0;
+ cond_resched();
if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
skip_on_failure = true;
@@ -768,8 +849,19 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
page = pfn_to_page(low_pfn);
- if (!valid_page)
+ /*
+ * Check if the pageblock has already been marked skipped.
+ * Only the aligned PFN is checked as the caller isolates
+ * COMPACT_CLUSTER_MAX at a time so the second call must
+ * not falsely conclude that the block should be skipped.
+ */
+ if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
+ if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
+ low_pfn = end_pfn;
+ goto isolate_abort;
+ }
valid_page = page;
+ }
/*
* Skip if free. We read page order here without zone lock
@@ -848,10 +940,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
- locked = compact_trylock_irqsave(zone_lru_lock(zone),
+ locked = compact_lock_irqsave(zone_lru_lock(zone),
&flags, cc);
- if (!locked)
- break;
+
+ /* Try get exclusive access under lock */
+ if (!skip_updated) {
+ skip_updated = true;
+ if (test_and_set_skip(cc, page, low_pfn))
+ goto isolate_abort;
+ }
/* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
@@ -887,16 +984,13 @@ isolate_success:
nr_isolated++;
/*
- * Record where we could have freed pages by migration and not
- * yet flushed them to buddy allocator.
- * - this is the lowest page that was isolated and likely be
- * then freed by migration.
+ * Avoid isolating too much unless this block is being
+ * rescanned (e.g. dirty/writeback pages, parallel allocation)
+ * or a lock is contended. For contention, isolate quickly to
+ * potentially remove one source of contention.
*/
- if (!cc->last_migrated_pfn)
- cc->last_migrated_pfn = low_pfn;
-
- /* Avoid isolating too much */
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX &&
+ !cc->rescan && !cc->contended) {
++low_pfn;
break;
}
@@ -918,7 +1012,6 @@ isolate_fail:
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
- cc->last_migrated_pfn = 0;
nr_isolated = 0;
}
@@ -939,15 +1032,23 @@ isolate_fail:
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
+isolate_abort:
if (locked)
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
/*
- * Update the pageblock-skip information and cached scanner pfn,
- * if the whole pageblock was scanned without isolating any page.
+ * Updated the cached scanner pfn once the pageblock has been scanned
+ * Pages will either be migrated in which case there is no point
+ * scanning in the near future or migration failed in which case the
+ * failure reason may persist. The block is marked for skipping if
+ * there were no pages isolated in the block or if the block is
+ * rescanned twice in a row.
*/
- if (low_pfn == end_pfn)
- update_pageblock_skip(cc, valid_page, nr_isolated, true);
+ if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) {
+ if (valid_page && !skip_updated)
+ set_pageblock_skip(valid_page);
+ update_cached_migrate(cc, low_pfn);
+ }
trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
nr_scanned, nr_isolated);
@@ -1013,6 +1114,9 @@ static bool suitable_migration_source(struct compact_control *cc,
{
int block_mt;
+ if (pageblock_skip_persistent(page))
+ return false;
+
if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
return true;
@@ -1050,6 +1154,12 @@ static bool suitable_migration_target(struct compact_control *cc,
return false;
}
+static inline unsigned int
+freelist_scan_limit(struct compact_control *cc)
+{
+ return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1;
+}
+
/*
* Test whether the free scanner has reached the same or lower pageblock than
* the migration scanner, and compaction should thus terminate.
@@ -1061,6 +1171,248 @@ static inline bool compact_scanners_met(struct compact_control *cc)
}
/*
+ * Used when scanning for a suitable migration target which scans freelists
+ * in reverse. Reorders the list such as the unscanned pages are scanned
+ * first on the next iteration of the free scanner
+ */
+static void
+move_freelist_head(struct list_head *freelist, struct page *freepage)
+{
+ LIST_HEAD(sublist);
+
+ if (!list_is_last(freelist, &freepage->lru)) {
+ list_cut_before(&sublist, freelist, &freepage->lru);
+ if (!list_empty(&sublist))
+ list_splice_tail(&sublist, freelist);
+ }
+}
+
+/*
+ * Similar to move_freelist_head except used by the migration scanner
+ * when scanning forward. It's possible for these list operations to
+ * move against each other if they search the free list exactly in
+ * lockstep.
+ */
+static void
+move_freelist_tail(struct list_head *freelist, struct page *freepage)
+{
+ LIST_HEAD(sublist);
+
+ if (!list_is_first(freelist, &freepage->lru)) {
+ list_cut_position(&sublist, freelist, &freepage->lru);
+ if (!list_empty(&sublist))
+ list_splice_tail(&sublist, freelist);
+ }
+}
+
+static void
+fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
+{
+ unsigned long start_pfn, end_pfn;
+ struct page *page = pfn_to_page(pfn);
+
+ /* Do not search around if there are enough pages already */
+ if (cc->nr_freepages >= cc->nr_migratepages)
+ return;
+
+ /* Minimise scanning during async compaction */
+ if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC)
+ return;
+
+ /* Pageblock boundaries */
+ start_pfn = pageblock_start_pfn(pfn);
+ end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone));
+
+ /* Scan before */
+ if (start_pfn != pfn) {
+ isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false);
+ if (cc->nr_freepages >= cc->nr_migratepages)
+ return;
+ }
+
+ /* Scan after */
+ start_pfn = pfn + nr_isolated;
+ if (start_pfn != end_pfn)
+ isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
+
+ /* Skip this pageblock in the future as it's full or nearly full */
+ if (cc->nr_freepages < cc->nr_migratepages)
+ set_pageblock_skip(page);
+}
+
+/* Search orders in round-robin fashion */
+static int next_search_order(struct compact_control *cc, int order)
+{
+ order--;
+ if (order < 0)
+ order = cc->order - 1;
+
+ /* Search wrapped around? */
+ if (order == cc->search_order) {
+ cc->search_order--;
+ if (cc->search_order < 0)
+ cc->search_order = cc->order - 1;
+ return -1;
+ }
+
+ return order;
+}
+
+static unsigned long
+fast_isolate_freepages(struct compact_control *cc)
+{
+ unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
+ unsigned int nr_scanned = 0;
+ unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0;
+ unsigned long nr_isolated = 0;
+ unsigned long distance;
+ struct page *page = NULL;
+ bool scan_start = false;
+ int order;
+
+ /* Full compaction passes in a negative order */
+ if (cc->order <= 0)
+ return cc->free_pfn;
+
+ /*
+ * If starting the scan, use a deeper search and use the highest
+ * PFN found if a suitable one is not found.
+ */
+ if (cc->free_pfn >= cc->zone->compact_init_free_pfn) {
+ limit = pageblock_nr_pages >> 1;
+ scan_start = true;
+ }
+
+ /*
+ * Preferred point is in the top quarter of the scan space but take
+ * a pfn from the top half if the search is problematic.
+ */
+ distance = (cc->free_pfn - cc->migrate_pfn);
+ low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2));
+ min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1));
+
+ if (WARN_ON_ONCE(min_pfn > low_pfn))
+ low_pfn = min_pfn;
+
+ /*
+ * Search starts from the last successful isolation order or the next
+ * order to search after a previous failure
+ */
+ cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order);
+
+ for (order = cc->search_order;
+ !page && order >= 0;
+ order = next_search_order(cc, order)) {
+ struct free_area *area = &cc->zone->free_area[order];
+ struct list_head *freelist;
+ struct page *freepage;
+ unsigned long flags;
+ unsigned int order_scanned = 0;
+
+ if (!area->nr_free)
+ continue;
+
+ spin_lock_irqsave(&cc->zone->lock, flags);
+ freelist = &area->free_list[MIGRATE_MOVABLE];
+ list_for_each_entry_reverse(freepage, freelist, lru) {
+ unsigned long pfn;
+
+ order_scanned++;
+ nr_scanned++;
+ pfn = page_to_pfn(freepage);
+
+ if (pfn >= highest)
+ highest = pageblock_start_pfn(pfn);
+
+ if (pfn >= low_pfn) {
+ cc->fast_search_fail = 0;
+ cc->search_order = order;
+ page = freepage;
+ break;
+ }
+
+ if (pfn >= min_pfn && pfn > high_pfn) {
+ high_pfn = pfn;
+
+ /* Shorten the scan if a candidate is found */
+ limit >>= 1;
+ }
+
+ if (order_scanned >= limit)
+ break;
+ }
+
+ /* Use a minimum pfn if a preferred one was not found */
+ if (!page && high_pfn) {
+ page = pfn_to_page(high_pfn);
+
+ /* Update freepage for the list reorder below */
+ freepage = page;
+ }
+
+ /* Reorder to so a future search skips recent pages */
+ move_freelist_head(freelist, freepage);
+
+ /* Isolate the page if available */
+ if (page) {
+ if (__isolate_free_page(page, order)) {
+ set_page_private(page, order);
+ nr_isolated = 1 << order;
+ cc->nr_freepages += nr_isolated;
+ list_add_tail(&page->lru, &cc->freepages);
+ count_compact_events(COMPACTISOLATED, nr_isolated);
+ } else {
+ /* If isolation fails, abort the search */
+ order = -1;
+ page = NULL;
+ }
+ }
+
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+
+ /*
+ * Smaller scan on next order so the total scan ig related
+ * to freelist_scan_limit.
+ */
+ if (order_scanned >= limit)
+ limit = min(1U, limit >> 1);
+ }
+
+ if (!page) {
+ cc->fast_search_fail++;
+ if (scan_start) {
+ /*
+ * Use the highest PFN found above min. If one was
+ * not found, be pessemistic for direct compaction
+ * and use the min mark.
+ */
+ if (highest) {
+ page = pfn_to_page(highest);
+ cc->free_pfn = highest;
+ } else {
+ if (cc->direct_compaction) {
+ page = pfn_to_page(min_pfn);
+ cc->free_pfn = min_pfn;
+ }
+ }
+ }
+ }
+
+ if (highest && highest >= cc->zone->compact_cached_free_pfn) {
+ highest -= pageblock_nr_pages;
+ cc->zone->compact_cached_free_pfn = highest;
+ }
+
+ cc->total_free_scanned += nr_scanned;
+ if (!page)
+ return cc->free_pfn;
+
+ low_pfn = page_to_pfn(page);
+ fast_isolate_around(cc, low_pfn, nr_isolated);
+ return low_pfn;
+}
+
+/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
*/
@@ -1073,6 +1425,12 @@ static void isolate_freepages(struct compact_control *cc)
unsigned long block_end_pfn; /* end of current pageblock */
unsigned long low_pfn; /* lowest pfn scanner is able to scan */
struct list_head *freelist = &cc->freepages;
+ unsigned int stride;
+
+ /* Try a small search of the free lists for a candidate */
+ isolate_start_pfn = fast_isolate_freepages(cc);
+ if (cc->nr_freepages)
+ goto splitmap;
/*
* Initialise the free scanner. The starting point is where we last
@@ -1086,10 +1444,11 @@ static void isolate_freepages(struct compact_control *cc)
* is using.
*/
isolate_start_pfn = cc->free_pfn;
- block_start_pfn = pageblock_start_pfn(cc->free_pfn);
+ block_start_pfn = pageblock_start_pfn(isolate_start_pfn);
block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
zone_end_pfn(zone));
low_pfn = pageblock_end_pfn(cc->migrate_pfn);
+ stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1;
/*
* Isolate free pages until enough are available to migrate the
@@ -1100,14 +1459,14 @@ static void isolate_freepages(struct compact_control *cc)
block_end_pfn = block_start_pfn,
block_start_pfn -= pageblock_nr_pages,
isolate_start_pfn = block_start_pfn) {
+ unsigned long nr_isolated;
+
/*
* This can iterate a massively long zone without finding any
- * suitable migration targets, so periodically check if we need
- * to schedule, or even abort async compaction.
+ * suitable migration targets, so periodically check resched.
*/
- if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
- && compact_should_abort(cc))
- break;
+ if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ cond_resched();
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
@@ -1123,15 +1482,15 @@ static void isolate_freepages(struct compact_control *cc)
continue;
/* Found a block suitable for isolating free pages from. */
- isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn,
- freelist, false);
+ nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+ block_end_pfn, freelist, stride, false);
- /*
- * If we isolated enough freepages, or aborted due to lock
- * contention, terminate.
- */
- if ((cc->nr_freepages >= cc->nr_migratepages)
- || cc->contended) {
+ /* Update the skip hint if the full pageblock was scanned */
+ if (isolate_start_pfn == block_end_pfn)
+ update_pageblock_skip(cc, page, block_start_pfn);
+
+ /* Are enough freepages isolated? */
+ if (cc->nr_freepages >= cc->nr_migratepages) {
if (isolate_start_pfn >= block_end_pfn) {
/*
* Restart at previous pageblock if more
@@ -1148,10 +1507,14 @@ static void isolate_freepages(struct compact_control *cc)
*/
break;
}
- }
- /* __isolate_free_page() does not map the pages */
- map_pages(freelist);
+ /* Adjust stride depending on isolation */
+ if (nr_isolated) {
+ stride = 1;
+ continue;
+ }
+ stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1);
+ }
/*
* Record where the free scanner will restart next time. Either we
@@ -1160,6 +1523,10 @@ static void isolate_freepages(struct compact_control *cc)
* and the loop terminated due to isolate_start_pfn < low_pfn
*/
cc->free_pfn = isolate_start_pfn;
+
+splitmap:
+ /* __isolate_free_page() does not map the pages */
+ split_map_pages(freelist);
}
/*
@@ -1172,13 +1539,8 @@ static struct page *compaction_alloc(struct page *migratepage,
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;
- /*
- * Isolate free pages if necessary, and if we are not aborting due to
- * contention.
- */
if (list_empty(&cc->freepages)) {
- if (!cc->contended)
- isolate_freepages(cc);
+ isolate_freepages(cc);
if (list_empty(&cc->freepages))
return NULL;
@@ -1217,6 +1579,147 @@ typedef enum {
*/
int sysctl_compact_unevictable_allowed __read_mostly = 1;
+static inline void
+update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
+{
+ if (cc->fast_start_pfn == ULONG_MAX)
+ return;
+
+ if (!cc->fast_start_pfn)
+ cc->fast_start_pfn = pfn;
+
+ cc->fast_start_pfn = min(cc->fast_start_pfn, pfn);
+}
+
+static inline unsigned long
+reinit_migrate_pfn(struct compact_control *cc)
+{
+ if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX)
+ return cc->migrate_pfn;
+
+ cc->migrate_pfn = cc->fast_start_pfn;
+ cc->fast_start_pfn = ULONG_MAX;
+
+ return cc->migrate_pfn;
+}
+
+/*
+ * Briefly search the free lists for a migration source that already has
+ * some free pages to reduce the number of pages that need migration
+ * before a pageblock is free.
+ */
+static unsigned long fast_find_migrateblock(struct compact_control *cc)
+{
+ unsigned int limit = freelist_scan_limit(cc);
+ unsigned int nr_scanned = 0;
+ unsigned long distance;
+ unsigned long pfn = cc->migrate_pfn;
+ unsigned long high_pfn;
+ int order;
+
+ /* Skip hints are relied on to avoid repeats on the fast search */
+ if (cc->ignore_skip_hint)
+ return pfn;
+
+ /*
+ * If the migrate_pfn is not at the start of a zone or the start
+ * of a pageblock then assume this is a continuation of a previous
+ * scan restarted due to COMPACT_CLUSTER_MAX.
+ */
+ if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn))
+ return pfn;
+
+ /*
+ * For smaller orders, just linearly scan as the number of pages
+ * to migrate should be relatively small and does not necessarily
+ * justify freeing up a large block for a small allocation.
+ */
+ if (cc->order <= PAGE_ALLOC_COSTLY_ORDER)
+ return pfn;
+
+ /*
+ * Only allow kcompactd and direct requests for movable pages to
+ * quickly clear out a MOVABLE pageblock for allocation. This
+ * reduces the risk that a large movable pageblock is freed for
+ * an unmovable/reclaimable small allocation.
+ */
+ if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE)
+ return pfn;
+
+ /*
+ * When starting the migration scanner, pick any pageblock within the
+ * first half of the search space. Otherwise try and pick a pageblock
+ * within the first eighth to reduce the chances that a migration
+ * target later becomes a source.
+ */
+ distance = (cc->free_pfn - cc->migrate_pfn) >> 1;
+ if (cc->migrate_pfn != cc->zone->zone_start_pfn)
+ distance >>= 2;
+ high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
+
+ for (order = cc->order - 1;
+ order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit;
+ order--) {
+ struct free_area *area = &cc->zone->free_area[order];
+ struct list_head *freelist;
+ unsigned long flags;
+ struct page *freepage;
+
+ if (!area->nr_free)
+ continue;
+
+ spin_lock_irqsave(&cc->zone->lock, flags);
+ freelist = &area->free_list[MIGRATE_MOVABLE];
+ list_for_each_entry(freepage, freelist, lru) {
+ unsigned long free_pfn;
+
+ nr_scanned++;
+ free_pfn = page_to_pfn(freepage);
+ if (free_pfn < high_pfn) {
+ /*
+ * Avoid if skipped recently. Ideally it would
+ * move to the tail but even safe iteration of
+ * the list assumes an entry is deleted, not
+ * reordered.
+ */
+ if (get_pageblock_skip(freepage)) {
+ if (list_is_last(freelist, &freepage->lru))
+ break;
+
+ continue;
+ }
+
+ /* Reorder to so a future search skips recent pages */
+ move_freelist_tail(freelist, freepage);
+
+ update_fast_start_pfn(cc, free_pfn);
+ pfn = pageblock_start_pfn(free_pfn);
+ cc->fast_search_fail = 0;
+ set_pageblock_skip(freepage);
+ break;
+ }
+
+ if (nr_scanned >= limit) {
+ cc->fast_search_fail++;
+ move_freelist_tail(freelist, freepage);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+ }
+
+ cc->total_migrate_scanned += nr_scanned;
+
+ /*
+ * If fast scanning failed then use a cached entry for a page block
+ * that had free pages as the basis for starting a linear scan.
+ */
+ if (pfn == cc->migrate_pfn)
+ pfn = reinit_migrate_pfn(cc);
+
+ return pfn;
+}
+
/*
* Isolate all pages that can be migrated from the first suitable block,
* starting at the block pointed to by the migrate scanner pfn within
@@ -1232,16 +1735,25 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
+ bool fast_find_block;
/*
* Start at where we last stopped, or beginning of the zone as
- * initialized by compact_zone()
+ * initialized by compact_zone(). The first failure will use
+ * the lowest PFN as the starting point for linear scanning.
*/
- low_pfn = cc->migrate_pfn;
+ low_pfn = fast_find_migrateblock(cc);
block_start_pfn = pageblock_start_pfn(low_pfn);
if (block_start_pfn < zone->zone_start_pfn)
block_start_pfn = zone->zone_start_pfn;
+ /*
+ * fast_find_migrateblock marks a pageblock skipped so to avoid
+ * the isolation_suitable check below, check whether the fast
+ * search was successful.
+ */
+ fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail;
+
/* Only scan within a pageblock boundary */
block_end_pfn = pageblock_end_pfn(low_pfn);
@@ -1250,6 +1762,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* Do not cross the free scanner.
*/
for (; block_end_pfn <= cc->free_pfn;
+ fast_find_block = false,
low_pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
@@ -1257,34 +1770,45 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
/*
* This can potentially iterate a massively long zone with
* many pageblocks unsuitable, so periodically check if we
- * need to schedule, or even abort async compaction.
+ * need to schedule.
*/
- if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
- && compact_should_abort(cc))
- break;
+ if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ cond_resched();
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
if (!page)
continue;
- /* If isolation recently failed, do not retry */
- if (!isolation_suitable(cc, page))
+ /*
+ * If isolation recently failed, do not retry. Only check the
+ * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock
+ * to be visited multiple times. Assume skip was checked
+ * before making it "skip" so other compaction instances do
+ * not scan the same block.
+ */
+ if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
+ !fast_find_block && !isolation_suitable(cc, page))
continue;
/*
- * For async compaction, also only scan in MOVABLE blocks.
- * Async compaction is optimistic to see if the minimum amount
- * of work satisfies the allocation.
+ * For async compaction, also only scan in MOVABLE blocks
+ * without huge pages. Async compaction is optimistic to see
+ * if the minimum amount of work satisfies the allocation.
+ * The cached PFN is updated as it's possible that all
+ * remaining blocks between source and target are unsuitable
+ * and the compaction scanners fail to meet.
*/
- if (!suitable_migration_source(cc, page))
+ if (!suitable_migration_source(cc, page)) {
+ update_cached_migrate(cc, block_end_pfn);
continue;
+ }
/* Perform the isolation */
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);
- if (!low_pfn || cc->contended)
+ if (!low_pfn)
return ISOLATE_ABORT;
/*
@@ -1310,19 +1834,16 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
-static enum compact_result __compact_finished(struct zone *zone,
- struct compact_control *cc)
+static enum compact_result __compact_finished(struct compact_control *cc)
{
unsigned int order;
const int migratetype = cc->migratetype;
-
- if (cc->contended || fatal_signal_pending(current))
- return COMPACT_CONTENDED;
+ int ret;
/* Compaction run completes if the migrate and free scanner meet */
if (compact_scanners_met(cc)) {
/* Let the next compaction start anew. */
- reset_cached_positions(zone);
+ reset_cached_positions(cc->zone);
/*
* Mark that the PG_migrate_skip information should be cleared
@@ -1331,7 +1852,7 @@ static enum compact_result __compact_finished(struct zone *zone,
* based on an allocation request.
*/
if (cc->direct_compaction)
- zone->compact_blockskip_flush = true;
+ cc->zone->compact_blockskip_flush = true;
if (cc->whole_zone)
return COMPACT_COMPLETE;
@@ -1342,30 +1863,29 @@ static enum compact_result __compact_finished(struct zone *zone,
if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
- if (cc->finishing_block) {
- /*
- * We have finished the pageblock, but better check again that
- * we really succeeded.
- */
- if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
- cc->finishing_block = false;
- else
- return COMPACT_CONTINUE;
- }
+ /*
+ * Always finish scanning a pageblock to reduce the possibility of
+ * fallbacks in the future. This is particularly important when
+ * migration source is unmovable/reclaimable but it's not worth
+ * special casing.
+ */
+ if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+ return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
+ ret = COMPACT_NO_SUITABLE_PAGE;
for (order = cc->order; order < MAX_ORDER; order++) {
- struct free_area *area = &zone->free_area[order];
+ struct free_area *area = &cc->zone->free_area[order];
bool can_steal;
/* Job done if page is free of the right migratetype */
- if (!list_empty(&area->free_list[migratetype]))
+ if (!free_area_empty(area, migratetype))
return COMPACT_SUCCESS;
#ifdef CONFIG_CMA
/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
if (migratetype == MIGRATE_MOVABLE &&
- !list_empty(&area->free_list[MIGRATE_CMA]))
+ !free_area_empty(area, MIGRATE_CMA))
return COMPACT_SUCCESS;
#endif
/*
@@ -1393,21 +1913,23 @@ static enum compact_result __compact_finished(struct zone *zone,
return COMPACT_SUCCESS;
}
- cc->finishing_block = true;
- return COMPACT_CONTINUE;
+ ret = COMPACT_CONTINUE;
+ break;
}
}
- return COMPACT_NO_SUITABLE_PAGE;
+ if (cc->contended || fatal_signal_pending(current))
+ ret = COMPACT_CONTENDED;
+
+ return ret;
}
-static enum compact_result compact_finished(struct zone *zone,
- struct compact_control *cc)
+static enum compact_result compact_finished(struct compact_control *cc)
{
int ret;
- ret = __compact_finished(zone, cc);
- trace_mm_compaction_finished(zone, cc->order, ret);
+ ret = __compact_finished(cc);
+ trace_mm_compaction_finished(cc->zone, cc->order, ret);
if (ret == COMPACT_NO_SUITABLE_PAGE)
ret = COMPACT_CONTINUE;
@@ -1534,15 +2056,18 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
return false;
}
-static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
+static enum compact_result
+compact_zone(struct compact_control *cc, struct capture_control *capc)
{
enum compact_result ret;
- unsigned long start_pfn = zone->zone_start_pfn;
- unsigned long end_pfn = zone_end_pfn(zone);
+ unsigned long start_pfn = cc->zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(cc->zone);
+ unsigned long last_migrated_pfn;
const bool sync = cc->mode != MIGRATE_ASYNC;
+ bool update_cached;
cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
- ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
+ ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
/* Compaction is likely to fail */
if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
@@ -1555,8 +2080,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
* Clear pageblock skip if there were failures recently and compaction
* is about to be retried after being deferred.
*/
- if (compaction_restarting(zone, cc->order))
- __reset_isolation_suitable(zone);
+ if (compaction_restarting(cc->zone, cc->order))
+ __reset_isolation_suitable(cc->zone);
/*
* Setup to move all movable pages to the end of the zone. Used cached
@@ -1564,43 +2089,76 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
* want to compact the whole zone), but check that it is initialised
* by ensuring the values are within zone boundaries.
*/
+ cc->fast_start_pfn = 0;
if (cc->whole_zone) {
cc->migrate_pfn = start_pfn;
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
} else {
- cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
- cc->free_pfn = zone->compact_cached_free_pfn;
+ cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync];
+ cc->free_pfn = cc->zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
- zone->compact_cached_free_pfn = cc->free_pfn;
+ cc->zone->compact_cached_free_pfn = cc->free_pfn;
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
- zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
- zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+ cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+ cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
- if (cc->migrate_pfn == start_pfn)
+ if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn)
cc->whole_zone = true;
}
- cc->last_migrated_pfn = 0;
+ last_migrated_pfn = 0;
+
+ /*
+ * Migrate has separate cached PFNs for ASYNC and SYNC* migration on
+ * the basis that some migrations will fail in ASYNC mode. However,
+ * if the cached PFNs match and pageblocks are skipped due to having
+ * no isolation candidates, then the sync state does not matter.
+ * Until a pageblock with isolation candidates is found, keep the
+ * cached PFNs in sync to avoid revisiting the same blocks.
+ */
+ update_cached = !sync &&
+ cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync);
migrate_prep_local();
- while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+ while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
int err;
+ unsigned long start_pfn = cc->migrate_pfn;
+
+ /*
+ * Avoid multiple rescans which can happen if a page cannot be
+ * isolated (dirty/writeback in async mode) or if the migrated
+ * pages are being allocated before the pageblock is cleared.
+ * The first rescan will capture the entire pageblock for
+ * migration. If it fails, it'll be marked skip and scanning
+ * will proceed as normal.
+ */
+ cc->rescan = false;
+ if (pageblock_start_pfn(last_migrated_pfn) ==
+ pageblock_start_pfn(start_pfn)) {
+ cc->rescan = true;
+ }
- switch (isolate_migratepages(zone, cc)) {
+ switch (isolate_migratepages(cc->zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
+ last_migrated_pfn = 0;
goto out;
case ISOLATE_NONE:
+ if (update_cached) {
+ cc->zone->compact_cached_migrate_pfn[1] =
+ cc->zone->compact_cached_migrate_pfn[0];
+ }
+
/*
* We haven't isolated and migrated anything, but
* there might still be unflushed migrations from
@@ -1608,6 +2166,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
*/
goto check_drain;
case ISOLATE_SUCCESS:
+ update_cached = false;
+ last_migrated_pfn = start_pfn;
;
}
@@ -1639,8 +2199,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
cc->migrate_pfn = block_end_pfn(
cc->migrate_pfn - 1, cc->order);
/* Draining pcplists is useless in this case */
- cc->last_migrated_pfn = 0;
-
+ last_migrated_pfn = 0;
}
}
@@ -1652,21 +2211,26 @@ check_drain:
* compact_finished() can detect immediately if allocation
* would succeed.
*/
- if (cc->order > 0 && cc->last_migrated_pfn) {
+ if (cc->order > 0 && last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
block_start_pfn(cc->migrate_pfn, cc->order);
- if (cc->last_migrated_pfn < current_block_start) {
+ if (last_migrated_pfn < current_block_start) {
cpu = get_cpu();
lru_add_drain_cpu(cpu);
- drain_local_pages(zone);
+ drain_local_pages(cc->zone);
put_cpu();
/* No more flushing until we migrate again */
- cc->last_migrated_pfn = 0;
+ last_migrated_pfn = 0;
}
}
+ /* Stop if a page has been captured */
+ if (capc && capc->page) {
+ ret = COMPACT_SUCCESS;
+ break;
+ }
}
out:
@@ -1685,8 +2249,8 @@ out:
* Only go back, not forward. The cached pfn might have been
* already reset to zone end in compact_finished()
*/
- if (free_pfn > zone->compact_cached_free_pfn)
- zone->compact_cached_free_pfn = free_pfn;
+ if (free_pfn > cc->zone->compact_cached_free_pfn)
+ cc->zone->compact_cached_free_pfn = free_pfn;
}
count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
@@ -1700,7 +2264,8 @@ out:
static enum compact_result compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum compact_priority prio,
- unsigned int alloc_flags, int classzone_idx)
+ unsigned int alloc_flags, int classzone_idx,
+ struct page **capture)
{
enum compact_result ret;
struct compact_control cc = {
@@ -1709,6 +2274,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.total_migrate_scanned = 0,
.total_free_scanned = 0,
.order = order,
+ .search_order = order,
.gfp_mask = gfp_mask,
.zone = zone,
.mode = (prio == COMPACT_PRIO_ASYNC) ?
@@ -1720,14 +2286,24 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
};
+ struct capture_control capc = {
+ .cc = &cc,
+ .page = NULL,
+ };
+
+ if (capture)
+ current->capture_control = &capc;
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
- ret = compact_zone(zone, &cc);
+ ret = compact_zone(&cc, &capc);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
+ *capture = capc.page;
+ current->capture_control = NULL;
+
return ret;
}
@@ -1745,7 +2321,7 @@ int sysctl_extfrag_threshold = 500;
*/
enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum compact_priority prio)
+ enum compact_priority prio, struct page **capture)
{
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
@@ -1773,7 +2349,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
}
status = compact_zone_order(zone, order, gfp_mask, prio,
- alloc_flags, ac_classzone_idx(ac));
+ alloc_flags, ac_classzone_idx(ac), capture);
rc = max(status, rc);
/* The allocation should succeed, stop compacting */
@@ -1841,7 +2417,7 @@ static void compact_node(int nid)
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
- compact_zone(zone, &cc);
+ compact_zone(&cc, NULL);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
@@ -1876,14 +2452,6 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
return 0;
}
-int sysctl_extfrag_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
-{
- proc_dointvec_minmax(table, write, buffer, length, ppos);
-
- return 0;
-}
-
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
static ssize_t sysfs_compact_node(struct device *dev,
struct device_attribute *attr,
@@ -1948,6 +2516,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
struct zone *zone;
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
+ .search_order = pgdat->kcompactd_max_order,
.total_migrate_scanned = 0,
.total_free_scanned = 0,
.classzone_idx = pgdat->kcompactd_classzone_idx,
@@ -1983,7 +2552,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
if (kthread_should_stop())
return;
- status = compact_zone(zone, &cc);
+ status = compact_zone(&cc, NULL);
if (status == COMPACT_SUCCESS) {
compaction_defer_reset(zone, cc.order, false);
diff --git a/mm/debug.c b/mm/debug.c
index 7d13941a72f9..c0b31b6c3877 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -44,7 +44,7 @@ const struct trace_print_flags vmaflag_names[] = {
void __dump_page(struct page *page, const char *reason)
{
- struct address_space *mapping = page_mapping(page);
+ struct address_space *mapping;
bool page_poisoned = PagePoisoned(page);
int mapcount;
@@ -58,6 +58,8 @@ void __dump_page(struct page *page, const char *reason)
goto hex_only;
}
+ mapping = page_mapping(page);
+
/*
* Avoid VM_BUG_ON() in page_mapcount().
* page->_mapcount space in struct page is used by sl[aou]b pages to
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 6d4b97e7e9e9..76a160083506 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -114,10 +114,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL);
* @size: size of the blocks in this pool.
* @align: alignment requirement for blocks; must be a power of two
* @boundary: returned blocks won't cross this power of two boundary
- * Context: !in_interrupt()
+ * Context: not in_interrupt()
*
- * Returns a dma allocation pool with the requested characteristics, or
- * null if one can't be created. Given one of these pools, dma_pool_alloc()
+ * Given one of these pools, dma_pool_alloc()
* may be used to allocate memory. Such memory will all have "consistent"
* DMA mappings, accessible by the device and its driver without using
* cache flushing primitives. The actual size of blocks allocated may be
@@ -127,6 +126,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL);
* cross that size boundary. This is useful for devices which have
* addressing restrictions on individual DMA transfers, such as not crossing
* boundaries of 4KBytes.
+ *
+ * Return: a dma allocation pool with the requested characteristics, or
+ * %NULL if one can't be created.
*/
struct dma_pool *dma_pool_create(const char *name, struct device *dev,
size_t size, size_t align, size_t boundary)
@@ -313,7 +315,7 @@ EXPORT_SYMBOL(dma_pool_destroy);
* @mem_flags: GFP_* bitmask
* @handle: pointer to dma address of block
*
- * This returns the kernel virtual address of a currently unused block,
+ * Return: the kernel virtual address of a currently unused block,
* and reports its dma address through the handle.
* If such a memory block can't be allocated, %NULL is returned.
*/
@@ -498,6 +500,9 @@ static int dmam_pool_match(struct device *dev, void *res, void *match_data)
*
* Managed dma_pool_create(). DMA pool created with this function is
* automatically destroyed on driver detach.
+ *
+ * Return: a managed dma allocation pool with the requested
+ * characteristics, or %NULL if one can't be created.
*/
struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
size_t size, size_t align, size_t allocation)
diff --git a/mm/failslab.c b/mm/failslab.c
index b135ebb88b6f..ec5aad211c5b 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -48,18 +48,12 @@ static int __init failslab_debugfs_init(void)
if (IS_ERR(dir))
return PTR_ERR(dir);
- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &failslab.ignore_gfp_reclaim))
- goto fail;
- if (!debugfs_create_bool("cache-filter", mode, dir,
- &failslab.cache_filter))
- goto fail;
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &failslab.ignore_gfp_reclaim);
+ debugfs_create_bool("cache-filter", mode, dir,
+ &failslab.cache_filter);
return 0;
-fail:
- debugfs_remove_recursive(dir);
-
- return -ENOMEM;
}
late_initcall(failslab_debugfs_init);
diff --git a/mm/filemap.c b/mm/filemap.c
index 9f5e323e883e..663f3b84990d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -392,6 +392,8 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
* opposed to a regular memory cleansing writeback. The difference between
* these two operations is that if a dirty page/buffer is encountered, it must
* be waited upon, and not just skipped over.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end, int sync_mode)
@@ -438,6 +440,8 @@ EXPORT_SYMBOL(filemap_fdatawrite_range);
*
* This is a mostly non-blocking flush. Not suitable for data-integrity
* purposes - I/O may not be started against all dirty pages.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int filemap_flush(struct address_space *mapping)
{
@@ -453,6 +457,9 @@ EXPORT_SYMBOL(filemap_flush);
*
* Find at least one page in the range supplied, usually used to check if
* direct writing in this range will trigger a writeback.
+ *
+ * Return: %true if at least one page exists in the specified range,
+ * %false otherwise.
*/
bool filemap_range_has_page(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
@@ -529,6 +536,8 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
* Since the error status of the address space is cleared by this function,
* callers are responsible for checking the return value and handling and/or
* reporting the error.
+ *
+ * Return: error status of the address space.
*/
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
loff_t end_byte)
@@ -551,6 +560,8 @@ EXPORT_SYMBOL(filemap_fdatawait_range);
* Since the error status of the file is advanced by this function,
* callers are responsible for checking the return value and handling and/or
* reporting the error.
+ *
+ * Return: error status of the address space vs. the file->f_wb_err cursor.
*/
int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
{
@@ -572,6 +583,8 @@ EXPORT_SYMBOL(file_fdatawait_range);
* Use this function if callers don't handle errors themselves. Expected
* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
* fsfreeze(8)
+ *
+ * Return: error status of the address space.
*/
int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
@@ -623,6 +636,8 @@ EXPORT_SYMBOL(filemap_write_and_wait);
*
* Note that @lend is inclusive (describes the last byte to be written) so
* that this function can be used to write to the very end-of-file (end = -1).
+ *
+ * Return: error status of the address space.
*/
int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
@@ -678,6 +693,8 @@ EXPORT_SYMBOL(__filemap_set_wb_err);
* While we handle mapping->wb_err with atomic operations, the f_wb_err
* value is protected by the f_lock since we must ensure that it reflects
* the latest value swapped in for this file descriptor.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int file_check_and_advance_wb_err(struct file *file)
{
@@ -720,6 +737,8 @@ EXPORT_SYMBOL(file_check_and_advance_wb_err);
*
* After writing out and waiting on the data, we check and advance the
* f_wb_err cursor to the latest value, and return any errors detected there.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
{
@@ -753,6 +772,8 @@ EXPORT_SYMBOL(file_write_and_wait_range);
* caller must do that.
*
* The remove + add is atomic. This function cannot fail.
+ *
+ * Return: %0
*/
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
{
@@ -867,6 +888,8 @@ error:
*
* This function is used to add a page to the pagecache. It must be locked.
* This function does not add the page to the LRU. The caller must do that.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
@@ -1463,7 +1486,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * Otherwise, %NULL is returned.
+ * Return: the found page or shadow entry, %NULL if nothing is found.
*/
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
@@ -1521,9 +1544,9 @@ EXPORT_SYMBOL(find_get_entry);
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * Otherwise, %NULL is returned.
- *
* find_lock_entry() may sleep.
+ *
+ * Return: the found page or shadow entry, %NULL if nothing is found.
*/
struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
{
@@ -1563,12 +1586,17 @@ EXPORT_SYMBOL(find_lock_entry);
* - FGP_CREAT: If page is not present then a new page is allocated using
* @gfp_mask and added to the page cache and the VM's LRU
* list. The page is returned locked and with an increased
- * refcount. Otherwise, NULL is returned.
+ * refcount.
+ * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
+ * its own locking dance if the page is already in cache, or unlock the page
+ * before returning if we had to add the page to pagecache.
*
* If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
* if the GFP flags specified for FGP_CREAT are atomic.
*
* If there is a page cache page, it is returned with an increased refcount.
+ *
+ * Return: the found page or %NULL otherwise.
*/
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
int fgp_flags, gfp_t gfp_mask)
@@ -1616,7 +1644,7 @@ no_page:
if (!page)
return NULL;
- if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
+ if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
/* Init accessed so avoid atomic mark_page_accessed later */
@@ -1630,6 +1658,13 @@ no_page:
if (err == -EEXIST)
goto repeat;
}
+
+ /*
+ * add_to_page_cache_lru locks the page, and for mmap we expect
+ * an unlocked page.
+ */
+ if (fgp_flags & FGP_FOR_MMAP)
+ unlock_page(page);
}
return page;
@@ -1656,8 +1691,7 @@ EXPORT_SYMBOL(pagecache_get_page);
* Any shadow entries of evicted pages, or swap entries from
* shmem/tmpfs, are included in the returned array.
*
- * find_get_entries() returns the number of pages and shadow entries
- * which were found.
+ * Return: the number of pages and shadow entries which were found.
*/
unsigned find_get_entries(struct address_space *mapping,
pgoff_t start, unsigned int nr_entries,
@@ -1727,8 +1761,8 @@ retry:
* indexes. There may be holes in the indices due to not-present pages.
* We also update @start to index the next page for the traversal.
*
- * find_get_pages_range() returns the number of pages which were found. If this
- * number is smaller than @nr_pages, the end of specified range has been
+ * Return: the number of pages which were found. If this number is
+ * smaller than @nr_pages, the end of specified range has been
* reached.
*/
unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
@@ -1765,7 +1799,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
pages[ret] = page;
if (++ret == nr_pages) {
- *start = page->index + 1;
+ *start = xas.xa_index + 1;
goto out;
}
continue;
@@ -1801,7 +1835,7 @@ out:
* find_get_pages_contig() works exactly like find_get_pages(), except
* that the returned number of pages are guaranteed to be contiguous.
*
- * find_get_pages_contig() returns the number of pages which were found.
+ * Return: the number of pages which were found.
*/
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
unsigned int nr_pages, struct page **pages)
@@ -1837,16 +1871,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- /*
- * must check mapping and index after taking the ref.
- * otherwise we can get both false positives and false
- * negatives, which is just confusing to the caller.
- */
- if (!page->mapping || page_to_pgoff(page) != xas.xa_index) {
- put_page(page);
- break;
- }
-
pages[ret] = page;
if (++ret == nr_pages)
break;
@@ -1872,6 +1896,8 @@ EXPORT_SYMBOL(find_get_pages_contig);
*
* Like find_get_pages, except we only return pages which are tagged with
* @tag. We update @index to index the next page for the traversal.
+ *
+ * Return: the number of pages which were found.
*/
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
@@ -1911,7 +1937,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
pages[ret] = page;
if (++ret == nr_pages) {
- *index = page->index + 1;
+ *index = xas.xa_index + 1;
goto out;
}
continue;
@@ -1949,6 +1975,8 @@ EXPORT_SYMBOL(find_get_pages_range_tag);
*
* Like find_get_entries, except we only return entries which are tagged with
* @tag.
+ *
+ * Return: the number of entries which were found.
*/
unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
xa_mark_t tag, unsigned int nr_entries,
@@ -2034,6 +2062,10 @@ static void shrink_readahead_size_eio(struct file *filp,
*
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
+ *
+ * Return:
+ * * total number of bytes copied, including those the were already @written
+ * * negative error code if nothing was copied
*/
static ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
@@ -2295,6 +2327,9 @@ out:
*
* This is the "read_iter()" routine for all filesystems
* that can use the page cache directly.
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code if nothing was read
*/
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -2354,62 +2389,93 @@ out:
EXPORT_SYMBOL(generic_file_read_iter);
#ifdef CONFIG_MMU
-/**
- * page_cache_read - adds requested page to the page cache if not already there
- * @file: file to read
- * @offset: page index
- * @gfp_mask: memory allocation flags
- *
- * This adds the requested page to the page cache if it isn't already there,
- * and schedules an I/O to read in its contents from disk.
- */
-static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
+#define MMAP_LOTSAMISS (100)
+static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
+ struct file *fpin)
{
- struct address_space *mapping = file->f_mapping;
- struct page *page;
- int ret;
+ int flags = vmf->flags;
- do {
- page = __page_cache_alloc(gfp_mask);
- if (!page)
- return -ENOMEM;
+ if (fpin)
+ return fpin;
- ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
- if (ret == 0)
- ret = mapping->a_ops->readpage(file, page);
- else if (ret == -EEXIST)
- ret = 0; /* losing race to add is OK */
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
+ * anything, so we only pin the file and drop the mmap_sem if only
+ * FAULT_FLAG_ALLOW_RETRY is set.
+ */
+ if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
+ FAULT_FLAG_ALLOW_RETRY) {
+ fpin = get_file(vmf->vma->vm_file);
+ up_read(&vmf->vma->vm_mm->mmap_sem);
+ }
+ return fpin;
+}
- put_page(page);
+/*
+ * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
+ * @vmf - the vm_fault for this fault.
+ * @page - the page to lock.
+ * @fpin - the pointer to the file we may pin (or is already pinned).
+ *
+ * This works similar to lock_page_or_retry in that it can drop the mmap_sem.
+ * It differs in that it actually returns the page locked if it returns 1 and 0
+ * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin
+ * will point to the pinned file and needs to be fput()'ed at a later point.
+ */
+static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
+ struct file **fpin)
+{
+ if (trylock_page(page))
+ return 1;
- } while (ret == AOP_TRUNCATED_PAGE);
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
+ return 0;
- return ret;
+ *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
+ if (vmf->flags & FAULT_FLAG_KILLABLE) {
+ if (__lock_page_killable(page)) {
+ /*
+ * We didn't have the right flags to drop the mmap_sem,
+ * but all fault_handlers only check for fatal signals
+ * if we return VM_FAULT_RETRY, so we need to drop the
+ * mmap_sem here and return 0 if we don't have a fpin.
+ */
+ if (*fpin == NULL)
+ up_read(&vmf->vma->vm_mm->mmap_sem);
+ return 0;
+ }
+ } else
+ __lock_page(page);
+ return 1;
}
-#define MMAP_LOTSAMISS (100)
/*
- * Synchronous readahead happens when we don't even find
- * a page in the page cache at all.
+ * Synchronous readahead happens when we don't even find a page in the page
+ * cache at all. We don't want to perform IO under the mmap sem, so if we have
+ * to drop the mmap sem we return the file that was pinned in order for us to do
+ * that. If we didn't pin a file then we return NULL. The file that is
+ * returned needs to be fput()'ed when we're done with it.
*/
-static void do_sync_mmap_readahead(struct vm_area_struct *vma,
- struct file_ra_state *ra,
- struct file *file,
- pgoff_t offset)
+static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
{
+ struct file *file = vmf->vma->vm_file;
+ struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ struct file *fpin = NULL;
+ pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
- if (vma->vm_flags & VM_RAND_READ)
- return;
+ if (vmf->vma->vm_flags & VM_RAND_READ)
+ return fpin;
if (!ra->ra_pages)
- return;
+ return fpin;
- if (vma->vm_flags & VM_SEQ_READ) {
+ if (vmf->vma->vm_flags & VM_SEQ_READ) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_sync_readahead(mapping, ra, file, offset,
ra->ra_pages);
- return;
+ return fpin;
}
/* Avoid banging the cache line if not needed */
@@ -2421,37 +2487,44 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
* stop bothering with read-ahead. It will only hurt.
*/
if (ra->mmap_miss > MMAP_LOTSAMISS)
- return;
+ return fpin;
/*
* mmap read-around
*/
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
ra_submit(ra, mapping, file);
+ return fpin;
}
/*
* Asynchronous readahead happens when we find the page and PG_readahead,
- * so we want to possibly extend the readahead further..
+ * so we want to possibly extend the readahead further. We return the file that
+ * was pinned if we have to drop the mmap_sem in order to do IO.
*/
-static void do_async_mmap_readahead(struct vm_area_struct *vma,
- struct file_ra_state *ra,
- struct file *file,
- struct page *page,
- pgoff_t offset)
+static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
+ struct page *page)
{
+ struct file *file = vmf->vma->vm_file;
+ struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ struct file *fpin = NULL;
+ pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
- if (vma->vm_flags & VM_RAND_READ)
- return;
+ if (vmf->vma->vm_flags & VM_RAND_READ)
+ return fpin;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
- if (PageReadahead(page))
+ if (PageReadahead(page)) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_async_readahead(mapping, ra, file,
page, offset, ra->ra_pages);
+ }
+ return fpin;
}
/**
@@ -2476,11 +2549,14 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
* has not been released.
*
* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
+ *
+ * Return: bitwise-OR of %VM_FAULT_ codes.
*/
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
int error;
struct file *file = vmf->vma->vm_file;
+ struct file *fpin = NULL;
struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
@@ -2502,23 +2578,26 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
* We found the page, so try async readahead before
* waiting for the lock.
*/
- do_async_mmap_readahead(vmf->vma, ra, file, page, offset);
+ fpin = do_async_mmap_readahead(vmf, page);
} else if (!page) {
/* No page in the page cache at all */
- do_sync_mmap_readahead(vmf->vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
+ fpin = do_sync_mmap_readahead(vmf);
retry_find:
- page = find_get_page(mapping, offset);
- if (!page)
- goto no_cached_page;
+ page = pagecache_get_page(mapping, offset,
+ FGP_CREAT|FGP_FOR_MMAP,
+ vmf->gfp_mask);
+ if (!page) {
+ if (fpin)
+ goto out_retry;
+ return vmf_error(-ENOMEM);
+ }
}
- if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
- put_page(page);
- return ret | VM_FAULT_RETRY;
- }
+ if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
+ goto out_retry;
/* Did it get truncated? */
if (unlikely(page->mapping != mapping)) {
@@ -2536,6 +2615,16 @@ retry_find:
goto page_not_uptodate;
/*
+ * We've made it this far and we had to drop our mmap_sem, now is the
+ * time to return to the upper layer and have it re-find the vma and
+ * redo the fault.
+ */
+ if (fpin) {
+ unlock_page(page);
+ goto out_retry;
+ }
+
+ /*
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
@@ -2549,28 +2638,6 @@ retry_find:
vmf->page = page;
return ret | VM_FAULT_LOCKED;
-no_cached_page:
- /*
- * We're only likely to ever get here if MADV_RANDOM is in
- * effect.
- */
- error = page_cache_read(file, offset, vmf->gfp_mask);
-
- /*
- * The page we want has now been added to the page cache.
- * In the unlikely event that someone removed it in the
- * meantime, we'll just come back here and read it again.
- */
- if (error >= 0)
- goto retry_find;
-
- /*
- * An error return from page_cache_read can result if the
- * system is low on memory, or a problem occurs while trying
- * to schedule I/O.
- */
- return vmf_error(error);
-
page_not_uptodate:
/*
* Umm, take care of errors if the page isn't up-to-date.
@@ -2579,12 +2646,15 @@ page_not_uptodate:
* and we need to check for errors.
*/
ClearPageError(page);
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
error = mapping->a_ops->readpage(file, page);
if (!error) {
wait_on_page_locked(page);
if (!PageUptodate(page))
error = -EIO;
}
+ if (fpin)
+ goto out_retry;
put_page(page);
if (!error || error == AOP_TRUNCATED_PAGE)
@@ -2593,6 +2663,18 @@ page_not_uptodate:
/* Things didn't work out. Return zero to tell the mm layer so. */
shrink_readahead_size_eio(file, ra);
return VM_FAULT_SIGBUS;
+
+out_retry:
+ /*
+ * We dropped the mmap_sem, we need to return to the fault handler to
+ * re-find the vma and come back and find our hopefully still populated
+ * page.
+ */
+ if (page)
+ put_page(page);
+ if (fpin)
+ fput(fpin);
+ return ret | VM_FAULT_RETRY;
}
EXPORT_SYMBOL(filemap_fault);
@@ -2861,6 +2943,8 @@ out:
* not set, try to fill the page and wait for it to become unlocked.
*
* If the page does not get brought uptodate, return -EIO.
+ *
+ * Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page(struct address_space *mapping,
pgoff_t index,
@@ -2881,6 +2965,8 @@ EXPORT_SYMBOL(read_cache_page);
* any new page allocations done using the specified allocation flags.
*
* If the page does not get brought uptodate, return -EIO.
+ *
+ * Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page_gfp(struct address_space *mapping,
pgoff_t index,
@@ -3081,7 +3167,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_NOWAIT) {
/* If there are pages to writeback, return */
if (filemap_range_has_page(inode->i_mapping, pos,
- pos + write_len))
+ pos + write_len - 1))
return -EAGAIN;
} else {
written = filemap_write_and_wait_range(mapping, pos,
@@ -3264,6 +3350,10 @@ EXPORT_SYMBOL(generic_perform_write);
* This function does *not* take care of syncing data in case of O_SYNC write.
* A caller has to handle it. This is mainly due to the fact that we want to
* avoid syncing under i_mutex.
+ *
+ * Return:
+ * * number of bytes written, even for truncated writes
+ * * negative error code if no data has been written at all
*/
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
@@ -3348,6 +3438,10 @@ EXPORT_SYMBOL(__generic_file_write_iter);
* This is a wrapper around __generic_file_write_iter() to be used by most
* filesystems. It takes care of syncing the file in case of O_SYNC file
* and acquires i_mutex as needed.
+ * Return:
+ * * negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * number of bytes written, even for truncated writes
*/
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
@@ -3374,8 +3468,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
* @gfp_mask: memory allocation flags (and I/O mode)
*
* The address_space is to try to release any data against the page
- * (presumably at page->private). If the release was successful, return '1'.
- * Otherwise return zero.
+ * (presumably at page->private).
*
* This may also be called if PG_fscache is set on a page, indicating that the
* page is known to the local caching routines.
@@ -3383,6 +3476,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
* The @gfp_mask argument specifies whether I/O may be performed to release
* this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
*
+ * Return: %1 if the release was successful, otherwise return zero.
*/
int try_to_release_page(struct page *page, gfp_t gfp_mask)
{
diff --git a/mm/gup.c b/mm/gup.c
index 23d3eb3a9e51..f84e22685aaa 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -13,6 +13,9 @@
#include <linux/sched/signal.h>
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
+#include <linux/migrate.h>
+#include <linux/mm_inline.h>
+#include <linux/sched/mm.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
@@ -1126,7 +1129,167 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages);
+#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
+
#ifdef CONFIG_FS_DAX
+static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
+{
+ long i;
+ struct vm_area_struct *vma_prev = NULL;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct vm_area_struct *vma = vmas[i];
+
+ if (vma == vma_prev)
+ continue;
+
+ vma_prev = vma;
+
+ if (vma_is_fsdax(vma))
+ return true;
+ }
+ return false;
+}
+#else
+static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
+{
+ return false;
+}
+#endif
+
+#ifdef CONFIG_CMA
+static struct page *new_non_cma_page(struct page *page, unsigned long private)
+{
+ /*
+ * We want to make sure we allocate the new page from the same node
+ * as the source page.
+ */
+ int nid = page_to_nid(page);
+ /*
+ * Trying to allocate a page for migration. Ignore allocation
+ * failure warnings. We don't force __GFP_THISNODE here because
+ * this node here is the node where we have CMA reservation and
+ * in some case these nodes will have really less non movable
+ * allocation memory.
+ */
+ gfp_t gfp_mask = GFP_USER | __GFP_NOWARN;
+
+ if (PageHighMem(page))
+ gfp_mask |= __GFP_HIGHMEM;
+
+#ifdef CONFIG_HUGETLB_PAGE
+ if (PageHuge(page)) {
+ struct hstate *h = page_hstate(page);
+ /*
+ * We don't want to dequeue from the pool because pool pages will
+ * mostly be from the CMA region.
+ */
+ return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
+ }
+#endif
+ if (PageTransHuge(page)) {
+ struct page *thp;
+ /*
+ * ignore allocation failure warnings
+ */
+ gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN;
+
+ /*
+ * Remove the movable mask so that we don't allocate from
+ * CMA area again.
+ */
+ thp_gfpmask &= ~__GFP_MOVABLE;
+ thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
+ if (!thp)
+ return NULL;
+ prep_transhuge_page(thp);
+ return thp;
+ }
+
+ return __alloc_pages_node(nid, gfp_mask, 0);
+}
+
+static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
+ unsigned int gup_flags,
+ struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ long i;
+ bool drain_allow = true;
+ bool migrate_allow = true;
+ LIST_HEAD(cma_page_list);
+
+check_again:
+ for (i = 0; i < nr_pages; i++) {
+ /*
+ * If we get a page from the CMA zone, since we are going to
+ * be pinning these entries, we might as well move them out
+ * of the CMA zone if possible.
+ */
+ if (is_migrate_cma_page(pages[i])) {
+
+ struct page *head = compound_head(pages[i]);
+
+ if (PageHuge(head)) {
+ isolate_huge_page(head, &cma_page_list);
+ } else {
+ if (!PageLRU(head) && drain_allow) {
+ lru_add_drain_all();
+ drain_allow = false;
+ }
+
+ if (!isolate_lru_page(head)) {
+ list_add_tail(&head->lru, &cma_page_list);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON +
+ page_is_file_cache(head),
+ hpage_nr_pages(head));
+ }
+ }
+ }
+ }
+
+ if (!list_empty(&cma_page_list)) {
+ /*
+ * drop the above get_user_pages reference.
+ */
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+
+ if (migrate_pages(&cma_page_list, new_non_cma_page,
+ NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
+ /*
+ * some of the pages failed migration. Do get_user_pages
+ * without migration.
+ */
+ migrate_allow = false;
+
+ if (!list_empty(&cma_page_list))
+ putback_movable_pages(&cma_page_list);
+ }
+ /*
+ * We did migrate all the pages, Try to get the page references again
+ * migrating any new CMA pages which we failed to isolate earlier.
+ */
+ nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+ if ((nr_pages > 0) && migrate_allow) {
+ drain_allow = true;
+ goto check_again;
+ }
+ }
+
+ return nr_pages;
+}
+#else
+static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
+ unsigned int gup_flags,
+ struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ return nr_pages;
+}
+#endif
+
/*
* This is the same as get_user_pages() in that it assumes we are
* operating on the current task's mm, but it goes further to validate
@@ -1140,11 +1303,11 @@ EXPORT_SYMBOL(get_user_pages);
* Contrast this to iov_iter_get_pages() usages which are transient.
*/
long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas_arg)
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas_arg)
{
struct vm_area_struct **vmas = vmas_arg;
- struct vm_area_struct *vma_prev = NULL;
+ unsigned long flags;
long rc, i;
if (!pages)
@@ -1157,31 +1320,20 @@ long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
return -ENOMEM;
}
+ flags = memalloc_nocma_save();
rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+ memalloc_nocma_restore(flags);
+ if (rc < 0)
+ goto out;
- for (i = 0; i < rc; i++) {
- struct vm_area_struct *vma = vmas[i];
-
- if (vma == vma_prev)
- continue;
-
- vma_prev = vma;
-
- if (vma_is_fsdax(vma))
- break;
- }
-
- /*
- * Either get_user_pages() failed, or the vma validation
- * succeeded, in either case we don't need to put_page() before
- * returning.
- */
- if (i >= rc)
+ if (check_dax_vmas(vmas, rc)) {
+ for (i = 0; i < rc; i++)
+ put_page(pages[i]);
+ rc = -EOPNOTSUPP;
goto out;
+ }
- for (i = 0; i < rc; i++)
- put_page(pages[i]);
- rc = -EOPNOTSUPP;
+ rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas);
out:
if (vmas != vmas_arg)
kfree(vmas);
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 5b42d3d4b60a..6c0279e70cc4 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -122,12 +122,8 @@ static const struct file_operations gup_benchmark_fops = {
static int gup_benchmark_init(void)
{
- void *ret;
-
- ret = debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
- &gup_benchmark_fops);
- if (!ret)
- pr_warn("Failed to create gup_benchmark in debugfs");
+ debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
+ &gup_benchmark_fops);
return 0;
}
diff --git a/mm/hmm.c b/mm/hmm.c
index a04e4b810610..3c9781037918 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -30,6 +30,7 @@
#include <linux/hugetlb.h>
#include <linux/memremap.h>
#include <linux/jump_label.h>
+#include <linux/dma-mapping.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@@ -38,24 +39,15 @@
#if IS_ENABLED(CONFIG_HMM_MIRROR)
static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
-/*
- * struct hmm - HMM per mm struct
- *
- * @mm: mm struct this HMM struct is bound to
- * @lock: lock protecting ranges list
- * @ranges: list of range being snapshotted
- * @mirrors: list of mirrors for this mm
- * @mmu_notifier: mmu notifier to track updates to CPU page table
- * @mirrors_sem: read/write semaphore protecting the mirrors list
- */
-struct hmm {
- struct mm_struct *mm;
- spinlock_t lock;
- struct list_head ranges;
- struct list_head mirrors;
- struct mmu_notifier mmu_notifier;
- struct rw_semaphore mirrors_sem;
-};
+static inline struct hmm *hmm_get(struct mm_struct *mm)
+{
+ struct hmm *hmm = READ_ONCE(mm->hmm);
+
+ if (hmm && kref_get_unless_zero(&hmm->kref))
+ return hmm;
+
+ return NULL;
+}
/*
* hmm_register - register HMM against an mm (HMM internal)
@@ -67,25 +59,24 @@ struct hmm {
*/
static struct hmm *hmm_register(struct mm_struct *mm)
{
- struct hmm *hmm = READ_ONCE(mm->hmm);
+ struct hmm *hmm = hmm_get(mm);
bool cleanup = false;
- /*
- * The hmm struct can only be freed once the mm_struct goes away,
- * hence we should always have pre-allocated an new hmm struct
- * above.
- */
if (hmm)
return hmm;
hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
if (!hmm)
return NULL;
+ init_waitqueue_head(&hmm->wq);
INIT_LIST_HEAD(&hmm->mirrors);
init_rwsem(&hmm->mirrors_sem);
hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(&hmm->ranges);
- spin_lock_init(&hmm->lock);
+ mutex_init(&hmm->lock);
+ kref_init(&hmm->kref);
+ hmm->notifiers = 0;
+ hmm->dead = false;
hmm->mm = mm;
spin_lock(&mm->page_table_lock);
@@ -106,7 +97,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
goto error_mm;
- return mm->hmm;
+ return hmm;
error_mm:
spin_lock(&mm->page_table_lock);
@@ -118,54 +109,60 @@ error:
return NULL;
}
-void hmm_mm_destroy(struct mm_struct *mm)
-{
- kfree(mm->hmm);
-}
-
-static int hmm_invalidate_range(struct hmm *hmm, bool device,
- const struct hmm_update *update)
+static void hmm_free(struct kref *kref)
{
- struct hmm_mirror *mirror;
- struct hmm_range *range;
+ struct hmm *hmm = container_of(kref, struct hmm, kref);
+ struct mm_struct *mm = hmm->mm;
- spin_lock(&hmm->lock);
- list_for_each_entry(range, &hmm->ranges, list) {
- unsigned long addr, idx, npages;
+ mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
- if (update->end < range->start || update->start >= range->end)
- continue;
+ spin_lock(&mm->page_table_lock);
+ if (mm->hmm == hmm)
+ mm->hmm = NULL;
+ spin_unlock(&mm->page_table_lock);
- range->valid = false;
- addr = max(update->start, range->start);
- idx = (addr - range->start) >> PAGE_SHIFT;
- npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT;
- memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
- }
- spin_unlock(&hmm->lock);
+ kfree(hmm);
+}
- if (!device)
- return 0;
+static inline void hmm_put(struct hmm *hmm)
+{
+ kref_put(&hmm->kref, hmm_free);
+}
- down_read(&hmm->mirrors_sem);
- list_for_each_entry(mirror, &hmm->mirrors, list) {
- int ret;
+void hmm_mm_destroy(struct mm_struct *mm)
+{
+ struct hmm *hmm;
- ret = mirror->ops->sync_cpu_device_pagetables(mirror, update);
- if (!update->blockable && ret == -EAGAIN) {
- up_read(&hmm->mirrors_sem);
- return -EAGAIN;
- }
+ spin_lock(&mm->page_table_lock);
+ hmm = hmm_get(mm);
+ mm->hmm = NULL;
+ if (hmm) {
+ hmm->mm = NULL;
+ hmm->dead = true;
+ spin_unlock(&mm->page_table_lock);
+ hmm_put(hmm);
+ return;
}
- up_read(&hmm->mirrors_sem);
- return 0;
+ spin_unlock(&mm->page_table_lock);
}
static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
{
+ struct hmm *hmm = hmm_get(mm);
struct hmm_mirror *mirror;
- struct hmm *hmm = mm->hmm;
+ struct hmm_range *range;
+
+ /* Report this HMM as dying. */
+ hmm->dead = true;
+
+ /* Wake-up everyone waiting on any range. */
+ mutex_lock(&hmm->lock);
+ list_for_each_entry(range, &hmm->ranges, list) {
+ range->valid = false;
+ }
+ wake_up_all(&hmm->wq);
+ mutex_unlock(&hmm->lock);
down_write(&hmm->mirrors_sem);
mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
@@ -186,36 +183,95 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
struct hmm_mirror, list);
}
up_write(&hmm->mirrors_sem);
+
+ hmm_put(hmm);
}
static int hmm_invalidate_range_start(struct mmu_notifier *mn,
- const struct mmu_notifier_range *range)
+ const struct mmu_notifier_range *nrange)
{
+ struct hmm *hmm = hmm_get(nrange->mm);
+ struct hmm_mirror *mirror;
struct hmm_update update;
- struct hmm *hmm = range->mm->hmm;
+ struct hmm_range *range;
+ int ret = 0;
VM_BUG_ON(!hmm);
- update.start = range->start;
- update.end = range->end;
+ /* Check if hmm_mm_destroy() was call. */
+ if (hmm->mm == NULL)
+ goto out;
+
+ update.start = nrange->start;
+ update.end = nrange->end;
update.event = HMM_UPDATE_INVALIDATE;
- update.blockable = range->blockable;
- return hmm_invalidate_range(hmm, true, &update);
+ update.blockable = nrange->blockable;
+
+ if (nrange->blockable)
+ mutex_lock(&hmm->lock);
+ else if (!mutex_trylock(&hmm->lock)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ hmm->notifiers++;
+ list_for_each_entry(range, &hmm->ranges, list) {
+ if (update.end < range->start || update.start >= range->end)
+ continue;
+
+ range->valid = false;
+ }
+ mutex_unlock(&hmm->lock);
+
+ if (nrange->blockable)
+ down_read(&hmm->mirrors_sem);
+ else if (!down_read_trylock(&hmm->mirrors_sem)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ list_for_each_entry(mirror, &hmm->mirrors, list) {
+ int ret;
+
+ ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
+ if (!update.blockable && ret == -EAGAIN) {
+ up_read(&hmm->mirrors_sem);
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
+ up_read(&hmm->mirrors_sem);
+
+out:
+ hmm_put(hmm);
+ return ret;
}
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
- const struct mmu_notifier_range *range)
+ const struct mmu_notifier_range *nrange)
{
- struct hmm_update update;
- struct hmm *hmm = range->mm->hmm;
+ struct hmm *hmm = hmm_get(nrange->mm);
VM_BUG_ON(!hmm);
- update.start = range->start;
- update.end = range->end;
- update.event = HMM_UPDATE_INVALIDATE;
- update.blockable = true;
- hmm_invalidate_range(hmm, false, &update);
+ /* Check if hmm_mm_destroy() was call. */
+ if (hmm->mm == NULL)
+ goto out;
+
+ mutex_lock(&hmm->lock);
+ hmm->notifiers--;
+ if (!hmm->notifiers) {
+ struct hmm_range *range;
+
+ list_for_each_entry(range, &hmm->ranges, list) {
+ if (range->valid)
+ continue;
+ range->valid = true;
+ }
+ wake_up_all(&hmm->wq);
+ }
+ mutex_unlock(&hmm->lock);
+
+out:
+ hmm_put(hmm);
}
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
@@ -241,24 +297,13 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
if (!mm || !mirror || !mirror->ops)
return -EINVAL;
-again:
mirror->hmm = hmm_register(mm);
if (!mirror->hmm)
return -ENOMEM;
down_write(&mirror->hmm->mirrors_sem);
- if (mirror->hmm->mm == NULL) {
- /*
- * A racing hmm_mirror_unregister() is about to destroy the hmm
- * struct. Try again to allocate a new one.
- */
- up_write(&mirror->hmm->mirrors_sem);
- mirror->hmm = NULL;
- goto again;
- } else {
- list_add(&mirror->list, &mirror->hmm->mirrors);
- up_write(&mirror->hmm->mirrors_sem);
- }
+ list_add(&mirror->list, &mirror->hmm->mirrors);
+ up_write(&mirror->hmm->mirrors_sem);
return 0;
}
@@ -273,38 +318,24 @@ EXPORT_SYMBOL(hmm_mirror_register);
*/
void hmm_mirror_unregister(struct hmm_mirror *mirror)
{
- bool should_unregister = false;
- struct mm_struct *mm;
- struct hmm *hmm;
+ struct hmm *hmm = READ_ONCE(mirror->hmm);
- if (mirror->hmm == NULL)
+ if (hmm == NULL)
return;
- hmm = mirror->hmm;
down_write(&hmm->mirrors_sem);
list_del_init(&mirror->list);
- should_unregister = list_empty(&hmm->mirrors);
+ /* To protect us against double unregister ... */
mirror->hmm = NULL;
- mm = hmm->mm;
- hmm->mm = NULL;
up_write(&hmm->mirrors_sem);
- if (!should_unregister || mm == NULL)
- return;
-
- mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
-
- spin_lock(&mm->page_table_lock);
- if (mm->hmm == hmm)
- mm->hmm = NULL;
- spin_unlock(&mm->page_table_lock);
-
- kfree(hmm);
+ hmm_put(hmm);
}
EXPORT_SYMBOL(hmm_mirror_unregister);
struct hmm_vma_walk {
struct hmm_range *range;
+ struct dev_pagemap *pgmap;
unsigned long last;
bool fault;
bool block;
@@ -323,13 +354,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
flags |= write_fault ? FAULT_FLAG_WRITE : 0;
ret = handle_mm_fault(vma, addr, flags);
if (ret & VM_FAULT_RETRY)
- return -EBUSY;
+ return -EAGAIN;
if (ret & VM_FAULT_ERROR) {
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
}
- return -EAGAIN;
+ return -EBUSY;
}
static int hmm_pfns_bad(unsigned long addr,
@@ -355,7 +386,7 @@ static int hmm_pfns_bad(unsigned long addr,
* @fault: should we fault or not ?
* @write_fault: write fault ?
* @walk: mm_walk structure
- * Returns: 0 on success, -EAGAIN after page fault, or page fault error
+ * Returns: 0 on success, -EBUSY after page fault, or page fault error
*
* This function will be called whenever pmd_none() or pte_none() returns true,
* or whenever there is no page directory covering the virtual address range.
@@ -367,23 +398,25 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
uint64_t *pfns = range->pfns;
- unsigned long i;
+ unsigned long i, page_size;
hmm_vma_walk->last = addr;
- i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++) {
+ page_size = 1UL << range->page_shift;
+ i = (addr - range->start) >> range->page_shift;
+
+ for (; addr < end; addr += page_size, i++) {
pfns[i] = range->values[HMM_PFN_NONE];
if (fault || write_fault) {
int ret;
ret = hmm_vma_do_fault(walk, addr, write_fault,
&pfns[i]);
- if (ret != -EAGAIN)
+ if (ret != -EBUSY)
return ret;
}
}
- return (fault || write_fault) ? -EAGAIN : 0;
+ return (fault || write_fault) ? -EBUSY : 0;
}
static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
@@ -392,10 +425,21 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
{
struct hmm_range *range = hmm_vma_walk->range;
- *fault = *write_fault = false;
if (!hmm_vma_walk->fault)
return;
+ /*
+ * So we not only consider the individual per page request we also
+ * consider the default flags requested for the range. The API can
+ * be use in 2 fashions. The first one where the HMM user coalesce
+ * multiple page fault into one request and set flags per pfns for
+ * of those faults. The second one where the HMM user want to pre-
+ * fault a range with specific flags. For the latter one it is a
+ * waste to have the user pre-fill the pfn arrays with a default
+ * flags value.
+ */
+ pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
+
/* We aren't ask to do anything ... */
if (!(pfns & range->flags[HMM_PFN_VALID]))
return;
@@ -431,10 +475,11 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
return;
}
+ *fault = *write_fault = false;
for (i = 0; i < npages; ++i) {
hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
fault, write_fault);
- if ((*fault) || (*write_fault))
+ if ((*write_fault))
return;
}
}
@@ -465,6 +510,15 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
range->flags[HMM_PFN_VALID];
}
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+ if (!pud_present(pud))
+ return 0;
+ return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+ range->flags[HMM_PFN_WRITE] :
+ range->flags[HMM_PFN_VALID];
+}
+
static int hmm_vma_handle_pmd(struct mm_walk *walk,
unsigned long addr,
unsigned long end,
@@ -486,8 +540,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
pfn = pmd_pfn(pmd) + pte_index(addr);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ if (pmd_devmap(pmd)) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ }
pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+ }
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
hmm_vma_walk->last = end;
return 0;
}
@@ -514,11 +579,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
uint64_t orig_pfn = *pfn;
*pfn = range->values[HMM_PFN_NONE];
- cpu_flags = pte_to_hmm_pfn_flags(range, pte);
- hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
- &fault, &write_fault);
+ fault = write_fault = false;
if (pte_none(pte)) {
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
+ &fault, &write_fault);
if (fault || write_fault)
goto fault;
return 0;
@@ -557,7 +622,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
hmm_vma_walk->last = addr;
migration_entry_wait(vma->vm_mm,
pmdp, addr);
- return -EAGAIN;
+ return -EBUSY;
}
return 0;
}
@@ -565,15 +630,33 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
/* Report error for everything else */
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
+ } else {
+ cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
}
if (fault || write_fault)
goto fault;
+ if (pte_devmap(pte)) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
+ *pfn = range->values[HMM_PFN_SPECIAL];
+ return -EFAULT;
+ }
+
*pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
return 0;
fault:
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -615,7 +698,7 @@ again:
if (fault || write_fault) {
hmm_vma_walk->last = addr;
pmd_migration_entry_wait(vma->vm_mm, pmdp);
- return -EAGAIN;
+ return -EBUSY;
}
return 0;
} else if (!pmd_present(pmd))
@@ -661,12 +744,147 @@ again:
return r;
}
}
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
pte_unmap(ptep - 1);
hmm_vma_walk->last = addr;
return 0;
}
+static int hmm_vma_walk_pud(pud_t *pudp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long addr = start, next;
+ pmd_t *pmdp;
+ pud_t pud;
+ int ret;
+
+again:
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ if (pud_huge(pud) && pud_devmap(pud)) {
+ unsigned long i, npages, pfn;
+ uint64_t *pfns, cpu_flags;
+ bool fault, write_fault;
+
+ if (!pud_present(pud))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ npages = (end - addr) >> PAGE_SHIFT;
+ pfns = &range->pfns[i];
+
+ cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+ cpu_flags, &fault, &write_fault);
+ if (fault || write_fault)
+ return hmm_vma_walk_hole_(addr, end, fault,
+ write_fault, walk);
+
+ pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+ }
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
+ hmm_vma_walk->last = end;
+ return 0;
+ }
+
+ split_huge_pud(vma, pudp, addr);
+ if (pud_none(*pudp))
+ goto again;
+
+ pmdp = pmd_offset(pudp, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ unsigned long addr = start, i, pfn, mask, size, pfn_inc;
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ struct hstate *h = hstate_vma(vma);
+ uint64_t orig_pfn, cpu_flags;
+ bool fault, write_fault;
+ spinlock_t *ptl;
+ pte_t entry;
+ int ret = 0;
+
+ size = 1UL << huge_page_shift(h);
+ mask = size - 1;
+ if (range->page_shift != PAGE_SHIFT) {
+ /* Make sure we are looking at full page. */
+ if (start & mask)
+ return -EINVAL;
+ if (end < (start + size))
+ return -EINVAL;
+ pfn_inc = size >> PAGE_SHIFT;
+ } else {
+ pfn_inc = 1;
+ size = PAGE_SIZE;
+ }
+
+
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ entry = huge_ptep_get(pte);
+
+ i = (start - range->start) >> range->page_shift;
+ orig_pfn = range->pfns[i];
+ range->pfns[i] = range->values[HMM_PFN_NONE];
+ cpu_flags = pte_to_hmm_pfn_flags(range, entry);
+ fault = write_fault = false;
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
+ if (fault || write_fault) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ pfn = pte_pfn(entry) + (start & mask);
+ for (; addr < end; addr += size, i++, pfn += pfn_inc)
+ range->pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+ hmm_vma_walk->last = end;
+
+unlock:
+ spin_unlock(ptl);
+
+ if (ret == -ENOENT)
+ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+
+ return ret;
+#else /* CONFIG_HUGETLB_PAGE */
+ return -EINVAL;
+#endif
+}
+
static void hmm_pfns_clear(struct hmm_range *range,
uint64_t *pfns,
unsigned long addr,
@@ -676,279 +894,442 @@ static void hmm_pfns_clear(struct hmm_range *range,
*pfns = range->values[HMM_PFN_NONE];
}
-static void hmm_pfns_special(struct hmm_range *range)
-{
- unsigned long addr = range->start, i = 0;
-
- for (; addr < range->end; addr += PAGE_SIZE, i++)
- range->pfns[i] = range->values[HMM_PFN_SPECIAL];
-}
-
/*
- * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
- * @range: range being snapshotted
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
- * vma permission, 0 success
+ * hmm_range_register() - start tracking change to CPU page table over a range
+ * @range: range
+ * @mm: the mm struct for the range of virtual address
+ * @start: start virtual address (inclusive)
+ * @end: end virtual address (exclusive)
+ * @page_shift: expect page shift for the range
+ * Returns 0 on success, -EFAULT if the address space is no longer valid
*
- * This snapshots the CPU page table for a range of virtual addresses. Snapshot
- * validity is tracked by range struct. See hmm_vma_range_done() for further
- * information.
- *
- * The range struct is initialized here. It tracks the CPU page table, but only
- * if the function returns success (0), in which case the caller must then call
- * hmm_vma_range_done() to stop CPU page table update tracking on this range.
- *
- * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
- * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
+ * Track updates to the CPU page table see include/linux/hmm.h
*/
-int hmm_vma_get_pfns(struct hmm_range *range)
+int hmm_range_register(struct hmm_range *range,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ unsigned page_shift)
{
- struct vm_area_struct *vma = range->vma;
- struct hmm_vma_walk hmm_vma_walk;
- struct mm_walk mm_walk;
- struct hmm *hmm;
+ unsigned long mask = ((1UL << page_shift) - 1UL);
+
+ range->valid = false;
+ range->hmm = NULL;
- /* Sanity check, this really should not happen ! */
- if (range->start < vma->vm_start || range->start >= vma->vm_end)
+ if ((start & mask) || (end & mask))
return -EINVAL;
- if (range->end < vma->vm_start || range->end > vma->vm_end)
+ if (start >= end)
return -EINVAL;
- hmm = hmm_register(vma->vm_mm);
- if (!hmm)
- return -ENOMEM;
- /* Caller must have registered a mirror, via hmm_mirror_register() ! */
- if (!hmm->mmu_notifier.ops)
- return -EINVAL;
+ range->page_shift = page_shift;
+ range->start = start;
+ range->end = end;
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
- vma_is_dax(vma)) {
- hmm_pfns_special(range);
- return -EINVAL;
- }
+ range->hmm = hmm_register(mm);
+ if (!range->hmm)
+ return -EFAULT;
- if (!(vma->vm_flags & VM_READ)) {
- /*
- * If vma do not allow read access, then assume that it does
- * not allow write access, either. Architecture that allow
- * write without read access are not supported by HMM, because
- * operations such has atomic access would not work.
- */
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -EPERM;
+ /* Check if hmm_mm_destroy() was call. */
+ if (range->hmm->mm == NULL || range->hmm->dead) {
+ hmm_put(range->hmm);
+ return -EFAULT;
}
/* Initialize range to track CPU page table update */
- spin_lock(&hmm->lock);
- range->valid = true;
- list_add_rcu(&range->list, &hmm->ranges);
- spin_unlock(&hmm->lock);
-
- hmm_vma_walk.fault = false;
- hmm_vma_walk.range = range;
- mm_walk.private = &hmm_vma_walk;
-
- mm_walk.vma = vma;
- mm_walk.mm = vma->vm_mm;
- mm_walk.pte_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
- mm_walk.pte_hole = hmm_vma_walk_hole;
-
- walk_page_range(range->start, range->end, &mm_walk);
+ mutex_lock(&range->hmm->lock);
+
+ list_add_rcu(&range->list, &range->hmm->ranges);
+
+ /*
+ * If there are any concurrent notifiers we have to wait for them for
+ * the range to be valid (see hmm_range_wait_until_valid()).
+ */
+ if (!range->hmm->notifiers)
+ range->valid = true;
+ mutex_unlock(&range->hmm->lock);
+
return 0;
}
-EXPORT_SYMBOL(hmm_vma_get_pfns);
+EXPORT_SYMBOL(hmm_range_register);
/*
- * hmm_vma_range_done() - stop tracking change to CPU page table over a range
- * @range: range being tracked
- * Returns: false if range data has been invalidated, true otherwise
+ * hmm_range_unregister() - stop tracking change to CPU page table over a range
+ * @range: range
*
* Range struct is used to track updates to the CPU page table after a call to
- * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
- * using the data, or wants to lock updates to the data it got from those
- * functions, it must call the hmm_vma_range_done() function, which will then
- * stop tracking CPU page table updates.
- *
- * Note that device driver must still implement general CPU page table update
- * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
- * the mmu_notifier API directly.
- *
- * CPU page table update tracking done through hmm_range is only temporary and
- * to be used while trying to duplicate CPU page table contents for a range of
- * virtual addresses.
- *
- * There are two ways to use this :
- * again:
- * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
- * trans = device_build_page_table_update_transaction(pfns);
- * device_page_table_lock();
- * if (!hmm_vma_range_done(range)) {
- * device_page_table_unlock();
- * goto again;
- * }
- * device_commit_transaction(trans);
- * device_page_table_unlock();
+ * hmm_range_register(). See include/linux/hmm.h for how to use it.
+ */
+void hmm_range_unregister(struct hmm_range *range)
+{
+ /* Sanity check this really should not happen. */
+ if (range->hmm == NULL || range->end <= range->start)
+ return;
+
+ mutex_lock(&range->hmm->lock);
+ list_del_rcu(&range->list);
+ mutex_unlock(&range->hmm->lock);
+
+ /* Drop reference taken by hmm_range_register() */
+ range->valid = false;
+ hmm_put(range->hmm);
+ range->hmm = NULL;
+}
+EXPORT_SYMBOL(hmm_range_unregister);
+
+/*
+ * hmm_range_snapshot() - snapshot CPU page table for a range
+ * @range: range
+ * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
+ * permission (for instance asking for write and range is read only),
+ * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid
+ * vma or it is illegal to access that range), number of valid pages
+ * in range->pfns[] (from range start address).
*
- * Or:
- * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
- * device_page_table_lock();
- * hmm_vma_range_done(range);
- * device_update_page_table(range->pfns);
- * device_page_table_unlock();
+ * This snapshots the CPU page table for a range of virtual addresses. Snapshot
+ * validity is tracked by range struct. See in include/linux/hmm.h for example
+ * on how to use.
*/
-bool hmm_vma_range_done(struct hmm_range *range)
+long hmm_range_snapshot(struct hmm_range *range)
{
- unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
- struct hmm *hmm;
+ const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
+ unsigned long start = range->start, end;
+ struct hmm_vma_walk hmm_vma_walk;
+ struct hmm *hmm = range->hmm;
+ struct vm_area_struct *vma;
+ struct mm_walk mm_walk;
- if (range->end <= range->start) {
- BUG();
- return false;
- }
+ /* Check if hmm_mm_destroy() was call. */
+ if (hmm->mm == NULL || hmm->dead)
+ return -EFAULT;
- hmm = hmm_register(range->vma->vm_mm);
- if (!hmm) {
- memset(range->pfns, 0, sizeof(*range->pfns) * npages);
- return false;
- }
+ do {
+ /* If range is no longer valid force retry. */
+ if (!range->valid)
+ return -EAGAIN;
- spin_lock(&hmm->lock);
- list_del_rcu(&range->list);
- spin_unlock(&hmm->lock);
+ vma = find_vma(hmm->mm, start);
+ if (vma == NULL || (vma->vm_flags & device_vma))
+ return -EFAULT;
- return range->valid;
+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h = hstate_vma(vma);
+
+ if (huge_page_shift(h) != range->page_shift &&
+ range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ } else {
+ if (range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ }
+
+ if (!(vma->vm_flags & VM_READ)) {
+ /*
+ * If vma do not allow read access, then assume that it
+ * does not allow write access, either. HMM does not
+ * support architecture that allow write without read.
+ */
+ hmm_pfns_clear(range, range->pfns,
+ range->start, range->end);
+ return -EPERM;
+ }
+
+ range->vma = vma;
+ hmm_vma_walk.pgmap = NULL;
+ hmm_vma_walk.last = start;
+ hmm_vma_walk.fault = false;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+ end = min(range->end, vma->vm_end);
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pud_entry = hmm_vma_walk_pud;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+ mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+
+ walk_page_range(start, end, &mm_walk);
+ start = end;
+ } while (start < range->end);
+
+ return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
}
-EXPORT_SYMBOL(hmm_vma_range_done);
+EXPORT_SYMBOL(hmm_range_snapshot);
/*
- * hmm_vma_fault() - try to fault some address in a virtual address range
+ * hmm_range_fault() - try to fault some address in a virtual address range
* @range: range being faulted
* @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
+ * Returns: 0 on success ortherwise:
+ * -EINVAL:
+ * Invalid argument
+ * -ENOMEM:
+ * Out of memory.
+ * -EPERM:
+ * Invalid permission (for instance asking for write and range
+ * is read only).
+ * -EAGAIN:
+ * If you need to retry and mmap_sem was drop. This can only
+ * happens if block argument is false.
+ * -EBUSY:
+ * If the the range is being invalidated and you should wait for
+ * invalidation to finish.
+ * -EFAULT:
+ * Invalid (ie either no valid vma or it is illegal to access that
+ * range), number of valid pages in range->pfns[] (from range start
+ * address).
*
* This is similar to a regular CPU page fault except that it will not trigger
- * any memory migration if the memory being faulted is not accessible by CPUs.
+ * any memory migration if the memory being faulted is not accessible by CPUs
+ * and caller does not ask for migration.
*
* On error, for one virtual address in the range, the function will mark the
* corresponding HMM pfn entry with an error flag.
- *
- * Expected use pattern:
- * retry:
- * down_read(&mm->mmap_sem);
- * // Find vma and address device wants to fault, initialize hmm_pfn_t
- * // array accordingly
- * ret = hmm_vma_fault(range, write, block);
- * switch (ret) {
- * case -EAGAIN:
- * hmm_vma_range_done(range);
- * // You might want to rate limit or yield to play nicely, you may
- * // also commit any valid pfn in the array assuming that you are
- * // getting true from hmm_vma_range_monitor_end()
- * goto retry;
- * case 0:
- * break;
- * case -ENOMEM:
- * case -EINVAL:
- * case -EPERM:
- * default:
- * // Handle error !
- * up_read(&mm->mmap_sem)
- * return;
- * }
- * // Take device driver lock that serialize device page table update
- * driver_lock_device_page_table_update();
- * hmm_vma_range_done(range);
- * // Commit pfns we got from hmm_vma_fault()
- * driver_unlock_device_page_table_update();
- * up_read(&mm->mmap_sem)
- *
- * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
- * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
- *
- * YOU HAVE BEEN WARNED !
*/
-int hmm_vma_fault(struct hmm_range *range, bool block)
+long hmm_range_fault(struct hmm_range *range, bool block)
{
- struct vm_area_struct *vma = range->vma;
- unsigned long start = range->start;
+ const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
+ unsigned long start = range->start, end;
struct hmm_vma_walk hmm_vma_walk;
+ struct hmm *hmm = range->hmm;
+ struct vm_area_struct *vma;
struct mm_walk mm_walk;
- struct hmm *hmm;
int ret;
- /* Sanity check, this really should not happen ! */
- if (range->start < vma->vm_start || range->start >= vma->vm_end)
- return -EINVAL;
- if (range->end < vma->vm_start || range->end > vma->vm_end)
- return -EINVAL;
+ /* Check if hmm_mm_destroy() was call. */
+ if (hmm->mm == NULL || hmm->dead)
+ return -EFAULT;
- hmm = hmm_register(vma->vm_mm);
- if (!hmm) {
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -ENOMEM;
- }
- /* Caller must have registered a mirror using hmm_mirror_register() */
- if (!hmm->mmu_notifier.ops)
- return -EINVAL;
+ do {
+ /* If range is no longer valid force retry. */
+ if (!range->valid) {
+ up_read(&hmm->mm->mmap_sem);
+ return -EAGAIN;
+ }
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
- vma_is_dax(vma)) {
- hmm_pfns_special(range);
- return -EINVAL;
- }
+ vma = find_vma(hmm->mm, start);
+ if (vma == NULL || (vma->vm_flags & device_vma))
+ return -EFAULT;
+
+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h = hstate_vma(vma);
+
+ if (huge_page_shift(h) != range->page_shift &&
+ range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ } else {
+ if (range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ }
+
+ if (!(vma->vm_flags & VM_READ)) {
+ /*
+ * If vma do not allow read access, then assume that it
+ * does not allow write access, either. HMM does not
+ * support architecture that allow write without read.
+ */
+ hmm_pfns_clear(range, range->pfns,
+ range->start, range->end);
+ return -EPERM;
+ }
+
+ range->vma = vma;
+ hmm_vma_walk.pgmap = NULL;
+ hmm_vma_walk.last = start;
+ hmm_vma_walk.fault = true;
+ hmm_vma_walk.block = block;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+ end = min(range->end, vma->vm_end);
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pud_entry = hmm_vma_walk_pud;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+ mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+
+ do {
+ ret = walk_page_range(start, end, &mm_walk);
+ start = hmm_vma_walk.last;
+
+ /* Keep trying while the range is valid. */
+ } while (ret == -EBUSY && range->valid);
+
+ if (ret) {
+ unsigned long i;
+
+ i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+ hmm_pfns_clear(range, &range->pfns[i],
+ hmm_vma_walk.last, range->end);
+ return ret;
+ }
+ start = end;
+
+ } while (start < range->end);
+
+ return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+}
+EXPORT_SYMBOL(hmm_range_fault);
+
+/*
+ * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
+ * @range: range being faulted
+ * @device: device against to dma map page to
+ * @daddrs: dma address of mapped pages
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ * drop and you need to try again, some other error value otherwise
+ *
+ * Note same usage pattern as hmm_range_fault().
+ */
+long hmm_range_dma_map(struct hmm_range *range,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool block)
+{
+ unsigned long i, npages, mapped, page_size;
+ long ret;
+
+ ret = hmm_range_fault(range, block);
+ if (ret <= 0)
+ return ret ? ret : -EBUSY;
+
+ page_size = hmm_range_page_size(range);
+ npages = (range->end - range->start) >> range->page_shift;
+ for (i = 0, mapped = 0; i < npages; ++i) {
+ enum dma_data_direction dir = DMA_FROM_DEVICE;
+ struct page *page;
- if (!(vma->vm_flags & VM_READ)) {
/*
- * If vma do not allow read access, then assume that it does
- * not allow write access, either. Architecture that allow
- * write without read access are not supported by HMM, because
- * operations such has atomic access would not work.
+ * FIXME need to update DMA API to provide invalid DMA address
+ * value instead of a function to test dma address value. This
+ * would remove lot of dumb code duplicated accross many arch.
+ *
+ * For now setting it to 0 here is good enough as the pfns[]
+ * value is what is use to check what is valid and what isn't.
*/
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -EPERM;
+ daddrs[i] = 0;
+
+ page = hmm_pfn_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ /* Check if range is being invalidated */
+ if (!range->valid) {
+ ret = -EBUSY;
+ goto unmap;
+ }
+
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->values[HMM_PFN_WRITE])
+ dir = DMA_BIDIRECTIONAL;
+
+ daddrs[i] = dma_map_page(device, page, 0, page_size, dir);
+ if (dma_mapping_error(device, daddrs[i])) {
+ ret = -EFAULT;
+ goto unmap;
+ }
+
+ mapped++;
}
- /* Initialize range to track CPU page table update */
- spin_lock(&hmm->lock);
- range->valid = true;
- list_add_rcu(&range->list, &hmm->ranges);
- spin_unlock(&hmm->lock);
-
- hmm_vma_walk.fault = true;
- hmm_vma_walk.block = block;
- hmm_vma_walk.range = range;
- mm_walk.private = &hmm_vma_walk;
- hmm_vma_walk.last = range->start;
-
- mm_walk.vma = vma;
- mm_walk.mm = vma->vm_mm;
- mm_walk.pte_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
- mm_walk.pte_hole = hmm_vma_walk_hole;
+ return mapped;
- do {
- ret = walk_page_range(start, range->end, &mm_walk);
- start = hmm_vma_walk.last;
- } while (ret == -EAGAIN);
+unmap:
+ for (npages = i, i = 0; (i < npages) && mapped; ++i) {
+ enum dma_data_direction dir = DMA_FROM_DEVICE;
+ struct page *page;
+
+ page = hmm_pfn_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ if (dma_mapping_error(device, daddrs[i]))
+ continue;
- if (ret) {
- unsigned long i;
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->values[HMM_PFN_WRITE])
+ dir = DMA_BIDIRECTIONAL;
- i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
- hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
- range->end);
- hmm_vma_range_done(range);
+ dma_unmap_page(device, daddrs[i], page_size, dir);
+ mapped--;
}
+
return ret;
}
-EXPORT_SYMBOL(hmm_vma_fault);
+EXPORT_SYMBOL(hmm_range_dma_map);
+
+/*
+ * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
+ * @range: range being unmapped
+ * @vma: the vma against which the range (optional)
+ * @device: device against which dma map was done
+ * @daddrs: dma address of mapped pages
+ * @dirty: dirty page if it had the write flag set
+ * Returns: number of page unmapped on success, -EINVAL otherwise
+ *
+ * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
+ * to the sync_cpu_device_pagetables() callback so that it is safe here to
+ * call set_page_dirty(). Caller must also take appropriate locks to avoid
+ * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
+ */
+long hmm_range_dma_unmap(struct hmm_range *range,
+ struct vm_area_struct *vma,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool dirty)
+{
+ unsigned long i, npages, page_size;
+ long cpages = 0;
+
+ /* Sanity check. */
+ if (range->end <= range->start)
+ return -EINVAL;
+ if (!daddrs)
+ return -EINVAL;
+ if (!range->pfns)
+ return -EINVAL;
+
+ page_size = hmm_range_page_size(range);
+ npages = (range->end - range->start) >> range->page_shift;
+ for (i = 0; i < npages; ++i) {
+ enum dma_data_direction dir = DMA_FROM_DEVICE;
+ struct page *page;
+
+ page = hmm_pfn_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->values[HMM_PFN_WRITE]) {
+ dir = DMA_BIDIRECTIONAL;
+
+ /*
+ * See comments in function description on why it is
+ * safe here to call set_page_dirty()
+ */
+ if (dirty)
+ set_page_dirty(page);
+ }
+
+ /* Unmap and clear pfns/dma address */
+ dma_unmap_page(device, daddrs[i], page_size, dir);
+ range->pfns[i] = range->values[HMM_PFN_NONE];
+ /* FIXME see comments in hmm_vma_dma_map() */
+ daddrs[i] = 0;
+ cpages++;
+ }
+
+ return cpages;
+}
+EXPORT_SYMBOL(hmm_range_dma_unmap);
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
@@ -990,7 +1371,7 @@ static void hmm_devmem_ref_kill(struct percpu_ref *ref)
percpu_ref_kill(ref);
}
-static int hmm_devmem_fault(struct vm_area_struct *vma,
+static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma,
unsigned long addr,
const struct page *page,
unsigned int flags,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index faf357eaf0ce..d4847026d4b1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -33,6 +33,7 @@
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/oom.h>
+#include <linux/numa.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -616,6 +617,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
+ count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
}
return 0;
@@ -1337,6 +1339,7 @@ alloc:
}
count_vm_event(THP_FAULT_ALLOC);
+ count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
if (!page)
clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
@@ -1475,7 +1478,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- int page_nid = -1, this_nid = numa_node_id();
+ int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
bool migrated = false;
@@ -1520,7 +1523,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
*/
page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
- if (target_nid == -1) {
+ if (target_nid == NUMA_NO_NODE) {
/* If the page was locked, there are no parallel migrations */
if (page_locked)
goto clear_pmdnuma;
@@ -1528,7 +1531,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
if (!get_page_unless_zero(page))
goto out_unlock;
spin_unlock(vmf->ptl);
@@ -1549,14 +1552,14 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
unlock_page(page);
put_page(page);
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
goto out_unlock;
}
/* Bail if we fail to protect against THP splits for any reason */
if (unlikely(!anon_vma)) {
put_page(page);
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
goto clear_pmdnuma;
}
@@ -1618,7 +1621,7 @@ out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
- if (page_nid != -1)
+ if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
flags);
@@ -2886,12 +2889,8 @@ DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
static int __init split_huge_pages_debugfs(void)
{
- void *ret;
-
- ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
- &split_huge_pages_fops);
- if (!ret)
- pr_warn("Failed to create split_huge_pages in debugfs");
+ debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
+ &split_huge_pages_fops);
return 0;
}
late_initcall(split_huge_pages_debugfs);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index afef61656c1e..1c5219193b9e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -25,6 +25,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/jhash.h>
+#include <linux/numa.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -887,7 +888,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask,
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
- int node = -1;
+ int node = NUMA_NO_NODE;
zonelist = node_zonelist(nid, gfp_mask);
@@ -919,7 +920,7 @@ retry_cpuset:
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
- if (hugepage_migration_supported(h))
+ if (hugepage_movable_supported(h))
return GFP_HIGHUSER_MOVABLE;
else
return GFP_HIGHUSER;
@@ -1035,7 +1036,6 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
static void destroy_compound_gigantic_page(struct page *page,
unsigned int order)
{
@@ -1058,6 +1058,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
free_contig_range(page_to_pfn(page), 1 << order);
}
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION
static int __alloc_gigantic_page(unsigned long start_pfn,
unsigned long nr_pages, gfp_t gfp_mask)
{
@@ -1143,22 +1144,19 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
static void prep_compound_gigantic_page(struct page *page, unsigned int order);
-#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static inline bool gigantic_page_supported(void) { return false; }
+#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION */
+static inline bool gigantic_page_runtime_allocation_supported(void)
+{
+ return false;
+}
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask) { return NULL; }
-static inline void free_gigantic_page(struct page *page, unsigned int order) { }
-static inline void destroy_compound_gigantic_page(struct page *page,
- unsigned int order) { }
#endif
static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
- return;
-
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h); i++) {
@@ -1586,8 +1584,8 @@ out_unlock:
return page;
}
-static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask)
+struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
{
struct page *page;
@@ -2276,13 +2274,20 @@ found:
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
+static int set_max_huge_pages(struct hstate *h, unsigned long count,
nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
- return h->max_huge_pages;
+ if (hstate_is_gigantic(h) &&
+ !gigantic_page_runtime_allocation_supported()) {
+ spin_lock(&hugetlb_lock);
+ if (count > persistent_huge_pages(h)) {
+ spin_unlock(&hugetlb_lock);
+ return -EINVAL;
+ }
+ goto decrease_pool;
+ }
/*
* Increase the pool size
@@ -2322,6 +2327,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
goto out;
}
+decrease_pool:
/*
* Decrease the pool size
* First return free pages to the buddy allocator (being careful
@@ -2350,9 +2356,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
break;
}
out:
- ret = persistent_huge_pages(h);
+ h->max_huge_pages = persistent_huge_pages(h);
spin_unlock(&hugetlb_lock);
- return ret;
+
+ return 0;
}
#define HSTATE_ATTR_RO(_name) \
@@ -2404,11 +2411,6 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
int err;
NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
- if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
- err = -EINVAL;
- goto out;
- }
-
if (nid == NUMA_NO_NODE) {
/*
* global hstate attribute
@@ -2428,7 +2430,9 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
} else
nodes_allowed = &node_states[N_MEMORY];
- h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
+ err = set_max_huge_pages(h, count, nodes_allowed);
+ if (err)
+ goto out;
if (nodes_allowed != &node_states[N_MEMORY])
NODEMASK_FREE(nodes_allowed);
@@ -3624,7 +3628,6 @@ retry_avoidcopy:
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
- set_page_huge_active(new_page);
mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
mmu_notifier_invalidate_range_start(&range);
@@ -3645,6 +3648,7 @@ retry_avoidcopy:
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
+ set_page_huge_active(new_page);
/* Make the old page be freed below */
new_page = old_page;
}
@@ -3790,7 +3794,6 @@ retry:
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
- set_page_huge_active(page);
if (vma->vm_flags & VM_MAYSHARE) {
int err = huge_add_to_page_cache(page, mapping, idx);
@@ -3861,6 +3864,10 @@ retry:
}
spin_unlock(ptl);
+
+ /* May already be set if not newly allocated page */
+ set_page_huge_active(page);
+
unlock_page(page);
out:
return ret;
@@ -4095,7 +4102,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
* the set_pte_at() write.
*/
__SetPageUptodate(page);
- set_page_huge_active(page);
mapping = dst_vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, dst_vma, dst_addr);
@@ -4163,6 +4169,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
+ set_page_huge_active(page);
if (vm_shared)
unlock_page(page);
ret = 0;
@@ -4388,10 +4395,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
continue;
}
if (!huge_pte_none(pte)) {
- pte = huge_ptep_get_and_clear(mm, address, ptep);
- pte = pte_mkhuge(huge_pte_modify(pte, newprot));
+ pte_t old_pte;
+
+ old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
+ pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
pte = arch_make_huge_pte(pte, vma, NULL, 0);
- set_huge_pte_at(mm, address, ptep, pte);
+ huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
}
spin_unlock(ptl);
diff --git a/mm/internal.h b/mm/internal.h
index f4a7bb02decf..9eeaf2b95166 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -163,6 +163,7 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
+extern void __free_pages_core(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
@@ -183,14 +184,16 @@ extern int user_min_free_kbytes;
struct compact_control {
struct list_head freepages; /* List of free pages to migrate to */
struct list_head migratepages; /* List of pages being migrated */
+ unsigned int nr_freepages; /* Number of isolated free pages */
+ unsigned int nr_migratepages; /* Number of pages to migrate */
+ unsigned long free_pfn; /* isolate_freepages search base */
+ unsigned long migrate_pfn; /* isolate_migratepages search base */
+ unsigned long fast_start_pfn; /* a pfn to start linear scan from */
struct zone *zone;
- unsigned long nr_freepages; /* Number of isolated free pages */
- unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long total_migrate_scanned;
unsigned long total_free_scanned;
- unsigned long free_pfn; /* isolate_freepages search base */
- unsigned long migrate_pfn; /* isolate_migratepages search base */
- unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
+ unsigned short fast_search_fail;/* failures to use free list searches */
+ short search_order; /* order to start a fast search at */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
int order; /* order a direct compactor needs */
int migratetype; /* migratetype of direct compactor */
@@ -203,7 +206,16 @@ struct compact_control {
bool direct_compaction; /* False from kcompactd or /proc/... */
bool whole_zone; /* Whole zone should/has been scanned */
bool contended; /* Signal lock or sched contention */
- bool finishing_block; /* Finishing current pageblock */
+ bool rescan; /* Rescanning the same pageblock */
+};
+
+/*
+ * Used in direct compaction when a page should be taken from the freelists
+ * immediately when one is created during the free path.
+ */
+struct capture_control {
+ struct compact_control *cc;
+ struct page *page;
};
unsigned long
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 73c9cbfdedf4..80bbe62b16cd 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -14,6 +14,8 @@
*
*/
+#define __KASAN_INTERNAL
+
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/init.h>
@@ -361,10 +363,15 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
* get different tags.
*/
static u8 assign_tag(struct kmem_cache *cache, const void *object,
- bool init, bool krealloc)
+ bool init, bool keep_tag)
{
- /* Reuse the same tag for krealloc'ed objects. */
- if (krealloc)
+ /*
+ * 1. When an object is kmalloc()'ed, two hooks are called:
+ * kasan_slab_alloc() and kasan_kmalloc(). We assign the
+ * tag only in the first one.
+ * 2. We reuse the same tag for krealloc'ed objects.
+ */
+ if (keep_tag)
return get_tag(object);
/*
@@ -405,12 +412,6 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
return (void *)object;
}
-void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
- gfp_t flags)
-{
- return kasan_kmalloc(cache, object, cache->object_size, flags);
-}
-
static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
{
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
@@ -467,7 +468,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
}
static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
- size_t size, gfp_t flags, bool krealloc)
+ size_t size, gfp_t flags, bool keep_tag)
{
unsigned long redzone_start;
unsigned long redzone_end;
@@ -485,7 +486,7 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
KASAN_SHADOW_SCALE_SIZE);
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- tag = assign_tag(cache, object, false, krealloc);
+ tag = assign_tag(cache, object, false, keep_tag);
/* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
kasan_unpoison_shadow(set_tag(object, tag), size);
@@ -498,10 +499,16 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
return set_tag(object, tag);
}
+void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
+ gfp_t flags)
+{
+ return __kasan_kmalloc(cache, object, cache->object_size, flags, false);
+}
+
void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
size_t size, gfp_t flags)
{
- return __kasan_kmalloc(cache, object, size, flags, false);
+ return __kasan_kmalloc(cache, object, size, flags, true);
}
EXPORT_SYMBOL(kasan_kmalloc);
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index ccb6207276e3..504c79363a34 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -275,25 +275,6 @@ EXPORT_SYMBOL(__asan_storeN_noabort);
void __asan_handle_no_return(void) {}
EXPORT_SYMBOL(__asan_handle_no_return);
-/* Emitted by compiler to poison large objects when they go out of scope. */
-void __asan_poison_stack_memory(const void *addr, size_t size)
-{
- /*
- * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded
- * by redzones, so we simply round up size to simplify logic.
- */
- kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE),
- KASAN_USE_AFTER_SCOPE);
-}
-EXPORT_SYMBOL(__asan_poison_stack_memory);
-
-/* Emitted by compiler to unpoison large objects when they go into scope. */
-void __asan_unpoison_stack_memory(const void *addr, size_t size)
-{
- kasan_unpoison_shadow(addr, size);
-}
-EXPORT_SYMBOL(__asan_unpoison_stack_memory);
-
/* Emitted by compiler to poison alloca()ed objects. */
void __asan_alloca_poison(unsigned long addr, size_t size)
{
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
index 5e12035888f2..36c645939bc9 100644
--- a/mm/kasan/generic_report.c
+++ b/mm/kasan/generic_report.c
@@ -82,9 +82,6 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
case KASAN_KMALLOC_FREE:
bug_type = "use-after-free";
break;
- case KASAN_USE_AFTER_SCOPE:
- bug_type = "use-after-scope";
- break;
case KASAN_ALLOCA_LEFT:
case KASAN_ALLOCA_RIGHT:
bug_type = "alloca-out-of-bounds";
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index ea51b2d898ec..3e0c11f7d7a1 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -34,7 +34,6 @@
#define KASAN_STACK_MID 0xF2
#define KASAN_STACK_RIGHT 0xF3
#define KASAN_STACK_PARTIAL 0xF4
-#define KASAN_USE_AFTER_SCOPE 0xF8
/*
* alloca redzone shadow values
@@ -187,8 +186,6 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size);
void __asan_loadN(unsigned long addr, size_t size);
void __asan_storeN(unsigned long addr, size_t size);
void __asan_handle_no_return(void);
-void __asan_poison_stack_memory(const void *addr, size_t size);
-void __asan_unpoison_stack_memory(const void *addr, size_t size);
void __asan_alloca_poison(unsigned long addr, size_t size);
void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4f017339ddb2..449044378782 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1074,6 +1074,7 @@ static void collapse_huge_page(struct mm_struct *mm,
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
+ count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
lru_cache_add_active_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
@@ -1502,6 +1503,7 @@ xa_unlocked:
page_ref_add(new_page, HPAGE_PMD_NR - 1);
set_page_dirty(new_page);
mem_cgroup_commit_charge(new_page, memcg, false, true);
+ count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
lru_cache_add_anon(new_page);
/*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f9d9dc250428..707fa5579f66 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -574,6 +574,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
unsigned long flags;
struct kmemleak_object *object, *parent;
struct rb_node **link, *rb_parent;
+ unsigned long untagged_ptr;
object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
if (!object) {
@@ -619,8 +620,9 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
write_lock_irqsave(&kmemleak_lock, flags);
- min_addr = min(min_addr, ptr);
- max_addr = max(max_addr, ptr + size);
+ untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
+ min_addr = min(min_addr, untagged_ptr);
+ max_addr = max(max_addr, untagged_ptr + size);
link = &object_tree_root.rb_node;
rb_parent = NULL;
while (*link) {
@@ -1333,6 +1335,7 @@ static void scan_block(void *_start, void *_end,
unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
unsigned long *end = _end - (BYTES_PER_POINTER - 1);
unsigned long flags;
+ unsigned long untagged_ptr;
read_lock_irqsave(&kmemleak_lock, flags);
for (ptr = start; ptr < end; ptr++) {
@@ -1347,7 +1350,8 @@ static void scan_block(void *_start, void *_end,
pointer = *ptr;
kasan_enable_current();
- if (pointer < min_addr || pointer >= max_addr)
+ untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer);
+ if (untagged_ptr < min_addr || untagged_ptr >= max_addr)
continue;
/*
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c48ad13b4c9..fa78626da9f0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -598,7 +598,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
chain->chain_prune_time = jiffies;
chain->rmap_hlist_len = STABLE_NODE_CHAIN;
#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
- chain->nid = -1; /* debug */
+ chain->nid = NUMA_NO_NODE; /* debug */
#endif
ksm_stable_node_chains++;
@@ -667,6 +667,12 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
free_stable_node(stable_node);
}
+enum get_ksm_page_flags {
+ GET_KSM_PAGE_NOLOCK,
+ GET_KSM_PAGE_LOCK,
+ GET_KSM_PAGE_TRYLOCK
+};
+
/*
* get_ksm_page: checks if the page indicated by the stable node
* is still its ksm page, despite having held no reference to it.
@@ -686,7 +692,8 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
* a page to put something that might look like our key in page->mapping.
* is on its way to being freed; but it is an anomaly to bear in mind.
*/
-static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
+static struct page *get_ksm_page(struct stable_node *stable_node,
+ enum get_ksm_page_flags flags)
{
struct page *page;
void *expected_mapping;
@@ -706,8 +713,9 @@ again:
* case this node is no longer referenced, and should be freed;
* however, it might mean that the page is under page_ref_freeze().
* The __remove_mapping() case is easy, again the node is now stale;
- * but if page is swapcache in migrate_page_move_mapping(), it might
- * still be our page, in which case it's essential to keep the node.
+ * the same is in reuse_ksm_page() case; but if page is swapcache
+ * in migrate_page_move_mapping(), it might still be our page,
+ * in which case it's essential to keep the node.
*/
while (!get_page_unless_zero(page)) {
/*
@@ -728,8 +736,15 @@ again:
goto stale;
}
- if (lock_it) {
+ if (flags == GET_KSM_PAGE_TRYLOCK) {
+ if (!trylock_page(page)) {
+ put_page(page);
+ return ERR_PTR(-EBUSY);
+ }
+ } else if (flags == GET_KSM_PAGE_LOCK)
lock_page(page);
+
+ if (flags != GET_KSM_PAGE_NOLOCK) {
if (READ_ONCE(page->mapping) != expected_mapping) {
unlock_page(page);
put_page(page);
@@ -763,7 +778,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
struct page *page;
stable_node = rmap_item->head;
- page = get_ksm_page(stable_node, true);
+ page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
if (!page)
goto out;
@@ -863,7 +878,7 @@ static int remove_stable_node(struct stable_node *stable_node)
struct page *page;
int err;
- page = get_ksm_page(stable_node, true);
+ page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
if (!page) {
/*
* get_ksm_page did remove_node_from_stable_tree itself.
@@ -1385,7 +1400,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
* stable_node parameter itself will be freed from
* under us if it returns NULL.
*/
- _tree_page = get_ksm_page(dup, false);
+ _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
if (!_tree_page)
continue;
nr += 1;
@@ -1508,7 +1523,7 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
if (!is_stable_node_chain(stable_node)) {
if (is_page_sharing_candidate(stable_node)) {
*_stable_node_dup = stable_node;
- return get_ksm_page(stable_node, false);
+ return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
}
/*
* _stable_node_dup set to NULL means the stable_node
@@ -1613,7 +1628,8 @@ again:
* wrprotected at all times. Any will work
* fine to continue the walk.
*/
- tree_page = get_ksm_page(stable_node_any, false);
+ tree_page = get_ksm_page(stable_node_any,
+ GET_KSM_PAGE_NOLOCK);
}
VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
@@ -1673,7 +1689,12 @@ again:
* It would be more elegant to return stable_node
* than kpage, but that involves more changes.
*/
- tree_page = get_ksm_page(stable_node_dup, true);
+ tree_page = get_ksm_page(stable_node_dup,
+ GET_KSM_PAGE_TRYLOCK);
+
+ if (PTR_ERR(tree_page) == -EBUSY)
+ return ERR_PTR(-EBUSY);
+
if (unlikely(!tree_page))
/*
* The tree may have been rebalanced,
@@ -1842,7 +1863,8 @@ again:
* wrprotected at all times. Any will work
* fine to continue the walk.
*/
- tree_page = get_ksm_page(stable_node_any, false);
+ tree_page = get_ksm_page(stable_node_any,
+ GET_KSM_PAGE_NOLOCK);
}
VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
@@ -2060,6 +2082,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
/* We first start with searching the page inside the stable tree */
kpage = stable_tree_search(page);
+
+ if (PTR_ERR(kpage) == -EBUSY)
+ return;
+
if (kpage == page && rmap_item->head == stable_node) {
put_page(kpage);
return;
@@ -2242,7 +2268,8 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
list_for_each_entry_safe(stable_node, next,
&migrate_nodes, list) {
- page = get_ksm_page(stable_node, false);
+ page = get_ksm_page(stable_node,
+ GET_KSM_PAGE_NOLOCK);
if (page)
put_page(page);
cond_resched();
@@ -2642,6 +2669,31 @@ again:
goto again;
}
+bool reuse_ksm_page(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+#ifdef CONFIG_DEBUG_VM
+ if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
+ WARN_ON(!page_mapped(page)) ||
+ WARN_ON(!PageLocked(page))) {
+ dump_page(page, "reuse_ksm_page");
+ return false;
+ }
+#endif
+
+ if (PageSwapCache(page) || !page_stable_node(page))
+ return false;
+ /* Prohibit parallel get_ksm_page() */
+ if (!page_ref_freeze(page, 1))
+ return false;
+
+ page_move_anon_rmap(page, vma);
+ page->index = linear_page_index(vma, address);
+ page_ref_unfreeze(page, 1);
+
+ return true;
+}
#ifdef CONFIG_MIGRATION
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 5b30625fd365..0730bf8ff39f 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -601,7 +601,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
struct lock_class_key *key, struct shrinker *shrinker)
{
int i;
- size_t size = sizeof(*lru->node) * nr_node_ids;
int err = -ENOMEM;
#ifdef CONFIG_MEMCG_KMEM
@@ -612,7 +611,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
#endif
memcg_get_cache_ids();
- lru->node = kzalloc(size, GFP_KERNEL);
+ lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
if (!lru->node)
goto out;
diff --git a/mm/memblock.c b/mm/memblock.c
index ea31045ba704..224816f80972 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1254,6 +1254,70 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
return 0;
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/**
+ * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
+ *
+ * @idx: pointer to u64 loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL
+ * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL
+ *
+ * This function is meant to be a zone/pfn specific wrapper for the
+ * for_each_mem_range type iterators. Specifically they are used in the
+ * deferred memory init routines and as such we were duplicating much of
+ * this logic throughout the code. So instead of having it in multiple
+ * locations it seemed like it would make more sense to centralize this to
+ * one new iterator that does everything they need.
+ */
+void __init_memblock
+__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
+ unsigned long *out_spfn, unsigned long *out_epfn)
+{
+ int zone_nid = zone_to_nid(zone);
+ phys_addr_t spa, epa;
+ int nid;
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+
+ while (*idx != U64_MAX) {
+ unsigned long epfn = PFN_DOWN(epa);
+ unsigned long spfn = PFN_UP(spa);
+
+ /*
+ * Verify the end is at least past the start of the zone and
+ * that we have at least one PFN to initialize.
+ */
+ if (zone->zone_start_pfn < epfn && spfn < epfn) {
+ /* if we went too far just stop searching */
+ if (zone_end_pfn(zone) <= spfn) {
+ *idx = U64_MAX;
+ break;
+ }
+
+ if (out_spfn)
+ *out_spfn = max(zone->zone_start_pfn, spfn);
+ if (out_epfn)
+ *out_epfn = min(zone_end_pfn(zone), epfn);
+
+ return;
+ }
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+ }
+
+ /* signal end of iteration */
+ if (out_spfn)
+ *out_spfn = ULONG_MAX;
+ if (out_epfn)
+ *out_epfn = 0;
+}
+
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
@@ -2005,8 +2069,7 @@ DEFINE_SHOW_ATTRIBUTE(memblock_debug);
static int __init memblock_init_debugfs(void)
{
struct dentry *root = debugfs_create_dir("memblock", NULL);
- if (!root)
- return -ENXIO;
+
debugfs_create_file("memory", 0444, root,
&memblock.memory, &memblock_debug_fops);
debugfs_create_file("reserved", 0444, root,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index af7f18b32389..6464de2648b2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
+#include <linux/vm_event_item.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
@@ -248,6 +249,12 @@ enum res_type {
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
+static inline bool should_force_charge(void)
+{
+ return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
+ (current->flags & PF_EXITING);
+}
+
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -1377,6 +1384,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
return max;
}
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+ return page_counter_read(&memcg->memory);
+}
+
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
@@ -1389,8 +1401,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
};
bool ret;
- mutex_lock(&oom_lock);
- ret = out_of_memory(&oc);
+ if (mutex_lock_killable(&oom_lock))
+ return true;
+ /*
+ * A few threads which were not waiting at mutex_lock_killable() can
+ * fail to bail out. Therefore, check again after holding oom_lock.
+ */
+ ret = should_force_charge() || out_of_memory(&oc);
mutex_unlock(&oom_lock);
return ret;
}
@@ -2156,14 +2173,17 @@ static void high_work_func(struct work_struct *work)
void mem_cgroup_handle_over_high(void)
{
unsigned int nr_pages = current->memcg_nr_pages_over_high;
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = current->memcg_high_reclaim;
if (likely(!nr_pages))
return;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ if (!memcg)
+ memcg = get_mem_cgroup_from_mm(current->mm);
+
reclaim_high(memcg, nr_pages, GFP_KERNEL);
css_put(&memcg->css);
+ current->memcg_high_reclaim = NULL;
current->memcg_nr_pages_over_high = 0;
}
@@ -2209,9 +2229,7 @@ retry:
* bypass the last charges so that they can exit quickly and
* free their memory.
*/
- if (unlikely(tsk_is_oom_victim(current) ||
- fatal_signal_pending(current) ||
- current->flags & PF_EXITING))
+ if (unlikely(should_force_charge()))
goto force;
/*
@@ -2317,10 +2335,10 @@ done_restock:
* If the hierarchy is above the normal consumption range, schedule
* reclaim on returning to userland. We can perform reclaim here
* if __GFP_RECLAIM but let's always punt for simplicity and so that
- * GFP_KERNEL can consistently be used during reclaim. @memcg is
- * not recorded as it most likely matches current's and won't
- * change in the meantime. As high limit is checked again before
- * reclaim, the cost of mismatch is negligible.
+ * GFP_KERNEL can consistently be used during reclaim. Record the memcg
+ * for the return-to-userland high reclaim. If the memcg is already
+ * recorded and the recorded memcg is not the descendant of the memcg
+ * needing high reclaim, punt the high reclaim to the work queue.
*/
do {
if (page_counter_read(&memcg->memory) > memcg->high) {
@@ -2328,6 +2346,13 @@ done_restock:
if (in_interrupt()) {
schedule_work(&memcg->high_work);
break;
+ } else if (!current->memcg_high_reclaim) {
+ css_get(&memcg->css);
+ current->memcg_high_reclaim = memcg;
+ } else if (!mem_cgroup_is_descendant(
+ current->memcg_high_reclaim, memcg)) {
+ schedule_work(&memcg->high_work);
+ break;
}
current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
@@ -2573,7 +2598,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep)
}
/**
- * memcg_kmem_charge_memcg: charge a kmem page
+ * __memcg_kmem_charge_memcg: charge a kmem page
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
@@ -2581,7 +2606,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep)
*
* Returns 0 on success, an error code on failure.
*/
-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct mem_cgroup *memcg)
{
unsigned int nr_pages = 1 << order;
@@ -2604,24 +2629,24 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
}
/**
- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
+ * __memcg_kmem_charge: charge a kmem page to the current memory cgroup
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
*
* Returns 0 on success, an error code on failure.
*/
-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
int ret = 0;
- if (mem_cgroup_disabled() || memcg_kmem_bypass())
+ if (memcg_kmem_bypass())
return 0;
memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
if (!ret)
__SetPageKmemcg(page);
}
@@ -2629,11 +2654,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
return ret;
}
/**
- * memcg_kmem_uncharge: uncharge a kmem page
+ * __memcg_kmem_uncharge: uncharge a kmem page
* @page: page to uncharge
* @order: allocation order
*/
-void memcg_kmem_uncharge(struct page *page, int order)
+void __memcg_kmem_uncharge(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
unsigned int nr_pages = 1 << order;
@@ -3337,7 +3362,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
const struct numa_stat *stat;
int nid;
unsigned long nr;
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -3388,7 +3413,7 @@ static const char *const memcg1_event_names[] = {
static int memcg_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
@@ -3626,8 +3651,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
/* Allocate memory for new array of thresholds */
- new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
- GFP_KERNEL);
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
if (!new) {
ret = -ENOMEM;
goto unlock;
@@ -3821,7 +3845,7 @@ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
@@ -4420,7 +4444,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- size_t size;
+ unsigned int size;
int node;
size = sizeof(struct mem_cgroup);
@@ -5354,6 +5378,16 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
root_mem_cgroup->use_hierarchy = false;
}
+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
+{
+ if (value == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
+
+ return 0;
+}
+
static u64 memory_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -5364,15 +5398,8 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
static int memory_min_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long min = READ_ONCE(memcg->memory.min);
-
- if (min == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
}
static ssize_t memory_min_write(struct kernfs_open_file *of,
@@ -5394,15 +5421,8 @@ static ssize_t memory_min_write(struct kernfs_open_file *of,
static int memory_low_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long low = READ_ONCE(memcg->memory.low);
-
- if (low == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
}
static ssize_t memory_low_write(struct kernfs_open_file *of,
@@ -5424,15 +5444,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
static int memory_high_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long high = READ_ONCE(memcg->high);
-
- if (high == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
}
static ssize_t memory_high_write(struct kernfs_open_file *of,
@@ -5461,15 +5473,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
static int memory_max_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->memory.max);
-
- if (max == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
}
static ssize_t memory_max_write(struct kernfs_open_file *of,
@@ -5523,7 +5528,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
static int memory_events_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "low %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
@@ -5541,7 +5546,7 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
struct accumulated_stats acc;
int i;
@@ -5582,6 +5587,15 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "file_writeback %llu\n",
(u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
+ /*
+ * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
+ * with the NR_ANON_THP vm counter, but right now it's a pain in the
+ * arse because it requires migrating the work out of rmap to a place
+ * where the page->mem_cgroup is set up and stable.
+ */
+ seq_printf(m, "anon_thp %llu\n",
+ (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE);
+
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
(u64)acc.lru_pages[i] * PAGE_SIZE);
@@ -5613,12 +5627,18 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
+ seq_printf(m, "thp_collapse_alloc %lu\n",
+ acc.events[THP_COLLAPSE_ALLOC]);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
return 0;
}
static int memory_oom_group_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "%d\n", memcg->oom_group);
@@ -6601,15 +6621,8 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
static int swap_max_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->swap.max);
-
- if (max == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
}
static ssize_t swap_max_write(struct kernfs_open_file *of,
@@ -6631,7 +6644,7 @@ static ssize_t swap_max_write(struct kernfs_open_file *of,
static int swap_events_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "max %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
diff --git a/mm/memfd.c b/mm/memfd.c
index 97264c79d2cd..650e65a46b9c 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -131,7 +131,8 @@ static unsigned int *memfd_file_seals_ptr(struct file *file)
#define F_ALL_SEALS (F_SEAL_SEAL | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
- F_SEAL_WRITE)
+ F_SEAL_WRITE | \
+ F_SEAL_FUTURE_WRITE)
static int memfd_add_seals(struct file *file, unsigned int seals)
{
diff --git a/mm/memory.c b/mm/memory.c
index e11ca9dd823f..34ced1369883 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,6 +69,7 @@
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
+#include <linux/numa.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -1451,7 +1452,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
spinlock_t *ptl;
retval = -EINVAL;
- if (PageAnon(page))
+ if (PageAnon(page) || PageSlab(page) || page_has_type(page))
goto out;
retval = -ENOMEM;
flush_dcache_page(page);
@@ -1503,6 +1504,8 @@ out:
* under mm->mmap_sem write-lock, so it can change vma->vm_flags.
* Caller must set VM_MIXEDMAP on vma if it wants to call this
* function from other places, for example from page-fault handler.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
@@ -1830,7 +1833,9 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
* @size: size of map area
* @prot: page protection flags for this mapping
*
- * Note: this is only safe if the mm semaphore is held when called.
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
@@ -1903,6 +1908,8 @@ EXPORT_SYMBOL(remap_pfn_range);
*
* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
* whatever write-combining details or similar.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
@@ -2381,12 +2388,13 @@ oom:
*
* This function handles all that is needed to finish a write page fault in a
* shared mapping due to PTE being read-only once the mapped page is prepared.
- * It handles locking of PTE and modifying it. The function returns
- * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
- * lock.
+ * It handles locking of PTE and modifying it.
*
* The function expects the page to be locked or other protection against
* concurrent faults / writeback (such as DAX radix tree locks).
+ *
+ * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
+ * we acquired PTE lock.
*/
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
{
@@ -2504,8 +2512,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
+ if (PageAnon(vmf->page)) {
int total_map_swapcount;
+ if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) ||
+ page_count(vmf->page) != 1))
+ goto copy;
if (!trylock_page(vmf->page)) {
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2520,6 +2531,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
}
put_page(vmf->page);
}
+ if (PageKsm(vmf->page)) {
+ bool reused = reuse_ksm_page(vmf->page, vmf->vma,
+ vmf->address);
+ unlock_page(vmf->page);
+ if (!reused)
+ goto copy;
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
+ }
if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
if (total_map_swapcount == 1) {
/*
@@ -2540,7 +2560,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
}
-
+copy:
/*
* Ok, we need to copy. Oh, well..
*/
@@ -2809,7 +2829,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
@@ -2823,6 +2842,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
}
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
@@ -3201,6 +3221,8 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
+ *
+ * Return: %0 on success, %VM_FAULT_ code in case of error.
*/
vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
@@ -3261,11 +3283,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
* This function handles all that is needed to finish a page fault once the
* page to fault in is prepared. It handles locking of PTEs, inserts PTE for
* given page, adds reverse page mapping, handles memcg charges and LRU
- * addition. The function returns 0 on success, VM_FAULT_ code in case of
- * error.
+ * addition.
*
* The function expects the page to be locked and on success it consumes a
* reference of a page being mapped (for the PTE which maps it).
+ *
+ * Return: %0 on success, %VM_FAULT_ code in case of error.
*/
vm_fault_t finish_fault(struct vm_fault *vmf)
{
@@ -3321,12 +3344,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
static int __init fault_around_debugfs(void)
{
- void *ret;
-
- ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
- &fault_around_bytes_fops);
- if (!ret)
- pr_warn("Failed to create fault_around_bytes in debugfs");
+ debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
+ &fault_around_bytes_fops);
return 0;
}
late_initcall(fault_around_debugfs);
@@ -3586,11 +3605,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
- int page_nid = -1;
+ int page_nid = NUMA_NO_NODE;
int last_cpupid;
int target_nid;
bool migrated = false;
- pte_t pte;
+ pte_t pte, old_pte;
bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
@@ -3610,12 +3629,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* Make it present again, Depending on how arch implementes non
* accessible ptes, some can allow access by kernel mode.
*/
- pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
- pte = pte_modify(pte, vma->vm_page_prot);
+ old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+ pte = pte_modify(old_pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
- ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
+ ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
page = vm_normal_page(vma, vmf->address, pte);
@@ -3653,7 +3672,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
pte_unmap_unlock(vmf->pte, vmf->ptl);
- if (target_nid == -1) {
+ if (target_nid == NUMA_NO_NODE) {
put_page(page);
goto out;
}
@@ -3667,7 +3686,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
flags |= TNF_MIGRATE_FAIL;
out:
- if (page_nid != -1)
+ if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
}
@@ -4150,7 +4169,7 @@ EXPORT_SYMBOL(follow_pte_pmd);
*
* Only IO mappings and raw PFN mappings are allowed.
*
- * Returns zero and the pfn at @pfn on success, -ve otherwise.
+ * Return: zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn)
@@ -4300,6 +4319,8 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
* @gup_flags: flags modifying lookup behaviour
*
* The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from source to destination.
*/
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 519f9db063ff..6c6e267753cf 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -39,6 +39,7 @@
#include <asm/tlbflush.h>
#include "internal.h"
+#include "shuffle.h"
/*
* online_page_callback contains pointer to current page onlining function.
@@ -47,7 +48,7 @@
* and restore_online_page_callback() for generic callback restore.
*/
-static void generic_online_page(struct page *page);
+static void generic_online_page(struct page *page, unsigned int order);
static online_page_callback_t online_page_callback = generic_online_page;
static DEFINE_MUTEX(online_page_callback_lock);
@@ -662,26 +663,39 @@ void __online_page_free(struct page *page)
}
EXPORT_SYMBOL_GPL(__online_page_free);
-static void generic_online_page(struct page *page)
+static void generic_online_page(struct page *page, unsigned int order)
{
- __online_page_set_limits(page);
- __online_page_increment_counters(page);
- __online_page_free(page);
+ __free_pages_core(page, order);
+ totalram_pages_add(1UL << order);
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages_add(1UL << order);
+#endif
+}
+
+static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
+{
+ unsigned long end = start + nr_pages;
+ int order, onlined_pages = 0;
+
+ while (start < end) {
+ order = min(MAX_ORDER - 1,
+ get_order(PFN_PHYS(end) - PFN_PHYS(start)));
+ (*online_page_callback)(pfn_to_page(start), order);
+
+ onlined_pages += (1UL << order);
+ start += (1UL << order);
+ }
+ return onlined_pages;
}
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
{
- unsigned long i;
unsigned long onlined_pages = *(unsigned long *)arg;
- struct page *page;
if (PageReserved(pfn_to_page(start_pfn)))
- for (i = 0; i < nr_pages; i++) {
- page = pfn_to_page(start_pfn + i);
- (*online_page_callback)(page);
- onlined_pages++;
- }
+ onlined_pages += online_pages_blocks(start_pfn, nr_pages);
online_mem_sections(start_pfn, start_pfn + nr_pages);
@@ -695,9 +709,9 @@ static void node_states_check_changes_online(unsigned long nr_pages,
{
int nid = zone_to_nid(zone);
- arg->status_change_nid = -1;
- arg->status_change_nid_normal = -1;
- arg->status_change_nid_high = -1;
+ arg->status_change_nid = NUMA_NO_NODE;
+ arg->status_change_nid_normal = NUMA_NO_NODE;
+ arg->status_change_nid_high = NUMA_NO_NODE;
if (!node_state(nid, N_MEMORY))
arg->status_change_nid = nid;
@@ -901,6 +915,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
zone->zone_pgdat->node_present_pages += onlined_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ shuffle_zone(zone);
+
if (onlined_pages) {
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
@@ -1368,12 +1384,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (PageHuge(page)) {
struct page *head = compound_head(page);
- pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
if (compound_order(head) > PFN_SECTION_SHIFT) {
ret = -EBUSY;
break;
}
- isolate_huge_page(page, &source);
+ pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
+ isolate_huge_page(head, &source);
continue;
} else if (PageTransHuge(page))
pfn = page_to_pfn(compound_head(page))
@@ -1499,9 +1515,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
unsigned long present_pages = 0;
enum zone_type zt;
- arg->status_change_nid = -1;
- arg->status_change_nid_normal = -1;
- arg->status_change_nid_high = -1;
+ arg->status_change_nid = NUMA_NO_NODE;
+ arg->status_change_nid_normal = NUMA_NO_NODE;
+ arg->status_change_nid_high = NUMA_NO_NODE;
/*
* Check whether node_states[N_NORMAL_MEMORY] will be changed.
@@ -1615,7 +1631,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
cond_resched();
lru_add_drain_all();
- drain_all_pages(zone);
pfn = scan_movable_pages(pfn, end_pfn);
if (pfn) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d4496d9d34f5..af171ccb56a2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -350,7 +350,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) &&
+ if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
@@ -1314,7 +1314,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
- const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+ unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
@@ -1491,7 +1491,7 @@ static int kernel_get_mempolicy(int __user *policy,
int uninitialized_var(pval);
nodemask_t nodes;
- if (nmask != NULL && maxnode < MAX_NUMNODES)
+ if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
err = do_get_mempolicy(&pval, &nodes, addr, flags);
@@ -1527,7 +1527,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
unsigned long nr_bits, alloc_size;
DECLARE_BITMAP(bm, MAX_NUMNODES);
- nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask)
@@ -2304,7 +2304,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
unsigned long pgoff;
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
- int polnid = -1;
+ int polnid = NUMA_NO_NODE;
int ret = -1;
pol = get_vma_policy(vma, addr);
diff --git a/mm/mempool.c b/mm/mempool.c
index 0ef8cc8d1602..85efab3da720 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -222,6 +222,8 @@ EXPORT_SYMBOL(mempool_init_node);
*
* Like mempool_create(), but initializes the pool in (i.e. embedded in another
* structure).
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
@@ -245,6 +247,8 @@ EXPORT_SYMBOL(mempool_init);
* functions. This function might sleep. Both the alloc_fn() and the free_fn()
* functions might sleep - as long as the mempool_alloc() function is not called
* from IRQ contexts.
+ *
+ * Return: pointer to the created memory pool object or %NULL on error.
*/
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
@@ -289,6 +293,8 @@ EXPORT_SYMBOL(mempool_create_node);
* Note, the caller must guarantee that no mempool_destroy is called
* while this function is running. mempool_alloc() & mempool_free()
* might be called (eg. from IRQ contexts) while this function executes.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int mempool_resize(mempool_t *pool, int new_min_nr)
{
@@ -363,6 +369,8 @@ EXPORT_SYMBOL(mempool_resize);
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
* Note: using __GFP_ZERO is not supported.
+ *
+ * Return: pointer to the allocated element or %NULL on error.
*/
void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
diff --git a/mm/migrate.c b/mm/migrate.c
index d4fd680be3b0..76517bf03621 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -100,7 +100,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
/*
* Check PageMovable before holding a PG_lock because page's owner
* assumes anybody doesn't touch PG_lock of newly allocated page
- * so unconditionally grapping the lock ruins page's owner side.
+ * so unconditionally grabbing the lock ruins page's owner side.
*/
if (unlikely(!__PageMovable(page)))
goto out_putpage;
@@ -374,7 +374,7 @@ unlock:
}
#endif
-static int expected_page_refs(struct page *page)
+static int expected_page_refs(struct address_space *mapping, struct page *page)
{
int expected_count = 1;
@@ -384,7 +384,7 @@ static int expected_page_refs(struct page *page)
*/
expected_count += is_device_private_page(page);
expected_count += is_device_public_page(page);
- if (page_mapping(page))
+ if (mapping)
expected_count += hpage_nr_pages(page) + page_has_private(page);
return expected_count;
@@ -405,7 +405,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
XA_STATE(xas, &mapping->i_pages, page_index(page));
struct zone *oldzone, *newzone;
int dirty;
- int expected_count = expected_page_refs(page) + extra_count;
+ int expected_count = expected_page_refs(mapping, page) + extra_count;
if (!mapping) {
/* Anonymous page without mapping */
@@ -750,7 +750,7 @@ static int __buffer_migrate_page(struct address_space *mapping,
return migrate_page(mapping, newpage, page, mode);
/* Check whether page does not have extra refs before we do more work */
- expected_count = expected_page_refs(page);
+ expected_count = expected_page_refs(mapping, page);
if (page_count(page) != expected_count)
return -EAGAIN;
@@ -911,7 +911,7 @@ static int fallback_migrate_page(struct address_space *mapping,
*/
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
- return -EAGAIN;
+ return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
return migrate_page(mapping, newpage, page, mode);
}
@@ -1287,7 +1287,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
struct anon_vma *anon_vma = NULL;
/*
- * Movability of hugepages depends on architectures and hugepage size.
+ * Migratability of hugepages depends on architectures and their size.
* This check is necessary because some callers of hugepage migration
* like soft offline and memory hotremove don't walk through page
* tables or check whether the hugepage is pmd-based or not before
diff --git a/mm/mmap.c b/mm/mmap.c
index f901065c4c64..5b7ee55367ad 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -438,7 +438,7 @@ static void vma_gap_update(struct vm_area_struct *vma)
{
/*
* As it turns out, RB_DECLARE_CALLBACKS() already created a callback
- * function that does exacltly what we want.
+ * function that does exactly what we want.
*/
vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
}
@@ -1012,7 +1012,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
* VM_SOFTDIRTY should not prevent from VMA merging, if we
* match the flags but dirty bit -- the caller should mark
* merged VMA as dirty. If dirty bit won't be excluded from
- * comparison, we increase pressue on the memory system forcing
+ * comparison, we increase pressure on the memory system forcing
* the kernel to generate new VMAs when old one could be
* extended instead.
*/
@@ -1115,7 +1115,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
* might become case 1 below case 2 below case 3 below
*
- * It is important for case 8 that the the vma NNNN overlapping the
+ * It is important for case 8 that the vma NNNN overlapping the
* region AAAA is never going to extended over XXXX. Instead XXXX must
* be extended in region AAAA and NNNN must be removed. This way in
* all cases where vma_merge succeeds, the moment vma_adjust drops the
@@ -1645,7 +1645,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
/*
- * Some shared mappigns will want the pages marked read-only
+ * Some shared mappings will want the pages marked read-only
* to track write events. If so, we'll downgrade vm_page_prot
* to the private version (using protection_map[] without the
* VM_SHARED bit).
@@ -2126,13 +2126,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
*/
#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct vm_area_struct *vma, *prev;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
struct vm_unmapped_area_info info;
const unsigned long mmap_end = arch_get_mmap_end(addr);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 36cb358db170..028c724dcb1a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -110,8 +110,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
continue;
}
- ptent = ptep_modify_prot_start(mm, addr, pte);
- ptent = pte_modify(ptent, newprot);
+ oldpte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_modify(oldpte, newprot);
if (preserve_write)
ptent = pte_mk_savedwrite(ptent);
@@ -121,7 +121,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
!(vma->vm_flags & VM_SOFTDIRTY))) {
ptent = pte_mkwrite(ptent);
}
- ptep_modify_prot_commit(mm, addr, pte, ptent);
+ ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
pages++;
} else if (IS_ENABLED(CONFIG_MIGRATION)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 26ea8636758f..3a2484884cfd 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -843,7 +843,7 @@ static bool task_will_free_mem(struct task_struct *task)
return ret;
}
-static void __oom_kill_process(struct task_struct *victim)
+static void __oom_kill_process(struct task_struct *victim, const char *message)
{
struct task_struct *p;
struct mm_struct *mm;
@@ -874,8 +874,9 @@ static void __oom_kill_process(struct task_struct *victim)
*/
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
mark_oom_victim(victim);
- pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
- task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
+ pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+ message, task_pid_nr(victim), victim->comm,
+ K(victim->mm->total_vm),
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
K(get_mm_counter(victim->mm, MM_FILEPAGES)),
K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
@@ -926,24 +927,20 @@ static void __oom_kill_process(struct task_struct *victim)
* Kill provided task unless it's secured by setting
* oom_score_adj to OOM_SCORE_ADJ_MIN.
*/
-static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+static int oom_kill_memcg_member(struct task_struct *task, void *message)
{
- if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+ if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
+ !is_global_init(task)) {
get_task_struct(task);
- __oom_kill_process(task);
+ __oom_kill_process(task, message);
}
return 0;
}
static void oom_kill_process(struct oom_control *oc, const char *message)
{
- struct task_struct *p = oc->chosen;
- unsigned int points = oc->chosen_points;
- struct task_struct *victim = p;
- struct task_struct *child;
- struct task_struct *t;
+ struct task_struct *victim = oc->chosen;
struct mem_cgroup *oom_group;
- unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -952,57 +949,18 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
* its children or threads, just give it access to memory reserves
* so it can die quickly
*/
- task_lock(p);
- if (task_will_free_mem(p)) {
- mark_oom_victim(p);
- wake_oom_reaper(p);
- task_unlock(p);
- put_task_struct(p);
+ task_lock(victim);
+ if (task_will_free_mem(victim)) {
+ mark_oom_victim(victim);
+ wake_oom_reaper(victim);
+ task_unlock(victim);
+ put_task_struct(victim);
return;
}
- task_unlock(p);
+ task_unlock(victim);
if (__ratelimit(&oom_rs))
- dump_header(oc, p);
-
- pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
- message, task_pid_nr(p), p->comm, points);
-
- /*
- * If any of p's children has a different mm and is eligible for kill,
- * the one with the highest oom_badness() score is sacrificed for its
- * parent. This attempts to lose the minimal amount of work done while
- * still freeing memory.
- */
- read_lock(&tasklist_lock);
-
- /*
- * The task 'p' might have already exited before reaching here. The
- * put_task_struct() will free task_struct 'p' while the loop still try
- * to access the field of 'p', so, get an extra reference.
- */
- get_task_struct(p);
- for_each_thread(p, t) {
- list_for_each_entry(child, &t->children, sibling) {
- unsigned int child_points;
-
- if (process_shares_mm(child, p->mm))
- continue;
- /*
- * oom_badness() returns 0 if the thread is unkillable
- */
- child_points = oom_badness(child,
- oc->memcg, oc->nodemask, oc->totalpages);
- if (child_points > victim_points) {
- put_task_struct(victim);
- victim = child;
- victim_points = child_points;
- get_task_struct(victim);
- }
- }
- }
- put_task_struct(p);
- read_unlock(&tasklist_lock);
+ dump_header(oc, victim);
/*
* Do we need to kill the entire memory cgroup?
@@ -1011,14 +969,15 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
*/
oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
- __oom_kill_process(victim);
+ __oom_kill_process(victim, message);
/*
* If necessary, kill all tasks in the selected memory cgroup.
*/
if (oom_group) {
mem_cgroup_print_oom_group(oom_group);
- mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
+ mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
+ (void*)message);
mem_cgroup_put(oom_group);
}
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7d1010453fb9..9f61dfec6a1f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -270,7 +270,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
* node_dirtyable_memory - number of dirtyable pages in a node
* @pgdat: the node
*
- * Returns the node's number of pages potentially available for dirty
+ * Return: the node's number of pages potentially available for dirty
* page cache. This is the base value for the per-node dirty limits.
*/
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
@@ -355,7 +355,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
/**
* global_dirtyable_memory - number of globally dirtyable pages
*
- * Returns the global number of pages potentially available for dirty
+ * Return: the global number of pages potentially available for dirty
* page cache. This is the base value for the global dirty limits.
*/
static unsigned long global_dirtyable_memory(void)
@@ -470,7 +470,7 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
* node_dirty_limit - maximum number of dirty pages allowed in a node
* @pgdat: the node
*
- * Returns the maximum number of dirty pages allowed in a node, based
+ * Return: the maximum number of dirty pages allowed in a node, based
* on the node's dirtyable memory.
*/
static unsigned long node_dirty_limit(struct pglist_data *pgdat)
@@ -495,7 +495,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat)
* node_dirty_ok - tells whether a node is within its dirty limits
* @pgdat: the node to check
*
- * Returns %true when the dirty pages in @pgdat are within the node's
+ * Return: %true when the dirty pages in @pgdat are within the node's
* dirty limit, %false if the limit is exceeded.
*/
bool node_dirty_ok(struct pglist_data *pgdat)
@@ -743,9 +743,6 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
* __wb_calc_thresh - @wb's share of dirty throttling threshold
* @dtc: dirty_throttle_context of interest
*
- * Returns @wb's dirty limit in pages. The term "dirty" in the context of
- * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
- *
* Note that balance_dirty_pages() will only seriously take it as a hard limit
* when sleeping max_pause per page is not enough to keep the dirty pages under
* control. For example, when the device is completely stalled due to some error
@@ -759,6 +756,9 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
*
* The wb's share of dirty limit will be adapting to its throughput and
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
+ *
+ * Return: @wb's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
*/
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
@@ -1918,7 +1918,9 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
* @wb: bdi_writeback of interest
*
* Determines whether background writeback should keep writing @wb or it's
- * clean enough. Returns %true if writeback should continue.
+ * clean enough.
+ *
+ * Return: %true if writeback should continue.
*/
bool wb_over_bg_thresh(struct bdi_writeback *wb)
{
@@ -2147,6 +2149,8 @@ EXPORT_SYMBOL(tag_pages_for_writeback);
* lock/page writeback access order inversion - we should only ever lock
* multiple pages in ascending page->index order, and looping back to the start
* of the file violates that rule and causes deadlocks.
+ *
+ * Return: %0 on success, negative error code otherwise
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
@@ -2305,6 +2309,8 @@ static int __writepage(struct page *page, struct writeback_control *wbc,
*
* This is a library function, which implements the writepages()
* address_space_operation.
+ *
+ * Return: %0 on success, negative error code otherwise
*/
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc)
@@ -2351,6 +2357,8 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
*
* Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
* function returns.
+ *
+ * Return: %0 on success, negative error code otherwise
*/
int write_one_page(struct page *page)
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 46285d28e43b..0d18630ef1fa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
#include <linux/mempolicy.h>
#include <linux/memremap.h>
#include <linux/stop_machine.h>
+#include <linux/random.h>
#include <linux/sort.h>
#include <linux/pfn.h>
#include <linux/backing-dev.h>
@@ -72,6 +73,7 @@
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include "internal.h"
+#include "shuffle.h"
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
@@ -289,8 +291,8 @@ EXPORT_SYMBOL(movable_zone);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#if MAX_NUMNODES > 1
-int nr_node_ids __read_mostly = MAX_NUMNODES;
-int nr_online_nodes __read_mostly = 1;
+unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
+unsigned int nr_online_nodes __read_mostly = 1;
EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
@@ -336,38 +338,33 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
}
/*
- * Returns true when the remaining initialisation should be deferred until
- * later in the boot cycle when it can be parallelised.
+ * Calculate first_deferred_pfn in case:
+ * - in MEMMAP_EARLY context
+ * - this is the last zone
+ *
+ * If the first aligned section doesn't exceed the end_pfn, set it to
+ * first_deferred_pfn and return it.
*/
-static bool __meminit
-defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+unsigned long __meminit
+defer_pfn(int nid, unsigned long start_pfn, unsigned long end_pfn,
+ enum memmap_context context)
{
- static unsigned long prev_end_pfn, nr_initialised;
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ unsigned long pfn;
- /*
- * prev_end_pfn static that contains the end of previous zone
- * No need to protect because called very early in boot before smp_init.
- */
- if (prev_end_pfn != end_pfn) {
- prev_end_pfn = end_pfn;
- nr_initialised = 0;
- }
+ if (context != MEMMAP_EARLY)
+ return end_pfn;
- /* Always populate low zones for address-constrained allocations */
- if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
- return false;
+ /* Always populate low zones */
+ if (end_pfn < pgdat_end_pfn(pgdat))
+ return end_pfn;
- /*
- * We start only with one section of pages, more pages are added as
- * needed until the rest of deferred pages are initialized.
- */
- nr_initialised++;
- if ((nr_initialised > PAGES_PER_SECTION) &&
- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
- NODE_DATA(nid)->first_deferred_pfn = pfn;
- return true;
+ pfn = roundup(start_pfn + PAGES_PER_SECTION - 1, PAGES_PER_SECTION);
+ if (end_pfn > pfn) {
+ pgdat->first_deferred_pfn = pfn;
+ end_pfn = pfn;
}
- return false;
+ return end_pfn;
}
#else
#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
@@ -377,9 +374,11 @@ static inline bool early_page_uninitialised(unsigned long pfn)
return false;
}
-static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+unsigned long __meminit
+defer_pfn(int nid, unsigned long start_pfn, unsigned long end_pfn,
+ enum memmap_context context)
{
- return false;
+ return end_pfn;
}
#endif
@@ -742,12 +741,6 @@ static inline void set_page_order(struct page *page, unsigned int order)
__SetPageBuddy(page);
}
-static inline void rmv_page_order(struct page *page)
-{
- __ClearPageBuddy(page);
- set_page_private(page, 0);
-}
-
/*
* This function checks whether a page is free && is the buddy
* we can coalesce a page and its buddy if
@@ -789,6 +782,57 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
return 0;
}
+#ifdef CONFIG_COMPACTION
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ struct capture_control *capc = current->capture_control;
+
+ return capc &&
+ !(current->flags & PF_KTHREAD) &&
+ !capc->page &&
+ capc->cc->zone == zone &&
+ capc->cc->direct_compaction ? capc : NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ if (!capc || order != capc->cc->order)
+ return false;
+
+ /* Do not accidentally pollute CMA or isolated regions*/
+ if (is_migrate_cma(migratetype) ||
+ is_migrate_isolate(migratetype))
+ return false;
+
+ /*
+ * Do not let lower order allocations polluate a movable pageblock.
+ * This might let an unmovable request use a reclaimable pageblock
+ * and vice-versa but no more than normal fallback logic which can
+ * have trouble finding a high-order free page.
+ */
+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
+ return false;
+
+ capc->page = page;
+ return true;
+}
+
+#else
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ return NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ return false;
+}
+#endif /* CONFIG_COMPACTION */
+
/*
* Freeing function for a buddy system allocator.
*
@@ -822,6 +866,7 @@ static inline void __free_one_page(struct page *page,
unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
+ struct capture_control *capc = task_capc(zone);
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
@@ -837,6 +882,11 @@ static inline void __free_one_page(struct page *page,
continue_merging:
while (order < max_order - 1) {
+ if (compaction_capture(capc, page, order, migratetype)) {
+ __mod_zone_freepage_state(zone, -(1 << order),
+ migratetype);
+ return;
+ }
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
@@ -848,13 +898,11 @@ continue_merging:
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
- if (page_is_guard(buddy)) {
+ if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
- } else {
- list_del(&buddy->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(buddy);
- }
+ else
+ del_page_from_free_area(buddy, &zone->free_area[order],
+ migratetype);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
@@ -896,7 +944,8 @@ done_merging:
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
- if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
+ if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
+ && !is_shuffle_order(order)) {
struct page *higher_page, *higher_buddy;
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
@@ -904,15 +953,18 @@ done_merging:
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1)) {
- list_add_tail(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
- goto out;
+ add_to_free_area_tail(page, &zone->free_area[order],
+ migratetype);
+ return;
}
}
- list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
-out:
- zone->free_area[order].nr_free++;
+ if (is_shuffle_order(order))
+ add_to_free_area_random(page, &zone->free_area[order],
+ migratetype);
+ else
+ add_to_free_area(page, &zone->free_area[order], migratetype);
+
}
/*
@@ -1056,7 +1108,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (PageMappingFlags(page))
page->mapping = NULL;
if (memcg_kmem_enabled() && PageKmemcg(page))
- memcg_kmem_uncharge(page, order);
+ __memcg_kmem_uncharge(page, order);
if (check_free)
bad += free_pages_check(page);
if (bad)
@@ -1213,17 +1265,23 @@ static void free_one_page(struct zone *zone,
spin_unlock(&zone->lock);
}
-static void __meminit __init_single_page(struct page *page, unsigned long pfn,
- unsigned long zone, int nid)
+static void __meminit __init_struct_page_nolru(struct page *page,
+ unsigned long pfn,
+ unsigned long zone, int nid,
+ bool is_reserved)
{
mm_zero_struct_page(page);
- set_page_links(page, zone, nid, pfn);
+
+ /*
+ * We can use a non-atomic operation for setting the
+ * PG_reserved flag as we are still initializing the pages.
+ */
+ set_page_links(page, zone, nid, pfn, is_reserved);
init_page_count(page);
page_mapcount_reset(page);
page_cpupid_reset_last(page);
page_kasan_tag_reset(page);
- INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
@@ -1231,6 +1289,74 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
#endif
}
+static void __meminit __init_single_page(struct page *page, unsigned long pfn,
+ unsigned long zone, int nid)
+{
+ __init_struct_page_nolru(page, pfn, zone, nid, false);
+ INIT_LIST_HEAD(&page->lru);
+}
+
+static void __meminit __init_pageblock(unsigned long start_pfn,
+ unsigned long nr_pages,
+ unsigned long zone, int nid,
+ struct dev_pagemap *pgmap,
+ bool is_reserved)
+{
+ unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ struct page *start_page = pfn_to_page(start_pfn);
+ unsigned long pfn = start_pfn + nr_pages - 1;
+ struct page *page;
+
+ /*
+ * Enforce the following requirements:
+ * size > 0
+ * size < pageblock_nr_pages
+ * start_pfn -> pfn does not cross pageblock_nr_pages boundary
+ */
+ VM_BUG_ON(((start_pfn ^ pfn) | (nr_pages - 1)) > nr_pgmask);
+
+ /*
+ * Work from highest page to lowest, this way we will still be
+ * warm in the cache when we call set_pageblock_migratetype
+ * below.
+ *
+ * The loop is based around the page pointer as the main index
+ * instead of the pfn because pfn is not used inside the loop if
+ * the section number is not in page flags and WANT_PAGE_VIRTUAL
+ * is not defined.
+ */
+ for (page = start_page + nr_pages; page-- != start_page; pfn--) {
+ __init_struct_page_nolru(page, pfn, zone, nid, is_reserved);
+
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back
+ * pointer and hmm_data. It is a bug if a ZONE_DEVICE
+ * page is ever freed or placed on a driver-private list.
+ */
+ page->pgmap = pgmap;
+ if (!pgmap)
+ INIT_LIST_HEAD(&page->lru);
+ }
+
+ /*
+ * Mark the block movable so that blocks are reserved for
+ * movable at startup. This will force kernel allocations
+ * to reserve their blocks rather than leaking throughout
+ * the address space during boot when many long-lived
+ * kernel allocations are made.
+ *
+ * bitmap is created for zone's valid pfn range. but memmap
+ * can be created for invalid pages (for alignment)
+ * check here not to call set_pageblock_migratetype() against
+ * pfn out of zone.
+ *
+ * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+ * because this is done early in sparse_add_one_section
+ */
+ if (!(start_pfn & nr_pgmask))
+ set_pageblock_migratetype(start_page, MIGRATE_MOVABLE);
+}
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __meminit init_reserved_page(unsigned long pfn)
{
@@ -1303,7 +1429,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
+void __free_pages_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -1344,36 +1470,22 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
#endif
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
int nid;
- nid = __early_pfn_to_nid(pfn, state);
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
if (nid >= 0 && nid != node)
return false;
return true;
}
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
-}
-
#else
-
static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
return true;
}
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
-{
- return true;
-}
#endif
@@ -1382,7 +1494,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
{
if (early_page_uninitialised(pfn))
return;
- return __free_pages_boot_core(page, order);
+ __free_pages_core(page, order);
}
/*
@@ -1457,32 +1569,6 @@ void clear_zone_contiguous(struct zone *zone)
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __init deferred_free_range(unsigned long pfn,
- unsigned long nr_pages)
-{
- struct page *page;
- unsigned long i;
-
- if (!nr_pages)
- return;
-
- page = pfn_to_page(pfn);
-
- /* Free a large naturally-aligned chunk if possible */
- if (nr_pages == pageblock_nr_pages &&
- (pfn & (pageblock_nr_pages - 1)) == 0) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, pageblock_order);
- return;
- }
-
- for (i = 0; i < nr_pages; i++, page++, pfn++) {
- if ((pfn & (pageblock_nr_pages - 1)) == 0)
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, 0);
- }
-}
-
/* Completion tracking for deferred_init_memmap() threads */
static atomic_t pgdat_init_n_undone __initdata;
static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
@@ -1494,57 +1580,89 @@ static inline void __init pgdat_init_report_one_done(void)
}
/*
- * Returns true if page needs to be initialized or freed to buddy allocator.
+ * Returns count if page range needs to be initialized or freed
*
- * First we check if pfn is valid on architectures where it is possible to have
- * holes within pageblock_nr_pages. On systems where it is not possible, this
- * function is optimized out.
+ * First we check if the contiguous pfns are valid on architectures where it
+ * is possible to have holes within pageblock_nr_pages. On systems where it
+ * is not possible, this function is optimized out.
*
- * Then, we check if a current large page is valid by only checking the validity
- * of the head pfn.
+ * Then, we check if a current large page is valid by only checking the
+ * validity of the head pfn.
*
- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
- * within a node: a pfn is between start and end of a node, but does not belong
- * to this memory node.
*/
-static inline bool __init
-deferred_pfn_valid(int nid, unsigned long pfn,
- struct mminit_pfnnid_cache *nid_init_state)
+static unsigned long __next_pfn_valid_range(unsigned long *pfn,
+ unsigned long *i,
+ unsigned long end_pfn)
{
- if (!pfn_valid_within(pfn))
- return false;
- if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
- return false;
- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
- return false;
- return true;
+ unsigned long start_pfn = *i;
+
+ while (start_pfn < end_pfn) {
+ unsigned long t = ALIGN(start_pfn + 1, pageblock_nr_pages);
+ unsigned long pageblock_pfn = min(t, end_pfn);
+ unsigned long count = 0;
+
+#ifndef CONFIG_HOLES_IN_ZONE
+ if (pfn_valid(start_pfn))
+ count = pageblock_pfn - start_pfn;
+ start_pfn = pageblock_pfn;
+#else
+ while (start_pfn < pageblock_pfn) {
+ if (pfn_valid(start_pfn++)) {
+ count++;
+ continue;
+ }
+
+ if (!count)
+ continue;
+
+ /*
+ * The last PFN was invalid, report the block of
+ * PFNs we currently have available and skip over
+ * the invalid one.
+ */
+ *pfn = start_pfn - (count + 1);
+ *i = start_pfn;
+ return count;
+ }
+#endif
+ if (!count)
+ continue;
+
+ *pfn = start_pfn - count;
+ *i = start_pfn;
+ return count;
+ }
+
+ return 0;
}
+#define for_each_deferred_pfn_valid_range(pfn, count, i, start_pfn, end_pfn) \
+ for (i = (start_pfn), \
+ count = __next_pfn_valid_range(&pfn, &i, (end_pfn)); \
+ count; \
+ count = __next_pfn_valid_range(&pfn, &i, (end_pfn)))
+
/*
* Free pages to buddy allocator. Try to free aligned pages in
* pageblock_nr_pages sizes.
*/
-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
+static void __init deferred_free_pages(unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
- unsigned long nr_pgmask = pageblock_nr_pages - 1;
- unsigned long nr_free = 0;
-
- for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
- deferred_free_range(pfn - nr_free, nr_free);
- nr_free = 0;
- } else if (!(pfn & nr_pgmask)) {
- deferred_free_range(pfn - nr_free, nr_free);
- nr_free = 1;
- touch_nmi_watchdog();
+ unsigned long i, pfn, count;
+
+ for_each_deferred_pfn_valid_range(pfn, count, i, start_pfn, end_pfn) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (count == pageblock_nr_pages) {
+ __free_pages_core(page, pageblock_order);
} else {
- nr_free++;
+ while (count--)
+ __free_pages_core(page++, 0);
}
+
+ touch_nmi_watchdog();
}
- /* Free the last block of pages to allocator */
- deferred_free_range(pfn - nr_free, nr_free);
}
/*
@@ -1552,43 +1670,121 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
* by performing it only once every pageblock_nr_pages.
* Return number of pages initialized.
*/
-static unsigned long __init deferred_init_pages(int nid, int zid,
- unsigned long pfn,
+static unsigned long __init deferred_init_pages(struct zone *zone,
+ unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
- unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ int nid = zone_to_nid(zone);
+ unsigned long i, pfn, count;
unsigned long nr_pages = 0;
- struct page *page = NULL;
+ int zid = zone_idx(zone);
+
+ for_each_deferred_pfn_valid_range(pfn, count, i, start_pfn, end_pfn) {
+ nr_pages += count;
+ __init_pageblock(pfn, count, zid, nid, NULL, false);
+
+ touch_nmi_watchdog();
+ }
+
+ return nr_pages;
+}
+
+/*
+ * This function is meant to pre-load the iterator for the zone init.
+ * Specifically it walks through the ranges until we are caught up to the
+ * first_init_pfn value and exits there. If we never encounter the value we
+ * return false indicating there are no valid ranges left.
+ */
+static bool __init
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
+ unsigned long *spfn, unsigned long *epfn,
+ unsigned long first_init_pfn)
+{
+ u64 j;
- for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
- page = NULL;
+ /*
+ * Start out by walking through the ranges in this zone that have
+ * already been initialized. We don't need to do anything with them
+ * so we just need to flush them out of the system.
+ */
+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
+ if (*epfn <= first_init_pfn)
continue;
- } else if (!page || !(pfn & nr_pgmask)) {
- page = pfn_to_page(pfn);
- touch_nmi_watchdog();
- } else {
- page++;
- }
- __init_single_page(page, pfn, zid, nid);
- nr_pages++;
+ if (*spfn < first_init_pfn)
+ *spfn = first_init_pfn;
+ *i = j;
+ return true;
}
- return (nr_pages);
+
+ return false;
+}
+
+/*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, than free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ *
+ * In order to try and keep some memory in the cache we have the loop
+ * broken along max page order boundaries. This way we will not cause
+ * any issues with the buddy page computation.
+ */
+static unsigned long __init
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+ unsigned long spfn = *start_pfn, epfn = *end_pfn;
+ unsigned long nr_pages = 0;
+ u64 j = *i;
+
+ /* First we loop through and initialize the page values */
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
+ unsigned long t;
+
+ if (mo_pfn <= spfn)
+ break;
+
+ t = min(mo_pfn, epfn);
+ nr_pages += deferred_init_pages(zone, spfn, t);
+
+ if (mo_pfn <= epfn)
+ break;
+ }
+
+ /* Reset values and now loop through freeing pages as needed */
+ j = *i;
+
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
+ unsigned long t;
+
+ if (mo_pfn <= *start_pfn)
+ break;
+
+ t = min(mo_pfn, *end_pfn);
+ deferred_free_pages(*start_pfn, t);
+ *start_pfn = t;
+
+ if (mo_pfn < *end_pfn)
+ break;
+ }
+
+ /* Store our current values to be reused on the next iteration */
+ *i = j;
+
+ return nr_pages;
}
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
- int nid = pgdat->node_id;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ unsigned long spfn = 0, epfn = 0, nr_pages = 0;
+ unsigned long first_init_pfn, flags;
unsigned long start = jiffies;
- unsigned long nr_pages = 0;
- unsigned long spfn, epfn, first_init_pfn, flags;
- phys_addr_t spa, epa;
- int zid;
struct zone *zone;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ int zid;
u64 i;
/* Bind memory initialisation thread to a local node if possible */
@@ -1614,31 +1810,27 @@ static int __init deferred_init_memmap(void *data)
if (first_init_pfn < zone_end_pfn(zone))
break;
}
- first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
+
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_init_pfn))
+ goto zone_empty;
/*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, than free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
*/
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
- }
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
- }
+ while (spfn < epfn)
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+zone_empty:
pgdat_resize_unlock(pgdat, &flags);
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
- pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
- jiffies_to_msecs(jiffies - start));
+ pr_info("node %d initialised, %lu pages in %ums\n",
+ pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
pgdat_init_report_one_done();
return 0;
@@ -1662,14 +1854,11 @@ static int __init deferred_init_memmap(void *data)
static noinline bool __init
deferred_grow_zone(struct zone *zone, unsigned int order)
{
- int zid = zone_idx(zone);
- int nid = zone_to_nid(zone);
- pg_data_t *pgdat = NODE_DATA(nid);
unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
- unsigned long nr_pages = 0;
- unsigned long first_init_pfn, spfn, epfn, t, flags;
+ pg_data_t *pgdat = zone->zone_pgdat;
unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
- phys_addr_t spa, epa;
+ unsigned long spfn, epfn, flags;
+ unsigned long nr_pages = 0;
u64 i;
/* Only the last zone may have deferred pages */
@@ -1698,37 +1887,24 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
return true;
}
- first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
-
- if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_deferred_pfn)) {
+ pgdat->first_deferred_pfn = ULONG_MAX;
pgdat_resize_unlock(pgdat, &flags);
- return false;
+ return true;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
-
- while (spfn < epfn && nr_pages < nr_pages_needed) {
- t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
- first_deferred_pfn = min(t, epfn);
- nr_pages += deferred_init_pages(nid, zid, spfn,
- first_deferred_pfn);
- spfn = first_deferred_pfn;
- }
-
- if (nr_pages >= nr_pages_needed)
- break;
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
+ */
+ while (spfn < epfn && nr_pages < nr_pages_needed) {
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ first_deferred_pfn = spfn;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
-
- if (first_deferred_pfn == epfn)
- break;
- }
pgdat->first_deferred_pfn = first_deferred_pfn;
pgdat_resize_unlock(pgdat, &flags);
@@ -1752,9 +1928,9 @@ _deferred_grow_zone(struct zone *zone, unsigned int order)
void __init page_alloc_init_late(void)
{
struct zone *zone;
+ int nid;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- int nid;
/* There will be num_node_state(N_MEMORY) threads */
atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
@@ -1779,6 +1955,9 @@ void __init page_alloc_init_late(void)
memblock_discard();
#endif
+ for_each_node_state(nid, N_MEMORY)
+ shuffle_free_memory(NODE_DATA(nid));
+
for_each_populated_zone(zone)
set_zone_contiguous(zone);
}
@@ -1849,7 +2028,7 @@ static inline void expand(struct zone *zone, struct page *page,
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
- list_add(&page[size].lru, &area->free_list[migratetype]);
+ add_to_free_area(&page[size], area, migratetype);
area->nr_free++;
set_page_order(&page[size], high);
}
@@ -1945,8 +2124,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
- kernel_poison_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
+ kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags);
}
@@ -1991,13 +2170,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
- page = list_first_entry_or_null(&area->free_list[migratetype],
- struct page, lru);
+ page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
- list_del(&page->lru);
- rmv_page_order(page);
- area->nr_free--;
+ del_page_from_free_area(page, area, migratetype);
expand(zone, page, order, current_order, area, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
@@ -2083,8 +2259,7 @@ static int move_freepages(struct zone *zone,
}
order = page_order(page);
- list_move(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
+ move_to_free_area(page, &zone->free_area[order], migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -2170,6 +2345,18 @@ static inline void boost_watermark(struct zone *zone)
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);
+
+ /*
+ * high watermark may be uninitialised if fragmentation occurs
+ * very early in boot so do not boost. We do not fall
+ * through and boost by pageblock_nr_pages as failing
+ * allocations that early means that reclaim is not going
+ * to help and it may even be impossible to reclaim the
+ * boosted watermark resulting in a hang.
+ */
+ if (!max_boost)
+ return;
+
max_boost = max(pageblock_nr_pages, max_boost);
zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
@@ -2260,7 +2447,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
single_page:
area = &zone->free_area[current_order];
- list_move(&page->lru, &area->free_list[start_type]);
+ move_to_free_area(page, area, start_type);
}
/*
@@ -2284,7 +2471,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
if (fallback_mt == MIGRATE_TYPES)
break;
- if (list_empty(&area->free_list[fallback_mt]))
+ if (free_area_empty(area, fallback_mt))
continue;
if (can_steal_fallback(order, migratetype))
@@ -2371,9 +2558,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
for (order = 0; order < MAX_ORDER; order++) {
struct free_area *area = &(zone->free_area[order]);
- page = list_first_entry_or_null(
- &area->free_list[MIGRATE_HIGHATOMIC],
- struct page, lru);
+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
if (!page)
continue;
@@ -2496,8 +2681,7 @@ find_smallest:
VM_BUG_ON(current_order == MAX_ORDER);
do_steal:
- page = list_first_entry(&area->free_list[fallback_mt],
- struct page, lru);
+ page = get_page_from_free_area(area, fallback_mt);
steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
can_steal);
@@ -2934,6 +3118,7 @@ EXPORT_SYMBOL_GPL(split_page);
int __isolate_free_page(struct page *page, unsigned int order)
{
+ struct free_area *area = &page_zone(page)->free_area[order];
unsigned long watermark;
struct zone *zone;
int mt;
@@ -2950,7 +3135,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
* watermark, because we already know our high-order page
* exists.
*/
- watermark = min_wmark_pages(zone) + (1UL << order);
+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
@@ -2958,9 +3143,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
/* Remove page from free list */
- list_del(&page->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(page);
+
+ del_page_from_free_area(page, area, mt);
/*
* Set the pageblock if the isolated page is at least half of a
@@ -3161,24 +3345,14 @@ static int __init fail_page_alloc_debugfs(void)
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
&fail_page_alloc.attr);
- if (IS_ERR(dir))
- return PTR_ERR(dir);
-
- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_reclaim))
- goto fail;
- if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
- &fail_page_alloc.ignore_gfp_highmem))
- goto fail;
- if (!debugfs_create_u32("min-order", mode, dir,
- &fail_page_alloc.min_order))
- goto fail;
- return 0;
-fail:
- debugfs_remove_recursive(dir);
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_reclaim);
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
- return -ENOMEM;
+ return 0;
}
late_initcall(fail_page_alloc_debugfs);
@@ -3268,13 +3442,13 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
continue;
for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
- if (!list_empty(&area->free_list[mt]))
+ if (!free_area_empty(area, mt))
return true;
}
#ifdef CONFIG_CMA
if ((alloc_flags & ALLOC_CMA) &&
- !list_empty(&area->free_list[MIGRATE_CMA])) {
+ !free_area_empty(area, MIGRATE_CMA)) {
return true;
}
#endif
@@ -3698,7 +3872,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compact_result)
{
- struct page *page;
+ struct page *page = NULL;
unsigned long pflags;
unsigned int noreclaim_flag;
@@ -3709,13 +3883,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
noreclaim_flag = memalloc_noreclaim_save();
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- prio);
+ prio, &page);
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
- if (*compact_result <= COMPACT_INACTIVE)
+ if (*compact_result <= COMPACT_INACTIVE) {
+ WARN_ON_ONCE(page);
return NULL;
+ }
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
@@ -3723,7 +3899,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ /* Prep a captured page if available */
+ if (page)
+ prep_new_page(page, order, gfp_mask, alloc_flags);
+
+ /* Try get a page from the freelist if available */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page) {
struct zone *zone = page_zone(page);
@@ -4556,7 +4738,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
- unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
+ unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
@@ -4749,6 +4931,8 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
* This function is also limited by MAX_ORDER.
*
* Memory allocated by this function must be released by free_pages_exact().
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
*/
void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
{
@@ -4769,6 +4953,8 @@ EXPORT_SYMBOL(alloc_pages_exact);
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
*/
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
@@ -4802,11 +4988,13 @@ EXPORT_SYMBOL(free_pages_exact);
* nr_free_zone_pages - count number of pages beyond high watermark
* @offset: The zone index of the highest zone
*
- * nr_free_zone_pages() counts the number of counts pages which are beyond the
+ * nr_free_zone_pages() counts the number of pages which are beyond the
* high watermark within all zones at or below a given zone index. For each
* zone, the number of pages is calculated as:
*
* nr_free_zone_pages = managed_pages - high_pages
+ *
+ * Return: number of pages beyond high watermark.
*/
static unsigned long nr_free_zone_pages(int offset)
{
@@ -4833,6 +5021,9 @@ static unsigned long nr_free_zone_pages(int offset)
*
* nr_free_buffer_pages() counts the number of pages which are beyond the high
* watermark within ZONE_DMA and ZONE_NORMAL.
+ *
+ * Return: number of pages beyond high watermark within ZONE_DMA and
+ * ZONE_NORMAL.
*/
unsigned long nr_free_buffer_pages(void)
{
@@ -4845,6 +5036,8 @@ EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
*
* nr_free_pagecache_pages() counts the number of pages which are beyond the
* high watermark within all zones.
+ *
+ * Return: number of pages beyond high watermark within all zones.
*/
unsigned long nr_free_pagecache_pages(void)
{
@@ -5176,7 +5369,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
types[order] = 0;
for (type = 0; type < MIGRATE_TYPES; type++) {
- if (!list_empty(&area->free_list[type]))
+ if (!free_area_empty(area, type))
types[order] |= 1 << type;
}
}
@@ -5291,7 +5484,8 @@ static int node_load[MAX_NUMNODES];
* from each node to each node in the system), and should also prefer nodes
* with no CPUs, since presumably they'll have very little allocation pressure
* on them otherwise.
- * It returns -1 if no node is found.
+ *
+ * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
*/
static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
@@ -5597,7 +5791,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
+ pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
@@ -5630,6 +5824,36 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
return false;
}
+static void __meminit __memmap_init_hotplug(unsigned long size, int nid,
+ unsigned long zone,
+ unsigned long start_pfn,
+ struct dev_pagemap *pgmap)
+{
+ unsigned long pfn = start_pfn + size;
+
+ while (pfn != start_pfn) {
+ unsigned long stride = pfn;
+
+ pfn = max(ALIGN_DOWN(pfn - 1, pageblock_nr_pages), start_pfn);
+ stride -= pfn;
+
+ /*
+ * The last argument of __init_pageblock is a boolean
+ * value indicating if the page will be marked as reserved.
+ *
+ * Mark page reserved as it will need to wait for onlining
+ * phase for it to be fully associated with a zone.
+ *
+ * Under certain circumstances ZONE_DEVICE pages may not
+ * need to be marked as reserved, however there is still
+ * code that is depending on this being set for now.
+ */
+ __init_pageblock(pfn, stride, zone, nid, pgmap, true);
+
+ cond_resched();
+ }
+}
+
/*
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
@@ -5640,49 +5864,59 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
struct vmem_altmap *altmap)
{
unsigned long pfn, end_pfn = start_pfn + size;
- struct page *page;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
+ if (context == MEMMAP_HOTPLUG) {
#ifdef CONFIG_ZONE_DEVICE
- /*
- * Honor reservation requested by the driver for this ZONE_DEVICE
- * memory. We limit the total number of pages to initialize to just
- * those that might contain the memory mapping. We will defer the
- * ZONE_DEVICE page initialization until after we have released
- * the hotplug lock.
- */
- if (zone == ZONE_DEVICE) {
- if (!altmap)
- return;
+ /*
+ * Honor reservation requested by the driver for this
+ * ZONE_DEVICE memory. We limit the total number of pages to
+ * initialize to just those that might contain the memory
+ * mapping. We will defer the ZONE_DEVICE page initialization
+ * until after we have released the hotplug lock.
+ */
+ if (zone == ZONE_DEVICE) {
+ if (!altmap)
+ return;
+
+ if (start_pfn == altmap->base_pfn)
+ start_pfn += altmap->reserve;
+ end_pfn = altmap->base_pfn +
+ vmem_altmap_offset(altmap);
+ }
+#endif
+ /*
+ * For these ZONE_DEVICE pages we don't need to record the
+ * pgmap as they should represent only those pages used to
+ * store the memory map. The actual ZONE_DEVICE pages will
+ * be initialized later.
+ */
+ __memmap_init_hotplug(end_pfn - start_pfn, nid, zone,
+ start_pfn, NULL);
- if (start_pfn == altmap->base_pfn)
- start_pfn += altmap->reserve;
- end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ return;
}
-#endif
+
+ end_pfn = defer_pfn(nid, start_pfn, end_pfn, context);
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ struct page *page;
+
/*
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn))
- continue;
- if (!early_pfn_in_nid(pfn, nid))
- continue;
- if (overlap_memmap_init(zone, &pfn))
- continue;
- if (defer_init(nid, pfn, end_pfn))
- break;
- }
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (overlap_memmap_init(zone, &pfn))
+ continue;
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
- if (context == MEMMAP_HOTPLUG)
- __SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
@@ -5709,7 +5943,6 @@ void __ref memmap_init_zone_device(struct zone *zone,
unsigned long size,
struct dev_pagemap *pgmap)
{
- unsigned long pfn, end_pfn = start_pfn + size;
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long zone_idx = zone_idx(zone);
unsigned long start = jiffies;
@@ -5725,53 +5958,13 @@ void __ref memmap_init_zone_device(struct zone *zone,
*/
if (pgmap->altmap_valid) {
struct vmem_altmap *altmap = &pgmap->altmap;
+ unsigned long end_pfn = start_pfn + size;
start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
size = end_pfn - start_pfn;
}
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- struct page *page = pfn_to_page(pfn);
-
- __init_single_page(page, pfn, zone_idx, nid);
-
- /*
- * Mark page reserved as it will need to wait for onlining
- * phase for it to be fully associated with a zone.
- *
- * We can use the non-atomic __set_bit operation for setting
- * the flag as we are still initializing the pages.
- */
- __SetPageReserved(page);
-
- /*
- * ZONE_DEVICE pages union ->lru with a ->pgmap back
- * pointer and hmm_data. It is a bug if a ZONE_DEVICE
- * page is ever freed or placed on a driver-private list.
- */
- page->pgmap = pgmap;
- page->hmm_data = 0;
-
- /*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
- *
- * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
- * because this is done early in sparse_add_one_section
- */
- if (!(pfn & (pageblock_nr_pages - 1))) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- cond_resched();
- }
- }
+ __memmap_init_hotplug(size, nid, zone_idx, start_pfn, pgmap);
pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
size, jiffies_to_msecs(jiffies - start));
@@ -6004,7 +6197,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn,
return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- if (nid != -1) {
+ if (nid != NUMA_NO_NODE) {
state->last_start = start_pfn;
state->last_end = end_pfn;
state->last_nid = nid;
@@ -6202,7 +6395,7 @@ unsigned long __init __absent_pages_in_range(int nid,
* @start_pfn: The start PFN to start searching for holes
* @end_pfn: The end PFN to stop searching for holes
*
- * It returns the number of pages frames in memory holes within a range.
+ * Return: the number of pages frames in memory holes within a range.
*/
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn)
@@ -6364,10 +6557,14 @@ static void __ref setup_usemap(struct pglist_data *pgdat,
{
unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
- if (usemapsize)
+ if (usemapsize) {
zone->pageblock_flags =
memblock_alloc_node_nopanic(usemapsize,
pgdat->node_id);
+ if (!zone->pageblock_flags)
+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
+ usemapsize, zone->name, pgdat->node_id);
+ }
}
#else
static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -6597,6 +6794,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
map = memblock_alloc_node_nopanic(size, pgdat->node_id);
+ if (!map)
+ panic("Failed to allocate %ld bytes for node %d memory map\n",
+ size, pgdat->node_id);
pgdat->node_mem_map = map + offset;
}
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
@@ -6752,14 +6952,14 @@ void __init setup_nr_node_ids(void)
* model has fine enough granularity to avoid incorrect mapping for the
* populated node map.
*
- * Returns the determined alignment in pfn's. 0 if there is no alignment
+ * Return: the determined alignment in pfn's. 0 if there is no alignment
* requirement (single node).
*/
unsigned long __init node_map_pfn_alignment(void)
{
unsigned long accl_mask = 0, last_end = 0;
unsigned long start, end, mask;
- int last_nid = -1;
+ int last_nid = NUMA_NO_NODE;
int i, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
@@ -6807,7 +7007,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
/**
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
- * It returns the minimum PFN based on information provided via
+ * Return: the minimum PFN based on information provided via
* memblock_set_node().
*/
unsigned long __init find_min_pfn_with_active_regions(void)
@@ -7255,7 +7455,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
return pages;
}
-EXPORT_SYMBOL(free_reserved_area);
#ifdef CONFIG_HIGHMEM
void free_highmem_page(struct page *page)
@@ -7484,7 +7683,7 @@ static void __setup_per_zone_wmarks(void)
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
- * deltas control asynch page reclaim, and so should
+ * deltas control async page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
@@ -7961,7 +8160,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
/*
* Hugepages are not in LRU lists, but they're movable.
- * We need not scan over tail pages bacause we don't
+ * We need not scan over tail pages because we don't
* handle each tail page individually in migration.
*/
if (PageHuge(page)) {
@@ -8100,7 +8299,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* pageblocks in the range. Once isolated, the pageblocks should not
* be modified by others.
*
- * Returns zero on success or negative error code. On success all
+ * Return: zero on success or negative error code. On success all
* pages which PFN is in [start, end) are allocated for the caller and
* need to be freed with free_contig_range().
*/
@@ -8184,7 +8383,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
*/
lru_add_drain_all();
- drain_all_pages(cc.zone);
order = 0;
outer_start = start;
@@ -8235,8 +8433,9 @@ done:
pfn_max_align_up(end), migratetype);
return ret;
}
+#endif
-void free_contig_range(unsigned long pfn, unsigned nr_pages)
+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
{
unsigned int count = 0;
@@ -8248,7 +8447,6 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
}
WARN(count != 0, "%d pages are still in use!\n", count);
}
-#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/*
@@ -8309,6 +8507,9 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
spin_lock_irqsave(&zone->lock, flags);
pfn = start_pfn;
while (pfn < end_pfn) {
+ struct free_area *area;
+ int mt;
+
if (!pfn_valid(pfn)) {
pfn++;
continue;
@@ -8327,13 +8528,13 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
order = page_order(page);
+ area = &zone->free_area[order];
#ifdef CONFIG_DEBUG_VM
pr_info("remove from free list %lx %d %lx\n",
pfn, 1 << order, end_pfn);
#endif
- list_del(&page->lru);
- rmv_page_order(page);
- zone->free_area[order].nr_free--;
+ mt = get_pageblock_migratetype(page);
+ del_page_from_free_area(page, area, mt);
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 8c78b8d45117..762d5b7eb523 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -300,7 +300,7 @@ static int __meminit online_page_ext(unsigned long start_pfn,
start = SECTION_ALIGN_DOWN(start_pfn);
end = SECTION_ALIGN_UP(start_pfn + nr_pages);
- if (nid == -1) {
+ if (nid == NUMA_NO_NODE) {
/*
* In this case, "nid" already exists and contains valid memory.
* "start_pfn" passed to us is a pfn which is an arg for
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 28b06524939f..925b6f44a444 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -625,16 +625,14 @@ static const struct file_operations proc_page_owner_operations = {
static int __init pageowner_init(void)
{
- struct dentry *dentry;
-
if (!static_branch_unlikely(&page_owner_inited)) {
pr_info("page_owner is disabled\n");
return 0;
}
- dentry = debugfs_create_file("page_owner", 0400, NULL,
- NULL, &proc_page_owner_operations);
+ debugfs_create_file("page_owner", 0400, NULL, NULL,
+ &proc_page_owner_operations);
- return PTR_ERR_OR_ZERO(dentry);
+ return 0;
}
late_initcall(pageowner_init)
diff --git a/mm/page_poison.c b/mm/page_poison.c
index f0c15e9017c0..21d4f97cb49b 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -6,6 +6,7 @@
#include <linux/page_ext.h>
#include <linux/poison.h>
#include <linux/ratelimit.h>
+#include <linux/kasan.h>
static bool want_page_poisoning __read_mostly;
@@ -40,7 +41,10 @@ static void poison_page(struct page *page)
{
void *addr = kmap_atomic(page);
+ /* KASAN still think the page is in-use, so skip it. */
+ kasan_disable_current();
memset(addr, PAGE_POISON, PAGE_SIZE);
+ kasan_enable_current();
kunmap_atomic(addr);
}
diff --git a/mm/readahead.c b/mm/readahead.c
index 1ae16522412a..a4593654a26c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -81,6 +81,8 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping,
* @data: private data for the callback routine.
*
* Hides the details of the LRU cache etc from the filesystems.
+ *
+ * Returns: %0 on success, error return by @filler otherwise
*/
int read_cache_pages(struct address_space *mapping, struct list_head *pages,
int (*filler)(void *, struct page *), void *data)
diff --git a/mm/shmem.c b/mm/shmem.c
index 6ece1e2fe76e..c8cdaa012f18 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -36,6 +36,7 @@
#include <linux/uio.h>
#include <linux/khugepaged.h>
#include <linux/hugetlb.h>
+#include <linux/frontswap.h>
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
@@ -123,6 +124,10 @@ static unsigned long shmem_default_max_inodes(void)
static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
static int shmem_replace_page(struct page **pagep, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index);
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct vm_area_struct *vma,
+ vm_fault_t *fault_type);
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
@@ -1089,159 +1094,184 @@ static void shmem_evict_inode(struct inode *inode)
clear_inode(inode);
}
-static unsigned long find_swap_entry(struct xarray *xa, void *item)
+extern struct swap_info_struct *swap_info[];
+
+static int shmem_find_swap_entries(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_entries,
+ struct page **entries, pgoff_t *indices,
+ bool frontswap)
{
- XA_STATE(xas, xa, 0);
- unsigned int checked = 0;
- void *entry;
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct page *page;
+ unsigned int ret = 0;
+
+ if (!nr_entries)
+ return 0;
rcu_read_lock();
- xas_for_each(&xas, entry, ULONG_MAX) {
- if (xas_retry(&xas, entry))
+ xas_for_each(&xas, page, ULONG_MAX) {
+ if (xas_retry(&xas, page))
continue;
- if (entry == item)
- break;
- checked++;
- if ((checked % XA_CHECK_SCHED) != 0)
+
+ if (!xa_is_value(page))
continue;
- xas_pause(&xas);
- cond_resched_rcu();
+
+ if (frontswap) {
+ swp_entry_t entry = radix_to_swp_entry(page);
+
+ if (!frontswap_test(swap_info[swp_type(entry)],
+ swp_offset(entry)))
+ continue;
+ }
+
+ indices[ret] = xas.xa_index;
+ entries[ret] = page;
+
+ if (need_resched()) {
+ xas_pause(&xas);
+ cond_resched_rcu();
+ }
+ if (++ret == nr_entries)
+ break;
}
rcu_read_unlock();
- return entry ? xas.xa_index : -1;
+ return ret;
}
/*
- * If swap found in inode, free it and move page from swapcache to filecache.
+ * Move the swapped pages for an inode to page cache. Returns the count
+ * of pages swapped in, or the error in case of failure.
*/
-static int shmem_unuse_inode(struct shmem_inode_info *info,
- swp_entry_t swap, struct page **pagep)
+static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
+ pgoff_t *indices)
{
- struct address_space *mapping = info->vfs_inode.i_mapping;
- void *radswap;
- pgoff_t index;
- gfp_t gfp;
+ int i = 0;
+ int ret = 0;
int error = 0;
+ struct address_space *mapping = inode->i_mapping;
- radswap = swp_to_radix_entry(swap);
- index = find_swap_entry(&mapping->i_pages, radswap);
- if (index == -1)
- return -EAGAIN; /* tell shmem_unuse we found nothing */
-
- /*
- * Move _head_ to start search for next from here.
- * But be careful: shmem_evict_inode checks list_empty without taking
- * mutex, and there's an instant in list_move_tail when info->swaplist
- * would appear empty, if it were the only one on shmem_swaplist.
- */
- if (shmem_swaplist.next != &info->swaplist)
- list_move_tail(&shmem_swaplist, &info->swaplist);
+ for (i = 0; i < pvec.nr; i++) {
+ struct page *page = pvec.pages[i];
- gfp = mapping_gfp_mask(mapping);
- if (shmem_should_replace_page(*pagep, gfp)) {
- mutex_unlock(&shmem_swaplist_mutex);
- error = shmem_replace_page(pagep, gfp, info, index);
- mutex_lock(&shmem_swaplist_mutex);
- /*
- * We needed to drop mutex to make that restrictive page
- * allocation, but the inode might have been freed while we
- * dropped it: although a racing shmem_evict_inode() cannot
- * complete without emptying the page cache, our page lock
- * on this swapcache page is not enough to prevent that -
- * free_swap_and_cache() of our swap entry will only
- * trylock_page(), removing swap from page cache whatever.
- *
- * We must not proceed to shmem_add_to_page_cache() if the
- * inode has been freed, but of course we cannot rely on
- * inode or mapping or info to check that. However, we can
- * safely check if our swap entry is still in use (and here
- * it can't have got reused for another page): if it's still
- * in use, then the inode cannot have been freed yet, and we
- * can safely proceed (if it's no longer in use, that tells
- * nothing about the inode, but we don't need to unuse swap).
- */
- if (!page_swapcount(*pagep))
- error = -ENOENT;
+ if (!xa_is_value(page))
+ continue;
+ error = shmem_swapin_page(inode, indices[i],
+ &page, SGP_CACHE,
+ mapping_gfp_mask(mapping),
+ NULL, NULL);
+ if (error == 0) {
+ unlock_page(page);
+ put_page(page);
+ ret++;
+ }
+ if (error == -ENOMEM)
+ break;
+ error = 0;
}
+ return error ? error : ret;
+}
- /*
- * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
- * but also to hold up shmem_evict_inode(): so inode cannot be freed
- * beneath us (pagelock doesn't help until the page is in pagecache).
- */
- if (!error)
- error = shmem_add_to_page_cache(*pagep, mapping, index,
- radswap, gfp);
- if (error != -ENOMEM) {
- /*
- * Truncation and eviction use free_swap_and_cache(), which
- * only does trylock page: if we raced, best clean up here.
- */
- delete_from_swap_cache(*pagep);
- set_page_dirty(*pagep);
- if (!error) {
- spin_lock_irq(&info->lock);
- info->swapped--;
- spin_unlock_irq(&info->lock);
- swap_free(swap);
+/*
+ * If swap found in inode, free it and move page from swapcache to filecache.
+ */
+static int shmem_unuse_inode(struct inode *inode, unsigned int type,
+ bool frontswap, unsigned long *fs_pages_to_unuse)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t start = 0;
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
+ int ret = 0;
+
+ pagevec_init(&pvec);
+ do {
+ unsigned int nr_entries = PAGEVEC_SIZE;
+
+ if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
+ nr_entries = *fs_pages_to_unuse;
+
+ pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
+ pvec.pages, indices,
+ frontswap);
+ if (pvec.nr == 0) {
+ ret = 0;
+ break;
}
- }
- return error;
+
+ ret = shmem_unuse_swap_entries(inode, pvec, indices);
+ if (ret < 0)
+ break;
+
+ if (frontswap_partial) {
+ *fs_pages_to_unuse -= ret;
+ if (*fs_pages_to_unuse == 0) {
+ ret = FRONTSWAP_PAGES_UNUSED;
+ break;
+ }
+ }
+
+ start = indices[pvec.nr - 1];
+ } while (true);
+
+ return ret;
}
/*
- * Search through swapped inodes to find and replace swap by page.
+ * Read all the shared memory data that resides in the swap
+ * device 'type' back into memory, so the swap device can be
+ * unused.
*/
-int shmem_unuse(swp_entry_t swap, struct page *page)
+int shmem_unuse(unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
- struct list_head *this, *next;
- struct shmem_inode_info *info;
- struct mem_cgroup *memcg;
+ struct shmem_inode_info *info, *next;
+ struct inode *inode;
+ struct inode *prev_inode = NULL;
int error = 0;
- /*
- * There's a faint possibility that swap page was replaced before
- * caller locked it: caller will come back later with the right page.
- */
- if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
- goto out;
+ if (list_empty(&shmem_swaplist))
+ return 0;
+
+ mutex_lock(&shmem_swaplist_mutex);
/*
- * Charge page using GFP_KERNEL while we can wait, before taking
- * the shmem_swaplist_mutex which might hold up shmem_writepage().
- * Charged back to the user (not to caller) when swap account is used.
+ * The extra refcount on the inode is necessary to safely dereference
+ * p->next after re-acquiring the lock. New shmem inodes with swap
+ * get added to the end of the list and we will scan them all.
*/
- error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
- &memcg, false);
- if (error)
- goto out;
- /* No memory allocation: swap entry occupies the slot for the page */
- error = -EAGAIN;
-
- mutex_lock(&shmem_swaplist_mutex);
- list_for_each_safe(this, next, &shmem_swaplist) {
- info = list_entry(this, struct shmem_inode_info, swaplist);
- if (info->swapped)
- error = shmem_unuse_inode(info, swap, &page);
- else
+ list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
+ if (!info->swapped) {
list_del_init(&info->swaplist);
+ continue;
+ }
+
+ inode = igrab(&info->vfs_inode);
+ if (!inode)
+ continue;
+
+ mutex_unlock(&shmem_swaplist_mutex);
+ if (prev_inode)
+ iput(prev_inode);
+ prev_inode = inode;
+
+ error = shmem_unuse_inode(inode, type, frontswap,
+ fs_pages_to_unuse);
cond_resched();
- if (error != -EAGAIN)
+
+ mutex_lock(&shmem_swaplist_mutex);
+ next = list_next_entry(info, swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
+ if (error)
break;
- /* found nothing in this: move on to search the next */
}
mutex_unlock(&shmem_swaplist_mutex);
- if (error) {
- if (error != -ENOMEM)
- error = 0;
- mem_cgroup_cancel_charge(page, memcg, false);
- } else
- mem_cgroup_commit_charge(page, memcg, true, false);
-out:
- unlock_page(page);
- put_page(page);
+ if (prev_inode)
+ iput(prev_inode);
+
return error;
}
@@ -1325,7 +1355,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
*/
mutex_lock(&shmem_swaplist_mutex);
if (list_empty(&info->swaplist))
- list_add_tail(&info->swaplist, &shmem_swaplist);
+ list_add(&info->swaplist, &shmem_swaplist);
if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
spin_lock_irq(&info->lock);
@@ -1576,6 +1606,116 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
}
/*
+ * Swap in the page pointed to by *pagep.
+ * Caller has to make sure that *pagep contains a valid swapped page.
+ * Returns 0 and the page in pagep if success. On failure, returns the
+ * the error code and NULL in *pagep.
+ */
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct vm_area_struct *vma,
+ vm_fault_t *fault_type)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
+ struct mem_cgroup *memcg;
+ struct page *page;
+ swp_entry_t swap;
+ int error;
+
+ VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
+ swap = radix_to_swp_entry(*pagep);
+ *pagep = NULL;
+
+ /* Look it up and read it in.. */
+ page = lookup_swap_cache(swap, NULL, 0);
+ if (!page) {
+ /* Or update major stats only when swapin succeeds?? */
+ if (fault_type) {
+ *fault_type |= VM_FAULT_MAJOR;
+ count_vm_event(PGMAJFAULT);
+ count_memcg_event_mm(charge_mm, PGMAJFAULT);
+ }
+ /* Here we actually start the io */
+ page = shmem_swapin(swap, gfp, info, index);
+ if (!page) {
+ error = -ENOMEM;
+ goto failed;
+ }
+ }
+
+ /* We have to do this with page locked to prevent races */
+ lock_page(page);
+ if (!PageSwapCache(page) || page_private(page) != swap.val ||
+ !shmem_confirm_swap(mapping, index, swap)) {
+ error = -EEXIST;
+ goto unlock;
+ }
+ if (!PageUptodate(page)) {
+ error = -EIO;
+ goto failed;
+ }
+ wait_on_page_writeback(page);
+
+ if (shmem_should_replace_page(page, gfp)) {
+ error = shmem_replace_page(&page, gfp, info, index);
+ if (error)
+ goto failed;
+ }
+
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
+ false);
+ if (!error) {
+ error = shmem_add_to_page_cache(page, mapping, index,
+ swp_to_radix_entry(swap), gfp);
+ /*
+ * We already confirmed swap under page lock, and make
+ * no memory allocation here, so usually no possibility
+ * of error; but free_swap_and_cache() only trylocks a
+ * page, so it is just possible that the entry has been
+ * truncated or holepunched since swap was confirmed.
+ * shmem_undo_range() will have done some of the
+ * unaccounting, now delete_from_swap_cache() will do
+ * the rest.
+ */
+ if (error) {
+ mem_cgroup_cancel_charge(page, memcg, false);
+ delete_from_swap_cache(page);
+ }
+ }
+ if (error)
+ goto failed;
+
+ mem_cgroup_commit_charge(page, memcg, true, false);
+
+ spin_lock_irq(&info->lock);
+ info->swapped--;
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+
+ if (sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
+ delete_from_swap_cache(page);
+ set_page_dirty(page);
+ swap_free(swap);
+
+ *pagep = page;
+ return 0;
+failed:
+ if (!shmem_confirm_swap(mapping, index, swap))
+ error = -EEXIST;
+unlock:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+
+ return error;
+}
+
+/*
* shmem_getpage_gfp - find page in cache, or get from swap, or allocate
*
* If we allocate a new one we do not mark it dirty. That's up to the
@@ -1596,7 +1736,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct mm_struct *charge_mm;
struct mem_cgroup *memcg;
struct page *page;
- swp_entry_t swap;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
int error;
@@ -1608,17 +1747,23 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
sgp = SGP_CACHE;
repeat:
- swap.val = 0;
+ if (sgp <= SGP_CACHE &&
+ ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
+ return -EINVAL;
+ }
+
+ sbinfo = SHMEM_SB(inode->i_sb);
+ charge_mm = vma ? vma->vm_mm : current->mm;
+
page = find_lock_entry(mapping, index);
if (xa_is_value(page)) {
- swap = radix_to_swp_entry(page);
- page = NULL;
- }
+ error = shmem_swapin_page(inode, index, &page,
+ sgp, gfp, vma, fault_type);
+ if (error == -EEXIST)
+ goto repeat;
- if (sgp <= SGP_CACHE &&
- ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
- error = -EINVAL;
- goto unlock;
+ *pagep = page;
+ return error;
}
if (page && sgp == SGP_WRITE)
@@ -1632,7 +1777,7 @@ repeat:
put_page(page);
page = NULL;
}
- if (page || (sgp == SGP_READ && !swap.val)) {
+ if (page || sgp == SGP_READ) {
*pagep = page;
return 0;
}
@@ -1641,215 +1786,138 @@ repeat:
* Fast cache lookup did not find it:
* bring it back from swap or allocate.
*/
- sbinfo = SHMEM_SB(inode->i_sb);
- charge_mm = vma ? vma->vm_mm : current->mm;
- if (swap.val) {
- /* Look it up and read it in.. */
- page = lookup_swap_cache(swap, NULL, 0);
- if (!page) {
- /* Or update major stats only when swapin succeeds?? */
- if (fault_type) {
- *fault_type |= VM_FAULT_MAJOR;
- count_vm_event(PGMAJFAULT);
- count_memcg_event_mm(charge_mm, PGMAJFAULT);
- }
- /* Here we actually start the io */
- page = shmem_swapin(swap, gfp, info, index);
- if (!page) {
- error = -ENOMEM;
- goto failed;
- }
- }
-
- /* We have to do this with page locked to prevent races */
- lock_page(page);
- if (!PageSwapCache(page) || page_private(page) != swap.val ||
- !shmem_confirm_swap(mapping, index, swap)) {
- error = -EEXIST; /* try again */
- goto unlock;
- }
- if (!PageUptodate(page)) {
- error = -EIO;
- goto failed;
- }
- wait_on_page_writeback(page);
-
- if (shmem_should_replace_page(page, gfp)) {
- error = shmem_replace_page(&page, gfp, info, index);
- if (error)
- goto failed;
- }
-
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- false);
- if (!error) {
- error = shmem_add_to_page_cache(page, mapping, index,
- swp_to_radix_entry(swap), gfp);
- /*
- * We already confirmed swap under page lock, and make
- * no memory allocation here, so usually no possibility
- * of error; but free_swap_and_cache() only trylocks a
- * page, so it is just possible that the entry has been
- * truncated or holepunched since swap was confirmed.
- * shmem_undo_range() will have done some of the
- * unaccounting, now delete_from_swap_cache() will do
- * the rest.
- * Reset swap.val? No, leave it so "failed" goes back to
- * "repeat": reading a hole and writing should succeed.
- */
- if (error) {
- mem_cgroup_cancel_charge(page, memcg, false);
- delete_from_swap_cache(page);
- }
- }
- if (error)
- goto failed;
-
- mem_cgroup_commit_charge(page, memcg, true, false);
-
- spin_lock_irq(&info->lock);
- info->swapped--;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
-
- if (sgp == SGP_WRITE)
- mark_page_accessed(page);
-
- delete_from_swap_cache(page);
- set_page_dirty(page);
- swap_free(swap);
-
- } else {
- if (vma && userfaultfd_missing(vma)) {
- *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
- return 0;
- }
+ if (vma && userfaultfd_missing(vma)) {
+ *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
+ return 0;
+ }
- /* shmem_symlink() */
- if (mapping->a_ops != &shmem_aops)
- goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
- goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_FORCE)
+ /* shmem_symlink() */
+ if (mapping->a_ops != &shmem_aops)
+ goto alloc_nohuge;
+ if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
+ goto alloc_nohuge;
+ if (shmem_huge == SHMEM_HUGE_FORCE)
+ goto alloc_huge;
+ switch (sbinfo->huge) {
+ loff_t i_size;
+ pgoff_t off;
+ case SHMEM_HUGE_NEVER:
+ goto alloc_nohuge;
+ case SHMEM_HUGE_WITHIN_SIZE:
+ off = round_up(index, HPAGE_PMD_NR);
+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (i_size >= HPAGE_PMD_SIZE &&
+ i_size >> PAGE_SHIFT >= off)
goto alloc_huge;
- switch (sbinfo->huge) {
- loff_t i_size;
- pgoff_t off;
- case SHMEM_HUGE_NEVER:
- goto alloc_nohuge;
- case SHMEM_HUGE_WITHIN_SIZE:
- off = round_up(index, HPAGE_PMD_NR);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >= HPAGE_PMD_SIZE &&
- i_size >> PAGE_SHIFT >= off)
- goto alloc_huge;
- /* fallthrough */
- case SHMEM_HUGE_ADVISE:
- if (sgp_huge == SGP_HUGE)
- goto alloc_huge;
- /* TODO: implement fadvise() hints */
- goto alloc_nohuge;
- }
+ /* fallthrough */
+ case SHMEM_HUGE_ADVISE:
+ if (sgp_huge == SGP_HUGE)
+ goto alloc_huge;
+ /* TODO: implement fadvise() hints */
+ goto alloc_nohuge;
+ }
alloc_huge:
- page = shmem_alloc_and_acct_page(gfp, inode, index, true);
- if (IS_ERR(page)) {
-alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
- index, false);
- }
- if (IS_ERR(page)) {
- int retry = 5;
- error = PTR_ERR(page);
- page = NULL;
- if (error != -ENOSPC)
- goto failed;
- /*
- * Try to reclaim some spece by splitting a huge page
- * beyond i_size on the filesystem.
- */
- while (retry--) {
- int ret;
- ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
- if (ret == SHRINK_STOP)
- break;
- if (ret)
- goto alloc_nohuge;
- }
- goto failed;
- }
-
- if (PageTransHuge(page))
- hindex = round_down(index, HPAGE_PMD_NR);
- else
- hindex = index;
+ page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+ if (IS_ERR(page)) {
+alloc_nohuge:
+ page = shmem_alloc_and_acct_page(gfp, inode,
+ index, false);
+ }
+ if (IS_ERR(page)) {
+ int retry = 5;
- if (sgp == SGP_WRITE)
- __SetPageReferenced(page);
+ error = PTR_ERR(page);
+ page = NULL;
+ if (error != -ENOSPC)
+ goto unlock;
+ /*
+ * Try to reclaim some space by splitting a huge page
+ * beyond i_size on the filesystem.
+ */
+ while (retry--) {
+ int ret;
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- PageTransHuge(page));
- if (error)
- goto unacct;
- error = shmem_add_to_page_cache(page, mapping, hindex,
- NULL, gfp & GFP_RECLAIM_MASK);
- if (error) {
- mem_cgroup_cancel_charge(page, memcg,
- PageTransHuge(page));
- goto unacct;
+ ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
+ if (ret == SHRINK_STOP)
+ break;
+ if (ret)
+ goto alloc_nohuge;
}
- mem_cgroup_commit_charge(page, memcg, false,
- PageTransHuge(page));
- lru_cache_add_anon(page);
+ goto unlock;
+ }
- spin_lock_irq(&info->lock);
- info->alloced += 1 << compound_order(page);
- inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
- alloced = true;
+ if (PageTransHuge(page))
+ hindex = round_down(index, HPAGE_PMD_NR);
+ else
+ hindex = index;
- if (PageTransHuge(page) &&
- DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
- hindex + HPAGE_PMD_NR - 1) {
- /*
- * Part of the huge page is beyond i_size: subject
- * to shrink under memory pressure.
- */
- spin_lock(&sbinfo->shrinklist_lock);
- /*
- * _careful to defend against unlocked access to
- * ->shrink_list in shmem_unused_huge_shrink()
- */
- if (list_empty_careful(&info->shrinklist)) {
- list_add_tail(&info->shrinklist,
- &sbinfo->shrinklist);
- sbinfo->shrinklist_len++;
- }
- spin_unlock(&sbinfo->shrinklist_lock);
- }
+ if (sgp == SGP_WRITE)
+ __SetPageReferenced(page);
+
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
+ PageTransHuge(page));
+ if (error)
+ goto unacct;
+ error = shmem_add_to_page_cache(page, mapping, hindex,
+ NULL, gfp & GFP_RECLAIM_MASK);
+ if (error) {
+ mem_cgroup_cancel_charge(page, memcg,
+ PageTransHuge(page));
+ goto unacct;
+ }
+ mem_cgroup_commit_charge(page, memcg, false,
+ PageTransHuge(page));
+ lru_cache_add_anon(page);
+ spin_lock_irq(&info->lock);
+ info->alloced += 1 << compound_order(page);
+ inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+ alloced = true;
+
+ if (PageTransHuge(page) &&
+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
+ hindex + HPAGE_PMD_NR - 1) {
/*
- * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ * Part of the huge page is beyond i_size: subject
+ * to shrink under memory pressure.
*/
- if (sgp == SGP_FALLOC)
- sgp = SGP_WRITE;
-clear:
+ spin_lock(&sbinfo->shrinklist_lock);
/*
- * Let SGP_WRITE caller clear ends if write does not fill page;
- * but SGP_FALLOC on a page fallocated earlier must initialize
- * it now, lest undo on failure cancel our earlier guarantee.
+ * _careful to defend against unlocked access to
+ * ->shrink_list in shmem_unused_huge_shrink()
*/
- if (sgp != SGP_WRITE && !PageUptodate(page)) {
- struct page *head = compound_head(page);
- int i;
+ if (list_empty_careful(&info->shrinklist)) {
+ list_add_tail(&info->shrinklist,
+ &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ }
+ spin_unlock(&sbinfo->shrinklist_lock);
+ }
- for (i = 0; i < (1 << compound_order(head)); i++) {
- clear_highpage(head + i);
- flush_dcache_page(head + i);
- }
- SetPageUptodate(head);
+ /*
+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ */
+ if (sgp == SGP_FALLOC)
+ sgp = SGP_WRITE;
+clear:
+ /*
+ * Let SGP_WRITE caller clear ends if write does not fill page;
+ * but SGP_FALLOC on a page fallocated earlier must initialize
+ * it now, lest undo on failure cancel our earlier guarantee.
+ */
+ if (sgp != SGP_WRITE && !PageUptodate(page)) {
+ struct page *head = compound_head(page);
+ int i;
+
+ for (i = 0; i < (1 << compound_order(head)); i++) {
+ clear_highpage(head + i);
+ flush_dcache_page(head + i);
}
+ SetPageUptodate(head);
}
/* Perhaps the file has been truncated since we checked */
@@ -1879,9 +1947,6 @@ unacct:
put_page(page);
goto alloc_nohuge;
}
-failed:
- if (swap.val && !shmem_confirm_swap(mapping, index, swap))
- error = -EEXIST;
unlock:
if (page) {
unlock_page(page);
@@ -2125,6 +2190,24 @@ out_nomem:
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+
+ if (info->seals & F_SEAL_FUTURE_WRITE) {
+ /*
+ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+ * "future write" seal active.
+ */
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+ return -EPERM;
+
+ /*
+ * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
+ * read-only mapping, take care to not allow mprotect to revert
+ * protections.
+ */
+ vma->vm_flags &= ~(VM_MAYWRITE);
+ }
+
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
@@ -2375,8 +2458,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index = pos >> PAGE_SHIFT;
/* i_mutex is held by caller */
- if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) {
- if (info->seals & F_SEAL_WRITE)
+ if (unlikely(info->seals & (F_SEAL_GROW |
+ F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
return -EPERM;
if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
return -EPERM;
@@ -2639,7 +2723,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
/* protected by i_mutex */
- if (info->seals & F_SEAL_WRITE) {
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
error = -EPERM;
goto out;
}
@@ -3843,7 +3927,8 @@ int __init shmem_init(void)
return 0;
}
-int shmem_unuse(swp_entry_t swap, struct page *page)
+int shmem_unuse(unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
return 0;
}
diff --git a/mm/shuffle.c b/mm/shuffle.c
new file mode 100644
index 000000000000..3ce12481b1dc
--- /dev/null
+++ b/mm/shuffle.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2018 Intel Corporation. All rights reserved.
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/mmzone.h>
+#include <linux/random.h>
+#include <linux/moduleparam.h>
+#include "internal.h"
+#include "shuffle.h"
+
+DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
+static unsigned long shuffle_state __ro_after_init;
+
+/*
+ * Depending on the architecture, module parameter parsing may run
+ * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents,
+ * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE
+ * attempts to turn on the implementation, but aborts if it finds
+ * SHUFFLE_FORCE_DISABLE already set.
+ */
+__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
+{
+ if (ctl == SHUFFLE_FORCE_DISABLE)
+ set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state);
+
+ if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) {
+ if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state))
+ static_branch_disable(&page_alloc_shuffle_key);
+ } else if (ctl == SHUFFLE_ENABLE
+ && !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state))
+ static_branch_enable(&page_alloc_shuffle_key);
+}
+
+static bool shuffle_param;
+extern int shuffle_show(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state)
+ ? 'Y' : 'N');
+}
+
+static __meminit int shuffle_store(const char *val,
+ const struct kernel_param *kp)
+{
+ int rc = param_set_bool(val, kp);
+
+ if (rc < 0)
+ return rc;
+ if (shuffle_param)
+ page_alloc_shuffle(SHUFFLE_ENABLE);
+ else
+ page_alloc_shuffle(SHUFFLE_FORCE_DISABLE);
+ return 0;
+}
+module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
+
+/*
+ * For two pages to be swapped in the shuffle, they must be free (on a
+ * 'free_area' lru), have the same order, and have the same migratetype.
+ */
+static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order)
+{
+ struct page *page;
+
+ /*
+ * Given we're dealing with randomly selected pfns in a zone we
+ * need to ask questions like...
+ */
+
+ /* ...is the pfn even in the memmap? */
+ if (!pfn_valid_within(pfn))
+ return NULL;
+
+ /* ...is the pfn in a present section or a hole? */
+ if (!pfn_present(pfn))
+ return NULL;
+
+ /* ...is the page free and currently on a free_area list? */
+ page = pfn_to_page(pfn);
+ if (!PageBuddy(page))
+ return NULL;
+
+ /*
+ * ...is the page on the same list as the page we will
+ * shuffle it with?
+ */
+ if (page_order(page) != order)
+ return NULL;
+
+ return page;
+}
+
+/*
+ * Fisher-Yates shuffle the freelist which prescribes iterating through an
+ * array, pfns in this case, and randomly swapping each entry with another in
+ * the span, end_pfn - start_pfn.
+ *
+ * To keep the implementation simple it does not attempt to correct for sources
+ * of bias in the distribution, like modulo bias or pseudo-random number
+ * generator bias. I.e. the expectation is that this shuffling raises the bar
+ * for attacks that exploit the predictability of page allocations, but need not
+ * be a perfect shuffle.
+ */
+#define SHUFFLE_RETRY 10
+void __meminit __shuffle_zone(struct zone *z)
+{
+ unsigned long i, flags;
+ unsigned long start_pfn = z->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(z);
+ const int order = SHUFFLE_ORDER;
+ const int order_pages = 1 << order;
+
+ spin_lock_irqsave(&z->lock, flags);
+ start_pfn = ALIGN(start_pfn, order_pages);
+ for (i = start_pfn; i < end_pfn; i += order_pages) {
+ unsigned long j;
+ int migratetype, retry;
+ struct page *page_i, *page_j;
+
+ /*
+ * We expect page_i, in the sub-range of a zone being added
+ * (@start_pfn to @end_pfn), to more likely be valid compared to
+ * page_j randomly selected in the span @zone_start_pfn to
+ * @spanned_pages.
+ */
+ page_i = shuffle_valid_page(i, order);
+ if (!page_i)
+ continue;
+
+ for (retry = 0; retry < SHUFFLE_RETRY; retry++) {
+ /*
+ * Pick a random order aligned page in the zone span as
+ * a swap target. If the selected pfn is a hole, retry
+ * up to SHUFFLE_RETRY attempts find a random valid pfn
+ * in the zone.
+ */
+ j = z->zone_start_pfn +
+ ALIGN_DOWN(get_random_long() % z->spanned_pages,
+ order_pages);
+ page_j = shuffle_valid_page(j, order);
+ if (page_j && page_j != page_i)
+ break;
+ }
+ if (retry >= SHUFFLE_RETRY) {
+ pr_debug("%s: failed to swap %#lx\n", __func__, i);
+ continue;
+ }
+
+ /*
+ * Each migratetype corresponds to its own list, make sure the
+ * types match otherwise we're moving pages to lists where they
+ * do not belong.
+ */
+ migratetype = get_pageblock_migratetype(page_i);
+ if (get_pageblock_migratetype(page_j) != migratetype) {
+ pr_debug("%s: migratetype mismatch %#lx\n", __func__, i);
+ continue;
+ }
+
+ list_swap(&page_i->lru, &page_j->lru);
+
+ pr_debug("%s: swap: %#lx -> %#lx\n", __func__, i, j);
+
+ /* take it easy on the zone lock */
+ if ((i % (100 * order_pages)) == 0) {
+ spin_unlock_irqrestore(&z->lock, flags);
+ cond_resched();
+ spin_lock_irqsave(&z->lock, flags);
+ }
+ }
+ spin_unlock_irqrestore(&z->lock, flags);
+}
+
+/**
+ * shuffle_free_memory - reduce the predictability of the page allocator
+ * @pgdat: node page data
+ */
+void __meminit __shuffle_free_memory(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ shuffle_zone(z);
+}
+
+void add_to_free_area_random(struct page *page, struct free_area *area,
+ int migratetype)
+{
+ static u64 rand;
+ static u8 rand_bits;
+
+ /*
+ * The lack of locking is deliberate. If 2 threads race to
+ * update the rand state it just adds to the entropy.
+ */
+ if (rand_bits == 0) {
+ rand_bits = 64;
+ rand = get_random_u64();
+ }
+
+ if (rand & 1)
+ add_to_free_area(page, area, migratetype);
+ else
+ add_to_free_area_tail(page, area, migratetype);
+ rand_bits--;
+ rand >>= 1;
+}
diff --git a/mm/shuffle.h b/mm/shuffle.h
new file mode 100644
index 000000000000..777a257a0d2f
--- /dev/null
+++ b/mm/shuffle.h
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2018 Intel Corporation. All rights reserved.
+#ifndef _MM_SHUFFLE_H
+#define _MM_SHUFFLE_H
+#include <linux/jump_label.h>
+
+/*
+ * SHUFFLE_ENABLE is called from the command line enabling path, or by
+ * platform-firmware enabling that indicates the presence of a
+ * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from
+ * the command line path and overrides any previous or future
+ * SHUFFLE_ENABLE.
+ */
+enum mm_shuffle_ctl {
+ SHUFFLE_ENABLE,
+ SHUFFLE_FORCE_DISABLE,
+};
+
+#define SHUFFLE_ORDER (MAX_ORDER-1)
+
+#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
+DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
+extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl);
+extern void __shuffle_free_memory(pg_data_t *pgdat);
+static inline void shuffle_free_memory(pg_data_t *pgdat)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return;
+ __shuffle_free_memory(pgdat);
+}
+
+extern void __shuffle_zone(struct zone *z);
+static inline void shuffle_zone(struct zone *z)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return;
+ __shuffle_zone(z);
+}
+
+static inline bool is_shuffle_order(int order)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return false;
+ return order >= SHUFFLE_ORDER;
+}
+#else
+static inline void shuffle_free_memory(pg_data_t *pgdat)
+{
+}
+
+static inline void shuffle_zone(struct zone *z)
+{
+}
+
+static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
+{
+}
+
+static inline bool is_shuffle_order(int order)
+{
+ return false;
+}
+#endif
+#endif /* _MM_SHUFFLE_H */
diff --git a/mm/slab.c b/mm/slab.c
index 78eb8c5bf4e4..7f4b31cd2565 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -550,14 +550,6 @@ static void start_cpu_timer(int cpu)
static void init_arraycache(struct array_cache *ac, int limit, int batch)
{
- /*
- * The array_cache structures contain pointers to free object.
- * However, when such objects are allocated or transferred to another
- * cache the pointers are not cleared and they could be counted as
- * valid references during a kmemleak scan. Therefore, kmemleak must
- * not scan such objects.
- */
- kmemleak_no_scan(ac);
if (ac) {
ac->avail = 0;
ac->limit = limit;
@@ -573,6 +565,14 @@ static struct array_cache *alloc_arraycache(int node, int entries,
struct array_cache *ac = NULL;
ac = kmalloc_node(memsize, gfp, node);
+ /*
+ * The array_cache structures contain pointers to free object.
+ * However, when such objects are allocated or transferred to another
+ * cache the pointers are not cleared and they could be counted as
+ * valid references during a kmemleak scan. Therefore, kmemleak must
+ * not scan such objects.
+ */
+ kmemleak_no_scan(ac);
init_arraycache(ac, entries, batchcount);
return ac;
}
@@ -667,6 +667,7 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
alc = kmalloc_node(memsize, gfp, node);
if (alc) {
+ kmemleak_no_scan(alc);
init_arraycache(&alc->ac, entries, batch);
spin_lock_init(&alc->lock);
}
@@ -676,12 +677,11 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
{
struct alien_cache **alc_ptr;
- size_t memsize = sizeof(void *) * nr_node_ids;
int i;
if (limit > 1)
limit = 12;
- alc_ptr = kzalloc_node(memsize, gfp, node);
+ alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node);
if (!alc_ptr)
return NULL;
@@ -1727,6 +1727,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
* This could be made much more intelligent. For now, try to avoid using
* high order pages for slabs. When the gfp() functions are more friendly
* towards high-order requests, this should be changed.
+ *
+ * Return: number of left-over bytes in a slab
*/
static size_t calculate_slab_order(struct kmem_cache *cachep,
size_t size, slab_flags_t flags)
@@ -1975,6 +1977,8 @@ static bool set_on_slab_cache(struct kmem_cache *cachep,
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
+ *
+ * Return: a pointer to the created cache or %NULL in case of error
*/
int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
{
@@ -3535,6 +3539,8 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
*
* Allocate an object from this cache. The flags are only relevant
* if the cache has no available objects.
+ *
+ * Return: pointer to the new object or %NULL in case of error
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
@@ -3625,6 +3631,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
* node, which can improve the performance for cpu bound structures.
*
* Fallback to other node is possible if __GFP_THISNODE is not set.
+ *
+ * Return: pointer to the new object or %NULL in case of error
*/
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
@@ -3694,6 +3702,8 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller);
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate (see kmalloc).
* @caller: function caller for debug tracking of the caller
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
*/
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
unsigned long caller)
@@ -4159,6 +4169,8 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
* @buffer: user buffer
* @count: data length
* @ppos: unused
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
@@ -4450,6 +4462,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
* The caller must guarantee that objp points to a valid object previously
* allocated with either kmalloc() or kmem_cache_alloc(). The object
* must not be freed during the duration of the call.
+ *
+ * Return: size of the actual memory used by @objp in bytes
*/
size_t ksize(const void *objp)
{
diff --git a/mm/slab.h b/mm/slab.h
index 4190c24ef0e9..e5e6658eeacc 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -276,8 +276,6 @@ static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
- if (!memcg_kmem_enabled())
- return 0;
if (is_root_cache(s))
return 0;
return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
@@ -286,8 +284,6 @@ static __always_inline int memcg_charge_slab(struct page *page,
static __always_inline void memcg_uncharge_slab(struct page *page, int order,
struct kmem_cache *s)
{
- if (!memcg_kmem_enabled())
- return;
memcg_kmem_uncharge(page, order);
}
@@ -437,11 +433,10 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
flags &= gfp_allowed_mask;
for (i = 0; i < size; i++) {
- void *object = p[i];
-
- kmemleak_alloc_recursive(object, s->object_size, 1,
+ p[i] = kasan_slab_alloc(s, p[i], flags);
+ /* As p[i] might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc_recursive(p[i], s->object_size, 1,
s->flags, flags);
- p[i] = kasan_slab_alloc(s, object, flags);
}
if (memcg_kmem_enabled())
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 81732d05e74a..03eeb8b7b4b1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -939,6 +939,8 @@ EXPORT_SYMBOL(kmem_cache_destroy);
*
* Releases as many slabs as possible for a cache.
* To help debugging, a zero exit status indicates all slabs were released.
+ *
+ * Return: %0 if all slabs were released, non-zero otherwise
*/
int kmem_cache_shrink(struct kmem_cache *cachep)
{
@@ -1228,8 +1230,9 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
flags |= __GFP_COMP;
page = alloc_pages(flags, order);
ret = page ? page_address(page) : NULL;
- kmemleak_alloc(ret, size, 1, flags);
ret = kasan_kmalloc_large(ret, size, flags);
+ /* As ret might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc(ret, size, 1, flags);
return ret;
}
EXPORT_SYMBOL(kmalloc_order);
@@ -1424,7 +1427,7 @@ void dump_unreclaimable_slab(void)
#if defined(CONFIG_MEMCG)
void *memcg_slab_start(struct seq_file *m, loff_t *pos)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
mutex_lock(&slab_mutex);
return seq_list_start(&memcg->kmem_caches, *pos);
@@ -1432,7 +1435,7 @@ void *memcg_slab_start(struct seq_file *m, loff_t *pos)
void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
return seq_list_next(p, &memcg->kmem_caches, pos);
}
@@ -1446,7 +1449,7 @@ int memcg_slab_show(struct seq_file *m, void *p)
{
struct kmem_cache *s = list_entry(p, struct kmem_cache,
memcg_params.kmem_caches_node);
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
if (p == memcg->kmem_caches.next)
print_slabinfo_header(m);
@@ -1527,6 +1530,8 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
* This function is like krealloc() except it never frees the originally
* allocated buffer. Use this if you don't want to free the buffer immediately
* like, for example, with RCU.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
*/
void *__krealloc(const void *p, size_t new_size, gfp_t flags)
{
@@ -1548,6 +1553,8 @@ EXPORT_SYMBOL(__krealloc);
* lesser of the new and old sizes. If @p is %NULL, krealloc()
* behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
* %NULL pointer, the object pointed to is freed.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
*/
void *krealloc(const void *p, size_t new_size, gfp_t flags)
{
diff --git a/mm/slub.c b/mm/slub.c
index 1e3d0ec4e200..d9131be3b710 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -249,7 +249,18 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
unsigned long ptr_addr)
{
#ifdef CONFIG_SLAB_FREELIST_HARDENED
- return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr);
+ /*
+ * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
+ * Normally, this doesn't cause any issues, as both set_freepointer()
+ * and get_freepointer() are called with a pointer with the same tag.
+ * However, there are some issues with CONFIG_SLUB_DEBUG code. For
+ * example, when __free_slub() iterates over objects in a cache, it
+ * passes untagged pointers to check_object(). check_object() in turns
+ * calls get_freepointer() with an untagged pointer, which causes the
+ * freepointer to be restored incorrectly.
+ */
+ return (void *)((unsigned long)ptr ^ s->random ^
+ (unsigned long)kasan_reset_tag((void *)ptr_addr));
#else
return ptr;
#endif
@@ -303,11 +314,6 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
__p < (__addr) + (__objects) * (__s)->size; \
__p += (__s)->size)
-#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
- for (__p = fixup_red_left(__s, __addr), __idx = 1; \
- __idx <= __objects; \
- __p += (__s)->size, __idx++)
-
/* Determine object index from a given position */
static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
{
@@ -507,6 +513,7 @@ static inline int check_valid_pointer(struct kmem_cache *s,
return 1;
base = page_address(page);
+ object = kasan_reset_tag(object);
object = restore_red_left(s, object);
if (object < base || object >= base + page->objects * s->size ||
(object - base) % s->size) {
@@ -684,7 +691,10 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
print_section(KERN_ERR, "Padding ", p + off,
size_from_object(s) - off);
- dump_stack();
+ if (unlikely(s->flags & SLAB_WARN_ON_ERROR))
+ WARN_ON(1);
+ else
+ dump_stack();
}
void object_err(struct kmem_cache *s, struct page *page,
@@ -705,7 +715,11 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
va_end(args);
slab_bug(s, "%s", buf);
print_page_info(page);
- dump_stack();
+
+ if (unlikely(s->flags & SLAB_WARN_ON_ERROR))
+ WARN_ON(1);
+ else
+ dump_stack();
}
static void init_object(struct kmem_cache *s, void *object, u8 val)
@@ -1075,9 +1089,18 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
init_tracking(s, object);
}
+static void setup_page_debug(struct kmem_cache *s, void *addr, int order)
+{
+ if (!(s->flags & SLAB_POISON))
+ return;
+
+ metadata_access_enable();
+ memset(addr, POISON_INUSE, PAGE_SIZE << order);
+ metadata_access_disable();
+}
+
static inline int alloc_consistency_checks(struct kmem_cache *s,
- struct page *page,
- void *object, unsigned long addr)
+ struct page *page, void *object)
{
if (!check_slab(s, page))
return 0;
@@ -1098,7 +1121,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s,
void *object, unsigned long addr)
{
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!alloc_consistency_checks(s, page, object, addr))
+ if (!alloc_consistency_checks(s, page, object))
goto bad;
}
@@ -1254,6 +1277,9 @@ static int __init setup_slub_debug(char *str)
case 'a':
slub_debug |= SLAB_FAILSLAB;
break;
+ case 'w':
+ slub_debug |= SLAB_WARN_ON_ERROR;
+ break;
case 'o':
/*
* Avoid enabling debugging on caches if its minimum
@@ -1330,6 +1356,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
struct page *page, void *object) {}
+static inline void setup_page_debug(struct kmem_cache *s,
+ void *addr, int order) {}
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
@@ -1374,8 +1402,10 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
*/
static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
{
+ ptr = kasan_kmalloc_large(ptr, size, flags);
+ /* As ptr might get tagged, call kmemleak hook after KASAN. */
kmemleak_alloc(ptr, size, 1, flags);
- return kasan_kmalloc_large(ptr, size, flags);
+ return ptr;
}
static __always_inline void kfree_hook(void *x)
@@ -1641,27 +1671,25 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
if (page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
- start = page_address(page);
+ kasan_poison_slab(page);
- if (unlikely(s->flags & SLAB_POISON))
- memset(start, POISON_INUSE, PAGE_SIZE << order);
+ start = page_address(page);
- kasan_poison_slab(page);
+ setup_page_debug(s, start, order);
shuffle = shuffle_freelist(s, page);
if (!shuffle) {
- for_each_object_idx(p, idx, s, start, page->objects) {
- if (likely(idx < page->objects)) {
- next = p + s->size;
- next = setup_object(s, page, next);
- set_freepointer(s, p, next);
- } else
- set_freepointer(s, p, NULL);
- }
start = fixup_red_left(s, start);
start = setup_object(s, page, start);
page->freelist = start;
+ for (idx = 0, p = start; idx < page->objects - 1; idx++) {
+ next = p + s->size;
+ next = setup_object(s, page, next);
+ set_freepointer(s, p, next);
+ p = next;
+ }
+ set_freepointer(s, p, NULL);
}
page->inuse = page->objects;
@@ -2111,7 +2139,7 @@ redo:
if (!lock) {
lock = 1;
/*
- * Taking the spinlock removes the possiblity
+ * Taking the spinlock removes the possibility
* that acquire_slab() will see a slab page that
* is frozen
*/
@@ -2235,8 +2263,8 @@ static void unfreeze_partials(struct kmem_cache *s,
}
/*
- * Put a page that was just frozen (in __slab_free) into a partial page
- * slot if available.
+ * Put a page that was just frozen (in __slab_free|get_partial_node) into a
+ * partial page slot if available.
*
* If we did not find a slot then simply move all the partials to the
* per node partial list.
@@ -2463,8 +2491,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
stat(s, ALLOC_SLAB);
c->page = page;
*pc = c;
- } else
- freelist = NULL;
+ }
return freelist;
}
@@ -4245,7 +4272,7 @@ void __init kmem_cache_init(void)
cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
slub_cpu_dead);
- pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n",
+ pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
cache_line_size(),
slub_min_order, slub_max_order, slub_min_objects,
nr_cpu_ids, nr_node_ids);
@@ -5220,6 +5247,25 @@ static ssize_t store_user_store(struct kmem_cache *s,
}
SLAB_ATTR(store_user);
+static ssize_t warn_on_error_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_WARN_ON_ERROR));
+}
+
+static ssize_t warn_on_error_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ if (any_slab_objects(s))
+ return -EBUSY;
+
+ s->flags &= ~SLAB_WARN_ON_ERROR;
+ if (buf[0] == '1')
+ s->flags |= SLAB_WARN_ON_ERROR;
+
+ return length;
+}
+SLAB_ATTR(warn_on_error);
+
static ssize_t validate_show(struct kmem_cache *s, char *buf)
{
return 0;
@@ -5428,6 +5474,7 @@ static struct attribute *slab_attrs[] = {
&validate_attr.attr,
&alloc_calls_attr.attr,
&free_calls_attr.attr,
+ &warn_on_error_attr.attr,
#endif
#ifdef CONFIG_ZONE_DMA
&cache_dma_attr.attr,
diff --git a/mm/swap.c b/mm/swap.c
index 4929bc1be60e..4d7d37eb3c40 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -320,11 +320,6 @@ static inline void activate_page_drain(int cpu)
{
}
-static bool need_activate_page_drain(int cpu)
-{
- return false;
-}
-
void activate_page(struct page *page)
{
struct zone *zone = page_zone(page);
@@ -653,13 +648,15 @@ void lru_add_drain(void)
put_cpu();
}
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+
static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
lru_add_drain();
}
-static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-
/*
* Doesn't need any cpu hotplug locking because we do rely on per-cpu
* kworkers being shut down before our page_alloc_cpu_dead callback is
@@ -702,6 +699,12 @@ void lru_add_drain_all(void)
mutex_unlock(&lock);
}
+#else
+void lru_add_drain_all(void)
+{
+ lru_add_drain();
+}
+#endif
/**
* release_pages - batched put_page()
diff --git a/mm/swap_state.c b/mm/swap_state.c
index fd2f21e1c60a..85245fdec8d9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -523,7 +523,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
* This has been extended to use the NUMA policies from the mm triggering
* the readahead.
*
- * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL.
+ * Caller must hold read mmap_sem if vmf->vma is not NULL.
*/
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
@@ -543,6 +543,13 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!mask)
goto skip;
+ /* Test swap type to make sure the dereference is safe */
+ if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) {
+ struct inode *inode = si->swap_file->f_mapping->host;
+ if (inode_read_congested(inode))
+ goto skip;
+ }
+
do_poll = false;
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
@@ -691,6 +698,20 @@ static void swap_ra_info(struct vm_fault *vmf,
pte_unmap(orig_pte);
}
+/**
+ * swap_vma_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @gfp_mask: memory allocation flags
+ * @vmf: fault information
+ *
+ * Returns the struct page for entry and addr, after queueing swapin.
+ *
+ * Primitive swap readahead code. We simply read in a few pages whoes
+ * virtual addresses are around the fault address in the same vma.
+ *
+ * Caller must hold read mmap_sem if vmf->vma is not NULL.
+ *
+ */
static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dbac1d49469d..cca8420b12db 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -98,6 +98,15 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+static struct swap_info_struct *swap_type_to_swap_info(int type)
+{
+ if (type >= READ_ONCE(nr_swapfiles))
+ return NULL;
+
+ smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
+ return READ_ONCE(swap_info[type]);
+}
+
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
@@ -1044,12 +1053,14 @@ noswap:
/* The only caller of this function is now suspend routine */
swp_entry_t get_swap_page_of_type(int type)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
pgoff_t offset;
- si = swap_info[type];
+ if (!si)
+ goto fail;
+
spin_lock(&si->lock);
- if (si && (si->flags & SWP_WRITEOK)) {
+ if (si->flags & SWP_WRITEOK) {
atomic_long_dec(&nr_swap_pages);
/* This is called for allocating swap entry, not cache */
offset = scan_swap_map(si, 1);
@@ -1060,6 +1071,7 @@ swp_entry_t get_swap_page_of_type(int type)
atomic_long_inc(&nr_swap_pages);
}
spin_unlock(&si->lock);
+fail:
return (swp_entry_t) {0};
}
@@ -1071,9 +1083,9 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
if (!entry.val)
goto out;
type = swp_type(entry);
- if (type >= nr_swapfiles)
+ p = swap_type_to_swap_info(type);
+ if (!p)
goto bad_nofile;
- p = swap_info[type];
if (!(p->flags & SWP_USED))
goto bad_device;
offset = swp_offset(entry);
@@ -1697,10 +1709,9 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
sector_t swapdev_block(int type, pgoff_t offset)
{
struct block_device *bdev;
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
- if ((unsigned int)type >= nr_swapfiles)
- return 0;
- if (!(swap_info[type]->flags & SWP_WRITEOK))
+ if (!si || !(si->flags & SWP_WRITEOK))
return 0;
return map_swap_entry(swp_entry(type, offset), &bdev);
}
@@ -1772,8 +1783,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
- set_pte_at(vma->vm_mm, addr, pte,
- pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
page_add_anon_rmap(page, vma, addr, false);
mem_cgroup_commit_charge(page, memcg, true, false);
@@ -1782,6 +1791,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
+ set_pte_at(vma->vm_mm, addr, pte,
+ pte_mkold(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
/*
* Move the page to the active list so it is not
@@ -1799,44 +1810,77 @@ out_nolock:
}
static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ unsigned long addr, unsigned long end,
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
- pte_t swp_pte = swp_entry_to_pte(entry);
+ struct page *page;
+ swp_entry_t entry;
pte_t *pte;
+ struct swap_info_struct *si;
+ unsigned long offset;
int ret = 0;
+ volatile unsigned char *swap_map;
- /*
- * We don't actually need pte lock while scanning for swp_pte: since
- * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
- * page table while we're scanning; though it could get zapped, and on
- * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
- * of unmatched parts which look like swp_pte, so unuse_pte must
- * recheck under pte lock. Scanning without pte lock lets it be
- * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
- */
+ si = swap_info[type];
pte = pte_offset_map(pmd, addr);
do {
- /*
- * swapoff spends a _lot_ of time in this loop!
- * Test inline before going to call unuse_pte.
- */
- if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
- pte_unmap(pte);
- ret = unuse_pte(vma, pmd, addr, entry, page);
- if (ret)
- goto out;
- pte = pte_offset_map(pmd, addr);
+ struct vm_fault vmf;
+
+ if (!is_swap_pte(*pte))
+ continue;
+
+ entry = pte_to_swp_entry(*pte);
+ if (swp_type(entry) != type)
+ continue;
+
+ offset = swp_offset(entry);
+ if (frontswap && !frontswap_test(si, offset))
+ continue;
+
+ pte_unmap(pte);
+ swap_map = &si->swap_map[offset];
+ vmf.vma = vma;
+ vmf.address = addr;
+ vmf.pmd = pmd;
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
+ if (!page) {
+ if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
+ goto try_next;
+ return -ENOMEM;
+ }
+
+ lock_page(page);
+ wait_on_page_writeback(page);
+ ret = unuse_pte(vma, pmd, addr, entry, page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ goto out;
+ }
+
+ try_to_free_swap(page);
+ unlock_page(page);
+ put_page(page);
+
+ if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
+ ret = FRONTSWAP_PAGES_UNUSED;
+ goto out;
}
+try_next:
+ pte = pte_offset_map(pmd, addr);
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap(pte - 1);
+
+ ret = 0;
out:
return ret;
}
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
pmd_t *pmd;
unsigned long next;
@@ -1848,7 +1892,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
- ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
+ ret = unuse_pte_range(vma, pmd, addr, next, type,
+ frontswap, fs_pages_to_unuse);
if (ret)
return ret;
} while (pmd++, addr = next, addr != end);
@@ -1857,7 +1902,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
pud_t *pud;
unsigned long next;
@@ -1868,7 +1914,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
+ ret = unuse_pmd_range(vma, pud, addr, next, type,
+ frontswap, fs_pages_to_unuse);
if (ret)
return ret;
} while (pud++, addr = next, addr != end);
@@ -1877,7 +1924,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
p4d_t *p4d;
unsigned long next;
@@ -1888,78 +1936,66 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d))
continue;
- ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
+ ret = unuse_pud_range(vma, p4d, addr, next, type,
+ frontswap, fs_pages_to_unuse);
if (ret)
return ret;
} while (p4d++, addr = next, addr != end);
return 0;
}
-static int unuse_vma(struct vm_area_struct *vma,
- swp_entry_t entry, struct page *page)
+static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
+ bool frontswap, unsigned long *fs_pages_to_unuse)
{
pgd_t *pgd;
unsigned long addr, end, next;
int ret;
- if (page_anon_vma(page)) {
- addr = page_address_in_vma(page, vma);
- if (addr == -EFAULT)
- return 0;
- else
- end = addr + PAGE_SIZE;
- } else {
- addr = vma->vm_start;
- end = vma->vm_end;
- }
+ addr = vma->vm_start;
+ end = vma->vm_end;
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
+ ret = unuse_p4d_range(vma, pgd, addr, next, type,
+ frontswap, fs_pages_to_unuse);
if (ret)
return ret;
} while (pgd++, addr = next, addr != end);
return 0;
}
-static int unuse_mm(struct mm_struct *mm,
- swp_entry_t entry, struct page *page)
+static int unuse_mm(struct mm_struct *mm, unsigned int type,
+ bool frontswap, unsigned long *fs_pages_to_unuse)
{
struct vm_area_struct *vma;
int ret = 0;
- if (!down_read_trylock(&mm->mmap_sem)) {
- /*
- * Activate page so shrink_inactive_list is unlikely to unmap
- * its ptes while lock is dropped, so swapoff can make progress.
- */
- activate_page(page);
- unlock_page(page);
- down_read(&mm->mmap_sem);
- lock_page(page);
- }
+ down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
- break;
+ if (vma->anon_vma) {
+ ret = unuse_vma(vma, type, frontswap,
+ fs_pages_to_unuse);
+ if (ret)
+ break;
+ }
cond_resched();
}
up_read(&mm->mmap_sem);
- return (ret < 0)? ret: 0;
+ return ret;
}
/*
* Scan swap_map (or frontswap_map if frontswap parameter is true)
- * from current position to next entry still in use.
- * Recycle to start on reaching the end, returning 0 when empty.
+ * from current position to next entry still in use. Return 0
+ * if there are no inuse entries after prev till end of the map.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
unsigned int prev, bool frontswap)
{
- unsigned int max = si->max;
- unsigned int i = prev;
+ unsigned int i;
unsigned char count;
/*
@@ -1968,20 +2004,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
* hits are okay, and sys_swapoff() has already prevented new
* allocations from this area (while holding swap_lock).
*/
- for (;;) {
- if (++i >= max) {
- if (!prev) {
- i = 0;
- break;
- }
- /*
- * No entries in use at top of swap_map,
- * loop back to start and recheck there.
- */
- max = prev + 1;
- prev = 0;
- i = 1;
- }
+ for (i = prev + 1; i < si->max; i++) {
count = READ_ONCE(si->swap_map[i]);
if (count && swap_count(count) != SWAP_MAP_BAD)
if (!frontswap || frontswap_test(si, i))
@@ -1989,240 +2012,121 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
if ((i % LATENCY_LIMIT) == 0)
cond_resched();
}
+
+ if (i == si->max)
+ i = 0;
+
return i;
}
/*
- * We completely avoid races by reading each swap page in advance,
- * and then search for the process using it. All the necessary
- * page table adjustments can then be made atomically.
- *
- * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * If the boolean frontswap is true, only unuse pages_to_unuse pages;
* pages_to_unuse==0 means all pages; ignored if frontswap is false
*/
+#define SWAP_UNUSE_MAX_TRIES 3
int try_to_unuse(unsigned int type, bool frontswap,
unsigned long pages_to_unuse)
{
+ struct mm_struct *prev_mm;
+ struct mm_struct *mm;
+ struct list_head *p;
+ int retval = 0;
struct swap_info_struct *si = swap_info[type];
- struct mm_struct *start_mm;
- volatile unsigned char *swap_map; /* swap_map is accessed without
- * locking. Mark it as volatile
- * to prevent compiler doing
- * something odd.
- */
- unsigned char swcount;
struct page *page;
swp_entry_t entry;
- unsigned int i = 0;
- int retval = 0;
+ unsigned int i;
+ int retries = 0;
- /*
- * When searching mms for an entry, a good strategy is to
- * start at the first mm we freed the previous entry from
- * (though actually we don't notice whether we or coincidence
- * freed the entry). Initialize this start_mm with a hold.
- *
- * A simpler strategy would be to start at the last mm we
- * freed the previous entry from; but that would take less
- * advantage of mmlist ordering, which clusters forked mms
- * together, child after parent. If we race with dup_mmap(), we
- * prefer to resolve parent before child, lest we miss entries
- * duplicated after we scanned child: using last mm would invert
- * that.
- */
- start_mm = &init_mm;
- mmget(&init_mm);
+ if (!si->inuse_pages)
+ return 0;
- /*
- * Keep on scanning until all entries have gone. Usually,
- * one pass through swap_map is enough, but not necessarily:
- * there are races when an instance of an entry might be missed.
- */
- while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
+ if (!frontswap)
+ pages_to_unuse = 0;
+
+retry:
+ retval = shmem_unuse(type, frontswap, &pages_to_unuse);
+ if (retval)
+ goto out;
+
+ prev_mm = &init_mm;
+ mmget(prev_mm);
+
+ spin_lock(&mmlist_lock);
+ p = &init_mm.mmlist;
+ while ((p = p->next) != &init_mm.mmlist) {
if (signal_pending(current)) {
retval = -EINTR;
break;
}
- /*
- * Get a page for the entry, using the existing swap
- * cache page if there is one. Otherwise, get a clean
- * page and read the swap into it.
- */
- swap_map = &si->swap_map[i];
- entry = swp_entry(type, i);
- page = read_swap_cache_async(entry,
- GFP_HIGHUSER_MOVABLE, NULL, 0, false);
- if (!page) {
- /*
- * Either swap_duplicate() failed because entry
- * has been freed independently, and will not be
- * reused since sys_swapoff() already disabled
- * allocation from here, or alloc_page() failed.
- */
- swcount = *swap_map;
- /*
- * We don't hold lock here, so the swap entry could be
- * SWAP_MAP_BAD (when the cluster is discarding).
- * Instead of fail out, We can just skip the swap
- * entry because swapoff will wait for discarding
- * finish anyway.
- */
- if (!swcount || swcount == SWAP_MAP_BAD)
- continue;
- retval = -ENOMEM;
- break;
- }
+ mm = list_entry(p, struct mm_struct, mmlist);
+ if (!mmget_not_zero(mm))
+ continue;
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ prev_mm = mm;
+ retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
- /*
- * Don't hold on to start_mm if it looks like exiting.
- */
- if (atomic_read(&start_mm->mm_users) == 1) {
- mmput(start_mm);
- start_mm = &init_mm;
- mmget(&init_mm);
+ if (retval) {
+ mmput(prev_mm);
+ goto out;
}
/*
- * Wait for and lock page. When do_swap_page races with
- * try_to_unuse, do_swap_page can handle the fault much
- * faster than try_to_unuse can locate the entry. This
- * apparently redundant "wait_on_page_locked" lets try_to_unuse
- * defer to do_swap_page in such a case - in some tests,
- * do_swap_page and try_to_unuse repeatedly compete.
- */
- wait_on_page_locked(page);
- wait_on_page_writeback(page);
- lock_page(page);
- wait_on_page_writeback(page);
-
- /*
- * Remove all references to entry.
+ * Make sure that we aren't completely killing
+ * interactive performance.
*/
- swcount = *swap_map;
- if (swap_count(swcount) == SWAP_MAP_SHMEM) {
- retval = shmem_unuse(entry, page);
- /* page has already been unlocked and released */
- if (retval < 0)
- break;
- continue;
- }
- if (swap_count(swcount) && start_mm != &init_mm)
- retval = unuse_mm(start_mm, entry, page);
-
- if (swap_count(*swap_map)) {
- int set_start_mm = (*swap_map >= swcount);
- struct list_head *p = &start_mm->mmlist;
- struct mm_struct *new_start_mm = start_mm;
- struct mm_struct *prev_mm = start_mm;
- struct mm_struct *mm;
-
- mmget(new_start_mm);
- mmget(prev_mm);
- spin_lock(&mmlist_lock);
- while (swap_count(*swap_map) && !retval &&
- (p = p->next) != &start_mm->mmlist) {
- mm = list_entry(p, struct mm_struct, mmlist);
- if (!mmget_not_zero(mm))
- continue;
- spin_unlock(&mmlist_lock);
- mmput(prev_mm);
- prev_mm = mm;
+ cond_resched();
+ spin_lock(&mmlist_lock);
+ }
+ spin_unlock(&mmlist_lock);
- cond_resched();
+ mmput(prev_mm);
- swcount = *swap_map;
- if (!swap_count(swcount)) /* any usage ? */
- ;
- else if (mm == &init_mm)
- set_start_mm = 1;
- else
- retval = unuse_mm(mm, entry, page);
-
- if (set_start_mm && *swap_map < swcount) {
- mmput(new_start_mm);
- mmget(mm);
- new_start_mm = mm;
- set_start_mm = 0;
- }
- spin_lock(&mmlist_lock);
- }
- spin_unlock(&mmlist_lock);
- mmput(prev_mm);
- mmput(start_mm);
- start_mm = new_start_mm;
- }
- if (retval) {
- unlock_page(page);
- put_page(page);
- break;
- }
+ i = 0;
+ while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
- /*
- * If a reference remains (rare), we would like to leave
- * the page in the swap cache; but try_to_unmap could
- * then re-duplicate the entry once we drop page lock,
- * so we might loop indefinitely; also, that page could
- * not be swapped out to other storage meanwhile. So:
- * delete from cache even if there's another reference,
- * after ensuring that the data has been saved to disk -
- * since if the reference remains (rarer), it will be
- * read from disk into another page. Splitting into two
- * pages would be incorrect if swap supported "shared
- * private" pages, but they are handled by tmpfs files.
- *
- * Given how unuse_vma() targets one particular offset
- * in an anon_vma, once the anon_vma has been determined,
- * this splitting happens to be just what is needed to
- * handle where KSM pages have been swapped out: re-reading
- * is unnecessarily slow, but we can fix that later on.
- */
- if (swap_count(*swap_map) &&
- PageDirty(page) && PageSwapCache(page)) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- };
-
- swap_writepage(compound_head(page), &wbc);
- lock_page(page);
- wait_on_page_writeback(page);
- }
+ entry = swp_entry(type, i);
+ page = find_get_page(swap_address_space(entry), i);
+ if (!page)
+ continue;
/*
* It is conceivable that a racing task removed this page from
- * swap cache just before we acquired the page lock at the top,
- * or while we dropped it in unuse_mm(). The page might even
- * be back in swap cache on another swap area: that we must not
- * delete, since it may not have been written out to swap yet.
+ * swap cache just before we acquired the page lock. The page
+ * might even be back in swap cache on another swap area. But
+ * that is okay, try_to_free_swap() only removes stale pages.
*/
- if (PageSwapCache(page) &&
- likely(page_private(page) == entry.val) &&
- (!PageTransCompound(page) ||
- !swap_page_trans_huge_swapped(si, entry)))
- delete_from_swap_cache(compound_head(page));
-
- /*
- * So we could skip searching mms once swap count went
- * to 1, we did not mark any present ptes as dirty: must
- * mark page dirty so shrink_page_list will preserve it.
- */
- SetPageDirty(page);
+ lock_page(page);
+ wait_on_page_writeback(page);
+ try_to_free_swap(page);
unlock_page(page);
put_page(page);
/*
- * Make sure that we aren't completely killing
- * interactive performance.
+ * For frontswap, we just need to unuse pages_to_unuse, if
+ * it was specified. Need not check frontswap again here as
+ * we already zeroed out pages_to_unuse if not frontswap.
*/
- cond_resched();
- if (frontswap && pages_to_unuse > 0) {
- if (!--pages_to_unuse)
- break;
- }
+ if (pages_to_unuse && --pages_to_unuse == 0)
+ goto out;
}
- mmput(start_mm);
- return retval;
+ /*
+ * Lets check again to see if there are still swap entries in the map.
+ * If yes, we would need to do retry the unuse logic again.
+ * Under global memory pressure, swap entries can be reinserted back
+ * into process space after the mmlist loop above passes over them.
+ * Its not worth continuosuly retrying to unuse the swap in this case.
+ * So we try SWAP_UNUSE_MAX_TRIES times.
+ */
+ if (++retries >= SWAP_UNUSE_MAX_TRIES)
+ retval = -EBUSY;
+ else if (si->inuse_pages)
+ goto retry;
+
+out:
+ return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
}
/*
@@ -2258,7 +2162,7 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
struct swap_extent *se;
pgoff_t offset;
- sis = swap_info[swp_type(entry)];
+ sis = swp_swap_info(entry);
*bdev = sis->bdev;
offset = swp_offset(entry);
@@ -2700,9 +2604,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
if (!l)
return SEQ_START_TOKEN;
- for (type = 0; type < nr_swapfiles; type++) {
- smp_rmb(); /* read nr_swapfiles before swap_info[type] */
- si = swap_info[type];
+ for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
if (!--l)
@@ -2722,9 +2624,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
else
type = si->type + 1;
- for (; type < nr_swapfiles; type++) {
- smp_rmb(); /* read nr_swapfiles before swap_info[type] */
- si = swap_info[type];
+ for (; (si = swap_type_to_swap_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
++*pos;
@@ -2813,7 +2713,7 @@ static struct swap_info_struct *alloc_swap_info(void)
struct swap_info_struct *p;
unsigned int type;
int i;
- int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
+ unsigned int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
p = kvzalloc(size, GFP_KERNEL);
if (!p)
@@ -2831,14 +2731,14 @@ static struct swap_info_struct *alloc_swap_info(void)
}
if (type >= nr_swapfiles) {
p->type = type;
- swap_info[type] = p;
+ WRITE_ONCE(swap_info[type], p);
/*
* Write swap_info[type] before nr_swapfiles, in case a
* racing procfs swap_start() or swap_next() is reading them.
* (We never shrink nr_swapfiles, we never free this entry.)
*/
smp_wmb();
- nr_swapfiles++;
+ WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
} else {
kvfree(p);
p = swap_info[type];
@@ -3358,7 +3258,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
struct swap_info_struct *p;
struct swap_cluster_info *ci;
- unsigned long offset, type;
+ unsigned long offset;
unsigned char count;
unsigned char has_cache;
int err = -EINVAL;
@@ -3366,10 +3266,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
if (non_swap_entry(entry))
goto out;
- type = swp_type(entry);
- if (type >= nr_swapfiles)
+ p = swp_swap_info(entry);
+ if (!p)
goto bad_file;
- p = swap_info[type];
+
offset = swp_offset(entry);
if (unlikely(offset >= p->max))
goto out;
@@ -3466,7 +3366,7 @@ int swapcache_prepare(swp_entry_t entry)
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
- return swap_info[swp_type(entry)];
+ return swap_type_to_swap_info(swp_type(entry));
}
struct swap_info_struct *page_swap_info(struct page *page)
diff --git a/mm/truncate.c b/mm/truncate.c
index 798e7ccfb030..b7d3c99f00c9 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -539,6 +539,8 @@ EXPORT_SYMBOL(truncate_inode_pages_final);
* invalidate_mapping_pages() will not block on IO activity. It will not
* invalidate pages which are dirty, locked, under writeback or mapped into
* pagetables.
+ *
+ * Return: the number of the pages that were invalidated
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
@@ -664,7 +666,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
- * Returns -EBUSY if any pages could not be invalidated.
+ * Return: -EBUSY if any pages could not be invalidated.
*/
int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
@@ -761,7 +763,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
- * Returns -EBUSY if any pages could not be invalidated.
+ * Return: -EBUSY if any pages could not be invalidated.
*/
int invalidate_inode_pages2(struct address_space *mapping)
{
diff --git a/mm/util.c b/mm/util.c
index 1ea055138043..2c64393adf58 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -36,6 +36,8 @@ EXPORT_SYMBOL(kfree_const);
* kstrdup - allocate space for and copy an existing string
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Return: newly allocated copy of @s or %NULL in case of error
*/
char *kstrdup(const char *s, gfp_t gfp)
{
@@ -58,9 +60,10 @@ EXPORT_SYMBOL(kstrdup);
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
- * Function returns source string if it is in .rodata section otherwise it
- * fallbacks to kstrdup.
- * Strings allocated by kstrdup_const should be freed by kfree_const.
+ * Note: Strings allocated by kstrdup_const should be freed by kfree_const.
+ *
+ * Return: source string if it is in .rodata section otherwise
+ * fallback to kstrdup.
*/
const char *kstrdup_const(const char *s, gfp_t gfp)
{
@@ -78,6 +81,8 @@ EXPORT_SYMBOL(kstrdup_const);
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
* Note: Use kmemdup_nul() instead if the size is known exactly.
+ *
+ * Return: newly allocated copy of @s or %NULL in case of error
*/
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
@@ -103,6 +108,8 @@ EXPORT_SYMBOL(kstrndup);
* @src: memory region to duplicate
* @len: memory region length
* @gfp: GFP mask to use
+ *
+ * Return: newly allocated copy of @src or %NULL in case of error
*/
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
@@ -120,6 +127,9 @@ EXPORT_SYMBOL(kmemdup);
* @s: The data to stringify
* @len: The size of the data
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Return: newly allocated copy of @s with NUL-termination or %NULL in
+ * case of error
*/
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
@@ -143,7 +153,7 @@ EXPORT_SYMBOL(kmemdup_nul);
* @src: source address in user space
* @len: number of bytes to copy
*
- * Returns an ERR_PTR() on failure. Result is physically
+ * Return: an ERR_PTR() on failure. Result is physically
* contiguous, to be freed by kfree().
*/
void *memdup_user(const void __user *src, size_t len)
@@ -169,7 +179,7 @@ EXPORT_SYMBOL(memdup_user);
* @src: source address in user space
* @len: number of bytes to copy
*
- * Returns an ERR_PTR() on failure. Result may be not
+ * Return: an ERR_PTR() on failure. Result may be not
* physically contiguous. Use kvfree() to free.
*/
void *vmemdup_user(const void __user *src, size_t len)
@@ -193,6 +203,8 @@ EXPORT_SYMBOL(vmemdup_user);
* strndup_user - duplicate an existing string from user space
* @s: The string to duplicate
* @n: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Return: newly allocated copy of @s or %NULL in case of error
*/
char *strndup_user(const char __user *s, long n)
{
@@ -224,7 +236,7 @@ EXPORT_SYMBOL(strndup_user);
* @src: source address in user space
* @len: number of bytes to copy
*
- * Returns an ERR_PTR() on failure.
+ * Return: an ERR_PTR() on failure.
*/
void *memdup_user_nul(const void __user *src, size_t len)
{
@@ -310,10 +322,6 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- *
* get_user_pages_fast provides equivalent functionality to get_user_pages,
* operating on current and current->mm, with force=0 and vma=NULL. However
* unlike get_user_pages, it must be called without mmap_sem held.
@@ -325,6 +333,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* pages have to be faulted in, it may turn out to be slightly slower so
* callers need to carefully consider what to use. On many architectures,
* get_user_pages_fast simply falls back to get_user_pages.
+ *
+ * Return: number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
*/
int __weak get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
@@ -386,6 +398,8 @@ EXPORT_SYMBOL(vm_mmap);
*
* Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
* fall back to vmalloc.
+ *
+ * Return: pointer to the allocated memory of %NULL in case of failure
*/
void *kvmalloc_node(size_t size, gfp_t flags, int node)
{
@@ -729,7 +743,8 @@ error:
* @buffer: the buffer to copy to.
* @buflen: the length of the buffer. Larger cmdline values are truncated
* to this length.
- * Returns the size of the cmdline field copied. Note that the copy does
+ *
+ * Return: the size of the cmdline field copied. Note that the copy does
* not guarantee an ending NULL byte.
*/
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 871e41c55e23..b7455d4c8c12 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -498,7 +498,11 @@ nocache:
}
found:
- if (addr + size > vend)
+ /*
+ * Check also calculated address against the vstart,
+ * because it can be 0 because of big align request.
+ */
+ if (addr + size > vend || addr < vstart)
goto overflow;
va->va_start = addr;
@@ -628,7 +632,7 @@ static unsigned long lazy_max_pages(void)
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
-static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
/*
* Serialize vmap purging. There is no actual criticial section protected
@@ -646,7 +650,7 @@ static void purge_fragmented_blocks_allcpus(void);
*/
void set_iounmap_nonlazy(void)
{
- atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+ atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
}
/*
@@ -654,34 +658,40 @@ void set_iounmap_nonlazy(void)
*/
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
+ unsigned long resched_threshold;
struct llist_node *valist;
struct vmap_area *va;
struct vmap_area *n_va;
- bool do_free = false;
lockdep_assert_held(&vmap_purge_lock);
valist = llist_del_all(&vmap_purge_list);
+ if (unlikely(valist == NULL))
+ return false;
+
+ /*
+ * TODO: to calculate a flush range without looping.
+ * The list can be up to lazy_max_pages() elements.
+ */
llist_for_each_entry(va, valist, purge_list) {
if (va->va_start < start)
start = va->va_start;
if (va->va_end > end)
end = va->va_end;
- do_free = true;
}
- if (!do_free)
- return false;
-
flush_tlb_kernel_range(start, end);
+ resched_threshold = lazy_max_pages() << 1;
spin_lock(&vmap_area_lock);
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
- int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+ unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
__free_vmap_area(va);
- atomic_sub(nr, &vmap_lazy_nr);
- cond_resched_lock(&vmap_area_lock);
+ atomic_long_sub(nr, &vmap_lazy_nr);
+
+ if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
+ cond_resched_lock(&vmap_area_lock);
}
spin_unlock(&vmap_area_lock);
return true;
@@ -717,10 +727,10 @@ static void purge_vmap_area_lazy(void)
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
- int nr_lazy;
+ unsigned long nr_lazy;
- nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
- &vmap_lazy_nr);
+ nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
+ PAGE_SHIFT, &vmap_lazy_nr);
/* After this point, we may free va at any time */
llist_add(&va->purge_list, &vmap_purge_list);
@@ -840,7 +850,7 @@ static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
* @order: how many 2^order pages should be occupied in newly allocated block
* @gfp_mask: flags for the page level allocator
*
- * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
+ * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
*/
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
@@ -1187,6 +1197,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
EXPORT_SYMBOL(vm_map_ram);
static struct vm_struct *vmlist __initdata;
+
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@@ -1421,13 +1432,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
}
/**
- * get_vm_area - reserve a contiguous kernel virtual area
- * @size: size of the area
- * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
+ * get_vm_area - reserve a contiguous kernel virtual area
+ * @size: size of the area
+ * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
*
- * Search an area of @size in the kernel virtual mapping area,
- * and reserved it for out purposes. Returns the area descriptor
- * on success or %NULL on failure.
+ * Search an area of @size in the kernel virtual mapping area,
+ * and reserved it for out purposes. Returns the area descriptor
+ * on success or %NULL on failure.
+ *
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
@@ -1444,12 +1457,14 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
}
/**
- * find_vm_area - find a continuous kernel virtual area
- * @addr: base address
+ * find_vm_area - find a continuous kernel virtual area
+ * @addr: base address
+ *
+ * Search for the kernel VM area starting at @addr, and return it.
+ * It is up to the caller to do all required locking to keep the returned
+ * pointer valid.
*
- * Search for the kernel VM area starting at @addr, and return it.
- * It is up to the caller to do all required locking to keep the returned
- * pointer valid.
+ * Return: pointer to the found area or %NULL on faulure
*/
struct vm_struct *find_vm_area(const void *addr)
{
@@ -1463,12 +1478,14 @@ struct vm_struct *find_vm_area(const void *addr)
}
/**
- * remove_vm_area - find and remove a continuous kernel virtual area
- * @addr: base address
+ * remove_vm_area - find and remove a continuous kernel virtual area
+ * @addr: base address
+ *
+ * Search for the kernel VM area starting at @addr, and remove it.
+ * This function returns the found VM area, but using it is NOT safe
+ * on SMP machines, except for its size or flags.
*
- * Search for the kernel VM area starting at @addr, and remove it.
- * This function returns the found VM area, but using it is NOT safe
- * on SMP machines, except for its size or flags.
+ * Return: pointer to the found area or %NULL on faulure
*/
struct vm_struct *remove_vm_area(const void *addr)
{
@@ -1505,7 +1522,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
addr))
return;
- area = find_vmap_area((unsigned long)addr)->vm;
+ area = find_vm_area(addr);
if (unlikely(!area)) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
@@ -1548,11 +1565,11 @@ static inline void __vfree_deferred(const void *addr)
}
/**
- * vfree_atomic - release memory allocated by vmalloc()
- * @addr: memory base address
+ * vfree_atomic - release memory allocated by vmalloc()
+ * @addr: memory base address
*
- * This one is just like vfree() but can be called in any atomic context
- * except NMIs.
+ * This one is just like vfree() but can be called in any atomic context
+ * except NMIs.
*/
void vfree_atomic(const void *addr)
{
@@ -1565,21 +1582,29 @@ void vfree_atomic(const void *addr)
__vfree_deferred(addr);
}
+static void __vfree(const void *addr)
+{
+ if (unlikely(in_interrupt()))
+ __vfree_deferred(addr);
+ else
+ __vunmap(addr, 1);
+}
+
/**
- * vfree - release memory allocated by vmalloc()
- * @addr: memory base address
+ * vfree - release memory allocated by vmalloc()
+ * @addr: memory base address
*
- * Free the virtually continuous memory area starting at @addr, as
- * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
- * NULL, no operation is performed.
+ * Free the virtually continuous memory area starting at @addr, as
+ * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
+ * NULL, no operation is performed.
*
- * Must not be called in NMI context (strictly speaking, only if we don't
- * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea)
+ * Must not be called in NMI context (strictly speaking, only if we don't
+ * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
+ * conventions for vfree() arch-depenedent would be a really bad idea)
*
- * May sleep if called *not* from interrupt context.
+ * May sleep if called *not* from interrupt context.
*
- * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
+ * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
*/
void vfree(const void *addr)
{
@@ -1591,21 +1616,19 @@ void vfree(const void *addr)
if (!addr)
return;
- if (unlikely(in_interrupt()))
- __vfree_deferred(addr);
- else
- __vunmap(addr, 1);
+
+ __vfree(addr);
}
EXPORT_SYMBOL(vfree);
/**
- * vunmap - release virtual mapping obtained by vmap()
- * @addr: memory base address
+ * vunmap - release virtual mapping obtained by vmap()
+ * @addr: memory base address
*
- * Free the virtually contiguous memory area starting at @addr,
- * which was created from the page array passed to vmap().
+ * Free the virtually contiguous memory area starting at @addr,
+ * which was created from the page array passed to vmap().
*
- * Must not be called in interrupt context.
+ * Must not be called in interrupt context.
*/
void vunmap(const void *addr)
{
@@ -1617,17 +1640,19 @@ void vunmap(const void *addr)
EXPORT_SYMBOL(vunmap);
/**
- * vmap - map an array of pages into virtually contiguous space
- * @pages: array of page pointers
- * @count: number of pages to map
- * @flags: vm_area->flags
- * @prot: page protection for the mapping
- *
- * Maps @count pages from @pages into contiguous kernel virtual
- * space.
+ * vmap - map an array of pages into virtually contiguous space
+ * @pages: array of page pointers
+ * @count: number of pages to map
+ * @flags: vm_area->flags
+ * @prot: page protection for the mapping
+ *
+ * Maps @count pages from @pages into contiguous kernel virtual
+ * space.
+ *
+ * Return: the address of the area or %NULL on failure
*/
void *vmap(struct page **pages, unsigned int count,
- unsigned long flags, pgprot_t prot)
+ unsigned long flags, pgprot_t prot)
{
struct vm_struct *area;
unsigned long size; /* In bytes */
@@ -1709,25 +1734,27 @@ fail:
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
- vfree(area->addr);
+ __vfree(area->addr);
return NULL;
}
/**
- * __vmalloc_node_range - allocate virtually contiguous memory
- * @size: allocation size
- * @align: desired alignment
- * @start: vm area range start
- * @end: vm area range end
- * @gfp_mask: flags for the page level allocator
- * @prot: protection mask for the allocated pages
- * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
- * @node: node to use for allocation or NUMA_NO_NODE
- * @caller: caller's return address
- *
- * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * __vmalloc_node_range - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @start: vm area range start
+ * @end: vm area range end
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
+ * @node: node to use for allocation or NUMA_NO_NODE
+ * @caller: caller's return address
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
+ *
+ * Return: the address of the area or %NULL on failure
*/
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
@@ -1768,25 +1795,35 @@ fail:
return NULL;
}
+/*
+ * This is only for performance analysis of vmalloc and stress purpose.
+ * It is required by vmalloc test module, therefore do not use it other
+ * than that.
+ */
+#ifdef CONFIG_TEST_VMALLOC_MODULE
+EXPORT_SYMBOL_GPL(__vmalloc_node_range);
+#endif
+
/**
- * __vmalloc_node - allocate virtually contiguous memory
- * @size: allocation size
- * @align: desired alignment
- * @gfp_mask: flags for the page level allocator
- * @prot: protection mask for the allocated pages
- * @node: node to use for allocation or NUMA_NO_NODE
- * @caller: caller's return address
+ * __vmalloc_node - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @node: node to use for allocation or NUMA_NO_NODE
+ * @caller: caller's return address
*
- * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
*
- * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
- * and __GFP_NOFAIL are not supported
+ * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
+ * and __GFP_NOFAIL are not supported
*
- * Any use of gfp flags outside of GFP_KERNEL should be consulted
- * with mm people.
+ * Any use of gfp flags outside of GFP_KERNEL should be consulted
+ * with mm people.
*
+ * Return: pointer to the allocated memory or %NULL on error
*/
static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
@@ -1818,13 +1855,16 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
}
/**
- * vmalloc - allocate virtually contiguous memory
- * @size: allocation size
- * Allocate enough pages to cover @size from the page level
- * allocator and map them into contiguous kernel virtual space.
+ * vmalloc - allocate virtually contiguous memory
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc(unsigned long size)
{
@@ -1834,14 +1874,17 @@ void *vmalloc(unsigned long size)
EXPORT_SYMBOL(vmalloc);
/**
- * vzalloc - allocate virtually contiguous memory with zero fill
- * @size: allocation size
- * Allocate enough pages to cover @size from the page level
- * allocator and map them into contiguous kernel virtual space.
- * The memory allocated is set to zero.
- *
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * vzalloc - allocate virtually contiguous memory with zero fill
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vzalloc(unsigned long size)
{
@@ -1856,34 +1899,30 @@ EXPORT_SYMBOL(vzalloc);
*
* The resulting memory area is zeroed so it can be mapped to userspace
* without leaking data.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_user(unsigned long size)
{
- struct vm_struct *area;
- void *ret;
-
- ret = __vmalloc_node(size, SHMLBA,
- GFP_KERNEL | __GFP_ZERO,
- PAGE_KERNEL, NUMA_NO_NODE,
- __builtin_return_address(0));
- if (ret) {
- area = find_vm_area(ret);
- area->flags |= VM_USERMAP;
- }
- return ret;
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user);
/**
- * vmalloc_node - allocate memory on a specific node
- * @size: allocation size
- * @node: numa node
+ * vmalloc_node - allocate memory on a specific node
+ * @size: allocation size
+ * @node: numa node
*
- * Allocate enough pages to cover @size from the page level
- * allocator and map them into contiguous kernel virtual space.
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_node(unsigned long size, int node)
{
@@ -1903,6 +1942,8 @@ EXPORT_SYMBOL(vmalloc_node);
*
* For tight control over page level allocator and protection flags
* use __vmalloc_node() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vzalloc_node(unsigned long size, int node)
{
@@ -1912,17 +1953,18 @@ void *vzalloc_node(unsigned long size, int node)
EXPORT_SYMBOL(vzalloc_node);
/**
- * vmalloc_exec - allocate virtually contiguous, executable memory
- * @size: allocation size
+ * vmalloc_exec - allocate virtually contiguous, executable memory
+ * @size: allocation size
*
- * Kernel-internal function to allocate enough pages to cover @size
- * the page level allocator and map them into contiguous and
- * executable kernel virtual space.
+ * Kernel-internal function to allocate enough pages to cover @size
+ * the page level allocator and map them into contiguous and
+ * executable kernel virtual space.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
-
void *vmalloc_exec(unsigned long size)
{
return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
@@ -1942,11 +1984,13 @@ void *vmalloc_exec(unsigned long size)
#endif
/**
- * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
- * @size: allocation size
+ * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
+ * @size: allocation size
+ *
+ * Allocate enough 32bit PA addressable pages to cover @size from the
+ * page level allocator and map them into contiguous kernel virtual space.
*
- * Allocate enough 32bit PA addressable pages to cover @size from the
- * page level allocator and map them into contiguous kernel virtual space.
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_32(unsigned long size)
{
@@ -1957,23 +2001,19 @@ EXPORT_SYMBOL(vmalloc_32);
/**
* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
- * @size: allocation size
+ * @size: allocation size
*
* The resulting memory area is 32bit addressable and zeroed so it can be
* mapped to userspace without leaking data.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_32_user(unsigned long size)
{
- struct vm_struct *area;
- void *ret;
-
- ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
- if (ret) {
- area = find_vm_area(ret);
- area->flags |= VM_USERMAP;
- }
- return ret;
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_user);
@@ -2059,31 +2099,29 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
}
/**
- * vread() - read vmalloc area in a safe way.
- * @buf: buffer for reading data
- * @addr: vm address.
- * @count: number of bytes to be read.
- *
- * Returns # of bytes which addr and buf should be increased.
- * (same number to @count). Returns 0 if [addr...addr+count) doesn't
- * includes any intersect with alive vmalloc area.
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from that area to a given buffer. If the given memory range
- * of [addr...addr+count) includes some valid address, data is copied to
- * proper area of @buf. If there are memory holes, they'll be zero-filled.
- * IOREMAP area is treated as memory hole and no copy is done.
- *
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
- *
- * Note: In usual ops, vread() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any informaion, as /dev/kmem.
- *
+ * vread() - read vmalloc area in a safe way.
+ * @buf: buffer for reading data
+ * @addr: vm address.
+ * @count: number of bytes to be read.
+ *
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * copy data from that area to a given buffer. If the given memory range
+ * of [addr...addr+count) includes some valid address, data is copied to
+ * proper area of @buf. If there are memory holes, they'll be zero-filled.
+ * IOREMAP area is treated as memory hole and no copy is done.
+ *
+ * If [addr...addr+count) doesn't includes any intersects with alive
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
+ *
+ * Note: In usual ops, vread() is never necessary because the caller
+ * should know vmalloc() area is valid and can use memcpy().
+ * This is for routines which have to access vmalloc area without
+ * any informaion, as /dev/kmem.
+ *
+ * Return: number of bytes for which addr and buf should be increased
+ * (same number as @count) or %0 if [addr...addr+count) doesn't
+ * include any intersection with valid vmalloc area
*/
-
long vread(char *buf, char *addr, unsigned long count)
{
struct vmap_area *va;
@@ -2140,31 +2178,29 @@ finished:
}
/**
- * vwrite() - write vmalloc area in a safe way.
- * @buf: buffer for source data
- * @addr: vm address.
- * @count: number of bytes to be read.
- *
- * Returns # of bytes which addr and buf should be incresed.
- * (same number to @count).
- * If [addr...addr+count) doesn't includes any intersect with valid
- * vmalloc area, returns 0.
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from a buffer to the given addr. If specified range of
- * [addr...addr+count) includes some valid address, data is copied from
- * proper area of @buf. If there are memory holes, no copy to hole.
- * IOREMAP area is treated as memory hole and no copy is done.
- *
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
- *
- * Note: In usual ops, vwrite() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any informaion, as /dev/kmem.
+ * vwrite() - write vmalloc area in a safe way.
+ * @buf: buffer for source data
+ * @addr: vm address.
+ * @count: number of bytes to be read.
+ *
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * copy data from a buffer to the given addr. If specified range of
+ * [addr...addr+count) includes some valid address, data is copied from
+ * proper area of @buf. If there are memory holes, no copy to hole.
+ * IOREMAP area is treated as memory hole and no copy is done.
+ *
+ * If [addr...addr+count) doesn't includes any intersects with alive
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
+ *
+ * Note: In usual ops, vwrite() is never necessary because the caller
+ * should know vmalloc() area is valid and can use memcpy().
+ * This is for routines which have to access vmalloc area without
+ * any informaion, as /dev/kmem.
+ *
+ * Return: number of bytes for which addr and buf should be
+ * increased (same number as @count) or %0 if [addr...addr+count)
+ * doesn't include any intersection with valid vmalloc area
*/
-
long vwrite(char *buf, char *addr, unsigned long count)
{
struct vmap_area *va;
@@ -2216,20 +2252,20 @@ finished:
}
/**
- * remap_vmalloc_range_partial - map vmalloc pages to userspace
- * @vma: vma to cover
- * @uaddr: target user address to start at
- * @kaddr: virtual address of vmalloc kernel memory
- * @size: size of map area
+ * remap_vmalloc_range_partial - map vmalloc pages to userspace
+ * @vma: vma to cover
+ * @uaddr: target user address to start at
+ * @kaddr: virtual address of vmalloc kernel memory
+ * @size: size of map area
*
- * Returns: 0 for success, -Exxx on failure
+ * Returns: 0 for success, -Exxx on failure
*
- * This function checks that @kaddr is a valid vmalloc'ed area,
- * and that it is big enough to cover the range starting at
- * @uaddr in @vma. Will return failure if that criteria isn't
- * met.
+ * This function checks that @kaddr is a valid vmalloc'ed area,
+ * and that it is big enough to cover the range starting at
+ * @uaddr in @vma. Will return failure if that criteria isn't
+ * met.
*
- * Similar to remap_pfn_range() (see mm/memory.c)
+ * Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
void *kaddr, unsigned long size)
@@ -2248,7 +2284,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
if (!(area->flags & VM_USERMAP))
return -EINVAL;
- if (kaddr + size > area->addr + area->size)
+ if (kaddr + size > area->addr + get_vm_area_size(area))
return -EINVAL;
do {
@@ -2271,18 +2307,18 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
EXPORT_SYMBOL(remap_vmalloc_range_partial);
/**
- * remap_vmalloc_range - map vmalloc pages to userspace
- * @vma: vma to cover (map full range of vma)
- * @addr: vmalloc memory
- * @pgoff: number of pages into addr before first page to map
+ * remap_vmalloc_range - map vmalloc pages to userspace
+ * @vma: vma to cover (map full range of vma)
+ * @addr: vmalloc memory
+ * @pgoff: number of pages into addr before first page to map
*
- * Returns: 0 for success, -Exxx on failure
+ * Returns: 0 for success, -Exxx on failure
*
- * This function checks that addr is a valid vmalloc'ed area, and
- * that it is big enough to cover the vma. Will return failure if
- * that criteria isn't met.
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * that it is big enough to cover the vma. Will return failure if
+ * that criteria isn't met.
*
- * Similar to remap_pfn_range() (see mm/memory.c)
+ * Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
@@ -2314,18 +2350,18 @@ static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
}
/**
- * alloc_vm_area - allocate a range of kernel address space
- * @size: size of the area
- * @ptes: returns the PTEs for the address space
+ * alloc_vm_area - allocate a range of kernel address space
+ * @size: size of the area
+ * @ptes: returns the PTEs for the address space
*
- * Returns: NULL on failure, vm_struct on success
+ * Returns: NULL on failure, vm_struct on success
*
- * This function reserves a range of kernel address space, and
- * allocates pagetables to map that range. No actual mappings
- * are created.
+ * This function reserves a range of kernel address space, and
+ * allocates pagetables to map that range. No actual mappings
+ * are created.
*
- * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
- * allocated for the VM area are returned.
+ * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
+ * allocated for the VM area are returned.
*/
struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
@@ -2751,4 +2787,3 @@ static int __init proc_vmalloc_init(void)
module_init(proc_vmalloc_init);
#endif
-
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e979705bbf32..ac4806f0f332 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -374,7 +374,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
*/
int prealloc_shrinker(struct shrinker *shrinker)
{
- size_t size = sizeof(*shrinker->nr_deferred);
+ unsigned int size = sizeof(*shrinker->nr_deferred);
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
@@ -1106,16 +1106,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
- int pgactivate = 0;
- unsigned nr_unqueued_dirty = 0;
- unsigned nr_dirty = 0;
- unsigned nr_congested = 0;
unsigned nr_reclaimed = 0;
- unsigned nr_writeback = 0;
- unsigned nr_immediate = 0;
- unsigned nr_ref_keep = 0;
- unsigned nr_unmap_fail = 0;
+ memset(stat, 0, sizeof(*stat));
cond_resched();
while (!list_empty(page_list)) {
@@ -1159,10 +1152,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
page_check_dirty_writeback(page, &dirty, &writeback);
if (dirty || writeback)
- nr_dirty++;
+ stat->nr_dirty++;
if (dirty && !writeback)
- nr_unqueued_dirty++;
+ stat->nr_unqueued_dirty++;
/*
* Treat this page as congested if the underlying BDI is or if
@@ -1174,7 +1167,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (((dirty || writeback) && mapping &&
inode_write_congested(mapping->host)) ||
(writeback && PageReclaim(page)))
- nr_congested++;
+ stat->nr_congested++;
/*
* If a page at the tail of the LRU is under writeback, there
@@ -1223,7 +1216,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (current_is_kswapd() &&
PageReclaim(page) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
- nr_immediate++;
+ stat->nr_immediate++;
goto activate_locked;
/* Case 2 above */
@@ -1241,7 +1234,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* and it's also appropriate in global reclaim.
*/
SetPageReclaim(page);
- nr_writeback++;
+ stat->nr_writeback++;
goto activate_locked;
/* Case 3 above */
@@ -1261,7 +1254,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case PAGEREF_ACTIVATE:
goto activate_locked;
case PAGEREF_KEEP:
- nr_ref_keep++;
+ stat->nr_ref_keep++;
goto keep_locked;
case PAGEREF_RECLAIM:
case PAGEREF_RECLAIM_CLEAN:
@@ -1326,7 +1319,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
if (!try_to_unmap(page, flags)) {
- nr_unmap_fail++;
+ stat->nr_unmap_fail++;
goto activate_locked;
}
}
@@ -1474,7 +1467,7 @@ activate_locked:
VM_BUG_ON_PAGE(PageActive(page), page);
if (!PageMlocked(page)) {
SetPageActive(page);
- pgactivate++;
+ stat->nr_activate++;
count_memcg_page_event(page, PGACTIVATE);
}
keep_locked:
@@ -1489,18 +1482,8 @@ keep:
free_unref_page_list(&free_pages);
list_splice(&ret_pages, page_list);
- count_vm_events(PGACTIVATE, pgactivate);
-
- if (stat) {
- stat->nr_dirty = nr_dirty;
- stat->nr_congested = nr_congested;
- stat->nr_unqueued_dirty = nr_unqueued_dirty;
- stat->nr_writeback = nr_writeback;
- stat->nr_immediate = nr_immediate;
- stat->nr_activate = pgactivate;
- stat->nr_ref_keep = nr_ref_keep;
- stat->nr_unmap_fail = nr_unmap_fail;
- }
+ count_vm_events(PGACTIVATE, stat->nr_activate);
+
return nr_reclaimed;
}
@@ -1512,6 +1495,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
.priority = DEF_PRIORITY,
.may_unmap = 1,
};
+ struct reclaim_stat dummy_stat;
unsigned long ret;
struct page *page, *next;
LIST_HEAD(clean_pages);
@@ -1525,7 +1509,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
}
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_IGNORE_ACCESS, NULL, true);
+ TTU_IGNORE_ACCESS, &dummy_stat, true);
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
@@ -1653,7 +1637,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct lruvec *lruvec, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
- isolate_mode_t mode, enum lru_list lru)
+ enum lru_list lru)
{
struct list_head *src = &lruvec->lists[lru];
unsigned long nr_taken = 0;
@@ -1662,6 +1646,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
LIST_HEAD(pages_skipped);
+ isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
scan = 0;
for (total_scan = 0;
@@ -1899,8 +1884,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
- struct reclaim_stat stat = {};
- isolate_mode_t isolate_mode = 0;
+ struct reclaim_stat stat;
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
@@ -1921,13 +1905,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
lru_add_drain();
- if (!sc->may_unmap)
- isolate_mode |= ISOLATE_UNMAPPED;
-
spin_lock_irq(&pgdat->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
- &nr_scanned, sc, isolate_mode, lru);
+ &nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
@@ -2084,19 +2065,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
- isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
lru_add_drain();
- if (!sc->may_unmap)
- isolate_mode |= ISOLATE_UNMAPPED;
-
spin_lock_irq(&pgdat->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
- &nr_scanned, sc, isolate_mode, lru);
+ &nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
@@ -2435,17 +2412,91 @@ out:
*lru_pages = 0;
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
- unsigned long size;
+ unsigned long lruvec_size;
unsigned long scan;
+ unsigned long min, low;
+
+ lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ mem_cgroup_protection(memcg, &min, &low);
+
+ if (min || low) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+
+ /*
+ * If there is any protection in place, we adjust scan
+ * pressure in proportion to how much a group's current
+ * usage exceeds that, in percent.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * of their best-effort protection they are using. Usage
+ * below memory.min is excluded from consideration when
+ * calculating utilisation, as it isn't ever
+ * reclaimable, so it might as well not exist for our
+ * purposes.
+ */
+ if (sc->memcg_low_reclaim && low > min) {
+ /*
+ * Reclaim according to utilisation between min
+ * and low
+ */
+ scan = lruvec_size * (cgroup_size - min) /
+ (low - min);
+ } else {
+ /* Reclaim according to protection overage */
+ scan = lruvec_size * cgroup_size /
+ max(min, low) - lruvec_size;
+ }
+
+ /*
+ * Don't allow the scan target to exceed the lruvec
+ * size, which otherwise could happen if we have >200%
+ * overage in the normal case, or >100% overage when
+ * sc->memcg_low_reclaim is set.
+ *
+ * This is important because other cgroups without
+ * memory.low have their scan target initially set to
+ * their lruvec size, so allowing values >100% of the
+ * lruvec size here could result in penalising cgroups
+ * with memory.low set even *more* than their peers in
+ * some cases in the case of large overages.
+ *
+ * Also, minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decremeting
+ * sc->priority further than desirable.
+ */
+ scan = clamp(scan, SWAP_CLUSTER_MAX, lruvec_size);
+ } else {
+ scan = lruvec_size;
+ }
+
+ scan >>= sc->priority;
- size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- scan = size >> sc->priority;
/*
* If the cgroup's already been deleted, make sure to
* scrape out the remaining cache.
*/
if (!scan && !mem_cgroup_online(memcg))
- scan = min(size, SWAP_CLUSTER_MAX);
+ scan = min(lruvec_size, SWAP_CLUSTER_MAX);
switch (scan_balance) {
case SCAN_EQUAL:
@@ -2465,7 +2516,7 @@ out:
case SCAN_ANON:
/* Scan one type exclusively */
if ((scan_balance == SCAN_FILE) != file) {
- size = 0;
+ lruvec_size = 0;
scan = 0;
}
break;
@@ -2474,7 +2525,7 @@ out:
BUG();
}
- *lru_pages += size;
+ *lru_pages += lruvec_size;
nr[lru] = scan;
}
}
@@ -2735,6 +2786,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
memcg_memory_event(memcg, MEMCG_LOW);
break;
case MEMCG_PROT_NONE:
+ /*
+ * All protection thresholds breached. We may
+ * still choose to vary the scan pressure
+ * applied based on by how much the cgroup in
+ * question has exceeded its protection
+ * thresholds (see get_scan_count).
+ */
break;
}
@@ -2754,16 +2812,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
sc->nr_reclaimed - reclaimed);
/*
- * Direct reclaim and kswapd have to scan all memory
- * cgroups to fulfill the overall scan target for the
- * node.
+ * Kswapd have to scan all memory cgroups to fulfill
+ * the overall scan target for the node.
*
* Limit reclaim, on the other hand, only cares about
* nr_to_reclaim pages to be reclaimed and it will
* retry with decreasing priority if one round over the
* whole hierarchy is not sufficient.
*/
- if (!global_reclaim(sc) &&
+ if (!current_is_kswapd() &&
sc->nr_reclaimed >= sc->nr_to_reclaim) {
mem_cgroup_iter_break(root, memcg);
break;
@@ -3527,7 +3584,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
- * found to have free_pages <= high_wmark_pages(zone), any page is that zone
+ * found to have free_pages <= high_wmark_pages(zone), any page in that zone
* or lower is eligible for reclaim until at least one usable zone is
* balanced.
*/
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 83b30edc2f7f..36b56f858f0f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -2121,21 +2121,14 @@ static int __init extfrag_debug_init(void)
struct dentry *extfrag_debug_root;
extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
- if (!extfrag_debug_root)
- return -ENOMEM;
- if (!debugfs_create_file("unusable_index", 0444,
- extfrag_debug_root, NULL, &unusable_file_ops))
- goto fail;
+ debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
+ &unusable_file_ops);
- if (!debugfs_create_file("extfrag_index", 0444,
- extfrag_debug_root, NULL, &extfrag_file_ops))
- goto fail;
+ debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
+ &extfrag_file_ops);
return 0;
-fail:
- debugfs_remove_recursive(extfrag_debug_root);
- return -ENOMEM;
}
module_init(extfrag_debug_init);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 6ac919847ce6..f3f5a78cd062 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -158,6 +158,7 @@
#include <linux/etherdevice.h>
#include <linux/kthread.h>
#include <linux/prefetch.h>
+#include <linux/mmzone.h>
#include <net/net_namespace.h>
#include <net/checksum.h>
#include <net/ipv6.h>
@@ -3625,7 +3626,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->svlan_cfi = 0;
pkt_dev->svlan_id = 0xffff;
pkt_dev->burst = 1;
- pkt_dev->node = -1;
+ pkt_dev->node = NUMA_NO_NODE;
err = pktgen_setup_dev(t->net, pkt_dev, ifname);
if (err)
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 86e1e37eb4e8..b37e6e0a1026 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -15,6 +15,7 @@
#include <linux/netlink.h>
#include <linux/qrtr.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
+#include <linux/numa.h>
#include <net/sock.h>
@@ -101,7 +102,7 @@ static inline struct qrtr_sock *qrtr_sk(struct sock *sk)
return container_of(sk, struct qrtr_sock, sk);
}
-static unsigned int qrtr_local_nid = -1;
+static unsigned int qrtr_local_nid = NUMA_NO_NODE;
/* for node ids */
static RADIX_TREE(qrtr_nodes, GFP_KERNEL);
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 25c259df8ffa..f1fb8e502657 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -27,14 +27,9 @@ else
$(call cc-param,asan-globals=1) \
$(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \
$(call cc-param,asan-stack=1) \
- $(call cc-param,asan-use-after-scope=1) \
$(call cc-param,asan-instrument-allocas=1)
endif
-ifdef CONFIG_KASAN_EXTRA
-CFLAGS_KASAN += $(call cc-option, -fsanitize-address-use-after-scope)
-endif
-
endif # CONFIG_KASAN_GENERIC
ifdef CONFIG_KASAN_SW_TAGS
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index d62abd032885..d0001fd1112d 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -61,7 +61,7 @@ my $codespellfile = "/usr/share/codespell/dictionary.txt";
my $conststructsfile = "$D/const_structs.checkpatch";
my $typedefsfile = "";
my $color = "auto";
-my $allow_c99_comments = 1;
+my $allow_c99_comments = 1; # Can be overridden by --ignore C99_COMMENT_TOLERANCE
sub help {
my ($exitcode) = @_;
@@ -466,6 +466,16 @@ our $logFunctions = qr{(?x:
seq_vprintf|seq_printf|seq_puts
)};
+our $allocFunctions = qr{(?x:
+ (?:(?:devm_)?
+ (?:kv|k|v)[czm]alloc(?:_node|_array)? |
+ kstrdup(?:_const)? |
+ kmemdup(?:_nul)?) |
+ (?:\w+)?alloc_skb(?:ip_align)? |
+ # dev_alloc_skb/netdev_alloc_skb, et al
+ dma_alloc_coherent
+)};
+
our $signature_tags = qr{(?xi:
Signed-off-by:|
Co-developed-by:|
@@ -1011,6 +1021,7 @@ if ($git) {
}
my $vname;
+$allow_c99_comments = !defined $ignore_type{"C99_COMMENT_TOLERANCE"};
for my $filename (@ARGV) {
my $FILE;
if ($git) {
@@ -2696,8 +2707,10 @@ sub process {
($line =~ /^\s*(?:WARNING:|BUG:)/ ||
$line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
# timestamp
- $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
- # stack dump address
+ $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/) ||
+ $line =~ /^(?:\s+\w+:\s+[0-9a-fA-F]+){3,3}/ ||
+ $line =~ /^\s*\#\d+\s*\[[0-9a-fA-F]+\]\s*\w+ at [0-9a-fA-F]+/) {
+ # stack dump address styles
$commit_log_possible_stack_dump = 1;
}
@@ -3037,16 +3050,24 @@ sub process {
$comment = '..';
}
+# check SPDX comment style for .[chsS] files
+ if ($realfile =~ /\.[chsS]$/ &&
+ $rawline =~ /SPDX-License-Identifier:/ &&
+ $rawline !~ m@^\+\s*\Q$comment\E\s*@) {
+ WARN("SPDX_LICENSE_TAG",
+ "Improper SPDX comment style for '$realfile', please use '$comment' instead\n" . $herecurr);
+ }
+
if ($comment !~ /^$/ &&
- $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) {
- WARN("SPDX_LICENSE_TAG",
- "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr);
+ $rawline !~ m@^\+\Q$comment\E SPDX-License-Identifier: @) {
+ WARN("SPDX_LICENSE_TAG",
+ "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr);
} elsif ($rawline =~ /(SPDX-License-Identifier: .*)/) {
- my $spdx_license = $1;
- if (!is_SPDX_License_valid($spdx_license)) {
- WARN("SPDX_LICENSE_TAG",
- "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr);
- }
+ my $spdx_license = $1;
+ if (!is_SPDX_License_valid($spdx_license)) {
+ WARN("SPDX_LICENSE_TAG",
+ "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr);
+ }
}
}
}
@@ -3054,6 +3075,14 @@ sub process {
# check we are in a valid source file if not then ignore this hunk
next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/);
+# check for using SPDX-License-Identifier on the wrong line number
+ if ($realline != $checklicenseline &&
+ $rawline =~ /\bSPDX-License-Identifier:/ &&
+ substr($line, @-, @+ - @-) eq "$;" x (@+ - @-)) {
+ WARN("SPDX_LICENSE_TAG",
+ "Misplaced SPDX-License-Identifier tag - use line $checklicenseline instead\n" . $herecurr);
+ }
+
# line length limit (with some exclusions)
#
# There are a few types of lines that may extend beyond $max_line_length:
@@ -5545,7 +5574,8 @@ sub process {
my ($s, $c) = ctx_statement_block($linenr - 3, $realcnt, 0);
# print("line: <$line>\nprevline: <$prevline>\ns: <$s>\nc: <$c>\n\n\n");
- if ($s =~ /(?:^|\n)[ \+]\s*(?:$Type\s*)?\Q$testval\E\s*=\s*(?:\([^\)]*\)\s*)?\s*(?:devm_)?(?:[kv][czm]alloc(?:_node|_array)?\b|kstrdup|kmemdup|(?:dev_)?alloc_skb)/) {
+ if ($s =~ /(?:^|\n)[ \+]\s*(?:$Type\s*)?\Q$testval\E\s*=\s*(?:\([^\)]*\)\s*)?\s*$allocFunctions\s*\(/ &&
+ $s !~ /\b__GFP_NOWARN\b/ ) {
WARN("OOM_MESSAGE",
"Possible unnecessary 'out of memory' message\n" . $hereprev);
}
@@ -6196,8 +6226,8 @@ sub process {
}
}
-# check for pointless casting of kmalloc return
- if ($line =~ /\*\s*\)\s*[kv][czm]alloc(_node){0,1}\b/) {
+# check for pointless casting of alloc functions
+ if ($line =~ /\*\s*\)\s*$allocFunctions\b/) {
WARN("UNNECESSARY_CASTS",
"unnecessary cast may hide bugs, see http://c-faq.com/malloc/mallocnocast.html\n" . $herecurr);
}
@@ -6205,7 +6235,7 @@ sub process {
# alloc style
# p = alloc(sizeof(struct foo), ...) should be p = alloc(sizeof(*p), ...)
if ($perl_version_ok &&
- $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*([kv][mz]alloc(?:_node)?)\s*\(\s*(sizeof\s*\(\s*struct\s+$Lval\s*\))/) {
+ $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*((?:kv|k|v)[mz]alloc(?:_node)?)\s*\(\s*(sizeof\s*\(\s*struct\s+$Lval\s*\))/) {
CHK("ALLOC_SIZEOF_STRUCT",
"Prefer $3(sizeof(*$1)...) over $3($4...)\n" . $herecurr);
}
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 98a7d63a723e..bcdd45df3f51 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -37,6 +37,13 @@ parse_symbol() {
symbol=${symbol#\(}
symbol=${symbol%\)}
+ # Strip segment
+ local segment
+ if [[ $symbol == *:* ]] ; then
+ segment=${symbol%%:*}:
+ symbol=${symbol#*:}
+ fi
+
# Strip the symbol name so that we could look it up
local name=${symbol%+*}
@@ -84,7 +91,7 @@ parse_symbol() {
code=${code//$'\n'/' '}
# Replace old address with pretty line numbers
- symbol="$name ($code)"
+ symbol="$segment$name ($code)"
}
decode_code() {
diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig
index d0cc92e48f6f..74271dba4f94 100644
--- a/scripts/gcc-plugins/Kconfig
+++ b/scripts/gcc-plugins/Kconfig
@@ -68,10 +68,6 @@ config GCC_PLUGIN_LATENT_ENTROPY
config GCC_PLUGIN_STRUCTLEAK
bool "Zero initialize stack variables"
- # Currently STRUCTLEAK inserts initialization out of live scope of
- # variables from KASAN point of view. This leads to KASAN false
- # positive reports. Prohibit this combination for now.
- depends on !KASAN_EXTRA
help
While the kernel is built with warnings enabled for any missed
stack variable initializations, this warning is silenced for
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 517d0c3f83df..86b87332b9e5 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -10,6 +10,8 @@
abandonning||abandoning
abigious||ambiguous
abitrate||arbitrate
+abnornally||abnormally
+abnrormal||abnormal
abord||abort
aboslute||absolute
abov||above
@@ -107,6 +109,7 @@ ambigious||ambiguous
amoung||among
amout||amount
amplifer||amplifier
+amplifyer||amplifier
an union||a union
an user||a user
an userspace||a userspace
@@ -145,6 +148,7 @@ artillary||artillery
asign||assign
asser||assert
assertation||assertion
+assertting||asserting
assiged||assigned
assigment||assignment
assigments||assignments
@@ -168,6 +172,8 @@ attachement||attachment
attched||attached
attemps||attempts
attemping||attempting
+attepmpt||attempt
+attnetion||attention
attruibutes||attributes
authentification||authentication
automaticaly||automatically
@@ -217,6 +223,7 @@ boardcast||broadcast
borad||board
boundry||boundary
brievely||briefly
+broadcase||broadcast
broadcat||broadcast
bufufer||buffer
cacluated||calculated
@@ -234,6 +241,7 @@ cancle||cancel
capabilites||capabilities
capabilty||capability
capabitilies||capabilities
+capablity||capability
capatibilities||capabilities
capapbilities||capabilities
caputure||capture
@@ -274,6 +282,7 @@ clared||cleared
closeing||closing
clustred||clustered
coexistance||coexistence
+colescing||coalescing
collapsable||collapsible
colorfull||colorful
comand||command
@@ -290,6 +299,7 @@ comsumer||consumer
comsuming||consuming
compability||compatibility
compaibility||compatibility
+comparsion||comparison
compatability||compatibility
compatable||compatible
compatibiliy||compatibility
@@ -303,6 +313,7 @@ completly||completely
complient||compliant
componnents||components
compoment||component
+comppatible||compatible
compres||compress
compresion||compression
comression||compression
@@ -368,6 +379,8 @@ decsribed||described
decription||description
dectected||detected
defailt||default
+deferal||deferral
+deffered||deferred
defferred||deferred
definate||definite
definately||definitely
@@ -400,6 +413,7 @@ descritptor||descriptor
desctiptor||descriptor
desriptor||descriptor
desriptors||descriptors
+desination||destination
destionation||destination
destoried||destroyed
destory||destroy
@@ -426,7 +440,9 @@ diffrent||different
differenciate||differentiate
diffrentiate||differentiate
difinition||definition
+dimention||dimension
dimesions||dimensions
+dispalying||displaying
diplay||display
directon||direction
direectly||directly
@@ -442,6 +458,7 @@ disbled||disabled
disconnet||disconnect
discontinous||discontinuous
disharge||discharge
+disnabled||disabled
dispertion||dispersion
dissapears||disappears
distiction||distinction
@@ -456,6 +473,7 @@ dorp||drop
dosen||doesn
downlad||download
downlads||downloads
+droped||dropped
druing||during
dynmaic||dynamic
eanable||enable
@@ -471,6 +489,7 @@ elementry||elementary
eletronic||electronic
embeded||embedded
enabledi||enabled
+enble||enable
enchanced||enhanced
encorporating||incorporating
encrupted||encrypted
@@ -479,6 +498,9 @@ encryptio||encryption
endianess||endianness
enhaced||enhanced
enlightnment||enlightenment
+enqueing||enqueuing
+entires||entries
+entites||entities
entrys||entries
enocded||encoded
enterily||entirely
@@ -498,6 +520,8 @@ etsbalishment||establishment
excecutable||executable
exceded||exceeded
excellant||excellent
+execeeded||exceeded
+execeeds||exceeds
exeed||exceed
existance||existence
existant||existent
@@ -506,6 +530,7 @@ exlcude||exclude
exlcusive||exclusive
exmaple||example
expecially||especially
+experies||expires
explicite||explicit
explicitely||explicitly
explict||explicit
@@ -521,6 +546,7 @@ extracter||extractor
faield||failed
falied||failed
faild||failed
+failded||failed
failer||failure
faill||fail
failied||failed
@@ -540,6 +566,7 @@ fetaure||feature
fetaures||features
fileystem||filesystem
fimware||firmware
+firmare||firmware
firware||firmware
finanize||finalize
findn||find
@@ -574,6 +601,7 @@ funtions||functions
furthur||further
futhermore||furthermore
futrue||future
+gauage||gauge
gaurenteed||guaranteed
generiously||generously
genereate||generate
@@ -645,6 +673,7 @@ independed||independent
indiate||indicate
indicat||indicate
inexpect||inexpected
+inferface||interface
infomation||information
informatiom||information
informations||information
@@ -662,14 +691,17 @@ initialiazation||initialization
initializiation||initialization
initialze||initialize
initialzed||initialized
+initialzing||initializing
initilization||initialization
initilize||initialize
inofficial||unofficial
inrerface||interface
insititute||institute
+instace||instance
instal||install
instanciate||instantiate
instanciated||instantiated
+insufficent||insufficient
inteface||interface
integreated||integrated
integrety||integrity
@@ -684,6 +716,8 @@ intermittant||intermittent
internel||internal
interoprability||interoperability
interuupt||interrupt
+interupt||interrupt
+interupts||interrupts
interrface||interface
interrrupt||interrupt
interrup||interrupt
@@ -699,11 +733,14 @@ intialization||initialization
intialized||initialized
intialize||initialize
intregral||integral
+intrerrupt||interrupt
intrrupt||interrupt
intterrupt||interrupt
intuative||intuitive
inavlid||invalid
invaid||invalid
+invaild||invalid
+invailid||invalid
invald||invalid
invalde||invalid
invalide||invalid
@@ -712,6 +749,7 @@ invalud||invalid
invididual||individual
invokation||invocation
invokations||invocations
+ireelevant||irrelevant
irrelevent||irrelevant
isnt||isn't
isssue||issue
@@ -747,6 +785,7 @@ loobpack||loopback
loosing||losing
losted||lost
machinary||machinery
+maibox||mailbox
maintainance||maintenance
maintainence||maintenance
maintan||maintain
@@ -758,14 +797,19 @@ managable||manageable
managment||management
mangement||management
manoeuvering||maneuvering
+manufaucturing||manufacturing
mappping||mapping
matchs||matches
mathimatical||mathematical
mathimatic||mathematic
mathimatics||mathematics
+maximium||maximum
maxium||maximum
mechamism||mechanism
meetign||meeting
+memeory||memory
+memmber||member
+memoery||memory
ment||meant
mergable||mergeable
mesage||message
@@ -779,6 +823,7 @@ migrateable||migratable
milliseonds||milliseconds
minium||minimum
minimam||minimum
+miniumum||minimum
minumum||minimum
misalinged||misaligned
miscelleneous||miscellaneous
@@ -839,6 +884,7 @@ occurence||occurrence
occure||occurred
occured||occurred
occuring||occurring
+offser||offset
offet||offset
offloded||offloaded
omited||omitted
@@ -855,6 +901,7 @@ optmizations||optimizations
orientatied||orientated
orientied||oriented
orignal||original
+originial||original
otherise||otherwise
ouput||output
oustanding||outstanding
@@ -874,6 +921,7 @@ packege||package
packge||package
packtes||packets
pakage||package
+paket||packet
pallette||palette
paln||plan
paramameters||parameters
@@ -886,6 +934,8 @@ paramters||parameters
parmaters||parameters
particuarly||particularly
particularily||particularly
+partion||partition
+partions||partitions
partiton||partition
pased||passed
passin||passing
@@ -897,10 +947,12 @@ peice||piece
pendantic||pedantic
peprocessor||preprocessor
perfoming||performing
+peripherial||peripheral
permissons||permissions
peroid||period
persistance||persistence
persistant||persistent
+phoneticly||phonetically
plalform||platform
platfoem||platform
platfrom||platform
@@ -915,6 +967,7 @@ posible||possible
positon||position
possibilites||possibilities
powerfull||powerful
+pramater||parameter
preamle||preamble
preample||preamble
preapre||prepare
@@ -976,6 +1029,7 @@ psudo||pseudo
psuedo||pseudo
psychadelic||psychedelic
pwoer||power
+queing||queuing
quering||querying
randomally||randomly
raoming||roaming
@@ -1004,6 +1058,7 @@ refering||referring
refernces||references
refernnce||reference
refrence||reference
+registed||registered
registerd||registered
registeration||registration
registeresd||registered
@@ -1018,6 +1073,7 @@ regulamentations||regulations
reigstration||registration
releated||related
relevent||relevant
+reloade||reload
remoote||remote
remore||remote
removeable||removable
@@ -1036,19 +1092,23 @@ requried||required
requst||request
reregisteration||reregistration
reseting||resetting
+reseved||reserved
reseverd||reserved
resizeable||resizable
resouce||resource
resouces||resources
resoures||resources
responce||response
+resrouce||resource
ressizes||resizes
ressource||resource
ressources||resources
restesting||retesting
+resumbmitting||resubmitting
retransmited||retransmitted
retreived||retrieved
retreive||retrieve
+retreiving||retrieving
retrive||retrieve
retuned||returned
reudce||reduce
@@ -1120,6 +1180,7 @@ sleeped||slept
softwares||software
speach||speech
specfic||specific
+specfield||specified
speciefied||specified
specifc||specific
specifed||specified
@@ -1142,7 +1203,10 @@ staion||station
standardss||standards
standartization||standardization
standart||standard
+standy||standby
+stardard||standard
staticly||statically
+statuss||status
stoped||stopped
stoping||stopping
stoppped||stopped
@@ -1227,12 +1291,14 @@ tipically||typically
timeing||timing
timout||timeout
tmis||this
+toogle||toggle
torerable||tolerable
traking||tracking
tramsmitted||transmitted
tramsmit||transmit
tranasction||transaction
tranfer||transfer
+transcevier||transceiver
transciever||transceiver
transferd||transferred
transfered||transferred
@@ -1267,6 +1333,7 @@ unfortunatelly||unfortunately
unifiy||unify
uniterrupted||uninterrupted
unintialized||uninitialized
+unitialized||uninitialized
unkmown||unknown
unknonw||unknown
unknow||unknown
@@ -1291,7 +1358,9 @@ unsuccessfull||unsuccessful
unsuported||unsupported
untill||until
unuseful||useless
+unvalid||invalid
upate||update
+upsupported||unsupported
usefule||useful
usefull||useful
usege||usage
diff --git a/tools/include/linux/numa.h b/tools/include/linux/numa.h
new file mode 100644
index 000000000000..110b0e5d0fb0
--- /dev/null
+++ b/tools/include/linux/numa.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NUMA_H
+#define _LINUX_NUMA_H
+
+
+#ifdef CONFIG_NODES_SHIFT
+#define NODES_SHIFT CONFIG_NODES_SHIFT
+#else
+#define NODES_SHIFT 0
+#endif
+
+#define MAX_NUMNODES (1 << NODES_SHIFT)
+
+#define NUMA_NO_NODE (-1)
+
+#endif /* _LINUX_NUMA_H */
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 44195514b19e..98ad783efc69 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -34,6 +34,7 @@
#include <sys/types.h>
#include <linux/kernel.h>
#include <linux/time64.h>
+#include <linux/numa.h>
#include <numa.h>
#include <numaif.h>
@@ -298,7 +299,7 @@ static cpu_set_t bind_to_node(int target_node)
CPU_ZERO(&mask);
- if (target_node == -1) {
+ if (target_node == NUMA_NO_NODE) {
for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
CPU_SET(cpu, &mask);
} else {
@@ -339,7 +340,7 @@ static void bind_to_memnode(int node)
unsigned long nodemask;
int ret;
- if (node == -1)
+ if (node == NUMA_NO_NODE)
return;
BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8);
@@ -1363,7 +1364,7 @@ static void init_thread_data(void)
int cpu;
/* Allow all nodes by default: */
- td->bind_node = -1;
+ td->bind_node = NUMA_NO_NODE;
/* Allow all CPUs by default: */
CPU_ZERO(&td->bind_cpumask);
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 10baa1652fc2..c67d32eeb668 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -54,6 +54,22 @@ static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
return fd;
}
+static int mfd_assert_reopen_fd(int fd_in)
+{
+ int r, fd;
+ char path[100];
+
+ sprintf(path, "/proc/self/fd/%d", fd_in);
+
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ printf("re-open of existing fd %d failed\n", fd_in);
+ abort();
+ }
+
+ return fd;
+}
+
static void mfd_fail_new(const char *name, unsigned int flags)
{
int r;
@@ -255,6 +271,25 @@ static void mfd_assert_read(int fd)
munmap(p, mfd_def_size);
}
+/* Test that PROT_READ + MAP_SHARED mappings work. */
+static void mfd_assert_read_shared(int fd)
+{
+ void *p;
+
+ /* verify PROT_READ and MAP_SHARED *is* allowed */
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ,
+ MAP_SHARED,
+ fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ abort();
+ }
+ munmap(p, mfd_def_size);
+}
+
static void mfd_assert_write(int fd)
{
ssize_t l;
@@ -693,6 +728,44 @@ static void test_seal_write(void)
}
/*
+ * Test SEAL_FUTURE_WRITE
+ * Test whether SEAL_FUTURE_WRITE actually prevents modifications.
+ */
+static void test_seal_future_write(void)
+{
+ int fd, fd2;
+ void *p;
+
+ printf("%s SEAL-FUTURE-WRITE\n", memfd_str);
+
+ fd = mfd_assert_new("kern_memfd_seal_future_write",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+ p = mfd_assert_mmap_shared(fd);
+
+ mfd_assert_has_seals(fd, 0);
+
+ mfd_assert_add_seals(fd, F_SEAL_FUTURE_WRITE);
+ mfd_assert_has_seals(fd, F_SEAL_FUTURE_WRITE);
+
+ /* read should pass, writes should fail */
+ mfd_assert_read(fd);
+ mfd_assert_read_shared(fd);
+ mfd_fail_write(fd);
+
+ fd2 = mfd_assert_reopen_fd(fd);
+ /* read should pass, writes should still fail */
+ mfd_assert_read(fd2);
+ mfd_assert_read_shared(fd2);
+ mfd_fail_write(fd2);
+
+ munmap(p, mfd_def_size);
+ close(fd2);
+ close(fd);
+}
+
+/*
* Test SEAL_SHRINK
* Test whether SEAL_SHRINK actually prevents shrinking
*/
@@ -945,6 +1018,7 @@ int main(int argc, char **argv)
test_basic();
test_seal_write();
+ test_seal_future_write();
test_seal_shrink();
test_seal_grow();
test_seal_resize();
diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore
index 29bac5ef9a93..444ad39d3700 100644
--- a/tools/testing/selftests/proc/.gitignore
+++ b/tools/testing/selftests/proc/.gitignore
@@ -2,6 +2,7 @@
/fd-002-posix-eq
/fd-003-kthread
/proc-loadavg-001
+/proc-pid-vm
/proc-self-map-files-001
/proc-self-map-files-002
/proc-self-syscall
diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile
index 434d033ee067..5163dc887aa3 100644
--- a/tools/testing/selftests/proc/Makefile
+++ b/tools/testing/selftests/proc/Makefile
@@ -6,6 +6,7 @@ TEST_GEN_PROGS += fd-001-lookup
TEST_GEN_PROGS += fd-002-posix-eq
TEST_GEN_PROGS += fd-003-kthread
TEST_GEN_PROGS += proc-loadavg-001
+TEST_GEN_PROGS += proc-pid-vm
TEST_GEN_PROGS += proc-self-map-files-001
TEST_GEN_PROGS += proc-self-map-files-002
TEST_GEN_PROGS += proc-self-syscall
diff --git a/tools/testing/selftests/proc/proc-loadavg-001.c b/tools/testing/selftests/proc/proc-loadavg-001.c
index fcff7047000d..471e2aa28077 100644
--- a/tools/testing/selftests/proc/proc-loadavg-001.c
+++ b/tools/testing/selftests/proc/proc-loadavg-001.c
@@ -30,7 +30,7 @@ int main(void)
if (unshare(CLONE_NEWPID) == -1) {
if (errno == ENOSYS || errno == EPERM)
- return 2;
+ return 4;
return 1;
}
diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c
new file mode 100644
index 000000000000..bbe8150d18aa
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-pid-vm.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Fork and exec tiny 1 page executable which precisely controls its VM.
+ * Test /proc/$PID/maps
+ * Test /proc/$PID/smaps
+ * Test /proc/$PID/smaps_rollup
+ * Test /proc/$PID/statm
+ *
+ * FIXME require CONFIG_TMPFS which can be disabled
+ * FIXME test other values from "smaps"
+ * FIXME support other archs
+ */
+#undef NDEBUG
+#include <assert.h>
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/uio.h>
+#include <linux/kdev_t.h>
+
+static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags)
+{
+ return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags);
+}
+
+static void make_private_tmp(void)
+{
+ if (unshare(CLONE_NEWNS) == -1) {
+ if (errno == ENOSYS || errno == EPERM) {
+ exit(4);
+ }
+ exit(1);
+ }
+ if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
+ exit(1);
+ }
+ if (mount(NULL, "/tmp", "tmpfs", 0, NULL) == -1) {
+ exit(1);
+ }
+}
+
+static pid_t pid = -1;
+static void ate(void)
+{
+ if (pid > 0) {
+ kill(pid, SIGTERM);
+ }
+}
+
+struct elf64_hdr {
+ uint8_t e_ident[16];
+ uint16_t e_type;
+ uint16_t e_machine;
+ uint32_t e_version;
+ uint64_t e_entry;
+ uint64_t e_phoff;
+ uint64_t e_shoff;
+ uint32_t e_flags;
+ uint16_t e_ehsize;
+ uint16_t e_phentsize;
+ uint16_t e_phnum;
+ uint16_t e_shentsize;
+ uint16_t e_shnum;
+ uint16_t e_shstrndx;
+};
+
+struct elf64_phdr {
+ uint32_t p_type;
+ uint32_t p_flags;
+ uint64_t p_offset;
+ uint64_t p_vaddr;
+ uint64_t p_paddr;
+ uint64_t p_filesz;
+ uint64_t p_memsz;
+ uint64_t p_align;
+};
+
+#ifdef __x86_64__
+#define PAGE_SIZE 4096
+#define VADDR (1UL << 32)
+#define MAPS_OFFSET 73
+
+#define syscall 0x0f, 0x05
+#define mov_rdi(x) \
+ 0x48, 0xbf, \
+ (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \
+ ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff
+
+#define mov_rsi(x) \
+ 0x48, 0xbe, \
+ (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \
+ ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff
+
+#define mov_eax(x) \
+ 0xb8, (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff
+
+static const uint8_t payload[] = {
+ /* Casually unmap stack, vDSO and everything else. */
+ /* munmap */
+ mov_rdi(VADDR + 4096),
+ mov_rsi((1ULL << 47) - 4096 - VADDR - 4096),
+ mov_eax(11),
+ syscall,
+
+ /* Ping parent. */
+ /* write(0, &c, 1); */
+ 0x31, 0xff, /* xor edi, edi */
+ 0x48, 0x8d, 0x35, 0x00, 0x00, 0x00, 0x00, /* lea rsi, [rip] */
+ 0xba, 0x01, 0x00, 0x00, 0x00, /* mov edx, 1 */
+ mov_eax(1),
+ syscall,
+
+ /* 1: pause(); */
+ mov_eax(34),
+ syscall,
+
+ 0xeb, 0xf7, /* jmp 1b */
+};
+
+static int make_exe(const uint8_t *payload, size_t len)
+{
+ struct elf64_hdr h;
+ struct elf64_phdr ph;
+
+ struct iovec iov[3] = {
+ {&h, sizeof(struct elf64_hdr)},
+ {&ph, sizeof(struct elf64_phdr)},
+ {(void *)payload, len},
+ };
+ int fd, fd1;
+ char buf[64];
+
+ memset(&h, 0, sizeof(h));
+ h.e_ident[0] = 0x7f;
+ h.e_ident[1] = 'E';
+ h.e_ident[2] = 'L';
+ h.e_ident[3] = 'F';
+ h.e_ident[4] = 2;
+ h.e_ident[5] = 1;
+ h.e_ident[6] = 1;
+ h.e_ident[7] = 0;
+ h.e_type = 2;
+ h.e_machine = 0x3e;
+ h.e_version = 1;
+ h.e_entry = VADDR + sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr);
+ h.e_phoff = sizeof(struct elf64_hdr);
+ h.e_shoff = 0;
+ h.e_flags = 0;
+ h.e_ehsize = sizeof(struct elf64_hdr);
+ h.e_phentsize = sizeof(struct elf64_phdr);
+ h.e_phnum = 1;
+ h.e_shentsize = 0;
+ h.e_shnum = 0;
+ h.e_shstrndx = 0;
+
+ memset(&ph, 0, sizeof(ph));
+ ph.p_type = 1;
+ ph.p_flags = (1<<2)|1;
+ ph.p_offset = 0;
+ ph.p_vaddr = VADDR;
+ ph.p_paddr = 0;
+ ph.p_filesz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload);
+ ph.p_memsz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload);
+ ph.p_align = 4096;
+
+ fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_EXCL|O_TMPFILE, 0700);
+ if (fd == -1) {
+ exit(1);
+ }
+
+ if (writev(fd, iov, 3) != sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len) {
+ exit(1);
+ }
+
+ /* Avoid ETXTBSY on exec. */
+ snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd);
+ fd1 = open(buf, O_RDONLY|O_CLOEXEC);
+ close(fd);
+
+ return fd1;
+}
+#endif
+
+#ifdef __x86_64__
+int main(void)
+{
+ int pipefd[2];
+ int exec_fd;
+
+ atexit(ate);
+
+ make_private_tmp();
+
+ /* Reserve fd 0 for 1-byte pipe ping from child. */
+ close(0);
+ if (open("/", O_RDONLY|O_DIRECTORY|O_PATH) != 0) {
+ return 1;
+ }
+
+ exec_fd = make_exe(payload, sizeof(payload));
+
+ if (pipe(pipefd) == -1) {
+ return 1;
+ }
+ if (dup2(pipefd[1], 0) != 0) {
+ return 1;
+ }
+
+ pid = fork();
+ if (pid == -1) {
+ return 1;
+ }
+ if (pid == 0) {
+ sys_execveat(exec_fd, "", NULL, NULL, AT_EMPTY_PATH);
+ return 1;
+ }
+
+ char _;
+ if (read(pipefd[0], &_, 1) != 1) {
+ return 1;
+ }
+
+ struct stat st;
+ if (fstat(exec_fd, &st) == -1) {
+ return 1;
+ }
+
+ /* Generate "head -n1 /proc/$PID/maps" */
+ char buf0[256];
+ memset(buf0, ' ', sizeof(buf0));
+ int len = snprintf(buf0, sizeof(buf0),
+ "%08lx-%08lx r-xp 00000000 %02lx:%02lx %llu",
+ VADDR, VADDR + PAGE_SIZE,
+ MAJOR(st.st_dev), MINOR(st.st_dev),
+ (unsigned long long)st.st_ino);
+ buf0[len] = ' ';
+ snprintf(buf0 + MAPS_OFFSET, sizeof(buf0) - MAPS_OFFSET,
+ "/tmp/#%llu (deleted)\n", (unsigned long long)st.st_ino);
+
+
+ /* Test /proc/$PID/maps */
+ {
+ char buf[256];
+ ssize_t rv;
+ int fd;
+
+ snprintf(buf, sizeof(buf), "/proc/%u/maps", pid);
+ fd = open(buf, O_RDONLY);
+ if (fd == -1) {
+ return 1;
+ }
+ rv = read(fd, buf, sizeof(buf));
+ assert(rv == strlen(buf0));
+ assert(memcmp(buf, buf0, strlen(buf0)) == 0);
+ }
+
+ /* Test /proc/$PID/smaps */
+ {
+ char buf[1024];
+ ssize_t rv;
+ int fd;
+
+ snprintf(buf, sizeof(buf), "/proc/%u/smaps", pid);
+ fd = open(buf, O_RDONLY);
+ if (fd == -1) {
+ return 1;
+ }
+ rv = read(fd, buf, sizeof(buf));
+ assert(0 <= rv && rv <= sizeof(buf));
+
+ assert(rv >= strlen(buf0));
+ assert(memcmp(buf, buf0, strlen(buf0)) == 0);
+
+#define RSS1 "Rss: 4 kB\n"
+#define RSS2 "Rss: 0 kB\n"
+#define PSS1 "Pss: 4 kB\n"
+#define PSS2 "Pss: 0 kB\n"
+ assert(memmem(buf, rv, RSS1, strlen(RSS1)) ||
+ memmem(buf, rv, RSS2, strlen(RSS2)));
+ assert(memmem(buf, rv, PSS1, strlen(PSS1)) ||
+ memmem(buf, rv, PSS2, strlen(PSS2)));
+
+ static const char *S[] = {
+ "Size: 4 kB\n",
+ "KernelPageSize: 4 kB\n",
+ "MMUPageSize: 4 kB\n",
+ "Anonymous: 0 kB\n",
+ "AnonHugePages: 0 kB\n",
+ "Shared_Hugetlb: 0 kB\n",
+ "Private_Hugetlb: 0 kB\n",
+ "Locked: 0 kB\n",
+ };
+ int i;
+
+ for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) {
+ assert(memmem(buf, rv, S[i], strlen(S[i])));
+ }
+ }
+
+ /* Test /proc/$PID/smaps_rollup */
+ {
+ char bufr[256];
+ memset(bufr, ' ', sizeof(bufr));
+ len = snprintf(bufr, sizeof(bufr),
+ "%08lx-%08lx ---p 00000000 00:00 0",
+ VADDR, VADDR + PAGE_SIZE);
+ bufr[len] = ' ';
+ snprintf(bufr + MAPS_OFFSET, sizeof(bufr) - MAPS_OFFSET,
+ "[rollup]\n");
+
+ char buf[1024];
+ ssize_t rv;
+ int fd;
+
+ snprintf(buf, sizeof(buf), "/proc/%u/smaps_rollup", pid);
+ fd = open(buf, O_RDONLY);
+ if (fd == -1) {
+ return 1;
+ }
+ rv = read(fd, buf, sizeof(buf));
+ assert(0 <= rv && rv <= sizeof(buf));
+
+ assert(rv >= strlen(bufr));
+ assert(memcmp(buf, bufr, strlen(bufr)) == 0);
+
+ assert(memmem(buf, rv, RSS1, strlen(RSS1)) ||
+ memmem(buf, rv, RSS2, strlen(RSS2)));
+ assert(memmem(buf, rv, PSS1, strlen(PSS1)) ||
+ memmem(buf, rv, PSS2, strlen(PSS2)));
+
+ static const char *S[] = {
+ "Anonymous: 0 kB\n",
+ "AnonHugePages: 0 kB\n",
+ "Shared_Hugetlb: 0 kB\n",
+ "Private_Hugetlb: 0 kB\n",
+ "Locked: 0 kB\n",
+ };
+ int i;
+
+ for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) {
+ assert(memmem(buf, rv, S[i], strlen(S[i])));
+ }
+ }
+
+ /* Test /proc/$PID/statm */
+ {
+ char buf[64];
+ ssize_t rv;
+ int fd;
+
+ snprintf(buf, sizeof(buf), "/proc/%u/statm", pid);
+ fd = open(buf, O_RDONLY);
+ if (fd == -1) {
+ return 1;
+ }
+ rv = read(fd, buf, sizeof(buf));
+ assert(rv == 7 * 2);
+
+ assert(buf[0] == '1'); /* ->total_vm */
+ assert(buf[1] == ' ');
+ assert(buf[2] == '0' || buf[2] == '1'); /* rss */
+ assert(buf[3] == ' ');
+ assert(buf[4] == '0' || buf[2] == '1'); /* file rss */
+ assert(buf[5] == ' ');
+ assert(buf[6] == '1'); /* ELF executable segments */
+ assert(buf[7] == ' ');
+ assert(buf[8] == '0');
+ assert(buf[9] == ' ');
+ assert(buf[10] == '0'); /* ->data_vm + ->stack_vm */
+ assert(buf[11] == ' ');
+ assert(buf[12] == '0');
+ assert(buf[13] == '\n');
+ }
+
+ return 0;
+}
+#else
+int main(void)
+{
+ return 4;
+}
+#endif
diff --git a/tools/testing/selftests/proc/proc-self-map-files-002.c b/tools/testing/selftests/proc/proc-self-map-files-002.c
index 85744425b08d..762cb01f2ca7 100644
--- a/tools/testing/selftests/proc/proc-self-map-files-002.c
+++ b/tools/testing/selftests/proc/proc-self-map-files-002.c
@@ -63,7 +63,7 @@ int main(void)
p = mmap((void *)va, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0);
if (p == MAP_FAILED) {
if (errno == EPERM)
- return 2;
+ return 4;
return 1;
}
diff --git a/tools/testing/selftests/proc/proc-self-syscall.c b/tools/testing/selftests/proc/proc-self-syscall.c
index 5ab5f4810e43..b2993a67a4b3 100644
--- a/tools/testing/selftests/proc/proc-self-syscall.c
+++ b/tools/testing/selftests/proc/proc-self-syscall.c
@@ -39,7 +39,7 @@ int main(void)
fd = open("/proc/self/syscall", O_RDONLY);
if (fd == -1) {
if (errno == ENOENT)
- return 2;
+ return 4;
return 1;
}
diff --git a/tools/testing/selftests/proc/proc-self-wchan.c b/tools/testing/selftests/proc/proc-self-wchan.c
index a38b2fbaa7ad..b467b98a457d 100644
--- a/tools/testing/selftests/proc/proc-self-wchan.c
+++ b/tools/testing/selftests/proc/proc-self-wchan.c
@@ -27,7 +27,7 @@ int main(void)
fd = open("/proc/self/wchan", O_RDONLY);
if (fd == -1) {
if (errno == ENOENT)
- return 2;
+ return 4;
return 1;
}
diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c
index 563e752e6eba..b3ef9e14d6cc 100644
--- a/tools/testing/selftests/proc/read.c
+++ b/tools/testing/selftests/proc/read.c
@@ -26,8 +26,10 @@
#include <dirent.h>
#include <stdbool.h>
#include <stdlib.h>
+#include <stdio.h>
#include <string.h>
#include <sys/stat.h>
+#include <sys/vfs.h>
#include <fcntl.h>
#include <unistd.h>
@@ -123,10 +125,22 @@ static void f(DIR *d, unsigned int level)
int main(void)
{
DIR *d;
+ struct statfs sfs;
d = opendir("/proc");
if (!d)
+ return 4;
+
+ /* Ensure /proc is proc. */
+ if (fstatfs(dirfd(d), &sfs) == -1) {
+ return 1;
+ }
+ if (sfs.f_type != 0x9fa0) {
+ fprintf(stderr, "error: unexpected f_type %lx\n", (long)sfs.f_type);
return 2;
+ }
+
f(d, 0);
+
return 0;
}
diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh
index 584eb8ea780a..780ce7123374 100755
--- a/tools/testing/selftests/sysctl/sysctl.sh
+++ b/tools/testing/selftests/sysctl/sysctl.sh
@@ -290,6 +290,58 @@ run_numerictests()
test_rc
}
+check_failure()
+{
+ echo -n "Testing that $1 fails as expected..."
+ reset_vals
+ TEST_STR="$1"
+ orig="$(cat $TARGET)"
+ echo -n "$TEST_STR" > $TARGET 2> /dev/null
+
+ # write should fail and $TARGET should retain its original value
+ if [ $? = 0 ] || [ "$(cat $TARGET)" != "$orig" ]; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+}
+
+run_wideint_tests()
+{
+ # sysctl conversion functions receive a boolean sign and ulong
+ # magnitude; here we list the magnitudes we want to test (each of
+ # which will be tested in both positive and negative forms). Since
+ # none of these values fit in 32 bits, writing them to an int- or
+ # uint-typed sysctl should fail.
+ local magnitudes=(
+ # common boundary-condition values (zero, +1, -1, INT_MIN,
+ # and INT_MAX respectively) if truncated to lower 32 bits
+ # (potential for being falsely deemed in range)
+ 0x0000000100000000
+ 0x0000000100000001
+ 0x00000001ffffffff
+ 0x0000000180000000
+ 0x000000017fffffff
+
+ # these look like negatives, but without a leading '-' are
+ # actually large positives (should be rejected as above
+ # despite being zero/+1/-1/INT_MIN/INT_MAX in the lower 32)
+ 0xffffffff00000000
+ 0xffffffff00000001
+ 0xffffffffffffffff
+ 0xffffffff80000000
+ 0xffffffff7fffffff
+ )
+
+ for sign in '' '-'; do
+ for mag in "${magnitudes[@]}"; do
+ check_failure "${sign}${mag}"
+ done
+ done
+}
+
# Your test must accept digits 3 and 4 to use this
run_limit_digit()
{
@@ -556,6 +608,7 @@ sysctl_test_0001()
TEST_STR=$(( $ORIG + 1 ))
run_numerictests
+ run_wideint_tests
run_limit_digit
}
@@ -580,6 +633,7 @@ sysctl_test_0003()
TEST_STR=$(( $ORIG + 1 ))
run_numerictests
+ run_wideint_tests
run_limit_digit
run_limit_digit_int
}
@@ -592,6 +646,7 @@ sysctl_test_0004()
TEST_STR=$(( $ORIG + 1 ))
run_numerictests
+ run_wideint_tests
run_limit_digit
run_limit_digit_uint
}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 584a91ae4a8f..951c507a27f7 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -211,4 +211,20 @@ else
echo "[PASS]"
fi
+echo "------------------------------------"
+echo "running vmalloc stability smoke test"
+echo "------------------------------------"
+./test_vmalloc.sh smoke
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
exit $exitcode
diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/vm/test_vmalloc.sh
new file mode 100644
index 000000000000..06d2bb109f06
--- /dev/null
+++ b/tools/testing/selftests/vm/test_vmalloc.sh
@@ -0,0 +1,176 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+#
+# This is a test script for the kernel test driver to analyse vmalloc
+# allocator. Therefore it is just a kernel module loader. You can specify
+# and pass different parameters in order to:
+# a) analyse performance of vmalloc allocations;
+# b) stressing and stability check of vmalloc subsystem.
+
+TEST_NAME="vmalloc"
+DRIVER="test_${TEST_NAME}"
+
+# 1 if fails
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+#
+# Static templates for performance, stressing and smoke tests.
+# Also it is possible to pass any supported parameters manualy.
+#
+PERF_PARAM="single_cpu_test=1 sequential_test_order=1 test_repeat_count=3"
+SMOKE_PARAM="single_cpu_test=1 test_loop_count=10000 test_repeat_count=10"
+STRESS_PARAM="test_repeat_count=20"
+
+check_test_requirements()
+{
+ uid=$(id -u)
+ if [ $uid -ne 0 ]; then
+ echo "$0: Must be run as root"
+ exit $ksft_skip
+ fi
+
+ if ! which modprobe > /dev/null 2>&1; then
+ echo "$0: You need modprobe installed"
+ exit $ksft_skip
+ fi
+
+ if ! modinfo $DRIVER > /dev/null 2>&1; then
+ echo "$0: You must have the following enabled in your kernel:"
+ echo "CONFIG_TEST_VMALLOC=m"
+ exit $ksft_skip
+ fi
+}
+
+run_perfformance_check()
+{
+ echo "Run performance tests to evaluate how fast vmalloc allocation is."
+ echo "It runs all test cases on one single CPU with sequential order."
+
+ modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1
+ echo "Done."
+ echo "Ccheck the kernel message buffer to see the summary."
+}
+
+run_stability_check()
+{
+ echo "Run stability tests. In order to stress vmalloc subsystem we run"
+ echo "all available test cases on all available CPUs simultaneously."
+ echo "It will take time, so be patient."
+
+ modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1
+ echo "Done."
+ echo "Check the kernel ring buffer to see the summary."
+}
+
+run_smoke_check()
+{
+ echo "Run smoke test. Note, this test provides basic coverage."
+ echo "Please check $0 output how it can be used"
+ echo "for deep performance analysis as well as stress testing."
+
+ modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1
+ echo "Done."
+ echo "Check the kernel ring buffer to see the summary."
+}
+
+usage()
+{
+ echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | "
+ echo "manual parameters"
+ echo
+ echo "Valid tests and parameters:"
+ echo
+ modinfo $DRIVER
+ echo
+ echo "Example usage:"
+ echo
+ echo "# Shows help message"
+ echo "./${DRIVER}.sh"
+ echo
+ echo "# Runs 1 test(id_1), repeats it 5 times on all online CPUs"
+ echo "./${DRIVER}.sh run_test_mask=1 test_repeat_count=5"
+ echo
+ echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with "
+ echo "sequential order"
+ echo -n "./${DRIVER}.sh single_cpu_test=1 sequential_test_order=1 "
+ echo "run_test_mask=23"
+ echo
+ echo -n "# Runs all tests on all online CPUs, shuffled order, repeats "
+ echo "20 times"
+ echo "./${DRIVER}.sh test_repeat_count=20"
+ echo
+ echo "# Performance analysis"
+ echo "./${DRIVER}.sh performance"
+ echo
+ echo "# Stress testing"
+ echo "./${DRIVER}.sh stress"
+ echo
+ exit 0
+}
+
+function validate_passed_args()
+{
+ VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'`
+
+ #
+ # Something has been passed, check it.
+ #
+ for passed_arg in $@; do
+ key=${passed_arg//=*/}
+ val="${passed_arg:$((${#key}+1))}"
+ valid=0
+
+ for valid_arg in $VALID_ARGS; do
+ if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then
+ valid=1
+ break
+ fi
+ done
+
+ if [[ $valid -ne 1 ]]; then
+ echo "Error: key or value is not correct: ${key} $val"
+ exit $exitcode
+ fi
+ done
+}
+
+function run_manual_check()
+{
+ #
+ # Validate passed parameters. If there is wrong one,
+ # the script exists and does not execute further.
+ #
+ validate_passed_args $@
+
+ echo "Run the test with following parameters: $@"
+ modprobe $DRIVER $@ > /dev/null 2>&1
+ echo "Done."
+ echo "Check the kernel ring buffer to see the summary."
+}
+
+function run_test()
+{
+ if [ $# -eq 0 ]; then
+ usage
+ else
+ if [[ "$1" = "performance" ]]; then
+ run_perfformance_check
+ elif [[ "$1" = "stress" ]]; then
+ run_stability_check
+ elif [[ "$1" = "smoke" ]]; then
+ run_smoke_check
+ else
+ run_manual_check $@
+ fi
+ fi
+}
+
+check_test_requirements
+run_test $@
+
+exit 0
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 1ff3a6c0367b..6f64b2b93234 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -133,7 +133,7 @@ static const char * const page_flag_names[] = {
[KPF_NOPAGE] = "n:nopage",
[KPF_KSM] = "x:ksm",
[KPF_THP] = "t:thp",
- [KPF_BALLOON] = "o:balloon",
+ [KPF_OFFLINE] = "o:offline",
[KPF_PGTABLE] = "g:pgtable",
[KPF_ZERO_PAGE] = "z:zero_page",
[KPF_IDLE] = "i:idle_page",
diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c
index 334b16db0ebb..73818f1b2ef8 100644
--- a/tools/vm/slabinfo.c
+++ b/tools/vm/slabinfo.c
@@ -110,39 +110,42 @@ static void fatal(const char *x, ...)
static void usage(void)
{
printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n"
- "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n"
+ "slabinfo [-aADefhilnosrStTvz1LXBU] [N=K] [-dafzput] [slab-regexp]\n"
"-a|--aliases Show aliases\n"
"-A|--activity Most active slabs first\n"
- "-d<options>|--debug=<options> Set/Clear Debug options\n"
+ "-B|--Bytes Show size in bytes\n"
"-D|--display-active Switch line format to activity\n"
"-e|--empty Show empty slabs\n"
"-f|--first-alias Show first alias\n"
"-h|--help Show usage information\n"
"-i|--inverted Inverted list\n"
"-l|--slabs Show slabs\n"
+ "-L|--Loss Sort by loss\n"
"-n|--numa Show NUMA information\n"
- "-o|--ops Show kmem_cache_ops\n"
+ "-N|--lines=K Show the first K slabs\n"
+ "-o|--ops Show kmem_cache_ops\n"
+ "-r|--report Detailed report on single slabs\n"
"-s|--shrink Shrink slabs\n"
- "-r|--report Detailed report on single slabs\n"
"-S|--Size Sort by size\n"
"-t|--tracking Show alloc/free information\n"
"-T|--Totals Show summary information\n"
+ "-U|--Unreclaim Show unreclaimable slabs only\n"
"-v|--validate Validate slabs\n"
"-z|--zero Include empty slabs\n"
"-1|--1ref Single reference\n"
- "-N|--lines=K Show the first K slabs\n"
- "-L|--Loss Sort by loss\n"
"-X|--Xtotals Show extended summary information\n"
- "-B|--Bytes Show size in bytes\n"
- "-U|--Unreclaim Show unreclaimable slabs only\n"
- "\nValid debug options (FZPUT may be combined)\n"
- "a / A Switch on all debug options (=FZUP)\n"
- "- Switch off all debug options\n"
- "f / F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n"
- "z / Z Redzoning\n"
- "p / P Poisoning\n"
- "u / U Tracking\n"
- "t / T Tracing\n"
+
+ "\n"
+ "-d | --debug Switch off all debug options\n"
+ "-da | --debug=a Switch on all debug options (--debug=FZPU)\n"
+
+ "\n"
+ "-d[afzput] | --debug=[afzput]\n"
+ " f | F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n"
+ " z | Z Redzoning\n"
+ " p | P Poisoning\n"
+ " u | U Tracking\n"
+ " t | T Tracing\n"
);
}