summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2019-05-14 13:37:22 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2019-05-14 13:37:22 +1000
commitc32d7d0b94c4a228969f4c0cd88c078316b4c322 (patch)
treec2daf214671787d9c2bea8f8919c44fd39c4b689
parentc372a7379a1ce0a00ee21ef44272499825365a8d (diff)
parentb78fc46079e444c5bdf3c2b843cccc2a195d1f75 (diff)
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-slab9
-rw-r--r--Documentation/accounting/psi.txt107
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst20
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt18
-rw-r--r--Documentation/core-api/genalloc.rst2
-rw-r--r--Documentation/core-api/kernel-api.rst4
-rw-r--r--Documentation/dev-tools/gcov.rst18
-rw-r--r--Documentation/devicetree/bindings/pps/pps-gpio.txt7
-rw-r--r--Documentation/filesystems/autofs-mount-control.txt6
-rw-r--r--Documentation/filesystems/autofs.txt66
-rw-r--r--Documentation/sysctl/vm.txt12
-rw-r--r--Documentation/trace/postprocess/trace-vmscan-postprocess.pl7
-rw-r--r--Documentation/vm/hmm.rst94
-rw-r--r--arch/Kconfig7
-rw-r--r--arch/alpha/mm/init.c14
-rw-r--r--arch/arc/mm/init.c15
-rw-r--r--arch/arm/Kconfig3
-rw-r--r--arch/arm/include/asm/hardirq.h1
-rw-r--r--arch/arm/kernel/atags.h2
-rw-r--r--arch/arm/kernel/smp.c6
-rw-r--r--arch/arm/mm/dma-mapping.c24
-rw-r--r--arch/arm/mm/init.c25
-rw-r--r--arch/arm64/Kconfig5
-rw-r--r--arch/arm64/include/asm/Kbuild1
-rw-r--r--arch/arm64/include/asm/cpufeature.h4
-rw-r--r--arch/arm64/include/asm/hugetlb.h4
-rw-r--r--arch/arm64/mm/init.c17
-rw-r--r--arch/arm64/mm/mmu.c6
-rw-r--r--arch/c6x/mm/init.c12
-rw-r--r--arch/h8300/mm/init.c14
-rw-r--r--arch/hexagon/Kconfig1
-rw-r--r--arch/hexagon/mm/init.c10
-rw-r--r--arch/ia64/Kconfig1
-rw-r--r--arch/ia64/mm/init.c17
-rw-r--r--arch/m68k/Kconfig3
-rw-r--r--arch/m68k/mm/init.c7
-rw-r--r--arch/microblaze/mm/init.c12
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/include/asm/bitops.h4
-rw-r--r--arch/mips/kernel/cpu-bugs64.c4
-rw-r--r--arch/mips/mm/gup.c11
-rw-r--r--arch/mips/mm/init.c8
-rw-r--r--arch/nds32/mm/init.c12
-rw-r--r--arch/nios2/Kconfig3
-rw-r--r--arch/nios2/mm/init.c12
-rw-r--r--arch/openrisc/mm/init.c12
-rw-r--r--arch/parisc/mm/init.c7
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/include/asm/Kbuild1
-rw-r--r--arch/powerpc/include/asm/book3s/64/hugetlb.h5
-rw-r--r--arch/powerpc/kernel/process.c12
-rw-r--r--arch/powerpc/kernel/prom_init.c6
-rw-r--r--arch/powerpc/kernel/sysfs.c8
-rw-r--r--arch/powerpc/kernel/traps.c2
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c4
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c34
-rw-r--r--arch/powerpc/kvm/e500_mmu.c2
-rw-r--r--arch/powerpc/mm/book3s64/iommu_api.c33
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c12
-rw-r--r--arch/powerpc/mm/fault.c6
-rw-r--r--arch/powerpc/mm/mem.c24
-rw-r--r--arch/powerpc/perf/callchain.c20
-rw-r--r--arch/powerpc/perf/core-book3s.c8
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype2
-rw-r--r--arch/powerpc/sysdev/fsl_pci.c10
-rw-r--r--arch/riscv/mm/init.c5
-rw-r--r--arch/s390/Kconfig3
-rw-r--r--arch/s390/include/asm/cpacf.h2
-rw-r--r--arch/s390/include/asm/hugetlb.h8
-rw-r--r--arch/s390/kvm/interrupt.c2
-rw-r--r--arch/s390/mm/init.c19
-rw-r--r--arch/sh/Kconfig2
-rw-r--r--arch/sh/boards/mach-dreamcast/irq.c1
-rw-r--r--arch/sh/mm/gup.c11
-rw-r--r--arch/sh/mm/init.c29
-rw-r--r--arch/sparc/Kconfig1
-rw-r--r--arch/sparc/include/asm/pgtable_64.h30
-rw-r--r--arch/sparc/mm/gup.c9
-rw-r--r--arch/sparc/mm/init_32.c13
-rw-r--r--arch/sparc/mm/init_64.c8
-rw-r--r--arch/um/kernel/mem.c7
-rw-r--r--arch/unicore32/Kconfig1
-rw-r--r--arch/unicore32/mm/init.c24
-rw-r--r--arch/x86/Kconfig7
-rw-r--r--arch/x86/Kconfig.debug14
-rw-r--r--arch/x86/entry/vdso/vdso32/vclock_gettime.c1
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/hugetlb.h4
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/svm.c2
-rw-r--r--arch/x86/mm/hugetlbpage.c2
-rw-r--r--arch/x86/mm/init_32.c11
-rw-r--r--arch/x86/mm/init_64.c20
-rw-r--r--arch/x86/mm/numa.c27
-rw-r--r--arch/xtensa/include/asm/irqflags.h2
-rw-r--r--arch/xtensa/kernel/smp.c2
-rw-r--r--arch/xtensa/mm/init.c5
-rw-r--r--drivers/base/memory.c24
-rw-r--r--drivers/firewire/core-iso.c15
-rw-r--r--drivers/fpga/dfl-afu-dma-region.c42
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c8
-rw-r--r--drivers/gpu/drm/i915/i915_gem_userptr.c2
-rw-r--r--drivers/gpu/drm/radeon/radeon_mn.c4
-rw-r--r--drivers/gpu/drm/rockchip/rockchip_drm_gem.c17
-rw-r--r--drivers/gpu/drm/via/via_dmablit.c3
-rw-r--r--drivers/gpu/drm/xen/xen_drm_front_gem.c18
-rw-r--r--drivers/infiniband/core/umem.c5
-rw-r--r--drivers/infiniband/core/umem_odp.c5
-rw-r--r--drivers/infiniband/hw/hfi1/user_pages.c3
-rw-r--r--drivers/infiniband/hw/mthca/mthca_memfree.c3
-rw-r--r--drivers/infiniband/hw/qib/qib_user_pages.c8
-rw-r--r--drivers/infiniband/hw/qib/qib_user_sdma.c2
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.c9
-rw-r--r--drivers/iommu/dma-iommu.c12
-rw-r--r--drivers/media/common/videobuf2/videobuf2-core.c7
-rw-r--r--drivers/media/common/videobuf2/videobuf2-dma-contig.c6
-rw-r--r--drivers/media/common/videobuf2/videobuf2-dma-sg.c22
-rw-r--r--drivers/media/v4l2-core/videobuf-dma-sg.c6
-rw-r--r--drivers/misc/genwqe/card_utils.c2
-rw-r--r--drivers/misc/sram-exec.c2
-rw-r--r--drivers/misc/vmw_vmci/vmci_host.c2
-rw-r--r--drivers/misc/vmw_vmci/vmci_queue_pair.c6
-rw-r--r--drivers/mtd/nand/raw/vf610_nfc.c2
-rw-r--r--drivers/platform/goldfish/goldfish_pipe.c3
-rw-r--r--drivers/pps/clients/pps-gpio.c153
-rw-r--r--drivers/rapidio/devices/rio_mport_cdev.c4
-rw-r--r--drivers/rapidio/rio_cm.c8
-rw-r--r--drivers/sbus/char/oradax.c2
-rw-r--r--drivers/scsi/st.c3
-rw-r--r--drivers/spi/spi-rockchip.c1
-rw-r--r--drivers/staging/gasket/gasket_page_table.c4
-rw-r--r--drivers/tee/tee_shm.c2
-rw-r--r--drivers/tty/sysrq.c6
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c40
-rw-r--r--drivers/vfio/vfio_iommu_type1.c34
-rw-r--r--drivers/vhost/vhost.c2
-rw-r--r--drivers/video/backlight/pwm_bl.c15
-rw-r--r--drivers/video/fbdev/pvr2fb.c2
-rw-r--r--drivers/virt/fsl_hypervisor.c2
-rw-r--r--drivers/xen/gntdev.c19
-rw-r--r--drivers/xen/privcmd-buf.c8
-rw-r--r--fs/binfmt_elf.c180
-rw-r--r--fs/buffer.c56
-rw-r--r--fs/dax.c2
-rw-r--r--fs/eventfd.c8
-rw-r--r--fs/exec.c4
-rw-r--r--fs/fat/file.c11
-rw-r--r--fs/hugetlbfs/inode.c18
-rw-r--r--fs/io_uring.c5
-rw-r--r--fs/ocfs2/alloc.c15
-rw-r--r--fs/ocfs2/aops.c22
-rw-r--r--fs/ocfs2/dir.c20
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c23
-rw-r--r--fs/ocfs2/dlmglue.c21
-rw-r--r--fs/ocfs2/export.c30
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/ocfs2_fs.h28
-rw-r--r--fs/orangefs/orangefs-bufmap.c2
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/task_mmu.c5
-rw-r--r--fs/ramfs/inode.c12
-rw-r--r--fs/reiserfs/xattr.c9
-rw-r--r--fs/sync.c21
-rw-r--r--fs/userfaultfd.c5
-rw-r--r--include/asm-generic/dynamic_debug.h116
-rw-r--r--include/asm-generic/hugetlb.h7
-rw-r--r--include/asm-generic/shmparam.h2
-rw-r--r--include/linux/balloon_compaction.h15
-rw-r--r--include/linux/binfmts.h3
-rw-r--r--include/linux/bitops.h16
-rw-r--r--include/linux/compiler_types.h3
-rw-r--r--include/linux/console.h7
-rw-r--r--include/linux/cpumask.h3
-rw-r--r--include/linux/device.h4
-rw-r--r--include/linux/dynamic_debug.h27
-rw-r--r--include/linux/genalloc.h2
-rw-r--r--include/linux/gfp.h4
-rw-r--r--include/linux/hmm.h310
-rw-r--r--include/linux/hugetlb.h4
-rw-r--r--include/linux/ipc_namespace.h1
-rw-r--r--include/linux/jump_label.h2
-rw-r--r--include/linux/kernel.h1
-rw-r--r--include/linux/kthread.h3
-rw-r--r--include/linux/latencytop.h4
-rw-r--r--include/linux/list.h35
-rw-r--r--include/linux/list_sort.h1
-rw-r--r--include/linux/memblock.h44
-rw-r--r--include/linux/memcontrol.h54
-rw-r--r--include/linux/memory.h2
-rw-r--r--include/linux/memory_hotplug.h42
-rw-r--r--include/linux/mm.h117
-rw-r--r--include/linux/mm_inline.h2
-rw-r--r--include/linux/mm_types.h7
-rw-r--r--include/linux/mmu_notifier.h63
-rw-r--r--include/linux/mmzone.h64
-rw-r--r--include/linux/net.h4
-rw-r--r--include/linux/pagemap.h13
-rw-r--r--include/linux/pid.h5
-rw-r--r--include/linux/plist.h4
-rw-r--r--include/linux/poll.h4
-rw-r--r--include/linux/pps-gpio.h5
-rw-r--r--include/linux/printk.h6
-rw-r--r--include/linux/psi.h8
-rw-r--r--include/linux/psi_types.h105
-rw-r--r--include/linux/qcom-geni-se.h2
-rw-r--r--include/linux/reboot.h2
-rw-r--r--include/linux/sched.h11
-rw-r--r--include/linux/uaccess.h34
-rw-r--r--include/linux/userfaultfd_k.h2
-rw-r--r--include/linux/vmalloc.h8
-rw-r--r--include/linux/vmstat.h2
-rw-r--r--include/trace/events/compaction.h10
-rw-r--r--include/trace/events/vmscan.h98
-rw-r--r--include/uapi/linux/byteorder/big_endian.h4
-rw-r--r--include/uapi/linux/byteorder/little_endian.h4
-rw-r--r--include/uapi/linux/fs.h3
-rw-r--r--init/Kconfig24
-rw-r--r--init/initramfs.c147
-rw-r--r--init/main.c5
-rw-r--r--ipc/ipc_sysctl.c14
-rw-r--r--ipc/mqueue.c72
-rw-r--r--ipc/msgutil.c6
-rw-r--r--ipc/util.c48
-rw-r--r--ipc/util.h47
-rw-r--r--kernel/cgroup/cgroup.c71
-rw-r--r--kernel/dma/remap.c2
-rw-r--r--kernel/events/uprobes.c3
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/fork.c39
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/gcov/Kconfig3
-rw-r--r--kernel/gcov/Makefile5
-rw-r--r--kernel/gcov/base.c86
-rw-r--r--kernel/gcov/clang.c581
-rw-r--r--kernel/gcov/gcc_3_4.c12
-rw-r--r--kernel/gcov/gcc_4_7.c12
-rw-r--r--kernel/gcov/gcc_base.c86
-rw-r--r--kernel/gcov/gcov.h5
-rw-r--r--kernel/kexec_file.c16
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/latencytop.c8
-rw-r--r--kernel/memremap.c12
-rw-r--r--kernel/notifier.c1
-rw-r--r--kernel/panic.c11
-rw-r--r--kernel/pid.c10
-rw-r--r--kernel/printk/printk.c18
-rw-r--r--kernel/reboot.c20
-rw-r--r--kernel/sched/psi.c615
-rw-r--r--kernel/signal.c1
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sysctl.c53
-rw-r--r--kernel/user.c7
-rw-r--r--lib/Kconfig14
-rw-r--r--lib/Kconfig.debug34
-rw-r--r--lib/Makefile15
-rw-r--r--lib/bitmap.c280
-rw-r--r--lib/debugobjects.c66
-rw-r--r--lib/dynamic_debug.c111
-rw-r--r--lib/genalloc.c5
-rw-r--r--lib/iov_iter.c7
-rw-r--r--lib/list_sort.c242
-rw-r--r--lib/math/Kconfig11
-rw-r--r--lib/math/Makefile5
-rw-r--r--lib/math/cordic.c (renamed from lib/cordic.c)0
-rw-r--r--lib/math/div64.c (renamed from lib/div64.c)2
-rw-r--r--lib/math/gcd.c (renamed from lib/gcd.c)0
-rw-r--r--lib/math/int_pow.c32
-rw-r--r--lib/math/int_sqrt.c (renamed from lib/int_sqrt.c)0
-rw-r--r--lib/math/lcm.c (renamed from lib/lcm.c)0
-rw-r--r--lib/math/prime_numbers.c (renamed from lib/prime_numbers.c)0
-rw-r--r--lib/math/rational.c102
-rw-r--r--lib/math/reciprocal_div.c (renamed from lib/reciprocal_div.c)0
-rw-r--r--lib/plist.c4
-rw-r--r--lib/rational.c65
-rw-r--r--lib/sort.c254
-rw-r--r--lib/test_bitmap.c67
-rw-r--r--lib/test_sysctl.c18
-rw-r--r--lib/test_vmalloc.c8
-rw-r--r--mm/Kconfig82
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/Makefile7
-rw-r--r--mm/cma.c23
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/compaction.c8
-rw-r--r--mm/debug.c5
-rw-r--r--mm/filemap.c163
-rw-r--r--mm/gup.c393
-rw-r--r--mm/gup_benchmark.c5
-rw-r--r--mm/hmm.c1086
-rw-r--r--mm/huge_memory.c17
-rw-r--r--mm/hugetlb.c177
-rw-r--r--mm/khugepaged.c7
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/madvise.c3
-rw-r--r--mm/memblock.c70
-rw-r--r--mm/memcontrol.c112
-rw-r--r--mm/memfd.c2
-rw-r--r--mm/memory.c108
-rw-r--r--mm/memory_hotplug.c132
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/mincore.c23
-rw-r--r--mm/mlock.c4
-rw-r--r--mm/mmap.c18
-rw-r--r--mm/mmu_notifier.c12
-rw-r--r--mm/mprotect.c4
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/nommu.c14
-rw-r--r--mm/oom_kill.c3
-rw-r--r--mm/page_alloc.c372
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/rmap.c10
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/shuffle.c207
-rw-r--r--mm/shuffle.h64
-rw-r--r--mm/slab.c61
-rw-r--r--mm/slob.c59
-rw-r--r--mm/slub.c83
-rw-r--r--mm/sparse.c16
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/userfaultfd.c3
-rw-r--r--mm/util.c59
-rw-r--r--mm/vmalloc.c1187
-rw-r--r--mm/vmscan.c290
-rw-r--r--mm/workingset.c5
-rw-r--r--mm/z3fold.c638
-rw-r--r--net/ceph/pagevec.c2
-rw-r--r--net/netfilter/core.c2
-rw-r--r--net/rds/info.c2
-rw-r--r--net/rds/rdma.c3
-rw-r--r--net/xdp/xdp_umem.c4
-rwxr-xr-xscripts/checkpatch.pl24
-rw-r--r--scripts/gdb/linux/clk.py69
-rw-r--r--scripts/gdb/linux/config.py44
-rw-r--r--scripts/gdb/linux/constants.py.in13
-rw-r--r--scripts/gdb/linux/cpus.py1
-rw-r--r--scripts/gdb/linux/lists.py24
-rw-r--r--scripts/gdb/linux/proc.py10
-rw-r--r--scripts/gdb/linux/rbtree.py177
-rw-r--r--scripts/gdb/linux/symbols.py6
-rw-r--r--scripts/gdb/linux/tasks.py2
-rw-r--r--scripts/gdb/linux/timerlist.py219
-rw-r--r--scripts/gdb/linux/utils.py7
-rw-r--r--scripts/gdb/vmlinux-gdb.py4
-rw-r--r--scripts/spelling.txt579
-rw-r--r--tools/testing/selftests/exec/.gitignore3
-rw-r--r--tools/testing/selftests/exec/Makefile4
-rw-r--r--tools/testing/selftests/exec/recursion-depth.c67
-rwxr-xr-xtools/testing/selftests/sysctl/sysctl.sh161
-rw-r--r--tools/vm/slabinfo.c7
-rw-r--r--virt/kvm/kvm_main.c3
354 files changed, 10010 insertions, 3858 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
index 29601d93a1c2..d742c6cfdffb 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -106,6 +106,15 @@ Description:
are from ZONE_DMA.
Available when CONFIG_ZONE_DMA is enabled.
+What: /sys/kernel/slab/cache/cache_dma32
+Date: December 2018
+KernelVersion: 4.21
+Contact: Nicolas Boichat <drinkcat@chromium.org>
+Description:
+ The cache_dma32 file is read-only and specifies whether objects
+ are from ZONE_DMA32.
+ Available when CONFIG_ZONE_DMA32 is enabled.
+
What: /sys/kernel/slab/cache/cpu_slabs
Date: May 2007
KernelVersion: 2.6.22
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt
index 7e71c9c1d8e9..5cbe5659e3b7 100644
--- a/Documentation/accounting/psi.txt
+++ b/Documentation/accounting/psi.txt
@@ -63,6 +63,110 @@ as well as medium and long term trends. The total absolute stall time
spikes which wouldn't necessarily make a dent in the time averages,
or to average trends over custom time frames.
+Monitoring for pressure thresholds
+==================================
+
+Users can register triggers and use poll() to be woken up when resource
+pressure exceeds certain thresholds.
+
+A trigger describes the maximum cumulative stall time over a specific
+time window, e.g. 100ms of total stall time within any 500ms window to
+generate a wakeup event.
+
+To register a trigger user has to open psi interface file under
+/proc/pressure/ representing the resource to be monitored and write the
+desired threshold and time window. The open file descriptor should be
+used to wait for trigger events using select(), poll() or epoll().
+The following format is used:
+
+<some|full> <stall amount in us> <time window in us>
+
+For example writing "some 150000 1000000" into /proc/pressure/memory
+would add 150ms threshold for partial memory stall measured within
+1sec time window. Writing "full 50000 1000000" into /proc/pressure/io
+would add 50ms threshold for full io stall measured within 1sec time window.
+
+Triggers can be set on more than one psi metric and more than one trigger
+for the same psi metric can be specified. However for each trigger a separate
+file descriptor is required to be able to poll it separately from others,
+therefore for each trigger a separate open() syscall should be made even
+when opening the same psi interface file.
+
+Monitors activate only when system enters stall state for the monitored
+psi metric and deactivates upon exit from the stall state. While system is
+in the stall state psi signal growth is monitored at a rate of 10 times per
+tracking window.
+
+The kernel accepts window sizes ranging from 500ms to 10s, therefore min
+monitoring update interval is 50ms and max is 1s. Min limit is set to
+prevent overly frequent polling. Max limit is chosen as a high enough number
+after which monitors are most likely not needed and psi averages can be used
+instead.
+
+When activated, psi monitor stays active for at least the duration of one
+tracking window to avoid repeated activations/deactivations when system is
+bouncing in and out of the stall state.
+
+Notifications to the userspace are rate-limited to one per tracking window.
+
+The trigger will de-register when the file descriptor used to define the
+trigger is closed.
+
+Userspace monitor usage example
+===============================
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <poll.h>
+#include <string.h>
+#include <unistd.h>
+
+/*
+ * Monitor memory partial stall with 1s tracking window size
+ * and 150ms threshold.
+ */
+int main() {
+ const char trig[] = "some 150000 1000000";
+ struct pollfd fds;
+ int n;
+
+ fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK);
+ if (fds.fd < 0) {
+ printf("/proc/pressure/memory open error: %s\n",
+ strerror(errno));
+ return 1;
+ }
+ fds.events = POLLPRI;
+
+ if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
+ printf("/proc/pressure/memory write error: %s\n",
+ strerror(errno));
+ return 1;
+ }
+
+ printf("waiting for events...\n");
+ while (1) {
+ n = poll(&fds, 1, -1);
+ if (n < 0) {
+ printf("poll error: %s\n", strerror(errno));
+ return 1;
+ }
+ if (fds.revents & POLLERR) {
+ printf("got POLLERR, event source is gone\n");
+ return 0;
+ }
+ if (fds.revents & POLLPRI) {
+ printf("event triggered!\n");
+ } else {
+ printf("unknown event received: 0x%x\n", fds.revents);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
Cgroup2 interface
=================
@@ -71,3 +175,6 @@ mounted, pressure stall information is also tracked for tasks grouped
into cgroups. Each subdirectory in the cgroupfs mountpoint contains
cpu.pressure, memory.pressure, and io.pressure files; the format is
the same as the /proc/pressure/ files.
+
+Per-cgroup psi monitors can be specified and used the same way as
+system-wide ones.
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 88e746074252..c55427269cc8 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -606,8 +606,8 @@ on an IO device and is an example of this type.
Protections
-----------
-A cgroup is protected to be allocated upto the configured amount of
-the resource if the usages of all its ancestors are under their
+A cgroup is protected upto the configured amount of the resource
+as long as the usages of all its ancestors are under their
protected levels. Protections can be hard guarantees or best effort
soft boundaries. Protections can also be over-committed in which case
only upto the amount available to the parent is protected among
@@ -1047,7 +1047,10 @@ PAGE_SIZE multiple when read back.
is within its effective min boundary, the cgroup's memory
won't be reclaimed under any conditions. If there is no
unprotected reclaimable memory available, OOM killer
- is invoked.
+ is invoked. Above the effective min boundary (or
+ effective low boundary if it is higher), pages are reclaimed
+ proportionally to the overage, reducing reclaim pressure for
+ smaller overages.
Effective min boundary is limited by memory.min values of
all ancestor cgroups. If there is memory.min overcommitment
@@ -1069,7 +1072,10 @@ PAGE_SIZE multiple when read back.
Best-effort memory protection. If the memory usage of a
cgroup is within its effective low boundary, the cgroup's
memory won't be reclaimed unless memory can be reclaimed
- from unprotected cgroups.
+ from unprotected cgroups. Above the effective low boundary (or
+ effective min boundary if it is higher), pages are reclaimed
+ proportionally to the overage, reducing reclaim pressure for
+ smaller overages.
Effective low boundary is limited by memory.low values of
all ancestor cgroups. If there is memory.low overcommitment
@@ -2326,8 +2332,10 @@ system performance due to overreclaim, to the point where the feature
becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
-reserve. A cgroup enjoys reclaim protection when it's within its low,
-which makes delegation of subtrees possible.
+reserve. A cgroup enjoys reclaim protection when it's within its
+effective low, which makes delegation of subtrees possible. It also
+enjoys having reclaim pressure proportional to its overage when
+above its effective low.
The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 64e4df0c8488..268b10aa7a3d 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1830,6 +1830,9 @@
ip= [IP_PNP]
See Documentation/filesystems/nfs/nfsroot.txt.
+ ipcmni_extend [KNL] Extend the maximum number of unique System V
+ IPC identifiers from 32,768 to 16,777,216.
+
irqaffinity= [SMP] Set the default irq affinity mask
The argument is a cpu list, as described above.
@@ -3146,6 +3149,16 @@
This will also cause panics on machine check exceptions.
Useful together with panic=30 to trigger a reboot.
+ page_alloc.shuffle=
+ [KNL] Boolean flag to control whether the page allocator
+ should randomize its free lists. The randomization may
+ be automatically enabled if the kernel detects it is
+ running on a platform with a direct-mapped memory-side
+ cache, and this parameter can be used to
+ override/disable that behavior. The state of the flag
+ can be read from sysfs at:
+ /sys/module/page_alloc/parameters/shuffle.
+
page_owner= [KNL] Boot-time page_owner enabling option.
Storage of the information about who allocated
each page is disabled in default. With this switch,
@@ -3171,6 +3184,7 @@
bit 2: print timer info
bit 3: print locks info if CONFIG_LOCKDEP is on
bit 4: print ftrace buffer
+ bit 5: print all printk messages in buffer
panic_on_warn panic() instead of WARN(). Useful to cause kdump
on a WARN().
@@ -4026,7 +4040,9 @@
[[,]s[mp]#### \
[[,]b[ios] | a[cpi] | k[bd] | t[riple] | e[fi] | p[ci]] \
[[,]f[orce]
- Where reboot_mode is one of warm (soft) or cold (hard) or gpio,
+ Where reboot_mode is one of warm (soft) or cold (hard) or gpio
+ (prefix with 'panic_' to set mode for panic
+ reboot only),
reboot_type is one of bios, acpi, kbd, triple, efi, or pci,
reboot_force is either force or not specified,
reboot_cpu is s[mp]#### with #### being the processor
diff --git a/Documentation/core-api/genalloc.rst b/Documentation/core-api/genalloc.rst
index 6b38a39fab24..a534cc7ebd05 100644
--- a/Documentation/core-api/genalloc.rst
+++ b/Documentation/core-api/genalloc.rst
@@ -129,7 +129,7 @@ writing of special-purpose memory allocators in the future.
:functions: gen_pool_for_each_chunk
.. kernel-doc:: lib/genalloc.c
- :functions: addr_in_gen_pool
+ :functions: gen_pool_has_addr
.. kernel-doc:: lib/genalloc.c
:functions: gen_pool_avail
diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 71f5d2fe39b7..a29c99d13331 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -147,10 +147,10 @@ Division Functions
.. kernel-doc:: include/linux/math64.h
:internal:
-.. kernel-doc:: lib/div64.c
+.. kernel-doc:: lib/math/div64.c
:functions: div_s64_rem div64_u64_rem div64_u64 div64_s64
-.. kernel-doc:: lib/gcd.c
+.. kernel-doc:: lib/math/gcd.c
:export:
UUID/GUID
diff --git a/Documentation/dev-tools/gcov.rst b/Documentation/dev-tools/gcov.rst
index 69a7d90c320a..46aae52a41d0 100644
--- a/Documentation/dev-tools/gcov.rst
+++ b/Documentation/dev-tools/gcov.rst
@@ -34,10 +34,6 @@ Configure the kernel with::
CONFIG_DEBUG_FS=y
CONFIG_GCOV_KERNEL=y
-select the gcc's gcov format, default is autodetect based on gcc version::
-
- CONFIG_GCOV_FORMAT_AUTODETECT=y
-
and to get coverage data for the entire kernel::
CONFIG_GCOV_PROFILE_ALL=y
@@ -169,6 +165,20 @@ b) gcov is run on the BUILD machine
[user@build] gcov -o /tmp/coverage/tmp/out/init main.c
+Note on compilers
+-----------------
+
+GCC and LLVM gcov tools are not necessarily compatible. Use gcov_ to work with
+GCC-generated .gcno and .gcda files, and use llvm-cov_ for Clang.
+
+.. _gcov: http://gcc.gnu.org/onlinedocs/gcc/Gcov.html
+.. _llvm-cov: https://llvm.org/docs/CommandGuide/llvm-cov.html
+
+Build differences between GCC and Clang gcov are handled by Kconfig. It
+automatically selects the appropriate gcov format depending on the detected
+toolchain.
+
+
Troubleshooting
---------------
diff --git a/Documentation/devicetree/bindings/pps/pps-gpio.txt b/Documentation/devicetree/bindings/pps/pps-gpio.txt
index 3683874832ae..9012a2a02e14 100644
--- a/Documentation/devicetree/bindings/pps/pps-gpio.txt
+++ b/Documentation/devicetree/bindings/pps/pps-gpio.txt
@@ -7,6 +7,10 @@ Required properties:
- compatible: should be "pps-gpio"
- gpios: one PPS GPIO in the format described by ../gpio/gpio.txt
+Additional required properties for the PPS ECHO functionality:
+- echo-gpios: one PPS ECHO GPIO in the format described by ../gpio/gpio.txt
+- echo-active-ms: duration in ms of the active portion of the echo pulse
+
Optional properties:
- assert-falling-edge: when present, assert is indicated by a falling edge
(instead of by a rising edge)
@@ -19,5 +23,8 @@ Example:
gpios = <&gpio1 26 GPIO_ACTIVE_HIGH>;
assert-falling-edge;
+ echo-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>;
+ echo-active-ms = <100>;
+
compatible = "pps-gpio";
};
diff --git a/Documentation/filesystems/autofs-mount-control.txt b/Documentation/filesystems/autofs-mount-control.txt
index 45edad6933cc..acc02fc57993 100644
--- a/Documentation/filesystems/autofs-mount-control.txt
+++ b/Documentation/filesystems/autofs-mount-control.txt
@@ -354,8 +354,10 @@ this ioctl is called until no further expire candidates are found.
The call requires an initialized struct autofs_dev_ioctl with the
ioctlfd field set to the descriptor obtained from the open call. In
-addition an immediate expire, independent of the mount timeout, can be
-requested by setting the how field of struct args_expire to 1. If no
+addition an immediate expire that's independent of the mount timeout,
+and a forced expire that's independent of whether the mount is busy,
+can be requested by setting the how field of struct args_expire to
+AUTOFS_EXP_IMMEDIATE or AUTOFS_EXP_FORCED, respectively . If no
expire candidates can be found the ioctl returns -1 with errno set to
EAGAIN.
diff --git a/Documentation/filesystems/autofs.txt b/Documentation/filesystems/autofs.txt
index 373ad25852d3..3af38c7fd26d 100644
--- a/Documentation/filesystems/autofs.txt
+++ b/Documentation/filesystems/autofs.txt
@@ -116,7 +116,7 @@ that purpose there is another flag.
**DCACHE_MANAGE_TRANSIT**
If a dentry has DCACHE_MANAGE_TRANSIT set then two very different but
-related behaviors are invoked, both using the `d_op->d_manage()`
+related behaviours are invoked, both using the `d_op->d_manage()`
dentry operation.
Firstly, before checking to see if any filesystem is mounted on the
@@ -193,8 +193,8 @@ VFS remain in RCU-walk mode, but can only tell it to get out of
RCU-walk mode by returning `-ECHILD`.
So `d_manage()`, when called with `rcu_walk` set, should either return
--ECHILD if there is any reason to believe it is unsafe to end the
-mounted filesystem, and otherwise should return 0.
+-ECHILD if there is any reason to believe it is unsafe to enter the
+mounted filesystem, otherwise it should return 0.
autofs will return `-ECHILD` if an expiry of the filesystem has been
initiated or is being considered, otherwise it returns 0.
@@ -210,7 +210,7 @@ mounts that were created by `d_automount()` returning a filesystem to be
mounted. As autofs doesn't return such a filesystem but leaves the
mounting to the automount daemon, it must involve the automount daemon
in unmounting as well. This also means that autofs has more control
-of expiry.
+over expiry.
The VFS also supports "expiry" of mounts using the MNT_EXPIRE flag to
the `umount` system call. Unmounting with MNT_EXPIRE will fail unless
@@ -225,7 +225,7 @@ unmount any filesystems mounted on the autofs filesystem or remove any
symbolic links or empty directories any time it likes. If the unmount
or removal is successful the filesystem will be returned to the state
it was before the mount or creation, so that any access of the name
-will trigger normal auto-mount processing. In particlar, `rmdir` and
+will trigger normal auto-mount processing. In particular, `rmdir` and
`unlink` do not leave negative entries in the dcache as a normal
filesystem would, so an attempt to access a recently-removed object is
passed to autofs for handling.
@@ -240,11 +240,18 @@ Normally the daemon only wants to remove entries which haven't been
used for a while. For this purpose autofs maintains a "`last_used`"
time stamp on each directory or symlink. For symlinks it genuinely
does record the last time the symlink was "used" or followed to find
-out where it points to. For directories the field is a slight
-misnomer. It actually records the last time that autofs checked if
-the directory or one of its descendents was busy and found that it
-was. This is just as useful and doesn't require updating the field so
-often.
+out where it points to. For directories the field is used slightly
+differently. The field is updated at mount time and during expire
+checks if it is found to be in use (ie. open file descriptor or
+process working directory) and during path walks. The update done
+during path walks prevents frequent expire and immediate mount of
+frequently accessed automounts. But in the case where a GUI continually
+access or an application frequently scans an autofs directory tree
+there can be an accumulation of mounts that aren't actually being
+used. To cater for this case the "`strictexpire`" autofs mount option
+can be used to avoid the "`last_used`" update on path walk thereby
+preventing this apparent inability to expire mounts that aren't
+really in use.
The daemon is able to ask autofs if anything is due to be expired,
using an `ioctl` as discussed later. For a *direct* mount, autofs
@@ -255,8 +262,12 @@ up.
There is an option with indirect mounts to consider each of the leaves
that has been mounted on instead of considering the top-level names.
-This is intended for compatability with version 4 of autofs and should
-be considered as deprecated.
+This was originally intended for compatibility with version 4 of autofs
+and should be considered as deprecated for Sun Format automount maps.
+However, it may be used again for amd format mount maps (which are
+generally indirect maps) because the amd automounter allows for the
+setting of an expire timeout for individual mounts. But there are
+some difficulties in making the needed changes for this.
When autofs considers a directory it checks the `last_used` time and
compares it with the "timeout" value set when the filesystem was
@@ -273,7 +284,7 @@ mounts. If it finds something in the root directory to expire it will
return the name of that thing. Once a name has been returned the
automount daemon needs to unmount any filesystems mounted below the
name normally. As described above, this is unsafe for non-toplevel
-mounts in a version-5 autofs. For this reason the current `automountd`
+mounts in a version-5 autofs. For this reason the current `automount(8)`
does not use this ioctl.
The second mechanism uses either the **AUTOFS_DEV_IOCTL_EXPIRE_CMD** or
@@ -345,7 +356,7 @@ The `wait_queue_token` is a unique number which can identify a
particular request to be acknowledged. When a message is sent over
the pipe the affected dentry is marked as either "active" or
"expiring" and other accesses to it block until the message is
-acknowledged using one of the ioctls below and the relevant
+acknowledged using one of the ioctls below with the relevant
`wait_queue_token`.
Communicating with autofs: root directory ioctls
@@ -367,15 +378,14 @@ The available ioctl commands are:
This mode is also entered if a write to the pipe fails.
- **AUTOFS_IOC_PROTOVER**: This returns the protocol version in use.
- **AUTOFS_IOC_PROTOSUBVER**: Returns the protocol sub-version which
- is really a version number for the implementation. It is
- currently 2.
+ is really a version number for the implementation.
- **AUTOFS_IOC_SETTIMEOUT**: This passes a pointer to an unsigned
long. The value is used to set the timeout for expiry, and
the current timeout value is stored back through the pointer.
- **AUTOFS_IOC_ASKUMOUNT**: Returns, in the pointed-to `int`, 1 if
the filesystem could be unmounted. This is only a hint as
the situation could change at any instant. This call can be
- use to avoid a more expensive full unmount attempt.
+ used to avoid a more expensive full unmount attempt.
- **AUTOFS_IOC_EXPIRE**: as described above, this asks if there is
anything suitable to expire. A pointer to a packet:
@@ -400,6 +410,11 @@ The available ioctl commands are:
**AUTOFS_EXP_IMMEDIATE** causes `last_used` time to be ignored
and objects are expired if the are not in use.
+ **AUTOFS_EXP_FORCED** causes the in use status to be ignored
+ and objects are expired ieven if they are in use. This assumes
+ that the daemon has requested this because it is capable of
+ performing the umount.
+
**AUTOFS_EXP_LEAVES** will select a leaf rather than a top-level
name to expire. This is only safe when *maxproto* is 4.
@@ -415,7 +430,7 @@ which can be used to communicate directly with the autofs filesystem.
It requires CAP_SYS_ADMIN for access.
The `ioctl`s that can be used on this device are described in a separate
-document `autofs-mount-control.txt`, and are summarized briefly here.
+document `autofs-mount-control.txt`, and are summarised briefly here.
Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure:
struct autofs_dev_ioctl {
@@ -511,6 +526,21 @@ directories.
Catatonic mode can only be left via the
**AUTOFS_DEV_IOCTL_OPENMOUNT_CMD** ioctl on the `/dev/autofs`.
+The "ignore" mount option
+-------------------------
+
+The "ignore" mount option can be used to provide a generic indicator
+to applications that the mount entry should be ignored when displaying
+mount information.
+
+In other OSes that provide autofs and that provide a mount list to user
+space based on the kernel mount list a no-op mount option ("ignore" is
+the one use on the most common OSes) is allowed so that autofs file
+system users can optionally use it.
+
+This is intended to be used by user space programs to exclude autofs
+mounts from consideration when reading the mounts list.
+
autofs, name spaces, and shared mounts
--------------------------------------
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 3f13d8599337..749322060f10 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm:
- stat_refresh
- numa_stat
- swappiness
+- unprivileged_userfaultfd
- user_reserve_kbytes
- vfs_cache_pressure
- watermark_boost_factor
@@ -818,6 +819,17 @@ The default value is 60.
==============================================================
+unprivileged_userfaultfd
+
+This flag controls whether unprivileged users can use the userfaultfd
+system calls. Set this to 1 to allow unprivileged users to use the
+userfaultfd system calls, or set this to 0 to restrict userfaultfd to only
+privileged users (with SYS_CAP_PTRACE capability).
+
+The default value is 1.
+
+==============================================================
+
- user_reserve_kbytes
When overcommit_memory is set to 2, "never overcommit" mode, reserve
diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
index 66bfd8396877..995da15b16ca 100644
--- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -113,7 +113,7 @@ my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)';
my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)';
-my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
+my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate_anon=([0-9]*) nr_activate_file=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)';
@@ -212,7 +212,8 @@ $regex_lru_shrink_inactive = generate_traceevent_regex(
"vmscan/mm_vmscan_lru_shrink_inactive",
$regex_lru_shrink_inactive_default,
"nid", "nr_scanned", "nr_reclaimed", "nr_dirty", "nr_writeback",
- "nr_congested", "nr_immediate", "nr_activate", "nr_ref_keep",
+ "nr_congested", "nr_immediate", "nr_activate_anon",
+ "nr_activate_file", "nr_ref_keep",
"nr_unmap_fail", "priority", "flags");
$regex_lru_shrink_active = generate_traceevent_regex(
"vmscan/mm_vmscan_lru_shrink_active",
@@ -407,7 +408,7 @@ EVENT_PROCESS:
}
my $nr_reclaimed = $3;
- my $flags = $12;
+ my $flags = $13;
my $file = 0;
if ($flags =~ /RECLAIM_WB_FILE/) {
$file = 1;
diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 44205f0b671f..ec1efa32af3c 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -189,20 +189,10 @@ the driver callback returns.
When the device driver wants to populate a range of virtual addresses, it can
use either::
- int hmm_vma_get_pfns(struct vm_area_struct *vma,
- struct hmm_range *range,
- unsigned long start,
- unsigned long end,
- hmm_pfn_t *pfns);
- int hmm_vma_fault(struct vm_area_struct *vma,
- struct hmm_range *range,
- unsigned long start,
- unsigned long end,
- hmm_pfn_t *pfns,
- bool write,
- bool block);
-
-The first one (hmm_vma_get_pfns()) will only fetch present CPU page table
+ long hmm_range_snapshot(struct hmm_range *range);
+ long hmm_range_fault(struct hmm_range *range, bool block);
+
+The first one (hmm_range_snapshot()) will only fetch present CPU page table
entries and will not trigger a page fault on missing or non-present entries.
The second one does trigger a page fault on missing or read-only entry if the
write parameter is true. Page faults use the generic mm page fault code path
@@ -220,25 +210,56 @@ respect in order to keep things properly synchronized. The usage pattern is::
{
struct hmm_range range;
...
+
+ range.start = ...;
+ range.end = ...;
+ range.pfns = ...;
+ range.flags = ...;
+ range.values = ...;
+ range.pfn_shift = ...;
+ hmm_range_register(&range);
+
+ /*
+ * Just wait for range to be valid, safe to ignore return value as we
+ * will use the return value of hmm_range_snapshot() below under the
+ * mmap_sem to ascertain the validity of the range.
+ */
+ hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
+
again:
- ret = hmm_vma_get_pfns(vma, &range, start, end, pfns);
- if (ret)
+ down_read(&mm->mmap_sem);
+ ret = hmm_range_snapshot(&range);
+ if (ret) {
+ up_read(&mm->mmap_sem);
+ if (ret == -EAGAIN) {
+ /*
+ * No need to check hmm_range_wait_until_valid() return value
+ * on retry we will get proper error with hmm_range_snapshot()
+ */
+ hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
+ goto again;
+ }
+ hmm_mirror_unregister(&range);
return ret;
+ }
take_lock(driver->update);
- if (!hmm_vma_range_done(vma, &range)) {
+ if (!range.valid) {
release_lock(driver->update);
+ up_read(&mm->mmap_sem);
goto again;
}
// Use pfns array content to update device page table
+ hmm_mirror_unregister(&range);
release_lock(driver->update);
+ up_read(&mm->mmap_sem);
return 0;
}
The driver->update lock is the same lock that the driver takes inside its
-update() callback. That lock must be held before hmm_vma_range_done() to avoid
-any race with a concurrent CPU page table update.
+update() callback. That lock must be held before checking the range.valid
+field to avoid any race with a concurrent CPU page table update.
HMM implements all this on top of the mmu_notifier API because we wanted a
simpler API and also to be able to perform optimizations latter on like doing
@@ -255,6 +276,41 @@ report commands as executed is serialized (there is no point in doing this
concurrently).
+Leverage default_flags and pfn_flags_mask
+=========================================
+
+The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows
+to set fault or snapshot policy for a whole range instead of having to set them
+for each entries in the range.
+
+For instance if the device flags for device entries are:
+ VALID (1 << 63)
+ WRITE (1 << 62)
+
+Now let say that device driver wants to fault with at least read a range then
+it does set:
+ range->default_flags = (1 << 63)
+ range->pfn_flags_mask = 0;
+
+and calls hmm_range_fault() as described above. This will fill fault all page
+in the range with at least read permission.
+
+Now let say driver wants to do the same except for one page in the range for
+which its want to have write. Now driver set:
+ range->default_flags = (1 << 63);
+ range->pfn_flags_mask = (1 << 62);
+ range->pfns[index_of_write] = (1 << 62);
+
+With this HMM will fault in all page with at least read (ie valid) and for the
+address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
+write permission ie if the CPU pte does not have write permission set then HMM
+will call handle_mm_fault().
+
+Note that HMM will populate the pfns array with write permission for any entry
+that have write permission within the CPU pte no matter what are the values set
+in default_flags or pfn_flags_mask.
+
+
Represent and manage device memory from core kernel point of view
=================================================================
diff --git a/arch/Kconfig b/arch/Kconfig
index 5e43fcbad4ca..f11f0698b148 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -245,6 +245,13 @@ config ARCH_HAS_FORTIFY_SOURCE
An architecture should select this when it can successfully
build and run with CONFIG_FORTIFY_SOURCE.
+#
+# Select if the arch provides a historic keepinit alias for the retain_initrd
+# command line option
+#
+config ARCH_HAS_KEEPINITRD
+ bool
+
# Select if arch has all set_memory_ro/rw/x/nx() functions in asm/cacheflush.h
config ARCH_HAS_SET_MEMORY
bool
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index a42fc5c4db89..e2cbec3789e8 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -285,17 +285,3 @@ mem_init(void)
memblock_free_all();
mem_init_print_info(NULL);
}
-
-void
-free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void
-free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index e1ab2d7f1d64..02b7a3b20d7c 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -206,18 +206,3 @@ void __init mem_init(void)
memblock_free_all();
mem_init_print_info(NULL);
}
-
-/*
- * free_initmem: Free all the __init memory.
- */
-void __ref free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 901bf1b79138..8869742a85df 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -4,11 +4,11 @@ config ARM
default y
select ARCH_32BIT_OFF_T
select ARCH_CLOCKSOURCE_DATA
- select ARCH_DISCARD_MEMBLOCK if !HAVE_ARCH_PFN_VALID && !KEXEC
select ARCH_HAS_DEBUG_VIRTUAL if MMU
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
+ select ARCH_HAS_KEEPINITRD
select ARCH_HAS_KCOV
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PTE_SPECIAL if ARM_LPAE
@@ -21,6 +21,7 @@ config ARM
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAVE_CUSTOM_GPIO_H
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_KEEP_MEMBLOCK if HAVE_ARCH_PFN_VALID || KEXEC
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_NO_SG_CHAIN if !ARM_HAS_SG_CHAIN
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h
index cba23eaa6072..7a88f160b1fb 100644
--- a/arch/arm/include/asm/hardirq.h
+++ b/arch/arm/include/asm/hardirq.h
@@ -6,6 +6,7 @@
#include <linux/threads.h>
#include <asm/irq.h>
+/* number of IPIS _not_ including IPI_CPU_BACKTRACE */
#define NR_IPI 7
typedef struct {
diff --git a/arch/arm/kernel/atags.h b/arch/arm/kernel/atags.h
index 201100226301..067e12edc341 100644
--- a/arch/arm/kernel/atags.h
+++ b/arch/arm/kernel/atags.h
@@ -5,7 +5,7 @@ void convert_to_tag_list(struct tag *tags);
const struct machine_desc *setup_machine_tags(phys_addr_t __atags_pointer,
unsigned int machine_nr);
#else
-static inline const struct machine_desc *
+static inline const struct machine_desc * __init __noreturn
setup_machine_tags(phys_addr_t __atags_pointer, unsigned int machine_nr)
{
early_print("no ATAGS support: can't continue\n");
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index c6d37563610a..ebc53804d57b 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -70,6 +70,10 @@ enum ipi_msg_type {
IPI_CPU_STOP,
IPI_IRQ_WORK,
IPI_COMPLETION,
+ /*
+ * CPU_BACKTRACE is special and not included in NR_IPI
+ * or tracable with trace_ipi_*
+ */
IPI_CPU_BACKTRACE,
/*
* SGI8-15 can be reserved by secure firmware, and thus may
@@ -803,7 +807,7 @@ core_initcall(register_cpufreq_notifier);
static void raise_nmi(cpumask_t *mask)
{
- smp_cross_call(mask, IPI_CPU_BACKTRACE);
+ __smp_cross_call(mask, IPI_CPU_BACKTRACE);
}
void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 43f46aa7ef33..12e0812e988c 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -565,7 +565,7 @@ static void *__alloc_from_pool(size_t size, struct page **ret_page)
static bool __in_atomic_pool(void *start, size_t size)
{
- return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+ return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
}
static int __free_from_pool(void *start, size_t size)
@@ -1577,31 +1577,21 @@ static int __arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
- unsigned long uaddr = vma->vm_start;
- unsigned long usize = vma->vm_end - vma->vm_start;
struct page **pages = __iommu_get_pages(cpu_addr, attrs);
unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
- unsigned long off = vma->vm_pgoff;
+ int err;
if (!pages)
return -ENXIO;
- if (off >= nr_pages || (usize >> PAGE_SHIFT) > nr_pages - off)
+ if (vma->vm_pgoff >= nr_pages)
return -ENXIO;
- pages += off;
-
- do {
- int ret = vm_insert_page(vma, uaddr, *pages++);
- if (ret) {
- pr_err("Remapping memory failed: %d\n", ret);
- return ret;
- }
- uaddr += PAGE_SIZE;
- usize -= PAGE_SIZE;
- } while (usize > 0);
+ err = vm_map_pages(vma, pages, nr_pages);
+ if (err)
+ pr_err("Remapping memory failed: %d\n", err);
- return 0;
+ return err;
}
static int arm_iommu_mmap_attrs(struct device *dev,
struct vm_area_struct *vma, void *cpu_addr,
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index b2fbb4a711fc..be0b42937888 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -680,27 +680,14 @@ void free_initmem(void)
}
#ifdef CONFIG_BLK_DEV_INITRD
-
-static int keep_initrd;
-
void free_initrd_mem(unsigned long start, unsigned long end)
{
- if (!keep_initrd) {
- if (start == initrd_start)
- start = round_down(start, PAGE_SIZE);
- if (end == initrd_end)
- end = round_up(end, PAGE_SIZE);
-
- poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
- }
-}
+ if (start == initrd_start)
+ start = round_down(start, PAGE_SIZE);
+ if (end == initrd_end)
+ end = round_up(end, PAGE_SIZE);
-static int __init keepinitrd_setup(char *__unused)
-{
- keep_initrd = 1;
- return 1;
+ poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
+ free_reserved_area((void *)start, (void *)end, -1, "initrd");
}
-
-__setup("keepinitrd", keepinitrd_setup);
#endif
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9504cf52a7a3..42551c89a5a2 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -19,8 +19,9 @@ config ARM64
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+ select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
+ select ARCH_HAS_KEEPINITRD
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SETUP_DMA_OPS
@@ -59,6 +60,7 @@ config ARM64
select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPT
select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPT
select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPT
+ select ARCH_KEEP_MEMBLOCK
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
@@ -84,6 +86,7 @@ config ARM64
select CRC32
select DCACHE_WORD_ACCESS
select DMA_DIRECT_REMAP
+ select DYNAMIC_DEBUG_RELATIVE_POINTERS
select EDAC_SUPPORT
select FRAME_POINTER
select GENERIC_ALLOCATOR
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index cc3f9d864224..50695e4d34c4 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -3,6 +3,7 @@ generic-y += delay.h
generic-y += div64.h
generic-y += dma.h
generic-y += dma-contiguous.h
+generic-y += dynamic_debug.h
generic-y += early_ioremap.h
generic-y += emergency-restart.h
generic-y += hw_irq.h
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index f210bcf096f7..bc895c869892 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -401,7 +401,7 @@ unsigned long cpu_get_elf_hwcap2(void);
#define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name))
/* System capability check for constant caps */
-static inline bool __cpus_have_const_cap(int num)
+static __always_inline bool __cpus_have_const_cap(int num)
{
if (num >= ARM64_NCAPS)
return false;
@@ -415,7 +415,7 @@ static inline bool cpus_have_cap(unsigned int num)
return test_bit(num, cpu_hwcaps);
}
-static inline bool cpus_have_const_cap(int num)
+static __always_inline bool cpus_have_const_cap(int num)
{
if (static_branch_likely(&arm64_const_caps_ready))
return __cpus_have_const_cap(num);
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index c6a07a3b433e..4aad6382f631 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -70,8 +70,4 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
#include <asm-generic/hugetlb.h>
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
-
#endif /* __ASM_HUGETLB_H */
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 40e2d7e5efcb..007c05a4cce0 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -578,24 +578,11 @@ void free_initmem(void)
}
#ifdef CONFIG_BLK_DEV_INITRD
-
-static int keep_initrd __initdata;
-
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
- if (!keep_initrd) {
- free_reserved_area((void *)start, (void *)end, 0, "initrd");
- memblock_free(__virt_to_phys(start), end - start);
- }
-}
-
-static int __init keepinitrd_setup(char *__unused)
-{
- keep_initrd = 1;
- return 1;
+ free_reserved_area((void *)start, (void *)end, 0, "initrd");
+ memblock_free(__virt_to_phys(start), end - start);
}
-
-__setup("keepinitrd", keepinitrd_setup);
#endif
/*
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ef82312860ac..ef32d4839c3f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1065,8 +1065,8 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
int flags = 0;
@@ -1077,6 +1077,6 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
- altmap, want_memblock);
+ restrictions);
}
#endif
diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c
index fe582c3a1794..573242b160e1 100644
--- a/arch/c6x/mm/init.c
+++ b/arch/c6x/mm/init.c
@@ -68,15 +68,3 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void __init free_initmem(void)
-{
- free_initmem_default(-1);
-}
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index 0f04a5e9aa4f..1eab16b1a0bc 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -102,17 +102,3 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
-
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void
-free_initmem(void)
-{
- free_initmem_default(-1);
-}
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 3e54a53208d5..b7d404bbaa0f 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -22,7 +22,6 @@ config HEXAGON
select GENERIC_IRQ_SHOW
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
- select ARCH_DISCARD_MEMBLOCK
select NEED_SG_DMA_LENGTH
select NO_IOPORT_MAP
select GENERIC_IOMAP
diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c
index 1719ede9e9bd..41cf34243ea1 100644
--- a/arch/hexagon/mm/init.c
+++ b/arch/hexagon/mm/init.c
@@ -85,16 +85,6 @@ void __init mem_init(void)
}
/*
- * free_initmem - frees memory used by stuff declared with __init
- *
- * Todo: free pages between __init_begin and __init_end; possibly
- * some devtree related stuff as well.
- */
-void __ref free_initmem(void)
-{
-}
-
-/*
* free_initrd_mem - frees... initrd memory.
* @start - start of init memory
* @end - end of init memory
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 73a26f04644e..7468d8e50467 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -33,7 +33,6 @@ config IA64
select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB
select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB
select VIRT_TO_BUS
- select ARCH_DISCARD_MEMBLOCK
select GENERIC_IRQ_PROBE
select GENERIC_PENDING_IRQ if SMP
select GENERIC_IRQ_SHOW
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index e49200e31750..d28e29103bdb 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -666,14 +666,14 @@ mem_init (void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
if (ret)
printk("%s: Problem encountered in __add_pages() as ret=%d\n",
__func__, ret);
@@ -682,20 +682,15 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct zone *zone;
- int ret;
zone = page_zone(pfn_to_page(start_pfn));
- ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
- if (ret)
- pr_warn("%s: Problem encountered in __remove_pages() as"
- " ret=%d\n", __func__, ret);
-
- return ret;
+ __remove_pages(zone, start_pfn, nr_pages, altmap);
}
#endif
#endif
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index fe5cc2da6d10..f8020d261e08 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -22,12 +22,11 @@ config M68K
select ARCH_WANT_IPC_PARSE_VERSION
select HAVE_FUTEX_CMPXCHG if MMU && FUTEX
select HAVE_MOD_ARCH_SPECIFIC
+ select MMU_GATHER_NO_RANGE if MMU
select MODULES_USE_ELF_REL
select MODULES_USE_ELF_RELA
select OLD_SIGSUSPEND3
select OLD_SIGACTION
- select ARCH_DISCARD_MEMBLOCK
- select MMU_GATHER_NO_RANGE if MMU
config CPU_BIG_ENDIAN
def_bool y
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index 8868a4c9adae..778cacb7d57b 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -147,10 +147,3 @@ void __init mem_init(void)
init_pointer_tables();
mem_init_print_info(NULL);
}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 7e97d44f6538..a015a951c8b7 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -186,18 +186,6 @@ void __init setup_memory(void)
paging_init();
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
void __init mem_init(void)
{
high_memory = (void *)__va(memory_start + lowmem_size - 1);
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 25bd9cfea5d7..70d3200476bf 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -5,7 +5,6 @@ config MIPS
select ARCH_32BIT_OFF_T if !64BIT
select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT
select ARCH_CLOCKSOURCE_DATA
- select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UBSAN_SANITIZE_ALL
diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h
index 830c93a010c3..9a466dde9b96 100644
--- a/arch/mips/include/asm/bitops.h
+++ b/arch/mips/include/asm/bitops.h
@@ -482,7 +482,7 @@ static inline void __clear_bit_unlock(unsigned long nr, volatile unsigned long *
* Return the bit position (0..63) of the most significant 1 bit in a word
* Returns -1 if no 1 bit exists
*/
-static inline unsigned long __fls(unsigned long word)
+static __always_inline unsigned long __fls(unsigned long word)
{
int num;
@@ -548,7 +548,7 @@ static inline unsigned long __fls(unsigned long word)
* Returns 0..SZLONG-1
* Undefined if no bit exists, so code should check against 0 first.
*/
-static inline unsigned long __ffs(unsigned long word)
+static __always_inline unsigned long __ffs(unsigned long word)
{
return __fls(word & -word);
}
diff --git a/arch/mips/kernel/cpu-bugs64.c b/arch/mips/kernel/cpu-bugs64.c
index bada74af7641..c04b97aace4a 100644
--- a/arch/mips/kernel/cpu-bugs64.c
+++ b/arch/mips/kernel/cpu-bugs64.c
@@ -42,8 +42,8 @@ static inline void align_mod(const int align, const int mod)
: "n"(align), "n"(mod));
}
-static inline void mult_sh_align_mod(long *v1, long *v2, long *w,
- const int align, const int mod)
+static __always_inline void mult_sh_align_mod(long *v1, long *v2, long *w,
+ const int align, const int mod)
{
unsigned long flags;
int m1, m2;
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 0d14e0d8eacf..4c2b4483683c 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -235,7 +235,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -247,8 +247,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
@@ -273,7 +273,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+ pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
@@ -289,7 +290,7 @@ slow_irqon:
pages += nr;
ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT,
- pages, write ? FOLL_WRITE : 0);
+ pages, gup_flags);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index bbb196ad5f26..8a038b30d3c4 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -504,14 +504,6 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end)
printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
- "initrd");
-}
-#endif
-
void (*free_init_pages_eva)(void *begin, void *end) = NULL;
void __ref free_initmem(void)
diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c
index f32e40e36fe1..55703b03d172 100644
--- a/arch/nds32/mm/init.c
+++ b/arch/nds32/mm/init.c
@@ -252,18 +252,6 @@ void __init mem_init(void)
return;
}
-void free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
void __set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags)
{
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index ea37394ff3ea..c2a8b9a95865 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,6 +16,7 @@ config NIOS2
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_KGDB
select IRQ_DOMAIN
+ select MMU_GATHER_NO_RANGE if MMU
select MODULES_USE_ELF_RELA
select OF
select OF_EARLY_FLATTREE
@@ -23,8 +24,6 @@ config NIOS2
select SPARSE_IRQ
select USB_ARCH_HAS_HCD if USB_SUPPORT
select CPU_NO_EFFICIENT_FFS
- select ARCH_DISCARD_MEMBLOCK
- select MMU_GATHER_NO_RANGE if MMU
config GENERIC_CSUM
def_bool y
diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c
index 16cea5776b87..2c609c2516b2 100644
--- a/arch/nios2/mm/init.c
+++ b/arch/nios2/mm/init.c
@@ -82,18 +82,6 @@ void __init mmu_init(void)
flush_tlb_all();
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void __ref free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
#define __page_aligned(order) __aligned(PAGE_SIZE << (order))
pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned(PGD_ORDER);
pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned(PTE_ORDER);
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index e8a640c7e74c..e63cb4a91a3e 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -222,15 +222,3 @@ void __init mem_init(void)
mem_init_done = 1;
return;
}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void free_initmem(void)
-{
- free_initmem_default(-1);
-}
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 5a68185295a9..ddca8287d43b 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -920,10 +920,3 @@ void flush_tlb_all(void)
spin_unlock(&sid_lock);
}
#endif
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d7996cfaceca..6a66a2da5b1a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
+ select ARCH_KEEP_MEMBLOCK
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
@@ -149,6 +150,7 @@ config PPC
select BUILDTIME_EXTABLE_SORT
select CLONE_BACKWARDS
select DCACHE_WORD_ACCESS if PPC64 && CPU_LITTLE_ENDIAN
+ select DYNAMIC_DEBUG_RELATIVE_POINTERS if PPC64
select DYNAMIC_FTRACE if FUNCTION_TRACER
select EDAC_ATOMIC_SCRUB
select EDAC_SUPPORT
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index b9f6e72bf4e5..3efffcd675c2 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -3,6 +3,7 @@ generated-y += syscall_table_64.h
generated-y += syscall_table_c32.h
generated-y += syscall_table_spu.h
generic-y += div64.h
+generic-y += dynamic_debug.h
generic-y += export.h
generic-y += irq_regs.h
generic-y += local64.h
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index 56140d19c85f..12e150e615b7 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -36,8 +36,8 @@ static inline int hstate_get_psize(struct hstate *hstate)
}
}
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void)
+#define __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED
+static inline bool gigantic_page_runtime_supported(void)
{
/*
* We used gigantic page reservation with hypervisor assist in some case.
@@ -49,7 +49,6 @@ static inline bool gigantic_page_supported(void)
return true;
}
-#endif
/* hugepd entry valid bit */
#define HUGEPD_VAL_BITS (0x8000000000000000UL)
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 87da40129927..7dee64f1e4cb 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1306,16 +1306,6 @@ void show_user_instructions(struct pt_regs *regs)
pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int));
- /*
- * Make sure the NIP points at userspace, not kernel text/data or
- * elsewhere.
- */
- if (!__access_ok(pc, NR_INSN_TO_PRINT * sizeof(int), USER_DS)) {
- pr_info("%s[%d]: Bad NIP, not dumping instructions.\n",
- current->comm, current->pid);
- return;
- }
-
seq_buf_init(&s, buf, sizeof(buf));
while (n) {
@@ -1326,7 +1316,7 @@ void show_user_instructions(struct pt_regs *regs)
for (i = 0; i < 8 && n; i++, n--, pc += sizeof(int)) {
int instr;
- if (probe_kernel_address((const void *)pc, instr)) {
+ if (probe_user_read(&instr, (void __user *)pc, sizeof(instr))) {
seq_buf_printf(&s, "XXXXXXXX ");
continue;
}
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 523bb99d7676..00682b8df330 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -628,14 +628,14 @@ static int __init prom_next_node(phandle *nodep)
}
}
-static inline int prom_getprop(phandle node, const char *pname,
- void *value, size_t valuelen)
+static inline int __init prom_getprop(phandle node, const char *pname,
+ void *value, size_t valuelen)
{
return call_prom("getprop", 4, 1, node, ADDR(pname),
(u32)(unsigned long) value, (u32) valuelen);
}
-static inline int prom_getproplen(phandle node, const char *pname)
+static inline int __init prom_getproplen(phandle node, const char *pname)
{
return call_prom("getproplen", 2, 1, node, ADDR(pname));
}
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index e8e93c2c7d03..7a1708875d27 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -610,7 +610,7 @@ SYSFS_PMCSETUP(pa6t_pmc2, SPRN_PA6T_PMC2);
SYSFS_PMCSETUP(pa6t_pmc3, SPRN_PA6T_PMC3);
SYSFS_PMCSETUP(pa6t_pmc4, SPRN_PA6T_PMC4);
SYSFS_PMCSETUP(pa6t_pmc5, SPRN_PA6T_PMC5);
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_MISC
SYSFS_SPRSETUP(hid0, SPRN_HID0);
SYSFS_SPRSETUP(hid1, SPRN_HID1);
SYSFS_SPRSETUP(hid4, SPRN_HID4);
@@ -639,7 +639,7 @@ SYSFS_SPRSETUP(tsr0, SPRN_PA6T_TSR0);
SYSFS_SPRSETUP(tsr1, SPRN_PA6T_TSR1);
SYSFS_SPRSETUP(tsr2, SPRN_PA6T_TSR2);
SYSFS_SPRSETUP(tsr3, SPRN_PA6T_TSR3);
-#endif /* CONFIG_DEBUG_KERNEL */
+#endif /* CONFIG_DEBUG_MISC */
#endif /* HAS_PPC_PMC_PA6T */
#ifdef HAS_PPC_PMC_IBM
@@ -680,7 +680,7 @@ static struct device_attribute pa6t_attrs[] = {
__ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3),
__ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4),
__ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5),
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_MISC
__ATTR(hid0, 0600, show_hid0, store_hid0),
__ATTR(hid1, 0600, show_hid1, store_hid1),
__ATTR(hid4, 0600, show_hid4, store_hid4),
@@ -709,7 +709,7 @@ static struct device_attribute pa6t_attrs[] = {
__ATTR(tsr1, 0600, show_tsr1, store_tsr1),
__ATTR(tsr2, 0600, show_tsr2, store_tsr2),
__ATTR(tsr3, 0600, show_tsr3, store_tsr3),
-#endif /* CONFIG_DEBUG_KERNEL */
+#endif /* CONFIG_DEBUG_MISC */
};
#endif /* HAS_PPC_PMC_PA6T */
#endif /* HAS_PPC_PMC_CLASSIC */
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 665f294725cb..83e59fdaa62d 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -179,7 +179,7 @@ extern void panic_flush_kmsg_end(void)
kmsg_dump(KMSG_DUMP_PANIC);
bust_spinlocks(0);
debug_locks_off();
- console_flush_on_panic();
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
}
static unsigned long oops_begin(struct pt_regs *regs)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index be7bc070eae5..ab3d484c5e2e 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -600,7 +600,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* If writing != 0, then the HPTE must allow writing, if we get here */
write_ok = writing;
hva = gfn_to_hva_memslot(memslot, gfn);
- npages = get_user_pages_fast(hva, 1, writing, pages);
+ npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages);
if (npages < 1) {
/* Check if it's an I/O mapping */
down_read(&current->mm->mmap_sem);
@@ -1193,7 +1193,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
goto err;
hva = gfn_to_hva_memslot(memslot, gfn);
- npages = get_user_pages_fast(hva, 1, 1, pages);
+ npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages);
if (npages < 1)
goto err;
page = pages[0];
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 66270e07449a..38af1c2eb652 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -56,39 +56,31 @@ static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
}
-static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
+static long kvmppc_account_memlimit(unsigned long pages, bool inc)
{
long ret = 0;
+ s64 locked_vm;
if (!current || !current->mm)
return ret; /* process exited */
- down_write(&current->mm->mmap_sem);
-
if (inc) {
- unsigned long locked, lock_limit;
+ unsigned long lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- locked = current->mm->locked_vm + stt_pages;
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ locked_vm = atomic64_add_return(pages, &current->mm->locked_vm);
+ if (locked_vm > lock_limit && !capable(CAP_IPC_LOCK)) {
ret = -ENOMEM;
- else
- current->mm->locked_vm += stt_pages;
+ atomic64_sub(pages, &current->mm->locked_vm);
+ }
} else {
- if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
- stt_pages = current->mm->locked_vm;
-
- current->mm->locked_vm -= stt_pages;
+ locked_vm = atomic64_sub_return(pages, &current->mm->locked_vm);
+ WARN_ON_ONCE(locked_vm < 0);
}
- pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
- inc ? '+' : '-',
- stt_pages << PAGE_SHIFT,
- current->mm->locked_vm << PAGE_SHIFT,
- rlimit(RLIMIT_MEMLOCK),
- ret ? " - exceeded" : "");
-
- up_write(&current->mm->mmap_sem);
+ pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%lu %lld/%lu%s\n", current->pid,
+ inc ? '+' : '-', pages << PAGE_SHIFT,
+ locked_vm << PAGE_SHIFT,
+ rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
return ret;
}
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 24296f4cadc6..e0af53fd78c5 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -783,7 +783,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
if (!pages)
return -ENOMEM;
- ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
+ ret = get_user_pages_fast(cfg->array, num_pages, FOLL_WRITE, pages);
if (ret < 0)
goto free_pages;
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index 8330f135294f..cd370d66d797 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -54,33 +54,29 @@ struct mm_iommu_table_group_mem_t {
static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
unsigned long npages, bool incr)
{
- long ret = 0, locked, lock_limit;
+ long ret = 0;
+ unsigned long lock_limit;
+ s64 locked_vm;
if (!npages)
return 0;
- down_write(&mm->mmap_sem);
-
if (incr) {
- locked = mm->locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ locked_vm = atomic64_add_return(npages, &mm->locked_vm);
+ if (locked_vm > lock_limit && !capable(CAP_IPC_LOCK)) {
ret = -ENOMEM;
- else
- mm->locked_vm += npages;
+ atomic64_sub(npages, &mm->locked_vm);
+ }
} else {
- if (WARN_ON_ONCE(npages > mm->locked_vm))
- npages = mm->locked_vm;
- mm->locked_vm -= npages;
+ locked_vm = atomic64_sub_return(npages, &mm->locked_vm);
+ WARN_ON_ONCE(locked_vm < 0);
}
- pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
- current ? current->pid : 0,
- incr ? '+' : '-',
- npages << PAGE_SHIFT,
- mm->locked_vm << PAGE_SHIFT,
+ pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%lu %lld/%lu\n",
+ current ? current->pid : 0, incr ? '+' : '-',
+ npages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK));
- up_write(&mm->mmap_sem);
return ret;
}
@@ -141,8 +137,9 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
for (entry = 0; entry < entries; entry += chunk) {
unsigned long n = min(entries - entry, chunk);
- ret = get_user_pages_longterm(ua + (entry << PAGE_SHIFT), n,
- FOLL_WRITE, mem->hpages + entry, NULL);
+ ret = get_user_pages(ua + (entry << PAGE_SHIFT), n,
+ FOLL_WRITE | FOLL_LONGTERM,
+ mem->hpages + entry, NULL);
if (ret == n) {
pinned += n;
continue;
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 6a23b9ebd2a1..14ff414d1545 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -90,8 +90,8 @@ void radix__tlbiel_all(unsigned int action)
asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
}
-static inline void __tlbiel_pid(unsigned long pid, int set,
- unsigned long ric)
+static __always_inline void __tlbiel_pid(unsigned long pid, int set,
+ unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -106,7 +106,7 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
trace_tlbie(0, 1, rb, rs, ric, prs, r);
}
-static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -136,7 +136,7 @@ static inline void __tlbiel_lpid(unsigned long lpid, int set,
trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
}
-static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
+static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -239,7 +239,7 @@ static inline void fixup_tlbie_lpid(unsigned long lpid)
/*
* We use 128 set in radix mode and 256 set in hpt mode.
*/
-static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
+static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
{
int set;
@@ -928,7 +928,7 @@ void radix__tlb_flush(struct mmu_gather *tlb)
tlb->need_flush_all = 0;
}
-static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
unsigned long start, unsigned long end,
int psize, bool also_pwc)
{
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b5d3578d9f65..8785def75916 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -304,12 +304,8 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
access_ok(nip, sizeof(*nip))) {
unsigned int inst;
- int res;
- pagefault_disable();
- res = __get_user_inatomic(inst, nip);
- pagefault_enable();
- if (!res)
+ if (!probe_user_read(&inst, nip, sizeof(inst)))
return !store_updates_sp(inst);
*must_retry = true;
}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index cd525d709072..879872e72d40 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -109,8 +109,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
return -ENODEV;
}
-int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int __ref arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -127,12 +127,12 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altm
}
flush_inval_dcache_range(start, start + size);
- return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ return __add_pages(nid, start_pfn, nr_pages, restrictions);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-int __ref arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void __ref arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -147,14 +147,13 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size,
if (altmap)
page += vmem_altmap_offset(altmap);
- ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
- if (ret)
- return ret;
+ __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
/* Remove htab bolted mappings for this section of memory */
start = (unsigned long)__va(start);
flush_inval_dcache_range(start, start + size);
ret = remove_section_mapping(start, start + size);
+ WARN_ON_ONCE(ret);
/* Ensure all vmalloc mappings are flushed in case they also
* hit that section of memory
@@ -163,8 +162,6 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size,
if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
pr_warn("Hash collision while resizing HPT\n");
-
- return ret;
}
#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
@@ -338,13 +335,6 @@ void free_initmem(void)
free_initmem_default(POISON_FREE_INITMEM);
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
/*
* This is called when a page has been modified by the kernel.
* It just marks the page as not i-cache clean. We do the i-cache
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 0af051a1974e..0680efb2237b 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -159,12 +159,8 @@ static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
((unsigned long)ptr & 7))
return -EFAULT;
- pagefault_disable();
- if (!__get_user_inatomic(*ret, ptr)) {
- pagefault_enable();
+ if (!probe_user_read(ret, ptr, sizeof(*ret)))
return 0;
- }
- pagefault_enable();
return read_user_stack_slow(ptr, ret, 8);
}
@@ -175,12 +171,8 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
((unsigned long)ptr & 3))
return -EFAULT;
- pagefault_disable();
- if (!__get_user_inatomic(*ret, ptr)) {
- pagefault_enable();
+ if (!probe_user_read(ret, ptr, sizeof(*ret)))
return 0;
- }
- pagefault_enable();
return read_user_stack_slow(ptr, ret, 4);
}
@@ -307,17 +299,11 @@ static inline int current_is_64bit(void)
*/
static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
{
- int rc;
-
if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
((unsigned long)ptr & 3))
return -EFAULT;
- pagefault_disable();
- rc = __get_user_inatomic(*ret, ptr);
- pagefault_enable();
-
- return rc;
+ return probe_user_read(ret, ptr, sizeof(*ret));
}
static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index a66fb9c01c9e..73db84d24491 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -420,7 +420,6 @@ static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
static __u64 power_pmu_bhrb_to(u64 addr)
{
unsigned int instr;
- int ret;
__u64 target;
if (is_kernel_addr(addr)) {
@@ -431,13 +430,8 @@ static __u64 power_pmu_bhrb_to(u64 addr)
}
/* Userspace: need copy instruction here then translate it */
- pagefault_disable();
- ret = __get_user_inatomic(instr, (unsigned int __user *)addr);
- if (ret) {
- pagefault_enable();
+ if (probe_user_read(&instr, (unsigned int __user *)addr, sizeof(instr)))
return 0;
- }
- pagefault_enable();
target = branch_target(&instr);
if ((!target) || (instr & BRANCH_ABSOLUTE))
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index d0e172d47574..2794235e9d3e 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -331,7 +331,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
config PPC_RADIX_MMU
bool "Radix MMU Support"
depends on PPC_BOOK3S_64 && HUGETLB_PAGE
- select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+ select ARCH_HAS_GIGANTIC_PAGE
select PPC_HAVE_KUEP
select PPC_HAVE_KUAP
default y
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index f49aec251a5a..37b0ba5f4ff9 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -1069,13 +1069,11 @@ int fsl_pci_mcheck_exception(struct pt_regs *regs)
addr += mfspr(SPRN_MCAR);
if (is_in_pci_mem_space(addr)) {
- if (user_mode(regs)) {
- pagefault_disable();
- ret = get_user(inst, (__u32 __user *)regs->nip);
- pagefault_enable();
- } else {
+ if (user_mode(regs))
+ ret = probe_user_read(&inst, (void __user *)regs->nip,
+ sizeof(inst));
+ else
ret = probe_kernel_address((void *)regs->nip, inst);
- }
if (!ret && mcheck_handle_load(regs, inst)) {
regs->nip += 4;
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index bc7b77e34d09..8bf6f9c2d48c 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -66,11 +66,6 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
-void free_initmem(void)
-{
- free_initmem_default(0);
-}
-
#ifdef CONFIG_BLK_DEV_INITRD
static void __init setup_initrd(void)
{
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 07485582d027..109243fdb6ec 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -63,7 +63,7 @@ config S390
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+ select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_MEMORY
@@ -100,6 +100,7 @@ config S390
select ARCH_INLINE_WRITE_UNLOCK_BH
select ARCH_INLINE_WRITE_UNLOCK_IRQ
select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ select ARCH_KEEP_MEMBLOCK
select ARCH_SAVE_PAGE_KEYS if HIBERNATION
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_NUMA_BALANCING
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index ce2770743cc5..27696755daa9 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -203,7 +203,7 @@ static inline int __cpacf_check_opcode(unsigned int opcode)
}
}
-static inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask)
+static __always_inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask)
{
if (__cpacf_check_opcode(opcode)) {
__cpacf_query(opcode, mask);
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 2d1afa58a4b6..bb59dd964590 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -116,7 +116,9 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
return pte_modify(pte, newprot);
}
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
+static inline bool gigantic_page_runtime_supported(void)
+{
+ return true;
+}
+
#endif /* _ASM_S390_HUGETLB_H */
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f30679a3a939..9dde4d7d8704 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2383,7 +2383,7 @@ static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
ret = -EFAULT;
goto out;
}
- ret = get_user_pages_fast(map->addr, 1, 1, &map->page);
+ ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page);
if (ret < 0)
goto out;
BUG_ON(ret != 1);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 7cf48eefec8f..14d1eae9fe43 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -157,14 +157,6 @@ void free_initmem(void)
free_initmem_default(POISON_FREE_INITMEM);
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
- "initrd");
-}
-#endif
-
unsigned long memory_block_size_bytes(void)
{
/*
@@ -227,8 +219,8 @@ device_initcall(s390_cma_mem_init);
#endif /* CONFIG_CMA */
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long size_pages = PFN_DOWN(size);
@@ -238,21 +230,22 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
if (rc)
return rc;
- rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock);
+ rc = __add_pages(nid, start_pfn, size_pages, restrictions);
if (rc)
vmem_remove_mapping(start, size);
return rc;
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
{
/*
* There is no hardware or firmware interface which could trigger a
* hot memory remove on s390. So there is nothing that needs to be
* implemented.
*/
- return -EBUSY;
+ BUG();
}
#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 0be08d586d40..b77f512bb176 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -10,7 +10,6 @@ config SUPERH
select DMA_DECLARE_COHERENT
select HAVE_IDE if HAS_IOPORT_MAP
select HAVE_MEMBLOCK_NODE_MAP
- select ARCH_DISCARD_MEMBLOCK
select HAVE_OPROFILE
select HAVE_ARCH_TRACEHOOK
select HAVE_PERF_EVENTS
@@ -53,6 +52,7 @@ config SUPERH
select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_NMI
select NEED_SG_DMA_LENGTH
+ select ARCH_HAS_GIGANTIC_PAGE
help
The SuperH is a RISC processor targeted for use in embedded systems
diff --git a/arch/sh/boards/mach-dreamcast/irq.c b/arch/sh/boards/mach-dreamcast/irq.c
index a929f764ae04..cc06e4cdb4cd 100644
--- a/arch/sh/boards/mach-dreamcast/irq.c
+++ b/arch/sh/boards/mach-dreamcast/irq.c
@@ -10,7 +10,6 @@
*/
#include <linux/irq.h>
#include <linux/io.h>
-#include <linux/irq.h>
#include <linux/export.h>
#include <linux/err.h>
#include <mach/sysasic.h>
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c
index 3e27f6d1f1ec..277c882f7489 100644
--- a/arch/sh/mm/gup.c
+++ b/arch/sh/mm/gup.c
@@ -204,7 +204,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -216,8 +216,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
@@ -241,7 +241,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+ pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
@@ -261,7 +262,7 @@ slow_irqon:
ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT, pages,
- write ? FOLL_WRITE : 0);
+ gup_flags);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 70621324db41..b95e343e3c9d 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -403,28 +403,16 @@ void __init mem_init(void)
mem_init_done = 1;
}
-void free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
/* We only have ZONE_NORMAL, so this is easy.. */
- ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
if (unlikely(ret))
printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
@@ -441,20 +429,15 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
#ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
struct zone *zone;
- int ret;
zone = page_zone(pfn_to_page(start_pfn));
- ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
- if (unlikely(ret))
- pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
- ret);
-
- return ret;
+ __remove_pages(zone, start_pfn, nr_pages, altmap);
}
#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index f6421c9ce5d3..7c93f3121ee6 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -92,6 +92,7 @@ config SPARC64
select ARCH_CLOCKSOURCE_DATA
select ARCH_HAS_PTE_SPECIAL
select PCI_DOMAINS if PCI
+ select ARCH_HAS_GIGANTIC_PAGE
config ARCH_DEFCONFIG
string
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 1393a8ac596b..22500c3be7a9 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -231,36 +231,6 @@ extern unsigned long _PAGE_ALL_SZ_BITS;
extern struct page *mem_map_zero;
#define ZERO_PAGE(vaddr) (mem_map_zero)
-/* This macro must be updated when the size of struct page grows above 80
- * or reduces below 64.
- * The idea that compiler optimizes out switch() statement, and only
- * leaves clrx instructions
- */
-#define mm_zero_struct_page(pp) do { \
- unsigned long *_pp = (void *)(pp); \
- \
- /* Check that struct page is either 64, 72, or 80 bytes */ \
- BUILD_BUG_ON(sizeof(struct page) & 7); \
- BUILD_BUG_ON(sizeof(struct page) < 64); \
- BUILD_BUG_ON(sizeof(struct page) > 80); \
- \
- switch (sizeof(struct page)) { \
- case 80: \
- _pp[9] = 0; /* fallthrough */ \
- case 72: \
- _pp[8] = 0; /* fallthrough */ \
- default: \
- _pp[7] = 0; \
- _pp[6] = 0; \
- _pp[5] = 0; \
- _pp[4] = 0; \
- _pp[3] = 0; \
- _pp[2] = 0; \
- _pp[1] = 0; \
- _pp[0] = 0; \
- } \
-} while (0)
-
/* PFNs are real physical page numbers. However, mem_map only begins to record
* per-page information starting at pfn_base. This is to handle systems where
* the first physical page in the machine is at some huge physical address,
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index aee6dba83d0e..1e770a517d4a 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -245,8 +245,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
return nr;
}
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
@@ -303,7 +303,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+ pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
@@ -324,7 +325,7 @@ slow:
ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT, pages,
- write ? FOLL_WRITE : 0);
+ gup_flags);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index a8ff29821bdb..046ab116cc8c 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -294,19 +294,6 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
-void free_initmem (void)
-{
- free_initmem_default(POISON_FREE_INITMEM);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
- "initrd");
-}
-#endif
-
void sparc_flush_page_to_ram(struct page *page)
{
unsigned long vaddr = (unsigned long)page_address(page);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index bc2aaa47bc8a..4b099dd7a767 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2572,14 +2572,6 @@ void free_initmem(void)
}
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
- "initrd");
-}
-#endif
-
pgprot_t PAGE_KERNEL __read_mostly;
EXPORT_SYMBOL(PAGE_KERNEL);
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 99aa11bf53d1..a9c9a94c096f 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -188,13 +188,6 @@ void free_initmem(void)
{
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
/* Allocate and free page tables. */
pgd_t *pgd_alloc(struct mm_struct *mm)
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index e85f89f96eca..41fe944005f8 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -3,6 +3,7 @@ config UNICORE32
def_bool y
select ARCH_32BIT_OFF_T
select ARCH_HAS_DEVMEM_IS_ALLOWED
+ select ARCH_HAS_KEEPINITRD
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select HAVE_KERNEL_GZIP
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index 74b6a2e29809..b4442f3060ce 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -287,27 +287,3 @@ void __init mem_init(void)
sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
}
}
-
-void free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-
-static int keep_initrd;
-
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- if (!keep_initrd)
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-
-static int __init keepinitrd_setup(char *__unused)
-{
- keep_initrd = 1;
- return 1;
-}
-
-__setup("keepinitrd", keepinitrd_setup);
-#endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e7212731cffb..98b86e819772 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,7 +22,7 @@ config X86_64
def_bool y
depends on 64BIT
# Options that are inherently 64-bit kernel only:
- select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+ select ARCH_HAS_GIGANTIC_PAGE
select ARCH_SUPPORTS_INT128
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
@@ -30,6 +30,7 @@ config X86_64
select NEED_DMA_MAP_STATE
select SWIOTLB
select ARCH_HAS_SYSCALL_WRAPPER
+ select DYNAMIC_DEBUG_RELATIVE_POINTERS
#
# Arch settings
@@ -47,7 +48,6 @@ config X86
select ARCH_32BIT_OFF_T if X86_32
select ARCH_CLOCKSOURCE_DATA
select ARCH_CLOCKSOURCE_INIT
- select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEVMEM_IS_ALLOWED
@@ -306,9 +306,6 @@ config ZONE_DMA32
config AUDIT_ARCH
def_bool y if X86_64
-config ARCH_SUPPORTS_OPTIMIZED_INLINING
- def_bool y
-
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 15d0fbe27872..f730680dc818 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -266,20 +266,6 @@ config CPA_DEBUG
---help---
Do change_page_attr() self-tests every 30 seconds.
-config OPTIMIZE_INLINING
- bool "Allow gcc to uninline functions marked 'inline'"
- ---help---
- This option determines if the kernel forces gcc to inline the functions
- developers have marked 'inline'. Doing so takes away freedom from gcc to
- do what it thinks is best, which is desirable for the gcc 3.x series of
- compilers. The gcc 4.x series have a rewritten inlining algorithm and
- enabling this option will generate a smaller kernel there. Hopefully
- this algorithm is so good that allowing gcc 4.x and above to make the
- decision will become the default in the future. Until then this option
- is there to test gcc for this.
-
- If unsure, say N.
-
config DEBUG_ENTRY
bool "Debug low-level entry code"
depends on DEBUG_KERNEL
diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
index 9242b28418d5..9acec4426206 100644
--- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
@@ -17,6 +17,7 @@
#undef CONFIG_ILLEGAL_POINTER_VALUE
#undef CONFIG_SPARSEMEM_VMEMMAP
#undef CONFIG_NR_CPUS
+#undef CONFIG_DYNAMIC_DEBUG_RELATIVE_POINTERS
#define CONFIG_X86_32 1
#define CONFIG_PGTABLE_LEVELS 2
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index eebd05942e6c..60232bddbf88 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -7,6 +7,7 @@ generated-y += unistd_64_x32.h
generated-y += xen-hypercalls.h
generic-y += dma-contiguous.h
+generic-y += dynamic_debug.h
generic-y += early_ioremap.h
generic-y += export.h
generic-y += mcs_spinlock.h
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 7469d321f072..f65cfb48cfdd 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -17,8 +17,4 @@ static inline void arch_clear_hugepage_flags(struct page *page)
{
}
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
-
#endif /* _ASM_X86_HUGETLB_H */
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 2bdbbbcfa393..cdf44aa9a501 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file contains definitions from Hyper-V Hypervisor Top-Level Functional
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bdca39829bc..08715034e315 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -140,7 +140,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pt_element_t *table;
struct page *page;
- npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
+ npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page);
/* Check if the user is doing something meaningless. */
if (unlikely(npages != 1))
return -EFAULT;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 406b558abfef..6b92eaf4a3b1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1805,7 +1805,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
return NULL;
/* Pin the user virtual address. */
- npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+ npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
if (npinned != npages) {
pr_err("SEV: Failure locking %lu pages.\n", npages);
goto err;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 92e4c4b85bba..fab095362c50 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -203,7 +203,7 @@ static __init int setup_hugepagesz(char *opt)
}
__setup("hugepagesz=", setup_hugepagesz);
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
+#ifdef CONFIG_CONTIG_ALLOC
static __init int gigantic_pages_init(void)
{
/* With compaction or CMA we can allocate gigantic pages at runtime */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 85c94f9a87f8..075e568098f2 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -850,24 +850,25 @@ void __init mem_init(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ return __add_pages(nid, start_pfn, nr_pages, restrictions);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct zone *zone;
zone = page_zone(pfn_to_page(start_pfn));
- return __remove_pages(zone, start_pfn, nr_pages, altmap);
+ __remove_pages(zone, start_pfn, nr_pages, altmap);
}
#endif
#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bccff68e3267..20d14254b686 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -777,11 +777,11 @@ static void update_end_of_memory_vars(u64 start, u64 size)
}
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap, bool want_memblock)
+ struct mhp_restrictions *restrictions)
{
int ret;
- ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
WARN_ON_ONCE(ret);
/* update max_pfn, max_low_pfn and high_memory */
@@ -791,15 +791,15 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
return ret;
}
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
init_memory_mapping(start, start + size);
- return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ return add_pages(nid, start_pfn, nr_pages, restrictions);
}
#define PAGE_INUSE 0xFD
@@ -1141,24 +1141,20 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
remove_pagetable(start, end, true, NULL);
}
-int __ref arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void __ref arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct page *page = pfn_to_page(start_pfn);
struct zone *zone;
- int ret;
/* With altmap the first mapped page is offset from @start */
if (altmap)
page += vmem_altmap_offset(altmap);
zone = page_zone(page);
- ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
- WARN_ON_ONCE(ret);
+ __remove_pages(zone, start_pfn, nr_pages, altmap);
kernel_physical_mapping_remove(start, start + size);
-
- return ret;
}
#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index dfb6c4df639a..53277862de40 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -212,8 +212,6 @@ static void __init alloc_node_data(int nid)
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-
- node_set_online(nid);
}
/**
@@ -566,7 +564,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
return -EINVAL;
/* Finally register nodes. */
- for_each_node_mask(nid, node_possible_map) {
+ for_each_node_mask(nid, numa_nodes_parsed) {
u64 start = PFN_PHYS(max_pfn);
u64 end = 0;
@@ -577,9 +575,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
end = max(mi->blk[i].end, end);
}
- if (start >= end)
- continue;
-
/*
* Don't confuse VM with a node that doesn't have the
* minimum amount of memory:
@@ -588,6 +583,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
continue;
alloc_node_data(nid);
+ if (end)
+ node_set_online(nid);
}
/* Dump memblock with node info and return. */
@@ -717,21 +714,6 @@ void __init x86_numa_init(void)
numa_init(dummy_numa_init);
}
-static void __init init_memory_less_node(int nid)
-{
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
-
- /* Allocate and initialize node data. Memory-less node is now online.*/
- alloc_node_data(nid);
- free_area_init_node(nid, zones_size, 0, zholes_size);
-
- /*
- * All zonelists will be built later in start_kernel() after per cpu
- * areas are initialized.
- */
-}
-
/*
* Setup early cpu_to_node.
*
@@ -759,9 +741,6 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;
- if (!node_online(node))
- init_memory_less_node(node);
-
numa_set_node(cpu, node);
}
}
diff --git a/arch/xtensa/include/asm/irqflags.h b/arch/xtensa/include/asm/irqflags.h
index 9b5e8526afe5..12890681587b 100644
--- a/arch/xtensa/include/asm/irqflags.h
+++ b/arch/xtensa/include/asm/irqflags.h
@@ -27,7 +27,7 @@ static inline unsigned long arch_local_irq_save(void)
{
unsigned long flags;
#if XTENSA_FAKE_NMI
-#if defined(CONFIG_DEBUG_KERNEL) && (LOCKLEVEL | TOPLEVEL) >= XCHAL_DEBUGLEVEL
+#if defined(CONFIG_DEBUG_MISC) && (LOCKLEVEL | TOPLEVEL) >= XCHAL_DEBUGLEVEL
unsigned long tmp;
asm volatile("rsr %0, ps\t\n"
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 3699d6d3e479..83b244ce61ee 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -126,7 +126,7 @@ void secondary_start_kernel(void)
init_mmu();
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_MISC
if (boot_secondary_processors == 0) {
pr_debug("%s: boot_secondary_processors:%d; Hanging cpu:%d\n",
__func__, boot_secondary_processors, cpu);
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index d49861099684..b51746f2b80b 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -216,11 +216,6 @@ void free_initrd_mem(unsigned long start, unsigned long end)
}
#endif
-void free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
static void __init parse_memmap_one(char *p)
{
char *oldp;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index e49028a60429..f180427e48f4 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -231,13 +231,14 @@ static bool pages_correctly_probed(unsigned long start_pfn)
* OK to have direct references to sparsemem variables in here.
*/
static int
-memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
+memory_block_action(unsigned long start_section_nr, unsigned long action,
+ int online_type)
{
unsigned long start_pfn;
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
int ret;
- start_pfn = section_nr_to_pfn(phys_index);
+ start_pfn = section_nr_to_pfn(start_section_nr);
switch (action) {
case MEM_ONLINE:
@@ -251,7 +252,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t
break;
default:
WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
- "%ld\n", __func__, phys_index, action, action);
+ "%ld\n", __func__, start_section_nr, action, action);
ret = -EINVAL;
}
@@ -733,16 +734,18 @@ unregister_memory(struct memory_block *memory)
{
BUG_ON(memory->dev.bus != &memory_subsys);
- /* drop the ref. we got in remove_memory_section() */
+ /* drop the ref. we got via find_memory_block() */
put_device(&memory->dev);
device_unregister(&memory->dev);
}
-static int remove_memory_section(unsigned long node_id,
- struct mem_section *section, int phys_device)
+void unregister_memory_section(struct mem_section *section)
{
struct memory_block *mem;
+ if (WARN_ON_ONCE(!present_section(section)))
+ return;
+
mutex_lock(&mem_sysfs_mutex);
/*
@@ -763,15 +766,6 @@ static int remove_memory_section(unsigned long node_id,
out_unlock:
mutex_unlock(&mem_sysfs_mutex);
- return 0;
-}
-
-int unregister_memory_section(struct mem_section *section)
-{
- if (!present_section(section))
- return -EINVAL;
-
- return remove_memory_section(0, section, 0);
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/drivers/firewire/core-iso.c b/drivers/firewire/core-iso.c
index 11b921727183..23fd996df8d2 100644
--- a/drivers/firewire/core-iso.c
+++ b/drivers/firewire/core-iso.c
@@ -107,19 +107,8 @@ EXPORT_SYMBOL(fw_iso_buffer_init);
int fw_iso_buffer_map_vma(struct fw_iso_buffer *buffer,
struct vm_area_struct *vma)
{
- unsigned long uaddr;
- int i, err;
-
- uaddr = vma->vm_start;
- for (i = 0; i < buffer->page_count; i++) {
- err = vm_insert_page(vma, uaddr, buffer->pages[i]);
- if (err)
- return err;
-
- uaddr += PAGE_SIZE;
- }
-
- return 0;
+ return vm_map_pages_zero(vma, buffer->pages,
+ buffer->page_count);
}
void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer,
diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c
index e18a786fc943..0bbc7142f1dc 100644
--- a/drivers/fpga/dfl-afu-dma-region.c
+++ b/drivers/fpga/dfl-afu-dma-region.c
@@ -35,45 +35,37 @@ void afu_dma_region_init(struct dfl_feature_platform_data *pdata)
* afu_dma_adjust_locked_vm - adjust locked memory
* @dev: port device
* @npages: number of pages
- * @incr: increase or decrease locked memory
*
* Increase or decrease the locked memory size with npages input.
*
* Return 0 on success.
* Return -ENOMEM if locked memory size is over the limit and no CAP_IPC_LOCK.
*/
-static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr)
+static int afu_dma_adjust_locked_vm(struct device *dev, long pages)
{
- unsigned long locked, lock_limit;
+ unsigned long lock_limit;
+ s64 locked_vm;
int ret = 0;
/* the task is exiting. */
- if (!current->mm)
+ if (!current->mm || !pages)
return 0;
- down_write(&current->mm->mmap_sem);
-
- if (incr) {
- locked = current->mm->locked_vm + npages;
+ locked_vm = atomic64_add_return(pages, &current->mm->locked_vm);
+ WARN_ON_ONCE(locked_vm < 0);
+ if (pages > 0 && !capable(CAP_IPC_LOCK)) {
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ if (locked_vm > lock_limit) {
ret = -ENOMEM;
- else
- current->mm->locked_vm += npages;
- } else {
- if (WARN_ON_ONCE(npages > current->mm->locked_vm))
- npages = current->mm->locked_vm;
- current->mm->locked_vm -= npages;
+ atomic64_sub(pages, &current->mm->locked_vm);
+ }
}
- dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %ld/%ld%s\n", current->pid,
- incr ? '+' : '-', npages << PAGE_SHIFT,
- current->mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK),
+ dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %lld/%lu%s\n", current->pid,
+ (pages > 0) ? '+' : '-', pages << PAGE_SHIFT,
+ locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK),
ret ? "- exceeded" : "");
- up_write(&current->mm->mmap_sem);
-
return ret;
}
@@ -92,7 +84,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
struct device *dev = &pdata->dev->dev;
int ret, pinned;
- ret = afu_dma_adjust_locked_vm(dev, npages, true);
+ ret = afu_dma_adjust_locked_vm(dev, npages);
if (ret)
return ret;
@@ -102,7 +94,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
goto unlock_vm;
}
- pinned = get_user_pages_fast(region->user_addr, npages, 1,
+ pinned = get_user_pages_fast(region->user_addr, npages, FOLL_WRITE,
region->pages);
if (pinned < 0) {
ret = pinned;
@@ -121,7 +113,7 @@ put_pages:
free_pages:
kfree(region->pages);
unlock_vm:
- afu_dma_adjust_locked_vm(dev, npages, false);
+ afu_dma_adjust_locked_vm(dev, -npages);
return ret;
}
@@ -141,7 +133,7 @@ static void afu_dma_unpin_pages(struct dfl_feature_platform_data *pdata,
put_all_pages(region->pages, npages);
kfree(region->pages);
- afu_dma_adjust_locked_vm(dev, npages, false);
+ afu_dma_adjust_locked_vm(dev, -npages);
dev_dbg(dev, "%ld pages unpinned\n", npages);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index 3e6823fdd939..58ed401c5996 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -256,14 +256,14 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
/* TODO we should be able to split locking for interval tree and
* amdgpu_mn_invalidate_node
*/
- if (amdgpu_mn_read_lock(amn, range->blockable))
+ if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range)))
return -EAGAIN;
it = interval_tree_iter_first(&amn->objects, range->start, end);
while (it) {
struct amdgpu_mn_node *node;
- if (!range->blockable) {
+ if (!mmu_notifier_range_blockable(range)) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}
@@ -299,7 +299,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
/* notification is exclusive, but interval is inclusive */
end = range->end - 1;
- if (amdgpu_mn_read_lock(amn, range->blockable))
+ if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range)))
return -EAGAIN;
it = interval_tree_iter_first(&amn->objects, range->start, end);
@@ -307,7 +307,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
struct amdgpu_mn_node *node;
struct amdgpu_bo *bo;
- if (!range->blockable) {
+ if (!mmu_notifier_range_blockable(range)) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 215bf3fef10c..8079ea3af103 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -122,7 +122,7 @@ userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
while (it) {
struct drm_i915_gem_object *obj;
- if (!range->blockable) {
+ if (!mmu_notifier_range_blockable(range)) {
ret = -EAGAIN;
break;
}
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c
index b3019505065a..c9bd1278f573 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -133,7 +133,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
/* TODO we should be able to split locking for interval tree and
* the tear down.
*/
- if (range->blockable)
+ if (mmu_notifier_range_blockable(range))
mutex_lock(&rmn->lock);
else if (!mutex_trylock(&rmn->lock))
return -EAGAIN;
@@ -144,7 +144,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
struct radeon_bo *bo;
long r;
- if (!range->blockable) {
+ if (!mmu_notifier_range_blockable(range)) {
ret = -EAGAIN;
goto out_unlock;
}
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
index a8db758d523e..a2ebb08990e9 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
@@ -221,26 +221,13 @@ static int rockchip_drm_gem_object_mmap_iommu(struct drm_gem_object *obj,
struct vm_area_struct *vma)
{
struct rockchip_gem_object *rk_obj = to_rockchip_obj(obj);
- unsigned int i, count = obj->size >> PAGE_SHIFT;
+ unsigned int count = obj->size >> PAGE_SHIFT;
unsigned long user_count = vma_pages(vma);
- unsigned long uaddr = vma->vm_start;
- unsigned long offset = vma->vm_pgoff;
- unsigned long end = user_count + offset;
- int ret;
if (user_count == 0)
return -ENXIO;
- if (end > count)
- return -ENXIO;
- for (i = offset; i < end; i++) {
- ret = vm_insert_page(vma, uaddr, rk_obj->pages[i]);
- if (ret)
- return ret;
- uaddr += PAGE_SIZE;
- }
-
- return 0;
+ return vm_map_pages(vma, rk_obj->pages, count);
}
static int rockchip_drm_gem_object_mmap_dma(struct drm_gem_object *obj,
diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c
index 8bf3a7c23ed3..062067438f1d 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -243,7 +243,8 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer)
if (NULL == vsg->pages)
return -ENOMEM;
ret = get_user_pages_fast((unsigned long)xfer->mem_addr,
- vsg->num_pages, vsg->direction == DMA_FROM_DEVICE,
+ vsg->num_pages,
+ vsg->direction == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
vsg->pages);
if (ret != vsg->num_pages) {
if (ret < 0)
diff --git a/drivers/gpu/drm/xen/xen_drm_front_gem.c b/drivers/gpu/drm/xen/xen_drm_front_gem.c
index 53c376d55fcf..a24548489dde 100644
--- a/drivers/gpu/drm/xen/xen_drm_front_gem.c
+++ b/drivers/gpu/drm/xen/xen_drm_front_gem.c
@@ -224,8 +224,7 @@ xen_drm_front_gem_import_sg_table(struct drm_device *dev,
static int gem_mmap_obj(struct xen_gem_object *xen_obj,
struct vm_area_struct *vma)
{
- unsigned long addr = vma->vm_start;
- int i;
+ int ret;
/*
* clear the VM_PFNMAP flag that was set by drm_gem_mmap(), and set the
@@ -252,18 +251,11 @@ static int gem_mmap_obj(struct xen_gem_object *xen_obj,
* FIXME: as we insert all the pages now then no .fault handler must
* be called, so don't provide one
*/
- for (i = 0; i < xen_obj->num_pages; i++) {
- int ret;
-
- ret = vm_insert_page(vma, addr, xen_obj->pages[i]);
- if (ret < 0) {
- DRM_ERROR("Failed to insert pages into vma: %d\n", ret);
- return ret;
- }
+ ret = vm_map_pages(vma, xen_obj->pages, xen_obj->num_pages);
+ if (ret < 0)
+ DRM_ERROR("Failed to map pages into vma: %d\n", ret);
- addr += PAGE_SIZE;
- }
- return 0;
+ return ret;
}
int xen_drm_front_gem_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 0a23048db523..e7ea819fcb11 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -295,10 +295,11 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
while (npages) {
down_read(&mm->mmap_sem);
- ret = get_user_pages_longterm(cur_base,
+ ret = get_user_pages(cur_base,
min_t(unsigned long, npages,
PAGE_SIZE / sizeof (struct page *)),
- gup_flags, page_list, NULL);
+ gup_flags | FOLL_LONGTERM,
+ page_list, NULL);
if (ret < 0) {
up_read(&mm->mmap_sem);
goto umem_release;
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index c7226cf52acc..f962b5bbfa40 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -152,7 +152,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct ib_ucontext_per_mm *per_mm =
container_of(mn, struct ib_ucontext_per_mm, mn);
- if (range->blockable)
+ if (mmu_notifier_range_blockable(range))
down_read(&per_mm->umem_rwsem);
else if (!down_read_trylock(&per_mm->umem_rwsem))
return -EAGAIN;
@@ -170,7 +170,8 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
range->end,
invalidate_range_start_trampoline,
- range->blockable, NULL);
+ mmu_notifier_range_blockable(range),
+ NULL);
}
static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index 24b592c6522e..02eee8eff1db 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -104,8 +104,9 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np
bool writable, struct page **pages)
{
int ret;
+ unsigned int gup_flags = FOLL_LONGTERM | (writable ? FOLL_WRITE : 0);
- ret = get_user_pages_fast(vaddr, npages, writable, pages);
+ ret = get_user_pages_fast(vaddr, npages, gup_flags, pages);
if (ret < 0)
return ret;
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index 112d2f38e0de..8ff0e90d7564 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -472,7 +472,8 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
goto out;
}
- ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages);
+ ret = get_user_pages_fast(uaddr & PAGE_MASK, 1,
+ FOLL_WRITE | FOLL_LONGTERM, pages);
if (ret < 0)
goto out;
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 123ca8f64f75..f712fb7fa82f 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -114,10 +114,10 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages,
down_read(&current->mm->mmap_sem);
for (got = 0; got < num_pages; got += ret) {
- ret = get_user_pages_longterm(start_page + got * PAGE_SIZE,
- num_pages - got,
- FOLL_WRITE | FOLL_FORCE,
- p + got, NULL);
+ ret = get_user_pages(start_page + got * PAGE_SIZE,
+ num_pages - got,
+ FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE,
+ p + got, NULL);
if (ret < 0) {
up_read(&current->mm->mmap_sem);
goto bail_release;
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c
index ef19d39a44b1..0c204776263f 100644
--- a/drivers/infiniband/hw/qib/qib_user_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
@@ -670,7 +670,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
else
j = npages;
- ret = get_user_pages_fast(addr, j, 0, pages);
+ ret = get_user_pages_fast(addr, j, FOLL_LONGTERM, pages);
if (ret != j) {
i = 0;
j = ret;
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index da35d6fdfc5e..e312f522a66d 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -143,10 +143,11 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
ret = 0;
while (npages) {
- ret = get_user_pages_longterm(cur_base,
- min_t(unsigned long, npages,
- PAGE_SIZE / sizeof(struct page *)),
- gup_flags, page_list, NULL);
+ ret = get_user_pages(cur_base,
+ min_t(unsigned long, npages,
+ PAGE_SIZE / sizeof(struct page *)),
+ gup_flags | FOLL_LONGTERM,
+ page_list, NULL);
if (ret < 0)
goto out;
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 542b0005e492..129c4badf9ae 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -648,17 +648,7 @@ out_free_pages:
int iommu_dma_mmap(struct page **pages, size_t size, struct vm_area_struct *vma)
{
- unsigned long uaddr = vma->vm_start;
- unsigned int i, count = PAGE_ALIGN(size) >> PAGE_SHIFT;
- int ret = -ENXIO;
-
- for (i = vma->vm_pgoff; i < count && uaddr < vma->vm_end; i++) {
- ret = vm_insert_page(vma, uaddr, pages[i]);
- if (ret)
- break;
- uaddr += PAGE_SIZE;
- }
- return ret;
+ return vm_map_pages(vma, pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
}
static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c
index 7ebd58a1c431..3cf25abf5807 100644
--- a/drivers/media/common/videobuf2/videobuf2-core.c
+++ b/drivers/media/common/videobuf2/videobuf2-core.c
@@ -2201,6 +2201,13 @@ int vb2_mmap(struct vb2_queue *q, struct vm_area_struct *vma)
goto unlock;
}
+ /*
+ * vm_pgoff is treated in V4L2 API as a 'cookie' to select a buffer,
+ * not as a in-buffer offset. We always want to mmap a whole buffer
+ * from its beginning.
+ */
+ vma->vm_pgoff = 0;
+
ret = call_memop(vb, mmap, vb->planes[plane].mem_priv, vma);
unlock:
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
index 82389aead6ed..ecbef266130b 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
@@ -186,12 +186,6 @@ static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma)
return -EINVAL;
}
- /*
- * dma_mmap_* uses vm_pgoff as in-buffer offset, but we want to
- * map whole buffer
- */
- vma->vm_pgoff = 0;
-
ret = dma_mmap_attrs(buf->dev, vma, buf->cookie,
buf->dma_addr, buf->size, buf->attrs);
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
index 270c3162fdcb..4a4c49d6085c 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
@@ -328,28 +328,18 @@ static unsigned int vb2_dma_sg_num_users(void *buf_priv)
static int vb2_dma_sg_mmap(void *buf_priv, struct vm_area_struct *vma)
{
struct vb2_dma_sg_buf *buf = buf_priv;
- unsigned long uaddr = vma->vm_start;
- unsigned long usize = vma->vm_end - vma->vm_start;
- int i = 0;
+ int err;
if (!buf) {
printk(KERN_ERR "No memory to map\n");
return -EINVAL;
}
- do {
- int ret;
-
- ret = vm_insert_page(vma, uaddr, buf->pages[i++]);
- if (ret) {
- printk(KERN_ERR "Remapping memory, error: %d\n", ret);
- return ret;
- }
-
- uaddr += PAGE_SIZE;
- usize -= PAGE_SIZE;
- } while (usize > 0);
-
+ err = vm_map_pages(vma, buf->pages, buf->num_pages);
+ if (err) {
+ printk(KERN_ERR "Remapping memory, error: %d\n", err);
+ return err;
+ }
/*
* Use common vm_area operations to track buffer refcount.
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 08929c087e27..870a2a526e0b 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -186,12 +186,12 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",
data, size, dma->nr_pages);
- err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages,
- flags, dma->pages, NULL);
+ err = get_user_pages(data & PAGE_MASK, dma->nr_pages,
+ flags | FOLL_LONGTERM, dma->pages, NULL);
if (err != dma->nr_pages) {
dma->nr_pages = (err >= 0) ? err : 0;
- dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err,
+ dprintk(1, "get_user_pages: err=%d [%d]\n", err,
dma->nr_pages);
return err < 0 ? err : -EINVAL;
}
diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c
index 25265fd0fd6e..89cff9d1012b 100644
--- a/drivers/misc/genwqe/card_utils.c
+++ b/drivers/misc/genwqe/card_utils.c
@@ -603,7 +603,7 @@ int genwqe_user_vmap(struct genwqe_dev *cd, struct dma_mapping *m, void *uaddr,
/* pin user pages in memory */
rc = get_user_pages_fast(data & PAGE_MASK, /* page aligned addr */
m->nr_pages,
- m->write, /* readable/writable */
+ m->write ? FOLL_WRITE : 0, /* readable/writable */
m->page_list); /* ptrs to pages */
if (rc < 0)
goto fail_get_user_pages;
diff --git a/drivers/misc/sram-exec.c b/drivers/misc/sram-exec.c
index 426ad912b441..d054e2842a5f 100644
--- a/drivers/misc/sram-exec.c
+++ b/drivers/misc/sram-exec.c
@@ -96,7 +96,7 @@ void *sram_exec_copy(struct gen_pool *pool, void *dst, void *src,
if (!part)
return NULL;
- if (!addr_in_gen_pool(pool, (unsigned long)dst, size))
+ if (!gen_pool_has_addr(pool, (unsigned long)dst, size))
return NULL;
base = (unsigned long)part->base;
diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c
index 997f92543dd4..422d08da3244 100644
--- a/drivers/misc/vmw_vmci/vmci_host.c
+++ b/drivers/misc/vmw_vmci/vmci_host.c
@@ -242,7 +242,7 @@ static int vmci_host_setup_notify(struct vmci_ctx *context,
/*
* Lock physical page backing a given user VA.
*/
- retval = get_user_pages_fast(uva, 1, 1, &context->notify_page);
+ retval = get_user_pages_fast(uva, 1, FOLL_WRITE, &context->notify_page);
if (retval != 1) {
context->notify_page = NULL;
return VMCI_ERROR_GENERIC;
diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index f5f1aac9d163..1174735f003d 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -659,7 +659,8 @@ static int qp_host_get_user_memory(u64 produce_uva,
int err = VMCI_SUCCESS;
retval = get_user_pages_fast((uintptr_t) produce_uva,
- produce_q->kernel_if->num_pages, 1,
+ produce_q->kernel_if->num_pages,
+ FOLL_WRITE,
produce_q->kernel_if->u.h.header_page);
if (retval < (int)produce_q->kernel_if->num_pages) {
pr_debug("get_user_pages_fast(produce) failed (retval=%d)",
@@ -671,7 +672,8 @@ static int qp_host_get_user_memory(u64 produce_uva,
}
retval = get_user_pages_fast((uintptr_t) consume_uva,
- consume_q->kernel_if->num_pages, 1,
+ consume_q->kernel_if->num_pages,
+ FOLL_WRITE,
consume_q->kernel_if->u.h.header_page);
if (retval < (int)consume_q->kernel_if->num_pages) {
pr_debug("get_user_pages_fast(consume) failed (retval=%d)",
diff --git a/drivers/mtd/nand/raw/vf610_nfc.c b/drivers/mtd/nand/raw/vf610_nfc.c
index 6d43ddb3332f..e4fe8c4bc711 100644
--- a/drivers/mtd/nand/raw/vf610_nfc.c
+++ b/drivers/mtd/nand/raw/vf610_nfc.c
@@ -364,7 +364,7 @@ static int vf610_nfc_cmd(struct nand_chip *chip,
{
const struct nand_op_instr *instr;
struct vf610_nfc *nfc = chip_to_nfc(chip);
- int op_id = -1, trfr_sz = 0, offset;
+ int op_id = -1, trfr_sz = 0, offset = 0;
u32 col = 0, row = 0, cmd1 = 0, cmd2 = 0, code = 0;
bool force8bit = false;
diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index 321bc673c417..cef0133aa47a 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -274,7 +274,8 @@ static int pin_user_pages(unsigned long first_page,
*iter_last_page_size = last_page_size;
}
- ret = get_user_pages_fast(first_page, requested_pages, !is_write,
+ ret = get_user_pages_fast(first_page, requested_pages,
+ !is_write ? FOLL_WRITE : 0,
pages);
if (ret <= 0)
return -EFAULT;
diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c
index dd5d1103e02b..4b6418039387 100644
--- a/drivers/pps/clients/pps-gpio.c
+++ b/drivers/pps/clients/pps-gpio.c
@@ -31,19 +31,25 @@
#include <linux/slab.h>
#include <linux/pps_kernel.h>
#include <linux/pps-gpio.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
#include <linux/list.h>
#include <linux/of_device.h>
#include <linux/of_gpio.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
/* Info for each registered platform device */
struct pps_gpio_device_data {
int irq; /* IRQ used as PPS source */
struct pps_device *pps; /* PPS source device */
struct pps_source_info info; /* PPS source information */
+ struct gpio_desc *gpio_pin; /* GPIO port descriptors */
+ struct gpio_desc *echo_pin;
+ struct timer_list echo_timer; /* timer to reset echo active state */
bool assert_falling_edge;
bool capture_clear;
- unsigned int gpio_pin;
+ unsigned int echo_active_ms; /* PPS echo active duration */
+ unsigned long echo_timeout; /* timer timeout value in jiffies */
};
/*
@@ -61,18 +67,101 @@ static irqreturn_t pps_gpio_irq_handler(int irq, void *data)
info = data;
- rising_edge = gpio_get_value(info->gpio_pin);
+ rising_edge = gpiod_get_value(info->gpio_pin);
if ((rising_edge && !info->assert_falling_edge) ||
(!rising_edge && info->assert_falling_edge))
- pps_event(info->pps, &ts, PPS_CAPTUREASSERT, NULL);
+ pps_event(info->pps, &ts, PPS_CAPTUREASSERT, data);
else if (info->capture_clear &&
((rising_edge && info->assert_falling_edge) ||
- (!rising_edge && !info->assert_falling_edge)))
- pps_event(info->pps, &ts, PPS_CAPTURECLEAR, NULL);
+ (!rising_edge && !info->assert_falling_edge)))
+ pps_event(info->pps, &ts, PPS_CAPTURECLEAR, data);
return IRQ_HANDLED;
}
+/* This function will only be called when an ECHO GPIO is defined */
+static void pps_gpio_echo(struct pps_device *pps, int event, void *data)
+{
+ /* add_timer() needs to write into info->echo_timer */
+ struct pps_gpio_device_data *info = data;
+
+ switch (event) {
+ case PPS_CAPTUREASSERT:
+ if (pps->params.mode & PPS_ECHOASSERT)
+ gpiod_set_value(info->echo_pin, 1);
+ break;
+
+ case PPS_CAPTURECLEAR:
+ if (pps->params.mode & PPS_ECHOCLEAR)
+ gpiod_set_value(info->echo_pin, 1);
+ break;
+ }
+
+ /* fire the timer */
+ if (info->pps->params.mode & (PPS_ECHOASSERT | PPS_ECHOCLEAR)) {
+ info->echo_timer.expires = jiffies + info->echo_timeout;
+ add_timer(&info->echo_timer);
+ }
+}
+
+/* Timer callback to reset the echo pin to the inactive state */
+static void pps_gpio_echo_timer_callback(struct timer_list *t)
+{
+ const struct pps_gpio_device_data *info;
+
+ info = from_timer(info, t, echo_timer);
+
+ gpiod_set_value(info->echo_pin, 0);
+}
+
+static int pps_gpio_setup(struct platform_device *pdev)
+{
+ struct pps_gpio_device_data *data = platform_get_drvdata(pdev);
+ struct device_node *np = pdev->dev.of_node;
+ int ret;
+ u32 value;
+
+ data->gpio_pin = devm_gpiod_get(&pdev->dev,
+ NULL, /* request "gpios" */
+ GPIOD_IN);
+ if (IS_ERR(data->gpio_pin)) {
+ dev_err(&pdev->dev,
+ "failed to request PPS GPIO\n");
+ return PTR_ERR(data->gpio_pin);
+ }
+
+ data->echo_pin = devm_gpiod_get_optional(&pdev->dev,
+ "echo",
+ GPIOD_OUT_LOW);
+ if (data->echo_pin) {
+ if (IS_ERR(data->echo_pin)) {
+ dev_err(&pdev->dev, "failed to request ECHO GPIO\n");
+ return PTR_ERR(data->echo_pin);
+ }
+
+ ret = of_property_read_u32(np,
+ "echo-active-ms",
+ &value);
+ if (ret) {
+ dev_err(&pdev->dev,
+ "failed to get echo-active-ms from OF\n");
+ return ret;
+ }
+ data->echo_active_ms = value;
+ /* sanity check on echo_active_ms */
+ if (!data->echo_active_ms || data->echo_active_ms > 999) {
+ dev_err(&pdev->dev,
+ "echo-active-ms: %u - bad value from OF\n",
+ data->echo_active_ms);
+ return -EINVAL;
+ }
+ }
+
+ if (of_property_read_bool(np, "assert-falling-edge"))
+ data->assert_falling_edge = true;
+ return 0;
+}
+
static unsigned long
get_irqf_trigger_flags(const struct pps_gpio_device_data *data)
{
@@ -90,53 +179,32 @@ get_irqf_trigger_flags(const struct pps_gpio_device_data *data)
static int pps_gpio_probe(struct platform_device *pdev)
{
struct pps_gpio_device_data *data;
- const char *gpio_label;
int ret;
int pps_default_params;
const struct pps_gpio_platform_data *pdata = pdev->dev.platform_data;
- struct device_node *np = pdev->dev.of_node;
/* allocate space for device info */
- data = devm_kzalloc(&pdev->dev, sizeof(struct pps_gpio_device_data),
- GFP_KERNEL);
+ data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
+ platform_set_drvdata(pdev, data);
+ /* GPIO setup */
if (pdata) {
data->gpio_pin = pdata->gpio_pin;
- gpio_label = pdata->gpio_label;
+ data->echo_pin = pdata->echo_pin;
data->assert_falling_edge = pdata->assert_falling_edge;
data->capture_clear = pdata->capture_clear;
+ data->echo_active_ms = pdata->echo_active_ms;
} else {
- ret = of_get_gpio(np, 0);
- if (ret < 0) {
- dev_err(&pdev->dev, "failed to get GPIO from device tree\n");
- return ret;
- }
- data->gpio_pin = ret;
- gpio_label = PPS_GPIO_NAME;
-
- if (of_get_property(np, "assert-falling-edge", NULL))
- data->assert_falling_edge = true;
- }
-
- /* GPIO setup */
- ret = devm_gpio_request(&pdev->dev, data->gpio_pin, gpio_label);
- if (ret) {
- dev_err(&pdev->dev, "failed to request GPIO %u\n",
- data->gpio_pin);
- return ret;
- }
-
- ret = gpio_direction_input(data->gpio_pin);
- if (ret) {
- dev_err(&pdev->dev, "failed to set pin direction\n");
- return -EINVAL;
+ ret = pps_gpio_setup(pdev);
+ if (ret)
+ return -EINVAL;
}
/* IRQ setup */
- ret = gpio_to_irq(data->gpio_pin);
+ ret = gpiod_to_irq(data->gpio_pin);
if (ret < 0) {
dev_err(&pdev->dev, "failed to map GPIO to IRQ: %d\n", ret);
return -EINVAL;
@@ -152,6 +220,11 @@ static int pps_gpio_probe(struct platform_device *pdev)
data->info.owner = THIS_MODULE;
snprintf(data->info.name, PPS_MAX_NAME_LEN - 1, "%s.%d",
pdev->name, pdev->id);
+ if (data->echo_pin) {
+ data->info.echo = pps_gpio_echo;
+ data->echo_timeout = msecs_to_jiffies(data->echo_active_ms);
+ timer_setup(&data->echo_timer, pps_gpio_echo_timer_callback, 0);
+ }
/* register PPS source */
pps_default_params = PPS_CAPTUREASSERT | PPS_OFFSETASSERT;
@@ -173,7 +246,6 @@ static int pps_gpio_probe(struct platform_device *pdev)
return -EINVAL;
}
- platform_set_drvdata(pdev, data);
dev_info(data->pps->dev, "Registered IRQ %d as PPS source\n",
data->irq);
@@ -185,6 +257,11 @@ static int pps_gpio_remove(struct platform_device *pdev)
struct pps_gpio_device_data *data = platform_get_drvdata(pdev);
pps_unregister_source(data->pps);
+ if (data->echo_pin) {
+ del_timer_sync(&data->echo_timer);
+ /* reset echo pin in any case */
+ gpiod_set_value(data->echo_pin, 0);
+ }
dev_info(&pdev->dev, "removed IRQ %d as PPS source\n", data->irq);
return 0;
}
@@ -209,4 +286,4 @@ MODULE_AUTHOR("Ricardo Martins <rasm@fe.up.pt>");
MODULE_AUTHOR("James Nuss <jamesnuss@nanometrics.ca>");
MODULE_DESCRIPTION("Use GPIO pin as PPS source");
MODULE_LICENSE("GPL");
-MODULE_VERSION("1.0.0");
+MODULE_VERSION("1.2.0");
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 1e1f42e210a0..4a4a75fa26d5 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -868,7 +868,9 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
pinned = get_user_pages_fast(
(unsigned long)xfer->loc_addr & PAGE_MASK,
- nr_pages, dir == DMA_FROM_DEVICE, page_list);
+ nr_pages,
+ dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
+ page_list);
if (pinned != nr_pages) {
if (pinned < 0) {
diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c
index cf45829585cb..b29fc258eeba 100644
--- a/drivers/rapidio/rio_cm.c
+++ b/drivers/rapidio/rio_cm.c
@@ -2147,6 +2147,14 @@ static int riocm_add_mport(struct device *dev,
mutex_init(&cm->rx_lock);
riocm_rx_fill(cm, RIOCM_RX_RING_SIZE);
cm->rx_wq = create_workqueue(DRV_NAME "/rxq");
+ if (!cm->rx_wq) {
+ riocm_error("failed to allocate IBMBOX_%d on %s",
+ cmbox, mport->name);
+ rio_release_outb_mbox(mport, cmbox);
+ kfree(cm);
+ return -ENOMEM;
+ }
+
INIT_WORK(&cm->rx_work, rio_ibmsg_handler);
cm->tx_slot = 0;
diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c
index acd9ba40eabe..8090dc9a1514 100644
--- a/drivers/sbus/char/oradax.c
+++ b/drivers/sbus/char/oradax.c
@@ -437,7 +437,7 @@ static int dax_lock_page(void *va, struct page **p)
dax_dbg("uva %p", va);
- ret = get_user_pages_fast((unsigned long)va, 1, 1, p);
+ ret = get_user_pages_fast((unsigned long)va, 1, FOLL_WRITE, p);
if (ret == 1) {
dax_dbg("locked page %p, for VA %p", *p, va);
return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 19c022e66d63..3c6a18ad9a87 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4922,7 +4922,8 @@ static int sgl_map_user_pages(struct st_buffer *STbp,
/* Try to fault in all of the necessary pages */
/* rw==READ means read from drive, write into memory area */
- res = get_user_pages_fast(uaddr, nr_pages, rw == READ, pages);
+ res = get_user_pages_fast(uaddr, nr_pages, rw == READ ? FOLL_WRITE : 0,
+ pages);
/* Errors and no page mapped should return here */
if (res < nr_pages)
diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c
index 3912526ead66..cdb613d38062 100644
--- a/drivers/spi/spi-rockchip.c
+++ b/drivers/spi/spi-rockchip.c
@@ -15,6 +15,7 @@
#include <linux/clk.h>
#include <linux/dmaengine.h>
+#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/pinctrl/consumer.h>
diff --git a/drivers/staging/gasket/gasket_page_table.c b/drivers/staging/gasket/gasket_page_table.c
index 600928f63577..d35c4fb19e28 100644
--- a/drivers/staging/gasket/gasket_page_table.c
+++ b/drivers/staging/gasket/gasket_page_table.c
@@ -486,8 +486,8 @@ static int gasket_perform_mapping(struct gasket_page_table *pg_tbl,
ptes[i].dma_addr = pg_tbl->coherent_pages[0].paddr +
off + i * PAGE_SIZE;
} else {
- ret = get_user_pages_fast(page_addr - offset, 1, 1,
- &page);
+ ret = get_user_pages_fast(page_addr - offset, 1,
+ FOLL_WRITE, &page);
if (ret <= 0) {
dev_err(pg_tbl->device,
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 0b9ab1d0dd45..49fd7312e2aa 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -273,7 +273,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
goto err;
}
- rc = get_user_pages_fast(start, num_pages, 1, shm->pages);
+ rc = get_user_pages_fast(start, num_pages, FOLL_WRITE, shm->pages);
if (rc > 0)
shm->num_pages = rc;
if (rc != num_pages) {
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 59e82e6d776d..573b2055173c 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -527,8 +527,12 @@ void __handle_sysrq(int key, bool check_mask)
{
struct sysrq_key_op *op_p;
int orig_log_level;
+ int orig_suppress_printk;
int i;
+ orig_suppress_printk = suppress_printk;
+ suppress_printk = 0;
+
rcu_sysrq_start();
rcu_read_lock();
/*
@@ -574,6 +578,8 @@ void __handle_sysrq(int key, bool check_mask)
}
rcu_read_unlock();
rcu_sysrq_end();
+
+ suppress_printk = orig_suppress_printk;
}
void handle_sysrq(int key)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 6b64e45a5269..7073c1472984 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -36,7 +36,9 @@ static void tce_iommu_detach_group(void *iommu_data,
static long try_increment_locked_vm(struct mm_struct *mm, long npages)
{
- long ret = 0, locked, lock_limit;
+ long ret = 0;
+ s64 locked;
+ unsigned long lock_limit;
if (WARN_ON_ONCE(!mm))
return -EPERM;
@@ -44,39 +46,32 @@ static long try_increment_locked_vm(struct mm_struct *mm, long npages)
if (!npages)
return 0;
- down_write(&mm->mmap_sem);
- locked = mm->locked_vm + npages;
+ locked = atomic64_add_return(npages, &mm->locked_vm);
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
ret = -ENOMEM;
- else
- mm->locked_vm += npages;
-
- pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
- npages << PAGE_SHIFT,
- mm->locked_vm << PAGE_SHIFT,
- rlimit(RLIMIT_MEMLOCK),
- ret ? " - exceeded" : "");
+ atomic64_sub(npages, &mm->locked_vm);
+ }
- up_write(&mm->mmap_sem);
+ pr_debug("[%d] RLIMIT_MEMLOCK +%ld %lld/%lu%s\n", current->pid,
+ npages << PAGE_SHIFT, locked << PAGE_SHIFT,
+ lock_limit, ret ? " - exceeded" : "");
return ret;
}
static void decrement_locked_vm(struct mm_struct *mm, long npages)
{
+ s64 locked;
+
if (!mm || !npages)
return;
- down_write(&mm->mmap_sem);
- if (WARN_ON_ONCE(npages > mm->locked_vm))
- npages = mm->locked_vm;
- mm->locked_vm -= npages;
- pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
- npages << PAGE_SHIFT,
- mm->locked_vm << PAGE_SHIFT,
+ locked = atomic64_sub_return(npages, &mm->locked_vm);
+ WARN_ON_ONCE(locked < 0);
+ pr_debug("[%d] RLIMIT_MEMLOCK -%ld %lld/%lu\n", current->pid,
+ npages << PAGE_SHIFT, locked << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK));
- up_write(&mm->mmap_sem);
}
/*
@@ -532,7 +527,8 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
enum dma_data_direction direction = iommu_tce_direction(tce);
if (get_user_pages_fast(tce & PAGE_MASK, 1,
- direction != DMA_TO_DEVICE, &page) != 1)
+ direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
+ &page) != 1)
return -EFAULT;
*hpa = __pa((unsigned long) page_address(page));
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 3be1db3501cc..0237ace12998 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -264,7 +264,8 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
{
struct mm_struct *mm;
- int ret;
+ s64 locked_vm;
+ int ret = 0;
if (!npage)
return 0;
@@ -273,24 +274,15 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
if (!mm)
return -ESRCH; /* process exited */
- ret = down_write_killable(&mm->mmap_sem);
- if (!ret) {
- if (npage > 0) {
- if (!dma->lock_cap) {
- unsigned long limit;
-
- limit = task_rlimit(dma->task,
- RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ locked_vm = atomic64_add_return(npage, &mm->locked_vm);
- if (mm->locked_vm + npage > limit)
- ret = -ENOMEM;
- }
+ if (npage > 0 && !dma->lock_cap) {
+ unsigned long limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >>
+ PAGE_SHIFT;
+ if (locked_vm > limit) {
+ atomic64_sub(npage, &mm->locked_vm);
+ ret = -ENOMEM;
}
-
- if (!ret)
- mm->locked_vm += npage;
-
- up_write(&mm->mmap_sem);
}
if (async)
@@ -358,7 +350,8 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
down_read(&mm->mmap_sem);
if (mm == current->mm) {
- ret = get_user_pages_longterm(vaddr, 1, flags, page, vmas);
+ ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page,
+ vmas);
} else {
ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
vmas, NULL);
@@ -408,6 +401,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
long ret, pinned = 0, lock_acct = 0;
bool rsvd;
dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
+ atomic64_t *locked_vm = &current->mm->locked_vm;
/* This code path is only user initiated */
if (!current->mm)
@@ -425,7 +419,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
* pages are already counted against the user.
*/
if (!rsvd && !vfio_find_vpfn(dma, iova)) {
- if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
+ if (!dma->lock_cap && atomic64_read(locked_vm) + 1 > limit) {
put_pfn(*pfn_base, dma->prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
limit << PAGE_SHIFT);
@@ -452,7 +446,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
if (!rsvd && !vfio_find_vpfn(dma, iova)) {
if (!dma->lock_cap &&
- current->mm->locked_vm + lock_acct + 1 > limit) {
+ atomic64_read(locked_vm) + lock_acct + 1 > limit) {
put_pfn(pfn, dma->prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
__func__, limit << PAGE_SHIFT);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 351af88231ad..1e3ed41ae1f3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1704,7 +1704,7 @@ static int set_bit_to_user(int nr, void __user *addr)
int bit = nr + (log % PAGE_SIZE) * 8;
int r;
- r = get_user_pages_fast(log, 1, 1, &page);
+ r = get_user_pages_fast(log, 1, FOLL_WRITE, &page);
if (r < 0)
return r;
BUG_ON(r != 1);
diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c
index 53b8ceea9bde..fb45f866b923 100644
--- a/drivers/video/backlight/pwm_bl.c
+++ b/drivers/video/backlight/pwm_bl.c
@@ -155,21 +155,6 @@ static const struct backlight_ops pwm_backlight_ops = {
#ifdef CONFIG_OF
#define PWM_LUMINANCE_SCALE 10000 /* luminance scale */
-/* An integer based power function */
-static u64 int_pow(u64 base, int exp)
-{
- u64 result = 1;
-
- while (exp) {
- if (exp & 1)
- result *= base;
- exp >>= 1;
- base *= base;
- }
-
- return result;
-}
-
/*
* CIE lightness to PWM conversion.
*
diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c
index dfed532ed606..4e4d6a0df978 100644
--- a/drivers/video/fbdev/pvr2fb.c
+++ b/drivers/video/fbdev/pvr2fb.c
@@ -686,7 +686,7 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf,
if (!pages)
return -ENOMEM;
- ret = get_user_pages_fast((unsigned long)buf, nr_pages, true, pages);
+ ret = get_user_pages_fast((unsigned long)buf, nr_pages, FOLL_WRITE, pages);
if (ret < nr_pages) {
nr_pages = ret;
ret = -EINVAL;
diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c
index 8ba726e600e9..6446bcab4185 100644
--- a/drivers/virt/fsl_hypervisor.c
+++ b/drivers/virt/fsl_hypervisor.c
@@ -244,7 +244,7 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
/* Get the physical addresses of the source buffer */
num_pinned = get_user_pages_fast(param.local_vaddr - lb_offset,
- num_pages, param.source != -1, pages);
+ num_pages, param.source != -1 ? FOLL_WRITE : 0, pages);
if (num_pinned != num_pages) {
/* get_user_pages() failed */
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 7cf9c51318aa..469dfbd6cf90 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -526,20 +526,20 @@ static int mn_invl_range_start(struct mmu_notifier *mn,
struct gntdev_grant_map *map;
int ret = 0;
- if (range->blockable)
+ if (mmu_notifier_range_blockable(range))
mutex_lock(&priv->lock);
else if (!mutex_trylock(&priv->lock))
return -EAGAIN;
list_for_each_entry(map, &priv->maps, next) {
ret = unmap_if_in_range(map, range->start, range->end,
- range->blockable);
+ mmu_notifier_range_blockable(range));
if (ret)
goto out_unlock;
}
list_for_each_entry(map, &priv->freeable_maps, next) {
ret = unmap_if_in_range(map, range->start, range->end,
- range->blockable);
+ mmu_notifier_range_blockable(range));
if (ret)
goto out_unlock;
}
@@ -852,7 +852,7 @@ static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt,
unsigned long xen_pfn;
int ret;
- ret = get_user_pages_fast(addr, 1, writeable, &page);
+ ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page);
if (ret < 0)
return ret;
@@ -1084,7 +1084,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
int index = vma->vm_pgoff;
int count = vma_pages(vma);
struct gntdev_grant_map *map;
- int i, err = -EINVAL;
+ int err = -EINVAL;
if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
return -EINVAL;
@@ -1145,12 +1145,9 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
goto out_put_map;
if (!use_ptemod) {
- for (i = 0; i < count; i++) {
- err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE,
- map->pages[i]);
- if (err)
- goto out_put_map;
- }
+ err = vm_map_pages(vma, map->pages, map->count);
+ if (err)
+ goto out_put_map;
} else {
#ifdef CONFIG_X86
/*
diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c
index a1c61e351d3f..dd5bbb6e1b6b 100644
--- a/drivers/xen/privcmd-buf.c
+++ b/drivers/xen/privcmd-buf.c
@@ -165,12 +165,8 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma)
if (vma_priv->n_pages != count)
ret = -ENOMEM;
else
- for (i = 0; i < vma_priv->n_pages; i++) {
- ret = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE,
- vma_priv->pages[i]);
- if (ret)
- break;
- }
+ ret = vm_map_pages_zero(vma, vma_priv->pages,
+ vma_priv->n_pages);
if (ret)
privcmd_buf_vmapriv_free(vma_priv);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7d09d125f148..fa9e99a962e0 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -524,6 +524,19 @@ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp,
#endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */
+static inline int make_prot(u32 p_flags)
+{
+ int prot = 0;
+
+ if (p_flags & PF_R)
+ prot |= PROT_READ;
+ if (p_flags & PF_W)
+ prot |= PROT_WRITE;
+ if (p_flags & PF_X)
+ prot |= PROT_EXEC;
+ return prot;
+}
+
/* This is much more generalized than the library routine read function,
so we keep this separate. Technically the library read function
is only provided so that we can read a.out libraries that have
@@ -563,16 +576,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
if (eppnt->p_type == PT_LOAD) {
int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
- int elf_prot = 0;
+ int elf_prot = make_prot(eppnt->p_flags);
unsigned long vaddr = 0;
unsigned long k, map_addr;
- if (eppnt->p_flags & PF_R)
- elf_prot = PROT_READ;
- if (eppnt->p_flags & PF_W)
- elf_prot |= PROT_WRITE;
- if (eppnt->p_flags & PF_X)
- elf_prot |= PROT_EXEC;
vaddr = eppnt->p_vaddr;
if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
elf_type |= MAP_FIXED_NOREPLACE;
@@ -687,7 +694,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
struct file *interpreter = NULL; /* to shut gcc up */
unsigned long load_addr = 0, load_bias = 0;
int load_addr_set = 0;
- char * elf_interpreter = NULL;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
unsigned long elf_bss, elf_brk;
@@ -698,13 +704,12 @@ static int load_elf_binary(struct linux_binprm *bprm)
unsigned long start_code, end_code, start_data, end_data;
unsigned long reloc_func_desc __maybe_unused = 0;
int executable_stack = EXSTACK_DEFAULT;
- struct pt_regs *regs = current_pt_regs();
struct {
struct elfhdr elf_ex;
struct elfhdr interp_elf_ex;
} *loc;
struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
- loff_t pos;
+ struct pt_regs *regs;
loc = kmalloc(sizeof(*loc), GFP_KERNEL);
if (!loc) {
@@ -734,69 +739,66 @@ static int load_elf_binary(struct linux_binprm *bprm)
goto out;
elf_ppnt = elf_phdata;
- elf_bss = 0;
- elf_brk = 0;
+ for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
+ char *elf_interpreter;
+ loff_t pos;
- start_code = ~0UL;
- end_code = 0;
- start_data = 0;
- end_data = 0;
+ if (elf_ppnt->p_type != PT_INTERP)
+ continue;
- for (i = 0; i < loc->elf_ex.e_phnum; i++) {
- if (elf_ppnt->p_type == PT_INTERP) {
- /* This is the program interpreter used for
- * shared libraries - for now assume that this
- * is an a.out format binary
- */
- retval = -ENOEXEC;
- if (elf_ppnt->p_filesz > PATH_MAX ||
- elf_ppnt->p_filesz < 2)
- goto out_free_ph;
-
- retval = -ENOMEM;
- elf_interpreter = kmalloc(elf_ppnt->p_filesz,
- GFP_KERNEL);
- if (!elf_interpreter)
- goto out_free_ph;
-
- pos = elf_ppnt->p_offset;
- retval = kernel_read(bprm->file, elf_interpreter,
- elf_ppnt->p_filesz, &pos);
- if (retval != elf_ppnt->p_filesz) {
- if (retval >= 0)
- retval = -EIO;
- goto out_free_interp;
- }
- /* make sure path is NULL terminated */
- retval = -ENOEXEC;
- if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
- goto out_free_interp;
+ /*
+ * This is the program interpreter used for shared libraries -
+ * for now assume that this is an a.out format binary.
+ */
+ retval = -ENOEXEC;
+ if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2)
+ goto out_free_ph;
- interpreter = open_exec(elf_interpreter);
- retval = PTR_ERR(interpreter);
- if (IS_ERR(interpreter))
- goto out_free_interp;
+ retval = -ENOMEM;
+ elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
+ if (!elf_interpreter)
+ goto out_free_ph;
- /*
- * If the binary is not readable then enforce
- * mm->dumpable = 0 regardless of the interpreter's
- * permissions.
- */
- would_dump(bprm, interpreter);
-
- /* Get the exec headers */
- pos = 0;
- retval = kernel_read(interpreter, &loc->interp_elf_ex,
- sizeof(loc->interp_elf_ex), &pos);
- if (retval != sizeof(loc->interp_elf_ex)) {
- if (retval >= 0)
- retval = -EIO;
- goto out_free_dentry;
- }
+ pos = elf_ppnt->p_offset;
+ retval = kernel_read(bprm->file, elf_interpreter,
+ elf_ppnt->p_filesz, &pos);
+ if (retval != elf_ppnt->p_filesz) {
+ if (retval >= 0)
+ retval = -EIO;
+ goto out_free_interp;
+ }
+ /* make sure path is NULL terminated */
+ retval = -ENOEXEC;
+ if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
+ goto out_free_interp;
- break;
+ interpreter = open_exec(elf_interpreter);
+ kfree(elf_interpreter);
+ retval = PTR_ERR(interpreter);
+ if (IS_ERR(interpreter))
+ goto out_free_ph;
+
+ /*
+ * If the binary is not readable then enforce mm->dumpable = 0
+ * regardless of the interpreter's permissions.
+ */
+ would_dump(bprm, interpreter);
+
+ /* Get the exec headers */
+ pos = 0;
+ retval = kernel_read(interpreter, &loc->interp_elf_ex,
+ sizeof(loc->interp_elf_ex), &pos);
+ if (retval != sizeof(loc->interp_elf_ex)) {
+ if (retval >= 0)
+ retval = -EIO;
+ goto out_free_dentry;
}
- elf_ppnt++;
+
+ break;
+
+out_free_interp:
+ kfree(elf_interpreter);
+ goto out_free_ph;
}
elf_ppnt = elf_phdata;
@@ -819,7 +821,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
}
/* Some simple consistency checks for the interpreter */
- if (elf_interpreter) {
+ if (interpreter) {
retval = -ELIBBAD;
/* Not an ELF interpreter */
if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
@@ -884,13 +886,19 @@ static int load_elf_binary(struct linux_binprm *bprm)
if (retval < 0)
goto out_free_dentry;
- current->mm->start_stack = bprm->p;
+ elf_bss = 0;
+ elf_brk = 0;
+
+ start_code = ~0UL;
+ end_code = 0;
+ start_data = 0;
+ end_data = 0;
/* Now we do a little grungy work by mmapping the ELF image into
the correct location in memory. */
for(i = 0, elf_ppnt = elf_phdata;
i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
- int elf_prot = 0, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE;
+ int elf_prot, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE;
unsigned long k, vaddr;
unsigned long total_size = 0;
@@ -931,12 +939,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
elf_fixed = MAP_FIXED;
}
- if (elf_ppnt->p_flags & PF_R)
- elf_prot |= PROT_READ;
- if (elf_ppnt->p_flags & PF_W)
- elf_prot |= PROT_WRITE;
- if (elf_ppnt->p_flags & PF_X)
- elf_prot |= PROT_EXEC;
+ elf_prot = make_prot(elf_ppnt->p_flags);
elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
@@ -978,7 +981,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
* independently randomized mmap region (0 load_bias
* without MAP_FIXED).
*/
- if (elf_interpreter) {
+ if (interpreter) {
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
@@ -1076,7 +1079,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
goto out_free_dentry;
}
- if (elf_interpreter) {
+ if (interpreter) {
unsigned long interp_map_addr = 0;
elf_entry = load_elf_interp(&loc->interp_elf_ex,
@@ -1100,7 +1103,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
allow_write_access(interpreter);
fput(interpreter);
- kfree(elf_interpreter);
} else {
elf_entry = loc->elf_ex.e_entry;
if (BAD_ADDR(elf_entry)) {
@@ -1115,7 +1117,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
set_binfmt(&elf_format);
#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
- retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
+ retval = arch_setup_additional_pages(bprm, !!interpreter);
if (retval < 0)
goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
@@ -1132,6 +1134,17 @@ static int load_elf_binary(struct linux_binprm *bprm)
current->mm->start_stack = bprm->p;
if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
+ /*
+ * For architectures with ELF randomization, when executing
+ * a loader directly (i.e. no interpreter listed in ELF
+ * headers), move the brk area out of the mmap region
+ * (since it grows up, and may collide early with the stack
+ * growing down), and into the unused ELF_ET_DYN_BASE region.
+ */
+ if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && !interpreter)
+ current->mm->brk = current->mm->start_brk =
+ ELF_ET_DYN_BASE;
+
current->mm->brk = current->mm->start_brk =
arch_randomize_brk(current->mm);
#ifdef compat_brk_randomized
@@ -1148,6 +1161,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
MAP_FIXED | MAP_PRIVATE, 0);
}
+ regs = current_pt_regs();
#ifdef ELF_PLAT_INIT
/*
* The ABI may specify that certain registers be set up in special
@@ -1176,8 +1190,6 @@ out_free_dentry:
allow_write_access(interpreter);
if (interpreter)
fput(interpreter);
-out_free_interp:
- kfree(elf_interpreter);
out_free_ph:
kfree(elf_phdata);
goto out;
@@ -1456,8 +1468,6 @@ static void fill_elf_header(struct elfhdr *elf, int segs,
elf->e_ehsize = sizeof(struct elfhdr);
elf->e_phentsize = sizeof(struct elf_phdr);
elf->e_phnum = segs;
-
- return;
}
static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
@@ -1470,7 +1480,6 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
phdr->p_memsz = 0;
phdr->p_flags = 0;
phdr->p_align = 0;
- return;
}
static void fill_note(struct memelfnote *note, const char *name, int type,
@@ -1480,7 +1489,6 @@ static void fill_note(struct memelfnote *note, const char *name, int type,
note->type = type;
note->datasz = sz;
note->data = data;
- return;
}
/*
diff --git a/fs/buffer.c b/fs/buffer.c
index 0faa41fb4c88..2e088db43646 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -955,10 +955,20 @@ grow_dev_page(struct block_device *bdev, sector_t block,
end_block = init_page_buffers(page, bdev,
(sector_t)index << sizebits,
size);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x01;
+#endif
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x02;
+#endif
goto failed;
+ }
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x04;
+#endif
}
/*
@@ -978,6 +988,9 @@ grow_dev_page(struct block_device *bdev, sector_t block,
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x08;
+#endif
failed:
unlock_page(page);
put_page(page);
@@ -1033,6 +1046,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
return NULL;
}
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_stamp = jiffies;
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+#endif
for (;;) {
struct buffer_head *bh;
int ret;
@@ -1044,6 +1063,24 @@ __getblk_slow(struct block_device *bdev, sector_t block,
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
+
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ if (!time_after(jiffies, current->getblk_stamp + 3 * HZ))
+ continue;
+ printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n",
+ current->comm, current->pid, current->getblk_executed,
+ current->getblk_bh_count, current->getblk_bh_state,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1L :
+ bdev->bd_super->s_blocksize, size,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1 :
+ bdev->bd_super->s_blocksize_bits,
+ IS_ERR_OR_NULL(bdev->bd_inode) ? -1 :
+ bdev->bd_inode->i_blkbits);
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+ current->getblk_stamp = jiffies;
+#endif
}
}
@@ -3226,6 +3263,11 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*/
static inline int buffer_busy(struct buffer_head *bh)
{
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x80;
+ current->getblk_bh_count = atomic_read(&bh->b_count);
+ current->getblk_bh_state = bh->b_state;
+#endif
return atomic_read(&bh->b_count) |
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
@@ -3264,11 +3306,18 @@ int try_to_free_buffers(struct page *page)
int ret = 0;
BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
+ if (PageWriteback(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x10;
+#endif
return 0;
+ }
if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(page, &buffers_to_free);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x20;
+#endif
goto out;
}
@@ -3292,6 +3341,9 @@ int try_to_free_buffers(struct page *page)
if (ret)
cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x40;
+#endif
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/dax.c b/fs/dax.c
index e5e54da1715f..6975c8f4757d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -814,7 +814,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
goto unlock_pmd;
flush_cache_page(vma, address, pfn);
- pmd = pmdp_huge_clear_flush(vma, address, pmdp);
+ pmd = pmdp_invalidate(vma, address, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 08d3bd602f73..ce8fa15cebe5 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -21,6 +21,9 @@
#include <linux/eventfd.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/idr.h>
+
+DEFINE_IDA(eventfd_ida);
struct eventfd_ctx {
struct kref kref;
@@ -35,6 +38,7 @@ struct eventfd_ctx {
*/
__u64 count;
unsigned int flags;
+ int id;
};
/**
@@ -69,6 +73,8 @@ EXPORT_SYMBOL_GPL(eventfd_signal);
static void eventfd_free_ctx(struct eventfd_ctx *ctx)
{
+ if (ctx->id >= 0)
+ ida_simple_remove(&eventfd_ida, ctx->id);
kfree(ctx);
}
@@ -297,6 +303,7 @@ static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
seq_printf(m, "eventfd-count: %16llx\n",
(unsigned long long)ctx->count);
spin_unlock_irq(&ctx->wqh.lock);
+ seq_printf(m, "eventfd-id: %d\n", ctx->id);
}
#endif
@@ -400,6 +407,7 @@ static int do_eventfd(unsigned int count, int flags)
init_waitqueue_head(&ctx->wqh);
ctx->count = count;
ctx->flags = flags;
+ ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
diff --git a/fs/exec.c b/fs/exec.c
index 2e0033348d8e..d88584ebf07f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1652,11 +1652,13 @@ int search_binary_handler(struct linux_binprm *bprm)
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
+
bprm->recursion_depth++;
retval = fmt->load_binary(bprm);
+ bprm->recursion_depth--;
+
read_lock(&binfmt_lock);
put_binfmt(fmt);
- bprm->recursion_depth--;
if (retval < 0 && !bprm->mm) {
/* we got to flush_old_exec() and failed after it */
read_unlock(&binfmt_lock);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b3bed32946b1..0e3ed79fcc3f 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -193,12 +193,17 @@ static int fat_file_release(struct inode *inode, struct file *filp)
int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
struct inode *inode = filp->f_mapping->host;
- int res, err;
+ int err;
+
+ err = __generic_file_fsync(filp, start, end, datasync);
+ if (err)
+ return err;
- res = generic_file_fsync(filp, start, end, datasync);
err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
+ if (err)
+ return err;
- return res ? res : err;
+ return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c74ef4426282..1dcc57189382 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -440,9 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
u32 hash;
index = page->index;
- hash = hugetlb_fault_mutex_hash(h, current->mm,
- &pseudo_vma,
- mapping, index, 0);
+ hash = hugetlb_fault_mutex_hash(h, mapping, index, 0);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
@@ -499,8 +497,15 @@ static void hugetlbfs_evict_inode(struct inode *inode)
struct resv_map *resv_map;
remove_inode_hugepages(inode, 0, LLONG_MAX);
- resv_map = (struct resv_map *)inode->i_mapping->private_data;
- /* root inode doesn't have the resv_map, so we should check it */
+
+ /*
+ * Get the resv_map from the address space embedded in the inode.
+ * This is the address space which points to any resv_map allocated
+ * at inode creation time. If this is a device special inode,
+ * i_mapping may not point to the original address space.
+ */
+ resv_map = (struct resv_map *)(&inode->i_data)->private_data;
+ /* Only regular and link inodes have associated reserve maps */
if (resv_map)
resv_map_release(&resv_map->refs);
clear_inode(inode);
@@ -639,8 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
addr = index * hpage_size;
/* mutex taken here, fault path and hole punch */
- hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
- index, addr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* See if already present in mapping to avoid alloc/free */
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c57eda9d0dc4..f55bbc1b934d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2698,8 +2698,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
ret = 0;
down_read(&current->mm->mmap_sem);
- pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
- pages, vmas);
+ pret = get_user_pages(ubuf, nr_pages,
+ FOLL_WRITE | FOLL_LONGTERM,
+ pages, vmas);
if (pret == nr_pages) {
/* don't support file backed memory */
for (j = 0; j < nr_pages; j++) {
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 6f0999015a44..c89fa18c9b03 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6007,6 +6007,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
struct buffer_head *data_alloc_bh = NULL;
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
+ struct ocfs2_journal *journal = osb->journal;
BUG_ON(inode_trylock(tl_inode));
@@ -6027,6 +6028,20 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out;
}
+ /* Appending truncate log(TA) and and flushing truncate log(TF) are
+ * two separated transactions. They can be both committed but not
+ * checkpointed. If crash occurs then, both two transaction will be
+ * replayed with several already released to global bitmap clusters.
+ * Then truncate log will be replayed resulting in cluster double free.
+ */
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal);
+ jbd2_journal_unlock_updates(journal->j_journal);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
data_alloc_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 832c1759a09a..e8ea6f783f19 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2153,13 +2153,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_dio_write_ctxt *dwc = NULL;
struct buffer_head *di_bh = NULL;
u64 p_blkno;
- loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
+ loff_t pos = iblock << i_blkbits;
+ sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
unsigned len, total_len = bh_result->b_size;
int ret = 0, first_get_block = 0;
len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
len = min(total_len, len);
+ /*
+ * bh_result->b_size is count in get_more_blocks according to write
+ * "pos" and "end", we need map twice to return different buffer state:
+ * 1. area in file size, not set NEW;
+ * 2. area out file size, set NEW.
+ *
+ * iblock endblk
+ * |--------|---------|---------|---------
+ * |<-------area in file------->|
+ */
+
+ if ((iblock <= endblk) &&
+ ((iblock + ((len - 1) >> i_blkbits)) > endblk))
+ len = (endblk - iblock + 1) << i_blkbits;
+
mlog(0, "get block of %lu at %llu:%u req %u\n",
inode->i_ino, pos, len, total_len);
@@ -2243,6 +2260,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
if (desc->c_needs_zero)
set_buffer_new(bh_result);
+ if (iblock > endblk)
+ set_buffer_new(bh_result);
+
/* May sleep in end_io. It should not happen in a irq context. So defer
* it to dio work queue. */
set_buffer_defer_completion(bh_result);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c121abbdfc7d..85f21caaa6ec 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -69,10 +69,6 @@
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-static unsigned char ocfs2_filetype_table[] = {
- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
static int ocfs2_do_extend_dir(struct super_block *sb,
handle_t *handle,
struct inode *dir,
@@ -1718,7 +1714,7 @@ int __ocfs2_add_entry(handle_t *handle,
de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
de = de1;
}
- de->file_type = OCFS2_FT_UNKNOWN;
+ de->file_type = FT_UNKNOWN;
if (blkno) {
de->inode = cpu_to_le64(blkno);
ocfs2_set_de_type(de, inode->i_mode);
@@ -1803,13 +1799,9 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
}
offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
- unsigned char d_type = DT_UNKNOWN;
-
- if (de->file_type < OCFS2_FT_MAX)
- d_type = ocfs2_filetype_table[de->file_type];
-
if (!dir_emit(ctx, de->name, de->name_len,
- le64_to_cpu(de->inode), d_type))
+ le64_to_cpu(de->inode),
+ fs_ftype_to_dtype(de->file_type)))
goto out;
}
ctx->pos += le16_to_cpu(de->rec_len);
@@ -1900,14 +1892,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
break;
}
if (le64_to_cpu(de->inode)) {
- unsigned char d_type = DT_UNKNOWN;
-
- if (de->file_type < OCFS2_FT_MAX)
- d_type = ocfs2_filetype_table[de->file_type];
if (!dir_emit(ctx, de->name,
de->name_len,
le64_to_cpu(de->inode),
- d_type)) {
+ fs_ftype_to_dtype(de->file_type))) {
brelse(bh);
return 0;
}
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 63d701cd1e2e..c8e9b7031d9a 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -105,7 +105,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
enum dlm_status status;
int actions = 0;
int in_use;
- u8 owner;
+ u8 owner;
+ int recovery_wait = 0;
mlog(0, "master_node = %d, valblk = %d\n", master_node,
flags & LKM_VALBLK);
@@ -208,9 +209,12 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
}
if (flags & LKM_CANCEL)
lock->cancel_pending = 0;
- else
- lock->unlock_pending = 0;
-
+ else {
+ if (!lock->unlock_pending)
+ recovery_wait = 1;
+ else
+ lock->unlock_pending = 0;
+ }
}
/* get an extra ref on lock. if we are just switching
@@ -244,6 +248,17 @@ leave:
spin_unlock(&res->spinlock);
wake_up(&res->wq);
+ if (recovery_wait) {
+ spin_lock(&res->spinlock);
+ /* Unlock request will directly succeed after owner dies,
+ * and the lock is already removed from grant list. We have to
+ * wait for RECOVERING done or we miss the chance to purge it
+ * since the removement is much faster than RECOVERING proc.
+ */
+ __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING);
+ spin_unlock(&res->spinlock);
+ }
+
/* let the caller's final dlm_lock_put handle the actual kfree */
if (actions & DLM_UNLOCK_FREE_LOCK) {
/* this should always be coupled with list removal */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index af405586c5b1..dccf4136f8c1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -448,7 +448,7 @@ static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
struct ocfs2_mask_waiter *mw, int ret)
{
u32 usec;
- ktime_t kt;
+ ktime_t last, kt;
struct ocfs2_lock_stats *stats;
if (level == LKM_PRMODE)
@@ -458,7 +458,8 @@ static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
else
return;
- kt = ktime_sub(ktime_get(), mw->mw_lock_start);
+ last = ktime_get();
+ kt = ktime_sub(last, mw->mw_lock_start);
usec = ktime_to_us(kt);
stats->ls_gets++;
@@ -474,6 +475,8 @@ static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
if (ret)
stats->ls_fail++;
+
+ stats->ls_last = ktime_to_timespec(last).tv_sec;
}
static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
@@ -3093,8 +3096,10 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
* - Lock stats printed
* New in version 3
* - Max time in lock stats is in usecs (instead of nsecs)
+ * New in version 4
+ * - Add last pr/ex unlock times in secs
*/
-#define OCFS2_DLM_DEBUG_STR_VERSION 3
+#define OCFS2_DLM_DEBUG_STR_VERSION 4
static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
{
int i;
@@ -3145,6 +3150,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
# define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max)
# define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max)
# define lock_refresh(_l) ((_l)->l_lock_refresh)
+# define lock_last_prmode(_l) ((_l)->l_lock_prmode.ls_last)
+# define lock_last_exmode(_l) ((_l)->l_lock_exmode.ls_last)
#else
# define lock_num_prmode(_l) (0)
# define lock_num_exmode(_l) (0)
@@ -3155,6 +3162,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
# define lock_max_prmode(_l) (0)
# define lock_max_exmode(_l) (0)
# define lock_refresh(_l) (0)
+# define lock_last_prmode(_l) (0)
+# define lock_last_exmode(_l) (0)
#endif
/* The following seq_print was added in version 2 of this output */
seq_printf(m, "%u\t"
@@ -3165,6 +3174,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
"%llu\t"
"%u\t"
"%u\t"
+ "%u\t"
+ "%u\t"
"%u\t",
lock_num_prmode(lockres),
lock_num_exmode(lockres),
@@ -3174,7 +3185,9 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
lock_total_exmode(lockres),
lock_max_prmode(lockres),
lock_max_exmode(lockres),
- lock_refresh(lockres));
+ lock_refresh(lockres),
+ lock_last_prmode(lockres),
+ lock_last_exmode(lockres));
/* End the line */
seq_printf(m, "\n");
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 4bf8d5854b27..af2888d23de3 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -148,16 +148,24 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
u64 blkno;
struct dentry *parent;
struct inode *dir = d_inode(child);
+ int set;
trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
(unsigned long long)OCFS2_I(dir)->ip_blkno);
+ status = ocfs2_nfs_sync_lock(OCFS2_SB(dir->i_sb), 1);
+ if (status < 0) {
+ mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+ parent = ERR_PTR(status);
+ goto bail;
+ }
+
status = ocfs2_inode_lock(dir, NULL, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
parent = ERR_PTR(status);
- goto bail;
+ goto unlock_nfs_sync;
}
status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno);
@@ -166,11 +174,31 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
goto bail_unlock;
}
+ status = ocfs2_test_inode_bit(OCFS2_SB(dir->i_sb), blkno, &set);
+ if (status < 0) {
+ if (status == -EINVAL) {
+ status = -ESTALE;
+ } else
+ mlog(ML_ERROR, "test inode bit failed %d\n", status);
+ parent = ERR_PTR(status);
+ goto bail_unlock;
+ }
+
+ trace_ocfs2_get_dentry_test_bit(status, set);
+ if (!set) {
+ status = -ESTALE;
+ parent = ERR_PTR(status);
+ goto bail_unlock;
+ }
+
parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
bail_unlock:
ocfs2_inode_unlock(dir, 0);
+unlock_nfs_sync:
+ ocfs2_nfs_sync_unlock(OCFS2_SB(dir->i_sb), 1);
+
bail:
trace_ocfs2_get_parent_end(parent);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1f029fbe8b8d..8efa022684f4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -164,6 +164,7 @@ struct ocfs2_lock_stats {
/* Storing max wait in usecs saves 24 bytes per inode */
u32 ls_max; /* Max wait in USEC */
+ u32 ls_last; /* Last unlock time in SEC */
};
#endif
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7071ad0dec90..b86bf5e74348 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -392,21 +392,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
#define OCFS2_HB_GLOBAL "heartbeat=global"
/*
- * OCFS2 directory file types. Only the low 3 bits are used. The
- * other bits are reserved for now.
- */
-#define OCFS2_FT_UNKNOWN 0
-#define OCFS2_FT_REG_FILE 1
-#define OCFS2_FT_DIR 2
-#define OCFS2_FT_CHRDEV 3
-#define OCFS2_FT_BLKDEV 4
-#define OCFS2_FT_FIFO 5
-#define OCFS2_FT_SOCK 6
-#define OCFS2_FT_SYMLINK 7
-
-#define OCFS2_FT_MAX 8
-
-/*
* OCFS2_DIR_PAD defines the directory entries boundaries
*
* NOTE: It must be a multiple of 4
@@ -424,17 +409,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
#define OCFS2_LINKS_HI_SHIFT 16
#define OCFS2_DX_ENTRIES_MAX (0xffffffffU)
-#define S_SHIFT 12
-static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR,
- [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK,
-};
-
/*
* Convenience casts
@@ -1629,7 +1603,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
umode_t mode)
{
- de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ de->file_type = fs_umode_to_ftype(mode);
}
static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index d4811f981608..2bb916d68576 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -269,7 +269,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
/* map the pages */
ret = get_user_pages_fast((unsigned long)user_desc->ptr,
- bufmap->page_count, 1, bufmap->page_array);
+ bufmap->page_count, FOLL_WRITE, bufmap->page_array);
if (ret < 0)
return ret;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b6ccb6c57706..9c8ca6cd3ce4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -510,7 +510,7 @@ static ssize_t lstats_write(struct file *file, const char __user *buf,
if (!task)
return -ESRCH;
- clear_all_latency_tracing(task);
+ clear_tsk_latency_tracing(task);
put_task_struct(task);
return count;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 568d90e17c17..465ea0153b2a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "Committed_AS: ", committed);
seq_printf(m, "VmallocTotal: %8lu kB\n",
(unsigned long)VMALLOC_TOTAL >> 10);
- show_val_kb(m, "VmallocUsed: ", 0ul);
+ show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages());
show_val_kb(m, "VmallocChunk: ", 0ul);
show_val_kb(m, "Percpu: ", pcpu_nr_pages());
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 95ca1fe7283c..a1c2ad9f960a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -58,7 +58,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
swap = get_mm_counter(mm, MM_SWAPENTS);
SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
- SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
+ SEQ_PUT_DEC(" kB\nVmLck:\t", atomic64_read(&mm->locked_vm));
SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
@@ -1169,7 +1169,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
break;
}
- mmu_notifier_range_init(&range, mm, 0, -1UL);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
+ 0, NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start(&range);
}
walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 11201b2d06b9..a26ae0b83f8f 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -145,6 +145,17 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
return error;
}
+static int ramfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+
+ inode = ramfs_get_inode(dir->i_sb, dir, mode, 0);
+ if (!inode)
+ return -ENOSPC;
+ d_tmpfile(dentry, inode);
+ return 0;
+}
+
static const struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
@@ -155,6 +166,7 @@ static const struct inode_operations ramfs_dir_inode_operations = {
.rmdir = simple_rmdir,
.mknod = ramfs_mknod,
.rename = simple_rename,
+ .tmpfile = ramfs_tmpfile,
};
/*
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 32d8986c26fb..b5b26d8a192c 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -450,6 +450,15 @@ fail:
static inline __u32 xattr_hash(const char *msg, int len)
{
+ /*
+ * csum_partial() gives different results for little-endian and
+ * big endian hosts. Images created on little-endian hosts and
+ * mounted on big-endian hosts(and vice versa) will see csum mismatches
+ * when trying to fetch xattrs. Treating the hash as __wsum_t would
+ * lower the frequency of mismatch. This is an endianness bug in
+ * reiserfs. The return statement would result in a sparse warning. Do
+ * not fix the sparse warning so as to not hide a reminder of the bug.
+ */
return csum_partial(msg, len, 0);
}
diff --git a/fs/sync.c b/fs/sync.c
index 01e82170545a..39df8c2958fe 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -292,8 +292,14 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
}
if (flags & SYNC_FILE_RANGE_WRITE) {
+ int sync_mode = WB_SYNC_NONE;
+
+ if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
+ SYNC_FILE_RANGE_WRITE_AND_WAIT)
+ sync_mode = WB_SYNC_ALL;
+
ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
- WB_SYNC_NONE);
+ sync_mode);
if (ret < 0)
goto out;
}
@@ -306,9 +312,9 @@ out:
}
/*
- * sys_sync_file_range() permits finely controlled syncing over a segment of
+ * ksys_sync_file_range() permits finely controlled syncing over a segment of
* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
- * zero then sys_sync_file_range() will operate from offset out to EOF.
+ * zero then ksys_sync_file_range() will operate from offset out to EOF.
*
* The flag bits are:
*
@@ -325,7 +331,7 @@ out:
* Useful combinations of the flag bits are:
*
* SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
- * in the range which were dirty on entry to sys_sync_file_range() are placed
+ * in the range which were dirty on entry to ksys_sync_file_range() are placed
* under writeout. This is a start-write-for-data-integrity operation.
*
* SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
@@ -337,10 +343,13 @@ out:
* earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
* for that operation to complete and to return the result.
*
- * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
+ * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER
+ * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT):
* a traditional sync() operation. This is a write-for-data-integrity operation
* which will ensure that all pages in the range which were dirty on entry to
- * sys_sync_file_range() are committed to disk.
+ * ksys_sync_file_range() are committed to disk. It should be noted that disk
+ * caches are not flushed by this call, so there are no guarantees here that the
+ * data will be available on disk after a crash.
*
*
* SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f5de1e726356..3b30301c90ec 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -30,6 +30,8 @@
#include <linux/security.h>
#include <linux/hugetlb.h>
+int sysctl_unprivileged_userfaultfd __read_mostly = 1;
+
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
enum userfaultfd_state {
@@ -1930,6 +1932,9 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
struct userfaultfd_ctx *ctx;
int fd;
+ if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE))
+ return -EPERM;
+
BUG_ON(!current->mm);
/* Check the UFFD_* constants for consistency. */
diff --git a/include/asm-generic/dynamic_debug.h b/include/asm-generic/dynamic_debug.h
new file mode 100644
index 000000000000..f1dd3019cd98
--- /dev/null
+++ b/include/asm-generic/dynamic_debug.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_GENERIC_DYNAMIC_DEBUG_H
+#define _ASM_GENERIC_DYNAMIC_DEBUG_H
+
+#ifndef _DYNAMIC_DEBUG_H
+#error "don't include asm/dynamic_debug.h directly"
+#endif
+
+#include <linux/build_bug.h>
+#ifdef CONFIG_JUMP_LABEL
+#include <linux/jump_label.h>
+#endif
+
+/*
+ * We need to know the exact layout of struct _ddebug in order to
+ * initialize it in assembly. Check that all members are at expected
+ * offsets - if any of these fail, the arch cannot use this generic
+ * dynamic_debug.h. DYNAMIC_DEBUG_RELATIVE_POINTERS is pointless for
+ * !64BIT, so we expect the static_key to be at an 8-byte boundary
+ * since it contains stuff which is long-aligned.
+ */
+
+static_assert(BITS_PER_LONG == 64);
+static_assert(offsetof(struct _ddebug, modname_disp) == 0);
+static_assert(offsetof(struct _ddebug, function_disp) == 4);
+static_assert(offsetof(struct _ddebug, filename_disp) == 8);
+static_assert(offsetof(struct _ddebug, format_disp) == 12);
+static_assert(offsetof(struct _ddebug, flags_lineno) == 16);
+
+#ifdef CONFIG_JUMP_LABEL
+static_assert(offsetof(struct _ddebug, key) == 24);
+static_assert(offsetof(struct static_key, enabled) == 0);
+static_assert(offsetof(struct static_key, type) == 8);
+
+/* <4 bytes padding>, .enabled, <4 bytes padding>, .type */
+# ifdef DEBUG
+# define _DPRINTK_ASM_KEY_INIT \
+ "\t.int 0\n" "\t.int 1\n" "\t.int 0\n" "\t.quad "__stringify(__JUMP_TYPE_TRUE)"\n"
+# else
+# define _DPRINTK_ASM_KEY_INIT \
+ "\t.int 0\n" "\t.int 0\n" "\t.int 0\n" "\t.quad "__stringify(__JUMP_TYPE_FALSE)"\n"
+# endif
+#else /* !CONFIG_JUMP_LABEL */
+#define _DPRINTK_ASM_KEY_INIT ""
+#endif
+
+/*
+ * There's a bit of magic involved here.
+ *
+ * First, unlike the bug table entries, we need to define an object in
+ * assembly which we can reference from C code (for use by the
+ * DYNAMIC_DEBUG_BRANCH macro), but we don't want 'name' to have
+ * external linkage (that would require use of globally unique
+ * identifiers, which we can't guarantee). Fortunately, the "extern"
+ * keyword just tells the compiler that _somebody_ provides that
+ * symbol - usually that somebody is the linker, but in this case it's
+ * the assembler, and since we do not do .globl name, the symbol gets
+ * internal linkage.
+ *
+ * So far so good. The next problem is that there's no scope in
+ * assembly, so the identifier 'name' has to be unique within each
+ * translation unit - otherwise all uses of that identifier end up
+ * referring to the same struct _ddebug instance. pr_debug and friends
+ * do this by use of indirection and __UNIQUE_ID(), and new users of
+ * DEFINE_DYNAMIC_DEBUG_METADATA() should do something similar. We
+ * need to catch cases where this is not done at build time.
+ *
+ * With assembly-level .ifndef we can ensure that we only define a
+ * given identifier once, preventing "symbol 'foo' already defined"
+ * errors. But we still need to detect and fail on multiple uses of
+ * the same identifer. The simplest, and wrong, solution to that is to
+ * add an .else .error branch to the .ifndef. The problem is that just
+ * because the DEFINE_DYNAMIC_DEBUG_METADATA() macro is only expanded
+ * once with a given identifier, the compiler may emit the assembly
+ * code multiple times, e.g. if the macro appears in an inline
+ * function. Now, in a normal case like
+ *
+ * static inline get_next_id(void) { static int v; return ++v; }
+ *
+ * all inlined copies of get_next_id are _supposed_ to refer to the
+ * same object 'v'. So we do need to allow this chunk of assembly to
+ * appear multiple times with the same 'name', as long as they all
+ * came from the same DEFINE_DYNAMIC_DEBUG_METADATA() instance. To do
+ * that, we pass __COUNTER__ to the asm(), and set an assembler symbol
+ * name.ddebug.once to that value when we first define 'name'. When we
+ * meet a second attempt at defining 'name', we compare
+ * name.ddebug.once to %6 and error out if they are different.
+ */
+
+#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \
+ extern struct _ddebug name; \
+ asm volatile(".ifndef " __stringify(name) "\n" \
+ ".pushsection __verbose,\"aw\"\n" \
+ ".type "__stringify(name)", STT_OBJECT\n" \
+ ".size "__stringify(name)", %c5\n" \
+ "1:\n" \
+ __stringify(name) ":\n" \
+ "\t.int %c0 - 1b /* _ddebug::modname_disp */\n" \
+ "\t.int %c1 - 1b /* _ddebug::function_disp */\n" \
+ "\t.int %c2 - 1b /* _ddebug::filename_disp */\n" \
+ "\t.int %c3 - 1b /* _ddebug::format_disp */\n" \
+ "\t.int %c4 /* _ddebug::flags_lineno */\n" \
+ _DPRINTK_ASM_KEY_INIT \
+ "\t.org 1b+%c5\n" \
+ ".popsection\n" \
+ ".set "__stringify(name)".ddebug.once, %c6\n" \
+ ".elseif "__stringify(name)".ddebug.once - %c6\n" \
+ ".line "__stringify(__LINE__) " - 1\n" \
+ ".error \"'"__stringify(name)"' used as _ddebug identifier more than once\"\n" \
+ ".endif\n" \
+ : : "i" (KBUILD_MODNAME), "i" (__func__), \
+ "i" (__FILE__), "i" (fmt), \
+ "i" (_DPRINTK_FLAGS_LINENO_INIT), \
+ "i" (sizeof(struct _ddebug)), "i" (__COUNTER__))
+
+#endif /* _ASM_GENERIC_DYNAMIC_DEBUG_H */
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 71d7b77eea50..822f433ac95c 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -126,4 +126,11 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
}
#endif
+#ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED
+static inline bool gigantic_page_runtime_supported(void)
+{
+ return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE);
+}
+#endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */
+
#endif /* _ASM_GENERIC_HUGETLB_H */
diff --git a/include/asm-generic/shmparam.h b/include/asm-generic/shmparam.h
index 8b78c0ba08b1..b8f9035ffc2c 100644
--- a/include/asm-generic/shmparam.h
+++ b/include/asm-generic/shmparam.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_SHMPARAM_H
#define __ASM_GENERIC_SHMPARAM_H
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index f111c780ef1d..f31521dcb09a 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -151,21 +151,6 @@ static inline void balloon_page_delete(struct page *page)
list_del(&page->lru);
}
-static inline bool __is_movable_balloon_page(struct page *page)
-{
- return false;
-}
-
-static inline bool balloon_page_movable(struct page *page)
-{
- return false;
-}
-
-static inline bool isolated_balloon_page(struct page *page)
-{
- return false;
-}
-
static inline bool balloon_page_isolate(struct page *page)
{
return false;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 688ab0de7810..b40fc633f3be 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -15,7 +15,6 @@ struct filename;
* This structure is used to hold the arguments that are used when loading binaries.
*/
struct linux_binprm {
- char buf[BINPRM_BUF_SIZE];
#ifdef CONFIG_MMU
struct vm_area_struct *vma;
unsigned long vma_pages;
@@ -64,6 +63,8 @@ struct linux_binprm {
unsigned long loader, exec;
struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
+
+ char buf[BINPRM_BUF_SIZE];
} __randomize_layout;
#define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 602af23b98c7..cf074bce3eb3 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -60,7 +60,7 @@ static __always_inline unsigned long hweight_long(unsigned long w)
*/
static inline __u64 rol64(__u64 word, unsigned int shift)
{
- return (word << shift) | (word >> (64 - shift));
+ return (word << (shift & 63)) | (word >> ((-shift) & 63));
}
/**
@@ -70,7 +70,7 @@ static inline __u64 rol64(__u64 word, unsigned int shift)
*/
static inline __u64 ror64(__u64 word, unsigned int shift)
{
- return (word >> shift) | (word << (64 - shift));
+ return (word >> (shift & 63)) | (word << ((-shift) & 63));
}
/**
@@ -80,7 +80,7 @@ static inline __u64 ror64(__u64 word, unsigned int shift)
*/
static inline __u32 rol32(__u32 word, unsigned int shift)
{
- return (word << shift) | (word >> ((-shift) & 31));
+ return (word << (shift & 31)) | (word >> ((-shift) & 31));
}
/**
@@ -90,7 +90,7 @@ static inline __u32 rol32(__u32 word, unsigned int shift)
*/
static inline __u32 ror32(__u32 word, unsigned int shift)
{
- return (word >> shift) | (word << (32 - shift));
+ return (word >> (shift & 31)) | (word << ((-shift) & 31));
}
/**
@@ -100,7 +100,7 @@ static inline __u32 ror32(__u32 word, unsigned int shift)
*/
static inline __u16 rol16(__u16 word, unsigned int shift)
{
- return (word << shift) | (word >> (16 - shift));
+ return (word << (shift & 15)) | (word >> ((-shift) & 15));
}
/**
@@ -110,7 +110,7 @@ static inline __u16 rol16(__u16 word, unsigned int shift)
*/
static inline __u16 ror16(__u16 word, unsigned int shift)
{
- return (word >> shift) | (word << (16 - shift));
+ return (word >> (shift & 15)) | (word << ((-shift) & 15));
}
/**
@@ -120,7 +120,7 @@ static inline __u16 ror16(__u16 word, unsigned int shift)
*/
static inline __u8 rol8(__u8 word, unsigned int shift)
{
- return (word << shift) | (word >> (8 - shift));
+ return (word << (shift & 7)) | (word >> ((-shift) & 7));
}
/**
@@ -130,7 +130,7 @@ static inline __u8 rol8(__u8 word, unsigned int shift)
*/
static inline __u8 ror8(__u8 word, unsigned int shift)
{
- return (word >> shift) | (word << (8 - shift));
+ return (word >> (shift & 7)) | (word << ((-shift) & 7));
}
/**
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index ba814f18cb4c..19e58b9138a0 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -140,8 +140,7 @@ struct ftrace_likely_data {
* Do not use __always_inline here, since currently it expands to inline again
* (which would break users of __always_inline).
*/
-#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \
- !defined(CONFIG_OPTIMIZE_INLINING)
+#if !defined(CONFIG_OPTIMIZE_INLINING)
#define inline inline __attribute__((__always_inline__)) __gnu_inline \
__maybe_unused notrace
#else
diff --git a/include/linux/console.h b/include/linux/console.h
index ec9bdb3d7bab..d09951d5a94e 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -166,6 +166,11 @@ struct console {
extern int console_set_on_cmdline;
extern struct console *early_console;
+enum con_flush_mode {
+ CONSOLE_FLUSH_PENDING,
+ CONSOLE_REPLAY_ALL,
+};
+
extern int add_preferred_console(char *name, int idx, char *options);
extern void register_console(struct console *);
extern int unregister_console(struct console *);
@@ -175,7 +180,7 @@ extern int console_trylock(void);
extern void console_unlock(void);
extern void console_conditional_schedule(void);
extern void console_unblank(void);
-extern void console_flush_on_panic(void);
+extern void console_flush_on_panic(enum con_flush_mode mode);
extern struct tty_driver *console_device(int *);
extern void console_stop(struct console *);
extern void console_start(struct console *);
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 147bdec42215..21755471b1c3 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -633,8 +633,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
*/
static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
{
- char *nl = strchr(buf, '\n');
- unsigned int len = nl ? (unsigned int)(nl - buf) : strlen(buf);
+ unsigned int len = strchrnul(buf, '\n') - buf;
return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
}
diff --git a/include/linux/device.h b/include/linux/device.h
index e85264fb6616..72a6260f2b4d 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1562,7 +1562,7 @@ do { \
dev_level_ratelimited(dev_info, dev, fmt, ##__VA_ARGS__)
#if defined(CONFIG_DYNAMIC_DEBUG)
/* descriptor check is first to prevent flooding with "callbacks suppressed" */
-#define dev_dbg_ratelimited(dev, fmt, ...) \
+#define _dev_dbg_ratelimited(descriptor, dev, fmt, ...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
@@ -1573,6 +1573,8 @@ do { \
__dynamic_dev_dbg(&descriptor, dev, dev_fmt(fmt), \
##__VA_ARGS__); \
} while (0)
+#define dev_dbg_ratelimited(dev, fmt, ...) \
+ _dev_dbg_ratelimited(__UNIQUE_ID(ddebug), dev, fmt, ##__VA_ARGS__)
#elif defined(DEBUG)
#define dev_dbg_ratelimited(dev, fmt, ...) \
do { \
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 6c809440f319..f048ff06f521 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -16,11 +16,17 @@ struct _ddebug {
* These fields are used to drive the user interface
* for selecting and displaying debug callsites.
*/
+#ifdef CONFIG_DYNAMIC_DEBUG_RELATIVE_POINTERS
+ signed int modname_disp;
+ signed int function_disp;
+ signed int filename_disp;
+ signed int format_disp;
+#else
const char *modname;
const char *function;
const char *filename;
const char *format;
- unsigned int lineno:18;
+#endif
/*
* The flags field controls the behaviour at the callsite.
* The bits here are changed dynamically when the user
@@ -37,7 +43,7 @@ struct _ddebug {
#else
#define _DPRINTK_FLAGS_DEFAULT 0
#endif
- unsigned int flags:8;
+ unsigned int flags_lineno; /* flags in lower 8 bits, lineno in upper 24 */
#ifdef CONFIG_JUMP_LABEL
union {
struct static_key_true dd_key_true;
@@ -46,7 +52,7 @@ struct _ddebug {
#endif
} __attribute__((aligned(8)));
-
+#define _DPRINTK_FLAGS_LINENO_INIT (((unsigned)__LINE__ << 8) | _DPRINTK_FLAGS_DEFAULT)
#if defined(CONFIG_DYNAMIC_DEBUG)
int ddebug_add_module(struct _ddebug *tab, unsigned int n,
@@ -78,6 +84,13 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
const struct ib_device *ibdev,
const char *fmt, ...);
+
+#ifdef CONFIG_DYNAMIC_DEBUG_RELATIVE_POINTERS
+#include <asm/dynamic_debug.h>
+#ifndef DEFINE_DYNAMIC_DEBUG_METADATA
+# error "asm/dynamic_debug.h must provide definition of DEFINE_DYNAMIC_DEBUG_METADATA"
+#endif
+#else
#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \
static struct _ddebug __aligned(8) \
__attribute__((section("__verbose"))) name = { \
@@ -85,10 +98,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
.function = __func__, \
.filename = __FILE__, \
.format = (fmt), \
- .lineno = __LINE__, \
- .flags = _DPRINTK_FLAGS_DEFAULT, \
+ .flags_lineno = _DPRINTK_FLAGS_LINENO_INIT, \
_DPRINTK_KEY_INIT \
}
+#endif
#ifdef CONFIG_JUMP_LABEL
@@ -111,10 +124,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
#ifdef DEBUG
#define DYNAMIC_DEBUG_BRANCH(descriptor) \
- likely(descriptor.flags & _DPRINTK_FLAGS_PRINT)
+ likely(descriptor.flags_lineno & _DPRINTK_FLAGS_PRINT)
#else
#define DYNAMIC_DEBUG_BRANCH(descriptor) \
- unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)
+ unlikely(descriptor.flags_lineno & _DPRINTK_FLAGS_PRINT)
#endif
#endif
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index dd0a452373e7..c5e5bed82fbc 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -156,7 +156,7 @@ extern struct gen_pool *devm_gen_pool_create(struct device *dev,
int min_alloc_order, int nid, const char *name);
extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
-bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+extern bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start,
size_t size);
#ifdef CONFIG_OF
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index fdab7de7490d..fb07b503dc45 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -585,12 +585,12 @@ static inline bool pm_suspended_storage(void)
}
#endif /* CONFIG_PM_SLEEP */
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
+#ifdef CONFIG_CONTIG_ALLOC
/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask);
-extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
#endif
+void free_contig_range(unsigned long pfn, unsigned int nr_pages);
#ifdef CONFIG_CMA
/* CMA stuff */
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ad50b7b4f141..51ec27a84668 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -77,8 +77,34 @@
#include <linux/migrate.h>
#include <linux/memremap.h>
#include <linux/completion.h>
+#include <linux/mmu_notifier.h>
-struct hmm;
+
+/*
+ * struct hmm - HMM per mm struct
+ *
+ * @mm: mm struct this HMM struct is bound to
+ * @lock: lock protecting ranges list
+ * @ranges: list of range being snapshotted
+ * @mirrors: list of mirrors for this mm
+ * @mmu_notifier: mmu notifier to track updates to CPU page table
+ * @mirrors_sem: read/write semaphore protecting the mirrors list
+ * @wq: wait queue for user waiting on a range invalidation
+ * @notifiers: count of active mmu notifiers
+ * @dead: is the mm dead ?
+ */
+struct hmm {
+ struct mm_struct *mm;
+ struct kref kref;
+ struct mutex lock;
+ struct list_head ranges;
+ struct list_head mirrors;
+ struct mmu_notifier mmu_notifier;
+ struct rw_semaphore mirrors_sem;
+ wait_queue_head_t wq;
+ long notifiers;
+ bool dead;
+};
/*
* hmm_pfn_flag_e - HMM flag enums
@@ -131,6 +157,7 @@ enum hmm_pfn_value_e {
/*
* struct hmm_range - track invalidation lock on virtual address range
*
+ * @hmm: the core HMM structure this range is active against
* @vma: the vm area struct for the range
* @list: all range lock are on a list
* @start: range virtual start address (inclusive)
@@ -138,10 +165,13 @@ enum hmm_pfn_value_e {
* @pfns: array of pfns (big enough for the range)
* @flags: pfn flags to match device driver page table
* @values: pfn value for some special case (none, special, error, ...)
+ * @default_flags: default flags for the range (write, read, ... see hmm doc)
+ * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
* @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
* @valid: pfns array did not change since it has been fill by an HMM function
*/
struct hmm_range {
+ struct hmm *hmm;
struct vm_area_struct *vma;
struct list_head list;
unsigned long start;
@@ -149,41 +179,96 @@ struct hmm_range {
uint64_t *pfns;
const uint64_t *flags;
const uint64_t *values;
+ uint64_t default_flags;
+ uint64_t pfn_flags_mask;
+ uint8_t page_shift;
uint8_t pfn_shift;
bool valid;
};
/*
- * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn
- * @range: range use to decode HMM pfn value
- * @pfn: HMM pfn value to get corresponding struct page from
- * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise
+ * hmm_range_page_shift() - return the page shift for the range
+ * @range: range being queried
+ * Returns: page shift (page size = 1 << page shift) for the range
+ */
+static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
+{
+ return range->page_shift;
+}
+
+/*
+ * hmm_range_page_size() - return the page size for the range
+ * @range: range being queried
+ * Returns: page size for the range in bytes
+ */
+static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
+{
+ return 1UL << hmm_range_page_shift(range);
+}
+
+/*
+ * hmm_range_wait_until_valid() - wait for range to be valid
+ * @range: range affected by invalidation to wait on
+ * @timeout: time out for wait in ms (ie abort wait after that period of time)
+ * Returns: true if the range is valid, false otherwise.
+ */
+static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
+ unsigned long timeout)
+{
+ /* Check if mm is dead ? */
+ if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
+ range->valid = false;
+ return false;
+ }
+ if (range->valid)
+ return true;
+ wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
+ msecs_to_jiffies(timeout));
+ /* Return current valid status just in case we get lucky */
+ return range->valid;
+}
+
+/*
+ * hmm_range_valid() - test if a range is valid or not
+ * @range: range
+ * Returns: true if the range is valid, false otherwise.
+ */
+static inline bool hmm_range_valid(struct hmm_range *range)
+{
+ return range->valid;
+}
+
+/*
+ * hmm_device_entry_to_page() - return struct page pointed to by a device entry
+ * @range: range use to decode device entry value
+ * @entry: device entry value to get corresponding struct page from
+ * Returns: struct page pointer if entry is a valid, NULL otherwise
*
- * If the HMM pfn is valid (ie valid flag set) then return the struct page
- * matching the pfn value stored in the HMM pfn. Otherwise return NULL.
+ * If the device entry is valid (ie valid flag set) then return the struct page
+ * matching the entry value. Otherwise return NULL.
*/
-static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
- uint64_t pfn)
+static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range,
+ uint64_t entry)
{
- if (pfn == range->values[HMM_PFN_NONE])
+ if (entry == range->values[HMM_PFN_NONE])
return NULL;
- if (pfn == range->values[HMM_PFN_ERROR])
+ if (entry == range->values[HMM_PFN_ERROR])
return NULL;
- if (pfn == range->values[HMM_PFN_SPECIAL])
+ if (entry == range->values[HMM_PFN_SPECIAL])
return NULL;
- if (!(pfn & range->flags[HMM_PFN_VALID]))
+ if (!(entry & range->flags[HMM_PFN_VALID]))
return NULL;
- return pfn_to_page(pfn >> range->pfn_shift);
+ return pfn_to_page(entry >> range->pfn_shift);
}
/*
- * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn
- * @range: range use to decode HMM pfn value
- * @pfn: HMM pfn value to extract pfn from
- * Returns: pfn value if HMM pfn is valid, -1UL otherwise
+ * hmm_device_entry_to_pfn() - return pfn value store in a device entry
+ * @range: range use to decode device entry value
+ * @entry: device entry to extract pfn from
+ * Returns: pfn value if device entry is valid, -1UL otherwise
*/
-static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
- uint64_t pfn)
+static inline unsigned long
+hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
{
if (pfn == range->values[HMM_PFN_NONE])
return -1UL;
@@ -197,31 +282,66 @@ static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
}
/*
- * hmm_pfn_from_page() - create a valid HMM pfn value from struct page
+ * hmm_device_entry_from_page() - create a valid device entry for a page
* @range: range use to encode HMM pfn value
- * @page: struct page pointer for which to create the HMM pfn
- * Returns: valid HMM pfn for the page
+ * @page: page for which to create the device entry
+ * Returns: valid device entry for the page
*/
-static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
- struct page *page)
+static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
+ struct page *page)
{
return (page_to_pfn(page) << range->pfn_shift) |
range->flags[HMM_PFN_VALID];
}
/*
- * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn
+ * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
* @range: range use to encode HMM pfn value
- * @pfn: pfn value for which to create the HMM pfn
- * Returns: valid HMM pfn for the pfn
+ * @pfn: pfn value for which to create the device entry
+ * Returns: valid device entry for the pfn
*/
-static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
- unsigned long pfn)
+static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
+ unsigned long pfn)
{
return (pfn << range->pfn_shift) |
range->flags[HMM_PFN_VALID];
}
+/*
+ * Old API:
+ * hmm_pfn_to_page()
+ * hmm_pfn_to_pfn()
+ * hmm_pfn_from_page()
+ * hmm_pfn_from_pfn()
+ *
+ * This are the OLD API please use new API, it is here to avoid cross-tree
+ * merge painfullness ie we convert things to new API in stages.
+ */
+static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
+ uint64_t pfn)
+{
+ return hmm_device_entry_to_page(range, pfn);
+}
+
+static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
+ uint64_t pfn)
+{
+ return hmm_device_entry_to_pfn(range, pfn);
+}
+
+static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
+ struct page *page)
+{
+ return hmm_device_entry_from_page(range, page);
+}
+
+static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
+ unsigned long pfn)
+{
+ return hmm_device_entry_from_pfn(range, pfn);
+}
+
+
#if IS_ENABLED(CONFIG_HMM_MIRROR)
/*
@@ -353,43 +473,113 @@ struct hmm_mirror {
int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
void hmm_mirror_unregister(struct hmm_mirror *mirror);
-
/*
- * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
- * driver lock that serializes device page table updates, then call
- * hmm_vma_range_done(), to check if the snapshot is still valid. The same
- * device driver page table update lock must also be used in the
- * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
- * table invalidation serializes on it.
- *
- * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
- * hmm_vma_get_pfns() WITHOUT ERROR !
- *
- * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
+ * hmm_mirror_mm_is_alive() - test if mm is still alive
+ * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
+ * Returns: false if the mm is dead, true otherwise
+ *
+ * This is an optimization it will not accurately always return -EINVAL if the
+ * mm is dead ie there can be false negative (process is being kill but HMM is
+ * not yet inform of that). It is only intented to be use to optimize out case
+ * where driver is about to do something time consuming and it would be better
+ * to skip it if the mm is dead.
*/
-int hmm_vma_get_pfns(struct hmm_range *range);
-bool hmm_vma_range_done(struct hmm_range *range);
+static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
+{
+ struct mm_struct *mm;
+
+ if (!mirror || !mirror->hmm)
+ return false;
+ mm = READ_ONCE(mirror->hmm->mm);
+ if (mirror->hmm->dead || !mm)
+ return false;
+
+ return true;
+}
/*
- * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will
- * not migrate any device memory back to system memory. The HMM pfn array will
- * be updated with the fault result and current snapshot of the CPU page table
- * for the range.
- *
- * The mmap_sem must be taken in read mode before entering and it might be
- * dropped by the function if the block argument is false. In that case, the
- * function returns -EAGAIN.
- *
- * Return value does not reflect if the fault was successful for every single
- * address or not. Therefore, the caller must to inspect the HMM pfn array to
- * determine fault status for each address.
- *
- * Trying to fault inside an invalid vma will result in -EINVAL.
+ * Please see Documentation/vm/hmm.rst for how to use the range API.
+ */
+int hmm_range_register(struct hmm_range *range,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ unsigned page_shift);
+void hmm_range_unregister(struct hmm_range *range);
+long hmm_range_snapshot(struct hmm_range *range);
+long hmm_range_fault(struct hmm_range *range, bool block);
+long hmm_range_dma_map(struct hmm_range *range,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool block);
+long hmm_range_dma_unmap(struct hmm_range *range,
+ struct vm_area_struct *vma,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool dirty);
+
+/*
+ * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
*
- * See the function description in mm/hmm.c for further documentation.
+ * When waiting for mmu notifiers we need some kind of time out otherwise we
+ * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to
+ * wait already.
*/
-int hmm_vma_fault(struct hmm_range *range, bool block);
+#define HMM_RANGE_DEFAULT_TIMEOUT 1000
+
+/* This is a temporary helper to avoid merge conflict between trees. */
+static inline bool hmm_vma_range_done(struct hmm_range *range)
+{
+ bool ret = hmm_range_valid(range);
+
+ hmm_range_unregister(range);
+ return ret;
+}
+
+/* This is a temporary helper to avoid merge conflict between trees. */
+static inline int hmm_vma_fault(struct hmm_range *range, bool block)
+{
+ long ret;
+
+ /*
+ * With the old API the driver must set each individual entries with
+ * the requested flags (valid, write, ...). So here we set the mask to
+ * keep intact the entries provided by the driver and zero out the
+ * default_flags.
+ */
+ range->default_flags = 0;
+ range->pfn_flags_mask = -1UL;
+
+ ret = hmm_range_register(range, range->vma->vm_mm,
+ range->start, range->end,
+ PAGE_SHIFT);
+ if (ret)
+ return (int)ret;
+
+ if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
+ /*
+ * The mmap_sem was taken by driver we release it here and
+ * returns -EAGAIN which correspond to mmap_sem have been
+ * drop in the old API.
+ */
+ up_read(&range->vma->vm_mm->mmap_sem);
+ return -EAGAIN;
+ }
+
+ ret = hmm_range_fault(range, block);
+ if (ret <= 0) {
+ if (ret == -EBUSY || !ret) {
+ /* Same as above drop mmap_sem to match old API. */
+ up_read(&range->vma->vm_mm->mmap_sem);
+ ret = -EBUSY;
+ } else if (ret == -EAGAIN)
+ ret = -EBUSY;
+ hmm_range_unregister(range);
+ return ret;
+ }
+ return 0;
+}
/* Below are for HMM internal use only! Not to be used by device driver! */
void hmm_mm_destroy(struct mm_struct *mm);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 11943b60f208..edf476c8cfb9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -123,9 +123,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address);
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 6ab8c1bada3f..c309f43bde45 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -19,6 +19,7 @@ struct ipc_ids {
struct rw_semaphore rwsem;
struct idr ipcs_idr;
int max_idx;
+ int last_idx; /* For wrap around detection */
#ifdef CONFIG_CHECKPOINT_RESTORE
int next_id;
#endif
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 3e113a1fa0f1..2b027d2ef3d0 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -190,6 +190,8 @@ struct module;
#ifdef CONFIG_JUMP_LABEL
+#define __JUMP_TYPE_FALSE 0
+#define __JUMP_TYPE_TRUE 1
#define JUMP_TYPE_FALSE 0UL
#define JUMP_TYPE_TRUE 1UL
#define JUMP_TYPE_LINKED 2UL
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index a3b59d143afb..74b1ee9027f5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -484,6 +484,7 @@ extern int __kernel_text_address(unsigned long addr);
extern int kernel_text_address(unsigned long addr);
extern int func_ptr_is_kernel_text(void *ptr);
+u64 int_pow(u64 base, unsigned int exp);
unsigned long int_sqrt(unsigned long);
#if BITS_PER_LONG < 64
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 2c89e60bc752..0f9da966934e 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -4,7 +4,6 @@
/* Simple interface for creating and stopping kernel threads without mess. */
#include <linux/err.h>
#include <linux/sched.h>
-#include <linux/cgroup.h>
__printf(4, 5)
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
@@ -198,6 +197,8 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work);
void kthread_destroy_worker(struct kthread_worker *worker);
+struct cgroup_subsys_state;
+
#ifdef CONFIG_BLK_CGROUP
void kthread_associate_blkcg(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *kthread_blkcg(void);
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index 7c560e0dc8f4..9022f0c2e2e4 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -36,7 +36,7 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
__account_scheduler_latency(task, usecs, inter);
}
-void clear_all_latency_tracing(struct task_struct *p);
+void clear_tsk_latency_tracing(struct task_struct *p);
extern int sysctl_latencytop(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -48,7 +48,7 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
{
}
-static inline void clear_all_latency_tracing(struct task_struct *p)
+static inline void clear_tsk_latency_tracing(struct task_struct *p)
{
}
diff --git a/include/linux/list.h b/include/linux/list.h
index e137b98cd31a..e951228db4b2 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -151,6 +151,23 @@ static inline void list_replace_init(struct list_head *old,
}
/**
+ * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position
+ * @entry1: the location to place entry2
+ * @entry2: the location to place entry1
+ */
+static inline void list_swap(struct list_head *entry1,
+ struct list_head *entry2)
+{
+ struct list_head *pos = entry2->prev;
+
+ list_del(entry2);
+ list_replace(entry1, entry2);
+ if (pos == entry1)
+ pos = entry2;
+ list_add(entry1, pos);
+}
+
+/**
* list_del_init - deletes entry from list and reinitialize it.
* @entry: the element to delete from the list.
*/
@@ -271,6 +288,24 @@ static inline void list_rotate_left(struct list_head *head)
}
/**
+ * list_rotate_to_front() - Rotate list to specific item.
+ * @list: The desired new front of the list.
+ * @head: The head of the list.
+ *
+ * Rotates list so that @list becomes the new front of the list.
+ */
+static inline void list_rotate_to_front(struct list_head *list,
+ struct list_head *head)
+{
+ /*
+ * Deletes the list head from the list denoted by @head and
+ * places it as the tail of @list, this effectively rotates the
+ * list so that @list is at the front.
+ */
+ list_move_tail(head, list);
+}
+
+/**
* list_is_singular - tests whether a list has just one entry.
* @head: the list to test.
*/
diff --git a/include/linux/list_sort.h b/include/linux/list_sort.h
index ba79956e848d..20f178c24e9d 100644
--- a/include/linux/list_sort.h
+++ b/include/linux/list_sort.h
@@ -6,6 +6,7 @@
struct list_head;
+__attribute__((nonnull(2,3)))
void list_sort(void *priv, struct list_head *head,
int (*cmp)(void *priv, struct list_head *a,
struct list_head *b));
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 294d5d80e150..676d3900e1bd 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -96,13 +96,14 @@ struct memblock {
extern struct memblock memblock;
extern int memblock_debug;
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
#define __init_memblock __meminit
#define __initdata_memblock __meminitdata
void memblock_discard(void);
#else
#define __init_memblock
#define __initdata_memblock
+static inline void memblock_discard(void) {}
#endif
#define memblock_dbg(fmt, ...) \
@@ -240,6 +241,47 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
+ unsigned long *out_spfn,
+ unsigned long *out_epfn);
+/**
+ * for_each_free_mem_range_in_zone - iterate through zone specific free
+ * memblock areas
+ * @i: u64 used as loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over free (memory && !reserved) areas of memblock in a specific
+ * zone. Available once memblock and an empty zone is initialized. The main
+ * assumption is that the zone start, end, and pgdat have been associated.
+ * This way we can use the zone to determine NUMA node, and if a given part
+ * of the memblock is valid for the zone.
+ */
+#define for_each_free_mem_pfn_range_in_zone(i, zone, p_start, p_end) \
+ for (i = 0, \
+ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \
+ i != U64_MAX; \
+ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+
+/**
+ * for_each_free_mem_range_in_zone_from - iterate through zone specific
+ * free memblock areas from a given point
+ * @i: u64 used as loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over free (memory && !reserved) areas of memblock in a specific
+ * zone, continuing from current position. Available as soon as memblock is
+ * initialized.
+ */
+#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \
+ for (; i != U64_MAX; \
+ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
/**
* for_each_free_mem_range - iterate through free memblock areas
* @i: u64 used as loop variable
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index dbb6118370c1..d2f644de7f9d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -333,6 +333,19 @@ static inline bool mem_cgroup_disabled(void)
return !cgroup_subsys_enabled(memory_cgrp_subsys);
}
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+ bool in_low_reclaim)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+
+ if (in_low_reclaim)
+ return READ_ONCE(memcg->memory.emin);
+
+ return max(READ_ONCE(memcg->memory.emin),
+ READ_ONCE(memcg->memory.elow));
+}
+
enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
struct mem_cgroup *memcg);
@@ -501,22 +514,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int zid, int nr_pages);
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask);
-
-static inline
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
- struct mem_cgroup_per_node *mz;
- unsigned long nr_pages = 0;
- int zid;
-
- mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- for (zid = 0; zid < MAX_NR_ZONES; zid++)
- nr_pages += mz->lru_zone_size[zid][lru];
- return nr_pages;
-}
-
static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
enum lru_list lru, int zone_idx)
@@ -531,6 +528,8 @@ void mem_cgroup_handle_over_high(void);
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg);
+
void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
struct task_struct *p);
@@ -827,6 +826,12 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
{
}
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+ bool in_low_reclaim)
+{
+ return 0;
+}
+
static inline enum mem_cgroup_protection mem_cgroup_protected(
struct mem_cgroup *root, struct mem_cgroup *memcg)
{
@@ -960,11 +965,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
return true;
}
-static inline unsigned long
-mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
- return 0;
-}
static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
enum lru_list lru, int zone_idx)
@@ -972,14 +972,12 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
return 0;
}
-static inline unsigned long
-mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
+static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
return 0;
}
-static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
+static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
{
return 0;
}
@@ -1117,6 +1115,12 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
{
}
+static inline void __count_memcg_events(struct mem_cgroup *memcg,
+ enum vm_event_item idx,
+ unsigned long count)
+{
+}
+
static inline void count_memcg_page_event(struct page *page,
int idx)
{
diff --git a/include/linux/memory.h b/include/linux/memory.h
index a6ddefc60517..e1dc1bb2b787 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -113,7 +113,7 @@ extern int register_memory_isolate_notifier(struct notifier_block *nb);
extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
int hotplug_memory_register(int nid, struct mem_section *section);
#ifdef CONFIG_MEMORY_HOTREMOVE
-extern int unregister_memory_section(struct mem_section *);
+extern void unregister_memory_section(struct mem_section *);
#endif
extern int memory_dev_init(void);
extern int memory_notify(unsigned long val, void *v);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 8ade08c50d26..ae892eef8b82 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -54,6 +54,16 @@ enum {
};
/*
+ * Restrictions for the memory hotplug:
+ * flags: MHP_ flags
+ * altmap: alternative allocator for memmap array
+ */
+struct mhp_restrictions {
+ unsigned long flags;
+ struct vmem_altmap *altmap;
+};
+
+/*
* Zone resizing functions
*
* Note: any attempt to resize a zone should has pgdat_resize_lock()
@@ -87,7 +97,8 @@ extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
extern int online_pages(unsigned long, unsigned long, int);
extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
unsigned long *valid_start, unsigned long *valid_end);
-extern void __offline_isolated_pages(unsigned long, unsigned long);
+extern unsigned long __offline_isolated_pages(unsigned long start_pfn,
+ unsigned long end_pfn);
typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
@@ -100,6 +111,8 @@ extern void __online_page_free(struct page *page);
extern int try_online_node(int nid);
+extern int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions);
extern u64 max_mem_size;
extern bool memhp_auto_online;
@@ -111,26 +124,33 @@ static inline bool movable_node_is_enabled(void)
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-extern int arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap);
-extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap);
+extern void arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap);
+extern void __remove_pages(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap);
#endif /* CONFIG_MEMORY_HOTREMOVE */
+/*
+ * Do we want sysfs memblock files created. This will allow userspace to online
+ * and offline memory explicitly. Lack of this bit means that the caller has to
+ * call move_pfn_range_to_zone to finish the initialization.
+ */
+
+#define MHP_MEMBLOCK_API (1<<0)
+
/* reasonably generic interface to expand the physical pages */
extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap, bool want_memblock);
+ struct mhp_restrictions *restrictions);
#ifndef CONFIG_ARCH_HAS_ADD_PAGES
static inline int add_pages(int nid, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap,
- bool want_memblock)
+ unsigned long nr_pages, struct mhp_restrictions *restrictions)
{
- return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ return __add_pages(nid, start_pfn, nr_pages, restrictions);
}
#else /* ARCH_HAS_ADD_PAGES */
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap, bool want_memblock);
+ struct mhp_restrictions *restrictions);
#endif /* ARCH_HAS_ADD_PAGES */
#ifdef CONFIG_NUMA
@@ -331,8 +351,6 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
extern int __add_memory(int nid, u64 start, u64 size);
extern int add_memory(int nid, u64 start, u64 size);
extern int add_memory_resource(int nid, struct resource *resource);
-extern int arch_add_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap, bool want_memblock);
extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap);
extern bool is_memblock_offlined(struct memory_block *mem);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 083d7b4863ed..0e8834ac32b7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -124,10 +124,45 @@ extern int mmap_rnd_compat_bits __read_mostly;
/*
* On some architectures it is expensive to call memset() for small sizes.
- * Those architectures should provide their own implementation of "struct page"
- * zeroing by defining this macro in <asm/pgtable.h>.
+ * If an architecture decides to implement their own version of
+ * mm_zero_struct_page they should wrap the defines below in a #ifndef and
+ * define their own version of this macro in <asm/pgtable.h>
*/
-#ifndef mm_zero_struct_page
+#if BITS_PER_LONG == 64
+/* This function must be updated when the size of struct page grows above 80
+ * or reduces below 56. The idea that compiler optimizes out switch()
+ * statement, and only leaves move/store instructions. Also the compiler can
+ * combine write statments if they are both assignments and can be reordered,
+ * this can result in several of the writes here being dropped.
+ */
+#define mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
+static inline void __mm_zero_struct_page(struct page *page)
+{
+ unsigned long *_pp = (void *)page;
+
+ /* Check that struct page is either 56, 64, 72, or 80 bytes */
+ BUILD_BUG_ON(sizeof(struct page) & 7);
+ BUILD_BUG_ON(sizeof(struct page) < 56);
+ BUILD_BUG_ON(sizeof(struct page) > 80);
+
+ switch (sizeof(struct page)) {
+ case 80:
+ _pp[9] = 0; /* fallthrough */
+ case 72:
+ _pp[8] = 0; /* fallthrough */
+ case 64:
+ _pp[7] = 0; /* fallthrough */
+ case 56:
+ _pp[6] = 0;
+ _pp[5] = 0;
+ _pp[4] = 0;
+ _pp[3] = 0;
+ _pp[2] = 0;
+ _pp[1] = 0;
+ _pp[0] = 0;
+ }
+}
+#else
#define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page)))
#endif
@@ -501,9 +536,6 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
struct mmu_gather;
struct inode;
-#define page_private(page) ((page)->private)
-#define set_page_private(page, v) ((page)->private = (v))
-
#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline int pmd_devmap(pmd_t pmd)
{
@@ -1007,6 +1039,30 @@ static inline void put_page(struct page *page)
__put_page(page);
}
+/**
+ * put_user_page() - release a gup-pinned page
+ * @page: pointer to page to be released
+ *
+ * Pages that were pinned via get_user_pages*() must be released via
+ * either put_user_page(), or one of the put_user_pages*() routines
+ * below. This is so that eventually, pages that are pinned via
+ * get_user_pages*() can be separately tracked and uniquely handled. In
+ * particular, interactions with RDMA and filesystems need special
+ * handling.
+ *
+ * put_user_page() and put_page() are not interchangeable, despite this early
+ * implementation that makes them look the same. put_user_page() calls must
+ * be perfectly matched up with get_user_page() calls.
+ */
+static inline void put_user_page(struct page *page)
+{
+ put_page(page);
+}
+
+void put_user_pages_dirty(struct page **pages, unsigned long npages);
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages);
+void put_user_pages(struct page **pages, unsigned long npages);
+
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif
@@ -1505,21 +1561,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
-#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA)
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas);
-#else
-static inline long get_user_pages_longterm(unsigned long start,
- unsigned long nr_pages, unsigned int gup_flags,
- struct page **pages, struct vm_area_struct **vmas)
-{
- return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
-}
-#endif /* CONFIG_FS_DAX */
-
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages);
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages);
/* Container for pinned pfns / pages */
struct frame_vector {
@@ -2533,6 +2576,10 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num);
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
@@ -2583,6 +2630,34 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */
+#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
+
+/*
+ * NOTE on FOLL_LONGTERM:
+ *
+ * FOLL_LONGTERM indicates that the page will be held for an indefinite time
+ * period _often_ under userspace control. This is contrasted with
+ * iov_iter_get_pages() where usages which are transient.
+ *
+ * FIXME: For pages which are part of a filesystem, mappings are subject to the
+ * lifetime enforced by the filesystem and we need guarantees that longterm
+ * users like RDMA and V4L2 only establish mappings which coordinate usage with
+ * the filesystem. Ideas for this coordination include revoking the longterm
+ * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was
+ * added after the problem with filesystems was found FS DAX VMAs are
+ * specifically failed. Filesystem pages are still subject to bugs and use of
+ * FOLL_LONGTERM should be avoided on those pages.
+ *
+ * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call.
+ * Currently only get_user_pages() and get_user_pages_fast() support this flag
+ * and calls to get_user_pages_[un]locked are specifically not allowed. This
+ * is due to an incompatibility with the FS DAX check and
+ * FAULT_FLAG_ALLOW_RETRY
+ *
+ * In the CMA case: longterm pins in a CMA region would unnecessarily fragment
+ * that region. And so CMA attempts to migrate the page before pinning when
+ * FOLL_LONGTERM is specified.
+ */
static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 04ec454d44ce..6f2fef7b0784 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -29,7 +29,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages);
+ __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4ef4bbe78a1d..1815fbc40926 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -103,7 +103,7 @@ struct page {
};
struct { /* slab, slob and slub */
union {
- struct list_head slab_list; /* uses lru */
+ struct list_head slab_list;
struct { /* Partial pages */
struct page *next;
#ifdef CONFIG_64BIT
@@ -220,6 +220,9 @@ struct page {
#define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK)
#define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE)
+#define page_private(page) ((page)->private)
+#define set_page_private(page, v) ((page)->private = (v))
+
struct page_frag_cache {
void * va;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -410,7 +413,7 @@ struct mm_struct {
unsigned long hiwater_vm; /* High-water virtual memory usage */
unsigned long total_vm; /* Total pages mapped */
- unsigned long locked_vm; /* Pages that have PG_mlocked set */
+ atomic64_t locked_vm; /* Pages that have PG_mlocked set */
atomic64_t pinned_vm; /* Refcount permanently increased */
unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 4050ec1c3b45..b6c004bd9f6a 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -10,6 +10,36 @@
struct mmu_notifier;
struct mmu_notifier_ops;
+/**
+ * enum mmu_notifier_event - reason for the mmu notifier callback
+ * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
+ * move the range
+ *
+ * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
+ * madvise() or replacing a page by another one, ...).
+ *
+ * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
+ * ie using the vma access permission (vm_page_prot) to update the whole range
+ * is enough no need to inspect changes to the CPU page table (mprotect()
+ * syscall)
+ *
+ * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
+ * pages in the range so to mirror those changes the user must inspect the CPU
+ * page table (from the end callback).
+ *
+ * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
+ * access flags). User should soft dirty the page in the end callback to make
+ * sure that anyone relying on soft dirtyness catch pages that might be written
+ * through non CPU mappings.
+ */
+enum mmu_notifier_event {
+ MMU_NOTIFY_UNMAP = 0,
+ MMU_NOTIFY_CLEAR,
+ MMU_NOTIFY_PROTECTION_VMA,
+ MMU_NOTIFY_PROTECTION_PAGE,
+ MMU_NOTIFY_SOFT_DIRTY,
+};
+
#ifdef CONFIG_MMU_NOTIFIER
/*
@@ -25,11 +55,15 @@ struct mmu_notifier_mm {
spinlock_t lock;
};
+#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
+
struct mmu_notifier_range {
+ struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long start;
unsigned long end;
- bool blockable;
+ unsigned flags;
+ enum mmu_notifier_event event;
};
struct mmu_notifier_ops {
@@ -225,6 +259,14 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r,
bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end);
+extern bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);
+
+static inline bool
+mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
+{
+ return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
+}
static inline void mmu_notifier_release(struct mm_struct *mm)
{
@@ -269,7 +311,7 @@ static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
if (mm_has_notifiers(range->mm)) {
- range->blockable = true;
+ range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
__mmu_notifier_invalidate_range_start(range);
}
}
@@ -278,7 +320,7 @@ static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
if (mm_has_notifiers(range->mm)) {
- range->blockable = false;
+ range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
return __mmu_notifier_invalidate_range_start(range);
}
return 0;
@@ -318,13 +360,19 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
+ enum mmu_notifier_event event,
+ unsigned flags,
+ struct vm_area_struct *vma,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
+ range->vma = vma;
+ range->event = event;
range->mm = mm;
range->start = start;
range->end = end;
+ range->flags = flags;
}
#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
@@ -452,9 +500,14 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
range->end = end;
}
-#define mmu_notifier_range_init(range, mm, start, end) \
+#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \
_mmu_notifier_range_init(range, start, end)
+static inline bool
+mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
+{
+ return true;
+}
static inline int mm_has_notifiers(struct mm_struct *mm)
{
@@ -517,6 +570,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
{
}
+#define mmu_notifier_range_update_to_read_only(r) false
+
#define ptep_clear_flush_young_notify ptep_clear_flush_young
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
#define ptep_clear_young_notify ptep_test_and_clear_young
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fba7741533be..70394cabaf4e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -18,6 +18,8 @@
#include <linux/pageblock-flags.h>
#include <linux/page-flags-layout.h>
#include <linux/atomic.h>
+#include <linux/mm_types.h>
+#include <linux/page-flags.h>
#include <asm/page.h>
/* Free memory management - zoned buddy allocator. */
@@ -98,6 +100,62 @@ struct free_area {
unsigned long nr_free;
};
+/* Used for pages not on another list */
+static inline void add_to_free_area(struct page *page, struct free_area *area,
+ int migratetype)
+{
+ list_add(&page->lru, &area->free_list[migratetype]);
+ area->nr_free++;
+}
+
+/* Used for pages not on another list */
+static inline void add_to_free_area_tail(struct page *page, struct free_area *area,
+ int migratetype)
+{
+ list_add_tail(&page->lru, &area->free_list[migratetype]);
+ area->nr_free++;
+}
+
+#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
+/* Used to preserve page allocation order entropy */
+void add_to_free_area_random(struct page *page, struct free_area *area,
+ int migratetype);
+#else
+static inline void add_to_free_area_random(struct page *page,
+ struct free_area *area, int migratetype)
+{
+ add_to_free_area(page, area, migratetype);
+}
+#endif
+
+/* Used for pages which are on another list */
+static inline void move_to_free_area(struct page *page, struct free_area *area,
+ int migratetype)
+{
+ list_move(&page->lru, &area->free_list[migratetype]);
+}
+
+static inline struct page *get_page_from_free_area(struct free_area *area,
+ int migratetype)
+{
+ return list_first_entry_or_null(&area->free_list[migratetype],
+ struct page, lru);
+}
+
+static inline void del_page_from_free_area(struct page *page,
+ struct free_area *area)
+{
+ list_del(&page->lru);
+ __ClearPageBuddy(page);
+ set_page_private(page, 0);
+ area->nr_free--;
+}
+
+static inline bool free_area_empty(struct free_area *area, int migratetype)
+{
+ return list_empty(&area->free_list[migratetype]);
+}
+
struct pglist_data;
/*
@@ -247,11 +305,6 @@ struct lruvec {
#endif
};
-/* Mask used at gathering information at once (see memcontrol.c) */
-#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
-#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
-#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
-
/* Isolate unmapped file */
#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2)
/* Isolate for asynchronous migration */
@@ -1276,6 +1329,7 @@ void sparse_init(void);
#else
#define sparse_init() do {} while (0)
#define sparse_index_init(_sec, _nid) do {} while (0)
+#define pfn_present pfn_valid
#endif /* CONFIG_SPARSEMEM */
/*
diff --git a/include/linux/net.h b/include/linux/net.h
index 50bf5206ead6..60db423a2d94 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -268,7 +268,7 @@ do { \
#define net_info_ratelimited(fmt, ...) \
net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__)
#if defined(CONFIG_DYNAMIC_DEBUG)
-#define net_dbg_ratelimited(fmt, ...) \
+#define _net_dbg_ratelimited(descriptor, fmt, ...) \
do { \
DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
if (DYNAMIC_DEBUG_BRANCH(descriptor) && \
@@ -276,6 +276,8 @@ do { \
__dynamic_pr_debug(&descriptor, pr_fmt(fmt), \
##__VA_ARGS__); \
} while (0)
+#define net_dbg_ratelimited(fmt, ...) \
+ _net_dbg_ratelimited(__UNIQUE_ID(ddebug), fmt, ##__VA_ARGS__)
#elif defined(DEBUG)
#define net_dbg_ratelimited(fmt, ...) \
net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bcf909d0de5f..2e8438a1216a 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -333,6 +333,19 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
mapping_gfp_mask(mapping));
}
+static inline struct page *find_subpage(struct page *page, pgoff_t offset)
+{
+ unsigned long mask;
+
+ if (PageHuge(page))
+ return page;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ mask = (1UL << compound_order(page)) - 1;
+ return page + (offset & mask);
+}
+
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 3c8ef5a199ca..0be5829ddd80 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -3,6 +3,7 @@
#define _LINUX_PID_H
#include <linux/rculist.h>
+#include <linux/refcount.h>
enum pid_type
{
@@ -56,7 +57,7 @@ struct upid {
struct pid
{
- atomic_t count;
+ refcount_t count;
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
@@ -71,7 +72,7 @@ extern const struct file_operations pidfd_fops;
static inline struct pid *get_pid(struct pid *pid)
{
if (pid)
- atomic_inc(&pid->count);
+ refcount_inc(&pid->count);
return pid;
}
diff --git a/include/linux/plist.h b/include/linux/plist.h
index 97883604a3c5..9365df5a823f 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -231,7 +231,7 @@ static inline int plist_node_empty(const struct plist_node *node)
* @type: the type of the struct this is embedded in
* @member: the name of the list_head within the struct
*/
-#ifdef CONFIG_DEBUG_PI_LIST
+#ifdef CONFIG_DEBUG_PLIST
# define plist_first_entry(head, type, member) \
({ \
WARN_ON(plist_head_empty(head)); \
@@ -248,7 +248,7 @@ static inline int plist_node_empty(const struct plist_node *node)
* @type: the type of the struct this is embedded in
* @member: the name of the list_head within the struct
*/
-#ifdef CONFIG_DEBUG_PI_LIST
+#ifdef CONFIG_DEBUG_PLIST
# define plist_last_entry(head, type, member) \
({ \
WARN_ON(plist_head_empty(head)); \
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 7e0fdcf905d2..1cdc32b1f1b0 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -16,7 +16,11 @@
extern struct ctl_table epoll_table[]; /* for sysctl */
/* ~832 bytes of stack space used max in sys_select/sys_poll before allocating
additional memory. */
+#ifdef __clang__
+#define MAX_STACK_ALLOC 768
+#else
#define MAX_STACK_ALLOC 832
+#endif
#define FRONTEND_STACK_ALLOC 256
#define SELECT_STACK_ALLOC FRONTEND_STACK_ALLOC
#define POLL_STACK_ALLOC FRONTEND_STACK_ALLOC
diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h
index 56f35dd3d01d..44171e6b7197 100644
--- a/include/linux/pps-gpio.h
+++ b/include/linux/pps-gpio.h
@@ -23,10 +23,11 @@
#define _PPS_GPIO_H
struct pps_gpio_platform_data {
+ struct gpio_desc *gpio_pin;
+ struct gpio_desc *echo_pin;
bool assert_falling_edge;
bool capture_clear;
- unsigned int gpio_pin;
- const char *gpio_label;
+ unsigned int echo_active_ms;
};
#endif /* _PPS_GPIO_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 84ea4d094af3..54499c1a74fd 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -82,6 +82,8 @@ static inline void console_verbose(void)
extern char devkmsg_log_str[];
struct ctl_table;
+extern int suppress_printk;
+
struct va_format {
const char *fmt;
va_list *va;
@@ -454,7 +456,7 @@ extern int kptr_restrict;
/* If you are writing a driver, please use dev_dbg instead */
#if defined(CONFIG_DYNAMIC_DEBUG)
/* descriptor check is first to prevent flooding with "callbacks suppressed" */
-#define pr_debug_ratelimited(fmt, ...) \
+#define _pr_debug_ratelimited(descriptor, fmt, ...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
@@ -464,6 +466,8 @@ do { \
__ratelimit(&_rs)) \
__dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__); \
} while (0)
+#define pr_debug_ratelimited(fmt, ...) \
+ _pr_debug_ratelimited(__UNIQUE_ID(ddebug), fmt, ##__VA_ARGS__)
#elif defined(DEBUG)
#define pr_debug_ratelimited(fmt, ...) \
printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
diff --git a/include/linux/psi.h b/include/linux/psi.h
index 7006008d5b72..af892c290116 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -4,6 +4,7 @@
#include <linux/jump_label.h>
#include <linux/psi_types.h>
#include <linux/sched.h>
+#include <linux/poll.h>
struct seq_file;
struct css_set;
@@ -26,6 +27,13 @@ int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
int psi_cgroup_alloc(struct cgroup *cgrp);
void psi_cgroup_free(struct cgroup *cgrp);
void cgroup_move_task(struct task_struct *p, struct css_set *to);
+
+struct psi_trigger *psi_trigger_create(struct psi_group *group,
+ char *buf, size_t nbytes, enum psi_res res);
+void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t);
+
+__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
+ poll_table *wait);
#endif
#else /* CONFIG_PSI */
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 2cf422db5d18..07aaf9b82241 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -1,8 +1,11 @@
#ifndef _LINUX_PSI_TYPES_H
#define _LINUX_PSI_TYPES_H
+#include <linux/kthread.h>
#include <linux/seqlock.h>
#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/wait.h>
#ifdef CONFIG_PSI
@@ -11,7 +14,7 @@ enum psi_task_count {
NR_IOWAIT,
NR_MEMSTALL,
NR_RUNNING,
- NR_PSI_TASK_COUNTS,
+ NR_PSI_TASK_COUNTS = 3,
};
/* Task state bitmasks */
@@ -24,7 +27,7 @@ enum psi_res {
PSI_IO,
PSI_MEM,
PSI_CPU,
- NR_PSI_RESOURCES,
+ NR_PSI_RESOURCES = 3,
};
/*
@@ -41,7 +44,13 @@ enum psi_states {
PSI_CPU_SOME,
/* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE,
- NR_PSI_STATES,
+ NR_PSI_STATES = 6,
+};
+
+enum psi_aggregators {
+ PSI_AVGS = 0,
+ PSI_POLL,
+ NR_PSI_AGGREGATORS,
};
struct psi_group_cpu {
@@ -53,6 +62,9 @@ struct psi_group_cpu {
/* States of the tasks belonging to this group */
unsigned int tasks[NR_PSI_TASK_COUNTS];
+ /* Aggregate pressure state derived from the tasks */
+ u32 state_mask;
+
/* Period time sampling buckets for each state of interest (ns) */
u32 times[NR_PSI_STATES];
@@ -62,25 +74,94 @@ struct psi_group_cpu {
/* 2nd cacheline updated by the aggregator */
/* Delta detection against the sampling buckets */
- u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp;
+ u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES]
+ ____cacheline_aligned_in_smp;
+};
+
+/* PSI growth tracking window */
+struct psi_window {
+ /* Window size in ns */
+ u64 size;
+
+ /* Start time of the current window in ns */
+ u64 start_time;
+
+ /* Value at the start of the window */
+ u64 start_value;
+
+ /* Value growth in the previous window */
+ u64 prev_growth;
+};
+
+struct psi_trigger {
+ /* PSI state being monitored by the trigger */
+ enum psi_states state;
+
+ /* User-spacified threshold in ns */
+ u64 threshold;
+
+ /* List node inside triggers list */
+ struct list_head node;
+
+ /* Backpointer needed during trigger destruction */
+ struct psi_group *group;
+
+ /* Wait queue for polling */
+ wait_queue_head_t event_wait;
+
+ /* Pending event flag */
+ int event;
+
+ /* Tracking window */
+ struct psi_window win;
+
+ /*
+ * Time last event was generated. Used for rate-limiting
+ * events to one per window
+ */
+ u64 last_event_time;
+
+ /* Refcounting to prevent premature destruction */
+ struct kref refcount;
};
struct psi_group {
- /* Protects data updated during an aggregation */
- struct mutex stat_lock;
+ /* Protects data used by the aggregator */
+ struct mutex avgs_lock;
/* Per-cpu task state & time tracking */
struct psi_group_cpu __percpu *pcpu;
- /* Periodic aggregation state */
- u64 total_prev[NR_PSI_STATES - 1];
- u64 last_update;
- u64 next_update;
- struct delayed_work clock_work;
+ /* Running pressure averages */
+ u64 avg_total[NR_PSI_STATES - 1];
+ u64 avg_last_update;
+ u64 avg_next_update;
+
+ /* Aggregator work control */
+ struct delayed_work avgs_work;
/* Total stall times and sampled pressure averages */
- u64 total[NR_PSI_STATES - 1];
+ u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
unsigned long avg[NR_PSI_STATES - 1][3];
+
+ /* Monitor work control */
+ atomic_t poll_scheduled;
+ struct kthread_worker __rcu *poll_kworker;
+ struct kthread_delayed_work poll_work;
+
+ /* Protects data used by the monitor */
+ struct mutex trigger_lock;
+
+ /* Configured polling triggers */
+ struct list_head triggers;
+ u32 nr_triggers[NR_PSI_STATES - 1];
+ u32 poll_states;
+ u64 poll_min_period;
+
+ /* Total stall times at the start of monitor activation */
+ u64 polling_total[NR_PSI_STATES - 1];
+ u64 polling_next_update;
+ u64 polling_until;
};
#else /* CONFIG_PSI */
diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h
index 3bcd67fd5548..dd464943f717 100644
--- a/include/linux/qcom-geni-se.h
+++ b/include/linux/qcom-geni-se.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2017-2018, The Linux Foundation. All rights reserved.
*/
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index e63799a6e895..3734cd8f38a8 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -14,6 +14,7 @@ struct device;
#define SYS_POWER_OFF 0x0003 /* Notify of system power off */
enum reboot_mode {
+ REBOOT_UNDEFINED = -1,
REBOOT_COLD = 0,
REBOOT_WARM,
REBOOT_HARD,
@@ -21,6 +22,7 @@ enum reboot_mode {
REBOOT_GPIO,
};
extern enum reboot_mode reboot_mode;
+extern enum reboot_mode panic_reboot_mode;
enum reboot_type {
BOOT_TRIPLE = 't',
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a2cd15855bad..9bc2ba94382b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -26,7 +26,6 @@
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/signal_types.h>
-#include <linux/psi_types.h>
#include <linux/mm_types_task.h>
#include <linux/task_io_accounting.h>
#include <linux/rseq.h>
@@ -1161,6 +1160,9 @@ struct task_struct {
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
+
+ /* Used by memcontrol for high relcaim: */
+ struct mem_cgroup *memcg_high_reclaim;
#endif
#ifdef CONFIG_BLK_CGROUP
@@ -1201,6 +1203,13 @@ struct task_struct {
unsigned long prev_lowest_stack;
#endif
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ unsigned long getblk_stamp;
+ unsigned int getblk_executed;
+ unsigned int getblk_bh_count;
+ unsigned long getblk_bh_state;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 2b70130af585..de29cca9c297 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -263,6 +263,40 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
#define probe_kernel_address(addr, retval) \
probe_kernel_read(&retval, addr, sizeof(retval))
+/**
+ * probe_user_read(): safely attempt to read from a user location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ *
+ * We ensure that the copy_from_user is executed in atomic context so that
+ * do_page_fault() doesn't attempt to take mmap_sem. This makes
+ * probe_user_read() suitable for use within regions where the caller
+ * already holds mmap_sem, or other locks which nest inside mmap_sem.
+ *
+ * Returns 0 on success, -EFAULT on error.
+ */
+
+#ifndef probe_user_read
+static __always_inline long probe_user_read(void *dst, const void __user *src,
+ size_t size)
+{
+ long ret;
+
+ if (!access_ok(src, size))
+ return -EFAULT;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, size);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+#endif
+
#ifndef user_access_begin
#define user_access_begin(ptr,len) access_ok(ptr, len)
#define user_access_end() do { } while (0)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 37c9eba75c98..ac9d71e24b81 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -28,6 +28,8 @@
#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
+extern int sysctl_unprivileged_userfaultfd;
+
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c6eebb839552..9b21d0047710 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -50,12 +50,16 @@ struct vm_struct {
struct vmap_area {
unsigned long va_start;
unsigned long va_end;
+
+ /*
+ * Largest available free size in subtree.
+ */
+ unsigned long subtree_max_size;
unsigned long flags;
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
struct llist_node purge_list; /* "lazy purge" list */
struct vm_struct *vm;
- struct rcu_head rcu_head;
};
/*
@@ -68,10 +72,12 @@ extern void vm_unmap_aliases(void);
#ifdef CONFIG_MMU
extern void __init vmalloc_init(void);
+extern unsigned long vmalloc_nr_pages(void);
#else
static inline void vmalloc_init(void)
{
}
+static inline unsigned long vmalloc_nr_pages(void) { return 0; }
#endif
extern void *vmalloc(unsigned long size);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 2db8d60981fe..bdeda4b079fe 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -26,7 +26,7 @@ struct reclaim_stat {
unsigned nr_congested;
unsigned nr_writeback;
unsigned nr_immediate;
- unsigned nr_activate;
+ unsigned nr_activate[2];
unsigned nr_ref_keep;
unsigned nr_unmap_fail;
};
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 6074eff3d766..e5bf6ee4e814 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -64,6 +64,7 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken)
);
+#ifdef CONFIG_COMPACTION
TRACE_EVENT(mm_compaction_migratepages,
TP_PROTO(unsigned long nr_all,
@@ -132,7 +133,6 @@ TRACE_EVENT(mm_compaction_begin,
__entry->sync ? "sync" : "async")
);
-#ifdef CONFIG_COMPACTION
TRACE_EVENT(mm_compaction_end,
TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
unsigned long free_pfn, unsigned long zone_end, bool sync,
@@ -166,7 +166,6 @@ TRACE_EVENT(mm_compaction_end,
__entry->sync ? "sync" : "async",
__print_symbolic(__entry->status, COMPACTION_STATUS))
);
-#endif
TRACE_EVENT(mm_compaction_try_to_compact_pages,
@@ -189,13 +188,12 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages,
__entry->prio = prio;
),
- TP_printk("order=%d gfp_mask=0x%x priority=%d",
+ TP_printk("order=%d gfp_mask=%s priority=%d",
__entry->order,
- __entry->gfp_mask,
+ show_gfp_flags(__entry->gfp_mask),
__entry->prio)
);
-#ifdef CONFIG_COMPACTION
DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
TP_PROTO(struct zone *zone,
@@ -296,7 +294,6 @@ DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset,
TP_ARGS(zone, order)
);
-#endif
TRACE_EVENT(mm_compaction_kcompactd_sleep,
@@ -352,6 +349,7 @@ DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake,
TP_ARGS(nid, order, classzone_idx)
);
+#endif
#endif /* _TRACE_COMPACTION_H */
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 252327dbfa51..a5ab2973e8dc 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -27,17 +27,11 @@
{RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \
) : "RECLAIM_WB_NONE"
-#define trace_reclaim_flags(page) ( \
- (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
+#define trace_reclaim_flags(file) ( \
+ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
(RECLAIM_WB_ASYNC) \
)
-#define trace_shrink_flags(file) \
- ( \
- (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
- (RECLAIM_WB_ASYNC) \
- )
-
TRACE_EVENT(mm_vmscan_kswapd_sleep,
TP_PROTO(int nid),
@@ -73,7 +67,9 @@ TRACE_EVENT(mm_vmscan_kswapd_wake,
__entry->order = order;
),
- TP_printk("nid=%d zid=%d order=%d", __entry->nid, __entry->zid, __entry->order)
+ TP_printk("nid=%d order=%d",
+ __entry->nid,
+ __entry->order)
);
TRACE_EVENT(mm_vmscan_wakeup_kswapd,
@@ -96,60 +92,53 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
__entry->gfp_flags = gfp_flags;
),
- TP_printk("nid=%d zid=%d order=%d gfp_flags=%s",
+ TP_printk("nid=%d order=%d gfp_flags=%s",
__entry->nid,
- __entry->zid,
__entry->order,
show_gfp_flags(__entry->gfp_flags))
);
DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
- TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
+ TP_PROTO(int order, gfp_t gfp_flags),
- TP_ARGS(order, may_writepage, gfp_flags, classzone_idx),
+ TP_ARGS(order, gfp_flags),
TP_STRUCT__entry(
__field( int, order )
- __field( int, may_writepage )
__field( gfp_t, gfp_flags )
- __field( int, classzone_idx )
),
TP_fast_assign(
__entry->order = order;
- __entry->may_writepage = may_writepage;
__entry->gfp_flags = gfp_flags;
- __entry->classzone_idx = classzone_idx;
),
- TP_printk("order=%d may_writepage=%d gfp_flags=%s classzone_idx=%d",
+ TP_printk("order=%d gfp_flags=%s",
__entry->order,
- __entry->may_writepage,
- show_gfp_flags(__entry->gfp_flags),
- __entry->classzone_idx)
+ show_gfp_flags(__entry->gfp_flags))
);
DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin,
- TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
+ TP_PROTO(int order, gfp_t gfp_flags),
- TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
+ TP_ARGS(order, gfp_flags)
);
#ifdef CONFIG_MEMCG
DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
- TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
+ TP_PROTO(int order, gfp_t gfp_flags),
- TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
+ TP_ARGS(order, gfp_flags)
);
DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,
- TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
+ TP_PROTO(int order, gfp_t gfp_flags),
- TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
+ TP_ARGS(order, gfp_flags)
);
#endif /* CONFIG_MEMCG */
@@ -333,7 +322,8 @@ TRACE_EVENT(mm_vmscan_writepage,
TP_fast_assign(
__entry->pfn = page_to_pfn(page);
- __entry->reclaim_flags = trace_reclaim_flags(page);
+ __entry->reclaim_flags = trace_reclaim_flags(
+ page_is_file_cache(page));
),
TP_printk("page=%p pfn=%lu flags=%s",
@@ -358,7 +348,8 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
__field(unsigned long, nr_writeback)
__field(unsigned long, nr_congested)
__field(unsigned long, nr_immediate)
- __field(unsigned long, nr_activate)
+ __field(unsigned int, nr_activate0)
+ __field(unsigned int, nr_activate1)
__field(unsigned long, nr_ref_keep)
__field(unsigned long, nr_unmap_fail)
__field(int, priority)
@@ -373,20 +364,22 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
__entry->nr_writeback = stat->nr_writeback;
__entry->nr_congested = stat->nr_congested;
__entry->nr_immediate = stat->nr_immediate;
- __entry->nr_activate = stat->nr_activate;
+ __entry->nr_activate0 = stat->nr_activate[0];
+ __entry->nr_activate1 = stat->nr_activate[1];
__entry->nr_ref_keep = stat->nr_ref_keep;
__entry->nr_unmap_fail = stat->nr_unmap_fail;
__entry->priority = priority;
- __entry->reclaim_flags = trace_shrink_flags(file);
+ __entry->reclaim_flags = trace_reclaim_flags(file);
),
- TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate=%ld nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s",
+ TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s",
__entry->nid,
__entry->nr_scanned, __entry->nr_reclaimed,
__entry->nr_dirty, __entry->nr_writeback,
__entry->nr_congested, __entry->nr_immediate,
- __entry->nr_activate, __entry->nr_ref_keep,
- __entry->nr_unmap_fail, __entry->priority,
+ __entry->nr_activate0, __entry->nr_activate1,
+ __entry->nr_ref_keep, __entry->nr_unmap_fail,
+ __entry->priority,
show_reclaim_flags(__entry->reclaim_flags))
);
@@ -415,7 +408,7 @@ TRACE_EVENT(mm_vmscan_lru_shrink_active,
__entry->nr_deactivated = nr_deactivated;
__entry->nr_referenced = nr_referenced;
__entry->priority = priority;
- __entry->reclaim_flags = trace_shrink_flags(file);
+ __entry->reclaim_flags = trace_reclaim_flags(file);
),
TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s",
@@ -454,7 +447,8 @@ TRACE_EVENT(mm_vmscan_inactive_list_is_low,
__entry->total_active = total_active;
__entry->active = active;
__entry->ratio = ratio;
- __entry->reclaim_flags = trace_shrink_flags(file) & RECLAIM_WB_LRU;
+ __entry->reclaim_flags = trace_reclaim_flags(file) &
+ RECLAIM_WB_LRU;
),
TP_printk("nid=%d reclaim_idx=%d total_inactive=%ld inactive=%ld total_active=%ld active=%ld ratio=%ld flags=%s",
@@ -465,6 +459,38 @@ TRACE_EVENT(mm_vmscan_inactive_list_is_low,
__entry->ratio,
show_reclaim_flags(__entry->reclaim_flags))
);
+
+TRACE_EVENT(mm_vmscan_node_reclaim_begin,
+
+ TP_PROTO(int nid, int order, gfp_t gfp_flags),
+
+ TP_ARGS(nid, order, gfp_flags),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ __field(int, order)
+ __field(gfp_t, gfp_flags)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->order = order;
+ __entry->gfp_flags = gfp_flags;
+ ),
+
+ TP_printk("nid=%d order=%d gfp_flags=%s",
+ __entry->nid,
+ __entry->order,
+ show_gfp_flags(__entry->gfp_flags))
+);
+
+DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end,
+
+ TP_PROTO(unsigned long nr_reclaimed),
+
+ TP_ARGS(nr_reclaimed)
+);
+
#endif /* _TRACE_VMSCAN_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/byteorder/big_endian.h b/include/uapi/linux/byteorder/big_endian.h
index 2199adc6a6c2..3e58b717fb5a 100644
--- a/include/uapi/linux/byteorder/big_endian.h
+++ b/include/uapi/linux/byteorder/big_endian.h
@@ -2,6 +2,10 @@
#ifndef _UAPI_LINUX_BYTEORDER_BIG_ENDIAN_H
#define _UAPI_LINUX_BYTEORDER_BIG_ENDIAN_H
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
+#error "Unsupported endianness, check your toolchain"
+#endif
+
#ifndef __BIG_ENDIAN
#define __BIG_ENDIAN 4321
#endif
diff --git a/include/uapi/linux/byteorder/little_endian.h b/include/uapi/linux/byteorder/little_endian.h
index 601c904fd5cd..c9ee440716a5 100644
--- a/include/uapi/linux/byteorder/little_endian.h
+++ b/include/uapi/linux/byteorder/little_endian.h
@@ -2,6 +2,10 @@
#ifndef _UAPI_LINUX_BYTEORDER_LITTLE_ENDIAN_H
#define _UAPI_LINUX_BYTEORDER_LITTLE_ENDIAN_H
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+#error "Unsupported endianness, check your toolchain"
+#endif
+
#ifndef __LITTLE_ENDIAN
#define __LITTLE_ENDIAN 1234
#endif
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 121e82ce296b..59c71fa8c553 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -320,6 +320,9 @@ struct fscrypt_key {
#define SYNC_FILE_RANGE_WAIT_BEFORE 1
#define SYNC_FILE_RANGE_WRITE 2
#define SYNC_FILE_RANGE_WAIT_AFTER 4
+#define SYNC_FILE_RANGE_WRITE_AND_WAIT (SYNC_FILE_RANGE_WRITE | \
+ SYNC_FILE_RANGE_WAIT_BEFORE | \
+ SYNC_FILE_RANGE_WAIT_AFTER)
/*
* Flags for preadv2/pwritev2:
diff --git a/init/Kconfig b/init/Kconfig
index 82b84e5ee30d..8b9ffe236e4f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1752,6 +1752,30 @@ config SLAB_FREELIST_HARDENED
sacrifies to harden the kernel slab allocator against common
freelist exploit methods.
+config SHUFFLE_PAGE_ALLOCATOR
+ bool "Page allocator randomization"
+ default SLAB_FREELIST_RANDOM && ACPI_NUMA
+ help
+ Randomization of the page allocator improves the average
+ utilization of a direct-mapped memory-side-cache. See section
+ 5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI
+ 6.2a specification for an example of how a platform advertises
+ the presence of a memory-side-cache. There are also incidental
+ security benefits as it reduces the predictability of page
+ allocations to compliment SLAB_FREELIST_RANDOM, but the
+ default granularity of shuffling on the "MAX_ORDER - 1" i.e,
+ 10th order of pages is selected based on cache utilization
+ benefits on x86.
+
+ While the randomization improves cache utilization it may
+ negatively impact workloads on platforms without a cache. For
+ this reason, by default, the randomization is enabled only
+ after runtime detection of a direct-mapped memory-side-cache.
+ Otherwise, the randomization may be force enabled with the
+ 'page_alloc.shuffle' kernel command line parameter.
+
+ Say Y if unsure.
+
config SLUB_CPU_PARTIAL
default y
depends on SLUB && SMP
diff --git a/init/initramfs.c b/init/initramfs.c
index 4749e1115eef..435a428c2af1 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -513,42 +513,55 @@ static int __init retain_initrd_param(char *str)
}
__setup("retain_initrd", retain_initrd_param);
+#ifdef CONFIG_ARCH_HAS_KEEPINITRD
+static int __init keepinitrd_setup(char *__unused)
+{
+ do_retain_initrd = 1;
+ return 1;
+}
+__setup("keepinitrd", keepinitrd_setup);
+#endif
+
extern char __initramfs_start[];
extern unsigned long __initramfs_size;
#include <linux/initrd.h>
#include <linux/kexec.h>
-static void __init free_initrd(void)
+void __weak free_initrd_mem(unsigned long start, unsigned long end)
{
+ free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
+ "initrd");
+}
+
#ifdef CONFIG_KEXEC_CORE
+static bool kexec_free_initrd(void)
+{
unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
unsigned long crashk_end = (unsigned long)__va(crashk_res.end);
-#endif
- if (do_retain_initrd)
- goto skip;
-#ifdef CONFIG_KEXEC_CORE
/*
* If the initrd region is overlapped with crashkernel reserved region,
* free only memory that is not part of crashkernel region.
*/
- if (initrd_start < crashk_end && initrd_end > crashk_start) {
- /*
- * Initialize initrd memory region since the kexec boot does
- * not do.
- */
- memset((void *)initrd_start, 0, initrd_end - initrd_start);
- if (initrd_start < crashk_start)
- free_initrd_mem(initrd_start, crashk_start);
- if (initrd_end > crashk_end)
- free_initrd_mem(crashk_end, initrd_end);
- } else
-#endif
- free_initrd_mem(initrd_start, initrd_end);
-skip:
- initrd_start = 0;
- initrd_end = 0;
+ if (initrd_start >= crashk_end || initrd_end <= crashk_start)
+ return false;
+
+ /*
+ * Initialize initrd memory region since the kexec boot does not do.
+ */
+ memset((void *)initrd_start, 0, initrd_end - initrd_start);
+ if (initrd_start < crashk_start)
+ free_initrd_mem(initrd_start, crashk_start);
+ if (initrd_end > crashk_end)
+ free_initrd_mem(crashk_end, initrd_end);
+ return true;
+}
+#else
+static inline bool kexec_free_initrd(void)
+{
+ return false;
}
+#endif /* CONFIG_KEXEC_CORE */
#ifdef CONFIG_BLK_DEV_RAM
#define BUF_SIZE 1024
@@ -597,7 +610,38 @@ static void __init clean_rootfs(void)
ksys_close(fd);
kfree(buf);
}
-#endif
+#else
+static inline void clean_rootfs(void)
+{
+}
+#endif /* CONFIG_BLK_DEV_RAM */
+
+#ifdef CONFIG_BLK_DEV_RAM
+static void populate_initrd_image(char *err)
+{
+ ssize_t written;
+ int fd;
+
+ unpack_to_rootfs(__initramfs_start, __initramfs_size);
+
+ printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n",
+ err);
+ fd = ksys_open("/initrd.image", O_WRONLY | O_CREAT, 0700);
+ if (fd < 0)
+ return;
+
+ written = xwrite(fd, (char *)initrd_start, initrd_end - initrd_start);
+ if (written != initrd_end - initrd_start)
+ pr_err("/initrd.image: incomplete write (%zd != %ld)\n",
+ written, initrd_end - initrd_start);
+ ksys_close(fd);
+}
+#else
+static void populate_initrd_image(char *err)
+{
+ printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
+}
+#endif /* CONFIG_BLK_DEV_RAM */
static int __init populate_rootfs(void)
{
@@ -605,46 +649,31 @@ static int __init populate_rootfs(void)
char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
if (err)
panic("%s", err); /* Failed to decompress INTERNAL initramfs */
- /* If available load the bootloader supplied initrd */
- if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) {
-#ifdef CONFIG_BLK_DEV_RAM
- int fd;
+
+ if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE))
+ goto done;
+
+ if (IS_ENABLED(CONFIG_BLK_DEV_RAM))
printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n");
- err = unpack_to_rootfs((char *)initrd_start,
- initrd_end - initrd_start);
- if (!err) {
- free_initrd();
- goto done;
- } else {
- clean_rootfs();
- unpack_to_rootfs(__initramfs_start, __initramfs_size);
- }
- printk(KERN_INFO "rootfs image is not initramfs (%s)"
- "; looks like an initrd\n", err);
- fd = ksys_open("/initrd.image",
- O_WRONLY|O_CREAT, 0700);
- if (fd >= 0) {
- ssize_t written = xwrite(fd, (char *)initrd_start,
- initrd_end - initrd_start);
-
- if (written != initrd_end - initrd_start)
- pr_err("/initrd.image: incomplete write (%zd != %ld)\n",
- written, initrd_end - initrd_start);
-
- ksys_close(fd);
- free_initrd();
- }
- done:
- /* empty statement */;
-#else
+ else
printk(KERN_INFO "Unpacking initramfs...\n");
- err = unpack_to_rootfs((char *)initrd_start,
- initrd_end - initrd_start);
- if (err)
- printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
- free_initrd();
-#endif
+
+ err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start);
+ if (err) {
+ clean_rootfs();
+ populate_initrd_image(err);
}
+
+done:
+ /*
+ * If the initrd region is overlapped with crashkernel reserved region,
+ * free only memory that is not part of crashkernel region.
+ */
+ if (!do_retain_initrd && !kexec_free_initrd())
+ free_initrd_mem(initrd_start, initrd_end);
+ initrd_start = 0;
+ initrd_end = 0;
+
flush_delayed_fput();
return 0;
}
diff --git a/init/main.c b/init/main.c
index 33c87e91dc37..5a2c69b4d7b3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1074,6 +1074,11 @@ static inline void mark_readonly(void)
}
#endif
+void __weak free_initmem(void)
+{
+ free_initmem_default(POISON_FREE_INITMEM);
+}
+
static int __ref kernel_init(void *unused)
{
int ret;
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 49f9bf4ffc7f..bfaae457810c 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -120,7 +120,9 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
static int zero;
static int one = 1;
static int int_max = INT_MAX;
-static int ipc_mni = IPCMNI;
+int ipc_mni = IPCMNI;
+int ipc_mni_shift = IPCMNI_SHIFT;
+int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
static struct ctl_table ipc_kern_table[] = {
{
@@ -246,3 +248,13 @@ static int __init ipc_sysctl_init(void)
}
device_initcall(ipc_sysctl_init);
+
+static int __init ipc_mni_extend(char *str)
+{
+ ipc_mni = IPCMNI_EXTEND;
+ ipc_mni_shift = IPCMNI_EXTEND_SHIFT;
+ ipc_min_cycle = IPCMNI_EXTEND_MIN_CYCLE;
+ pr_info("IPCMNI extended to %d.\n", ipc_mni);
+ return 0;
+}
+early_param("ipcmni_extend", ipc_mni_extend);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index ba44164ea1f9..216cad1ff0d0 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -76,6 +76,7 @@ struct mqueue_inode_info {
wait_queue_head_t wait_q;
struct rb_root msg_tree;
+ struct rb_node *msg_tree_rightmost;
struct posix_msg_tree_node *node_cache;
struct mq_attr attr;
@@ -131,6 +132,7 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
{
struct rb_node **p, *parent = NULL;
struct posix_msg_tree_node *leaf;
+ bool rightmost = true;
p = &info->msg_tree.rb_node;
while (*p) {
@@ -139,9 +141,10 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
if (likely(leaf->priority == msg->m_type))
goto insert_msg;
- else if (msg->m_type < leaf->priority)
+ else if (msg->m_type < leaf->priority) {
p = &(*p)->rb_left;
- else
+ rightmost = false;
+ } else
p = &(*p)->rb_right;
}
if (info->node_cache) {
@@ -154,6 +157,10 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
INIT_LIST_HEAD(&leaf->msg_list);
}
leaf->priority = msg->m_type;
+
+ if (rightmost)
+ info->msg_tree_rightmost = &leaf->rb_node;
+
rb_link_node(&leaf->rb_node, parent, p);
rb_insert_color(&leaf->rb_node, &info->msg_tree);
insert_msg:
@@ -163,23 +170,35 @@ insert_msg:
return 0;
}
+static inline void msg_tree_erase(struct posix_msg_tree_node *leaf,
+ struct mqueue_inode_info *info)
+{
+ struct rb_node *node = &leaf->rb_node;
+
+ if (info->msg_tree_rightmost == node)
+ info->msg_tree_rightmost = rb_prev(node);
+
+ rb_erase(node, &info->msg_tree);
+ if (info->node_cache) {
+ kfree(leaf);
+ } else {
+ info->node_cache = leaf;
+ }
+}
+
static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
{
- struct rb_node **p, *parent = NULL;
+ struct rb_node *parent = NULL;
struct posix_msg_tree_node *leaf;
struct msg_msg *msg;
try_again:
- p = &info->msg_tree.rb_node;
- while (*p) {
- parent = *p;
- /*
- * During insert, low priorities go to the left and high to the
- * right. On receive, we want the highest priorities first, so
- * walk all the way to the right.
- */
- p = &(*p)->rb_right;
- }
+ /*
+ * During insert, low priorities go to the left and high to the
+ * right. On receive, we want the highest priorities first, so
+ * walk all the way to the right.
+ */
+ parent = info->msg_tree_rightmost;
if (!parent) {
if (info->attr.mq_curmsgs) {
pr_warn_once("Inconsistency in POSIX message queue, "
@@ -194,24 +213,14 @@ try_again:
pr_warn_once("Inconsistency in POSIX message queue, "
"empty leaf node but we haven't implemented "
"lazy leaf delete!\n");
- rb_erase(&leaf->rb_node, &info->msg_tree);
- if (info->node_cache) {
- kfree(leaf);
- } else {
- info->node_cache = leaf;
- }
+ msg_tree_erase(leaf, info);
goto try_again;
} else {
msg = list_first_entry(&leaf->msg_list,
struct msg_msg, m_list);
list_del(&msg->m_list);
if (list_empty(&leaf->msg_list)) {
- rb_erase(&leaf->rb_node, &info->msg_tree);
- if (info->node_cache) {
- kfree(leaf);
- } else {
- info->node_cache = leaf;
- }
+ msg_tree_erase(leaf, info);
}
}
info->attr.mq_curmsgs--;
@@ -254,6 +263,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
info->qsize = 0;
info->user = NULL; /* set when all is ok */
info->msg_tree = RB_ROOT;
+ info->msg_tree_rightmost = NULL;
info->node_cache = NULL;
memset(&info->attr, 0, sizeof(info->attr));
info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
@@ -430,7 +440,8 @@ static void mqueue_evict_inode(struct inode *inode)
struct user_struct *user;
unsigned long mq_bytes, mq_treesize;
struct ipc_namespace *ipc_ns;
- struct msg_msg *msg;
+ struct msg_msg *msg, *nmsg;
+ LIST_HEAD(tmp_msg);
clear_inode(inode);
@@ -441,10 +452,15 @@ static void mqueue_evict_inode(struct inode *inode)
info = MQUEUE_I(inode);
spin_lock(&info->lock);
while ((msg = msg_get(info)) != NULL)
- free_msg(msg);
+ list_add_tail(&msg->m_list, &tmp_msg);
kfree(info->node_cache);
spin_unlock(&info->lock);
+ list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
+ list_del(&msg->m_list);
+ free_msg(msg);
+ }
+
/* Total amount of bytes accounted for the mqueue */
mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
@@ -605,8 +621,6 @@ static void wq_add(struct mqueue_inode_info *info, int sr,
{
struct ext_wait_queue *walk;
- ewp->task = current;
-
list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
if (walk->task->prio <= current->prio) {
list_add_tail(&ewp->list, &walk->list);
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 84598025a6ad..e65593742e2b 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -18,6 +18,7 @@
#include <linux/utsname.h>
#include <linux/proc_ns.h>
#include <linux/uaccess.h>
+#include <linux/sched.h>
#include "util.h"
@@ -64,6 +65,9 @@ static struct msg_msg *alloc_msg(size_t len)
pseg = &msg->next;
while (len > 0) {
struct msg_msgseg *seg;
+
+ cond_resched();
+
alen = min(len, DATALEN_SEG);
seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT);
if (seg == NULL)
@@ -176,6 +180,8 @@ void free_msg(struct msg_msg *msg)
kfree(msg);
while (seg != NULL) {
struct msg_msgseg *tmp = seg->next;
+
+ cond_resched();
kfree(seg);
seg = tmp;
}
diff --git a/ipc/util.c b/ipc/util.c
index 095274a871f8..d126d156efc6 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -109,7 +109,7 @@ static const struct rhashtable_params ipc_kht_params = {
* @ids: ipc identifier set
*
* Set up the sequence range to use for the ipc identifier range (limited
- * below IPCMNI) then initialise the keys hashtable and ids idr.
+ * below ipc_mni) then initialise the keys hashtable and ids idr.
*/
void ipc_init_ids(struct ipc_ids *ids)
{
@@ -119,6 +119,7 @@ void ipc_init_ids(struct ipc_ids *ids)
rhashtable_init(&ids->key_ht, &ipc_kht_params);
idr_init(&ids->ipcs_idr);
ids->max_idx = -1;
+ ids->last_idx = -1;
#ifdef CONFIG_CHECKPOINT_RESTORE
ids->next_id = -1;
#endif
@@ -192,6 +193,10 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
*
* The caller must own kern_ipc_perm.lock.of the new object.
* On error, the function returns a (negative) error code.
+ *
+ * To conserve sequence number space, especially with extended ipc_mni,
+ * the sequence number is incremented only when the returned ID is less than
+ * the last one.
*/
static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
{
@@ -215,17 +220,42 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
*/
if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */
- new->seq = ids->seq++;
- if (ids->seq > IPCID_SEQ_MAX)
- ids->seq = 0;
- idx = idr_alloc(&ids->ipcs_idr, new, 0, 0, GFP_NOWAIT);
+ int max_idx;
+
+ max_idx = max(ids->in_use*3/2, ipc_min_cycle);
+ max_idx = min(max_idx, ipc_mni);
+
+ /* allocate the idx, with a NULL struct kern_ipc_perm */
+ idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx,
+ GFP_NOWAIT);
+
+ if (idx >= 0) {
+ /*
+ * idx got allocated successfully.
+ * Now calculate the sequence number and set the
+ * pointer for real.
+ */
+ if (idx <= ids->last_idx) {
+ ids->seq++;
+ if (ids->seq >= ipcid_seq_max())
+ ids->seq = 0;
+ }
+ ids->last_idx = idx;
+
+ new->seq = ids->seq;
+ /* no need for smp_wmb(), this is done
+ * inside idr_replace, as part of
+ * rcu_assign_pointer
+ */
+ idr_replace(&ids->ipcs_idr, new, idx);
+ }
} else {
new->seq = ipcid_to_seqx(next_id);
idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id),
0, GFP_NOWAIT);
}
if (idx >= 0)
- new->id = SEQ_MULTIPLIER * new->seq + idx;
+ new->id = (new->seq << ipcmni_seq_shift()) + idx;
return idx;
}
@@ -253,8 +283,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
/* 1) Initialize the refcount so that ipc_rcu_putref works */
refcount_set(&new->refcount, 1);
- if (limit > IPCMNI)
- limit = IPCMNI;
+ if (limit > ipc_mni)
+ limit = ipc_mni;
if (ids->in_use >= limit)
return -ENOSPC;
@@ -737,7 +767,7 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
if (total >= ids->in_use)
return NULL;
- for (; pos < IPCMNI; pos++) {
+ for (; pos < ipc_mni; pos++) {
ipc = idr_find(&ids->ipcs_idr, pos);
if (ipc != NULL) {
*new_pos = pos + 1;
diff --git a/ipc/util.h b/ipc/util.h
index e272be622ae7..0fcf8e719b76 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -15,8 +15,37 @@
#include <linux/err.h>
#include <linux/ipc_namespace.h>
-#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
-#define SEQ_MULTIPLIER (IPCMNI)
+/*
+ * The IPC ID contains 2 separate numbers - index and sequence number.
+ * By default,
+ * bits 0-14: index (32k, 15 bits)
+ * bits 15-30: sequence number (64k, 16 bits)
+ *
+ * When IPCMNI extension mode is turned on, the composition changes:
+ * bits 0-23: index (16M, 24 bits)
+ * bits 24-30: sequence number (128, 7 bits)
+ */
+#define IPCMNI_SHIFT 15
+#define IPCMNI_EXTEND_SHIFT 24
+#define IPCMNI_EXTEND_MIN_CYCLE (RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE)
+#define IPCMNI (1 << IPCMNI_SHIFT)
+#define IPCMNI_EXTEND (1 << IPCMNI_EXTEND_SHIFT)
+
+#ifdef CONFIG_SYSVIPC_SYSCTL
+extern int ipc_mni;
+extern int ipc_mni_shift;
+extern int ipc_min_cycle;
+
+#define ipcmni_seq_shift() ipc_mni_shift
+#define IPCMNI_IDX_MASK ((1 << ipc_mni_shift) - 1)
+
+#else /* CONFIG_SYSVIPC_SYSCTL */
+
+#define ipc_mni IPCMNI
+#define ipc_min_cycle ((int)RADIX_TREE_MAP_SIZE)
+#define ipcmni_seq_shift() IPCMNI_SHIFT
+#define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1)
+#endif /* CONFIG_SYSVIPC_SYSCTL */
void sem_init(void);
void msg_init(void);
@@ -96,9 +125,9 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *);
#define IPC_MSG_IDS 1
#define IPC_SHM_IDS 2
-#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
-#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER)
-#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX)
+#define ipcid_to_idx(id) ((id) & IPCMNI_IDX_MASK)
+#define ipcid_to_seqx(id) ((id) >> ipcmni_seq_shift())
+#define ipcid_seq_max() (INT_MAX >> ipcmni_seq_shift())
/* must be called with ids->rwsem acquired for writing */
int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
@@ -123,8 +152,8 @@ static inline int ipc_get_maxidx(struct ipc_ids *ids)
if (ids->in_use == 0)
return -1;
- if (ids->in_use == IPCMNI)
- return IPCMNI - 1;
+ if (ids->in_use == ipc_mni)
+ return ipc_mni - 1;
return ids->max_idx;
}
@@ -216,10 +245,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
static inline int sem_check_semmni(struct ipc_namespace *ns) {
/*
- * Check semmni range [0, IPCMNI]
+ * Check semmni range [0, ipc_mni]
* semmni is the last element of sem_ctls[4] array
*/
- return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > IPCMNI))
+ return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > ipc_mni))
? -ERANGE : 0;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 327f37c9fdfa..1140357d46f4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3550,7 +3550,65 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
}
-#endif
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
+{
+ struct psi_trigger *new;
+ struct cgroup *cgrp;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ cgroup_get(cgrp);
+ cgroup_kn_unlock(of->kn);
+
+ new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+ if (IS_ERR(new)) {
+ cgroup_put(cgrp);
+ return PTR_ERR(new);
+ }
+
+ psi_trigger_replace(&of->priv, new);
+
+ cgroup_put(cgrp);
+
+ return nbytes;
+}
+
+static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+}
+
+static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+}
+
+static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
+ poll_table *pt)
+{
+ return psi_trigger_poll(&of->priv, of->file, pt);
+}
+
+static void cgroup_pressure_release(struct kernfs_open_file *of)
+{
+ psi_trigger_replace(&of->priv, NULL);
+}
+#endif /* CONFIG_PSI */
static int cgroup_freeze_show(struct seq_file *seq, void *v)
{
@@ -4745,18 +4803,27 @@ static struct cftype cgroup_base_files[] = {
.name = "io.pressure",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_io_pressure_show,
+ .write = cgroup_io_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
{
.name = "memory.pressure",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_memory_pressure_show,
+ .write = cgroup_memory_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
{
.name = "cpu.pressure",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_cpu_pressure_show,
+ .write = cgroup_cpu_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
-#endif
+#endif /* CONFIG_PSI */
{ } /* terminate */
};
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 7a723194ecbe..2b750f13bc8f 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -158,7 +158,7 @@ out:
bool dma_in_atomic_pool(void *start, size_t size)
{
- return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+ return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
}
void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4ca7364c956d..78f61bfc6b79 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct mmu_notifier_range range;
struct mem_cgroup *memcg;
- mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ addr + PAGE_SIZE);
VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
diff --git a/kernel/exit.c b/kernel/exit.c
index 2166c2d92ddc..8361a560cd1d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -422,7 +422,7 @@ retry:
* freed task structure.
*/
if (atomic_read(&mm->mm_users) <= 1) {
- mm->owner = NULL;
+ WRITE_ONCE(mm->owner, NULL);
return;
}
@@ -462,7 +462,7 @@ retry:
* most likely racing with swapoff (try_to_unuse()) or /proc or
* ptrace or page migration (get_task_mm()). Mark owner as NULL.
*/
- mm->owner = NULL;
+ WRITE_ONCE(mm->owner, NULL);
return;
assign_new_owner:
@@ -483,7 +483,7 @@ assign_new_owner:
put_task_struct(c);
goto retry;
}
- mm->owner = c;
+ WRITE_ONCE(mm->owner, c);
task_unlock(c);
put_task_struct(c);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 737db1828437..b17c5224569c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -921,6 +921,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
+ tsk->memcg_high_reclaim = NULL;
#endif
return tsk;
@@ -955,6 +956,15 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}
+static __always_inline void mm_clear_owner(struct mm_struct *mm,
+ struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+ if (mm->owner == p)
+ WRITE_ONCE(mm->owner, NULL);
+#endif
+}
+
static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
@@ -982,7 +992,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->core_state = NULL;
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
- mm->locked_vm = 0;
+ atomic64_set(&mm->locked_vm, 0);
atomic64_set(&mm->pinned_vm, 0);
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
@@ -1343,6 +1353,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
free_pt:
/* don't put binfmt in mmput, we haven't got module yet */
mm->binfmt = NULL;
+ mm_init_owner(mm, NULL);
mmput(mm);
fail_nomem:
@@ -1726,6 +1737,24 @@ static int pidfd_create(struct pid *pid)
return fd;
}
+#ifdef CONFIG_MEMCG
+static void __delayed_free_task(struct rcu_head *rhp)
+{
+ struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+ free_task(tsk);
+}
+#endif /* CONFIG_MEMCG */
+
+static __always_inline void delayed_free_task(struct task_struct *tsk)
+{
+#ifdef CONFIG_MEMCG
+ call_rcu(&tsk->rcu, __delayed_free_task);
+#else /* CONFIG_MEMCG */
+ free_task(tsk);
+#endif /* CONFIG_MEMCG */
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -2068,7 +2097,7 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
- clear_all_latency_tracing(p);
+ clear_tsk_latency_tracing(p);
/* ok, now we should be set up.. */
p->pid = pid_nr(pid);
@@ -2233,8 +2262,10 @@ bad_fork_cleanup_io:
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_mm:
- if (p->mm)
+ if (p->mm) {
+ mm_clear_owner(p->mm, p);
mmput(p->mm);
+ }
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
free_signal_struct(p->signal);
@@ -2265,7 +2296,7 @@ bad_fork_cleanup_count:
bad_fork_free:
p->state = TASK_DEAD;
put_task_stack(p);
- free_task(p);
+ delayed_free_task(p);
fork_out:
spin_lock_irq(&current->sighand->siglock);
hlist_del_init(&delayed.node);
diff --git a/kernel/futex.c b/kernel/futex.c
index 6262f1534ac9..2268b97d5439 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -543,7 +543,7 @@ again:
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
- err = get_user_pages_fast(address, 1, 1, &page);
+ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
/*
* If write access is not required (eg. FUTEX_WAIT), try
* and get read-only access.
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 1e3823fa799b..f71c1adcff31 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -53,6 +53,7 @@ config GCOV_PROFILE_ALL
choice
prompt "Specify GCOV format"
depends on GCOV_KERNEL
+ depends on CC_IS_GCC
---help---
The gcov format is usually determined by the GCC version, and the
default is chosen according to your GCC version. However, there are
@@ -62,7 +63,7 @@ choice
config GCOV_FORMAT_3_4
bool "GCC 3.4 format"
- depends on CC_IS_GCC && GCC_VERSION < 40700
+ depends on GCC_VERSION < 40700
---help---
Select this option to use the format defined by GCC 3.4.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index ff06d64df397..d66a74b0f100 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -2,5 +2,6 @@
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-y := base.o fs.o
-obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
-obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
+obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o
+obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o
+obj-$(CONFIG_CC_IS_CLANG) += clang.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9c7c8d5c18f2..0ffe9f194080 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -22,88 +22,8 @@
#include <linux/sched.h>
#include "gcov.h"
-static int gcov_events_enabled;
-static DEFINE_MUTEX(gcov_lock);
-
-/*
- * __gcov_init is called by gcc-generated constructor code for each object
- * file compiled with -fprofile-arcs.
- */
-void __gcov_init(struct gcov_info *info)
-{
- static unsigned int gcov_version;
-
- mutex_lock(&gcov_lock);
- if (gcov_version == 0) {
- gcov_version = gcov_info_version(info);
- /*
- * Printing gcc's version magic may prove useful for debugging
- * incompatibility reports.
- */
- pr_info("version magic: 0x%x\n", gcov_version);
- }
- /*
- * Add new profiling data structure to list and inform event
- * listener.
- */
- gcov_info_link(info);
- if (gcov_events_enabled)
- gcov_event(GCOV_ADD, info);
- mutex_unlock(&gcov_lock);
-}
-EXPORT_SYMBOL(__gcov_init);
-
-/*
- * These functions may be referenced by gcc-generated profiling code but serve
- * no function for kernel profiling.
- */
-void __gcov_flush(void)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_flush);
-
-void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_add);
-
-void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_single);
-
-void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_delta);
-
-void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_ior);
-
-void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_time_profile);
-
-void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_icall_topn);
-
-void __gcov_exit(void)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_exit);
+int gcov_events_enabled;
+DEFINE_MUTEX(gcov_lock);
/**
* gcov_enable_events - enable event reporting through gcov_event()
@@ -144,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
/* Remove entries located in module from linked list. */
while ((info = gcov_info_next(info))) {
- if (within_module((unsigned long)info, mod)) {
+ if (gcov_info_within_module(info, mod)) {
gcov_info_unlink(prev, info);
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
new file mode 100644
index 000000000000..c94b820a1b62
--- /dev/null
+++ b/kernel/gcov/clang.c
@@ -0,0 +1,581 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Google, Inc.
+ * modified from kernel/gcov/gcc_4_7.c
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * LLVM uses profiling data that's deliberately similar to GCC, but has a
+ * very different way of exporting that data. LLVM calls llvm_gcov_init() once
+ * per module, and provides a couple of callbacks that we can use to ask for
+ * more data.
+ *
+ * We care about the "writeout" callback, which in turn calls back into
+ * compiler-rt/this module to dump all the gathered coverage data to disk:
+ *
+ * llvm_gcda_start_file()
+ * llvm_gcda_emit_function()
+ * llvm_gcda_emit_arcs()
+ * llvm_gcda_emit_function()
+ * llvm_gcda_emit_arcs()
+ * [... repeats for each function ...]
+ * llvm_gcda_summary_info()
+ * llvm_gcda_end_file()
+ *
+ * This design is much more stateless and unstructured than gcc's, and is
+ * intended to run at process exit. This forces us to keep some local state
+ * about which module we're dealing with at the moment. On the other hand, it
+ * also means we don't depend as much on how LLVM represents profiling data
+ * internally.
+ *
+ * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more
+ * details on how this works, particularly GCOVProfiler::emitProfileArcs(),
+ * GCOVProfiler::insertCounterWriteout(), and
+ * GCOVProfiler::insertFlush().
+ */
+
+#define pr_fmt(fmt) "gcov: " fmt
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+typedef void (*llvm_gcov_callback)(void);
+
+struct gcov_info {
+ struct list_head head;
+
+ const char *filename;
+ unsigned int version;
+ u32 checksum;
+
+ struct list_head functions;
+};
+
+struct gcov_fn_info {
+ struct list_head head;
+
+ u32 ident;
+ u32 checksum;
+ u8 use_extra_checksum;
+ u32 cfg_checksum;
+
+ u32 num_counters;
+ u64 *counters;
+ const char *function_name;
+};
+
+static struct gcov_info *current_info;
+
+static LIST_HEAD(clang_gcov_list);
+
+void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
+{
+ struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return;
+
+ INIT_LIST_HEAD(&info->head);
+ INIT_LIST_HEAD(&info->functions);
+
+ mutex_lock(&gcov_lock);
+
+ list_add_tail(&info->head, &clang_gcov_list);
+ current_info = info;
+ writeout();
+ current_info = NULL;
+ if (gcov_events_enabled)
+ gcov_event(GCOV_ADD, info);
+
+ mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(llvm_gcov_init);
+
+void llvm_gcda_start_file(const char *orig_filename, const char version[4],
+ u32 checksum)
+{
+ current_info->filename = orig_filename;
+ memcpy(&current_info->version, version, sizeof(current_info->version));
+ current_info->checksum = checksum;
+}
+EXPORT_SYMBOL(llvm_gcda_start_file);
+
+void llvm_gcda_emit_function(u32 ident, const char *function_name,
+ u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
+{
+ struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return;
+
+ INIT_LIST_HEAD(&info->head);
+ info->ident = ident;
+ info->checksum = func_checksum;
+ info->use_extra_checksum = use_extra_checksum;
+ info->cfg_checksum = cfg_checksum;
+ if (function_name)
+ info->function_name = kstrdup(function_name, GFP_KERNEL);
+
+ list_add_tail(&info->head, &current_info->functions);
+}
+EXPORT_SYMBOL(llvm_gcda_emit_function);
+
+void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
+{
+ struct gcov_fn_info *info = list_last_entry(&current_info->functions,
+ struct gcov_fn_info, head);
+
+ info->num_counters = num_counters;
+ info->counters = counters;
+}
+EXPORT_SYMBOL(llvm_gcda_emit_arcs);
+
+void llvm_gcda_summary_info(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_summary_info);
+
+void llvm_gcda_end_file(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_end_file);
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+ return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+ return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+ if (!info)
+ return list_first_entry_or_null(&clang_gcov_list,
+ struct gcov_info, head);
+ if (list_is_last(&info->head, &clang_gcov_list))
+ return NULL;
+ return list_next_entry(info, head);
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+ list_add_tail(&info->head, &clang_gcov_list);
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+ /* Generic code unlinks while iterating. */
+ __list_del_entry(&info->head);
+}
+
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+ return within_module((unsigned long)info->filename, mod);
+}
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+ { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
+ { 0, NULL},
+};
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+ struct gcov_fn_info *fn;
+
+ list_for_each_entry(fn, &info->functions, head)
+ memset(fn->counters, 0,
+ sizeof(fn->counters[0]) * fn->num_counters);
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+ struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null(
+ &info1->functions, struct gcov_fn_info, head);
+ struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null(
+ &info2->functions, struct gcov_fn_info, head);
+
+ if (info1->checksum != info2->checksum)
+ return false;
+ if (!fn_ptr1)
+ return fn_ptr1 == fn_ptr2;
+ while (!list_is_last(&fn_ptr1->head, &info1->functions) &&
+ !list_is_last(&fn_ptr2->head, &info2->functions)) {
+ if (fn_ptr1->checksum != fn_ptr2->checksum)
+ return false;
+ if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
+ return false;
+ if (fn_ptr1->use_extra_checksum &&
+ fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
+ return false;
+ fn_ptr1 = list_next_entry(fn_ptr1, head);
+ fn_ptr2 = list_next_entry(fn_ptr2, head);
+ }
+ return list_is_last(&fn_ptr1->head, &info1->functions) &&
+ list_is_last(&fn_ptr2->head, &info2->functions);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
+{
+ struct gcov_fn_info *dfn_ptr;
+ struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions,
+ struct gcov_fn_info, head);
+
+ list_for_each_entry(dfn_ptr, &dst->functions, head) {
+ u32 i;
+
+ for (i = 0; i < sfn_ptr->num_counters; i++)
+ dfn_ptr->counters[i] += sfn_ptr->counters[i];
+ }
+}
+
+static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
+{
+ size_t cv_size; /* counter values size */
+ struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
+ GFP_KERNEL);
+ if (!fn_dup)
+ return NULL;
+ INIT_LIST_HEAD(&fn_dup->head);
+
+ fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL);
+ if (!fn_dup->function_name)
+ goto err_name;
+
+ cv_size = fn->num_counters * sizeof(fn->counters[0]);
+ fn_dup->counters = vmalloc(cv_size);
+ if (!fn_dup->counters)
+ goto err_counters;
+ memcpy(fn_dup->counters, fn->counters, cv_size);
+
+ return fn_dup;
+
+err_counters:
+ kfree(fn_dup->function_name);
+err_name:
+ kfree(fn_dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+ struct gcov_info *dup;
+ struct gcov_fn_info *fn;
+
+ dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
+ if (!dup)
+ return NULL;
+ INIT_LIST_HEAD(&dup->head);
+ INIT_LIST_HEAD(&dup->functions);
+ dup->filename = kstrdup(info->filename, GFP_KERNEL);
+ if (!dup->filename)
+ goto err;
+
+ list_for_each_entry(fn, &info->functions, head) {
+ struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn);
+
+ if (!fn_dup)
+ goto err;
+ list_add_tail(&fn_dup->head, &dup->functions);
+ }
+
+ return dup;
+
+err:
+ gcov_info_free(dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+ struct gcov_fn_info *fn, *tmp;
+
+ list_for_each_entry_safe(fn, tmp, &info->functions, head) {
+ kfree(fn->function_name);
+ vfree(fn->counters);
+ list_del(&fn->head);
+ kfree(fn);
+ }
+ kfree(info->filename);
+ kfree(info);
+}
+
+#define ITER_STRIDE PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+ struct gcov_info *info;
+ void *buffer;
+ size_t size;
+ loff_t pos;
+};
+
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+ *data = v;
+ }
+
+ return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+
+ data[0] = (v & 0xffffffffUL);
+ data[1] = (v >> 32);
+ }
+
+ return sizeof(*data) * 2;
+}
+
+/**
+ * convert_to_gcda - convert profiling data set to gcda file format
+ * @buffer: the buffer to store file data or %NULL if no data should be stored
+ * @info: profiling data set to be converted
+ *
+ * Returns the number of bytes that were/would have been stored into the buffer.
+ */
+static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+{
+ struct gcov_fn_info *fi_ptr;
+ size_t pos = 0;
+
+ /* File header. */
+ pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
+ pos += store_gcov_u32(buffer, pos, info->version);
+ pos += store_gcov_u32(buffer, pos, info->checksum);
+
+ list_for_each_entry(fi_ptr, &info->functions, head) {
+ u32 i;
+ u32 len = 2;
+
+ if (fi_ptr->use_extra_checksum)
+ len++;
+
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
+ pos += store_gcov_u32(buffer, pos, len);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
+ if (fi_ptr->use_extra_checksum)
+ pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
+ for (i = 0; i < fi_ptr->num_counters; i++)
+ pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]);
+ }
+
+ return pos;
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+ struct gcov_iterator *iter;
+
+ iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
+ if (!iter)
+ goto err_free;
+
+ iter->info = info;
+ /* Dry-run to get the actual buffer size. */
+ iter->size = convert_to_gcda(NULL, info);
+ iter->buffer = vmalloc(iter->size);
+ if (!iter->buffer)
+ goto err_free;
+
+ convert_to_gcda(iter->buffer, info);
+
+ return iter;
+
+err_free:
+ kfree(iter);
+ return NULL;
+}
+
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+ vfree(iter->buffer);
+ kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+ return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+ iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+ if (iter->pos < iter->size)
+ iter->pos += ITER_STRIDE;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+ size_t len;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ len = ITER_STRIDE;
+ if (iter->pos + len > iter->size)
+ len = iter->size - iter->pos;
+
+ seq_write(seq, iter->buffer + iter->pos, len);
+
+ return 0;
+}
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 2dddecbdbe6e..801ee4b0b969 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
gcov_info_head = info->next;
}
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+ return within_module((unsigned long)info, mod);
+}
+
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index ca5e5c0ef853..ec37563674d6 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
gcov_info_head = info->next;
}
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+ return within_module((unsigned long)info, mod);
+}
+
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c
new file mode 100644
index 000000000000..3cf736b9f880
--- /dev/null
+++ b/kernel/gcov/gcc_base.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include "gcov.h"
+
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+ static unsigned int gcov_version;
+
+ mutex_lock(&gcov_lock);
+ if (gcov_version == 0) {
+ gcov_version = gcov_info_version(info);
+ /*
+ * Printing gcc's version magic may prove useful for debugging
+ * incompatibility reports.
+ */
+ pr_info("version magic: 0x%x\n", gcov_version);
+ }
+ /*
+ * Add new profiling data structure to list and inform event
+ * listener.
+ */
+ gcov_info_link(info);
+ if (gcov_events_enabled)
+ gcov_event(GCOV_ADD, info);
+ mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_ior);
+
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_time_profile);
+
+void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_icall_topn);
+
+void __gcov_exit(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_exit);
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index de118ad4a024..6ab2c1808c9d 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -15,6 +15,7 @@
#ifndef GCOV_H
#define GCOV_H GCOV_H
+#include <linux/module.h>
#include <linux/types.h>
/*
@@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info);
struct gcov_info *gcov_info_next(struct gcov_info *info);
void gcov_info_link(struct gcov_info *info);
void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod);
/* Base interface. */
enum gcov_action {
@@ -83,4 +85,7 @@ struct gcov_link {
};
extern const struct gcov_link gcov_link[];
+extern int gcov_events_enabled;
+extern struct mutex gcov_lock;
+
#endif /* GCOV_H */
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f7fb8f6a688f..072b6ee55e3f 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
return locate_mem_hole_bottom_up(start, end, kbuf);
}
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-static int kexec_walk_memblock(struct kexec_buf *kbuf,
- int (*func)(struct resource *, void *))
-{
- return 0;
-}
-#else
+#ifdef CONFIG_ARCH_KEEP_MEMBLOCK
static int kexec_walk_memblock(struct kexec_buf *kbuf,
int (*func)(struct resource *, void *))
{
@@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
return ret;
}
+#else
+static int kexec_walk_memblock(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ return 0;
+}
#endif
/**
@@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
return 0;
- if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
+ if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
else
ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5942eeafb9ac..be4e8795561a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -11,6 +11,7 @@
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
+#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 99a5b5f46dc5..871734ea2f04 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -67,13 +67,10 @@ static struct latency_record latency_record[MAXLR];
int latencytop_enabled;
-void clear_all_latency_tracing(struct task_struct *p)
+void clear_tsk_latency_tracing(struct task_struct *p)
{
unsigned long flags;
- if (!latencytop_enabled)
- return;
-
raw_spin_lock_irqsave(&latency_lock, flags);
memset(&p->latency_record, 0, sizeof(p->latency_record));
p->latency_record_count = 0;
@@ -96,9 +93,6 @@ account_global_scheduler_latency(struct task_struct *tsk,
int firstnonnull = MAXLR + 1;
int i;
- if (!latencytop_enabled)
- return;
-
/* skip kernel threads for now */
if (!tsk->mm)
return;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index a856cb5ff192..4e59d29245f4 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -148,6 +148,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
&pgmap->altmap : NULL;
struct resource *res = &pgmap->res;
struct dev_pagemap *conflict_pgmap;
+ struct mhp_restrictions restrictions = {
+ /*
+ * We do not want any optional features only our own memmap
+ */
+ .altmap = altmap,
+ };
pgprot_t pgprot = PAGE_KERNEL;
int error, nid, is_ram;
@@ -214,7 +220,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
*/
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
error = add_pages(nid, align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, NULL, false);
+ align_size >> PAGE_SHIFT, &restrictions);
} else {
error = kasan_add_zero_shadow(__va(align_start), align_size);
if (error) {
@@ -222,8 +228,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
goto err_kasan;
}
- error = arch_add_memory(nid, align_start, align_size, altmap,
- false);
+ error = arch_add_memory(nid, align_start, align_size,
+ &restrictions);
}
if (!error) {
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 6196af8a8223..bfc95b3e4235 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -22,6 +22,7 @@ static int notifier_chain_register(struct notifier_block **nl,
struct notifier_block *n)
{
while ((*nl) != NULL) {
+ WARN_ONCE(((*nl) == n), "double register detected");
if (n->priority > (*nl)->priority)
break;
nl = &((*nl)->next);
diff --git a/kernel/panic.c b/kernel/panic.c
index c1fcaad337b7..b4543a31a495 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
#define PANIC_PRINT_TIMER_INFO 0x00000004
#define PANIC_PRINT_LOCK_INFO 0x00000008
#define PANIC_PRINT_FTRACE_INFO 0x00000010
+#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
unsigned long panic_print;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -134,6 +135,9 @@ EXPORT_SYMBOL(nmi_panic);
static void panic_print_sys_info(void)
{
+ if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
+ console_flush_on_panic(CONSOLE_REPLAY_ALL);
+
if (panic_print & PANIC_PRINT_TASK_INFO)
show_state();
@@ -277,7 +281,7 @@ void panic(const char *fmt, ...)
* panic() is not being callled from OOPS.
*/
debug_locks_off();
- console_flush_on_panic();
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
panic_print_sys_info();
@@ -306,6 +310,8 @@ void panic(const char *fmt, ...)
* shutting down. But if there is a chance of
* rebooting the system it will be rebooted.
*/
+ if (panic_reboot_mode != REBOOT_UNDEFINED)
+ reboot_mode = panic_reboot_mode;
emergency_restart();
}
#ifdef __sparc__
@@ -321,6 +327,9 @@ void panic(const char *fmt, ...)
disabled_wait();
#endif
pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
+
+ /* Do not scroll important messages printed above */
+ suppress_printk = 1;
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..5b063f0556ba 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -37,12 +37,12 @@
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
-#include <linux/proc_fs.h>
+#include <linux/refcount.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
struct pid init_struct_pid = {
- .count = ATOMIC_INIT(1),
+ .count = REFCOUNT_INIT(1),
.tasks = {
{ .first = NULL },
{ .first = NULL },
@@ -106,8 +106,8 @@ void put_pid(struct pid *pid)
return;
ns = pid->numbers[pid->level].ns;
- if ((atomic_read(&pid->count) == 1) ||
- atomic_dec_and_test(&pid->count)) {
+ if ((refcount_read(&pid->count) == 1) ||
+ refcount_dec_and_test(&pid->count)) {
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -210,7 +210,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
get_pid_ns(ns);
- atomic_set(&pid->count, 1);
+ refcount_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 02ca827b8fac..e1e82506b2dd 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -86,6 +86,12 @@ static DEFINE_SEMAPHORE(console_sem);
struct console *console_drivers;
EXPORT_SYMBOL_GPL(console_drivers);
+/*
+ * System may need to suppress printk message under certain
+ * circumstances, like after kernel panic happens.
+ */
+int __read_mostly suppress_printk;
+
#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
@@ -1943,6 +1949,10 @@ asmlinkage int vprintk_emit(int facility, int level,
unsigned long flags;
u64 curr_log_seq;
+ /* Suppress unimportant messages after panic happens */
+ if (unlikely(suppress_printk))
+ return 0;
+
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
in_sched = true;
@@ -2525,10 +2535,11 @@ void console_unblank(void)
/**
* console_flush_on_panic - flush console content on panic
+ * @mode: flush all messages in buffer or just the pending ones
*
* Immediately output all pending messages no matter what.
*/
-void console_flush_on_panic(void)
+void console_flush_on_panic(enum con_flush_mode mode)
{
/*
* If someone else is holding the console lock, trylock will fail
@@ -2539,6 +2550,11 @@ void console_flush_on_panic(void)
*/
console_trylock();
console_may_schedule = 0;
+
+ if (mode == CONSOLE_REPLAY_ALL) {
+ console_seq = log_first_seq;
+ console_idx = log_first_idx;
+ }
console_unlock();
}
diff --git a/kernel/reboot.c b/kernel/reboot.c
index e1b79b6a2735..b9e79e8c7226 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -31,6 +31,7 @@ EXPORT_SYMBOL(cad_pid);
#define DEFAULT_REBOOT_MODE
#endif
enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED;
/*
* This variable is used privately to keep track of whether or not
@@ -519,6 +520,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot);
static int __init reboot_setup(char *str)
{
for (;;) {
+ enum reboot_mode *mode;
+
/*
* Having anything passed on the command line via
* reboot= will cause us to disable DMI checking
@@ -526,17 +529,24 @@ static int __init reboot_setup(char *str)
*/
reboot_default = 0;
+ if (!strncmp(str, "panic_", 6)) {
+ mode = &panic_reboot_mode;
+ str += 6;
+ } else {
+ mode = &reboot_mode;
+ }
+
switch (*str) {
case 'w':
- reboot_mode = REBOOT_WARM;
+ *mode = REBOOT_WARM;
break;
case 'c':
- reboot_mode = REBOOT_COLD;
+ *mode = REBOOT_COLD;
break;
case 'h':
- reboot_mode = REBOOT_HARD;
+ *mode = REBOOT_HARD;
break;
case 's':
@@ -553,11 +563,11 @@ static int __init reboot_setup(char *str)
if (rc)
return rc;
} else
- reboot_mode = REBOOT_SOFT;
+ *mode = REBOOT_SOFT;
break;
}
case 'g':
- reboot_mode = REBOOT_GPIO;
+ *mode = REBOOT_GPIO;
break;
case 'b':
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 0e97ca9306ef..e88918e0bb6d 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -4,6 +4,9 @@
* Copyright (c) 2018 Facebook, Inc.
* Author: Johannes Weiner <hannes@cmpxchg.org>
*
+ * Polling support by Suren Baghdasaryan <surenb@google.com>
+ * Copyright (c) 2018 Google, Inc.
+ *
* When CPU, memory and IO are contended, tasks experience delays that
* reduce throughput and introduce latencies into the workload. Memory
* and IO contention, in addition, can cause a full loss of forward
@@ -129,9 +132,13 @@
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/seqlock.h>
+#include <linux/uaccess.h>
#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/sched.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/poll.h>
#include <linux/psi.h>
#include "sched.h"
@@ -140,9 +147,9 @@ static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED
-bool psi_enable;
+static bool psi_enable;
#else
-bool psi_enable = true;
+static bool psi_enable = true;
#endif
static int __init setup_psi(char *str)
{
@@ -156,6 +163,11 @@ __setup("psi=", setup_psi);
#define EXP_60s 1981 /* 1/exp(2s/60s) */
#define EXP_300s 2034 /* 1/exp(2s/300s) */
+/* PSI trigger definitions */
+#define WINDOW_MIN_US 500000 /* Min window size is 500ms */
+#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
+#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
+
/* Sampling frequency in nanoseconds */
static u64 psi_period __read_mostly;
@@ -165,7 +177,7 @@ static struct psi_group psi_system = {
.pcpu = &system_group_pcpu,
};
-static void psi_update_work(struct work_struct *work);
+static void psi_avgs_work(struct work_struct *work);
static void group_init(struct psi_group *group)
{
@@ -173,9 +185,20 @@ static void group_init(struct psi_group *group)
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
- group->next_update = sched_clock() + psi_period;
- INIT_DELAYED_WORK(&group->clock_work, psi_update_work);
- mutex_init(&group->stat_lock);
+ group->avg_next_update = sched_clock() + psi_period;
+ INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
+ mutex_init(&group->avgs_lock);
+ /* Init trigger-related members */
+ atomic_set(&group->poll_scheduled, 0);
+ mutex_init(&group->trigger_lock);
+ INIT_LIST_HEAD(&group->triggers);
+ memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
+ group->poll_states = 0;
+ group->poll_min_period = U32_MAX;
+ memset(group->polling_total, 0, sizeof(group->polling_total));
+ group->polling_next_update = ULLONG_MAX;
+ group->polling_until = 0;
+ rcu_assign_pointer(group->poll_kworker, NULL);
}
void __init psi_init(void)
@@ -210,20 +233,24 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
}
}
-static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
+static void get_recent_times(struct psi_group *group, int cpu,
+ enum psi_aggregators aggregator, u32 *times,
+ u32 *pchanged_states)
{
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
- unsigned int tasks[NR_PSI_TASK_COUNTS];
u64 now, state_start;
+ enum psi_states s;
unsigned int seq;
- int s;
+ u32 state_mask;
+
+ *pchanged_states = 0;
/* Snapshot a coherent view of the CPU state */
do {
seq = read_seqcount_begin(&groupc->seq);
now = cpu_clock(cpu);
memcpy(times, groupc->times, sizeof(groupc->times));
- memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
+ state_mask = groupc->state_mask;
state_start = groupc->state_start;
} while (read_seqcount_retry(&groupc->seq, seq));
@@ -239,13 +266,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
* (u32) and our reported pressure close to what's
* actually happening.
*/
- if (test_state(tasks, s))
+ if (state_mask & (1 << s))
times[s] += now - state_start;
- delta = times[s] - groupc->times_prev[s];
- groupc->times_prev[s] = times[s];
+ delta = times[s] - groupc->times_prev[aggregator][s];
+ groupc->times_prev[aggregator][s] = times[s];
times[s] = delta;
+ if (delta)
+ *pchanged_states |= (1 << s);
}
}
@@ -269,17 +298,16 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
avg[2] = calc_load(avg[2], EXP_300s, pct);
}
-static bool update_stats(struct psi_group *group)
+static void collect_percpu_times(struct psi_group *group,
+ enum psi_aggregators aggregator,
+ u32 *pchanged_states)
{
u64 deltas[NR_PSI_STATES - 1] = { 0, };
- unsigned long missed_periods = 0;
unsigned long nonidle_total = 0;
- u64 now, expires, period;
+ u32 changed_states = 0;
int cpu;
int s;
- mutex_lock(&group->stat_lock);
-
/*
* Collect the per-cpu time buckets and average them into a
* single time sample that is normalized to wallclock time.
@@ -291,8 +319,11 @@ static bool update_stats(struct psi_group *group)
for_each_possible_cpu(cpu) {
u32 times[NR_PSI_STATES];
u32 nonidle;
+ u32 cpu_changed_states;
- get_recent_times(group, cpu, times);
+ get_recent_times(group, cpu, aggregator, times,
+ &cpu_changed_states);
+ changed_states |= cpu_changed_states;
nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
nonidle_total += nonidle;
@@ -315,13 +346,22 @@ static bool update_stats(struct psi_group *group)
/* total= */
for (s = 0; s < NR_PSI_STATES - 1; s++)
- group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
+ group->total[aggregator][s] +=
+ div_u64(deltas[s], max(nonidle_total, 1UL));
+
+ if (pchanged_states)
+ *pchanged_states = changed_states;
+}
+
+static u64 update_averages(struct psi_group *group, u64 now)
+{
+ unsigned long missed_periods = 0;
+ u64 expires, period;
+ u64 avg_next_update;
+ int s;
/* avgX= */
- now = sched_clock();
- expires = group->next_update;
- if (now < expires)
- goto out;
+ expires = group->avg_next_update;
if (now - expires >= psi_period)
missed_periods = div_u64(now - expires, psi_period);
@@ -332,14 +372,14 @@ static bool update_stats(struct psi_group *group)
* But the deltas we sample out of the per-cpu buckets above
* are based on the actual time elapsing between clock ticks.
*/
- group->next_update = expires + ((1 + missed_periods) * psi_period);
- period = now - (group->last_update + (missed_periods * psi_period));
- group->last_update = now;
+ avg_next_update = expires + ((1 + missed_periods) * psi_period);
+ period = now - (group->avg_last_update + (missed_periods * psi_period));
+ group->avg_last_update = now;
for (s = 0; s < NR_PSI_STATES - 1; s++) {
u32 sample;
- sample = group->total[s] - group->total_prev[s];
+ sample = group->total[PSI_AVGS][s] - group->avg_total[s];
/*
* Due to the lockless sampling of the time buckets,
* recorded time deltas can slip into the next period,
@@ -359,23 +399,30 @@ static bool update_stats(struct psi_group *group)
*/
if (sample > period)
sample = period;
- group->total_prev[s] += sample;
+ group->avg_total[s] += sample;
calc_avgs(group->avg[s], missed_periods, sample, period);
}
-out:
- mutex_unlock(&group->stat_lock);
- return nonidle_total;
+
+ return avg_next_update;
}
-static void psi_update_work(struct work_struct *work)
+static void psi_avgs_work(struct work_struct *work)
{
struct delayed_work *dwork;
struct psi_group *group;
+ u32 changed_states;
bool nonidle;
+ u64 now;
dwork = to_delayed_work(work);
- group = container_of(dwork, struct psi_group, clock_work);
+ group = container_of(dwork, struct psi_group, avgs_work);
+
+ mutex_lock(&group->avgs_lock);
+ now = sched_clock();
+
+ collect_percpu_times(group, PSI_AVGS, &changed_states);
+ nonidle = changed_states & (1 << PSI_NONIDLE);
/*
* If there is task activity, periodically fold the per-cpu
* times and feed samples into the running averages. If things
@@ -383,18 +430,196 @@ static void psi_update_work(struct work_struct *work)
* Once restarted, we'll catch up the running averages in one
* go - see calc_avgs() and missed_periods.
*/
-
- nonidle = update_stats(group);
+ if (now >= group->avg_next_update)
+ group->avg_next_update = update_averages(group, now);
if (nonidle) {
- unsigned long delay = 0;
- u64 now;
+ schedule_delayed_work(dwork, nsecs_to_jiffies(
+ group->avg_next_update - now) + 1);
+ }
+
+ mutex_unlock(&group->avgs_lock);
+}
+
+/* Trigger tracking window manupulations */
+static void window_reset(struct psi_window *win, u64 now, u64 value,
+ u64 prev_growth)
+{
+ win->start_time = now;
+ win->start_value = value;
+ win->prev_growth = prev_growth;
+}
+
+/*
+ * PSI growth tracking window update and growth calculation routine.
+ *
+ * This approximates a sliding tracking window by interpolating
+ * partially elapsed windows using historical growth data from the
+ * previous intervals. This minimizes memory requirements (by not storing
+ * all the intermediate values in the previous window) and simplifies
+ * the calculations. It works well because PSI signal changes only in
+ * positive direction and over relatively small window sizes the growth
+ * is close to linear.
+ */
+static u64 window_update(struct psi_window *win, u64 now, u64 value)
+{
+ u64 elapsed;
+ u64 growth;
+
+ elapsed = now - win->start_time;
+ growth = value - win->start_value;
+ /*
+ * After each tracking window passes win->start_value and
+ * win->start_time get reset and win->prev_growth stores
+ * the average per-window growth of the previous window.
+ * win->prev_growth is then used to interpolate additional
+ * growth from the previous window assuming it was linear.
+ */
+ if (elapsed > win->size)
+ window_reset(win, now, value, growth);
+ else {
+ u32 remaining;
+
+ remaining = win->size - elapsed;
+ growth += div_u64(win->prev_growth * remaining, win->size);
+ }
+
+ return growth;
+}
+
+static void init_triggers(struct psi_group *group, u64 now)
+{
+ struct psi_trigger *t;
+
+ list_for_each_entry(t, &group->triggers, node)
+ window_reset(&t->win, now,
+ group->total[PSI_POLL][t->state], 0);
+ memcpy(group->polling_total, group->total[PSI_POLL],
+ sizeof(group->polling_total));
+ group->polling_next_update = now + group->poll_min_period;
+}
+
+static u64 update_triggers(struct psi_group *group, u64 now)
+{
+ struct psi_trigger *t;
+ bool new_stall = false;
+ u64 *total = group->total[PSI_POLL];
+
+ /*
+ * On subsequent updates, calculate growth deltas and let
+ * watchers know when their specified thresholds are exceeded.
+ */
+ list_for_each_entry(t, &group->triggers, node) {
+ u64 growth;
+
+ /* Check for stall activity */
+ if (group->polling_total[t->state] == total[t->state])
+ continue;
+
+ /*
+ * Multiple triggers might be looking at the same state,
+ * remember to update group->polling_total[] once we've
+ * been through all of them. Also remember to extend the
+ * polling time if we see new stall activity.
+ */
+ new_stall = true;
+
+ /* Calculate growth since last update */
+ growth = window_update(&t->win, now, total[t->state]);
+ if (growth < t->threshold)
+ continue;
+
+ /* Limit event signaling to once per window */
+ if (now < t->last_event_time + t->win.size)
+ continue;
+
+ /* Generate an event */
+ if (cmpxchg(&t->event, 0, 1) == 0)
+ wake_up_interruptible(&t->event_wait);
+ t->last_event_time = now;
+ }
+
+ if (new_stall)
+ memcpy(group->polling_total, total,
+ sizeof(group->polling_total));
+
+ return now + group->poll_min_period;
+}
+
+/*
+ * Schedule polling if it's not already scheduled. It's safe to call even from
+ * hotpath because even though kthread_queue_delayed_work takes worker->lock
+ * spinlock that spinlock is never contended due to poll_scheduled atomic
+ * preventing such competition.
+ */
+static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
+{
+ struct kthread_worker *kworker;
+
+ /* Do not reschedule if already scheduled */
+ if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
+ return;
+
+ rcu_read_lock();
- now = sched_clock();
- if (group->next_update > now)
- delay = nsecs_to_jiffies(group->next_update - now) + 1;
- schedule_delayed_work(dwork, delay);
+ kworker = rcu_dereference(group->poll_kworker);
+ /*
+ * kworker might be NULL in case psi_trigger_destroy races with
+ * psi_task_change (hotpath) which can't use locks
+ */
+ if (likely(kworker))
+ kthread_queue_delayed_work(kworker, &group->poll_work, delay);
+ else
+ atomic_set(&group->poll_scheduled, 0);
+
+ rcu_read_unlock();
+}
+
+static void psi_poll_work(struct kthread_work *work)
+{
+ struct kthread_delayed_work *dwork;
+ struct psi_group *group;
+ u32 changed_states;
+ u64 now;
+
+ dwork = container_of(work, struct kthread_delayed_work, work);
+ group = container_of(dwork, struct psi_group, poll_work);
+
+ atomic_set(&group->poll_scheduled, 0);
+
+ mutex_lock(&group->trigger_lock);
+
+ now = sched_clock();
+
+ collect_percpu_times(group, PSI_POLL, &changed_states);
+
+ if (changed_states & group->poll_states) {
+ /* Initialize trigger windows when entering polling mode */
+ if (now > group->polling_until)
+ init_triggers(group, now);
+
+ /*
+ * Keep the monitor active for at least the duration of the
+ * minimum tracking window as long as monitor states are
+ * changing.
+ */
+ group->polling_until = now +
+ group->poll_min_period * UPDATES_PER_WINDOW;
+ }
+
+ if (now > group->polling_until) {
+ group->polling_next_update = ULLONG_MAX;
+ goto out;
}
+
+ if (now >= group->polling_next_update)
+ group->polling_next_update = update_triggers(group, now);
+
+ psi_schedule_poll_work(group,
+ nsecs_to_jiffies(group->polling_next_update - now) + 1);
+
+out:
+ mutex_unlock(&group->trigger_lock);
}
static void record_times(struct psi_group_cpu *groupc, int cpu,
@@ -407,15 +632,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
delta = now - groupc->state_start;
groupc->state_start = now;
- if (test_state(groupc->tasks, PSI_IO_SOME)) {
+ if (groupc->state_mask & (1 << PSI_IO_SOME)) {
groupc->times[PSI_IO_SOME] += delta;
- if (test_state(groupc->tasks, PSI_IO_FULL))
+ if (groupc->state_mask & (1 << PSI_IO_FULL))
groupc->times[PSI_IO_FULL] += delta;
}
- if (test_state(groupc->tasks, PSI_MEM_SOME)) {
+ if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
groupc->times[PSI_MEM_SOME] += delta;
- if (test_state(groupc->tasks, PSI_MEM_FULL))
+ if (groupc->state_mask & (1 << PSI_MEM_FULL))
groupc->times[PSI_MEM_FULL] += delta;
else if (memstall_tick) {
u32 sample;
@@ -436,18 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
}
}
- if (test_state(groupc->tasks, PSI_CPU_SOME))
+ if (groupc->state_mask & (1 << PSI_CPU_SOME))
groupc->times[PSI_CPU_SOME] += delta;
- if (test_state(groupc->tasks, PSI_NONIDLE))
+ if (groupc->state_mask & (1 << PSI_NONIDLE))
groupc->times[PSI_NONIDLE] += delta;
}
-static void psi_group_change(struct psi_group *group, int cpu,
- unsigned int clear, unsigned int set)
+static u32 psi_group_change(struct psi_group *group, int cpu,
+ unsigned int clear, unsigned int set)
{
struct psi_group_cpu *groupc;
unsigned int t, m;
+ enum psi_states s;
+ u32 state_mask = 0;
groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -480,7 +707,16 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (set & (1 << t))
groupc->tasks[t]++;
+ /* Calculate state mask representing active states */
+ for (s = 0; s < NR_PSI_STATES; s++) {
+ if (test_state(groupc->tasks, s))
+ state_mask |= (1 << s);
+ }
+ groupc->state_mask = state_mask;
+
write_seqcount_end(&groupc->seq);
+
+ return state_mask;
}
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -537,13 +773,17 @@ void psi_task_change(struct task_struct *task, int clear, int set)
*/
if (unlikely((clear & TSK_RUNNING) &&
(task->flags & PF_WQ_WORKER) &&
- wq_worker_last_func(task) == psi_update_work))
+ wq_worker_last_func(task) == psi_avgs_work))
wake_clock = false;
while ((group = iterate_groups(task, &iter))) {
- psi_group_change(group, cpu, clear, set);
- if (wake_clock && !delayed_work_pending(&group->clock_work))
- schedule_delayed_work(&group->clock_work, PSI_FREQ);
+ u32 state_mask = psi_group_change(group, cpu, clear, set);
+
+ if (state_mask & group->poll_states)
+ psi_schedule_poll_work(group, 1);
+
+ if (wake_clock && !delayed_work_pending(&group->avgs_work))
+ schedule_delayed_work(&group->avgs_work, PSI_FREQ);
}
}
@@ -640,8 +880,10 @@ void psi_cgroup_free(struct cgroup *cgroup)
if (static_branch_likely(&psi_disabled))
return;
- cancel_delayed_work_sync(&cgroup->psi.clock_work);
+ cancel_delayed_work_sync(&cgroup->psi.avgs_work);
free_percpu(cgroup->psi.pcpu);
+ /* All triggers must be removed by now */
+ WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
}
/**
@@ -697,11 +939,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{
int full;
+ u64 now;
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
- update_stats(group);
+ /* Update averages before reporting them */
+ mutex_lock(&group->avgs_lock);
+ now = sched_clock();
+ collect_percpu_times(group, PSI_AVGS, NULL);
+ if (now >= group->avg_next_update)
+ group->avg_next_update = update_averages(group, now);
+ mutex_unlock(&group->avgs_lock);
for (full = 0; full < 2 - (res == PSI_CPU); full++) {
unsigned long avg[3];
@@ -710,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
for (w = 0; w < 3; w++)
avg[w] = group->avg[res * 2 + full][w];
- total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC);
+ total = div_u64(group->total[PSI_AVGS][res * 2 + full],
+ NSEC_PER_USEC);
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
full ? "full" : "some",
@@ -753,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file)
return single_open(file, psi_cpu_show, NULL);
}
+struct psi_trigger *psi_trigger_create(struct psi_group *group,
+ char *buf, size_t nbytes, enum psi_res res)
+{
+ struct psi_trigger *t;
+ enum psi_states state;
+ u32 threshold_us;
+ u32 window_us;
+
+ if (static_branch_likely(&psi_disabled))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
+ state = PSI_IO_SOME + res * 2;
+ else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
+ state = PSI_IO_FULL + res * 2;
+ else
+ return ERR_PTR(-EINVAL);
+
+ if (state >= PSI_NONIDLE)
+ return ERR_PTR(-EINVAL);
+
+ if (window_us < WINDOW_MIN_US ||
+ window_us > WINDOW_MAX_US)
+ return ERR_PTR(-EINVAL);
+
+ /* Check threshold */
+ if (threshold_us == 0 || threshold_us > window_us)
+ return ERR_PTR(-EINVAL);
+
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return ERR_PTR(-ENOMEM);
+
+ t->group = group;
+ t->state = state;
+ t->threshold = threshold_us * NSEC_PER_USEC;
+ t->win.size = window_us * NSEC_PER_USEC;
+ window_reset(&t->win, 0, 0, 0);
+
+ t->event = 0;
+ t->last_event_time = 0;
+ init_waitqueue_head(&t->event_wait);
+ kref_init(&t->refcount);
+
+ mutex_lock(&group->trigger_lock);
+
+ if (!rcu_access_pointer(group->poll_kworker)) {
+ struct sched_param param = {
+ .sched_priority = MAX_RT_PRIO - 1,
+ };
+ struct kthread_worker *kworker;
+
+ kworker = kthread_create_worker(0, "psimon");
+ if (IS_ERR(kworker)) {
+ kfree(t);
+ mutex_unlock(&group->trigger_lock);
+ return ERR_CAST(kworker);
+ }
+ sched_setscheduler(kworker->task, SCHED_FIFO, &param);
+ kthread_init_delayed_work(&group->poll_work,
+ psi_poll_work);
+ rcu_assign_pointer(group->poll_kworker, kworker);
+ }
+
+ list_add(&t->node, &group->triggers);
+ group->poll_min_period = min(group->poll_min_period,
+ div_u64(t->win.size, UPDATES_PER_WINDOW));
+ group->nr_triggers[t->state]++;
+ group->poll_states |= (1 << t->state);
+
+ mutex_unlock(&group->trigger_lock);
+
+ return t;
+}
+
+static void psi_trigger_destroy(struct kref *ref)
+{
+ struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
+ struct psi_group *group = t->group;
+ struct kthread_worker *kworker_to_destroy = NULL;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ /*
+ * Wakeup waiters to stop polling. Can happen if cgroup is deleted
+ * from under a polling process.
+ */
+ wake_up_interruptible(&t->event_wait);
+
+ mutex_lock(&group->trigger_lock);
+
+ if (!list_empty(&t->node)) {
+ struct psi_trigger *tmp;
+ u64 period = ULLONG_MAX;
+
+ list_del(&t->node);
+ group->nr_triggers[t->state]--;
+ if (!group->nr_triggers[t->state])
+ group->poll_states &= ~(1 << t->state);
+ /* reset min update period for the remaining triggers */
+ list_for_each_entry(tmp, &group->triggers, node)
+ period = min(period, div_u64(tmp->win.size,
+ UPDATES_PER_WINDOW));
+ group->poll_min_period = period;
+ /* Destroy poll_kworker when the last trigger is destroyed */
+ if (group->poll_states == 0) {
+ group->polling_until = 0;
+ kworker_to_destroy = rcu_dereference_protected(
+ group->poll_kworker,
+ lockdep_is_held(&group->trigger_lock));
+ rcu_assign_pointer(group->poll_kworker, NULL);
+ }
+ }
+
+ mutex_unlock(&group->trigger_lock);
+
+ /*
+ * Wait for both *trigger_ptr from psi_trigger_replace and
+ * poll_kworker RCUs to complete their read-side critical sections
+ * before destroying the trigger and optionally the poll_kworker
+ */
+ synchronize_rcu();
+ /*
+ * Destroy the kworker after releasing trigger_lock to prevent a
+ * deadlock while waiting for psi_poll_work to acquire trigger_lock
+ */
+ if (kworker_to_destroy) {
+ kthread_cancel_delayed_work_sync(&group->poll_work);
+ kthread_destroy_worker(kworker_to_destroy);
+ }
+ kfree(t);
+}
+
+void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
+{
+ struct psi_trigger *old = *trigger_ptr;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ rcu_assign_pointer(*trigger_ptr, new);
+ if (old)
+ kref_put(&old->refcount, psi_trigger_destroy);
+}
+
+__poll_t psi_trigger_poll(void **trigger_ptr,
+ struct file *file, poll_table *wait)
+{
+ __poll_t ret = DEFAULT_POLLMASK;
+ struct psi_trigger *t;
+
+ if (static_branch_likely(&psi_disabled))
+ return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+
+ rcu_read_lock();
+
+ t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
+ if (!t) {
+ rcu_read_unlock();
+ return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+ }
+ kref_get(&t->refcount);
+
+ rcu_read_unlock();
+
+ poll_wait(file, &t->event_wait, wait);
+
+ if (cmpxchg(&t->event, 1, 0) == 1)
+ ret |= EPOLLPRI;
+
+ kref_put(&t->refcount, psi_trigger_destroy);
+
+ return ret;
+}
+
+static ssize_t psi_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, enum psi_res res)
+{
+ char buf[32];
+ size_t buf_size;
+ struct seq_file *seq;
+ struct psi_trigger *new;
+
+ if (static_branch_likely(&psi_disabled))
+ return -EOPNOTSUPP;
+
+ buf_size = min(nbytes, (sizeof(buf) - 1));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size - 1] = '\0';
+
+ new = psi_trigger_create(&psi_system, buf, nbytes, res);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ seq = file->private_data;
+ /* Take seq->lock to protect seq->private from concurrent writes */
+ mutex_lock(&seq->lock);
+ psi_trigger_replace(&seq->private, new);
+ mutex_unlock(&seq->lock);
+
+ return nbytes;
+}
+
+static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_IO);
+}
+
+static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_MEM);
+}
+
+static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_CPU);
+}
+
+static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
+{
+ struct seq_file *seq = file->private_data;
+
+ return psi_trigger_poll(&seq->private, file, wait);
+}
+
+static int psi_fop_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+
+ psi_trigger_replace(&seq->private, NULL);
+ return single_release(inode, file);
+}
+
static const struct file_operations psi_io_fops = {
.open = psi_io_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .write = psi_io_write,
+ .poll = psi_fop_poll,
+ .release = psi_fop_release,
};
static const struct file_operations psi_memory_fops = {
.open = psi_memory_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .write = psi_memory_write,
+ .poll = psi_fop_poll,
+ .release = psi_fop_release,
};
static const struct file_operations psi_cpu_fops = {
.open = psi_cpu_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .write = psi_cpu_write,
+ .poll = psi_fop_poll,
+ .release = psi_fop_release,
};
static int __init psi_proc_init(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index 62f9aea4a15a..c4dd66436fc5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -840,6 +840,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
*/
if (!sid || sid == task_session(current))
break;
+ /* fall through */
default:
return -EPERM;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 12df0e5434b8..bdbfe8d37418 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1924,7 +1924,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
((unsigned long)prctl_map->__m1 __op \
(unsigned long)prctl_map->__m2) ? 0 : -EINVAL
error = __prctl_check_order(start_code, <, end_code);
- error |= __prctl_check_order(start_data, <, end_data);
+ error |= __prctl_check_order(start_data,<=, end_data);
error |= __prctl_check_order(start_brk, <=, brk);
error |= __prctl_check_order(arg_start, <=, arg_end);
error |= __prctl_check_order(env_start, <=, env_end);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 599510a3355e..943c89178e3d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -66,6 +66,7 @@
#include <linux/kexec.h>
#include <linux/bpf.h>
#include <linux/mount.h>
+#include <linux/userfaultfd_k.h>
#include "../lib/kstrtox.h"
@@ -1720,6 +1721,17 @@ static struct ctl_table vm_table[] = {
.extra2 = (void *)&mmap_rnd_compat_bits_max,
},
#endif
+#ifdef CONFIG_USERFAULTFD
+ {
+ .procname = "unprivileged_userfaultfd",
+ .data = &sysctl_unprivileged_userfaultfd,
+ .maxlen = sizeof(sysctl_unprivileged_userfaultfd),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{ }
};
@@ -2874,8 +2886,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (neg)
continue;
val = convmul * val / convdiv;
- if ((min && val < *min) || (max && val > *max))
- continue;
+ if ((min && val < *min) || (max && val > *max)) {
+ err = -EINVAL;
+ break;
+ }
*i = val;
} else {
val = convdiv * (*i) / convmul;
@@ -3158,17 +3172,19 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
if (write) {
char *kbuf, *p;
+ size_t skipped = 0;
- if (left > PAGE_SIZE - 1)
+ if (left > PAGE_SIZE - 1) {
left = PAGE_SIZE - 1;
+ /* How much of the buffer we'll skip this pass */
+ skipped = *lenp - left;
+ }
p = kbuf = memdup_user_nul(buffer, left);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len),
- sizeof(unsigned long),
- GFP_KERNEL);
+ tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
if (!tmp_bitmap) {
kfree(kbuf);
return -ENOMEM;
@@ -3177,9 +3193,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
while (!err && left) {
unsigned long val_a, val_b;
bool neg;
+ size_t saved_left;
+ /* In case we stop parsing mid-number, we can reset */
+ saved_left = left;
err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
sizeof(tr_a), &c);
+ /*
+ * If we consumed the entirety of a truncated buffer or
+ * only one char is left (may be a "-"), then stop here,
+ * reset, & come back for more.
+ */
+ if ((left <= 1) && skipped) {
+ left = saved_left;
+ break;
+ }
+
if (err)
break;
if (val_a >= bitmap_len || neg) {
@@ -3197,6 +3226,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
err = proc_get_long(&p, &left, &val_b,
&neg, tr_b, sizeof(tr_b),
&c);
+ /*
+ * If we consumed all of a truncated buffer or
+ * then stop here, reset, & come back for more.
+ */
+ if (!left && skipped) {
+ left = saved_left;
+ break;
+ }
+
if (err)
break;
if (val_b >= bitmap_len || neg ||
@@ -3215,6 +3253,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
proc_skip_char(&p, &left, '\n');
}
kfree(kbuf);
+ left += skipped;
} else {
unsigned long bit_a, bit_b = 0;
@@ -3259,7 +3298,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
*ppos += *lenp;
}
- kfree(tmp_bitmap);
+ bitmap_free(tmp_bitmap);
return err;
}
diff --git a/kernel/user.c b/kernel/user.c
index 0df9b1640b2a..88b834f0eebc 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -185,7 +185,7 @@ struct user_struct *alloc_uid(kuid_t uid)
if (!up) {
new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
if (!new)
- goto out_unlock;
+ return NULL;
new->uid = uid;
refcount_set(&new->__count, 1);
@@ -199,8 +199,6 @@ struct user_struct *alloc_uid(kuid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
- key_put(new->uid_keyring);
- key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
} else {
uid_hash_insert(new, hashent);
@@ -210,9 +208,6 @@ struct user_struct *alloc_uid(kuid_t uid)
}
return up;
-
-out_unlock:
- return NULL;
}
static int __init uid_cache_init(void)
diff --git a/lib/Kconfig b/lib/Kconfig
index f871282e0612..8d9239a4156c 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -46,9 +46,6 @@ config HAVE_ARCH_BITREVERSE
This option enables the use of hardware bit-reversal instructions on
architectures which support such operations.
-config RATIONAL
- bool
-
config GENERIC_STRNCPY_FROM_USER
bool
@@ -61,6 +58,8 @@ config GENERIC_NET_UTILS
config GENERIC_FIND_FIRST_BIT
bool
+source "lib/math/Kconfig"
+
config NO_GENERIC_PCI_IOPORT_MAP
bool
@@ -531,12 +530,6 @@ config LRU_CACHE
config CLZ_TAB
bool
-config CORDIC
- tristate "CORDIC algorithm"
- help
- This option provides an implementation of the CORDIC algorithm;
- calculations are in fixed point. Module will be called cordic.
-
config DDR
bool "JEDEC DDR data"
help
@@ -632,9 +625,6 @@ config SBITMAP
config PARMAN
tristate "parman" if COMPILE_TEST
-config PRIME_NUMBERS
- tristate
-
config STRING_SELFTEST
tristate "Test string functions"
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d695ec1477f3..2a75cfef2e20 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -163,6 +163,9 @@ config DYNAMIC_DEBUG
See Documentation/admin-guide/dynamic-debug-howto.rst for additional
information.
+config DYNAMIC_DEBUG_RELATIVE_POINTERS
+ bool
+
endmenu # "printk and dmesg options"
menu "Compile-time checks and compiler options"
@@ -318,6 +321,20 @@ config HEADERS_CHECK
exported to $(INSTALL_HDR_PATH) (usually 'usr/include' in
your build tree), to make sure they're suitable.
+config OPTIMIZE_INLINING
+ bool "Allow compiler to uninline functions marked 'inline'"
+ help
+ This option determines if the kernel forces gcc to inline the functions
+ developers have marked 'inline'. Doing so takes away freedom from gcc to
+ do what it thinks is best, which is desirable for the gcc 3.x series of
+ compilers. The gcc 4.x series have a rewritten inlining algorithm and
+ enabling this option will generate a smaller kernel there. Hopefully
+ this algorithm is so good that allowing gcc 4.x and above to make the
+ decision will become the default in the future. Until then this option
+ is there to test gcc for this.
+
+ If unsure, say N.
+
config DEBUG_SECTION_MISMATCH
bool "Enable full Section mismatch analysis"
help
@@ -446,6 +463,15 @@ config DEBUG_KERNEL
Say Y here if you are developing drivers or trying to debug and
identify kernel problems.
+config DEBUG_MISC
+ bool "Miscellaneous debug code"
+ default DEBUG_KERNEL
+ depends on DEBUG_KERNEL
+ help
+ Say Y here if you need to enable miscellaneous debug code that should
+ be under a more specific debug option but isn't.
+
+
menu "Memory Debugging"
source "mm/Kconfig.debug"
@@ -1358,7 +1384,7 @@ config DEBUG_LIST
If unsure, say N.
-config DEBUG_PI_LIST
+config DEBUG_PLIST
bool "Debug priority linked list manipulation"
depends on DEBUG_KERNEL
help
@@ -2088,6 +2114,12 @@ config IO_STRICT_DEVMEM
If in doubt, say Y.
+config DEBUG_AID_FOR_SYZBOT
+ bool "Additional debug code for syzbot"
+ default n
+ help
+ This option is intended for testing by syzbot.
+
source "arch/$(SRCARCH)/Kconfig.debug"
endmenu # Kernel hacking
diff --git a/lib/Makefile b/lib/Makefile
index 83d7df2661ff..fb7697031a79 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -30,7 +30,7 @@ endif
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o timerqueue.o xarray.o \
- idr.o int_sqrt.o extable.o \
+ idr.o extable.o \
sha1.o chacha.o irq_regs.o argv_split.o \
flex_proportions.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
@@ -44,11 +44,11 @@ lib-$(CONFIG_SMP) += cpumask.o
lib-y += kobject.o klist.o
obj-y += lockref.o
-obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
+obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
- gcd.o lcm.o list_sort.o uuid.o iov_iter.o clz_ctz.o \
+ list_sort.o uuid.o iov_iter.o clz_ctz.o \
bsearch.o find_bit.o llist.o memweight.o kfifo.o \
- percpu-refcount.o rhashtable.o reciprocal_div.o \
+ percpu-refcount.o rhashtable.o \
once.o refcount.o usercopy.o errseq.o bucket_locks.o \
generic-radix-tree.o
obj-$(CONFIG_STRING_SELFTEST) += test_string.o
@@ -102,6 +102,8 @@ endif
obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o
CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any)
+obj-y += math/
+
obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
obj-$(CONFIG_GENERIC_PCI_IOMAP) += pci_iomap.o
obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
@@ -121,7 +123,6 @@ obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o
obj-$(CONFIG_BITREVERSE) += bitrev.o
obj-$(CONFIG_PACKING) += packing.o
-obj-$(CONFIG_RATIONAL) += rational.o
obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o
obj-$(CONFIG_CRC16) += crc16.o
obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o
@@ -195,8 +196,6 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
-obj-$(CONFIG_CORDIC) += cordic.o
-
obj-$(CONFIG_DQL) += dynamic_queue_limits.o
obj-$(CONFIG_GLOB) += glob.o
@@ -238,8 +237,6 @@ obj-$(CONFIG_ASN1) += asn1_decoder.o
obj-$(CONFIG_FONT_SUPPORT) += fonts/
-obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
-
hostprogs-y := gen_crc32table
hostprogs-y += gen_crc64table
clean-files := crc32table.h
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 98872e9025da..f235434df87b 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -20,6 +20,8 @@
#include <asm/page.h>
+#include "kstrtox.h"
+
/**
* DOC: bitmap introduction
*
@@ -477,12 +479,128 @@ int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
}
EXPORT_SYMBOL(bitmap_print_to_pagebuf);
+/*
+ * Region 9-38:4/10 describes the following bitmap structure:
+ * 0 9 12 18 38
+ * .........****......****......****......
+ * ^ ^ ^ ^
+ * start off group_len end
+ */
+struct region {
+ unsigned int start;
+ unsigned int off;
+ unsigned int group_len;
+ unsigned int end;
+};
+
+static int bitmap_set_region(const struct region *r,
+ unsigned long *bitmap, int nbits)
+{
+ unsigned int start;
+
+ if (r->end >= nbits)
+ return -ERANGE;
+
+ for (start = r->start; start <= r->end; start += r->group_len)
+ bitmap_set(bitmap, start, min(r->end - start + 1, r->off));
+
+ return 0;
+}
+
+static int bitmap_check_region(const struct region *r)
+{
+ if (r->start > r->end || r->group_len == 0 || r->off > r->group_len)
+ return -EINVAL;
+
+ return 0;
+}
+
+static const char *bitmap_getnum(const char *str, unsigned int *num)
+{
+ unsigned long long n;
+ unsigned int len;
+
+ len = _parse_integer(str, 10, &n);
+ if (!len)
+ return ERR_PTR(-EINVAL);
+ if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n)
+ return ERR_PTR(-EOVERFLOW);
+
+ *num = n;
+ return str + len;
+}
+
+static inline bool end_of_str(char c)
+{
+ return c == '\0' || c == '\n';
+}
+
+static inline bool __end_of_region(char c)
+{
+ return isspace(c) || c == ',';
+}
+
+static inline bool end_of_region(char c)
+{
+ return __end_of_region(c) || end_of_str(c);
+}
+
+/*
+ * The format allows commas and whitespases at the beginning
+ * of the region.
+ */
+static const char *bitmap_find_region(const char *str)
+{
+ while (__end_of_region(*str))
+ str++;
+
+ return end_of_str(*str) ? NULL : str;
+}
+
+static const char *bitmap_parse_region(const char *str, struct region *r)
+{
+ str = bitmap_getnum(str, &r->start);
+ if (IS_ERR(str))
+ return str;
+
+ if (end_of_region(*str))
+ goto no_end;
+
+ if (*str != '-')
+ return ERR_PTR(-EINVAL);
+
+ str = bitmap_getnum(str + 1, &r->end);
+ if (IS_ERR(str))
+ return str;
+
+ if (end_of_region(*str))
+ goto no_pattern;
+
+ if (*str != ':')
+ return ERR_PTR(-EINVAL);
+
+ str = bitmap_getnum(str + 1, &r->off);
+ if (IS_ERR(str))
+ return str;
+
+ if (*str != '/')
+ return ERR_PTR(-EINVAL);
+
+ return bitmap_getnum(str + 1, &r->group_len);
+
+no_end:
+ r->end = r->start;
+no_pattern:
+ r->off = r->end + 1;
+ r->group_len = r->end + 1;
+
+ return end_of_str(*str) ? NULL : str;
+}
+
/**
- * __bitmap_parselist - convert list format ASCII string to bitmap
- * @buf: read nul-terminated user string from this buffer
- * @buflen: buffer size in bytes. If string is smaller than this
- * then it must be terminated with a \0.
- * @is_user: location of buffer, 0 indicates kernel space
+ * bitmap_parselist - convert list format ASCII string to bitmap
+ * @buf: read user string from this buffer; must be terminated
+ * with a \0 or \n.
* @maskp: write resulting mask here
* @nmaskbits: number of bits in mask to be written
*
@@ -498,127 +616,38 @@ EXPORT_SYMBOL(bitmap_print_to_pagebuf);
*
* Returns: 0 on success, -errno on invalid input strings. Error values:
*
- * - ``-EINVAL``: second number in range smaller than first
+ * - ``-EINVAL``: wrong region format
* - ``-EINVAL``: invalid character in string
* - ``-ERANGE``: bit number specified too large for mask
+ * - ``-EOVERFLOW``: integer overflow in the input parameters
*/
-static int __bitmap_parselist(const char *buf, unsigned int buflen,
- int is_user, unsigned long *maskp,
- int nmaskbits)
+int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits)
{
- unsigned int a, b, old_a, old_b;
- unsigned int group_size, used_size, off;
- int c, old_c, totaldigits, ndigits;
- const char __user __force *ubuf = (const char __user __force *)buf;
- int at_start, in_range, in_partial_range;
+ struct region r;
+ long ret;
- totaldigits = c = 0;
- old_a = old_b = 0;
- group_size = used_size = 0;
bitmap_zero(maskp, nmaskbits);
- do {
- at_start = 1;
- in_range = 0;
- in_partial_range = 0;
- a = b = 0;
- ndigits = totaldigits;
-
- /* Get the next cpu# or a range of cpu#'s */
- while (buflen) {
- old_c = c;
- if (is_user) {
- if (__get_user(c, ubuf++))
- return -EFAULT;
- } else
- c = *buf++;
- buflen--;
- if (isspace(c))
- continue;
-
- /* A '\0' or a ',' signal the end of a cpu# or range */
- if (c == '\0' || c == ',')
- break;
- /*
- * whitespaces between digits are not allowed,
- * but it's ok if whitespaces are on head or tail.
- * when old_c is whilespace,
- * if totaldigits == ndigits, whitespace is on head.
- * if whitespace is on tail, it should not run here.
- * as c was ',' or '\0',
- * the last code line has broken the current loop.
- */
- if ((totaldigits != ndigits) && isspace(old_c))
- return -EINVAL;
- if (c == '/') {
- used_size = a;
- at_start = 1;
- in_range = 0;
- a = b = 0;
- continue;
- }
+ while (buf) {
+ buf = bitmap_find_region(buf);
+ if (buf == NULL)
+ return 0;
- if (c == ':') {
- old_a = a;
- old_b = b;
- at_start = 1;
- in_range = 0;
- in_partial_range = 1;
- a = b = 0;
- continue;
- }
+ buf = bitmap_parse_region(buf, &r);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
- if (c == '-') {
- if (at_start || in_range)
- return -EINVAL;
- b = 0;
- in_range = 1;
- at_start = 1;
- continue;
- }
+ ret = bitmap_check_region(&r);
+ if (ret)
+ return ret;
- if (!isdigit(c))
- return -EINVAL;
+ ret = bitmap_set_region(&r, maskp, nmaskbits);
+ if (ret)
+ return ret;
+ }
- b = b * 10 + (c - '0');
- if (!in_range)
- a = b;
- at_start = 0;
- totaldigits++;
- }
- if (ndigits == totaldigits)
- continue;
- if (in_partial_range) {
- group_size = a;
- a = old_a;
- b = old_b;
- old_a = old_b = 0;
- } else {
- used_size = group_size = b - a + 1;
- }
- /* if no digit is after '-', it's wrong*/
- if (at_start && in_range)
- return -EINVAL;
- if (!(a <= b) || group_size == 0 || !(used_size <= group_size))
- return -EINVAL;
- if (b >= nmaskbits)
- return -ERANGE;
- while (a <= b) {
- off = min(b - a + 1, used_size);
- bitmap_set(maskp, a, off);
- a += group_size;
- }
- } while (buflen && c == ',');
return 0;
}
-
-int bitmap_parselist(const char *bp, unsigned long *maskp, int nmaskbits)
-{
- char *nl = strchrnul(bp, '\n');
- int len = nl - bp;
-
- return __bitmap_parselist(bp, len, 0, maskp, nmaskbits);
-}
EXPORT_SYMBOL(bitmap_parselist);
@@ -632,23 +661,27 @@ EXPORT_SYMBOL(bitmap_parselist);
* @nmaskbits: size of bitmap, in bits.
*
* Wrapper for bitmap_parselist(), providing it with user buffer.
- *
- * We cannot have this as an inline function in bitmap.h because it needs
- * linux/uaccess.h to get the access_ok() declaration and this causes
- * cyclic dependencies.
*/
int bitmap_parselist_user(const char __user *ubuf,
unsigned int ulen, unsigned long *maskp,
int nmaskbits)
{
- if (!access_ok(ubuf, ulen))
- return -EFAULT;
- return __bitmap_parselist((const char __force *)ubuf,
- ulen, 1, maskp, nmaskbits);
+ char *buf;
+ int ret;
+
+ buf = memdup_user_nul(ubuf, ulen);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
+ ret = bitmap_parselist(buf, maskp, nmaskbits);
+
+ kfree(buf);
+ return ret;
}
EXPORT_SYMBOL(bitmap_parselist_user);
+#ifdef CONFIG_NUMA
/**
* bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
* @buf: pointer to a bitmap
@@ -757,7 +790,6 @@ void bitmap_remap(unsigned long *dst, const unsigned long *src,
set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst);
}
}
-EXPORT_SYMBOL(bitmap_remap);
/**
* bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit
@@ -795,7 +827,6 @@ int bitmap_bitremap(int oldbit, const unsigned long *old,
else
return bitmap_ord_to_pos(new, n % w, bits);
}
-EXPORT_SYMBOL(bitmap_bitremap);
/**
* bitmap_onto - translate one bitmap relative to another
@@ -930,7 +961,6 @@ void bitmap_onto(unsigned long *dst, const unsigned long *orig,
m++;
}
}
-EXPORT_SYMBOL(bitmap_onto);
/**
* bitmap_fold - fold larger bitmap into smaller, modulo specified size
@@ -955,7 +985,7 @@ void bitmap_fold(unsigned long *dst, const unsigned long *orig,
for_each_set_bit(oldbit, orig, nbits)
set_bit(oldbit % sz, dst);
}
-EXPORT_SYMBOL(bitmap_fold);
+#endif /* CONFIG_NUMA */
/*
* Common code for bitmap_*_region() routines.
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 55437fd5128b..1546a3c00709 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -375,6 +375,7 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
struct debug_bucket *db;
struct debug_obj *obj;
unsigned long flags;
+ bool check_object_on_stack = false;
fill_pool();
@@ -391,7 +392,7 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
debug_objects_oom();
return;
}
- debug_object_is_on_stack(addr, onstack);
+ check_object_on_stack = true;
}
switch (obj->state) {
@@ -402,20 +403,24 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
break;
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "init");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "init");
debug_object_fixup(descr->fixup_init, addr, state);
return;
case ODEBUG_STATE_DESTROYED:
+ raw_spin_unlock_irqrestore(&db->lock, flags);
debug_print_object(obj, "init");
- break;
+ return;
default:
break;
}
raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (check_object_on_stack)
+ debug_object_is_on_stack(addr, onstack);
+
}
/**
@@ -473,33 +478,34 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
obj = lookup_object(addr, db);
if (obj) {
+ ret = 0;
switch (obj->state) {
case ODEBUG_STATE_INIT:
case ODEBUG_STATE_INACTIVE:
obj->state = ODEBUG_STATE_ACTIVE;
- ret = 0;
break;
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "activate");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "activate");
ret = debug_object_fixup(descr->fixup_activate, addr, state);
return ret ? 0 : -EINVAL;
case ODEBUG_STATE_DESTROYED:
- debug_print_object(obj, "activate");
ret = -EINVAL;
break;
default:
- ret = 0;
break;
}
raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (ret)
+ debug_print_object(obj, "activate");
return ret;
}
raw_spin_unlock_irqrestore(&db->lock, flags);
+
/*
* We are here when a static object is activated. We
* let the type specific code confirm whether this is
@@ -548,24 +554,30 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr)
if (!obj->astate)
obj->state = ODEBUG_STATE_INACTIVE;
else
- debug_print_object(obj, "deactivate");
+ goto out_unlock_print;
break;
case ODEBUG_STATE_DESTROYED:
- debug_print_object(obj, "deactivate");
- break;
+ goto out_unlock_print;
+
default:
break;
}
- } else {
+ }
+
+ raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (!obj) {
struct debug_obj o = { .object = addr,
.state = ODEBUG_STATE_NOTAVAILABLE,
.descr = descr };
debug_print_object(&o, "deactivate");
}
+ return;
+out_unlock_print:
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "deactivate");
}
EXPORT_SYMBOL_GPL(debug_object_deactivate);
@@ -599,15 +611,16 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr)
obj->state = ODEBUG_STATE_DESTROYED;
break;
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "destroy");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "destroy");
debug_object_fixup(descr->fixup_destroy, addr, state);
return;
case ODEBUG_STATE_DESTROYED:
+ raw_spin_unlock_irqrestore(&db->lock, flags);
debug_print_object(obj, "destroy");
- break;
+ return;
default:
break;
}
@@ -641,9 +654,9 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr)
switch (obj->state) {
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "free");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "free");
debug_object_fixup(descr->fixup_free, addr, state);
return;
default:
@@ -731,22 +744,27 @@ debug_object_active_state(void *addr, struct debug_obj_descr *descr,
if (obj->astate == expect)
obj->astate = next;
else
- debug_print_object(obj, "active_state");
+ goto out_unlock_print;
break;
default:
- debug_print_object(obj, "active_state");
- break;
+ goto out_unlock_print;
}
- } else {
+ }
+
+ raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (!obj) {
struct debug_obj o = { .object = addr,
.state = ODEBUG_STATE_NOTAVAILABLE,
.descr = descr };
debug_print_object(&o, "active_state");
}
+ return;
+out_unlock_print:
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "active_state");
}
EXPORT_SYMBOL_GPL(debug_object_active_state);
@@ -782,10 +800,10 @@ repeat:
switch (obj->state) {
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "free");
descr = obj->descr;
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "free");
debug_object_fixup(descr->fixup_free,
(void *) oaddr, state);
goto repeat;
@@ -978,19 +996,24 @@ check_results(void *addr, enum debug_obj_state state, int fixups, int warnings)
struct debug_obj *obj;
unsigned long flags;
int res = -EINVAL;
+ enum debug_obj_state obj_state;
db = get_bucket((unsigned long) addr);
raw_spin_lock_irqsave(&db->lock, flags);
obj = lookup_object(addr, db);
+ obj_state = obj ? obj->state : state;
+
+ raw_spin_unlock_irqrestore(&db->lock, flags);
+
if (!obj && state != ODEBUG_STATE_NONE) {
WARN(1, KERN_ERR "ODEBUG: selftest object not found\n");
goto out;
}
- if (obj && obj->state != state) {
+ if (obj_state != state) {
WARN(1, KERN_ERR "ODEBUG: selftest wrong state: %d != %d\n",
- obj->state, state);
+ obj_state, state);
goto out;
}
if (fixups != debug_objects_fixups) {
@@ -1005,7 +1028,6 @@ check_results(void *addr, enum debug_obj_state state, int fixups, int warnings)
}
res = 0;
out:
- raw_spin_unlock_irqrestore(&db->lock, flags);
if (res)
debug_objects_enabled = 0;
return res;
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 8a16c2d498e9..f1646795692c 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -39,6 +39,55 @@
#include <rdma/ib_verbs.h>
+#ifdef CONFIG_DYNAMIC_DEBUG_RELATIVE_POINTERS
+static inline const char *dd_modname(const struct _ddebug *dd)
+{
+ return (const char *)dd + dd->modname_disp;
+}
+static inline const char *dd_function(const struct _ddebug *dd)
+{
+ return (const char *)dd + dd->function_disp;
+}
+static inline const char *dd_filename(const struct _ddebug *dd)
+{
+ return (const char *)dd + dd->filename_disp;
+}
+static inline const char *dd_format(const struct _ddebug *dd)
+{
+ return (const char *)dd + dd->format_disp;
+}
+#else
+static inline const char *dd_modname(const struct _ddebug *dd)
+{
+ return dd->modname;
+}
+static inline const char *dd_function(const struct _ddebug *dd)
+{
+ return dd->function;
+}
+static inline const char *dd_filename(const struct _ddebug *dd)
+{
+ return dd->filename;
+}
+static inline const char *dd_format(const struct _ddebug *dd)
+{
+ return dd->format;
+}
+#endif
+
+static inline unsigned dd_lineno(const struct _ddebug *dd)
+{
+ return dd->flags_lineno >> 8;
+}
+static inline unsigned dd_flags(const struct _ddebug *dd)
+{
+ return dd->flags_lineno & 0xff;
+}
+static inline void dd_set_flags(struct _ddebug *dd, unsigned newflags)
+{
+ dd->flags_lineno = (dd_lineno(dd) << 8) | newflags;
+}
+
extern struct _ddebug __start___verbose[];
extern struct _ddebug __stop___verbose[];
@@ -96,7 +145,7 @@ static char *ddebug_describe_flags(struct _ddebug *dp, char *buf,
BUG_ON(maxlen < 6);
for (i = 0; i < ARRAY_SIZE(opt_array); ++i)
- if (dp->flags & opt_array[i].flag)
+ if (dd_flags(dp) & opt_array[i].flag)
*p++ = opt_array[i].opt_char;
if (p == buf)
*p++ = '_';
@@ -142,7 +191,7 @@ static int ddebug_change(const struct ddebug_query *query,
{
int i;
struct ddebug_table *dt;
- unsigned int newflags;
+ unsigned int newflags, oldflags;
unsigned int nfound = 0;
char flagbuf[10];
@@ -160,47 +209,48 @@ static int ddebug_change(const struct ddebug_query *query,
/* match against the source filename */
if (query->filename &&
- !match_wildcard(query->filename, dp->filename) &&
+ !match_wildcard(query->filename, dd_filename(dp)) &&
!match_wildcard(query->filename,
- kbasename(dp->filename)) &&
+ kbasename(dd_filename(dp))) &&
!match_wildcard(query->filename,
- trim_prefix(dp->filename)))
+ trim_prefix(dd_filename(dp))))
continue;
/* match against the function */
if (query->function &&
- !match_wildcard(query->function, dp->function))
+ !match_wildcard(query->function, dd_function(dp)))
continue;
/* match against the format */
if (query->format &&
- !strstr(dp->format, query->format))
+ !strstr(dd_format(dp), query->format))
continue;
/* match against the line number range */
if (query->first_lineno &&
- dp->lineno < query->first_lineno)
+ dd_lineno(dp) < query->first_lineno)
continue;
if (query->last_lineno &&
- dp->lineno > query->last_lineno)
+ dd_lineno(dp) > query->last_lineno)
continue;
nfound++;
- newflags = (dp->flags & mask) | flags;
- if (newflags == dp->flags)
+ oldflags = dd_flags(dp);
+ newflags = (oldflags & mask) | flags;
+ if (newflags == oldflags)
continue;
#ifdef CONFIG_JUMP_LABEL
- if (dp->flags & _DPRINTK_FLAGS_PRINT) {
+ if (oldflags & _DPRINTK_FLAGS_PRINT) {
if (!(flags & _DPRINTK_FLAGS_PRINT))
static_branch_disable(&dp->key.dd_key_true);
} else if (flags & _DPRINTK_FLAGS_PRINT)
static_branch_enable(&dp->key.dd_key_true);
#endif
- dp->flags = newflags;
+ dd_set_flags(dp, newflags);
vpr_info("changed %s:%d [%s]%s =%s\n",
- trim_prefix(dp->filename), dp->lineno,
- dt->mod_name, dp->function,
+ trim_prefix(dd_filename(dp)), dd_lineno(dp),
+ dt->mod_name, dd_function(dp),
ddebug_describe_flags(dp, flagbuf,
sizeof(flagbuf)));
}
@@ -522,10 +572,11 @@ static char *dynamic_emit_prefix(const struct _ddebug *desc, char *buf)
{
int pos_after_tid;
int pos = 0;
+ unsigned flags = dd_flags(desc);
*buf = '\0';
- if (desc->flags & _DPRINTK_FLAGS_INCL_TID) {
+ if (flags & _DPRINTK_FLAGS_INCL_TID) {
if (in_interrupt())
pos += snprintf(buf + pos, remaining(pos), "<intr> ");
else
@@ -533,15 +584,15 @@ static char *dynamic_emit_prefix(const struct _ddebug *desc, char *buf)
task_pid_vnr(current));
}
pos_after_tid = pos;
- if (desc->flags & _DPRINTK_FLAGS_INCL_MODNAME)
+ if (flags & _DPRINTK_FLAGS_INCL_MODNAME)
pos += snprintf(buf + pos, remaining(pos), "%s:",
- desc->modname);
- if (desc->flags & _DPRINTK_FLAGS_INCL_FUNCNAME)
+ dd_modname(desc));
+ if (flags & _DPRINTK_FLAGS_INCL_FUNCNAME)
pos += snprintf(buf + pos, remaining(pos), "%s:",
- desc->function);
- if (desc->flags & _DPRINTK_FLAGS_INCL_LINENO)
+ dd_function(desc));
+ if (flags & _DPRINTK_FLAGS_INCL_LINENO)
pos += snprintf(buf + pos, remaining(pos), "%d:",
- desc->lineno);
+ dd_lineno(desc));
if (pos - pos_after_tid)
pos += snprintf(buf + pos, remaining(pos), " ");
if (pos >= PREFIX_SIZE)
@@ -827,10 +878,10 @@ static int ddebug_proc_show(struct seq_file *m, void *p)
}
seq_printf(m, "%s:%u [%s]%s =%s \"",
- trim_prefix(dp->filename), dp->lineno,
- iter->table->mod_name, dp->function,
+ trim_prefix(dd_filename(dp)), dd_lineno(dp),
+ iter->table->mod_name, dd_function(dp),
ddebug_describe_flags(dp, flagsbuf, sizeof(flagsbuf)));
- seq_escape(m, dp->format, "\t\r\n\"");
+ seq_escape(m, dd_format(dp), "\t\r\n\"");
seq_puts(m, "\"\n");
return 0;
@@ -1024,20 +1075,20 @@ static int __init dynamic_debug_init(void)
return 1;
}
iter = __start___verbose;
- modname = iter->modname;
+ modname = dd_modname(iter);
iter_start = iter;
for (; iter < __stop___verbose; iter++) {
entries++;
- verbose_bytes += strlen(iter->modname) + strlen(iter->function)
- + strlen(iter->filename) + strlen(iter->format);
+ verbose_bytes += strlen(dd_modname(iter)) + strlen(dd_function(iter))
+ + strlen(dd_filename(iter)) + strlen(dd_format(iter));
- if (strcmp(modname, iter->modname)) {
+ if (strcmp(modname, dd_modname(iter))) {
modct++;
ret = ddebug_add_module(iter_start, n, modname);
if (ret)
goto out_err;
n = 0;
- modname = iter->modname;
+ modname = dd_modname(iter);
iter_start = iter;
}
n++;
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 7e85d1e37a6e..2e45b3a94313 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -424,7 +424,7 @@ void gen_pool_for_each_chunk(struct gen_pool *pool,
EXPORT_SYMBOL(gen_pool_for_each_chunk);
/**
- * addr_in_gen_pool - checks if an address falls within the range of a pool
+ * gen_pool_has_addr - checks if an address falls within the range of a pool
* @pool: the generic memory pool
* @start: start address
* @size: size of the region
@@ -432,7 +432,7 @@ EXPORT_SYMBOL(gen_pool_for_each_chunk);
* Check if the range of addresses falls within the specified pool. Returns
* true if the entire range is contained in the pool and false otherwise.
*/
-bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start,
size_t size)
{
bool found = false;
@@ -451,6 +451,7 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
rcu_read_unlock();
return found;
}
+EXPORT_SYMBOL(gen_pool_has_addr);
/**
* gen_pool_avail - get available free space of the pool
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index b396d328a764..f74fa832f3aa 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1293,7 +1293,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
len = maxpages * PAGE_SIZE;
addr &= ~(PAGE_SIZE - 1);
n = DIV_ROUND_UP(len, PAGE_SIZE);
- res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, pages);
+ res = get_user_pages_fast(addr, n,
+ iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0,
+ pages);
if (unlikely(res < 0))
return res;
return (res == n ? len : res * PAGE_SIZE) - *start;
@@ -1374,7 +1376,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
p = get_pages_array(n);
if (!p)
return -ENOMEM;
- res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, p);
+ res = get_user_pages_fast(addr, n,
+ iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p);
if (unlikely(res < 0)) {
kvfree(p);
return res;
diff --git a/lib/list_sort.c b/lib/list_sort.c
index 85759928215b..06e900c5587b 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -7,33 +7,41 @@
#include <linux/list_sort.h>
#include <linux/list.h>
-#define MAX_LIST_LENGTH_BITS 20
+typedef int __attribute__((nonnull(2,3))) (*cmp_func)(void *,
+ struct list_head const *, struct list_head const *);
/*
* Returns a list organized in an intermediate format suited
* to chaining of merge() calls: null-terminated, no reserved or
* sentinel head node, "prev" links not maintained.
*/
-static struct list_head *merge(void *priv,
- int (*cmp)(void *priv, struct list_head *a,
- struct list_head *b),
+__attribute__((nonnull(2,3,4)))
+static struct list_head *merge(void *priv, cmp_func cmp,
struct list_head *a, struct list_head *b)
{
- struct list_head head, *tail = &head;
+ struct list_head *head, **tail = &head;
- while (a && b) {
+ for (;;) {
/* if equal, take 'a' -- important for sort stability */
- if ((*cmp)(priv, a, b) <= 0) {
- tail->next = a;
+ if (cmp(priv, a, b) <= 0) {
+ *tail = a;
+ tail = &a->next;
a = a->next;
+ if (!a) {
+ *tail = b;
+ break;
+ }
} else {
- tail->next = b;
+ *tail = b;
+ tail = &b->next;
b = b->next;
+ if (!b) {
+ *tail = a;
+ break;
+ }
}
- tail = tail->next;
}
- tail->next = a?:b;
- return head.next;
+ return head;
}
/*
@@ -43,44 +51,52 @@ static struct list_head *merge(void *priv,
* prev-link restoration pass, or maintaining the prev links
* throughout.
*/
-static void merge_and_restore_back_links(void *priv,
- int (*cmp)(void *priv, struct list_head *a,
- struct list_head *b),
- struct list_head *head,
- struct list_head *a, struct list_head *b)
+__attribute__((nonnull(2,3,4,5)))
+static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
+ struct list_head *a, struct list_head *b)
{
struct list_head *tail = head;
u8 count = 0;
- while (a && b) {
+ for (;;) {
/* if equal, take 'a' -- important for sort stability */
- if ((*cmp)(priv, a, b) <= 0) {
+ if (cmp(priv, a, b) <= 0) {
tail->next = a;
a->prev = tail;
+ tail = a;
a = a->next;
+ if (!a)
+ break;
} else {
tail->next = b;
b->prev = tail;
+ tail = b;
b = b->next;
+ if (!b) {
+ b = a;
+ break;
+ }
}
- tail = tail->next;
}
- tail->next = a ? : b;
+ /* Finish linking remainder of list b on to tail */
+ tail->next = b;
do {
/*
- * In worst cases this loop may run many iterations.
+ * If the merge is highly unbalanced (e.g. the input is
+ * already sorted), this loop may run many iterations.
* Continue callbacks to the client even though no
* element comparison is needed, so the client's cmp()
* routine can invoke cond_resched() periodically.
*/
- if (unlikely(!(++count)))
- (*cmp)(priv, tail->next, tail->next);
-
- tail->next->prev = tail;
- tail = tail->next;
- } while (tail->next);
-
+ if (unlikely(!++count))
+ cmp(priv, b, b);
+ b->prev = tail;
+ tail = b;
+ b = b->next;
+ } while (b);
+
+ /* And the final links to make a circular doubly-linked list */
tail->next = head;
head->prev = tail;
}
@@ -91,55 +107,149 @@ static void merge_and_restore_back_links(void *priv,
* @head: the list to sort
* @cmp: the elements comparison function
*
- * This function implements "merge sort", which has O(nlog(n))
- * complexity.
+ * The comparison funtion @cmp must return > 0 if @a should sort after
+ * @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should
+ * sort before @b *or* their original order should be preserved. It is
+ * always called with the element that came first in the input in @a,
+ * and list_sort is a stable sort, so it is not necessary to distinguish
+ * the @a < @b and @a == @b cases.
+ *
+ * This is compatible with two styles of @cmp function:
+ * - The traditional style which returns <0 / =0 / >0, or
+ * - Returning a boolean 0/1.
+ * The latter offers a chance to save a few cycles in the comparison
+ * (which is used by e.g. plug_ctx_cmp() in block/blk-mq.c).
+ *
+ * A good way to write a multi-word comparison is
+ * if (a->high != b->high)
+ * return a->high > b->high;
+ * if (a->middle != b->middle)
+ * return a->middle > b->middle;
+ * return a->low > b->low;
+ *
+ *
+ * This mergesort is as eager as possible while always performing at least
+ * 2:1 balanced merges. Given two pending sublists of size 2^k, they are
+ * merged to a size-2^(k+1) list as soon as we have 2^k following elements.
+ *
+ * Thus, it will avoid cache thrashing as long as 3*2^k elements can
+ * fit into the cache. Not quite as good as a fully-eager bottom-up
+ * mergesort, but it does use 0.2*n fewer comparisons, so is faster in
+ * the common case that everything fits into L1.
+ *
+ *
+ * The merging is controlled by "count", the number of elements in the
+ * pending lists. This is beautiully simple code, but rather subtle.
*
- * The comparison function @cmp must return a negative value if @a
- * should sort before @b, and a positive value if @a should sort after
- * @b. If @a and @b are equivalent, and their original relative
- * ordering is to be preserved, @cmp must return 0.
+ * Each time we increment "count", we set one bit (bit k) and clear
+ * bits k-1 .. 0. Each time this happens (except the very first time
+ * for each bit, when count increments to 2^k), we merge two lists of
+ * size 2^k into one list of size 2^(k+1).
+ *
+ * This merge happens exactly when the count reaches an odd multiple of
+ * 2^k, which is when we have 2^k elements pending in smaller lists,
+ * so it's safe to merge away two lists of size 2^k.
+ *
+ * After this happens twice, we have created two lists of size 2^(k+1),
+ * which will be merged into a list of size 2^(k+2) before we create
+ * a third list of size 2^(k+1), so there are never more than two pending.
+ *
+ * The number of pending lists of size 2^k is determined by the
+ * state of bit k of "count" plus two extra pieces of information:
+ * - The state of bit k-1 (when k == 0, consider bit -1 always set), and
+ * - Whether the higher-order bits are zero or non-zero (i.e.
+ * is count >= 2^(k+1)).
+ * There are six states we distinguish. "x" represents some arbitrary
+ * bits, and "y" represents some arbitrary non-zero bits:
+ * 0: 00x: 0 pending of size 2^k; x pending of sizes < 2^k
+ * 1: 01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * 2: x10x: 0 pending of size 2^k; 2^k + x pending of sizes < 2^k
+ * 3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * 4: y00x: 1 pending of size 2^k; 2^k + x pending of sizes < 2^k
+ * 5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * (merge and loop back to state 2)
+ *
+ * We gain lists of size 2^k in the 2->3 and 4->5 transitions (because
+ * bit k-1 is set while the more significant bits are non-zero) and
+ * merge them away in the 5->2 transition. Note in particular that just
+ * before the 5->2 transition, all lower-order bits are 11 (state 3),
+ * so there is one list of each smaller size.
+ *
+ * When we reach the end of the input, we merge all the pending
+ * lists, from smallest to largest. If you work through cases 2 to
+ * 5 above, you can see that the number of elements we merge with a list
+ * of size 2^k varies from 2^(k-1) (cases 3 and 5 when x == 0) to
+ * 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1).
*/
+__attribute__((nonnull(2,3)))
void list_sort(void *priv, struct list_head *head,
int (*cmp)(void *priv, struct list_head *a,
struct list_head *b))
{
- struct list_head *part[MAX_LIST_LENGTH_BITS+1]; /* sorted partial lists
- -- last slot is a sentinel */
- int lev; /* index into part[] */
- int max_lev = 0;
- struct list_head *list;
+ struct list_head *list = head->next, *pending = NULL;
+ size_t count = 0; /* Count of pending */
- if (list_empty(head))
+ if (list == head->prev) /* Zero or one elements */
return;
- memset(part, 0, sizeof(part));
-
+ /* Convert to a null-terminated singly-linked list. */
head->prev->next = NULL;
- list = head->next;
-
- while (list) {
- struct list_head *cur = list;
- list = list->next;
- cur->next = NULL;
- for (lev = 0; part[lev]; lev++) {
- cur = merge(priv, cmp, part[lev], cur);
- part[lev] = NULL;
- }
- if (lev > max_lev) {
- if (unlikely(lev >= ARRAY_SIZE(part)-1)) {
- printk_once(KERN_DEBUG "list too long for efficiency\n");
- lev--;
- }
- max_lev = lev;
+ /*
+ * Data structure invariants:
+ * - All lists are singly linked and null-terminated; prev
+ * pointers are not maintained.
+ * - pending is a prev-linked "list of lists" of sorted
+ * sublists awaiting further merging.
+ * - Each of the sorted sublists is power-of-two in size.
+ * - Sublists are sorted by size and age, smallest & newest at front.
+ * - There are zero to two sublists of each size.
+ * - A pair of pending sublists are merged as soon as the number
+ * of following pending elements equals their size (i.e.
+ * each time count reaches an odd multiple of that size).
+ * That ensures each later final merge will be at worst 2:1.
+ * - Each round consists of:
+ * - Merging the two sublists selected by the highest bit
+ * which flips when count is incremented, and
+ * - Adding an element from the input as a size-1 sublist.
+ */
+ do {
+ size_t bits;
+ struct list_head **tail = &pending;
+
+ /* Find the least-significant clear bit in count */
+ for (bits = count; bits & 1; bits >>= 1)
+ tail = &(*tail)->prev;
+ /* Do the indicated merge */
+ if (likely(bits)) {
+ struct list_head *a = *tail, *b = a->prev;
+
+ a = merge(priv, (cmp_func)cmp, b, a);
+ /* Install the merged result in place of the inputs */
+ a->prev = b->prev;
+ *tail = a;
}
- part[lev] = cur;
- }
- for (lev = 0; lev < max_lev; lev++)
- if (part[lev])
- list = merge(priv, cmp, part[lev], list);
-
- merge_and_restore_back_links(priv, cmp, head, part[max_lev], list);
+ /* Move one element from input list to pending */
+ list->prev = pending;
+ pending = list;
+ list = list->next;
+ pending->next = NULL;
+ count++;
+ } while (list);
+
+ /* End of input; merge together all the pending lists. */
+ list = pending;
+ pending = pending->prev;
+ for (;;) {
+ struct list_head *next = pending->prev;
+
+ if (!next)
+ break;
+ list = merge(priv, (cmp_func)cmp, pending, list);
+ pending = next;
+ }
+ /* The final merge, rebuilding prev links */
+ merge_final(priv, (cmp_func)cmp, head, pending, list);
}
EXPORT_SYMBOL(list_sort);
diff --git a/lib/math/Kconfig b/lib/math/Kconfig
new file mode 100644
index 000000000000..73bdf37178d1
--- /dev/null
+++ b/lib/math/Kconfig
@@ -0,0 +1,11 @@
+config CORDIC
+ tristate "CORDIC algorithm"
+ help
+ This option provides an implementation of the CORDIC algorithm;
+ calculations are in fixed point. Module will be called cordic.
+
+config PRIME_NUMBERS
+ tristate
+
+config RATIONAL
+ bool
diff --git a/lib/math/Makefile b/lib/math/Makefile
new file mode 100644
index 000000000000..583bbfebfc09
--- /dev/null
+++ b/lib/math/Makefile
@@ -0,0 +1,5 @@
+obj-y += div64.o gcd.o lcm.o int_pow.o int_sqrt.o reciprocal_div.o
+
+obj-$(CONFIG_CORDIC) += cordic.o
+obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
+obj-$(CONFIG_RATIONAL) += rational.o
diff --git a/lib/cordic.c b/lib/math/cordic.c
index 8ef27c12956f..8ef27c12956f 100644
--- a/lib/cordic.c
+++ b/lib/math/cordic.c
diff --git a/lib/div64.c b/lib/math/div64.c
index ee146bb4c558..368ca7fd0d82 100644
--- a/lib/div64.c
+++ b/lib/math/div64.c
@@ -10,7 +10,7 @@
* Generic C version of 64bit/32bit division and modulo, with
* 64bit result and 32bit remainder.
*
- * The fast case for (n>>32 == 0) is handled inline by do_div().
+ * The fast case for (n>>32 == 0) is handled inline by do_div().
*
* Code generated for this function might be very inefficient
* for some CPUs. __div64_32() can be overridden by linking arch-specific
diff --git a/lib/gcd.c b/lib/math/gcd.c
index 7948ab27f0a4..7948ab27f0a4 100644
--- a/lib/gcd.c
+++ b/lib/math/gcd.c
diff --git a/lib/math/int_pow.c b/lib/math/int_pow.c
new file mode 100644
index 000000000000..622fc1ab3c74
--- /dev/null
+++ b/lib/math/int_pow.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * An integer based power function
+ *
+ * Derived from drivers/video/backlight/pwm_bl.c
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+/**
+ * int_pow - computes the exponentiation of the given base and exponent
+ * @base: base which will be raised to the given power
+ * @exp: power to be raised to
+ *
+ * Computes: pow(base, exp), i.e. @base raised to the @exp power
+ */
+u64 int_pow(u64 base, unsigned int exp)
+{
+ u64 result = 1;
+
+ while (exp) {
+ if (exp & 1)
+ result *= base;
+ exp >>= 1;
+ base *= base;
+ }
+
+ return result;
+}
+EXPORT_SYMBOL_GPL(int_pow);
diff --git a/lib/int_sqrt.c b/lib/math/int_sqrt.c
index 30e0f9770f88..30e0f9770f88 100644
--- a/lib/int_sqrt.c
+++ b/lib/math/int_sqrt.c
diff --git a/lib/lcm.c b/lib/math/lcm.c
index 03d7fcb420b5..03d7fcb420b5 100644
--- a/lib/lcm.c
+++ b/lib/math/lcm.c
diff --git a/lib/prime_numbers.c b/lib/math/prime_numbers.c
index 550eec457c2e..550eec457c2e 100644
--- a/lib/prime_numbers.c
+++ b/lib/math/prime_numbers.c
diff --git a/lib/math/rational.c b/lib/math/rational.c
new file mode 100644
index 000000000000..31fb27db2deb
--- /dev/null
+++ b/lib/math/rational.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * rational fractions
+ *
+ * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <oskar@scara.com>
+ * Copyright (C) 2019 Trent Piepho <tpiepho@gmail.com>
+ *
+ * helper functions when coping with rational numbers
+ */
+
+#include <linux/rational.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+
+/*
+ * calculate best rational approximation for a given fraction
+ * taking into account restricted register size, e.g. to find
+ * appropriate values for a pll with 5 bit denominator and
+ * 8 bit numerator register fields, trying to set up with a
+ * frequency ratio of 3.1415, one would say:
+ *
+ * rational_best_approximation(31415, 10000,
+ * (1 << 8) - 1, (1 << 5) - 1, &n, &d);
+ *
+ * you may look at given_numerator as a fixed point number,
+ * with the fractional part size described in given_denominator.
+ *
+ * for theoretical background, see:
+ * http://en.wikipedia.org/wiki/Continued_fraction
+ */
+
+void rational_best_approximation(
+ unsigned long given_numerator, unsigned long given_denominator,
+ unsigned long max_numerator, unsigned long max_denominator,
+ unsigned long *best_numerator, unsigned long *best_denominator)
+{
+ /* n/d is the starting rational, which is continually
+ * decreased each iteration using the Euclidean algorithm.
+ *
+ * dp is the value of d from the prior iteration.
+ *
+ * n2/d2, n1/d1, and n0/d0 are our successively more accurate
+ * approximations of the rational. They are, respectively,
+ * the current, previous, and two prior iterations of it.
+ *
+ * a is current term of the continued fraction.
+ */
+ unsigned long n, d, n0, d0, n1, d1, n2, d2;
+ n = given_numerator;
+ d = given_denominator;
+ n0 = d1 = 0;
+ n1 = d0 = 1;
+
+ for (;;) {
+ unsigned long dp, a;
+
+ if (d == 0)
+ break;
+ /* Find next term in continued fraction, 'a', via
+ * Euclidean algorithm.
+ */
+ dp = d;
+ a = n / d;
+ d = n % d;
+ n = dp;
+
+ /* Calculate the current rational approximation (aka
+ * convergent), n2/d2, using the term just found and
+ * the two prior approximations.
+ */
+ n2 = n0 + a * n1;
+ d2 = d0 + a * d1;
+
+ /* If the current convergent exceeds the maxes, then
+ * return either the previous convergent or the
+ * largest semi-convergent, the final term of which is
+ * found below as 't'.
+ */
+ if ((n2 > max_numerator) || (d2 > max_denominator)) {
+ unsigned long t = min((max_numerator - n0) / n1,
+ (max_denominator - d0) / d1);
+
+ /* This tests if the semi-convergent is closer
+ * than the previous convergent.
+ */
+ if (2u * t > a || (2u * t == a && d0 * dp > d1 * d)) {
+ n1 = n0 + t * n1;
+ d1 = d0 + t * d1;
+ }
+ break;
+ }
+ n0 = n1;
+ n1 = n2;
+ d0 = d1;
+ d1 = d2;
+ }
+ *best_numerator = n1;
+ *best_denominator = d1;
+}
+
+EXPORT_SYMBOL(rational_best_approximation);
diff --git a/lib/reciprocal_div.c b/lib/math/reciprocal_div.c
index bf043258fa00..bf043258fa00 100644
--- a/lib/reciprocal_div.c
+++ b/lib/math/reciprocal_div.c
diff --git a/lib/plist.c b/lib/plist.c
index 199408f91057..d3bd8827186f 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -26,7 +26,7 @@
#include <linux/bug.h>
#include <linux/plist.h>
-#ifdef CONFIG_DEBUG_PI_LIST
+#ifdef CONFIG_DEBUG_PLIST
static struct plist_head test_head;
@@ -173,7 +173,7 @@ void plist_requeue(struct plist_node *node, struct plist_head *head)
plist_check_head(head);
}
-#ifdef CONFIG_DEBUG_PI_LIST
+#ifdef CONFIG_DEBUG_PLIST
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/module.h>
diff --git a/lib/rational.c b/lib/rational.c
deleted file mode 100644
index ba7443677c90..000000000000
--- a/lib/rational.c
+++ /dev/null
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * rational fractions
- *
- * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <oskar@scara.com>
- *
- * helper functions when coping with rational numbers
- */
-
-#include <linux/rational.h>
-#include <linux/compiler.h>
-#include <linux/export.h>
-
-/*
- * calculate best rational approximation for a given fraction
- * taking into account restricted register size, e.g. to find
- * appropriate values for a pll with 5 bit denominator and
- * 8 bit numerator register fields, trying to set up with a
- * frequency ratio of 3.1415, one would say:
- *
- * rational_best_approximation(31415, 10000,
- * (1 << 8) - 1, (1 << 5) - 1, &n, &d);
- *
- * you may look at given_numerator as a fixed point number,
- * with the fractional part size described in given_denominator.
- *
- * for theoretical background, see:
- * http://en.wikipedia.org/wiki/Continued_fraction
- */
-
-void rational_best_approximation(
- unsigned long given_numerator, unsigned long given_denominator,
- unsigned long max_numerator, unsigned long max_denominator,
- unsigned long *best_numerator, unsigned long *best_denominator)
-{
- unsigned long n, d, n0, d0, n1, d1;
- n = given_numerator;
- d = given_denominator;
- n0 = d1 = 0;
- n1 = d0 = 1;
- for (;;) {
- unsigned long t, a;
- if ((n1 > max_numerator) || (d1 > max_denominator)) {
- n1 = n0;
- d1 = d0;
- break;
- }
- if (d == 0)
- break;
- t = d;
- a = n / d;
- d = n % d;
- n = t;
- t = n0 + a * n1;
- n0 = n1;
- n1 = t;
- t = d0 + a * d1;
- d0 = d1;
- d1 = t;
- }
- *best_numerator = n1;
- *best_denominator = d1;
-}
-
-EXPORT_SYMBOL(rational_best_approximation);
diff --git a/lib/sort.c b/lib/sort.c
index d6b7a202b0b6..50855ea8c262 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -1,8 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ * A fast, small, non-recursive O(n log n) sort for the Linux kernel
*
- * Jan 23 2005 Matt Mackall <mpm@selenic.com>
+ * This performs n*log2(n) + 0.37*n + o(n) comparisons on average,
+ * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case.
+ *
+ * Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n
+ * better) at the expense of stack usage and much larger code to avoid
+ * quicksort's O(n^2) worst case.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -11,35 +16,155 @@
#include <linux/export.h>
#include <linux/sort.h>
-static int alignment_ok(const void *base, int align)
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
{
- return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
- ((unsigned long)base & (align - 1)) == 0;
+ unsigned char lsbits = (unsigned char)size;
+
+ (void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+ return (lsbits & (align - 1)) == 0;
}
-static void u32_swap(void *a, void *b, int size)
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory. This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is stil valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, size_t n)
{
- u32 t = *(u32 *)a;
- *(u32 *)a = *(u32 *)b;
- *(u32 *)b = t;
+ do {
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+ } while (n);
}
-static void u64_swap(void *a, void *b, int size)
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory. This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible. If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI). Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, size_t n)
{
- u64 t = *(u64 *)a;
- *(u64 *)a = *(u64 *)b;
- *(u64 *)b = t;
+ do {
+#ifdef CONFIG_64BIT
+ u64 t = *(u64 *)(a + (n -= 8));
+ *(u64 *)(a + n) = *(u64 *)(b + n);
+ *(u64 *)(b + n) = t;
+#else
+ /* Use two 32-bit transfers to avoid base+index+4 addressing */
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+
+ t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+#endif
+ } while (n);
}
-static void generic_swap(void *a, void *b, int size)
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a, @b: pointers to the elements
+ * @size: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, size_t n)
{
- char t;
-
do {
- t = *(char *)a;
- *(char *)a++ = *(char *)b;
- *(char *)b++ = t;
- } while (--size > 0);
+ char t = ((char *)a)[--n];
+ ((char *)a)[n] = ((char *)b)[n];
+ ((char *)b)[n] = t;
+ } while (n);
+}
+
+typedef void (*swap_func_t)(void *a, void *b, int size);
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_func_t)0
+#define SWAP_WORDS_32 (swap_func_t)1
+#define SWAP_BYTES (swap_func_t)2
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_func_t swap_func)
+{
+ if (swap_func == SWAP_WORDS_64)
+ swap_words_64(a, b, size);
+ else if (swap_func == SWAP_WORDS_32)
+ swap_words_32(a, b, size);
+ else if (swap_func == SWAP_BYTES)
+ swap_bytes(a, b, size);
+ else
+ swap_func(a, b, (int)size);
+}
+
+/**
+ * parent - given the offset of the child, find the offset of the parent.
+ * @i: the offset of the heap element whose parent is sought. Non-zero.
+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
+ * @size: size of each element
+ *
+ * In terms of array indexes, the parent of element j = @i/@size is simply
+ * (j-1)/2. But when working in byte offsets, we can't use implicit
+ * truncation of integer divides.
+ *
+ * Fortunately, we only need one bit of the quotient, not the full divide.
+ * @size has a least significant bit. That bit will be clear if @i is
+ * an even multiple of @size, and set if it's an odd multiple.
+ *
+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
+ * branch is unpredictable, it's done with a bit of clever branch-free
+ * code instead.
+ */
+__attribute_const__ __always_inline
+static size_t parent(size_t i, unsigned int lsbit, size_t size)
+{
+ i -= size;
+ i -= size & -(i & lsbit);
+ return i / 2;
}
/**
@@ -50,57 +175,78 @@ static void generic_swap(void *a, void *b, int size)
* @cmp_func: pointer to comparison function
* @swap_func: pointer to swap function or NULL
*
- * This function does a heapsort on the given array. You may provide a
- * swap_func function optimized to your element type.
+ * This function does a heapsort on the given array. You may provide
+ * a swap_func function if you need to do something more than a memory
+ * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
+ * avoids a slow retpoline and so is significantly faster.
*
* Sorting time is O(n log n) both on average and worst-case. While
- * qsort is about 20% faster on average, it suffers from exploitable
+ * quicksort is slightly faster on average, it suffers from exploitable
* O(n*n) worst-case behavior and extra memory requirements that make
* it less suitable for kernel use.
*/
-
void sort(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *),
void (*swap_func)(void *, void *, int size))
{
/* pre-scale counters for performance */
- int i = (num/2 - 1) * size, n = num * size, c, r;
+ size_t n = num * size, a = (num/2) * size;
+ const unsigned int lsbit = size & -size; /* Used to find parent */
+
+ if (!a) /* num < 2 || size == 0 */
+ return;
if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
- swap_func = u64_swap;
+ if (is_aligned(base, size, 8))
+ swap_func = SWAP_WORDS_64;
+ else if (is_aligned(base, size, 4))
+ swap_func = SWAP_WORDS_32;
else
- swap_func = generic_swap;
+ swap_func = SWAP_BYTES;
}
- /* heapify */
- for ( ; i >= 0; i -= size) {
- for (r = i; r * 2 + size < n; r = c) {
- c = r * 2 + size;
- if (c < n - size &&
- cmp_func(base + c, base + c + size) < 0)
- c += size;
- if (cmp_func(base + r, base + c) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
+ /*
+ * Loop invariants:
+ * 1. elements [a,n) satisfy the heap property (compare greater than
+ * all of their children),
+ * 2. elements [n,num*size) are sorted, and
+ * 3. a <= b <= c <= d <= n (whenever they are valid).
+ */
+ for (;;) {
+ size_t b, c, d;
+
+ if (a) /* Building heap: sift down --a */
+ a -= size;
+ else if (n -= size) /* Sorting: Extract root to --n */
+ do_swap(base, base + n, size, swap_func);
+ else /* Sort complete */
+ break;
- /* sort */
- for (i = n - size; i > 0; i -= size) {
- swap_func(base, base + i, size);
- for (r = 0; r * 2 + size < i; r = c) {
- c = r * 2 + size;
- if (c < i - size &&
- cmp_func(base + c, base + c + size) < 0)
- c += size;
- if (cmp_func(base + r, base + c) >= 0)
- break;
- swap_func(base + r, base + c, size);
+ /*
+ * Sift element at "a" down into heap. This is the
+ * "bottom-up" variant, which significantly reduces
+ * calls to cmp_func(): we find the sift-down path all
+ * the way to the leaves (one compare per level), then
+ * backtrack to find where to insert the target element.
+ *
+ * Because elements tend to sift down close to the leaves,
+ * this uses fewer compares than doing two per level
+ * on the way down. (A bit more than half as many on
+ * average, 3/4 worst-case.)
+ */
+ for (b = a; c = 2*b + size, (d = c + size) < n;)
+ b = cmp_func(base + c, base + d) >= 0 ? c : d;
+ if (d == n) /* Special case last leaf with no sibling */
+ b = c;
+
+ /* Now backtrack from "b" to the correct location for "a" */
+ while (b != a && cmp_func(base + a, base + b) >= 0)
+ b = parent(b, lsbit, size);
+ c = b; /* Where "a" belongs */
+ while (b != a) { /* Shift it into place */
+ b = parent(b, lsbit, size);
+ do_swap(base + b, base + c, size, swap_func);
}
}
}
-
EXPORT_SYMBOL(sort);
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 792d90608052..d3a501f2a81a 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -11,6 +11,7 @@
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/uaccess.h>
#include "../tools/testing/selftests/kselftest_module.h"
@@ -226,7 +227,8 @@ static const unsigned long exp[] __initconst = {
BITMAP_FROM_U64(0xffffffff),
BITMAP_FROM_U64(0xfffffffe),
BITMAP_FROM_U64(0x3333333311111111ULL),
- BITMAP_FROM_U64(0xffffffff77777777ULL)
+ BITMAP_FROM_U64(0xffffffff77777777ULL),
+ BITMAP_FROM_U64(0),
};
static const unsigned long exp2[] __initconst = {
@@ -249,55 +251,93 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = {
{0, "1-31:4/4", &exp[9 * step], 32, 0},
{0, "0-31:1/4,32-63:2/4", &exp[10 * step], 64, 0},
{0, "0-31:3/4,32-63:4/4", &exp[11 * step], 64, 0},
+ {0, " ,, 0-31:3/4 ,, 32-63:4/4 ,, ", &exp[11 * step], 64, 0},
{0, "0-31:1/4,32-63:2/4,64-95:3/4,96-127:4/4", exp2, 128, 0},
{0, "0-2047:128/256", NULL, 2048, PARSE_TIME},
+ {0, "", &exp[12 * step], 8, 0},
+ {0, "\n", &exp[12 * step], 8, 0},
+ {0, ",, ,, , , ,", &exp[12 * step], 8, 0},
+ {0, " , ,, , , ", &exp[12 * step], 8, 0},
+ {0, " , ,, , , \n", &exp[12 * step], 8, 0},
+
{-EINVAL, "-1", NULL, 8, 0},
{-EINVAL, "-0", NULL, 8, 0},
{-EINVAL, "10-1", NULL, 8, 0},
{-EINVAL, "0-31:", NULL, 8, 0},
{-EINVAL, "0-31:0", NULL, 8, 0},
+ {-EINVAL, "0-31:0/", NULL, 8, 0},
{-EINVAL, "0-31:0/0", NULL, 8, 0},
{-EINVAL, "0-31:1/0", NULL, 8, 0},
{-EINVAL, "0-31:10/1", NULL, 8, 0},
+ {-EOVERFLOW, "0-98765432123456789:10/1", NULL, 8, 0},
+
+ {-EINVAL, "a-31", NULL, 8, 0},
+ {-EINVAL, "0-a1", NULL, 8, 0},
+ {-EINVAL, "a-31:10/1", NULL, 8, 0},
+ {-EINVAL, "0-31:a/1", NULL, 8, 0},
+ {-EINVAL, "0-\n", NULL, 8, 0},
};
-static void __init test_bitmap_parselist(void)
+static void __init __test_bitmap_parselist(int is_user)
{
int i;
int err;
- cycles_t cycles;
+ ktime_t time;
DECLARE_BITMAP(bmap, 2048);
+ char *mode = is_user ? "_user" : "";
for (i = 0; i < ARRAY_SIZE(parselist_tests); i++) {
#define ptest parselist_tests[i]
- cycles = get_cycles();
- err = bitmap_parselist(ptest.in, bmap, ptest.nbits);
- cycles = get_cycles() - cycles;
+ if (is_user) {
+ mm_segment_t orig_fs = get_fs();
+ size_t len = strlen(ptest.in);
+
+ set_fs(KERNEL_DS);
+ time = ktime_get();
+ err = bitmap_parselist_user(ptest.in, len,
+ bmap, ptest.nbits);
+ time = ktime_get() - time;
+ set_fs(orig_fs);
+ } else {
+ time = ktime_get();
+ err = bitmap_parselist(ptest.in, bmap, ptest.nbits);
+ time = ktime_get() - time;
+ }
if (err != ptest.errno) {
- pr_err("test %d: input is %s, errno is %d, expected %d\n",
- i, ptest.in, err, ptest.errno);
+ pr_err("parselist%s: %d: input is %s, errno is %d, expected %d\n",
+ mode, i, ptest.in, err, ptest.errno);
continue;
}
if (!err && ptest.expected
&& !__bitmap_equal(bmap, ptest.expected, ptest.nbits)) {
- pr_err("test %d: input is %s, result is 0x%lx, expected 0x%lx\n",
- i, ptest.in, bmap[0], *ptest.expected);
+ pr_err("parselist%s: %d: input is %s, result is 0x%lx, expected 0x%lx\n",
+ mode, i, ptest.in, bmap[0],
+ *ptest.expected);
continue;
}
if (ptest.flags & PARSE_TIME)
- pr_err("test %d: input is '%s' OK, Time: %llu\n",
- i, ptest.in,
- (unsigned long long)cycles);
+ pr_err("parselist%s: %d: input is '%s' OK, Time: %llu\n",
+ mode, i, ptest.in, time);
}
}
+static void __init test_bitmap_parselist(void)
+{
+ __test_bitmap_parselist(0);
+}
+
+static void __init test_bitmap_parselist_user(void)
+{
+ __test_bitmap_parselist(1);
+}
+
#define EXP_BYTES (sizeof(exp) * 8)
static void __init test_bitmap_arr32(void)
@@ -370,6 +410,7 @@ static void __init selftest(void)
test_copy();
test_bitmap_arr32();
test_bitmap_parselist();
+ test_bitmap_parselist_user();
test_mem_optimisations();
}
diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c
index 3dd801c1c85b..566dad3f4196 100644
--- a/lib/test_sysctl.c
+++ b/lib/test_sysctl.c
@@ -47,6 +47,9 @@ struct test_sysctl_data {
unsigned int uint_0001;
char string_0001[65];
+
+#define SYSCTL_TEST_BITMAP_SIZE 65536
+ unsigned long *bitmap_0001;
};
static struct test_sysctl_data test_data = {
@@ -102,6 +105,13 @@ static struct ctl_table test_table[] = {
.mode = 0644,
.proc_handler = proc_dostring,
},
+ {
+ .procname = "bitmap_0001",
+ .data = &test_data.bitmap_0001,
+ .maxlen = SYSCTL_TEST_BITMAP_SIZE,
+ .mode = 0644,
+ .proc_handler = proc_do_large_bitmap,
+ },
{ }
};
@@ -129,15 +139,21 @@ static struct ctl_table_header *test_sysctl_header;
static int __init test_sysctl_init(void)
{
+ test_data.bitmap_0001 = kzalloc(SYSCTL_TEST_BITMAP_SIZE/8, GFP_KERNEL);
+ if (!test_data.bitmap_0001)
+ return -ENOMEM;
test_sysctl_header = register_sysctl_table(test_sysctl_root_table);
- if (!test_sysctl_header)
+ if (!test_sysctl_header) {
+ kfree(test_data.bitmap_0001);
return -ENOMEM;
+ }
return 0;
}
late_initcall(test_sysctl_init);
static void __exit test_sysctl_exit(void)
{
+ kfree(test_data.bitmap_0001);
if (test_sysctl_header)
unregister_sysctl_table(test_sysctl_header);
}
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index f832b095afba..8bbefcaddfe8 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -384,12 +384,11 @@ static int test_func(void *private)
{
struct test_driver *t = private;
int random_array[ARRAY_SIZE(test_case_array)];
- int index, i, j, ret;
+ int index, i, j;
ktime_t kt;
u64 delta;
- ret = set_cpus_allowed_ptr(current, cpumask_of(t->cpu));
- if (ret < 0)
+ if (set_cpus_allowed_ptr(current, cpumask_of(t->cpu)) < 0)
pr_err("Failed to set affinity to %d CPU\n", t->cpu);
for (i = 0; i < ARRAY_SIZE(test_case_array); i++)
@@ -415,8 +414,7 @@ static int test_func(void *private)
kt = ktime_get();
for (j = 0; j < test_repeat_count; j++) {
- ret = test_case_array[index].test_func();
- if (!ret)
+ if (!test_case_array[index].test_func())
per_cpu_test_data[t->cpu][index].test_passed++;
else
per_cpu_test_data[t->cpu][index].test_failed++;
diff --git a/mm/Kconfig b/mm/Kconfig
index 25c71eb8a7db..ee8d1f311858 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -11,23 +11,24 @@ choice
default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
default FLATMEM_MANUAL
+ help
+ This option allows you to change some of the ways that
+ Linux manages its memory internally. Most users will
+ only have one option here selected by the architecture
+ configuration. This is normal.
config FLATMEM_MANUAL
bool "Flat Memory"
depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
help
- This option allows you to change some of the ways that
- Linux manages its memory internally. Most users will
- only have one option here: FLATMEM. This is normal
- and a correct option.
-
- Some users of more advanced features like NUMA and
- memory hotplug may have different options here.
- DISCONTIGMEM is a more mature, better tested system,
- but is incompatible with memory hotplug and may suffer
- decreased performance over SPARSEMEM. If unsure between
- "Sparse Memory" and "Discontiguous Memory", choose
- "Discontiguous Memory".
+ This option is best suited for non-NUMA systems with
+ flat address space. The FLATMEM is the most efficient
+ system in terms of performance and resource consumption
+ and it is the best option for smaller systems.
+
+ For systems that have holes in their physical address
+ spaces and for features like NUMA and memory hotplug,
+ choose "Sparse Memory"
If unsure, choose this option (Flat Memory) over any other.
@@ -38,29 +39,26 @@ config DISCONTIGMEM_MANUAL
This option provides enhanced support for discontiguous
memory systems, over FLATMEM. These systems have holes
in their physical address spaces, and this option provides
- more efficient handling of these holes. However, the vast
- majority of hardware has quite flat address spaces, and
- can have degraded performance from the extra overhead that
- this option imposes.
+ more efficient handling of these holes.
- Many NUMA configurations will have this as the only option.
+ Although "Discontiguous Memory" is still used by several
+ architectures, it is considered deprecated in favor of
+ "Sparse Memory".
- If unsure, choose "Flat Memory" over this option.
+ If unsure, choose "Sparse Memory" over this option.
config SPARSEMEM_MANUAL
bool "Sparse Memory"
depends on ARCH_SPARSEMEM_ENABLE
help
This will be the only option for some systems, including
- memory hotplug systems. This is normal.
+ memory hot-plug systems. This is normal.
- For many other systems, this will be an alternative to
- "Discontiguous Memory". This option provides some potential
- performance benefits, along with decreased code complexity,
- but it is newer, and more experimental.
+ This option provides efficient support for systems with
+ holes is their physical address space and allows memory
+ hot-plug and hot-remove.
- If unsure, choose "Discontiguous Memory" or "Flat Memory"
- over this option.
+ If unsure, choose "Flat Memory" over this option.
endchoice
@@ -136,7 +134,7 @@ config HAVE_MEMBLOCK_PHYS_MAP
config HAVE_GENERIC_GUP
bool
-config ARCH_DISCARD_MEMBLOCK
+config ARCH_KEEP_MEMBLOCK
bool
config MEMORY_ISOLATION
@@ -161,7 +159,6 @@ config MEMORY_HOTPLUG_SPARSE
config MEMORY_HOTPLUG_DEFAULT_ONLINE
bool "Online the newly added memory blocks by default"
- default n
depends on MEMORY_HOTPLUG
help
This option sets the default policy setting for memory hotplug
@@ -258,6 +255,9 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
config ARCH_ENABLE_THP_MIGRATION
bool
+config CONTIG_ALLOC
+ def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
+
config PHYS_ADDR_T_64BIT
def_bool 64BIT
@@ -436,7 +436,6 @@ config NEED_PER_CPU_KM
config CLEANCACHE
bool "Enable cleancache driver to cache clean pages if tmem is present"
- default n
help
Cleancache can be thought of as a page-granularity victim cache
for clean pages that the kernel's pageframe replacement algorithm
@@ -460,7 +459,6 @@ config CLEANCACHE
config FRONTSWAP
bool "Enable frontswap to cache swap pages if tmem is present"
depends on SWAP
- default n
help
Frontswap is so named because it can be thought of as the opposite
of a "backing" store for a swap device. The data is stored into
@@ -532,7 +530,6 @@ config ZSWAP
depends on FRONTSWAP && CRYPTO=y
select CRYPTO_LZO
select ZPOOL
- default n
help
A lightweight compressed cache for swap pages. It takes
pages that are in the process of being swapped out and attempts to
@@ -549,14 +546,12 @@ config ZSWAP
config ZPOOL
tristate "Common API for compressed memory storage"
- default n
help
Compressed memory storage API. This allows using either zbud or
zsmalloc.
config ZBUD
tristate "Low (Up to 2x) density storage for compressed pages"
- default n
help
A special purpose allocator for storing compressed pages.
It is designed to store up to two compressed pages per physical
@@ -567,7 +562,6 @@ config ZBUD
config Z3FOLD
tristate "Up to 3x density storage for compressed pages"
depends on ZPOOL
- default n
help
A special purpose allocator for storing compressed pages.
It is designed to store up to three compressed pages per physical
@@ -577,7 +571,6 @@ config Z3FOLD
config ZSMALLOC
tristate "Memory allocator for compressed pages"
depends on MMU
- default n
help
zsmalloc is a slab-based memory allocator designed to store
compressed RAM pages. zsmalloc uses virtual memory mapping
@@ -628,7 +621,6 @@ config MAX_STACK_SIZE_MB
config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kthreads"
- default n
depends on SPARSEMEM
depends on !NEED_PER_CPU_KM
depends on 64BIT
@@ -676,6 +668,22 @@ config ZONE_DEVICE
If FS_DAX is enabled, then say Y.
+config ARCH_HAS_HMM_MIRROR
+ bool
+ default y
+ depends on (X86_64 || PPC64)
+ depends on MMU && 64BIT
+
+config ARCH_HAS_HMM_DEVICE
+ bool
+ default y
+ depends on (X86_64 || PPC64)
+ depends on MEMORY_HOTPLUG
+ depends on MEMORY_HOTREMOVE
+ depends on SPARSEMEM_VMEMMAP
+ depends on ARCH_HAS_ZONE_DEVICE
+ select XARRAY_MULTI
+
config ARCH_HAS_HMM
bool
default y
@@ -694,12 +702,12 @@ config DEV_PAGEMAP_OPS
config HMM
bool
+ select MMU_NOTIFIER
select MIGRATE_VMA_HELPER
config HMM_MIRROR
bool "HMM mirror CPU page table into a device page table"
depends on ARCH_HAS_HMM
- select MMU_NOTIFIER
select HMM
help
Select HMM_MIRROR if you want to mirror range of the CPU page table of a
@@ -740,7 +748,6 @@ config ARCH_HAS_PKEYS
config PERCPU_STATS
bool "Collect percpu memory statistics"
- default n
help
This feature collects and exposes statistics via debugfs. The
information includes global and per chunk statistics, which can
@@ -748,7 +755,6 @@ config PERCPU_STATS
config GUP_BENCHMARK
bool "Enable infrastructure for get_user_pages_fast() benchmarking"
- default n
help
Provides /sys/kernel/debug/gup_benchmark that helps with testing
performance of get_user_pages_fast().
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index e3df921208c0..e980ceb775a4 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -33,7 +33,6 @@ config DEBUG_PAGEALLOC
config DEBUG_PAGEALLOC_ENABLE_DEFAULT
bool "Enable debug page memory allocations by default?"
- default n
depends on DEBUG_PAGEALLOC
---help---
Enable debug page memory allocations by default? This value
diff --git a/mm/Makefile b/mm/Makefile
index d210cc9d6f80..ac5e5ba78874 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,7 +33,7 @@ mmu-$(CONFIG_MMU) += process_vm_access.o
endif
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
- maccess.o page_alloc.o page-writeback.o \
+ maccess.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
@@ -41,6 +41,11 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
interval_tree.o list_lru.o workingset.o \
debug.o $(mmu-y)
+# Give 'page_alloc' its own module-parameter namespace
+page-alloc-y := page_alloc.o
+page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
+
+obj-y += page-alloc.o
obj-y += init-mm.o
obj-y += memblock.o
diff --git a/mm/cma.c b/mm/cma.c
index bb2d333ffcb3..5e36d7418031 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -106,8 +106,10 @@ static int __init cma_activate_area(struct cma *cma)
cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!cma->bitmap)
+ if (!cma->bitmap) {
+ cma->count = 0;
return -ENOMEM;
+ }
WARN_ON_ONCE(!pfn_valid(pfn));
zone = page_zone(pfn_to_page(pfn));
@@ -367,23 +369,26 @@ err:
#ifdef CONFIG_CMA_DEBUG
static void cma_debug_show_areas(struct cma *cma)
{
- unsigned long next_zero_bit, next_set_bit;
+ unsigned long next_zero_bit, next_set_bit, nr_zero;
unsigned long start = 0;
- unsigned int nr_zero, nr_total = 0;
+ unsigned long nr_part, nr_total = 0;
+ unsigned long nbits = cma_bitmap_maxno(cma);
mutex_lock(&cma->lock);
pr_info("number of available pages: ");
for (;;) {
- next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start);
- if (next_zero_bit >= cma->count)
+ next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
+ if (next_zero_bit >= nbits)
break;
- next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit);
+ next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit);
nr_zero = next_set_bit - next_zero_bit;
- pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit);
- nr_total += nr_zero;
+ nr_part = nr_zero << cma->order_per_bit;
+ pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part,
+ next_zero_bit);
+ nr_total += nr_part;
start = next_zero_bit + nr_zero;
}
- pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count);
+ pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
mutex_unlock(&cma->lock);
}
#else
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 8d7b2fd52225..a7dd9e8e10d5 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -56,7 +56,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
mutex_lock(&cma->lock);
for (;;) {
start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
- if (start >= cma->count)
+ if (start >= bitmap_maxno)
break;
end = find_next_bit(cma->bitmap, bitmap_maxno, start);
maxchunk = max(end - start, maxchunk);
diff --git a/mm/compaction.c b/mm/compaction.c
index 3319e0872d01..cbac7277978a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1164,7 +1164,9 @@ static bool suitable_migration_target(struct compact_control *cc,
static inline unsigned int
freelist_scan_limit(struct compact_control *cc)
{
- return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1;
+ unsigned short shift = BITS_PER_LONG - 1;
+
+ return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + 1;
}
/*
@@ -1886,13 +1888,13 @@ static enum compact_result __compact_finished(struct compact_control *cc)
bool can_steal;
/* Job done if page is free of the right migratetype */
- if (!list_empty(&area->free_list[migratetype]))
+ if (!free_area_empty(area, migratetype))
return COMPACT_SUCCESS;
#ifdef CONFIG_CMA
/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
if (migratetype == MIGRATE_MOVABLE &&
- !list_empty(&area->free_list[MIGRATE_CMA]))
+ !free_area_empty(area, MIGRATE_CMA))
return COMPACT_SUCCESS;
#endif
/*
diff --git a/mm/debug.c b/mm/debug.c
index eee9c221280c..b9cd71927d3c 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -136,7 +136,7 @@ void dump_mm(const struct mm_struct *mm)
#endif
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
- "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
+ "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %llx\n"
"pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
@@ -167,7 +167,8 @@ void dump_mm(const struct mm_struct *mm)
atomic_read(&mm->mm_count),
mm_pgtables_bytes(mm),
mm->map_count,
- mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
+ mm->hiwater_rss, mm->hiwater_vm, mm->total_vm,
+ (u64)atomic64_read(&mm->locked_vm),
(u64)atomic64_read(&mm->pinned_vm),
mm->data_vm, mm->exec_vm, mm->stack_vm,
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
diff --git a/mm/filemap.c b/mm/filemap.c
index d78f577baef2..3ad18fa56057 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -24,6 +24,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
+#include <linux/error-injection.h>
#include <linux/hash.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
@@ -279,11 +280,11 @@ EXPORT_SYMBOL(delete_from_page_cache);
* @pvec: pagevec with pages to delete
*
* The function walks over mapping->i_pages and removes pages passed in @pvec
- * from the mapping. The function expects @pvec to be sorted by page index.
+ * from the mapping. The function expects @pvec to be sorted by page index
+ * and is optimised for it to be dense.
* It tolerates holes in @pvec (mapping entries at those indices are not
* modified). The function expects only THP head pages to be present in the
- * @pvec and takes care to delete all corresponding tail pages from the
- * mapping as well.
+ * @pvec.
*
* The function expects the i_pages lock to be held.
*/
@@ -292,40 +293,44 @@ static void page_cache_delete_batch(struct address_space *mapping,
{
XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
int total_pages = 0;
- int i = 0, tail_pages = 0;
+ int i = 0;
struct page *page;
mapping_set_update(&xas, mapping);
xas_for_each(&xas, page, ULONG_MAX) {
- if (i >= pagevec_count(pvec) && !tail_pages)
+ if (i >= pagevec_count(pvec))
break;
+
+ /* A swap/dax/shadow entry got inserted? Skip it. */
if (xa_is_value(page))
continue;
- if (!tail_pages) {
- /*
- * Some page got inserted in our range? Skip it. We
- * have our pages locked so they are protected from
- * being removed.
- */
- if (page != pvec->pages[i]) {
- VM_BUG_ON_PAGE(page->index >
- pvec->pages[i]->index, page);
- continue;
- }
- WARN_ON_ONCE(!PageLocked(page));
- if (PageTransHuge(page) && !PageHuge(page))
- tail_pages = HPAGE_PMD_NR - 1;
+ /*
+ * A page got inserted in our range? Skip it. We have our
+ * pages locked so they are protected from being removed.
+ * If we see a page whose index is higher than ours, it
+ * means our page has been removed, which shouldn't be
+ * possible because we're holding the PageLock.
+ */
+ if (page != pvec->pages[i]) {
+ VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
+ page);
+ continue;
+ }
+
+ WARN_ON_ONCE(!PageLocked(page));
+
+ if (page->index == xas.xa_index)
page->mapping = NULL;
- /*
- * Leave page->index set: truncation lookup relies
- * upon it
- */
+ /* Leave page->index set: truncation lookup relies on it */
+
+ /*
+ * Move to the next page in the vector if this is a regular
+ * page or the index is of the last sub-page of this compound
+ * page.
+ */
+ if (page->index + (1UL << compound_order(page)) - 1 ==
+ xas.xa_index)
i++;
- } else {
- VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
- != pvec->pages[i]->index, page);
- tail_pages--;
- }
xas_store(&xas, NULL);
total_pages++;
}
@@ -878,6 +883,7 @@ error:
put_page(page);
return xas_error(&xas);
}
+ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
/**
* add_to_page_cache_locked - add a locked page to the pagecache
@@ -1440,7 +1446,7 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
EXPORT_SYMBOL(page_cache_next_miss);
/**
- * page_cache_prev_miss() - Find the next gap in the page cache.
+ * page_cache_prev_miss() - Find the previous gap in the page cache.
* @mapping: Mapping.
* @index: Index.
* @max_scan: Maximum range to search.
@@ -1491,7 +1497,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
XA_STATE(xas, &mapping->i_pages, offset);
- struct page *head, *page;
+ struct page *page;
rcu_read_lock();
repeat:
@@ -1506,25 +1512,19 @@ repeat:
if (!page || xa_is_value(page))
goto out;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto repeat;
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
/*
- * Has the page moved?
+ * Has the page moved or been split?
* This is part of the lockless pagecache protocol. See
* include/linux/pagemap.h for details.
*/
if (unlikely(page != xas_reload(&xas))) {
- put_page(head);
+ put_page(page);
goto repeat;
}
+ page = find_subpage(page, offset);
out:
rcu_read_unlock();
@@ -1706,7 +1706,6 @@ unsigned find_get_entries(struct address_space *mapping,
rcu_read_lock();
xas_for_each(&xas, page, ULONG_MAX) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1717,17 +1716,13 @@ unsigned find_get_entries(struct address_space *mapping,
if (xa_is_value(page))
goto export;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
+ page = find_subpage(page, xas.xa_index);
export:
indices[ret] = xas.xa_index;
@@ -1736,7 +1731,7 @@ export:
break;
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1778,33 +1773,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
rcu_read_lock();
xas_for_each(&xas, page, end) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/* Skip over shadow, swap and DAX entries */
if (xa_is_value(page))
continue;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*start = xas.xa_index + 1;
goto out;
}
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1849,7 +1838,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
rcu_read_lock();
for (page = xas_load(&xas); page; page = xas_next(&xas)) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1859,24 +1847,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
if (xa_is_value(page))
break;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages)
break;
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1912,7 +1895,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
rcu_read_lock();
xas_for_each_marked(&xas, page, end, tag) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1923,26 +1905,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
if (xa_is_value(page))
continue;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*index = xas.xa_index + 1;
goto out;
}
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1991,7 +1968,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
rcu_read_lock();
xas_for_each_marked(&xas, page, ULONG_MAX, tag) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -2002,17 +1978,13 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
if (xa_is_value(page))
goto export;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
+ page = find_subpage(page, xas.xa_index);
export:
indices[ret] = xas.xa_index;
@@ -2021,7 +1993,7 @@ export:
break;
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -2691,7 +2663,7 @@ void filemap_map_pages(struct vm_fault *vmf,
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *head, *page;
+ struct page *page;
rcu_read_lock();
xas_for_each(&xas, page, end_pgoff) {
@@ -2700,24 +2672,19 @@ void filemap_map_pages(struct vm_fault *vmf,
if (xa_is_value(page))
goto next;
- head = compound_head(page);
-
/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
- if (PageLocked(head))
+ if (PageLocked(page))
goto next;
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto next;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto skip;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto skip;
+ page = find_subpage(page, xas.xa_index);
if (!PageUptodate(page) ||
PageReadahead(page) ||
diff --git a/mm/gup.c b/mm/gup.c
index 91819b8ad9cc..2c08248d4fa2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -28,6 +28,111 @@ struct follow_page_context {
unsigned int page_mask;
};
+typedef int (*set_dirty_func_t)(struct page *page);
+
+static void __put_user_pages_dirty(struct page **pages,
+ unsigned long npages,
+ set_dirty_func_t sdf)
+{
+ unsigned long index;
+
+ for (index = 0; index < npages; index++) {
+ struct page *page = compound_head(pages[index]);
+
+ /*
+ * Checking PageDirty at this point may race with
+ * clear_page_dirty_for_io(), but that's OK. Two key cases:
+ *
+ * 1) This code sees the page as already dirty, so it skips
+ * the call to sdf(). That could happen because
+ * clear_page_dirty_for_io() called page_mkclean(),
+ * followed by set_page_dirty(). However, now the page is
+ * going to get written back, which meets the original
+ * intention of setting it dirty, so all is well:
+ * clear_page_dirty_for_io() goes on to call
+ * TestClearPageDirty(), and write the page back.
+ *
+ * 2) This code sees the page as clean, so it calls sdf().
+ * The page stays dirty, despite being written back, so it
+ * gets written back again in the next writeback cycle.
+ * This is harmless.
+ */
+ if (!PageDirty(page))
+ sdf(page);
+
+ put_user_page(page);
+ }
+}
+
+/**
+ * put_user_pages_dirty() - release and dirty an array of gup-pinned pages
+ * @pages: array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * "gup-pinned page" refers to a page that has had one of the get_user_pages()
+ * variants called on that page.
+ *
+ * For each page in the @pages array, make that page (or its head page, if a
+ * compound page) dirty, if it was previously listed as clean. Then, release
+ * the page using put_user_page().
+ *
+ * Please see the put_user_page() documentation for details.
+ *
+ * set_page_dirty(), which does not lock the page, is used here.
+ * Therefore, it is the caller's responsibility to ensure that this is
+ * safe. If not, then put_user_pages_dirty_lock() should be called instead.
+ *
+ */
+void put_user_pages_dirty(struct page **pages, unsigned long npages)
+{
+ __put_user_pages_dirty(pages, npages, set_page_dirty);
+}
+EXPORT_SYMBOL(put_user_pages_dirty);
+
+/**
+ * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages
+ * @pages: array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * For each page in the @pages array, make that page (or its head page, if a
+ * compound page) dirty, if it was previously listed as clean. Then, release
+ * the page using put_user_page().
+ *
+ * Please see the put_user_page() documentation for details.
+ *
+ * This is just like put_user_pages_dirty(), except that it invokes
+ * set_page_dirty_lock(), instead of set_page_dirty().
+ *
+ */
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages)
+{
+ __put_user_pages_dirty(pages, npages, set_page_dirty_lock);
+}
+EXPORT_SYMBOL(put_user_pages_dirty_lock);
+
+/**
+ * put_user_pages() - release an array of gup-pinned pages.
+ * @pages: array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * For each page in the @pages array, release the page using put_user_page().
+ *
+ * Please see the put_user_page() documentation for details.
+ */
+void put_user_pages(struct page **pages, unsigned long npages)
+{
+ unsigned long index;
+
+ /*
+ * TODO: this can be optimized for huge pages: if a series of pages is
+ * physically contiguous and part of the same compound page, then a
+ * single operation to the head page should suffice.
+ */
+ for (index = 0; index < npages; index++)
+ put_user_page(pages[index]);
+}
+EXPORT_SYMBOL(put_user_pages);
+
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags)
{
@@ -1018,6 +1123,15 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
int *locked)
{
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
return __get_user_pages_locked(current, current->mm, start, nr_pages,
pages, NULL, locked,
gup_flags | FOLL_TOUCH);
@@ -1046,6 +1160,15 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
int locked = 1;
long ret;
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
down_read(&mm->mmap_sem);
ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
&locked, gup_flags | FOLL_TOUCH);
@@ -1116,32 +1239,22 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
locked,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
-/*
- * This is the same as get_user_pages_remote(), just with a
- * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's and don't allow
- * passing of a locked parameter. We also obviously don't pass
- * FOLL_REMOTE in here.
- */
-long get_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
-{
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
- pages, vmas, NULL,
- gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages);
-
#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
-
-#ifdef CONFIG_FS_DAX
static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
{
long i;
@@ -1160,12 +1273,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
}
return false;
}
-#else
-static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
-{
- return false;
-}
-#endif
#ifdef CONFIG_CMA
static struct page *new_non_cma_page(struct page *page, unsigned long private)
@@ -1219,10 +1326,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private)
return __alloc_pages_node(nid, gfp_mask, 0);
}
-static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
- unsigned int gup_flags,
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
struct page **pages,
- struct vm_area_struct **vmas)
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
{
long i;
bool drain_allow = true;
@@ -1278,10 +1388,14 @@ check_again:
putback_movable_pages(&cma_page_list);
}
/*
- * We did migrate all the pages, Try to get the page references again
- * migrating any new CMA pages which we failed to isolate earlier.
+ * We did migrate all the pages, Try to get the page references
+ * again migrating any new CMA pages which we failed to isolate
+ * earlier.
*/
- nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+ nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
+ pages, vmas, NULL,
+ gup_flags);
+
if ((nr_pages > 0) && migrate_allow) {
drain_allow = true;
goto check_again;
@@ -1291,66 +1405,101 @@ check_again:
return nr_pages;
}
#else
-static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
- unsigned int gup_flags,
- struct page **pages,
- struct vm_area_struct **vmas)
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
{
return nr_pages;
}
#endif
/*
- * This is the same as get_user_pages() in that it assumes we are
- * operating on the current task's mm, but it goes further to validate
- * that the vmas associated with the address range are suitable for
- * longterm elevated page reference counts. For example, filesystem-dax
- * mappings are subject to the lifetime enforced by the filesystem and
- * we need guarantees that longterm users like RDMA and V4L2 only
- * establish mappings that have a kernel enforced revocation mechanism.
- *
- * "longterm" == userspace controlled elevated page count lifetime.
- * Contrast this to iov_iter_get_pages() usages which are transient.
+ * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
+ * allows us to process the FOLL_LONGTERM flag.
*/
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas_arg)
+static long __gup_longterm_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
{
- struct vm_area_struct **vmas = vmas_arg;
- unsigned long flags;
+ struct vm_area_struct **vmas_tmp = vmas;
+ unsigned long flags = 0;
long rc, i;
- if (!pages)
- return -EINVAL;
-
- if (!vmas) {
- vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
- GFP_KERNEL);
- if (!vmas)
- return -ENOMEM;
+ if (gup_flags & FOLL_LONGTERM) {
+ if (!pages)
+ return -EINVAL;
+
+ if (!vmas_tmp) {
+ vmas_tmp = kcalloc(nr_pages,
+ sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!vmas_tmp)
+ return -ENOMEM;
+ }
+ flags = memalloc_nocma_save();
}
- flags = memalloc_nocma_save();
- rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
- memalloc_nocma_restore(flags);
- if (rc < 0)
- goto out;
+ rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
+ vmas_tmp, NULL, gup_flags);
- if (check_dax_vmas(vmas, rc)) {
- for (i = 0; i < rc; i++)
- put_page(pages[i]);
- rc = -EOPNOTSUPP;
- goto out;
+ if (gup_flags & FOLL_LONGTERM) {
+ memalloc_nocma_restore(flags);
+ if (rc < 0)
+ goto out;
+
+ if (check_dax_vmas(vmas_tmp, rc)) {
+ for (i = 0; i < rc; i++)
+ put_page(pages[i]);
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
+ vmas_tmp, gup_flags);
}
- rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas);
out:
- if (vmas != vmas_arg)
- kfree(vmas);
+ if (vmas_tmp != vmas)
+ kfree(vmas_tmp);
return rc;
}
-EXPORT_SYMBOL(get_user_pages_longterm);
-#endif /* CONFIG_FS_DAX */
+#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
+static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int flags)
+{
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ NULL, flags);
+}
+#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+
+/*
+ * This is the same as get_user_pages_remote(), just with a
+ * less-flexible calling convention where we assume that the task
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter. We also obviously don't pass
+ * FOLL_REMOTE in here.
+ */
+long get_user_pages(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ pages, vmas, gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages);
/**
* populate_vma_page_range() - populate a range of pages in the vma.
@@ -1571,7 +1720,7 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -1589,10 +1738,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
if (pte_protnone(pte))
goto pte_unmap;
- if (!pte_access_permitted(pte, write))
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
goto pte_unmap;
if (pte_devmap(pte)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ goto pte_unmap;
+
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
@@ -1641,7 +1793,7 @@ pte_unmap:
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
return 0;
}
@@ -1724,16 +1876,19 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
#endif
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (!pmd_access_permitted(orig, write))
+ if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
- if (pmd_devmap(orig))
+ if (pmd_devmap(orig)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ return 0;
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
+ }
refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1762,16 +1917,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
}
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (!pud_access_permitted(orig, write))
+ if (!pud_access_permitted(orig, flags & FOLL_WRITE))
return 0;
- if (pud_devmap(orig))
+ if (pud_devmap(orig)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ return 0;
return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
+ }
refs = 0;
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -1800,13 +1958,13 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
}
static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
- unsigned long end, int write,
+ unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
int refs;
struct page *head, *page;
- if (!pgd_access_permitted(orig, write))
+ if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
@@ -1837,7 +1995,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
}
static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;
@@ -1860,7 +2018,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (pmd_protnone(pmd))
return 0;
- if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
+ if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
pages, nr))
return 0;
@@ -1870,9 +2028,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
* pmd format and THP pmd format
*/
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
- PMD_SHIFT, next, write, pages, nr))
+ PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
@@ -1880,7 +2038,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
}
static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
@@ -1893,14 +2051,14 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
if (pud_none(pud))
return 0;
if (unlikely(pud_huge(pud))) {
- if (!gup_huge_pud(pud, pudp, addr, next, write,
+ if (!gup_huge_pud(pud, pudp, addr, next, flags,
pages, nr))
return 0;
} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
- PUD_SHIFT, next, write, pages, nr))
+ PUD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);
@@ -1908,7 +2066,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
}
static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
p4d_t *p4dp;
@@ -1923,9 +2081,9 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
BUILD_BUG_ON(p4d_huge(p4d));
if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
- P4D_SHIFT, next, write, pages, nr))
+ P4D_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
+ } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
@@ -1933,7 +2091,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
}
static void gup_pgd_range(unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pgd_t *pgdp;
@@ -1946,14 +2104,14 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
if (pgd_none(pgd))
return;
if (unlikely(pgd_huge(pgd))) {
- if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
+ if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
pages, nr))
return;
} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
- PGDIR_SHIFT, next, write, pages, nr))
+ PGDIR_SHIFT, next, flags, pages, nr))
return;
- } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
+ } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
@@ -2007,18 +2165,41 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (gup_fast_permitted(start, nr_pages)) {
local_irq_save(flags);
- gup_pgd_range(start, end, write, pages, &nr);
+ gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr);
local_irq_restore(flags);
}
return nr;
}
+static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int ret;
+
+ /*
+ * FIXME: FOLL_LONGTERM does not work with
+ * get_user_pages_unlocked() (see comments in that function)
+ */
+ if (gup_flags & FOLL_LONGTERM) {
+ down_read(&current->mm->mmap_sem);
+ ret = __gup_longterm_locked(current, current->mm,
+ start, nr_pages,
+ pages, NULL, gup_flags);
+ up_read(&current->mm->mmap_sem);
+ } else {
+ ret = get_user_pages_unlocked(start, nr_pages,
+ pages, gup_flags);
+ }
+
+ return ret;
+}
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -2030,8 +2211,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
unsigned long addr, len, end;
int nr = 0, ret = 0;
@@ -2049,7 +2230,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (gup_fast_permitted(start, nr_pages)) {
local_irq_disable();
- gup_pgd_range(addr, end, write, pages, &nr);
+ gup_pgd_range(addr, end, gup_flags, pages, &nr);
local_irq_enable();
ret = nr;
}
@@ -2059,8 +2240,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start += nr << PAGE_SHIFT;
pages += nr;
- ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
- write ? FOLL_WRITE : 0);
+ ret = __gup_longterm_unlocked(start, nr_pages - nr,
+ gup_flags, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 6c0279e70cc4..7dd602d7f8db 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -54,8 +54,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
pages + i);
break;
case GUP_LONGTERM_BENCHMARK:
- nr = get_user_pages_longterm(addr, nr, gup->flags & 1,
- pages + i, NULL);
+ nr = get_user_pages(addr, nr,
+ (gup->flags & 1) | FOLL_LONGTERM,
+ pages + i, NULL);
break;
case GUP_BENCHMARK:
nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
diff --git a/mm/hmm.c b/mm/hmm.c
index fe1cd87e49ac..0db8491090b8 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -30,6 +30,7 @@
#include <linux/hugetlb.h>
#include <linux/memremap.h>
#include <linux/jump_label.h>
+#include <linux/dma-mapping.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@@ -38,54 +39,48 @@
#if IS_ENABLED(CONFIG_HMM_MIRROR)
static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
-/*
- * struct hmm - HMM per mm struct
- *
- * @mm: mm struct this HMM struct is bound to
- * @lock: lock protecting ranges list
- * @ranges: list of range being snapshotted
- * @mirrors: list of mirrors for this mm
- * @mmu_notifier: mmu notifier to track updates to CPU page table
- * @mirrors_sem: read/write semaphore protecting the mirrors list
- */
-struct hmm {
- struct mm_struct *mm;
- spinlock_t lock;
- struct list_head ranges;
- struct list_head mirrors;
- struct mmu_notifier mmu_notifier;
- struct rw_semaphore mirrors_sem;
-};
+static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
+{
+ struct hmm *hmm = READ_ONCE(mm->hmm);
-/*
- * hmm_register - register HMM against an mm (HMM internal)
+ if (hmm && kref_get_unless_zero(&hmm->kref))
+ return hmm;
+
+ return NULL;
+}
+
+/**
+ * hmm_get_or_create - register HMM against an mm (HMM internal)
*
* @mm: mm struct to attach to
+ * Returns: returns an HMM object, either by referencing the existing
+ * (per-process) object, or by creating a new one.
*
- * This is not intended to be used directly by device drivers. It allocates an
- * HMM struct if mm does not have one, and initializes it.
+ * This is not intended to be used directly by device drivers. If mm already
+ * has an HMM struct then it get a reference on it and returns it. Otherwise
+ * it allocates an HMM struct, initializes it, associate it with the mm and
+ * returns it.
*/
-static struct hmm *hmm_register(struct mm_struct *mm)
+static struct hmm *hmm_get_or_create(struct mm_struct *mm)
{
- struct hmm *hmm = READ_ONCE(mm->hmm);
+ struct hmm *hmm = mm_get_hmm(mm);
bool cleanup = false;
- /*
- * The hmm struct can only be freed once the mm_struct goes away,
- * hence we should always have pre-allocated an new hmm struct
- * above.
- */
if (hmm)
return hmm;
hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
if (!hmm)
return NULL;
+ init_waitqueue_head(&hmm->wq);
INIT_LIST_HEAD(&hmm->mirrors);
init_rwsem(&hmm->mirrors_sem);
hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(&hmm->ranges);
- spin_lock_init(&hmm->lock);
+ mutex_init(&hmm->lock);
+ kref_init(&hmm->kref);
+ hmm->notifiers = 0;
+ hmm->dead = false;
hmm->mm = mm;
spin_lock(&mm->page_table_lock);
@@ -106,7 +101,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
goto error_mm;
- return mm->hmm;
+ return hmm;
error_mm:
spin_lock(&mm->page_table_lock);
@@ -118,54 +113,60 @@ error:
return NULL;
}
-void hmm_mm_destroy(struct mm_struct *mm)
+static void hmm_free(struct kref *kref)
{
- kfree(mm->hmm);
-}
+ struct hmm *hmm = container_of(kref, struct hmm, kref);
+ struct mm_struct *mm = hmm->mm;
-static int hmm_invalidate_range(struct hmm *hmm, bool device,
- const struct hmm_update *update)
-{
- struct hmm_mirror *mirror;
- struct hmm_range *range;
-
- spin_lock(&hmm->lock);
- list_for_each_entry(range, &hmm->ranges, list) {
- unsigned long addr, idx, npages;
+ mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
- if (update->end < range->start || update->start >= range->end)
- continue;
+ spin_lock(&mm->page_table_lock);
+ if (mm->hmm == hmm)
+ mm->hmm = NULL;
+ spin_unlock(&mm->page_table_lock);
- range->valid = false;
- addr = max(update->start, range->start);
- idx = (addr - range->start) >> PAGE_SHIFT;
- npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT;
- memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
- }
- spin_unlock(&hmm->lock);
+ kfree(hmm);
+}
- if (!device)
- return 0;
+static inline void hmm_put(struct hmm *hmm)
+{
+ kref_put(&hmm->kref, hmm_free);
+}
- down_read(&hmm->mirrors_sem);
- list_for_each_entry(mirror, &hmm->mirrors, list) {
- int ret;
+void hmm_mm_destroy(struct mm_struct *mm)
+{
+ struct hmm *hmm;
- ret = mirror->ops->sync_cpu_device_pagetables(mirror, update);
- if (!update->blockable && ret == -EAGAIN) {
- up_read(&hmm->mirrors_sem);
- return -EAGAIN;
- }
+ spin_lock(&mm->page_table_lock);
+ hmm = mm_get_hmm(mm);
+ mm->hmm = NULL;
+ if (hmm) {
+ hmm->mm = NULL;
+ hmm->dead = true;
+ spin_unlock(&mm->page_table_lock);
+ hmm_put(hmm);
+ return;
}
- up_read(&hmm->mirrors_sem);
- return 0;
+ spin_unlock(&mm->page_table_lock);
}
static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
{
+ struct hmm *hmm = mm_get_hmm(mm);
struct hmm_mirror *mirror;
- struct hmm *hmm = mm->hmm;
+ struct hmm_range *range;
+
+ /* Report this HMM as dying. */
+ hmm->dead = true;
+
+ /* Wake-up everyone waiting on any range. */
+ mutex_lock(&hmm->lock);
+ list_for_each_entry(range, &hmm->ranges, list) {
+ range->valid = false;
+ }
+ wake_up_all(&hmm->wq);
+ mutex_unlock(&hmm->lock);
down_write(&hmm->mirrors_sem);
mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
@@ -186,36 +187,86 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
struct hmm_mirror, list);
}
up_write(&hmm->mirrors_sem);
+
+ hmm_put(hmm);
}
static int hmm_invalidate_range_start(struct mmu_notifier *mn,
- const struct mmu_notifier_range *range)
+ const struct mmu_notifier_range *nrange)
{
+ struct hmm *hmm = mm_get_hmm(nrange->mm);
+ struct hmm_mirror *mirror;
struct hmm_update update;
- struct hmm *hmm = range->mm->hmm;
+ struct hmm_range *range;
+ int ret = 0;
VM_BUG_ON(!hmm);
- update.start = range->start;
- update.end = range->end;
+ update.start = nrange->start;
+ update.end = nrange->end;
update.event = HMM_UPDATE_INVALIDATE;
- update.blockable = range->blockable;
- return hmm_invalidate_range(hmm, true, &update);
+ update.blockable = mmu_notifier_range_blockable(nrange);
+
+ if (mmu_notifier_range_blockable(nrange))
+ mutex_lock(&hmm->lock);
+ else if (!mutex_trylock(&hmm->lock)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ hmm->notifiers++;
+ list_for_each_entry(range, &hmm->ranges, list) {
+ if (update.end < range->start || update.start >= range->end)
+ continue;
+
+ range->valid = false;
+ }
+ mutex_unlock(&hmm->lock);
+
+ if (mmu_notifier_range_blockable(nrange))
+ down_read(&hmm->mirrors_sem);
+ else if (!down_read_trylock(&hmm->mirrors_sem)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ list_for_each_entry(mirror, &hmm->mirrors, list) {
+ int ret;
+
+ ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
+ if (!update.blockable && ret == -EAGAIN) {
+ up_read(&hmm->mirrors_sem);
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
+ up_read(&hmm->mirrors_sem);
+
+out:
+ hmm_put(hmm);
+ return ret;
}
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
- const struct mmu_notifier_range *range)
+ const struct mmu_notifier_range *nrange)
{
- struct hmm_update update;
- struct hmm *hmm = range->mm->hmm;
+ struct hmm *hmm = mm_get_hmm(nrange->mm);
VM_BUG_ON(!hmm);
- update.start = range->start;
- update.end = range->end;
- update.event = HMM_UPDATE_INVALIDATE;
- update.blockable = true;
- hmm_invalidate_range(hmm, false, &update);
+ mutex_lock(&hmm->lock);
+ hmm->notifiers--;
+ if (!hmm->notifiers) {
+ struct hmm_range *range;
+
+ list_for_each_entry(range, &hmm->ranges, list) {
+ if (range->valid)
+ continue;
+ range->valid = true;
+ }
+ wake_up_all(&hmm->wq);
+ }
+ mutex_unlock(&hmm->lock);
+
+ hmm_put(hmm);
}
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
@@ -241,24 +292,13 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
if (!mm || !mirror || !mirror->ops)
return -EINVAL;
-again:
- mirror->hmm = hmm_register(mm);
+ mirror->hmm = hmm_get_or_create(mm);
if (!mirror->hmm)
return -ENOMEM;
down_write(&mirror->hmm->mirrors_sem);
- if (mirror->hmm->mm == NULL) {
- /*
- * A racing hmm_mirror_unregister() is about to destroy the hmm
- * struct. Try again to allocate a new one.
- */
- up_write(&mirror->hmm->mirrors_sem);
- mirror->hmm = NULL;
- goto again;
- } else {
- list_add(&mirror->list, &mirror->hmm->mirrors);
- up_write(&mirror->hmm->mirrors_sem);
- }
+ list_add(&mirror->list, &mirror->hmm->mirrors);
+ up_write(&mirror->hmm->mirrors_sem);
return 0;
}
@@ -273,38 +313,24 @@ EXPORT_SYMBOL(hmm_mirror_register);
*/
void hmm_mirror_unregister(struct hmm_mirror *mirror)
{
- bool should_unregister = false;
- struct mm_struct *mm;
- struct hmm *hmm;
+ struct hmm *hmm = READ_ONCE(mirror->hmm);
- if (mirror->hmm == NULL)
+ if (hmm == NULL)
return;
- hmm = mirror->hmm;
down_write(&hmm->mirrors_sem);
list_del_init(&mirror->list);
- should_unregister = list_empty(&hmm->mirrors);
+ /* To protect us against double unregister ... */
mirror->hmm = NULL;
- mm = hmm->mm;
- hmm->mm = NULL;
up_write(&hmm->mirrors_sem);
- if (!should_unregister || mm == NULL)
- return;
-
- mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
-
- spin_lock(&mm->page_table_lock);
- if (mm->hmm == hmm)
- mm->hmm = NULL;
- spin_unlock(&mm->page_table_lock);
-
- kfree(hmm);
+ hmm_put(hmm);
}
EXPORT_SYMBOL(hmm_mirror_unregister);
struct hmm_vma_walk {
struct hmm_range *range;
+ struct dev_pagemap *pgmap;
unsigned long last;
bool fault;
bool block;
@@ -323,13 +349,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
flags |= write_fault ? FAULT_FLAG_WRITE : 0;
ret = handle_mm_fault(vma, addr, flags);
if (ret & VM_FAULT_RETRY)
- return -EBUSY;
+ return -EAGAIN;
if (ret & VM_FAULT_ERROR) {
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
}
- return -EAGAIN;
+ return -EBUSY;
}
static int hmm_pfns_bad(unsigned long addr,
@@ -355,7 +381,7 @@ static int hmm_pfns_bad(unsigned long addr,
* @fault: should we fault or not ?
* @write_fault: write fault ?
* @walk: mm_walk structure
- * Returns: 0 on success, -EAGAIN after page fault, or page fault error
+ * Returns: 0 on success, -EBUSY after page fault, or page fault error
*
* This function will be called whenever pmd_none() or pte_none() returns true,
* or whenever there is no page directory covering the virtual address range.
@@ -367,23 +393,25 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
uint64_t *pfns = range->pfns;
- unsigned long i;
+ unsigned long i, page_size;
hmm_vma_walk->last = addr;
- i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++) {
+ page_size = hmm_range_page_size(range);
+ i = (addr - range->start) >> range->page_shift;
+
+ for (; addr < end; addr += page_size, i++) {
pfns[i] = range->values[HMM_PFN_NONE];
if (fault || write_fault) {
int ret;
ret = hmm_vma_do_fault(walk, addr, write_fault,
&pfns[i]);
- if (ret != -EAGAIN)
+ if (ret != -EBUSY)
return ret;
}
}
- return (fault || write_fault) ? -EAGAIN : 0;
+ return (fault || write_fault) ? -EBUSY : 0;
}
static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
@@ -392,10 +420,21 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
{
struct hmm_range *range = hmm_vma_walk->range;
- *fault = *write_fault = false;
if (!hmm_vma_walk->fault)
return;
+ /*
+ * So we not only consider the individual per page request we also
+ * consider the default flags requested for the range. The API can
+ * be use in 2 fashions. The first one where the HMM user coalesce
+ * multiple page fault into one request and set flags per pfns for
+ * of those faults. The second one where the HMM user want to pre-
+ * fault a range with specific flags. For the latter one it is a
+ * waste to have the user pre-fill the pfn arrays with a default
+ * flags value.
+ */
+ pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
+
/* We aren't ask to do anything ... */
if (!(pfns & range->flags[HMM_PFN_VALID]))
return;
@@ -431,10 +470,11 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
return;
}
+ *fault = *write_fault = false;
for (i = 0; i < npages; ++i) {
hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
fault, write_fault);
- if ((*fault) || (*write_fault))
+ if ((*write_fault))
return;
}
}
@@ -465,12 +505,22 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
range->flags[HMM_PFN_VALID];
}
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+ if (!pud_present(pud))
+ return 0;
+ return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+ range->flags[HMM_PFN_WRITE] :
+ range->flags[HMM_PFN_VALID];
+}
+
static int hmm_vma_handle_pmd(struct mm_walk *walk,
unsigned long addr,
unsigned long end,
uint64_t *pfns,
pmd_t pmd)
{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long pfn, npages, i;
@@ -486,10 +536,25 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
pfn = pmd_pfn(pmd) + pte_index(addr);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ if (pmd_devmap(pmd)) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ }
+ pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
+ }
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
hmm_vma_walk->last = end;
return 0;
+#else
+ /* If THP is not enabled then we should never reach that code ! */
+ return -EINVAL;
+#endif
}
static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
@@ -514,11 +579,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
uint64_t orig_pfn = *pfn;
*pfn = range->values[HMM_PFN_NONE];
- cpu_flags = pte_to_hmm_pfn_flags(range, pte);
- hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
- &fault, &write_fault);
+ fault = write_fault = false;
if (pte_none(pte)) {
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
+ &fault, &write_fault);
if (fault || write_fault)
goto fault;
return 0;
@@ -546,7 +611,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
&fault, &write_fault);
if (fault || write_fault)
goto fault;
- *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
+ *pfn = hmm_device_entry_from_pfn(range,
+ swp_offset(entry));
*pfn |= cpu_flags;
return 0;
}
@@ -557,7 +623,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
hmm_vma_walk->last = addr;
migration_entry_wait(vma->vm_mm,
pmdp, addr);
- return -EAGAIN;
+ return -EBUSY;
}
return 0;
}
@@ -565,15 +631,33 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
/* Report error for everything else */
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
+ } else {
+ cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
}
if (fault || write_fault)
goto fault;
- *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
+ if (pte_devmap(pte)) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
+ *pfn = range->values[HMM_PFN_SPECIAL];
+ return -EFAULT;
+ }
+
+ *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
return 0;
fault:
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -615,7 +699,7 @@ again:
if (fault || write_fault) {
hmm_vma_walk->last = addr;
pmd_migration_entry_wait(vma->vm_mm, pmdp);
- return -EAGAIN;
+ return -EBUSY;
}
return 0;
} else if (!pmd_present(pmd))
@@ -661,12 +745,158 @@ again:
return r;
}
}
+ if (hmm_vma_walk->pgmap) {
+ /*
+ * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
+ * so that we can leverage get_dev_pagemap() optimization which
+ * will not re-take a reference on a pgmap if we already have
+ * one.
+ */
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
pte_unmap(ptep - 1);
hmm_vma_walk->last = addr;
return 0;
}
+static int hmm_vma_walk_pud(pud_t *pudp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long addr = start, next;
+ pmd_t *pmdp;
+ pud_t pud;
+ int ret;
+
+again:
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ if (pud_huge(pud) && pud_devmap(pud)) {
+ unsigned long i, npages, pfn;
+ uint64_t *pfns, cpu_flags;
+ bool fault, write_fault;
+
+ if (!pud_present(pud))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ npages = (end - addr) >> PAGE_SHIFT;
+ pfns = &range->pfns[i];
+
+ cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+ cpu_flags, &fault, &write_fault);
+ if (fault || write_fault)
+ return hmm_vma_walk_hole_(addr, end, fault,
+ write_fault, walk);
+
+#ifdef CONFIG_HUGETLB_PAGE
+ pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
+ cpu_flags;
+ }
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
+ hmm_vma_walk->last = end;
+ return 0;
+#else
+ return -EINVAL;
+#endif
+ }
+
+ split_huge_pud(walk->vma, pudp, addr);
+ if (pud_none(*pudp))
+ goto again;
+
+ pmdp = pmd_offset(pudp, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ unsigned long addr = start, i, pfn, mask, size, pfn_inc;
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ struct hstate *h = hstate_vma(vma);
+ uint64_t orig_pfn, cpu_flags;
+ bool fault, write_fault;
+ spinlock_t *ptl;
+ pte_t entry;
+ int ret = 0;
+
+ size = 1UL << huge_page_shift(h);
+ mask = size - 1;
+ if (range->page_shift != PAGE_SHIFT) {
+ /* Make sure we are looking at full page. */
+ if (start & mask)
+ return -EINVAL;
+ if (end < (start + size))
+ return -EINVAL;
+ pfn_inc = size >> PAGE_SHIFT;
+ } else {
+ pfn_inc = 1;
+ size = PAGE_SIZE;
+ }
+
+
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ entry = huge_ptep_get(pte);
+
+ i = (start - range->start) >> range->page_shift;
+ orig_pfn = range->pfns[i];
+ range->pfns[i] = range->values[HMM_PFN_NONE];
+ cpu_flags = pte_to_hmm_pfn_flags(range, entry);
+ fault = write_fault = false;
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
+ if (fault || write_fault) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift);
+ for (; addr < end; addr += size, i++, pfn += pfn_inc)
+ range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
+ cpu_flags;
+ hmm_vma_walk->last = end;
+
+unlock:
+ spin_unlock(ptl);
+
+ if (ret == -ENOENT)
+ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+
+ return ret;
+#else /* CONFIG_HUGETLB_PAGE */
+ return -EINVAL;
+#endif
+}
+
static void hmm_pfns_clear(struct hmm_range *range,
uint64_t *pfns,
unsigned long addr,
@@ -676,279 +906,437 @@ static void hmm_pfns_clear(struct hmm_range *range,
*pfns = range->values[HMM_PFN_NONE];
}
-static void hmm_pfns_special(struct hmm_range *range)
-{
- unsigned long addr = range->start, i = 0;
-
- for (; addr < range->end; addr += PAGE_SIZE, i++)
- range->pfns[i] = range->values[HMM_PFN_SPECIAL];
-}
-
/*
- * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
- * @range: range being snapshotted
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
- * vma permission, 0 success
- *
- * This snapshots the CPU page table for a range of virtual addresses. Snapshot
- * validity is tracked by range struct. See hmm_vma_range_done() for further
- * information.
- *
- * The range struct is initialized here. It tracks the CPU page table, but only
- * if the function returns success (0), in which case the caller must then call
- * hmm_vma_range_done() to stop CPU page table update tracking on this range.
+ * hmm_range_register() - start tracking change to CPU page table over a range
+ * @range: range
+ * @mm: the mm struct for the range of virtual address
+ * @start: start virtual address (inclusive)
+ * @end: end virtual address (exclusive)
+ * @page_shift: expect page shift for the range
+ * Returns 0 on success, -EFAULT if the address space is no longer valid
*
- * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
- * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
+ * Track updates to the CPU page table see include/linux/hmm.h
*/
-int hmm_vma_get_pfns(struct hmm_range *range)
+int hmm_range_register(struct hmm_range *range,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ unsigned page_shift)
{
- struct vm_area_struct *vma = range->vma;
- struct hmm_vma_walk hmm_vma_walk;
- struct mm_walk mm_walk;
- struct hmm *hmm;
+ unsigned long mask = ((1UL << page_shift) - 1UL);
+
+ range->valid = false;
+ range->hmm = NULL;
- /* Sanity check, this really should not happen ! */
- if (range->start < vma->vm_start || range->start >= vma->vm_end)
+ if ((start & mask) || (end & mask))
return -EINVAL;
- if (range->end < vma->vm_start || range->end > vma->vm_end)
+ if (start >= end)
return -EINVAL;
- hmm = hmm_register(vma->vm_mm);
- if (!hmm)
- return -ENOMEM;
- /* Caller must have registered a mirror, via hmm_mirror_register() ! */
- if (!hmm->mmu_notifier.ops)
- return -EINVAL;
+ range->page_shift = page_shift;
+ range->start = start;
+ range->end = end;
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
- vma_is_dax(vma)) {
- hmm_pfns_special(range);
- return -EINVAL;
- }
+ range->hmm = hmm_get_or_create(mm);
+ if (!range->hmm)
+ return -EFAULT;
- if (!(vma->vm_flags & VM_READ)) {
- /*
- * If vma do not allow read access, then assume that it does
- * not allow write access, either. Architecture that allow
- * write without read access are not supported by HMM, because
- * operations such has atomic access would not work.
- */
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -EPERM;
+ /* Check if hmm_mm_destroy() was call. */
+ if (range->hmm->mm == NULL || range->hmm->dead) {
+ hmm_put(range->hmm);
+ return -EFAULT;
}
/* Initialize range to track CPU page table update */
- spin_lock(&hmm->lock);
- range->valid = true;
- list_add_rcu(&range->list, &hmm->ranges);
- spin_unlock(&hmm->lock);
-
- hmm_vma_walk.fault = false;
- hmm_vma_walk.range = range;
- mm_walk.private = &hmm_vma_walk;
-
- mm_walk.vma = vma;
- mm_walk.mm = vma->vm_mm;
- mm_walk.pte_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
- mm_walk.pte_hole = hmm_vma_walk_hole;
-
- walk_page_range(range->start, range->end, &mm_walk);
+ mutex_lock(&range->hmm->lock);
+
+ list_add_rcu(&range->list, &range->hmm->ranges);
+
+ /*
+ * If there are any concurrent notifiers we have to wait for them for
+ * the range to be valid (see hmm_range_wait_until_valid()).
+ */
+ if (!range->hmm->notifiers)
+ range->valid = true;
+ mutex_unlock(&range->hmm->lock);
+
return 0;
}
-EXPORT_SYMBOL(hmm_vma_get_pfns);
+EXPORT_SYMBOL(hmm_range_register);
/*
- * hmm_vma_range_done() - stop tracking change to CPU page table over a range
- * @range: range being tracked
- * Returns: false if range data has been invalidated, true otherwise
+ * hmm_range_unregister() - stop tracking change to CPU page table over a range
+ * @range: range
*
* Range struct is used to track updates to the CPU page table after a call to
- * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
- * using the data, or wants to lock updates to the data it got from those
- * functions, it must call the hmm_vma_range_done() function, which will then
- * stop tracking CPU page table updates.
- *
- * Note that device driver must still implement general CPU page table update
- * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
- * the mmu_notifier API directly.
- *
- * CPU page table update tracking done through hmm_range is only temporary and
- * to be used while trying to duplicate CPU page table contents for a range of
- * virtual addresses.
- *
- * There are two ways to use this :
- * again:
- * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
- * trans = device_build_page_table_update_transaction(pfns);
- * device_page_table_lock();
- * if (!hmm_vma_range_done(range)) {
- * device_page_table_unlock();
- * goto again;
- * }
- * device_commit_transaction(trans);
- * device_page_table_unlock();
+ * hmm_range_register(). See include/linux/hmm.h for how to use it.
+ */
+void hmm_range_unregister(struct hmm_range *range)
+{
+ /* Sanity check this really should not happen. */
+ if (range->hmm == NULL || range->end <= range->start)
+ return;
+
+ mutex_lock(&range->hmm->lock);
+ list_del_rcu(&range->list);
+ mutex_unlock(&range->hmm->lock);
+
+ /* Drop reference taken by hmm_range_register() */
+ range->valid = false;
+ hmm_put(range->hmm);
+ range->hmm = NULL;
+}
+EXPORT_SYMBOL(hmm_range_unregister);
+
+/*
+ * hmm_range_snapshot() - snapshot CPU page table for a range
+ * @range: range
+ * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
+ * permission (for instance asking for write and range is read only),
+ * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid
+ * vma or it is illegal to access that range), number of valid pages
+ * in range->pfns[] (from range start address).
*
- * Or:
- * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
- * device_page_table_lock();
- * hmm_vma_range_done(range);
- * device_update_page_table(range->pfns);
- * device_page_table_unlock();
+ * This snapshots the CPU page table for a range of virtual addresses. Snapshot
+ * validity is tracked by range struct. See in include/linux/hmm.h for example
+ * on how to use.
*/
-bool hmm_vma_range_done(struct hmm_range *range)
+long hmm_range_snapshot(struct hmm_range *range)
{
- unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
- struct hmm *hmm;
+ const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
+ unsigned long start = range->start, end;
+ struct hmm_vma_walk hmm_vma_walk;
+ struct hmm *hmm = range->hmm;
+ struct vm_area_struct *vma;
+ struct mm_walk mm_walk;
- if (range->end <= range->start) {
- BUG();
- return false;
- }
+ /* Check if hmm_mm_destroy() was call. */
+ if (hmm->mm == NULL || hmm->dead)
+ return -EFAULT;
- hmm = hmm_register(range->vma->vm_mm);
- if (!hmm) {
- memset(range->pfns, 0, sizeof(*range->pfns) * npages);
- return false;
- }
+ do {
+ /* If range is no longer valid force retry. */
+ if (!range->valid)
+ return -EAGAIN;
- spin_lock(&hmm->lock);
- list_del_rcu(&range->list);
- spin_unlock(&hmm->lock);
+ vma = find_vma(hmm->mm, start);
+ if (vma == NULL || (vma->vm_flags & device_vma))
+ return -EFAULT;
+
+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h = hstate_vma(vma);
- return range->valid;
+ if (huge_page_shift(h) != range->page_shift &&
+ range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ } else {
+ if (range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ }
+
+ if (!(vma->vm_flags & VM_READ)) {
+ /*
+ * If vma do not allow read access, then assume that it
+ * does not allow write access, either. HMM does not
+ * support architecture that allow write without read.
+ */
+ hmm_pfns_clear(range, range->pfns,
+ range->start, range->end);
+ return -EPERM;
+ }
+
+ range->vma = vma;
+ hmm_vma_walk.pgmap = NULL;
+ hmm_vma_walk.last = start;
+ hmm_vma_walk.fault = false;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+ end = min(range->end, vma->vm_end);
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pud_entry = hmm_vma_walk_pud;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+ mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+
+ walk_page_range(start, end, &mm_walk);
+ start = end;
+ } while (start < range->end);
+
+ return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
}
-EXPORT_SYMBOL(hmm_vma_range_done);
+EXPORT_SYMBOL(hmm_range_snapshot);
/*
- * hmm_vma_fault() - try to fault some address in a virtual address range
+ * hmm_range_fault() - try to fault some address in a virtual address range
* @range: range being faulted
* @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
+ * Returns: number of valid pages in range->pfns[] (from range start
+ * address). This may be zero. If the return value is negative,
+ * then one of the following values may be returned:
+ *
+ * -EINVAL invalid arguments or mm or virtual address are in an
+ * invalid vma (for instance device file vma).
+ * -ENOMEM: Out of memory.
+ * -EPERM: Invalid permission (for instance asking for write and
+ * range is read only).
+ * -EAGAIN: If you need to retry and mmap_sem was drop. This can only
+ * happens if block argument is false.
+ * -EBUSY: If the the range is being invalidated and you should wait
+ * for invalidation to finish.
+ * -EFAULT: Invalid (ie either no valid vma or it is illegal to access
+ * that range), number of valid pages in range->pfns[] (from
+ * range start address).
*
* This is similar to a regular CPU page fault except that it will not trigger
- * any memory migration if the memory being faulted is not accessible by CPUs.
+ * any memory migration if the memory being faulted is not accessible by CPUs
+ * and caller does not ask for migration.
*
* On error, for one virtual address in the range, the function will mark the
* corresponding HMM pfn entry with an error flag.
- *
- * Expected use pattern:
- * retry:
- * down_read(&mm->mmap_sem);
- * // Find vma and address device wants to fault, initialize hmm_pfn_t
- * // array accordingly
- * ret = hmm_vma_fault(range, write, block);
- * switch (ret) {
- * case -EAGAIN:
- * hmm_vma_range_done(range);
- * // You might want to rate limit or yield to play nicely, you may
- * // also commit any valid pfn in the array assuming that you are
- * // getting true from hmm_vma_range_monitor_end()
- * goto retry;
- * case 0:
- * break;
- * case -ENOMEM:
- * case -EINVAL:
- * case -EPERM:
- * default:
- * // Handle error !
- * up_read(&mm->mmap_sem)
- * return;
- * }
- * // Take device driver lock that serialize device page table update
- * driver_lock_device_page_table_update();
- * hmm_vma_range_done(range);
- * // Commit pfns we got from hmm_vma_fault()
- * driver_unlock_device_page_table_update();
- * up_read(&mm->mmap_sem)
- *
- * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
- * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
- *
- * YOU HAVE BEEN WARNED !
*/
-int hmm_vma_fault(struct hmm_range *range, bool block)
+long hmm_range_fault(struct hmm_range *range, bool block)
{
- struct vm_area_struct *vma = range->vma;
- unsigned long start = range->start;
+ const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
+ unsigned long start = range->start, end;
struct hmm_vma_walk hmm_vma_walk;
+ struct hmm *hmm = range->hmm;
+ struct vm_area_struct *vma;
struct mm_walk mm_walk;
- struct hmm *hmm;
int ret;
- /* Sanity check, this really should not happen ! */
- if (range->start < vma->vm_start || range->start >= vma->vm_end)
- return -EINVAL;
- if (range->end < vma->vm_start || range->end > vma->vm_end)
- return -EINVAL;
+ /* Check if hmm_mm_destroy() was call. */
+ if (hmm->mm == NULL || hmm->dead)
+ return -EFAULT;
- hmm = hmm_register(vma->vm_mm);
- if (!hmm) {
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -ENOMEM;
- }
- /* Caller must have registered a mirror using hmm_mirror_register() */
- if (!hmm->mmu_notifier.ops)
- return -EINVAL;
+ do {
+ /* If range is no longer valid force retry. */
+ if (!range->valid) {
+ up_read(&hmm->mm->mmap_sem);
+ return -EAGAIN;
+ }
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
- vma_is_dax(vma)) {
- hmm_pfns_special(range);
- return -EINVAL;
- }
+ vma = find_vma(hmm->mm, start);
+ if (vma == NULL || (vma->vm_flags & device_vma))
+ return -EFAULT;
+
+ if (is_vm_hugetlb_page(vma)) {
+ if (huge_page_shift(hstate_vma(vma)) !=
+ range->page_shift &&
+ range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ } else {
+ if (range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ }
+
+ if (!(vma->vm_flags & VM_READ)) {
+ /*
+ * If vma do not allow read access, then assume that it
+ * does not allow write access, either. HMM does not
+ * support architecture that allow write without read.
+ */
+ hmm_pfns_clear(range, range->pfns,
+ range->start, range->end);
+ return -EPERM;
+ }
+
+ range->vma = vma;
+ hmm_vma_walk.pgmap = NULL;
+ hmm_vma_walk.last = start;
+ hmm_vma_walk.fault = true;
+ hmm_vma_walk.block = block;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+ end = min(range->end, vma->vm_end);
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pud_entry = hmm_vma_walk_pud;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+ mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+
+ do {
+ ret = walk_page_range(start, end, &mm_walk);
+ start = hmm_vma_walk.last;
+
+ /* Keep trying while the range is valid. */
+ } while (ret == -EBUSY && range->valid);
+
+ if (ret) {
+ unsigned long i;
+
+ i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+ hmm_pfns_clear(range, &range->pfns[i],
+ hmm_vma_walk.last, range->end);
+ return ret;
+ }
+ start = end;
+
+ } while (start < range->end);
+
+ return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+}
+EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
+ * @range: range being faulted
+ * @device: device against to dma map page to
+ * @daddrs: dma address of mapped pages
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ * drop and you need to try again, some other error value otherwise
+ *
+ * Note same usage pattern as hmm_range_fault().
+ */
+long hmm_range_dma_map(struct hmm_range *range,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool block)
+{
+ unsigned long i, npages, mapped;
+ long ret;
+
+ ret = hmm_range_fault(range, block);
+ if (ret <= 0)
+ return ret ? ret : -EBUSY;
+
+ npages = (range->end - range->start) >> PAGE_SHIFT;
+ for (i = 0, mapped = 0; i < npages; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
- if (!(vma->vm_flags & VM_READ)) {
/*
- * If vma do not allow read access, then assume that it does
- * not allow write access, either. Architecture that allow
- * write without read access are not supported by HMM, because
- * operations such has atomic access would not work.
+ * FIXME need to update DMA API to provide invalid DMA address
+ * value instead of a function to test dma address value. This
+ * would remove lot of dumb code duplicated accross many arch.
+ *
+ * For now setting it to 0 here is good enough as the pfns[]
+ * value is what is use to check what is valid and what isn't.
*/
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -EPERM;
+ daddrs[i] = 0;
+
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ /* Check if range is being invalidated */
+ if (!range->valid) {
+ ret = -EBUSY;
+ goto unmap;
+ }
+
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
+ dir = DMA_BIDIRECTIONAL;
+
+ daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
+ if (dma_mapping_error(device, daddrs[i])) {
+ ret = -EFAULT;
+ goto unmap;
+ }
+
+ mapped++;
}
- /* Initialize range to track CPU page table update */
- spin_lock(&hmm->lock);
- range->valid = true;
- list_add_rcu(&range->list, &hmm->ranges);
- spin_unlock(&hmm->lock);
-
- hmm_vma_walk.fault = true;
- hmm_vma_walk.block = block;
- hmm_vma_walk.range = range;
- mm_walk.private = &hmm_vma_walk;
- hmm_vma_walk.last = range->start;
-
- mm_walk.vma = vma;
- mm_walk.mm = vma->vm_mm;
- mm_walk.pte_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
- mm_walk.pte_hole = hmm_vma_walk_hole;
+ return mapped;
- do {
- ret = walk_page_range(start, range->end, &mm_walk);
- start = hmm_vma_walk.last;
- } while (ret == -EAGAIN);
+unmap:
+ for (npages = i, i = 0; (i < npages) && mapped; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
- if (ret) {
- unsigned long i;
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ if (dma_mapping_error(device, daddrs[i]))
+ continue;
- i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
- hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
- range->end);
- hmm_vma_range_done(range);
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
+ dir = DMA_BIDIRECTIONAL;
+
+ dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+ mapped--;
}
+
return ret;
}
-EXPORT_SYMBOL(hmm_vma_fault);
+EXPORT_SYMBOL(hmm_range_dma_map);
+
+/**
+ * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
+ * @range: range being unmapped
+ * @vma: the vma against which the range (optional)
+ * @device: device against which dma map was done
+ * @daddrs: dma address of mapped pages
+ * @dirty: dirty page if it had the write flag set
+ * Returns: number of page unmapped on success, -EINVAL otherwise
+ *
+ * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
+ * to the sync_cpu_device_pagetables() callback so that it is safe here to
+ * call set_page_dirty(). Caller must also take appropriate locks to avoid
+ * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
+ */
+long hmm_range_dma_unmap(struct hmm_range *range,
+ struct vm_area_struct *vma,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool dirty)
+{
+ unsigned long i, npages;
+ long cpages = 0;
+
+ /* Sanity check. */
+ if (range->end <= range->start)
+ return -EINVAL;
+ if (!daddrs)
+ return -EINVAL;
+ if (!range->pfns)
+ return -EINVAL;
+
+ npages = (range->end - range->start) >> PAGE_SHIFT;
+ for (i = 0; i < npages; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
+
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) {
+ dir = DMA_BIDIRECTIONAL;
+
+ /*
+ * See comments in function description on why it is
+ * safe here to call set_page_dirty()
+ */
+ if (dirty)
+ set_page_dirty(page);
+ }
+
+ /* Unmap and clear pfns/dma address */
+ dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+ range->pfns[i] = range->values[HMM_PFN_NONE];
+ /* FIXME see comments in hmm_vma_dma_map() */
+ daddrs[i] = 0;
+ cpages++;
+ }
+
+ return cpages;
+}
+EXPORT_SYMBOL(hmm_range_dma_unmap);
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b6a34b32d8ac..093507983dd2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1220,8 +1220,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
cond_resched();
}
- mmu_notifier_range_init(&range, vma->vm_mm, haddr,
- haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -1384,8 +1384,8 @@ alloc:
vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
- mmu_notifier_range_init(&range, vma->vm_mm, haddr,
- haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
spin_lock(vmf->ptl);
@@ -2060,7 +2060,8 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
spinlock_t *ptl;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address & HPAGE_PUD_MASK,
(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pud_lock(vma->vm_mm, pud);
@@ -2278,7 +2279,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
spinlock_t *ptl;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address & HPAGE_PMD_MASK,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pmd_lock(vma->vm_mm, pmd);
@@ -2492,6 +2494,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
shmem_uncharge(head->mapping->host, 1);
put_page(head + i);
+ } else if (!PageAnon(page)) {
+ __xa_store(&head->mapping->i_pages, head[i].index,
+ head + i, 0);
}
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 641cedfc8c0f..bf58cee30f65 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -740,7 +740,15 @@ void resv_map_release(struct kref *ref)
static inline struct resv_map *inode_resv_map(struct inode *inode)
{
- return inode->i_mapping->private_data;
+ /*
+ * At inode evict time, i_mapping may not point to the original
+ * address space within the inode. This original address space
+ * contains the pointer to the resv_map. So, always use the
+ * address space embedded within the inode.
+ * The VERY common case is inode->mapping == &inode->i_data but,
+ * this may not be true for device special inodes.
+ */
+ return (struct resv_map *)(&inode->i_data)->private_data;
}
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
@@ -1059,6 +1067,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
free_contig_range(page_to_pfn(page), 1 << order);
}
+#ifdef CONFIG_CONTIG_ALLOC
static int __alloc_gigantic_page(unsigned long start_pfn,
unsigned long nr_pages, gfp_t gfp_mask)
{
@@ -1143,11 +1152,20 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
static void prep_compound_gigantic_page(struct page *page, unsigned int order);
+#else /* !CONFIG_CONTIG_ALLOC */
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
+#endif /* CONFIG_CONTIG_ALLOC */
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static inline bool gigantic_page_supported(void) { return false; }
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask) { return NULL; }
+ int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
unsigned int order) { }
@@ -1157,7 +1175,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
h->nr_huge_pages--;
@@ -1258,12 +1276,23 @@ void free_huge_page(struct page *page)
ClearPagePrivate(page);
/*
- * A return code of zero implies that the subpool will be under its
- * minimum size if the reservation is not restored after page is free.
- * Therefore, force restore_reserve operation.
+ * If PagePrivate() was set on page, page allocation consumed a
+ * reservation. If the page was associated with a subpool, there
+ * would have been a page reserved in the subpool before allocation
+ * via hugepage_subpool_get_pages(). Since we are 'restoring' the
+ * reservtion, do not call hugepage_subpool_put_pages() as this will
+ * remove the reserved page from the subpool.
*/
- if (hugepage_subpool_put_pages(spool, 1) == 0)
- restore_reserve = true;
+ if (!restore_reserve) {
+ /*
+ * A return code of zero implies that the subpool will be
+ * under its minimum size if the reservation is not restored
+ * after page is free. Therefore, force restore_reserve
+ * operation.
+ */
+ if (hugepage_subpool_put_pages(spool, 1) == 0)
+ restore_reserve = true;
+ }
spin_lock(&hugetlb_lock);
clear_page_huge_active(page);
@@ -2277,13 +2306,47 @@ found:
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
- nodemask_t *nodes_allowed)
+static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
+ nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
- return h->max_huge_pages;
+ spin_lock(&hugetlb_lock);
+
+ /*
+ * Check for a node specific request.
+ * Changing node specific huge page count may require a corresponding
+ * change to the global count. In any case, the passed node mask
+ * (nodes_allowed) will restrict alloc/free to the specified node.
+ */
+ if (nid != NUMA_NO_NODE) {
+ unsigned long old_count = count;
+
+ count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+ /*
+ * User may have specified a large count value which caused the
+ * above calculation to overflow. In this case, they wanted
+ * to allocate as many huge pages as possible. Set count to
+ * largest possible value to align with their intention.
+ */
+ if (count < old_count)
+ count = ULONG_MAX;
+ }
+
+ /*
+ * Gigantic pages runtime allocation depend on the capability for large
+ * page range allocation.
+ * If the system does not provide this feature, return an error when
+ * the user tries to allocate gigantic pages but let the user free the
+ * boottime allocated gigantic pages.
+ */
+ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
+ if (count > persistent_huge_pages(h)) {
+ spin_unlock(&hugetlb_lock);
+ return -EINVAL;
+ }
+ /* Fall through to decrease pool */
+ }
/*
* Increase the pool size
@@ -2296,7 +2359,6 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* pool might be one hugepage larger than it needs to be, but
* within all the constraints specified by the sysctls.
*/
- spin_lock(&hugetlb_lock);
while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, -1))
break;
@@ -2351,9 +2413,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
break;
}
out:
- ret = persistent_huge_pages(h);
+ h->max_huge_pages = persistent_huge_pages(h);
spin_unlock(&hugetlb_lock);
- return ret;
+
+ return 0;
}
#define HSTATE_ATTR_RO(_name) \
@@ -2403,41 +2466,32 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
unsigned long count, size_t len)
{
int err;
- NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
+ nodemask_t nodes_allowed, *n_mask;
- if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
- err = -EINVAL;
- goto out;
- }
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ return -EINVAL;
if (nid == NUMA_NO_NODE) {
/*
* global hstate attribute
*/
if (!(obey_mempolicy &&
- init_nodemask_of_mempolicy(nodes_allowed))) {
- NODEMASK_FREE(nodes_allowed);
- nodes_allowed = &node_states[N_MEMORY];
- }
- } else if (nodes_allowed) {
+ init_nodemask_of_mempolicy(&nodes_allowed)))
+ n_mask = &node_states[N_MEMORY];
+ else
+ n_mask = &nodes_allowed;
+ } else {
/*
- * per node hstate attribute: adjust count to global,
- * but restrict alloc/free to the specified node.
+ * Node specific request. count adjustment happens in
+ * set_max_huge_pages() after acquiring hugetlb_lock.
*/
- count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
- init_nodemask_of_node(nodes_allowed, nid);
- } else
- nodes_allowed = &node_states[N_MEMORY];
-
- h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
+ init_nodemask_of_node(&nodes_allowed, nid);
+ n_mask = &nodes_allowed;
+ }
- if (nodes_allowed != &node_states[N_MEMORY])
- NODEMASK_FREE(nodes_allowed);
+ err = set_max_huge_pages(h, count, nid, n_mask);
- return len;
-out:
- NODEMASK_FREE(nodes_allowed);
- return err;
+ return err ? err : len;
}
static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
@@ -3247,7 +3301,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
if (cow) {
- mmu_notifier_range_init(&range, src, vma->vm_start,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
+ vma->vm_start,
vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
}
@@ -3359,7 +3414,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
/*
* If sharing possible, alert mmu notifiers of worst case.
*/
- mmu_notifier_range_init(&range, mm, start, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
+ end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
mmu_notifier_invalidate_range_start(&range);
address = start;
@@ -3626,7 +3682,8 @@ retry_avoidcopy:
pages_per_huge_page(h));
__SetPageUptodate(new_page);
- mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
+ haddr + huge_page_size(h));
mmu_notifier_invalidate_range_start(&range);
/*
@@ -3777,8 +3834,7 @@ retry:
* handling userfault. Reacquire after handling
* fault to make calling code simpler.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
- idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -3886,21 +3942,14 @@ backout_unlocked:
}
#ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
unsigned long key[2];
u32 hash;
- if (vma->vm_flags & VM_SHARED) {
- key[0] = (unsigned long) mapping;
- key[1] = idx;
- } else {
- key[0] = (unsigned long) mm;
- key[1] = address >> huge_page_shift(h);
- }
+ key[0] = (unsigned long) mapping;
+ key[1] = idx;
hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
@@ -3911,9 +3960,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
return 0;
@@ -3958,7 +4005,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
@@ -4371,7 +4418,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* start/end. Set range.start/range.end to cover the maximum possible
* range if PMD sharing is possible.
*/
- mmu_notifier_range_init(&range, mm, start, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
+ 0, vma, mm, start, end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
BUG_ON(address >= end);
@@ -4477,6 +4525,11 @@ int hugetlb_reserve_pages(struct inode *inode,
* called to make the mapping read-write. Assume !vma is a shm mapping
*/
if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ /*
+ * resv_map can not be NULL as hugetlb_reserve_pages is only
+ * called for inodes for which resv_maps were created (see
+ * hugetlbfs_get_inode).
+ */
resv_map = inode_resv_map(inode);
chg = region_chg(resv_map, from, to);
@@ -4568,6 +4621,10 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
struct hugepage_subpool *spool = subpool_inode(inode);
long gbl_reserve;
+ /*
+ * Since this routine can be called in the evict inode path for all
+ * hugetlbfs inodes, resv_map could be NULL.
+ */
if (resv_map) {
chg = region_del(resv_map, start, end);
/*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 449044378782..a335f7c1fac4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1016,7 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm,
pte = pte_offset_map(pmd, address);
pte_ptl = pte_lockptr(mm, pmd);
- mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+ address, address + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
@@ -1374,7 +1375,7 @@ static void collapse_shmem(struct mm_struct *mm,
result = SCAN_FAIL;
goto xa_locked;
}
- xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+ xas_store(&xas, new_page);
nr_none++;
continue;
}
@@ -1450,7 +1451,7 @@ static void collapse_shmem(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+ xas_store(&xas, new_page);
continue;
out_unlock:
unlock_page(page);
diff --git a/mm/ksm.c b/mm/ksm.c
index fc64874dc6f4..81c20ed57bf6 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1066,7 +1066,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
BUG_ON(PageTransCompound(page));
- mmu_notifier_range_init(&range, mm, pvmw.address,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ pvmw.address,
pvmw.address + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -1154,7 +1155,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
if (!pmd)
goto out;
- mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
diff --git a/mm/madvise.c b/mm/madvise.c
index bb3a4554d5d5..628022e674a7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -472,7 +472,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.end = min(vma->vm_end, end_addr);
if (range.end <= vma->vm_start)
return -EINVAL;
- mmu_notifier_range_init(&range, mm, range.start, range.end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ range.start, range.end);
lru_add_drain();
tlb_gather_mmu(&tlb, mm, range.start, range.end);
diff --git a/mm/memblock.c b/mm/memblock.c
index a48f520c2d01..6bbad46f4d2c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -94,7 +94,7 @@
* :c:func:`mem_init` function frees all the memory to the buddy page
* allocator.
*
- * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the
+ * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
* memblock data structures will be discarded after the system
* initialization compltes.
*/
@@ -375,7 +375,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
}
}
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
/**
* memblock_discard - discard memory and reserved arrays if they were allocated
*/
@@ -1255,6 +1255,70 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
return 0;
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/**
+ * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
+ *
+ * @idx: pointer to u64 loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL
+ * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL
+ *
+ * This function is meant to be a zone/pfn specific wrapper for the
+ * for_each_mem_range type iterators. Specifically they are used in the
+ * deferred memory init routines and as such we were duplicating much of
+ * this logic throughout the code. So instead of having it in multiple
+ * locations it seemed like it would make more sense to centralize this to
+ * one new iterator that does everything they need.
+ */
+void __init_memblock
+__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
+ unsigned long *out_spfn, unsigned long *out_epfn)
+{
+ int zone_nid = zone_to_nid(zone);
+ phys_addr_t spa, epa;
+ int nid;
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+
+ while (*idx != U64_MAX) {
+ unsigned long epfn = PFN_DOWN(epa);
+ unsigned long spfn = PFN_UP(spa);
+
+ /*
+ * Verify the end is at least past the start of the zone and
+ * that we have at least one PFN to initialize.
+ */
+ if (zone->zone_start_pfn < epfn && spfn < epfn) {
+ /* if we went too far just stop searching */
+ if (zone_end_pfn(zone) <= spfn) {
+ *idx = U64_MAX;
+ break;
+ }
+
+ if (out_spfn)
+ *out_spfn = max(zone->zone_start_pfn, spfn);
+ if (out_epfn)
+ *out_epfn = min(zone_end_pfn(zone), epfn);
+
+ return;
+ }
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+ }
+
+ /* signal end of iteration */
+ if (out_spfn)
+ *out_spfn = ULONG_MAX;
+ if (out_epfn)
+ *out_epfn = 0;
+}
+
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/**
* memblock_alloc_range_nid - allocate boot memory block
@@ -1923,7 +1987,7 @@ unsigned long __init memblock_free_all(void)
return pages;
}
-#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK)
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
static int memblock_debug_show(struct seq_file *m, void *private)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 81a0d3914ec9..2bfe21f3e5ec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -725,34 +725,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
}
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
-{
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
- unsigned long nr = 0;
- enum lru_list lru;
-
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
-
- for_each_lru(lru) {
- if (!(BIT(lru) & lru_mask))
- continue;
- nr += mem_cgroup_get_lru_size(lruvec, lru);
- }
- return nr;
-}
-
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
- unsigned int lru_mask)
-{
- unsigned long nr = 0;
- int nid;
-
- for_each_node_state(nid, N_MEMORY)
- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
- return nr;
-}
-
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
enum mem_cgroup_events_target target)
{
@@ -1358,7 +1330,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
for (i = 0; i < NR_LRU_LISTS; i++)
pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+ K(memcg_page_state(iter, NR_LRU_BASE + i)));
pr_cont("\n");
}
@@ -1384,6 +1356,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
return max;
}
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+ return page_counter_read(&memcg->memory);
+}
+
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
@@ -1422,11 +1399,15 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+
+ if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
+ lruvec_page_state(lruvec, NR_ACTIVE_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
+ if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
+ lruvec_page_state(lruvec, NR_ACTIVE_ANON))
return true;
return false;
@@ -2168,14 +2149,17 @@ static void high_work_func(struct work_struct *work)
void mem_cgroup_handle_over_high(void)
{
unsigned int nr_pages = current->memcg_nr_pages_over_high;
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = current->memcg_high_reclaim;
if (likely(!nr_pages))
return;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ if (!memcg)
+ memcg = get_mem_cgroup_from_mm(current->mm);
+
reclaim_high(memcg, nr_pages, GFP_KERNEL);
css_put(&memcg->css);
+ current->memcg_high_reclaim = NULL;
current->memcg_nr_pages_over_high = 0;
}
@@ -2327,10 +2311,10 @@ done_restock:
* If the hierarchy is above the normal consumption range, schedule
* reclaim on returning to userland. We can perform reclaim here
* if __GFP_RECLAIM but let's always punt for simplicity and so that
- * GFP_KERNEL can consistently be used during reclaim. @memcg is
- * not recorded as it most likely matches current's and won't
- * change in the meantime. As high limit is checked again before
- * reclaim, the cost of mismatch is negligible.
+ * GFP_KERNEL can consistently be used during reclaim. Record the memcg
+ * for the return-to-userland high reclaim. If the memcg is already
+ * recorded and the recorded memcg is not the descendant of the memcg
+ * needing high reclaim, punt the high reclaim to the work queue.
*/
do {
if (page_counter_read(&memcg->memory) > memcg->high) {
@@ -2338,6 +2322,13 @@ done_restock:
if (in_interrupt()) {
schedule_work(&memcg->high_work);
break;
+ } else if (!current->memcg_high_reclaim) {
+ css_get(&memcg->css);
+ current->memcg_high_reclaim = memcg;
+ } else if (!mem_cgroup_is_descendant(
+ current->memcg_high_reclaim, memcg)) {
+ schedule_work(&memcg->high_work);
+ break;
}
current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
@@ -2990,8 +2981,8 @@ static void accumulate_memcg_tree(struct mem_cgroup *memcg,
acc->events_array ? acc->events_array[i] : i);
for (i = 0; i < NR_LRU_LISTS; i++)
- acc->lru_pages[i] +=
- mem_cgroup_nr_lru_pages(mi, BIT(i));
+ acc->lru_pages[i] += memcg_page_state(mi,
+ NR_LRU_BASE + i);
}
}
@@ -3331,6 +3322,42 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#endif
#ifdef CONFIG_NUMA
+
+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
+
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
+ unsigned int lru_mask)
+{
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
@@ -3421,7 +3448,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
+ memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
@@ -3927,8 +3955,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
/* this should eventually include NR_UNSTABLE_NFS */
*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
- (1 << LRU_ACTIVE_FILE));
+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
diff --git a/mm/memfd.c b/mm/memfd.c
index 650e65a46b9c..2647c898990c 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas)
xas_for_each(xas, page, ULONG_MAX) {
if (xa_is_value(page))
continue;
+ page = find_subpage(page, xas->xa_index);
if (page_count(page) - page_mapcount(page) > 1)
xas_set_mark(xas, MEMFD_TAG_PINNED);
@@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
bool clear = true;
if (xa_is_value(page))
continue;
+ page = find_subpage(page, xas.xa_index);
if (page_count(page) - page_mapcount(page) != 1) {
/*
* On the last scan, we clean up all those tags
diff --git a/mm/memory.c b/mm/memory.c
index f7d962d7de19..0d0711a912de 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1010,7 +1010,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
is_cow = is_cow_mapping(vma->vm_flags);
if (is_cow) {
- mmu_notifier_range_init(&range, src_mm, addr, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
+ 0, vma, src_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
}
@@ -1334,7 +1335,8 @@ void unmap_vmas(struct mmu_gather *tlb,
{
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
@@ -1356,7 +1358,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
struct mmu_gather tlb;
lru_add_drain();
- mmu_notifier_range_init(&range, vma->vm_mm, start, start + size);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ start, start + size);
tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
@@ -1382,7 +1385,8 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
struct mmu_gather tlb;
lru_add_drain();
- mmu_notifier_range_init(&range, vma->vm_mm, address, address + size);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address, address + size);
tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
@@ -1523,6 +1527,87 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_page);
+/*
+ * __vm_map_pages - maps range of kernel pages into user vma
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ * @offset: user's requested vm_pgoff
+ *
+ * This allows drivers to map range of kernel pages into a user vma.
+ *
+ * Return: 0 on success and error code otherwise.
+ */
+static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num, unsigned long offset)
+{
+ unsigned long count = vma_pages(vma);
+ unsigned long uaddr = vma->vm_start;
+ int ret, i;
+
+ /* Fail if the user requested offset is beyond the end of the object */
+ if (offset > num)
+ return -ENXIO;
+
+ /* Fail if the user requested size exceeds available object size */
+ if (count > num - offset)
+ return -ENXIO;
+
+ for (i = 0; i < count; i++) {
+ ret = vm_insert_page(vma, uaddr, pages[offset + i]);
+ if (ret < 0)
+ return ret;
+ uaddr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+/**
+ * vm_map_pages - maps range of kernel pages starts with non zero offset
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ *
+ * Maps an object consisting of @num pages, catering for the user's
+ * requested vm_pgoff
+ *
+ * If we fail to insert any page into the vma, the function will return
+ * immediately leaving any previously inserted pages present. Callers
+ * from the mmap handler may immediately return the error as their caller
+ * will destroy the vma, removing any successfully inserted pages. Other
+ * callers should make their own arrangements for calling unmap_region().
+ *
+ * Context: Process context. Called by mmap handlers.
+ * Return: 0 on success and error code otherwise.
+ */
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
+}
+EXPORT_SYMBOL(vm_map_pages);
+
+/**
+ * vm_map_pages_zero - map range of kernel pages starts with zero offset
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ *
+ * Similar to vm_map_pages(), except that it explicitly sets the offset
+ * to 0. This function is intended for the drivers that did not consider
+ * vm_pgoff.
+ *
+ * Context: Process context. Called by mmap handlers.
+ * Return: 0 on success and error code otherwise.
+ */
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return __vm_map_pages(vma, pages, num, 0);
+}
+EXPORT_SYMBOL(vm_map_pages_zero);
+
static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, pgprot_t prot, bool mkwrite)
{
@@ -2279,7 +2364,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
__SetPageUptodate(new_page);
- mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -2830,7 +2916,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
@@ -2844,6 +2929,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
}
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
@@ -4104,8 +4190,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
goto out;
if (range) {
- mmu_notifier_range_init(range, mm, address & PMD_MASK,
- (address & PMD_MASK) + PMD_SIZE);
+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
+ NULL, mm, address & PMD_MASK,
+ (address & PMD_MASK) + PMD_SIZE);
mmu_notifier_invalidate_range_start(range);
}
*ptlp = pmd_lock(mm, pmd);
@@ -4122,8 +4209,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
goto out;
if (range) {
- mmu_notifier_range_init(range, mm, address & PAGE_MASK,
- (address & PAGE_MASK) + PAGE_SIZE);
+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+ address & PAGE_MASK,
+ (address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(range);
}
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b236069ff0d8..328878b6799d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -39,6 +39,7 @@
#include <asm/tlbflush.h>
#include "internal.h"
+#include "shuffle.h"
/*
* online_page_callback contains pointer to current page onlining function.
@@ -273,12 +274,12 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
* add the new pages.
*/
int __ref __add_pages(int nid, unsigned long phys_start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap,
- bool want_memblock)
+ unsigned long nr_pages, struct mhp_restrictions *restrictions)
{
unsigned long i;
int err = 0;
int start_sec, end_sec;
+ struct vmem_altmap *altmap = restrictions->altmap;
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
@@ -299,7 +300,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
for (i = start_sec; i <= end_sec; i++) {
err = __add_section(nid, section_nr_to_pfn(i), altmap,
- want_memblock);
+ restrictions->flags & MHP_MEMBLOCK_API);
/*
* EEXIST is finally dealt with by ioresource collision
@@ -516,26 +517,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
-static int __remove_section(struct zone *zone, struct mem_section *ms,
- unsigned long map_offset, struct vmem_altmap *altmap)
+static void __remove_section(struct zone *zone, struct mem_section *ms,
+ unsigned long map_offset,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn;
int scn_nr;
- int ret = -EINVAL;
- if (!valid_section(ms))
- return ret;
+ if (WARN_ON_ONCE(!valid_section(ms)))
+ return;
- ret = unregister_memory_section(ms);
- if (ret)
- return ret;
+ unregister_memory_section(ms);
scn_nr = __section_nr(ms);
start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
__remove_zone(zone, start_pfn);
sparse_remove_one_section(zone, ms, map_offset, altmap);
- return 0;
}
/**
@@ -550,31 +548,17 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
* sure that pages are marked reserved and zones are adjust properly by
* calling offline_pages().
*/
-int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap)
{
unsigned long i;
unsigned long map_offset = 0;
- int sections_to_remove, ret = 0;
+ int sections_to_remove;
/* In the ZONE_DEVICE case device driver owns the memory region */
if (is_dev_zone(zone)) {
if (altmap)
map_offset = vmem_altmap_offset(altmap);
- } else {
- resource_size_t start, size;
-
- start = phys_start_pfn << PAGE_SHIFT;
- size = nr_pages * PAGE_SIZE;
-
- ret = release_mem_region_adjustable(&iomem_resource, start,
- size);
- if (ret) {
- resource_size_t endres = start + size - 1;
-
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
- &start, &endres, ret);
- }
}
clear_zone_contiguous(zone);
@@ -590,16 +574,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
cond_resched();
- ret = __remove_section(zone, __pfn_to_section(pfn), map_offset,
- altmap);
+ __remove_section(zone, __pfn_to_section(pfn), map_offset,
+ altmap);
map_offset = 0;
- if (ret)
- break;
}
set_zone_contiguous(zone);
-
- return ret;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -714,7 +694,7 @@ static void node_states_check_changes_online(unsigned long nr_pages,
if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
arg->status_change_nid_normal = nid;
#ifdef CONFIG_HIGHMEM
- if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY))
+ if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
arg->status_change_nid_high = nid;
#endif
}
@@ -912,6 +892,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
zone->zone_pgdat->node_present_pages += onlined_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ shuffle_zone(zone);
+
if (onlined_pages) {
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
@@ -1097,6 +1079,9 @@ static int online_memory_block(struct memory_block *mem, void *arg)
*/
int __ref add_memory_resource(int nid, struct resource *res)
{
+ struct mhp_restrictions restrictions = {
+ .flags = MHP_MEMBLOCK_API,
+ };
u64 start, size;
bool new_node = false;
int ret;
@@ -1124,7 +1109,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
new_node = ret;
/* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size, NULL, true);
+ ret = arch_add_memory(nid, start, size, &restrictions);
if (ret < 0)
goto error;
@@ -1341,8 +1326,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
if (!PageHuge(page))
continue;
head = compound_head(page);
- if (hugepage_migration_supported(page_hstate(head)) &&
- page_huge_active(head))
+ if (page_huge_active(head))
return pfn;
skip = (1 << compound_order(head)) - (page - head);
pfn += skip - 1;
@@ -1382,10 +1366,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (PageHuge(page)) {
struct page *head = compound_head(page);
- if (compound_order(head) > PFN_SECTION_SHIFT) {
- ret = -EBUSY;
- break;
- }
pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
isolate_huge_page(head, &source);
continue;
@@ -1454,15 +1434,10 @@ static int
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
void *data)
{
- __offline_isolated_pages(start, start + nr_pages);
- return 0;
-}
+ unsigned long *offlined_pages = (unsigned long *)data;
-static void
-offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
-{
- walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
- offline_isolated_pages_cb);
+ *offlined_pages += __offline_isolated_pages(start, start + nr_pages);
+ return 0;
}
/*
@@ -1472,26 +1447,7 @@ static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
void *data)
{
- int ret;
- long offlined = *(long *)data;
- ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
- offlined = nr_pages;
- if (!ret)
- *(long *)data += offlined;
- return ret;
-}
-
-static long
-check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
-{
- long offlined = 0;
- int ret;
-
- ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
- check_pages_isolated_cb);
- if (ret < 0)
- offlined = (long)ret;
- return offlined;
+ return test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
}
static int __init cmdline_parse_movable_node(char *p)
@@ -1576,7 +1532,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long pfn, nr_pages;
- long offlined_pages;
+ unsigned long offlined_pages = 0;
int ret, node, nr_isolate_pageblock;
unsigned long flags;
unsigned long valid_start, valid_end;
@@ -1652,14 +1608,15 @@ static int __ref __offline_pages(unsigned long start_pfn,
goto failed_removal_isolated;
}
/* check again */
- offlined_pages = check_pages_isolated(start_pfn, end_pfn);
- } while (offlined_pages < 0);
+ ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+ NULL, check_pages_isolated_cb);
+ } while (ret);
- pr_info("Offlined Pages %ld\n", offlined_pages);
/* Ok, all of our target is isolated.
We cannot do rollback at this point. */
- offline_isolated_pages(start_pfn, end_pfn);
-
+ walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+ &offlined_pages, offline_isolated_pages_cb);
+ pr_info("Offlined Pages %ld\n", offlined_pages);
/*
* Onlining will reset pagetype flags and makes migrate type
* MOVABLE, so just need to decrease the number of isolated
@@ -1843,6 +1800,26 @@ void try_offline_node(int nid)
}
EXPORT_SYMBOL(try_offline_node);
+static void __release_memory_resource(resource_size_t start,
+ resource_size_t size)
+{
+ int ret;
+
+ /*
+ * When removing memory in the same granularity as it was added,
+ * this function never fails. It might only fail if resources
+ * have to be adjusted or split. We'll ignore the error, as
+ * removing of memory cannot fail.
+ */
+ ret = release_mem_region_adjustable(&iomem_resource, start, size);
+ if (ret) {
+ resource_size_t endres = start + size - 1;
+
+ pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
+ &start, &endres, ret);
+ }
+}
+
/**
* remove_memory
* @nid: the node ID
@@ -1877,6 +1854,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
memblock_remove(start, size);
arch_remove_memory(nid, start, size, NULL);
+ __release_memory_resource(start, size);
try_offline_node(nid);
diff --git a/mm/migrate.c b/mm/migrate.c
index 663a5449367a..f2ecc2855a12 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -463,7 +463,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
for (i = 1; i < HPAGE_PMD_NR; i++) {
xas_next(&xas);
- xas_store(&xas, newpage + i);
+ xas_store(&xas, newpage);
}
}
@@ -2356,7 +2356,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
mm_walk.mm = migrate->vma->vm_mm;
mm_walk.private = migrate;
- mmu_notifier_range_init(&range, mm_walk.mm, migrate->start,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm,
+ migrate->start,
migrate->end);
mmu_notifier_invalidate_range_start(&range);
walk_page_range(migrate->start, migrate->end, &mm_walk);
@@ -2764,6 +2765,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
notified = true;
mmu_notifier_range_init(&range,
+ MMU_NOTIFY_CLEAR, 0,
+ NULL,
migrate->vma->vm_mm,
addr, migrate->end);
mmu_notifier_invalidate_range_start(&range);
diff --git a/mm/mincore.c b/mm/mincore.c
index 218099b5ed31..c3f058bd0faf 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -169,6 +169,22 @@ out:
return 0;
}
+static inline bool can_do_mincore(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return true;
+ if (!vma->vm_file)
+ return false;
+ /*
+ * Reveal pagecache information only for non-anonymous mappings that
+ * correspond to the files the calling process could (if tried) open
+ * for writing; otherwise we'd be including shared non-exclusive
+ * mappings, which opens a side channel.
+ */
+ return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+ inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
/*
* Do a chunk of "sys_mincore()". We've already checked
* all the arguments, we hold the mmap semaphore: we should
@@ -189,8 +205,13 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
vma = find_vma(current->mm, addr);
if (!vma || addr < vma->vm_start)
return -ENOMEM;
- mincore_walk.mm = vma->vm_mm;
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
+ if (!can_do_mincore(vma)) {
+ unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE);
+ memset(vec, 1, pages);
+ return pages;
+ }
+ mincore_walk.mm = vma->vm_mm;
err = walk_page_range(addr, end, &mincore_walk);
if (err < 0)
return err;
diff --git a/mm/mlock.c b/mm/mlock.c
index 080f3b36415b..e492a155c51a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -562,7 +562,7 @@ success:
nr_pages = -nr_pages;
else if (old_flags & VM_LOCKED)
nr_pages = 0;
- mm->locked_vm += nr_pages;
+ atomic64_add(nr_pages, &mm->locked_vm);
/*
* vm_flags is protected by the mmap_sem held in write mode.
@@ -687,7 +687,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
if (down_write_killable(&current->mm->mmap_sem))
return -EINTR;
- locked += current->mm->locked_vm;
+ locked += atomic64_read(&current->mm->locked_vm);
if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
/*
* It is possible that the regions requested intersect with
diff --git a/mm/mmap.c b/mm/mmap.c
index bd7b9f293b39..9cf52bdb22a8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1340,7 +1340,7 @@ static inline int mlock_future_check(struct mm_struct *mm,
/* mlock MCL_FUTURE? */
if (flags & VM_LOCKED) {
locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
+ locked += atomic64_read(&mm->locked_vm);
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
@@ -1826,7 +1826,7 @@ out:
vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
else
- mm->locked_vm += (len >> PAGE_SHIFT);
+ atomic64_add(len >> PAGE_SHIFT, &mm->locked_vm);
}
if (file)
@@ -2302,7 +2302,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked;
unsigned long limit;
- locked = mm->locked_vm + grow;
+ locked = atomic64_read(&mm->locked_vm) + grow;
limit = rlimit(RLIMIT_MEMLOCK);
limit >>= PAGE_SHIFT;
if (locked > limit && !capable(CAP_IPC_LOCK))
@@ -2396,7 +2396,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
+ atomic64_add(grow, &mm->locked_vm);
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
@@ -2476,7 +2476,7 @@ int expand_downwards(struct vm_area_struct *vma,
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
+ atomic64_add(grow, &mm->locked_vm);
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
@@ -2801,11 +2801,11 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
/*
* unlock any mlock()ed ranges before detaching vmas
*/
- if (mm->locked_vm) {
+ if (atomic64_read(&mm->locked_vm)) {
struct vm_area_struct *tmp = vma;
while (tmp && tmp->vm_start < end) {
if (tmp->vm_flags & VM_LOCKED) {
- mm->locked_vm -= vma_pages(tmp);
+ atomic64_sub(vma_pages(tmp), &mm->locked_vm);
munlock_vma_pages_all(tmp);
}
@@ -3048,7 +3048,7 @@ out:
mm->total_vm += len >> PAGE_SHIFT;
mm->data_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED)
- mm->locked_vm += (len >> PAGE_SHIFT);
+ atomic64_add(len >> PAGE_SHIFT, &mm->locked_vm);
vma->vm_flags |= VM_SOFTDIRTY;
return 0;
}
@@ -3120,7 +3120,7 @@ void exit_mmap(struct mm_struct *mm)
up_write(&mm->mmap_sem);
}
- if (mm->locked_vm) {
+ if (atomic64_read(&mm->locked_vm)) {
vma = mm->mmap;
while (vma) {
if (vma->vm_flags & VM_LOCKED)
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9c884abc7850..ee36068077b6 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -180,7 +180,7 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
if (_ret) {
pr_info("%pS callback failed with %d in %sblockable context.\n",
mn->ops->invalidate_range_start, _ret,
- !range->blockable ? "non-" : "");
+ !mmu_notifier_range_blockable(range) ? "non-" : "");
ret = _ret;
}
}
@@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
mmdrop(mm);
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
+
+bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
+{
+ if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA)
+ return false;
+ /* Return true if the vma still have the read flag set. */
+ return range->vma->vm_flags & VM_READ;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 028c724dcb1a..65242f1e4457 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -185,7 +185,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
/* invoke the mmu notifier if the pmd is populated */
if (!range.start) {
- mmu_notifier_range_init(&range, vma->vm_mm, addr, end);
+ mmu_notifier_range_init(&range,
+ MMU_NOTIFY_PROTECTION_VMA, 0,
+ vma, vma->vm_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
}
diff --git a/mm/mremap.c b/mm/mremap.c
index e3edef6b7a12..37b5b2ad91be 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -249,7 +249,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
- mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ old_addr, old_end);
mmu_notifier_invalidate_range_start(&range);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
@@ -422,7 +423,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
}
if (vm_flags & VM_LOCKED) {
- mm->locked_vm += new_len >> PAGE_SHIFT;
+ atomic64_add(new_len >> PAGE_SHIFT, &mm->locked_vm);
*locked = true;
}
@@ -473,7 +474,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
- locked = mm->locked_vm << PAGE_SHIFT;
+ locked = atomic64_read(&mm->locked_vm) << PAGE_SHIFT;
lock_limit = rlimit(RLIMIT_MEMLOCK);
locked += new_len - old_len;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
@@ -679,7 +680,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
vm_stat_account(mm, vma->vm_flags, pages);
if (vma->vm_flags & VM_LOCKED) {
- mm->locked_vm += pages;
+ atomic64_add(pages, &mm->locked_vm);
locked = true;
new_addr = addr;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 749276beb109..b492fd1fcf9f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -473,6 +473,20 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_page);
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_map_pages);
+
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_map_pages_zero);
+
/*
* sys_brk() for the most part doesn't need the global kernel
* lock, except when an application is doing something nasty
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3a2484884cfd..539c91d0b26a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -531,7 +531,8 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
struct mmu_notifier_range range;
struct mmu_gather tlb;
- mmu_notifier_range_init(&range, mm, vma->vm_start,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
+ vma, mm, vma->vm_start,
vma->vm_end);
tlb_gather_mmu(&tlb, mm, range.start, range.end);
if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 59661106da16..1f99db76b1ff 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
#include <linux/mempolicy.h>
#include <linux/memremap.h>
#include <linux/stop_machine.h>
+#include <linux/random.h>
#include <linux/sort.h>
#include <linux/pfn.h>
#include <linux/backing-dev.h>
@@ -72,6 +73,7 @@
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include "internal.h"
+#include "shuffle.h"
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
@@ -755,12 +757,6 @@ static inline void set_page_order(struct page *page, unsigned int order)
__SetPageBuddy(page);
}
-static inline void rmv_page_order(struct page *page)
-{
- __ClearPageBuddy(page);
- set_page_private(page, 0);
-}
-
/*
* This function checks whether a page is free && is the buddy
* we can coalesce a page and its buddy if
@@ -918,13 +914,10 @@ continue_merging:
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
- if (page_is_guard(buddy)) {
+ if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
- } else {
- list_del(&buddy->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(buddy);
- }
+ else
+ del_page_from_free_area(buddy, &zone->free_area[order]);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
@@ -966,7 +959,8 @@ done_merging:
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
- if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
+ if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
+ && !is_shuffle_order(order)) {
struct page *higher_page, *higher_buddy;
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
@@ -974,15 +968,18 @@ done_merging:
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1)) {
- list_add_tail(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
- goto out;
+ add_to_free_area_tail(page, &zone->free_area[order],
+ migratetype);
+ return;
}
}
- list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
-out:
- zone->free_area[order].nr_free++;
+ if (is_shuffle_order(order))
+ add_to_free_area_random(page, &zone->free_area[order],
+ migratetype);
+ else
+ add_to_free_area(page, &zone->free_area[order], migratetype);
+
}
/*
@@ -1416,36 +1413,22 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
#endif
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
int nid;
- nid = __early_pfn_to_nid(pfn, state);
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
if (nid >= 0 && nid != node)
return false;
return true;
}
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
-}
-
#else
-
static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
return true;
}
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
-{
- return true;
-}
#endif
@@ -1574,21 +1557,13 @@ static inline void __init pgdat_init_report_one_done(void)
*
* Then, we check if a current large page is valid by only checking the validity
* of the head pfn.
- *
- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
- * within a node: a pfn is between start and end of a node, but does not belong
- * to this memory node.
*/
-static inline bool __init
-deferred_pfn_valid(int nid, unsigned long pfn,
- struct mminit_pfnnid_cache *nid_init_state)
+static inline bool __init deferred_pfn_valid(unsigned long pfn)
{
if (!pfn_valid_within(pfn))
return false;
if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
return false;
- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
- return false;
return true;
}
@@ -1596,15 +1571,14 @@ deferred_pfn_valid(int nid, unsigned long pfn,
* Free pages to buddy allocator. Try to free aligned pages in
* pageblock_nr_pages sizes.
*/
-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
+static void __init deferred_free_pages(unsigned long pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
unsigned long nr_free = 0;
for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ if (!deferred_pfn_valid(pfn)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 0;
} else if (!(pfn & nr_pgmask)) {
@@ -1624,17 +1598,18 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
* by performing it only once every pageblock_nr_pages.
* Return number of pages initialized.
*/
-static unsigned long __init deferred_init_pages(int nid, int zid,
+static unsigned long __init deferred_init_pages(struct zone *zone,
unsigned long pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ int nid = zone_to_nid(zone);
unsigned long nr_pages = 0;
+ int zid = zone_idx(zone);
struct page *page = NULL;
for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ if (!deferred_pfn_valid(pfn)) {
page = NULL;
continue;
} else if (!page || !(pfn & nr_pgmask)) {
@@ -1649,18 +1624,100 @@ static unsigned long __init deferred_init_pages(int nid, int zid,
return (nr_pages);
}
+/*
+ * This function is meant to pre-load the iterator for the zone init.
+ * Specifically it walks through the ranges until we are caught up to the
+ * first_init_pfn value and exits there. If we never encounter the value we
+ * return false indicating there are no valid ranges left.
+ */
+static bool __init
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
+ unsigned long *spfn, unsigned long *epfn,
+ unsigned long first_init_pfn)
+{
+ u64 j;
+
+ /*
+ * Start out by walking through the ranges in this zone that have
+ * already been initialized. We don't need to do anything with them
+ * so we just need to flush them out of the system.
+ */
+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
+ if (*epfn <= first_init_pfn)
+ continue;
+ if (*spfn < first_init_pfn)
+ *spfn = first_init_pfn;
+ *i = j;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, then free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ *
+ * In order to try and keep some memory in the cache we have the loop
+ * broken along max page order boundaries. This way we will not cause
+ * any issues with the buddy page computation.
+ */
+static unsigned long __init
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+ unsigned long spfn = *start_pfn, epfn = *end_pfn;
+ unsigned long nr_pages = 0;
+ u64 j = *i;
+
+ /* First we loop through and initialize the page values */
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
+ unsigned long t;
+
+ if (mo_pfn <= *start_pfn)
+ break;
+
+ t = min(mo_pfn, *end_pfn);
+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
+
+ if (mo_pfn < *end_pfn) {
+ *start_pfn = mo_pfn;
+ break;
+ }
+ }
+
+ /* Reset values and now loop through freeing pages as needed */
+ swap(j, *i);
+
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
+ unsigned long t;
+
+ if (mo_pfn <= spfn)
+ break;
+
+ t = min(mo_pfn, epfn);
+ deferred_free_pages(spfn, t);
+
+ if (mo_pfn <= epfn)
+ break;
+ }
+
+ return nr_pages;
+}
+
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
- int nid = pgdat->node_id;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ unsigned long spfn = 0, epfn = 0, nr_pages = 0;
+ unsigned long first_init_pfn, flags;
unsigned long start = jiffies;
- unsigned long nr_pages = 0;
- unsigned long spfn, epfn, first_init_pfn, flags;
- phys_addr_t spa, epa;
- int zid;
struct zone *zone;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ int zid;
u64 i;
/* Bind memory initialisation thread to a local node if possible */
@@ -1686,31 +1743,27 @@ static int __init deferred_init_memmap(void *data)
if (first_init_pfn < zone_end_pfn(zone))
break;
}
- first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
+
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_init_pfn))
+ goto zone_empty;
/*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, than free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
*/
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
- }
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
- }
+ while (spfn < epfn)
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+zone_empty:
pgdat_resize_unlock(pgdat, &flags);
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
- pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
- jiffies_to_msecs(jiffies - start));
+ pr_info("node %d initialised, %lu pages in %ums\n",
+ pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
pgdat_init_report_one_done();
return 0;
@@ -1734,14 +1787,11 @@ static int __init deferred_init_memmap(void *data)
static noinline bool __init
deferred_grow_zone(struct zone *zone, unsigned int order)
{
- int zid = zone_idx(zone);
- int nid = zone_to_nid(zone);
- pg_data_t *pgdat = NODE_DATA(nid);
unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
- unsigned long nr_pages = 0;
- unsigned long first_init_pfn, spfn, epfn, t, flags;
+ pg_data_t *pgdat = zone->zone_pgdat;
unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
- phys_addr_t spa, epa;
+ unsigned long spfn, epfn, flags;
+ unsigned long nr_pages = 0;
u64 i;
/* Only the last zone may have deferred pages */
@@ -1770,38 +1820,35 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
return true;
}
- first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
-
- if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_deferred_pfn)) {
+ pgdat->first_deferred_pfn = ULONG_MAX;
pgdat_resize_unlock(pgdat, &flags);
- return false;
+ return true;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
+ */
+ while (spfn < epfn) {
+ /* update our first deferred PFN for this section */
+ first_deferred_pfn = spfn;
- while (spfn < epfn && nr_pages < nr_pages_needed) {
- t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
- first_deferred_pfn = min(t, epfn);
- nr_pages += deferred_init_pages(nid, zid, spfn,
- first_deferred_pfn);
- spfn = first_deferred_pfn;
- }
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+
+ /* We should only stop along section boundaries */
+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
+ continue;
+ /* If our quota has been met we can stop here */
if (nr_pages >= nr_pages_needed)
break;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
-
- if (first_deferred_pfn == epfn)
- break;
- }
- pgdat->first_deferred_pfn = first_deferred_pfn;
+ pgdat->first_deferred_pfn = spfn;
pgdat_resize_unlock(pgdat, &flags);
return nr_pages > 0;
@@ -1824,9 +1871,9 @@ _deferred_grow_zone(struct zone *zone, unsigned int order)
void __init page_alloc_init_late(void)
{
struct zone *zone;
+ int nid;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- int nid;
/* There will be num_node_state(N_MEMORY) threads */
atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
@@ -1846,10 +1893,12 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+
/* Discard memblock private memory */
memblock_discard();
-#endif
+
+ for_each_node_state(nid, N_MEMORY)
+ shuffle_free_memory(NODE_DATA(nid));
for_each_populated_zone(zone)
set_zone_contiguous(zone);
@@ -1921,8 +1970,7 @@ static inline void expand(struct zone *zone, struct page *page,
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
- list_add(&page[size].lru, &area->free_list[migratetype]);
- area->nr_free++;
+ add_to_free_area(&page[size], area, migratetype);
set_page_order(&page[size], high);
}
}
@@ -2064,13 +2112,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
- page = list_first_entry_or_null(&area->free_list[migratetype],
- struct page, lru);
+ page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
- list_del(&page->lru);
- rmv_page_order(page);
- area->nr_free--;
+ del_page_from_free_area(page, area);
expand(zone, page, order, current_order, area, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
@@ -2156,8 +2201,7 @@ static int move_freepages(struct zone *zone,
}
order = page_order(page);
- list_move(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
+ move_to_free_area(page, &zone->free_area[order], migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -2345,7 +2389,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
single_page:
area = &zone->free_area[current_order];
- list_move(&page->lru, &area->free_list[start_type]);
+ move_to_free_area(page, area, start_type);
}
/*
@@ -2369,7 +2413,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
if (fallback_mt == MIGRATE_TYPES)
break;
- if (list_empty(&area->free_list[fallback_mt]))
+ if (free_area_empty(area, fallback_mt))
continue;
if (can_steal_fallback(order, migratetype))
@@ -2456,9 +2500,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
for (order = 0; order < MAX_ORDER; order++) {
struct free_area *area = &(zone->free_area[order]);
- page = list_first_entry_or_null(
- &area->free_list[MIGRATE_HIGHATOMIC],
- struct page, lru);
+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
if (!page)
continue;
@@ -2581,8 +2623,7 @@ find_smallest:
VM_BUG_ON(current_order == MAX_ORDER);
do_steal:
- page = list_first_entry(&area->free_list[fallback_mt],
- struct page, lru);
+ page = get_page_from_free_area(area, fallback_mt);
steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
can_steal);
@@ -3019,6 +3060,7 @@ EXPORT_SYMBOL_GPL(split_page);
int __isolate_free_page(struct page *page, unsigned int order)
{
+ struct free_area *area = &page_zone(page)->free_area[order];
unsigned long watermark;
struct zone *zone;
int mt;
@@ -3043,9 +3085,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
/* Remove page from free list */
- list_del(&page->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(page);
+
+ del_page_from_free_area(page, area);
/*
* Set the pageblock if the isolated page is at least half of a
@@ -3120,9 +3161,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
- struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype,
- unsigned int alloc_flags)
+ struct zone *zone, gfp_t gfp_flags,
+ int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
@@ -3134,7 +3174,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
if (page) {
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
@@ -3154,8 +3194,8 @@ struct page *rmqueue(struct zone *preferred_zone,
struct page *page;
if (likely(order == 0)) {
- page = rmqueue_pcplist(preferred_zone, zone, order,
- gfp_flags, migratetype, alloc_flags);
+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
+ migratetype, alloc_flags);
goto out;
}
@@ -3343,13 +3383,13 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
continue;
for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
- if (!list_empty(&area->free_list[mt]))
+ if (!free_area_empty(area, mt))
return true;
}
#ifdef CONFIG_CMA
if ((alloc_flags & ALLOC_CMA) &&
- !list_empty(&area->free_list[MIGRATE_CMA])) {
+ !free_area_empty(area, MIGRATE_CMA)) {
return true;
}
#endif
@@ -4821,7 +4861,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
/**
* alloc_pages_exact - allocate an exact number physically-contiguous pages.
* @size: the number of bytes to allocate
- * @gfp_mask: GFP flags for the allocation
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* This function is similar to alloc_pages(), except that it allocates the
* minimum number of pages to satisfy the request. alloc_pages() can only
@@ -4838,6 +4878,9 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
unsigned int order = get_order(size);
unsigned long addr;
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+ gfp_mask &= ~__GFP_COMP;
+
addr = __get_free_pages(gfp_mask, order);
return make_alloc_exact(addr, order, size);
}
@@ -4848,7 +4891,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
* pages on a node.
* @nid: the preferred node ID where memory should be allocated
* @size: the number of bytes to allocate
- * @gfp_mask: GFP flags for the allocation
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
@@ -4858,7 +4901,12 @@ EXPORT_SYMBOL(alloc_pages_exact);
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
- struct page *p = alloc_pages_node(nid, gfp_mask, order);
+ struct page *p;
+
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+ gfp_mask &= ~__GFP_COMP;
+
+ p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
return NULL;
return make_alloc_exact((unsigned long)page_address(p), order, size);
@@ -5268,7 +5316,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
types[order] = 0;
for (type = 0; type < MIGRATE_TYPES; type++) {
- if (!list_empty(&area->free_list[type]))
+ if (!free_area_empty(area, type))
types[order] |= 1 << type;
}
}
@@ -5486,6 +5534,8 @@ static void build_zonelists(pg_data_t *pgdat)
int node, load, nr_nodes = 0;
nodemask_t used_mask;
int local_node, prev_node;
+ struct zone *zone;
+ struct zoneref *z;
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
@@ -5511,6 +5561,11 @@ static void build_zonelists(pg_data_t *pgdat)
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
+
+ pr_info("node[%d] zonelist: ", pgdat->node_id);
+ for_each_zone_zonelist(zone, z, &pgdat->node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1)
+ pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
+ pr_cont("\n");
}
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
@@ -5613,10 +5668,11 @@ static void __build_all_zonelists(void *data)
if (self && !node_online(self->node_id)) {
build_zonelists(self);
} else {
- for_each_online_node(nid) {
+ for_each_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
- build_zonelists(pgdat);
+ if (pgdat)
+ build_zonelists(pgdat);
}
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
@@ -6247,13 +6303,15 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long *zone_end_pfn,
unsigned long *ignored)
{
+ unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
/* Get the start and end of the zone */
- *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
zone_start_pfn, zone_end_pfn);
@@ -6897,10 +6955,8 @@ static unsigned long __init find_min_pfn_for_node(int nid)
for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
min_pfn = min(min_pfn, start_pfn);
- if (min_pfn == ULONG_MAX) {
- pr_warn("Could not find start_pfn for node %d\n", nid);
+ if (min_pfn == ULONG_MAX)
return 0;
- }
return min_pfn;
}
@@ -7244,8 +7300,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
mminit_verify_pageflags_layout();
setup_nr_node_ids();
zero_resv_unavail();
- for_each_online_node(nid) {
+ for_each_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
+
+ if (!pgdat)
+ continue;
+
free_area_init_node(nid, NULL,
find_min_pfn_for_node(nid), NULL);
@@ -8129,8 +8189,7 @@ unmovable:
return true;
}
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
-
+#ifdef CONFIG_CONTIG_ALLOC
static unsigned long pfn_max_align_down(unsigned long pfn)
{
return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
@@ -8339,8 +8398,9 @@ done:
pfn_max_align_up(end), migratetype);
return ret;
}
+#endif /* CONFIG_CONTIG_ALLOC */
-void free_contig_range(unsigned long pfn, unsigned nr_pages)
+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
{
unsigned int count = 0;
@@ -8352,7 +8412,6 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
}
WARN(count != 0, "%d pages are still in use!\n", count);
}
-#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/*
@@ -8394,7 +8453,7 @@ void zone_pcp_reset(struct zone *zone)
* All pages in the range must be in a single zone and isolated
* before calling this.
*/
-void
+unsigned long
__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
struct page *page;
@@ -8402,12 +8461,15 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
unsigned int order, i;
unsigned long pfn;
unsigned long flags;
+ unsigned long offlined_pages = 0;
+
/* find the first valid pfn */
for (pfn = start_pfn; pfn < end_pfn; pfn++)
if (pfn_valid(pfn))
break;
if (pfn == end_pfn)
- return;
+ return offlined_pages;
+
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
@@ -8425,24 +8487,26 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
SetPageReserved(page);
+ offlined_pages++;
continue;
}
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
order = page_order(page);
+ offlined_pages += 1 << order;
#ifdef CONFIG_DEBUG_VM
pr_info("remove from free list %lx %d %lx\n",
pfn, 1 << order, end_pfn);
#endif
- list_del(&page->lru);
- rmv_page_order(page);
- zone->free_area[order].nr_free--;
+ del_page_from_free_area(page, &zone->free_area[order]);
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
+
+ return offlined_pages;
}
#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 019280712e1b..e3638a5bafff 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -151,8 +151,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
for (i = 0; i < nr_pages; i++) {
struct page *page;
- if (!pfn_valid_within(pfn + i))
- continue;
page = pfn_to_online_page(pfn + i);
if (!page)
continue;
diff --git a/mm/rmap.c b/mm/rmap.c
index b30c7c71d1d9..e5dfe2ae6b0d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -850,7 +850,7 @@ int page_referenced(struct page *page,
};
*vm_flags = 0;
- if (!page_mapped(page))
+ if (!pra.mapcount)
return 0;
if (!page_rmapping(page))
@@ -896,7 +896,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
* We have to assume the worse case ie pmd for invalidation. Note that
* the page can not be free from this function.
*/
- mmu_notifier_range_init(&range, vma->vm_mm, address,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
+ 0, vma, vma->vm_mm, address,
min(vma->vm_end, address +
(PAGE_SIZE << compound_order(page))));
mmu_notifier_invalidate_range_start(&range);
@@ -928,7 +929,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
continue;
flush_cache_page(vma, address, page_to_pfn(page));
- entry = pmdp_huge_clear_flush(vma, address, pmd);
+ entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
@@ -1371,7 +1372,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
- mmu_notifier_range_init(&range, vma->vm_mm, address,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address,
min(vma->vm_end, address +
(PAGE_SIZE << compound_order(page))));
if (PageHuge(page)) {
diff --git a/mm/shmem.c b/mm/shmem.c
index f4dce9c8670d..1bb3b8dc8bb2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -614,7 +614,7 @@ static int shmem_add_to_page_cache(struct page *page,
if (xas_error(&xas))
goto unlock;
next:
- xas_store(&xas, page + i);
+ xas_store(&xas, page);
if (++i < nr) {
xas_next(&xas);
goto next;
diff --git a/mm/shuffle.c b/mm/shuffle.c
new file mode 100644
index 000000000000..3ce12481b1dc
--- /dev/null
+++ b/mm/shuffle.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2018 Intel Corporation. All rights reserved.
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/mmzone.h>
+#include <linux/random.h>
+#include <linux/moduleparam.h>
+#include "internal.h"
+#include "shuffle.h"
+
+DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
+static unsigned long shuffle_state __ro_after_init;
+
+/*
+ * Depending on the architecture, module parameter parsing may run
+ * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents,
+ * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE
+ * attempts to turn on the implementation, but aborts if it finds
+ * SHUFFLE_FORCE_DISABLE already set.
+ */
+__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
+{
+ if (ctl == SHUFFLE_FORCE_DISABLE)
+ set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state);
+
+ if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) {
+ if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state))
+ static_branch_disable(&page_alloc_shuffle_key);
+ } else if (ctl == SHUFFLE_ENABLE
+ && !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state))
+ static_branch_enable(&page_alloc_shuffle_key);
+}
+
+static bool shuffle_param;
+extern int shuffle_show(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state)
+ ? 'Y' : 'N');
+}
+
+static __meminit int shuffle_store(const char *val,
+ const struct kernel_param *kp)
+{
+ int rc = param_set_bool(val, kp);
+
+ if (rc < 0)
+ return rc;
+ if (shuffle_param)
+ page_alloc_shuffle(SHUFFLE_ENABLE);
+ else
+ page_alloc_shuffle(SHUFFLE_FORCE_DISABLE);
+ return 0;
+}
+module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
+
+/*
+ * For two pages to be swapped in the shuffle, they must be free (on a
+ * 'free_area' lru), have the same order, and have the same migratetype.
+ */
+static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order)
+{
+ struct page *page;
+
+ /*
+ * Given we're dealing with randomly selected pfns in a zone we
+ * need to ask questions like...
+ */
+
+ /* ...is the pfn even in the memmap? */
+ if (!pfn_valid_within(pfn))
+ return NULL;
+
+ /* ...is the pfn in a present section or a hole? */
+ if (!pfn_present(pfn))
+ return NULL;
+
+ /* ...is the page free and currently on a free_area list? */
+ page = pfn_to_page(pfn);
+ if (!PageBuddy(page))
+ return NULL;
+
+ /*
+ * ...is the page on the same list as the page we will
+ * shuffle it with?
+ */
+ if (page_order(page) != order)
+ return NULL;
+
+ return page;
+}
+
+/*
+ * Fisher-Yates shuffle the freelist which prescribes iterating through an
+ * array, pfns in this case, and randomly swapping each entry with another in
+ * the span, end_pfn - start_pfn.
+ *
+ * To keep the implementation simple it does not attempt to correct for sources
+ * of bias in the distribution, like modulo bias or pseudo-random number
+ * generator bias. I.e. the expectation is that this shuffling raises the bar
+ * for attacks that exploit the predictability of page allocations, but need not
+ * be a perfect shuffle.
+ */
+#define SHUFFLE_RETRY 10
+void __meminit __shuffle_zone(struct zone *z)
+{
+ unsigned long i, flags;
+ unsigned long start_pfn = z->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(z);
+ const int order = SHUFFLE_ORDER;
+ const int order_pages = 1 << order;
+
+ spin_lock_irqsave(&z->lock, flags);
+ start_pfn = ALIGN(start_pfn, order_pages);
+ for (i = start_pfn; i < end_pfn; i += order_pages) {
+ unsigned long j;
+ int migratetype, retry;
+ struct page *page_i, *page_j;
+
+ /*
+ * We expect page_i, in the sub-range of a zone being added
+ * (@start_pfn to @end_pfn), to more likely be valid compared to
+ * page_j randomly selected in the span @zone_start_pfn to
+ * @spanned_pages.
+ */
+ page_i = shuffle_valid_page(i, order);
+ if (!page_i)
+ continue;
+
+ for (retry = 0; retry < SHUFFLE_RETRY; retry++) {
+ /*
+ * Pick a random order aligned page in the zone span as
+ * a swap target. If the selected pfn is a hole, retry
+ * up to SHUFFLE_RETRY attempts find a random valid pfn
+ * in the zone.
+ */
+ j = z->zone_start_pfn +
+ ALIGN_DOWN(get_random_long() % z->spanned_pages,
+ order_pages);
+ page_j = shuffle_valid_page(j, order);
+ if (page_j && page_j != page_i)
+ break;
+ }
+ if (retry >= SHUFFLE_RETRY) {
+ pr_debug("%s: failed to swap %#lx\n", __func__, i);
+ continue;
+ }
+
+ /*
+ * Each migratetype corresponds to its own list, make sure the
+ * types match otherwise we're moving pages to lists where they
+ * do not belong.
+ */
+ migratetype = get_pageblock_migratetype(page_i);
+ if (get_pageblock_migratetype(page_j) != migratetype) {
+ pr_debug("%s: migratetype mismatch %#lx\n", __func__, i);
+ continue;
+ }
+
+ list_swap(&page_i->lru, &page_j->lru);
+
+ pr_debug("%s: swap: %#lx -> %#lx\n", __func__, i, j);
+
+ /* take it easy on the zone lock */
+ if ((i % (100 * order_pages)) == 0) {
+ spin_unlock_irqrestore(&z->lock, flags);
+ cond_resched();
+ spin_lock_irqsave(&z->lock, flags);
+ }
+ }
+ spin_unlock_irqrestore(&z->lock, flags);
+}
+
+/**
+ * shuffle_free_memory - reduce the predictability of the page allocator
+ * @pgdat: node page data
+ */
+void __meminit __shuffle_free_memory(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ shuffle_zone(z);
+}
+
+void add_to_free_area_random(struct page *page, struct free_area *area,
+ int migratetype)
+{
+ static u64 rand;
+ static u8 rand_bits;
+
+ /*
+ * The lack of locking is deliberate. If 2 threads race to
+ * update the rand state it just adds to the entropy.
+ */
+ if (rand_bits == 0) {
+ rand_bits = 64;
+ rand = get_random_u64();
+ }
+
+ if (rand & 1)
+ add_to_free_area(page, area, migratetype);
+ else
+ add_to_free_area_tail(page, area, migratetype);
+ rand_bits--;
+ rand >>= 1;
+}
diff --git a/mm/shuffle.h b/mm/shuffle.h
new file mode 100644
index 000000000000..777a257a0d2f
--- /dev/null
+++ b/mm/shuffle.h
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2018 Intel Corporation. All rights reserved.
+#ifndef _MM_SHUFFLE_H
+#define _MM_SHUFFLE_H
+#include <linux/jump_label.h>
+
+/*
+ * SHUFFLE_ENABLE is called from the command line enabling path, or by
+ * platform-firmware enabling that indicates the presence of a
+ * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from
+ * the command line path and overrides any previous or future
+ * SHUFFLE_ENABLE.
+ */
+enum mm_shuffle_ctl {
+ SHUFFLE_ENABLE,
+ SHUFFLE_FORCE_DISABLE,
+};
+
+#define SHUFFLE_ORDER (MAX_ORDER-1)
+
+#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
+DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
+extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl);
+extern void __shuffle_free_memory(pg_data_t *pgdat);
+static inline void shuffle_free_memory(pg_data_t *pgdat)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return;
+ __shuffle_free_memory(pgdat);
+}
+
+extern void __shuffle_zone(struct zone *z);
+static inline void shuffle_zone(struct zone *z)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return;
+ __shuffle_zone(z);
+}
+
+static inline bool is_shuffle_order(int order)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return false;
+ return order >= SHUFFLE_ORDER;
+}
+#else
+static inline void shuffle_free_memory(pg_data_t *pgdat)
+{
+}
+
+static inline void shuffle_zone(struct zone *z)
+{
+}
+
+static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
+{
+}
+
+static inline bool is_shuffle_order(int order)
+{
+ return false;
+}
+#endif
+#endif /* _MM_SHUFFLE_H */
diff --git a/mm/slab.c b/mm/slab.c
index 284ab737faee..2915d912e89a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -990,10 +990,8 @@ static void cpuup_canceled(long cpu)
/* cpu is dead; no one can alloc from it. */
nc = per_cpu_ptr(cachep->cpu_cache, cpu);
- if (nc) {
- free_block(cachep, nc->entry, nc->avail, node, &list);
- nc->avail = 0;
- }
+ free_block(cachep, nc->entry, nc->avail, node, &list);
+ nc->avail = 0;
if (!cpumask_empty(mask)) {
spin_unlock_irq(&n->list_lock);
@@ -1674,8 +1672,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
{
struct page *page, *n;
- list_for_each_entry_safe(page, n, list, lru) {
- list_del(&page->lru);
+ list_for_each_entry_safe(page, n, list, slab_list) {
+ list_del(&page->slab_list);
slab_destroy(cachep, page);
}
}
@@ -2231,8 +2229,8 @@ static int drain_freelist(struct kmem_cache *cache,
goto out;
}
- page = list_entry(p, struct page, lru);
- list_del(&page->lru);
+ page = list_entry(p, struct page, slab_list);
+ list_del(&page->slab_list);
n->free_slabs--;
n->total_slabs--;
/*
@@ -2691,13 +2689,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
if (!page)
return;
- INIT_LIST_HEAD(&page->lru);
+ INIT_LIST_HEAD(&page->slab_list);
n = get_node(cachep, page_to_nid(page));
spin_lock(&n->list_lock);
n->total_slabs++;
if (!page->active) {
- list_add_tail(&page->lru, &(n->slabs_free));
+ list_add_tail(&page->slab_list, &n->slabs_free);
n->free_slabs++;
} else
fixup_slab_list(cachep, n, page, &list);
@@ -2806,9 +2804,9 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
void **list)
{
/* move slabp to correct slabp list: */
- list_del(&page->lru);
+ list_del(&page->slab_list);
if (page->active == cachep->num) {
- list_add(&page->lru, &n->slabs_full);
+ list_add(&page->slab_list, &n->slabs_full);
if (OBJFREELIST_SLAB(cachep)) {
#if DEBUG
/* Poisoning will be done without holding the lock */
@@ -2822,7 +2820,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
page->freelist = NULL;
}
} else
- list_add(&page->lru, &n->slabs_partial);
+ list_add(&page->slab_list, &n->slabs_partial);
}
/* Try to find non-pfmemalloc slab if needed */
@@ -2845,20 +2843,20 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
}
/* Move pfmemalloc slab to the end of list to speed up next search */
- list_del(&page->lru);
+ list_del(&page->slab_list);
if (!page->active) {
- list_add_tail(&page->lru, &n->slabs_free);
+ list_add_tail(&page->slab_list, &n->slabs_free);
n->free_slabs++;
} else
- list_add_tail(&page->lru, &n->slabs_partial);
+ list_add_tail(&page->slab_list, &n->slabs_partial);
- list_for_each_entry(page, &n->slabs_partial, lru) {
+ list_for_each_entry(page, &n->slabs_partial, slab_list) {
if (!PageSlabPfmemalloc(page))
return page;
}
n->free_touched = 1;
- list_for_each_entry(page, &n->slabs_free, lru) {
+ list_for_each_entry(page, &n->slabs_free, slab_list) {
if (!PageSlabPfmemalloc(page)) {
n->free_slabs--;
return page;
@@ -2873,11 +2871,12 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
struct page *page;
assert_spin_locked(&n->list_lock);
- page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
+ page = list_first_entry_or_null(&n->slabs_partial, struct page,
+ slab_list);
if (!page) {
n->free_touched = 1;
page = list_first_entry_or_null(&n->slabs_free, struct page,
- lru);
+ slab_list);
if (page)
n->free_slabs--;
}
@@ -3378,29 +3377,29 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
objp = objpp[i];
page = virt_to_head_page(objp);
- list_del(&page->lru);
+ list_del(&page->slab_list);
check_spinlock_acquired_node(cachep, node);
slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);
/* fixup slab chains */
if (page->active == 0) {
- list_add(&page->lru, &n->slabs_free);
+ list_add(&page->slab_list, &n->slabs_free);
n->free_slabs++;
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
- list_add_tail(&page->lru, &n->slabs_partial);
+ list_add_tail(&page->slab_list, &n->slabs_partial);
}
}
while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
n->free_objects -= cachep->num;
- page = list_last_entry(&n->slabs_free, struct page, lru);
- list_move(&page->lru, list);
+ page = list_last_entry(&n->slabs_free, struct page, slab_list);
+ list_move(&page->slab_list, list);
n->free_slabs--;
n->total_slabs--;
}
@@ -3438,7 +3437,7 @@ free_done:
int i = 0;
struct page *page;
- list_for_each_entry(page, &n->slabs_free, lru) {
+ list_for_each_entry(page, &n->slabs_free, slab_list) {
BUG_ON(page->active);
i++;
@@ -4292,8 +4291,12 @@ static int leaks_show(struct seq_file *m, void *p)
* whole processing.
*/
do {
- set_store_user_clean(cachep);
drain_cpu_caches(cachep);
+ /*
+ * drain_cpu_caches() could make kmemleak_object and
+ * debug_objects_cache dirty, so reset afterwards.
+ */
+ set_store_user_clean(cachep);
x[1] = 0;
@@ -4302,9 +4305,9 @@ static int leaks_show(struct seq_file *m, void *p)
check_irq_on();
spin_lock_irq(&n->list_lock);
- list_for_each_entry(page, &n->slabs_full, lru)
+ list_for_each_entry(page, &n->slabs_full, slab_list)
handle_slab(x, cachep, page);
- list_for_each_entry(page, &n->slabs_partial, lru)
+ list_for_each_entry(page, &n->slabs_partial, slab_list)
handle_slab(x, cachep, page);
spin_unlock_irq(&n->list_lock);
}
diff --git a/mm/slob.c b/mm/slob.c
index 307c2c9feb44..84aefd9b91ee 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -112,13 +112,13 @@ static inline int slob_page_free(struct page *sp)
static void set_slob_page_free(struct page *sp, struct list_head *list)
{
- list_add(&sp->lru, list);
+ list_add(&sp->slab_list, list);
__SetPageSlobFree(sp);
}
static inline void clear_slob_page_free(struct page *sp)
{
- list_del(&sp->lru);
+ list_del(&sp->slab_list);
__ClearPageSlobFree(sp);
}
@@ -213,13 +213,26 @@ static void slob_free_pages(void *b, int order)
}
/*
- * Allocate a slob block within a given slob_page sp.
+ * slob_page_alloc() - Allocate a slob block within a given slob_page sp.
+ * @sp: Page to look in.
+ * @size: Size of the allocation.
+ * @align: Allocation alignment.
+ * @page_removed_from_list: Return parameter.
+ *
+ * Tries to find a chunk of memory at least @size bytes big within @page.
+ *
+ * Return: Pointer to memory if allocated, %NULL otherwise. If the
+ * allocation fills up @page then the page is removed from the
+ * freelist, in this case @page_removed_from_list will be set to
+ * true (set to false otherwise).
*/
-static void *slob_page_alloc(struct page *sp, size_t size, int align)
+static void *slob_page_alloc(struct page *sp, size_t size, int align,
+ bool *page_removed_from_list)
{
slob_t *prev, *cur, *aligned = NULL;
int delta = 0, units = SLOB_UNITS(size);
+ *page_removed_from_list = false;
for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
slobidx_t avail = slob_units(cur);
@@ -254,8 +267,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align)
}
sp->units -= units;
- if (!sp->units)
+ if (!sp->units) {
clear_slob_page_free(sp);
+ *page_removed_from_list = true;
+ }
return cur;
}
if (slob_last(cur))
@@ -269,10 +284,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align)
static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
{
struct page *sp;
- struct list_head *prev;
struct list_head *slob_list;
slob_t *b = NULL;
unsigned long flags;
+ bool _unused;
if (size < SLOB_BREAK1)
slob_list = &free_slob_small;
@@ -283,7 +298,8 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
spin_lock_irqsave(&slob_lock, flags);
/* Iterate through each partially free page, try to find room */
- list_for_each_entry(sp, slob_list, lru) {
+ list_for_each_entry(sp, slob_list, slab_list) {
+ bool page_removed_from_list = false;
#ifdef CONFIG_NUMA
/*
* If there's a node specification, search for a partial
@@ -296,18 +312,25 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
if (sp->units < SLOB_UNITS(size))
continue;
- /* Attempt to alloc */
- prev = sp->lru.prev;
- b = slob_page_alloc(sp, size, align);
+ b = slob_page_alloc(sp, size, align, &page_removed_from_list);
if (!b)
continue;
- /* Improve fragment distribution and reduce our average
- * search time by starting our next search here. (see
- * Knuth vol 1, sec 2.5, pg 449) */
- if (prev != slob_list->prev &&
- slob_list->next != prev->next)
- list_move_tail(slob_list, prev->next);
+ /*
+ * If slob_page_alloc() removed sp from the list then we
+ * cannot call list functions on sp. If so allocation
+ * did not fragment the page anyway so optimisation is
+ * unnecessary.
+ */
+ if (!page_removed_from_list) {
+ /*
+ * Improve fragment distribution and reduce our average
+ * search time by starting our next search here. (see
+ * Knuth vol 1, sec 2.5, pg 449)
+ */
+ if (!list_is_first(&sp->slab_list, slob_list))
+ list_rotate_to_front(&sp->slab_list, slob_list);
+ }
break;
}
spin_unlock_irqrestore(&slob_lock, flags);
@@ -323,10 +346,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
spin_lock_irqsave(&slob_lock, flags);
sp->units = SLOB_UNITS(PAGE_SIZE);
sp->freelist = b;
- INIT_LIST_HEAD(&sp->lru);
+ INIT_LIST_HEAD(&sp->slab_list);
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
set_slob_page_free(sp, slob_list);
- b = slob_page_alloc(sp, size, align);
+ b = slob_page_alloc(sp, size, align, &_unused);
BUG_ON(!b);
spin_unlock_irqrestore(&slob_lock, flags);
}
diff --git a/mm/slub.c b/mm/slub.c
index 6b28cd2b5a58..e6ce13c54cb0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -58,10 +58,11 @@
* D. page->frozen -> frozen state
*
* If a slab is frozen then it is exempt from list management. It is not
- * on any list. The processor that froze the slab is the one who can
- * perform list operations on the page. Other processors may put objects
- * onto the freelist but the processor that froze the slab is the only
- * one that can retrieve the objects from the page's freelist.
+ * on any list except per cpu partial list. The processor that froze the
+ * slab is the one who can perform list operations on the page. Other
+ * processors may put objects onto the freelist but the processor that
+ * froze the slab is the only one that can retrieve the objects from the
+ * page's freelist.
*
* The list_lock protects the partial and full list on each node and
* the partial slab counter. If taken then no new slabs may be added or
@@ -1014,7 +1015,7 @@ static void add_full(struct kmem_cache *s,
return;
lockdep_assert_held(&n->list_lock);
- list_add(&page->lru, &n->full);
+ list_add(&page->slab_list, &n->full);
}
static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
@@ -1023,7 +1024,7 @@ static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct
return;
lockdep_assert_held(&n->list_lock);
- list_del(&page->lru);
+ list_del(&page->slab_list);
}
/* Tracking of the number of slabs for debugging purposes */
@@ -1764,9 +1765,9 @@ __add_partial(struct kmem_cache_node *n, struct page *page, int tail)
{
n->nr_partial++;
if (tail == DEACTIVATE_TO_TAIL)
- list_add_tail(&page->lru, &n->partial);
+ list_add_tail(&page->slab_list, &n->partial);
else
- list_add(&page->lru, &n->partial);
+ list_add(&page->slab_list, &n->partial);
}
static inline void add_partial(struct kmem_cache_node *n,
@@ -1780,7 +1781,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
struct page *page)
{
lockdep_assert_held(&n->list_lock);
- list_del(&page->lru);
+ list_del(&page->slab_list);
n->nr_partial--;
}
@@ -1854,7 +1855,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
return NULL;
spin_lock(&n->list_lock);
- list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
void *t;
if (!pfmemalloc_match(page, flags))
@@ -1942,7 +1943,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
}
}
} while (read_mems_allowed_retry(cpuset_mems_cookie));
-#endif
+#endif /* CONFIG_NUMA */
return NULL;
}
@@ -2240,7 +2241,7 @@ static void unfreeze_partials(struct kmem_cache *s,
discard_slab(s, page);
stat(s, FREE_SLAB);
}
-#endif
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
/*
@@ -2299,7 +2300,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
local_irq_restore(flags);
}
preempt_enable();
-#endif
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -2398,7 +2399,7 @@ static unsigned long count_partial(struct kmem_cache_node *n,
struct page *page;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru)
+ list_for_each_entry(page, &n->partial, slab_list)
x += get_count(page);
spin_unlock_irqrestore(&n->list_lock, flags);
return x;
@@ -2804,7 +2805,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
}
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
-#endif
+#endif /* CONFIG_NUMA */
/*
* Slow path handling. This may still be called frequently since objects
@@ -2903,8 +2904,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
* then add it.
*/
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
- if (kmem_cache_debug(s))
- remove_full(s, n, page);
+ remove_full(s, n, page);
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
@@ -3696,10 +3696,10 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
BUG_ON(irqs_disabled());
spin_lock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &n->partial, lru) {
+ list_for_each_entry_safe(page, h, &n->partial, slab_list) {
if (!page->inuse) {
remove_partial(n, page);
- list_add(&page->lru, &discard);
+ list_add(&page->slab_list, &discard);
} else {
list_slab_objects(s, page,
"Objects remaining in %s on __kmem_cache_shutdown()");
@@ -3707,7 +3707,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
}
spin_unlock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &discard, lru)
+ list_for_each_entry_safe(page, h, &discard, slab_list)
discard_slab(s, page);
}
@@ -3839,7 +3839,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
return ret;
}
EXPORT_SYMBOL(__kmalloc_node);
-#endif
+#endif /* CONFIG_NUMA */
#ifdef CONFIG_HARDENED_USERCOPY
/*
@@ -3987,7 +3987,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
* Note that concurrent frees may occur while we hold the
* list_lock. page->inuse here is the upper limit.
*/
- list_for_each_entry_safe(page, t, &n->partial, lru) {
+ list_for_each_entry_safe(page, t, &n->partial, slab_list) {
int free = page->objects - page->inuse;
/* Do not reread page->inuse */
@@ -3997,10 +3997,10 @@ int __kmem_cache_shrink(struct kmem_cache *s)
BUG_ON(free <= 0);
if (free == page->objects) {
- list_move(&page->lru, &discard);
+ list_move(&page->slab_list, &discard);
n->nr_partial--;
} else if (free <= SHRINK_PROMOTE_MAX)
- list_move(&page->lru, promote + free - 1);
+ list_move(&page->slab_list, promote + free - 1);
}
/*
@@ -4013,7 +4013,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
- list_for_each_entry_safe(page, t, &discard, lru)
+ list_for_each_entry_safe(page, t, &discard, slab_list)
discard_slab(s, page);
if (slabs_node(s, node))
@@ -4057,7 +4057,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s)
*/
slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
}
-#endif
+#endif /* CONFIG_MEMCG */
static int slab_mem_going_offline_callback(void *arg)
{
@@ -4205,11 +4205,11 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
for_each_kmem_cache_node(s, node, n) {
struct page *p;
- list_for_each_entry(p, &n->partial, lru)
+ list_for_each_entry(p, &n->partial, slab_list)
p->slab_cache = s;
#ifdef CONFIG_SLUB_DEBUG
- list_for_each_entry(p, &n->full, lru)
+ list_for_each_entry(p, &n->full, slab_list)
p->slab_cache = s;
#endif
}
@@ -4426,7 +4426,7 @@ static int validate_slab_node(struct kmem_cache *s,
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru) {
+ list_for_each_entry(page, &n->partial, slab_list) {
validate_slab_slab(s, page, map);
count++;
}
@@ -4437,7 +4437,7 @@ static int validate_slab_node(struct kmem_cache *s,
if (!(s->flags & SLAB_STORE_USER))
goto out;
- list_for_each_entry(page, &n->full, lru) {
+ list_for_each_entry(page, &n->full, slab_list) {
validate_slab_slab(s, page, map);
count++;
}
@@ -4633,9 +4633,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
continue;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru)
+ list_for_each_entry(page, &n->partial, slab_list)
process_slab(&t, s, page, alloc, map);
- list_for_each_entry(page, &n->full, lru)
+ list_for_each_entry(page, &n->full, slab_list)
process_slab(&t, s, page, alloc, map);
spin_unlock_irqrestore(&n->list_lock, flags);
}
@@ -4690,7 +4690,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
len += sprintf(buf, "No data\n");
return len;
}
-#endif
+#endif /* CONFIG_SLUB_DEBUG */
#ifdef SLUB_RESILIENCY_TEST
static void __init resiliency_test(void)
@@ -4750,7 +4750,7 @@ static void __init resiliency_test(void)
#ifdef CONFIG_SYSFS
static void resiliency_test(void) {};
#endif
-#endif
+#endif /* SLUB_RESILIENCY_TEST */
#ifdef CONFIG_SYSFS
enum slab_stat_type {
@@ -5103,6 +5103,14 @@ static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
SLAB_ATTR_RO(cache_dma);
#endif
+#ifdef CONFIG_ZONE_DMA32
+static ssize_t cache_dma32_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA32));
+}
+SLAB_ATTR_RO(cache_dma32);
+#endif
+
static ssize_t usersize_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%u\n", s->usersize);
@@ -5407,7 +5415,7 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
-#endif
+#endif /* CONFIG_SLUB_STATS */
static struct attribute *slab_attrs[] = {
&slab_size_attr.attr,
@@ -5443,6 +5451,9 @@ static struct attribute *slab_attrs[] = {
#ifdef CONFIG_ZONE_DMA
&cache_dma_attr.attr,
#endif
+#ifdef CONFIG_ZONE_DMA32
+ &cache_dma32_attr.attr,
+#endif
#ifdef CONFIG_NUMA
&remote_node_defrag_ratio_attr.attr,
#endif
@@ -5608,7 +5619,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
if (buffer)
free_page((unsigned long)buffer);
-#endif
+#endif /* CONFIG_MEMCG */
}
static void kmem_cache_release(struct kobject *k)
diff --git a/mm/sparse.c b/mm/sparse.c
index 56e057c432f9..fd13166949b5 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -684,10 +684,18 @@ static void free_map_bootmem(struct page *memmap)
#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-/*
- * returns the number of sections whose mem_maps were properly
- * set. If this is <=0, then that means that the passed-in
- * map was not consumed and must be freed.
+/**
+ * sparse_add_one_section - add a memory section
+ * @nid: The node to add section on
+ * @start_pfn: start pfn of the memory range
+ * @altmap: device page map
+ *
+ * This is only intended for hotplug.
+ *
+ * Return:
+ * * 0 - On success.
+ * * -EEXIST - Section has been present.
+ * * -ENOMEM - Out of memory.
*/
int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
struct vmem_altmap *altmap)
diff --git a/mm/swap.c b/mm/swap.c
index 301ed4e04320..3a75722e68a9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -867,7 +867,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
SetPageLRU(page);
/*
* Page becomes evictable in two ways:
- * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()].
+ * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
* 2) Before acquiring LRU lock to put the page to correct LRU and then
* a) do PageLRU check with lock [check_move_unevictable_pages]
* b) do PageLRU check before lock [clear_page_mlock]
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 85245fdec8d9..eb714165afd2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -132,7 +132,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
for (i = 0; i < nr; i++) {
VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
set_page_private(page + i, entry.val + i);
- xas_store(&xas, page + i);
+ xas_store(&xas, page);
xas_next(&xas);
}
address_space->nrpages += nr;
@@ -167,7 +167,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, NULL);
- VM_BUG_ON_PAGE(entry != page + i, entry);
+ VM_BUG_ON_PAGE(entry != page, entry);
set_page_private(page + i, 0);
xas_next(&xas);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index cf63b5f01adf..be36f6fe2f8c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1783,8 +1783,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
- set_pte_at(vma->vm_mm, addr, pte,
- pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
page_add_anon_rmap(page, vma, addr, false);
mem_cgroup_commit_charge(page, memcg, true, false);
@@ -1793,6 +1791,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
+ set_pte_at(vma->vm_mm, addr, pte,
+ pte_mkold(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
/*
* Move the page to the active list so it is not
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index d59b5a73dfb3..9932d5755e4c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -271,8 +271,7 @@ retry:
*/
idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
- hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
- idx, dst_addr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = -ENOMEM;
diff --git a/mm/util.c b/mm/util.c
index 43a2984bccaa..e2e4f8c3fa12 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -318,7 +318,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -339,10 +339,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* were pinned, returns -errno.
*/
int __weak get_user_pages_fast(unsigned long start,
- int nr_pages, int write, struct page **pages)
+ int nr_pages, unsigned int gup_flags,
+ struct page **pages)
{
- return get_user_pages_unlocked(start, nr_pages, pages,
- write ? FOLL_WRITE : 0);
+ return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
@@ -652,7 +652,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- long free, allowed, reserve;
+ long allowed;
VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
-(s64)vm_committed_as_batch * num_online_cpus(),
@@ -667,51 +667,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_zone_page_state(NR_FREE_PAGES);
- free += global_node_page_state(NR_FILE_PAGES);
-
- /*
- * shmem pages shouldn't be counted as free in this
- * case, they can't be purged, only swapped out, and
- * that won't affect the overall amount of available
- * memory in the system.
- */
- free -= global_node_page_state(NR_SHMEM);
-
- free += get_nr_swap_pages();
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_node_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Part of the kernel memory, which can be released
- * under memory pressure.
- */
- free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (free <= totalreserve_pages)
+ if (pages > totalram_pages() + total_swap_pages)
goto error;
- else
- free -= totalreserve_pages;
-
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- if (free > pages)
- return 0;
-
- goto error;
+ return 0;
}
allowed = vm_commit_limit();
@@ -725,7 +683,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
* Don't let a single process grow so big a user can't recover
*/
if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+
allowed -= min_t(long, mm->total_vm / 32, reserve);
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e5e9e1fcac01..22caff913f7d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -32,6 +32,7 @@
#include <linux/compiler.h>
#include <linux/llist.h>
#include <linux/bitops.h>
+#include <linux/rbtree_augmented.h>
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
@@ -324,6 +325,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
/*** Global kva allocator ***/
+#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
+#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
+
#define VM_LAZY_FREE 0x02
#define VM_VM_AREA 0x04
@@ -332,14 +336,74 @@ static DEFINE_SPINLOCK(vmap_area_lock);
LIST_HEAD(vmap_area_list);
static LLIST_HEAD(vmap_purge_list);
static struct rb_root vmap_area_root = RB_ROOT;
+static bool vmap_initialized __read_mostly;
+
+/*
+ * This kmem_cache is used for vmap_area objects. Instead of
+ * allocating from slab we reuse an object from this cache to
+ * make things faster. Especially in "no edge" splitting of
+ * free block.
+ */
+static struct kmem_cache *vmap_area_cachep;
+
+/*
+ * This linked list is used in pair with free_vmap_area_root.
+ * It gives O(1) access to prev/next to perform fast coalescing.
+ */
+static LIST_HEAD(free_vmap_area_list);
+
+/*
+ * This augment red-black tree represents the free vmap space.
+ * All vmap_area objects in this tree are sorted by va->va_start
+ * address. It is used for allocation and merging when a vmap
+ * object is released.
+ *
+ * Each vmap_area node contains a maximum available free block
+ * of its sub-tree, right or left. Therefore it is possible to
+ * find a lowest match of free area.
+ */
+static struct rb_root free_vmap_area_root = RB_ROOT;
+
+static __always_inline unsigned long
+va_size(struct vmap_area *va)
+{
+ return (va->va_end - va->va_start);
+}
+
+static __always_inline unsigned long
+get_subtree_max_size(struct rb_node *node)
+{
+ struct vmap_area *va;
+
+ va = rb_entry_safe(node, struct vmap_area, rb_node);
+ return va ? va->subtree_max_size : 0;
+}
+
+/*
+ * Gets called when remove the node and rotate.
+ */
+static __always_inline unsigned long
+compute_subtree_max_size(struct vmap_area *va)
+{
+ return max3(va_size(va),
+ get_subtree_max_size(va->rb_node.rb_left),
+ get_subtree_max_size(va->rb_node.rb_right));
+}
+
+RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb,
+ struct vmap_area, rb_node, unsigned long, subtree_max_size,
+ compute_subtree_max_size)
-/* The vmap cache globals are protected by vmap_area_lock */
-static struct rb_node *free_vmap_cache;
-static unsigned long cached_hole_size;
-static unsigned long cached_vstart;
-static unsigned long cached_align;
+static void purge_vmap_area_lazy(void);
+static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
+static unsigned long lazy_max_pages(void);
+
+static atomic_long_t nr_vmalloc_pages;
-static unsigned long vmap_area_pcpu_hole;
+unsigned long vmalloc_nr_pages(void)
+{
+ return atomic_long_read(&nr_vmalloc_pages);
+}
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
@@ -360,41 +424,610 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
return NULL;
}
-static void __insert_vmap_area(struct vmap_area *va)
-{
- struct rb_node **p = &vmap_area_root.rb_node;
- struct rb_node *parent = NULL;
- struct rb_node *tmp;
+/*
+ * This function returns back addresses of parent node
+ * and its left or right link for further processing.
+ */
+static __always_inline struct rb_node **
+find_va_links(struct vmap_area *va,
+ struct rb_root *root, struct rb_node *from,
+ struct rb_node **parent)
+{
+ struct vmap_area *tmp_va;
+ struct rb_node **link;
+
+ if (root) {
+ link = &root->rb_node;
+ if (unlikely(!*link)) {
+ *parent = NULL;
+ return link;
+ }
+ } else {
+ link = &from;
+ }
- while (*p) {
- struct vmap_area *tmp_va;
+ /*
+ * Go to the bottom of the tree. When we hit the last point
+ * we end up with parent rb_node and correct direction, i name
+ * it link, where the new va->rb_node will be attached to.
+ */
+ do {
+ tmp_va = rb_entry(*link, struct vmap_area, rb_node);
- parent = *p;
- tmp_va = rb_entry(parent, struct vmap_area, rb_node);
- if (va->va_start < tmp_va->va_end)
- p = &(*p)->rb_left;
- else if (va->va_end > tmp_va->va_start)
- p = &(*p)->rb_right;
+ /*
+ * During the traversal we also do some sanity check.
+ * Trigger the BUG() if there are sides(left/right)
+ * or full overlaps.
+ */
+ if (va->va_start < tmp_va->va_end &&
+ va->va_end <= tmp_va->va_start)
+ link = &(*link)->rb_left;
+ else if (va->va_end > tmp_va->va_start &&
+ va->va_start >= tmp_va->va_end)
+ link = &(*link)->rb_right;
else
BUG();
+ } while (*link);
+
+ *parent = &tmp_va->rb_node;
+ return link;
+}
+
+static __always_inline struct list_head *
+get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
+{
+ struct list_head *list;
+
+ if (unlikely(!parent))
+ /*
+ * The red-black tree where we try to find VA neighbors
+ * before merging or inserting is empty, i.e. it means
+ * there is no free vmap space. Normally it does not
+ * happen but we handle this case anyway.
+ */
+ return NULL;
+
+ list = &rb_entry(parent, struct vmap_area, rb_node)->list;
+ return (&parent->rb_right == link ? list->next : list);
+}
+
+static __always_inline void
+link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link, struct list_head *head)
+{
+ /*
+ * VA is still not in the list, but we can
+ * identify its future previous list_head node.
+ */
+ if (likely(parent)) {
+ head = &rb_entry(parent, struct vmap_area, rb_node)->list;
+ if (&parent->rb_right != link)
+ head = head->prev;
}
- rb_link_node(&va->rb_node, parent, p);
- rb_insert_color(&va->rb_node, &vmap_area_root);
+ /* Insert to the rb-tree */
+ rb_link_node(&va->rb_node, parent, link);
+ if (root == &free_vmap_area_root) {
+ /*
+ * Some explanation here. Just perform simple insertion
+ * to the tree. We do not set va->subtree_max_size to
+ * its current size before calling rb_insert_augmented().
+ * It is because of we populate the tree from the bottom
+ * to parent levels when the node _is_ in the tree.
+ *
+ * Therefore we set subtree_max_size to zero after insertion,
+ * to let __augment_tree_propagate_from() puts everything to
+ * the correct order later on.
+ */
+ rb_insert_augmented(&va->rb_node,
+ root, &free_vmap_area_rb_augment_cb);
+ va->subtree_max_size = 0;
+ } else {
+ rb_insert_color(&va->rb_node, root);
+ }
- /* address-sort this list */
- tmp = rb_prev(&va->rb_node);
- if (tmp) {
- struct vmap_area *prev;
- prev = rb_entry(tmp, struct vmap_area, rb_node);
- list_add_rcu(&va->list, &prev->list);
- } else
- list_add_rcu(&va->list, &vmap_area_list);
+ /* Address-sort this list */
+ list_add(&va->list, head);
}
-static void purge_vmap_area_lazy(void);
+static __always_inline void
+unlink_va(struct vmap_area *va, struct rb_root *root)
+{
+ /*
+ * During merging a VA node can be empty, therefore
+ * not linked with the tree nor list. Just check it.
+ */
+ if (!RB_EMPTY_NODE(&va->rb_node)) {
+ if (root == &free_vmap_area_root)
+ rb_erase_augmented(&va->rb_node,
+ root, &free_vmap_area_rb_augment_cb);
+ else
+ rb_erase(&va->rb_node, root);
-static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
+ list_del(&va->list);
+ RB_CLEAR_NODE(&va->rb_node);
+ }
+}
+
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
+static void
+augment_tree_propagate_check(struct rb_node *n)
+{
+ struct vmap_area *va;
+ struct rb_node *node;
+ unsigned long size;
+ bool found = false;
+
+ if (n == NULL)
+ return;
+
+ va = rb_entry(n, struct vmap_area, rb_node);
+ size = va->subtree_max_size;
+ node = n;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+
+ if (get_subtree_max_size(node->rb_left) == size) {
+ node = node->rb_left;
+ } else {
+ if (va_size(va) == size) {
+ found = true;
+ break;
+ }
+
+ node = node->rb_right;
+ }
+ }
+
+ if (!found) {
+ va = rb_entry(n, struct vmap_area, rb_node);
+ pr_emerg("tree is corrupted: %lu, %lu\n",
+ va_size(va), va->subtree_max_size);
+ }
+
+ augment_tree_propagate_check(n->rb_left);
+ augment_tree_propagate_check(n->rb_right);
+}
+#endif
+
+/*
+ * This function populates subtree_max_size from bottom to upper
+ * levels starting from VA point. The propagation must be done
+ * when VA size is modified by changing its va_start/va_end. Or
+ * in case of newly inserting of VA to the tree.
+ *
+ * It means that __augment_tree_propagate_from() must be called:
+ * - After VA has been inserted to the tree(free path);
+ * - After VA has been shrunk(allocation path);
+ * - After VA has been increased(merging path).
+ *
+ * Please note that, it does not mean that upper parent nodes
+ * and their subtree_max_size are recalculated all the time up
+ * to the root node.
+ *
+ * 4--8
+ * /\
+ * / \
+ * / \
+ * 2--2 8--8
+ *
+ * For example if we modify the node 4, shrinking it to 2, then
+ * no any modification is required. If we shrink the node 2 to 1
+ * its subtree_max_size is updated only, and set to 1. If we shrink
+ * the node 8 to 6, then its subtree_max_size is set to 6 and parent
+ * node becomes 4--6.
+ */
+static __always_inline void
+augment_tree_propagate_from(struct vmap_area *va)
+{
+ struct rb_node *node = &va->rb_node;
+ unsigned long new_va_sub_max_size;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+ new_va_sub_max_size = compute_subtree_max_size(va);
+
+ /*
+ * If the newly calculated maximum available size of the
+ * subtree is equal to the current one, then it means that
+ * the tree is propagated correctly. So we have to stop at
+ * this point to save cycles.
+ */
+ if (va->subtree_max_size == new_va_sub_max_size)
+ break;
+
+ va->subtree_max_size = new_va_sub_max_size;
+ node = rb_parent(&va->rb_node);
+ }
+
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
+ augment_tree_propagate_check(free_vmap_area_root.rb_node);
+#endif
+}
+
+static void
+insert_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+
+ link = find_va_links(va, root, NULL, &parent);
+ link_va(va, root, parent, link, head);
+}
+
+static void
+insert_vmap_area_augment(struct vmap_area *va,
+ struct rb_node *from, struct rb_root *root,
+ struct list_head *head)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+
+ if (from)
+ link = find_va_links(va, NULL, from, &parent);
+ else
+ link = find_va_links(va, root, NULL, &parent);
+
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+}
+
+/*
+ * Merge de-allocated chunk of VA memory with previous
+ * and next free blocks. If coalesce is not done a new
+ * free area is inserted. If VA has been merged, it is
+ * freed.
+ */
+static __always_inline void
+merge_or_add_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ struct vmap_area *sibling;
+ struct list_head *next;
+ struct rb_node **link;
+ struct rb_node *parent;
+ bool merged = false;
+
+ /*
+ * Find a place in the tree where VA potentially will be
+ * inserted, unless it is merged with its sibling/siblings.
+ */
+ link = find_va_links(va, root, NULL, &parent);
+
+ /*
+ * Get next node of VA to check if merging can be done.
+ */
+ next = get_va_next_sibling(parent, link);
+ if (unlikely(next == NULL))
+ goto insert;
+
+ /*
+ * start end
+ * | |
+ * |<------VA------>|<-----Next----->|
+ * | |
+ * start end
+ */
+ if (next != head) {
+ sibling = list_entry(next, struct vmap_area, list);
+ if (sibling->va_start == va->va_end) {
+ sibling->va_start = va->va_start;
+
+ /* Check and update the tree if needed. */
+ augment_tree_propagate_from(sibling);
+
+ /* Remove this VA, it has been merged. */
+ unlink_va(va, root);
+
+ /* Free vmap_area object. */
+ kmem_cache_free(vmap_area_cachep, va);
+
+ /* Point to the new merged area. */
+ va = sibling;
+ merged = true;
+ }
+ }
+
+ /*
+ * start end
+ * | |
+ * |<-----Prev----->|<------VA------>|
+ * | |
+ * start end
+ */
+ if (next->prev != head) {
+ sibling = list_entry(next->prev, struct vmap_area, list);
+ if (sibling->va_end == va->va_start) {
+ sibling->va_end = va->va_end;
+
+ /* Check and update the tree if needed. */
+ augment_tree_propagate_from(sibling);
+
+ /* Remove this VA, it has been merged. */
+ unlink_va(va, root);
+
+ /* Free vmap_area object. */
+ kmem_cache_free(vmap_area_cachep, va);
+
+ return;
+ }
+ }
+
+insert:
+ if (!merged) {
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+ }
+}
+
+static __always_inline bool
+is_within_this_va(struct vmap_area *va, unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ unsigned long nva_start_addr;
+
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+ else
+ nva_start_addr = ALIGN(vstart, align);
+
+ /* Can be overflowed due to big size or alignment. */
+ if (nva_start_addr + size < nva_start_addr ||
+ nva_start_addr < vstart)
+ return false;
+
+ return (nva_start_addr + size <= va->va_end);
+}
+
+/*
+ * Find the first free block(lowest start address) in the tree,
+ * that will accomplish the request corresponding to passing
+ * parameters.
+ */
+static __always_inline struct vmap_area *
+find_vmap_lowest_match(unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ struct vmap_area *va;
+ struct rb_node *node;
+ unsigned long length;
+
+ /* Start from the root. */
+ node = free_vmap_area_root.rb_node;
+
+ /* Adjust the search size for alignment overhead. */
+ length = size + align - 1;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+
+ if (get_subtree_max_size(node->rb_left) >= length &&
+ vstart < va->va_start) {
+ node = node->rb_left;
+ } else {
+ if (is_within_this_va(va, size, align, vstart))
+ return va;
+
+ /*
+ * Does not make sense to go deeper towards the right
+ * sub-tree if it does not have a free block that is
+ * equal or bigger to the requested search length.
+ */
+ if (get_subtree_max_size(node->rb_right) >= length) {
+ node = node->rb_right;
+ continue;
+ }
+
+ /*
+ * OK. We roll back and find the fist right sub-tree,
+ * that will satisfy the search criteria. It can happen
+ * only once due to "vstart" restriction.
+ */
+ while ((node = rb_parent(node))) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+ if (is_within_this_va(va, size, align, vstart))
+ return va;
+
+ if (get_subtree_max_size(node->rb_right) >= length &&
+ vstart <= va->va_start) {
+ node = node->rb_right;
+ break;
+ }
+ }
+ }
+ }
+
+ return NULL;
+}
+
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
+#include <linux/random.h>
+
+static struct vmap_area *
+find_vmap_lowest_linear_match(unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ struct vmap_area *va;
+
+ list_for_each_entry(va, &free_vmap_area_list, list) {
+ if (!is_within_this_va(va, size, align, vstart))
+ continue;
+
+ return va;
+ }
+
+ return NULL;
+}
+
+static void
+find_vmap_lowest_match_check(unsigned long size)
+{
+ struct vmap_area *va_1, *va_2;
+ unsigned long vstart;
+ unsigned int rnd;
+
+ get_random_bytes(&rnd, sizeof(rnd));
+ vstart = VMALLOC_START + rnd;
+
+ va_1 = find_vmap_lowest_match(size, 1, vstart);
+ va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
+
+ if (va_1 != va_2)
+ pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
+ va_1, va_2, vstart);
+}
+#endif
+
+enum fit_type {
+ NOTHING_FIT = 0,
+ FL_FIT_TYPE = 1, /* full fit */
+ LE_FIT_TYPE = 2, /* left edge fit */
+ RE_FIT_TYPE = 3, /* right edge fit */
+ NE_FIT_TYPE = 4 /* no edge fit */
+};
+
+static __always_inline enum fit_type
+classify_va_fit_type(struct vmap_area *va,
+ unsigned long nva_start_addr, unsigned long size)
+{
+ enum fit_type type;
+
+ /* Check if it is within VA. */
+ if (nva_start_addr < va->va_start ||
+ nva_start_addr + size > va->va_end)
+ return NOTHING_FIT;
+
+ /* Now classify. */
+ if (va->va_start == nva_start_addr) {
+ if (va->va_end == nva_start_addr + size)
+ type = FL_FIT_TYPE;
+ else
+ type = LE_FIT_TYPE;
+ } else if (va->va_end == nva_start_addr + size) {
+ type = RE_FIT_TYPE;
+ } else {
+ type = NE_FIT_TYPE;
+ }
+
+ return type;
+}
+
+static __always_inline int
+adjust_va_to_fit_type(struct vmap_area *va,
+ unsigned long nva_start_addr, unsigned long size,
+ enum fit_type type)
+{
+ struct vmap_area *lva;
+
+ if (type == FL_FIT_TYPE) {
+ /*
+ * No need to split VA, it fully fits.
+ *
+ * | |
+ * V NVA V
+ * |---------------|
+ */
+ unlink_va(va, &free_vmap_area_root);
+ kmem_cache_free(vmap_area_cachep, va);
+ } else if (type == LE_FIT_TYPE) {
+ /*
+ * Split left edge of fit VA.
+ *
+ * | |
+ * V NVA V R
+ * |-------|-------|
+ */
+ va->va_start += size;
+ } else if (type == RE_FIT_TYPE) {
+ /*
+ * Split right edge of fit VA.
+ *
+ * | |
+ * L V NVA V
+ * |-------|-------|
+ */
+ va->va_end = nva_start_addr;
+ } else if (type == NE_FIT_TYPE) {
+ /*
+ * Split no edge of fit VA.
+ *
+ * | |
+ * L V NVA V R
+ * |---|-------|---|
+ */
+ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
+ if (unlikely(!lva))
+ return -1;
+
+ /*
+ * Build the remainder.
+ */
+ lva->va_start = va->va_start;
+ lva->va_end = nva_start_addr;
+
+ /*
+ * Shrink this VA to remaining size.
+ */
+ va->va_start = nva_start_addr + size;
+ } else {
+ return -1;
+ }
+
+ if (type != FL_FIT_TYPE) {
+ augment_tree_propagate_from(va);
+
+ if (type == NE_FIT_TYPE)
+ insert_vmap_area_augment(lva, &va->rb_node,
+ &free_vmap_area_root, &free_vmap_area_list);
+ }
+
+ return 0;
+}
+
+/*
+ * Returns a start address of the newly allocated area, if success.
+ * Otherwise a vend is returned that indicates failure.
+ */
+static __always_inline unsigned long
+__alloc_vmap_area(unsigned long size, unsigned long align,
+ unsigned long vstart, unsigned long vend, int node)
+{
+ unsigned long nva_start_addr;
+ struct vmap_area *va;
+ enum fit_type type;
+ int ret;
+
+ va = find_vmap_lowest_match(size, align, vstart);
+ if (unlikely(!va))
+ return vend;
+
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+ else
+ nva_start_addr = ALIGN(vstart, align);
+
+ /* Check the "vend" restriction. */
+ if (nva_start_addr + size > vend)
+ return vend;
+
+ /* Classify what we have found. */
+ type = classify_va_fit_type(va, nva_start_addr, size);
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
+ return vend;
+
+ /* Update the free vmap_area. */
+ ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
+ if (ret)
+ return vend;
+
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
+ find_vmap_lowest_match_check(size);
+#endif
+
+ return nva_start_addr;
+}
/*
* Allocate a region of KVA of the specified size and alignment, within the
@@ -406,18 +1039,19 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
int node, gfp_t gfp_mask)
{
struct vmap_area *va;
- struct rb_node *n;
unsigned long addr;
int purged = 0;
- struct vmap_area *first;
BUG_ON(!size);
BUG_ON(offset_in_page(size));
BUG_ON(!is_power_of_2(align));
+ if (unlikely(!vmap_initialized))
+ return ERR_PTR(-EBUSY);
+
might_sleep();
- va = kmalloc_node(sizeof(struct vmap_area),
+ va = kmem_cache_alloc_node(vmap_area_cachep,
gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
@@ -430,87 +1064,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
retry:
spin_lock(&vmap_area_lock);
- /*
- * Invalidate cache if we have more permissive parameters.
- * cached_hole_size notes the largest hole noticed _below_
- * the vmap_area cached in free_vmap_cache: if size fits
- * into that hole, we want to scan from vstart to reuse
- * the hole instead of allocating above free_vmap_cache.
- * Note that __free_vmap_area may update free_vmap_cache
- * without updating cached_hole_size or cached_align.
- */
- if (!free_vmap_cache ||
- size < cached_hole_size ||
- vstart < cached_vstart ||
- align < cached_align) {
-nocache:
- cached_hole_size = 0;
- free_vmap_cache = NULL;
- }
- /* record if we encounter less permissive parameters */
- cached_vstart = vstart;
- cached_align = align;
-
- /* find starting point for our search */
- if (free_vmap_cache) {
- first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
- addr = ALIGN(first->va_end, align);
- if (addr < vstart)
- goto nocache;
- if (addr + size < addr)
- goto overflow;
-
- } else {
- addr = ALIGN(vstart, align);
- if (addr + size < addr)
- goto overflow;
-
- n = vmap_area_root.rb_node;
- first = NULL;
-
- while (n) {
- struct vmap_area *tmp;
- tmp = rb_entry(n, struct vmap_area, rb_node);
- if (tmp->va_end >= addr) {
- first = tmp;
- if (tmp->va_start <= addr)
- break;
- n = n->rb_left;
- } else
- n = n->rb_right;
- }
-
- if (!first)
- goto found;
- }
-
- /* from the starting point, walk areas until a suitable hole is found */
- while (addr + size > first->va_start && addr + size <= vend) {
- if (addr + cached_hole_size < first->va_start)
- cached_hole_size = first->va_start - addr;
- addr = ALIGN(first->va_end, align);
- if (addr + size < addr)
- goto overflow;
-
- if (list_is_last(&first->list, &vmap_area_list))
- goto found;
- first = list_next_entry(first, list);
- }
-
-found:
/*
- * Check also calculated address against the vstart,
- * because it can be 0 because of big align request.
+ * If an allocation fails, the "vend" address is
+ * returned. Therefore trigger the overflow path.
*/
- if (addr + size > vend || addr < vstart)
+ addr = __alloc_vmap_area(size, align, vstart, vend, node);
+ if (unlikely(addr == vend))
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->flags = 0;
- __insert_vmap_area(va);
- free_vmap_cache = &va->rb_node;
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+
spin_unlock(&vmap_area_lock);
BUG_ON(!IS_ALIGNED(va->va_start, align));
@@ -539,7 +1106,8 @@ overflow:
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
size);
- kfree(va);
+
+ kmem_cache_free(vmap_area_cachep, va);
return ERR_PTR(-EBUSY);
}
@@ -559,35 +1127,16 @@ static void __free_vmap_area(struct vmap_area *va)
{
BUG_ON(RB_EMPTY_NODE(&va->rb_node));
- if (free_vmap_cache) {
- if (va->va_end < cached_vstart) {
- free_vmap_cache = NULL;
- } else {
- struct vmap_area *cache;
- cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
- if (va->va_start <= cache->va_start) {
- free_vmap_cache = rb_prev(&va->rb_node);
- /*
- * We don't try to update cached_hole_size or
- * cached_align, but it won't go very wrong.
- */
- }
- }
- }
- rb_erase(&va->rb_node, &vmap_area_root);
- RB_CLEAR_NODE(&va->rb_node);
- list_del_rcu(&va->list);
-
/*
- * Track the highest possible candidate for pcpu area
- * allocation. Areas outside of vmalloc area can be returned
- * here too, consider only end addresses which fall inside
- * vmalloc area proper.
+ * Remove from the busy tree/list.
*/
- if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
- vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
+ unlink_va(va, &vmap_area_root);
- kfree_rcu(va, rcu_head);
+ /*
+ * Merge VA with its neighbors, otherwise just add it.
+ */
+ merge_or_add_vmap_area(va,
+ &free_vmap_area_root, &free_vmap_area_list);
}
/*
@@ -633,7 +1182,7 @@ static unsigned long lazy_max_pages(void)
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
-static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
/*
* Serialize vmap purging. There is no actual criticial section protected
@@ -651,7 +1200,7 @@ static void purge_fragmented_blocks_allcpus(void);
*/
void set_iounmap_nonlazy(void)
{
- atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+ atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
}
/*
@@ -659,34 +1208,40 @@ void set_iounmap_nonlazy(void)
*/
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
+ unsigned long resched_threshold;
struct llist_node *valist;
struct vmap_area *va;
struct vmap_area *n_va;
- bool do_free = false;
lockdep_assert_held(&vmap_purge_lock);
valist = llist_del_all(&vmap_purge_list);
+ if (unlikely(valist == NULL))
+ return false;
+
+ /*
+ * TODO: to calculate a flush range without looping.
+ * The list can be up to lazy_max_pages() elements.
+ */
llist_for_each_entry(va, valist, purge_list) {
if (va->va_start < start)
start = va->va_start;
if (va->va_end > end)
end = va->va_end;
- do_free = true;
}
- if (!do_free)
- return false;
-
flush_tlb_kernel_range(start, end);
+ resched_threshold = lazy_max_pages() << 1;
spin_lock(&vmap_area_lock);
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
- int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+ unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
__free_vmap_area(va);
- atomic_sub(nr, &vmap_lazy_nr);
- cond_resched_lock(&vmap_area_lock);
+ atomic_long_sub(nr, &vmap_lazy_nr);
+
+ if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
+ cond_resched_lock(&vmap_area_lock);
}
spin_unlock(&vmap_area_lock);
return true;
@@ -722,10 +1277,10 @@ static void purge_vmap_area_lazy(void)
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
- int nr_lazy;
+ unsigned long nr_lazy;
- nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
- &vmap_lazy_nr);
+ nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
+ PAGE_SHIFT, &vmap_lazy_nr);
/* After this point, we may free va at any time */
llist_add(&va->purge_list, &vmap_purge_list);
@@ -788,8 +1343,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr)
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
-static bool vmap_initialized __read_mostly = false;
-
struct vmap_block_queue {
spinlock_t lock;
struct list_head free;
@@ -1250,12 +1803,58 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
vm_area_add_early(vm);
}
+static void vmap_init_free_space(void)
+{
+ unsigned long vmap_start = 1;
+ const unsigned long vmap_end = ULONG_MAX;
+ struct vmap_area *busy, *free;
+
+ /*
+ * B F B B B F
+ * -|-----|.....|-----|-----|-----|.....|-
+ * | The KVA space |
+ * |<--------------------------------->|
+ */
+ list_for_each_entry(busy, &vmap_area_list, list) {
+ if (busy->va_start - vmap_start > 0) {
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!WARN_ON_ONCE(!free)) {
+ free->va_start = vmap_start;
+ free->va_end = busy->va_start;
+
+ insert_vmap_area_augment(free, NULL,
+ &free_vmap_area_root,
+ &free_vmap_area_list);
+ }
+ }
+
+ vmap_start = busy->va_end;
+ }
+
+ if (vmap_end - vmap_start > 0) {
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!WARN_ON_ONCE(!free)) {
+ free->va_start = vmap_start;
+ free->va_end = vmap_end;
+
+ insert_vmap_area_augment(free, NULL,
+ &free_vmap_area_root,
+ &free_vmap_area_list);
+ }
+ }
+}
+
void __init vmalloc_init(void)
{
struct vmap_area *va;
struct vm_struct *tmp;
int i;
+ /*
+ * Create the cache for vmap_area objects.
+ */
+ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
+
for_each_possible_cpu(i) {
struct vmap_block_queue *vbq;
struct vfree_deferred *p;
@@ -1270,16 +1869,21 @@ void __init vmalloc_init(void)
/* Import existing vmlist entries. */
for (tmp = vmlist; tmp; tmp = tmp->next) {
- va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
+ va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (WARN_ON_ONCE(!va))
+ continue;
+
va->flags = VM_VM_AREA;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;
va->vm = tmp;
- __insert_vmap_area(va);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
}
- vmap_area_pcpu_hole = VMALLOC_END;
-
+ /*
+ * Now we can initialize a free vmap space.
+ */
+ vmap_init_free_space();
vmap_initialized = true;
}
@@ -1478,6 +2082,22 @@ struct vm_struct *find_vm_area(const void *addr)
return NULL;
}
+static struct vm_struct *__remove_vm_area(struct vmap_area *va)
+{
+ struct vm_struct *vm = va->vm;
+
+ spin_lock(&vmap_area_lock);
+ va->vm = NULL;
+ va->flags &= ~VM_VM_AREA;
+ va->flags |= VM_LAZY_FREE;
+ spin_unlock(&vmap_area_lock);
+
+ kasan_free_shadow(vm);
+ free_unmap_vmap_area(va);
+
+ return vm;
+}
+
/**
* remove_vm_area - find and remove a continuous kernel virtual area
* @addr: base address
@@ -1490,26 +2110,14 @@ struct vm_struct *find_vm_area(const void *addr)
*/
struct vm_struct *remove_vm_area(const void *addr)
{
+ struct vm_struct *vm = NULL;
struct vmap_area *va;
- might_sleep();
-
va = find_vmap_area((unsigned long)addr);
- if (va && va->flags & VM_VM_AREA) {
- struct vm_struct *vm = va->vm;
-
- spin_lock(&vmap_area_lock);
- va->vm = NULL;
- va->flags &= ~VM_VM_AREA;
- va->flags |= VM_LAZY_FREE;
- spin_unlock(&vmap_area_lock);
-
- kasan_free_shadow(vm);
- free_unmap_vmap_area(va);
+ if (va && va->flags & VM_VM_AREA)
+ vm = __remove_vm_area(va);
- return vm;
- }
- return NULL;
+ return vm;
}
static inline void set_area_direct_map(const struct vm_struct *area,
@@ -1523,8 +2131,9 @@ static inline void set_area_direct_map(const struct vm_struct *area,
}
/* Handle removing and resetting vm mappings related to the vm_struct. */
-static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
+static void vm_remove_mappings(struct vmap_area *va, int deallocate_pages)
{
+ struct vm_struct *area = va->vm;
unsigned long addr = (unsigned long)area->addr;
unsigned long start = ULONG_MAX, end = 0;
int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
@@ -1541,7 +2150,7 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
set_memory_rw(addr, area->nr_pages);
}
- remove_vm_area(area->addr);
+ __remove_vm_area(va);
/* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
if (!flush_reset)
@@ -1581,6 +2190,7 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
static void __vunmap(const void *addr, int deallocate_pages)
{
struct vm_struct *area;
+ struct vmap_area *va;
if (!addr)
return;
@@ -1589,17 +2199,18 @@ static void __vunmap(const void *addr, int deallocate_pages)
addr))
return;
- area = find_vm_area(addr);
- if (unlikely(!area)) {
+ va = find_vmap_area((unsigned long)addr);
+ if (unlikely(!va || !(va->flags & VM_VM_AREA))) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
return;
}
+ area = va->vm;
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
- vm_remove_mappings(area, deallocate_pages);
+ vm_remove_mappings(va, deallocate_pages);
if (deallocate_pages) {
int i;
@@ -1610,12 +2221,12 @@ static void __vunmap(const void *addr, int deallocate_pages)
BUG_ON(!page);
__free_pages(page, 0);
}
+ atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
kvfree(area->pages);
}
kfree(area);
- return;
}
static inline void __vfree_deferred(const void *addr)
@@ -1787,12 +2398,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
+ atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
goto fail;
}
area->pages[i] = page;
if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched();
}
+ atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
if (map_vm_area(area, prot, pages))
goto fail;
@@ -2471,81 +3084,64 @@ static struct vmap_area *node_to_va(struct rb_node *n)
}
/**
- * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
- * @end: target address
- * @pnext: out arg for the next vmap_area
- * @pprev: out arg for the previous vmap_area
+ * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
+ * @addr: target address
*
- * Returns: %true if either or both of next and prev are found,
- * %false if no vmap_area exists
- *
- * Find vmap_areas end addresses of which enclose @end. ie. if not
- * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
+ * Returns: vmap_area if it is found. If there is no such area
+ * the first highest(reverse order) vmap_area is returned
+ * i.e. va->va_start < addr && va->va_end < addr or NULL
+ * if there are no any areas before @addr.
*/
-static bool pvm_find_next_prev(unsigned long end,
- struct vmap_area **pnext,
- struct vmap_area **pprev)
+static struct vmap_area *
+pvm_find_va_enclose_addr(unsigned long addr)
{
- struct rb_node *n = vmap_area_root.rb_node;
- struct vmap_area *va = NULL;
+ struct vmap_area *va, *tmp;
+ struct rb_node *n;
+
+ n = free_vmap_area_root.rb_node;
+ va = NULL;
while (n) {
- va = rb_entry(n, struct vmap_area, rb_node);
- if (end < va->va_end)
- n = n->rb_left;
- else if (end > va->va_end)
+ tmp = rb_entry(n, struct vmap_area, rb_node);
+ if (tmp->va_start <= addr) {
+ va = tmp;
+ if (tmp->va_end >= addr)
+ break;
+
n = n->rb_right;
- else
- break;
+ } else {
+ n = n->rb_left;
+ }
}
- if (!va)
- return false;
-
- if (va->va_end > end) {
- *pnext = va;
- *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
- } else {
- *pprev = va;
- *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
- }
- return true;
+ return va;
}
/**
- * pvm_determine_end - find the highest aligned address between two vmap_areas
- * @pnext: in/out arg for the next vmap_area
- * @pprev: in/out arg for the previous vmap_area
- * @align: alignment
+ * pvm_determine_end_from_reverse - find the highest aligned address
+ * of free block below VMALLOC_END
+ * @va:
+ * in - the VA we start the search(reverse order);
+ * out - the VA with the highest aligned end address.
*
- * Returns: determined end address
- *
- * Find the highest aligned address between *@pnext and *@pprev below
- * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
- * down address is between the end addresses of the two vmap_areas.
- *
- * Please note that the address returned by this function may fall
- * inside *@pnext vmap_area. The caller is responsible for checking
- * that.
+ * Returns: determined end address within vmap_area
*/
-static unsigned long pvm_determine_end(struct vmap_area **pnext,
- struct vmap_area **pprev,
- unsigned long align)
+static unsigned long
+pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
{
- const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+ unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
unsigned long addr;
- if (*pnext)
- addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
- else
- addr = vmalloc_end;
-
- while (*pprev && (*pprev)->va_end > addr) {
- *pnext = *pprev;
- *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
+ if (likely(*va)) {
+ list_for_each_entry_from_reverse((*va),
+ &free_vmap_area_list, list) {
+ addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
+ if ((*va)->va_start < addr)
+ return addr;
+ }
}
- return addr;
+ return 0;
}
/**
@@ -2565,12 +3161,12 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
* to gigabytes. To avoid interacting with regular vmallocs, these
* areas are allocated from top.
*
- * Despite its complicated look, this allocator is rather simple. It
- * does everything top-down and scans areas from the end looking for
- * matching slot. While scanning, if any of the areas overlaps with
- * existing vmap_area, the base address is pulled down to fit the
- * area. Scanning is repeated till all the areas fit and then all
- * necessary data structures are inserted and the result is returned.
+ * Despite its complicated look, this allocator is rather simple. It
+ * does everything top-down and scans free blocks from the end looking
+ * for matching base. While scanning, if any of the areas do not fit the
+ * base address is pulled down to fit the area. Scanning is repeated till
+ * all the areas fit and then all necessary data structures are inserted
+ * and the result is returned.
*/
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
@@ -2578,11 +3174,12 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
{
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
- struct vmap_area **vas, *prev, *next;
+ struct vmap_area **vas, *va;
struct vm_struct **vms;
int area, area2, last_area, term_area;
- unsigned long base, start, end, last_end;
+ unsigned long base, start, size, end, last_end;
bool purged = false;
+ enum fit_type type;
/* verify parameters and allocate data structures */
BUG_ON(offset_in_page(align) || !is_power_of_2(align));
@@ -2618,7 +3215,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
goto err_free2;
for (area = 0; area < nr_vms; area++) {
- vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
+ vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
if (!vas[area] || !vms[area])
goto err_free;
@@ -2631,49 +3228,29 @@ retry:
start = offsets[area];
end = start + sizes[area];
- if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
- base = vmalloc_end - last_end;
- goto found;
- }
- base = pvm_determine_end(&next, &prev, align) - end;
+ va = pvm_find_va_enclose_addr(vmalloc_end);
+ base = pvm_determine_end_from_reverse(&va, align) - end;
while (true) {
- BUG_ON(next && next->va_end <= base + end);
- BUG_ON(prev && prev->va_end > base + end);
-
/*
* base might have underflowed, add last_end before
* comparing.
*/
- if (base + last_end < vmalloc_start + last_end) {
- spin_unlock(&vmap_area_lock);
- if (!purged) {
- purge_vmap_area_lazy();
- purged = true;
- goto retry;
- }
- goto err_free;
- }
+ if (base + last_end < vmalloc_start + last_end)
+ goto overflow;
/*
- * If next overlaps, move base downwards so that it's
- * right below next and then recheck.
+ * Fitting base has not been found.
*/
- if (next && next->va_start < base + end) {
- base = pvm_determine_end(&next, &prev, align) - end;
- term_area = area;
- continue;
- }
+ if (va == NULL)
+ goto overflow;
/*
- * If prev overlaps, shift down next and prev and move
- * base so that it's right below new next and then
- * recheck.
+ * If this VA does not fit, move base downwards and recheck.
*/
- if (prev && prev->va_end > base + start) {
- next = prev;
- prev = node_to_va(rb_prev(&next->rb_node));
- base = pvm_determine_end(&next, &prev, align) - end;
+ if (base + start < va->va_start || base + end > va->va_end) {
+ va = node_to_va(rb_prev(&va->rb_node));
+ base = pvm_determine_end_from_reverse(&va, align) - end;
term_area = area;
continue;
}
@@ -2685,21 +3262,40 @@ retry:
area = (area + nr_vms - 1) % nr_vms;
if (area == term_area)
break;
+
start = offsets[area];
end = start + sizes[area];
- pvm_find_next_prev(base + end, &next, &prev);
+ va = pvm_find_va_enclose_addr(base + end);
}
-found:
+
/* we've found a fitting base, insert all va's */
for (area = 0; area < nr_vms; area++) {
- struct vmap_area *va = vas[area];
+ int ret;
- va->va_start = base + offsets[area];
- va->va_end = va->va_start + sizes[area];
- __insert_vmap_area(va);
- }
+ start = base + offsets[area];
+ size = sizes[area];
+
+ va = pvm_find_va_enclose_addr(start);
+ if (WARN_ON_ONCE(va == NULL))
+ /* It is a BUG(), but trigger recovery instead. */
+ goto recovery;
+
+ type = classify_va_fit_type(va, start, size);
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
+ /* It is a BUG(), but trigger recovery instead. */
+ goto recovery;
- vmap_area_pcpu_hole = base + offsets[last_area];
+ ret = adjust_va_to_fit_type(va, start, size, type);
+ if (unlikely(ret))
+ goto recovery;
+
+ /* Allocated area. */
+ va = vas[area];
+ va->va_start = start;
+ va->va_end = start + size;
+
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ }
spin_unlock(&vmap_area_lock);
@@ -2711,9 +3307,38 @@ found:
kfree(vas);
return vms;
+recovery:
+ /* Remove previously inserted areas. */
+ while (area--) {
+ __free_vmap_area(vas[area]);
+ vas[area] = NULL;
+ }
+
+overflow:
+ spin_unlock(&vmap_area_lock);
+ if (!purged) {
+ purge_vmap_area_lazy();
+ purged = true;
+
+ /* Before "retry", check if we recover. */
+ for (area = 0; area < nr_vms; area++) {
+ if (vas[area])
+ continue;
+
+ vas[area] = kmem_cache_zalloc(
+ vmap_area_cachep, GFP_KERNEL);
+ if (!vas[area])
+ goto err_free;
+ }
+
+ goto retry;
+ }
+
err_free:
for (area = 0; area < nr_vms; area++) {
- kfree(vas[area]);
+ if (vas[area])
+ kmem_cache_free(vmap_area_cachep, vas[area]);
+
kfree(vms[area]);
}
err_free2:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fd9de504e516..df9941179fc5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -346,7 +346,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
int zid;
if (!mem_cgroup_disabled())
- lru_size = mem_cgroup_get_lru_size(lruvec, lru);
+ lru_size = lruvec_page_state(lruvec, NR_LRU_BASE + lru);
else
lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
@@ -1107,6 +1107,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
unsigned nr_reclaimed = 0;
+ unsigned pgactivate = 0;
memset(stat, 0, sizeof(*stat));
cond_resched();
@@ -1466,8 +1467,10 @@ activate_locked:
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
if (!PageMlocked(page)) {
+ int type = page_is_file_cache(page);
SetPageActive(page);
- stat->nr_activate++;
+ pgactivate++;
+ stat->nr_activate[type] += hpage_nr_pages(page);
count_memcg_page_event(page, PGACTIVATE);
}
keep_locked:
@@ -1482,7 +1485,7 @@ keep:
free_unref_page_list(&free_pages);
list_splice(&ret_pages, page_list);
- count_vm_events(PGACTIVATE, stat->nr_activate);
+ count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
}
@@ -1804,40 +1807,54 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
return isolated > inactive;
}
-static noinline_for_stack void
-putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
+/*
+ * This moves pages from @list to corresponding LRU list.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes, from rmap.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold zone_lru_lock across the whole operation. But if
+ * the pages are mapped, the processing is slow (page_referenced()) so we
+ * should drop zone_lru_lock around each page. It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_refcount against each page.
+ * But we had to alter page->flags anyway.
+ *
+ * Returns the number of pages moved to the given lruvec.
+ */
+
+static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
+ struct list_head *list)
{
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
+ struct page *page;
+ enum lru_list lru;
- /*
- * Put back any unfreeable pages.
- */
- while (!list_empty(page_list)) {
- struct page *page = lru_to_page(page_list);
- int lru;
-
+ while (!list_empty(list)) {
+ page = lru_to_page(list);
VM_BUG_ON_PAGE(PageLRU(page), page);
- list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
+ list_del(&page->lru);
spin_unlock_irq(&pgdat->lru_lock);
putback_lru_page(page);
spin_lock_irq(&pgdat->lru_lock);
continue;
}
-
lruvec = mem_cgroup_page_lruvec(page, pgdat);
SetPageLRU(page);
lru = page_lru(page);
- add_page_to_lru_list(page, lruvec, lru);
- if (is_active_lru(lru)) {
- int file = is_file_lru(lru);
- int numpages = hpage_nr_pages(page);
- reclaim_stat->recent_rotated[file] += numpages;
- }
+ nr_pages = hpage_nr_pages(page);
+ update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
+ list_move(&page->lru, &lruvec->lists[lru]);
+
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
@@ -1850,13 +1867,17 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
+ } else {
+ nr_moved += nr_pages;
}
}
/*
* To save our caller's stack, now use input list for pages to free.
*/
- list_splice(&pages_to_free, page_list);
+ list_splice(&pages_to_free, list);
+
+ return nr_moved;
}
/*
@@ -1886,6 +1907,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
unsigned long nr_taken;
struct reclaim_stat stat;
int file = is_file_lru(lru);
+ enum vm_event_item item;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
bool stalled = false;
@@ -1913,17 +1935,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
- if (current_is_kswapd()) {
- if (global_reclaim(sc))
- __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
- nr_scanned);
- } else {
- if (global_reclaim(sc))
- __count_vm_events(PGSCAN_DIRECT, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
- nr_scanned);
- }
+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ if (global_reclaim(sc))
+ __count_vm_events(item, nr_scanned);
+ __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
if (nr_taken == 0)
@@ -1934,19 +1949,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&pgdat->lru_lock);
- if (current_is_kswapd()) {
- if (global_reclaim(sc))
- __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
- count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,
- nr_reclaimed);
- } else {
- if (global_reclaim(sc))
- __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
- count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
- nr_reclaimed);
- }
+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ if (global_reclaim(sc))
+ __count_vm_events(item, nr_reclaimed);
+ __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ reclaim_stat->recent_rotated[0] = stat.nr_activate[0];
+ reclaim_stat->recent_rotated[1] = stat.nr_activate[1];
- putback_inactive_pages(lruvec, &page_list);
+ move_pages_to_lru(lruvec, &page_list);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
@@ -1983,73 +1993,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
return nr_reclaimed;
}
-/*
- * This moves pages from the active list to the inactive list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold pgdat->lru_lock across the whole operation. But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop pgdat->lru_lock around each page. It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_refcount against each page.
- * But we had to alter page->flags anyway.
- *
- * Returns the number of pages moved to the given lru.
- */
-
-static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
- struct list_head *list,
- struct list_head *pages_to_free,
- enum lru_list lru)
-{
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- struct page *page;
- int nr_pages;
- int nr_moved = 0;
-
- while (!list_empty(list)) {
- page = lru_to_page(list);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
- VM_BUG_ON_PAGE(PageLRU(page), page);
- SetPageLRU(page);
-
- nr_pages = hpage_nr_pages(page);
- update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
- list_move(&page->lru, &lruvec->lists[lru]);
-
- if (put_page_testzero(page)) {
- __ClearPageLRU(page);
- __ClearPageActive(page);
- del_page_from_lru_list(page, lruvec, lru);
-
- if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge(page);
- (*get_compound_page_dtor(page))(page);
- spin_lock_irq(&pgdat->lru_lock);
- } else
- list_add(&page->lru, pages_to_free);
- } else {
- nr_moved += nr_pages;
- }
- }
-
- if (!is_active_lru(lru)) {
- __count_vm_events(PGDEACTIVATE, nr_moved);
- count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
- nr_moved);
- }
-
- return nr_moved;
-}
-
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
@@ -2079,7 +2022,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
reclaim_stat->recent_scanned[file] += nr_taken;
__count_vm_events(PGREFILL, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+ __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2136,13 +2079,19 @@ static void shrink_active_list(unsigned long nr_to_scan,
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
- nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
- nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
+ nr_activate = move_pages_to_lru(lruvec, &l_active);
+ nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
+ /* Keep all free pages in l_active list */
+ list_splice(&l_inactive, &l_active);
+
+ __count_vm_events(PGDEACTIVATE, nr_deactivate);
+ __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge_list(&l_hold);
- free_unref_page_list(&l_hold);
+ mem_cgroup_uncharge_list(&l_active);
+ free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
@@ -2250,8 +2199,7 @@ enum scan_balance {
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
- struct scan_control *sc, unsigned long *nr,
- unsigned long *lru_pages)
+ struct scan_control *sc, unsigned long *nr)
{
int swappiness = mem_cgroup_swappiness(memcg);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
@@ -2402,20 +2350,72 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
fraction[1] = fp;
denominator = ap + fp + 1;
out:
- *lru_pages = 0;
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
- unsigned long size;
+ unsigned long lruvec_size;
unsigned long scan;
+ unsigned long protection;
+
+ lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ protection = mem_cgroup_protection(memcg,
+ sc->memcg_low_reclaim);
+
+ if (protection) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan = lruvec_size - lruvec_size * protection /
+ cgroup_size;
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decremeting
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ } else {
+ scan = lruvec_size;
+ }
+
+ scan >>= sc->priority;
- size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- scan = size >> sc->priority;
/*
* If the cgroup's already been deleted, make sure to
* scrape out the remaining cache.
*/
if (!scan && !mem_cgroup_online(memcg))
- scan = min(size, SWAP_CLUSTER_MAX);
+ scan = min(lruvec_size, SWAP_CLUSTER_MAX);
switch (scan_balance) {
case SCAN_EQUAL:
@@ -2435,7 +2435,7 @@ out:
case SCAN_ANON:
/* Scan one type exclusively */
if ((scan_balance == SCAN_FILE) != file) {
- size = 0;
+ lruvec_size = 0;
scan = 0;
}
break;
@@ -2444,7 +2444,6 @@ out:
BUG();
}
- *lru_pages += size;
nr[lru] = scan;
}
}
@@ -2453,7 +2452,7 @@ out:
* This is a basic per-node page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
- struct scan_control *sc, unsigned long *lru_pages)
+ struct scan_control *sc)
{
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
unsigned long nr[NR_LRU_LISTS];
@@ -2465,7 +2464,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
struct blk_plug plug;
bool scan_adjusted;
- get_scan_count(lruvec, memcg, sc, nr, lru_pages);
+ get_scan_count(lruvec, memcg, sc, nr);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
@@ -2670,7 +2669,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
.pgdat = pgdat,
.priority = sc->priority,
};
- unsigned long node_lru_pages = 0;
struct mem_cgroup *memcg;
memset(&sc->nr, 0, sizeof(sc->nr));
@@ -2680,7 +2678,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
- unsigned long lru_pages;
unsigned long reclaimed;
unsigned long scanned;
@@ -2705,13 +2702,19 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
memcg_memory_event(memcg, MEMCG_LOW);
break;
case MEMCG_PROT_NONE:
+ /*
+ * All protection thresholds breached. We may
+ * still choose to vary the scan pressure
+ * applied based on by how much the cgroup in
+ * question has exceeded its protection
+ * thresholds (see get_scan_count).
+ */
break;
}
reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
- shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
- node_lru_pages += lru_pages;
+ shrink_node_memcg(pgdat, memcg, sc);
if (sc->may_shrinkslab) {
shrink_slab(sc->gfp_mask, pgdat->node_id,
@@ -3212,10 +3215,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
return 1;
- trace_mm_vmscan_direct_reclaim_begin(order,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
@@ -3240,15 +3240,12 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
.may_swap = !noswap,
.may_shrinkslab = 1,
};
- unsigned long lru_pages;
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ sc.gfp_mask);
/*
* NOTE: Although we can get the priority field, using it
@@ -3257,7 +3254,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
+ shrink_node_memcg(pgdat, memcg, &sc);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -3297,10 +3294,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
- trace_mm_vmscan_memcg_reclaim_begin(0,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
@@ -4149,6 +4143,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
.reclaim_idx = gfp_zone(gfp_mask),
};
+ trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
+ sc.gfp_mask);
+
cond_resched();
fs_reclaim_acquire(sc.gfp_mask);
/*
@@ -4175,6 +4172,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
+
+ trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
+
return sc.nr_reclaimed >= nr_pages;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 0bedf67502d5..6419baebd306 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -426,10 +426,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
#ifdef CONFIG_MEMCG
if (sc->memcg) {
struct lruvec *lruvec;
+ int i;
- pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
- LRU_ALL);
lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
+ for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
+ pages += lruvec_page_state(lruvec, NR_LRU_BASE + i);
pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE);
} else
diff --git a/mm/z3fold.c b/mm/z3fold.c
index aee9b0b8d907..1ffecd6333e5 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -24,16 +24,47 @@
#include <linux/atomic.h>
#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/dcache.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/migrate.h>
+#include <linux/node.h>
+#include <linux/compaction.h>
#include <linux/percpu.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
#include <linux/preempt.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/zpool.h>
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation. It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
+ * in the beginning of an allocated page are occupied by z3fold header, so
+ * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
+ * which shows the max number of free chunks in z3fold page, also there will
+ * be 63, or 62, respectively, freelists per pool.
+ */
+#define NCHUNKS_ORDER 6
+
+#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE (1 << CHUNK_SHIFT)
+#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
+#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
+#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
+#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+
+#define BUDDY_MASK (0x3)
+#define BUDDY_SHIFT 2
+#define SLOTS_ALIGN (0x40)
+
/*****************
* Structures
*****************/
@@ -47,8 +78,18 @@ enum buddy {
FIRST,
MIDDLE,
LAST,
- BUDDIES_MAX
+ BUDDIES_MAX = LAST
+};
+
+struct z3fold_buddy_slots {
+ /*
+ * we are using BUDDY_MASK in handle_to_buddy etc. so there should
+ * be enough slots to hold all possible variants
+ */
+ unsigned long slot[BUDDY_MASK + 1];
+ unsigned long pool; /* back link + flags */
};
+#define HANDLE_FLAG_MASK (0x03)
/*
* struct z3fold_header - z3fold page metadata occupying first chunks of each
@@ -58,49 +99,29 @@ enum buddy {
* @page_lock: per-page lock
* @refcount: reference count for the z3fold page
* @work: work_struct for page layout optimization
- * @pool: pointer to the pool which this page belongs to
+ * @slots: pointer to the structure holding buddy slots
* @cpu: CPU which this page "belongs" to
* @first_chunks: the size of the first buddy in chunks, 0 if free
* @middle_chunks: the size of the middle buddy in chunks, 0 if free
* @last_chunks: the size of the last buddy in chunks, 0 if free
* @first_num: the starting number (for the first handle)
+ * @mapped_count: the number of objects currently mapped
*/
struct z3fold_header {
struct list_head buddy;
spinlock_t page_lock;
struct kref refcount;
struct work_struct work;
- struct z3fold_pool *pool;
+ struct z3fold_buddy_slots *slots;
short cpu;
unsigned short first_chunks;
unsigned short middle_chunks;
unsigned short last_chunks;
unsigned short start_middle;
unsigned short first_num:2;
+ unsigned short mapped_count:2;
};
-/*
- * NCHUNKS_ORDER determines the internal allocation granularity, effectively
- * adjusting internal fragmentation. It also determines the number of
- * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
- * in the beginning of an allocated page are occupied by z3fold header, so
- * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
- * which shows the max number of free chunks in z3fold page, also there will
- * be 63, or 62, respectively, freelists per pool.
- */
-#define NCHUNKS_ORDER 6
-
-#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
-#define CHUNK_SIZE (1 << CHUNK_SHIFT)
-#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
-#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
-#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
-#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
-
-#define BUDDY_MASK (0x3)
-#define BUDDY_SHIFT 2
-
/**
* struct z3fold_pool - stores metadata for each z3fold pool
* @name: pool name
@@ -113,11 +134,13 @@ struct z3fold_header {
* added buddy.
* @stale: list of pages marked for freeing
* @pages_nr: number of z3fold pages in the pool.
+ * @c_handle: cache for z3fold_buddy_slots allocation
* @ops: pointer to a structure of user defined operations specified at
* pool creation time.
* @compact_wq: workqueue for page layout background optimization
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
+ * @inode: inode for z3fold pseudo filesystem
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular z3fold pool.
@@ -130,12 +153,14 @@ struct z3fold_pool {
struct list_head lru;
struct list_head stale;
atomic64_t pages_nr;
+ struct kmem_cache *c_handle;
const struct z3fold_ops *ops;
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
struct workqueue_struct *compact_wq;
struct workqueue_struct *release_wq;
struct work_struct work;
+ struct inode *inode;
};
/*
@@ -164,11 +189,118 @@ static int size_to_chunks(size_t size)
static void compact_page_work(struct work_struct *w);
+static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool)
+{
+ struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle,
+ GFP_KERNEL);
+
+ if (slots) {
+ memset(slots->slot, 0, sizeof(slots->slot));
+ slots->pool = (unsigned long)pool;
+ }
+
+ return slots;
+}
+
+static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
+{
+ return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
+}
+
+static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
+{
+ return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
+}
+
+static inline void free_handle(unsigned long handle)
+{
+ struct z3fold_buddy_slots *slots;
+ int i;
+ bool is_free;
+
+ if (handle & (1 << PAGE_HEADLESS))
+ return;
+
+ WARN_ON(*(unsigned long *)handle == 0);
+ *(unsigned long *)handle = 0;
+ slots = handle_to_slots(handle);
+ is_free = true;
+ for (i = 0; i <= BUDDY_MASK; i++) {
+ if (slots->slot[i]) {
+ is_free = false;
+ break;
+ }
+ }
+
+ if (is_free) {
+ struct z3fold_pool *pool = slots_to_pool(slots);
+
+ kmem_cache_free(pool->c_handle, slots);
+ }
+}
+
+static struct dentry *z3fold_do_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ static const struct dentry_operations ops = {
+ .d_dname = simple_dname,
+ };
+
+ return mount_pseudo(fs_type, "z3fold:", NULL, &ops, 0x33);
+}
+
+static struct file_system_type z3fold_fs = {
+ .name = "z3fold",
+ .mount = z3fold_do_mount,
+ .kill_sb = kill_anon_super,
+};
+
+static struct vfsmount *z3fold_mnt;
+static int z3fold_mount(void)
+{
+ int ret = 0;
+
+ z3fold_mnt = kern_mount(&z3fold_fs);
+ if (IS_ERR(z3fold_mnt))
+ ret = PTR_ERR(z3fold_mnt);
+
+ return ret;
+}
+
+static void z3fold_unmount(void)
+{
+ kern_unmount(z3fold_mnt);
+}
+
+static const struct address_space_operations z3fold_aops;
+static int z3fold_register_migration(struct z3fold_pool *pool)
+{
+ pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
+ if (IS_ERR(pool->inode)) {
+ pool->inode = NULL;
+ return 1;
+ }
+
+ pool->inode->i_mapping->private_data = pool;
+ pool->inode->i_mapping->a_ops = &z3fold_aops;
+ return 0;
+}
+
+static void z3fold_unregister_migration(struct z3fold_pool *pool)
+{
+ if (pool->inode)
+ iput(pool->inode);
+ }
+
/* Initializes the z3fold header of a newly allocated z3fold page */
static struct z3fold_header *init_z3fold_page(struct page *page,
struct z3fold_pool *pool)
{
struct z3fold_header *zhdr = page_address(page);
+ struct z3fold_buddy_slots *slots = alloc_slots(pool);
+
+ if (!slots)
+ return NULL;
INIT_LIST_HEAD(&page->lru);
clear_bit(PAGE_HEADLESS, &page->private);
@@ -185,15 +317,21 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
zhdr->first_num = 0;
zhdr->start_middle = 0;
zhdr->cpu = -1;
- zhdr->pool = pool;
+ zhdr->slots = slots;
INIT_LIST_HEAD(&zhdr->buddy);
INIT_WORK(&zhdr->work, compact_page_work);
return zhdr;
}
/* Resets the struct page fields and frees the page */
-static void free_z3fold_page(struct page *page)
+static void free_z3fold_page(struct page *page, bool headless)
{
+ if (!headless) {
+ lock_page(page);
+ __ClearPageMovable(page);
+ unlock_page(page);
+ }
+ ClearPagePrivate(page);
__free_page(page);
}
@@ -215,33 +353,57 @@ static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
spin_unlock(&zhdr->page_lock);
}
+/* Helper function to build the index */
+static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
+{
+ return (bud + zhdr->first_num) & BUDDY_MASK;
+}
+
/*
* Encodes the handle of a particular buddy within a z3fold page
* Pool lock should be held as this function accesses first_num
*/
static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
{
- unsigned long handle;
+ struct z3fold_buddy_slots *slots;
+ unsigned long h = (unsigned long)zhdr;
+ int idx = 0;
- handle = (unsigned long)zhdr;
- if (bud != HEADLESS) {
- handle |= (bud + zhdr->first_num) & BUDDY_MASK;
- if (bud == LAST)
- handle |= (zhdr->last_chunks << BUDDY_SHIFT);
- }
- return handle;
+ /*
+ * For a headless page, its handle is its pointer with the extra
+ * PAGE_HEADLESS bit set
+ */
+ if (bud == HEADLESS)
+ return h | (1 << PAGE_HEADLESS);
+
+ /* otherwise, return pointer to encoded handle */
+ idx = __idx(zhdr, bud);
+ h += idx;
+ if (bud == LAST)
+ h |= (zhdr->last_chunks << BUDDY_SHIFT);
+
+ slots = zhdr->slots;
+ slots->slot[idx] = h;
+ return (unsigned long)&slots->slot[idx];
}
/* Returns the z3fold page where a given handle is stored */
-static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
+static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
{
- return (struct z3fold_header *)(handle & PAGE_MASK);
+ unsigned long addr = h;
+
+ if (!(addr & (1 << PAGE_HEADLESS)))
+ addr = *(unsigned long *)h;
+
+ return (struct z3fold_header *)(addr & PAGE_MASK);
}
/* only for LAST bud, returns zero otherwise */
static unsigned short handle_to_chunks(unsigned long handle)
{
- return (handle & ~PAGE_MASK) >> BUDDY_SHIFT;
+ unsigned long addr = *(unsigned long *)handle;
+
+ return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
}
/*
@@ -251,21 +413,31 @@ static unsigned short handle_to_chunks(unsigned long handle)
*/
static enum buddy handle_to_buddy(unsigned long handle)
{
- struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
- return (handle - zhdr->first_num) & BUDDY_MASK;
+ struct z3fold_header *zhdr;
+ unsigned long addr;
+
+ WARN_ON(handle & (1 << PAGE_HEADLESS));
+ addr = *(unsigned long *)handle;
+ zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
+ return (addr - zhdr->first_num) & BUDDY_MASK;
+}
+
+static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
+{
+ return slots_to_pool(zhdr->slots);
}
static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
{
struct page *page = virt_to_page(zhdr);
- struct z3fold_pool *pool = zhdr->pool;
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
WARN_ON(!list_empty(&zhdr->buddy));
set_bit(PAGE_STALE, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
spin_lock(&pool->lock);
if (!list_empty(&page->lru))
- list_del(&page->lru);
+ list_del_init(&page->lru);
spin_unlock(&pool->lock);
if (locked)
z3fold_page_unlock(zhdr);
@@ -295,9 +467,10 @@ static void release_z3fold_page_locked_list(struct kref *ref)
{
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
refcount);
- spin_lock(&zhdr->pool->lock);
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+ spin_lock(&pool->lock);
list_del_init(&zhdr->buddy);
- spin_unlock(&zhdr->pool->lock);
+ spin_unlock(&pool->lock);
WARN_ON(z3fold_page_trylock(zhdr));
__release_z3fold_page(zhdr, true);
@@ -318,7 +491,7 @@ static void free_pages_work(struct work_struct *w)
continue;
spin_unlock(&pool->stale_lock);
cancel_work_sync(&zhdr->work);
- free_z3fold_page(page);
+ free_z3fold_page(page, false);
cond_resched();
spin_lock(&pool->stale_lock);
}
@@ -349,6 +522,23 @@ static int num_free_chunks(struct z3fold_header *zhdr)
return nfree;
}
+/* Add to the appropriate unbuddied list */
+static inline void add_to_unbuddied(struct z3fold_pool *pool,
+ struct z3fold_header *zhdr)
+{
+ if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
+ zhdr->middle_chunks == 0) {
+ struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
+
+ int freechunks = num_free_chunks(zhdr);
+ spin_lock(&pool->lock);
+ list_add(&zhdr->buddy, &unbuddied[freechunks]);
+ spin_unlock(&pool->lock);
+ zhdr->cpu = smp_processor_id();
+ put_cpu_ptr(pool->unbuddied);
+ }
+}
+
static inline void *mchunk_memmove(struct z3fold_header *zhdr,
unsigned short dst_chunk)
{
@@ -367,6 +557,9 @@ static int z3fold_compact_page(struct z3fold_header *zhdr)
if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
return 0; /* can't move middle chunk, it's used */
+ if (unlikely(PageIsolated(page)))
+ return 0;
+
if (zhdr->middle_chunks == 0)
return 0; /* nothing to compact */
@@ -406,10 +599,8 @@ static int z3fold_compact_page(struct z3fold_header *zhdr)
static void do_compact_page(struct z3fold_header *zhdr, bool locked)
{
- struct z3fold_pool *pool = zhdr->pool;
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
struct page *page;
- struct list_head *unbuddied;
- int fchunks;
page = virt_to_page(zhdr);
if (locked)
@@ -429,19 +620,14 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
return;
}
- z3fold_compact_page(zhdr);
- unbuddied = get_cpu_ptr(pool->unbuddied);
- fchunks = num_free_chunks(zhdr);
- if (fchunks < NCHUNKS &&
- (!zhdr->first_chunks || !zhdr->middle_chunks ||
- !zhdr->last_chunks)) {
- /* the page's not completely free and it's unbuddied */
- spin_lock(&pool->lock);
- list_add(&zhdr->buddy, &unbuddied[fchunks]);
- spin_unlock(&pool->lock);
- zhdr->cpu = smp_processor_id();
+ if (unlikely(PageIsolated(page) ||
+ test_bit(PAGE_STALE, &page->private))) {
+ z3fold_page_unlock(zhdr);
+ return;
}
- put_cpu_ptr(pool->unbuddied);
+
+ z3fold_compact_page(zhdr);
+ add_to_unbuddied(pool, zhdr);
z3fold_page_unlock(zhdr);
}
@@ -453,6 +639,103 @@ static void compact_page_work(struct work_struct *w)
do_compact_page(zhdr, false);
}
+/* returns _locked_ z3fold page header or NULL */
+static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
+ size_t size, bool can_sleep)
+{
+ struct z3fold_header *zhdr = NULL;
+ struct page *page;
+ struct list_head *unbuddied;
+ int chunks = size_to_chunks(size), i;
+
+lookup:
+ /* First, try to find an unbuddied z3fold page. */
+ unbuddied = get_cpu_ptr(pool->unbuddied);
+ for_each_unbuddied_list(i, chunks) {
+ struct list_head *l = &unbuddied[i];
+
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
+ struct z3fold_header, buddy);
+
+ if (!zhdr)
+ continue;
+
+ /* Re-check under lock. */
+ spin_lock(&pool->lock);
+ l = &unbuddied[i];
+ if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
+ struct z3fold_header, buddy)) ||
+ !z3fold_page_trylock(zhdr)) {
+ spin_unlock(&pool->lock);
+ zhdr = NULL;
+ put_cpu_ptr(pool->unbuddied);
+ if (can_sleep)
+ cond_resched();
+ goto lookup;
+ }
+ list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
+ spin_unlock(&pool->lock);
+
+ page = virt_to_page(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ put_cpu_ptr(pool->unbuddied);
+ if (can_sleep)
+ cond_resched();
+ goto lookup;
+ }
+
+ /*
+ * this page could not be removed from its unbuddied
+ * list while pool lock was held, and then we've taken
+ * page lock so kref_put could not be called before
+ * we got here, so it's safe to just call kref_get()
+ */
+ kref_get(&zhdr->refcount);
+ break;
+ }
+ put_cpu_ptr(pool->unbuddied);
+
+ if (!zhdr) {
+ int cpu;
+
+ /* look for _exact_ match on other cpus' lists */
+ for_each_online_cpu(cpu) {
+ struct list_head *l;
+
+ unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
+ spin_lock(&pool->lock);
+ l = &unbuddied[chunks];
+
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
+ struct z3fold_header, buddy);
+
+ if (!zhdr || !z3fold_page_trylock(zhdr)) {
+ spin_unlock(&pool->lock);
+ zhdr = NULL;
+ continue;
+ }
+ list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
+ spin_unlock(&pool->lock);
+
+ page = virt_to_page(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ if (can_sleep)
+ cond_resched();
+ continue;
+ }
+ kref_get(&zhdr->refcount);
+ break;
+ }
+ }
+
+ return zhdr;
+}
/*
* API Functions
@@ -476,6 +759,11 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
pool = kzalloc(sizeof(struct z3fold_pool), gfp);
if (!pool)
goto out;
+ pool->c_handle = kmem_cache_create("z3fold_handle",
+ sizeof(struct z3fold_buddy_slots),
+ SLOTS_ALIGN, 0, NULL);
+ if (!pool->c_handle)
+ goto out_c;
spin_lock_init(&pool->lock);
spin_lock_init(&pool->stale_lock);
pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
@@ -497,15 +785,21 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
pool->release_wq = create_singlethread_workqueue(pool->name);
if (!pool->release_wq)
goto out_wq;
+ if (z3fold_register_migration(pool))
+ goto out_rwq;
INIT_WORK(&pool->work, free_pages_work);
pool->ops = ops;
return pool;
+out_rwq:
+ destroy_workqueue(pool->release_wq);
out_wq:
destroy_workqueue(pool->compact_wq);
out_unbuddied:
free_percpu(pool->unbuddied);
out_pool:
+ kmem_cache_destroy(pool->c_handle);
+out_c:
kfree(pool);
out:
return NULL;
@@ -519,6 +813,8 @@ out:
*/
static void z3fold_destroy_pool(struct z3fold_pool *pool)
{
+ kmem_cache_destroy(pool->c_handle);
+ z3fold_unregister_migration(pool);
destroy_workqueue(pool->release_wq);
destroy_workqueue(pool->compact_wq);
kfree(pool);
@@ -546,7 +842,7 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
unsigned long *handle)
{
- int chunks = 0, i, freechunks;
+ int chunks = size_to_chunks(size);
struct z3fold_header *zhdr = NULL;
struct page *page = NULL;
enum buddy bud;
@@ -561,56 +857,8 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
bud = HEADLESS;
else {
- struct list_head *unbuddied;
- chunks = size_to_chunks(size);
-
-lookup:
- /* First, try to find an unbuddied z3fold page. */
- unbuddied = get_cpu_ptr(pool->unbuddied);
- for_each_unbuddied_list(i, chunks) {
- struct list_head *l = &unbuddied[i];
-
- zhdr = list_first_entry_or_null(READ_ONCE(l),
- struct z3fold_header, buddy);
-
- if (!zhdr)
- continue;
-
- /* Re-check under lock. */
- spin_lock(&pool->lock);
- l = &unbuddied[i];
- if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
- struct z3fold_header, buddy)) ||
- !z3fold_page_trylock(zhdr)) {
- spin_unlock(&pool->lock);
- put_cpu_ptr(pool->unbuddied);
- goto lookup;
- }
- list_del_init(&zhdr->buddy);
- zhdr->cpu = -1;
- spin_unlock(&pool->lock);
-
- page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private)) {
- z3fold_page_unlock(zhdr);
- zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
- if (can_sleep)
- cond_resched();
- goto lookup;
- }
-
- /*
- * this page could not be removed from its unbuddied
- * list while pool lock was held, and then we've taken
- * page lock so kref_put could not be called before
- * we got here, so it's safe to just call kref_get()
- */
- kref_get(&zhdr->refcount);
- break;
- }
- put_cpu_ptr(pool->unbuddied);
-
+retry:
+ zhdr = __z3fold_alloc(pool, size, can_sleep);
if (zhdr) {
if (zhdr->first_chunks == 0) {
if (zhdr->middle_chunks != 0 &&
@@ -630,8 +878,9 @@ lookup:
z3fold_page_unlock(zhdr);
pr_err("No free chunks in unbuddied\n");
WARN_ON(1);
- goto lookup;
+ goto retry;
}
+ page = virt_to_page(zhdr);
goto found;
}
bud = FIRST;
@@ -662,13 +911,18 @@ lookup:
if (!page)
return -ENOMEM;
- atomic64_inc(&pool->pages_nr);
zhdr = init_z3fold_page(page, pool);
+ if (!zhdr) {
+ __free_page(page);
+ return -ENOMEM;
+ }
+ atomic64_inc(&pool->pages_nr);
if (bud == HEADLESS) {
set_bit(PAGE_HEADLESS, &page->private);
goto headless;
}
+ __SetPageMovable(page, pool->inode->i_mapping);
z3fold_page_lock(zhdr);
found:
@@ -680,19 +934,7 @@ found:
zhdr->middle_chunks = chunks;
zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
}
-
- if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
- zhdr->middle_chunks == 0) {
- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
-
- /* Add to unbuddied list */
- freechunks = num_free_chunks(zhdr);
- spin_lock(&pool->lock);
- list_add(&zhdr->buddy, &unbuddied[freechunks]);
- spin_unlock(&pool->lock);
- zhdr->cpu = smp_processor_id();
- put_cpu_ptr(pool->unbuddied);
- }
+ add_to_unbuddied(pool, zhdr);
headless:
spin_lock(&pool->lock);
@@ -739,7 +981,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
spin_lock(&pool->lock);
list_del(&page->lru);
spin_unlock(&pool->lock);
- free_z3fold_page(page);
+ free_z3fold_page(page, true);
atomic64_dec(&pool->pages_nr);
}
return;
@@ -766,6 +1008,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
return;
}
+ free_handle(handle);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
atomic64_dec(&pool->pages_nr);
return;
@@ -774,7 +1017,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
z3fold_page_unlock(zhdr);
return;
}
- if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
+ if (unlikely(PageIsolated(page)) ||
+ test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
return;
}
@@ -855,10 +1099,12 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
if (test_and_set_bit(PAGE_CLAIMED, &page->private))
continue;
- zhdr = page_address(page);
+ if (unlikely(PageIsolated(page)))
+ continue;
if (test_bit(PAGE_HEADLESS, &page->private))
break;
+ zhdr = page_address(page);
if (!z3fold_page_trylock(zhdr)) {
zhdr = NULL;
continue; /* can't evict at this point */
@@ -919,7 +1165,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
next:
if (test_bit(PAGE_HEADLESS, &page->private)) {
if (ret == 0) {
- free_z3fold_page(page);
+ free_z3fold_page(page, true);
atomic64_dec(&pool->pages_nr);
return 0;
}
@@ -996,6 +1242,8 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
break;
}
+ if (addr)
+ zhdr->mapped_count++;
z3fold_page_unlock(zhdr);
out:
return addr;
@@ -1022,6 +1270,7 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
buddy = handle_to_buddy(handle);
if (buddy == MIDDLE)
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ zhdr->mapped_count--;
z3fold_page_unlock(zhdr);
}
@@ -1036,6 +1285,128 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
return atomic64_read(&pool->pages_nr);
}
+static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
+{
+ struct z3fold_header *zhdr;
+ struct z3fold_pool *pool;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(PageIsolated(page), page);
+
+ if (test_bit(PAGE_HEADLESS, &page->private))
+ return false;
+
+ zhdr = page_address(page);
+ z3fold_page_lock(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
+ test_bit(PAGE_STALE, &page->private))
+ goto out;
+
+ pool = zhdr_to_pool(zhdr);
+
+ if (zhdr->mapped_count == 0) {
+ kref_get(&zhdr->refcount);
+ if (!list_empty(&zhdr->buddy))
+ list_del_init(&zhdr->buddy);
+ spin_lock(&pool->lock);
+ if (!list_empty(&page->lru))
+ list_del(&page->lru);
+ spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
+ return true;
+ }
+out:
+ z3fold_page_unlock(zhdr);
+ return false;
+}
+
+static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ struct z3fold_header *zhdr, *new_zhdr;
+ struct z3fold_pool *pool;
+ struct address_space *new_mapping;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ zhdr = page_address(page);
+ pool = zhdr_to_pool(zhdr);
+
+ if (!trylock_page(page))
+ return -EAGAIN;
+
+ if (!z3fold_page_trylock(zhdr)) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
+ if (zhdr->mapped_count != 0) {
+ z3fold_page_unlock(zhdr);
+ unlock_page(page);
+ return -EBUSY;
+ }
+ new_zhdr = page_address(newpage);
+ memcpy(new_zhdr, zhdr, PAGE_SIZE);
+ newpage->private = page->private;
+ page->private = 0;
+ z3fold_page_unlock(zhdr);
+ spin_lock_init(&new_zhdr->page_lock);
+ new_mapping = page_mapping(page);
+ __ClearPageMovable(page);
+ ClearPagePrivate(page);
+
+ get_page(newpage);
+ z3fold_page_lock(new_zhdr);
+ if (new_zhdr->first_chunks)
+ encode_handle(new_zhdr, FIRST);
+ if (new_zhdr->last_chunks)
+ encode_handle(new_zhdr, LAST);
+ if (new_zhdr->middle_chunks)
+ encode_handle(new_zhdr, MIDDLE);
+ set_bit(NEEDS_COMPACTING, &newpage->private);
+ new_zhdr->cpu = smp_processor_id();
+ spin_lock(&pool->lock);
+ list_add(&newpage->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ __SetPageMovable(newpage, new_mapping);
+ z3fold_page_unlock(new_zhdr);
+
+ queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
+
+ page_mapcount_reset(page);
+ unlock_page(page);
+ put_page(page);
+ return 0;
+}
+
+static void z3fold_page_putback(struct page *page)
+{
+ struct z3fold_header *zhdr;
+ struct z3fold_pool *pool;
+
+ zhdr = page_address(page);
+ pool = zhdr_to_pool(zhdr);
+
+ z3fold_page_lock(zhdr);
+ if (!list_empty(&zhdr->buddy))
+ list_del_init(&zhdr->buddy);
+ INIT_LIST_HEAD(&page->lru);
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ atomic64_dec(&pool->pages_nr);
+ return;
+ }
+ spin_lock(&pool->lock);
+ list_add(&page->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
+}
+
+static const struct address_space_operations z3fold_aops = {
+ .isolate_page = z3fold_page_isolate,
+ .migratepage = z3fold_page_migrate,
+ .putback_page = z3fold_page_putback,
+};
+
/*****************
* zpool
****************/
@@ -1133,8 +1504,14 @@ MODULE_ALIAS("zpool-z3fold");
static int __init init_z3fold(void)
{
+ int ret;
+
/* Make sure the z3fold header is not larger than the page size */
BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
+ ret = z3fold_mount();
+ if (ret)
+ return ret;
+
zpool_register_driver(&z3fold_zpool_driver);
return 0;
@@ -1142,6 +1519,7 @@ static int __init init_z3fold(void)
static void __exit exit_z3fold(void)
{
+ z3fold_unmount();
zpool_unregister_driver(&z3fold_zpool_driver);
}
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index d3736f5bffec..74cafc0142ea 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -27,7 +27,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data,
while (got < num_pages) {
rc = get_user_pages_fast(
(unsigned long)data + ((unsigned long)got * PAGE_SIZE),
- num_pages - got, write_page, pages + got);
+ num_pages - got, write_page ? FOLL_WRITE : 0, pages + got);
if (rc < 0)
break;
BUG_ON(rc == 0);
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 71f06900473e..b96fd3f54705 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -163,7 +163,7 @@ nf_hook_entries_grow(const struct nf_hook_entries *old,
static void hooks_validate(const struct nf_hook_entries *hooks)
{
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_MISC
struct nf_hook_ops **orig_ops;
int prio = INT_MIN;
size_t i = 0;
diff --git a/net/rds/info.c b/net/rds/info.c
index e367a97a18c8..03f6fd56d237 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -193,7 +193,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
ret = -ENOMEM;
goto out;
}
- ret = get_user_pages_fast(start, nr_pages, 1, pages);
+ ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
if (ret != nr_pages) {
if (ret > 0)
nr_pages = ret;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 182ab8430594..b340ed4fc43a 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -158,7 +158,8 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
{
int ret;
- ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
+ ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0,
+ pages);
if (ret >= 0 && ret < nr_pages) {
while (ret--)
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 989e52386c35..2b18223e7eb8 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -253,8 +253,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem)
return -ENOMEM;
down_read(&current->mm->mmap_sem);
- npgs = get_user_pages_longterm(umem->address, umem->npgs,
- gup_flags, &umem->pgs[0], NULL);
+ npgs = get_user_pages(umem->address, umem->npgs,
+ gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
up_read(&current->mm->mmap_sem);
if (npgs != umem->npgs) {
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index bb28b178d929..1c421ac42b07 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2725,8 +2725,10 @@ sub process {
($line =~ /^\s*(?:WARNING:|BUG:)/ ||
$line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
# timestamp
- $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
- # stack dump address
+ $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/) ||
+ $line =~ /^(?:\s+\w+:\s+[0-9a-fA-F]+){3,3}/ ||
+ $line =~ /^\s*\#\d+\s*\[[0-9a-fA-F]+\]\s*\w+ at [0-9a-fA-F]+/) {
+ # stack dump address styles
$commit_log_possible_stack_dump = 1;
}
@@ -3069,21 +3071,21 @@ sub process {
# check SPDX comment style for .[chsS] files
if ($realfile =~ /\.[chsS]$/ &&
$rawline =~ /SPDX-License-Identifier:/ &&
- $rawline !~ /^\+\s*\Q$comment\E\s*/) {
+ $rawline !~ m@^\+\s*\Q$comment\E\s*@) {
WARN("SPDX_LICENSE_TAG",
"Improper SPDX comment style for '$realfile', please use '$comment' instead\n" . $herecurr);
}
if ($comment !~ /^$/ &&
- $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) {
- WARN("SPDX_LICENSE_TAG",
- "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr);
+ $rawline !~ m@^\+\Q$comment\E SPDX-License-Identifier: @) {
+ WARN("SPDX_LICENSE_TAG",
+ "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr);
} elsif ($rawline =~ /(SPDX-License-Identifier: .*)/) {
- my $spdx_license = $1;
- if (!is_SPDX_License_valid($spdx_license)) {
- WARN("SPDX_LICENSE_TAG",
- "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr);
- }
+ my $spdx_license = $1;
+ if (!is_SPDX_License_valid($spdx_license)) {
+ WARN("SPDX_LICENSE_TAG",
+ "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr);
+ }
}
}
}
diff --git a/scripts/gdb/linux/clk.py b/scripts/gdb/linux/clk.py
new file mode 100644
index 000000000000..6bf71976b55d
--- /dev/null
+++ b/scripts/gdb/linux/clk.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) NXP 2019
+
+import gdb
+import sys
+
+from linux import utils, lists
+
+clk_core_type = utils.CachedType("struct clk_core")
+
+
+def clk_core_for_each_child(hlist_head):
+ return lists.hlist_for_each_entry(hlist_head,
+ clk_core_type.get_type().pointer(), "child_node")
+
+
+class LxClkSummary(gdb.Command):
+ """Print Linux kernel log buffer."""
+
+ def __init__(self):
+ super(LxClkSummary, self).__init__("lx-clk-summary", gdb.COMMAND_DATA)
+
+ def show_subtree(self, clk, level):
+ gdb.write("%*s%-*s %7d %8d %8d\n" % (
+ level * 3 + 1, "",
+ 30 - level * 3,
+ clk['name'].string(),
+ clk['enable_count'],
+ clk['prepare_count'],
+ clk['protect_count']))
+
+ for child in clk_core_for_each_child(clk['children']):
+ self.show_subtree(child, level + 1)
+
+ def invoke(self, arg, from_tty):
+ gdb.write(" enable prepare protect\n")
+ gdb.write(" clock count count count\n")
+ gdb.write("---------------------------------------------------------\n")
+ for clk in clk_core_for_each_child(gdb.parse_and_eval("clk_root_list")):
+ self.show_subtree(clk, 0)
+ for clk in clk_core_for_each_child(gdb.parse_and_eval("clk_orphan_list")):
+ self.show_subtree(clk, 0)
+
+
+LxClkSummary()
+
+
+class LxClkCoreLookup(gdb.Function):
+ """Find struct clk_core by name"""
+
+ def __init__(self):
+ super(LxClkCoreLookup, self).__init__("lx_clk_core_lookup")
+
+ def lookup_hlist(self, hlist_head, name):
+ for child in clk_core_for_each_child(hlist_head):
+ if child['name'].string() == name:
+ return child
+ result = self.lookup_hlist(child['children'], name)
+ if result:
+ return result
+
+ def invoke(self, name):
+ name = name.string()
+ return (self.lookup_hlist(gdb.parse_and_eval("clk_root_list"), name) or
+ self.lookup_hlist(gdb.parse_and_eval("clk_orphan_list"), name))
+
+
+LxClkCoreLookup()
diff --git a/scripts/gdb/linux/config.py b/scripts/gdb/linux/config.py
new file mode 100644
index 000000000000..90e1565b1967
--- /dev/null
+++ b/scripts/gdb/linux/config.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright 2019 Google LLC.
+
+import gdb
+import zlib
+
+from linux import utils
+
+
+class LxConfigDump(gdb.Command):
+ """Output kernel config to the filename specified as the command
+ argument. Equivalent to 'zcat /proc/config.gz > config.txt' on
+ a running target"""
+
+ def __init__(self):
+ super(LxConfigDump, self).__init__("lx-configdump", gdb.COMMAND_DATA,
+ gdb.COMPLETE_FILENAME)
+
+ def invoke(self, arg, from_tty):
+ if len(arg) == 0:
+ filename = "config.txt"
+ else:
+ filename = arg
+
+ try:
+ py_config_ptr = gdb.parse_and_eval("kernel_config_data + 8")
+ py_config_size = gdb.parse_and_eval(
+ "sizeof(kernel_config_data) - 1 - 8 * 2")
+ except gdb.error as e:
+ raise gdb.GdbError("Can't find config, enable CONFIG_IKCONFIG?")
+
+ inf = gdb.inferiors()[0]
+ zconfig_buf = utils.read_memoryview(inf, py_config_ptr,
+ py_config_size).tobytes()
+
+ config_buf = zlib.decompress(zconfig_buf, 16)
+ with open(filename, 'wb') as f:
+ f.write(config_buf)
+
+ gdb.write("Dumped config to " + filename + "\n")
+
+
+LxConfigDump()
diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in
index d3319a80788a..76f46f427b96 100644
--- a/scripts/gdb/linux/constants.py.in
+++ b/scripts/gdb/linux/constants.py.in
@@ -13,8 +13,10 @@
*/
#include <linux/fs.h>
+#include <linux/hrtimer.h>
#include <linux/mount.h>
#include <linux/of_fdt.h>
+#include <linux/threads.h>
/* We need to stringify expanded macros so that they can be parsed */
@@ -44,6 +46,9 @@ LX_VALUE(SB_DIRSYNC)
LX_VALUE(SB_NOATIME)
LX_VALUE(SB_NODIRATIME)
+/* linux/htimer.h */
+LX_GDBPARSED(hrtimer_resolution)
+
/* linux/mount.h */
LX_VALUE(MNT_NOSUID)
LX_VALUE(MNT_NODEV)
@@ -52,8 +57,16 @@ LX_VALUE(MNT_NOATIME)
LX_VALUE(MNT_NODIRATIME)
LX_VALUE(MNT_RELATIME)
+/* linux/threads.h */
+LX_VALUE(NR_CPUS)
+
/* linux/of_fdt.h> */
LX_VALUE(OF_DT_HEADER)
/* Kernel Configs */
+LX_CONFIG(CONFIG_GENERIC_CLOCKEVENTS)
+LX_CONFIG(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)
+LX_CONFIG(CONFIG_HIGH_RES_TIMERS)
+LX_CONFIG(CONFIG_NR_CPUS)
LX_CONFIG(CONFIG_OF)
+LX_CONFIG(CONFIG_TICK_ONESHOT)
diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py
index ca11e8df31b6..008e62f3190d 100644
--- a/scripts/gdb/linux/cpus.py
+++ b/scripts/gdb/linux/cpus.py
@@ -135,6 +135,7 @@ and can help identify the state of hotplugged CPUs"""
gdb.write("Online CPUs : {}\n".format(list(each_online_cpu())))
gdb.write("Active CPUs : {}\n".format(list(each_active_cpu())))
+
LxCpus()
diff --git a/scripts/gdb/linux/lists.py b/scripts/gdb/linux/lists.py
index 2f335fbd86fd..55356b66f8ea 100644
--- a/scripts/gdb/linux/lists.py
+++ b/scripts/gdb/linux/lists.py
@@ -16,6 +16,8 @@ import gdb
from linux import utils
list_head = utils.CachedType("struct list_head")
+hlist_head = utils.CachedType("struct hlist_head")
+hlist_node = utils.CachedType("struct hlist_node")
def list_for_each(head):
@@ -39,6 +41,27 @@ def list_for_each_entry(head, gdbtype, member):
yield utils.container_of(node, gdbtype, member)
+def hlist_for_each(head):
+ if head.type == hlist_head.get_type().pointer():
+ head = head.dereference()
+ elif head.type != hlist_head.get_type():
+ raise gdb.GdbError("Must be struct hlist_head not {}"
+ .format(head.type))
+
+ node = head['first'].dereference()
+ while node.address:
+ yield node.address
+ node = node['next'].dereference()
+
+
+def hlist_for_each_entry(head, gdbtype, member):
+ for node in hlist_for_each(head):
+ if node.type != hlist_node.get_type().pointer():
+ raise TypeError("Type {} found. Expected struct hlist_head *."
+ .format(node.type))
+ yield utils.container_of(node, gdbtype, member)
+
+
def list_check(head):
nb = 0
if (head.type == list_head.get_type().pointer()):
@@ -110,4 +133,5 @@ class LxListChk(gdb.Command):
raise gdb.GdbError("lx-list-check takes one argument")
list_check(gdb.parse_and_eval(argv[0]))
+
LxListChk()
diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py
index 2f01a958eb22..6a56bba233a9 100644
--- a/scripts/gdb/linux/proc.py
+++ b/scripts/gdb/linux/proc.py
@@ -29,6 +29,7 @@ class LxCmdLine(gdb.Command):
def invoke(self, arg, from_tty):
gdb.write(gdb.parse_and_eval("saved_command_line").string() + "\n")
+
LxCmdLine()
@@ -43,6 +44,7 @@ class LxVersion(gdb.Command):
# linux_banner should contain a newline
gdb.write(gdb.parse_and_eval("(char *)linux_banner").string())
+
LxVersion()
@@ -86,6 +88,7 @@ Equivalent to cat /proc/iomem on a running target"""
def invoke(self, arg, from_tty):
return show_lx_resources("iomem_resource")
+
LxIOMem()
@@ -100,6 +103,7 @@ Equivalent to cat /proc/ioports on a running target"""
def invoke(self, arg, from_tty):
return show_lx_resources("ioport_resource")
+
LxIOPorts()
@@ -149,7 +153,7 @@ values of that process namespace"""
if len(argv) >= 1:
try:
pid = int(argv[0])
- except:
+ except gdb.error:
raise gdb.GdbError("Provide a PID as integer value")
else:
pid = 1
@@ -195,6 +199,7 @@ values of that process namespace"""
info_opts(FS_INFO, s_flags),
info_opts(MNT_INFO, m_flags)))
+
LxMounts()
@@ -259,7 +264,7 @@ class LxFdtDump(gdb.Command):
try:
f = open(filename, 'wb')
- except:
+ except gdb.error:
raise gdb.GdbError("Could not open file to dump fdt")
f.write(fdt_buf)
@@ -267,4 +272,5 @@ class LxFdtDump(gdb.Command):
gdb.write("Dumped fdt blob to " + filename + "\n")
+
LxFdtDump()
diff --git a/scripts/gdb/linux/rbtree.py b/scripts/gdb/linux/rbtree.py
new file mode 100644
index 000000000000..39db889b874c
--- /dev/null
+++ b/scripts/gdb/linux/rbtree.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright 2019 Google LLC.
+
+import gdb
+
+from linux import utils
+
+rb_root_type = utils.CachedType("struct rb_root")
+rb_node_type = utils.CachedType("struct rb_node")
+
+
+def rb_first(root):
+ if root.type == rb_root_type.get_type():
+ node = node.address.cast(rb_root_type.get_type().pointer())
+ elif root.type != rb_root_type.get_type().pointer():
+ raise gdb.GdbError("Must be struct rb_root not {}".format(root.type))
+
+ node = root['rb_node']
+ if node is 0:
+ return None
+
+ while node['rb_left']:
+ node = node['rb_left']
+
+ return node
+
+
+def rb_last(root):
+ if root.type == rb_root_type.get_type():
+ node = node.address.cast(rb_root_type.get_type().pointer())
+ elif root.type != rb_root_type.get_type().pointer():
+ raise gdb.GdbError("Must be struct rb_root not {}".format(root.type))
+
+ node = root['rb_node']
+ if node is 0:
+ return None
+
+ while node['rb_right']:
+ node = node['rb_right']
+
+ return node
+
+
+def rb_parent(node):
+ parent = gdb.Value(node['__rb_parent_color'] & ~3)
+ return parent.cast(rb_node_type.get_type().pointer())
+
+
+def rb_empty_node(node):
+ return node['__rb_parent_color'] == node.address
+
+
+def rb_next(node):
+ if node.type == rb_node_type.get_type():
+ node = node.address.cast(rb_node_type.get_type().pointer())
+ elif node.type != rb_node_type.get_type().pointer():
+ raise gdb.GdbError("Must be struct rb_node not {}".format(node.type))
+
+ if rb_empty_node(node):
+ return None
+
+ if node['rb_right']:
+ node = node['rb_right']
+ while node['rb_left']:
+ node = node['rb_left']
+ return node
+
+ parent = rb_parent(node)
+ while parent and node == parent['rb_right']:
+ node = parent
+ parent = rb_parent(node)
+
+ return parent
+
+
+def rb_prev(node):
+ if node.type == rb_node_type.get_type():
+ node = node.address.cast(rb_node_type.get_type().pointer())
+ elif node.type != rb_node_type.get_type().pointer():
+ raise gdb.GdbError("Must be struct rb_node not {}".format(node.type))
+
+ if rb_empty_node(node):
+ return None
+
+ if node['rb_left']:
+ node = node['rb_left']
+ while node['rb_right']:
+ node = node['rb_right']
+ return node.dereference()
+
+ parent = rb_parent(node)
+ while parent and node == parent['rb_left'].dereference():
+ node = parent
+ parent = rb_parent(node)
+
+ return parent
+
+
+class LxRbFirst(gdb.Function):
+ """Lookup and return a node from an RBTree
+
+$lx_rb_first(root): Return the node at the given index.
+If index is omitted, the root node is dereferenced and returned."""
+
+ def __init__(self):
+ super(LxRbFirst, self).__init__("lx_rb_first")
+
+ def invoke(self, root):
+ result = rb_first(root)
+ if result is None:
+ raise gdb.GdbError("No entry in tree")
+
+ return result
+
+
+LxRbFirst()
+
+
+class LxRbLast(gdb.Function):
+ """Lookup and return a node from an RBTree.
+
+$lx_rb_last(root): Return the node at the given index.
+If index is omitted, the root node is dereferenced and returned."""
+
+ def __init__(self):
+ super(LxRbLast, self).__init__("lx_rb_last")
+
+ def invoke(self, root):
+ result = rb_last(root)
+ if result is None:
+ raise gdb.GdbError("No entry in tree")
+
+ return result
+
+
+LxRbLast()
+
+
+class LxRbNext(gdb.Function):
+ """Lookup and return a node from an RBTree.
+
+$lx_rb_next(node): Return the node at the given index.
+If index is omitted, the root node is dereferenced and returned."""
+
+ def __init__(self):
+ super(LxRbNext, self).__init__("lx_rb_next")
+
+ def invoke(self, node):
+ result = rb_next(node)
+ if result is None:
+ raise gdb.GdbError("No entry in tree")
+
+ return result
+
+
+LxRbNext()
+
+
+class LxRbPrev(gdb.Function):
+ """Lookup and return a node from an RBTree.
+
+$lx_rb_prev(node): Return the node at the given index.
+If index is omitted, the root node is dereferenced and returned."""
+
+ def __init__(self):
+ super(LxRbPrev, self).__init__("lx_rb_prev")
+
+ def invoke(self, node):
+ result = rb_prev(node)
+ if result is None:
+ raise gdb.GdbError("No entry in tree")
+
+ return result
+
+
+LxRbPrev()
diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py
index 004b0ac7fa72..2f5b95f09fa0 100644
--- a/scripts/gdb/linux/symbols.py
+++ b/scripts/gdb/linux/symbols.py
@@ -139,8 +139,12 @@ lx-symbols command."""
saved_states.append({'breakpoint': bp, 'enabled': bp.enabled})
# drop all current symbols and reload vmlinux
+ orig_vmlinux = 'vmlinux'
+ for obj in gdb.objfiles():
+ if obj.filename.endswith('vmlinux'):
+ orig_vmlinux = obj.filename
gdb.execute("symbol-file", to_string=True)
- gdb.execute("symbol-file vmlinux")
+ gdb.execute("symbol-file {0}".format(orig_vmlinux))
self.loaded_modules = []
module_list = modules.module_list()
diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py
index f6ab3ccf698f..0301dc1e0138 100644
--- a/scripts/gdb/linux/tasks.py
+++ b/scripts/gdb/linux/tasks.py
@@ -79,6 +79,7 @@ class LxPs(gdb.Command):
pid=task["pid"],
comm=task["comm"].string()))
+
LxPs()
@@ -134,4 +135,5 @@ variable."""
else:
raise gdb.GdbError("No task of PID " + str(pid))
+
LxThreadInfoByPidFunc()
diff --git a/scripts/gdb/linux/timerlist.py b/scripts/gdb/linux/timerlist.py
new file mode 100644
index 000000000000..071d0dd5a634
--- /dev/null
+++ b/scripts/gdb/linux/timerlist.py
@@ -0,0 +1,219 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright 2019 Google LLC.
+
+import binascii
+import gdb
+
+from linux import constants
+from linux import cpus
+from linux import rbtree
+from linux import utils
+
+timerqueue_node_type = utils.CachedType("struct timerqueue_node").get_type()
+hrtimer_type = utils.CachedType("struct hrtimer").get_type()
+
+
+def ktime_get():
+ """Returns the current time, but not very accurately
+
+ We can't read the hardware timer itself to add any nanoseconds
+ that need to be added since we last stored the time in the
+ timekeeper. But this is probably good enough for debug purposes."""
+ tk_core = gdb.parse_and_eval("&tk_core")
+
+ return tk_core['timekeeper']['tkr_mono']['base']
+
+
+def print_timer(rb_node, idx):
+ timerqueue = utils.container_of(rb_node, timerqueue_node_type.pointer(),
+ "node")
+ timer = utils.container_of(timerqueue, hrtimer_type.pointer(), "node")
+
+ function = str(timer['function']).split(" ")[1].strip("<>")
+ softexpires = timer['_softexpires']
+ expires = timer['node']['expires']
+ now = ktime_get()
+
+ text = " #{}: <{}>, {}, ".format(idx, timer, function)
+ text += "S:{:02x}\n".format(int(timer['state']))
+ text += " # expires at {}-{} nsecs [in {} to {} nsecs]\n".format(
+ softexpires, expires, softexpires - now, expires - now)
+ return text
+
+
+def print_active_timers(base):
+ curr = base['active']['next']['node']
+ curr = curr.address.cast(rbtree.rb_node_type.get_type().pointer())
+ idx = 0
+ while curr:
+ yield print_timer(curr, idx)
+ curr = rbtree.rb_next(curr)
+ idx += 1
+
+
+def print_base(base):
+ text = " .base: {}\n".format(base.address)
+ text += " .index: {}\n".format(base['index'])
+
+ text += " .resolution: {} nsecs\n".format(constants.LX_hrtimer_resolution)
+
+ text += " .get_time: {}\n".format(base['get_time'])
+ if constants.LX_CONFIG_HIGH_RES_TIMERS:
+ text += " .offset: {} nsecs\n".format(base['offset'])
+ text += "active timers:\n"
+ text += "".join([x for x in print_active_timers(base)])
+ return text
+
+
+def print_cpu(hrtimer_bases, cpu, max_clock_bases):
+ cpu_base = cpus.per_cpu(hrtimer_bases, cpu)
+ jiffies = gdb.parse_and_eval("jiffies_64")
+ tick_sched_ptr = gdb.parse_and_eval("&tick_cpu_sched")
+ ts = cpus.per_cpu(tick_sched_ptr, cpu)
+
+ text = "cpu: {}\n".format(cpu)
+ for i in xrange(max_clock_bases):
+ text += " clock {}:\n".format(i)
+ text += print_base(cpu_base['clock_base'][i])
+
+ if constants.LX_CONFIG_HIGH_RES_TIMERS:
+ fmts = [(" .{} : {} nsecs", 'expires_next'),
+ (" .{} : {}", 'hres_active'),
+ (" .{} : {}", 'nr_events'),
+ (" .{} : {}", 'nr_retries'),
+ (" .{} : {}", 'nr_hangs'),
+ (" .{} : {}", 'max_hang_time')]
+ text += "\n".join([s.format(f, cpu_base[f]) for s, f in fmts])
+ text += "\n"
+
+ if constants.LX_CONFIG_TICK_ONESHOT:
+ fmts = [(" .{} : {}", 'nohz_mode'),
+ (" .{} : {} nsecs", 'last_tick'),
+ (" .{} : {}", 'tick_stopped'),
+ (" .{} : {}", 'idle_jiffies'),
+ (" .{} : {}", 'idle_calls'),
+ (" .{} : {}", 'idle_sleeps'),
+ (" .{} : {} nsecs", 'idle_entrytime'),
+ (" .{} : {} nsecs", 'idle_waketime'),
+ (" .{} : {} nsecs", 'idle_exittime'),
+ (" .{} : {} nsecs", 'idle_sleeptime'),
+ (" .{}: {} nsecs", 'iowait_sleeptime'),
+ (" .{} : {}", 'last_jiffies'),
+ (" .{} : {}", 'next_timer'),
+ (" .{} : {} nsecs", 'idle_expires')]
+ text += "\n".join([s.format(f, ts[f]) for s, f in fmts])
+ text += "\njiffies: {}\n".format(jiffies)
+
+ text += "\n"
+
+ return text
+
+
+def print_tickdevice(td, cpu):
+ dev = td['evtdev']
+ text = "Tick Device: mode: {}\n".format(td['mode'])
+ if cpu < 0:
+ text += "Broadcast device\n"
+ else:
+ text += "Per CPU device: {}\n".format(cpu)
+
+ text += "Clock Event Device: "
+ if dev == 0:
+ text += "<NULL>\n"
+ return text
+
+ text += "{}\n".format(dev['name'])
+ text += " max_delta_ns: {}\n".format(dev['max_delta_ns'])
+ text += " min_delta_ns: {}\n".format(dev['min_delta_ns'])
+ text += " mult: {}\n".format(dev['mult'])
+ text += " shift: {}\n".format(dev['shift'])
+ text += " mode: {}\n".format(dev['state_use_accessors'])
+ text += " next_event: {} nsecs\n".format(dev['next_event'])
+
+ text += " set_next_event: {}\n".format(dev['set_next_event'])
+
+ members = [('set_state_shutdown', " shutdown: {}\n"),
+ ('set_state_periodic', " periodic: {}\n"),
+ ('set_state_oneshot', " oneshot: {}\n"),
+ ('set_state_oneshot_stopped', " oneshot stopped: {}\n"),
+ ('tick_resume', " resume: {}\n")]
+ for member, fmt in members:
+ if dev[member]:
+ text += fmt.format(dev[member])
+
+ text += " event_handler: {}\n".format(dev['event_handler'])
+ text += " retries: {}\n".format(dev['retries'])
+
+ return text
+
+
+def pr_cpumask(mask):
+ nr_cpu_ids = 1
+ if constants.LX_NR_CPUS > 1:
+ nr_cpu_ids = gdb.parse_and_eval("nr_cpu_ids")
+
+ inf = gdb.inferiors()[0]
+ bits = mask['bits']
+ num_bytes = (nr_cpu_ids + 7) / 8
+ buf = utils.read_memoryview(inf, bits, num_bytes).tobytes()
+ buf = binascii.b2a_hex(buf)
+
+ chunks = []
+ i = num_bytes
+ while i > 0:
+ i -= 1
+ start = i * 2
+ end = start + 2
+ chunks.append(buf[start:end])
+ if i != 0 and i % 4 == 0:
+ chunks.append(',')
+
+ extra = nr_cpu_ids % 8
+ if 0 < extra <= 4:
+ chunks[0] = chunks[0][0] # Cut off the first 0
+
+ return "".join(chunks)
+
+
+class LxTimerList(gdb.Command):
+ """Print /proc/timer_list"""
+
+ def __init__(self):
+ super(LxTimerList, self).__init__("lx-timerlist", gdb.COMMAND_DATA)
+
+ def invoke(self, arg, from_tty):
+ hrtimer_bases = gdb.parse_and_eval("&hrtimer_bases")
+ max_clock_bases = gdb.parse_and_eval("HRTIMER_MAX_CLOCK_BASES")
+
+ text = "Timer List Version: gdb scripts\n"
+ text += "HRTIMER_MAX_CLOCK_BASES: {}\n".format(max_clock_bases)
+ text += "now at {} nsecs\n".format(ktime_get())
+
+ for cpu in cpus.each_online_cpu():
+ text += print_cpu(hrtimer_bases, cpu, max_clock_bases)
+
+ if constants.LX_CONFIG_GENERIC_CLOCKEVENTS:
+ if constants.LX_CONFIG_GENERIC_CLOCKEVENTS_BROADCAST:
+ bc_dev = gdb.parse_and_eval("&tick_broadcast_device")
+ text += print_tickdevice(bc_dev, -1)
+ text += "\n"
+ mask = gdb.parse_and_eval("tick_broadcast_mask")
+ mask = pr_cpumask(mask)
+ text += "tick_broadcast_mask: {}\n".format(mask)
+ if constants.LX_CONFIG_TICK_ONESHOT:
+ mask = gdb.parse_and_eval("tick_broadcast_oneshot_mask")
+ mask = pr_cpumask(mask)
+ text += "tick_broadcast_oneshot_mask: {}\n".format(mask)
+ text += "\n"
+
+ tick_cpu_devices = gdb.parse_and_eval("&tick_cpu_device")
+ for cpu in cpus.each_online_cpu():
+ tick_dev = cpus.per_cpu(tick_cpu_devices, cpu)
+ text += print_tickdevice(tick_dev, cpu)
+ text += "\n"
+
+ gdb.write(text)
+
+
+LxTimerList()
diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py
index 50805874cfc3..bc67126118c4 100644
--- a/scripts/gdb/linux/utils.py
+++ b/scripts/gdb/linux/utils.py
@@ -66,6 +66,7 @@ Note that TYPE and ELEMENT have to be quoted as strings."""
return container_of(ptr, gdb.lookup_type(typename.string()).pointer(),
elementname.string())
+
ContainerOf()
@@ -148,14 +149,14 @@ def get_gdbserver_type():
def probe_qemu():
try:
return gdb.execute("monitor info version", to_string=True) != ""
- except:
+ except gdb.error:
return False
def probe_kgdb():
try:
thread_info = gdb.execute("info thread 2", to_string=True)
return "shadowCPU0" in thread_info
- except:
+ except gdb.error:
return False
global gdbserver_type
@@ -172,7 +173,7 @@ def get_gdbserver_type():
def gdb_eval_or_none(expresssion):
try:
return gdb.parse_and_eval(expresssion)
- except:
+ except gdb.error:
return None
diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py
index 6e0b0afd888a..eff5a48ac026 100644
--- a/scripts/gdb/vmlinux-gdb.py
+++ b/scripts/gdb/vmlinux-gdb.py
@@ -27,7 +27,11 @@ else:
import linux.modules
import linux.dmesg
import linux.tasks
+ import linux.config
import linux.cpus
import linux.lists
+ import linux.rbtree
import linux.proc
import linux.constants
+ import linux.timerlist
+ import linux.clk
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 86b87332b9e5..dcd25a31f9f5 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -21,11 +21,14 @@ absolut||absolute
absoulte||absolute
acccess||access
acceess||access
+accelaration||acceleration
acceleratoin||acceleration
+acceleratrion||acceleration
accelleration||acceleration
accesing||accessing
accesnt||accent
accessable||accessible
+accessiable||accessible
accesss||access
accidentaly||accidentally
accidentually||accidentally
@@ -39,45 +42,58 @@ accout||account
accquire||acquire
accquired||acquired
accross||across
+accss||access
acessable||accessible
acess||access
+acheived||achieved
achitecture||architecture
acient||ancient
acitions||actions
acitve||active
acknowldegement||acknowledgment
acknowledgement||acknowledgment
+acknowleding||acknowledging
ackowledge||acknowledge
ackowledged||acknowledged
+ackowledgement||acknowledgement
+ackowledges||acknowledges
acording||according
-activete||activate
actived||activated
+activete||activate
actualy||actually
acumulating||accumulating
acumulator||accumulator
+acutally||actually
adapater||adapter
+adddress||address
+adderss||address
addional||additional
additionaly||additionally
+additionnally||additionally
+additoinal||additional
additonal||additional
addres||address
-adddress||address
addreses||addresses
addresss||address
addrress||address
aditional||additional
aditionally||additionally
aditionaly||additionally
+adjusment||adjustment
adminstrative||administrative
adress||address
adresses||addresses
adrresses||addresses
advertisment||advertisement
adviced||advised
+advnace||advance
afecting||affecting
+affvecs||affects
againt||against
agaist||against
aggreataon||aggregation
aggreation||aggregation
+agruments||arguments
albumns||albums
alegorical||allegorical
algined||aligned
@@ -87,12 +103,15 @@ algoritm||algorithm
algoritms||algorithms
algorrithm||algorithm
algorritm||algorithm
+algrthm||algorithm
aligment||alignment
alignement||alignment
allign||align
alligned||aligned
alllocate||allocate
alloated||allocated
+allocagtor||allocator
+allocas||allices
allocatote||allocate
allocatrd||allocated
allocte||allocate
@@ -103,23 +122,29 @@ alogrithm||algorithm
alot||a lot
alow||allow
alows||allows
+alreaady||already
altough||although
alue||value
+amacout||mahout
ambigious||ambiguous
+ammount||amount
+ammounts||amounts
amoung||among
amout||amount
amplifer||amplifier
amplifyer||amplifier
-an union||a union
-an user||a user
-an userspace||a userspace
-an one||a one
+analagous||analogous
+analogous||analogous
analysator||analyzer
ang||and
anniversery||anniversary
annoucement||announcement
anomolies||anomalies
anomoly||anomaly
+an one||a one
+an union||a union
+an user||a user
+an userspace||a userspace
anway||anyway
aplication||application
appearence||appearance
@@ -134,48 +159,68 @@ approriate||appropriate
approriately||appropriately
apropriate||appropriate
aquainted||acquainted
+aquaris||Aquarius
+aquire||acquire
aquired||acquired
aquisition||acquisition
arbitary||arbitrary
+arbitralrily||arbitrarily
+arbitray||arbitrary
architechture||architecture
arguement||argument
arguements||arguments
+argumetns||arguments
aritmetic||arithmetic
arne't||aren't
arraival||arrival
artifical||artificial
artillary||artillery
asign||assign
+assebmly||assembly
asser||assert
assertation||assertion
assertting||asserting
+assgined||assigned
assiged||assigned
assigment||assignment
assigments||assignments
assistent||assistant
+assmebly||assembly
+assocaition||association
assocation||association
+assocativity||associativity
associcated||associated
+assoicatied||associated
assotiated||associated
asssert||assert
assum||assume
+assumming||assuming
assumtpion||assumption
+asume||assume
asuming||assuming
asycronous||asynchronous
-asynchnous||asynchronous
-asynchromous||asynchronous
asymetric||asymmetric
asymmeric||asymmetric
+asynchnous||asynchronous
+asynchromous||asynchronous
+asynchrounous||asynchronous
+atempt||attempt
atomatically||automatically
atomicly||atomically
-atempt||attempt
+atribute||attribute
attachement||attachment
attched||attached
-attemps||attempts
attemping||attempting
+attemppt||attempt
+attemps||attempts
attepmpt||attempt
+attirbutes||attributes
attnetion||attention
+attribut||attribute
attruibutes||attributes
authentification||authentication
+autherr||author
+automagicaly||automatically
automaticaly||automatically
automaticly||automatically
automatize||automate
@@ -185,9 +230,11 @@ autonymous||autonomous
auxillary||auxiliary
auxilliary||auxiliary
avaiable||available
+avaialable||available
avaible||available
availabe||available
availabled||available
+availablility||availability
availablity||availability
availaible||available
availale||available
@@ -205,6 +252,7 @@ bahavior||behavior
bakup||backup
baloon||balloon
baloons||balloons
+bandwitdh||bandwidth
bandwith||bandwidth
banlance||balance
batery||battery
@@ -215,17 +263,24 @@ becuase||because
beeing||being
befor||before
begining||beginning
+beginnning||beginning
+beiginning||beginning
beter||better
betweeen||between
bianries||binaries
+bidirectionnal||bidirectional
bitmast||bitmask
boardcast||broadcast
+bondary||boundary
+boook||book
borad||board
boundry||boundary
brievely||briefly
broadcase||broadcast
broadcat||broadcast
bufufer||buffer
+byteen||between
+cachces||caches
cacluated||calculated
caculate||calculate
caculation||calculation
@@ -238,20 +293,27 @@ calucate||calculate
calulate||calculate
cancelation||cancellation
cancle||cancel
+capabale||capable
+capabiliities||capabilities
capabilites||capabilities
+capabilitie||capabilities
+capabilitites||capabilities
+capabilitiy||capability
+capabilties||capabilities
capabilty||capability
capabitilies||capabilities
capablity||capability
-capatibilities||capabilities
capapbilities||capabilities
+capatibilities||capabilities
caputure||capture
carefuly||carefully
cariage||carriage
catagory||category
+cconcentrator||concentrator
cehck||check
+chache||cache
challange||challenge
challanges||challenges
-chache||cache
chanell||channel
changable||changeable
chanined||chained
@@ -259,6 +321,7 @@ channle||channel
channnel||channel
charachter||character
charachters||characters
+characteristcs||characteristics
charactor||character
charater||character
charaters||characters
@@ -268,6 +331,7 @@ chck||check
checksumed||checksummed
checksuming||checksumming
childern||children
+child'ren||children
childs||children
chiled||child
chked||checked
@@ -282,138 +346,212 @@ clared||cleared
closeing||closing
clustred||clustered
coexistance||coexistence
+cofigured||configured
colescing||coalescing
collapsable||collapsible
+collpased||collapsed
colorfull||colorful
+comamnd||command
comand||command
+combatibility||compatibility
comit||commit
commerical||commercial
comming||coming
+comminicate||communicate
comminucation||communication
commited||committed
commiting||committing
committ||commit
commoditiy||commodity
-comsume||consume
-comsumer||consumer
-comsuming||consuming
+communation||commination
+communicaion||communication
compability||compatibility
compaibility||compatibility
+comparation||compilation
comparsion||comparison
compatability||compatibility
compatable||compatible
+compatbility||compatibility
+compatibile||compatible
compatibiliy||compatibility
compatibilty||compatibility
+compatibl||compatible
compatiblity||compatibility
+compatile||compatible
competion||completion
compilant||compliant
compleatly||completely
+completey||completely
+completiom||completion
+completited||completed
+completition||competition
completition||completion
completly||completely
complient||compliant
-componnents||components
compoment||component
+componnents||components
+componse||compose
comppatible||compatible
compres||compress
compresion||compression
comression||compression
+comsume||consume
+comsumer||consumer
+comsuming||consuming
comunication||communication
conbination||combination
+conditinal||conditional
conditionaly||conditionally
conected||connected
conector||connector
-connecetd||connected
+conerter||converter
+configuared||configured
configuartion||configuration
configuation||configuration
+configued||configured
+configuraions||configurations
configuratoin||configuration
configuraton||configuration
+configuredi||configured
configuretion||configuration
+configurtion||configuration
configutation||configuration
+congifured||configured
conider||consider
conjuction||conjunction
+connecetd||connected
+connecteded||connected
connectinos||connections
connnection||connection
connnections||connections
+consecutivity||consecutively
+consel||console
consistancy||consistency
consistant||consistent
+consitional||conditional
+constitue||constitute
containes||contains
containts||contains
contaisn||contains
contant||contact
contence||contents
+contigious||contagious
continious||continuous
continous||continuous
continously||continuously
continueing||continuing
-contraints||constraints
-contruct||construct
+continuosuly||continuously
+continuying||continuing
contol||control
contoller||controller
+contraints||constraints
controled||controlled
controler||controller
+controlers||controllers
controll||control
+controlller||controller
+controllor||controller
+contruct||construct
contruction||construction
contry||country
conuntry||country
+converstion||conversation
convertion||conversion
convertor||converter
+conveter||converter
convienient||convenient
convinient||convenient
+convnetional||conventional
+convrate||concrete
+convrates||concretes
+coommon||common
+coonnected||connected
+coordiante||coordinate
corected||corrected
+correcly||correctly
correponding||corresponding
correponds||corresponds
+corresonding||corresponding
correspoding||corresponding
cotrol||control
cound||could
couter||counter
coutner||counter
+coutry||country
+creater||creator
+criticial||critical
cryptocraphic||cryptographic
cunter||counter
+curcuit||circuit
+cureent||current
curently||currently
cylic||cyclic
+daemonise||demonize
dafault||default
deafult||default
deamon||daemon
+debufs||debugs
+decapsulates||encapsulates
+decendents||decedents
decompres||decompress
-decsribed||described
+decomression||decompression
+decrementation||defragmentation
+decremeter||decremented
+decribed||described
decription||description
+decriptor||descriptor
+decsribed||described
dectected||detected
defailt||default
+defaiult||default
deferal||deferral
deffered||deferred
defferred||deferred
definate||definite
definately||definitely
+defin||define
+definetly||definitely
+definision||definition
+defintiion||definition
defintion||definition
defintions||definitions
+defragmentaion||defragmentation
defualt||default
defult||default
-deintializing||deinitializing
-deintialize||deinitialize
deintialized||deinitialized
+deintialize||deinitialize
+deintializing||deinitializing
deivce||device
delared||declared
delare||declare
delares||declares
delaring||declaring
delemiter||delimiter
-demodualtor||demodulator
+delievers||delivers
demension||dimension
+demodualtor||demodulator
+depdendency||dependency
dependancies||dependencies
dependancy||dependency
dependant||dependent
+depenedent||dependent
+depenencies||dependencies
depreacted||deprecated
depreacte||deprecate
desactivate||deactivate
+descibe||describe
desciptor||descriptor
desciptors||descriptors
+descpriptor||descriptor
+descripition||description
descripton||description
descrition||description
descritptor||descriptor
desctiptor||descriptor
+desination||destination
+desribe||describe
desriptor||descriptor
desriptors||descriptors
-desination||destination
destionation||destination
destoried||destroyed
destory||destroy
@@ -422,6 +560,8 @@ destorys||destroys
destroied||destroyed
detabase||database
deteced||detected
+detemine||determine
+detroyed||destroyed
develope||develop
developement||development
developped||developed
@@ -435,79 +575,102 @@ diable||disable
dictionnary||dictionary
didnt||didn't
diferent||different
+differenciate||differentiate
differrence||difference
diffrent||different
-differenciate||differentiate
diffrentiate||differentiate
difinition||definition
+digigram||decigram
dimention||dimension
dimesions||dimensions
-dispalying||displaying
diplay||display
+direcly||directly
directon||direction
direectly||directly
diregard||disregard
-disassocation||disassociation
disapear||disappear
disapeared||disappeared
disappared||disappeared
-disbale||disable
+disassocation||disassociation
disbaled||disabled
-disble||disable
+disbale||disable
disbled||disabled
+disble||disable
disconnet||disconnect
discontinous||discontinuous
+discpline||discipline
+discription||description
+discsignr||discoing
disharge||discharge
+disinguish||distinguish
disnabled||disabled
+dispalying||displaying
dispertion||dispersion
dissapears||disappears
distiction||distinction
+distinquish||distinguish
+distributon||distribution
divisable||divisible
divsiors||divisors
docuentation||documentation
documantation||documentation
+documenation||documentation
documentaion||documentation
documment||document
+documnetation||documentation
doesnt||doesn't
+dont't||don't
dorp||drop
dosen||doesn
downlad||download
downlads||downloads
droped||dropped
druing||during
+dyanamically||dynamically
dynmaic||dynamic
eanable||enable
+eariler||earlier
easilly||easily
ecspecially||especially
edditable||editable
editting||editing
efective||effective
+effectiv||effective
efficently||efficiently
ehther||ether
eigth||eight
+ejectcd||ejected
elementry||elementary
eletronic||electronic
embeded||embedded
+emergerncy||emergency
+emualted||emulated
+emultor||emulator
enabledi||enabled
enble||enable
enchanced||enhanced
+encomapssing||encompassing
encorporating||incorporating
encrupted||encrypted
encrypiton||encryption
encryptio||encryption
endianess||endianness
enhaced||enhanced
+enlightment||enlighten
enlightnment||enlightenment
+enocded||encoded
enqueing||enqueuing
+enterily||entirely
+entimate||intimate
entires||entries
entites||entities
+entrancy||entrance
entrys||entries
-enocded||encoded
-enterily||entirely
enviroiment||environment
enviroment||environment
environement||environment
environent||environment
+epected||expected
eqivalent||equivalent
equiped||equipped
equivelant||equivalent
@@ -515,63 +678,78 @@ equivilant||equivalent
eror||error
errorr||error
estbalishment||establishment
+ethernel||eternal
etsablishment||establishment
etsbalishment||establishment
+evntse||vents
+exacltly||exactly
excecutable||executable
exceded||exceeded
excellant||excellent
+excluevt||exclave
execeeded||exceeded
execeeds||exceeds
exeed||exceed
+exiample||example
existance||existence
existant||existent
exixt||exist
exlcude||exclude
exlcusive||exclusive
exmaple||example
+exmaples||examples
expecially||especially
experies||expires
+explainig||explaining
explicite||explicit
explicitely||explicitly
-explict||explicit
+explicity||explicitly
explictely||explicitly
+explict||explicit
explictly||explicitly
expresion||expression
exprimental||experimental
extened||extended
extensability||extensibility
-extention||extension
extenstion||extension
+extention||extension
extracter||extractor
faield||failed
-falied||failed
-faild||failed
failded||failed
+faild||failed
failer||failure
-faill||fail
failied||failed
+faill||fail
faillure||failure
+failng||failing
failue||failure
failuer||failure
-failng||failing
faireness||fairness
falied||failed
faliure||failure
fallbck||fallback
familar||familiar
+familiy||family
fatser||faster
+faulure||failure
+favorit||favorite
+featues||features
feauture||feature
feautures||features
+ferequencies||frequencies
+feroceon||ferrocene
fetaure||feature
fetaures||features
fileystem||filesystem
fimware||firmware
-firmare||firmware
-firware||firmware
finanize||finalize
findn||find
finilizes||finalizes
finsih||finish
+firmare||firmware
+firware||firmware
+firwmware||firmware
+flexsible||flexible
flusing||flushing
folloing||following
followign||following
@@ -582,19 +760,27 @@ forseeable||foreseeable
forse||force
fortan||fortran
forwardig||forwarding
+fotmat||format
frambuffer||framebuffer
+framewor||framework
framming||framing
framwork||framework
frequncy||frequency
frome||from
+frramework||framework
fucntion||function
fuction||function
fuctions||functions
funcation||function
+funcional||functional
funcion||function
functionallity||functionality
+functionalty||functionality
functionaly||functionally
+functionlity||functionality
functionnality||functionality
+functionnals||functional
+functiuon||function
functonality||functionality
funtion||function
funtions||functions
@@ -603,104 +789,143 @@ futhermore||furthermore
futrue||future
gauage||gauge
gaurenteed||guaranteed
-generiously||generously
genereate||generate
genereted||generated
+genericization||generalization
+generiously||generously
genric||generic
globel||global
grabing||grabbing
grahical||graphical
grahpical||graphical
+grammathical||grammatical
grapic||graphic
grranted||granted
guage||gauge
+guaratees||guarantees
guarenteed||guaranteed
guarentee||guarantee
+guarentees||guarantees
+gurantee||guarantee
halfs||halves
hander||handler
handfull||handful
hanlde||handle
hanled||handled
happend||happened
+hardocde||hardcode
harware||hardware
heirarchically||hierarchically
helpfull||helpful
-hybernate||hibernate
+hertergenous||heterogeneous
+hieararchy||hierarchy
+hiearchy||hierarchy
hierachy||hierarchy
hierarchie||hierarchy
homogenous||homogeneous
howver||however
hsould||should
+hybernate||hibernate
hypervior||hypervisor
hypter||hyper
+idential||identical
identidier||identifier
+identificator||identification
+iinclude||include
iligal||illegal
-illigal||illegal
illgal||illegal
-iomaped||iomapped
+illigal||illegal
imblance||imbalance
+imeediate||immediate
+imlemented||implemented
immeadiately||immediately
immedaite||immediate
immediatelly||immediately
immediatly||immediately
immidiate||immediate
+impatients||impatient
impelentation||implementation
impementated||implemented
implemantation||implementation
implemenation||implementation
implementaiton||implementation
implementated||implemented
-implemention||implementation
implementd||implemented
+implemention||implementation
implemetation||implementation
implemntation||implementation
implentation||implementation
implmentation||implementation
implmenting||implementing
+inavlid||invalid
incative||inactive
+incluing||including
incomming||incoming
incompatabilities||incompatibilities
incompatable||incompatible
+incompleted||incomplete
inconsistant||inconsistent
+incoporate||incorporate
increas||increase
+incrementes||increments
incremeted||incremented
+incresed||increased
incrment||increment
+indecated||indicated
indendation||indentation
indended||intended
+indentifier||identifier
+indentifies||identifies
+indentify||identify
independant||independent
independantly||independently
independed||independent
+independend||independent
indiate||indicate
indicat||indicate
+indicies||indices
+indix||index
+indle||idle
+inefficint||inefficient
inexpect||inexpected
inferface||interface
infomation||information
+informaion||information
informatiom||information
informations||information
informtion||information
infromation||information
ingore||ignore
+inialized||animalized
inital||initial
-initalized||initialized
initalised||initialized
initalise||initialize
+initalized||initialized
initalize||initialize
initation||initiation
initators||initiators
+initerface||interface
initialiazation||initialization
+initializatiion||initialization
initializiation||initialization
-initialze||initialize
initialzed||initialized
+initialze||initialize
initialzing||initializing
initilization||initialization
initilize||initialize
+inititalizing||initializing
+initted||ignited
+inmediately||immediately
inofficial||unofficial
inrerface||interface
insititute||institute
instace||instance
instal||install
-instanciate||instantiate
instanciated||instantiated
+instanciate||instantiate
+instantanous||instantaneous
+instituto||institute
+instuction||instruction
insufficent||insufficient
inteface||interface
integreated||integrated
@@ -710,14 +935,14 @@ intendet||intended
intented||intended
interanl||internal
interchangable||interchangeable
+intercs||inters
interferring||interfering
interger||integer
intermittant||intermittent
+internallly||internally
internel||internal
interoprability||interoperability
-interuupt||interrupt
-interupt||interrupt
-interupts||interrupts
+interprete||interpreted
interrface||interface
interrrupt||interrupt
interrup||interrupt
@@ -725,6 +950,8 @@ interrups||interrupts
interruptted||interrupted
interupted||interrupted
interupt||interrupt
+interupts||interrupts
+interuupt||interrupt
intial||initial
intialisation||initialisation
intialised||initialised
@@ -732,23 +959,26 @@ intialise||initialise
intialization||initialization
intialized||initialized
intialize||initialize
+intializes||initializes
intregral||integral
intrerrupt||interrupt
intrrupt||interrupt
+intruction||instruction
+intstruments||instruments
intterrupt||interrupt
intuative||intuitive
-inavlid||invalid
invaid||invalid
invaild||invalid
invailid||invalid
-invald||invalid
invalde||invalid
+invald||invalid
invalide||invalid
invalidiate||invalidate
invalud||invalid
invididual||individual
invokation||invocation
invokations||invocations
+iomaped||iomapped
ireelevant||irrelevant
irrelevent||irrelevant
isnt||isn't
@@ -757,16 +987,20 @@ iternations||iterations
itertation||iteration
itslef||itself
jave||java
+jection||ejection
jeffies||jiffies
+journalid||journaled
juse||just
jus||just
kown||known
+kthreadd's||thread's
langage||language
langauage||language
langauge||language
langugage||language
lauch||launch
layed||laid
+leftoever||leftover
legnth||length
leightweight||lightweight
lengh||length
@@ -778,6 +1012,7 @@ libary||library
librairies||libraries
libraris||libraries
licenceing||licencing
+limitiaion||limitation
loggging||logging
loggin||login
logile||logfile
@@ -786,23 +1021,29 @@ loosing||losing
losted||lost
machinary||machinery
maibox||mailbox
+mailformed||malformed
maintainance||maintenance
maintainence||maintenance
+maintanence||maintenance
maintan||maintain
makeing||making
-mailformed||malformed
malplaced||misplaced
malplace||misplace
managable||manageable
+managament||management
+managememt||management
managment||management
mangement||management
manoeuvering||maneuvering
+manufactor||manufactory
manufaucturing||manufacturing
+mappigns||mappings
mappping||mapping
matchs||matches
mathimatical||mathematical
mathimatic||mathematic
mathimatics||mathematics
+maximim||maximum
maximium||maximum
maxium||maximum
mechamism||mechanism
@@ -810,45 +1051,60 @@ meetign||meeting
memeory||memory
memmber||member
memoery||memory
+memomry||memory
ment||meant
+menual||manual
mergable||mergeable
mesage||message
+messagges||messages
messags||messages
+messgae||message
messgaes||messages
messsage||message
messsages||messages
micropone||microphone
microprocesspr||microprocessor
+migarion||migration
migrateable||migratable
+migratecd||migrated
+miliseconds||milliseconds
+millenium||millennium
milliseonds||milliseconds
-minium||minimum
minimam||minimum
+minimim||minimum
+minium||minimum
miniumum||minimum
minumum||minimum
misalinged||misaligned
miscelleneous||miscellaneous
misformed||malformed
-mispelled||misspelled
-mispelt||misspelt
mising||missing
mismactch||mismatch
+mispelled||misspelled
+mispelt||misspelt
missmanaged||mismanaged
missmatch||mismatch
+misterious||mysterious
miximum||maximum
mmnemonic||mnemonic
mnay||many
modfiy||modify
modulues||modules
+modyfying||modifying
momery||memory
-memomry||memory
monochorome||monochrome
monochromo||monochrome
monocrome||monochrome
mopdule||module
+mordern||modern
mroe||more
+muliptle||multiple
mulitplied||multiplied
multidimensionnal||multidimensional
+multipe||multiple
+multipy||multiply
multple||multiple
+muma||numa
mumber||number
muticast||multicast
mutilcast||multicast
@@ -860,19 +1116,27 @@ nead||need
neccecary||necessary
neccesary||necessary
neccessary||necessary
+nececssary||necessary
necesary||necessary
neded||needed
negaive||negative
+negitive||negative
+neglibigle||negligible
+negligeable||negligible
negoitation||negotiation
negotation||negotiation
+negtive||negative
nerver||never
nescessary||necessary
nessessary||necessary
noticable||noticeable
notications||notifications
notifed||notified
+notifer||rotifer
+notifys||notifies
numebr||number
numner||number
+numnodes||nimrods
obtaion||obtain
obusing||abusing
occassionally||occasionally
@@ -881,12 +1145,14 @@ occurance||occurrence
occurances||occurrences
occured||occurred
occurence||occurrence
+occurences||occurrences
occure||occurred
-occured||occurred
+occures||occurs
occuring||occurring
-offser||offset
offet||offset
offloded||offloaded
+offseet||offset
+offser||offset
omited||omitted
omiting||omitting
omitt||omit
@@ -894,26 +1160,35 @@ ommiting||omitting
ommitted||omitted
onself||oneself
ony||only
+openattr||operator
operatione||operation
opertaions||operations
+optimzation||optimization
optionnal||optional
optmizations||optimizations
+orginally||originally
orientatied||orientated
orientied||oriented
-orignal||original
originial||original
+orignal||original
otherise||otherwise
+othrewise||otherwise
ouput||output
oustanding||outstanding
overaall||overall
+overcommited||overcommitted
overhread||overhead
-overlaping||overlapping
+overidden||overridden
overide||override
+overlaping||overlapping
+overriddenp||overridden
overrided||overridden
overriden||overridden
overun||overrun
-overwritting||overwriting
overwriten||overwritten
+overwritting||overwriting
+ovewrite||overwrite
+ovrie||ovaries
pacakge||package
pachage||package
packacge||package
@@ -922,11 +1197,12 @@ packge||package
packtes||packets
pakage||package
paket||packet
+palaeontologic||paleontological
pallette||palette
paln||plan
paramameters||parameters
-paramaters||parameters
paramater||parameter
+paramaters||parameters
parametes||parameters
parametised||parametrised
paramter||parameter
@@ -936,6 +1212,7 @@ particuarly||particularly
particularily||particularly
partion||partition
partions||partitions
+partititon||partition
partiton||partition
pased||passed
passin||passing
@@ -947,12 +1224,19 @@ peice||piece
pendantic||pedantic
peprocessor||preprocessor
perfoming||performing
+periperal||peripheral
peripherial||peripheral
+permision||permission
permissons||permissions
peroid||period
persistance||persistence
persistant||persistent
+pessemistic||pessimistic
phoneticly||phonetically
+phyical||physical
+physcial||physical
+physial||physical
+pipleline||pipeline
plalform||platform
platfoem||platform
platfrom||platform
@@ -961,11 +1245,17 @@ pleaes||please
ploting||plotting
plugable||pluggable
poinnter||pointer
+pointee||pointer
pointeur||pointer
poiter||pointer
+polluate||pollute
+poroperties||properties
posible||possible
+posioned||poisoned
positon||position
+positve||positive
possibilites||possibilities
+posssible||possible
powerfull||powerful
pramater||parameter
preamle||preamble
@@ -976,26 +1266,36 @@ preceeding||preceding
preceed||precede
precendence||precedence
precission||precision
+predictible||predictable
preemptable||preemptible
prefered||preferred
+prefetcht||prefect
prefferably||preferably
premption||preemption
prepaired||prepared
+prepapre||prepare
preperation||preparation
+presentes||presents
pressre||pressure
+pretetch||protect
+previosuly||previously
+prileged||pillaged
primative||primitive
princliple||principle
+princples||principles
priorty||priority
+privatep||private
privilaged||privileged
privilage||privilege
priviledge||privilege
priviledges||privileges
+privilged||privileged
probaly||probably
procceed||proceed
proccesors||processors
procesed||processed
-proces||process
procesing||processing
+proces||process
processessing||processing
processess||processes
processpr||processor
@@ -1006,30 +1306,39 @@ prodecure||procedure
progams||programs
progess||progress
programers||programmers
+programmble||programmable
programm||program
programms||programs
progresss||progress
+promiscouos||promiscuous
promiscous||promiscuous
+promixity||proximity
promps||prompts
+promsicuous||promiscuous
pronnounced||pronounced
prononciation||pronunciation
pronouce||pronounce
pronunce||pronounce
+properies||properties
+properites||properties
+properities||prosperities
propery||property
propigate||propagate
propigation||propagation
propogate||propagate
+proptery||property
prosess||process
protable||portable
protcol||protocol
protecion||protection
protocoll||protocol
-promixity||proximity
psudo||pseudo
psuedo||pseudo
psychadelic||psychedelic
+publishhed||published
pwoer||power
queing||queuing
+quently||queenly
quering||querying
randomally||randomly
raoming||roaming
@@ -1045,6 +1354,7 @@ recieves||receives
recogniced||recognised
recognizeable||recognizable
recommanded||recommended
+recurive||recursive
recyle||recycle
redircet||redirect
redirectrion||redirection
@@ -1054,23 +1364,33 @@ refcounf||refcount
refence||reference
refered||referred
referenace||reference
+refererence||reference
refering||referring
refernces||references
refernnce||reference
refrence||reference
registed||registered
-registerd||registered
registeration||registration
+registerd||registered
+registe||register
registeresd||registered
registerred||registered
registes||registers
registraration||registration
regsiter||register
regster||register
+regsters||registers
+regstiers||registers
regualar||regular
+regualtor||regulator
+regualtors||regulators
reguator||regulator
+regulaltor||regulator
regulamentations||regulations
+regularily||regularly
reigstration||registration
+reinititalise||reinitialize
+relatime||realtime
releated||related
relevent||relevant
reloade||reload
@@ -1086,60 +1406,78 @@ representaion||representation
reqeust||request
requestied||requested
requiere||require
+requiremenst||requirements
requirment||requirement
requred||required
requried||required
+requsted||requested
requst||request
reregisteration||reregistration
+reselient||resilient
+reservd||reserved
reseting||resetting
reseved||reserved
reseverd||reserved
resizeable||resizable
+resoltuion||resolution
resouce||resource
resouces||resources
resoures||resources
+resovle||resolve
responce||response
resrouce||resource
ressizes||resizes
ressource||resource
ressources||resources
restesting||retesting
+restrct||restrict
resumbmitting||resubmitting
+resvered||revered
retransmited||retransmitted
retreived||retrieved
retreive||retrieve
retreiving||retrieving
retrive||retrieve
+retuerned||returned
retuned||returned
reudce||reduce
reuest||request
reuqest||request
reutnred||returned
revsion||revision
+rgister||register
rmeoved||removed
rmeove||remove
rmeoves||removes
+robustneess||robustness
rountine||routine
routins||routines
rquest||request
runing||running
runned||ran
+runnig||running
runnning||running
runtine||runtime
sacrifying||sacrificing
safly||safely
safty||safety
+sasitfy||salsify
savable||saveable
+scalabilty||scalability
scaleing||scaling
scaned||scanned
scaning||scanning
scarch||search
+scehduled||scheduled
+schd||scud
+schduler||scheduler
seach||search
searchs||searches
secquence||sequence
secund||second
segement||segment
semaphone||semaphore
+sempahore||semaphore
senario||scenario
senarios||scenarios
sentivite||sensitive
@@ -1152,15 +1490,21 @@ seperate||separate
seperatly||separately
seperator||separator
sepperate||separate
-seqeunce||sequence
-seqeuncer||sequencer
seqeuencer||sequencer
+seqeuncer||sequencer
+seqeunce||sequence
sequece||sequence
sequencial||sequential
+serivced||serviced
+serliazed||sterilized
serveral||several
servive||service
setts||sets
settting||setting
+severly||severely
+shapping||shaping
+sharability||salability
+shimphy||simply
shotdown||shutdown
shoud||should
shouldnt||shouldn't
@@ -1168,44 +1512,64 @@ shoule||should
shrinked||shrunk
siginificantly||significantly
signabl||signal
+signifcant||significant
similary||similarly
similiar||similar
simlar||similar
simliar||similar
simpified||simplified
+simplely||simplify
singaled||signaled
singal||signal
singed||signed
+singnalled||signaled
sleeped||slept
softwares||software
+solidum||solidus
+spcified||specified
speach||speech
specfic||specific
+specfied||specified
specfield||specified
+specfiers||specifiers
+speciall||special
speciefied||specified
+specifcally||specifically
specifc||specific
specifed||specified
+specifer||specifier
specificatin||specification
specificaton||specification
+specifiction||specification
specifing||specifying
specifiying||specifying
+specifiy||specify
+specyfing||specifying
speficied||specified
+speicific||specific
speicify||specify
speling||spelling
+spiint||splint
spinlcok||spinlock
spinock||spinlock
+splited||spited
splitted||split
spreaded||spread
+spuriuous||spurious
+spurriously||spuriously
spurrious||spurious
sructure||structure
stablilization||stabilization
staically||statically
staion||station
standardss||standards
+standarized||standardized
standartization||standardization
standart||standard
standy||standby
stardard||standard
staticly||statically
+statistcs||statistics
statuss||status
stoped||stopped
stoping||stopping
@@ -1213,56 +1577,70 @@ stoppped||stopped
straming||streaming
struc||struct
structres||structures
-stuct||struct
strucuture||structure
+stuct||struct
stucture||structure
sturcture||structure
+subcribed||subscribed
subdirectoires||subdirectories
suble||subtle
-substract||subtract
submition||submission
-suceed||succeed
+submmitted||submitted
+substract||subtract
+subsytem||subsystem
+subystem||subsystem
+succecss||success
succesfully||successfully
succesful||successful
successed||succeeded
successfull||successful
successfuly||successfully
+successully||successfully
+succsful||successful
+suceed||succeed
sucessfully||successfully
+sucessful||successful
sucess||success
superflous||superfluous
superseeded||superseded
suplied||supplied
suported||supported
suport||support
-supportet||supported
suppored||supported
+supportet||supported
supportin||supporting
suppoted||supported
suppported||supported
suppport||support
+supressed||suppressed
supress||suppress
surpressed||suppressed
surpresses||suppresses
susbsystem||subsystem
+suseconds||seconds
+susepnd||suspend
suspeneded||suspended
-suspsend||suspend
suspicously||suspiciously
+suspsend||suspend
swaping||swapping
switchs||switches
-swith||switch
swithable||switchable
-swithc||switch
swithced||switched
swithcing||switching
+swithc||switch
swithed||switched
swithing||switching
+swith||switch
swtich||switch
+sychronize||synchronize
symetric||symmetric
synax||syntax
+synchonization||synchronization
synchonized||synchronized
+synchornization||synchronization
synchronuously||synchronously
-syncronize||synchronize
syncronized||synchronized
+syncronize||synchronize
syncronizing||synchronizing
syncronus||synchronous
syste||system
@@ -1273,23 +1651,30 @@ tansmit||transmit
targetted||targeted
targetting||targeting
taskelt||tasklet
+technologie||technologies
+technologt||technology
teh||the
temorary||temporary
temproarily||temporarily
+terminage||terminate
+terminted||terminated
thead||thread
+theorical||theatrical
therfore||therefore
+theroretical||theoretical
thier||their
+thourgh||though
threds||threads
threshhold||threshold
thresold||threshold
throught||through
-troughput||throughput
thses||these
-tiggers||triggers
tiggered||triggered
-tipically||typically
+tiggers||triggers
timeing||timing
+timestampns||timestamps
timout||timeout
+tipically||typically
tmis||this
toogle||toggle
torerable||tolerable
@@ -1298,19 +1683,31 @@ tramsmitted||transmitted
tramsmit||transmit
tranasction||transaction
tranfer||transfer
+tranlsate||translate
+transceive||transceiver
transcevier||transceiver
transciever||transceiver
transferd||transferred
transfered||transferred
transfering||transferring
+transferrs||transfers
transision||transition
+transistions||transitions
+translarion||translation
transmittd||transmitted
+transofrmed||transformed
transormed||transformed
+transparantly||transparently
+transportt||transport
+transpport||transport
trasfer||transfer
trasmission||transmission
treshold||threshold
trigerred||triggered
trigerring||triggering
+trivialy||trivially
+troughput||throughput
+trsfr||turfs
trun||turn
tunning||tuning
ture||true
@@ -1318,11 +1715,13 @@ tyep||type
udpate||update
uesd||used
uknown||unknown
-usupported||unsupported
+ulility||utility
+unamed||unnamed
uncommited||uncommitted
unconditionaly||unconditionally
underun||underrun
unecessary||unnecessary
+uneeded||unneeded
unexecpted||unexpected
unexepected||unexpected
unexpcted||unexpected
@@ -1330,37 +1729,40 @@ unexpectd||unexpected
unexpeted||unexpected
unexpexted||unexpected
unfortunatelly||unfortunately
+unhandeable||unchangeable
unifiy||unify
-uniterrupted||uninterrupted
+uninitialzed||uninitialized
unintialized||uninitialized
+uniterrupted||uninterrupted
unitialized||uninitialized
unkmown||unknown
unknonw||unknown
unknow||unknown
unkown||unknown
-unamed||unnamed
-uneeded||unneeded
-unneded||unneeded
+unmached||unmatched
unneccecary||unnecessary
unneccesary||unnecessary
unneccessary||unnecessary
unnecesary||unnecessary
+unneded||unneeded
unneedingly||unnecessarily
unnsupported||unsupported
-unmached||unmatched
unregester||unregister
unresgister||unregister
unrgesiter||unregister
unsinged||unsigned
-unstabel||unstable
unsolicitied||unsolicited
+unstabel||unstable
unsuccessfull||unsuccessful
unsuported||unsupported
+unsychronized||unsynchronized
untill||until
unuseful||useless
unvalid||invalid
+upadted||updated
upate||update
upsupported||unsupported
+uptodate||up-to-date
usefule||useful
usefull||useful
usege||usage
@@ -1383,24 +1785,27 @@ vaule||value
verbse||verbose
verisons||versions
verison||version
+veriton||version
verson||version
vicefersa||vice-versa
virtal||virtual
virtaul||virtual
virtiual||virtual
+virulization||virtualization
visiters||visitors
vitual||virtual
+vmext||vmexit
vunerable||vulnerable
wakeus||wakeups
wathdog||watchdog
wating||waiting
-wiat||wait
wether||whether
whataver||whatever
whcih||which
whenver||whenever
wheter||whether
whe||when
+wiat||wait
wierd||weird
wiil||will
wirte||write
diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore
index 64073e050c6a..b02279da6fa1 100644
--- a/tools/testing/selftests/exec/.gitignore
+++ b/tools/testing/selftests/exec/.gitignore
@@ -6,4 +6,5 @@ execveat.moved
execveat.path.ephemeral
execveat.ephemeral
execveat.denatured
-xxxxxxxx* \ No newline at end of file
+/recursion-depth
+xxxxxxxx*
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile
index 427c41ba5151..33339e31e365 100644
--- a/tools/testing/selftests/exec/Makefile
+++ b/tools/testing/selftests/exec/Makefile
@@ -1,11 +1,15 @@
# SPDX-License-Identifier: GPL-2.0
CFLAGS = -Wall
+CFLAGS += -Wno-nonnull
+CFLAGS += -D_GNU_SOURCE
TEST_GEN_PROGS := execveat
TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir
# Makefile is a run-time dependency, since it's accessed by the execveat test
TEST_FILES := Makefile
+TEST_GEN_PROGS += recursion-depth
+
EXTRA_CLEAN := $(OUTPUT)/subdir.moved $(OUTPUT)/execveat.moved $(OUTPUT)/xxxxx*
include ../lib.mk
diff --git a/tools/testing/selftests/exec/recursion-depth.c b/tools/testing/selftests/exec/recursion-depth.c
new file mode 100644
index 000000000000..2dbd5bc45b3e
--- /dev/null
+++ b/tools/testing/selftests/exec/recursion-depth.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/* Test that pointing #! script interpreter to self doesn't recurse. */
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+int main(void)
+{
+ if (unshare(CLONE_NEWNS) == -1) {
+ if (errno == ENOSYS || errno == EPERM) {
+ fprintf(stderr, "error: unshare, errno %d\n", errno);
+ return 4;
+ }
+ fprintf(stderr, "error: unshare, errno %d\n", errno);
+ return 1;
+ }
+ if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
+ fprintf(stderr, "error: mount '/', errno %d\n", errno);
+ return 1;
+ }
+ /* Require "exec" filesystem. */
+ if (mount(NULL, "/tmp", "ramfs", 0, NULL) == -1) {
+ fprintf(stderr, "error: mount ramfs, errno %d\n", errno);
+ return 1;
+ }
+
+#define FILENAME "/tmp/1"
+
+ int fd = creat(FILENAME, 0700);
+ if (fd == -1) {
+ fprintf(stderr, "error: creat, errno %d\n", errno);
+ return 1;
+ }
+#define S "#!" FILENAME "\n"
+ if (write(fd, S, strlen(S)) != strlen(S)) {
+ fprintf(stderr, "error: write, errno %d\n", errno);
+ return 1;
+ }
+ close(fd);
+
+ int rv = execve(FILENAME, NULL, NULL);
+ if (rv == -1 && errno == ELOOP) {
+ return 0;
+ }
+ fprintf(stderr, "error: execve, rv %d, errno %d\n", rv, errno);
+ return 1;
+}
diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh
index 780ce7123374..6a970b127c9b 100755
--- a/tools/testing/selftests/sysctl/sysctl.sh
+++ b/tools/testing/selftests/sysctl/sysctl.sh
@@ -24,19 +24,21 @@ TEST_FILE=$(mktemp)
# This represents
#
-# TEST_ID:TEST_COUNT:ENABLED
+# TEST_ID:TEST_COUNT:ENABLED:TARGET
#
# TEST_ID: is the test id number
# TEST_COUNT: number of times we should run the test
# ENABLED: 1 if enabled, 0 otherwise
+# TARGET: test target file required on the test_sysctl module
#
# Once these are enabled please leave them as-is. Write your own test,
# we have tons of space.
-ALL_TESTS="0001:1:1"
-ALL_TESTS="$ALL_TESTS 0002:1:1"
-ALL_TESTS="$ALL_TESTS 0003:1:1"
-ALL_TESTS="$ALL_TESTS 0004:1:1"
-ALL_TESTS="$ALL_TESTS 0005:3:1"
+ALL_TESTS="0001:1:1:int_0001"
+ALL_TESTS="$ALL_TESTS 0002:1:1:string_0001"
+ALL_TESTS="$ALL_TESTS 0003:1:1:int_0002"
+ALL_TESTS="$ALL_TESTS 0004:1:1:uint_0001"
+ALL_TESTS="$ALL_TESTS 0005:3:1:int_0003"
+ALL_TESTS="$ALL_TESTS 0006:50:1:bitmap_0001"
test_modprobe()
{
@@ -149,6 +151,9 @@ reset_vals()
string_0001)
VAL="(none)"
;;
+ bitmap_0001)
+ VAL=""
+ ;;
*)
;;
esac
@@ -157,8 +162,10 @@ reset_vals()
set_orig()
{
- if [ ! -z $TARGET ]; then
- echo "${ORIG}" > "${TARGET}"
+ if [ ! -z $TARGET ] && [ ! -z $ORIG ]; then
+ if [ -f ${TARGET} ]; then
+ echo "${ORIG}" > "${TARGET}"
+ fi
fi
}
@@ -177,9 +184,25 @@ verify()
return 0
}
+# proc files get read a page at a time, which can confuse diff,
+# and get you incorrect results on proc files with long data. To use
+# diff against them you must first extract the output to a file, and
+# then compare against that file.
+verify_diff_proc_file()
+{
+ TMP_DUMP_FILE=$(mktemp)
+ cat $1 > $TMP_DUMP_FILE
+
+ if ! diff -w -q $TMP_DUMP_FILE $2; then
+ return 1
+ else
+ return 0
+ fi
+}
+
verify_diff_w()
{
- echo "$TEST_STR" | diff -q -w -u - $1
+ echo "$TEST_STR" | diff -q -w -u - $1 > /dev/null
return $?
}
@@ -600,9 +623,70 @@ run_stringtests()
test_rc
}
+target_exists()
+{
+ TARGET="${SYSCTL}/$1"
+ TEST_ID="$2"
+
+ if [ ! -f ${TARGET} ] ; then
+ echo "Target for test $TEST_ID: $TARGET not exist, skipping test ..."
+ return 0
+ fi
+ return 1
+}
+
+run_bitmaptest() {
+ # Total length of bitmaps string to use, a bit under
+ # the maximum input size of the test node
+ LENGTH=$((RANDOM % 65000))
+
+ # First bit to set
+ BIT=$((RANDOM % 1024))
+
+ # String containing our list of bits to set
+ TEST_STR=$BIT
+
+ # build up the string
+ while [ "${#TEST_STR}" -le "$LENGTH" ]; do
+ # Make sure next entry is discontiguous,
+ # skip ahead at least 2
+ BIT=$((BIT + $((2 + RANDOM % 10))))
+
+ # Add new bit to the list
+ TEST_STR="${TEST_STR},${BIT}"
+
+ # Randomly make it a range
+ if [ "$((RANDOM % 2))" -eq "1" ]; then
+ RANGE_END=$((BIT + $((1 + RANDOM % 10))))
+ TEST_STR="${TEST_STR}-${RANGE_END}"
+ BIT=$RANGE_END
+ fi
+ done
+
+ echo -n "Checking bitmap handler... "
+ TEST_FILE=$(mktemp)
+ echo -n "$TEST_STR" > $TEST_FILE
+
+ cat $TEST_FILE > $TARGET 2> /dev/null
+ if [ $? -ne 0 ]; then
+ echo "FAIL" >&2
+ rc=1
+ test_rc
+ fi
+
+ if ! verify_diff_proc_file "$TARGET" "$TEST_FILE"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ rc=0
+ fi
+ test_rc
+}
+
sysctl_test_0001()
{
- TARGET="${SYSCTL}/int_0001"
+ TARGET="${SYSCTL}/$(get_test_target 0001)"
reset_vals
ORIG=$(cat "${TARGET}")
TEST_STR=$(( $ORIG + 1 ))
@@ -614,7 +698,7 @@ sysctl_test_0001()
sysctl_test_0002()
{
- TARGET="${SYSCTL}/string_0001"
+ TARGET="${SYSCTL}/$(get_test_target 0002)"
reset_vals
ORIG=$(cat "${TARGET}")
TEST_STR="Testing sysctl"
@@ -627,7 +711,7 @@ sysctl_test_0002()
sysctl_test_0003()
{
- TARGET="${SYSCTL}/int_0002"
+ TARGET="${SYSCTL}/$(get_test_target 0003)"
reset_vals
ORIG=$(cat "${TARGET}")
TEST_STR=$(( $ORIG + 1 ))
@@ -640,7 +724,7 @@ sysctl_test_0003()
sysctl_test_0004()
{
- TARGET="${SYSCTL}/uint_0001"
+ TARGET="${SYSCTL}/$(get_test_target 0004)"
reset_vals
ORIG=$(cat "${TARGET}")
TEST_STR=$(( $ORIG + 1 ))
@@ -653,13 +737,21 @@ sysctl_test_0004()
sysctl_test_0005()
{
- TARGET="${SYSCTL}/int_0003"
+ TARGET="${SYSCTL}/$(get_test_target 0005)"
reset_vals
ORIG=$(cat "${TARGET}")
run_limit_digit_int_array
}
+sysctl_test_0006()
+{
+ TARGET="${SYSCTL}/bitmap_0001"
+ reset_vals
+ ORIG=""
+ run_bitmaptest
+}
+
list_tests()
{
echo "Test ID list:"
@@ -673,10 +765,9 @@ list_tests()
echo "0003 x $(get_test_count 0003) - tests proc_dointvec()"
echo "0004 x $(get_test_count 0004) - tests proc_douintvec()"
echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array"
+ echo "0006 x $(get_test_count 0006) - tests proc_do_large_bitmap()"
}
-test_reqs
-
usage()
{
NUM_TESTS=$(grep -o ' ' <<<"$ALL_TESTS" | grep -c .)
@@ -724,25 +815,35 @@ function get_test_count()
{
test_num $1
TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}')
- LAST_TWO=${TEST_DATA#*:*}
- echo ${LAST_TWO%:*}
+ echo ${TEST_DATA} | awk -F":" '{print $2}'
}
function get_test_enabled()
{
test_num $1
TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}')
- echo ${TEST_DATA#*:*:}
+ echo ${TEST_DATA} | awk -F":" '{print $3}'
+}
+
+function get_test_target()
+{
+ test_num $1
+ TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}')
+ echo ${TEST_DATA} | awk -F":" '{print $4}'
}
function run_all_tests()
{
for i in $ALL_TESTS ; do
- TEST_ID=${i%:*:*}
+ TEST_ID=${i%:*:*:*}
ENABLED=$(get_test_enabled $TEST_ID)
TEST_COUNT=$(get_test_count $TEST_ID)
+ TEST_TARGET=$(get_test_target $TEST_ID)
+ if target_exists $TEST_TARGET $TEST_ID; then
+ continue
+ fi
if [[ $ENABLED -eq "1" ]]; then
- test_case $TEST_ID $TEST_COUNT
+ test_case $TEST_ID $TEST_COUNT $TEST_TARGET
fi
done
}
@@ -775,12 +876,14 @@ function watch_case()
function test_case()
{
- NUM_TESTS=$DEFAULT_NUM_TESTS
- if [ $# -eq 2 ]; then
- NUM_TESTS=$2
- fi
+ NUM_TESTS=$2
i=0
+
+ if target_exists $3 $1; then
+ continue
+ fi
+
while [ $i -lt $NUM_TESTS ]; do
test_num $1
watch_log $i ${TEST_NAME}_test_$1 noclear
@@ -803,15 +906,15 @@ function parse_args()
elif [[ "$1" = "-t" ]]; then
shift
test_num $1
- test_case $1 $(get_test_count $1)
+ test_case $1 $(get_test_count $1) $(get_test_target $1)
elif [[ "$1" = "-c" ]]; then
shift
test_num $1
test_num $2
- test_case $1 $2
+ test_case $1 $2 $(get_test_target $1)
elif [[ "$1" = "-s" ]]; then
shift
- test_case $1 1
+ test_case $1 1 $(get_test_target $1)
elif [[ "$1" = "-l" ]]; then
list_tests
elif [[ "$1" = "-h" || "$1" = "--help" ]]; then
@@ -825,8 +928,8 @@ function parse_args()
test_reqs
allow_user_defaults
check_production_sysctl_writes_strict
-test_modprobe
load_req_mod
+test_modprobe
trap "test_finish" EXIT
diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c
index 73818f1b2ef8..3bf2a1fef8ed 100644
--- a/tools/vm/slabinfo.c
+++ b/tools/vm/slabinfo.c
@@ -29,7 +29,7 @@ struct slabinfo {
char *name;
int alias;
int refs;
- int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu;
+ int aliases, align, cache_dma, cache_dma32, cpu_slabs, destroy_by_rcu;
unsigned int hwcache_align, object_size, objs_per_slab;
unsigned int sanity_checks, slab_size, store_user, trace;
int order, poison, reclaim_account, red_zone;
@@ -534,6 +534,8 @@ static void report(struct slabinfo *s)
printf("** Hardware cacheline aligned\n");
if (s->cache_dma)
printf("** Memory is allocated in a special DMA zone\n");
+ if (s->cache_dma32)
+ printf("** Memory is allocated in a special DMA32 zone\n");
if (s->destroy_by_rcu)
printf("** Slabs are destroyed via RCU\n");
if (s->reclaim_account)
@@ -602,6 +604,8 @@ static void slabcache(struct slabinfo *s)
*p++ = '*';
if (s->cache_dma)
*p++ = 'd';
+ if (s->cache_dma32)
+ *p++ = 'D';
if (s->hwcache_align)
*p++ = 'A';
if (s->poison)
@@ -1208,6 +1212,7 @@ static void read_slab_dir(void)
slab->aliases = get_obj("aliases");
slab->align = get_obj("align");
slab->cache_dma = get_obj("cache_dma");
+ slab->cache_dma32 = get_obj("cache_dma32");
slab->cpu_slabs = get_obj("cpu_slabs");
slab->destroy_by_rcu = get_obj("destroy_by_rcu");
slab->hwcache_align = get_obj("hwcache_align");
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2b18a923ab7a..4bf728e3d857 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -391,7 +391,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
spin_unlock(&kvm->mmu_lock);
ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start,
- range->end, range->blockable);
+ range->end,
+ mmu_notifier_range_blockable(range));
srcu_read_unlock(&kvm->srcu, idx);