summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/cgroup-v1/memory.rst19
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst4
-rw-r--r--Documentation/admin-guide/dynamic-debug-howto.rst5
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt40
-rw-r--r--Documentation/admin-guide/mm/hugetlbpage.rst35
-rw-r--r--Documentation/admin-guide/mm/transhuge.rst7
-rw-r--r--Documentation/admin-guide/sysctl/vm.rst23
-rw-r--r--Documentation/core-api/cachetlb.rst2
-rw-r--r--Documentation/core-api/padata.rst41
-rw-r--r--Documentation/dev-tools/kcov.rst17
-rw-r--r--Documentation/features/debug/debug-vm-pgtable/arch-support.txt34
-rw-r--r--Documentation/features/vm/numa-memblock/arch-support.txt34
-rw-r--r--Documentation/filesystems/locking.rst6
-rw-r--r--Documentation/filesystems/vfs.rst15
-rw-r--r--Documentation/lzo.txt8
-rw-r--r--Documentation/vm/memory-model.rst9
-rw-r--r--Documentation/vm/page_owner.rst3
-rw-r--r--Documentation/vm/slub.rst2
-rw-r--r--arch/alpha/mm/init.c16
-rw-r--r--arch/alpha/mm/numa.c22
-rw-r--r--arch/arc/Kconfig1
-rw-r--r--arch/arc/include/asm/highmem.h18
-rw-r--r--arch/arc/include/asm/hugepage.h2
-rw-r--r--arch/arc/mm/highmem.c28
-rw-r--r--arch/arc/mm/init.c41
-rw-r--r--arch/arm/configs/omap2plus_defconfig2
-rw-r--r--arch/arm/include/asm/highmem.h9
-rw-r--r--arch/arm/include/asm/hugetlb.h7
-rw-r--r--arch/arm/include/asm/pgtable-3level.h2
-rw-r--r--arch/arm/mm/highmem.c35
-rw-r--r--arch/arm/mm/init.c66
-rw-r--r--arch/arm64/Kconfig3
-rw-r--r--arch/arm64/Kconfig.debug29
-rw-r--r--arch/arm64/include/asm/hugetlb.h13
-rw-r--r--arch/arm64/include/asm/pgtable.h5
-rw-r--r--arch/arm64/include/asm/vmap_stack.h6
-rw-r--r--arch/arm64/mm/dump.c2
-rw-r--r--arch/arm64/mm/hugetlbpage.c30
-rw-r--r--arch/arm64/mm/init.c56
-rw-r--r--arch/arm64/mm/numa.c9
-rw-r--r--arch/c6x/mm/init.c8
-rw-r--r--arch/csky/include/asm/highmem.h12
-rw-r--r--arch/csky/kernel/setup.c26
-rw-r--r--arch/csky/mm/highmem.c56
-rw-r--r--arch/h8300/mm/init.c6
-rw-r--r--arch/hexagon/mm/init.c6
-rw-r--r--arch/ia64/Kconfig1
-rw-r--r--arch/ia64/include/asm/hugetlb.h5
-rw-r--r--arch/ia64/mm/contig.c2
-rw-r--r--arch/ia64/mm/discontig.c2
-rw-r--r--arch/m68k/mm/init.c6
-rw-r--r--arch/m68k/mm/mcfmmu.c9
-rw-r--r--arch/m68k/mm/motorola.c15
-rw-r--r--arch/m68k/mm/sun3mmu.c10
-rw-r--r--arch/microblaze/Kconfig1
-rw-r--r--arch/microblaze/include/asm/highmem.h27
-rw-r--r--arch/microblaze/mm/highmem.c21
-rw-r--r--arch/microblaze/mm/init.c5
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/include/asm/highmem.h11
-rw-r--r--arch/mips/include/asm/hugetlb.h11
-rw-r--r--arch/mips/include/asm/pgtable.h2
-rw-r--r--arch/mips/loongson64/numa.c2
-rw-r--r--arch/mips/mm/cache.c6
-rw-r--r--arch/mips/mm/highmem.c54
-rw-r--r--arch/mips/mm/init.c2
-rw-r--r--arch/mips/sgi-ip27/ip27-memory.c2
-rw-r--r--arch/nds32/include/asm/highmem.h9
-rw-r--r--arch/nds32/mm/highmem.c41
-rw-r--r--arch/nds32/mm/init.c11
-rw-r--r--arch/nios2/mm/init.c8
-rw-r--r--arch/openrisc/mm/init.c9
-rw-r--r--arch/parisc/include/asm/cacheflush.h30
-rw-r--r--arch/parisc/include/asm/hugetlb.h10
-rw-r--r--arch/parisc/include/asm/pgtable.h2
-rw-r--r--arch/parisc/mm/init.c22
-rw-r--r--arch/powerpc/Kconfig11
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h4
-rw-r--r--arch/powerpc/include/asm/highmem.h28
-rw-r--r--arch/powerpc/include/asm/hugetlb.h5
-rw-r--r--arch/powerpc/include/asm/io.h10
-rw-r--r--arch/powerpc/include/asm/pci-bridge.h2
-rw-r--r--arch/powerpc/kernel/irq.c5
-rw-r--r--arch/powerpc/kernel/isa-bridge.c28
-rw-r--r--arch/powerpc/kernel/pci_64.c54
-rw-r--r--arch/powerpc/mm/highmem.c26
-rw-r--r--arch/powerpc/mm/hugetlbpage.c30
-rw-r--r--arch/powerpc/mm/ioremap_64.c50
-rw-r--r--arch/powerpc/mm/mem.c5
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-memory.c26
-rw-r--r--arch/riscv/Kconfig2
-rw-r--r--arch/riscv/include/asm/hugetlb.h10
-rw-r--r--arch/riscv/include/asm/pgtable.h4
-rw-r--r--arch/riscv/include/asm/ptdump.h11
-rw-r--r--arch/riscv/mm/hugetlbpage.c24
-rw-r--r--arch/riscv/mm/init.c5
-rw-r--r--arch/s390/Kconfig2
-rw-r--r--arch/s390/include/asm/hugetlb.h8
-rw-r--r--arch/s390/kernel/setup.c9
-rw-r--r--arch/s390/mm/hugetlbpage.c24
-rw-r--r--arch/s390/mm/init.c2
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sh/include/asm/hugetlb.h7
-rw-r--r--arch/sh/kernel/cpu/sh4/sq.c3
-rw-r--r--arch/sh/mm/init.c2
-rw-r--r--arch/sparc/Kconfig10
-rw-r--r--arch/sparc/include/asm/highmem.h25
-rw-r--r--arch/sparc/include/asm/hugetlb.h10
-rw-r--r--arch/sparc/mm/highmem.c25
-rw-r--r--arch/sparc/mm/init_64.c45
-rw-r--r--arch/sparc/mm/io-unit.c1
-rw-r--r--arch/sparc/mm/iommu.c1
-rw-r--r--arch/sparc/mm/srmmu.c21
-rw-r--r--arch/um/kernel/mem.c12
-rw-r--r--arch/unicore32/include/asm/memory.h2
-rw-r--r--arch/unicore32/include/mach/memory.h6
-rw-r--r--arch/unicore32/kernel/pci.c14
-rw-r--r--arch/unicore32/mm/init.c43
-rw-r--r--arch/x86/Kconfig12
-rw-r--r--arch/x86/Kconfig.debug27
-rw-r--r--arch/x86/hyperv/hv_init.c5
-rw-r--r--arch/x86/include/asm/fixmap.h1
-rw-r--r--arch/x86/include/asm/highmem.h9
-rw-r--r--arch/x86/include/asm/hugetlb.h10
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h2
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h2
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/pgtable_64.h6
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h2
-rw-r--r--arch/x86/include/asm/pgtable_types.h8
-rw-r--r--arch/x86/include/asm/switch_to.h23
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/setup_percpu.c6
-rw-r--r--arch/x86/kvm/svm/sev.c3
-rw-r--r--arch/x86/mm/dump_pagetables.c33
-rw-r--r--arch/x86/mm/fault.c176
-rw-r--r--arch/x86/mm/highmem_32.c50
-rw-r--r--arch/x86/mm/hugetlbpage.c23
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/init_64.c17
-rw-r--r--arch/x86/mm/kmmio.c2
-rw-r--r--arch/x86/mm/numa.c11
-rw-r--r--arch/x86/mm/pti.c8
-rw-r--r--arch/x86/mm/tlb.c37
-rw-r--r--arch/xtensa/include/asm/highmem.h27
-rw-r--r--arch/xtensa/mm/highmem.c22
-rw-r--r--arch/xtensa/mm/init.c8
-rw-r--r--block/blk-core.c1
-rw-r--r--drivers/acpi/apei/ghes.c6
-rw-r--r--drivers/base/memory.c44
-rw-r--r--drivers/block/drbd/drbd_bitmap.c4
-rw-r--r--drivers/block/zram/zcomp.c7
-rw-r--r--drivers/dax/dax-private.h1
-rw-r--r--drivers/dax/device.c1
-rw-r--r--drivers/dax/kmem.c28
-rw-r--r--drivers/gpu/drm/drm_scatter.c11
-rw-r--r--drivers/gpu/drm/etnaviv/etnaviv_dump.c4
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_userptr.c22
-rw-r--r--drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c2
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo_util.c56
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_blit.c17
-rw-r--r--drivers/lightnvm/pblk-init.c5
-rw-r--r--drivers/md/dm-bufio.c4
-rw-r--r--drivers/md/md-bitmap.c12
-rw-r--r--drivers/media/common/videobuf2/videobuf2-dma-sg.c3
-rw-r--r--drivers/media/common/videobuf2/videobuf2-vmalloc.c3
-rw-r--r--drivers/media/pci/ivtv/ivtv-udma.c19
-rw-r--r--drivers/media/pci/ivtv/ivtv-yuv.c17
-rw-r--r--drivers/media/pci/ivtv/ivtvfb.c4
-rw-r--r--drivers/mtd/ubi/io.c4
-rw-r--r--drivers/pcmcia/electra_cf.c45
-rw-r--r--drivers/rapidio/devices/rio_mport_cdev.c27
-rw-r--r--drivers/scsi/sd_zbc.c3
-rw-r--r--drivers/staging/android/ion/ion_heap.c4
-rw-r--r--drivers/staging/media/ipu3/ipu3-css-pool.h4
-rw-r--r--drivers/staging/media/ipu3/ipu3-dmamap.c30
-rw-r--r--drivers/tty/serial/sh-sci.c2
-rw-r--r--drivers/tty/vt/keyboard.c2
-rw-r--r--drivers/usb/core/hcd.c3
-rw-r--r--fs/aio.c9
-rw-r--r--fs/binfmt_elf.c6
-rw-r--r--fs/binfmt_em86.c6
-rw-r--r--fs/binfmt_misc.c4
-rw-r--r--fs/binfmt_script.c6
-rw-r--r--fs/block_dev.c7
-rw-r--r--fs/btrfs/disk-io.c4
-rw-r--r--fs/btrfs/extent_io.c64
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/inode.c39
-rw-r--r--fs/buffer.c79
-rw-r--r--fs/cifs/file.c10
-rw-r--r--fs/erofs/data.c39
-rw-r--r--fs/erofs/decompressor.c2
-rw-r--r--fs/erofs/zdata.c29
-rw-r--r--fs/exec.c79
-rw-r--r--fs/exfat/inode.c7
-rw-r--r--fs/ext2/inode.c10
-rw-r--r--fs/ext4/ext4.h5
-rw-r--r--fs/ext4/inode.c21
-rw-r--r--fs/ext4/readpage.c25
-rw-r--r--fs/ext4/verity.c35
-rw-r--r--fs/f2fs/data.c50
-rw-r--r--fs/f2fs/f2fs.h14
-rw-r--r--fs/f2fs/verity.c35
-rw-r--r--fs/fat/fatent.c103
-rw-r--r--fs/fat/inode.c13
-rw-r--r--fs/file_table.c1
-rw-r--r--fs/fuse/dev.c2
-rw-r--r--fs/fuse/file.c100
-rw-r--r--fs/gfs2/aops.c23
-rw-r--r--fs/gfs2/dir.c9
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/hpfs/file.c7
-rw-r--r--fs/hugetlbfs/inode.c67
-rw-r--r--fs/iomap/buffered-io.c111
-rw-r--r--fs/iomap/trace.h2
-rw-r--r--fs/isofs/inode.c7
-rw-r--r--fs/jfs/inode.c7
-rw-r--r--fs/mpage.c38
-rw-r--r--fs/namei.c5
-rw-r--r--fs/nfs/blocklayout/extent_tree.c2
-rw-r--r--fs/nilfs2/inode.c15
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ntfs/malloc.h2
-rw-r--r--fs/ntfs/mft.c2
-rw-r--r--fs/ocfs2/aops.c34
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c1
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/slot_map.c46
-rw-r--r--fs/ocfs2/super.c21
-rw-r--r--fs/omfs/file.c7
-rw-r--r--fs/open.c9
-rw-r--r--fs/orangefs/inode.c32
-rw-r--r--fs/proc/array.c8
-rw-r--r--fs/proc/page.c40
-rw-r--r--fs/proc/task_mmu.c16
-rw-r--r--fs/qnx6/inode.c7
-rw-r--r--fs/ramfs/inode.c12
-rw-r--r--fs/reiserfs/inode.c8
-rw-r--r--fs/seq_file.c7
-rw-r--r--fs/squashfs/block.c273
-rw-r--r--fs/squashfs/decompressor.h5
-rw-r--r--fs/squashfs/decompressor_multi.c9
-rw-r--r--fs/squashfs/decompressor_multi_percpu.c17
-rw-r--r--fs/squashfs/decompressor_single.c9
-rw-r--r--fs/squashfs/lz4_wrapper.c17
-rw-r--r--fs/squashfs/lzo_wrapper.c17
-rw-r--r--fs/squashfs/squashfs.h4
-rw-r--r--fs/squashfs/xz_wrapper.c51
-rw-r--r--fs/squashfs/zlib_wrapper.c63
-rw-r--r--fs/squashfs/zstd_wrapper.c64
-rw-r--r--fs/sync.c6
-rw-r--r--fs/ubifs/debug.c2
-rw-r--r--fs/ubifs/lprops.c2
-rw-r--r--fs/ubifs/lpt_commit.c4
-rw-r--r--fs/ubifs/orphan.c2
-rw-r--r--fs/udf/inode.c7
-rw-r--r--fs/xfs/kmem.c2
-rw-r--r--fs/xfs/xfs_aops.c13
-rw-r--r--fs/xfs/xfs_buf.c2
-rw-r--r--fs/zonefs/super.c7
-rw-r--r--include/asm-generic/5level-fixup.h5
-rw-r--r--include/asm-generic/hugetlb.h2
-rw-r--r--include/asm-generic/pgtable.h27
-rw-r--r--include/drm/ttm/ttm_bo_api.h4
-rw-r--r--include/linux/binfmts.h3
-rw-r--r--include/linux/bitops.h7
-rw-r--r--include/linux/buffer_head.h8
-rw-r--r--include/linux/compaction.h9
-rw-r--r--include/linux/dev_printk.h6
-rw-r--r--include/linux/dynamic_debug.h2
-rw-r--r--include/linux/elfnote.h2
-rw-r--r--include/linux/fs.h21
-rw-r--r--include/linux/fsnotify.h4
-rw-r--r--include/linux/gfp.h7
-rw-r--r--include/linux/highmem.h81
-rw-r--r--include/linux/hugetlb.h16
-rw-r--r--include/linux/iomap.h3
-rw-r--r--include/linux/ioport.h1
-rw-r--r--include/linux/ipc_namespace.h12
-rw-r--r--include/linux/memblock.h15
-rw-r--r--include/linux/memcontrol.h66
-rw-r--r--include/linux/memory_hotplug.h9
-rw-r--r--include/linux/memremap.h6
-rw-r--r--include/linux/mm.h133
-rw-r--r--include/linux/mm_types.h6
-rw-r--r--include/linux/mmzone.h60
-rw-r--r--include/linux/mpage.h4
-rw-r--r--include/linux/net.h3
-rw-r--r--include/linux/netdevice.h6
-rw-r--r--include/linux/nmi.h1
-rw-r--r--include/linux/padata.h43
-rw-r--r--include/linux/pagemap.h193
-rw-r--r--include/linux/printk.h9
-rw-r--r--include/linux/ptdump.h3
-rw-r--r--include/linux/sched.h11
-rw-r--r--include/linux/seq_file.h19
-rw-r--r--include/linux/string.h60
-rw-r--r--include/linux/swap.h27
-rw-r--r--include/linux/vm_event_item.h8
-rw-r--r--include/linux/vmalloc.h49
-rw-r--r--include/linux/vmstat.h2
-rw-r--r--include/linux/wait.h4
-rw-r--r--include/linux/zsmalloc.h2
-rw-r--r--include/rdma/ib_verbs.h6
-rw-r--r--include/trace/events/compaction.h22
-rw-r--r--include/trace/events/erofs.h6
-rw-r--r--include/trace/events/f2fs.h6
-rw-r--r--include/trace/events/huge_memory.h3
-rw-r--r--include/trace/events/vmscan.h14
-rw-r--r--init/Kconfig27
-rw-r--r--init/main.c12
-rw-r--r--ipc/ipc_sysctl.c14
-rw-r--r--ipc/msg.c3
-rw-r--r--ipc/namespace.c37
-rw-r--r--ipc/sem.c1
-rw-r--r--ipc/shm.c60
-rw-r--r--ipc/util.c175
-rw-r--r--ipc/util.h4
-rw-r--r--kernel/bpf/core.c6
-rw-r--r--kernel/bpf/syscall.c25
-rw-r--r--kernel/dma/remap.c48
-rw-r--r--kernel/events/uprobes.c10
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hung_task.c44
-rw-r--r--kernel/kcov.c266
-rw-r--r--kernel/kexec_file.c5
-rw-r--r--kernel/kprobes.c33
-rw-r--r--kernel/module.c3
-rw-r--r--kernel/notifier.c1
-rw-r--r--kernel/padata.c277
-rw-r--r--kernel/relay.c17
-rw-r--r--kernel/trace/trace.c12
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/watchdog.c22
-rw-r--r--kernel/watchdog_hld.c1
-rw-r--r--lib/Kconfig.debug64
-rw-r--r--lib/Kconfig.twist26
-rw-r--r--lib/Makefile4
-rw-r--r--lib/cpumask.c102
-rw-r--r--lib/dynamic_debug.c9
-rw-r--r--lib/flex_proportions.c7
-rw-r--r--lib/ioremap.c46
-rw-r--r--lib/lzo/lzo1x_compress.c13
-rw-r--r--lib/math/prime_numbers.c10
-rw-r--r--lib/percpu-refcount.c6
-rw-r--r--lib/strncpy_from_user.c1
-rw-r--r--lib/test_bitops.c60
-rw-r--r--lib/test_kasan.c29
-rw-r--r--lib/test_lockup.c2
-rw-r--r--lib/test_vmalloc.c26
-rw-r--r--lib/ubsan.c33
-rw-r--r--lib/zlib_inflate/inffast.c91
-rw-r--r--mm/Kconfig16
-rw-r--r--mm/Kconfig.debug32
-rw-r--r--mm/Makefile1
-rw-r--r--mm/compaction.c89
-rw-r--r--mm/debug.c56
-rw-r--r--mm/debug_vm_pgtable.c382
-rw-r--r--mm/fadvise.c6
-rw-r--r--mm/filemap.c45
-rw-r--r--mm/frontswap.c6
-rw-r--r--mm/gup.c238
-rw-r--r--mm/huge_memory.c272
-rw-r--r--mm/hugetlb.c224
-rw-r--r--mm/internal.h37
-rw-r--r--mm/kasan/common.c19
-rw-r--r--mm/kasan/report.c22
-rw-r--r--mm/khugepaged.c263
-rw-r--r--mm/ksm.c10
-rw-r--r--mm/list_lru.c2
-rw-r--r--mm/memblock.c21
-rw-r--r--mm/memcontrol.c539
-rw-r--r--mm/memory.c57
-rw-r--r--mm/memory_hotplug.c169
-rw-r--r--mm/mempolicy.c5
-rw-r--r--mm/memremap.c20
-rw-r--r--mm/migrate.c52
-rw-r--r--mm/mm_init.c16
-rw-r--r--mm/mmap.c45
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c46
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page-writeback.c2
-rw-r--r--mm/page_alloc.c611
-rw-r--r--mm/page_owner.c7
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/ptdump.c17
-rw-r--r--mm/readahead.c275
-rw-r--r--mm/rmap.c53
-rw-r--r--mm/shmem.c110
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slab_common.c3
-rw-r--r--mm/slub.c73
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c98
-rw-r--r--mm/swap_cgroup.c6
-rw-r--r--mm/swap_state.c109
-rw-r--r--mm/swapfile.c206
-rw-r--r--mm/userfaultfd.c5
-rw-r--r--mm/util.c20
-rw-r--r--mm/vmalloc.c367
-rw-r--r--mm/vmscan.c284
-rw-r--r--mm/vmstat.c52
-rw-r--r--mm/workingset.c21
-rw-r--r--mm/z3fold.c3
-rw-r--r--mm/zbud.c2
-rw-r--r--mm/zsmalloc.c12
-rw-r--r--net/bridge/netfilter/ebtables.c6
-rw-r--r--net/ceph/ceph_common.c3
-rwxr-xr-xscripts/checkpatch.pl74
-rwxr-xr-xscripts/get_maintainer.pl46
-rw-r--r--security/keys/internal.h11
-rw-r--r--security/keys/keyctl.c16
-rw-r--r--sound/core/memalloc.c2
-rw-r--r--sound/core/pcm_memory.c2
-rw-r--r--tools/testing/selftests/lib/config1
-rw-r--r--tools/testing/selftests/vm/.gitignore2
-rw-r--r--tools/testing/selftests/vm/Makefile74
-rw-r--r--tools/testing/selftests/vm/khugepaged.c1035
-rw-r--r--tools/testing/selftests/vm/mremap_dontunmap.c1
-rw-r--r--tools/testing/selftests/vm/pkey-helpers.h225
-rw-r--r--tools/testing/selftests/vm/pkey-powerpc.h133
-rw-r--r--tools/testing/selftests/vm/pkey-x86.h181
-rw-r--r--tools/testing/selftests/vm/protection_keys.c (renamed from tools/testing/selftests/x86/protection_keys.c)696
-rw-r--r--tools/testing/selftests/x86/.gitignore1
-rw-r--r--tools/testing/selftests/x86/Makefile2
-rw-r--r--tools/testing/selftests/x86/pkey-helpers.h219
-rw-r--r--tools/vm/page_owner_sort.c5
431 files changed, 8461 insertions, 6608 deletions
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index 0ae4f564c2d6..12757e63b26c 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -199,11 +199,11 @@ An RSS page is unaccounted when it's fully unmapped. A PageCache page is
unaccounted when it's removed from radix-tree. Even if RSS pages are fully
unmapped (by kswapd), they may exist as SwapCache in the system until they
are really freed. Such SwapCaches are also accounted.
-A swapped-in page is not accounted until it's mapped.
+A swapped-in page is accounted after adding into swapcache.
Note: The kernel does swapin-readahead and reads multiple swaps at once.
-This means swapped-in pages may contain pages for other tasks than a task
-causing page fault. So, we avoid accounting at swap-in I/O.
+Since page's memcg recorded into swap whatever memsw enabled, the page will
+be accounted after swapin.
At page migration, accounting information is kept.
@@ -222,18 +222,13 @@ the cgroup that brought it in -- this will happen on memory pressure).
But see section 8.2: when moving a task to another cgroup, its pages may
be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
-Exception: If CONFIG_MEMCG_SWAP is not used.
-When you do swapoff and make swapped-out pages of shmem(tmpfs) to
-be backed into memory in force, charges for pages are accounted against the
-caller of swapoff rather than the users of shmem.
-
-2.4 Swap Extension (CONFIG_MEMCG_SWAP)
+2.4 Swap Extension
--------------------------------------
-Swap Extension allows you to record charge for swap. A swapped-in page is
-charged back to original page allocator if possible.
+Swap usage is always recorded for each of cgroup. Swap Extension allows you to
+read and limit it.
-When swap is accounted, following files are added.
+When CONFIG_SWAP is enabled, following files are added.
- memory.memsw.usage_in_bytes.
- memory.memsw.limit_in_bytes.
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index bcc80269bb6a..5f12f203822e 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1329,6 +1329,10 @@ PAGE_SIZE multiple when read back.
workingset_activate
Number of refaulted pages that were immediately activated
+ workingset_restore
+ Number of restored pages which have been detected as an active
+ workingset before they got reclaimed.
+
workingset_nodereclaim
Number of times a shadow node has been reclaimed
diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst
index 0dc2eb8e44e5..1012bd9305e9 100644
--- a/Documentation/admin-guide/dynamic-debug-howto.rst
+++ b/Documentation/admin-guide/dynamic-debug-howto.rst
@@ -13,6 +13,11 @@ kernel code to obtain additional kernel information. Currently, if
``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically
enabled per-callsite.
+If you do not want to enable dynamic debug globally (i.e. in some embedded
+system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic
+debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any
+modules which you'd like to dynamically debug later.
+
If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just
shortcut for ``print_hex_dump(KERN_DEBUG)``.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 5e550e0bdbef..9b1c473f2fb2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -834,12 +834,15 @@
See also Documentation/networking/decnet.rst.
default_hugepagesz=
- [same as hugepagesz=] The size of the default
- HugeTLB page size. This is the size represented by
- the legacy /proc/ hugepages APIs, used for SHM, and
- default size when mounting hugetlbfs filesystems.
- Defaults to the default architecture's huge page size
- if not specified.
+ [HW] The size of the default HugeTLB page. This is
+ the size represented by the legacy /proc/ hugepages
+ APIs. In addition, this is the default hugetlb size
+ used for shmget(), mmap() and mounting hugetlbfs
+ filesystems. If not specified, defaults to the
+ architecture's default huge page size. Huge page
+ sizes are architecture dependent. See also
+ Documentation/admin-guide/mm/hugetlbpage.rst.
+ Format: size[KMG]
deferred_probe_timeout=
[KNL] Debugging option to set a timeout in seconds for
@@ -1484,13 +1487,24 @@
hugepages using the cma allocator. If enabled, the
boot-time allocation of gigantic hugepages is skipped.
- hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
- hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
- On x86-64 and powerpc, this option can be specified
- multiple times interleaved with hugepages= to reserve
- huge pages of different sizes. Valid pages sizes on
- x86-64 are 2M (when the CPU supports "pse") and 1G
- (when the CPU supports the "pdpe1gb" cpuinfo flag).
+ hugepages= [HW] Number of HugeTLB pages to allocate at boot.
+ If this follows hugepagesz (below), it specifies
+ the number of pages of hugepagesz to be allocated.
+ If this is the first HugeTLB parameter on the command
+ line, it specifies the number of pages to allocate for
+ the default huge page size. See also
+ Documentation/admin-guide/mm/hugetlbpage.rst.
+ Format: <integer>
+
+ hugepagesz=
+ [HW] The size of the HugeTLB pages. This is used in
+ conjunction with hugepages (above) to allocate huge
+ pages of a specific size at boot. The pair
+ hugepagesz=X hugepages=Y can be specified once for
+ each supported huge page size. Huge page sizes are
+ architecture dependent. See also
+ Documentation/admin-guide/mm/hugetlbpage.rst.
+ Format: size[KMG]
hung_task_panic=
[KNL] Should the hung task detector generate panics.
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index 1cc0bc78d10e..5026e58826e2 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -100,6 +100,41 @@ with a huge page size selection parameter "hugepagesz=<size>". <size> must
be specified in bytes with optional scale suffix [kKmMgG]. The default huge
page size may be selected with the "default_hugepagesz=<size>" boot parameter.
+Hugetlb boot command line parameter semantics
+hugepagesz - Specify a huge page size. Used in conjunction with hugepages
+ parameter to preallocate a number of huge pages of the specified
+ size. Hence, hugepagesz and hugepages are typically specified in
+ pairs such as:
+ hugepagesz=2M hugepages=512
+ hugepagesz can only be specified once on the command line for a
+ specific huge page size. Valid huge page sizes are architecture
+ dependent.
+hugepages - Specify the number of huge pages to preallocate. This typically
+ follows a valid hugepagesz or default_hugepagesz parameter. However,
+ if hugepages is the first or only hugetlb command line parameter it
+ implicitly specifies the number of huge pages of default size to
+ allocate. If the number of huge pages of default size is implicitly
+ specified, it can not be overwritten by a hugepagesz,hugepages
+ parameter pair for the default size.
+ For example, on an architecture with 2M default huge page size:
+ hugepages=256 hugepagesz=2M hugepages=512
+ will result in 256 2M huge pages being allocated and a warning message
+ indicating that the hugepages=512 parameter is ignored. If a hugepages
+ parameter is preceded by an invalid hugepagesz parameter, it will
+ be ignored.
+default_hugepagesz - Specify the default huge page size. This parameter can
+ only be specified once on the command line. default_hugepagesz can
+ optionally be followed by the hugepages parameter to preallocate a
+ specific number of huge pages of default size. The number of default
+ sized huge pages to preallocate can also be implicitly specified as
+ mentioned in the hugepages section above. Therefore, on an
+ architecture with 2M default huge page size:
+ hugepages=256
+ default_hugepagesz=2M hugepages=256
+ hugepages=256 default_hugepagesz=2M
+ will all result in 256 2M huge pages being allocated. Valid default
+ huge page size is architecture dependent.
+
When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
indicates the current number of pre-allocated huge pages of the default size.
Thus, one can use the following command to dynamically allocate/deallocate
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 2f31de8f7c74..6a233e42be08 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -220,6 +220,13 @@ memory. A lower value can prevent THPs from being
collapsed, resulting fewer pages being collapsed into
THPs, and lower memory access performance.
+``max_ptes_shared`` specifies how many pages can be shared across multiple
+processes. Exceeding the number would block the collapse::
+
+ /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared
+
+A higher value may increase memory footprint for some workloads.
+
Boot parameter
==============
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 0329a4d3fa9e..d46d5b7013c6 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -831,14 +831,27 @@ tooling to work, you can do::
swappiness
==========
-This control is used to define how aggressive the kernel will swap
-memory pages. Higher values will increase aggressiveness, lower values
-decrease the amount of swap. A value of 0 instructs the kernel not to
-initiate swap until the amount of free and file-backed pages is less
-than the high water mark in a zone.
+This control is used to define the rough relative IO cost of swapping
+and filesystem paging, as a value between 0 and 200. At 100, the VM
+assumes equal IO cost and will thus apply memory pressure to the page
+cache and swap-backed pages equally; lower values signify more
+expensive swap IO, higher values indicates cheaper.
+
+Keep in mind that filesystem IO patterns under memory pressure tend to
+be more efficient than swap's random IO. An optimal value will require
+experimentation and will also be workload-dependent.
The default value is 60.
+For in-memory swap, like zram or zswap, as well as hybrid setups that
+have swap on faster devices than the filesystem, values beyond 100 can
+be considered. For example, if the random IO against the swap device
+is on average 2x faster than IO from the filesystem, swappiness should
+be 133 (x + 2x = 200, 2x = 133.33).
+
+At 0, the kernel will not initiate swap until the amount of free and
+file-backed pages is less than the high watermark in a zone.
+
unprivileged_userfaultfd
========================
diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst
index 93cb65d52720..a1582cc79f0f 100644
--- a/Documentation/core-api/cachetlb.rst
+++ b/Documentation/core-api/cachetlb.rst
@@ -213,7 +213,7 @@ Here are the routines, one by one:
there will be no entries in the cache for the kernel address
space for virtual addresses in the range 'start' to 'end-1'.
- The first of these two routines is invoked after map_vm_area()
+ The first of these two routines is invoked after map_kernel_range()
has installed the page table entries. The second is invoked
before unmap_kernel_range() deletes the page table entries.
diff --git a/Documentation/core-api/padata.rst b/Documentation/core-api/padata.rst
index 9a24c111781d..b7e047af993e 100644
--- a/Documentation/core-api/padata.rst
+++ b/Documentation/core-api/padata.rst
@@ -4,23 +4,26 @@
The padata parallel execution mechanism
=======================================
-:Date: December 2019
+:Date: April 2020
Padata is a mechanism by which the kernel can farm jobs out to be done in
-parallel on multiple CPUs while retaining their ordering. It was developed for
-use with the IPsec code, which needs to be able to perform encryption and
-decryption on large numbers of packets without reordering those packets. The
-crypto developers made a point of writing padata in a sufficiently general
-fashion that it could be put to other uses as well.
+parallel on multiple CPUs while optionally retaining their ordering.
-Usage
-=====
+It was originally developed for IPsec, which needs to perform encryption and
+decryption on large numbers of packets without reordering those packets. This
+is currently the sole consumer of padata's serialized job support.
+
+Padata also supports multithreaded jobs, splitting up the job evenly while load
+balancing and coordinating between threads.
+
+Running Serialized Jobs
+=======================
Initializing
------------
-The first step in using padata is to set up a padata_instance structure for
-overall control of how jobs are to be run::
+The first step in using padata to run parallel jobs is to set up a
+padata_instance structure for overall control of how jobs are to be run::
#include <linux/padata.h>
@@ -162,6 +165,24 @@ functions that correspond to the allocation in reverse::
It is the user's responsibility to ensure all outstanding jobs are complete
before any of the above are called.
+Running Multithreaded Jobs
+==========================
+
+A multithreaded job has a main thread and zero or more helper threads, with the
+main thread participating in the job and then waiting until all helpers have
+finished. padata splits the job into units called chunks, where a chunk is a
+piece of the job that one thread completes in one call to the thread function.
+
+A user has to do three things to run a multithreaded job. First, describe the
+job by defining a padata_mt_job structure, which is explained in the Interface
+section. This includes a pointer to the thread function, which padata will
+call each time it assigns a job chunk to a thread. Then, define the thread
+function, which accepts three arguments, ``start``, ``end``, and ``arg``, where
+the first two delimit the range that the thread operates on and the last is a
+pointer to the job's shared state, if any. Prepare the shared state, which is
+typically a stack-allocated structure that wraps the required data. Last, call
+padata_do_multithreaded(), which will return once the job is finished.
+
Interface
=========
diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst
index 1c4e1825d769..8548b0b04e43 100644
--- a/Documentation/dev-tools/kcov.rst
+++ b/Documentation/dev-tools/kcov.rst
@@ -217,14 +217,15 @@ This allows to collect coverage from two types of kernel background
threads: the global ones, that are spawned during kernel boot in a limited
number of instances (e.g. one USB hub_event() worker thread is spawned per
USB HCD); and the local ones, that are spawned when a user interacts with
-some kernel interface (e.g. vhost workers).
+some kernel interface (e.g. vhost workers); as well as from soft
+interrupts.
-To enable collecting coverage from a global background thread, a unique
-global handle must be assigned and passed to the corresponding
-kcov_remote_start() call. Then a userspace process can pass a list of such
-handles to the KCOV_REMOTE_ENABLE ioctl in the handles array field of the
-kcov_remote_arg struct. This will attach the used kcov device to the code
-sections, that are referenced by those handles.
+To enable collecting coverage from a global background thread or from a
+softirq, a unique global handle must be assigned and passed to the
+corresponding kcov_remote_start() call. Then a userspace process can pass
+a list of such handles to the KCOV_REMOTE_ENABLE ioctl in the handles
+array field of the kcov_remote_arg struct. This will attach the used kcov
+device to the code sections, that are referenced by those handles.
Since there might be many local background threads spawned from different
userspace processes, we can't use a single global handle per annotation.
@@ -242,7 +243,7 @@ handles as they don't belong to a particular subsystem. The bytes 4-7 are
currently reserved and must be zero. In the future the number of bytes
used for the subsystem or handle ids might be increased.
-When a particular userspace proccess collects coverage by via a common
+When a particular userspace proccess collects coverage via a common
handle, kcov will collect coverage for each code section that is annotated
to use the common handle obtained as kcov_handle from the current
task_struct. However non common handles allow to collect coverage
diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
new file mode 100644
index 000000000000..c527d05c0459
--- /dev/null
+++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
@@ -0,0 +1,34 @@
+#
+# Feature name: debug-vm-pgtable
+# Kconfig: ARCH_HAS_DEBUG_VM_PGTABLE
+# description: arch supports pgtable tests for semantics compliance
+#
+ -----------------------
+ | arch |status|
+ -----------------------
+ | alpha: | TODO |
+ | arc: | ok |
+ | arm: | TODO |
+ | arm64: | ok |
+ | c6x: | TODO |
+ | csky: | TODO |
+ | h8300: | TODO |
+ | hexagon: | TODO |
+ | ia64: | TODO |
+ | m68k: | TODO |
+ | microblaze: | TODO |
+ | mips: | TODO |
+ | nds32: | TODO |
+ | nios2: | TODO |
+ | openrisc: | TODO |
+ | parisc: | TODO |
+ | powerpc: | ok |
+ | riscv: | TODO |
+ | s390: | ok |
+ | sh: | TODO |
+ | sparc: | TODO |
+ | um: | TODO |
+ | unicore32: | TODO |
+ | x86: | ok |
+ | xtensa: | TODO |
+ -----------------------
diff --git a/Documentation/features/vm/numa-memblock/arch-support.txt b/Documentation/features/vm/numa-memblock/arch-support.txt
deleted file mode 100644
index 3004beb0fd71..000000000000
--- a/Documentation/features/vm/numa-memblock/arch-support.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# Feature name: numa-memblock
-# Kconfig: HAVE_MEMBLOCK_NODE_MAP
-# description: arch supports NUMA aware memblocks
-#
- -----------------------
- | arch |status|
- -----------------------
- | alpha: | TODO |
- | arc: | .. |
- | arm: | .. |
- | arm64: | ok |
- | c6x: | .. |
- | csky: | .. |
- | h8300: | .. |
- | hexagon: | .. |
- | ia64: | ok |
- | m68k: | .. |
- | microblaze: | ok |
- | mips: | ok |
- | nds32: | TODO |
- | nios2: | .. |
- | openrisc: | .. |
- | parisc: | .. |
- | powerpc: | ok |
- | riscv: | ok |
- | s390: | ok |
- | sh: | ok |
- | sparc: | ok |
- | um: | .. |
- | unicore32: | .. |
- | x86: | ok |
- | xtensa: | .. |
- -----------------------
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 9fdcec416614..3ffb248eec8c 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -239,6 +239,7 @@ prototypes::
int (*readpage)(struct file *, struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
+ void (*readahead)(struct readahead_control *);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
@@ -271,7 +272,8 @@ writepage: yes, unlocks (see below)
readpage: yes, unlocks
writepages:
set_page_dirty no
-readpages:
+readahead: yes, unlocks
+readpages: no
write_begin: locks the page exclusive
write_end: yes, unlocks exclusive
bmap:
@@ -295,6 +297,8 @@ the request handler (/dev/loop).
->readpage() unlocks the page, either synchronously or via I/O
completion.
+->readahead() unlocks the pages that I/O is attempted on like ->readpage().
+
->readpages() populates the pagecache with the passed pages and starts
I/O against them. They come unlocked upon I/O completion.
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 7d4d09dd5e6d..ed17771c212b 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -706,6 +706,7 @@ cache in your filesystem. The following members are defined:
int (*readpage)(struct file *, struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
+ void (*readahead)(struct readahead_control *);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
@@ -781,12 +782,26 @@ cache in your filesystem. The following members are defined:
If defined, it should set the PageDirty flag, and the
PAGECACHE_TAG_DIRTY tag in the radix tree.
+``readahead``
+ Called by the VM to read pages associated with the address_space
+ object. The pages are consecutive in the page cache and are
+ locked. The implementation should decrement the page refcount
+ after starting I/O on each page. Usually the page will be
+ unlocked by the I/O completion handler. If the filesystem decides
+ to stop attempting I/O before reaching the end of the readahead
+ window, it can simply return. The caller will decrement the page
+ refcount and unlock the remaining pages for you. Set PageUptodate
+ if the I/O completes successfully. Setting PageError on any page
+ will be ignored; simply unlock the page if an I/O error occurs.
+
``readpages``
called by the VM to read pages associated with the address_space
object. This is essentially just a vector version of readpage.
Instead of just one page, several pages are requested.
readpages is only used for read-ahead, so read errors are
ignored. If anything goes wrong, feel free to give up.
+ This interface is deprecated and will be removed by the end of
+ 2020; implement readahead instead.
``write_begin``
Called by the generic buffered write code to ask the filesystem
diff --git a/Documentation/lzo.txt b/Documentation/lzo.txt
index ca983328976b..f65b51523014 100644
--- a/Documentation/lzo.txt
+++ b/Documentation/lzo.txt
@@ -159,11 +159,15 @@ Byte sequences
distance = 16384 + (H << 14) + D
state = S (copy S literals after this block)
End of stream is reached if distance == 16384
+ In version 1 only, to prevent ambiguity with the RLE case when
+ ((distance & 0x803f) == 0x803f) && (261 <= length <= 264), the
+ compressor must not emit block copies where distance and length
+ meet these conditions.
In version 1 only, this instruction is also used to encode a run of
- zeros if distance = 0xbfff, i.e. H = 1 and the D bits are all 1.
+ zeros if distance = 0xbfff, i.e. H = 1 and the D bits are all 1.
In this case, it is followed by a fourth byte, X.
- run length = ((X << 3) | (0 0 0 0 0 L L L)) + 4.
+ run length = ((X << 3) | (0 0 0 0 0 L L L)) + 4
0 0 1 L L L L L (32..63)
Copy of small block within 16kB distance (preferably less than 34B)
diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst
index 58a12376b7df..91228044ed16 100644
--- a/Documentation/vm/memory-model.rst
+++ b/Documentation/vm/memory-model.rst
@@ -46,11 +46,10 @@ maps the entire physical memory. For most architectures, the holes
have entries in the `mem_map` array. The `struct page` objects
corresponding to the holes are never fully initialized.
-To allocate the `mem_map` array, architecture specific setup code
-should call :c:func:`free_area_init_node` function or its convenience
-wrapper :c:func:`free_area_init`. Yet, the mappings array is not
-usable until the call to :c:func:`memblock_free_all` that hands all
-the memory to the page allocator.
+To allocate the `mem_map` array, architecture specific setup code should
+call :c:func:`free_area_init` function. Yet, the mappings array is not
+usable until the call to :c:func:`memblock_free_all` that hands all the
+memory to the page allocator.
If an architecture enables `CONFIG_ARCH_HAS_HOLES_MEMORYMODEL` option,
it may free parts of the `mem_map` array that do not cover the
diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst
index 0ed5ab8c7ab4..079f3f8c4784 100644
--- a/Documentation/vm/page_owner.rst
+++ b/Documentation/vm/page_owner.rst
@@ -83,8 +83,7 @@ Usage
4) Analyze information from page owner::
cat /sys/kernel/debug/page_owner > page_owner_full.txt
- grep -v ^PFN page_owner_full.txt > page_owner.txt
- ./page_owner_sort page_owner.txt sorted_page_owner.txt
+ ./page_owner_sort page_owner_full.txt sorted_page_owner.txt
See the result about who allocated each page
in the ``sorted_page_owner.txt``.
diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst
index 933ada4368ff..4eee598555c9 100644
--- a/Documentation/vm/slub.rst
+++ b/Documentation/vm/slub.rst
@@ -49,7 +49,7 @@ Possible debug options are::
P Poisoning (object and padding)
U User tracking (free and alloc)
T Trace (please only use on single slabs)
- A Toggle failslab filter mark for the cache
+ A Enable failslab filter mark for the cache
O Switch debugging off for caches that would have
caused higher minimum slab orders
- Switch all debugging off (useful if the kernel is
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 12e218d3792a..667cd21393b5 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -243,21 +243,17 @@ callback_init(void * kernel_end)
*/
void __init paging_init(void)
{
- unsigned long zones_size[MAX_NR_ZONES] = {0, };
- unsigned long dma_pfn, high_pfn;
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, };
+ unsigned long dma_pfn;
dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- high_pfn = max_pfn = max_low_pfn;
+ max_pfn = max_low_pfn;
- if (dma_pfn >= high_pfn)
- zones_size[ZONE_DMA] = high_pfn;
- else {
- zones_size[ZONE_DMA] = dma_pfn;
- zones_size[ZONE_NORMAL] = high_pfn - dma_pfn;
- }
+ max_zone_pfn[ZONE_DMA] = dma_pfn;
+ max_zone_pfn[ZONE_NORMAL] = max_pfn;
/* Initialize mem_map[]. */
- free_area_init(zones_size);
+ free_area_init(max_zone_pfn);
/* Initialize the kernel's ZERO_PGE. */
memset((void *)ZERO_PGE, 0, PAGE_SIZE);
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index d0b73371e985..5ad6087de1d6 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -144,8 +144,8 @@ setup_memory_node(int nid, void *kernel_end)
if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn))
panic("kernel loaded out of ram");
- memblock_add(PFN_PHYS(node_min_pfn),
- (node_max_pfn - node_min_pfn) << PAGE_SHIFT);
+ memblock_add_node(PFN_PHYS(node_min_pfn),
+ (node_max_pfn - node_min_pfn) << PAGE_SHIFT, nid);
/* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned.
Note that we round this down, not up - node memory
@@ -202,8 +202,7 @@ setup_memory(void *kernel_end)
void __init paging_init(void)
{
- unsigned int nid;
- unsigned long zones_size[MAX_NR_ZONES] = {0, };
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, };
unsigned long dma_local_pfn;
/*
@@ -215,19 +214,10 @@ void __init paging_init(void)
*/
dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- for_each_online_node(nid) {
- unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn;
- unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_present_pages;
+ max_zone_pfn[ZONE_DMA] = dma_local_pfn;
+ max_zone_pfn[ZONE_NORMAL] = max_pfn;
- if (dma_local_pfn >= end_pfn - start_pfn)
- zones_size[ZONE_DMA] = end_pfn - start_pfn;
- else {
- zones_size[ZONE_DMA] = dma_local_pfn;
- zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn;
- }
- node_set_state(nid, N_NORMAL_MEMORY);
- free_area_init_node(nid, zones_size, start_pfn, NULL);
- }
+ free_area_init(max_zone_pfn);
/* Initialize the kernel's ZERO_PGE. */
memset((void *)ZERO_PGE, 0, PAGE_SIZE);
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index ff306246d0f8..471ef22216c4 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -6,6 +6,7 @@
config ARC
def_bool y
select ARC_TIMERS
+ select ARCH_HAS_DEBUG_VM_PGTABLE
select ARCH_HAS_DMA_PREP_COHERENT
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SETUP_DMA_OPS
diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h
index 1af00accb37f..6e5eafb3afdd 100644
--- a/arch/arc/include/asm/highmem.h
+++ b/arch/arc/include/asm/highmem.h
@@ -25,17 +25,8 @@
#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
#define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT)
-#define kmap_prot PAGE_KERNEL
-
-
#include <asm/cacheflush.h>
-extern void *kmap(struct page *page);
-extern void *kmap_high(struct page *page);
-extern void *kmap_atomic(struct page *page);
-extern void __kunmap_atomic(void *kvaddr);
-extern void kunmap_high(struct page *page);
-
extern void kmap_init(void);
static inline void flush_cache_kmaps(void)
@@ -43,15 +34,6 @@ static inline void flush_cache_kmaps(void)
flush_cache_all();
}
-static inline void kunmap(struct page *page)
-{
- BUG_ON(in_interrupt());
- if (!PageHighMem(page))
- return;
- kunmap_high(page);
-}
-
-
#endif
#endif
diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h
index 30ac40fed2c5..4eef17c5c1da 100644
--- a/arch/arc/include/asm/hugepage.h
+++ b/arch/arc/include/asm/hugepage.h
@@ -26,7 +26,7 @@ static inline pmd_t pte_pmd(pte_t pte)
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mkhuge(pmd) pte_pmd(pte_mkhuge(pmd_pte(pmd)))
-#define pmd_mknotpresent(pmd) pte_pmd(pte_mknotpresent(pmd_pte(pmd)))
+#define pmd_mkinvalid(pmd) pte_pmd(pte_mknotpresent(pmd_pte(pmd)))
#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_write(pmd) pte_write(pmd_pte(pmd))
diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c
index fc8849e4f72e..479b0d72d3cf 100644
--- a/arch/arc/mm/highmem.c
+++ b/arch/arc/mm/highmem.c
@@ -49,38 +49,23 @@
extern pte_t * pkmap_page_table;
static pte_t * fixmap_page_table;
-void *kmap(struct page *page)
-{
- BUG_ON(in_interrupt());
- if (!PageHighMem(page))
- return page_address(page);
-
- return kmap_high(page);
-}
-EXPORT_SYMBOL(kmap);
-
-void *kmap_atomic(struct page *page)
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
{
int idx, cpu_idx;
unsigned long vaddr;
- preempt_disable();
- pagefault_disable();
- if (!PageHighMem(page))
- return page_address(page);
-
cpu_idx = kmap_atomic_idx_push();
idx = cpu_idx + KM_TYPE_NR * smp_processor_id();
vaddr = FIXMAP_ADDR(idx);
set_pte_at(&init_mm, vaddr, fixmap_page_table + idx,
- mk_pte(page, kmap_prot));
+ mk_pte(page, prot));
return (void *)vaddr;
}
-EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_high_prot);
-void __kunmap_atomic(void *kv)
+void kunmap_atomic_high(void *kv)
{
unsigned long kvaddr = (unsigned long)kv;
@@ -102,11 +87,8 @@ void __kunmap_atomic(void *kv)
kmap_atomic_idx_pop();
}
-
- pagefault_enable();
- preempt_enable();
}
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_high);
static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr)
{
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index 0920c969c466..e7bdc2ac1c87 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -63,11 +63,13 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
low_mem_sz = size;
in_use = 1;
+ memblock_add_node(base, size, 0);
} else {
#ifdef CONFIG_HIGHMEM
high_mem_start = base;
high_mem_sz = size;
in_use = 1;
+ memblock_add_node(base, size, 1);
#endif
}
@@ -75,6 +77,11 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
base, TO_MB(size), !in_use ? "Not used":"");
}
+bool arch_has_descending_max_zone_pfns(void)
+{
+ return !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
+}
+
/*
* First memory setup routine called from setup_arch()
* 1. setup swapper's mm @init_mm
@@ -83,8 +90,7 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
*/
void __init setup_arch_memory(void)
{
- unsigned long zones_size[MAX_NR_ZONES];
- unsigned long zones_holes[MAX_NR_ZONES];
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
init_mm.start_code = (unsigned long)_text;
init_mm.end_code = (unsigned long)_etext;
@@ -115,7 +121,6 @@ void __init setup_arch_memory(void)
* the crash
*/
- memblock_add_node(low_mem_start, low_mem_sz, 0);
memblock_reserve(CONFIG_LINUX_LINK_BASE,
__pa(_end) - CONFIG_LINUX_LINK_BASE);
@@ -133,22 +138,7 @@ void __init setup_arch_memory(void)
memblock_dump_all();
/*----------------- node/zones setup --------------------------*/
- memset(zones_size, 0, sizeof(zones_size));
- memset(zones_holes, 0, sizeof(zones_holes));
-
- zones_size[ZONE_NORMAL] = max_low_pfn - min_low_pfn;
- zones_holes[ZONE_NORMAL] = 0;
-
- /*
- * We can't use the helper free_area_init(zones[]) because it uses
- * PAGE_OFFSET to compute the @min_low_pfn which would be wrong
- * when our kernel doesn't start at PAGE_OFFSET, i.e.
- * PAGE_OFFSET != CONFIG_LINUX_RAM_BASE
- */
- free_area_init_node(0, /* node-id */
- zones_size, /* num pages per zone */
- min_low_pfn, /* first pfn of node */
- zones_holes); /* holes */
+ max_zone_pfn[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
/*
@@ -168,20 +158,13 @@ void __init setup_arch_memory(void)
min_high_pfn = PFN_DOWN(high_mem_start);
max_high_pfn = PFN_DOWN(high_mem_start + high_mem_sz);
- zones_size[ZONE_NORMAL] = 0;
- zones_holes[ZONE_NORMAL] = 0;
-
- zones_size[ZONE_HIGHMEM] = max_high_pfn - min_high_pfn;
- zones_holes[ZONE_HIGHMEM] = 0;
-
- free_area_init_node(1, /* node-id */
- zones_size, /* num pages per zone */
- min_high_pfn, /* first pfn of node */
- zones_holes); /* holes */
+ max_zone_pfn[ZONE_HIGHMEM] = max_high_pfn;
high_memory = (void *)(min_high_pfn << PAGE_SHIFT);
kmap_init();
#endif
+
+ free_area_init(max_zone_pfn);
}
/*
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 8b83d4a5d309..fe383f5a92fb 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -81,7 +81,7 @@ CONFIG_PARTITION_ADVANCED=y
CONFIG_BINFMT_MISC=y
CONFIG_CMA=y
CONFIG_ZSMALLOC=m
-CONFIG_PGTABLE_MAPPING=y
+CONFIG_ZSMALLOC_PGTABLE_MAPPING=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h
index eb4e4207cd3c..31811be38d78 100644
--- a/arch/arm/include/asm/highmem.h
+++ b/arch/arm/include/asm/highmem.h
@@ -10,8 +10,6 @@
#define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT)
#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
-#define kmap_prot PAGE_KERNEL
-
#define flush_cache_kmaps() \
do { \
if (cache_is_vivt()) \
@@ -20,9 +18,6 @@
extern pte_t *pkmap_page_table;
-extern void *kmap_high(struct page *page);
-extern void kunmap_high(struct page *page);
-
/*
* The reason for kmap_high_get() is to ensure that the currently kmap'd
* page usage count does not decrease to zero while we're using its
@@ -63,10 +58,6 @@ static inline void *kmap_high_get(struct page *page)
* when CONFIG_HIGHMEM is not set.
*/
#ifdef CONFIG_HIGHMEM
-extern void *kmap(struct page *page);
-extern void kunmap(struct page *page);
-extern void *kmap_atomic(struct page *page);
-extern void __kunmap_atomic(void *kvaddr);
extern void *kmap_atomic_pfn(unsigned long pfn);
#endif
diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
index 318dcf5921ab..d02d6ca88e92 100644
--- a/arch/arm/include/asm/hugetlb.h
+++ b/arch/arm/include/asm/hugetlb.h
@@ -14,15 +14,10 @@
#include <asm/hugetlb-3level.h>
#include <asm-generic/hugetlb.h>
-static inline int is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr, unsigned long len)
-{
- return 0;
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_dcache_clean, &page->flags);
}
+#define arch_clear_hugepage_flags arch_clear_hugepage_flags
#endif /* _ASM_ARM_HUGETLB_H */
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 36805f94939e..1933aed9f68d 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -221,7 +221,7 @@ PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
#define pmdp_establish generic_pmdp_establish
/* represent a notpresent pmd by faulting entry, this is used by pmdp_invalidate */
-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+static inline pmd_t pmd_mkinvalid(pmd_t pmd)
{
return __pmd(pmd_val(pmd) & ~L_PMD_SECT_VALID);
}
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index a76f8ace9ce6..e013f6b81328 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -31,36 +31,13 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
return *ptep;
}
-void *kmap(struct page *page)
-{
- might_sleep();
- if (!PageHighMem(page))
- return page_address(page);
- return kmap_high(page);
-}
-EXPORT_SYMBOL(kmap);
-
-void kunmap(struct page *page)
-{
- BUG_ON(in_interrupt());
- if (!PageHighMem(page))
- return;
- kunmap_high(page);
-}
-EXPORT_SYMBOL(kunmap);
-
-void *kmap_atomic(struct page *page)
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
{
unsigned int idx;
unsigned long vaddr;
void *kmap;
int type;
- preempt_disable();
- pagefault_disable();
- if (!PageHighMem(page))
- return page_address(page);
-
#ifdef CONFIG_DEBUG_HIGHMEM
/*
* There is no cache coherency issue when non VIVT, so force the
@@ -90,13 +67,13 @@ void *kmap_atomic(struct page *page)
* in place, so the contained TLB flush ensures the TLB is updated
* with the new mapping.
*/
- set_fixmap_pte(idx, mk_pte(page, kmap_prot));
+ set_fixmap_pte(idx, mk_pte(page, prot));
return (void *)vaddr;
}
-EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_high_prot);
-void __kunmap_atomic(void *kvaddr)
+void kunmap_atomic_high(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
int idx, type;
@@ -118,10 +95,8 @@ void __kunmap_atomic(void *kvaddr)
/* this address was obtained through kmap_high_get() */
kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
}
- pagefault_enable();
- preempt_enable();
}
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_high);
void *kmap_atomic_pfn(unsigned long pfn)
{
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 054be44d1cdb..4e43455fab84 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -92,18 +92,6 @@ EXPORT_SYMBOL(arm_dma_zone_size);
*/
phys_addr_t arm_dma_limit;
unsigned long arm_dma_pfn_limit;
-
-static void __init arm_adjust_dma_zone(unsigned long *size, unsigned long *hole,
- unsigned long dma_size)
-{
- if (size[0] <= dma_size)
- return;
-
- size[ZONE_NORMAL] = size[0] - dma_size;
- size[ZONE_DMA] = dma_size;
- hole[ZONE_NORMAL] = hole[0];
- hole[ZONE_DMA] = 0;
-}
#endif
void __init setup_dma_zone(const struct machine_desc *mdesc)
@@ -121,56 +109,16 @@ void __init setup_dma_zone(const struct machine_desc *mdesc)
static void __init zone_sizes_init(unsigned long min, unsigned long max_low,
unsigned long max_high)
{
- unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
- struct memblock_region *reg;
-
- /*
- * initialise the zones.
- */
- memset(zone_size, 0, sizeof(zone_size));
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
- /*
- * The memory size has already been determined. If we need
- * to do anything fancy with the allocation of this memory
- * to the zones, now is the time to do it.
- */
- zone_size[0] = max_low - min;
-#ifdef CONFIG_HIGHMEM
- zone_size[ZONE_HIGHMEM] = max_high - max_low;
+#ifdef CONFIG_ZONE_DMA
+ max_zone_pfn[ZONE_DMA] = min(arm_dma_pfn_limit, max_low);
#endif
-
- /*
- * Calculate the size of the holes.
- * holes = node_size - sum(bank_sizes)
- */
- memcpy(zhole_size, zone_size, sizeof(zhole_size));
- for_each_memblock(memory, reg) {
- unsigned long start = memblock_region_memory_base_pfn(reg);
- unsigned long end = memblock_region_memory_end_pfn(reg);
-
- if (start < max_low) {
- unsigned long low_end = min(end, max_low);
- zhole_size[0] -= low_end - start;
- }
+ max_zone_pfn[ZONE_NORMAL] = max_low;
#ifdef CONFIG_HIGHMEM
- if (end > max_low) {
- unsigned long high_start = max(start, max_low);
- zhole_size[ZONE_HIGHMEM] -= end - high_start;
- }
+ max_zone_pfn[ZONE_HIGHMEM] = max_high;
#endif
- }
-
-#ifdef CONFIG_ZONE_DMA
- /*
- * Adjust the sizes according to any special requirements for
- * this machine type.
- */
- if (arm_dma_zone_size)
- arm_adjust_dma_zone(zone_size, zhole_size,
- arm_dma_zone_size >> PAGE_SHIFT);
-#endif
-
- free_area_init_node(0, zone_size, min, zhole_size);
+ free_area_init(max_zone_pfn);
}
#ifdef CONFIG_HAVE_ARCH_PFN_VALID
@@ -306,7 +254,7 @@ void __init bootmem_init(void)
sparse_init();
/*
- * Now free the memory - free_area_init_node needs
+ * Now free the memory - free_area_init needs
* the sparse mem_map arrays initialized by sparse_init()
* for memmap_init_zone(), otherwise all PFNs are invalid.
*/
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 552d36cacc05..45d304524295 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -10,7 +10,9 @@ config ARM64
select ACPI_SPCR_TABLE if ACPI
select ACPI_PPTT if ACPI
select ARCH_BINFMT_ELF_STATE
+ select ARCH_HAS_DEBUG_WX
select ARCH_HAS_DEBUG_VIRTUAL
+ select ARCH_HAS_DEBUG_VM_PGTABLE
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_DMA_PREP_COHERENT
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
@@ -162,7 +164,6 @@ config ARM64
select HAVE_GCC_PLUGINS
select HAVE_HW_BREAKPOINT if PERF_EVENTS
select HAVE_IRQ_TIME_ACCOUNTING
- select HAVE_MEMBLOCK_NODE_MAP if NUMA
select HAVE_NMI
select HAVE_PATA_PLATFORM
select HAVE_PERF_EVENTS
diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index a1efa246c9ed..cdf7ec0b975e 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -23,35 +23,6 @@ config ARM64_RANDOMIZE_TEXT_OFFSET
of TEXT_OFFSET and platforms must not require a specific
value.
-config DEBUG_WX
- bool "Warn on W+X mappings at boot"
- select PTDUMP_CORE
- ---help---
- Generate a warning if any W+X mappings are found at boot.
-
- This is useful for discovering cases where the kernel is leaving
- W+X mappings after applying NX, as such mappings are a security risk.
- This check also includes UXN, which should be set on all kernel
- mappings.
-
- Look for a message in dmesg output like this:
-
- arm64/mm: Checked W+X mappings: passed, no W+X pages found.
-
- or like this, if the check failed:
-
- arm64/mm: Checked W+X mappings: FAILED, <N> W+X pages found.
-
- Note that even if the check fails, your kernel is possibly
- still fine, as W+X mappings are not a security hole in
- themselves, what they do is that they make the exploitation
- of other unfixed kernel bugs easier.
-
- There is no runtime or memory usage effect of this option
- once the kernel has booted up - it's a one time check.
-
- If in doubt, say "Y".
-
config DEBUG_EFI
depends on EFI && DEBUG_INFO
bool "UEFI debugging"
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 2eb6c234d594..94ba0c5bced2 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -17,22 +17,11 @@
extern bool arch_hugetlb_migration_supported(struct hstate *h);
#endif
-#define __HAVE_ARCH_HUGE_PTEP_GET
-static inline pte_t huge_ptep_get(pte_t *ptep)
-{
- return READ_ONCE(*ptep);
-}
-
-static inline int is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr, unsigned long len)
-{
- return 0;
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_dcache_clean, &page->flags);
}
+#define arch_clear_hugepage_flags arch_clear_hugepage_flags
extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
struct page *page, int writable);
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index e50e4dda90c2..9ce000f22d9e 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -366,7 +366,7 @@ static inline int pmd_protnone(pmd_t pmd)
#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
-#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID))
+#define pmd_mkinvalid(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID))
#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd))
@@ -407,6 +407,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
#define __pgprot_modify(prot,mask,bits) \
__pgprot((pgprot_val(prot) & ~(mask)) | (bits))
+#define pgprot_nx(prot) \
+ __pgprot_modify(prot, 0, PTE_PXN)
+
/*
* Mark the prot value as uncacheable and unbufferable.
*/
diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h
index 0a12115d9638..0cc6636e3f15 100644
--- a/arch/arm64/include/asm/vmap_stack.h
+++ b/arch/arm64/include/asm/vmap_stack.h
@@ -19,10 +19,8 @@ static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node)
{
BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK));
- return __vmalloc_node_range(stack_size, THREAD_ALIGN,
- VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP, PAGE_KERNEL, 0, node,
- __builtin_return_address(0));
+ return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node,
+ __builtin_return_address(0));
}
#endif /* __ASM_VMAP_STACK_H */
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index 78163b7a7dde..0da020c563e6 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -252,7 +252,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
}
static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
- unsigned long val)
+ u64 val)
{
struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
static const char units[] = "KMGTPE";
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 0be3355e3499..07f154b8b84a 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -443,44 +443,30 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma,
clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig);
}
-static void __init add_huge_page_size(unsigned long size)
-{
- if (size_to_hstate(size))
- return;
-
- hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
-}
-
static int __init hugetlbpage_init(void)
{
#ifdef CONFIG_ARM64_4K_PAGES
- add_huge_page_size(PUD_SIZE);
+ hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
#endif
- add_huge_page_size(CONT_PMD_SIZE);
- add_huge_page_size(PMD_SIZE);
- add_huge_page_size(CONT_PTE_SIZE);
+ hugetlb_add_hstate((CONT_PMD_SHIFT + PMD_SHIFT) - PAGE_SHIFT);
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+ hugetlb_add_hstate((CONT_PTE_SHIFT + PAGE_SHIFT) - PAGE_SHIFT);
return 0;
}
arch_initcall(hugetlbpage_init);
-static __init int setup_hugepagesz(char *opt)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
- unsigned long ps = memparse(opt, &opt);
-
- switch (ps) {
+ switch (size) {
#ifdef CONFIG_ARM64_4K_PAGES
case PUD_SIZE:
#endif
case CONT_PMD_SIZE:
case PMD_SIZE:
case CONT_PTE_SIZE:
- add_huge_page_size(ps);
- return 1;
+ return true;
}
- hugetlb_bad_size();
- pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
- return 0;
+ return false;
}
-__setup("hugepagesz=", setup_hugepagesz);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index d2df416b840e..e631e6425165 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -192,8 +192,6 @@ static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
return min(offset + (1ULL << zone_bits), memblock_end_of_DRAM());
}
-#ifdef CONFIG_NUMA
-
static void __init zone_sizes_init(unsigned long min, unsigned long max)
{
unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
@@ -206,61 +204,9 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
#endif
max_zone_pfns[ZONE_NORMAL] = max;
- free_area_init_nodes(max_zone_pfns);
-}
-
-#else
-
-static void __init zone_sizes_init(unsigned long min, unsigned long max)
-{
- struct memblock_region *reg;
- unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
- unsigned long __maybe_unused max_dma, max_dma32;
-
- memset(zone_size, 0, sizeof(zone_size));
-
- max_dma = max_dma32 = min;
-#ifdef CONFIG_ZONE_DMA
- max_dma = max_dma32 = PFN_DOWN(arm64_dma_phys_limit);
- zone_size[ZONE_DMA] = max_dma - min;
-#endif
-#ifdef CONFIG_ZONE_DMA32
- max_dma32 = PFN_DOWN(arm64_dma32_phys_limit);
- zone_size[ZONE_DMA32] = max_dma32 - max_dma;
-#endif
- zone_size[ZONE_NORMAL] = max - max_dma32;
-
- memcpy(zhole_size, zone_size, sizeof(zhole_size));
-
- for_each_memblock(memory, reg) {
- unsigned long start = memblock_region_memory_base_pfn(reg);
- unsigned long end = memblock_region_memory_end_pfn(reg);
-
-#ifdef CONFIG_ZONE_DMA
- if (start >= min && start < max_dma) {
- unsigned long dma_end = min(end, max_dma);
- zhole_size[ZONE_DMA] -= dma_end - start;
- start = dma_end;
- }
-#endif
-#ifdef CONFIG_ZONE_DMA32
- if (start >= max_dma && start < max_dma32) {
- unsigned long dma32_end = min(end, max_dma32);
- zhole_size[ZONE_DMA32] -= dma32_end - start;
- start = dma32_end;
- }
-#endif
- if (start >= max_dma32 && start < max) {
- unsigned long normal_end = min(end, max);
- zhole_size[ZONE_NORMAL] -= normal_end - start;
- }
- }
-
- free_area_init_node(0, zone_size, min, zhole_size);
+ free_area_init(max_zone_pfns);
}
-#endif /* CONFIG_NUMA */
-
int pfn_valid(unsigned long pfn)
{
phys_addr_t addr = pfn << PAGE_SHIFT;
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index 4decf1659700..aafcee3e3f7e 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -350,13 +350,16 @@ static int __init numa_register_nodes(void)
struct memblock_region *mblk;
/* Check that valid nid is set to memblks */
- for_each_memblock(memory, mblk)
- if (mblk->nid == NUMA_NO_NODE || mblk->nid >= MAX_NUMNODES) {
+ for_each_memblock(memory, mblk) {
+ int mblk_nid = memblock_get_region_node(mblk);
+
+ if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) {
pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
- mblk->nid, mblk->base,
+ mblk_nid, mblk->base,
mblk->base + mblk->size - 1);
return -EINVAL;
}
+ }
/* Finally register nodes. */
for_each_node_mask(nid, numa_nodes_parsed) {
diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c
index 9b374393a8f4..a97e51a3e26d 100644
--- a/arch/c6x/mm/init.c
+++ b/arch/c6x/mm/init.c
@@ -33,7 +33,7 @@ EXPORT_SYMBOL(empty_zero_page);
void __init paging_init(void)
{
struct pglist_data *pgdat = NODE_DATA(0);
- unsigned long zones_size[MAX_NR_ZONES] = {0, };
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, };
empty_zero_page = (unsigned long) memblock_alloc(PAGE_SIZE,
PAGE_SIZE);
@@ -49,11 +49,9 @@ void __init paging_init(void)
/*
* Define zones
*/
- zones_size[ZONE_NORMAL] = (memory_end - PAGE_OFFSET) >> PAGE_SHIFT;
- pgdat->node_zones[ZONE_NORMAL].zone_start_pfn =
- __pa(PAGE_OFFSET) >> PAGE_SHIFT;
+ max_zone_pfn[ZONE_NORMAL] = memory_end >> PAGE_SHIFT;
- free_area_init(zones_size);
+ free_area_init(max_zone_pfn);
}
void __init mem_init(void)
diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h
index a345a2f2c22e..14645e3d5cd5 100644
--- a/arch/csky/include/asm/highmem.h
+++ b/arch/csky/include/asm/highmem.h
@@ -30,22 +30,14 @@ extern pte_t *pkmap_page_table;
#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
-extern void *kmap_high(struct page *page);
-extern void kunmap_high(struct page *page);
-
-extern void *kmap(struct page *page);
-extern void kunmap(struct page *page);
-extern void *kmap_atomic(struct page *page);
-extern void __kunmap_atomic(void *kvaddr);
+#define ARCH_HAS_KMAP_FLUSH_TLB
+extern void kmap_flush_tlb(unsigned long addr);
extern void *kmap_atomic_pfn(unsigned long pfn);
-extern struct page *kmap_atomic_to_page(void *ptr);
#define flush_cache_kmaps() do {} while (0)
extern void kmap_init(void);
-#define kmap_prot PAGE_KERNEL
-
#endif /* __KERNEL__ */
#endif /* __ASM_CSKY_HIGHMEM_H */
diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c
index 819a9a7bf786..0481f4e34538 100644
--- a/arch/csky/kernel/setup.c
+++ b/arch/csky/kernel/setup.c
@@ -26,7 +26,9 @@ struct screen_info screen_info = {
static void __init csky_memblock_init(void)
{
- unsigned long zone_size[MAX_NR_ZONES];
+ unsigned long lowmem_size = PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET);
+ unsigned long sseg_size = PFN_DOWN(SSEG_SIZE - PHYS_OFFSET_OFFSET);
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
signed long size;
memblock_reserve(__pa(_stext), _end - _stext);
@@ -36,28 +38,22 @@ static void __init csky_memblock_init(void)
memblock_dump_all();
- memset(zone_size, 0, sizeof(zone_size));
-
min_low_pfn = PFN_UP(memblock_start_of_DRAM());
max_low_pfn = max_pfn = PFN_DOWN(memblock_end_of_DRAM());
size = max_pfn - min_low_pfn;
- if (size <= PFN_DOWN(SSEG_SIZE - PHYS_OFFSET_OFFSET))
- zone_size[ZONE_NORMAL] = size;
- else if (size < PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET)) {
- zone_size[ZONE_NORMAL] =
- PFN_DOWN(SSEG_SIZE - PHYS_OFFSET_OFFSET);
- max_low_pfn = min_low_pfn + zone_size[ZONE_NORMAL];
- } else {
- zone_size[ZONE_NORMAL] =
- PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET);
- max_low_pfn = min_low_pfn + zone_size[ZONE_NORMAL];
+ if (size >= lowmem_size) {
+ max_low_pfn = min_low_pfn + lowmem_size;
write_mmu_msa1(read_mmu_msa0() + SSEG_SIZE);
+ } else if (size > sseg_size) {
+ max_low_pfn = min_low_pfn + sseg_size;
}
+ max_zone_pfn[ZONE_NORMAL] = max_low_pfn;
+
#ifdef CONFIG_HIGHMEM
- zone_size[ZONE_HIGHMEM] = max_pfn - max_low_pfn;
+ max_zone_pfn[ZONE_HIGHMEM] = max_pfn;
highstart_pfn = max_low_pfn;
highend_pfn = max_pfn;
@@ -66,7 +62,7 @@ static void __init csky_memblock_init(void)
dma_contiguous_reserve(0);
- free_area_init_node(0, zone_size, min_low_pfn, NULL);
+ free_area_init(max_zone_pfn);
}
void __init setup_arch(char **cmdline_p)
diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c
index 813129145f3d..3b3f622f5ae9 100644
--- a/arch/csky/mm/highmem.c
+++ b/arch/csky/mm/highmem.c
@@ -13,59 +13,39 @@ static pte_t *kmap_pte;
unsigned long highstart_pfn, highend_pfn;
-void *kmap(struct page *page)
+void kmap_flush_tlb(unsigned long addr)
{
- void *addr;
-
- might_sleep();
- if (!PageHighMem(page))
- return page_address(page);
- addr = kmap_high(page);
- flush_tlb_one((unsigned long)addr);
-
- return addr;
+ flush_tlb_one(addr);
}
-EXPORT_SYMBOL(kmap);
+EXPORT_SYMBOL(kmap_flush_tlb);
-void kunmap(struct page *page)
-{
- BUG_ON(in_interrupt());
- if (!PageHighMem(page))
- return;
- kunmap_high(page);
-}
-EXPORT_SYMBOL(kunmap);
+EXPORT_SYMBOL(kmap);
-void *kmap_atomic(struct page *page)
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
{
unsigned long vaddr;
int idx, type;
- preempt_disable();
- pagefault_disable();
- if (!PageHighMem(page))
- return page_address(page);
-
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
#ifdef CONFIG_DEBUG_HIGHMEM
BUG_ON(!pte_none(*(kmap_pte - idx)));
#endif
- set_pte(kmap_pte-idx, mk_pte(page, PAGE_KERNEL));
+ set_pte(kmap_pte-idx, mk_pte(page, prot));
flush_tlb_one((unsigned long)vaddr);
return (void *)vaddr;
}
-EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_high_prot);
-void __kunmap_atomic(void *kvaddr)
+void kunmap_atomic_high(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
int idx;
if (vaddr < FIXADDR_START)
- goto out;
+ return;
#ifdef CONFIG_DEBUG_HIGHMEM
idx = KM_TYPE_NR*smp_processor_id() + kmap_atomic_idx();
@@ -78,11 +58,8 @@ void __kunmap_atomic(void *kvaddr)
(void) idx; /* to kill a warning */
#endif
kmap_atomic_idx_pop();
-out:
- pagefault_enable();
- preempt_enable();
}
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_high);
/*
* This is the same as kmap_atomic() but can map memory that doesn't
@@ -104,19 +81,6 @@ void *kmap_atomic_pfn(unsigned long pfn)
return (void *) vaddr;
}
-struct page *kmap_atomic_to_page(void *ptr)
-{
- unsigned long idx, vaddr = (unsigned long)ptr;
- pte_t *pte;
-
- if (vaddr < FIXADDR_START)
- return virt_to_page(ptr);
-
- idx = virt_to_fix(vaddr);
- pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
- return pte_page(*pte);
-}
-
static void __init kmap_pages_init(void)
{
unsigned long vaddr;
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index 1eab16b1a0bc..27a0020e3771 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -83,10 +83,10 @@ void __init paging_init(void)
start_mem, end_mem);
{
- unsigned long zones_size[MAX_NR_ZONES] = {0, };
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, };
- zones_size[ZONE_NORMAL] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT;
- free_area_init(zones_size);
+ max_zone_pfn[ZONE_NORMAL] = end_mem >> PAGE_SHIFT;
+ free_area_init(max_zone_pfn);
}
}
diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c
index c961773a6fff..f2e6c868e477 100644
--- a/arch/hexagon/mm/init.c
+++ b/arch/hexagon/mm/init.c
@@ -91,7 +91,7 @@ void sync_icache_dcache(pte_t pte)
*/
void __init paging_init(void)
{
- unsigned long zones_sizes[MAX_NR_ZONES] = {0, };
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, };
/*
* This is not particularly well documented anywhere, but
@@ -101,9 +101,9 @@ void __init paging_init(void)
* adjust accordingly.
*/
- zones_sizes[ZONE_NORMAL] = max_low_pfn;
+ max_zone_pfn[ZONE_NORMAL] = max_low_pfn;
- free_area_init(zones_sizes); /* sets up the zonelists and mem_map */
+ free_area_init(max_zone_pfn); /* sets up the zonelists and mem_map */
/*
* Start of high memory area. Will probably need something more
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index bab7cd878464..88b05b5256a9 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -31,7 +31,6 @@ config IA64
select HAVE_FUNCTION_TRACER
select TTY
select HAVE_ARCH_TRACEHOOK
- select HAVE_MEMBLOCK_NODE_MAP
select HAVE_VIRT_CPU_ACCOUNTING
select DMA_NONCOHERENT_MMAP
select ARCH_HAS_SYNC_DMA_FOR_CPU
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h
index 36cc0396b214..7e46ebde8c0c 100644
--- a/arch/ia64/include/asm/hugetlb.h
+++ b/arch/ia64/include/asm/hugetlb.h
@@ -20,6 +20,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
return (REGION_NUMBER(addr) == RGN_HPAGE ||
REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE);
}
+#define is_hugepage_only_range is_hugepage_only_range
#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
@@ -27,10 +28,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
{
}
-static inline void arch_clear_hugepage_flags(struct page *page)
-{
-}
-
#include <asm-generic/hugetlb.h>
#endif /* _ASM_IA64_HUGETLB_H */
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index 5b00dc3898e1..8786fa5c7612 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -210,6 +210,6 @@ paging_init (void)
printk("Virtual mem_map starts at 0x%p\n", mem_map);
}
#endif /* !CONFIG_VIRTUAL_MEM_MAP */
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
}
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 4f33f6e7e206..dd8284bcbf16 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -627,7 +627,7 @@ void __init paging_init(void)
max_zone_pfns[ZONE_DMA32] = max_dma;
#endif
max_zone_pfns[ZONE_NORMAL] = max_pfn;
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
}
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index b88d510d4fe3..6d3147662ff2 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -84,7 +84,7 @@ void __init paging_init(void)
* page_alloc get different views of the world.
*/
unsigned long end_mem = memory_end & PAGE_MASK;
- unsigned long zones_size[MAX_NR_ZONES] = { 0, };
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, };
high_memory = (void *) end_mem;
@@ -98,8 +98,8 @@ void __init paging_init(void)
*/
set_fs (USER_DS);
- zones_size[ZONE_DMA] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT;
- free_area_init(zones_size);
+ max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT;
+ free_area_init(max_zone_pfn);
}
#endif /* CONFIG_MMU */
diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c
index 0ea375607767..80064e6d064f 100644
--- a/arch/m68k/mm/mcfmmu.c
+++ b/arch/m68k/mm/mcfmmu.c
@@ -39,7 +39,7 @@ void __init paging_init(void)
pte_t *pg_table;
unsigned long address, size;
unsigned long next_pgtable, bootmem_end;
- unsigned long zones_size[MAX_NR_ZONES];
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
enum zone_type zone;
int i;
@@ -80,11 +80,8 @@ void __init paging_init(void)
}
current->mm = NULL;
-
- for (zone = 0; zone < MAX_NR_ZONES; zone++)
- zones_size[zone] = 0x0;
- zones_size[ZONE_DMA] = num_pages;
- free_area_init(zones_size);
+ max_zone_pfn[ZONE_DMA] = PFN_DOWN(_ramend);
+ free_area_init(max_zone_pfn);
}
int cf_tlb_miss(struct pt_regs *regs, int write, int dtlb, int extension_word)
diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c
index fc16190ec2d6..904c2a663977 100644
--- a/arch/m68k/mm/motorola.c
+++ b/arch/m68k/mm/motorola.c
@@ -365,7 +365,7 @@ static void __init map_node(int node)
*/
void __init paging_init(void)
{
- unsigned long zones_size[MAX_NR_ZONES] = { 0, };
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, };
unsigned long min_addr, max_addr;
unsigned long addr;
int i;
@@ -386,7 +386,7 @@ void __init paging_init(void)
min_addr = m68k_memory[0].addr;
max_addr = min_addr + m68k_memory[0].size;
- memblock_add(m68k_memory[0].addr, m68k_memory[0].size);
+ memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0);
for (i = 1; i < m68k_num_memory;) {
if (m68k_memory[i].addr < min_addr) {
printk("Ignoring memory chunk at 0x%lx:0x%lx before the first chunk\n",
@@ -397,7 +397,7 @@ void __init paging_init(void)
(m68k_num_memory - i) * sizeof(struct m68k_mem_info));
continue;
}
- memblock_add(m68k_memory[i].addr, m68k_memory[i].size);
+ memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i);
addr = m68k_memory[i].addr + m68k_memory[i].size;
if (addr > max_addr)
max_addr = addr;
@@ -448,11 +448,10 @@ void __init paging_init(void)
#ifdef DEBUG
printk ("before free_area_init\n");
#endif
- for (i = 0; i < m68k_num_memory; i++) {
- zones_size[ZONE_DMA] = m68k_memory[i].size >> PAGE_SHIFT;
- free_area_init_node(i, zones_size,
- m68k_memory[i].addr >> PAGE_SHIFT, NULL);
+ for (i = 0; i < m68k_num_memory; i++)
if (node_present_pages(i))
node_set_state(i, N_NORMAL_MEMORY);
- }
+
+ max_zone_pfn[ZONE_DMA] = memblock_end_of_DRAM();
+ free_area_init(max_zone_pfn);
}
diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c
index eca1c46bb90a..5d8d956d9329 100644
--- a/arch/m68k/mm/sun3mmu.c
+++ b/arch/m68k/mm/sun3mmu.c
@@ -42,7 +42,7 @@ void __init paging_init(void)
unsigned long address;
unsigned long next_pgtable;
unsigned long bootmem_end;
- unsigned long zones_size[MAX_NR_ZONES] = { 0, };
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, };
unsigned long size;
empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
@@ -89,14 +89,10 @@ void __init paging_init(void)
current->mm = NULL;
/* memory sizing is a hack stolen from motorola.c.. hope it works for us */
- zones_size[ZONE_DMA] = ((unsigned long)high_memory - PAGE_OFFSET) >> PAGE_SHIFT;
+ max_zone_pfn[ZONE_DMA] = ((unsigned long)high_memory) >> PAGE_SHIFT;
/* I really wish I knew why the following change made things better... -- Sam */
-/* free_area_init(zones_size); */
- free_area_init_node(0, zones_size,
- (__pa(PAGE_OFFSET) >> PAGE_SHIFT) + 1, NULL);
+ free_area_init(max_zone_pfn);
}
-
-
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 9606c244b5b8..d262ac0c8714 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -32,7 +32,6 @@ config MICROBLAZE
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
- select HAVE_MEMBLOCK_NODE_MAP
select HAVE_OPROFILE
select HAVE_PCI
select IRQ_DOMAIN
diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h
index 332c78e15198..284ca8fb54c1 100644
--- a/arch/microblaze/include/asm/highmem.h
+++ b/arch/microblaze/include/asm/highmem.h
@@ -26,7 +26,6 @@
#include <asm/fixmap.h>
extern pte_t *kmap_pte;
-extern pgprot_t kmap_prot;
extern pte_t *pkmap_page_table;
/*
@@ -51,32 +50,6 @@ extern pte_t *pkmap_page_table;
#define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT)
#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
-extern void *kmap_high(struct page *page);
-extern void kunmap_high(struct page *page);
-extern void *kmap_atomic_prot(struct page *page, pgprot_t prot);
-extern void __kunmap_atomic(void *kvaddr);
-
-static inline void *kmap(struct page *page)
-{
- might_sleep();
- if (!PageHighMem(page))
- return page_address(page);
- return kmap_high(page);
-}
-
-static inline void kunmap(struct page *page)
-{
- BUG_ON(in_interrupt());
- if (!PageHighMem(page))
- return;
- kunmap_high(page);
-}
-
-static inline void *kmap_atomic(struct page *page)
-{
- return kmap_atomic_prot(page, kmap_prot);
-}
-
#define flush_cache_kmaps() { flush_icache(); flush_dcache(); }
#endif /* __KERNEL__ */
diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c
index d7569f77fa15..92e0890416c9 100644
--- a/arch/microblaze/mm/highmem.c
+++ b/arch/microblaze/mm/highmem.c
@@ -32,18 +32,12 @@
*/
#include <asm/tlbflush.h>
-void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
{
unsigned long vaddr;
int idx, type;
- preempt_disable();
- pagefault_disable();
- if (!PageHighMem(page))
- return page_address(page);
-
-
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
@@ -55,19 +49,16 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
return (void *) vaddr;
}
-EXPORT_SYMBOL(kmap_atomic_prot);
+EXPORT_SYMBOL(kmap_atomic_high_prot);
-void __kunmap_atomic(void *kvaddr)
+void kunmap_atomic_high(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
int type;
unsigned int idx;
- if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
- pagefault_enable();
- preempt_enable();
+ if (vaddr < __fix_to_virt(FIX_KMAP_END))
return;
- }
type = kmap_atomic_idx();
@@ -83,7 +74,5 @@ void __kunmap_atomic(void *kvaddr)
local_flush_tlb_page(NULL, vaddr);
kmap_atomic_idx_pop();
- pagefault_enable();
- preempt_enable();
}
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_high);
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 1ffbfa96b9b8..d943f69784b1 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -49,8 +49,6 @@ unsigned long lowmem_size;
#ifdef CONFIG_HIGHMEM
pte_t *kmap_pte;
EXPORT_SYMBOL(kmap_pte);
-pgprot_t kmap_prot;
-EXPORT_SYMBOL(kmap_prot);
static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
@@ -68,7 +66,6 @@ static void __init highmem_init(void)
pkmap_page_table = virt_to_kpte(PKMAP_BASE);
kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN));
- kmap_prot = PAGE_KERNEL;
}
static void highmem_setup(void)
@@ -112,7 +109,7 @@ static void __init paging_init(void)
#endif
/* We don't have holes in memory map */
- free_area_init_nodes(zones_size);
+ free_area_init(zones_size);
}
void __init setup_memory(void)
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 3d31297f49d4..3ca59b610a67 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -72,7 +72,6 @@ config MIPS
select HAVE_KPROBES
select HAVE_KRETPROBES
select HAVE_LD_DEAD_CODE_DATA_ELIMINATION
- select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NMI
select HAVE_OPROFILE
diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h
index 9d84aafc33d0..f1f788b57166 100644
--- a/arch/mips/include/asm/highmem.h
+++ b/arch/mips/include/asm/highmem.h
@@ -46,21 +46,14 @@ extern pte_t *pkmap_page_table;
#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
-extern void * kmap_high(struct page *page);
-extern void kunmap_high(struct page *page);
-
-extern void *kmap(struct page *page);
-extern void kunmap(struct page *page);
-extern void *kmap_atomic(struct page *page);
-extern void __kunmap_atomic(void *kvaddr);
+#define ARCH_HAS_KMAP_FLUSH_TLB
+extern void kmap_flush_tlb(unsigned long addr);
extern void *kmap_atomic_pfn(unsigned long pfn);
#define flush_cache_kmaps() BUG_ON(cpu_has_dc_aliases)
extern void kmap_init(void);
-#define kmap_prot PAGE_KERNEL
-
#endif /* __KERNEL__ */
#endif /* _ASM_HIGHMEM_H */
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index 425bb6fc3bda..10e3be870df7 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -11,13 +11,6 @@
#include <asm/page.h>
-static inline int is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr,
- unsigned long len)
-{
- return 0;
-}
-
#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
static inline int prepare_hugepage_range(struct file *file,
unsigned long addr,
@@ -82,10 +75,6 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
return changed;
}
-static inline void arch_clear_hugepage_flags(struct page *page)
-{
-}
-
#include <asm-generic/hugetlb.h>
#endif /* __ASM_HUGETLB_H */
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 9b01d2d78753..63c42054b1a4 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -672,7 +672,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
return pmd;
}
-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+static inline pmd_t pmd_mkinvalid(pmd_t pmd)
{
pmd_val(pmd) &= ~(_PAGE_PRESENT | _PAGE_VALID | _PAGE_DIRTY);
diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c
index 1ae072df4831..901f5be5ee76 100644
--- a/arch/mips/loongson64/numa.c
+++ b/arch/mips/loongson64/numa.c
@@ -247,7 +247,7 @@ void __init paging_init(void)
zones_size[ZONE_DMA32] = MAX_DMA32_PFN;
#endif
zones_size[ZONE_NORMAL] = max_low_pfn;
- free_area_init_nodes(zones_size);
+ free_area_init(zones_size);
}
void __init mem_init(void)
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index ad6df1cea866..3e81ba000096 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -14,9 +14,9 @@
#include <linux/sched.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <asm/cacheflush.h>
-#include <asm/highmem.h>
#include <asm/processor.h>
#include <asm/cpu.h>
#include <asm/cpu-features.h>
@@ -103,7 +103,7 @@ void __flush_dcache_page(struct page *page)
flush_data_cache_page(addr);
if (PageHighMem(page))
- __kunmap_atomic((void *)addr);
+ kunmap_atomic((void *)addr);
}
EXPORT_SYMBOL(__flush_dcache_page);
@@ -146,7 +146,7 @@ void __update_cache(unsigned long address, pte_t pte)
flush_data_cache_page(addr);
if (PageHighMem(page))
- __kunmap_atomic((void *)addr);
+ kunmap_atomic((void *)addr);
ClearPageDcacheDirty(page);
}
diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c
index d08e6d7d533b..8e8726992720 100644
--- a/arch/mips/mm/highmem.c
+++ b/arch/mips/mm/highmem.c
@@ -12,71 +12,37 @@ static pte_t *kmap_pte;
unsigned long highstart_pfn, highend_pfn;
-void *kmap(struct page *page)
+void kmap_flush_tlb(unsigned long addr)
{
- void *addr;
-
- might_sleep();
- if (!PageHighMem(page))
- return page_address(page);
- addr = kmap_high(page);
- flush_tlb_one((unsigned long)addr);
-
- return addr;
+ flush_tlb_one(addr);
}
-EXPORT_SYMBOL(kmap);
+EXPORT_SYMBOL(kmap_flush_tlb);
-void kunmap(struct page *page)
-{
- BUG_ON(in_interrupt());
- if (!PageHighMem(page))
- return;
- kunmap_high(page);
-}
-EXPORT_SYMBOL(kunmap);
-
-/*
- * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
- * no global lock is needed and because the kmap code must perform a global TLB
- * invalidation when the kmap pool wraps.
- *
- * However when holding an atomic kmap is is not legal to sleep, so atomic
- * kmaps are appropriate for short, tight code paths only.
- */
-
-void *kmap_atomic(struct page *page)
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
{
unsigned long vaddr;
int idx, type;
- preempt_disable();
- pagefault_disable();
- if (!PageHighMem(page))
- return page_address(page);
-
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
#ifdef CONFIG_DEBUG_HIGHMEM
BUG_ON(!pte_none(*(kmap_pte - idx)));
#endif
- set_pte(kmap_pte-idx, mk_pte(page, PAGE_KERNEL));
+ set_pte(kmap_pte-idx, mk_pte(page, prot));
local_flush_tlb_one((unsigned long)vaddr);
return (void*) vaddr;
}
-EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_high_prot);
-void __kunmap_atomic(void *kvaddr)
+void kunmap_atomic_high(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
int type __maybe_unused;
- if (vaddr < FIXADDR_START) { // FIXME
- pagefault_enable();
- preempt_enable();
+ if (vaddr < FIXADDR_START)
return;
- }
type = kmap_atomic_idx();
#ifdef CONFIG_DEBUG_HIGHMEM
@@ -94,10 +60,8 @@ void __kunmap_atomic(void *kvaddr)
}
#endif
kmap_atomic_idx_pop();
- pagefault_enable();
- preempt_enable();
}
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_high);
/*
* This is the same as kmap_atomic() but can map memory that doesn't
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 620ebfa45ec1..7c9f0c0a6cd3 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -424,7 +424,7 @@ void __init paging_init(void)
}
#endif
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
#ifdef CONFIG_64BIT
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index a45691e6ab90..1213215ea965 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -419,7 +419,7 @@ void __init paging_init(void)
pagetable_init();
zones_size[ZONE_NORMAL] = max_low_pfn;
- free_area_init_nodes(zones_size);
+ free_area_init(zones_size);
}
void __init mem_init(void)
diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h
index b3a82c97ded3..5717647d14d1 100644
--- a/arch/nds32/include/asm/highmem.h
+++ b/arch/nds32/include/asm/highmem.h
@@ -32,7 +32,6 @@
#define LAST_PKMAP_MASK (LAST_PKMAP - 1)
#define PKMAP_NR(virt) (((virt) - (PKMAP_BASE)) >> PAGE_SHIFT)
#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
-#define kmap_prot PAGE_KERNEL
static inline void flush_cache_kmaps(void)
{
@@ -44,9 +43,6 @@ extern unsigned long highstart_pfn, highend_pfn;
extern pte_t *pkmap_page_table;
-extern void *kmap_high(struct page *page);
-extern void kunmap_high(struct page *page);
-
extern void kmap_init(void);
/*
@@ -54,12 +50,7 @@ extern void kmap_init(void);
* when CONFIG_HIGHMEM is not set.
*/
#ifdef CONFIG_HIGHMEM
-extern void *kmap(struct page *page);
-extern void kunmap(struct page *page);
-extern void *kmap_atomic(struct page *page);
-extern void __kunmap_atomic(void *kvaddr);
extern void *kmap_atomic_pfn(unsigned long pfn);
-extern struct page *kmap_atomic_to_page(void *ptr);
#endif
#endif
diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c
index 022779af6148..4284cd59e21a 100644
--- a/arch/nds32/mm/highmem.c
+++ b/arch/nds32/mm/highmem.c
@@ -10,45 +10,18 @@
#include <asm/fixmap.h>
#include <asm/tlbflush.h>
-void *kmap(struct page *page)
-{
- unsigned long vaddr;
- might_sleep();
- if (!PageHighMem(page))
- return page_address(page);
- vaddr = (unsigned long)kmap_high(page);
- return (void *)vaddr;
-}
-
-EXPORT_SYMBOL(kmap);
-
-void kunmap(struct page *page)
-{
- BUG_ON(in_interrupt());
- if (!PageHighMem(page))
- return;
- kunmap_high(page);
-}
-
-EXPORT_SYMBOL(kunmap);
-
-void *kmap_atomic(struct page *page)
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
{
unsigned int idx;
unsigned long vaddr, pte;
int type;
pte_t *ptep;
- preempt_disable();
- pagefault_disable();
- if (!PageHighMem(page))
- return page_address(page);
-
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR * smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- pte = (page_to_pfn(page) << PAGE_SHIFT) | (PAGE_KERNEL);
+ pte = (page_to_pfn(page) << PAGE_SHIFT) | prot;
ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr);
set_pte(ptep, pte);
@@ -58,10 +31,9 @@ void *kmap_atomic(struct page *page)
__nds32__isb();
return (void *)vaddr;
}
+EXPORT_SYMBOL(kmap_atomic_high_prot);
-EXPORT_SYMBOL(kmap_atomic);
-
-void __kunmap_atomic(void *kvaddr)
+void kunmap_atomic_high(void *kvaddr)
{
if (kvaddr >= (void *)FIXADDR_START) {
unsigned long vaddr = (unsigned long)kvaddr;
@@ -72,8 +44,5 @@ void __kunmap_atomic(void *kvaddr)
ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr);
set_pte(ptep, 0);
}
- pagefault_enable();
- preempt_enable();
}
-
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_high);
diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c
index 0be3833f6814..91147cca4b64 100644
--- a/arch/nds32/mm/init.c
+++ b/arch/nds32/mm/init.c
@@ -31,16 +31,13 @@ EXPORT_SYMBOL(empty_zero_page);
static void __init zone_sizes_init(void)
{
- unsigned long zones_size[MAX_NR_ZONES];
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
- /* Clear the zone sizes */
- memset(zones_size, 0, sizeof(zones_size));
-
- zones_size[ZONE_NORMAL] = max_low_pfn;
+ max_zone_pfn[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
- zones_size[ZONE_HIGHMEM] = max_pfn;
+ max_zone_pfn[ZONE_HIGHMEM] = max_pfn;
#endif
- free_area_init(zones_size);
+ free_area_init(max_zone_pfn);
}
diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c
index 2c609c2516b2..9afca77d10b1 100644
--- a/arch/nios2/mm/init.c
+++ b/arch/nios2/mm/init.c
@@ -46,17 +46,15 @@ pgd_t *pgd_current;
*/
void __init paging_init(void)
{
- unsigned long zones_size[MAX_NR_ZONES];
-
- memset(zones_size, 0, sizeof(zones_size));
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
pagetable_init();
pgd_current = swapper_pg_dir;
- zones_size[ZONE_NORMAL] = max_mapnr;
+ max_zone_pfn[ZONE_NORMAL] = max_mapnr;
/* pass the memory from the bootmem allocator to the main allocator */
- free_area_init(zones_size);
+ free_area_init(max_zone_pfn);
flush_dcache_range((unsigned long)empty_zero_page,
(unsigned long)empty_zero_page + PAGE_SIZE);
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index 1f87b524db78..f94fe6d3f499 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -45,17 +45,14 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
static void __init zone_sizes_init(void)
{
- unsigned long zones_size[MAX_NR_ZONES];
-
- /* Clear the zone sizes */
- memset(zones_size, 0, sizeof(zones_size));
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
/*
* We use only ZONE_NORMAL
*/
- zones_size[ZONE_NORMAL] = max_low_pfn;
+ max_zone_pfn[ZONE_NORMAL] = max_low_pfn;
- free_area_init(zones_size);
+ free_area_init(max_zone_pfn);
}
extern const char _s_kernel_ro[], _e_kernel_ro[];
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index 0c83644bfa5c..99663fc1f997 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -100,37 +100,11 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma
}
}
-#include <asm/kmap_types.h>
-
-#define ARCH_HAS_KMAP
-
-static inline void *kmap(struct page *page)
-{
- might_sleep();
- return page_address(page);
-}
-
-static inline void kunmap(struct page *page)
-{
- flush_kernel_dcache_page_addr(page_address(page));
-}
-
-static inline void *kmap_atomic(struct page *page)
-{
- preempt_disable();
- pagefault_disable();
- return page_address(page);
-}
-
-static inline void __kunmap_atomic(void *addr)
+#define ARCH_HAS_FLUSH_ON_KUNMAP
+static inline void kunmap_flush_on_unmap(void *addr)
{
flush_kernel_dcache_page_addr(addr);
- pagefault_enable();
- preempt_enable();
}
-#define kmap_atomic_prot(page, prot) kmap_atomic(page)
-#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn))
-
#endif /* _PARISC_CACHEFLUSH_H */
diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h
index 7cb595dcb7d7..a69cf9efb0c1 100644
--- a/arch/parisc/include/asm/hugetlb.h
+++ b/arch/parisc/include/asm/hugetlb.h
@@ -12,12 +12,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
-static inline int is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr,
- unsigned long len) {
- return 0;
-}
-
/*
* If the arch doesn't supply something else, assume that hugepage
* size aligned regions are ok without further preparation.
@@ -48,10 +42,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t pte, int dirty);
-static inline void arch_clear_hugepage_flags(struct page *page)
-{
-}
-
#include <asm-generic/hugetlb.h>
#endif /* _ASM_PARISC64_HUGETLB_H */
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 9832c73a7021..cd7df48dc874 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -93,10 +93,8 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
#define set_pte_at(mm, addr, ptep, pteval) \
do { \
- pte_t old_pte; \
unsigned long flags; \
spin_lock_irqsave(pgd_spinlock((mm)->pgd), flags);\
- old_pte = *ptep; \
set_pte(ptep, pteval); \
purge_tlb_entries(mm, addr); \
spin_unlock_irqrestore(pgd_spinlock((mm)->pgd), flags);\
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 5224fb38d766..02d2fdb85dcc 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -675,27 +675,11 @@ static void __init gateway_init(void)
static void __init parisc_bootmem_free(void)
{
- unsigned long zones_size[MAX_NR_ZONES] = { 0, };
- unsigned long holes_size[MAX_NR_ZONES] = { 0, };
- unsigned long mem_start_pfn = ~0UL, mem_end_pfn = 0, mem_size_pfn = 0;
- int i;
-
- for (i = 0; i < npmem_ranges; i++) {
- unsigned long start = pmem_ranges[i].start_pfn;
- unsigned long size = pmem_ranges[i].pages;
- unsigned long end = start + size;
-
- if (mem_start_pfn > start)
- mem_start_pfn = start;
- if (mem_end_pfn < end)
- mem_end_pfn = end;
- mem_size_pfn += size;
- }
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, };
- zones_size[0] = mem_end_pfn - mem_start_pfn;
- holes_size[0] = zones_size[0] - mem_size_pfn;
+ max_zone_pfn[0] = memblock_end_of_DRAM();
- free_area_init_node(0, zones_size, mem_start_pfn, holes_size);
+ free_area_init(max_zone_pfn);
}
void __init paging_init(void)
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1dfa59126fcf..782df1cc4674 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -116,6 +116,7 @@ config PPC
#
select ARCH_32BIT_OFF_T if PPC32
select ARCH_HAS_DEBUG_VIRTUAL
+ select ARCH_HAS_DEBUG_VM_PGTABLE
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
@@ -210,7 +211,6 @@ config PPC
select HAVE_KRETPROBES
select HAVE_LD_DEAD_CODE_DATA_ELIMINATION
select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS
- select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S)
select HAVE_HARDLOCKUP_DETECTOR_ARCH if (PPC64 && PPC_BOOK3S)
@@ -686,15 +686,6 @@ config ARCH_MEMORY_PROBE
def_bool y
depends on MEMORY_HOTPLUG
-# Some NUMA nodes have memory ranges that span
-# other nodes. Even though a pfn is valid and
-# between a node's start and end pfns, it may not
-# reside on that node. See memmap_init_zone()
-# for details.
-config NODES_SPAN_OTHER_NODES
- def_bool y
- depends on NEED_MULTIPLE_NODES
-
config STDBINUTILS
bool "Using standard binutils settings"
depends on 44x
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index e1f551159f7d..83b605806d9c 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1173,10 +1173,6 @@ static inline int pmd_large(pmd_t pmd)
return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
}
-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
-{
- return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
-}
/*
* For radix we should always find H_PAGE_HASHPTE zero. Hence
* the below will work for radix too
diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h
index a4b65b186ec6..104026f7d6bc 100644
--- a/arch/powerpc/include/asm/highmem.h
+++ b/arch/powerpc/include/asm/highmem.h
@@ -30,7 +30,6 @@
#include <asm/fixmap.h>
extern pte_t *kmap_pte;
-extern pgprot_t kmap_prot;
extern pte_t *pkmap_page_table;
/*
@@ -59,33 +58,6 @@ extern pte_t *pkmap_page_table;
#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
-extern void *kmap_high(struct page *page);
-extern void kunmap_high(struct page *page);
-extern void *kmap_atomic_prot(struct page *page, pgprot_t prot);
-extern void __kunmap_atomic(void *kvaddr);
-
-static inline void *kmap(struct page *page)
-{
- might_sleep();
- if (!PageHighMem(page))
- return page_address(page);
- return kmap_high(page);
-}
-
-static inline void kunmap(struct page *page)
-{
- BUG_ON(in_interrupt());
- if (!PageHighMem(page))
- return;
- kunmap_high(page);
-}
-
-static inline void *kmap_atomic(struct page *page)
-{
- return kmap_atomic_prot(page, kmap_prot);
-}
-
-
#define flush_cache_kmaps() flush_cache_all()
#endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index bd6504c28c2f..e6dfa63da552 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -30,6 +30,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
return slice_is_hugepage_only_range(mm, addr, len);
return 0;
}
+#define is_hugepage_only_range is_hugepage_only_range
#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
@@ -60,10 +61,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t pte, int dirty);
-static inline void arch_clear_hugepage_flags(struct page *page)
-{
-}
-
#include <asm-generic/hugetlb.h>
#else /* ! CONFIG_HUGETLB_PAGE */
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 635969b5b58e..13f90dd03450 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -699,10 +699,6 @@ static inline void iosync(void)
*
* * iounmap undoes such a mapping and can be hooked
*
- * * __ioremap_at (and the pending __iounmap_at) are low level functions to
- * create hand-made mappings for use only by the PCI code and cannot
- * currently be hooked. Must be page aligned.
- *
* * __ioremap_caller is the same as above but takes an explicit caller
* reference rather than using __builtin_return_address(0)
*
@@ -719,6 +715,8 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size);
extern void iounmap(volatile void __iomem *addr);
+void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size);
+
int early_ioremap_range(unsigned long ea, phys_addr_t pa,
unsigned long size, pgprot_t prot);
void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
@@ -727,10 +725,6 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size,
pgprot_t prot, void *caller);
-extern void __iomem * __ioremap_at(phys_addr_t pa, void *ea,
- unsigned long size, pgprot_t prot);
-extern void __iounmap_at(void *ea, unsigned long size);
-
/*
* When CONFIG_PPC_INDIRECT_PIO is set, we use the generic iomap implementation
* which needs some additional definitions here. They basically allow PIO
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 69f4cb3b7c56..b92e81b256e5 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -66,7 +66,7 @@ struct pci_controller {
void __iomem *io_base_virt;
#ifdef CONFIG_PPC64
- void *io_base_alloc;
+ void __iomem *io_base_alloc;
#endif
resource_size_t io_base_phys;
resource_size_t pci_io_size;
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 1f1169856dc8..112d150354b2 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -748,9 +748,8 @@ void do_IRQ(struct pt_regs *regs)
static void *__init alloc_vm_stack(void)
{
- return __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START,
- VMALLOC_END, THREADINFO_GFP, PAGE_KERNEL,
- 0, NUMA_NO_NODE, (void*)_RET_IP_);
+ return __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP,
+ NUMA_NO_NODE, (void *)_RET_IP_);
}
static void __init vmap_irqstack_init(void)
diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c
index 773671b512df..2257d24e6a26 100644
--- a/arch/powerpc/kernel/isa-bridge.c
+++ b/arch/powerpc/kernel/isa-bridge.c
@@ -18,6 +18,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/notifier.h>
+#include <linux/vmalloc.h>
#include <asm/processor.h>
#include <asm/io.h>
@@ -38,6 +39,22 @@ EXPORT_SYMBOL_GPL(isa_bridge_pcidev);
#define ISA_SPACE_MASK 0x1
#define ISA_SPACE_IO 0x1
+static void remap_isa_base(phys_addr_t pa, unsigned long size)
+{
+ WARN_ON_ONCE(ISA_IO_BASE & ~PAGE_MASK);
+ WARN_ON_ONCE(pa & ~PAGE_MASK);
+ WARN_ON_ONCE(size & ~PAGE_MASK);
+
+ if (slab_is_available()) {
+ if (ioremap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa,
+ pgprot_noncached(PAGE_KERNEL)))
+ unmap_kernel_range(ISA_IO_BASE, size);
+ } else {
+ early_ioremap_range(ISA_IO_BASE, pa, size,
+ pgprot_noncached(PAGE_KERNEL));
+ }
+}
+
static void pci_process_ISA_OF_ranges(struct device_node *isa_node,
unsigned long phb_io_base_phys)
{
@@ -105,15 +122,13 @@ static void pci_process_ISA_OF_ranges(struct device_node *isa_node,
if (size > 0x10000)
size = 0x10000;
- __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE,
- size, pgprot_noncached(PAGE_KERNEL));
+ remap_isa_base(phb_io_base_phys, size);
return;
inval_range:
printk(KERN_ERR "no ISA IO ranges or unexpected isa range, "
"mapping 64k\n");
- __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE,
- 0x10000, pgprot_noncached(PAGE_KERNEL));
+ remap_isa_base(phb_io_base_phys, 0x10000);
}
@@ -248,8 +263,7 @@ void __init isa_bridge_init_non_pci(struct device_node *np)
* and map it
*/
isa_io_base = ISA_IO_BASE;
- __ioremap_at(pbase, (void *)ISA_IO_BASE,
- size, pgprot_noncached(PAGE_KERNEL));
+ remap_isa_base(pbase, size);
pr_debug("ISA: Non-PCI bridge is %pOF\n", np);
}
@@ -297,7 +311,7 @@ static void isa_bridge_remove(void)
isa_bridge_pcidev = NULL;
/* Unmap the ISA area */
- __iounmap_at((void *)ISA_IO_BASE, 0x10000);
+ unmap_kernel_range(ISA_IO_BASE, 0x10000);
}
/**
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 6a932de18aa6..9312e6eda7ff 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -109,23 +109,47 @@ int pcibios_unmap_io_space(struct pci_bus *bus)
/* Get the host bridge */
hose = pci_bus_to_host(bus);
- /* Check if we have IOs allocated */
- if (hose->io_base_alloc == NULL)
- return 0;
-
pr_debug("IO unmapping for PHB %pOF\n", hose->dn);
pr_debug(" alloc=0x%p\n", hose->io_base_alloc);
- /* This is a PHB, we fully unmap the IO area */
- vunmap(hose->io_base_alloc);
-
+ iounmap(hose->io_base_alloc);
return 0;
}
EXPORT_SYMBOL_GPL(pcibios_unmap_io_space);
-static int pcibios_map_phb_io_space(struct pci_controller *hose)
+void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size)
{
struct vm_struct *area;
+ unsigned long addr;
+
+ WARN_ON_ONCE(paddr & ~PAGE_MASK);
+ WARN_ON_ONCE(size & ~PAGE_MASK);
+
+ /*
+ * Let's allocate some IO space for that guy. We don't pass VM_IOREMAP
+ * because we don't care about alignment tricks that the core does in
+ * that case. Maybe we should due to stupid card with incomplete
+ * address decoding but I'd rather not deal with those outside of the
+ * reserved 64K legacy region.
+ */
+ area = __get_vm_area_caller(size, 0, PHB_IO_BASE, PHB_IO_END,
+ __builtin_return_address(0));
+ if (!area)
+ return NULL;
+
+ addr = (unsigned long)area->addr;
+ if (ioremap_page_range(addr, addr + size, paddr,
+ pgprot_noncached(PAGE_KERNEL))) {
+ unmap_kernel_range(addr, size);
+ return NULL;
+ }
+
+ return (void __iomem *)addr;
+}
+EXPORT_SYMBOL_GPL(ioremap_phb);
+
+static int pcibios_map_phb_io_space(struct pci_controller *hose)
+{
unsigned long phys_page;
unsigned long size_page;
unsigned long io_virt_offset;
@@ -146,12 +170,11 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose)
* with incomplete address decoding but I'd rather not deal with
* those outside of the reserved 64K legacy region.
*/
- area = __get_vm_area(size_page, 0, PHB_IO_BASE, PHB_IO_END);
- if (area == NULL)
+ hose->io_base_alloc = ioremap_phb(phys_page, size_page);
+ if (!hose->io_base_alloc)
return -ENOMEM;
- hose->io_base_alloc = area->addr;
- hose->io_base_virt = (void __iomem *)(area->addr +
- hose->io_base_phys - phys_page);
+ hose->io_base_virt = hose->io_base_alloc +
+ hose->io_base_phys - phys_page;
pr_debug("IO mapping for PHB %pOF\n", hose->dn);
pr_debug(" phys=0x%016llx, virt=0x%p (alloc=0x%p)\n",
@@ -159,11 +182,6 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose)
pr_debug(" size=0x%016llx (alloc=0x%016lx)\n",
hose->pci_io_size, size_page);
- /* Establish the mapping */
- if (__ioremap_at(phys_page, area->addr, size_page,
- pgprot_noncached(PAGE_KERNEL)) == NULL)
- return -ENOMEM;
-
/* Fixup hose IO resource */
io_virt_offset = pcibios_io_space_offset(hose);
hose->io_resource.start += io_virt_offset;
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index 320c1672b2ae..624b4438aff9 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -24,22 +24,11 @@
#include <linux/highmem.h>
#include <linux/module.h>
-/*
- * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap
- * gives a more generic (and caching) interface. But kmap_atomic can
- * be used in IRQ contexts, so in some (very limited) cases we need
- * it.
- */
-void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
{
unsigned long vaddr;
int idx, type;
- preempt_disable();
- pagefault_disable();
- if (!PageHighMem(page))
- return page_address(page);
-
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
@@ -49,17 +38,14 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
return (void*) vaddr;
}
-EXPORT_SYMBOL(kmap_atomic_prot);
+EXPORT_SYMBOL(kmap_atomic_high_prot);
-void __kunmap_atomic(void *kvaddr)
+void kunmap_atomic_high(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
- if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
- pagefault_enable();
- preempt_enable();
+ if (vaddr < __fix_to_virt(FIX_KMAP_END))
return;
- }
if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) {
int type = kmap_atomic_idx();
@@ -77,7 +63,5 @@ void __kunmap_atomic(void *kvaddr)
}
kmap_atomic_idx_pop();
- pagefault_enable();
- preempt_enable();
}
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_high);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index d06efb946c7d..3b11e01611af 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -558,7 +558,7 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
return vma_kernel_pagesize(vma);
}
-static int __init add_huge_page_size(unsigned long long size)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
int shift = __ffs(size);
int mmu_psize;
@@ -566,37 +566,27 @@ static int __init add_huge_page_size(unsigned long long size)
/* Check that it is a page size supported by the hardware and
* that it fits within pagetable and slice limits. */
if (size <= PAGE_SIZE || !is_power_of_2(size))
- return -EINVAL;
+ return false;
mmu_psize = check_and_get_huge_psize(shift);
if (mmu_psize < 0)
- return -EINVAL;
+ return false;
BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
- /* Return if huge page size has already been setup */
- if (size_to_hstate(size))
- return 0;
-
- hugetlb_add_hstate(shift - PAGE_SHIFT);
-
- return 0;
+ return true;
}
-static int __init hugepage_setup_sz(char *str)
+static int __init add_huge_page_size(unsigned long long size)
{
- unsigned long long size;
-
- size = memparse(str, &str);
+ int shift = __ffs(size);
- if (add_huge_page_size(size) != 0) {
- hugetlb_bad_size();
- pr_err("Invalid huge page size specified(%llu)\n", size);
- }
+ if (!arch_hugetlb_valid_size((unsigned long)size))
+ return -EINVAL;
- return 1;
+ hugetlb_add_hstate(shift - PAGE_SHIFT);
+ return 0;
}
-__setup("hugepagesz=", hugepage_setup_sz);
static int __init hugetlbpage_init(void)
{
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
index 50a99d9684f7..ba5cbb0d66bd 100644
--- a/arch/powerpc/mm/ioremap_64.c
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -4,56 +4,6 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
-/**
- * Low level function to establish the page tables for an IO mapping
- */
-void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot)
-{
- int ret;
- unsigned long va = (unsigned long)ea;
-
- /* We don't support the 4K PFN hack with ioremap */
- if (pgprot_val(prot) & H_PAGE_4K_PFN)
- return NULL;
-
- if ((ea + size) >= (void *)IOREMAP_END) {
- pr_warn("Outside the supported range\n");
- return NULL;
- }
-
- WARN_ON(pa & ~PAGE_MASK);
- WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
- WARN_ON(size & ~PAGE_MASK);
-
- if (slab_is_available()) {
- ret = ioremap_page_range(va, va + size, pa, prot);
- if (ret)
- unmap_kernel_range(va, size);
- } else {
- ret = early_ioremap_range(va, pa, size, prot);
- }
-
- if (ret)
- return NULL;
-
- return (void __iomem *)ea;
-}
-EXPORT_SYMBOL(__ioremap_at);
-
-/**
- * Low level function to tear down the page tables for an IO mapping. This is
- * used for mappings that are manipulated manually, like partial unmapping of
- * PCI IOs or ISA space.
- */
-void __iounmap_at(void *ea, unsigned long size)
-{
- WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
- WARN_ON(size & ~PAGE_MASK);
-
- unmap_kernel_range((unsigned long)ea, size);
-}
-EXPORT_SYMBOL(__iounmap_at);
-
void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
pgprot_t prot, void *caller)
{
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 041ed7cfd341..7cebb9c818d3 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -64,8 +64,6 @@ bool init_mem_is_free;
#ifdef CONFIG_HIGHMEM
pte_t *kmap_pte;
EXPORT_SYMBOL(kmap_pte);
-pgprot_t kmap_prot;
-EXPORT_SYMBOL(kmap_prot);
#endif
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
@@ -245,7 +243,6 @@ void __init paging_init(void)
pkmap_page_table = virt_to_kpte(PKMAP_BASE);
kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN));
- kmap_prot = PAGE_KERNEL;
#endif /* CONFIG_HIGHMEM */
printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n",
@@ -271,7 +268,7 @@ void __init paging_init(void)
max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
#endif
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
mark_nonram_nosave();
}
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index b2cde1732301..5ace2f9a277e 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -337,39 +337,19 @@ static int pseries_remove_mem_node(struct device_node *np)
static bool lmb_is_removable(struct drmem_lmb *lmb)
{
- int i, scns_per_block;
- bool rc = true;
- unsigned long pfn, block_sz;
- u64 phys_addr;
-
if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
return false;
- block_sz = memory_block_size_bytes();
- scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
- phys_addr = lmb->base_addr;
-
#ifdef CONFIG_FA_DUMP
/*
* Don't hot-remove memory that falls in fadump boot memory area
* and memory that is reserved for capturing old kernel memory.
*/
- if (is_fadump_memory_area(phys_addr, block_sz))
+ if (is_fadump_memory_area(lmb->base_addr, memory_block_size_bytes()))
return false;
#endif
-
- for (i = 0; i < scns_per_block; i++) {
- pfn = PFN_DOWN(phys_addr);
- if (!pfn_in_present_section(pfn)) {
- phys_addr += MIN_MEMORY_BLOCK_SIZE;
- continue;
- }
-
- rc = rc && is_mem_section_removable(pfn, PAGES_PER_SECTION);
- phys_addr += MIN_MEMORY_BLOCK_SIZE;
- }
-
- return rc;
+ /* device_offline() will determine if we can actually remove this lmb */
+ return true;
}
static int dlpar_add_lmb(struct drmem_lmb *);
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index f7144354938f..c733007b90ab 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -16,6 +16,7 @@ config RISCV
select OF_EARLY_FLATTREE
select OF_IRQ
select ARCH_HAS_BINFMT_FLAT
+ select ARCH_HAS_DEBUG_WX
select ARCH_WANT_FRAME_POINTERS
select CLONE_BACKWARDS
select COMMON_CLK
@@ -32,7 +33,6 @@ config RISCV
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ASM_MODVERSIONS
- select HAVE_MEMBLOCK_NODE_MAP
select HAVE_DMA_CONTIGUOUS if MMU
select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_PERF_EVENTS
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index 728a5db66597..a5c2ca1d1cd8 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -5,14 +5,4 @@
#include <asm-generic/hugetlb.h>
#include <asm/page.h>
-static inline int is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr,
- unsigned long len) {
- return 0;
-}
-
-static inline void arch_clear_hugepage_flags(struct page *page)
-{
-}
-
#endif /* _ASM_RISCV_HUGETLB_H */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 35b60035b6b0..d50706ea1c94 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -473,9 +473,9 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
#define PAGE_SHARED __pgprot(0)
#define PAGE_KERNEL __pgprot(0)
#define swapper_pg_dir NULL
+#define TASK_SIZE 0xffffffffUL
#define VMALLOC_START 0
-
-#define TASK_SIZE 0xffffffffUL
+#define VMALLOC_END TASK_SIZE
static inline void __kernel_map_pages(struct page *page, int numpages, int enable) {}
diff --git a/arch/riscv/include/asm/ptdump.h b/arch/riscv/include/asm/ptdump.h
index e29af7191909..3c9ea6dd5af7 100644
--- a/arch/riscv/include/asm/ptdump.h
+++ b/arch/riscv/include/asm/ptdump.h
@@ -8,4 +8,15 @@
void ptdump_check_wx(void);
+#ifdef CONFIG_DEBUG_WX
+static inline void debug_checkwx(void)
+{
+ ptdump_check_wx();
+}
+#else
+static inline void debug_checkwx(void)
+{
+}
+#endif
+
#endif /* _ASM_RISCV_PTDUMP_H */
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index a6189ed36c5f..932dadfdca54 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -12,29 +12,21 @@ int pmd_huge(pmd_t pmd)
return pmd_leaf(pmd);
}
-static __init int setup_hugepagesz(char *opt)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
- unsigned long ps = memparse(opt, &opt);
-
- if (ps == HPAGE_SIZE) {
- hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT);
- } else if (IS_ENABLED(CONFIG_64BIT) && ps == PUD_SIZE) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
- } else {
- hugetlb_bad_size();
- pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20);
- return 0;
- }
-
- return 1;
+ if (size == HPAGE_SIZE)
+ return true;
+ else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE)
+ return true;
+ else
+ return false;
}
-__setup("hugepagesz=", setup_hugepagesz);
#ifdef CONFIG_CONTIG_ALLOC
static __init int gigantic_pages_init(void)
{
/* With CONTIG_ALLOC, we can allocate gigantic pages at runtime */
- if (IS_ENABLED(CONFIG_64BIT) && !size_to_hstate(1UL << PUD_SHIFT))
+ if (IS_ENABLED(CONFIG_64BIT))
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
return 0;
}
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index a870a514fdbc..34327407b0c5 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -20,6 +20,7 @@
#include <asm/soc.h>
#include <asm/pgtable.h>
#include <asm/io.h>
+#include <asm/ptdump.h>
#include "../kernel/head.h"
@@ -40,7 +41,7 @@ static void __init zone_sizes_init(void)
#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
static void setup_zero_page(void)
@@ -523,6 +524,8 @@ void mark_rodata_ro(void)
set_memory_ro(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
set_memory_nx(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
set_memory_nx(data_start, (max_low - data_start) >> PAGE_SHIFT);
+
+ debug_checkwx();
}
#endif
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 2167bce993ff..f854faff38c3 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -59,6 +59,7 @@ config KASAN_SHADOW_OFFSET
config S390
def_bool y
select ARCH_BINFMT_ELF_STATE
+ select ARCH_HAS_DEBUG_VM_PGTABLE
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
@@ -162,7 +163,6 @@ config S390
select HAVE_LIVEPATCH
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
- select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MEMBLOCK_PHYS_MAP
select MMU_GATHER_NO_GATHER
select HAVE_MOD_ARCH_SPECIFIC
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index de8f0bf5f238..9ddf4a43a590 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -21,13 +21,6 @@ pte_t huge_ptep_get(pte_t *ptep);
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
-static inline bool is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr,
- unsigned long len)
-{
- return false;
-}
-
/*
* If the arch doesn't supply something else, assume that hugepage
* size aligned regions are ok without further preparation.
@@ -46,6 +39,7 @@ static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_arch_1, &page->flags);
}
+#define arch_clear_hugepage_flags arch_clear_hugepage_flags
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz)
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 12f07565ef64..c0881f0a3175 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -305,12 +305,9 @@ void *restart_stack __section(.data);
unsigned long stack_alloc(void)
{
#ifdef CONFIG_VMAP_STACK
- return (unsigned long)
- __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
- VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP,
- PAGE_KERNEL, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
+ return (unsigned long)__vmalloc_node(THREAD_SIZE, THREAD_SIZE,
+ THREADINFO_GFP, NUMA_NO_NODE,
+ __builtin_return_address(0));
#else
return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
#endif
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 4632d4e26b66..82df06d720e8 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -254,25 +254,15 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
}
-static __init int setup_hugepagesz(char *opt)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
- unsigned long size;
- char *string = opt;
-
- size = memparse(opt, &opt);
- if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) {
- hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
- } else {
- hugetlb_bad_size();
- pr_err("hugepagesz= specifies an unsupported page size %s\n",
- string);
- return 0;
- }
- return 1;
+ if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
+ return true;
+ else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE)
+ return true;
+ else
+ return false;
}
-__setup("hugepagesz=", setup_hugepagesz);
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 87b2d024e75a..b11bcf4da531 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -122,7 +122,7 @@ void __init paging_init(void)
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS);
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
void mark_rodata_ro(void)
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 97656d20b9ea..0424b8f2f8d3 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -9,7 +9,6 @@ config SUPERH
select CLKDEV_LOOKUP
select DMA_DECLARE_COHERENT
select HAVE_IDE if HAS_IOPORT_MAP
- select HAVE_MEMBLOCK_NODE_MAP
select HAVE_OPROFILE
select HAVE_ARCH_TRACEHOOK
select HAVE_PERF_EVENTS
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index 6f025fe18146..ae4de7b89210 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -5,12 +5,6 @@
#include <asm/cacheflush.h>
#include <asm/page.h>
-static inline int is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr,
- unsigned long len) {
- return 0;
-}
-
/*
* If the arch doesn't supply something else, assume that hugepage
* size aligned regions are ok without further preparation.
@@ -36,6 +30,7 @@ static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_dcache_clean, &page->flags);
}
+#define arch_clear_hugepage_flags arch_clear_hugepage_flags
#include <asm-generic/hugetlb.h>
diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c
index 934ff84844fa..d432164b23b7 100644
--- a/arch/sh/kernel/cpu/sh4/sq.c
+++ b/arch/sh/kernel/cpu/sh4/sq.c
@@ -103,7 +103,8 @@ static int __sq_remap(struct sq_mapping *map, pgprot_t prot)
#if defined(CONFIG_MMU)
struct vm_struct *vma;
- vma = __get_vm_area(map->size, VM_ALLOC, map->sq_addr, SQ_ADDRMAX);
+ vma = __get_vm_area_caller(map->size, VM_ALLOC, map->sq_addr,
+ SQ_ADDRMAX, __builtin_return_address(0));
if (!vma)
return -ENOMEM;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 8d2a68aea1fc..628f461b8993 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -334,7 +334,7 @@ void __init paging_init(void)
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
unsigned int mem_init_done = 0;
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index da515fdad83d..0e4f3891b904 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -65,7 +65,6 @@ config SPARC64
select HAVE_KRETPROBES
select HAVE_KPROBES
select MMU_GATHER_RCU_TABLE_FREE if SMP
- select HAVE_MEMBLOCK_NODE_MAP
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD
@@ -287,15 +286,6 @@ config NODES_SHIFT
Specify the maximum number of NUMA Nodes available on the target
system. Increases memory reserved to accommodate various tables.
-# Some NUMA nodes have memory ranges that span
-# other nodes. Even though a pfn is valid and
-# between a node's start and end pfns, it may not
-# reside on that node. See memmap_init_zone()
-# for details.
-config NODES_SPAN_OTHER_NODES
- def_bool y
- depends on NEED_MULTIPLE_NODES
-
config ARCH_SPARSEMEM_ENABLE
def_bool y if SPARC64
select SPARSEMEM_VMEMMAP_ENABLE
diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h
index 18d776925c45..ddb03c04f1f3 100644
--- a/arch/sparc/include/asm/highmem.h
+++ b/arch/sparc/include/asm/highmem.h
@@ -25,11 +25,12 @@
#include <asm/vaddrs.h>
#include <asm/kmap_types.h>
#include <asm/pgtable.h>
+#include <asm/pgtsrmmu.h>
/* declarations for highmem.c */
extern unsigned long highstart_pfn, highend_pfn;
-extern pgprot_t kmap_prot;
+#define kmap_prot __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE)
extern pte_t *pkmap_page_table;
void kmap_init(void) __init;
@@ -50,28 +51,6 @@ void kmap_init(void) __init;
#define PKMAP_END (PKMAP_ADDR(LAST_PKMAP))
-void *kmap_high(struct page *page);
-void kunmap_high(struct page *page);
-
-static inline void *kmap(struct page *page)
-{
- BUG_ON(in_interrupt());
- if (!PageHighMem(page))
- return page_address(page);
- return kmap_high(page);
-}
-
-static inline void kunmap(struct page *page)
-{
- BUG_ON(in_interrupt());
- if (!PageHighMem(page))
- return;
- kunmap_high(page);
-}
-
-void *kmap_atomic(struct page *page);
-void __kunmap_atomic(void *kvaddr);
-
#define flush_cache_kmaps() flush_cache_all()
#endif /* __KERNEL__ */
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index 3963f80d1cb3..53838a173f62 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -20,12 +20,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
-static inline int is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr,
- unsigned long len) {
- return 0;
-}
-
#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
@@ -53,10 +47,6 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
return changed;
}
-static inline void arch_clear_hugepage_flags(struct page *page)
-{
-}
-
#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor,
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c
index d4a80adea7e5..6ff6e2a9f9b3 100644
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -32,8 +32,6 @@
#include <asm/pgalloc.h>
#include <asm/vaddrs.h>
-pgprot_t kmap_prot;
-
static pte_t *kmap_pte;
void __init kmap_init(void)
@@ -50,19 +48,13 @@ void __init kmap_init(void)
/* cache the first kmap pte */
kmap_pte = pte_offset_kernel(dir, address);
- kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE);
}
-void *kmap_atomic(struct page *page)
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
{
unsigned long vaddr;
long idx, type;
- preempt_disable();
- pagefault_disable();
- if (!PageHighMem(page))
- return page_address(page);
-
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
@@ -77,7 +69,7 @@ void *kmap_atomic(struct page *page)
#ifdef CONFIG_DEBUG_HIGHMEM
BUG_ON(!pte_none(*(kmap_pte-idx)));
#endif
- set_pte(kmap_pte-idx, mk_pte(page, kmap_prot));
+ set_pte(kmap_pte-idx, mk_pte(page, prot));
/* XXX Fix - Anton */
#if 0
__flush_tlb_one(vaddr);
@@ -87,18 +79,15 @@ void *kmap_atomic(struct page *page)
return (void*) vaddr;
}
-EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_high_prot);
-void __kunmap_atomic(void *kvaddr)
+void kunmap_atomic_high(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
int type;
- if (vaddr < FIXADDR_START) { // FIXME
- pagefault_enable();
- preempt_enable();
+ if (vaddr < FIXADDR_START)
return;
- }
type = kmap_atomic_idx();
@@ -131,7 +120,5 @@ void __kunmap_atomic(void *kvaddr)
#endif
kmap_atomic_idx_pop();
- pagefault_enable();
- preempt_enable();
}
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_high);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index b488d4587793..2ef6826a6ca6 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -325,23 +325,12 @@ static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_inde
}
#ifdef CONFIG_HUGETLB_PAGE
-static void __init add_huge_page_size(unsigned long size)
-{
- unsigned int order;
-
- if (size_to_hstate(size))
- return;
-
- order = ilog2(size) - PAGE_SHIFT;
- hugetlb_add_hstate(order);
-}
-
static int __init hugetlbpage_init(void)
{
- add_huge_page_size(1UL << HPAGE_64K_SHIFT);
- add_huge_page_size(1UL << HPAGE_SHIFT);
- add_huge_page_size(1UL << HPAGE_256MB_SHIFT);
- add_huge_page_size(1UL << HPAGE_2GB_SHIFT);
+ hugetlb_add_hstate(HPAGE_64K_SHIFT - PAGE_SHIFT);
+ hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT);
+ hugetlb_add_hstate(HPAGE_256MB_SHIFT - PAGE_SHIFT);
+ hugetlb_add_hstate(HPAGE_2GB_SHIFT - PAGE_SHIFT);
return 0;
}
@@ -360,16 +349,11 @@ static void __init pud_huge_patch(void)
__asm__ __volatile__("flush %0" : : "r" (addr));
}
-static int __init setup_hugepagesz(char *string)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
- unsigned long long hugepage_size;
- unsigned int hugepage_shift;
+ unsigned int hugepage_shift = ilog2(size);
unsigned short hv_pgsz_idx;
unsigned int hv_pgsz_mask;
- int rc = 0;
-
- hugepage_size = memparse(string, &string);
- hugepage_shift = ilog2(hugepage_size);
switch (hugepage_shift) {
case HPAGE_16GB_SHIFT:
@@ -397,20 +381,11 @@ static int __init setup_hugepagesz(char *string)
hv_pgsz_mask = 0;
}
- if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U) {
- hugetlb_bad_size();
- pr_err("hugepagesz=%llu not supported by MMU.\n",
- hugepage_size);
- goto out;
- }
-
- add_huge_page_size(hugepage_size);
- rc = 1;
+ if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U)
+ return false;
-out:
- return rc;
+ return true;
}
-__setup("hugepagesz=", setup_hugepagesz);
#endif /* CONFIG_HUGETLB_PAGE */
void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
@@ -2488,7 +2463,7 @@ void __init paging_init(void)
max_zone_pfns[ZONE_NORMAL] = end_pfn;
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
printk("Booting Linux...\n");
diff --git a/arch/sparc/mm/io-unit.c b/arch/sparc/mm/io-unit.c
index 289276b99b01..08238d989cfd 100644
--- a/arch/sparc/mm/io-unit.c
+++ b/arch/sparc/mm/io-unit.c
@@ -10,7 +10,6 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
-#include <linux/highmem.h> /* pte_offset_map => kmap_atomic */
#include <linux/bitops.h>
#include <linux/dma-mapping.h>
#include <linux/of.h>
diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index b00dde13681b..f1e08e30b64e 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -12,7 +12,6 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/highmem.h> /* pte_offset_map => kmap_atomic */
#include <linux/dma-mapping.h>
#include <linux/of.h>
#include <linux/of_device.h>
diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index ad34753f20c9..3ce4db5c16d7 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c
@@ -977,24 +977,13 @@ void __init srmmu_paging_init(void)
kmap_init();
{
- unsigned long zones_size[MAX_NR_ZONES];
- unsigned long zholes_size[MAX_NR_ZONES];
- unsigned long npages;
- int znum;
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
- for (znum = 0; znum < MAX_NR_ZONES; znum++)
- zones_size[znum] = zholes_size[znum] = 0;
+ max_zone_pfn[ZONE_DMA] = max_low_pfn;
+ max_zone_pfn[ZONE_NORMAL] = max_low_pfn;
+ max_zone_pfn[ZONE_HIGHMEM] = highend_pfn;
- npages = max_low_pfn - pfn_base;
-
- zones_size[ZONE_DMA] = npages;
- zholes_size[ZONE_DMA] = npages - pages_avail;
-
- npages = highend_pfn - max_low_pfn;
- zones_size[ZONE_HIGHMEM] = npages;
- zholes_size[ZONE_HIGHMEM] = npages - calc_highpages();
-
- free_area_init_node(0, zones_size, pfn_base, zholes_size);
+ free_area_init(max_zone_pfn);
}
}
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 30885d0b94ac..401b22f14743 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -158,8 +158,8 @@ static void __init fixaddr_user_init( void)
void __init paging_init(void)
{
- unsigned long zones_size[MAX_NR_ZONES], vaddr;
- int i;
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
+ unsigned long vaddr;
empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE,
PAGE_SIZE);
@@ -167,12 +167,8 @@ void __init paging_init(void)
panic("%s: Failed to allocate %lu bytes align=%lx\n",
__func__, PAGE_SIZE, PAGE_SIZE);
- for (i = 0; i < ARRAY_SIZE(zones_size); i++)
- zones_size[i] = 0;
-
- zones_size[ZONE_NORMAL] = (end_iomem >> PAGE_SHIFT) -
- (uml_physmem >> PAGE_SHIFT);
- free_area_init(zones_size);
+ max_zone_pfn[ZONE_NORMAL] = end_iomem >> PAGE_SHIFT;
+ free_area_init(max_zone_pfn);
/*
* Fixed mappings, only the page table structure has to be
diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h
index 23c93105f98f..66285178dd9b 100644
--- a/arch/unicore32/include/asm/memory.h
+++ b/arch/unicore32/include/asm/memory.h
@@ -60,7 +60,7 @@
#ifndef __ASSEMBLY__
#ifndef arch_adjust_zones
-#define arch_adjust_zones(size, holes) do { } while (0)
+#define arch_adjust_zones(max_zone_pfn) do { } while (0)
#endif
/*
diff --git a/arch/unicore32/include/mach/memory.h b/arch/unicore32/include/mach/memory.h
index 2b527cedd03d..b4e6035cb9a3 100644
--- a/arch/unicore32/include/mach/memory.h
+++ b/arch/unicore32/include/mach/memory.h
@@ -25,10 +25,10 @@
#if !defined(__ASSEMBLY__) && defined(CONFIG_PCI)
-void puv3_pci_adjust_zones(unsigned long *size, unsigned long *holes);
+void puv3_pci_adjust_zones(unsigned long *max_zone_pfn);
-#define arch_adjust_zones(size, holes) \
- puv3_pci_adjust_zones(size, holes)
+#define arch_adjust_zones(max_zone_pfn) \
+ puv3_pci_adjust_zones(max_zone_pfn)
#endif
diff --git a/arch/unicore32/kernel/pci.c b/arch/unicore32/kernel/pci.c
index efa04a94dcdb..0d098aa05b47 100644
--- a/arch/unicore32/kernel/pci.c
+++ b/arch/unicore32/kernel/pci.c
@@ -133,21 +133,11 @@ static int pci_puv3_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
* This is really ugly and we need a better way of specifying
* DMA-capable regions of memory.
*/
-void __init puv3_pci_adjust_zones(unsigned long *zone_size,
- unsigned long *zhole_size)
+void __init puv3_pci_adjust_zones(unsigned long max_zone_pfn)
{
unsigned int sz = SZ_128M >> PAGE_SHIFT;
- /*
- * Only adjust if > 128M on current system
- */
- if (zone_size[0] <= sz)
- return;
-
- zone_size[1] = zone_size[0] - sz;
- zone_size[0] = sz;
- zhole_size[1] = zhole_size[0];
- zhole_size[0] = 0;
+ max_zone_pfn[ZONE_DMA] = sz;
}
/*
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index 6cf010fadc7a..52425d383cea 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -61,46 +61,21 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low,
}
}
-static void __init uc32_bootmem_free(unsigned long min, unsigned long max_low,
- unsigned long max_high)
+static void __init uc32_bootmem_free(unsigned long max_low)
{
- unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
- struct memblock_region *reg;
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
- /*
- * initialise the zones.
- */
- memset(zone_size, 0, sizeof(zone_size));
-
- /*
- * The memory size has already been determined. If we need
- * to do anything fancy with the allocation of this memory
- * to the zones, now is the time to do it.
- */
- zone_size[0] = max_low - min;
-
- /*
- * Calculate the size of the holes.
- * holes = node_size - sum(bank_sizes)
- */
- memcpy(zhole_size, zone_size, sizeof(zhole_size));
- for_each_memblock(memory, reg) {
- unsigned long start = memblock_region_memory_base_pfn(reg);
- unsigned long end = memblock_region_memory_end_pfn(reg);
-
- if (start < max_low) {
- unsigned long low_end = min(end, max_low);
- zhole_size[0] -= low_end - start;
- }
- }
+ max_zone_pfn[ZONE_DMA] = max_low;
+ max_zone_pfn[ZONE_NORMAL] = max_low;
/*
* Adjust the sizes according to any special requirements for
* this machine type.
+ * This might lower ZONE_DMA limit.
*/
- arch_adjust_zones(zone_size, zhole_size);
+ arch_adjust_zones(max_zone_pfn);
- free_area_init_node(0, zone_size, min, zhole_size);
+ free_area_init(max_zone_pfn);
}
int pfn_valid(unsigned long pfn)
@@ -176,11 +151,11 @@ void __init bootmem_init(void)
sparse_init();
/*
- * Now free the memory - free_area_init_node needs
+ * Now free the memory - free_area_init needs
* the sparse mem_map arrays initialized by sparse_init()
* for memmap_init_zone(), otherwise all PFNs are invalid.
*/
- uc32_bootmem_free(min, max_low, max_high);
+ uc32_bootmem_free(max_low);
high_memory = __va((max_low << PAGE_SHIFT) - 1) + 1;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3b7295dfc86f..8e3d40ff1eea 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -59,6 +59,7 @@ config X86
select ARCH_CLOCKSOURCE_INIT
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_DEBUG_VIRTUAL
+ select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_EARLY_DEBUG if KGDB
select ARCH_HAS_ELF_RANDOMIZE
@@ -82,6 +83,7 @@ config X86
select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_UBSAN_SANITIZE_ALL
+ select ARCH_HAS_DEBUG_WX
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
@@ -194,7 +196,6 @@ config X86
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH if X86_64
- select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MIXED_BREAKPOINTS_REGS
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_MOVE_PMD
@@ -1588,15 +1589,6 @@ config X86_64_ACPI_NUMA
---help---
Enable ACPI SRAT based node topology detection.
-# Some NUMA nodes have memory ranges that span
-# other nodes. Even though a pfn is valid and
-# between a node's start and end pfns, it may not
-# reside on that node. See memmap_init_zone()
-# for details.
-config NODES_SPAN_OTHER_NODES
- def_bool y
- depends on X86_64_ACPI_NUMA
-
config NUMA_EMU
bool "NUMA emulation"
depends on NUMA
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index f909d3ce36e6..fdf1431ac8c2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -72,33 +72,6 @@ config EFI_PGT_DUMP
issues with the mapping of the EFI runtime regions into that
table.
-config DEBUG_WX
- bool "Warn on W+X mappings at boot"
- select PTDUMP_CORE
- ---help---
- Generate a warning if any W+X mappings are found at boot.
-
- This is useful for discovering cases where the kernel is leaving
- W+X mappings after applying NX, as such mappings are a security risk.
-
- Look for a message in dmesg output like this:
-
- x86/mm: Checked W+X mappings: passed, no W+X pages found.
-
- or like this, if the check failed:
-
- x86/mm: Checked W+X mappings: FAILED, <N> W+X pages found.
-
- Note that even if the check fails, your kernel is possibly
- still fine, as W+X mappings are not a security hole in
- themselves, what they do is that they make the exploitation
- of other unfixed kernel bugs easier.
-
- There is no runtime or memory usage effect of this option
- once the kernel has booted up - it's a one time check.
-
- If in doubt, say "Y".
-
config DEBUG_TLBFLUSH
bool "Set upper limit of TLB entries to flush one-by-one"
depends on DEBUG_KERNEL
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index acf76b466db6..e2137070386a 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -97,8 +97,7 @@ static int hv_cpu_init(unsigned int cpu)
* not be stopped in the case of CPU offlining and the VM will hang.
*/
if (!*hvp) {
- *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO,
- PAGE_KERNEL);
+ *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
}
if (*hvp) {
@@ -379,7 +378,7 @@ void __init hyperv_init(void)
guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0);
wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
- hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX);
+ hv_hypercall_pg = vmalloc_exec(PAGE_SIZE);
if (hv_hypercall_pg == NULL) {
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
goto remove_cpuhp_state;
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 28183ee3cc42..b9527a54db99 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -152,7 +152,6 @@ extern void reserve_top_address(unsigned long reserve);
extern int fixmaps_set;
extern pte_t *kmap_pte;
-#define kmap_prot PAGE_KERNEL
extern pte_t *pkmap_page_table;
void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index a8059930056d..0f420b24e0fc 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -58,15 +58,6 @@ extern unsigned long highstart_pfn, highend_pfn;
#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
-extern void *kmap_high(struct page *page);
-extern void kunmap_high(struct page *page);
-
-void *kmap(struct page *page);
-void kunmap(struct page *page);
-
-void *kmap_atomic_prot(struct page *page, pgprot_t prot);
-void *kmap_atomic(struct page *page);
-void __kunmap_atomic(void *kvaddr);
void *kmap_atomic_pfn(unsigned long pfn);
void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index f65cfb48cfdd..1721b1aadeb1 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -7,14 +7,4 @@
#define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE)
-static inline int is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr,
- unsigned long len) {
- return 0;
-}
-
-static inline void arch_clear_hugepage_flags(struct page *page)
-{
-}
-
#endif /* _ASM_X86_HUGETLB_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4c8eea28983a..07cc4813232e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1293,8 +1293,7 @@ extern struct kmem_cache *x86_fpu_cache;
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
- return __vmalloc(kvm_x86_ops.vm_size,
- GFP_KERNEL_ACCOUNT | __GFP_ZERO, PAGE_KERNEL);
+ return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
}
void kvm_arch_free_vm(struct kvm *kvm);
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index 6deb6cd236e3..7f6ccff0ba72 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -20,6 +20,8 @@ typedef union {
#define SHARED_KERNEL_PMD 0
+#define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED
+
/*
* traditional i386 two-level paging structure:
*/
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 33845d36897c..80fbb4a9ed87 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -27,6 +27,8 @@ typedef union {
#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI))
#endif
+#define ARCH_PAGE_TABLE_SYNC_MASK (SHARED_KERNEL_PMD ? 0 : PGTBL_PMD_MODIFIED)
+
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 4d02e64af1b3..f51d8997ed00 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -624,7 +624,7 @@ static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
return __pud(pfn | check_pgprot(pgprot));
}
-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+static inline pmd_t pmd_mkinvalid(pmd_t pmd)
{
return pfn_pmd(pmd_pfn(pmd),
__pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index df1373415f11..8d03ffd43794 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -53,6 +53,12 @@ static inline void sync_initial_page_table(void) { }
struct mm_struct;
+#define mm_p4d_folded mm_p4d_folded
+static inline bool mm_p4d_folded(struct mm_struct *mm)
+{
+ return !pgtable_l5_enabled();
+}
+
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 52e5f5f2240d..8f63efb2a2cc 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -159,4 +159,6 @@ extern unsigned int ptrs_per_p4d;
#define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t))
+#define ARCH_PAGE_TABLE_SYNC_MASK (pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED)
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 7b6ddcf77d70..2da1f95b88d7 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -194,7 +194,6 @@ enum page_cache_mode {
#define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0)
#define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC)
#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G)
-#define __PAGE_KERNEL_RX (__PP| 0| 0|___A| 0|___D| 0|___G)
#define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC)
#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G)
#define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G)
@@ -220,7 +219,6 @@ enum page_cache_mode {
#define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC)
#define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC)
#define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0)
-#define PAGE_KERNEL_RX __pgprot_mask(__PAGE_KERNEL_RX | _ENC)
#define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC)
#define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC)
#define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC)
@@ -284,6 +282,12 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
typedef struct { pgdval_t pgd; } pgd_t;
+static inline pgprot_t pgprot_nx(pgprot_t prot)
+{
+ return __pgprot(pgprot_val(prot) | _PAGE_NX);
+}
+#define pgprot_nx pgprot_nx
+
#ifdef CONFIG_X86_PAE
/*
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 0e059b73437b..9f69cc497f4b 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -12,27 +12,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);
-/* This runs runs on the previous thread's stack. */
-static inline void prepare_switch_to(struct task_struct *next)
-{
-#ifdef CONFIG_VMAP_STACK
- /*
- * If we switch to a stack that has a top-level paging entry
- * that is not present in the current mm, the resulting #PF will
- * will be promoted to a double-fault and we'll panic. Probe
- * the new stack now so that vmalloc_fault can fix up the page
- * tables if needed. This can only happen if we use a stack
- * in vmap space.
- *
- * We assume that the stack is aligned so that it never spans
- * more than one top-level paging entry.
- *
- * To minimize cache pollution, just follow the stack pointer.
- */
- READ_ONCE(*(unsigned char *)next->thread.sp);
-#endif
-}
-
asmlinkage void ret_from_fork(void);
/*
@@ -67,8 +46,6 @@ struct fork_frame {
#define switch_to(prev, next, last) \
do { \
- prepare_switch_to(next); \
- \
((last) = __switch_to_asm((prev), (next))); \
} while (0)
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 12df3a4abfdd..6b32ab009c19 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -43,7 +43,7 @@ static int map_irq_stack(unsigned int cpu)
pages[i] = pfn_to_page(pa >> PAGE_SHIFT);
}
- va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
+ va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL);
if (!va)
return -ENOMEM;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index e6d7894ad127..fd945ce78554 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -287,9 +287,9 @@ void __init setup_per_cpu_areas(void)
/*
* Sync back kernel address range again. We already did this in
* setup_arch(), but percpu data also needs to be available in
- * the smpboot asm. We can't reliably pick up percpu mappings
- * using vmalloc_fault(), because exception dispatch needs
- * percpu data.
+ * the smpboot asm and arch_sync_kernel_mappings() doesn't sync to
+ * swapper_pg_dir on 32-bit. The per-cpu mappings need to be available
+ * there too.
*
* FIXME: Can the later sync in setup_cpu_entry_areas() replace
* this call?
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 89f7f3aebd31..5573a97f1520 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -336,8 +336,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
/* Avoid using vmalloc for smaller buffers. */
size = npages * sizeof(struct page *);
if (size > PAGE_SIZE)
- pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO,
- PAGE_KERNEL);
+ pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
else
pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 69309cd56fdf..17aa7ac94a34 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -249,10 +249,22 @@ static void note_wx(struct pg_state *st, unsigned long addr)
(void *)st->start_address);
}
-static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
+static void effective_prot(struct ptdump_state *pt_st, int level, u64 val)
{
- return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) |
- ((prot1 | prot2) & _PAGE_NX);
+ struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
+ pgprotval_t prot = val & PTE_FLAGS_MASK;
+ pgprotval_t effective;
+
+ if (level > 0) {
+ pgprotval_t higher_prot = st->prot_levels[level - 1];
+
+ effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) |
+ ((higher_prot | prot) & _PAGE_NX);
+ } else {
+ effective = prot;
+ }
+
+ st->prot_levels[level] = effective;
}
/*
@@ -261,7 +273,7 @@ static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
* print what we collected so far.
*/
static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
- unsigned long val)
+ u64 val)
{
struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
pgprotval_t new_prot, new_eff;
@@ -270,16 +282,10 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
struct seq_file *m = st->seq;
new_prot = val & PTE_FLAGS_MASK;
+ new_eff = st->prot_levels[level];
- if (level > 0) {
- new_eff = effective_prot(st->prot_levels[level - 1],
- new_prot);
- } else {
- new_eff = new_prot;
- }
-
- if (level >= 0)
- st->prot_levels[level] = new_eff;
+ if (!val)
+ new_eff = 0;
/*
* If we have a "break" in the series, we need to flush the state that
@@ -374,6 +380,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m,
struct pg_state st = {
.ptdump = {
.note_page = note_page,
+ .effective_prot = effective_prot,
.range = ptdump_ranges
},
.level = -1,
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7c3ac7f251b4..ca01be033edd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -191,16 +191,13 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
return pmd_k;
}
-static void vmalloc_sync(void)
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
- unsigned long address;
-
- if (SHARED_KERNEL_PMD)
- return;
+ unsigned long addr;
- for (address = VMALLOC_START & PMD_MASK;
- address >= TASK_SIZE_MAX && address < VMALLOC_END;
- address += PMD_SIZE) {
+ for (addr = start & PMD_MASK;
+ addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
+ addr += PMD_SIZE) {
struct page *page;
spin_lock(&pgd_lock);
@@ -211,61 +208,13 @@ static void vmalloc_sync(void)
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
- vmalloc_sync_one(page_address(page), address);
+ vmalloc_sync_one(page_address(page), addr);
spin_unlock(pgt_lock);
}
spin_unlock(&pgd_lock);
}
}
-void vmalloc_sync_mappings(void)
-{
- vmalloc_sync();
-}
-
-void vmalloc_sync_unmappings(void)
-{
- vmalloc_sync();
-}
-
-/*
- * 32-bit:
- *
- * Handle a fault on the vmalloc or module mapping area
- */
-static noinline int vmalloc_fault(unsigned long address)
-{
- unsigned long pgd_paddr;
- pmd_t *pmd_k;
- pte_t *pte_k;
-
- /* Make sure we are in vmalloc area: */
- if (!(address >= VMALLOC_START && address < VMALLOC_END))
- return -1;
-
- /*
- * Synchronize this task's top level page-table
- * with the 'reference' page table.
- *
- * Do _not_ use "current" here. We might be inside
- * an interrupt in the middle of a task switch..
- */
- pgd_paddr = read_cr3_pa();
- pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
- if (!pmd_k)
- return -1;
-
- if (pmd_large(*pmd_k))
- return 0;
-
- pte_k = pte_offset_kernel(pmd_k, address);
- if (!pte_present(*pte_k))
- return -1;
-
- return 0;
-}
-NOKPROBE_SYMBOL(vmalloc_fault);
-
/*
* Did it hit the DOS screen memory VA from vm86 mode?
*/
@@ -330,96 +279,6 @@ out:
#else /* CONFIG_X86_64: */
-void vmalloc_sync_mappings(void)
-{
- /*
- * 64-bit mappings might allocate new p4d/pud pages
- * that need to be propagated to all tasks' PGDs.
- */
- sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
-}
-
-void vmalloc_sync_unmappings(void)
-{
- /*
- * Unmappings never allocate or free p4d/pud pages.
- * No work is required here.
- */
-}
-
-/*
- * 64-bit:
- *
- * Handle a fault on the vmalloc area
- */
-static noinline int vmalloc_fault(unsigned long address)
-{
- pgd_t *pgd, *pgd_k;
- p4d_t *p4d, *p4d_k;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- /* Make sure we are in vmalloc area: */
- if (!(address >= VMALLOC_START && address < VMALLOC_END))
- return -1;
-
- /*
- * Copy kernel mappings over when needed. This can also
- * happen within a race in page table update. In the later
- * case just flush:
- */
- pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
- pgd_k = pgd_offset_k(address);
- if (pgd_none(*pgd_k))
- return -1;
-
- if (pgtable_l5_enabled()) {
- if (pgd_none(*pgd)) {
- set_pgd(pgd, *pgd_k);
- arch_flush_lazy_mmu_mode();
- } else {
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
- }
- }
-
- /* With 4-level paging, copying happens on the p4d level. */
- p4d = p4d_offset(pgd, address);
- p4d_k = p4d_offset(pgd_k, address);
- if (p4d_none(*p4d_k))
- return -1;
-
- if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
- set_p4d(p4d, *p4d_k);
- arch_flush_lazy_mmu_mode();
- } else {
- BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
- }
-
- BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
-
- pud = pud_offset(p4d, address);
- if (pud_none(*pud))
- return -1;
-
- if (pud_large(*pud))
- return 0;
-
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd))
- return -1;
-
- if (pmd_large(*pmd))
- return 0;
-
- pte = pte_offset_kernel(pmd, address);
- if (!pte_present(*pte))
- return -1;
-
- return 0;
-}
-NOKPROBE_SYMBOL(vmalloc_fault);
-
#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
KERN_ERR
@@ -1260,29 +1119,6 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
*/
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
- /*
- * We can fault-in kernel-space virtual memory on-demand. The
- * 'reference' page table is init_mm.pgd.
- *
- * NOTE! We MUST NOT take any locks for this case. We may
- * be in an interrupt or a critical region, and should
- * only copy the information from the master page table,
- * nothing more.
- *
- * Before doing this on-demand faulting, ensure that the
- * fault is not any of the following:
- * 1. A fault on a PTE with a reserved bit set.
- * 2. A fault caused by a user-mode access. (Do not demand-
- * fault kernel memory due to user-mode accesses).
- * 3. A fault caused by a page-level protection violation.
- * (A demand fault would be on a non-present page which
- * would have X86_PF_PROT==0).
- */
- if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
- if (vmalloc_fault(address) >= 0)
- return;
- }
-
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 0a1898b8552e..075fe51317b0 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -4,44 +4,11 @@
#include <linux/swap.h> /* for totalram_pages */
#include <linux/memblock.h>
-void *kmap(struct page *page)
-{
- might_sleep();
- if (!PageHighMem(page))
- return page_address(page);
- return kmap_high(page);
-}
-EXPORT_SYMBOL(kmap);
-
-void kunmap(struct page *page)
-{
- if (in_interrupt())
- BUG();
- if (!PageHighMem(page))
- return;
- kunmap_high(page);
-}
-EXPORT_SYMBOL(kunmap);
-
-/*
- * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
- * no global lock is needed and because the kmap code must perform a global TLB
- * invalidation when the kmap pool wraps.
- *
- * However when holding an atomic kmap it is not legal to sleep, so atomic
- * kmaps are appropriate for short, tight code paths only.
- */
-void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
{
unsigned long vaddr;
int idx, type;
- preempt_disable();
- pagefault_disable();
-
- if (!PageHighMem(page))
- return page_address(page);
-
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
@@ -51,13 +18,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
return (void *)vaddr;
}
-EXPORT_SYMBOL(kmap_atomic_prot);
-
-void *kmap_atomic(struct page *page)
-{
- return kmap_atomic_prot(page, kmap_prot);
-}
-EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_high_prot);
/*
* This is the same as kmap_atomic() but can map memory that doesn't
@@ -69,7 +30,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
-void __kunmap_atomic(void *kvaddr)
+void kunmap_atomic_high(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -99,11 +60,8 @@ void __kunmap_atomic(void *kvaddr)
BUG_ON(vaddr >= (unsigned long)high_memory);
}
#endif
-
- pagefault_enable();
- preempt_enable();
}
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_high);
void __init set_highmem_pages_init(void)
{
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 5bfd5aef5378..cf5781142716 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -181,28 +181,21 @@ get_unmapped_area:
#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_X86_64
-static __init int setup_hugepagesz(char *opt)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
- unsigned long ps = memparse(opt, &opt);
- if (ps == PMD_SIZE) {
- hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
- } else {
- hugetlb_bad_size();
- printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
- ps >> 20);
- return 0;
- }
- return 1;
+ if (size == PMD_SIZE)
+ return true;
+ else if (size == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES))
+ return true;
+ else
+ return false;
}
-__setup("hugepagesz=", setup_hugepagesz);
#ifdef CONFIG_CONTIG_ALLOC
static __init int gigantic_pages_init(void)
{
/* With compaction or CMA we can allocate gigantic pages at runtime */
- if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT))
+ if (boot_cpu_has(X86_FEATURE_GBPAGES))
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
return 0;
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8ce055e4117b..112d3b98a3b6 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -982,7 +982,7 @@ void __init zone_sizes_init(void)
max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
#endif
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5e0a43b141ae..add03c35aa34 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -218,6 +218,11 @@ void sync_global_pgds(unsigned long start, unsigned long end)
sync_global_pgds_l4(start, end);
}
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
+{
+ sync_global_pgds(start, end);
+}
+
/*
* NOTE: This function is marked __ref because it calls __init function
* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -1260,6 +1265,18 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+ /*
+ * More CPUs always led to greater speedups on tested systems, up to
+ * all the nodes' CPUs. Use all since the system is otherwise idle
+ * now.
+ */
+ return max_t(int, cpumask_weight(node_cpumask), 1);
+}
+#endif
+
int kernel_set_to_readonly;
void mark_rodata_ro(void)
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index dd625898425a..be020a7bc414 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -130,7 +130,7 @@ static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
pmdval_t v = pmd_val(*pmd);
if (clear) {
*old = v;
- new_pmd = pmd_mknotpresent(*pmd);
+ new_pmd = pmd_mkinvalid(*pmd);
} else {
/* Presume this has been called with clear==true previously */
new_pmd = __pmd(*old);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 59ba008504dc..8ee952038c80 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -517,8 +517,10 @@ static void __init numa_clear_kernel_node_hotplug(void)
* reserve specific pages for Sandy Bridge graphics. ]
*/
for_each_memblock(reserved, mb_region) {
- if (mb_region->nid != MAX_NUMNODES)
- node_set(mb_region->nid, reserved_nodemask);
+ int nid = memblock_get_region_node(mb_region);
+
+ if (nid != MAX_NUMNODES)
+ node_set(nid, reserved_nodemask);
}
/*
@@ -735,12 +737,9 @@ void __init x86_numa_init(void)
static void __init init_memory_less_node(int nid)
{
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
-
/* Allocate and initialize node data. Memory-less node is now online.*/
alloc_node_data(nid);
- free_area_init_node(nid, zones_size, 0, zholes_size);
+ free_area_init_memoryless_node(nid);
/*
* All zonelists will be built later in start_kernel() after per cpu
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 843aa10a4cb6..da0fb17a1a36 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -448,13 +448,7 @@ static void __init pti_clone_user_shared(void)
* the sp1 and sp2 slots.
*
* This is done for all possible CPUs during boot to ensure
- * that it's propagated to all mms. If we were to add one of
- * these mappings during CPU hotplug, we would need to take
- * some measure to make sure that every mm that subsequently
- * ran on that CPU would have the relevant PGD entry in its
- * pagetables. The usual vmalloc_fault() mechanism would not
- * work for page faults taken in entry_SYSCALL_64 before RSP
- * is set up.
+ * that it's propagated to all mms.
*/
unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c8524c506ab0..ff1ff8c83452 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -345,34 +345,6 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
local_irq_restore(flags);
}
-static void sync_current_stack_to_mm(struct mm_struct *mm)
-{
- unsigned long sp = current_stack_pointer;
- pgd_t *pgd = pgd_offset(mm, sp);
-
- if (pgtable_l5_enabled()) {
- if (unlikely(pgd_none(*pgd))) {
- pgd_t *pgd_ref = pgd_offset_k(sp);
-
- set_pgd(pgd, *pgd_ref);
- }
- } else {
- /*
- * "pgd" is faked. The top level entries are "p4d"s, so sync
- * the p4d. This compiles to approximately the same code as
- * the 5-level case.
- */
- p4d_t *p4d = p4d_offset(pgd, sp);
-
- if (unlikely(p4d_none(*p4d))) {
- pgd_t *pgd_ref = pgd_offset_k(sp);
- p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
-
- set_p4d(p4d, *p4d_ref);
- }
- }
-}
-
static inline unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
@@ -592,15 +564,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
*/
cond_mitigation(tsk);
- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- /*
- * If our current stack is in vmalloc space and isn't
- * mapped in the new pgd, we'll double-fault. Forcibly
- * map it.
- */
- sync_current_stack_to_mm(next);
- }
-
/*
* Stop remote flushes for the previous mm.
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h
index 04e9340eac4b..d6a10704307a 100644
--- a/arch/xtensa/include/asm/highmem.h
+++ b/arch/xtensa/include/asm/highmem.h
@@ -63,38 +63,11 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
extern pte_t *pkmap_page_table;
-void *kmap_high(struct page *page);
-void kunmap_high(struct page *page);
-
-static inline void *kmap(struct page *page)
-{
- /* Check if this memory layout is broken because PKMAP overlaps
- * page table.
- */
- BUILD_BUG_ON(PKMAP_BASE <
- TLBTEMP_BASE_1 + TLBTEMP_SIZE);
- BUG_ON(in_interrupt());
- if (!PageHighMem(page))
- return page_address(page);
- return kmap_high(page);
-}
-
-static inline void kunmap(struct page *page)
-{
- BUG_ON(in_interrupt());
- if (!PageHighMem(page))
- return;
- kunmap_high(page);
-}
-
static inline void flush_cache_kmaps(void)
{
flush_cache_all();
}
-void *kmap_atomic(struct page *page);
-void __kunmap_atomic(void *kvaddr);
-
void kmap_init(void);
#endif
diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c
index 184ceadccc1a..99b5ad137ab5 100644
--- a/arch/xtensa/mm/highmem.c
+++ b/arch/xtensa/mm/highmem.c
@@ -37,29 +37,24 @@ static inline enum fixed_addresses kmap_idx(int type, unsigned long color)
color;
}
-void *kmap_atomic(struct page *page)
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
{
enum fixed_addresses idx;
unsigned long vaddr;
- preempt_disable();
- pagefault_disable();
- if (!PageHighMem(page))
- return page_address(page);
-
idx = kmap_idx(kmap_atomic_idx_push(),
DCACHE_ALIAS(page_to_phys(page)));
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
#ifdef CONFIG_DEBUG_HIGHMEM
BUG_ON(!pte_none(*(kmap_pte + idx)));
#endif
- set_pte(kmap_pte + idx, mk_pte(page, PAGE_KERNEL_EXEC));
+ set_pte(kmap_pte + idx, mk_pte(page, prot));
return (void *)vaddr;
}
-EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_high_prot);
-void __kunmap_atomic(void *kvaddr)
+void kunmap_atomic_high(void *kvaddr)
{
if (kvaddr >= (void *)FIXADDR_START &&
kvaddr < (void *)FIXADDR_TOP) {
@@ -78,16 +73,17 @@ void __kunmap_atomic(void *kvaddr)
kmap_atomic_idx_pop();
}
-
- pagefault_enable();
- preempt_enable();
}
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_high);
void __init kmap_init(void)
{
unsigned long kmap_vstart;
+ /* Check if this memory layout is broken because PKMAP overlaps
+ * page table.
+ */
+ BUILD_BUG_ON(PKMAP_BASE < TLBTEMP_BASE_1 + TLBTEMP_SIZE);
/* cache the first kmap pte */
kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 19c625e6d81f..a05b306cf371 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -70,13 +70,13 @@ void __init bootmem_init(void)
void __init zones_init(void)
{
/* All pages are DMA-able, so we put them all in the DMA zone. */
- unsigned long zones_size[MAX_NR_ZONES] = {
- [ZONE_NORMAL] = max_low_pfn - ARCH_PFN_OFFSET,
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = {
+ [ZONE_NORMAL] = max_low_pfn,
#ifdef CONFIG_HIGHMEM
- [ZONE_HIGHMEM] = max_pfn - max_low_pfn,
+ [ZONE_HIGHMEM] = max_pfn,
#endif
};
- free_area_init_node(0, zones_size, ARCH_PFN_OFFSET, NULL);
+ free_area_init(max_zone_pfn);
}
#ifdef CONFIG_HIGHMEM
diff --git a/block/blk-core.c b/block/blk-core.c
index 77e57c2e8d60..d250ed6eccc6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -20,6 +20,7 @@
#include <linux/blk-mq.h>
#include <linux/highmem.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 5abca09455ad..81bf71b10d44 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -168,12 +168,6 @@ int ghes_estatus_pool_init(int num_ghes)
if (!addr)
goto err_pool_alloc;
- /*
- * New allocation must be visible in all pgd before it can be found by
- * an NMI allocating from the pool.
- */
- vmalloc_sync_mappings();
-
rc = gen_pool_add(ghes_estatus_pool, addr, PAGE_ALIGN(len), -1);
if (rc)
goto err_pool_add;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index dbec3a05590a..2b09b68b9f78 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -21,6 +21,7 @@
#include <linux/mm.h>
#include <linux/stat.h>
#include <linux/slab.h>
+#include <linux/xarray.h>
#include <linux/atomic.h>
#include <linux/uaccess.h>
@@ -74,6 +75,13 @@ static struct bus_type memory_subsys = {
.offline = memory_subsys_offline,
};
+/*
+ * Memory blocks are cached in a local radix tree to avoid
+ * a costly linear search for the corresponding device on
+ * the subsystem bus.
+ */
+static DEFINE_XARRAY(memory_blocks);
+
static BLOCKING_NOTIFIER_HEAD(memory_chain);
int register_memory_notifier(struct notifier_block *nb)
@@ -489,22 +497,23 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
return 0;
}
-/* A reference for the returned memory block device is acquired. */
+/*
+ * A reference for the returned memory block device is acquired.
+ *
+ * Called under device_hotplug_lock.
+ */
static struct memory_block *find_memory_block_by_id(unsigned long block_id)
{
- struct device *dev;
+ struct memory_block *mem;
- dev = subsys_find_device_by_id(&memory_subsys, block_id, NULL);
- return dev ? to_memory_block(dev) : NULL;
+ mem = xa_load(&memory_blocks, block_id);
+ if (mem)
+ get_device(&mem->dev);
+ return mem;
}
/*
- * For now, we have a linear search to go find the appropriate
- * memory_block corresponding to a particular phys_index. If
- * this gets to be a real problem, we can always use a radix
- * tree or something here.
- *
- * This could be made generic for all device subsystems.
+ * Called under device_hotplug_lock.
*/
struct memory_block *find_memory_block(struct mem_section *section)
{
@@ -548,9 +557,16 @@ int register_memory(struct memory_block *memory)
memory->dev.offline = memory->state == MEM_OFFLINE;
ret = device_register(&memory->dev);
- if (ret)
+ if (ret) {
put_device(&memory->dev);
-
+ return ret;
+ }
+ ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
+ GFP_KERNEL));
+ if (ret) {
+ put_device(&memory->dev);
+ device_unregister(&memory->dev);
+ }
return ret;
}
@@ -604,6 +620,8 @@ static void unregister_memory(struct memory_block *memory)
if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
return;
+ WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
+
/* drop the ref. we got via find_memory_block() */
put_device(&memory->dev);
device_unregister(&memory->dev);
@@ -750,6 +768,8 @@ void __init memory_dev_init(void)
*
* In case func() returns an error, walking is aborted and the error is
* returned.
+ *
+ * Called under device_hotplug_lock.
*/
int walk_memory_blocks(unsigned long start, unsigned long size,
void *arg, walk_memory_blocks_func_t func)
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 15e99697234a..df53dca5d02c 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -396,9 +396,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
bytes = sizeof(struct page *)*want;
new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN);
if (!new_pages) {
- new_pages = __vmalloc(bytes,
- GFP_NOIO | __GFP_ZERO,
- PAGE_KERNEL);
+ new_pages = __vmalloc(bytes, GFP_NOIO | __GFP_ZERO);
if (!new_pages)
return NULL;
}
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 1a8564a79d8d..e78e7a2ccfd5 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -29,7 +29,6 @@ static const char * const backends[] = {
#if IS_ENABLED(CONFIG_CRYPTO_ZSTD)
"zstd",
#endif
- NULL
};
static void zcomp_strm_free(struct zcomp_strm *zstrm)
@@ -67,7 +66,7 @@ bool zcomp_available_algorithm(const char *comp)
{
int i;
- i = __sysfs_match_string(backends, -1, comp);
+ i = sysfs_match_string(backends, comp);
if (i >= 0)
return true;
@@ -86,9 +85,9 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
{
bool known_algorithm = false;
ssize_t sz = 0;
- int i = 0;
+ int i;
- for (; backends[i]; i++) {
+ for (i = 0; i < ARRAY_SIZE(backends); i++) {
if (!strcmp(comp, backends[i])) {
known_algorithm = true;
sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 3107ce80e809..16850d5388ab 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -44,6 +44,7 @@ struct dax_region {
* @dev - device core
* @pgmap - pgmap for memmap setup / lifetime (driver owned)
* @dax_mem_res: physical address range of hotadded DAX memory
+ * @dax_mem_name: name for hotadded DAX memory via add_memory_driver_managed()
*/
struct dev_dax {
struct dax_region *region;
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 1af823b2fe6b..4c0af2eb7e19 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -377,6 +377,7 @@ static int dax_open(struct inode *inode, struct file *filp)
inode->i_mapping->a_ops = &dev_dax_aops;
filp->f_mapping = inode->i_mapping;
filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
+ filp->f_sb_err = file_sample_sb_err(filp);
filp->private_data = dev_dax;
inode->i_flags = S_DAX;
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 1e678bdf5aed..275aa5f87399 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -14,6 +14,11 @@
#include "dax-private.h"
#include "bus.h"
+/* Memory resource name used for add_memory_driver_managed(). */
+static const char *kmem_name;
+/* Set if any memory will remain added when the driver will be unloaded. */
+static bool any_hotremove_failed;
+
int dev_dax_kmem_probe(struct device *dev)
{
struct dev_dax *dev_dax = to_dev_dax(dev);
@@ -70,7 +75,12 @@ int dev_dax_kmem_probe(struct device *dev)
*/
new_res->flags = IORESOURCE_SYSTEM_RAM;
- rc = add_memory(numa_node, new_res->start, resource_size(new_res));
+ /*
+ * Ensure that future kexec'd kernels will not treat this as RAM
+ * automatically.
+ */
+ rc = add_memory_driver_managed(numa_node, new_res->start,
+ resource_size(new_res), kmem_name);
if (rc) {
release_resource(new_res);
kfree(new_res);
@@ -100,6 +110,7 @@ static int dev_dax_kmem_remove(struct device *dev)
*/
rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size);
if (rc) {
+ any_hotremove_failed = true;
dev_err(dev,
"DAX region %pR cannot be hotremoved until the next reboot\n",
res);
@@ -124,6 +135,7 @@ static int dev_dax_kmem_remove(struct device *dev)
* permanently pinned as reserved by the unreleased
* request_mem_region().
*/
+ any_hotremove_failed = true;
return 0;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -137,12 +149,24 @@ static struct dax_device_driver device_dax_kmem_driver = {
static int __init dax_kmem_init(void)
{
- return dax_driver_register(&device_dax_kmem_driver);
+ int rc;
+
+ /* Resource name is permanently allocated if any hotremove fails. */
+ kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL);
+ if (!kmem_name)
+ return -ENOMEM;
+
+ rc = dax_driver_register(&device_dax_kmem_driver);
+ if (rc)
+ kfree_const(kmem_name);
+ return rc;
}
static void __exit dax_kmem_exit(void)
{
dax_driver_unregister(&device_dax_kmem_driver);
+ if (!any_hotremove_failed)
+ kfree_const(kmem_name);
}
MODULE_AUTHOR("Intel Corporation");
diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c
index ca520028b2cb..f4e6184d1877 100644
--- a/drivers/gpu/drm/drm_scatter.c
+++ b/drivers/gpu/drm/drm_scatter.c
@@ -43,15 +43,6 @@
#define DEBUG_SCATTER 0
-static inline void *drm_vmalloc_dma(unsigned long size)
-{
-#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
- return __vmalloc(size, GFP_KERNEL, pgprot_noncached_wc(PAGE_KERNEL));
-#else
- return vmalloc_32(size);
-#endif
-}
-
static void drm_sg_cleanup(struct drm_sg_mem * entry)
{
struct page *page;
@@ -126,7 +117,7 @@ int drm_legacy_sg_alloc(struct drm_device *dev, void *data,
return -ENOMEM;
}
- entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT);
+ entry->virtual = vmalloc_32(pages << PAGE_SHIFT);
if (!entry->virtual) {
kfree(entry->busaddr);
kfree(entry->pagelist);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
index 648cf0207309..706af0304ca4 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
@@ -154,8 +154,8 @@ void etnaviv_core_dump(struct etnaviv_gem_submit *submit)
file_size += sizeof(*iter.hdr) * n_obj;
/* Allocate the file in vmalloc memory, it's likely to be big */
- iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
- PAGE_KERNEL);
+ iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NORETRY);
if (!iter.start) {
mutex_unlock(&gpu->mmu_context->lock);
dev_warn(gpu->dev, "failed to allocate devcoredump file\n");
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index 7ffd7afeb7a5..b55ac7563189 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -471,7 +471,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
down_read(&mm->mmap_sem);
locked = 1;
}
- ret = get_user_pages_remote
+ ret = pin_user_pages_remote
(work->task, mm,
obj->userptr.ptr + pinned * PAGE_SIZE,
npages - pinned,
@@ -507,7 +507,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
}
mutex_unlock(&obj->mm.lock);
- release_pages(pvec, pinned);
+ unpin_user_pages(pvec, pinned);
kvfree(pvec);
i915_gem_object_put(obj);
@@ -564,6 +564,7 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
struct sg_table *pages;
bool active;
int pinned;
+ unsigned int gup_flags = 0;
/* If userspace should engineer that these pages are replaced in
* the vma between us binding this page into the GTT and completion
@@ -598,11 +599,14 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
GFP_KERNEL |
__GFP_NORETRY |
__GFP_NOWARN);
- if (pvec) /* defer to worker if malloc fails */
- pinned = __get_user_pages_fast(obj->userptr.ptr,
- num_pages,
- !i915_gem_object_is_readonly(obj),
- pvec);
+ /* defer to worker if malloc fails */
+ if (pvec) {
+ if (!i915_gem_object_is_readonly(obj))
+ gup_flags |= FOLL_WRITE;
+ pinned = pin_user_pages_fast_only(obj->userptr.ptr,
+ num_pages, gup_flags,
+ pvec);
+ }
}
active = false;
@@ -620,7 +624,7 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
__i915_gem_userptr_set_active(obj, true);
if (IS_ERR(pages))
- release_pages(pvec, pinned);
+ unpin_user_pages(pvec, pinned);
kvfree(pvec);
return PTR_ERR_OR_ZERO(pages);
@@ -675,7 +679,7 @@ i915_gem_userptr_put_pages(struct drm_i915_gem_object *obj,
}
mark_page_accessed(page);
- put_page(page);
+ unpin_user_page(page);
}
obj->mm.dirty = false;
diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
index 9272bef57092..debaf7b18ab5 100644
--- a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
@@ -66,7 +66,7 @@ static void *mock_dmabuf_vmap(struct dma_buf *dma_buf)
{
struct mock_dmabuf *mock = to_mock(dma_buf);
- return vm_map_ram(mock->pages, mock->npages, 0, PAGE_KERNEL);
+ return vm_map_ram(mock->pages, mock->npages, 0);
}
static void mock_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr)
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index 52d2b71f1588..f09b096ba4fd 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -257,54 +257,6 @@ static int ttm_copy_io_page(void *dst, void *src, unsigned long page)
return 0;
}
-#ifdef CONFIG_X86
-#define __ttm_kmap_atomic_prot(__page, __prot) kmap_atomic_prot(__page, __prot)
-#define __ttm_kunmap_atomic(__addr) kunmap_atomic(__addr)
-#else
-#define __ttm_kmap_atomic_prot(__page, __prot) vmap(&__page, 1, 0, __prot)
-#define __ttm_kunmap_atomic(__addr) vunmap(__addr)
-#endif
-
-
-/**
- * ttm_kmap_atomic_prot - Efficient kernel map of a single page with
- * specified page protection.
- *
- * @page: The page to map.
- * @prot: The page protection.
- *
- * This function maps a TTM page using the kmap_atomic api if available,
- * otherwise falls back to vmap. The user must make sure that the
- * specified page does not have an aliased mapping with a different caching
- * policy unless the architecture explicitly allows it. Also mapping and
- * unmapping using this api must be correctly nested. Unmapping should
- * occur in the reverse order of mapping.
- */
-void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot)
-{
- if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))
- return kmap_atomic(page);
- else
- return __ttm_kmap_atomic_prot(page, prot);
-}
-EXPORT_SYMBOL(ttm_kmap_atomic_prot);
-
-/**
- * ttm_kunmap_atomic_prot - Unmap a page that was mapped using
- * ttm_kmap_atomic_prot.
- *
- * @addr: The virtual address from the map.
- * @prot: The page protection.
- */
-void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot)
-{
- if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))
- kunmap_atomic(addr);
- else
- __ttm_kunmap_atomic(addr);
-}
-EXPORT_SYMBOL(ttm_kunmap_atomic_prot);
-
static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src,
unsigned long page,
pgprot_t prot)
@@ -316,13 +268,13 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src,
return -ENOMEM;
src = (void *)((unsigned long)src + (page << PAGE_SHIFT));
- dst = ttm_kmap_atomic_prot(d, prot);
+ dst = kmap_atomic_prot(d, prot);
if (!dst)
return -ENOMEM;
memcpy_fromio(dst, src, PAGE_SIZE);
- ttm_kunmap_atomic_prot(dst, prot);
+ kunmap_atomic(dst);
return 0;
}
@@ -338,13 +290,13 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst,
return -ENOMEM;
dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT));
- src = ttm_kmap_atomic_prot(s, prot);
+ src = kmap_atomic_prot(s, prot);
if (!src)
return -ENOMEM;
memcpy_toio(dst, src, PAGE_SIZE);
- ttm_kunmap_atomic_prot(src, prot);
+ kunmap_atomic(src);
return 0;
}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
index bb46ca0c458f..1629427d5734 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
@@ -27,6 +27,7 @@
**************************************************************************/
#include "vmwgfx_drv.h"
+#include <linux/highmem.h>
/*
* Template that implements find_first_diff() for a generic
@@ -374,12 +375,12 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d,
copy_size = min_t(u32, copy_size, PAGE_SIZE - src_page_offset);
if (unmap_src) {
- ttm_kunmap_atomic_prot(d->src_addr, d->src_prot);
+ kunmap_atomic(d->src_addr);
d->src_addr = NULL;
}
if (unmap_dst) {
- ttm_kunmap_atomic_prot(d->dst_addr, d->dst_prot);
+ kunmap_atomic(d->dst_addr);
d->dst_addr = NULL;
}
@@ -388,8 +389,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d,
return -EINVAL;
d->dst_addr =
- ttm_kmap_atomic_prot(d->dst_pages[dst_page],
- d->dst_prot);
+ kmap_atomic_prot(d->dst_pages[dst_page],
+ d->dst_prot);
if (!d->dst_addr)
return -ENOMEM;
@@ -401,8 +402,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d,
return -EINVAL;
d->src_addr =
- ttm_kmap_atomic_prot(d->src_pages[src_page],
- d->src_prot);
+ kmap_atomic_prot(d->src_pages[src_page],
+ d->src_prot);
if (!d->src_addr)
return -ENOMEM;
@@ -499,9 +500,9 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst,
}
out:
if (d.src_addr)
- ttm_kunmap_atomic_prot(d.src_addr, d.src_prot);
+ kunmap_atomic(d.src_addr);
if (d.dst_addr)
- ttm_kunmap_atomic_prot(d.dst_addr, d.dst_prot);
+ kunmap_atomic(d.dst_addr);
return ret;
}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 9a967a2e83dd..6e677ff62cc9 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -145,9 +145,8 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
int ret = 0;
map_size = pblk_trans_map_size(pblk);
- pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN
- | __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM,
- PAGE_KERNEL);
+ pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN |
+ __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM);
if (!pblk->trans_map) {
pblk_err(pblk, "failed to allocate L2P (need %zu of memory)\n",
map_size);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index bf289be1ee3a..c35f224400c1 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -400,13 +400,13 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
*/
if (gfp_mask & __GFP_NORETRY) {
unsigned noio_flag = memalloc_noio_save();
- void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
+ void *ptr = __vmalloc(c->block_size, gfp_mask);
memalloc_noio_restore(noio_flag);
return ptr;
}
- return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
+ return __vmalloc(c->block_size, gfp_mask);
}
/*
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index b952bd45bd6a..95a5f3757fa3 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -324,14 +324,6 @@ static void end_bitmap_write(struct buffer_head *bh, int uptodate)
wake_up(&bitmap->write_wait);
}
-/* copied from buffer.c */
-static void
-__clear_page_buffers(struct page *page)
-{
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
-}
static void free_buffers(struct page *page)
{
struct buffer_head *bh;
@@ -345,7 +337,7 @@ static void free_buffers(struct page *page)
free_buffer_head(bh);
bh = next;
}
- __clear_page_buffers(page);
+ detach_page_private(page);
put_page(page);
}
@@ -374,7 +366,7 @@ static int read_page(struct file *file, unsigned long index,
ret = -ENOMEM;
goto out;
}
- attach_page_buffers(page, bh);
+ attach_page_private(page, bh);
blk_cur = index << (PAGE_SHIFT - inode->i_blkbits);
while (bh) {
block = blk_cur;
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
index 6db60e9d5183..92072a08af25 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
@@ -309,8 +309,7 @@ static void *vb2_dma_sg_vaddr(void *buf_priv)
if (buf->db_attach)
buf->vaddr = dma_buf_vmap(buf->db_attach->dmabuf);
else
- buf->vaddr = vm_map_ram(buf->pages,
- buf->num_pages, -1, PAGE_KERNEL);
+ buf->vaddr = vm_map_ram(buf->pages, buf->num_pages, -1);
}
/* add offset in case userptr is not page-aligned */
diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
index 1a4f0ca87c7c..c66fda4a65e4 100644
--- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c
+++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
@@ -107,8 +107,7 @@ static void *vb2_vmalloc_get_userptr(struct device *dev, unsigned long vaddr,
buf->vaddr = (__force void *)
ioremap(__pfn_to_phys(nums[0]), size + offset);
} else {
- buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1,
- PAGE_KERNEL);
+ buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1);
}
if (!buf->vaddr)
diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c
index 5f8883031c9c..0d8372cc364a 100644
--- a/drivers/media/pci/ivtv/ivtv-udma.c
+++ b/drivers/media/pci/ivtv/ivtv-udma.c
@@ -92,7 +92,7 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr,
{
struct ivtv_dma_page_info user_dma;
struct ivtv_user_dma *dma = &itv->udma;
- int i, err;
+ int err;
IVTV_DEBUG_DMA("ivtv_udma_setup, dst: 0x%08x\n", (unsigned int)ivtv_dest_addr);
@@ -111,16 +111,15 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr,
return -EINVAL;
}
- /* Get user pages for DMA Xfer */
- err = get_user_pages_unlocked(user_dma.uaddr, user_dma.page_count,
+ /* Pin user pages for DMA Xfer */
+ err = pin_user_pages_unlocked(user_dma.uaddr, user_dma.page_count,
dma->map, FOLL_FORCE);
if (user_dma.page_count != err) {
IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n",
err, user_dma.page_count);
if (err >= 0) {
- for (i = 0; i < err; i++)
- put_page(dma->map[i]);
+ unpin_user_pages(dma->map, err);
return -EINVAL;
}
return err;
@@ -130,9 +129,7 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr,
/* Fill SG List with new values */
if (ivtv_udma_fill_sg_list(dma, &user_dma, 0) < 0) {
- for (i = 0; i < dma->page_count; i++) {
- put_page(dma->map[i]);
- }
+ unpin_user_pages(dma->map, dma->page_count);
dma->page_count = 0;
return -ENOMEM;
}
@@ -153,7 +150,6 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr,
void ivtv_udma_unmap(struct ivtv *itv)
{
struct ivtv_user_dma *dma = &itv->udma;
- int i;
IVTV_DEBUG_INFO("ivtv_unmap_user_dma\n");
@@ -169,10 +165,7 @@ void ivtv_udma_unmap(struct ivtv *itv)
/* sync DMA */
ivtv_udma_sync_for_cpu(itv);
- /* Release User Pages */
- for (i = 0; i < dma->page_count; i++) {
- put_page(dma->map[i]);
- }
+ unpin_user_pages(dma->map, dma->page_count);
dma->page_count = 0;
}
diff --git a/drivers/media/pci/ivtv/ivtv-yuv.c b/drivers/media/pci/ivtv/ivtv-yuv.c
index cd2fe2d444c0..5f7dc9771f8d 100644
--- a/drivers/media/pci/ivtv/ivtv-yuv.c
+++ b/drivers/media/pci/ivtv/ivtv-yuv.c
@@ -30,7 +30,6 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma,
struct yuv_playback_info *yi = &itv->yuv_info;
u8 frame = yi->draw_frame;
struct yuv_frame_info *f = &yi->new_frame_info[frame];
- int i;
int y_pages, uv_pages;
unsigned long y_buffer_offset, uv_buffer_offset;
int y_decode_height, uv_decode_height, y_size;
@@ -62,12 +61,12 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma,
ivtv_udma_get_page_info (&y_dma, (unsigned long)args->y_source, 720 * y_decode_height);
ivtv_udma_get_page_info (&uv_dma, (unsigned long)args->uv_source, 360 * uv_decode_height);
- /* Get user pages for DMA Xfer */
- y_pages = get_user_pages_unlocked(y_dma.uaddr,
+ /* Pin user pages for DMA Xfer */
+ y_pages = pin_user_pages_unlocked(y_dma.uaddr,
y_dma.page_count, &dma->map[0], FOLL_FORCE);
uv_pages = 0; /* silence gcc. value is set and consumed only if: */
if (y_pages == y_dma.page_count) {
- uv_pages = get_user_pages_unlocked(uv_dma.uaddr,
+ uv_pages = pin_user_pages_unlocked(uv_dma.uaddr,
uv_dma.page_count, &dma->map[y_pages],
FOLL_FORCE);
}
@@ -81,8 +80,7 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma,
uv_pages, uv_dma.page_count);
if (uv_pages >= 0) {
- for (i = 0; i < uv_pages; i++)
- put_page(dma->map[y_pages + i]);
+ unpin_user_pages(&dma->map[y_pages], uv_pages);
rc = -EFAULT;
} else {
rc = uv_pages;
@@ -93,8 +91,7 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma,
y_pages, y_dma.page_count);
}
if (y_pages >= 0) {
- for (i = 0; i < y_pages; i++)
- put_page(dma->map[i]);
+ unpin_user_pages(dma->map, y_pages);
/*
* Inherit the -EFAULT from rc's
* initialization, but allow it to be
@@ -112,9 +109,7 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma,
/* Fill & map SG List */
if (ivtv_udma_fill_sg_list (dma, &uv_dma, ivtv_udma_fill_sg_list (dma, &y_dma, 0)) < 0) {
IVTV_DEBUG_WARN("could not allocate bounce buffers for highmem userspace buffers\n");
- for (i = 0; i < dma->page_count; i++) {
- put_page(dma->map[i]);
- }
+ unpin_user_pages(dma->map, dma->page_count);
dma->page_count = 0;
return -ENOMEM;
}
diff --git a/drivers/media/pci/ivtv/ivtvfb.c b/drivers/media/pci/ivtv/ivtvfb.c
index 0c2859844081..e2d56dca5be4 100644
--- a/drivers/media/pci/ivtv/ivtvfb.c
+++ b/drivers/media/pci/ivtv/ivtvfb.c
@@ -281,10 +281,10 @@ static int ivtvfb_prep_dec_dma_to_device(struct ivtv *itv,
/* Map User DMA */
if (ivtv_udma_setup(itv, ivtv_dest_addr, userbuf, size_in_bytes) <= 0) {
mutex_unlock(&itv->udma.lock);
- IVTVFB_WARN("ivtvfb_prep_dec_dma_to_device, Error with get_user_pages: %d bytes, %d pages returned\n",
+ IVTVFB_WARN("ivtvfb_prep_dec_dma_to_device, Error with pin_user_pages: %d bytes, %d pages returned\n",
size_in_bytes, itv->udma.page_count);
- /* get_user_pages must have failed completely */
+ /* pin_user_pages must have failed completely */
return -EIO;
}
diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c
index b57b84fb97d0..14d890b00d2c 100644
--- a/drivers/mtd/ubi/io.c
+++ b/drivers/mtd/ubi/io.c
@@ -1297,7 +1297,7 @@ static int self_check_write(struct ubi_device *ubi, const void *buf, int pnum,
if (!ubi_dbg_chk_io(ubi))
return 0;
- buf1 = __vmalloc(len, GFP_NOFS, PAGE_KERNEL);
+ buf1 = __vmalloc(len, GFP_NOFS);
if (!buf1) {
ubi_err(ubi, "cannot allocate memory to check writes");
return 0;
@@ -1361,7 +1361,7 @@ int ubi_self_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len)
if (!ubi_dbg_chk_io(ubi))
return 0;
- buf = __vmalloc(len, GFP_NOFS, PAGE_KERNEL);
+ buf = __vmalloc(len, GFP_NOFS);
if (!buf) {
ubi_err(ubi, "cannot allocate memory to check for 0xFFs");
return 0;
diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c
index f2741c04289d..35158cfd9c1a 100644
--- a/drivers/pcmcia/electra_cf.c
+++ b/drivers/pcmcia/electra_cf.c
@@ -178,10 +178,9 @@ static int electra_cf_probe(struct platform_device *ofdev)
struct device_node *np = ofdev->dev.of_node;
struct electra_cf_socket *cf;
struct resource mem, io;
- int status;
+ int status = -ENOMEM;
const unsigned int *prop;
int err;
- struct vm_struct *area;
err = of_address_to_resource(np, 0, &mem);
if (err)
@@ -202,30 +201,19 @@ static int electra_cf_probe(struct platform_device *ofdev)
cf->mem_phys = mem.start;
cf->mem_size = PAGE_ALIGN(resource_size(&mem));
cf->mem_base = ioremap(cf->mem_phys, cf->mem_size);
+ if (!cf->mem_base)
+ goto out_free_cf;
cf->io_size = PAGE_ALIGN(resource_size(&io));
-
- area = __get_vm_area(cf->io_size, 0, PHB_IO_BASE, PHB_IO_END);
- if (area == NULL) {
- status = -ENOMEM;
- goto fail1;
- }
-
- cf->io_virt = (void __iomem *)(area->addr);
+ cf->io_virt = ioremap_phb(io.start, cf->io_size);
+ if (!cf->io_virt)
+ goto out_unmap_mem;
cf->gpio_base = ioremap(0xfc103000, 0x1000);
+ if (!cf->gpio_base)
+ goto out_unmap_virt;
dev_set_drvdata(device, cf);
- if (!cf->mem_base || !cf->io_virt || !cf->gpio_base ||
- (__ioremap_at(io.start, cf->io_virt, cf->io_size,
- pgprot_noncached(PAGE_KERNEL)) == NULL)) {
- dev_err(device, "can't ioremap ranges\n");
- status = -ENOMEM;
- goto fail1;
- }
-
-
cf->io_base = (unsigned long)cf->io_virt - VMALLOC_END;
-
cf->iomem.start = (unsigned long)cf->mem_base;
cf->iomem.end = (unsigned long)cf->mem_base + (mem.end - mem.start);
cf->iomem.flags = IORESOURCE_MEM;
@@ -305,14 +293,13 @@ fail1:
if (cf->irq)
free_irq(cf->irq, cf);
- if (cf->io_virt)
- __iounmap_at(cf->io_virt, cf->io_size);
- if (cf->mem_base)
- iounmap(cf->mem_base);
- if (cf->gpio_base)
- iounmap(cf->gpio_base);
- if (area)
- device_init_wakeup(&ofdev->dev, 0);
+ iounmap(cf->gpio_base);
+out_unmap_virt:
+ device_init_wakeup(&ofdev->dev, 0);
+ iounmap(cf->io_virt);
+out_unmap_mem:
+ iounmap(cf->mem_base);
+out_free_cf:
kfree(cf);
return status;
@@ -330,7 +317,7 @@ static int electra_cf_remove(struct platform_device *ofdev)
free_irq(cf->irq, cf);
del_timer_sync(&cf->timer);
- __iounmap_at(cf->io_virt, cf->io_size);
+ iounmap(cf->io_virt);
iounmap(cf->mem_base);
iounmap(cf->gpio_base);
release_mem_region(cf->mem_phys, cf->mem_size);
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 10af330153b5..451608e960a1 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -572,14 +572,12 @@ static void dma_req_free(struct kref *ref)
struct mport_dma_req *req = container_of(ref, struct mport_dma_req,
refcount);
struct mport_cdev_priv *priv = req->priv;
- unsigned int i;
dma_unmap_sg(req->dmach->device->dev,
req->sgt.sgl, req->sgt.nents, req->dir);
sg_free_table(&req->sgt);
if (req->page_list) {
- for (i = 0; i < req->nr_pages; i++)
- put_page(req->page_list[i]);
+ unpin_user_pages(req->page_list, req->nr_pages);
kfree(req->page_list);
}
@@ -815,7 +813,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
struct mport_dma_req *req;
struct mport_dev *md = priv->md;
struct dma_chan *chan;
- int i, ret;
+ int ret;
int nents;
if (xfer->length == 0)
@@ -862,7 +860,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
goto err_req;
}
- pinned = get_user_pages_fast(
+ pinned = pin_user_pages_fast(
(unsigned long)xfer->loc_addr & PAGE_MASK,
nr_pages,
dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
@@ -870,7 +868,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
if (pinned != nr_pages) {
if (pinned < 0) {
- rmcd_error("get_user_pages_unlocked err=%ld",
+ rmcd_error("pin_user_pages_fast err=%ld",
pinned);
nr_pages = 0;
} else
@@ -951,8 +949,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
err_pg:
if (!req->page_list) {
- for (i = 0; i < nr_pages; i++)
- put_page(page_list[i]);
+ unpin_user_pages(page_list, nr_pages);
kfree(page_list);
}
err_req:
@@ -2384,13 +2381,6 @@ static struct mport_dev *mport_cdev_add(struct rio_mport *mport)
cdev_init(&md->cdev, &mport_fops);
md->cdev.owner = THIS_MODULE;
- ret = cdev_device_add(&md->cdev, &md->dev);
- if (ret) {
- rmcd_error("Failed to register mport %d (err=%d)",
- mport->id, ret);
- goto err_cdev;
- }
-
INIT_LIST_HEAD(&md->doorbells);
spin_lock_init(&md->db_lock);
INIT_LIST_HEAD(&md->portwrites);
@@ -2410,6 +2400,13 @@ static struct mport_dev *mport_cdev_add(struct rio_mport *mport)
#else
md->properties.transfer_mode |= RIO_TRANSFER_MODE_TRANSFER;
#endif
+
+ ret = cdev_device_add(&md->cdev, &md->dev);
+ if (ret) {
+ rmcd_error("Failed to register mport %d (err=%d)",
+ mport->id, ret);
+ goto err_cdev;
+ }
ret = rio_query_mport(mport, &attr);
if (!ret) {
md->properties.flags = attr.flags;
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index bb87fbba2a09..6f7eba66687e 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -169,8 +169,7 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp,
while (bufsize >= SECTOR_SIZE) {
buf = __vmalloc(bufsize,
- GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY,
- PAGE_KERNEL);
+ GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY);
if (buf) {
*buflen = bufsize;
return buf;
diff --git a/drivers/staging/android/ion/ion_heap.c b/drivers/staging/android/ion/ion_heap.c
index 473b465724f1..0755b11348ed 100644
--- a/drivers/staging/android/ion/ion_heap.c
+++ b/drivers/staging/android/ion/ion_heap.c
@@ -99,12 +99,12 @@ int ion_heap_map_user(struct ion_heap *heap, struct ion_buffer *buffer,
static int ion_heap_clear_pages(struct page **pages, int num, pgprot_t pgprot)
{
- void *addr = vm_map_ram(pages, num, -1, pgprot);
+ void *addr = vmap(pages, num, VM_MAP, pgprot);
if (!addr)
return -ENOMEM;
memset(addr, 0, PAGE_SIZE * num);
- vm_unmap_ram(addr, num);
+ vunmap(addr);
return 0;
}
diff --git a/drivers/staging/media/ipu3/ipu3-css-pool.h b/drivers/staging/media/ipu3/ipu3-css-pool.h
index f4a60b41401b..a8ccd4f70320 100644
--- a/drivers/staging/media/ipu3/ipu3-css-pool.h
+++ b/drivers/staging/media/ipu3/ipu3-css-pool.h
@@ -15,14 +15,12 @@ struct imgu_device;
* @size: size of the buffer in bytes.
* @vaddr: kernel virtual address.
* @daddr: iova dma address to access IPU3.
- * @vma: private, a pointer to &struct vm_struct,
- * used for imgu_dmamap_free.
*/
struct imgu_css_map {
size_t size;
void *vaddr;
dma_addr_t daddr;
- struct vm_struct *vma;
+ struct page **pages;
};
/**
diff --git a/drivers/staging/media/ipu3/ipu3-dmamap.c b/drivers/staging/media/ipu3/ipu3-dmamap.c
index 7431322379f6..8a19b0024152 100644
--- a/drivers/staging/media/ipu3/ipu3-dmamap.c
+++ b/drivers/staging/media/ipu3/ipu3-dmamap.c
@@ -96,6 +96,7 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct imgu_css_map *map,
unsigned long shift = iova_shift(&imgu->iova_domain);
struct device *dev = &imgu->pci_dev->dev;
size_t size = PAGE_ALIGN(len);
+ int count = size >> PAGE_SHIFT;
struct page **pages;
dma_addr_t iovaddr;
struct iova *iova;
@@ -114,7 +115,7 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct imgu_css_map *map,
/* Call IOMMU driver to setup pgt */
iovaddr = iova_dma_addr(&imgu->iova_domain, iova);
- for (i = 0; i < size / PAGE_SIZE; ++i) {
+ for (i = 0; i < count; ++i) {
rval = imgu_mmu_map(imgu->mmu, iovaddr,
page_to_phys(pages[i]), PAGE_SIZE);
if (rval)
@@ -123,33 +124,23 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct imgu_css_map *map,
iovaddr += PAGE_SIZE;
}
- /* Now grab a virtual region */
- map->vma = __get_vm_area(size, VM_USERMAP, VMALLOC_START, VMALLOC_END);
- if (!map->vma)
+ map->vaddr = vmap(pages, count, VM_USERMAP, PAGE_KERNEL);
+ if (!map->vaddr)
goto out_unmap;
- map->vma->pages = pages;
- /* And map it in KVA */
- if (map_vm_area(map->vma, PAGE_KERNEL, pages))
- goto out_vunmap;
-
+ map->pages = pages;
map->size = size;
map->daddr = iova_dma_addr(&imgu->iova_domain, iova);
- map->vaddr = map->vma->addr;
dev_dbg(dev, "%s: allocated %zu @ IOVA %pad @ VA %p\n", __func__,
- size, &map->daddr, map->vma->addr);
-
- return map->vma->addr;
+ size, &map->daddr, map->vaddr);
-out_vunmap:
- vunmap(map->vma->addr);
+ return map->vaddr;
out_unmap:
imgu_dmamap_free_buffer(pages, size);
imgu_mmu_unmap(imgu->mmu, iova_dma_addr(&imgu->iova_domain, iova),
i * PAGE_SIZE);
- map->vma = NULL;
out_free_iova:
__free_iova(&imgu->iova_domain, iova);
@@ -177,8 +168,6 @@ void imgu_dmamap_unmap(struct imgu_device *imgu, struct imgu_css_map *map)
*/
void imgu_dmamap_free(struct imgu_device *imgu, struct imgu_css_map *map)
{
- struct vm_struct *area = map->vma;
-
dev_dbg(&imgu->pci_dev->dev, "%s: freeing %zu @ IOVA %pad @ VA %p\n",
__func__, map->size, &map->daddr, map->vaddr);
@@ -187,11 +176,8 @@ void imgu_dmamap_free(struct imgu_device *imgu, struct imgu_css_map *map)
imgu_dmamap_unmap(imgu, map);
- if (WARN_ON(!area) || WARN_ON(!area->pages))
- return;
-
- imgu_dmamap_free_buffer(area->pages, map->size);
vunmap(map->vaddr);
+ imgu_dmamap_free_buffer(map->pages, map->size);
map->vaddr = NULL;
}
diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index ae8463aa8e0f..bb000b8f44cc 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -1352,7 +1352,7 @@ static int sci_dma_rx_submit(struct sci_port *s, bool port_lock_held)
{
struct dma_chan *chan = s->chan_rx;
struct uart_port *port = &s->port;
- unsigned long flags;
+ unsigned long uninitialized_var(flags);
int i;
for (i = 0; i < 2; i++) {
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 15d33fa0c925..0262bc23eca2 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -633,6 +633,8 @@ static void k_spec(struct vc_data *vc, unsigned char value, char up_flag)
kbd->kbdmode == VC_OFF) &&
value != KVAL(K_SAK))
return; /* SAK is allowed even in raw mode */
+ if (IS_ENABLED(CONFIG_TWIST_DISABLE_KBD_K_SPEC_HANDLER))
+ return;
fn_handler[value](vc);
}
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index aa45840d8273..de624c47e190 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -31,6 +31,7 @@
#include <linux/types.h>
#include <linux/genalloc.h>
#include <linux/io.h>
+#include <linux/kcov.h>
#include <linux/phy/phy.h>
#include <linux/usb.h>
@@ -1645,7 +1646,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
/* pass ownership to the completion handler */
urb->status = status;
+ kcov_remote_start_usb((u64)urb->dev->bus->busnum);
urb->complete(urb);
+ kcov_remote_stop();
usb_anchor_resume_wakeups(anchor);
atomic_dec(&urb->use_count);
diff --git a/fs/aio.c b/fs/aio.c
index 1a9ab2460330..e0c38bbd1546 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1288,12 +1288,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
* the ringbuffer empty. So in practice we should be ok, but it's
* something to be aware of when touching this code.
*/
- if (until == 0)
- aio_read_events(ctx, min_nr, nr, event, &ret);
- else
- wait_event_interruptible_hrtimeout(ctx->wait,
- aio_read_events(ctx, min_nr, nr, event, &ret),
- until);
+ wait_event_interruptible_hrtimeout(ctx->wait,
+ aio_read_events(ctx, min_nr, nr, event, &ret),
+ until);
return ret;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8bf92048e444..32c8029e5867 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -353,8 +353,6 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
return 0;
}
-#ifndef elf_map
-
static unsigned long elf_map(struct file *filep, unsigned long addr,
const struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
@@ -394,8 +392,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
-#endif /* !elf_map */
-
static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
{
int i, first_idx = -1, last_idx = -1;
@@ -1855,7 +1851,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
(!regset->active || regset->active(t->task, regset) > 0)) {
int ret;
size_t size = regset_size(t->task, regset);
- void *data = kmalloc(size, GFP_KERNEL);
+ void *data = kzalloc(size, GFP_KERNEL);
if (unlikely(!data))
return 0;
ret = regset->get(t->task, regset,
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 995883693cb2..06b9b9fddf70 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -64,15 +64,15 @@ static int load_em86(struct linux_binprm *bprm)
* user environment and arguments are stored.
*/
remove_arg_zero(bprm);
- retval = copy_strings_kernel(1, &bprm->filename, bprm);
+ retval = copy_string_kernel(bprm->filename, bprm);
if (retval < 0) return retval;
bprm->argc++;
if (i_arg) {
- retval = copy_strings_kernel(1, &i_arg, bprm);
+ retval = copy_string_kernel(i_arg, bprm);
if (retval < 0) return retval;
bprm->argc++;
}
- retval = copy_strings_kernel(1, &i_name, bprm);
+ retval = copy_string_kernel(i_name, bprm);
if (retval < 0) return retval;
bprm->argc++;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 53968ea07b57..f69a043f562b 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -163,13 +163,13 @@ static int load_misc_binary(struct linux_binprm *bprm)
bprm->have_execfd = 1;
/* make argv[1] be the path to the binary */
- retval = copy_strings_kernel(1, &bprm->interp, bprm);
+ retval = copy_string_kernel(bprm->interp, bprm);
if (retval < 0)
goto ret;
bprm->argc++;
/* add the interp as argv[0] */
- retval = copy_strings_kernel(1, &fmt->interpreter, bprm);
+ retval = copy_string_kernel(fmt->interpreter, bprm);
if (retval < 0)
goto ret;
bprm->argc++;
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 0e8b953d12cf..1b6625e95958 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -106,19 +106,19 @@ static int load_script(struct linux_binprm *bprm)
retval = remove_arg_zero(bprm);
if (retval)
return retval;
- retval = copy_strings_kernel(1, &bprm->interp, bprm);
+ retval = copy_string_kernel(bprm->interp, bprm);
if (retval < 0)
return retval;
bprm->argc++;
*((char *)i_end) = '\0';
if (i_arg) {
*((char *)i_sep) = '\0';
- retval = copy_strings_kernel(1, &i_arg, bprm);
+ retval = copy_string_kernel(i_arg, bprm);
if (retval < 0)
return retval;
bprm->argc++;
}
- retval = copy_strings_kernel(1, &i_name, bprm);
+ retval = copy_string_kernel(i_name, bprm);
if (retval)
return retval;
bprm->argc++;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ec8dccc81b65..a333a648244e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -614,10 +614,9 @@ static int blkdev_readpage(struct file * file, struct page * page)
return block_read_full_page(page, blkdev_get_block);
}
-static int blkdev_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void blkdev_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+ mpage_readahead(rac, blkdev_get_block);
}
static int blkdev_write_begin(struct file *file, struct address_space *mapping,
@@ -2068,7 +2067,7 @@ static int blkdev_writepages(struct address_space *mapping,
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
- .readpages = blkdev_readpages,
+ .readahead = blkdev_readahead,
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f8ec2d8606fd..7c6f0bbb54a5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -976,9 +976,7 @@ static void btree_invalidatepage(struct page *page, unsigned int offset,
btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
"page private not zero on page %llu",
(unsigned long long)page_offset(page));
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
+ detach_page_private(page);
}
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d73fb7f49582..704239546093 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3099,22 +3099,16 @@ static int submit_extent_page(unsigned int opf,
static void attach_extent_buffer_page(struct extent_buffer *eb,
struct page *page)
{
- if (!PagePrivate(page)) {
- SetPagePrivate(page);
- get_page(page);
- set_page_private(page, (unsigned long)eb);
- } else {
+ if (!PagePrivate(page))
+ attach_page_private(page, eb);
+ else
WARN_ON(page->private != (unsigned long)eb);
- }
}
void set_page_extent_mapped(struct page *page)
{
- if (!PagePrivate(page)) {
- SetPagePrivate(page);
- get_page(page);
- set_page_private(page, EXTENT_PAGE_PRIVATE);
- }
+ if (!PagePrivate(page))
+ attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
}
static struct extent_map *
@@ -4390,51 +4384,32 @@ int extent_writepages(struct address_space *mapping,
return ret;
}
-int extent_readpages(struct address_space *mapping, struct list_head *pages,
- unsigned nr_pages)
+void extent_readahead(struct readahead_control *rac)
{
struct bio *bio = NULL;
unsigned long bio_flags = 0;
struct page *pagepool[16];
struct extent_map *em_cached = NULL;
- int nr = 0;
u64 prev_em_start = (u64)-1;
+ int nr;
- while (!list_empty(pages)) {
- u64 contig_end = 0;
-
- for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
- struct page *page = lru_to_page(pages);
-
- prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping, page->index,
- readahead_gfp_mask(mapping))) {
- put_page(page);
- break;
- }
-
- pagepool[nr++] = page;
- contig_end = page_offset(page) + PAGE_SIZE - 1;
- }
+ while ((nr = readahead_page_batch(rac, pagepool))) {
+ u64 contig_start = page_offset(pagepool[0]);
+ u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1;
- if (nr) {
- u64 contig_start = page_offset(pagepool[0]);
+ ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
- ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
-
- contiguous_readpages(pagepool, nr, contig_start,
- contig_end, &em_cached, &bio, &bio_flags,
- &prev_em_start);
- }
+ contiguous_readpages(pagepool, nr, contig_start, contig_end,
+ &em_cached, &bio, &bio_flags, &prev_em_start);
}
if (em_cached)
free_extent_map(em_cached);
- if (bio)
- return submit_one_bio(bio, 0, bio_flags);
- return 0;
+ if (bio) {
+ if (submit_one_bio(bio, 0, bio_flags))
+ return;
+ }
}
/*
@@ -4954,10 +4929,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
* We need to make sure we haven't be attached
* to a new eb.
*/
- ClearPagePrivate(page);
- set_page_private(page, 0);
- /* One for the page private */
- put_page(page);
+ detach_page_private(page);
}
if (mapped)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 107838008e27..87f60a48f750 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -202,8 +202,7 @@ int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
-int extent_readpages(struct address_space *mapping, struct list_head *pages,
- unsigned nr_pages);
+void extent_readahead(struct readahead_control *rac);
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
void set_page_extent_mapped(struct page *page);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fb95efeb63ed..9ecccb03e0f8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4843,8 +4843,8 @@ static void evict_inode_truncate_pages(struct inode *inode)
/*
* Keep looping until we have no more ranges in the io tree.
- * We can have ongoing bios started by readpages (called from readahead)
- * that have their endio callback (extent_io.c:end_bio_extent_readpage)
+ * We can have ongoing bios started by readahead that have
+ * their endio callback (extent_io.c:end_bio_extent_readpage)
* still in progress (unlocked the pages in the bio but did not yet
* unlocked the ranges in the io tree). Therefore this means some
* ranges can still be locked and eviction started because before
@@ -7039,11 +7039,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
* for it to complete) and then invalidate the pages for
* this range (through invalidate_inode_pages2_range()),
* but that can lead us to a deadlock with a concurrent
- * call to readpages() (a buffered read or a defrag call
+ * call to readahead (a buffered read or a defrag call
* triggered a readahead) on a page lock due to an
* ordered dio extent we created before but did not have
* yet a corresponding bio submitted (whence it can not
- * complete), which makes readpages() wait for that
+ * complete), which makes readahead wait for that
* ordered extent to complete while holding a lock on
* that page.
*/
@@ -7815,21 +7815,16 @@ static int btrfs_writepages(struct address_space *mapping,
return extent_writepages(mapping, wbc);
}
-static int
-btrfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void btrfs_readahead(struct readahead_control *rac)
{
- return extent_readpages(mapping, pages, nr_pages);
+ extent_readahead(rac);
}
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
int ret = try_release_extent_mapping(page, gfp_flags);
- if (ret == 1) {
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- }
+ if (ret == 1)
+ detach_page_private(page);
return ret;
}
@@ -7851,14 +7846,8 @@ static int btrfs_migratepage(struct address_space *mapping,
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
- if (page_has_private(page)) {
- ClearPagePrivate(page);
- get_page(newpage);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- SetPagePrivate(newpage);
- }
+ if (page_has_private(page))
+ attach_page_private(newpage, detach_page_private(page));
if (PagePrivate2(page)) {
ClearPagePrivate2(page);
@@ -7980,11 +7969,7 @@ again:
}
ClearPageChecked(page);
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- }
+ detach_page_private(page);
}
/*
@@ -10075,7 +10060,7 @@ static const struct address_space_operations btrfs_aops = {
.readpage = btrfs_readpage,
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
- .readpages = btrfs_readpages,
+ .readahead = btrfs_readahead,
.direct_IO = noop_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
diff --git a/fs/buffer.c b/fs/buffer.c
index a60f60396cfa..fc8831c392d7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -123,14 +123,6 @@ void __wait_on_buffer(struct buffer_head * bh)
}
EXPORT_SYMBOL(__wait_on_buffer);
-static void
-__clear_page_buffers(struct page *page)
-{
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
-}
-
static void buffer_io_error(struct buffer_head *bh, char *msg)
{
if (!test_bit(BH_Quiet, &bh->b_state))
@@ -906,7 +898,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
bh = bh->b_this_page;
} while (bh);
tail->b_this_page = head;
- attach_page_buffers(page, head);
+ attach_page_private(page, head);
}
static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
@@ -990,10 +982,20 @@ grow_dev_page(struct block_device *bdev, sector_t block,
end_block = init_page_buffers(page, bdev,
(sector_t)index << sizebits,
size);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x01;
+#endif
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x02;
+#endif
goto failed;
+ }
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x04;
+#endif
}
/*
@@ -1013,6 +1015,9 @@ grow_dev_page(struct block_device *bdev, sector_t block,
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x08;
+#endif
failed:
unlock_page(page);
put_page(page);
@@ -1068,6 +1073,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
return NULL;
}
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_stamp = jiffies;
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+#endif
for (;;) {
struct buffer_head *bh;
int ret;
@@ -1079,6 +1090,24 @@ __getblk_slow(struct block_device *bdev, sector_t block,
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
+
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ if (!time_after(jiffies, current->getblk_stamp + 3 * HZ))
+ continue;
+ printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n",
+ current->comm, current->pid, current->getblk_executed,
+ current->getblk_bh_count, current->getblk_bh_state,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1L :
+ bdev->bd_super->s_blocksize, size,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1 :
+ bdev->bd_super->s_blocksize_bits,
+ IS_ERR_OR_NULL(bdev->bd_inode) ? -1 :
+ bdev->bd_inode->i_blkbits);
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+ current->getblk_stamp = jiffies;
+#endif
}
}
@@ -1154,12 +1183,19 @@ EXPORT_SYMBOL(mark_buffer_dirty);
void mark_buffer_write_io_error(struct buffer_head *bh)
{
+ struct super_block *sb;
+
set_buffer_write_io_error(bh);
/* FIXME: do we need to set this in both places? */
if (bh->b_page && bh->b_page->mapping)
mapping_set_error(bh->b_page->mapping, -EIO);
if (bh->b_assoc_map)
mapping_set_error(bh->b_assoc_map, -EIO);
+ rcu_read_lock();
+ sb = READ_ONCE(bh->b_bdev->bd_super);
+ if (sb)
+ errseq_set(&sb->s_wb_err, -EIO);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(mark_buffer_write_io_error);
@@ -1580,7 +1616,7 @@ void create_empty_buffers(struct page *page,
bh = bh->b_this_page;
} while (bh != head);
}
- attach_page_buffers(page, head);
+ attach_page_private(page, head);
spin_unlock(&page->mapping->private_lock);
}
EXPORT_SYMBOL(create_empty_buffers);
@@ -2567,7 +2603,7 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
bh->b_this_page = head;
bh = bh->b_this_page;
} while (bh != head);
- attach_page_buffers(page, head);
+ attach_page_private(page, head);
spin_unlock(&page->mapping->private_lock);
}
@@ -3202,6 +3238,11 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*/
static inline int buffer_busy(struct buffer_head *bh)
{
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x80;
+ current->getblk_bh_count = atomic_read(&bh->b_count);
+ current->getblk_bh_state = bh->b_state;
+#endif
return atomic_read(&bh->b_count) |
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
@@ -3227,7 +3268,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
bh = next;
} while (bh != head);
*buffers_to_free = head;
- __clear_page_buffers(page);
+ detach_page_private(page);
return 1;
failed:
return 0;
@@ -3240,11 +3281,18 @@ int try_to_free_buffers(struct page *page)
int ret = 0;
BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
+ if (PageWriteback(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x10;
+#endif
return 0;
+ }
if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(page, &buffers_to_free);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x20;
+#endif
goto out;
}
@@ -3268,6 +3316,9 @@ int try_to_free_buffers(struct page *page)
if (ret)
cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x40;
+#endif
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 75ddce8ef456..17a4f49c34f5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4162,7 +4162,7 @@ cifs_readv_complete(struct work_struct *work)
for (i = 0; i < rdata->nr_pages; i++) {
struct page *page = rdata->pages[i];
- lru_cache_add_file(page);
+ lru_cache_add(page);
if (rdata->result == 0 ||
(rdata->result == -EAGAIN && got_bytes)) {
@@ -4232,7 +4232,7 @@ readpages_fill_pages(struct TCP_Server_Info *server,
* fill them until the writes are flushed.
*/
zero_user(page, 0, PAGE_SIZE);
- lru_cache_add_file(page);
+ lru_cache_add(page);
flush_dcache_page(page);
SetPageUptodate(page);
unlock_page(page);
@@ -4242,7 +4242,7 @@ readpages_fill_pages(struct TCP_Server_Info *server,
continue;
} else {
/* no need to hold page hostage */
- lru_cache_add_file(page);
+ lru_cache_add(page);
unlock_page(page);
put_page(page);
rdata->pages[i] = NULL;
@@ -4437,7 +4437,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
/* best to give up if we're out of mem */
list_for_each_entry_safe(page, tpage, &tmplist, lru) {
list_del(&page->lru);
- lru_cache_add_file(page);
+ lru_cache_add(page);
unlock_page(page);
put_page(page);
}
@@ -4475,7 +4475,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
add_credits_and_wake_if(server, &rdata->credits, 0);
for (i = 0; i < rdata->nr_pages; i++) {
page = rdata->pages[i];
- lru_cache_add_file(page);
+ lru_cache_add(page);
unlock_page(page);
put_page(page);
}
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index fc3a8d8064f8..d0542151e8c4 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -280,47 +280,36 @@ static int erofs_raw_access_readpage(struct file *file, struct page *page)
return 0;
}
-static int erofs_raw_access_readpages(struct file *filp,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned int nr_pages)
+static void erofs_raw_access_readahead(struct readahead_control *rac)
{
erofs_off_t last_block;
struct bio *bio = NULL;
- gfp_t gfp = readahead_gfp_mask(mapping);
- struct page *page = list_last_entry(pages, struct page, lru);
-
- trace_erofs_readpages(mapping->host, page, nr_pages, true);
+ struct page *page;
- for (; nr_pages; --nr_pages) {
- page = list_entry(pages->prev, struct page, lru);
+ trace_erofs_readpages(rac->mapping->host, readahead_index(rac),
+ readahead_count(rac), true);
+ while ((page = readahead_page(rac))) {
prefetchw(&page->flags);
- list_del(&page->lru);
- if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) {
- bio = erofs_read_raw_page(bio, mapping, page,
- &last_block, nr_pages, true);
+ bio = erofs_read_raw_page(bio, rac->mapping, page, &last_block,
+ readahead_count(rac), true);
- /* all the page errors are ignored when readahead */
- if (IS_ERR(bio)) {
- pr_err("%s, readahead error at page %lu of nid %llu\n",
- __func__, page->index,
- EROFS_I(mapping->host)->nid);
+ /* all the page errors are ignored when readahead */
+ if (IS_ERR(bio)) {
+ pr_err("%s, readahead error at page %lu of nid %llu\n",
+ __func__, page->index,
+ EROFS_I(rac->mapping->host)->nid);
- bio = NULL;
- }
+ bio = NULL;
}
- /* pages could still be locked */
put_page(page);
}
- DBG_BUGON(!list_empty(pages));
/* the rare case (end in gaps) */
if (bio)
submit_bio(bio);
- return 0;
}
static int erofs_get_block(struct inode *inode, sector_t iblock,
@@ -358,7 +347,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
/* for uncompressed (aligned) files and raw access for other files */
const struct address_space_operations erofs_raw_access_aops = {
.readpage = erofs_raw_access_readpage,
- .readpages = erofs_raw_access_readpages,
+ .readahead = erofs_raw_access_readahead,
.bmap = erofs_bmap,
};
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 5d2d81940679..7628816f2453 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -274,7 +274,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
i = 0;
while (1) {
- dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL);
+ dst = vm_map_ram(rq->out, nrpages_out, -1);
/* retry two more times (totally 3 times) */
if (dst || ++i >= 3)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 5086b1218aac..be50a4d9d273 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1305,28 +1305,23 @@ static bool should_decompress_synchronously(struct erofs_sb_info *sbi,
return nr <= sbi->ctx.max_sync_decompress_pages;
}
-static int z_erofs_readpages(struct file *filp, struct address_space *mapping,
- struct list_head *pages, unsigned int nr_pages)
+static void z_erofs_readahead(struct readahead_control *rac)
{
- struct inode *const inode = mapping->host;
+ struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
- bool sync = should_decompress_synchronously(sbi, nr_pages);
+ bool sync = should_decompress_synchronously(sbi, readahead_count(rac));
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
- struct page *head = NULL;
+ struct page *page, *head = NULL;
LIST_HEAD(pagepool);
- trace_erofs_readpages(mapping->host, lru_to_page(pages),
- nr_pages, false);
+ trace_erofs_readpages(inode, readahead_index(rac),
+ readahead_count(rac), false);
- f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT;
-
- for (; nr_pages; --nr_pages) {
- struct page *page = lru_to_page(pages);
+ f.headoffset = readahead_pos(rac);
+ while ((page = readahead_page(rac))) {
prefetchw(&page->flags);
- list_del(&page->lru);
/*
* A pure asynchronous readahead is indicated if
@@ -1335,11 +1330,6 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping,
*/
sync &= !(PageReadahead(page) && !head);
- if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
- list_add(&page->lru, &pagepool);
- continue;
- }
-
set_page_private(page, (unsigned long)head);
head = page;
}
@@ -1368,11 +1358,10 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping,
/* clean up the remaining free pages */
put_pages_list(&pagepool);
- return 0;
}
const struct address_space_operations z_erofs_aops = {
.readpage = z_erofs_readpage,
- .readpages = z_erofs_readpages,
+ .readahead = z_erofs_readahead,
};
diff --git a/fs/exec.c b/fs/exec.c
index a2bdf1d72054..239fd98bdf8a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -139,11 +139,13 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
if (IS_ERR(file))
goto out;
- error = -EINVAL;
+ /*
+ * do_open() has already checked for these, but we can be extra
+ * cautious and check again at the very end too.
+ */
+ error = -EACCES;
if (!S_ISREG(file_inode(file)->i_mode))
goto exit;
-
- error = -EACCES;
if (path_noexec(&file->f_path))
goto exit;
@@ -588,24 +590,48 @@ out:
}
/*
- * Like copy_strings, but get argv and its values from kernel memory.
+ * Copy and argument/environment string from the kernel to the processes stack.
*/
-int copy_strings_kernel(int argc, const char *const *__argv,
- struct linux_binprm *bprm)
+int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
{
- int r;
- mm_segment_t oldfs = get_fs();
- struct user_arg_ptr argv = {
- .ptr.native = (const char __user *const __user *)__argv,
- };
+ int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */;
+ unsigned long pos = bprm->p;
- set_fs(KERNEL_DS);
- r = copy_strings(argc, argv, bprm);
- set_fs(oldfs);
+ if (len == 0)
+ return -EFAULT;
+ if (!valid_arg_len(bprm, len))
+ return -E2BIG;
+
+ /* We're going to work our way backwards. */
+ arg += len;
+ bprm->p -= len;
+ if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin)
+ return -E2BIG;
- return r;
+ while (len > 0) {
+ unsigned int bytes_to_copy = min_t(unsigned int, len,
+ min_not_zero(offset_in_page(pos), PAGE_SIZE));
+ struct page *page;
+ char *kaddr;
+
+ pos -= bytes_to_copy;
+ arg -= bytes_to_copy;
+ len -= bytes_to_copy;
+
+ page = get_arg_page(bprm, pos, 1);
+ if (!page)
+ return -E2BIG;
+ kaddr = kmap_atomic(page);
+ flush_arg_page(bprm, pos & PAGE_MASK, page);
+ memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
+ flush_kernel_dcache_page(page);
+ kunmap_atomic(kaddr);
+ put_arg_page(page);
+ }
+
+ return 0;
}
-EXPORT_SYMBOL(copy_strings_kernel);
+EXPORT_SYMBOL(copy_string_kernel);
#ifdef CONFIG_MMU
@@ -861,10 +887,13 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
if (IS_ERR(file))
goto out;
+ /*
+ * do_open() has already checked for these, but we can be extra
+ * cautious and check again at the very end too.
+ */
err = -EACCES;
if (!S_ISREG(file_inode(file)->i_mode))
goto exit;
-
if (path_noexec(&file->f_path))
goto exit;
@@ -1845,11 +1874,17 @@ static int __do_execve_file(int fd, struct filename *filename,
check_unsafe_exec(bprm);
current->in_execve = 1;
- if (!file)
+ if (!file) {
file = do_open_execat(fd, filename, flags);
- retval = PTR_ERR(file);
- if (IS_ERR(file))
- goto out_unmark;
+ retval = PTR_ERR(file);
+ if (IS_ERR(file))
+ goto out_unmark;
+ } else {
+ retval = deny_write_access(file);
+ if (retval)
+ goto out_unmark;
+ get_file(file);
+ }
sched_exec();
@@ -1892,7 +1927,7 @@ static int __do_execve_file(int fd, struct filename *filename,
if (retval)
goto out;
- retval = copy_strings_kernel(1, &bprm->filename, bprm);
+ retval = copy_string_kernel(bprm->filename, bprm);
if (retval < 0)
goto out;
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 3f367d081cd6..a7d58adf560a 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -372,10 +372,9 @@ static int exfat_readpage(struct file *file, struct page *page)
return mpage_readpage(page, exfat_get_block);
}
-static int exfat_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned int nr_pages)
+static void exfat_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, exfat_get_block);
+ mpage_readahead(rac, exfat_get_block);
}
static int exfat_writepage(struct page *page, struct writeback_control *wbc)
@@ -502,7 +501,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from)
static const struct address_space_operations exfat_aops = {
.readpage = exfat_readpage,
- .readpages = exfat_readpages,
+ .readahead = exfat_readahead,
.writepage = exfat_writepage,
.writepages = exfat_writepages,
.write_begin = exfat_write_begin,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0f12a0e8a8d9..c8b371c82b4f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -878,11 +878,9 @@ static int ext2_readpage(struct file *file, struct page *page)
return mpage_readpage(page, ext2_get_block);
}
-static int
-ext2_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void ext2_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
+ mpage_readahead(rac, ext2_get_block);
}
static int
@@ -968,7 +966,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc
const struct address_space_operations ext2_aops = {
.readpage = ext2_readpage,
- .readpages = ext2_readpages,
+ .readahead = ext2_readahead,
.writepage = ext2_writepage,
.write_begin = ext2_write_begin,
.write_end = ext2_write_end,
@@ -982,7 +980,7 @@ const struct address_space_operations ext2_aops = {
const struct address_space_operations ext2_nobh_aops = {
.readpage = ext2_readpage,
- .readpages = ext2_readpages,
+ .readahead = ext2_readahead,
.writepage = ext2_nobh_writepage,
.write_begin = ext2_nobh_write_begin,
.write_end = nobh_write_end,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cb07778bca3d..a2f8182ecc5b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3323,9 +3323,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
}
/* readpages.c */
-extern int ext4_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
- unsigned nr_pages, bool is_readahead);
+extern int ext4_mpage_readpages(struct inode *inode,
+ struct readahead_control *rac, struct page *page);
extern int __init ext4_init_post_read_processing(void);
extern void ext4_exit_post_read_processing(void);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 456e8a6b4809..9c6fc4e7c196 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3230,23 +3230,20 @@ static int ext4_readpage(struct file *file, struct page *page)
ret = ext4_readpage_inline(inode, page);
if (ret == -EAGAIN)
- return ext4_mpage_readpages(page->mapping, NULL, page, 1,
- false);
+ return ext4_mpage_readpages(inode, NULL, page);
return ret;
}
-static int
-ext4_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void ext4_readahead(struct readahead_control *rac)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = rac->mapping->host;
- /* If the file has inline data, no need to do readpages. */
+ /* If the file has inline data, no need to do readahead. */
if (ext4_has_inline_data(inode))
- return 0;
+ return;
- return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true);
+ ext4_mpage_readpages(inode, rac, NULL);
}
static void ext4_invalidatepage(struct page *page, unsigned int offset,
@@ -3611,7 +3608,7 @@ static int ext4_set_page_dirty(struct page *page)
static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
@@ -3628,7 +3625,7 @@ static const struct address_space_operations ext4_aops = {
static const struct address_space_operations ext4_journalled_aops = {
.readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
@@ -3644,7 +3641,7 @@ static const struct address_space_operations ext4_journalled_aops = {
static const struct address_space_operations ext4_da_aops = {
.readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index c1769afbf799..5761e9961682 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -7,8 +7,8 @@
*
* This was originally taken from fs/mpage.c
*
- * The intent is the ext4_mpage_readpages() function here is intended
- * to replace mpage_readpages() in the general case, not just for
+ * The ext4_mpage_readpages() function here is intended to
+ * replace mpage_readahead() in the general case, not just for
* encrypted files. It has some limitations (see below), where it
* will fall back to read_block_full_page(), but these limitations
* should only be hit when page_size != block_size.
@@ -221,14 +221,12 @@ static inline loff_t ext4_readpage_limit(struct inode *inode)
return i_size_read(inode);
}
-int ext4_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
- unsigned nr_pages, bool is_readahead)
+int ext4_mpage_readpages(struct inode *inode,
+ struct readahead_control *rac, struct page *page)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
- struct inode *inode = mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
@@ -241,6 +239,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
int length;
unsigned relative_block = 0;
struct ext4_map_blocks map;
+ unsigned int nr_pages = rac ? readahead_count(rac) : 1;
map.m_pblk = 0;
map.m_lblk = 0;
@@ -251,14 +250,9 @@ int ext4_mpage_readpages(struct address_space *mapping,
int fully_mapped = 1;
unsigned first_hole = blocks_per_page;
- if (pages) {
- page = lru_to_page(pages);
-
+ if (rac) {
+ page = readahead_page(rac);
prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping, page->index,
- readahead_gfp_mask(mapping)))
- goto next_page;
}
if (page_has_buffers(page))
@@ -381,7 +375,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
bio_set_op_attrs(bio, REQ_OP_READ,
- is_readahead ? REQ_RAHEAD : 0);
+ rac ? REQ_RAHEAD : 0);
}
length = first_hole << blkbits;
@@ -406,10 +400,9 @@ int ext4_mpage_readpages(struct address_space *mapping,
else
unlock_page(page);
next_page:
- if (pages)
+ if (rac)
put_page(page);
}
- BUG_ON(pages && !list_empty(pages));
if (bio)
submit_bio(bio);
return 0;
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index dc5ec724d889..dec1244dd062 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -342,37 +342,6 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
return desc_size;
}
-/*
- * Prefetch some pages from the file's Merkle tree.
- *
- * This is basically a stripped-down version of __do_page_cache_readahead()
- * which works on pages past i_size.
- */
-static void ext4_merkle_tree_readahead(struct address_space *mapping,
- pgoff_t start_index, unsigned long count)
-{
- LIST_HEAD(pages);
- unsigned int nr_pages = 0;
- struct page *page;
- pgoff_t index;
- struct blk_plug plug;
-
- for (index = start_index; index < start_index + count; index++) {
- page = xa_load(&mapping->i_pages, index);
- if (!page || xa_is_value(page)) {
- page = __page_cache_alloc(readahead_gfp_mask(mapping));
- if (!page)
- break;
- page->index = index;
- list_add(&page->lru, &pages);
- nr_pages++;
- }
- }
- blk_start_plug(&plug);
- ext4_mpage_readpages(mapping, &pages, NULL, nr_pages, true);
- blk_finish_plug(&plug);
-}
-
static struct page *ext4_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
@@ -386,8 +355,8 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
if (page)
put_page(page);
else if (num_ra_pages > 1)
- ext4_merkle_tree_readahead(inode->i_mapping, index,
- num_ra_pages);
+ page_cache_readahead_unbounded(inode->i_mapping, NULL,
+ index, num_ra_pages, 0);
page = read_mapping_page(inode->i_mapping, index, NULL);
}
return page;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index cea43caf23a6..cb05f71cf850 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2263,13 +2263,11 @@ out:
* use ->readpage() or do the necessary surgery to decouple ->readpages()
* from read-ahead.
*/
-int f2fs_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
- unsigned nr_pages, bool is_readahead)
+static int f2fs_mpage_readpages(struct inode *inode,
+ struct readahead_control *rac, struct page *page)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
- struct inode *inode = mapping->host;
struct f2fs_map_blocks map;
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct compress_ctx cc = {
@@ -2283,6 +2281,7 @@ int f2fs_mpage_readpages(struct address_space *mapping,
.nr_cpages = 0,
};
#endif
+ unsigned nr_pages = rac ? readahead_count(rac) : 1;
unsigned max_nr_pages = nr_pages;
int ret = 0;
@@ -2296,15 +2295,9 @@ int f2fs_mpage_readpages(struct address_space *mapping,
map.m_may_create = false;
for (; nr_pages; nr_pages--) {
- if (pages) {
- page = list_last_entry(pages, struct page, lru);
-
+ if (rac) {
+ page = readahead_page(rac);
prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping,
- page_index(page),
- readahead_gfp_mask(mapping)))
- goto next_page;
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -2314,7 +2307,7 @@ int f2fs_mpage_readpages(struct address_space *mapping,
ret = f2fs_read_multi_pages(&cc, &bio,
max_nr_pages,
&last_block_in_bio,
- is_readahead, false);
+ rac != NULL, false);
f2fs_destroy_compress_ctx(&cc);
if (ret)
goto set_error_page;
@@ -2337,7 +2330,7 @@ read_single_page:
#endif
ret = f2fs_read_single_page(inode, page, max_nr_pages, &map,
- &bio, &last_block_in_bio, is_readahead);
+ &bio, &last_block_in_bio, rac);
if (ret) {
#ifdef CONFIG_F2FS_FS_COMPRESSION
set_error_page:
@@ -2346,8 +2339,10 @@ set_error_page:
zero_user_segment(page, 0, PAGE_SIZE);
unlock_page(page);
}
+#ifdef CONFIG_F2FS_FS_COMPRESSION
next_page:
- if (pages)
+#endif
+ if (rac)
put_page(page);
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -2357,16 +2352,15 @@ next_page:
ret = f2fs_read_multi_pages(&cc, &bio,
max_nr_pages,
&last_block_in_bio,
- is_readahead, false);
+ rac != NULL, false);
f2fs_destroy_compress_ctx(&cc);
}
}
#endif
}
- BUG_ON(pages && !list_empty(pages));
if (bio)
__submit_bio(F2FS_I_SB(inode), bio, DATA);
- return pages ? 0 : ret;
+ return ret;
}
static int f2fs_read_data_page(struct file *file, struct page *page)
@@ -2385,28 +2379,24 @@ static int f2fs_read_data_page(struct file *file, struct page *page)
if (f2fs_has_inline_data(inode))
ret = f2fs_read_inline_data(inode, page);
if (ret == -EAGAIN)
- ret = f2fs_mpage_readpages(page_file_mapping(page),
- NULL, page, 1, false);
+ ret = f2fs_mpage_readpages(inode, NULL, page);
return ret;
}
-static int f2fs_read_data_pages(struct file *file,
- struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void f2fs_readahead(struct readahead_control *rac)
{
- struct inode *inode = mapping->host;
- struct page *page = list_last_entry(pages, struct page, lru);
+ struct inode *inode = rac->mapping->host;
- trace_f2fs_readpages(inode, page, nr_pages);
+ trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac));
if (!f2fs_is_compress_backend_ready(inode))
- return 0;
+ return;
/* If the file has inline data, skip readpages */
if (f2fs_has_inline_data(inode))
- return 0;
+ return;
- return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true);
+ f2fs_mpage_readpages(inode, rac, NULL);
}
int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
@@ -3928,7 +3918,7 @@ static void f2fs_swap_deactivate(struct file *file)
const struct address_space_operations f2fs_dblock_aops = {
.readpage = f2fs_read_data_page,
- .readpages = f2fs_read_data_pages,
+ .readahead = f2fs_readahead,
.writepage = f2fs_write_data_page,
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c86e4963aeb5..c6c6a28832fd 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3129,19 +3129,12 @@ static inline void f2fs_set_page_private(struct page *page,
if (PagePrivate(page))
return;
- get_page(page);
- SetPagePrivate(page);
- set_page_private(page, data);
+ attach_page_private(page, (void *)data);
}
static inline void f2fs_clear_page_private(struct page *page)
{
- if (!PagePrivate(page))
- return;
-
- set_page_private(page, 0);
- ClearPagePrivate(page);
- f2fs_put_page(page, 0);
+ detach_page_private(page);
}
/*
@@ -3452,9 +3445,6 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn);
int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from);
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
-int f2fs_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
- unsigned nr_pages, bool is_readahead);
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
int op_flags, bool for_write);
struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index);
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index d7d430a6f130..865c9fb774fb 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -222,37 +222,6 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
return size;
}
-/*
- * Prefetch some pages from the file's Merkle tree.
- *
- * This is basically a stripped-down version of __do_page_cache_readahead()
- * which works on pages past i_size.
- */
-static void f2fs_merkle_tree_readahead(struct address_space *mapping,
- pgoff_t start_index, unsigned long count)
-{
- LIST_HEAD(pages);
- unsigned int nr_pages = 0;
- struct page *page;
- pgoff_t index;
- struct blk_plug plug;
-
- for (index = start_index; index < start_index + count; index++) {
- page = xa_load(&mapping->i_pages, index);
- if (!page || xa_is_value(page)) {
- page = __page_cache_alloc(readahead_gfp_mask(mapping));
- if (!page)
- break;
- page->index = index;
- list_add(&page->lru, &pages);
- nr_pages++;
- }
- }
- blk_start_plug(&plug);
- f2fs_mpage_readpages(mapping, &pages, NULL, nr_pages, true);
- blk_finish_plug(&plug);
-}
-
static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
@@ -266,8 +235,8 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
if (page)
put_page(page);
else if (num_ra_pages > 1)
- f2fs_merkle_tree_readahead(inode->i_mapping, index,
- num_ra_pages);
+ page_cache_readahead_unbounded(inode->i_mapping, NULL,
+ index, num_ra_pages, 0);
page = read_mapping_page(inode->i_mapping, index, NULL);
}
return page;
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 3647c65a0f48..bbfe18c07417 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -632,20 +632,80 @@ error:
}
EXPORT_SYMBOL_GPL(fat_free_clusters);
-/* 128kb is the whole sectors for FAT12 and FAT16 */
-#define FAT_READA_SIZE (128 * 1024)
+struct fatent_ra {
+ sector_t cur;
+ sector_t limit;
+
+ unsigned int ra_blocks;
+ sector_t ra_advance;
+ sector_t ra_next;
+ sector_t ra_limit;
+};
-static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent,
- unsigned long reada_blocks)
+static void fat_ra_init(struct super_block *sb, struct fatent_ra *ra,
+ struct fat_entry *fatent, int ent_limit)
{
- const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
- sector_t blocknr;
- int i, offset;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ const struct fatent_operations *ops = sbi->fatent_ops;
+ sector_t blocknr, block_end;
+ int offset;
+ /*
+ * This is the sequential read, so ra_pages * 2 (but try to
+ * align the optimal hardware IO size).
+ * [BTW, 128kb covers the whole sectors for FAT12 and FAT16]
+ */
+ unsigned long ra_pages = sb->s_bdi->ra_pages;
+ unsigned int reada_blocks;
+ if (ra_pages > sb->s_bdi->io_pages)
+ ra_pages = rounddown(ra_pages, sb->s_bdi->io_pages);
+ reada_blocks = ra_pages << (PAGE_SHIFT - sb->s_blocksize_bits + 1);
+
+ /* Initialize the range for sequential read */
ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr);
+ ops->ent_blocknr(sb, ent_limit - 1, &offset, &block_end);
+ ra->cur = 0;
+ ra->limit = (block_end + 1) - blocknr;
- for (i = 0; i < reada_blocks; i++)
- sb_breadahead(sb, blocknr + i);
+ /* Advancing the window at half size */
+ ra->ra_blocks = reada_blocks >> 1;
+ ra->ra_advance = ra->cur;
+ ra->ra_next = ra->cur;
+ ra->ra_limit = ra->cur + min_t(sector_t, reada_blocks, ra->limit);
+}
+
+/* Assuming to be called before reading a new block (increments ->cur). */
+static void fat_ent_reada(struct super_block *sb, struct fatent_ra *ra,
+ struct fat_entry *fatent)
+{
+ if (ra->ra_next >= ra->ra_limit)
+ return;
+
+ if (ra->cur >= ra->ra_advance) {
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ const struct fatent_operations *ops = sbi->fatent_ops;
+ struct blk_plug plug;
+ sector_t blocknr, diff;
+ int offset;
+
+ ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr);
+
+ diff = blocknr - ra->cur;
+ blk_start_plug(&plug);
+ /*
+ * FIXME: we would want to directly use the bio with
+ * pages to reduce the number of segments.
+ */
+ for (; ra->ra_next < ra->ra_limit; ra->ra_next++)
+ sb_breadahead(sb, ra->ra_next + diff);
+ blk_finish_plug(&plug);
+
+ /* Advance the readahead window */
+ ra->ra_advance += ra->ra_blocks;
+ ra->ra_limit += min_t(sector_t,
+ ra->ra_blocks, ra->limit - ra->ra_limit);
+ }
+ ra->cur++;
}
int fat_count_free_clusters(struct super_block *sb)
@@ -653,27 +713,20 @@ int fat_count_free_clusters(struct super_block *sb)
struct msdos_sb_info *sbi = MSDOS_SB(sb);
const struct fatent_operations *ops = sbi->fatent_ops;
struct fat_entry fatent;
- unsigned long reada_blocks, reada_mask, cur_block;
+ struct fatent_ra fatent_ra;
int err = 0, free;
lock_fat(sbi);
if (sbi->free_clusters != -1 && sbi->free_clus_valid)
goto out;
- reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits;
- reada_mask = reada_blocks - 1;
- cur_block = 0;
-
free = 0;
fatent_init(&fatent);
fatent_set_entry(&fatent, FAT_START_ENT);
+ fat_ra_init(sb, &fatent_ra, &fatent, sbi->max_cluster);
while (fatent.entry < sbi->max_cluster) {
/* readahead of fat blocks */
- if ((cur_block & reada_mask) == 0) {
- unsigned long rest = sbi->fat_length - cur_block;
- fat_ent_reada(sb, &fatent, min(reada_blocks, rest));
- }
- cur_block++;
+ fat_ent_reada(sb, &fatent_ra, &fatent);
err = fat_ent_read_block(sb, &fatent);
if (err)
@@ -707,9 +760,9 @@ int fat_trim_fs(struct inode *inode, struct fstrim_range *range)
struct msdos_sb_info *sbi = MSDOS_SB(sb);
const struct fatent_operations *ops = sbi->fatent_ops;
struct fat_entry fatent;
+ struct fatent_ra fatent_ra;
u64 ent_start, ent_end, minlen, trimmed = 0;
u32 free = 0;
- unsigned long reada_blocks, reada_mask, cur_block = 0;
int err = 0;
/*
@@ -727,19 +780,13 @@ int fat_trim_fs(struct inode *inode, struct fstrim_range *range)
if (ent_end >= sbi->max_cluster)
ent_end = sbi->max_cluster - 1;
- reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits;
- reada_mask = reada_blocks - 1;
-
fatent_init(&fatent);
lock_fat(sbi);
fatent_set_entry(&fatent, ent_start);
+ fat_ra_init(sb, &fatent_ra, &fatent, ent_end + 1);
while (fatent.entry <= ent_end) {
/* readahead of fat blocks */
- if ((cur_block & reada_mask) == 0) {
- unsigned long rest = sbi->fat_length - cur_block;
- fat_ent_reada(sb, &fatent, min(reada_blocks, rest));
- }
- cur_block++;
+ fat_ent_reada(sb, &fatent_ra, &fatent);
err = fat_ent_read_block(sb, &fatent);
if (err)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 71946da84388..a0cf99debb1e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -210,10 +210,9 @@ static int fat_readpage(struct file *file, struct page *page)
return mpage_readpage(page, fat_get_block);
}
-static int fat_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void fat_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
+ mpage_readahead(rac, fat_get_block);
}
static void fat_write_failed(struct address_space *mapping, loff_t to)
@@ -344,7 +343,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from)
static const struct address_space_operations fat_aops = {
.readpage = fat_readpage,
- .readpages = fat_readpages,
+ .readahead = fat_readahead,
.writepage = fat_writepage,
.writepages = fat_writepages,
.write_begin = fat_write_begin,
@@ -1520,6 +1519,12 @@ static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b,
goto out;
}
+ if (bpb->fat_fat_length == 0 && bpb->fat32_length == 0) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR, "bogus number of FAT sectors");
+ goto out;
+ }
+
error = 0;
out:
diff --git a/fs/file_table.c b/fs/file_table.c
index 3b612535391f..656647f9575a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -198,6 +198,7 @@ static struct file *alloc_file(const struct path *path, int flags,
file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
+ file->f_sb_err = file_sample_sb_err(file);
if ((file->f_mode & FMODE_READ) &&
likely(fop->read || fop->read_iter))
file->f_mode |= FMODE_CAN_READ;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 10eb35fd473f..02b3c36b3676 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -839,7 +839,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
get_page(newpage);
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
- lru_cache_add_file(newpage);
+ lru_cache_add(newpage);
err = 0;
spin_lock(&cs->req->waitq.lock);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 336d1cf72da0..e573b0cd2737 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -931,84 +931,40 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
fuse_readpages_end(fc, &ap->args, err);
}
-struct fuse_fill_data {
- struct fuse_io_args *ia;
- struct file *file;
- struct inode *inode;
- unsigned int nr_pages;
- unsigned int max_pages;
-};
-
-static int fuse_readpages_fill(void *_data, struct page *page)
+static void fuse_readahead(struct readahead_control *rac)
{
- struct fuse_fill_data *data = _data;
- struct fuse_io_args *ia = data->ia;
- struct fuse_args_pages *ap = &ia->ap;
- struct inode *inode = data->inode;
+ struct inode *inode = rac->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
+ unsigned int i, max_pages, nr_pages = 0;
- fuse_wait_on_page_writeback(inode, page->index);
-
- if (ap->num_pages &&
- (ap->num_pages == fc->max_pages ||
- (ap->num_pages + 1) * PAGE_SIZE > fc->max_read ||
- ap->pages[ap->num_pages - 1]->index + 1 != page->index)) {
- data->max_pages = min_t(unsigned int, data->nr_pages,
- fc->max_pages);
- fuse_send_readpages(ia, data->file);
- data->ia = ia = fuse_io_alloc(NULL, data->max_pages);
- if (!ia) {
- unlock_page(page);
- return -ENOMEM;
- }
- ap = &ia->ap;
- }
-
- if (WARN_ON(ap->num_pages >= data->max_pages)) {
- unlock_page(page);
- fuse_io_free(ia);
- return -EIO;
- }
-
- get_page(page);
- ap->pages[ap->num_pages] = page;
- ap->descs[ap->num_pages].length = PAGE_SIZE;
- ap->num_pages++;
- data->nr_pages--;
- return 0;
-}
-
-static int fuse_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
-{
- struct inode *inode = mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_fill_data data;
- int err;
-
- err = -EIO;
if (is_bad_inode(inode))
- goto out;
+ return;
- data.file = file;
- data.inode = inode;
- data.nr_pages = nr_pages;
- data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages);
-;
- data.ia = fuse_io_alloc(NULL, data.max_pages);
- err = -ENOMEM;
- if (!data.ia)
- goto out;
+ max_pages = min_t(unsigned int, fc->max_pages,
+ fc->max_read / PAGE_SIZE);
- err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
- if (!err) {
- if (data.ia->ap.num_pages)
- fuse_send_readpages(data.ia, file);
- else
- fuse_io_free(data.ia);
+ for (;;) {
+ struct fuse_io_args *ia;
+ struct fuse_args_pages *ap;
+
+ nr_pages = readahead_count(rac) - nr_pages;
+ if (nr_pages > max_pages)
+ nr_pages = max_pages;
+ if (nr_pages == 0)
+ break;
+ ia = fuse_io_alloc(NULL, nr_pages);
+ if (!ia)
+ return;
+ ap = &ia->ap;
+ nr_pages = __readahead_batch(rac, ap->pages, nr_pages);
+ for (i = 0; i < nr_pages; i++) {
+ fuse_wait_on_page_writeback(inode,
+ readahead_index(rac) + i);
+ ap->descs[i].length = PAGE_SIZE;
+ }
+ ap->num_pages = nr_pages;
+ fuse_send_readpages(ia, rac->file);
}
-out:
- return err;
}
static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -3437,10 +3393,10 @@ static const struct file_operations fuse_file_operations = {
static const struct address_space_operations fuse_file_aops = {
.readpage = fuse_readpage,
+ .readahead = fuse_readahead,
.writepage = fuse_writepage,
.writepages = fuse_writepages,
.launder_page = fuse_launder_page,
- .readpages = fuse_readpages,
.set_page_dirty = __set_page_dirty_nobuffers,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 786c1ce8f030..72c9560f4467 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -577,7 +577,7 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
}
/**
- * gfs2_readpages - Read a bunch of pages at once
+ * gfs2_readahead - Read a bunch of pages at once
* @file: The file to read from
* @mapping: Address space info
* @pages: List of pages to read
@@ -590,31 +590,24 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
* obviously not something we'd want to do on too regular a basis.
* Any I/O we ignore at this time will be done via readpage later.
* 2. We don't handle stuffed files here we let readpage do the honours.
- * 3. mpage_readpages() does most of the heavy lifting in the common case.
+ * 3. mpage_readahead() does most of the heavy lifting in the common case.
* 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
*/
-static int gfs2_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void gfs2_readahead(struct readahead_control *rac)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = rac->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_holder gh;
- int ret;
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
- ret = gfs2_glock_nq(&gh);
- if (unlikely(ret))
+ if (gfs2_glock_nq(&gh))
goto out_uninit;
if (!gfs2_is_stuffed(ip))
- ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
+ mpage_readahead(rac, gfs2_block_map);
gfs2_glock_dq(&gh);
out_uninit:
gfs2_holder_uninit(&gh);
- if (unlikely(gfs2_withdrawn(sdp)))
- ret = -EIO;
- return ret;
}
/**
@@ -833,7 +826,7 @@ static const struct address_space_operations gfs2_aops = {
.writepage = gfs2_writepage,
.writepages = gfs2_writepages,
.readpage = gfs2_readpage,
- .readpages = gfs2_readpages,
+ .readahead = gfs2_readahead,
.bmap = gfs2_bmap,
.invalidatepage = gfs2_invalidatepage,
.releasepage = gfs2_releasepage,
@@ -847,7 +840,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
.writepage = gfs2_jdata_writepage,
.writepages = gfs2_jdata_writepages,
.readpage = gfs2_readpage,
- .readpages = gfs2_readpages,
+ .readahead = gfs2_readahead,
.set_page_dirty = jdata_set_page_dirty,
.bmap = gfs2_bmap,
.invalidatepage = gfs2_invalidatepage,
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c3f7732415be..c0f2875c946c 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -354,7 +354,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN);
if (hc == NULL)
- hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL);
+ hc = __vmalloc(hsize, GFP_NOFS);
if (hc == NULL)
return ERR_PTR(-ENOMEM);
@@ -1166,7 +1166,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
hc2 = kmalloc_array(hsize_bytes, 2, GFP_NOFS | __GFP_NOWARN);
if (hc2 == NULL)
- hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL);
+ hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS);
if (!hc2)
return -ENOMEM;
@@ -1327,7 +1327,7 @@ static void *gfs2_alloc_sort_buffer(unsigned size)
if (size < KMALLOC_MAX_SIZE)
ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN);
if (!ptr)
- ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL);
+ ptr = __vmalloc(size, GFP_NOFS);
return ptr;
}
@@ -1987,8 +1987,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN);
if (ht == NULL)
- ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO,
- PAGE_KERNEL);
+ ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO);
if (!ht)
return -ENOMEM;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8259fef3f986..4b67d47a7e00 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1365,7 +1365,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN);
if (sdp->sd_quota_bitmap == NULL)
sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS |
- __GFP_ZERO, PAGE_KERNEL);
+ __GFP_ZERO);
if (!sdp->sd_quota_bitmap)
return error;
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 62959a8e43ad..077c25128eb7 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -126,10 +126,9 @@ static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, hpfs_get_block, wbc);
}
-static int hpfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void hpfs_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block);
+ mpage_readahead(rac, hpfs_get_block);
}
static int hpfs_writepages(struct address_space *mapping,
@@ -199,7 +198,7 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
const struct address_space_operations hpfs_aops = {
.readpage = hpfs_readpage,
.writepage = hpfs_writepage,
- .readpages = hpfs_readpages,
+ .readahead = hpfs_readahead,
.writepages = hpfs_writepages,
.write_begin = hpfs_write_begin,
.write_end = hpfs_write_end,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 991c60c7ffe0..f3420a643b4f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -38,6 +38,7 @@
#include <linux/uio.h>
#include <linux/uaccess.h>
+#include <linux/sched/mm.h>
static const struct super_operations hugetlbfs_ops;
static const struct address_space_operations hugetlbfs_aops;
@@ -191,13 +192,60 @@ out:
#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
static unsigned long
+hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = current->mm->mmap_base;
+ info.high_limit = TASK_SIZE;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ return vm_unmapped_area(&info);
+}
+
+static unsigned long
+hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.high_limit = current->mm->mmap_base;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ addr = vm_unmapped_area(&info);
+
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ if (unlikely(offset_in_page(addr))) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = current->mm->mmap_base;
+ info.high_limit = TASK_SIZE;
+ addr = vm_unmapped_area(&info);
+ }
+
+ return addr;
+}
+
+static unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -218,13 +266,16 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
return addr;
}
- info.flags = 0;
- info.length = len;
- info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = TASK_SIZE;
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
- return vm_unmapped_area(&info);
+ /*
+ * Use mm->get_unmapped_area value as a hint to use topdown routine.
+ * If architectures have special needs, they should define their own
+ * version of hugetlb_get_unmapped_area.
+ */
+ if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
+ return hugetlb_get_unmapped_area_topdown(file, addr, len,
+ pgoff, flags);
+ return hugetlb_get_unmapped_area_bottomup(file, addr, len,
+ pgoff, flags);
}
#endif
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 89e21961d1ad..a1ed7620fbac 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -59,24 +59,19 @@ iomap_page_create(struct inode *inode, struct page *page)
* migrate_page_move_mapping() assumes that pages with private data have
* their count elevated by 1.
*/
- get_page(page);
- set_page_private(page, (unsigned long)iop);
- SetPagePrivate(page);
+ attach_page_private(page, iop);
return iop;
}
static void
iomap_page_release(struct page *page)
{
- struct iomap_page *iop = to_iomap_page(page);
+ struct iomap_page *iop = detach_page_private(page);
if (!iop)
return;
WARN_ON_ONCE(atomic_read(&iop->read_count));
WARN_ON_ONCE(atomic_read(&iop->write_count));
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
kfree(iop);
}
@@ -214,9 +209,8 @@ iomap_read_end_io(struct bio *bio)
struct iomap_readpage_ctx {
struct page *cur_page;
bool cur_page_in_bio;
- bool is_readahead;
struct bio *bio;
- struct list_head *pages;
+ struct readahead_control *rac;
};
static void
@@ -308,7 +302,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (ctx->bio)
submit_bio(ctx->bio);
- if (ctx->is_readahead) /* same as readahead_gfp_mask */
+ if (ctx->rac) /* same as readahead_gfp_mask */
gfp |= __GFP_NORETRY | __GFP_NOWARN;
ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
/*
@@ -319,7 +313,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (!ctx->bio)
ctx->bio = bio_alloc(orig_gfp, 1);
ctx->bio->bi_opf = REQ_OP_READ;
- if (ctx->is_readahead)
+ if (ctx->rac)
ctx->bio->bi_opf |= REQ_RAHEAD;
ctx->bio->bi_iter.bi_sector = sector;
bio_set_dev(ctx->bio, iomap->bdev);
@@ -367,7 +361,7 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
}
/*
- * Just like mpage_readpages and block_read_full_page we always
+ * Just like mpage_readahead and block_read_full_page we always
* return 0 and just mark the page as PageError on errors. This
* should be cleaned up all through the stack eventually.
*/
@@ -375,36 +369,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_readpage);
-static struct page *
-iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
- loff_t length, loff_t *done)
-{
- while (!list_empty(pages)) {
- struct page *page = lru_to_page(pages);
-
- if (page_offset(page) >= (u64)pos + length)
- break;
-
- list_del(&page->lru);
- if (!add_to_page_cache_lru(page, inode->i_mapping, page->index,
- GFP_NOFS))
- return page;
-
- /*
- * If we already have a page in the page cache at index we are
- * done. Upper layers don't care if it is uptodate after the
- * readpages call itself as every page gets checked again once
- * actually needed.
- */
- *done += PAGE_SIZE;
- put_page(page);
- }
-
- return NULL;
-}
-
static loff_t
-iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
+iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length,
void *data, struct iomap *iomap, struct iomap *srcmap)
{
struct iomap_readpage_ctx *ctx = data;
@@ -418,10 +384,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
ctx->cur_page = NULL;
}
if (!ctx->cur_page) {
- ctx->cur_page = iomap_next_page(inode, ctx->pages,
- pos, length, &done);
- if (!ctx->cur_page)
- break;
+ ctx->cur_page = readahead_page(ctx->rac);
ctx->cur_page_in_bio = false;
}
ret = iomap_readpage_actor(inode, pos + done, length - done,
@@ -431,32 +394,43 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
return done;
}
-int
-iomap_readpages(struct address_space *mapping, struct list_head *pages,
- unsigned nr_pages, const struct iomap_ops *ops)
+/**
+ * iomap_readahead - Attempt to read pages from a file.
+ * @rac: Describes the pages to be read.
+ * @ops: The operations vector for the filesystem.
+ *
+ * This function is for filesystems to call to implement their readahead
+ * address_space operation.
+ *
+ * Context: The @ops callbacks may submit I/O (eg to read the addresses of
+ * blocks from disc), and may wait for it. The caller may be trying to
+ * access a different page, and so sleeping excessively should be avoided.
+ * It may allocate memory, but should avoid costly allocations. This
+ * function is called with memalloc_nofs set, so allocations will not cause
+ * the filesystem to be reentered.
+ */
+void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
{
+ struct inode *inode = rac->mapping->host;
+ loff_t pos = readahead_pos(rac);
+ loff_t length = readahead_length(rac);
struct iomap_readpage_ctx ctx = {
- .pages = pages,
- .is_readahead = true,
+ .rac = rac,
};
- loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
- loff_t last = page_offset(list_entry(pages->next, struct page, lru));
- loff_t length = last - pos + PAGE_SIZE, ret = 0;
- trace_iomap_readpages(mapping->host, nr_pages);
+ trace_iomap_readahead(inode, readahead_count(rac));
while (length > 0) {
- ret = iomap_apply(mapping->host, pos, length, 0, ops,
- &ctx, iomap_readpages_actor);
+ loff_t ret = iomap_apply(inode, pos, length, 0, ops,
+ &ctx, iomap_readahead_actor);
if (ret <= 0) {
WARN_ON_ONCE(ret == 0);
- goto done;
+ break;
}
pos += ret;
length -= ret;
}
- ret = 0;
-done:
+
if (ctx.bio)
submit_bio(ctx.bio);
if (ctx.cur_page) {
@@ -464,15 +438,8 @@ done:
unlock_page(ctx.cur_page);
put_page(ctx.cur_page);
}
-
- /*
- * Check that we didn't lose a page due to the arcance calling
- * conventions..
- */
- WARN_ON_ONCE(!ret && !list_empty(ctx.pages));
- return ret;
}
-EXPORT_SYMBOL_GPL(iomap_readpages);
+EXPORT_SYMBOL_GPL(iomap_readahead);
/*
* iomap_is_partially_uptodate checks whether blocks within a page are
@@ -554,14 +521,8 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
- if (page_has_private(page)) {
- ClearPagePrivate(page);
- get_page(newpage);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- SetPagePrivate(newpage);
- }
+ if (page_has_private(page))
+ attach_page_private(newpage, detach_page_private(page));
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 4df19c66f597..5693a39d52fb 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -39,7 +39,7 @@ DEFINE_EVENT(iomap_readpage_class, name, \
TP_PROTO(struct inode *inode, int nr_pages), \
TP_ARGS(inode, nr_pages))
DEFINE_READPAGE_EVENT(iomap_readpage);
-DEFINE_READPAGE_EVENT(iomap_readpages);
+DEFINE_READPAGE_EVENT(iomap_readahead);
DECLARE_EVENT_CLASS(iomap_range_class,
TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 276107cdaaf1..d634561f871a 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1183,10 +1183,9 @@ static int isofs_readpage(struct file *file, struct page *page)
return mpage_readpage(page, isofs_get_block);
}
-static int isofs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void isofs_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, isofs_get_block);
+ mpage_readahead(rac, isofs_get_block);
}
static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
@@ -1196,7 +1195,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
static const struct address_space_operations isofs_aops = {
.readpage = isofs_readpage,
- .readpages = isofs_readpages,
+ .readahead = isofs_readahead,
.bmap = _isofs_bmap
};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9486afcdac76..6f65bfa9f18d 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -296,10 +296,9 @@ static int jfs_readpage(struct file *file, struct page *page)
return mpage_readpage(page, jfs_get_block);
}
-static int jfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void jfs_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, jfs_get_block);
+ mpage_readahead(rac, jfs_get_block);
}
static void jfs_write_failed(struct address_space *mapping, loff_t to)
@@ -358,7 +357,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
const struct address_space_operations jfs_aops = {
.readpage = jfs_readpage,
- .readpages = jfs_readpages,
+ .readahead = jfs_readahead,
.writepage = jfs_writepage,
.writepages = jfs_writepages,
.write_begin = jfs_write_begin,
diff --git a/fs/mpage.c b/fs/mpage.c
index ccba3c4c4479..830e6cc2a9e7 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -91,7 +91,7 @@ mpage_alloc(struct block_device *bdev,
}
/*
- * support function for mpage_readpages. The fs supplied get_block might
+ * support function for mpage_readahead. The fs supplied get_block might
* return an up to date buffer. This is used to map that buffer into
* the page, which allows readpage to avoid triggering a duplicate call
* to get_block.
@@ -338,13 +338,8 @@ confused:
}
/**
- * mpage_readpages - populate an address space with some pages & start reads against them
- * @mapping: the address_space
- * @pages: The address of a list_head which contains the target pages. These
- * pages have their ->index populated and are otherwise uninitialised.
- * The page at @pages->prev has the lowest file offset, and reads should be
- * issued in @pages->prev to @pages->next order.
- * @nr_pages: The number of pages at *@pages
+ * mpage_readahead - start reads against pages
+ * @rac: Describes which pages to read.
* @get_block: The filesystem's block mapper function.
*
* This function walks the pages and the blocks within each page, building and
@@ -381,36 +376,25 @@ confused:
*
* This all causes the disk requests to be issued in the correct order.
*/
-int
-mpage_readpages(struct address_space *mapping, struct list_head *pages,
- unsigned nr_pages, get_block_t get_block)
+void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
{
+ struct page *page;
struct mpage_readpage_args args = {
.get_block = get_block,
.is_readahead = true,
};
- unsigned page_idx;
-
- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = lru_to_page(pages);
+ while ((page = readahead_page(rac))) {
prefetchw(&page->flags);
- list_del(&page->lru);
- if (!add_to_page_cache_lru(page, mapping,
- page->index,
- readahead_gfp_mask(mapping))) {
- args.page = page;
- args.nr_pages = nr_pages - page_idx;
- args.bio = do_mpage_readpage(&args);
- }
+ args.page = page;
+ args.nr_pages = readahead_count(rac);
+ args.bio = do_mpage_readpage(&args);
put_page(page);
}
- BUG_ON(!list_empty(pages));
if (args.bio)
mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio);
- return 0;
}
-EXPORT_SYMBOL(mpage_readpages);
+EXPORT_SYMBOL(mpage_readahead);
/*
* This isn't called much at all
@@ -563,7 +547,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
* Page has buffers, but they are all unmapped. The page was
* created by pagein or read over a hole which was handled by
* block_read_full_page(). If this address_space is also
- * using mpage_readpages then this can rarely happen.
+ * using mpage_readahead then this can rarely happen.
*/
goto confused;
}
diff --git a/fs/namei.c b/fs/namei.c
index d81f73ff1a8b..0cdd76984cd3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3212,6 +3212,11 @@ static int do_open(struct nameidata *nd,
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
return -ENOTDIR;
+ /* Opening for execution requires a regular file on an exec mnt. */
+ if ((file->f_mode & FMODE_EXEC) && (!d_is_reg(nd->path.dentry) ||
+ path_noexec(&nd->path)))
+ return -EACCES;
+
do_truncate = false;
acc_mode = op->acc_mode;
if (file->f_mode & FMODE_CREATED) {
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 7a57ff2528af..8f7cff7a4293 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -582,7 +582,7 @@ retry:
if (!arg->layoutupdate_pages)
return -ENOMEM;
- start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
+ start_p = __vmalloc(buffer_size, GFP_NOFS);
if (!start_p) {
kfree(arg->layoutupdate_pages);
return -ENOMEM;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 25b0d368ecdb..28009ec54420 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -146,18 +146,9 @@ static int nilfs_readpage(struct file *file, struct page *page)
return mpage_readpage(page, nilfs_get_block);
}
-/**
- * nilfs_readpages() - implement readpages() method of nilfs_aops {}
- * address_space_operations.
- * @file - file struct of the file to be read
- * @mapping - address_space struct used for reading multiple pages
- * @pages - the pages to be read
- * @nr_pages - number of pages to be read
- */
-static int nilfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned int nr_pages)
+static void nilfs_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
+ mpage_readahead(rac, nilfs_get_block);
}
static int nilfs_writepages(struct address_space *mapping,
@@ -309,7 +300,7 @@ const struct address_space_operations nilfs_aops = {
.readpage = nilfs_readpage,
.writepages = nilfs_writepages,
.set_page_dirty = nilfs_set_page_dirty,
- .readpages = nilfs_readpages,
+ .readahead = nilfs_readahead,
.write_begin = nilfs_write_begin,
.write_end = nilfs_write_end,
/* .releasepage = nilfs_releasepage, */
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 554b744f41bf..bb0a43860ad2 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1732,7 +1732,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
bh = bh->b_this_page;
} while (bh);
tail->b_this_page = head;
- attach_page_buffers(page, head);
+ attach_page_private(page, head);
} else
buffers_to_free = bh;
}
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index 842b0bfc3ac9..7068425735f1 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -34,7 +34,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
/* return (void *)__get_free_page(gfp_mask); */
}
if (likely((size >> PAGE_SHIFT) < totalram_pages()))
- return __vmalloc(size, gfp_mask, PAGE_KERNEL);
+ return __vmalloc(size, gfp_mask);
return NULL;
}
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 3aac5c917afe..fbb9f1bc623d 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -504,7 +504,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
bh = bh->b_this_page;
} while (bh);
tail->b_this_page = head;
- attach_page_buffers(page, head);
+ attach_page_private(page, head);
}
bh = head = page_buffers(page);
BUG_ON(!bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3a67a6518ddf..3bfb4147895a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -350,14 +350,11 @@ out:
* grow out to a tree. If need be, detecting boundary extents could
* trivially be added in a future version of ocfs2_get_block().
*/
-static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void ocfs2_readahead(struct readahead_control *rac)
{
- int ret, err = -EIO;
- struct inode *inode = mapping->host;
+ int ret;
+ struct inode *inode = rac->mapping->host;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- loff_t start;
- struct page *last;
/*
* Use the nonblocking flag for the dlm code to avoid page
@@ -365,36 +362,31 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
*/
ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
if (ret)
- return err;
+ return;
- if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
- ocfs2_inode_unlock(inode, 0);
- return err;
- }
+ if (down_read_trylock(&oi->ip_alloc_sem) == 0)
+ goto out_unlock;
/*
* Don't bother with inline-data. There isn't anything
* to read-ahead in that case anyway...
*/
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- goto out_unlock;
+ goto out_up;
/*
* Check whether a remote node truncated this file - we just
* drop out in that case as it's not worth handling here.
*/
- last = lru_to_page(pages);
- start = (loff_t)last->index << PAGE_SHIFT;
- if (start >= i_size_read(inode))
- goto out_unlock;
+ if (readahead_pos(rac) >= i_size_read(inode))
+ goto out_up;
- err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
+ mpage_readahead(rac, ocfs2_get_block);
-out_unlock:
+out_up:
up_read(&oi->ip_alloc_sem);
+out_unlock:
ocfs2_inode_unlock(inode, 0);
-
- return err;
}
/* Note: Because we don't support holes, our allocation has
@@ -2474,7 +2466,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
const struct address_space_operations ocfs2_aops = {
.readpage = ocfs2_readpage,
- .readpages = ocfs2_readpages,
+ .readahead = ocfs2_readahead,
.writepage = ocfs2_writepage,
.write_begin = ocfs2_write_begin,
.write_end = ocfs2_write_end,
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 55a6512e9fde..f105746063ed 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2760,6 +2760,7 @@ leave:
* Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
*/
int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+ __must_hold(&dlm->spinlock)
{
int ret;
int lock_dropped = 0;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 9150cfa4df7d..ee5d98516212 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -279,6 +279,7 @@ enum ocfs2_mount_options
OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
+ OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -673,7 +674,8 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
static inline int ocfs2_mount_local(struct ocfs2_super *osb)
{
- return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
+ return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)
+ || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER));
}
static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 8caeceeaeda7..4da0e4b1e79b 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -254,14 +254,16 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
int i, ret = -ENOSPC;
if ((preferred >= 0) && (preferred < si->si_num_slots)) {
- if (!si->si_slots[preferred].sl_valid) {
+ if (!si->si_slots[preferred].sl_valid ||
+ !si->si_slots[preferred].sl_node_num) {
ret = preferred;
goto out;
}
}
for(i = 0; i < si->si_num_slots; i++) {
- if (!si->si_slots[i].sl_valid) {
+ if (!si->si_slots[i].sl_valid ||
+ !si->si_slots[i].sl_node_num) {
ret = i;
break;
}
@@ -456,24 +458,30 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
- /* search for ourselves first and take the slot if it already
- * exists. Perhaps we need to mark this in a variable for our
- * own journal recovery? Possibly not, though we certainly
- * need to warn to the user */
- slot = __ocfs2_node_num_to_slot(si, osb->node_num);
- if (slot < 0) {
- /* if no slot yet, then just take 1st available
- * one. */
- slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
+ if (ocfs2_mount_local(osb))
+ /* use slot 0 directly in local mode */
+ slot = 0;
+ else {
+ /* search for ourselves first and take the slot if it already
+ * exists. Perhaps we need to mark this in a variable for our
+ * own journal recovery? Possibly not, though we certainly
+ * need to warn to the user */
+ slot = __ocfs2_node_num_to_slot(si, osb->node_num);
if (slot < 0) {
- spin_unlock(&osb->osb_lock);
- mlog(ML_ERROR, "no free slots available!\n");
- status = -EINVAL;
- goto bail;
- }
- } else
- printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
- "allocated to this node!\n", slot, osb->dev_str);
+ /* if no slot yet, then just take 1st available
+ * one. */
+ slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
+ if (slot < 0) {
+ spin_unlock(&osb->osb_lock);
+ mlog(ML_ERROR, "no free slots available!\n");
+ status = -EINVAL;
+ goto bail;
+ }
+ } else
+ printk(KERN_INFO "ocfs2: Slot %d on device (%s) was "
+ "already allocated to this node!\n",
+ slot, osb->dev_str);
+ }
ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ac61eeaf3837..71ea9ce71a6b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -175,6 +175,7 @@ enum {
Opt_dir_resv_level,
Opt_journal_async_commit,
Opt_err_cont,
+ Opt_nocluster,
Opt_err,
};
@@ -208,6 +209,7 @@ static const match_table_t tokens = {
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_journal_async_commit, "journal_async_commit"},
{Opt_err_cont, "errors=continue"},
+ {Opt_nocluster, "nocluster"},
{Opt_err, NULL}
};
@@ -619,6 +621,13 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
goto out;
}
+ tmp = OCFS2_MOUNT_NOCLUSTER;
+ if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
+ ret = -EINVAL;
+ mlog(ML_ERROR, "Cannot change nocluster option on remount\n");
+ goto out;
+ }
+
tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
OCFS2_MOUNT_HB_NONE;
if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
@@ -859,6 +868,7 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
}
if (ocfs2_userspace_stack(osb) &&
+ !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) &&
strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
OCFS2_STACK_LABEL_LEN)) {
mlog(ML_ERROR,
@@ -1139,6 +1149,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
"ordered");
+ if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) &&
+ !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT))
+ printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted "
+ "without cluster aware mode.\n", osb->dev_str);
+
atomic_set(&osb->vol_state, VOLUME_MOUNTED);
wake_up(&osb->osb_mount_event);
@@ -1445,6 +1460,9 @@ static int ocfs2_parse_options(struct super_block *sb,
case Opt_journal_async_commit:
mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
break;
+ case Opt_nocluster:
+ mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER;
+ break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1556,6 +1574,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
seq_printf(s, ",journal_async_commit");
+ if (opts & OCFS2_MOUNT_NOCLUSTER)
+ seq_printf(s, ",nocluster");
+
return 0;
}
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index d640b9388238..d7b5f09d298c 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -289,10 +289,9 @@ static int omfs_readpage(struct file *file, struct page *page)
return block_read_full_page(page, omfs_get_block);
}
-static int omfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void omfs_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, omfs_get_block);
+ mpage_readahead(rac, omfs_get_block);
}
static int omfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -373,7 +372,7 @@ const struct inode_operations omfs_file_inops = {
const struct address_space_operations omfs_aops = {
.readpage = omfs_readpage,
- .readpages = omfs_readpages,
+ .readahead = omfs_readahead,
.writepage = omfs_writepage,
.writepages = omfs_writepages,
.write_begin = omfs_write_begin,
diff --git a/fs/open.c b/fs/open.c
index e62b1db06638..623b7506a6db 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -775,9 +775,8 @@ static int do_dentry_open(struct file *f,
path_get(&f->f_path);
f->f_inode = inode;
f->f_mapping = inode->i_mapping;
-
- /* Ensure that we skip any errors that predate opening of the file */
f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
+ f->f_sb_err = file_sample_sb_err(f);
if (unlikely(f->f_flags & O_PATH)) {
f->f_mode = FMODE_PATH | FMODE_OPENED;
@@ -785,12 +784,6 @@ static int do_dentry_open(struct file *f,
return 0;
}
- /* Any file opened for execve()/uselib() has to be a regular file. */
- if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) {
- error = -EACCES;
- goto cleanup_file;
- }
-
if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
error = get_write_access(inode);
if (unlikely(error))
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 12ae630fbed7..48f0547d4850 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -62,12 +62,7 @@ static int orangefs_writepage_locked(struct page *page,
} else {
ret = 0;
}
- if (wr) {
- kfree(wr);
- set_page_private(page, 0);
- ClearPagePrivate(page);
- put_page(page);
- }
+ kfree(detach_page_private(page));
return ret;
}
@@ -409,9 +404,7 @@ static int orangefs_write_begin(struct file *file,
wr->len = len;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
- SetPagePrivate(page);
- set_page_private(page, (unsigned long)wr);
- get_page(page);
+ attach_page_private(page, wr);
okay:
return 0;
}
@@ -459,18 +452,12 @@ static void orangefs_invalidatepage(struct page *page,
wr = (struct orangefs_write_range *)page_private(page);
if (offset == 0 && length == PAGE_SIZE) {
- kfree((struct orangefs_write_range *)page_private(page));
- set_page_private(page, 0);
- ClearPagePrivate(page);
- put_page(page);
+ kfree(detach_page_private(page));
return;
/* write range entirely within invalidate range (or equal) */
} else if (page_offset(page) + offset <= wr->pos &&
wr->pos + wr->len <= page_offset(page) + offset + length) {
- kfree((struct orangefs_write_range *)page_private(page));
- set_page_private(page, 0);
- ClearPagePrivate(page);
- put_page(page);
+ kfree(detach_page_private(page));
/* XXX is this right? only caller in fs */
cancel_dirty_page(page);
return;
@@ -535,12 +522,7 @@ static int orangefs_releasepage(struct page *page, gfp_t foo)
static void orangefs_freepage(struct page *page)
{
- if (PagePrivate(page)) {
- kfree((struct orangefs_write_range *)page_private(page));
- set_page_private(page, 0);
- ClearPagePrivate(page);
- put_page(page);
- }
+ kfree(detach_page_private(page));
}
static int orangefs_launder_page(struct page *page)
@@ -740,9 +722,7 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
wr->len = PAGE_SIZE;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
- SetPagePrivate(page);
- set_page_private(page, (unsigned long)wr);
- get_page(page);
+ attach_page_private(page, wr);
okay:
file_update_time(vmf->vma->vm_file);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 043311014db2..713ffac59bbb 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -248,8 +248,8 @@ void render_sigset_t(struct seq_file *m, const char *header,
seq_putc(m, '\n');
}
-static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
- sigset_t *catch)
+static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *sigign,
+ sigset_t *sigcatch)
{
struct k_sigaction *k;
int i;
@@ -257,9 +257,9 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
k = p->sighand->action;
for (i = 1; i <= _NSIG; ++i, ++k) {
if (k->sa.sa_handler == SIG_IGN)
- sigaddset(ign, i);
+ sigaddset(sigign, i);
else if (k->sa.sa_handler != SIG_DFL)
- sigaddset(catch, i);
+ sigaddset(sigcatch, i);
}
}
diff --git a/fs/proc/page.c b/fs/proc/page.c
index f909243d4a66..f3b39a7d2bf3 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,7 +115,10 @@ u64 stable_page_flags(struct page *page)
* it differentiates a memory hole from a page with no flags
*/
if (!page)
- return 1 << KPF_NOPAGE;
+ return BIT_ULL(KPF_NOPAGE);
+
+ if (pfn_zone_device_reserved(page_to_pfn(page)))
+ return BIT_ULL(KPF_RESERVED);
k = page->flags;
u = 0;
@@ -127,22 +130,22 @@ u64 stable_page_flags(struct page *page)
* simple test in page_mapped() is not enough.
*/
if (!PageSlab(page) && page_mapped(page))
- u |= 1 << KPF_MMAP;
+ u |= BIT_ULL(KPF_MMAP);
if (PageAnon(page))
- u |= 1 << KPF_ANON;
+ u |= BIT_ULL(KPF_ANON);
if (PageKsm(page))
- u |= 1 << KPF_KSM;
+ u |= BIT_ULL(KPF_KSM);
/*
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
if (PageHead(page))
- u |= 1 << KPF_COMPOUND_HEAD;
+ u |= BIT_ULL(KPF_COMPOUND_HEAD);
if (PageTail(page))
- u |= 1 << KPF_COMPOUND_TAIL;
+ u |= BIT_ULL(KPF_COMPOUND_TAIL);
if (PageHuge(page))
- u |= 1 << KPF_HUGE;
+ u |= BIT_ULL(KPF_HUGE);
/*
* PageTransCompound can be true for non-huge compound pages (slab
* pages or pages allocated by drivers with __GFP_COMP) because it
@@ -153,14 +156,13 @@ u64 stable_page_flags(struct page *page)
struct page *head = compound_head(page);
if (PageLRU(head) || PageAnon(head))
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_THP);
else if (is_huge_zero_page(head)) {
- u |= 1 << KPF_ZERO_PAGE;
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_ZERO_PAGE);
+ u |= BIT_ULL(KPF_THP);
}
} else if (is_zero_pfn(page_to_pfn(page)))
- u |= 1 << KPF_ZERO_PAGE;
-
+ u |= BIT_ULL(KPF_ZERO_PAGE);
/*
* Caveats on high order pages: page->_refcount will only be set
@@ -168,23 +170,23 @@ u64 stable_page_flags(struct page *page)
* SLOB won't set PG_slab at all on compound pages.
*/
if (PageBuddy(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
else if (page_count(page) == 0 && is_free_buddy_page(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
if (PageOffline(page))
- u |= 1 << KPF_OFFLINE;
+ u |= BIT_ULL(KPF_OFFLINE);
if (PageTable(page))
- u |= 1 << KPF_PGTABLE;
+ u |= BIT_ULL(KPF_PGTABLE);
if (page_is_idle(page))
- u |= 1 << KPF_IDLE;
+ u |= BIT_ULL(KPF_IDLE);
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
if (PageTail(page) && PageSlab(compound_head(page)))
- u |= 1 << KPF_SLAB;
+ u |= BIT_ULL(KPF_SLAB);
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
@@ -197,7 +199,7 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
if (PageSwapCache(page))
- u |= 1 << KPF_SWAPCACHE;
+ u |= BIT_ULL(KPF_SWAPCACHE);
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 10a6d472397f..6ad407d5efe2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -546,10 +546,17 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
- struct page *page;
+ struct page *page = NULL;
+
+ if (pmd_present(*pmd)) {
+ /* FOLL_DUMP will return -EFAULT on huge zero page */
+ page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
+ } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
+ swp_entry_t entry = pmd_to_swp_entry(*pmd);
- /* FOLL_DUMP will return -EFAULT on huge zero page */
- page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
+ if (is_migration_entry(entry))
+ page = migration_entry_to_page(entry);
+ }
if (IS_ERR_OR_NULL(page))
return;
if (PageAnon(page))
@@ -578,8 +585,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
- if (pmd_present(*pmd))
- smaps_pmd_entry(pmd, addr, walk);
+ smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
goto out;
}
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 345db56c98fd..755293c8c71a 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -99,10 +99,9 @@ static int qnx6_readpage(struct file *file, struct page *page)
return mpage_readpage(page, qnx6_get_block);
}
-static int qnx6_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void qnx6_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, qnx6_get_block);
+ mpage_readahead(rac, qnx6_get_block);
}
/*
@@ -499,7 +498,7 @@ static sector_t qnx6_bmap(struct address_space *mapping, sector_t block)
}
static const struct address_space_operations qnx6_aops = {
.readpage = qnx6_readpage,
- .readpages = qnx6_readpages,
+ .readahead = qnx6_readahead,
.bmap = qnx6_bmap
};
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index ee179a81b3da..55c1dc329bd2 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -147,6 +147,17 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
return error;
}
+static int ramfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+
+ inode = ramfs_get_inode(dir->i_sb, dir, mode, 0);
+ if (!inode)
+ return -ENOSPC;
+ d_tmpfile(dentry, inode);
+ return 0;
+}
+
static const struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
@@ -157,6 +168,7 @@ static const struct inode_operations ramfs_dir_inode_operations = {
.rmdir = simple_rmdir,
.mknod = ramfs_mknod,
.rename = simple_rename,
+ .tmpfile = ramfs_tmpfile,
};
/*
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6419e6dacc39..0031070b3692 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1160,11 +1160,9 @@ failure:
return retval;
}
-static int
-reiserfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void reiserfs_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
+ mpage_readahead(rac, reiserfs_get_block);
}
/*
@@ -3434,7 +3432,7 @@ out:
const struct address_space_operations reiserfs_address_space_operations = {
.writepage = reiserfs_writepage,
.readpage = reiserfs_readpage,
- .readpages = reiserfs_readpages,
+ .readahead = reiserfs_readahead,
.releasepage = reiserfs_releasepage,
.invalidatepage = reiserfs_invalidatepage,
.write_begin = reiserfs_write_begin,
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 70f5fdf99bf6..4e6239f33c06 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -6,6 +6,8 @@
* initial implementation -- AV, Oct 2001.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/cache.h>
#include <linux/fs.h>
#include <linux/export.h>
@@ -233,9 +235,8 @@ Fill:
p = m->op->next(m, p, &m->index);
if (pos == m->index) {
- pr_info_ratelimited("buggy seq_file .next function %ps "
- "did not updated position index\n",
- m->op->next);
+ pr_info_ratelimited("buggy .next function %ps did not update position index\n",
+ m->op->next);
m->index++;
}
if (!p || IS_ERR(p)) {
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 4f9b9fb59362..64f61330564a 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -13,6 +13,7 @@
* datablocks and metadata blocks.
*/
+#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/slab.h>
@@ -27,44 +28,103 @@
#include "page_actor.h"
/*
- * Read the metadata block length, this is stored in the first two
- * bytes of the metadata block.
+ * Returns the amount of bytes copied to the page actor.
*/
-static struct buffer_head *get_block_length(struct super_block *sb,
- u64 *cur_index, int *offset, int *length)
+static int copy_bio_to_actor(struct bio *bio,
+ struct squashfs_page_actor *actor,
+ int offset, int req_length)
+{
+ void *actor_addr = squashfs_first_page(actor);
+ struct bvec_iter_all iter_all = {};
+ struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
+ int copied_bytes = 0;
+ int actor_offset = 0;
+
+ if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all)))
+ return 0;
+
+ while (copied_bytes < req_length) {
+ int bytes_to_copy = min_t(int, bvec->bv_len - offset,
+ PAGE_SIZE - actor_offset);
+
+ bytes_to_copy = min_t(int, bytes_to_copy,
+ req_length - copied_bytes);
+ memcpy(actor_addr + actor_offset,
+ page_address(bvec->bv_page) + bvec->bv_offset + offset,
+ bytes_to_copy);
+
+ actor_offset += bytes_to_copy;
+ copied_bytes += bytes_to_copy;
+ offset += bytes_to_copy;
+
+ if (actor_offset >= PAGE_SIZE) {
+ actor_addr = squashfs_next_page(actor);
+ if (!actor_addr)
+ break;
+ actor_offset = 0;
+ }
+ if (offset >= bvec->bv_len) {
+ if (!bio_next_segment(bio, &iter_all))
+ break;
+ offset = 0;
+ }
+ }
+ squashfs_finish_page(actor);
+ return copied_bytes;
+}
+
+static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
+ struct bio **biop, int *block_offset)
{
struct squashfs_sb_info *msblk = sb->s_fs_info;
- struct buffer_head *bh;
-
- bh = sb_bread(sb, *cur_index);
- if (bh == NULL)
- return NULL;
-
- if (msblk->devblksize - *offset == 1) {
- *length = (unsigned char) bh->b_data[*offset];
- put_bh(bh);
- bh = sb_bread(sb, ++(*cur_index));
- if (bh == NULL)
- return NULL;
- *length |= (unsigned char) bh->b_data[0] << 8;
- *offset = 1;
- } else {
- *length = (unsigned char) bh->b_data[*offset] |
- (unsigned char) bh->b_data[*offset + 1] << 8;
- *offset += 2;
-
- if (*offset == msblk->devblksize) {
- put_bh(bh);
- bh = sb_bread(sb, ++(*cur_index));
- if (bh == NULL)
- return NULL;
- *offset = 0;
+ const u64 read_start = round_down(index, msblk->devblksize);
+ const sector_t block = read_start >> msblk->devblksize_log2;
+ const u64 read_end = round_up(index + length, msblk->devblksize);
+ const sector_t block_end = read_end >> msblk->devblksize_log2;
+ int offset = read_start - round_down(index, PAGE_SIZE);
+ int total_len = (block_end - block) << msblk->devblksize_log2;
+ const int page_count = DIV_ROUND_UP(total_len + offset, PAGE_SIZE);
+ int error, i;
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_NOIO, page_count);
+ if (!bio)
+ return -ENOMEM;
+
+ bio_set_dev(bio, sb->s_bdev);
+ bio->bi_opf = READ;
+ bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT);
+
+ for (i = 0; i < page_count; ++i) {
+ unsigned int len =
+ min_t(unsigned int, PAGE_SIZE - offset, total_len);
+ struct page *page = alloc_page(GFP_NOIO);
+
+ if (!page) {
+ error = -ENOMEM;
+ goto out_free_bio;
+ }
+ if (!bio_add_page(bio, page, len, offset)) {
+ error = -EIO;
+ goto out_free_bio;
}
+ offset = 0;
+ total_len -= len;
}
- return bh;
-}
+ error = submit_bio_wait(bio);
+ if (error)
+ goto out_free_bio;
+ *biop = bio;
+ *block_offset = index & ((1 << msblk->devblksize_log2) - 1);
+ return 0;
+
+out_free_bio:
+ bio_free_pages(bio);
+ bio_put(bio);
+ return error;
+}
/*
* Read and decompress a metadata block or datablock. Length is non-zero
@@ -76,129 +136,88 @@ static struct buffer_head *get_block_length(struct super_block *sb,
* algorithms).
*/
int squashfs_read_data(struct super_block *sb, u64 index, int length,
- u64 *next_index, struct squashfs_page_actor *output)
+ u64 *next_index, struct squashfs_page_actor *output)
{
struct squashfs_sb_info *msblk = sb->s_fs_info;
- struct buffer_head **bh;
- int offset = index & ((1 << msblk->devblksize_log2) - 1);
- u64 cur_index = index >> msblk->devblksize_log2;
- int bytes, compressed, b = 0, k = 0, avail, i;
-
- bh = kcalloc(((output->length + msblk->devblksize - 1)
- >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
- if (bh == NULL)
- return -ENOMEM;
+ struct bio *bio = NULL;
+ int compressed;
+ int res;
+ int offset;
if (length) {
/*
* Datablock.
*/
- bytes = -offset;
compressed = SQUASHFS_COMPRESSED_BLOCK(length);
length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
- if (next_index)
- *next_index = index + length;
-
TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
index, compressed ? "" : "un", length, output->length);
-
- if (length < 0 || length > output->length ||
- (index + length) > msblk->bytes_used)
- goto read_failure;
-
- for (b = 0; bytes < length; b++, cur_index++) {
- bh[b] = sb_getblk(sb, cur_index);
- if (bh[b] == NULL)
- goto block_release;
- bytes += msblk->devblksize;
- }
- ll_rw_block(REQ_OP_READ, 0, b, bh);
} else {
/*
* Metadata block.
*/
- if ((index + 2) > msblk->bytes_used)
- goto read_failure;
+ const u8 *data;
+ struct bvec_iter_all iter_all = {};
+ struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
- bh[0] = get_block_length(sb, &cur_index, &offset, &length);
- if (bh[0] == NULL)
- goto read_failure;
- b = 1;
+ if (index + 2 > msblk->bytes_used) {
+ res = -EIO;
+ goto out;
+ }
+ res = squashfs_bio_read(sb, index, 2, &bio, &offset);
+ if (res)
+ goto out;
+
+ if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) {
+ res = -EIO;
+ goto out_free_bio;
+ }
+ /* Extract the length of the metadata block */
+ data = page_address(bvec->bv_page) + bvec->bv_offset;
+ length = data[offset];
+ if (offset <= bvec->bv_len - 1) {
+ length |= data[offset + 1] << 8;
+ } else {
+ if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) {
+ res = -EIO;
+ goto out_free_bio;
+ }
+ data = page_address(bvec->bv_page) + bvec->bv_offset;
+ length |= data[0] << 8;
+ }
+ bio_free_pages(bio);
+ bio_put(bio);
- bytes = msblk->devblksize - offset;
compressed = SQUASHFS_COMPRESSED(length);
length = SQUASHFS_COMPRESSED_SIZE(length);
- if (next_index)
- *next_index = index + length + 2;
+ index += 2;
TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
- compressed ? "" : "un", length);
-
- if (length < 0 || length > output->length ||
- (index + length) > msblk->bytes_used)
- goto block_release;
-
- for (; bytes < length; b++) {
- bh[b] = sb_getblk(sb, ++cur_index);
- if (bh[b] == NULL)
- goto block_release;
- bytes += msblk->devblksize;
- }
- ll_rw_block(REQ_OP_READ, 0, b - 1, bh + 1);
+ compressed ? "" : "un", length);
}
+ if (next_index)
+ *next_index = index + length;
- for (i = 0; i < b; i++) {
- wait_on_buffer(bh[i]);
- if (!buffer_uptodate(bh[i]))
- goto block_release;
- }
+ res = squashfs_bio_read(sb, index, length, &bio, &offset);
+ if (res)
+ goto out;
if (compressed) {
- if (!msblk->stream)
- goto read_failure;
- length = squashfs_decompress(msblk, bh, b, offset, length,
- output);
- if (length < 0)
- goto read_failure;
- } else {
- /*
- * Block is uncompressed.
- */
- int in, pg_offset = 0;
- void *data = squashfs_first_page(output);
-
- for (bytes = length; k < b; k++) {
- in = min(bytes, msblk->devblksize - offset);
- bytes -= in;
- while (in) {
- if (pg_offset == PAGE_SIZE) {
- data = squashfs_next_page(output);
- pg_offset = 0;
- }
- avail = min_t(int, in, PAGE_SIZE -
- pg_offset);
- memcpy(data + pg_offset, bh[k]->b_data + offset,
- avail);
- in -= avail;
- pg_offset += avail;
- offset += avail;
- }
- offset = 0;
- put_bh(bh[k]);
+ if (!msblk->stream) {
+ res = -EIO;
+ goto out_free_bio;
}
- squashfs_finish_page(output);
+ res = squashfs_decompress(msblk, bio, offset, length, output);
+ } else {
+ res = copy_bio_to_actor(bio, output, offset, length);
}
- kfree(bh);
- return length;
-
-block_release:
- for (; k < b; k++)
- put_bh(bh[k]);
+out_free_bio:
+ bio_free_pages(bio);
+ bio_put(bio);
+out:
+ if (res < 0)
+ ERROR("Failed to read block 0x%llx: %d\n", index, res);
-read_failure:
- ERROR("squashfs_read_data failed to read block 0x%llx\n",
- (unsigned long long) index);
- kfree(bh);
- return -EIO;
+ return res;
}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index ec8617523e56..1b9ccfd0aa51 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -10,13 +10,14 @@
* decompressor.h
*/
+#include <linux/bio.h>
+
struct squashfs_decompressor {
void *(*init)(struct squashfs_sb_info *, void *);
void *(*comp_opts)(struct squashfs_sb_info *, void *, int);
void (*free)(void *);
int (*decompress)(struct squashfs_sb_info *, void *,
- struct buffer_head **, int, int, int,
- struct squashfs_page_actor *);
+ struct bio *, int, int, struct squashfs_page_actor *);
int id;
char *name;
int supported;
diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c
index c181dee235bb..db9f12a3ea05 100644
--- a/fs/squashfs/decompressor_multi.c
+++ b/fs/squashfs/decompressor_multi.c
@@ -6,7 +6,7 @@
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
+#include <linux/bio.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/cpumask.h>
@@ -180,14 +180,15 @@ wait:
}
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
- int b, int offset, int length, struct squashfs_page_actor *output)
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+ int offset, int length,
+ struct squashfs_page_actor *output)
{
int res;
struct squashfs_stream *stream = msblk->stream;
struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream);
res = msblk->decompressor->decompress(msblk, decomp_stream->stream,
- bh, b, offset, length, output);
+ bio, offset, length, output);
put_decomp_stream(decomp_stream, stream);
if (res < 0)
ERROR("%s decompression failed, data probably corrupt\n",
diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
index 2a2a2d106440..d93e12d9b712 100644
--- a/fs/squashfs/decompressor_multi_percpu.c
+++ b/fs/squashfs/decompressor_multi_percpu.c
@@ -72,14 +72,17 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
}
}
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
- int b, int offset, int length, struct squashfs_page_actor *output)
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+ int offset, int length, struct squashfs_page_actor *output)
{
- struct squashfs_stream __percpu *percpu =
- (struct squashfs_stream __percpu *) msblk->stream;
- struct squashfs_stream *stream = get_cpu_ptr(percpu);
- int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
- offset, length, output);
+ struct squashfs_stream __percpu *percpu;
+ struct squashfs_stream *stream;
+ int res;
+
+ percpu = (struct squashfs_stream __percpu *)msblk->stream;
+ stream = get_cpu_ptr(percpu);
+ res = msblk->decompressor->decompress(msblk, stream->stream, bio,
+ offset, length, output);
put_cpu_ptr(stream);
if (res < 0)
diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c
index 550c3e592032..4eb3d083d45e 100644
--- a/fs/squashfs/decompressor_single.c
+++ b/fs/squashfs/decompressor_single.c
@@ -7,7 +7,7 @@
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
+#include <linux/bio.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
@@ -59,14 +59,15 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
}
}
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
- int b, int offset, int length, struct squashfs_page_actor *output)
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+ int offset, int length,
+ struct squashfs_page_actor *output)
{
int res;
struct squashfs_stream *stream = msblk->stream;
mutex_lock(&stream->mutex);
- res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
+ res = msblk->decompressor->decompress(msblk, stream->stream, bio,
offset, length, output);
mutex_unlock(&stream->mutex);
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
index c4e47e0588c7..233d5582fbee 100644
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -4,7 +4,7 @@
* Phillip Lougher <phillip@squashfs.org.uk>
*/
-#include <linux/buffer_head.h>
+#include <linux/bio.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -89,20 +89,23 @@ static void lz4_free(void *strm)
static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
- struct buffer_head **bh, int b, int offset, int length,
+ struct bio *bio, int offset, int length,
struct squashfs_page_actor *output)
{
+ struct bvec_iter_all iter_all = {};
+ struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
struct squashfs_lz4 *stream = strm;
void *buff = stream->input, *data;
- int avail, i, bytes = length, res;
+ int bytes = length, res;
- for (i = 0; i < b; i++) {
- avail = min(bytes, msblk->devblksize - offset);
- memcpy(buff, bh[i]->b_data + offset, avail);
+ while (bio_next_segment(bio, &iter_all)) {
+ int avail = min(bytes, ((int)bvec->bv_len) - offset);
+
+ data = page_address(bvec->bv_page) + bvec->bv_offset;
+ memcpy(buff, data + offset, avail);
buff += avail;
bytes -= avail;
offset = 0;
- put_bh(bh[i]);
}
res = LZ4_decompress_safe(stream->input, stream->output,
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index aa3c3dafc33d..97bb7d92ddcd 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -9,7 +9,7 @@
*/
#include <linux/mutex.h>
-#include <linux/buffer_head.h>
+#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/lzo.h>
@@ -63,21 +63,24 @@ static void lzo_free(void *strm)
static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
- struct buffer_head **bh, int b, int offset, int length,
+ struct bio *bio, int offset, int length,
struct squashfs_page_actor *output)
{
+ struct bvec_iter_all iter_all = {};
+ struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
struct squashfs_lzo *stream = strm;
void *buff = stream->input, *data;
- int avail, i, bytes = length, res;
+ int bytes = length, res;
size_t out_len = output->length;
- for (i = 0; i < b; i++) {
- avail = min(bytes, msblk->devblksize - offset);
- memcpy(buff, bh[i]->b_data + offset, avail);
+ while (bio_next_segment(bio, &iter_all)) {
+ int avail = min(bytes, ((int)bvec->bv_len) - offset);
+
+ data = page_address(bvec->bv_page) + bvec->bv_offset;
+ memcpy(buff, data + offset, avail);
buff += avail;
bytes -= avail;
offset = 0;
- put_bh(bh[i]);
}
res = lzo1x_decompress_safe(stream->input, (size_t)length,
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 2797763ed046..9783e01c8100 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -40,8 +40,8 @@ extern void *squashfs_decompressor_setup(struct super_block *, unsigned short);
/* decompressor_xxx.c */
extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *);
extern void squashfs_decompressor_destroy(struct squashfs_sb_info *);
-extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **,
- int, int, int, struct squashfs_page_actor *);
+extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *,
+ int, int, struct squashfs_page_actor *);
extern int squashfs_max_decompressors(void);
/* export.c */
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 4b2f2051a6dc..e80419aed862 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -10,7 +10,7 @@
#include <linux/mutex.h>
-#include <linux/buffer_head.h>
+#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/xz.h>
#include <linux/bitops.h>
@@ -117,11 +117,12 @@ static void squashfs_xz_free(void *strm)
static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
- struct buffer_head **bh, int b, int offset, int length,
+ struct bio *bio, int offset, int length,
struct squashfs_page_actor *output)
{
- enum xz_ret xz_err;
- int avail, total = 0, k = 0;
+ struct bvec_iter_all iter_all = {};
+ struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
+ int total = 0, error = 0;
struct squashfs_xz *stream = strm;
xz_dec_reset(stream->state);
@@ -131,11 +132,23 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
stream->buf.out_size = PAGE_SIZE;
stream->buf.out = squashfs_first_page(output);
- do {
- if (stream->buf.in_pos == stream->buf.in_size && k < b) {
- avail = min(length, msblk->devblksize - offset);
+ for (;;) {
+ enum xz_ret xz_err;
+
+ if (stream->buf.in_pos == stream->buf.in_size) {
+ const void *data;
+ int avail;
+
+ if (!bio_next_segment(bio, &iter_all)) {
+ /* XZ_STREAM_END must be reached. */
+ error = -EIO;
+ break;
+ }
+
+ avail = min(length, ((int)bvec->bv_len) - offset);
+ data = page_address(bvec->bv_page) + bvec->bv_offset;
length -= avail;
- stream->buf.in = bh[k]->b_data + offset;
+ stream->buf.in = data + offset;
stream->buf.in_size = avail;
stream->buf.in_pos = 0;
offset = 0;
@@ -150,23 +163,17 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
xz_err = xz_dec_run(stream->state, &stream->buf);
-
- if (stream->buf.in_pos == stream->buf.in_size && k < b)
- put_bh(bh[k++]);
- } while (xz_err == XZ_OK);
+ if (xz_err == XZ_STREAM_END)
+ break;
+ if (xz_err != XZ_OK) {
+ error = -EIO;
+ break;
+ }
+ }
squashfs_finish_page(output);
- if (xz_err != XZ_STREAM_END || k < b)
- goto out;
-
- return total + stream->buf.out_pos;
-
-out:
- for (; k < b; k++)
- put_bh(bh[k]);
-
- return -EIO;
+ return error ? error : total + stream->buf.out_pos;
}
const struct squashfs_decompressor squashfs_xz_comp_ops = {
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index f2226afa1625..bcb881ec47f2 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -10,7 +10,7 @@
#include <linux/mutex.h>
-#include <linux/buffer_head.h>
+#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/zlib.h>
#include <linux/vmalloc.h>
@@ -50,21 +50,35 @@ static void zlib_free(void *strm)
static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
- struct buffer_head **bh, int b, int offset, int length,
+ struct bio *bio, int offset, int length,
struct squashfs_page_actor *output)
{
- int zlib_err, zlib_init = 0, k = 0;
+ struct bvec_iter_all iter_all = {};
+ struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
+ int zlib_init = 0, error = 0;
z_stream *stream = strm;
stream->avail_out = PAGE_SIZE;
stream->next_out = squashfs_first_page(output);
stream->avail_in = 0;
- do {
- if (stream->avail_in == 0 && k < b) {
- int avail = min(length, msblk->devblksize - offset);
+ for (;;) {
+ int zlib_err;
+
+ if (stream->avail_in == 0) {
+ const void *data;
+ int avail;
+
+ if (!bio_next_segment(bio, &iter_all)) {
+ /* Z_STREAM_END must be reached. */
+ error = -EIO;
+ break;
+ }
+
+ avail = min(length, ((int)bvec->bv_len) - offset);
+ data = page_address(bvec->bv_page) + bvec->bv_offset;
length -= avail;
- stream->next_in = bh[k]->b_data + offset;
+ stream->next_in = data + offset;
stream->avail_in = avail;
offset = 0;
}
@@ -78,37 +92,28 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
if (!zlib_init) {
zlib_err = zlib_inflateInit(stream);
if (zlib_err != Z_OK) {
- squashfs_finish_page(output);
- goto out;
+ error = -EIO;
+ break;
}
zlib_init = 1;
}
zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
-
- if (stream->avail_in == 0 && k < b)
- put_bh(bh[k++]);
- } while (zlib_err == Z_OK);
+ if (zlib_err == Z_STREAM_END)
+ break;
+ if (zlib_err != Z_OK) {
+ error = -EIO;
+ break;
+ }
+ }
squashfs_finish_page(output);
- if (zlib_err != Z_STREAM_END)
- goto out;
-
- zlib_err = zlib_inflateEnd(stream);
- if (zlib_err != Z_OK)
- goto out;
-
- if (k < b)
- goto out;
-
- return stream->total_out;
-
-out:
- for (; k < b; k++)
- put_bh(bh[k]);
+ if (!error)
+ if (zlib_inflateEnd(stream) != Z_OK)
+ error = -EIO;
- return -EIO;
+ return error ? error : stream->total_out;
}
const struct squashfs_decompressor squashfs_zlib_comp_ops = {
diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c
index b448c2a1d0ed..b7cb1faa652d 100644
--- a/fs/squashfs/zstd_wrapper.c
+++ b/fs/squashfs/zstd_wrapper.c
@@ -9,7 +9,7 @@
*/
#include <linux/mutex.h>
-#include <linux/buffer_head.h>
+#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/zstd.h>
#include <linux/vmalloc.h>
@@ -59,33 +59,44 @@ static void zstd_free(void *strm)
static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
- struct buffer_head **bh, int b, int offset, int length,
+ struct bio *bio, int offset, int length,
struct squashfs_page_actor *output)
{
struct workspace *wksp = strm;
ZSTD_DStream *stream;
size_t total_out = 0;
- size_t zstd_err;
- int k = 0;
+ int error = 0;
ZSTD_inBuffer in_buf = { NULL, 0, 0 };
ZSTD_outBuffer out_buf = { NULL, 0, 0 };
+ struct bvec_iter_all iter_all = {};
+ struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
stream = ZSTD_initDStream(wksp->window_size, wksp->mem, wksp->mem_size);
if (!stream) {
ERROR("Failed to initialize zstd decompressor\n");
- goto out;
+ return -EIO;
}
out_buf.size = PAGE_SIZE;
out_buf.dst = squashfs_first_page(output);
- do {
- if (in_buf.pos == in_buf.size && k < b) {
- int avail = min(length, msblk->devblksize - offset);
+ for (;;) {
+ size_t zstd_err;
+ if (in_buf.pos == in_buf.size) {
+ const void *data;
+ int avail;
+
+ if (!bio_next_segment(bio, &iter_all)) {
+ error = -EIO;
+ break;
+ }
+
+ avail = min(length, ((int)bvec->bv_len) - offset);
+ data = page_address(bvec->bv_page) + bvec->bv_offset;
length -= avail;
- in_buf.src = bh[k]->b_data + offset;
+ in_buf.src = data + offset;
in_buf.size = avail;
in_buf.pos = 0;
offset = 0;
@@ -97,8 +108,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
/* Shouldn't run out of pages
* before stream is done.
*/
- squashfs_finish_page(output);
- goto out;
+ error = -EIO;
+ break;
}
out_buf.pos = 0;
out_buf.size = PAGE_SIZE;
@@ -107,29 +118,20 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
total_out -= out_buf.pos;
zstd_err = ZSTD_decompressStream(stream, &out_buf, &in_buf);
total_out += out_buf.pos; /* add the additional data produced */
-
- if (in_buf.pos == in_buf.size && k < b)
- put_bh(bh[k++]);
- } while (zstd_err != 0 && !ZSTD_isError(zstd_err));
-
- squashfs_finish_page(output);
-
- if (ZSTD_isError(zstd_err)) {
- ERROR("zstd decompression error: %d\n",
- (int)ZSTD_getErrorCode(zstd_err));
- goto out;
+ if (zstd_err == 0)
+ break;
+
+ if (ZSTD_isError(zstd_err)) {
+ ERROR("zstd decompression error: %d\n",
+ (int)ZSTD_getErrorCode(zstd_err));
+ error = -EIO;
+ break;
+ }
}
- if (k < b)
- goto out;
-
- return (int)total_out;
-
-out:
- for (; k < b; k++)
- put_bh(bh[k]);
+ squashfs_finish_page(output);
- return -EIO;
+ return error ? error : total_out;
}
const struct squashfs_decompressor squashfs_zstd_comp_ops = {
diff --git a/fs/sync.c b/fs/sync.c
index 16c2630ee4bf..1373a610dc78 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -162,7 +162,7 @@ SYSCALL_DEFINE1(syncfs, int, fd)
{
struct fd f = fdget(fd);
struct super_block *sb;
- int ret;
+ int ret, ret2;
if (!f.file)
return -EBADF;
@@ -172,8 +172,10 @@ SYSCALL_DEFINE1(syncfs, int, fd)
ret = sync_filesystem(sb);
up_read(&sb->s_umount);
+ ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);
+
fdput(f);
- return ret;
+ return ret ? ret : ret2;
}
/**
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 0f5a480fe264..31288d8fa2ce 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -815,7 +815,7 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum)
pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum);
- buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+ buf = __vmalloc(c->leb_size, GFP_NOFS);
if (!buf) {
ubifs_err(c, "cannot allocate memory for dumping LEB %d", lnum);
return;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 29826c51883a..22bfda158f7f 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1095,7 +1095,7 @@ static int scan_check_cb(struct ubifs_info *c,
return LPT_SCAN_CONTINUE;
}
- buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+ buf = __vmalloc(c->leb_size, GFP_NOFS);
if (!buf)
return -ENOMEM;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index ff5e0411cf2d..d76a19e460cd 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1596,7 +1596,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
if (!dbg_is_chk_lprops(c))
return 0;
- buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+ buf = p = __vmalloc(c->leb_size, GFP_NOFS);
if (!buf) {
ubifs_err(c, "cannot allocate memory for ltab checking");
return 0;
@@ -1845,7 +1845,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
void *buf, *p;
pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum);
- buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+ buf = p = __vmalloc(c->leb_size, GFP_NOFS);
if (!buf) {
ubifs_err(c, "cannot allocate memory to dump LPT");
return;
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 283f9eb48410..2c294085ffed 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -977,7 +977,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
if (c->no_orphs)
return 0;
- buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+ buf = __vmalloc(c->leb_size, GFP_NOFS);
if (!buf) {
ubifs_err(c, "cannot allocate memory to check orphans");
return 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index e875bc5668ee..adaba8e8b326 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -195,10 +195,9 @@ static int udf_readpage(struct file *file, struct page *page)
return mpage_readpage(page, udf_get_block);
}
-static int udf_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void udf_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, udf_get_block);
+ mpage_readahead(rac, udf_get_block);
}
static int udf_write_begin(struct file *file, struct address_space *mapping,
@@ -234,7 +233,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations udf_aops = {
.readpage = udf_readpage,
- .readpages = udf_readpages,
+ .readahead = udf_readahead,
.writepage = udf_writepage,
.writepages = udf_writepages,
.write_begin = udf_write_begin,
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 1da94237a8cf..f1366475c389 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -48,7 +48,7 @@ __kmem_vmalloc(size_t size, xfs_km_flags_t flags)
if (flags & KM_NOFS)
nofs_flag = memalloc_nofs_save();
- ptr = __vmalloc(size, lflags, PAGE_KERNEL);
+ ptr = __vmalloc(size, lflags);
if (flags & KM_NOFS)
memalloc_nofs_restore(nofs_flag);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 2834cbf1212e..b35611882ff9 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -621,14 +621,11 @@ xfs_vm_readpage(
return iomap_readpage(page, &xfs_read_iomap_ops);
}
-STATIC int
-xfs_vm_readpages(
- struct file *unused,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned nr_pages)
+STATIC void
+xfs_vm_readahead(
+ struct readahead_control *rac)
{
- return iomap_readpages(mapping, pages, nr_pages, &xfs_read_iomap_ops);
+ iomap_readahead(rac, &xfs_read_iomap_ops);
}
static int
@@ -644,7 +641,7 @@ xfs_iomap_swapfile_activate(
const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
- .readpages = xfs_vm_readpages,
+ .readahead = xfs_vm_readahead,
.writepage = xfs_vm_writepage,
.writepages = xfs_vm_writepages,
.set_page_dirty = iomap_set_page_dirty,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9c2fbb6bbf89..20b748f7e186 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -477,7 +477,7 @@ _xfs_buf_map_pages(
nofs_flag = memalloc_nofs_save();
do {
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
- -1, PAGE_KERNEL);
+ -1);
if (bp->b_addr)
break;
vm_unmap_aliases();
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 1a3ecedd47c2..07bc42d62673 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -79,10 +79,9 @@ static int zonefs_readpage(struct file *unused, struct page *page)
return iomap_readpage(page, &zonefs_iomap_ops);
}
-static int zonefs_readpages(struct file *unused, struct address_space *mapping,
- struct list_head *pages, unsigned int nr_pages)
+static void zonefs_readahead(struct readahead_control *rac)
{
- return iomap_readpages(mapping, pages, nr_pages, &zonefs_iomap_ops);
+ iomap_readahead(rac, &zonefs_iomap_ops);
}
/*
@@ -129,7 +128,7 @@ static int zonefs_writepages(struct address_space *mapping,
static const struct address_space_operations zonefs_file_aops = {
.readpage = zonefs_readpage,
- .readpages = zonefs_readpages,
+ .readahead = zonefs_readahead,
.writepage = zonefs_writepage,
.writepages = zonefs_writepages,
.set_page_dirty = iomap_set_page_dirty,
diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
index 4c74b1c1d13b..58046ddc08d0 100644
--- a/include/asm-generic/5level-fixup.h
+++ b/include/asm-generic/5level-fixup.h
@@ -17,8 +17,9 @@
((unlikely(pgd_none(*(p4d))) && __pud_alloc(mm, p4d, address)) ? \
NULL : pud_offset(p4d, address))
-#define p4d_alloc(mm, pgd, address) (pgd)
-#define p4d_offset(pgd, start) (pgd)
+#define p4d_alloc(mm, pgd, address) (pgd)
+#define p4d_alloc_track(mm, pgd, address, mask) (pgd)
+#define p4d_offset(pgd, start) (pgd)
#ifndef __ASSEMBLY__
static inline int p4d_none(p4d_t p4d)
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 822f433ac95c..40f85decc2ee 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -122,7 +122,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
#ifndef __HAVE_ARCH_HUGE_PTEP_GET
static inline pte_t huge_ptep_get(pte_t *ptep)
{
- return *ptep;
+ return READ_ONCE(*ptep);
}
#endif
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index d10be362eafa..b6616d820dd0 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -491,6 +491,10 @@ static inline int arch_unmap_one(struct mm_struct *mm,
#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
#endif
+#ifndef pgprot_nx
+#define pgprot_nx(prot) (prot)
+#endif
+
#ifndef pgprot_noncached
#define pgprot_noncached(prot) (prot)
#endif
@@ -1209,6 +1213,29 @@ static inline bool arch_has_pfn_modify_check(void)
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
+/*
+ * Page Table Modification bits for pgtbl_mod_mask.
+ *
+ * These are used by the p?d_alloc_track*() set of functions an in the generic
+ * vmalloc/ioremap code to track at which page-table levels entries have been
+ * modified. Based on that the code can better decide when vmalloc and ioremap
+ * mapping changes need to be synchronized to other page-tables in the system.
+ */
+#define __PGTBL_PGD_MODIFIED 0
+#define __PGTBL_P4D_MODIFIED 1
+#define __PGTBL_PUD_MODIFIED 2
+#define __PGTBL_PMD_MODIFIED 3
+#define __PGTBL_PTE_MODIFIED 4
+
+#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED)
+#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED)
+#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED)
+#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED)
+#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED)
+
+/* Page-Table Modification Mask */
+typedef unsigned int pgtbl_mod_mask;
+
#endif /* !__ASSEMBLY__ */
#ifndef io_remap_pfn_range
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 0a9d042e075a..de1ccdcd5703 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -668,10 +668,6 @@ int ttm_bo_mmap_obj(struct vm_area_struct *vma, struct ttm_buffer_object *bo);
int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
struct ttm_bo_device *bdev);
-void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot);
-
-void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot);
-
/**
* ttm_bo_io
*
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 7fc05929c967..bfc9283074da 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -135,8 +135,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm,
extern int transfer_args_to_stack(struct linux_binprm *bprm,
unsigned long *sp_location);
extern int bprm_change_interp(const char *interp, struct linux_binprm *bprm);
-extern int copy_strings_kernel(int argc, const char *const *argv,
- struct linux_binprm *bprm);
+int copy_string_kernel(const char *arg, struct linux_binprm *bprm);
extern void set_binfmt(struct linux_binfmt *new);
extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t);
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 9acf654f0b19..030a98f0c452 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -72,7 +72,7 @@ static inline int get_bitmask_order(unsigned int count)
static __always_inline unsigned long hweight_long(unsigned long w)
{
- return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
+ return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w);
}
/**
@@ -206,10 +206,7 @@ static inline int get_count_order_long(unsigned long l)
{
if (l == 0UL)
return -1;
- else if (l & (l - 1UL))
- return (int)fls_long(l);
- else
- return (int)fls_long(l) - 1;
+ return (int)fls_long(--l);
}
/**
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 15b765a181b8..22fb11e2d2e0 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -272,14 +272,6 @@ void buffer_init(void);
* inline definitions
*/
-static inline void attach_page_buffers(struct page *page,
- struct buffer_head *head)
-{
- get_page(page);
- SetPagePrivate(page);
- set_page_private(page, (unsigned long)head);
-}
-
static inline void get_bh(struct buffer_head *bh)
{
atomic_inc(&bh->b_count);
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index a0eabfbeb0e1..6fa0eea3f530 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -97,7 +97,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
struct page **page);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern enum compact_result compaction_suitable(struct zone *zone, int order,
- unsigned int alloc_flags, int classzone_idx);
+ unsigned int alloc_flags, int highest_zoneidx);
extern void defer_compaction(struct zone *zone, int order);
extern bool compaction_deferred(struct zone *zone, int order);
@@ -182,7 +182,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
extern int kcompactd_run(int nid);
extern void kcompactd_stop(int nid);
-extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
+extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx);
#else
static inline void reset_isolation_suitable(pg_data_t *pgdat)
@@ -190,7 +190,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
}
static inline enum compact_result compaction_suitable(struct zone *zone, int order,
- int alloc_flags, int classzone_idx)
+ int alloc_flags, int highest_zoneidx)
{
return COMPACT_SKIPPED;
}
@@ -232,7 +232,8 @@ static inline void kcompactd_stop(int nid)
{
}
-static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+static inline void wakeup_kcompactd(pg_data_t *pgdat,
+ int order, int highest_zoneidx)
{
}
diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h
index 5aad06b4ca7b..3028b644b4fb 100644
--- a/include/linux/dev_printk.h
+++ b/include/linux/dev_printk.h
@@ -109,7 +109,8 @@ void _dev_info(const struct device *dev, const char *fmt, ...)
#define dev_info(dev, fmt, ...) \
_dev_info(dev, dev_fmt(fmt), ##__VA_ARGS__)
-#if defined(CONFIG_DYNAMIC_DEBUG)
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define dev_dbg(dev, fmt, ...) \
dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__)
#elif defined(DEBUG)
@@ -181,7 +182,8 @@ do { \
dev_level_ratelimited(dev_notice, dev, fmt, ##__VA_ARGS__)
#define dev_info_ratelimited(dev, fmt, ...) \
dev_level_ratelimited(dev_info, dev, fmt, ##__VA_ARGS__)
-#if defined(CONFIG_DYNAMIC_DEBUG)
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* descriptor check is first to prevent flooding with "callbacks suppressed" */
#define dev_dbg_ratelimited(dev, fmt, ...) \
do { \
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 4cf02ecd67de..abcd5fde30eb 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -48,7 +48,7 @@ struct _ddebug {
-#if defined(CONFIG_DYNAMIC_DEBUG)
+#if defined(CONFIG_DYNAMIC_DEBUG_CORE)
int ddebug_add_module(struct _ddebug *tab, unsigned int n,
const char *modname);
extern int ddebug_remove_module(const char *mod_name);
diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h
index 594d4e78654f..69b136e4dd2b 100644
--- a/include/linux/elfnote.h
+++ b/include/linux/elfnote.h
@@ -54,7 +54,7 @@
.popsection ;
#define ELFNOTE(name, type, desc) \
- ELFNOTE_START(name, type, "") \
+ ELFNOTE_START(name, type, "a") \
desc ; \
ELFNOTE_END
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f04b27264162..ac64fa29321b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -297,6 +297,7 @@ enum positive_aop_returns {
struct page;
struct address_space;
struct writeback_control;
+struct readahead_control;
/*
* Write life time hint values.
@@ -385,6 +386,7 @@ struct address_space_operations {
*/
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
+ void (*readahead)(struct readahead_control *);
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -986,6 +988,7 @@ struct file {
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
errseq_t f_wb_err;
+ errseq_t f_sb_err; /* for syncfs */
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
@@ -1533,6 +1536,9 @@ struct super_block {
/* Being remounted read-only */
int s_readonly_remount;
+ /* per-sb errseq_t for reporting writeback errors via syncfs */
+ errseq_t s_wb_err;
+
/* AIO completions deferred from interrupt context */
struct workqueue_struct *s_dio_done_wq;
struct hlist_head s_pins;
@@ -2843,6 +2849,18 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
return errseq_sample(&mapping->wb_err);
}
+/**
+ * file_sample_sb_err - sample the current errseq_t to test for later errors
+ * @mapping: mapping to be sampled
+ *
+ * Grab the most current superblock-level errseq_t value for the given
+ * struct file.
+ */
+static inline errseq_t file_sample_sb_err(struct file *file)
+{
+ return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
+}
+
static inline int filemap_nr_thps(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
@@ -3550,10 +3568,11 @@ int __init get_filesystem_list(char *buf);
#define __FMODE_EXEC ((__force int) FMODE_EXEC)
#define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY)
+#define __SMUGGLED_FMODE_FLAGS (__FMODE_EXEC | __FMODE_NONOTIFY)
#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
- (flag & __FMODE_NONOTIFY)))
+ (flag & __SMUGGLED_FMODE_FLAGS)))
static inline bool is_sxid(umode_t mode)
{
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 5ab28f6c7d26..86761ed4b434 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -90,7 +90,7 @@ static inline int fsnotify_perm(struct file *file, int mask)
if (mask & MAY_OPEN) {
fsnotify_mask = FS_OPEN_PERM;
- if (file->f_flags & __FMODE_EXEC) {
+ if (file->f_mode & FMODE_EXEC) {
ret = fsnotify_file(file, FS_OPEN_EXEC_PERM);
if (ret)
@@ -264,7 +264,7 @@ static inline void fsnotify_open(struct file *file)
{
__u32 mask = FS_OPEN;
- if (file->f_flags & __FMODE_EXEC)
+ if (file->f_mode & FMODE_EXEC)
mask |= FS_OPEN_EXEC;
fsnotify_file(file, mask);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4aba4c86c626..67a0774e080b 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -110,6 +110,11 @@ struct vm_area_struct;
* the caller guarantees the allocation will allow more memory to be freed
* very shortly e.g. process exiting or swapping. Users either should
* be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
+ * Users of this flag have to be extremely careful to not deplete the reserve
+ * completely and implement a throttling mechanism which controls the
+ * consumption of the reserve based on the amount of freed memory.
+ * Usage of a pre-allocated pool (e.g. mempool) should be always considered
+ * before using this flag.
*
* %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
* This takes precedence over the %__GFP_MEMALLOC flag if both are set.
@@ -307,7 +312,7 @@ struct vm_area_struct;
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
#define GFP_MOVABLE_SHIFT 3
-static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
+static inline int gfp_migratetype(const gfp_t gfp_flags)
{
VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index ea5cdbd8c2c3..d6e82e3de027 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -32,8 +32,65 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
#include <asm/kmap_types.h>
#ifdef CONFIG_HIGHMEM
+extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot);
+extern void kunmap_atomic_high(void *kvaddr);
#include <asm/highmem.h>
+#ifndef ARCH_HAS_KMAP_FLUSH_TLB
+static inline void kmap_flush_tlb(unsigned long addr) { }
+#endif
+
+#ifndef kmap_prot
+#define kmap_prot PAGE_KERNEL
+#endif
+
+void *kmap_high(struct page *page);
+static inline void *kmap(struct page *page)
+{
+ void *addr;
+
+ might_sleep();
+ if (!PageHighMem(page))
+ addr = page_address(page);
+ else
+ addr = kmap_high(page);
+ kmap_flush_tlb((unsigned long)addr);
+ return addr;
+}
+
+void kunmap_high(struct page *page);
+
+static inline void kunmap(struct page *page)
+{
+ might_sleep();
+ if (!PageHighMem(page))
+ return;
+ kunmap_high(page);
+}
+
+/*
+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
+ * no global lock is needed and because the kmap code must perform a global TLB
+ * invalidation when the kmap pool wraps.
+ *
+ * However when holding an atomic kmap is is not legal to sleep, so atomic
+ * kmaps are appropriate for short, tight code paths only.
+ *
+ * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap
+ * gives a more generic (and caching) interface. But kmap_atomic can
+ * be used in IRQ contexts, so in some (very limited) cases we need
+ * it.
+ */
+static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+ preempt_disable();
+ pagefault_disable();
+ if (!PageHighMem(page))
+ return page_address(page);
+ return kmap_atomic_high_prot(page, prot);
+}
+#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot)
+
/* declarations for linux/mm/highmem.c */
unsigned int nr_free_highpages(void);
extern atomic_long_t _totalhigh_pages;
@@ -77,15 +134,21 @@ static inline struct page *kmap_to_page(void *addr)
static inline unsigned long totalhigh_pages(void) { return 0UL; }
-#ifndef ARCH_HAS_KMAP
static inline void *kmap(struct page *page)
{
might_sleep();
return page_address(page);
}
+static inline void kunmap_high(struct page *page)
+{
+}
+
static inline void kunmap(struct page *page)
{
+#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
+ kunmap_flush_on_unmap(page_address(page));
+#endif
}
static inline void *kmap_atomic(struct page *page)
@@ -96,16 +159,20 @@ static inline void *kmap_atomic(struct page *page)
}
#define kmap_atomic_prot(page, prot) kmap_atomic(page)
-static inline void __kunmap_atomic(void *addr)
+static inline void kunmap_atomic_high(void *addr)
{
- pagefault_enable();
- preempt_enable();
+ /*
+ * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic()
+ * handles re-enabling faults + preemption
+ */
+#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
+ kunmap_flush_on_unmap(addr);
+#endif
}
#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn))
#define kmap_flush_unused() do {} while(0)
-#endif
#endif /* CONFIG_HIGHMEM */
@@ -149,7 +216,9 @@ static inline void kmap_atomic_idx_pop(void)
#define kunmap_atomic(addr) \
do { \
BUILD_BUG_ON(__same_type((addr), struct page *)); \
- __kunmap_atomic(addr); \
+ kunmap_atomic_high(addr); \
+ pagefault_enable(); \
+ preempt_enable(); \
} while (0)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 92c21c5ccc58..0cced410e0bd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -518,8 +518,8 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
int __init __alloc_bootmem_huge_page(struct hstate *h);
int __init alloc_bootmem_huge_page(struct hstate *h);
-void __init hugetlb_bad_size(void);
void __init hugetlb_add_hstate(unsigned order);
+bool __init arch_hugetlb_valid_size(unsigned long size);
struct hstate *size_to_hstate(unsigned long size);
#ifndef HUGE_MAX_HSTATE
@@ -590,6 +590,20 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h)
#include <asm/hugetlb.h>
+#ifndef is_hugepage_only_range
+static inline int is_hugepage_only_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long len)
+{
+ return 0;
+}
+#define is_hugepage_only_range is_hugepage_only_range
+#endif
+
+#ifndef arch_clear_hugepage_flags
+static inline void arch_clear_hugepage_flags(struct page *page) { }
+#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#endif
+
#ifndef arch_make_huge_pte
static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
struct page *page, int writable)
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index fa08dabb933b..4d1d3c3469e9 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -155,8 +155,7 @@ loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
const struct iomap_ops *ops);
int iomap_readpage(struct page *page, const struct iomap_ops *ops);
-int iomap_readpages(struct address_space *mapping, struct list_head *pages,
- unsigned nr_pages, const struct iomap_ops *ops);
+void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
int iomap_set_page_dirty(struct page *page);
int iomap_is_partially_uptodate(struct page *page, unsigned long from,
unsigned long count);
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index a9b9170b5dd2..cc9a5b4593ca 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -103,6 +103,7 @@ struct resource {
#define IORESOURCE_MEM_32BIT (3<<3)
#define IORESOURCE_MEM_SHADOWABLE (1<<5) /* dup: IORESOURCE_SHADOWABLE */
#define IORESOURCE_MEM_EXPANSIONROM (1<<6)
+#define IORESOURCE_MEM_DRIVER_MANAGED (1<<7)
/* PnP I/O specific bits (IORESOURCE_BITS) */
#define IORESOURCE_IO_16BIT_ADDR (1<<0)
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index c309f43bde45..d3c0e5b86ee4 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -3,13 +3,13 @@
#define __IPC_NAMESPACE_H__
#include <linux/err.h>
-#include <linux/idr.h>
-#include <linux/rwsem.h>
#include <linux/notifier.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
+#include <linux/rwsem.h>
+#include <linux/xarray.h>
struct user_namespace;
@@ -17,11 +17,11 @@ struct ipc_ids {
int in_use;
unsigned short seq;
struct rw_semaphore rwsem;
- struct idr ipcs_idr;
+ struct xarray ipcs;
int max_idx;
- int last_idx; /* For wrap around detection */
+ int next_idx;
#ifdef CONFIG_CHECKPOINT_RESTORE
- int next_id;
+ int restore_id;
#endif
struct rhashtable key_ht;
};
@@ -68,6 +68,8 @@ struct ipc_namespace {
struct user_namespace *user_ns;
struct ucounts *ucounts;
+ struct llist_node mnt_llist;
+
struct ns_common ns;
} __randomize_layout;
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 6bc37a731d27..017fae833d4a 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -41,7 +41,7 @@ enum memblock_flags {
/**
* struct memblock_region - represents a memory region
- * @base: physical address of the region
+ * @base: base address of the region
* @size: size of the region
* @flags: memory region attributes
* @nid: NUMA node id
@@ -50,7 +50,7 @@ struct memblock_region {
phys_addr_t base;
phys_addr_t size;
enum memblock_flags flags;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#ifdef CONFIG_NEED_MULTIPLE_NODES
int nid;
#endif
};
@@ -75,7 +75,7 @@ struct memblock_type {
* struct memblock - memblock allocator metadata
* @bottom_up: is bottom up direction?
* @current_limit: physical address of the current allocation limit
- * @memory: usabe memory regions
+ * @memory: usable memory regions
* @reserved: reserved memory regions
* @physmem: all physical memory
*/
@@ -215,7 +215,6 @@ static inline bool memblock_is_nomap(struct memblock_region *m)
return m->flags & MEMBLOCK_NOMAP;
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
unsigned long *end_pfn);
void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
@@ -234,7 +233,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \
for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
@@ -275,6 +273,9 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \
for (; i != U64_MAX; \
__next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+
+int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask);
+
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/**
@@ -310,10 +311,10 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
nid, flags, p_start, p_end, p_nid)
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int memblock_set_node(phys_addr_t base, phys_addr_t size,
struct memblock_type *type, int nid);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
static inline void memblock_set_region_node(struct memblock_region *r, int nid)
{
r->nid = nid;
@@ -332,7 +333,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r)
{
return 0;
}
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
/* Flags for memblock allocation APIs */
#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 977edd3b7bd8..0ba84f1c3f91 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -29,10 +29,7 @@ struct kmem_cache;
/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
- MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS,
- MEMCG_RSS,
- MEMCG_RSS_HUGE,
- MEMCG_SWAP,
+ MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
MEMCG_SOCK,
/* XXX: why are these zone and not node counters? */
MEMCG_KERNEL_STACK_KB,
@@ -360,16 +357,8 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
struct mem_cgroup *memcg);
-int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound);
-int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound);
-void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare, bool compound);
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
- bool compound);
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
+
void mem_cgroup_uncharge(struct page *page);
void mem_cgroup_uncharge_list(struct list_head *page_list);
@@ -570,7 +559,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
#ifdef CONFIG_MEMCG_SWAP
-extern int do_swap_account;
+extern bool cgroup_memory_noswap;
#endif
struct mem_cgroup *lock_page_memcg(struct page *page);
@@ -710,16 +699,17 @@ static inline void mod_lruvec_state(struct lruvec *lruvec,
static inline void __mod_lruvec_page_state(struct page *page,
enum node_stat_item idx, int val)
{
+ struct page *head = compound_head(page); /* rmap on tail pages */
pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
/* Untracked pages have no memcg, no lruvec. Update only the node */
- if (!page->mem_cgroup) {
+ if (!head->mem_cgroup) {
__mod_node_page_state(pgdat, idx, val);
return;
}
- lruvec = mem_cgroup_lruvec(page->mem_cgroup, pgdat);
+ lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
__mod_lruvec_state(lruvec, idx, val);
}
@@ -849,37 +839,12 @@ static inline enum mem_cgroup_protection mem_cgroup_protected(
return MEMCG_PROT_NONE;
}
-static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask,
- struct mem_cgroup **memcgp,
- bool compound)
-{
- *memcgp = NULL;
- return 0;
-}
-
-static inline int mem_cgroup_try_charge_delay(struct page *page,
- struct mm_struct *mm,
- gfp_t gfp_mask,
- struct mem_cgroup **memcgp,
- bool compound)
+static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
{
- *memcgp = NULL;
return 0;
}
-static inline void mem_cgroup_commit_charge(struct page *page,
- struct mem_cgroup *memcg,
- bool lrucare, bool compound)
-{
-}
-
-static inline void mem_cgroup_cancel_charge(struct page *page,
- struct mem_cgroup *memcg,
- bool compound)
-{
-}
-
static inline void mem_cgroup_uncharge(struct page *page)
{
}
@@ -1279,6 +1244,19 @@ static inline void dec_lruvec_page_state(struct page *page,
mod_lruvec_page_state(page, idx, -1);
}
+static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = lruvec_memcg(lruvec);
+ if (!memcg)
+ return NULL;
+ memcg = parent_mem_cgroup(memcg);
+ if (!memcg)
+ return NULL;
+ return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
+}
+
#ifdef CONFIG_CGROUP_WRITEBACK
struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 93d9ada74ddd..fee7fab5d706 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -314,19 +314,12 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
#ifdef CONFIG_MEMORY_HOTREMOVE
-extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
extern void try_offline_node(int nid);
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
extern int remove_memory(int nid, u64 start, u64 size);
extern void __remove_memory(int nid, u64 start, u64 size);
#else
-static inline bool is_mem_section_removable(unsigned long pfn,
- unsigned long nr_pages)
-{
- return false;
-}
-
static inline void try_offline_node(int nid) {}
static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
@@ -349,6 +342,8 @@ extern void __ref free_area_init_core_hotplug(int nid);
extern int __add_memory(int nid, u64 start, u64 size);
extern int add_memory(int nid, u64 start, u64 size);
extern int add_memory_resource(int nid, struct resource *resource);
+extern int add_memory_driver_managed(int nid, u64 start, u64 size,
+ const char *resource_name);
extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap);
extern void remove_pfn_range_from_zone(struct zone *zone,
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 5f5b2df06e61..fac75e5a723b 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -125,6 +125,7 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
}
#ifdef CONFIG_ZONE_DEVICE
+bool pfn_zone_device_reserved(unsigned long pfn);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
@@ -136,6 +137,11 @@ unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
unsigned long memremap_compat_align(void);
#else
+static inline bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ return false;
+}
+
static inline void *devm_memremap_pages(struct device *dev,
struct dev_pagemap *pgmap)
{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b32449a366b1..fd14d870cc9c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -501,7 +501,6 @@ struct vm_fault {
pte_t orig_pte; /* Value of PTE at the time of fault */
struct page *cow_page; /* Page handler may use for COW fault */
- struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */
struct page *page; /* ->fault handlers should return a
* page here, unless VM_FAULT_NOPAGE
* is set (which is also implied by
@@ -777,7 +776,13 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
}
extern void kvfree(const void *addr);
+extern void kvfree_sensitive(const void *addr, size_t len);
+/*
+ * Mapcount of compound page as a whole, does not include mapped sub-pages.
+ *
+ * Must be called only for compound pages or any their tail sub-pages.
+ */
static inline int compound_mapcount(struct page *page)
{
VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -797,10 +802,16 @@ static inline void page_mapcount_reset(struct page *page)
int __page_mapcount(struct page *page);
+/*
+ * Mapcount of 0-order page; when compound sub-page, includes
+ * compound_mapcount().
+ *
+ * Result is undefined for pages which cannot be mapped into userspace.
+ * For example SLAB or special types of pages. See function page_has_type().
+ * They use this place in struct page differently.
+ */
static inline int page_mapcount(struct page *page)
{
- VM_BUG_ON_PAGE(PageSlab(page), page);
-
if (unlikely(PageCompound(page)))
return __page_mapcount(page);
return atomic_read(&page->_mapcount) + 1;
@@ -856,7 +867,7 @@ enum compound_dtor_id {
#endif
NR_COMPOUND_DTORS,
};
-extern compound_page_dtor * const compound_page_dtors[];
+extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS];
static inline void set_compound_page_dtor(struct page *page,
enum compound_dtor_id compound_dtor)
@@ -865,10 +876,10 @@ static inline void set_compound_page_dtor(struct page *page,
page[1].compound_dtor = compound_dtor;
}
-static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
+static inline void destroy_compound_page(struct page *page)
{
VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
- return compound_page_dtors[page[1].compound_dtor];
+ compound_page_dtors[page[1].compound_dtor](page);
}
static inline unsigned int compound_order(struct page *page)
@@ -935,8 +946,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte;
}
-vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
- struct page *page);
+vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page);
vm_fault_t finish_fault(struct vm_fault *vmf);
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#endif
@@ -1363,7 +1373,7 @@ static inline int cpu_pid_to_cpupid(int nid, int pid)
static inline bool cpupid_pid_unset(int cpupid)
{
- return 1;
+ return true;
}
static inline void page_cpupid_reset_last(struct page *page)
@@ -1698,6 +1708,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages, int *locked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
+long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
+ struct page **pages, unsigned int gup_flags);
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
@@ -1814,6 +1826,8 @@ extern int mprotect_fixup(struct vm_area_struct *vma,
*/
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages);
+int pin_user_pages_fast_only(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages);
/*
* per-process(per-mm_struct) statistics.
*/
@@ -2074,13 +2088,54 @@ static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
NULL : pud_offset(p4d, address);
}
+
+static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+
+{
+ if (unlikely(pgd_none(*pgd))) {
+ if (__p4d_alloc(mm, pgd, address))
+ return NULL;
+ *mod_mask |= PGTBL_PGD_MODIFIED;
+ }
+
+ return p4d_offset(pgd, address);
+}
+
#endif /* !__ARCH_HAS_5LEVEL_HACK */
+static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(p4d_none(*p4d))) {
+ if (__pud_alloc(mm, p4d, address))
+ return NULL;
+ *mod_mask |= PGTBL_P4D_MODIFIED;
+ }
+
+ return pud_offset(p4d, address);
+}
+
static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
NULL: pmd_offset(pud, address);
}
+
+static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(pud_none(*pud))) {
+ if (__pmd_alloc(mm, pud, address))
+ return NULL;
+ *mod_mask |= PGTBL_PUD_MODIFIED;
+ }
+
+ return pmd_offset(pud, address);
+}
#endif /* CONFIG_MMU */
#if USE_SPLIT_PTE_PTLOCKS
@@ -2196,6 +2251,11 @@ static inline void pgtable_pte_page_dtor(struct page *page)
((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
NULL: pte_offset_kernel(pmd, address))
+#define pte_alloc_kernel_track(pmd, address, mask) \
+ ((unlikely(pmd_none(*(pmd))) && \
+ (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
+ NULL: pte_offset_kernel(pmd, address))
+
#if USE_SPLIT_PMD_PTLOCKS
static struct page *pmd_to_page(pmd_t *pmd)
@@ -2268,9 +2328,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
}
extern void __init pagecache_init(void);
-extern void free_area_init(unsigned long * zones_size);
-extern void __init free_area_init_node(int nid, unsigned long * zones_size,
- unsigned long zone_start_pfn, unsigned long *zholes_size);
+extern void __init free_area_init_memoryless_node(int nid);
extern void free_initmem(void);
/*
@@ -2340,34 +2398,26 @@ static inline unsigned long get_num_physpages(void)
return phys_pages;
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
- * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its
- * zones, allocate the backing mem_map and account for memory holes in a more
- * architecture independent manner. This is a substitute for creating the
- * zone_sizes[] and zholes_size[] arrays and passing them to
- * free_area_init_node()
+ * Using memblock node mappings, an architecture may initialise its
+ * zones, allocate the backing mem_map and account for memory holes in an
+ * architecture independent manner.
*
* An architecture is expected to register range of page frames backed by
* physical memory with memblock_add[_node]() before calling
- * free_area_init_nodes() passing in the PFN each zone ends at. At a basic
+ * free_area_init() passing in the PFN each zone ends at. At a basic
* usage, an architecture is expected to do something like
*
* unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
* max_highmem_pfn};
* for_each_valid_physical_page_range()
* memblock_add_node(base, size, nid)
- * free_area_init_nodes(max_zone_pfns);
+ * free_area_init(max_zone_pfns);
*
- * free_bootmem_with_active_regions() calls free_bootmem_node() for each
- * registered physical page range. Similarly
* sparse_memory_present_with_active_regions() calls memory_present() for
* each range when SPARSEMEM is enabled.
- *
- * See mm/page_alloc.c for more information on each function exposed by
- * CONFIG_HAVE_MEMBLOCK_NODE_MAP.
*/
-extern void free_area_init_nodes(unsigned long *max_zone_pfn);
+void free_area_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
unsigned long end_pfn);
@@ -2376,16 +2426,10 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn,
extern void get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn);
extern unsigned long find_min_pfn_with_active_regions(void);
-extern void free_bootmem_with_active_regions(int nid,
- unsigned long max_low_pfn);
extern void sparse_memory_present_with_active_regions(int nid);
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
-#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
- !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID)
-static inline int __early_pfn_to_nid(unsigned long pfn,
- struct mminit_pfnnid_cache *state)
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+static inline int early_pfn_to_nid(unsigned long pfn)
{
return 0;
}
@@ -2421,6 +2465,7 @@ extern void setup_per_cpu_pageset(void);
extern int min_free_kbytes;
extern int watermark_boost_factor;
extern int watermark_scale_factor;
+extern bool arch_has_descending_max_zone_pfns(void);
/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
@@ -2597,25 +2642,6 @@ extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
int __must_check write_one_page(struct page *page);
void task_dirty_inc(struct task_struct *tsk);
-/* readahead.c */
-#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE)
-
-int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read);
-
-void page_cache_sync_readahead(struct address_space *mapping,
- struct file_ra_state *ra,
- struct file *filp,
- pgoff_t offset,
- unsigned long size);
-
-void page_cache_async_readahead(struct address_space *mapping,
- struct file_ra_state *ra,
- struct file *filp,
- struct page *pg,
- pgoff_t offset,
- unsigned long size);
-
extern unsigned long stack_guard_gap;
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
@@ -2776,6 +2802,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */
+#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */
/*
* FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4aba6c0c2ba8..ef6d3aface8a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -240,7 +240,11 @@ static inline atomic_t *compound_pincount_ptr(struct page *page)
#define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE)
#define page_private(page) ((page)->private)
-#define set_page_private(page, v) ((page)->private = (v))
+
+static inline void set_page_private(struct page *page, unsigned long private)
+{
+ page->private = private;
+}
struct page_frag_cache {
void * va;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e90604729b7e..e8d7d0b0acf4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -243,19 +243,6 @@ static inline bool is_active_lru(enum lru_list lru)
return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}
-struct zone_reclaim_stat {
- /*
- * The pageout code in vmscan.c keeps track of how many of the
- * mem/swap backed and file backed pages are referenced.
- * The higher the rotated/scanned ratio, the more valuable
- * that cache is.
- *
- * The anon LRU stats live in [0], file LRU stats in [1]
- */
- unsigned long recent_rotated[2];
- unsigned long recent_scanned[2];
-};
-
enum lruvec_flags {
LRUVEC_CONGESTED, /* lruvec has many dirty pages
* backed by a congested BDI
@@ -264,7 +251,13 @@ enum lruvec_flags {
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
- struct zone_reclaim_stat reclaim_stat;
+ /*
+ * These track the cost of reclaiming one LRU - file or anon -
+ * over the other. As the observed cost of reclaiming one LRU
+ * increases, the reclaim scan balance tips toward the other.
+ */
+ unsigned long anon_cost;
+ unsigned long file_cost;
/* Evictions & activations on the inactive file list */
atomic_long_t inactive_age;
/* Refaults at the time of last reclaim cycle */
@@ -668,9 +661,21 @@ struct deferred_split {
* per-zone basis.
*/
typedef struct pglist_data {
+ /*
+ * node_zones contains just the zones for THIS node. Not all of the
+ * zones may be populated, but it is the full list. It is referenced by
+ * this node's node_zonelists as well as other node's node_zonelists.
+ */
struct zone node_zones[MAX_NR_ZONES];
+
+ /*
+ * node_zonelists contains references to all zones in all nodes.
+ * Generally the first zones will be references to this node's
+ * node_zones.
+ */
struct zonelist node_zonelists[MAX_ZONELISTS];
- int nr_zones;
+
+ int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
@@ -681,6 +686,8 @@ typedef struct pglist_data {
/*
* Must be held any time you expect node_start_pfn,
* node_present_pages, node_spanned_pages or nr_zones to stay constant.
+ * Also synchronizes pgdat->first_deferred_pfn during deferred page
+ * init.
*
* pgdat_resize_lock() and pgdat_resize_unlock() are provided to
* manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
@@ -700,13 +707,13 @@ typedef struct pglist_data {
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
int kswapd_order;
- enum zone_type kswapd_classzone_idx;
+ enum zone_type kswapd_highest_zoneidx;
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
- enum zone_type kcompactd_classzone_idx;
+ enum zone_type kcompactd_highest_zoneidx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
#endif
@@ -784,15 +791,15 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
void build_all_zonelists(pg_data_t *pgdat);
void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
- enum zone_type classzone_idx);
+ enum zone_type highest_zoneidx);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
- int classzone_idx, unsigned int alloc_flags,
+ int highest_zoneidx, unsigned int alloc_flags,
long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx,
+ unsigned long mark, int highest_zoneidx,
unsigned int alloc_flags);
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx);
+ unsigned long mark, int highest_zoneidx);
enum memmap_context {
MEMMAP_EARLY,
MEMMAP_HOTPLUG,
@@ -877,7 +884,7 @@ extern int movable_zone;
#ifdef CONFIG_HIGHMEM
static inline int zone_movable_is_highmem(void)
{
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#ifdef CONFIG_NEED_MULTIPLE_NODES
return movable_zone == ZONE_HIGHMEM;
#else
return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM;
@@ -1080,15 +1087,6 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
#include <asm/sparsemem.h>
#endif
-#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
- !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
-static inline unsigned long early_pfn_to_nid(unsigned long pfn)
-{
- BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA));
- return 0;
-}
-#endif
-
#ifdef CONFIG_FLATMEM
#define pfn_to_nid(pfn) (0)
#endif
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 001f1fcf9836..f4f5e90a6844 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -13,9 +13,9 @@
#ifdef CONFIG_BLOCK
struct writeback_control;
+struct readahead_control;
-int mpage_readpages(struct address_space *mapping, struct list_head *pages,
- unsigned nr_pages, get_block_t get_block);
+void mpage_readahead(struct readahead_control *, get_block_t get_block);
int mpage_readpage(struct page *page, get_block_t get_block);
int mpage_writepages(struct address_space *mapping,
struct writeback_control *wbc, get_block_t get_block);
diff --git a/include/linux/net.h b/include/linux/net.h
index 6451425e828f..7b7b21a81736 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -264,7 +264,8 @@ do { \
net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__)
#define net_info_ratelimited(fmt, ...) \
net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__)
-#if defined(CONFIG_DYNAMIC_DEBUG)
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define net_dbg_ratelimited(fmt, ...) \
do { \
DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1a96e9c4ec36..5b364a2e0006 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4942,7 +4942,8 @@ do { \
#define MODULE_ALIAS_NETDEV(device) \
MODULE_ALIAS("netdev-" device)
-#if defined(CONFIG_DYNAMIC_DEBUG)
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define netdev_dbg(__dev, format, args...) \
do { \
dynamic_netdev_dbg(__dev, format, ##args); \
@@ -5012,7 +5013,8 @@ do { \
#define netif_info(priv, type, dev, fmt, args...) \
netif_level(info, priv, type, dev, fmt, ##args)
-#if defined(CONFIG_DYNAMIC_DEBUG)
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define netif_dbg(priv, type, netdev, format, args...) \
do { \
if (netif_msg_##type(priv)) \
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 750c7f395ca9..0c2a1d915453 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -84,6 +84,7 @@ static inline void reset_hung_task_detector(void) { }
#if defined(CONFIG_HARDLOCKUP_DETECTOR)
extern void hardlockup_detector_disable(void);
extern unsigned int hardlockup_panic;
+extern atomic_t hardlockup_detected;
#else
static inline void hardlockup_detector_disable(void) {}
#endif
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 693cae9bfe66..7302efff5e65 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -4,6 +4,9 @@
*
* Copyright (C) 2008, 2009 secunet Security Networks AG
* Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Copyright (c) 2020 Oracle and/or its affiliates.
+ * Author: Daniel Jordan <daniel.m.jordan@oracle.com>
*/
#ifndef PADATA_H
@@ -24,7 +27,6 @@
* @list: List entry, to attach to the padata lists.
* @pd: Pointer to the internal control structure.
* @cb_cpu: Callback cpu for serializatioon.
- * @cpu: Cpu for parallelization.
* @seq_nr: Sequence number of the parallelized data object.
* @info: Used to pass information from the parallel to the serial function.
* @parallel: Parallel execution function.
@@ -34,7 +36,6 @@ struct padata_priv {
struct list_head list;
struct parallel_data *pd;
int cb_cpu;
- int cpu;
unsigned int seq_nr;
int info;
void (*parallel)(struct padata_priv *padata);
@@ -68,15 +69,11 @@ struct padata_serial_queue {
/**
* struct padata_parallel_queue - The percpu padata parallel queue
*
- * @parallel: List to wait for parallelization.
* @reorder: List to wait for reordering after parallel processing.
- * @work: work struct for parallelization.
* @num_obj: Number of objects that are processed by this cpu.
*/
struct padata_parallel_queue {
- struct padata_list parallel;
struct padata_list reorder;
- struct work_struct work;
atomic_t num_obj;
};
@@ -111,7 +108,7 @@ struct parallel_data {
struct padata_parallel_queue __percpu *pqueue;
struct padata_serial_queue __percpu *squeue;
atomic_t refcnt;
- atomic_t seq_nr;
+ unsigned int seq_nr;
unsigned int processed;
int cpu;
struct padata_cpumask cpumask;
@@ -137,6 +134,31 @@ struct padata_shell {
};
/**
+ * struct padata_mt_job - represents one multithreaded job
+ *
+ * @thread_fn: Called for each chunk of work that a padata thread does.
+ * @fn_arg: The thread function argument.
+ * @start: The start of the job (units are job-specific).
+ * @size: size of this node's work (units are job-specific).
+ * @align: Ranges passed to the thread function fall on this boundary, with the
+ * possible exceptions of the beginning and end of the job.
+ * @min_chunk: The minimum chunk size in job-specific units. This allows
+ * the client to communicate the minimum amount of work that's
+ * appropriate for one worker thread to do at once.
+ * @max_threads: Max threads to use for the job, actual number may be less
+ * depending on task size and minimum chunk size.
+ */
+struct padata_mt_job {
+ void (*thread_fn)(unsigned long start, unsigned long end, void *arg);
+ void *fn_arg;
+ unsigned long start;
+ unsigned long size;
+ unsigned long align;
+ unsigned long min_chunk;
+ int max_threads;
+};
+
+/**
* struct padata_instance - The overall control structure.
*
* @cpu_online_node: Linkage for CPU online callback.
@@ -166,6 +188,12 @@ struct padata_instance {
#define PADATA_INVALID 4
};
+#ifdef CONFIG_PADATA
+extern void __init padata_init(void);
+#else
+static inline void __init padata_init(void) {}
+#endif
+
extern struct padata_instance *padata_alloc_possible(const char *name);
extern void padata_free(struct padata_instance *pinst);
extern struct padata_shell *padata_alloc_shell(struct padata_instance *pinst);
@@ -173,6 +201,7 @@ extern void padata_free_shell(struct padata_shell *ps);
extern int padata_do_parallel(struct padata_shell *ps,
struct padata_priv *padata, int *cb_cpu);
extern void padata_do_serial(struct padata_priv *padata);
+extern void __init padata_do_multithreaded(struct padata_mt_job *job);
extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
cpumask_var_t cpumask);
extern int padata_start(struct padata_instance *pinst);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 8b65420410ee..68c6cf612711 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -51,7 +51,10 @@ static inline void mapping_set_error(struct address_space *mapping, int error)
return;
/* Record in wb_err for checkers using errseq_t based tracking */
- filemap_set_wb_err(mapping, error);
+ __filemap_set_wb_err(mapping, error);
+
+ /* Record it in superblock */
+ errseq_set(&mapping->host->i_sb->s_wb_err, error);
/* Record it in flags for now, for legacy callers */
if (error == -ENOSPC)
@@ -205,6 +208,43 @@ static inline int page_cache_add_speculative(struct page *page, int count)
return __page_cache_add_speculative(page, count);
}
+/**
+ * attach_page_private - Attach private data to a page.
+ * @page: Page to attach data to.
+ * @data: Data to attach to page.
+ *
+ * Attaching private data to a page increments the page's reference count.
+ * The data must be detached before the page will be freed.
+ */
+static inline void attach_page_private(struct page *page, void *data)
+{
+ get_page(page);
+ set_page_private(page, (unsigned long)data);
+ SetPagePrivate(page);
+}
+
+/**
+ * detach_page_private - Detach private data from a page.
+ * @page: Page to detach data from.
+ *
+ * Removes the data that was previously attached to the page and decrements
+ * the refcount on the page.
+ *
+ * Return: Data that was attached to the page.
+ */
+static inline void *detach_page_private(struct page *page)
+{
+ void *data = (void *)page_private(page);
+
+ if (!PagePrivate(page))
+ return NULL;
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ put_page(page);
+
+ return data;
+}
+
#ifdef CONFIG_NUMA
extern struct page *__page_cache_alloc(gfp_t gfp);
#else
@@ -682,6 +722,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
void delete_from_page_cache_batch(struct address_space *mapping,
struct pagevec *pvec);
+#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE)
+
+void page_cache_sync_readahead(struct address_space *, struct file_ra_state *,
+ struct file *, pgoff_t index, unsigned long req_count);
+void page_cache_async_readahead(struct address_space *, struct file_ra_state *,
+ struct file *, struct page *, pgoff_t index,
+ unsigned long req_count);
+void page_cache_readahead_unbounded(struct address_space *, struct file *,
+ pgoff_t index, unsigned long nr_to_read,
+ unsigned long lookahead_count);
+
/*
* Like add_to_page_cache_locked, but used to add newly allocated pages:
* the page is new, so we can just run __SetPageLocked() against it.
@@ -698,6 +749,146 @@ static inline int add_to_page_cache(struct page *page,
return error;
}
+/**
+ * struct readahead_control - Describes a readahead request.
+ *
+ * A readahead request is for consecutive pages. Filesystems which
+ * implement the ->readahead method should call readahead_page() or
+ * readahead_page_batch() in a loop and attempt to start I/O against
+ * each page in the request.
+ *
+ * Most of the fields in this struct are private and should be accessed
+ * by the functions below.
+ *
+ * @file: The file, used primarily by network filesystems for authentication.
+ * May be NULL if invoked internally by the filesystem.
+ * @mapping: Readahead this filesystem object.
+ */
+struct readahead_control {
+ struct file *file;
+ struct address_space *mapping;
+/* private: use the readahead_* accessors instead */
+ pgoff_t _index;
+ unsigned int _nr_pages;
+ unsigned int _batch_count;
+};
+
+/**
+ * readahead_page - Get the next page to read.
+ * @rac: The current readahead request.
+ *
+ * Context: The page is locked and has an elevated refcount. The caller
+ * should decreases the refcount once the page has been submitted for I/O
+ * and unlock the page once all I/O to that page has completed.
+ * Return: A pointer to the next page, or %NULL if we are done.
+ */
+static inline struct page *readahead_page(struct readahead_control *rac)
+{
+ struct page *page;
+
+ BUG_ON(rac->_batch_count > rac->_nr_pages);
+ rac->_nr_pages -= rac->_batch_count;
+ rac->_index += rac->_batch_count;
+
+ if (!rac->_nr_pages) {
+ rac->_batch_count = 0;
+ return NULL;
+ }
+
+ page = xa_load(&rac->mapping->i_pages, rac->_index);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ rac->_batch_count = hpage_nr_pages(page);
+
+ return page;
+}
+
+static inline unsigned int __readahead_batch(struct readahead_control *rac,
+ struct page **array, unsigned int array_sz)
+{
+ unsigned int i = 0;
+ XA_STATE(xas, &rac->mapping->i_pages, 0);
+ struct page *page;
+
+ BUG_ON(rac->_batch_count > rac->_nr_pages);
+ rac->_nr_pages -= rac->_batch_count;
+ rac->_index += rac->_batch_count;
+ rac->_batch_count = 0;
+
+ xas_set(&xas, rac->_index);
+ rcu_read_lock();
+ xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) {
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ array[i++] = page;
+ rac->_batch_count += hpage_nr_pages(page);
+
+ /*
+ * The page cache isn't using multi-index entries yet,
+ * so the xas cursor needs to be manually moved to the
+ * next index. This can be removed once the page cache
+ * is converted.
+ */
+ if (PageHead(page))
+ xas_set(&xas, rac->_index + rac->_batch_count);
+
+ if (i == array_sz)
+ break;
+ }
+ rcu_read_unlock();
+
+ return i;
+}
+
+/**
+ * readahead_page_batch - Get a batch of pages to read.
+ * @rac: The current readahead request.
+ * @array: An array of pointers to struct page.
+ *
+ * Context: The pages are locked and have an elevated refcount. The caller
+ * should decreases the refcount once the page has been submitted for I/O
+ * and unlock the page once all I/O to that page has completed.
+ * Return: The number of pages placed in the array. 0 indicates the request
+ * is complete.
+ */
+#define readahead_page_batch(rac, array) \
+ __readahead_batch(rac, array, ARRAY_SIZE(array))
+
+/**
+ * readahead_pos - The byte offset into the file of this readahead request.
+ * @rac: The readahead request.
+ */
+static inline loff_t readahead_pos(struct readahead_control *rac)
+{
+ return (loff_t)rac->_index * PAGE_SIZE;
+}
+
+/**
+ * readahead_length - The number of bytes in this readahead request.
+ * @rac: The readahead request.
+ */
+static inline loff_t readahead_length(struct readahead_control *rac)
+{
+ return (loff_t)rac->_nr_pages * PAGE_SIZE;
+}
+
+/**
+ * readahead_index - The index of the first page in this readahead request.
+ * @rac: The readahead request.
+ */
+static inline pgoff_t readahead_index(struct readahead_control *rac)
+{
+ return rac->_index;
+}
+
+/**
+ * readahead_count - The number of pages in this readahead request.
+ * @rac: The readahead request.
+ */
+static inline unsigned int readahead_count(struct readahead_control *rac)
+{
+ return rac->_nr_pages;
+}
+
static inline unsigned long dir_pages(struct inode *inode)
{
return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 3cc2f178bf06..fc8f03c54543 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -399,7 +399,8 @@ extern int kptr_restrict;
/* If you are writing a driver, please use dev_dbg instead */
-#if defined(CONFIG_DYNAMIC_DEBUG)
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#include <linux/dynamic_debug.h>
/**
@@ -535,7 +536,8 @@ extern int kptr_restrict;
#endif
/* If you are writing a driver, please use dev_dbg instead */
-#if defined(CONFIG_DYNAMIC_DEBUG)
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* descriptor check is first to prevent flooding with "callbacks suppressed" */
#define pr_debug_ratelimited(fmt, ...) \
do { \
@@ -582,7 +584,8 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
#endif
-#if defined(CONFIG_DYNAMIC_DEBUG)
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \
groupsize, buf, len, ascii) \
dynamic_hex_dump(prefix_str, prefix_type, rowsize, \
diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h
index a67065c403c3..2a3a95586425 100644
--- a/include/linux/ptdump.h
+++ b/include/linux/ptdump.h
@@ -13,7 +13,8 @@ struct ptdump_range {
struct ptdump_state {
/* level is 0:PGD to 4:PTE, or -1 if unknown */
void (*note_page)(struct ptdump_state *st, unsigned long addr,
- int level, unsigned long val);
+ int level, u64 val);
+ void (*effective_prot)(struct ptdump_state *st, int level, u64 val);
const struct ptdump_range *range;
};
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f38d62c4632c..46420ae8de56 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -920,6 +920,7 @@ struct task_struct {
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
unsigned long last_switch_time;
+ unsigned long killed_time;
#endif
/* Filesystem information: */
struct fs_struct *fs;
@@ -1250,6 +1251,9 @@ struct task_struct {
/* KCOV sequence number: */
int kcov_sequence;
+
+ /* Collect coverage from softirq context: */
+ unsigned int kcov_softirq;
#endif
#ifdef CONFIG_MEMCG
@@ -1308,6 +1312,13 @@ struct task_struct {
struct callback_head mce_kill_me;
#endif
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ unsigned long getblk_stamp;
+ unsigned int getblk_executed;
+ unsigned int getblk_bh_count;
+ unsigned long getblk_bh_state;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 1672cf6f7614..813614d4b71f 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -145,6 +145,25 @@ void *__seq_open_private(struct file *, const struct seq_operations *, int);
int seq_open_private(struct file *, const struct seq_operations *, int);
int seq_release_private(struct inode *, struct file *);
+#define DEFINE_SEQ_ATTRIBUTE(__name) \
+static int __name ## _open(struct inode *inode, struct file *file) \
+{ \
+ int ret = seq_open(file, &__name ## _sops); \
+ if (!ret && inode->i_private) { \
+ struct seq_file *seq_f = file->private_data; \
+ seq_f->private = inode->i_private; \
+ } \
+ return ret; \
+} \
+ \
+static const struct file_operations __name ## _fops = { \
+ .owner = THIS_MODULE, \
+ .open = __name ## _open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = seq_release, \
+}
+
#define DEFINE_SHOW_ATTRIBUTE(__name) \
static int __name ## _open(struct inode *inode, struct file *file) \
{ \
diff --git a/include/linux/string.h b/include/linux/string.h
index 6dfbb2efa815..9b7a0632e87a 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -272,6 +272,31 @@ void __read_overflow3(void) __compiletime_error("detected read beyond size of ob
void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter");
#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
+
+#ifdef CONFIG_KASAN
+extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
+extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
+extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
+extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
+extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
+extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat);
+extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy);
+extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
+extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
+extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);
+#else
+#define __underlying_memchr __builtin_memchr
+#define __underlying_memcmp __builtin_memcmp
+#define __underlying_memcpy __builtin_memcpy
+#define __underlying_memmove __builtin_memmove
+#define __underlying_memset __builtin_memset
+#define __underlying_strcat __builtin_strcat
+#define __underlying_strcpy __builtin_strcpy
+#define __underlying_strlen __builtin_strlen
+#define __underlying_strncat __builtin_strncat
+#define __underlying_strncpy __builtin_strncpy
+#endif
+
__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
{
size_t p_size = __builtin_object_size(p, 0);
@@ -279,14 +304,14 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
__write_overflow();
if (p_size < size)
fortify_panic(__func__);
- return __builtin_strncpy(p, q, size);
+ return __underlying_strncpy(p, q, size);
}
__FORTIFY_INLINE char *strcat(char *p, const char *q)
{
size_t p_size = __builtin_object_size(p, 0);
if (p_size == (size_t)-1)
- return __builtin_strcat(p, q);
+ return __underlying_strcat(p, q);
if (strlcat(p, q, p_size) >= p_size)
fortify_panic(__func__);
return p;
@@ -300,7 +325,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p)
/* Work around gcc excess stack consumption issue */
if (p_size == (size_t)-1 ||
(__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0'))
- return __builtin_strlen(p);
+ return __underlying_strlen(p);
ret = strnlen(p, p_size);
if (p_size <= ret)
fortify_panic(__func__);
@@ -333,7 +358,7 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
__write_overflow();
if (len >= p_size)
fortify_panic(__func__);
- __builtin_memcpy(p, q, len);
+ __underlying_memcpy(p, q, len);
p[len] = '\0';
}
return ret;
@@ -346,12 +371,12 @@ __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count)
size_t p_size = __builtin_object_size(p, 0);
size_t q_size = __builtin_object_size(q, 0);
if (p_size == (size_t)-1 && q_size == (size_t)-1)
- return __builtin_strncat(p, q, count);
+ return __underlying_strncat(p, q, count);
p_len = strlen(p);
copy_len = strnlen(q, count);
if (p_size < p_len + copy_len + 1)
fortify_panic(__func__);
- __builtin_memcpy(p + p_len, q, copy_len);
+ __underlying_memcpy(p + p_len, q, copy_len);
p[p_len + copy_len] = '\0';
return p;
}
@@ -363,7 +388,7 @@ __FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size)
__write_overflow();
if (p_size < size)
fortify_panic(__func__);
- return __builtin_memset(p, c, size);
+ return __underlying_memset(p, c, size);
}
__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size)
@@ -378,7 +403,7 @@ __FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size)
}
if (p_size < size || q_size < size)
fortify_panic(__func__);
- return __builtin_memcpy(p, q, size);
+ return __underlying_memcpy(p, q, size);
}
__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size)
@@ -393,7 +418,7 @@ __FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size)
}
if (p_size < size || q_size < size)
fortify_panic(__func__);
- return __builtin_memmove(p, q, size);
+ return __underlying_memmove(p, q, size);
}
extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan);
@@ -419,7 +444,7 @@ __FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size)
}
if (p_size < size || q_size < size)
fortify_panic(__func__);
- return __builtin_memcmp(p, q, size);
+ return __underlying_memcmp(p, q, size);
}
__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size)
@@ -429,7 +454,7 @@ __FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size)
__read_overflow();
if (p_size < size)
fortify_panic(__func__);
- return __builtin_memchr(p, c, size);
+ return __underlying_memchr(p, c, size);
}
void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv);
@@ -460,11 +485,22 @@ __FORTIFY_INLINE char *strcpy(char *p, const char *q)
size_t p_size = __builtin_object_size(p, 0);
size_t q_size = __builtin_object_size(q, 0);
if (p_size == (size_t)-1 && q_size == (size_t)-1)
- return __builtin_strcpy(p, q);
+ return __underlying_strcpy(p, q);
memcpy(p, q, strlen(q) + 1);
return p;
}
+/* Don't use these outside the FORITFY_SOURCE implementation */
+#undef __underlying_memchr
+#undef __underlying_memcmp
+#undef __underlying_memcpy
+#undef __underlying_memmove
+#undef __underlying_memset
+#undef __underlying_strcat
+#undef __underlying_strcpy
+#undef __underlying_strlen
+#undef __underlying_strncat
+#undef __underlying_strncpy
#endif
/**
diff --git a/include/linux/swap.h b/include/linux/swap.h
index e1bbf7a16b27..d9e362c7439c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -183,12 +183,17 @@ enum {
#define SWAP_CLUSTER_MAX 32UL
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
-#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */
-#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */
+/* Bit flag in swap_map */
#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
-#define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */
-#define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */
-#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */
+#define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */
+
+/* Special value in first swap_map */
+#define SWAP_MAP_MAX 0x3e /* Max count */
+#define SWAP_MAP_BAD 0x3f /* Note page is bad */
+#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */
+
+/* Special value in each swap_map continuation */
+#define SWAP_CONT_MAX 0x7f /* Max count */
/*
* We use this to track usage of a cluster. A cluster is a block of swap disk
@@ -247,6 +252,7 @@ struct swap_info_struct {
unsigned int inuse_pages; /* number of those currently in use */
unsigned int cluster_next; /* likely index for next allocation */
unsigned int cluster_nr; /* countdown to next cluster search */
+ unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */
struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
struct rb_root swap_extent_root;/* root of the swap extent rbtree */
struct block_device *bdev; /* swap device or bdev of swap file */
@@ -328,9 +334,10 @@ extern unsigned long nr_free_pagecache_pages(void);
/* linux/mm/swap.c */
+extern void lru_note_cost(struct lruvec *lruvec, bool file,
+ unsigned int nr_pages);
+extern void lru_note_cost_page(struct page *);
extern void lru_cache_add(struct page *);
-extern void lru_cache_add_anon(struct page *page);
-extern void lru_cache_add_file(struct page *page);
extern void lru_add_page_tail(struct page *page, struct page *page_tail,
struct lruvec *lruvec, struct list_head *head);
extern void activate_page(struct page *);
@@ -645,11 +652,9 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
#endif
#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-extern void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
- gfp_t gfp_mask);
+extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
#else
-static inline void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg,
- int node, gfp_t gfp_mask)
+static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
{
}
#endif
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index ffef0f279747..2b7a7b3d5444 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -35,6 +35,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_DIRECT_THROTTLE,
+ PGSCAN_ANON,
+ PGSCAN_FILE,
+ PGSTEAL_ANON,
+ PGSTEAL_FILE,
#ifdef CONFIG_NUMA
PGSCAN_ZONE_RECLAIM_FAILED,
#endif
@@ -91,6 +95,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_ZERO_PAGE_ALLOC_FAILED,
THP_SWPOUT,
THP_SWPOUT_FALLBACK,
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ THP_PMD_MIGRATION_SUCCESS,
+ THP_PMD_MIGRATION_FAILURE,
+#endif
#endif
#ifdef CONFIG_MEMORY_BALLOON
BALLOON_INFLATE,
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index a95d3cc74d79..48bb681e6c2a 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -88,8 +88,7 @@ struct vmap_area {
* Highlevel APIs for driver use
*/
extern void vm_unmap_ram(const void *mem, unsigned int count);
-extern void *vm_map_ram(struct page **pages, unsigned int count,
- int node, pgprot_t prot);
+extern void *vm_map_ram(struct page **pages, unsigned int count, int node);
extern void vm_unmap_aliases(void);
#ifdef CONFIG_MMU
@@ -107,26 +106,16 @@ extern void *vzalloc(unsigned long size);
extern void *vmalloc_user(unsigned long size);
extern void *vmalloc_node(unsigned long size, int node);
extern void *vzalloc_node(unsigned long size, int node);
-extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags);
extern void *vmalloc_exec(unsigned long size);
extern void *vmalloc_32(unsigned long size);
extern void *vmalloc_32_user(unsigned long size);
-extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
+extern void *__vmalloc(unsigned long size, gfp_t gfp_mask);
extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller);
-#ifndef CONFIG_MMU
-extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
-static inline void *__vmalloc_node_flags_caller(unsigned long size, int node,
- gfp_t flags, void *caller)
-{
- return __vmalloc_node_flags(size, node, flags);
-}
-#else
-extern void *__vmalloc_node_flags_caller(unsigned long size,
- int node, gfp_t flags, void *caller);
-#endif
+void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
+ int node, const void *caller);
extern void vfree(const void *addr);
extern void vfree_atomic(const void *addr);
@@ -141,8 +130,22 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma,
extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff);
-void vmalloc_sync_mappings(void);
-void vmalloc_sync_unmappings(void);
+
+/*
+ * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
+ * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
+ * needs to be called.
+ */
+#ifndef ARCH_PAGE_TABLE_SYNC_MASK
+#define ARCH_PAGE_TABLE_SYNC_MASK 0
+#endif
+
+/*
+ * There is no default implementation for arch_sync_kernel_mappings(). It is
+ * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK
+ * is 0.
+ */
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end);
/*
* Lowlevel-APIs (not for driver use!)
@@ -161,8 +164,6 @@ static inline size_t get_vm_area_size(const struct vm_struct *area)
extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
extern struct vm_struct *get_vm_area_caller(unsigned long size,
unsigned long flags, const void *caller);
-extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
- unsigned long start, unsigned long end);
extern struct vm_struct *__get_vm_area_caller(unsigned long size,
unsigned long flags,
unsigned long start, unsigned long end,
@@ -170,11 +171,11 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size,
extern struct vm_struct *remove_vm_area(const void *addr);
extern struct vm_struct *find_vm_area(const void *addr);
-extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
- struct page **pages);
#ifdef CONFIG_MMU
extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
pgprot_t prot, struct page **pages);
+int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
+ struct page **pages);
extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
extern void unmap_kernel_range(unsigned long addr, unsigned long size);
static inline void set_vm_flush_reset_perms(void *addr)
@@ -191,14 +192,12 @@ map_kernel_range_noflush(unsigned long start, unsigned long size,
{
return size >> PAGE_SHIFT;
}
+#define map_kernel_range map_kernel_range_noflush
static inline void
unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
{
}
-static inline void
-unmap_kernel_range(unsigned long addr, unsigned long size)
-{
-}
+#define unmap_kernel_range unmap_kernel_range_noflush
static inline void set_vm_flush_reset_perms(void *addr)
{
}
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index cb507151710f..aa961088c551 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -26,9 +26,11 @@ struct reclaim_stat {
unsigned nr_congested;
unsigned nr_writeback;
unsigned nr_immediate;
+ unsigned nr_pageout;
unsigned nr_activate[2];
unsigned nr_ref_keep;
unsigned nr_unmap_fail;
+ unsigned nr_lazyfree_fail;
};
enum writeback_stat_item {
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 898c890fc153..74935bc9757b 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -544,7 +544,7 @@ do { \
({ \
int __ret = 0; \
might_sleep(); \
- if (!(condition)) \
+ if (!(condition) && (timeout)) \
__ret = __wait_event_hrtimeout(wq_head, condition, timeout, \
TASK_UNINTERRUPTIBLE); \
__ret; \
@@ -570,7 +570,7 @@ do { \
({ \
long __ret = 0; \
might_sleep(); \
- if (!(condition)) \
+ if (!(condition) && (timeout)) \
__ret = __wait_event_hrtimeout(wq, condition, timeout, \
TASK_INTERRUPTIBLE); \
__ret; \
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 2219cce81ca4..0fdbf653b173 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -20,7 +20,7 @@
* zsmalloc mapping modes
*
* NOTE: These only make a difference when a mapped object spans pages.
- * They also have no effect when PGTABLE_MAPPING is selected.
+ * They also have no effect when ZSMALLOC_PGTABLE_MAPPING is selected.
*/
enum zs_mapmode {
ZS_MM_RW, /* normal read-write mapping */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 94533ae16697..c45010b402ee 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -100,7 +100,8 @@ void ibdev_notice(const struct ib_device *ibdev, const char *format, ...);
__printf(2, 3) __cold
void ibdev_info(const struct ib_device *ibdev, const char *format, ...);
-#if defined(CONFIG_DYNAMIC_DEBUG)
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define ibdev_dbg(__dev, format, args...) \
dynamic_ibdev_dbg(__dev, format, ##args)
#else
@@ -133,7 +134,8 @@ do { \
#define ibdev_info_ratelimited(ibdev, fmt, ...) \
ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__)
-#if defined(CONFIG_DYNAMIC_DEBUG)
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* descriptor check is first to prevent flooding with "callbacks suppressed" */
#define ibdev_dbg_ratelimited(ibdev, fmt, ...) \
do { \
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index e5bf6ee4e814..54e5bf081171 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -314,40 +314,44 @@ TRACE_EVENT(mm_compaction_kcompactd_sleep,
DECLARE_EVENT_CLASS(kcompactd_wake_template,
- TP_PROTO(int nid, int order, enum zone_type classzone_idx),
+ TP_PROTO(int nid, int order, enum zone_type highest_zoneidx),
- TP_ARGS(nid, order, classzone_idx),
+ TP_ARGS(nid, order, highest_zoneidx),
TP_STRUCT__entry(
__field(int, nid)
__field(int, order)
- __field(enum zone_type, classzone_idx)
+ __field(enum zone_type, highest_zoneidx)
),
TP_fast_assign(
__entry->nid = nid;
__entry->order = order;
- __entry->classzone_idx = classzone_idx;
+ __entry->highest_zoneidx = highest_zoneidx;
),
+ /*
+ * classzone_idx is previous name of the highest_zoneidx.
+ * Reason not to change it is the ABI requirement of the tracepoint.
+ */
TP_printk("nid=%d order=%d classzone_idx=%-8s",
__entry->nid,
__entry->order,
- __print_symbolic(__entry->classzone_idx, ZONE_TYPE))
+ __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE))
);
DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd,
- TP_PROTO(int nid, int order, enum zone_type classzone_idx),
+ TP_PROTO(int nid, int order, enum zone_type highest_zoneidx),
- TP_ARGS(nid, order, classzone_idx)
+ TP_ARGS(nid, order, highest_zoneidx)
);
DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake,
- TP_PROTO(int nid, int order, enum zone_type classzone_idx),
+ TP_PROTO(int nid, int order, enum zone_type highest_zoneidx),
- TP_ARGS(nid, order, classzone_idx)
+ TP_ARGS(nid, order, highest_zoneidx)
);
#endif
diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h
index 27f5caa6299a..bf9806fd1306 100644
--- a/include/trace/events/erofs.h
+++ b/include/trace/events/erofs.h
@@ -113,10 +113,10 @@ TRACE_EVENT(erofs_readpage,
TRACE_EVENT(erofs_readpages,
- TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage,
+ TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage,
bool raw),
- TP_ARGS(inode, page, nrpage, raw),
+ TP_ARGS(inode, start, nrpage, raw),
TP_STRUCT__entry(
__field(dev_t, dev )
@@ -129,7 +129,7 @@ TRACE_EVENT(erofs_readpages,
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->nid = EROFS_I(inode)->nid;
- __entry->start = page->index;
+ __entry->start = start;
__entry->nrpage = nrpage;
__entry->raw = raw;
),
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index a8e2e83c91ce..8639ab962a71 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -1379,9 +1379,9 @@ TRACE_EVENT(f2fs_writepages,
TRACE_EVENT(f2fs_readpages,
- TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage),
+ TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage),
- TP_ARGS(inode, page, nrpage),
+ TP_ARGS(inode, start, nrpage),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -1393,7 +1393,7 @@ TRACE_EVENT(f2fs_readpages,
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
- __entry->start = page->index;
+ __entry->start = start;
__entry->nrpage = nrpage;
),
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 70e32ff096ec..4fdb14a81108 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -12,6 +12,8 @@
EM( SCAN_SUCCEED, "succeeded") \
EM( SCAN_PMD_NULL, "pmd_null") \
EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \
+ EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \
+ EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \
EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \
EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \
EM( SCAN_PAGE_RO, "no_writable_page") \
@@ -31,7 +33,6 @@
EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\
EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \
- EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \
EM( SCAN_TRUNCATED, "truncated") \
EMe(SCAN_PAGE_HAS_PRIVATE, "page_has_private") \
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 74bb594ccb25..2070df64958e 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -265,7 +265,7 @@ TRACE_EVENT(mm_shrink_slab_end,
);
TRACE_EVENT(mm_vmscan_lru_isolate,
- TP_PROTO(int classzone_idx,
+ TP_PROTO(int highest_zoneidx,
int order,
unsigned long nr_requested,
unsigned long nr_scanned,
@@ -274,10 +274,10 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
isolate_mode_t isolate_mode,
int lru),
- TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru),
+ TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru),
TP_STRUCT__entry(
- __field(int, classzone_idx)
+ __field(int, highest_zoneidx)
__field(int, order)
__field(unsigned long, nr_requested)
__field(unsigned long, nr_scanned)
@@ -288,7 +288,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
),
TP_fast_assign(
- __entry->classzone_idx = classzone_idx;
+ __entry->highest_zoneidx = highest_zoneidx;
__entry->order = order;
__entry->nr_requested = nr_requested;
__entry->nr_scanned = nr_scanned;
@@ -298,9 +298,13 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
__entry->lru = lru;
),
+ /*
+ * classzone is previous name of the highest_zoneidx.
+ * Reason not to change it is the ABI requirement of the tracepoint.
+ */
TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s",
__entry->isolate_mode,
- __entry->classzone_idx,
+ __entry->highest_zoneidx,
__entry->order,
__entry->nr_requested,
__entry->nr_scanned,
diff --git a/init/Kconfig b/init/Kconfig
index 5cfde7b00549..a8136131c108 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -284,6 +284,16 @@ config KERNEL_UNCOMPRESSED
endchoice
+config DEFAULT_INIT
+ string "Default init path"
+ default ""
+ help
+ This option determines the default init for the system if no init=
+ option is passed on the kernel command line. If the requested path is
+ not present, we will still then move on to attempting further
+ locations (e.g. /sbin/init, etc). If this is empty, we will just use
+ the fallback list when init= is not passed.
+
config DEFAULT_HOSTNAME
string "Default hostname"
default "(none)"
@@ -858,24 +868,9 @@ config MEMCG
Provides control over the memory footprint of tasks in a cgroup.
config MEMCG_SWAP
- bool "Swap controller"
+ bool
depends on MEMCG && SWAP
- help
- Provides control over the swap space consumed by tasks in a cgroup.
-
-config MEMCG_SWAP_ENABLED
- bool "Swap controller enabled by default"
- depends on MEMCG_SWAP
default y
- help
- Memory Resource Controller Swap Extension comes with its price in
- a bigger memory consumption. General purpose distribution kernels
- which want to enable the feature but keep it disabled by default
- and let the user enable it by swapaccount=1 boot command line
- parameter should have this option unselected.
- For those who want to have the feature enabled by default should
- select this option (if, for some reason, they need to disable it
- then swapaccount=0 does the trick).
config MEMCG_KMEM
bool
diff --git a/init/main.c b/init/main.c
index 73e6182c8381..30271bcb30de 100644
--- a/init/main.c
+++ b/init/main.c
@@ -63,6 +63,7 @@
#include <linux/debugobjects.h>
#include <linux/lockdep.h>
#include <linux/kmemleak.h>
+#include <linux/padata.h>
#include <linux/pid_namespace.h>
#include <linux/device/driver.h>
#include <linux/kthread.h>
@@ -1434,6 +1435,16 @@ static int __ref kernel_init(void *unused)
panic("Requested init %s failed (error %d).",
execute_command, ret);
}
+
+ if (CONFIG_DEFAULT_INIT[0] != '\0') {
+ ret = run_init_process(CONFIG_DEFAULT_INIT);
+ if (ret)
+ pr_err("Default init %s failed (error %d)\n",
+ CONFIG_DEFAULT_INIT, ret);
+ else
+ return 0;
+ }
+
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
@@ -1484,6 +1495,7 @@ static noinline void __init kernel_init_freeable(void)
smp_init();
sched_init_smp();
+ padata_init();
page_alloc_init_late();
/* Initialize page ext after all struct pages are initialized. */
page_ext_init();
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index d1b8644bfb88..4aeb5375fa8b 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -115,7 +115,7 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
int ipc_mni = IPCMNI;
int ipc_mni_shift = IPCMNI_SHIFT;
-int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
+int ipc_min_cycle = XA_CHUNK_SIZE;
static struct ctl_table ipc_kern_table[] = {
{
@@ -196,8 +196,8 @@ static struct ctl_table ipc_kern_table[] = {
#ifdef CONFIG_CHECKPOINT_RESTORE
{
.procname = "sem_next_id",
- .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
- .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
+ .data = &init_ipc_ns.ids[IPC_SEM_IDS].restore_id,
+ .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].restore_id),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
@@ -205,8 +205,8 @@ static struct ctl_table ipc_kern_table[] = {
},
{
.procname = "msg_next_id",
- .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
- .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
+ .data = &init_ipc_ns.ids[IPC_MSG_IDS].restore_id,
+ .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].restore_id),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
@@ -214,8 +214,8 @@ static struct ctl_table ipc_kern_table[] = {
},
{
.procname = "shm_next_id",
- .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
- .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
+ .data = &init_ipc_ns.ids[IPC_SHM_IDS].restore_id,
+ .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].restore_id),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
diff --git a/ipc/msg.c b/ipc/msg.c
index caca67368cb5..b8f7b18cd778 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -268,6 +268,8 @@ static void expunge_all(struct msg_queue *msq, int res,
* before freeque() is called. msg_ids.rwsem remains locked on exit.
*/
static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
+ __releases(RCU)
+ __releases(&msq->q_perm)
{
struct msg_msg *msg, *t;
struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
@@ -1308,7 +1310,6 @@ void msg_init_ns(struct ipc_namespace *ns)
void msg_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &msg_ids(ns), freeque);
- idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht);
}
#endif
diff --git a/ipc/namespace.c b/ipc/namespace.c
index fdc3b5f3f53a..e0d5de4c26b9 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -96,27 +96,26 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
void (*free)(struct ipc_namespace *, struct kern_ipc_perm *))
{
struct kern_ipc_perm *perm;
- int next_id;
- int total, in_use;
+ unsigned long index;
down_write(&ids->rwsem);
- in_use = ids->in_use;
-
- for (total = 0, next_id = 0; total < in_use; next_id++) {
- perm = idr_find(&ids->ipcs_idr, next_id);
- if (perm == NULL)
- continue;
+ xa_for_each(&ids->ipcs, index, perm) {
rcu_read_lock();
ipc_lock_object(perm);
free(ns, perm);
- total++;
}
+ BUG_ON(!xa_empty(&ids->ipcs));
+
up_write(&ids->rwsem);
}
static void free_ipc_ns(struct ipc_namespace *ns)
{
+ /* mq_put_mnt() waits for a grace period as kern_unmount()
+ * uses synchronize_rcu().
+ */
+ mq_put_mnt(ns);
sem_exit_ns(ns);
msg_exit_ns(ns);
shm_exit_ns(ns);
@@ -127,6 +126,21 @@ static void free_ipc_ns(struct ipc_namespace *ns)
kfree(ns);
}
+static LLIST_HEAD(free_ipc_list);
+static void free_ipc(struct work_struct *unused)
+{
+ struct llist_node *node = llist_del_all(&free_ipc_list);
+ struct ipc_namespace *n, *t;
+
+ llist_for_each_entry_safe(n, t, node, mnt_llist)
+ free_ipc_ns(n);
+}
+
+/*
+ * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount.
+ */
+static DECLARE_WORK(free_ipc_work, free_ipc);
+
/*
* put_ipc_ns - drop a reference to an ipc namespace.
* @ns: the namespace to put
@@ -148,8 +162,9 @@ void put_ipc_ns(struct ipc_namespace *ns)
if (refcount_dec_and_lock(&ns->count, &mq_lock)) {
mq_clear_sbinfo(ns);
spin_unlock(&mq_lock);
- mq_put_mnt(ns);
- free_ipc_ns(ns);
+
+ if (llist_add(&ns->mnt_llist, &free_ipc_list))
+ schedule_work(&free_ipc_work);
}
}
diff --git a/ipc/sem.c b/ipc/sem.c
index 3687b71151b3..8d6550ac035a 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -258,7 +258,6 @@ void sem_init_ns(struct ipc_namespace *ns)
void sem_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &sem_ids(ns), freeary);
- idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
}
#endif
diff --git a/ipc/shm.c b/ipc/shm.c
index 0ba6add05b35..7922c4f65234 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -129,7 +129,6 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
void shm_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
- idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht);
}
#endif
@@ -348,34 +347,30 @@ done:
up_write(&shm_ids(ns).rwsem);
}
-/* Called with ns->shm_ids(ns).rwsem locked */
-static int shm_try_destroy_orphaned(int id, void *p, void *data)
+void shm_destroy_orphaned(struct ipc_namespace *ns)
{
- struct ipc_namespace *ns = data;
- struct kern_ipc_perm *ipcp = p;
- struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+ struct kern_ipc_perm *ipcp;
+ unsigned long index;
- /*
- * We want to destroy segments without users and with already
- * exit'ed originating process.
- *
- * As shp->* are changed under rwsem, it's safe to skip shp locking.
- */
- if (shp->shm_creator != NULL)
- return 0;
+ down_write(&shm_ids(ns).rwsem);
+ xa_for_each(&shm_ids(ns).ipcs, index, ipcp) {
+ struct shmid_kernel *shp;
- if (shm_may_destroy(ns, shp)) {
- shm_lock_by_ptr(shp);
- shm_destroy(ns, shp);
- }
- return 0;
-}
+ shp = container_of(ipcp, struct shmid_kernel, shm_perm);
-void shm_destroy_orphaned(struct ipc_namespace *ns)
-{
- down_write(&shm_ids(ns).rwsem);
- if (shm_ids(ns).in_use)
- idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
+ /*
+ * We want to destroy segments without users and with already
+ * exit'ed originating process. As shp->* are changed under
+ * rwsem, it's safe to skip shp locking.
+ */
+ if (shp->shm_creator != NULL)
+ continue;
+
+ if (shm_may_destroy(ns, shp)) {
+ shm_lock_by_ptr(shp);
+ shm_destroy(ns, shp);
+ }
+ }
up_write(&shm_ids(ns).rwsem);
}
@@ -860,26 +855,17 @@ static void shm_add_rss_swap(struct shmid_kernel *shp,
static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
unsigned long *swp)
{
- int next_id;
- int total, in_use;
+ struct kern_ipc_perm *ipc;
+ unsigned long index;
*rss = 0;
*swp = 0;
- in_use = shm_ids(ns).in_use;
-
- for (total = 0, next_id = 0; total < in_use; next_id++) {
- struct kern_ipc_perm *ipc;
+ xa_for_each(&shm_ids(ns).ipcs, index, ipc) {
struct shmid_kernel *shp;
- ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
- if (ipc == NULL)
- continue;
shp = container_of(ipc, struct shmid_kernel, shm_perm);
-
shm_add_rss_swap(shp, rss, swp);
-
- total++;
}
}
diff --git a/ipc/util.c b/ipc/util.c
index cfa0045e748d..ae357c18dcdd 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -104,12 +104,20 @@ static const struct rhashtable_params ipc_kht_params = {
.automatic_shrinking = true,
};
+#ifdef CONFIG_CHECKPOINT_RESTORE
+#define set_restore_id(ids, x) ids->restore_id = x
+#define get_restore_id(ids) ids->restore_id
+#else
+#define set_restore_id(ids, x) do { } while (0)
+#define get_restore_id(ids) (-1)
+#endif
+
/**
* ipc_init_ids - initialise ipc identifiers
* @ids: ipc identifier set
*
* Set up the sequence range to use for the ipc identifier range (limited
- * below ipc_mni) then initialise the keys hashtable and ids idr.
+ * below ipc_mni) then initialise the keys hashtable and ids xarray.
*/
void ipc_init_ids(struct ipc_ids *ids)
{
@@ -117,12 +125,10 @@ void ipc_init_ids(struct ipc_ids *ids)
ids->seq = 0;
init_rwsem(&ids->rwsem);
rhashtable_init(&ids->key_ht, &ipc_kht_params);
- idr_init(&ids->ipcs_idr);
+ xa_init_flags(&ids->ipcs, XA_FLAGS_ALLOC);
ids->max_idx = -1;
- ids->last_idx = -1;
-#ifdef CONFIG_CHECKPOINT_RESTORE
- ids->next_id = -1;
-#endif
+ ids->next_idx = 0;
+ set_restore_id(ids, -1);
}
#ifdef CONFIG_PROC_FS
@@ -183,12 +189,12 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
}
/*
- * Insert new IPC object into idr tree, and set sequence number and id
+ * Insert new IPC object into xarray, and set sequence number and id
* in the correct order.
* Especially:
- * - the sequence number must be set before inserting the object into the idr,
- * because the sequence number is accessed without a lock.
- * - the id can/must be set after inserting the object into the idr.
+ * - the sequence number must be set before inserting the object into the
+ * xarray, because the sequence number is accessed without a lock.
+ * - the id can/must be set after inserting the object into the xarray.
* All accesses must be done after getting kern_ipc_perm.lock.
*
* The caller must own kern_ipc_perm.lock.of the new object.
@@ -198,64 +204,48 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
* the sequence number is incremented only when the returned ID is less than
* the last one.
*/
-static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
+static inline int ipc_id_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
{
- int idx, next_id = -1;
-
-#ifdef CONFIG_CHECKPOINT_RESTORE
- next_id = ids->next_id;
- ids->next_id = -1;
-#endif
-
- /*
- * As soon as a new object is inserted into the idr,
- * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it,
- * and the lockless preparations for ipc operations can start.
- * This means especially: permission checks, audit calls, allocation
- * of undo structures, ...
- *
- * Thus the object must be fully initialized, and if something fails,
- * then the full tear-down sequence must be followed.
- * (i.e.: set new->deleted, reduce refcount, call_rcu())
- */
+ u32 idx;
+ int err;
- if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */
+ if (get_restore_id(ids) < 0) {
int max_idx;
max_idx = max(ids->in_use*3/2, ipc_min_cycle);
- max_idx = min(max_idx, ipc_mni);
-
- /* allocate the idx, with a NULL struct kern_ipc_perm */
- idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx,
- GFP_NOWAIT);
-
- if (idx >= 0) {
- /*
- * idx got allocated successfully.
- * Now calculate the sequence number and set the
- * pointer for real.
- */
- if (idx <= ids->last_idx) {
- ids->seq++;
- if (ids->seq >= ipcid_seq_max())
- ids->seq = 0;
- }
- ids->last_idx = idx;
+ max_idx = min(max_idx, ipc_mni) - 1;
+
+ xa_lock(&ids->ipcs);
+ err = __xa_alloc_cyclic(&ids->ipcs, &idx, NULL,
+ XA_LIMIT(0, max_idx), &ids->next_idx,
+ GFP_KERNEL);
+ if (err == 1) {
+ ids->seq++;
+ if (ids->seq >= ipcid_seq_max())
+ ids->seq = 0;
+ }
+
+ if (err >= 0) {
new->seq = ids->seq;
- /* no need for smp_wmb(), this is done
- * inside idr_replace, as part of
- * rcu_assign_pointer
- */
- idr_replace(&ids->ipcs_idr, new, idx);
+ new->id = (new->seq << ipcmni_seq_shift()) + idx;
+ /* xa_store contains a write barrier */
+ __xa_store(&ids->ipcs, idx, new, GFP_KERNEL);
}
+
+ xa_unlock(&ids->ipcs);
} else {
- new->seq = ipcid_to_seqx(next_id);
- idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id),
- 0, GFP_NOWAIT);
+ new->id = get_restore_id(ids);
+ new->seq = ipcid_to_seqx(new->id);
+ idx = ipcid_to_idx(new->id);
+ err = xa_insert(&ids->ipcs, idx, new, GFP_KERNEL);
+ set_restore_id(ids, -1);
}
- if (idx >= 0)
- new->id = (new->seq << ipcmni_seq_shift()) + idx;
+
+ if (err == -EBUSY)
+ return -ENOSPC;
+ if (err < 0)
+ return err;
return idx;
}
@@ -278,7 +268,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
{
kuid_t euid;
kgid_t egid;
- int idx, err;
+ int idx;
/* 1) Initialize the refcount so that ipc_rcu_putref works */
refcount_set(&new->refcount, 1);
@@ -289,29 +279,42 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
if (ids->in_use >= limit)
return -ENOSPC;
- idr_preload(GFP_KERNEL);
-
+ /*
+ * 2) Hold the spinlock so that nobody else can access the object
+ * once they can find it
+ */
spin_lock_init(&new->lock);
- rcu_read_lock();
spin_lock(&new->lock);
-
current_euid_egid(&euid, &egid);
new->cuid = new->uid = euid;
new->gid = new->cgid = egid;
-
new->deleted = false;
- idx = ipc_idr_alloc(ids, new);
- idr_preload_end();
+ idx = ipc_id_alloc(ids, new);
+
+ rcu_read_lock();
+
+ /*
+ * As soon as a new object is inserted into the XArray,
+ * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it,
+ * and the lockless preparations for ipc operations can start.
+ * This means especially: permission checks, audit calls, allocation
+ * of undo structures, ...
+ *
+ * Thus the object must be fully initialized, and if something fails,
+ * then the full tear-down sequence must be followed.
+ * (i.e.: set new->deleted, reduce refcount, call_rcu())
+ */
if (idx >= 0 && new->key != IPC_PRIVATE) {
- err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
+ int err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
ipc_kht_params);
if (err < 0) {
- idr_remove(&ids->ipcs_idr, idx);
+ xa_erase(&ids->ipcs, idx);
idx = err;
}
}
+
if (idx < 0) {
new->deleted = true;
spin_unlock(&new->lock);
@@ -462,7 +465,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
{
int idx = ipcid_to_idx(ipcp->id);
- idr_remove(&ids->ipcs_idr, idx);
+ xa_erase(&ids->ipcs, idx);
ipc_kht_remove(ids, ipcp);
ids->in_use--;
ipcp->deleted = true;
@@ -472,7 +475,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
idx--;
if (idx == -1)
break;
- } while (!idr_find(&ids->ipcs_idr, idx));
+ } while (!xa_load(&ids->ipcs, idx));
ids->max_idx = idx;
}
}
@@ -595,7 +598,7 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id)
struct kern_ipc_perm *out;
int idx = ipcid_to_idx(id);
- out = idr_find(&ids->ipcs_idr, idx);
+ out = xa_load(&ids->ipcs, idx);
if (!out)
return ERR_PTR(-EINVAL);
@@ -754,30 +757,16 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s)
static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
loff_t *new_pos)
{
+ unsigned long index = pos;
struct kern_ipc_perm *ipc;
- int total, id;
-
- total = 0;
- for (id = 0; id < pos && total < ids->in_use; id++) {
- ipc = idr_find(&ids->ipcs_idr, id);
- if (ipc != NULL)
- total++;
- }
-
- ipc = NULL;
- if (total >= ids->in_use)
- goto out;
- for (; pos < ipc_mni; pos++) {
- ipc = idr_find(&ids->ipcs_idr, pos);
- if (ipc != NULL) {
- rcu_read_lock();
- ipc_lock_object(ipc);
- break;
- }
- }
-out:
- *new_pos = pos + 1;
+ rcu_read_lock();
+ ipc = xa_find(&ids->ipcs, &index, ULONG_MAX, XA_PRESENT);
+ if (ipc)
+ ipc_lock_object(ipc);
+ else
+ rcu_read_unlock();
+ *new_pos = index + 1;
return ipc;
}
diff --git a/ipc/util.h b/ipc/util.h
index 5766c61aed0e..04d49db4cefa 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -27,7 +27,7 @@
*/
#define IPCMNI_SHIFT 15
#define IPCMNI_EXTEND_SHIFT 24
-#define IPCMNI_EXTEND_MIN_CYCLE (RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE)
+#define IPCMNI_EXTEND_MIN_CYCLE (XA_CHUNK_SIZE * XA_CHUNK_SIZE)
#define IPCMNI (1 << IPCMNI_SHIFT)
#define IPCMNI_EXTEND (1 << IPCMNI_EXTEND_SHIFT)
@@ -42,7 +42,7 @@ extern int ipc_min_cycle;
#else /* CONFIG_SYSVIPC_SYSCTL */
#define ipc_mni IPCMNI
-#define ipc_min_cycle ((int)RADIX_TREE_MAP_SIZE)
+#define ipc_min_cycle ((int)XA_CHUNK_SIZE)
#define ipcmni_seq_shift() IPCMNI_SHIFT
#define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1)
#endif /* CONFIG_SYSVIPC_SYSCTL */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2f43e889ba42..e17ed7df5369 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -82,7 +82,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
struct bpf_prog *fp;
size = round_up(size, PAGE_SIZE);
- fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ fp = __vmalloc(size, gfp_flags);
if (fp == NULL)
return NULL;
@@ -232,7 +232,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
if (ret)
return NULL;
- fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ fp = __vmalloc(size, gfp_flags);
if (fp == NULL) {
__bpf_prog_uncharge(fp_old->aux->user, delta);
} else {
@@ -1089,7 +1089,7 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog *fp;
- fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
+ fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags);
if (fp != NULL) {
/* aux->prog still points to the fp_other one, so
* when promoting the clone to the real program,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d13b804ff045..93b6461d5fab 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -25,6 +25,7 @@
#include <linux/nospec.h>
#include <linux/audit.h>
#include <uapi/linux/btf.h>
+#include <asm/pgtable.h>
#include <linux/bpf_lsm.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -285,27 +286,29 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
* __GFP_RETRY_MAYFAIL to avoid such situations.
*/
- const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
+ const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO;
+ unsigned int flags = 0;
+ unsigned long align = 1;
void *area;
if (size >= SIZE_MAX)
return NULL;
/* kmalloc()'ed memory can't be mmap()'ed */
- if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
+ if (mmapable) {
+ BUG_ON(!PAGE_ALIGNED(size));
+ align = SHMLBA;
+ flags = VM_USERMAP;
+ } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+ area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
numa_node);
if (area != NULL)
return area;
}
- if (mmapable) {
- BUG_ON(!PAGE_ALIGNED(size));
- return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL |
- __GFP_RETRY_MAYFAIL | flags);
- }
- return __vmalloc_node_flags_caller(size, numa_node,
- GFP_KERNEL | __GFP_RETRY_MAYFAIL |
- flags, __builtin_return_address(0));
+
+ return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+ gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
+ flags, numa_node, __builtin_return_address(0));
}
void *bpf_map_area_alloc(u64 size, int numa_node)
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index f7b402849891..e739a6eea6e7 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -15,23 +15,6 @@ struct page **dma_common_find_pages(void *cpu_addr)
return area->pages;
}
-static struct vm_struct *__dma_common_pages_remap(struct page **pages,
- size_t size, pgprot_t prot, const void *caller)
-{
- struct vm_struct *area;
-
- area = get_vm_area_caller(size, VM_DMA_COHERENT, caller);
- if (!area)
- return NULL;
-
- if (map_vm_area(area, prot, pages)) {
- vunmap(area->addr);
- return NULL;
- }
-
- return area;
-}
-
/*
* Remaps an array of PAGE_SIZE pages into another vm_area.
* Cannot be used in non-sleeping contexts
@@ -39,15 +22,12 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages,
void *dma_common_pages_remap(struct page **pages, size_t size,
pgprot_t prot, const void *caller)
{
- struct vm_struct *area;
+ void *vaddr;
- area = __dma_common_pages_remap(pages, size, prot, caller);
- if (!area)
- return NULL;
-
- area->pages = pages;
-
- return area->addr;
+ vaddr = vmap(pages, size >> PAGE_SHIFT, VM_DMA_COHERENT, prot);
+ if (vaddr)
+ find_vm_area(vaddr)->pages = pages;
+ return vaddr;
}
/*
@@ -57,24 +37,20 @@ void *dma_common_pages_remap(struct page **pages, size_t size,
void *dma_common_contiguous_remap(struct page *page, size_t size,
pgprot_t prot, const void *caller)
{
- int i;
+ int count = size >> PAGE_SHIFT;
struct page **pages;
- struct vm_struct *area;
+ void *vaddr;
+ int i;
- pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL);
+ pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
if (!pages)
return NULL;
-
- for (i = 0; i < (size >> PAGE_SHIFT); i++)
+ for (i = 0; i < count; i++)
pages[i] = nth_page(page, i);
-
- area = __dma_common_pages_remap(pages, size, prot, caller);
-
+ vaddr = vmap(pages, count, VM_DMA_COHERENT, prot);
kfree(pages);
- if (!area)
- return NULL;
- return area->addr;
+ return vaddr;
}
/*
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ece7e13f6e4a..eddc8db96027 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -162,14 +162,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
};
int err;
struct mmu_notifier_range range;
- struct mem_cgroup *memcg;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
if (new_page) {
- err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
- &memcg, false);
+ err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
if (err)
return err;
}
@@ -179,17 +177,13 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
- if (!page_vma_mapped_walk(&pvmw)) {
- if (new_page)
- mem_cgroup_cancel_charge(new_page, memcg, false);
+ if (!page_vma_mapped_walk(&pvmw))
goto unlock;
- }
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
if (new_page) {
get_page(new_page);
page_add_new_anon_rmap(new_page, vma, addr, false);
- mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
} else
/* no new page, just dec_mm_counter for old_page */
diff --git a/kernel/groups.c b/kernel/groups.c
index daae2f2dc6d4..6ee6691f6839 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -20,7 +20,7 @@ struct group_info *groups_alloc(int gidsetsize)
len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize;
gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
if (!gi)
- gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL);
+ gi = __vmalloc(len, GFP_KERNEL_ACCOUNT);
if (!gi)
return NULL;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 14a625c16cb3..69f54848af79 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -142,6 +142,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
touch_nmi_watchdog();
}
+static void check_killed_task(struct task_struct *t, unsigned long timeout)
+{
+ unsigned long stamp = t->killed_time;
+
+ /*
+ * Ensure the task is not frozen.
+ * Also, skip vfork and any other user process that freezer should skip.
+ */
+ if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
+ return;
+ /*
+ * Skip threads which are already inside do_exit(), for exit_mm() etc.
+ * might take many seconds.
+ */
+ if (t->flags & PF_EXITING)
+ return;
+ if (!stamp) {
+ stamp = jiffies;
+ if (!stamp)
+ stamp++;
+ t->killed_time = stamp;
+ return;
+ }
+ if (time_is_after_jiffies(stamp + timeout * HZ))
+ return;
+ trace_sched_process_hang(t);
+ if (sysctl_hung_task_panic) {
+ console_verbose();
+ hung_task_call_panic = true;
+ }
+ /*
+ * This thread failed to terminate for more than
+ * sysctl_hung_task_timeout_secs seconds, complain:
+ */
+ pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n",
+ t->comm, t->pid, (jiffies - stamp) / HZ);
+ sched_show_task(t);
+ hung_task_show_lock = true;
+ touch_nmi_watchdog();
+}
+
/*
* To avoid extending the RCU grace period for an unbounded amount of time,
* periodically exit the critical section and enter a new one.
@@ -193,6 +234,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
last_break = jiffies;
}
+ /* Check threads which are about to terminate. */
+ if (unlikely(fatal_signal_pending(t)))
+ check_killed_task(t, timeout);
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 8accc9722a81..55c5d883a93e 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -86,6 +86,18 @@ static DEFINE_SPINLOCK(kcov_remote_lock);
static DEFINE_HASHTABLE(kcov_remote_map, 4);
static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas);
+struct kcov_percpu_data {
+ void *irq_area;
+
+ unsigned int saved_mode;
+ unsigned int saved_size;
+ void *saved_area;
+ struct kcov *saved_kcov;
+ int saved_sequence;
+};
+
+DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
+
/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_find(u64 handle)
{
@@ -98,6 +110,7 @@ static struct kcov_remote *kcov_remote_find(u64 handle)
return NULL;
}
+/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_add(struct kcov *kcov, u64 handle)
{
struct kcov_remote *remote;
@@ -119,16 +132,13 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size)
struct kcov_remote_area *area;
struct list_head *pos;
- kcov_debug("size = %u\n", size);
list_for_each(pos, &kcov_remote_areas) {
area = list_entry(pos, struct kcov_remote_area, list);
if (area->size == size) {
list_del(&area->list);
- kcov_debug("rv = %px\n", area);
return area;
}
}
- kcov_debug("rv = NULL\n");
return NULL;
}
@@ -136,7 +146,6 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size)
static void kcov_remote_area_put(struct kcov_remote_area *area,
unsigned int size)
{
- kcov_debug("area = %px, size = %u\n", area, size);
INIT_LIST_HEAD(&area->list);
area->size = size;
list_add(&area->list, &kcov_remote_areas);
@@ -148,9 +157,10 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru
/*
* We are interested in code coverage as a function of a syscall inputs,
- * so we ignore code executed in interrupts.
+ * so we ignore code executed in interrupts, unless we are in a remote
+ * coverage collection section in a softirq.
*/
- if (!in_task())
+ if (!in_task() && !(in_serving_softirq() && t->kcov_softirq))
return false;
mode = READ_ONCE(t->kcov_mode);
/*
@@ -312,23 +322,26 @@ void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
-static void kcov_start(struct task_struct *t, unsigned int size,
- void *area, enum kcov_mode mode, int sequence)
+static void kcov_start(struct task_struct *t, struct kcov *kcov,
+ unsigned int size, void *area, enum kcov_mode mode,
+ int sequence)
{
kcov_debug("t = %px, size = %u, area = %px\n", t, size, area);
+ t->kcov = kcov;
/* Cache in task struct for performance. */
t->kcov_size = size;
t->kcov_area = area;
+ t->kcov_sequence = sequence;
/* See comment in check_kcov_mode(). */
barrier();
WRITE_ONCE(t->kcov_mode, mode);
- t->kcov_sequence = sequence;
}
static void kcov_stop(struct task_struct *t)
{
WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED);
barrier();
+ t->kcov = NULL;
t->kcov_size = 0;
t->kcov_area = NULL;
}
@@ -336,7 +349,6 @@ static void kcov_stop(struct task_struct *t)
static void kcov_task_reset(struct task_struct *t)
{
kcov_stop(t);
- t->kcov = NULL;
t->kcov_sequence = 0;
t->kcov_handle = 0;
}
@@ -361,18 +373,18 @@ static void kcov_remote_reset(struct kcov *kcov)
int bkt;
struct kcov_remote *remote;
struct hlist_node *tmp;
+ unsigned long flags;
- spin_lock(&kcov_remote_lock);
+ spin_lock_irqsave(&kcov_remote_lock, flags);
hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) {
if (remote->kcov != kcov)
continue;
- kcov_debug("removing handle %llx\n", remote->handle);
hash_del(&remote->hnode);
kfree(remote);
}
/* Do reset before unlock to prevent races with kcov_remote_start(). */
kcov_reset(kcov);
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock, flags);
}
static void kcov_disable(struct task_struct *t, struct kcov *kcov)
@@ -401,12 +413,13 @@ static void kcov_put(struct kcov *kcov)
void kcov_task_exit(struct task_struct *t)
{
struct kcov *kcov;
+ unsigned long flags;
kcov = t->kcov;
if (kcov == NULL)
return;
- spin_lock(&kcov->lock);
+ spin_lock_irqsave(&kcov->lock, flags);
kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t);
/*
* For KCOV_ENABLE devices we want to make sure that t->kcov->t == t,
@@ -430,12 +443,12 @@ void kcov_task_exit(struct task_struct *t)
* By combining all three checks into one we get:
*/
if (WARN_ON(kcov->t != t)) {
- spin_unlock(&kcov->lock);
+ spin_unlock_irqrestore(&kcov->lock, flags);
return;
}
/* Just to not leave dangling references behind. */
kcov_disable(t, kcov);
- spin_unlock(&kcov->lock);
+ spin_unlock_irqrestore(&kcov->lock, flags);
kcov_put(kcov);
}
@@ -446,12 +459,13 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
struct kcov *kcov = vma->vm_file->private_data;
unsigned long size, off;
struct page *page;
+ unsigned long flags;
area = vmalloc_user(vma->vm_end - vma->vm_start);
if (!area)
return -ENOMEM;
- spin_lock(&kcov->lock);
+ spin_lock_irqsave(&kcov->lock, flags);
size = kcov->size * sizeof(unsigned long);
if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
@@ -461,7 +475,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
if (!kcov->area) {
kcov->area = area;
vma->vm_flags |= VM_DONTEXPAND;
- spin_unlock(&kcov->lock);
+ spin_unlock_irqrestore(&kcov->lock, flags);
for (off = 0; off < size; off += PAGE_SIZE) {
page = vmalloc_to_page(kcov->area + off);
if (vm_insert_page(vma, vma->vm_start + off, page))
@@ -470,7 +484,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
return 0;
}
exit:
- spin_unlock(&kcov->lock);
+ spin_unlock_irqrestore(&kcov->lock, flags);
vfree(area);
return res;
}
@@ -550,10 +564,10 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
int mode, i;
struct kcov_remote_arg *remote_arg;
struct kcov_remote *remote;
+ unsigned long flags;
switch (cmd) {
case KCOV_INIT_TRACE:
- kcov_debug("KCOV_INIT_TRACE\n");
/*
* Enable kcov in trace mode and setup buffer size.
* Must happen before anything else.
@@ -572,7 +586,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
kcov->mode = KCOV_MODE_INIT;
return 0;
case KCOV_ENABLE:
- kcov_debug("KCOV_ENABLE\n");
/*
* Enable coverage for the current task.
* At this point user must have been enabled trace mode,
@@ -590,15 +603,13 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
return mode;
kcov_fault_in_area(kcov);
kcov->mode = mode;
- kcov_start(t, kcov->size, kcov->area, kcov->mode,
+ kcov_start(t, kcov, kcov->size, kcov->area, kcov->mode,
kcov->sequence);
- t->kcov = kcov;
kcov->t = t;
/* Put either in kcov_task_exit() or in KCOV_DISABLE. */
kcov_get(kcov);
return 0;
case KCOV_DISABLE:
- kcov_debug("KCOV_DISABLE\n");
/* Disable coverage for the current task. */
unused = arg;
if (unused != 0 || current->kcov != kcov)
@@ -610,7 +621,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
kcov_put(kcov);
return 0;
case KCOV_REMOTE_ENABLE:
- kcov_debug("KCOV_REMOTE_ENABLE\n");
if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
return -EINVAL;
t = current;
@@ -627,41 +637,42 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
kcov->t = t;
kcov->remote = true;
kcov->remote_size = remote_arg->area_size;
- spin_lock(&kcov_remote_lock);
+ spin_lock_irqsave(&kcov_remote_lock, flags);
for (i = 0; i < remote_arg->num_handles; i++) {
- kcov_debug("handle %llx\n", remote_arg->handles[i]);
if (!kcov_check_handle(remote_arg->handles[i],
false, true, false)) {
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock,
+ flags);
kcov_disable(t, kcov);
return -EINVAL;
}
remote = kcov_remote_add(kcov, remote_arg->handles[i]);
if (IS_ERR(remote)) {
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock,
+ flags);
kcov_disable(t, kcov);
return PTR_ERR(remote);
}
}
if (remote_arg->common_handle) {
- kcov_debug("common handle %llx\n",
- remote_arg->common_handle);
if (!kcov_check_handle(remote_arg->common_handle,
true, false, false)) {
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock,
+ flags);
kcov_disable(t, kcov);
return -EINVAL;
}
remote = kcov_remote_add(kcov,
remote_arg->common_handle);
if (IS_ERR(remote)) {
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock,
+ flags);
kcov_disable(t, kcov);
return PTR_ERR(remote);
}
t->kcov_handle = remote_arg->common_handle;
}
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock, flags);
/* Put either in kcov_task_exit() or in KCOV_DISABLE. */
kcov_get(kcov);
return 0;
@@ -677,6 +688,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
struct kcov_remote_arg *remote_arg = NULL;
unsigned int remote_num_handles;
unsigned long remote_arg_size;
+ unsigned long flags;
if (cmd == KCOV_REMOTE_ENABLE) {
if (get_user(remote_num_handles, (unsigned __user *)(arg +
@@ -697,9 +709,9 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
}
kcov = filep->private_data;
- spin_lock(&kcov->lock);
+ spin_lock_irqsave(&kcov->lock, flags);
res = kcov_ioctl_locked(kcov, cmd, arg);
- spin_unlock(&kcov->lock);
+ spin_unlock_irqrestore(&kcov->lock, flags);
kfree(remote_arg);
@@ -716,8 +728,8 @@ static const struct file_operations kcov_fops = {
/*
* kcov_remote_start() and kcov_remote_stop() can be used to annotate a section
- * of code in a kernel background thread to allow kcov to be used to collect
- * coverage from that part of code.
+ * of code in a kernel background thread or in a softirq to allow kcov to be
+ * used to collect coverage from that part of code.
*
* The handle argument of kcov_remote_start() identifies a code section that is
* used for coverage collection. A userspace process passes this handle to
@@ -728,9 +740,9 @@ static const struct file_operations kcov_fops = {
* the type of the kernel thread whose code is being annotated.
*
* For global kernel threads that are spawned in a limited number of instances
- * (e.g. one USB hub_event() worker thread is spawned per USB HCD), each
- * instance must be assigned a unique 4-byte instance id. The instance id is
- * then combined with a 1-byte subsystem id to get a handle via
+ * (e.g. one USB hub_event() worker thread is spawned per USB HCD) and for
+ * softirqs, each instance must be assigned a unique 4-byte instance id. The
+ * instance id is then combined with a 1-byte subsystem id to get a handle via
* kcov_remote_handle(subsystem_id, instance_id).
*
* For local kernel threads that are spawned from system calls handler when a
@@ -749,70 +761,136 @@ static const struct file_operations kcov_fops = {
*
* See Documentation/dev-tools/kcov.rst for more details.
*
- * Internally, this function looks up the kcov device associated with the
+ * Internally, kcov_remote_start() looks up the kcov device associated with the
* provided handle, allocates an area for coverage collection, and saves the
* pointers to kcov and area into the current task_struct to allow coverage to
* be collected via __sanitizer_cov_trace_pc()
* In turns kcov_remote_stop() clears those pointers from task_struct to stop
* collecting coverage and copies all collected coverage into the kcov area.
*/
+
+static inline bool kcov_mode_enabled(unsigned int mode)
+{
+ return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED;
+}
+
+void kcov_remote_softirq_start(struct task_struct *t)
+{
+ struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
+ unsigned int mode;
+
+ mode = READ_ONCE(t->kcov_mode);
+ barrier();
+ if (kcov_mode_enabled(mode)) {
+ data->saved_mode = mode;
+ data->saved_size = t->kcov_size;
+ data->saved_area = t->kcov_area;
+ data->saved_sequence = t->kcov_sequence;
+ data->saved_kcov = t->kcov;
+ kcov_stop(t);
+ }
+}
+
+void kcov_remote_softirq_stop(struct task_struct *t)
+{
+ struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
+
+ if (data->saved_kcov) {
+ kcov_start(t, data->saved_kcov, data->saved_size,
+ data->saved_area, data->saved_mode,
+ data->saved_sequence);
+ data->saved_mode = 0;
+ data->saved_size = 0;
+ data->saved_area = NULL;
+ data->saved_sequence = 0;
+ data->saved_kcov = NULL;
+ }
+}
+
void kcov_remote_start(u64 handle)
{
+ struct task_struct *t = current;
struct kcov_remote *remote;
+ struct kcov *kcov;
+ unsigned int mode;
void *area;
- struct task_struct *t;
unsigned int size;
- enum kcov_mode mode;
int sequence;
+ unsigned long flags;
if (WARN_ON(!kcov_check_handle(handle, true, true, true)))
return;
- if (WARN_ON(!in_task()))
+ if (!in_task() && !in_serving_softirq())
return;
- t = current;
+
+ local_irq_save(flags);
+
/*
- * Check that kcov_remote_start is not called twice
- * nor called by user tasks (with enabled kcov).
+ * Check that kcov_remote_start() is not called twice in background
+ * threads nor called by user tasks (with enabled kcov).
*/
- if (WARN_ON(t->kcov))
+ mode = READ_ONCE(t->kcov_mode);
+ if (WARN_ON(in_task() && kcov_mode_enabled(mode))) {
+ local_irq_restore(flags);
return;
-
- kcov_debug("handle = %llx\n", handle);
+ }
+ /*
+ * Check that kcov_remote_start() is not called twice in softirqs.
+ * Note, that kcov_remote_start() can be called from a softirq that
+ * happened while collecting coverage from a background thread.
+ */
+ if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) {
+ local_irq_restore(flags);
+ return;
+ }
spin_lock(&kcov_remote_lock);
remote = kcov_remote_find(handle);
if (!remote) {
- kcov_debug("no remote found");
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock, flags);
return;
}
+ kcov_debug("handle = %llx, context: %s\n", handle,
+ in_task() ? "task" : "softirq");
+ kcov = remote->kcov;
/* Put in kcov_remote_stop(). */
- kcov_get(remote->kcov);
- t->kcov = remote->kcov;
+ kcov_get(kcov);
/*
* Read kcov fields before unlock to prevent races with
* KCOV_DISABLE / kcov_remote_reset().
*/
- size = remote->kcov->remote_size;
- mode = remote->kcov->mode;
- sequence = remote->kcov->sequence;
- area = kcov_remote_area_get(size);
- spin_unlock(&kcov_remote_lock);
+ mode = kcov->mode;
+ sequence = kcov->sequence;
+ if (in_task()) {
+ size = kcov->remote_size;
+ area = kcov_remote_area_get(size);
+ } else {
+ size = CONFIG_KCOV_IRQ_AREA_SIZE;
+ area = this_cpu_ptr(&kcov_percpu_data)->irq_area;
+ }
+ spin_unlock_irqrestore(&kcov_remote_lock, flags);
+ /* Can only happen when in_task(). */
if (!area) {
area = vmalloc(size * sizeof(unsigned long));
if (!area) {
- t->kcov = NULL;
- kcov_put(remote->kcov);
+ kcov_put(kcov);
return;
}
}
+
+ local_irq_save(flags);
+
/* Reset coverage size. */
*(u64 *)area = 0;
- kcov_debug("area = %px, size = %u", area, size);
+ if (in_serving_softirq()) {
+ kcov_remote_softirq_start(t);
+ t->kcov_softirq = 1;
+ }
+ kcov_start(t, kcov, size, area, mode, sequence);
- kcov_start(t, size, area, mode, sequence);
+ local_irq_restore(flags);
}
EXPORT_SYMBOL(kcov_remote_start);
@@ -876,34 +954,58 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area,
void kcov_remote_stop(void)
{
struct task_struct *t = current;
- struct kcov *kcov = t->kcov;
- void *area = t->kcov_area;
- unsigned int size = t->kcov_size;
- int sequence = t->kcov_sequence;
+ struct kcov *kcov;
+ unsigned int mode;
+ void *area;
+ unsigned int size;
+ int sequence;
+ unsigned long flags;
- if (!kcov) {
- kcov_debug("no kcov found\n");
+ if (!in_task() && !in_serving_softirq())
+ return;
+
+ local_irq_save(flags);
+
+ mode = READ_ONCE(t->kcov_mode);
+ barrier();
+ if (!kcov_mode_enabled(mode)) {
+ local_irq_restore(flags);
+ return;
+ }
+ kcov = t->kcov;
+ area = t->kcov_area;
+ size = t->kcov_size;
+ sequence = t->kcov_sequence;
+
+ if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) {
+ local_irq_restore(flags);
return;
}
kcov_stop(t);
- t->kcov = NULL;
+ if (in_serving_softirq()) {
+ t->kcov_softirq = 0;
+ kcov_remote_softirq_stop(t);
+ }
spin_lock(&kcov->lock);
/*
* KCOV_DISABLE could have been called between kcov_remote_start()
- * and kcov_remote_stop(), hence the check.
+ * and kcov_remote_stop(), hence the sequence check.
*/
- kcov_debug("move if: %d == %d && %d\n",
- sequence, kcov->sequence, (int)kcov->remote);
if (sequence == kcov->sequence && kcov->remote)
kcov_move_area(kcov->mode, kcov->area, kcov->size, area);
spin_unlock(&kcov->lock);
- spin_lock(&kcov_remote_lock);
- kcov_remote_area_put(area, size);
- spin_unlock(&kcov_remote_lock);
+ if (in_task()) {
+ spin_lock(&kcov_remote_lock);
+ kcov_remote_area_put(area, size);
+ spin_unlock(&kcov_remote_lock);
+ }
+
+ local_irq_restore(flags);
+ /* Get in kcov_remote_start(). */
kcov_put(kcov);
}
EXPORT_SYMBOL(kcov_remote_stop);
@@ -917,6 +1019,16 @@ EXPORT_SYMBOL(kcov_common_handle);
static int __init kcov_init(void)
{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE *
+ sizeof(unsigned long));
+ if (!area)
+ return -ENOMEM;
+ per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area;
+ }
+
/*
* The kcov debugfs file won't ever get removed and thus,
* there is no need to protect it against removal races. The
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index faa74d5f6941..bb05fd52de85 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -540,6 +540,11 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
unsigned long sz = end - start + 1;
/* Returning 0 will take to next memory range */
+
+ /* Don't use memory that will be detected and handled by a driver. */
+ if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED)
+ return 0;
+
if (sz < kbuf->memsz)
return 0;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0fbdee78266b..8bef07cf33cb 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2475,24 +2475,14 @@ static int show_kprobe_addr(struct seq_file *pi, void *v)
return 0;
}
-static const struct seq_operations kprobes_seq_ops = {
+static const struct seq_operations kprobes_sops = {
.start = kprobe_seq_start,
.next = kprobe_seq_next,
.stop = kprobe_seq_stop,
.show = show_kprobe_addr
};
-static int kprobes_open(struct inode *inode, struct file *filp)
-{
- return seq_open(filp, &kprobes_seq_ops);
-}
-
-static const struct file_operations debugfs_kprobes_operations = {
- .open = kprobes_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(kprobes);
/* kprobes/blacklist -- shows which functions can not be probed */
static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
@@ -2529,24 +2519,14 @@ static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v)
mutex_unlock(&kprobe_mutex);
}
-static const struct seq_operations kprobe_blacklist_seq_ops = {
+static const struct seq_operations kprobe_blacklist_sops = {
.start = kprobe_blacklist_seq_start,
.next = kprobe_blacklist_seq_next,
.stop = kprobe_blacklist_seq_stop,
.show = kprobe_blacklist_seq_show,
};
-static int kprobe_blacklist_open(struct inode *inode, struct file *filp)
-{
- return seq_open(filp, &kprobe_blacklist_seq_ops);
-}
-
-static const struct file_operations debugfs_kprobe_blacklist_ops = {
- .open = kprobe_blacklist_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist);
static int arm_all_kprobes(void)
{
@@ -2705,13 +2685,12 @@ static int __init debugfs_kprobe_init(void)
dir = debugfs_create_dir("kprobes", NULL);
- debugfs_create_file("list", 0400, dir, NULL,
- &debugfs_kprobes_operations);
+ debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops);
debugfs_create_file("enabled", 0600, dir, &value, &fops_kp);
debugfs_create_file("blacklist", 0400, dir, NULL,
- &debugfs_kprobe_blacklist_ops);
+ &kprobe_blacklist_fops);
return 0;
}
diff --git a/kernel/module.c b/kernel/module.c
index 39b65d81fba8..bb4a830e7490 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2970,8 +2970,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
return err;
/* Suck in entire file: we'll want most of it. */
- info->hdr = __vmalloc(info->len,
- GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL);
+ info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN);
if (!info->hdr)
return -ENOMEM;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 5989bbb93039..84c987dfbe03 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -519,7 +519,6 @@ NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{
- vmalloc_sync_mappings();
return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);
diff --git a/kernel/padata.c b/kernel/padata.c
index aae789896616..29fc5d87a4cd 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -7,6 +7,9 @@
* Copyright (C) 2008, 2009 secunet Security Networks AG
* Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
*
+ * Copyright (c) 2020 Oracle and/or its affiliates.
+ * Author: Daniel Jordan <daniel.m.jordan@oracle.com>
+ *
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
@@ -21,6 +24,7 @@
* 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*/
+#include <linux/completion.h>
#include <linux/export.h>
#include <linux/cpumask.h>
#include <linux/err.h>
@@ -31,11 +35,30 @@
#include <linux/slab.h>
#include <linux/sysfs.h>
#include <linux/rcupdate.h>
-#include <linux/module.h>
-#define MAX_OBJ_NUM 1000
+#define PADATA_WORK_ONSTACK 1 /* Work's memory is on stack */
+
+struct padata_work {
+ struct work_struct pw_work;
+ struct list_head pw_list; /* padata_free_works linkage */
+ void *pw_data;
+};
+
+static DEFINE_SPINLOCK(padata_works_lock);
+static struct padata_work *padata_works;
+static LIST_HEAD(padata_free_works);
+
+struct padata_mt_job_state {
+ spinlock_t lock;
+ struct completion completion;
+ struct padata_mt_job *job;
+ int nworks;
+ int nworks_fini;
+ unsigned long chunk_size;
+};
static void padata_free_pd(struct parallel_data *pd);
+static void __init padata_mt_helper(struct work_struct *work);
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
{
@@ -59,30 +82,82 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)
return padata_index_to_cpu(pd, cpu_index);
}
-static void padata_parallel_worker(struct work_struct *parallel_work)
+static struct padata_work *padata_work_alloc(void)
{
- struct padata_parallel_queue *pqueue;
- LIST_HEAD(local_list);
+ struct padata_work *pw;
- local_bh_disable();
- pqueue = container_of(parallel_work,
- struct padata_parallel_queue, work);
+ lockdep_assert_held(&padata_works_lock);
- spin_lock(&pqueue->parallel.lock);
- list_replace_init(&pqueue->parallel.list, &local_list);
- spin_unlock(&pqueue->parallel.lock);
+ if (list_empty(&padata_free_works))
+ return NULL; /* No more work items allowed to be queued. */
- while (!list_empty(&local_list)) {
- struct padata_priv *padata;
+ pw = list_first_entry(&padata_free_works, struct padata_work, pw_list);
+ list_del(&pw->pw_list);
+ return pw;
+}
- padata = list_entry(local_list.next,
- struct padata_priv, list);
+static void padata_work_init(struct padata_work *pw, work_func_t work_fn,
+ void *data, int flags)
+{
+ if (flags & PADATA_WORK_ONSTACK)
+ INIT_WORK_ONSTACK(&pw->pw_work, work_fn);
+ else
+ INIT_WORK(&pw->pw_work, work_fn);
+ pw->pw_data = data;
+}
- list_del_init(&padata->list);
+static int __init padata_work_alloc_mt(int nworks, void *data,
+ struct list_head *head)
+{
+ int i;
- padata->parallel(padata);
+ spin_lock(&padata_works_lock);
+ /* Start at 1 because the current task participates in the job. */
+ for (i = 1; i < nworks; ++i) {
+ struct padata_work *pw = padata_work_alloc();
+
+ if (!pw)
+ break;
+ padata_work_init(pw, padata_mt_helper, data, 0);
+ list_add(&pw->pw_list, head);
+ }
+ spin_unlock(&padata_works_lock);
+
+ return i;
+}
+
+static void padata_work_free(struct padata_work *pw)
+{
+ lockdep_assert_held(&padata_works_lock);
+ list_add(&pw->pw_list, &padata_free_works);
+}
+
+static void __init padata_works_free(struct list_head *works)
+{
+ struct padata_work *cur, *next;
+
+ if (list_empty(works))
+ return;
+
+ spin_lock(&padata_works_lock);
+ list_for_each_entry_safe(cur, next, works, pw_list) {
+ list_del(&cur->pw_list);
+ padata_work_free(cur);
}
+ spin_unlock(&padata_works_lock);
+}
+static void padata_parallel_worker(struct work_struct *parallel_work)
+{
+ struct padata_work *pw = container_of(parallel_work, struct padata_work,
+ pw_work);
+ struct padata_priv *padata = pw->pw_data;
+
+ local_bh_disable();
+ padata->parallel(padata);
+ spin_lock(&padata_works_lock);
+ padata_work_free(pw);
+ spin_unlock(&padata_works_lock);
local_bh_enable();
}
@@ -106,9 +181,9 @@ int padata_do_parallel(struct padata_shell *ps,
struct padata_priv *padata, int *cb_cpu)
{
struct padata_instance *pinst = ps->pinst;
- int i, cpu, cpu_index, target_cpu, err;
- struct padata_parallel_queue *queue;
+ int i, cpu, cpu_index, err;
struct parallel_data *pd;
+ struct padata_work *pw;
rcu_read_lock_bh();
@@ -136,25 +211,25 @@ int padata_do_parallel(struct padata_shell *ps,
if ((pinst->flags & PADATA_RESET))
goto out;
- if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
- goto out;
-
- err = 0;
atomic_inc(&pd->refcnt);
padata->pd = pd;
padata->cb_cpu = *cb_cpu;
- padata->seq_nr = atomic_inc_return(&pd->seq_nr);
- target_cpu = padata_cpu_hash(pd, padata->seq_nr);
- padata->cpu = target_cpu;
- queue = per_cpu_ptr(pd->pqueue, target_cpu);
-
- spin_lock(&queue->parallel.lock);
- list_add_tail(&padata->list, &queue->parallel.list);
- spin_unlock(&queue->parallel.lock);
+ rcu_read_unlock_bh();
- queue_work(pinst->parallel_wq, &queue->work);
+ spin_lock(&padata_works_lock);
+ padata->seq_nr = ++pd->seq_nr;
+ pw = padata_work_alloc();
+ spin_unlock(&padata_works_lock);
+ if (pw) {
+ padata_work_init(pw, padata_parallel_worker, padata, 0);
+ queue_work(pinst->parallel_wq, &pw->pw_work);
+ } else {
+ /* Maximum works limit exceeded, run in the current task. */
+ padata->parallel(padata);
+ }
+ return 0;
out:
rcu_read_unlock_bh();
@@ -325,8 +400,9 @@ static void padata_serial_worker(struct work_struct *serial_work)
void padata_do_serial(struct padata_priv *padata)
{
struct parallel_data *pd = padata->pd;
+ int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr);
struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue,
- padata->cpu);
+ hashed_cpu);
struct padata_priv *cur;
spin_lock(&pqueue->reorder.lock);
@@ -387,6 +463,98 @@ out:
return err;
}
+static void __init padata_mt_helper(struct work_struct *w)
+{
+ struct padata_work *pw = container_of(w, struct padata_work, pw_work);
+ struct padata_mt_job_state *ps = pw->pw_data;
+ struct padata_mt_job *job = ps->job;
+ bool done;
+
+ spin_lock(&ps->lock);
+
+ while (job->size > 0) {
+ unsigned long start, size, end;
+
+ start = job->start;
+ /* So end is chunk size aligned if enough work remains. */
+ size = roundup(start + 1, ps->chunk_size) - start;
+ size = min(size, job->size);
+ end = start + size;
+
+ job->start = end;
+ job->size -= size;
+
+ spin_unlock(&ps->lock);
+ job->thread_fn(start, end, job->fn_arg);
+ spin_lock(&ps->lock);
+ }
+
+ ++ps->nworks_fini;
+ done = (ps->nworks_fini == ps->nworks);
+ spin_unlock(&ps->lock);
+
+ if (done)
+ complete(&ps->completion);
+}
+
+/**
+ * padata_do_multithreaded - run a multithreaded job
+ * @job: Description of the job.
+ *
+ * See the definition of struct padata_mt_job for more details.
+ */
+void __init padata_do_multithreaded(struct padata_mt_job *job)
+{
+ /* In case threads finish at different times. */
+ static const unsigned long load_balance_factor = 4;
+ struct padata_work my_work, *pw;
+ struct padata_mt_job_state ps;
+ LIST_HEAD(works);
+ int nworks;
+
+ if (job->size == 0)
+ return;
+
+ /* Ensure at least one thread when size < min_chunk. */
+ nworks = max(job->size / job->min_chunk, 1ul);
+ nworks = min(nworks, job->max_threads);
+
+ if (nworks == 1) {
+ /* Single thread, no coordination needed, cut to the chase. */
+ job->thread_fn(job->start, job->start + job->size, job->fn_arg);
+ return;
+ }
+
+ spin_lock_init(&ps.lock);
+ init_completion(&ps.completion);
+ ps.job = job;
+ ps.nworks = padata_work_alloc_mt(nworks, &ps, &works);
+ ps.nworks_fini = 0;
+
+ /*
+ * Chunk size is the amount of work a helper does per call to the
+ * thread function. Load balance large jobs between threads by
+ * increasing the number of chunks, guarantee at least the minimum
+ * chunk size from the caller, and honor the caller's alignment.
+ */
+ ps.chunk_size = job->size / (ps.nworks * load_balance_factor);
+ ps.chunk_size = max(ps.chunk_size, job->min_chunk);
+ ps.chunk_size = roundup(ps.chunk_size, job->align);
+
+ list_for_each_entry(pw, &works, pw_list)
+ queue_work(system_unbound_wq, &pw->pw_work);
+
+ /* Use the current thread, which saves starting a workqueue worker. */
+ padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK);
+ padata_mt_helper(&my_work.pw_work);
+
+ /* Wait for all the helpers to finish. */
+ wait_for_completion(&ps.completion);
+
+ destroy_work_on_stack(&my_work.pw_work);
+ padata_works_free(&works);
+}
+
static void __padata_list_init(struct padata_list *pd_list)
{
INIT_LIST_HEAD(&pd_list->list);
@@ -417,8 +585,6 @@ static void padata_init_pqueues(struct parallel_data *pd)
pqueue = per_cpu_ptr(pd->pqueue, cpu);
__padata_list_init(&pqueue->reorder);
- __padata_list_init(&pqueue->parallel);
- INIT_WORK(&pqueue->work, padata_parallel_worker);
atomic_set(&pqueue->num_obj, 0);
}
}
@@ -452,7 +618,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
padata_init_pqueues(pd);
padata_init_squeues(pd);
- atomic_set(&pd->seq_nr, -1);
+ pd->seq_nr = -1;
atomic_set(&pd->refcnt, 1);
spin_lock_init(&pd->lock);
pd->cpu = cpumask_first(pd->cpumask.pcpu);
@@ -1052,32 +1218,41 @@ void padata_free_shell(struct padata_shell *ps)
}
EXPORT_SYMBOL(padata_free_shell);
-#ifdef CONFIG_HOTPLUG_CPU
-
-static __init int padata_driver_init(void)
+void __init padata_init(void)
{
+ unsigned int i, possible_cpus;
+#ifdef CONFIG_HOTPLUG_CPU
int ret;
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
padata_cpu_online, NULL);
if (ret < 0)
- return ret;
+ goto err;
hp_online = ret;
ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead",
NULL, padata_cpu_dead);
- if (ret < 0) {
- cpuhp_remove_multi_state(hp_online);
- return ret;
- }
- return 0;
-}
-module_init(padata_driver_init);
+ if (ret < 0)
+ goto remove_online_state;
+#endif
-static __exit void padata_driver_exit(void)
-{
+ possible_cpus = num_possible_cpus();
+ padata_works = kmalloc_array(possible_cpus, sizeof(struct padata_work),
+ GFP_KERNEL);
+ if (!padata_works)
+ goto remove_dead_state;
+
+ for (i = 0; i < possible_cpus; ++i)
+ list_add(&padata_works[i].pw_list, &padata_free_works);
+
+ return;
+
+remove_dead_state:
+#ifdef CONFIG_HOTPLUG_CPU
cpuhp_remove_multi_state(CPUHP_PADATA_DEAD);
+remove_online_state:
cpuhp_remove_multi_state(hp_online);
-}
-module_exit(padata_driver_exit);
+err:
#endif
+ pr_warn("padata: initialization failed\n");
+}
diff --git a/kernel/relay.c b/kernel/relay.c
index 90c7a002436d..c29a6ea6624e 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -991,14 +991,14 @@ static void relay_file_read_consume(struct rchan_buf *buf,
/*
* relay_file_read_avail - boolean, are there unconsumed bytes available?
*/
-static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
+static int relay_file_read_avail(struct rchan_buf *buf)
{
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t produced = buf->subbufs_produced;
size_t consumed = buf->subbufs_consumed;
- relay_file_read_consume(buf, read_pos, 0);
+ relay_file_read_consume(buf, 0, 0);
consumed = buf->subbufs_consumed;
@@ -1059,23 +1059,20 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
/**
* relay_file_read_start_pos - find the first available byte to read
- * @read_pos: file read position
* @buf: relay channel buffer
*
- * If the @read_pos is in the middle of padding, return the
+ * If the read_pos is in the middle of padding, return the
* position of the first actually available byte, otherwise
* return the original value.
*/
-static size_t relay_file_read_start_pos(size_t read_pos,
- struct rchan_buf *buf)
+static size_t relay_file_read_start_pos(struct rchan_buf *buf)
{
size_t read_subbuf, padding, padding_start, padding_end;
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t consumed = buf->subbufs_consumed % n_subbufs;
+ size_t read_pos = consumed * subbuf_size + buf->bytes_consumed;
- if (!read_pos)
- read_pos = consumed * subbuf_size + buf->bytes_consumed;
read_subbuf = read_pos / subbuf_size;
padding = buf->padding[read_subbuf];
padding_start = (read_subbuf + 1) * subbuf_size - padding;
@@ -1131,10 +1128,10 @@ static ssize_t relay_file_read(struct file *filp,
do {
void *from;
- if (!relay_file_read_avail(buf, *ppos))
+ if (!relay_file_read_avail(buf))
break;
- read_start = relay_file_read_start_pos(*ppos, buf);
+ read_start = relay_file_read_start_pos(buf);
avail = relay_file_read_subbuf_avail(read_start, buf);
if (!avail)
break;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f3b7593eddea..3ab27022c20f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8518,18 +8518,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
allocate_snapshot = false;
#endif
- /*
- * Because of some magic with the way alloc_percpu() works on
- * x86_64, we need to synchronize the pgd of all the tables,
- * otherwise the trace events that happen in x86_64 page fault
- * handlers can't cope with accessing the chance that a
- * alloc_percpu()'d memory might be touched in the page fault trace
- * event. Oh, and we need to audit all other alloc_percpu() and vmalloc()
- * calls in tracing, because something might get triggered within a
- * page fault trace event!
- */
- vmalloc_sync_mappings();
-
return 0;
}
diff --git a/kernel/user.c b/kernel/user.c
index 5235d7f49982..b1635d94a1f2 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -82,7 +82,7 @@ EXPORT_SYMBOL_GPL(init_user_ns);
#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid))))
static struct kmem_cache *uid_cachep;
-struct hlist_head uidhash_table[UIDHASH_SZ];
+static struct hlist_head uidhash_table[UIDHASH_SZ];
/*
* The uidhash_lock is mostly taken from process context, but it is
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 53ff2c81b084..fa5aacbfd000 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -92,6 +92,26 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str)
}
__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
# endif /* CONFIG_SMP */
+
+atomic_t hardlockup_detected = ATOMIC_INIT(0);
+
+static inline void flush_hardlockup_messages(void)
+{
+ static atomic_t flushed = ATOMIC_INIT(0);
+
+ /* flush messages from hard lockup detector */
+ if (atomic_read(&hardlockup_detected) != atomic_read(&flushed)) {
+ atomic_set(&flushed, atomic_read(&hardlockup_detected));
+ printk_safe_flush();
+ }
+}
+
+#else /* CONFIG_HARDLOCKUP_DETECTOR */
+
+static inline void flush_hardlockup_messages(void)
+{
+}
+
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
/*
@@ -370,6 +390,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* kick the hardlockup detector */
watchdog_interrupt_count();
+ flush_hardlockup_messages();
+
/* kick the softlockup detector */
if (completion_done(this_cpu_ptr(&softlockup_completion))) {
reinit_completion(this_cpu_ptr(&softlockup_completion));
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 247bf0b1582c..a546bc54f6ff 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -154,6 +154,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP");
+ atomic_inc(&hardlockup_detected);
__this_cpu_write(hard_watchdog_warn, true);
return;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 217175560434..cf77b3881a21 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -99,6 +99,7 @@ config DYNAMIC_DEBUG
default n
depends on PRINTK
depends on (DEBUG_FS || PROC_FS)
+ select DYNAMIC_DEBUG_CORE
help
Compiles debug level messages into the kernel, which would not
@@ -165,6 +166,17 @@ config DYNAMIC_DEBUG
See Documentation/admin-guide/dynamic-debug-howto.rst for additional
information.
+config DYNAMIC_DEBUG_CORE
+ bool "Enable core function of dynamic debug support"
+ depends on PRINTK
+ depends on (DEBUG_FS || PROC_FS)
+ help
+ Enable core functional support of dynamic debug. It is useful
+ when you want to tie dynamic debug to your kernel modules with
+ DYNAMIC_DEBUG_MODULE defined for each of them, especially for
+ the case of embedded system where the kernel image size is
+ sensitive for people.
+
config SYMBOLIC_ERRNAME
bool "Support symbolic error names in printf"
default y if PRINTK
@@ -658,6 +670,12 @@ config SCHED_STACK_END_CHECK
data corruption or a sporadic crash at a later stage once the region
is examined. The runtime overhead introduced is minimal.
+config ARCH_HAS_DEBUG_VM_PGTABLE
+ bool
+ help
+ An architecture should select this when it can successfully
+ build and run DEBUG_VM_PGTABLE.
+
config DEBUG_VM
bool "Debug VM"
depends on DEBUG_KERNEL
@@ -693,6 +711,22 @@ config DEBUG_VM_PGFLAGS
If unsure, say N.
+config DEBUG_VM_PGTABLE
+ bool "Debug arch page table for semantics compliance"
+ depends on MMU
+ depends on ARCH_HAS_DEBUG_VM_PGTABLE
+ default y if DEBUG_VM
+ help
+ This option provides a debug method which can be used to test
+ architecture page table helper functions on various platforms in
+ verifying if they comply with expected generic MM semantics. This
+ will help architecture code in making sure that any changes or
+ new additions of these helpers still conform to expected
+ semantics of the generic MM. Platforms will have to opt in for
+ this through ARCH_HAS_DEBUG_VM_PGTABLE.
+
+ If unsure, say N.
+
config ARCH_HAS_DEBUG_VIRTUAL
bool
@@ -1562,6 +1596,12 @@ config IO_STRICT_DEVMEM
menu "$(SRCARCH) Debugging"
+config DEBUG_AID_FOR_SYZBOT
+ bool "Additional debug code for syzbot"
+ default n
+ help
+ This option is intended for testing by syzbot.
+
source "arch/$(SRCARCH)/Kconfig.debug"
endmenu
@@ -1776,6 +1816,15 @@ config KCOV_INSTRUMENT_ALL
filesystem fuzzing with AFL) then you will want to enable coverage
for more specific subsets of files, and should say n here.
+config KCOV_IRQ_AREA_SIZE
+ hex "Size of interrupt coverage collection area in words"
+ depends on KCOV
+ default 0x40000
+ help
+ KCOV uses preallocated per-cpu areas to collect coverage from
+ soft interrupts. This specifies the size of those areas in the
+ number of unsigned long words.
+
menuconfig RUNTIME_TESTING_MENU
bool "Runtime Testing"
def_bool y
@@ -1993,6 +2042,19 @@ config TEST_LKM
If unsure, say N.
+config TEST_BITOPS
+ tristate "Test module for compilation of clear_bit/set_bit operations"
+ depends on m
+ help
+ This builds the "test_bitops" module that is much like the
+ TEST_LKM module except that it does a basic exercise of the
+ clear_bit and set_bit macros to make sure there are no compiler
+ warnings from C=1 sparse checker or -Wextra compilations. It has
+ no dependencies and doesn't run or load unless explicitly requested
+ by name. for example: modprobe test_bitops.
+
+ If unsure, say N.
+
config TEST_VMALLOC
tristate "Test module for stress/performance analysis of vmalloc allocator"
default n
@@ -2257,4 +2319,6 @@ config HYPERV_TESTING
endmenu # "Kernel Testing and Coverage"
+source "lib/Kconfig.twist"
+
endmenu # Kernel hacking
diff --git a/lib/Kconfig.twist b/lib/Kconfig.twist
new file mode 100644
index 000000000000..f20e0d003581
--- /dev/null
+++ b/lib/Kconfig.twist
@@ -0,0 +1,26 @@
+menuconfig TWIST_KERNEL_BEHAVIOR
+ bool "Twist kernel behavior"
+ help
+ Saying Y here allows modifying kernel behavior via kernel
+ config options which will become visible by selecting this
+ config option. Since these kernel config options are intended
+ for helping e.g. fuzz testing, behavior twisted by this kernel
+ option might be unstable. Userspace applications should not
+ count on this option being selected.
+
+if TWIST_KERNEL_BEHAVIOR
+
+config TWIST_FOR_SYZKALLER_TESTING
+ bool "Select all twist options suitable for syzkaller testing"
+ select TWIST_DISABLE_KBD_K_SPEC_HANDLER
+ help
+ Say N unless you are building kernels for syzkaller's testing.
+
+config TWIST_DISABLE_KBD_K_SPEC_HANDLER
+ bool "Disable k_spec() function in drivers/tty/vt/keyboard.c"
+ help
+ k_spec() function allows triggering e.g. Ctrl-Alt-Del event.
+ Such event is annoying for fuzz testing which wants to test
+ kernel code without rebooting the system.
+
+endif # TWIST_KERNEL_BEHAVIOR
diff --git a/lib/Makefile b/lib/Makefile
index c0da0ea30654..7796f75f556d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -79,6 +79,8 @@ obj-$(CONFIG_TEST_SORT) += test_sort.o
obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o
obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o
obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o
+obj-$(CONFIG_TEST_BITOPS) += test_bitops.o
+CFLAGS_test_bitops.o += -Werror
obj-$(CONFIG_TEST_PRINTF) += test_printf.o
obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o
obj-$(CONFIG_TEST_STRSCPY) += test_strscpy.o
@@ -191,7 +193,7 @@ lib-$(CONFIG_GENERIC_BUG) += bug.o
obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
-obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o
+obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o
obj-$(CONFIG_SYMBOLIC_ERRNAME) += errname.o
obj-$(CONFIG_NLATTR) += nlattr.o
diff --git a/lib/cpumask.c b/lib/cpumask.c
index fb22fb266f93..2fecbcd8c160 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -6,6 +6,7 @@
#include <linux/export.h>
#include <linux/memblock.h>
#include <linux/numa.h>
+#include <linux/spinlock.h>
/**
* cpumask_next - get the next cpu in a cpumask
@@ -192,18 +193,39 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
}
#endif
-/**
- * cpumask_local_spread - select the i'th cpu with local numa cpu's first
- * @i: index number
- * @node: local numa_node
- *
- * This function selects an online CPU according to a numa aware policy;
- * local cpus are returned first, followed by non-local ones, then it
- * wraps around.
- *
- * It's not very efficient, but useful for setup.
- */
-unsigned int cpumask_local_spread(unsigned int i, int node)
+static void calc_node_distance(int *node_dist, int node)
+{
+ int i;
+
+ for (i = 0; i < nr_node_ids; i++)
+ node_dist[i] = node_distance(node, i);
+}
+
+static int find_nearest_node(int *node_dist, bool *used)
+{
+ int i, min_dist = node_dist[0], node_id = -1;
+
+ /* Choose the first unused node to compare */
+ for (i = 0; i < nr_node_ids; i++) {
+ if (used[i] == 0) {
+ min_dist = node_dist[i];
+ node_id = i;
+ break;
+ }
+ }
+
+ /* Compare and return the nearest node */
+ for (i = 0; i < nr_node_ids; i++) {
+ if (node_dist[i] < min_dist && used[i] == 0) {
+ min_dist = node_dist[i];
+ node_id = i;
+ }
+ }
+
+ return node_id;
+}
+
+static unsigned int __cpumask_local_spread(unsigned int i, int node)
{
int cpu;
@@ -231,6 +253,62 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
}
BUG();
}
+
+/**
+ * cpumask_local_spread - select the i'th cpu with local numa cpu's first
+ * @i: index number
+ * @node: local numa_node
+ *
+ * This function selects an online CPU according to a numa aware policy;
+ * local cpus are returned first, followed by the nearest non-local ones,
+ * then it wraps around.
+ *
+ * It's not very efficient, but useful for setup.
+ */
+unsigned int cpumask_local_spread(unsigned int i, int node)
+{
+ static DEFINE_SPINLOCK(spread_lock);
+ static int node_dist[MAX_NUMNODES];
+ static bool used[MAX_NUMNODES];
+ unsigned long flags;
+ int cpu, j, id;
+
+ /* Wrap: we always want a cpu. */
+ i %= num_online_cpus();
+
+ if (node == NUMA_NO_NODE) {
+ for_each_cpu(cpu, cpu_online_mask)
+ if (i-- == 0)
+ return cpu;
+ } else {
+ if (nr_node_ids > MAX_NUMNODES)
+ return __cpumask_local_spread(i, node);
+
+ spin_lock_irqsave(&spread_lock, flags);
+ memset(used, 0, nr_node_ids * sizeof(bool));
+ calc_node_distance(node_dist, node);
+ for (j = 0; j < nr_node_ids; j++) {
+ id = find_nearest_node(node_dist, used);
+ if (id < 0)
+ break;
+
+ for_each_cpu_and(cpu, cpumask_of_node(id),
+ cpu_online_mask)
+ if (i-- == 0) {
+ spin_unlock_irqrestore(&spread_lock,
+ flags);
+ return cpu;
+ }
+ used[id] = 1;
+ }
+ spin_unlock_irqrestore(&spread_lock, flags);
+
+ for_each_cpu(cpu, cpu_online_mask)
+ if (i-- == 0)
+ return cpu;
+ }
+ BUG();
+}
EXPORT_SYMBOL(cpumask_local_spread);
static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 8f199f403ab5..321437bbf87d 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -1032,8 +1032,13 @@ static int __init dynamic_debug_init(void)
int verbose_bytes = 0;
if (&__start___verbose == &__stop___verbose) {
- pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n");
- return 1;
+ if (IS_ENABLED(CONFIG_DYNAMIC_DEBUG)) {
+ pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n");
+ return 1;
+ }
+ pr_info("Ignore empty _ddebug table in a CONFIG_DYNAMIC_DEBUG_CORE build\n");
+ ddebug_init_success = 1;
+ return 0;
}
iter = __start___verbose;
modname = iter->modname;
diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c
index 7852bfff50b1..451543937524 100644
--- a/lib/flex_proportions.c
+++ b/lib/flex_proportions.c
@@ -266,8 +266,7 @@ void __fprop_inc_percpu_max(struct fprop_global *p,
if (numerator >
(((u64)denominator) * max_frac) >> FPROP_FRAC_SHIFT)
return;
- } else
- fprop_reflect_period_percpu(p, pl);
- percpu_counter_add_batch(&pl->events, 1, PROP_BATCH);
- percpu_counter_add(&p->events, 1);
+ }
+
+ __fprop_inc_percpu(p, pl);
}
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 3f0e18543de8..ad485f08173b 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -61,13 +61,14 @@ static inline int ioremap_pmd_enabled(void) { return 0; }
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
{
pte_t *pte;
u64 pfn;
pfn = phys_addr >> PAGE_SHIFT;
- pte = pte_alloc_kernel(pmd, addr);
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
do {
@@ -75,6 +76,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -101,21 +103,24 @@ static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
}
static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
- pmd = pmd_alloc(&init_mm, pud, addr);
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot))
+ if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
+ *mask |= PGTBL_PMD_MODIFIED;
continue;
+ }
- if (ioremap_pte_range(pmd, addr, next, phys_addr, prot))
+ if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask))
return -ENOMEM;
} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
return 0;
@@ -144,21 +149,24 @@ static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
}
static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
- pud = pud_alloc(&init_mm, p4d, addr);
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot))
+ if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
+ *mask |= PGTBL_PUD_MODIFIED;
continue;
+ }
- if (ioremap_pmd_range(pud, addr, next, phys_addr, prot))
+ if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask))
return -ENOMEM;
} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
return 0;
@@ -187,21 +195,24 @@ static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
}
static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
- p4d = p4d_alloc(&init_mm, pgd, addr);
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
if (!p4d)
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot))
+ if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
+ *mask |= PGTBL_P4D_MODIFIED;
continue;
+ }
- if (ioremap_pud_range(p4d, addr, next, phys_addr, prot))
+ if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask))
return -ENOMEM;
} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
return 0;
@@ -214,6 +225,7 @@ int ioremap_page_range(unsigned long addr,
unsigned long start;
unsigned long next;
int err;
+ pgtbl_mod_mask mask = 0;
might_sleep();
BUG_ON(addr >= end);
@@ -222,13 +234,17 @@ int ioremap_page_range(unsigned long addr,
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot);
+ err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot,
+ &mask);
if (err)
break;
} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
flush_cache_vmap(start, end);
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
return err;
}
diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c
index 717c940112f9..8ad5ba2b86e2 100644
--- a/lib/lzo/lzo1x_compress.c
+++ b/lib/lzo/lzo1x_compress.c
@@ -268,6 +268,19 @@ m_len_done:
*op++ = (M4_MARKER | ((m_off >> 11) & 8)
| (m_len - 2));
else {
+ if (unlikely(((m_off & 0x403f) == 0x403f)
+ && (m_len >= 261)
+ && (m_len <= 264))
+ && likely(bitstream_version)) {
+ // Under lzo-rle, block copies
+ // for 261 <= length <= 264 and
+ // (distance & 0x80f3) == 0x80f3
+ // can result in ambiguous
+ // output. Adjust length
+ // to 260 to prevent ambiguity.
+ ip -= m_len - 260;
+ m_len = 260;
+ }
m_len -= M4_MAX_LEN;
*op++ = (M4_MARKER | ((m_off >> 11) & 8));
while (unlikely(m_len > 255)) {
diff --git a/lib/math/prime_numbers.c b/lib/math/prime_numbers.c
index 052f5b727be7..d42cebf7407f 100644
--- a/lib/math/prime_numbers.c
+++ b/lib/math/prime_numbers.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-#define pr_fmt(fmt) "prime numbers: " fmt "\n"
+#define pr_fmt(fmt) "prime numbers: " fmt
#include <linux/module.h>
#include <linux/mutex.h>
@@ -253,7 +253,7 @@ static void dump_primes(void)
if (buf)
bitmap_print_to_pagebuf(true, buf, p->primes, p->sz);
- pr_info("primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s",
+ pr_info("primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s\n",
p->last, p->sz, p->primes[BITS_TO_LONGS(p->sz) - 1], buf);
rcu_read_unlock();
@@ -273,7 +273,7 @@ static int selftest(unsigned long max)
bool fast = is_prime_number(x);
if (slow != fast) {
- pr_err("inconsistent result for is-prime(%lu): slow=%s, fast=%s!",
+ pr_err("inconsistent result for is-prime(%lu): slow=%s, fast=%s!\n",
x, slow ? "yes" : "no", fast ? "yes" : "no");
goto err;
}
@@ -282,14 +282,14 @@ static int selftest(unsigned long max)
continue;
if (next_prime_number(last) != x) {
- pr_err("incorrect result for next-prime(%lu): expected %lu, got %lu",
+ pr_err("incorrect result for next-prime(%lu): expected %lu, got %lu\n",
last, x, next_prime_number(last));
goto err;
}
last = x;
}
- pr_info("selftest(%lu) passed, last prime was %lu", x, last);
+ pr_info("%s(%lu) passed, last prime was %lu\n", __func__, x, last);
return 0;
err:
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 8d092609928e..0ba686b8fe57 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-#define pr_fmt(fmt) "%s: " fmt "\n", __func__
+#define pr_fmt(fmt) "%s: " fmt, __func__
#include <linux/kernel.h>
#include <linux/sched.h>
@@ -141,8 +141,8 @@ static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
for_each_possible_cpu(cpu)
count += *per_cpu_ptr(percpu_count, cpu);
- pr_debug("global %ld percpu %ld",
- atomic_long_read(&ref->count), (long)count);
+ pr_debug("global %lu percpu %lu\n",
+ atomic_long_read(&ref->count), count);
/*
* It's crucial that we sum the percpu counters _before_ adding the sum
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index b90ec550183a..34696a348864 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -98,6 +98,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
{
unsigned long max_addr, src_addr;
+ might_fault();
if (unlikely(count <= 0))
return 0;
diff --git a/lib/test_bitops.c b/lib/test_bitops.c
new file mode 100644
index 000000000000..fd50b3ae4a14
--- /dev/null
+++ b/lib/test_bitops.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Intel Corporation
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+
+/* a tiny module only meant to test set/clear_bit */
+
+/* use an enum because thats the most common BITMAP usage */
+enum bitops_fun {
+ BITOPS_4 = 4,
+ BITOPS_7 = 7,
+ BITOPS_11 = 11,
+ BITOPS_31 = 31,
+ BITOPS_88 = 88,
+ BITOPS_LAST = 255,
+ BITOPS_LENGTH = 256
+};
+
+static DECLARE_BITMAP(g_bitmap, BITOPS_LENGTH);
+
+static int __init test_bitops_startup(void)
+{
+ pr_warn("Loaded test module\n");
+ set_bit(BITOPS_4, g_bitmap);
+ set_bit(BITOPS_7, g_bitmap);
+ set_bit(BITOPS_11, g_bitmap);
+ set_bit(BITOPS_31, g_bitmap);
+ set_bit(BITOPS_88, g_bitmap);
+ return 0;
+}
+
+static void __exit test_bitops_unstartup(void)
+{
+ int bit_set;
+
+ clear_bit(BITOPS_4, g_bitmap);
+ clear_bit(BITOPS_7, g_bitmap);
+ clear_bit(BITOPS_11, g_bitmap);
+ clear_bit(BITOPS_31, g_bitmap);
+ clear_bit(BITOPS_88, g_bitmap);
+
+ bit_set = find_first_bit(g_bitmap, BITOPS_LAST);
+ if (bit_set != BITOPS_LAST)
+ pr_err("ERROR: FOUND SET BIT %d\n", bit_set);
+
+ pr_warn("Unloaded test module\n");
+}
+
+module_init(test_bitops_startup);
+module_exit(test_bitops_unstartup);
+
+MODULE_AUTHOR("Jesse Brandeburg <jesse.brandeburg@intel.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Bit testing module");
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index e3087d90e00d..dc2c6a51d11a 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -24,6 +24,14 @@
#include <asm/page.h>
/*
+ * We assign some test results to these globals to make sure the tests
+ * are not eliminated as dead code.
+ */
+
+int kasan_int_result;
+void *kasan_ptr_result;
+
+/*
* Note: test functions are marked noinline so that their names appear in
* reports.
*/
@@ -622,7 +630,7 @@ static noinline void __init kasan_memchr(void)
if (!ptr)
return;
- memchr(ptr, '1', size + 1);
+ kasan_ptr_result = memchr(ptr, '1', size + 1);
kfree(ptr);
}
@@ -638,7 +646,7 @@ static noinline void __init kasan_memcmp(void)
return;
memset(arr, 0, sizeof(arr));
- memcmp(ptr, arr, size+1);
+ kasan_int_result = memcmp(ptr, arr, size + 1);
kfree(ptr);
}
@@ -661,22 +669,22 @@ static noinline void __init kasan_strings(void)
* will likely point to zeroed byte.
*/
ptr += 16;
- strchr(ptr, '1');
+ kasan_ptr_result = strchr(ptr, '1');
pr_info("use-after-free in strrchr\n");
- strrchr(ptr, '1');
+ kasan_ptr_result = strrchr(ptr, '1');
pr_info("use-after-free in strcmp\n");
- strcmp(ptr, "2");
+ kasan_int_result = strcmp(ptr, "2");
pr_info("use-after-free in strncmp\n");
- strncmp(ptr, "2", 1);
+ kasan_int_result = strncmp(ptr, "2", 1);
pr_info("use-after-free in strlen\n");
- strlen(ptr);
+ kasan_int_result = strlen(ptr);
pr_info("use-after-free in strnlen\n");
- strnlen(ptr, 1);
+ kasan_int_result = strnlen(ptr, 1);
}
static noinline void __init kasan_bitops(void)
@@ -743,11 +751,12 @@ static noinline void __init kasan_bitops(void)
__test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits);
pr_info("out-of-bounds in test_bit\n");
- (void)test_bit(BITS_PER_LONG + BITS_PER_BYTE, bits);
+ kasan_int_result = test_bit(BITS_PER_LONG + BITS_PER_BYTE, bits);
#if defined(clear_bit_unlock_is_negative_byte)
pr_info("out-of-bounds in clear_bit_unlock_is_negative_byte\n");
- clear_bit_unlock_is_negative_byte(BITS_PER_LONG + BITS_PER_BYTE, bits);
+ kasan_int_result = clear_bit_unlock_is_negative_byte(BITS_PER_LONG +
+ BITS_PER_BYTE, bits);
#endif
kfree(bits);
}
diff --git a/lib/test_lockup.c b/lib/test_lockup.c
index ea09ca335b21..419fbaceba73 100644
--- a/lib/test_lockup.c
+++ b/lib/test_lockup.c
@@ -142,7 +142,7 @@ module_param(reallocate_pages, bool, 0400);
MODULE_PARM_DESC(reallocate_pages, "free and allocate pages between iterations");
struct file *test_file;
-struct inode *test_inode;
+static struct inode *test_inode;
static char test_file_path[256];
module_param_string(file_path, test_file_path, sizeof(test_file_path), 0400);
MODULE_PARM_DESC(file_path, "file path to test");
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index 8bbefcaddfe8..ddc9685702b1 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -91,12 +91,8 @@ static int random_size_align_alloc_test(void)
*/
size = ((rnd % 10) + 1) * PAGE_SIZE;
- ptr = __vmalloc_node_range(size, align,
- VMALLOC_START, VMALLOC_END,
- GFP_KERNEL | __GFP_ZERO,
- PAGE_KERNEL,
- 0, 0, __builtin_return_address(0));
-
+ ptr = __vmalloc_node(size, align, GFP_KERNEL | __GFP_ZERO, 0,
+ __builtin_return_address(0));
if (!ptr)
return -1;
@@ -118,12 +114,8 @@ static int align_shift_alloc_test(void)
for (i = 0; i < BITS_PER_LONG; i++) {
align = ((unsigned long) 1) << i;
- ptr = __vmalloc_node_range(PAGE_SIZE, align,
- VMALLOC_START, VMALLOC_END,
- GFP_KERNEL | __GFP_ZERO,
- PAGE_KERNEL,
- 0, 0, __builtin_return_address(0));
-
+ ptr = __vmalloc_node(PAGE_SIZE, align, GFP_KERNEL|__GFP_ZERO, 0,
+ __builtin_return_address(0));
if (!ptr)
return -1;
@@ -139,13 +131,9 @@ static int fix_align_alloc_test(void)
int i;
for (i = 0; i < test_loop_count; i++) {
- ptr = __vmalloc_node_range(5 * PAGE_SIZE,
- THREAD_ALIGN << 1,
- VMALLOC_START, VMALLOC_END,
- GFP_KERNEL | __GFP_ZERO,
- PAGE_KERNEL,
- 0, 0, __builtin_return_address(0));
-
+ ptr = __vmalloc_node(5 * PAGE_SIZE, THREAD_ALIGN << 1,
+ GFP_KERNEL | __GFP_ZERO, 0,
+ __builtin_return_address(0));
if (!ptr)
return -1;
diff --git a/lib/ubsan.c b/lib/ubsan.c
index f8c0ccf35f29..cb9af3f6b77e 100644
--- a/lib/ubsan.c
+++ b/lib/ubsan.c
@@ -189,7 +189,7 @@ static void handle_overflow(struct overflow_data *data, void *lhs,
ubsan_epilogue();
}
-void __ubsan_handle_add_overflow(struct overflow_data *data,
+void __ubsan_handle_add_overflow(void *data,
void *lhs, void *rhs)
{
@@ -197,23 +197,23 @@ void __ubsan_handle_add_overflow(struct overflow_data *data,
}
EXPORT_SYMBOL(__ubsan_handle_add_overflow);
-void __ubsan_handle_sub_overflow(struct overflow_data *data,
+void __ubsan_handle_sub_overflow(void *data,
void *lhs, void *rhs)
{
handle_overflow(data, lhs, rhs, '-');
}
EXPORT_SYMBOL(__ubsan_handle_sub_overflow);
-void __ubsan_handle_mul_overflow(struct overflow_data *data,
+void __ubsan_handle_mul_overflow(void *data,
void *lhs, void *rhs)
{
handle_overflow(data, lhs, rhs, '*');
}
EXPORT_SYMBOL(__ubsan_handle_mul_overflow);
-void __ubsan_handle_negate_overflow(struct overflow_data *data,
- void *old_val)
+void __ubsan_handle_negate_overflow(void *_data, void *old_val)
{
+ struct overflow_data *data = _data;
char old_val_str[VALUE_LENGTH];
if (suppress_report(&data->location))
@@ -231,9 +231,9 @@ void __ubsan_handle_negate_overflow(struct overflow_data *data,
EXPORT_SYMBOL(__ubsan_handle_negate_overflow);
-void __ubsan_handle_divrem_overflow(struct overflow_data *data,
- void *lhs, void *rhs)
+void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs)
{
+ struct overflow_data *data = _data;
char rhs_val_str[VALUE_LENGTH];
if (suppress_report(&data->location))
@@ -326,10 +326,9 @@ void __ubsan_handle_type_mismatch(struct type_mismatch_data *data,
}
EXPORT_SYMBOL(__ubsan_handle_type_mismatch);
-void __ubsan_handle_type_mismatch_v1(struct type_mismatch_data_v1 *data,
- void *ptr)
+void __ubsan_handle_type_mismatch_v1(void *_data, void *ptr)
{
-
+ struct type_mismatch_data_v1 *data = _data;
struct type_mismatch_data_common common_data = {
.location = &data->location,
.type = data->type,
@@ -341,8 +340,9 @@ void __ubsan_handle_type_mismatch_v1(struct type_mismatch_data_v1 *data,
}
EXPORT_SYMBOL(__ubsan_handle_type_mismatch_v1);
-void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data, void *index)
+void __ubsan_handle_out_of_bounds(void *_data, void *index)
{
+ struct out_of_bounds_data *data = _data;
char index_str[VALUE_LENGTH];
if (suppress_report(&data->location))
@@ -357,9 +357,9 @@ void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data, void *index)
}
EXPORT_SYMBOL(__ubsan_handle_out_of_bounds);
-void __ubsan_handle_shift_out_of_bounds(struct shift_out_of_bounds_data *data,
- void *lhs, void *rhs)
+void __ubsan_handle_shift_out_of_bounds(void *_data, void *lhs, void *rhs)
{
+ struct shift_out_of_bounds_data *data = _data;
struct type_descriptor *rhs_type = data->rhs_type;
struct type_descriptor *lhs_type = data->lhs_type;
char rhs_str[VALUE_LENGTH];
@@ -399,8 +399,9 @@ out:
EXPORT_SYMBOL(__ubsan_handle_shift_out_of_bounds);
-void __ubsan_handle_builtin_unreachable(struct unreachable_data *data)
+void __ubsan_handle_builtin_unreachable(void *_data)
{
+ struct unreachable_data *data = _data;
ubsan_prologue(&data->location, "unreachable");
pr_err("calling __builtin_unreachable()\n");
ubsan_epilogue();
@@ -408,9 +409,9 @@ void __ubsan_handle_builtin_unreachable(struct unreachable_data *data)
}
EXPORT_SYMBOL(__ubsan_handle_builtin_unreachable);
-void __ubsan_handle_load_invalid_value(struct invalid_value_data *data,
- void *val)
+void __ubsan_handle_load_invalid_value(void *_data, void *val)
{
+ struct invalid_value_data *data = _data;
char val_str[VALUE_LENGTH];
if (suppress_report(&data->location))
diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
index 2c13ecc5bb2c..ed1f3df27260 100644
--- a/lib/zlib_inflate/inffast.c
+++ b/lib/zlib_inflate/inffast.c
@@ -10,17 +10,6 @@
#ifndef ASMINF
-/* Allow machine dependent optimization for post-increment or pre-increment.
- Based on testing to date,
- Pre-increment preferred for:
- - PowerPC G3 (Adler)
- - MIPS R5000 (Randers-Pehrson)
- Post-increment preferred for:
- - none
- No measurable difference:
- - Pentium III (Anderson)
- - M68060 (Nikl)
- */
union uu {
unsigned short us;
unsigned char b[2];
@@ -38,16 +27,6 @@ get_unaligned16(const unsigned short *p)
return mm.us;
}
-#ifdef POSTINC
-# define OFF 0
-# define PUP(a) *(a)++
-# define UP_UNALIGNED(a) get_unaligned16((a)++)
-#else
-# define OFF 1
-# define PUP(a) *++(a)
-# define UP_UNALIGNED(a) get_unaligned16(++(a))
-#endif
-
/*
Decode literal, length, and distance codes and write out the resulting
literal and match bytes until either not enough input or output is
@@ -115,9 +94,9 @@ void inflate_fast(z_streamp strm, unsigned start)
/* copy state to local variables */
state = (struct inflate_state *)strm->state;
- in = strm->next_in - OFF;
+ in = strm->next_in;
last = in + (strm->avail_in - 5);
- out = strm->next_out - OFF;
+ out = strm->next_out;
beg = out - (start - strm->avail_out);
end = out + (strm->avail_out - 257);
#ifdef INFLATE_STRICT
@@ -138,9 +117,9 @@ void inflate_fast(z_streamp strm, unsigned start)
input data or output space */
do {
if (bits < 15) {
- hold += (unsigned long)(PUP(in)) << bits;
+ hold += (unsigned long)(*in++) << bits;
bits += 8;
- hold += (unsigned long)(PUP(in)) << bits;
+ hold += (unsigned long)(*in++) << bits;
bits += 8;
}
this = lcode[hold & lmask];
@@ -150,14 +129,14 @@ void inflate_fast(z_streamp strm, unsigned start)
bits -= op;
op = (unsigned)(this.op);
if (op == 0) { /* literal */
- PUP(out) = (unsigned char)(this.val);
+ *out++ = (unsigned char)(this.val);
}
else if (op & 16) { /* length base */
len = (unsigned)(this.val);
op &= 15; /* number of extra bits */
if (op) {
if (bits < op) {
- hold += (unsigned long)(PUP(in)) << bits;
+ hold += (unsigned long)(*in++) << bits;
bits += 8;
}
len += (unsigned)hold & ((1U << op) - 1);
@@ -165,9 +144,9 @@ void inflate_fast(z_streamp strm, unsigned start)
bits -= op;
}
if (bits < 15) {
- hold += (unsigned long)(PUP(in)) << bits;
+ hold += (unsigned long)(*in++) << bits;
bits += 8;
- hold += (unsigned long)(PUP(in)) << bits;
+ hold += (unsigned long)(*in++) << bits;
bits += 8;
}
this = dcode[hold & dmask];
@@ -180,10 +159,10 @@ void inflate_fast(z_streamp strm, unsigned start)
dist = (unsigned)(this.val);
op &= 15; /* number of extra bits */
if (bits < op) {
- hold += (unsigned long)(PUP(in)) << bits;
+ hold += (unsigned long)(*in++) << bits;
bits += 8;
if (bits < op) {
- hold += (unsigned long)(PUP(in)) << bits;
+ hold += (unsigned long)(*in++) << bits;
bits += 8;
}
}
@@ -205,13 +184,13 @@ void inflate_fast(z_streamp strm, unsigned start)
state->mode = BAD;
break;
}
- from = window - OFF;
+ from = window;
if (write == 0) { /* very common case */
from += wsize - op;
if (op < len) { /* some from window */
len -= op;
do {
- PUP(out) = PUP(from);
+ *out++ = *from++;
} while (--op);
from = out - dist; /* rest from output */
}
@@ -222,14 +201,14 @@ void inflate_fast(z_streamp strm, unsigned start)
if (op < len) { /* some from end of window */
len -= op;
do {
- PUP(out) = PUP(from);
+ *out++ = *from++;
} while (--op);
- from = window - OFF;
+ from = window;
if (write < len) { /* some from start of window */
op = write;
len -= op;
do {
- PUP(out) = PUP(from);
+ *out++ = *from++;
} while (--op);
from = out - dist; /* rest from output */
}
@@ -240,21 +219,21 @@ void inflate_fast(z_streamp strm, unsigned start)
if (op < len) { /* some from window */
len -= op;
do {
- PUP(out) = PUP(from);
+ *out++ = *from++;
} while (--op);
from = out - dist; /* rest from output */
}
}
while (len > 2) {
- PUP(out) = PUP(from);
- PUP(out) = PUP(from);
- PUP(out) = PUP(from);
+ *out++ = *from++;
+ *out++ = *from++;
+ *out++ = *from++;
len -= 3;
}
if (len) {
- PUP(out) = PUP(from);
+ *out++ = *from++;
if (len > 1)
- PUP(out) = PUP(from);
+ *out++ = *from++;
}
}
else {
@@ -264,29 +243,29 @@ void inflate_fast(z_streamp strm, unsigned start)
from = out - dist; /* copy direct from output */
/* minimum length is three */
/* Align out addr */
- if (!((long)(out - 1 + OFF) & 1)) {
- PUP(out) = PUP(from);
+ if (!((long)(out - 1) & 1)) {
+ *out++ = *from++;
len--;
}
- sout = (unsigned short *)(out - OFF);
+ sout = (unsigned short *)(out);
if (dist > 2) {
unsigned short *sfrom;
- sfrom = (unsigned short *)(from - OFF);
+ sfrom = (unsigned short *)(from);
loops = len >> 1;
do
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
- PUP(sout) = PUP(sfrom);
+ *sout++ = *sfrom++;
#else
- PUP(sout) = UP_UNALIGNED(sfrom);
+ *sout++ = get_unaligned16(sfrom++);
#endif
while (--loops);
- out = (unsigned char *)sout + OFF;
- from = (unsigned char *)sfrom + OFF;
+ out = (unsigned char *)sout;
+ from = (unsigned char *)sfrom;
} else { /* dist == 1 or dist == 2 */
unsigned short pat16;
- pat16 = *(sout-1+OFF);
+ pat16 = *(sout-1);
if (dist == 1) {
union uu mm;
/* copy one char pattern to both bytes */
@@ -296,12 +275,12 @@ void inflate_fast(z_streamp strm, unsigned start)
}
loops = len >> 1;
do
- PUP(sout) = pat16;
+ *sout++ = pat16;
while (--loops);
- out = (unsigned char *)sout + OFF;
+ out = (unsigned char *)sout;
}
if (len & 1)
- PUP(out) = PUP(from);
+ *out++ = *from++;
}
}
else if ((op & 64) == 0) { /* 2nd level distance code */
@@ -336,8 +315,8 @@ void inflate_fast(z_streamp strm, unsigned start)
hold &= (1U << bits) - 1;
/* update state and return */
- strm->next_in = in + OFF;
- strm->next_out = out + OFF;
+ strm->next_in = in;
+ strm->next_out = out;
strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
strm->avail_out = (unsigned)(out < end ?
257 + (end - out) : 257 - (out - end));
diff --git a/mm/Kconfig b/mm/Kconfig
index c1acc34c1c35..5b28240d2af8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -126,9 +126,6 @@ config SPARSEMEM_VMEMMAP
pfn_to_page and page_to_pfn operations. This is the most
efficient option when sufficient kernel resources are available.
-config HAVE_MEMBLOCK_NODE_MAP
- bool
-
config HAVE_MEMBLOCK_PHYS_MAP
bool
@@ -136,6 +133,9 @@ config HAVE_FAST_GUP
depends on MMU
bool
+# Don't discard allocated memory used to track "memory" and "reserved" memblocks
+# after early boot, so it can still be used to test for validity of memory.
+# Also, memblocks are updated with memory hot(un)plug.
config ARCH_KEEP_MEMBLOCK
bool
@@ -705,9 +705,9 @@ config ZSMALLOC
returned by an alloc(). This handle must be mapped in order to
access the allocated space.
-config PGTABLE_MAPPING
+config ZSMALLOC_PGTABLE_MAPPING
bool "Use page table mapping to access object in zsmalloc"
- depends on ZSMALLOC
+ depends on ZSMALLOC=y
help
By default, zsmalloc uses a copy-based object mapping method to
access allocations that span two pages. However, if a particular
@@ -750,13 +750,13 @@ config DEFERRED_STRUCT_PAGE_INIT
depends on SPARSEMEM
depends on !NEED_PER_CPU_KM
depends on 64BIT
+ select PADATA
help
Ordinarily all struct pages are initialised during early boot in a
single thread. On very large machines this can take a considerable
amount of time. If this option is set, large machines will bring up
- a subset of memmap at boot and then initialise the rest in parallel
- by starting one-off "pgdatinitX" kernel thread for each node X. This
- has a potential performance impact on processes running early in the
+ a subset of memmap at boot and then initialise the rest in parallel.
+ This has a potential performance impact on tasks running early in the
lifetime of the system until these kthreads finish the
initialisation.
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 0271b22e063f..2409f7fc1567 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -118,6 +118,38 @@ config DEBUG_RODATA_TEST
---help---
This option enables a testcase for the setting rodata read-only.
+config ARCH_HAS_DEBUG_WX
+ bool
+
+config DEBUG_WX
+ bool "Warn on W+X mappings at boot"
+ depends on ARCH_HAS_DEBUG_WX
+ depends on MMU
+ select PTDUMP_CORE
+ help
+ Generate a warning if any W+X mappings are found at boot.
+
+ This is useful for discovering cases where the kernel is leaving W+X
+ mappings after applying NX, as such mappings are a security risk.
+
+ Look for a message in dmesg output like this:
+
+ <arch>/mm: Checked W+X mappings: passed, no W+X pages found.
+
+ or like this, if the check failed:
+
+ <arch>/mm: Checked W+X mappings: failed, <N> W+X pages found.
+
+ Note that even if the check fails, your kernel is possibly
+ still fine, as W+X mappings are not a security hole in
+ themselves, what they do is that they make the exploitation
+ of other unfixed kernel bugs easier.
+
+ There is no runtime or memory usage effect of this option
+ once the kernel has booted up - it's a one time check.
+
+ If in doubt, say "Y".
+
config GENERIC_PTDUMP
bool
diff --git a/mm/Makefile b/mm/Makefile
index 7881b8ede627..fa91e963c2f9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
+obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/compaction.c b/mm/compaction.c
index d8cfb7b99a83..9ce4cff4d407 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -935,12 +935,16 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
/*
- * Migration will fail if an anonymous page is pinned in memory,
+ * Migration will fail if an page is pinned in memory,
* so avoid taking lru_lock and isolating it unnecessarily in an
- * admittedly racy check.
+ * admittedly racy check simplest case for 0-order pages.
+ *
+ * Open code page_mapcount() to avoid VM_BUG_ON(PageSlab(page)).
+ * Page could have extra reference from mapping or swap cache.
*/
- if (!page_mapping(page) &&
- page_count(page) > page_mapcount(page))
+ if (!PageCompound(page) &&
+ page_count(page) > atomic_read(&page->_mapcount) + 1 +
+ (!PageAnon(page) || PageSwapCache(page)))
goto isolate_fail;
/*
@@ -975,6 +979,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
low_pfn += compound_nr(page) - 1;
goto isolate_fail;
}
+
+ /* Recheck page extra references under lock */
+ if (page_count(page) > page_mapcount(page) +
+ (!PageAnon(page) || PageSwapCache(page)))
+ goto isolate_fail;
}
lruvec = mem_cgroup_page_lruvec(page, pgdat);
@@ -1401,7 +1410,7 @@ fast_isolate_freepages(struct compact_control *cc)
if (scan_start) {
/*
* Use the highest PFN found above min. If one was
- * not found, be pessemistic for direct compaction
+ * not found, be pessimistic for direct compaction
* and use the min mark.
*/
if (highest) {
@@ -1409,7 +1418,9 @@ fast_isolate_freepages(struct compact_control *cc)
cc->free_pfn = highest;
} else {
if (cc->direct_compaction && pfn_valid(min_pfn)) {
- page = pfn_to_page(min_pfn);
+ page = pageblock_pfn_to_page(min_pfn,
+ pageblock_end_pfn(min_pfn),
+ cc->zone);
cc->free_pfn = min_pfn;
}
}
@@ -1966,7 +1977,7 @@ static enum compact_result compact_finished(struct compact_control *cc)
*/
static enum compact_result __compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
- int classzone_idx,
+ int highest_zoneidx,
unsigned long wmark_target)
{
unsigned long watermark;
@@ -1979,7 +1990,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
* If watermarks for high-order allocation are already met, there
* should be no need for compaction at all.
*/
- if (zone_watermark_ok(zone, order, watermark, classzone_idx,
+ if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
alloc_flags))
return COMPACT_SUCCESS;
@@ -1989,9 +2000,9 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
* watermark and alloc_flags have to match, or be more pessimistic than
* the check in __isolate_free_page(). We don't use the direct
* compactor's alloc_flags, as they are not relevant for freepage
- * isolation. We however do use the direct compactor's classzone_idx to
- * skip over zones where lowmem reserves would prevent allocation even
- * if compaction succeeds.
+ * isolation. We however do use the direct compactor's highest_zoneidx
+ * to skip over zones where lowmem reserves would prevent allocation
+ * even if compaction succeeds.
* For costly orders, we require low watermark instead of min for
* compaction to proceed to increase its chances.
* ALLOC_CMA is used, as pages in CMA pageblocks are considered
@@ -2000,7 +2011,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
- if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
+ if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
ALLOC_CMA, wmark_target))
return COMPACT_SKIPPED;
@@ -2009,12 +2020,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
enum compact_result compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
- int classzone_idx)
+ int highest_zoneidx)
{
enum compact_result ret;
int fragindex;
- ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
+ ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx,
zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
@@ -2055,8 +2066,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
* Make sure at least one zone would pass __compaction_suitable if we continue
* retrying the reclaim.
*/
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
unsigned long available;
enum compact_result compact_result;
@@ -2069,7 +2080,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
compact_result = __compaction_suitable(zone, order, alloc_flags,
- ac_classzone_idx(ac), available);
+ ac->highest_zoneidx, available);
if (compact_result != COMPACT_SKIPPED)
return true;
}
@@ -2098,9 +2109,9 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
INIT_LIST_HEAD(&cc->freepages);
INIT_LIST_HEAD(&cc->migratepages);
- cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
+ cc->migratetype = gfp_migratetype(cc->gfp_mask);
ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
- cc->classzone_idx);
+ cc->highest_zoneidx);
/* Compaction is likely to fail */
if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
return ret;
@@ -2295,7 +2306,7 @@ out:
static enum compact_result compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum compact_priority prio,
- unsigned int alloc_flags, int classzone_idx,
+ unsigned int alloc_flags, int highest_zoneidx,
struct page **capture)
{
enum compact_result ret;
@@ -2307,7 +2318,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.mode = (prio == COMPACT_PRIO_ASYNC) ?
MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
.alloc_flags = alloc_flags,
- .classzone_idx = classzone_idx,
+ .highest_zoneidx = highest_zoneidx,
.direct_compaction = true,
.whole_zone = (prio == MIN_COMPACT_PRIORITY),
.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
@@ -2363,8 +2374,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
/* Compact each zone in the list */
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
enum compact_result status;
if (prio > MIN_COMPACT_PRIORITY
@@ -2374,7 +2385,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
}
status = compact_zone_order(zone, order, gfp_mask, prio,
- alloc_flags, ac_classzone_idx(ac), capture);
+ alloc_flags, ac->highest_zoneidx, capture);
rc = max(status, rc);
/* The allocation should succeed, stop compacting */
@@ -2509,16 +2520,16 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
{
int zoneid;
struct zone *zone;
- enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
+ enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx;
- for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
+ for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) {
zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
continue;
if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
- classzone_idx) == COMPACT_CONTINUE)
+ highest_zoneidx) == COMPACT_CONTINUE)
return true;
}
@@ -2536,16 +2547,16 @@ static void kcompactd_do_work(pg_data_t *pgdat)
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
.search_order = pgdat->kcompactd_max_order,
- .classzone_idx = pgdat->kcompactd_classzone_idx,
+ .highest_zoneidx = pgdat->kcompactd_highest_zoneidx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = false,
.gfp_mask = GFP_KERNEL,
};
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
- cc.classzone_idx);
+ cc.highest_zoneidx);
count_compact_event(KCOMPACTD_WAKE);
- for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
+ for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) {
int status;
zone = &pgdat->node_zones[zoneid];
@@ -2594,16 +2605,16 @@ static void kcompactd_do_work(pg_data_t *pgdat)
/*
* Regardless of success, we are done until woken up next. But remember
- * the requested order/classzone_idx in case it was higher/tighter than
- * our current ones
+ * the requested order/highest_zoneidx in case it was higher/tighter
+ * than our current ones
*/
if (pgdat->kcompactd_max_order <= cc.order)
pgdat->kcompactd_max_order = 0;
- if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
- pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+ if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx)
+ pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
}
-void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
{
if (!order)
return;
@@ -2611,8 +2622,8 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
if (pgdat->kcompactd_max_order < order)
pgdat->kcompactd_max_order = order;
- if (pgdat->kcompactd_classzone_idx > classzone_idx)
- pgdat->kcompactd_classzone_idx = classzone_idx;
+ if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx)
+ pgdat->kcompactd_highest_zoneidx = highest_zoneidx;
/*
* Pairs with implicit barrier in wait_event_freezable()
@@ -2625,7 +2636,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
return;
trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
- classzone_idx);
+ highest_zoneidx);
wake_up_interruptible(&pgdat->kcompactd_wait);
}
@@ -2646,7 +2657,7 @@ static int kcompactd(void *p)
set_freezable();
pgdat->kcompactd_max_order = 0;
- pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+ pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
while (!kthread_should_stop()) {
unsigned long pflags;
diff --git a/mm/debug.c b/mm/debug.c
index 2189357f0987..f2ede2df585a 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -110,13 +110,57 @@ void __dump_page(struct page *page, const char *reason)
else if (PageAnon(page))
type = "anon ";
else if (mapping) {
- if (mapping->host && mapping->host->i_dentry.first) {
- struct dentry *dentry;
- dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias);
- pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry);
- } else
- pr_warn("%ps\n", mapping->a_ops);
+ const struct inode *host;
+ const struct address_space_operations *a_ops;
+ const struct hlist_node *dentry_first;
+ const struct dentry *dentry_ptr;
+ struct dentry dentry;
+
+ /*
+ * mapping can be invalid pointer and we don't want to crash
+ * accessing it, so probe everything depending on it carefully
+ */
+ if (probe_kernel_read_strict(&host, &mapping->host,
+ sizeof(struct inode *)) ||
+ probe_kernel_read_strict(&a_ops, &mapping->a_ops,
+ sizeof(struct address_space_operations *))) {
+ pr_warn("failed to read mapping->host or a_ops, mapping not a valid kernel address?\n");
+ goto out_mapping;
+ }
+
+ if (!host) {
+ pr_warn("mapping->a_ops:%ps\n", a_ops);
+ goto out_mapping;
+ }
+
+ if (probe_kernel_read_strict(&dentry_first,
+ &host->i_dentry.first, sizeof(struct hlist_node *))) {
+ pr_warn("mapping->a_ops:%ps with invalid mapping->host inode address %px\n",
+ a_ops, host);
+ goto out_mapping;
+ }
+
+ if (!dentry_first) {
+ pr_warn("mapping->a_ops:%ps\n", a_ops);
+ goto out_mapping;
+ }
+
+ dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
+ if (probe_kernel_read_strict(&dentry, dentry_ptr,
+ sizeof(struct dentry))) {
+ pr_warn("mapping->aops:%ps with invalid mapping->host->i_dentry.first %px\n",
+ a_ops, dentry_ptr);
+ } else {
+ /*
+ * if dentry is corrupted, the %pd handler may still
+ * crash, but it's unlikely that we reach here with a
+ * corrupted struct page
+ */
+ pr_warn("mapping->aops:%ps dentry name:\"%pd\"\n",
+ a_ops, &dentry);
+ }
}
+out_mapping:
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags,
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
new file mode 100644
index 000000000000..188c18908964
--- /dev/null
+++ b/mm/debug_vm_pgtable.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This kernel test validates architecture page table helpers and
+ * accessors and helps in verifying their continued compliance with
+ * expected generic MM semantics.
+ *
+ * Copyright (C) 2019 ARM Ltd.
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#define pr_fmt(fmt) "debug_vm_pgtable: %s: " fmt, __func__
+
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/kernel.h>
+#include <linux/kconfig.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mm_types.h>
+#include <linux/module.h>
+#include <linux/pfn_t.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/start_kernel.h>
+#include <linux/sched/mm.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+
+#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC)
+
+/*
+ * On s390 platform, the lower 4 bits are used to identify given page table
+ * entry type. But these bits might affect the ability to clear entries with
+ * pxx_clear() because of how dynamic page table folding works on s390. So
+ * while loading up the entries do not change the lower 4 bits. It does not
+ * have affect any other platform.
+ */
+#define S390_MASK_BITS 4
+#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
+#define RANDOM_NZVALUE GENMASK(7, 0)
+
+static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ WARN_ON(!pte_same(pte, pte));
+ WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte))));
+ WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte))));
+ WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte))));
+ WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte))));
+ WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte))));
+ WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ WARN_ON(!pmd_same(pmd, pmd));
+ WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd))));
+ WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd))));
+ WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd))));
+ WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd))));
+ WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd))));
+ WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd))));
+ /*
+ * A huge page does not point to next level page table
+ * entry. Hence this must qualify as pmd_bad().
+ */
+ WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud = pfn_pud(pfn, prot);
+
+ WARN_ON(!pud_same(pud, pud));
+ WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud))));
+ WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud))));
+ WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud))));
+ WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud))));
+
+ if (mm_pmd_folded(mm))
+ return;
+
+ /*
+ * A huge page does not point to next level page table
+ * entry. Hence this must qualify as pud_bad().
+ */
+ WARN_ON(!pud_bad(pud_mkhuge(pud)));
+}
+#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ p4d_t p4d;
+
+ memset(&p4d, RANDOM_NZVALUE, sizeof(p4d_t));
+ WARN_ON(!p4d_same(p4d, p4d));
+}
+
+static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ pgd_t pgd;
+
+ memset(&pgd, RANDOM_NZVALUE, sizeof(pgd_t));
+ WARN_ON(!pgd_same(pgd, pgd));
+}
+
+#ifndef __PAGETABLE_PUD_FOLDED
+static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp)
+{
+ pud_t pud = READ_ONCE(*pudp);
+
+ if (mm_pmd_folded(mm))
+ return;
+
+ pud = __pud(pud_val(pud) | RANDOM_ORVALUE);
+ WRITE_ONCE(*pudp, pud);
+ pud_clear(pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+}
+
+static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
+ pmd_t *pmdp)
+{
+ pud_t pud;
+
+ if (mm_pmd_folded(mm))
+ return;
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as pud_bad().
+ */
+ pmd_clear(pmdp);
+ pud_clear(pudp);
+ pud_populate(mm, pudp, pmdp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(pud_bad(pud));
+}
+#else /* !__PAGETABLE_PUD_FOLDED */
+static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { }
+static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
+ pmd_t *pmdp)
+{
+}
+#endif /* PAGETABLE_PUD_FOLDED */
+
+#ifndef __PAGETABLE_P4D_FOLDED
+static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp)
+{
+ p4d_t p4d = READ_ONCE(*p4dp);
+
+ if (mm_pud_folded(mm))
+ return;
+
+ p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE);
+ WRITE_ONCE(*p4dp, p4d);
+ p4d_clear(p4dp);
+ p4d = READ_ONCE(*p4dp);
+ WARN_ON(!p4d_none(p4d));
+}
+
+static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
+ pud_t *pudp)
+{
+ p4d_t p4d;
+
+ if (mm_pud_folded(mm))
+ return;
+
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as p4d_bad().
+ */
+ pud_clear(pudp);
+ p4d_clear(p4dp);
+ p4d_populate(mm, p4dp, pudp);
+ p4d = READ_ONCE(*p4dp);
+ WARN_ON(p4d_bad(p4d));
+}
+
+static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp)
+{
+ pgd_t pgd = READ_ONCE(*pgdp);
+
+ if (mm_p4d_folded(mm))
+ return;
+
+ pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE);
+ WRITE_ONCE(*pgdp, pgd);
+ pgd_clear(pgdp);
+ pgd = READ_ONCE(*pgdp);
+ WARN_ON(!pgd_none(pgd));
+}
+
+static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
+ p4d_t *p4dp)
+{
+ pgd_t pgd;
+
+ if (mm_p4d_folded(mm))
+ return;
+
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as pgd_bad().
+ */
+ p4d_clear(p4dp);
+ pgd_clear(pgdp);
+ pgd_populate(mm, pgdp, p4dp);
+ pgd = READ_ONCE(*pgdp);
+ WARN_ON(pgd_bad(pgd));
+}
+#else /* !__PAGETABLE_P4D_FOLDED */
+static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { }
+static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { }
+static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
+ pud_t *pudp)
+{
+}
+static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
+ p4d_t *p4dp)
+{
+}
+#endif /* PAGETABLE_P4D_FOLDED */
+
+static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
+ unsigned long vaddr)
+{
+ pte_t pte = READ_ONCE(*ptep);
+
+ pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
+ set_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ pte_clear(mm, vaddr, ptep);
+ pte = READ_ONCE(*ptep);
+ WARN_ON(!pte_none(pte));
+}
+
+static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pmd_t pmd = READ_ONCE(*pmdp);
+
+ pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE);
+ WRITE_ONCE(*pmdp, pmd);
+ pmd_clear(pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+}
+
+static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ pmd_t pmd;
+
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as pmd_bad().
+ */
+ pmd_clear(pmdp);
+ pmd_populate(mm, pmdp, pgtable);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(pmd_bad(pmd));
+}
+
+static unsigned long __init get_random_vaddr(void)
+{
+ unsigned long random_vaddr, random_pages, total_user_pages;
+
+ total_user_pages = (TASK_SIZE - FIRST_USER_ADDRESS) / PAGE_SIZE;
+
+ random_pages = get_random_long() % total_user_pages;
+ random_vaddr = FIRST_USER_ADDRESS + random_pages * PAGE_SIZE;
+
+ return random_vaddr;
+}
+
+static int __init debug_vm_pgtable(void)
+{
+ struct mm_struct *mm;
+ pgd_t *pgdp;
+ p4d_t *p4dp, *saved_p4dp;
+ pud_t *pudp, *saved_pudp;
+ pmd_t *pmdp, *saved_pmdp, pmd;
+ pte_t *ptep;
+ pgtable_t saved_ptep;
+ pgprot_t prot;
+ phys_addr_t paddr;
+ unsigned long vaddr, pte_aligned, pmd_aligned;
+ unsigned long pud_aligned, p4d_aligned, pgd_aligned;
+ spinlock_t *uninitialized_var(ptl);
+
+ pr_info("Validating architecture page table helpers\n");
+ prot = vm_get_page_prot(VMFLAGS);
+ vaddr = get_random_vaddr();
+ mm = mm_alloc();
+ if (!mm) {
+ pr_err("mm_struct allocation failed\n");
+ return 1;
+ }
+
+ /*
+ * PFN for mapping at PTE level is determined from a standard kernel
+ * text symbol. But pfns for higher page table levels are derived by
+ * masking lower bits of this real pfn. These derived pfns might not
+ * exist on the platform but that does not really matter as pfn_pxx()
+ * helpers will still create appropriate entries for the test. This
+ * helps avoid large memory block allocations to be used for mapping
+ * at higher page table levels.
+ */
+ paddr = __pa_symbol(&start_kernel);
+
+ pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT;
+ pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT;
+ pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT;
+ p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT;
+ pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT;
+ WARN_ON(!pfn_valid(pte_aligned));
+
+ pgdp = pgd_offset(mm, vaddr);
+ p4dp = p4d_alloc(mm, pgdp, vaddr);
+ pudp = pud_alloc(mm, p4dp, vaddr);
+ pmdp = pmd_alloc(mm, pudp, vaddr);
+ ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
+
+ /*
+ * Save all the page table page addresses as the page table
+ * entries will be used for testing with random or garbage
+ * values. These saved addresses will be used for freeing
+ * page table pages.
+ */
+ pmd = READ_ONCE(*pmdp);
+ saved_p4dp = p4d_offset(pgdp, 0UL);
+ saved_pudp = pud_offset(p4dp, 0UL);
+ saved_pmdp = pmd_offset(pudp, 0UL);
+ saved_ptep = pmd_pgtable(pmd);
+
+ pte_basic_tests(pte_aligned, prot);
+ pmd_basic_tests(pmd_aligned, prot);
+ pud_basic_tests(pud_aligned, prot);
+ p4d_basic_tests(p4d_aligned, prot);
+ pgd_basic_tests(pgd_aligned, prot);
+
+ pte_clear_tests(mm, ptep, vaddr);
+ pmd_clear_tests(mm, pmdp);
+ pud_clear_tests(mm, pudp);
+ p4d_clear_tests(mm, p4dp);
+ pgd_clear_tests(mm, pgdp);
+
+ pte_unmap_unlock(ptep, ptl);
+
+ pmd_populate_tests(mm, pmdp, saved_ptep);
+ pud_populate_tests(mm, pudp, saved_pmdp);
+ p4d_populate_tests(mm, p4dp, saved_pudp);
+ pgd_populate_tests(mm, pgdp, saved_p4dp);
+
+ p4d_free(mm, saved_p4dp);
+ pud_free(mm, saved_pudp);
+ pmd_free(mm, saved_pmdp);
+ pte_free(mm, saved_ptep);
+
+ mm_dec_nr_puds(mm);
+ mm_dec_nr_pmds(mm);
+ mm_dec_nr_ptes(mm);
+ mmdrop(mm);
+ return 0;
+}
+late_initcall(debug_vm_pgtable);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 4f17c83db575..0e66f2aaeea3 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -22,6 +22,8 @@
#include <asm/unistd.h>
+#include "internal.h"
+
/*
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
* deactivate the pages and clear PG_Referenced.
@@ -102,10 +104,6 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
if (!nrpages)
nrpages = ~0UL;
- /*
- * Ignore return value because fadvise() shall return
- * success even if filesystem can't retrieve a hint,
- */
force_page_cache_readahead(mapping, file, start_index, nrpages);
break;
case POSIX_FADV_NOREUSE:
diff --git a/mm/filemap.c b/mm/filemap.c
index e0ac68f9c89e..bd85a83f457b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -199,9 +199,9 @@ static void unaccount_page_cache_page(struct address_space *mapping,
nr = hpage_nr_pages(page);
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+ __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
if (PageSwapBacked(page)) {
- __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
+ __mod_lruvec_page_state(page, NR_SHMEM, -nr);
if (PageTransHuge(page))
__dec_node_page_state(page, NR_SHMEM_THPS);
} else if (PageTransHuge(page)) {
@@ -802,21 +802,22 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
new->mapping = mapping;
new->index = offset;
+ mem_cgroup_migrate(old, new);
+
xas_lock_irqsave(&xas, flags);
xas_store(&xas, new);
old->mapping = NULL;
/* hugetlb pages do not participate in page cache accounting. */
if (!PageHuge(old))
- __dec_node_page_state(new, NR_FILE_PAGES);
+ __dec_lruvec_page_state(old, NR_FILE_PAGES);
if (!PageHuge(new))
- __inc_node_page_state(new, NR_FILE_PAGES);
+ __inc_lruvec_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(old))
- __dec_node_page_state(new, NR_SHMEM);
+ __dec_lruvec_page_state(old, NR_SHMEM);
if (PageSwapBacked(new))
- __inc_node_page_state(new, NR_SHMEM);
+ __inc_lruvec_page_state(new, NR_SHMEM);
xas_unlock_irqrestore(&xas, flags);
- mem_cgroup_migrate(old, new);
if (freepage)
freepage(old);
put_page(old);
@@ -832,7 +833,6 @@ static int __add_to_page_cache_locked(struct page *page,
{
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
- struct mem_cgroup *memcg;
int error;
void *old;
@@ -840,17 +840,16 @@ static int __add_to_page_cache_locked(struct page *page,
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
mapping_set_update(&xas, mapping);
- if (!huge) {
- error = mem_cgroup_try_charge(page, current->mm,
- gfp_mask, &memcg, false);
- if (error)
- return error;
- }
-
get_page(page);
page->mapping = mapping;
page->index = offset;
+ if (!huge) {
+ error = mem_cgroup_charge(page, current->mm, gfp_mask);
+ if (error)
+ goto error;
+ }
+
do {
xas_lock_irq(&xas);
old = xas_load(&xas);
@@ -869,25 +868,23 @@ static int __add_to_page_cache_locked(struct page *page,
/* hugetlb pages do not participate in page cache accounting */
if (!huge)
- __inc_node_page_state(page, NR_FILE_PAGES);
+ __inc_lruvec_page_state(page, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
- if (xas_error(&xas))
+ if (xas_error(&xas)) {
+ error = xas_error(&xas);
goto error;
+ }
- if (!huge)
- mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
error:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
- if (!huge)
- mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return xas_error(&xas);
+ return error;
}
ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
@@ -1276,7 +1273,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
* instead.
*
* The read of PG_waiters has to be after (or concurrently with) PG_locked
- * being cleared, but a memory barrier should be unneccssary since it is
+ * being cleared, but a memory barrier should be unnecessary since it is
* in the same byte as PG_locked.
*/
static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
@@ -2668,7 +2665,7 @@ void filemap_map_pages(struct vm_fault *vmf,
if (vmf->pte)
vmf->pte += xas.xa_index - last_pgoff;
last_pgoff = xas.xa_index;
- if (alloc_set_pte(vmf, NULL, page))
+ if (alloc_set_pte(vmf, page))
goto unlock;
unlock_page(page);
goto next;
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 60bb20e8a951..bfa3a339253e 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -87,7 +87,7 @@ static inline void inc_frontswap_invalidates(void) { }
*
* This would not guards us against the user deciding to call swapoff right as
* we are calling the backend to initialize (so swapon is in action).
- * Fortunatly for us, the swapon_mutex has been taked by the callee so we are
+ * Fortunately for us, the swapon_mutex has been taken by the callee so we are
* OK. The other scenario where calls to frontswap_store (called via
* swap_writepage) is racing with frontswap_invalidate_area (called via
* swapoff) is again guarded by the swap subsystem.
@@ -413,8 +413,8 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
}
/*
- * Used to check if it's necessory and feasible to unuse pages.
- * Return 1 when nothing to do, 0 when need to shink pages,
+ * Used to check if it's necessary and feasible to unuse pages.
+ * Return 1 when nothing to do, 0 when need to shrink pages,
* error code when there is an error.
*/
static int __frontswap_shrink(unsigned long target_pages,
diff --git a/mm/gup.c b/mm/gup.c
index a99acdc1b568..13ab8ad5491b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -980,6 +980,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* -- If nr_pages is >0, but no pages were pinned, returns -errno.
* -- If nr_pages is >0, and some pages were pinned, returns the number of
* pages pinned. Again, this may be less than nr_pages.
+ * -- 0 return value is possible when the fault would need to be retried.
*
* The caller is responsible for releasing returned @pages, via put_page().
*
@@ -1168,7 +1169,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
return true;
}
-/*
+/**
* fixup_user_fault() - manually resolve a user page fault
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
@@ -1176,7 +1177,8 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
* @address: user address
* @fault_flags:flags to pass down to handle_mm_fault()
* @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller
- * does not allow retry
+ * does not allow retry. If NULL, the caller must guarantee
+ * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
*
* This is meant to be called in the specific scenario where for locking reasons
* we try to access user memory in atomic context (within a pagefault_disable()
@@ -1249,6 +1251,10 @@ retry:
}
EXPORT_SYMBOL_GPL(fixup_user_fault);
+/*
+ * Please note that this function, unlike __get_user_pages will not
+ * return 0 for nr_pages > 0 without FOLL_NOWAIT
+ */
static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
struct mm_struct *mm,
unsigned long start,
@@ -1839,7 +1845,7 @@ static long __get_user_pages_remote(struct task_struct *tsk,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
-/*
+/**
* get_user_pages_remote() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
@@ -1870,13 +1876,13 @@ static long __get_user_pages_remote(struct task_struct *tsk,
*
* Must be called with mmap_sem held for read or write.
*
- * get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
+ * get_user_pages_remote walks a process's page tables and takes a reference
+ * to each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
- * get_user_pages returns, and there may even be a completely different
+ * get_user_pages_remote returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
@@ -1888,17 +1894,17 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
* be called after the page is finished with, and before put_page is called.
*
- * get_user_pages is typically used for fewer-copy IO operations, to get a
- * handle on the memory by some means other than accesses via the user virtual
- * addresses. The pages may be submitted for DMA to devices or accessed via
- * their kernel linear mapping (via the kmap APIs). Care should be taken to
- * use the correct cache flushing APIs.
+ * get_user_pages_remote is typically used for fewer-copy IO operations,
+ * to get a handle on the memory by some means other than accesses
+ * via the user virtual addresses. The pages may be submitted for
+ * DMA to devices or accessed via their kernel linear mapping (via the
+ * kmap APIs). Care should be taken to use the correct cache flushing APIs.
*
* See also get_user_pages_fast, for performance critical applications.
*
- * get_user_pages should be phased out in favor of
+ * get_user_pages_remote should be phased out in favor of
* get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
- * should use get_user_pages because it cannot pass
+ * should use get_user_pages_remote because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
@@ -1937,7 +1943,17 @@ static long __get_user_pages_remote(struct task_struct *tsk,
}
#endif /* !CONFIG_MMU */
-/*
+/**
+ * get_user_pages() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ *
* This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task
* and mm being operated on are the current task's and don't allow
@@ -1960,11 +1976,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages);
-/*
- * We can leverage the VM_FAULT_RETRY functionality in the page fault
- * paths better by using either get_user_pages_locked() or
- * get_user_pages_unlocked().
- *
+/**
* get_user_pages_locked() is suitable to replace the form:
*
* down_read(&mm->mmap_sem);
@@ -1980,6 +1992,21 @@ EXPORT_SYMBOL(get_user_pages);
* get_user_pages_locked(tsk, mm, ..., pages, &locked);
* if (locked)
* up_read(&mm->mmap_sem);
+ *
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
+ *
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages_locked() or
+ * get_user_pages_unlocked().
+ *
*/
long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -2666,62 +2693,6 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end)
}
#endif
-/*
- * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
- * the regular GUP.
- * Note a difference with get_user_pages_fast: this always returns the
- * number of pages pinned, 0 if no pages were pinned.
- *
- * If the architecture does not support this function, simply return with no
- * pages pinned.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- unsigned long len, end;
- unsigned long flags;
- int nr_pinned = 0;
- /*
- * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
- * because gup fast is always a "pin with a +1 page refcount" request.
- */
- unsigned int gup_flags = FOLL_GET;
-
- if (write)
- gup_flags |= FOLL_WRITE;
-
- start = untagged_addr(start) & PAGE_MASK;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
-
- if (end <= start)
- return 0;
- if (unlikely(!access_ok((void __user *)start, len)))
- return 0;
-
- /*
- * Disable interrupts. We use the nested form as we can already have
- * interrupts disabled by get_futex_key.
- *
- * With interrupts disabled, we block page table pages from being
- * freed from under us. See struct mmu_table_batch comments in
- * include/asm-generic/tlb.h for more details.
- *
- * We do not adopt an rcu_read_lock(.) here as we also want to
- * block IPIs that come from THPs splitting.
- */
-
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
- gup_fast_permitted(start, end)) {
- local_irq_save(flags);
- gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
- local_irq_restore(flags);
- }
-
- return nr_pinned;
-}
-EXPORT_SYMBOL_GPL(__get_user_pages_fast);
-
static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
@@ -2750,12 +2721,17 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
struct page **pages)
{
unsigned long addr, len, end;
+ unsigned long flags;
int nr_pinned = 0, ret = 0;
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
- FOLL_FORCE | FOLL_PIN | FOLL_GET)))
+ FOLL_FORCE | FOLL_PIN | FOLL_GET |
+ FOLL_FAST_ONLY)))
return -EINVAL;
+ if (!(gup_flags & FOLL_FAST_ONLY))
+ might_lock_read(&current->mm->mmap_sem);
+
start = untagged_addr(start) & PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
@@ -2766,15 +2742,26 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
if (unlikely(!access_ok((void __user *)start, len)))
return -EFAULT;
+ /*
+ * Disable interrupts. The nested form is used, in order to allow full,
+ * general purpose use of this routine.
+ *
+ * With interrupts disabled, we block page table pages from being
+ * freed from under us. See struct mmu_table_batch comments in
+ * include/asm-generic/tlb.h for more details.
+ *
+ * We do not adopt an rcu_read_lock(.) here as we also want to
+ * block IPIs that come from THPs splitting.
+ */
if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
gup_fast_permitted(start, end)) {
- local_irq_disable();
+ local_irq_save(flags);
gup_pgd_range(addr, end, gup_flags, pages, &nr_pinned);
- local_irq_enable();
+ local_irq_restore(flags);
ret = nr_pinned;
}
- if (nr_pinned < nr_pages) {
+ if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) {
/* Try to get the remaining pages with get_user_pages */
start += nr_pinned << PAGE_SHIFT;
pages += nr_pinned;
@@ -2794,6 +2781,46 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
return ret;
}
+/*
+ * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
+ * the regular GUP.
+ * Note a difference with get_user_pages_fast: this always returns the
+ * number of pages pinned, 0 if no pages were pinned.
+ *
+ * If the architecture does not support this function, simply return with no
+ * pages pinned.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ int nr_pinned;
+ /*
+ * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
+ * because gup fast is always a "pin with a +1 page refcount" request.
+ *
+ * FOLL_FAST_ONLY is required in order to match the API description of
+ * this routine: no fall back to regular ("slow") GUP.
+ */
+ unsigned int gup_flags = FOLL_GET | FOLL_FAST_ONLY;
+
+ if (write)
+ gup_flags |= FOLL_WRITE;
+
+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
+ pages);
+ /*
+ * As specified in the API description above, this routine is not
+ * allowed to return negative values. However, the common core
+ * routine internal_get_user_pages_fast() *can* return -errno.
+ * Therefore, correct for that here:
+ */
+ if (nr_pinned < 0)
+ nr_pinned = 0;
+
+ return nr_pinned;
+}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
@@ -2862,6 +2889,42 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);
+/*
+ * This is the FOLL_PIN equivalent of __get_user_pages_fast(). Behavior is the
+ * same, except that this one sets FOLL_PIN instead of FOLL_GET.
+ *
+ * The API rules are the same, too: no negative values may be returned.
+ */
+int pin_user_pages_fast_only(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int nr_pinned;
+
+ /*
+ * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
+ * rules require returning 0, rather than -errno:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return 0;
+ /*
+ * FOLL_FAST_ONLY is required in order to match the API description of
+ * this routine: no fall back to regular ("slow") GUP.
+ */
+ gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
+ pages);
+ /*
+ * This routine is not allowed to return negative values. However,
+ * internal_get_user_pages_fast() *can* return -errno. Therefore,
+ * correct for that here:
+ */
+ if (nr_pinned < 0)
+ nr_pinned = 0;
+
+ return nr_pinned;
+}
+EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
+
/**
* pin_user_pages_remote() - pin pages of a remote process (task != current)
*
@@ -2939,3 +3002,20 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
pages, vmas, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
+
+/*
+ * pin_user_pages_unlocked() is the FOLL_PIN variant of
+ * get_user_pages_unlocked(). Behavior is the same, except that this one sets
+ * FOLL_PIN and rejects FOLL_GET.
+ */
+long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
+ struct page **pages, unsigned int gup_flags)
+{
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
+}
+EXPORT_SYMBOL(pin_user_pages_unlocked);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 16f2bd6f1549..23763452b52a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -522,7 +522,7 @@ void prep_transhuge_page(struct page *page)
bool is_transparent_hugepage(struct page *page)
{
if (!PageCompound(page))
- return 0;
+ return false;
page = compound_head(page);
return is_huge_zero_page(page) ||
@@ -587,19 +587,19 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
struct page *page, gfp_t gfp)
{
struct vm_area_struct *vma = vmf->vma;
- struct mem_cgroup *memcg;
pgtable_t pgtable;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
vm_fault_t ret = 0;
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
+ if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
return VM_FAULT_FALLBACK;
}
+ cgroup_throttle_swaprate(page, gfp);
pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable)) {
@@ -630,7 +630,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
vm_fault_t ret2;
spin_unlock(vmf->ptl);
- mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
@@ -641,7 +640,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr, true);
- mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
@@ -649,7 +647,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
- count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}
return 0;
@@ -658,7 +656,6 @@ unlock_release:
release:
if (pgtable)
pte_free(vma->vm_mm, pgtable);
- mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
return ret;
@@ -1255,263 +1252,63 @@ unlock:
spin_unlock(vmf->ptl);
}
-static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
- pmd_t orig_pmd, struct page *page)
-{
- struct vm_area_struct *vma = vmf->vma;
- unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- struct mem_cgroup *memcg;
- pgtable_t pgtable;
- pmd_t _pmd;
- int i;
- vm_fault_t ret = 0;
- struct page **pages;
- struct mmu_notifier_range range;
-
- pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
- GFP_KERNEL);
- if (unlikely(!pages)) {
- ret |= VM_FAULT_OOM;
- goto out;
- }
-
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
- vmf->address, page_to_nid(page));
- if (unlikely(!pages[i] ||
- mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
- GFP_KERNEL, &memcg, false))) {
- if (pages[i])
- put_page(pages[i]);
- while (--i >= 0) {
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg,
- false);
- put_page(pages[i]);
- }
- kfree(pages);
- ret |= VM_FAULT_OOM;
- goto out;
- }
- set_page_private(pages[i], (unsigned long)memcg);
- }
-
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- copy_user_highpage(pages[i], page + i,
- haddr + PAGE_SIZE * i, vma);
- __SetPageUptodate(pages[i]);
- cond_resched();
- }
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- haddr, haddr + HPAGE_PMD_SIZE);
- mmu_notifier_invalidate_range_start(&range);
-
- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
- goto out_free_pages;
- VM_BUG_ON_PAGE(!PageHead(page), page);
-
- /*
- * Leave pmd empty until pte is filled note we must notify here as
- * concurrent CPU thread might write to new page before the call to
- * mmu_notifier_invalidate_range_end() happens which can lead to a
- * device seeing memory write in different order than CPU.
- *
- * See Documentation/vm/mmu_notifier.rst
- */
- pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-
- pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
- pmd_populate(vma->vm_mm, &_pmd, pgtable);
-
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t entry;
- entry = mk_pte(pages[i], vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
- mem_cgroup_commit_charge(pages[i], memcg, false, false);
- lru_cache_add_active_or_unevictable(pages[i], vma);
- vmf->pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*vmf->pte));
- set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
- pte_unmap(vmf->pte);
- }
- kfree(pages);
-
- smp_wmb(); /* make pte visible before pmd */
- pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
- page_remove_rmap(page, true);
- spin_unlock(vmf->ptl);
-
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(&range);
-
- ret |= VM_FAULT_WRITE;
- put_page(page);
-
-out:
- return ret;
-
-out_free_pages:
- spin_unlock(vmf->ptl);
- mmu_notifier_invalidate_range_end(&range);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg, false);
- put_page(pages[i]);
- }
- kfree(pages);
- goto out;
-}
-
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = NULL, *new_page;
- struct mem_cgroup *memcg;
+ struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- struct mmu_notifier_range range;
- gfp_t huge_gfp; /* for allocation and charge */
- vm_fault_t ret = 0;
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
+
if (is_huge_zero_pmd(orig_pmd))
- goto alloc;
+ goto fallback;
+
spin_lock(vmf->ptl);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
- goto out_unlock;
+
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
- /*
- * We can only reuse the page if nobody else maps the huge page or it's
- * part.
- */
+
+ /* Lock page for reuse_swap_page() */
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
lock_page(page);
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
unlock_page(page);
put_page(page);
- goto out_unlock;
+ return 0;
}
put_page(page);
}
+
+ /*
+ * We can only reuse the page if nobody else maps the huge page or it's
+ * part.
+ */
if (reuse_swap_page(page, NULL)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
+ if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- ret |= VM_FAULT_WRITE;
unlock_page(page);
- goto out_unlock;
- }
- unlock_page(page);
- get_page(page);
- spin_unlock(vmf->ptl);
-alloc:
- if (__transparent_hugepage_enabled(vma) &&
- !transparent_hugepage_debug_cow()) {
- huge_gfp = alloc_hugepage_direct_gfpmask(vma);
- new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
- } else
- new_page = NULL;
-
- if (likely(new_page)) {
- prep_transhuge_page(new_page);
- } else {
- if (!page) {
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- ret |= VM_FAULT_FALLBACK;
- } else {
- ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
- if (ret & VM_FAULT_OOM) {
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- ret |= VM_FAULT_FALLBACK;
- }
- put_page(page);
- }
- count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
-
- if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
- huge_gfp, &memcg, true))) {
- put_page(new_page);
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- if (page)
- put_page(page);
- ret |= VM_FAULT_FALLBACK;
- count_vm_event(THP_FAULT_FALLBACK);
- count_vm_event(THP_FAULT_FALLBACK_CHARGE);
- goto out;
- }
-
- count_vm_event(THP_FAULT_ALLOC);
- count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
-
- if (!page)
- clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
- else
- copy_user_huge_page(new_page, page, vmf->address,
- vma, HPAGE_PMD_NR);
- __SetPageUptodate(new_page);
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- haddr, haddr + HPAGE_PMD_SIZE);
- mmu_notifier_invalidate_range_start(&range);
-
- spin_lock(vmf->ptl);
- if (page)
- put_page(page);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
spin_unlock(vmf->ptl);
- mem_cgroup_cancel_charge(new_page, memcg, true);
- put_page(new_page);
- goto out_mn;
- } else {
- pmd_t entry;
- entry = mk_huge_pmd(new_page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
- page_add_new_anon_rmap(new_page, vma, haddr, true);
- mem_cgroup_commit_charge(new_page, memcg, false, true);
- lru_cache_add_active_or_unevictable(new_page, vma);
- set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- if (!page) {
- add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- } else {
- VM_BUG_ON_PAGE(!PageHead(page), page);
- page_remove_rmap(page, true);
- put_page(page);
- }
- ret |= VM_FAULT_WRITE;
+ return VM_FAULT_WRITE;
}
+
+ unlock_page(page);
spin_unlock(vmf->ptl);
-out_mn:
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(&range);
-out:
- return ret;
-out_unlock:
- spin_unlock(vmf->ptl);
- return ret;
+fallback:
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ return VM_FAULT_FALLBACK;
}
/*
@@ -1582,7 +1379,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
goto skip_mlock;
if (!trylock_page(page))
goto skip_mlock;
- lru_add_drain();
if (page->mapping && !PageDoubleMap(page))
mlock_vma_page(page);
unlock_page(page);
@@ -2360,15 +2156,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
atomic_inc(&page[i]._mapcount);
}
+ lock_page_memcg(page);
if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
/* Last compound_mapcount is gone. */
- __dec_node_page_state(page, NR_ANON_THPS);
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
if (TestClearPageDoubleMap(page)) {
/* No need in mapcount reference anymore */
for (i = 0; i < HPAGE_PMD_NR; i++)
atomic_dec(&page[i]._mapcount);
}
}
+ unlock_page_memcg(page);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
@@ -2784,7 +2582,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
- bool mlocked;
unsigned long flags;
pgoff_t end;
@@ -2843,14 +2640,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out_unlock;
}
- mlocked = PageMlocked(head);
unmap_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
- /* Make sure the page is not on per-CPU pagevec as it takes pin */
- if (mlocked)
- lru_add_drain();
-
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irqsave(&pgdata->lru_lock, flags);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f9a97320e1de..dcb34d7f5562 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -59,8 +59,8 @@ __initdata LIST_HEAD(huge_boot_pages);
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
-static unsigned long __initdata default_hstate_size;
static bool __initdata parsed_valid_hugepagesz = true;
+static bool __initdata parsed_default_hugepagesz;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -85,7 +85,7 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
spin_unlock(&spool->lock);
/* If no pages are used, and no other handles to the subpool
- * remain, give up any reservations mased on minimum size and
+ * remain, give up any reservations based on minimum size and
* free the subpool */
if (free) {
if (spool->min_hpages != -1)
@@ -133,7 +133,7 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
* the request. Otherwise, return the number of pages by which the
* global pools must be adjusted (upward). The returned value may
* only be different than the passed value (delta) in the case where
- * a subpool minimum size must be manitained.
+ * a subpool minimum size must be maintained.
*/
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
long delta)
@@ -473,7 +473,7 @@ out_of_memory:
*
* Return the number of new huge pages added to the map. This number is greater
* than or equal to zero. If file_region entries needed to be allocated for
- * this operation and we were not able to allocate, it ruturns -ENOMEM.
+ * this operation and we were not able to allocate, it returns -ENOMEM.
* region_add of regions of length 1 never allocate file_regions and cannot
* fail; region_chg will always allocate at least 1 entry and a region_add for
* 1 page will only require at most 1 entry.
@@ -988,7 +988,7 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
* We know VM_NORESERVE is not set. Therefore, there SHOULD
* be a region map for all pages. The only situation where
* there is no region map is if a hole was punched via
- * fallocate. In this case, there really are no reverves to
+ * fallocate. In this case, there really are no reserves to
* use. This situation is indicated if chg != 0.
*/
if (chg)
@@ -1519,7 +1519,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
* For gigantic hugepages allocated through bootmem at
* boot, it's safer to be consistent with the not-gigantic
* hugepages and clear the PG_reserved bit from all tail pages
- * too. Otherwse drivers using get_user_pages() to access tail
+ * too. Otherwise drivers using get_user_pages() to access tail
* pages may get the reference counting wrong if they see
* PG_reserved set on a tail page (despite the head page not
* having PG_reserved set). Enforcing this consistency between
@@ -3060,7 +3060,7 @@ static void __init hugetlb_sysfs_init(void)
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
hstate_kobjs, &hstate_attr_group);
if (err)
- pr_err("Hugetlb: Unable to add hstate %s", h->name);
+ pr_err("HugeTLB: Unable to add hstate %s", h->name);
}
}
@@ -3164,7 +3164,7 @@ static void hugetlb_register_node(struct node *node)
nhs->hstate_kobjs,
&per_node_hstate_attr_group);
if (err) {
- pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
+ pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
h->name, node->dev.id);
hugetlb_unregister_node(node);
break;
@@ -3212,23 +3212,41 @@ static int __init hugetlb_init(void)
{
int i;
- if (!hugepages_supported())
+ if (!hugepages_supported()) {
+ if (hugetlb_max_hstate || default_hstate_max_huge_pages)
+ pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
return 0;
+ }
- if (!size_to_hstate(default_hstate_size)) {
- if (default_hstate_size != 0) {
- pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
- default_hstate_size, HPAGE_SIZE);
+ /*
+ * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some
+ * architectures depend on setup being done here.
+ */
+ hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
+ if (!parsed_default_hugepagesz) {
+ /*
+ * If we did not parse a default huge page size, set
+ * default_hstate_idx to HPAGE_SIZE hstate. And, if the
+ * number of huge pages for this default size was implicitly
+ * specified, set that here as well.
+ * Note that the implicit setting will overwrite an explicit
+ * setting. A warning will be printed in this case.
+ */
+ default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
+ if (default_hstate_max_huge_pages) {
+ if (default_hstate.max_huge_pages) {
+ char buf[32];
+
+ string_get_size(huge_page_size(&default_hstate),
+ 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
+ default_hstate.max_huge_pages, buf);
+ pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
+ default_hstate_max_huge_pages);
+ }
+ default_hstate.max_huge_pages =
+ default_hstate_max_huge_pages;
}
-
- default_hstate_size = HPAGE_SIZE;
- if (!size_to_hstate(default_hstate_size))
- hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
- }
- default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
- if (default_hstate_max_huge_pages) {
- if (!default_hstate.max_huge_pages)
- default_hstate.max_huge_pages = default_hstate_max_huge_pages;
}
hugetlb_cma_check();
@@ -3256,10 +3274,10 @@ static int __init hugetlb_init(void)
}
subsys_initcall(hugetlb_init);
-/* Should be called on processing a hugepagesz=... option */
-void __init hugetlb_bad_size(void)
+/* Overwritten by architectures with more huge page sizes */
+bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
{
- parsed_valid_hugepagesz = false;
+ return size == HPAGE_SIZE;
}
void __init hugetlb_add_hstate(unsigned int order)
@@ -3268,7 +3286,6 @@ void __init hugetlb_add_hstate(unsigned int order)
unsigned long i;
if (size_to_hstate(PAGE_SIZE << order)) {
- pr_warn("hugepagesz= specified twice, ignoring\n");
return;
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -3289,20 +3306,29 @@ void __init hugetlb_add_hstate(unsigned int order)
parsed_hstate = h;
}
-static int __init hugetlb_nrpages_setup(char *s)
+/*
+ * hugepages command line processing
+ * hugepages normally follows a valid hugepagsz or default_hugepagsz
+ * specification. If not, ignore the hugepages value. hugepages can also
+ * be the first huge page command line option in which case it implicitly
+ * specifies the number of huge pages for the default size.
+ */
+static int __init hugepages_setup(char *s)
{
unsigned long *mhp;
static unsigned long *last_mhp;
if (!parsed_valid_hugepagesz) {
- pr_warn("hugepages = %s preceded by "
- "an unsupported hugepagesz, ignoring\n", s);
+ pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
- return 1;
+ return 0;
}
+
/*
- * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
- * so this hugepages= parameter goes to the "default hstate".
+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
+ * yet, so this hugepages= parameter goes to the "default hstate".
+ * Otherwise, it goes with the previously parsed hugepagesz or
+ * default_hugepagesz.
*/
else if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
@@ -3310,8 +3336,8 @@ static int __init hugetlb_nrpages_setup(char *s)
mhp = &parsed_hstate->max_huge_pages;
if (mhp == last_mhp) {
- pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
- return 1;
+ pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
+ return 0;
}
if (sscanf(s, "%lu", mhp) <= 0)
@@ -3329,14 +3355,102 @@ static int __init hugetlb_nrpages_setup(char *s)
return 1;
}
-__setup("hugepages=", hugetlb_nrpages_setup);
+__setup("hugepages=", hugepages_setup);
-static int __init hugetlb_default_setup(char *s)
+/*
+ * hugepagesz command line processing
+ * A specific huge page size can only be specified once with hugepagesz.
+ * hugepagesz is followed by hugepages on the command line. The global
+ * variable 'parsed_valid_hugepagesz' is used to determine if prior
+ * hugepagesz argument was valid.
+ */
+static int __init hugepagesz_setup(char *s)
{
- default_hstate_size = memparse(s, &s);
+ unsigned long size;
+ struct hstate *h;
+
+ parsed_valid_hugepagesz = false;
+ size = (unsigned long)memparse(s, NULL);
+
+ if (!arch_hugetlb_valid_size(size)) {
+ pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
+ return 0;
+ }
+
+ h = size_to_hstate(size);
+ if (h) {
+ /*
+ * hstate for this size already exists. This is normally
+ * an error, but is allowed if the existing hstate is the
+ * default hstate. More specifically, it is only allowed if
+ * the number of huge pages for the default hstate was not
+ * previously specified.
+ */
+ if (!parsed_default_hugepagesz || h != &default_hstate ||
+ default_hstate.max_huge_pages) {
+ pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
+ return 0;
+ }
+
+ /*
+ * No need to call hugetlb_add_hstate() as hstate already
+ * exists. But, do set parsed_hstate so that a following
+ * hugepages= parameter will be applied to this hstate.
+ */
+ parsed_hstate = h;
+ parsed_valid_hugepagesz = true;
+ return 1;
+ }
+
+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
+ parsed_valid_hugepagesz = true;
return 1;
}
-__setup("default_hugepagesz=", hugetlb_default_setup);
+__setup("hugepagesz=", hugepagesz_setup);
+
+/*
+ * default_hugepagesz command line input
+ * Only one instance of default_hugepagesz allowed on command line.
+ */
+static int __init default_hugepagesz_setup(char *s)
+{
+ unsigned long size;
+
+ parsed_valid_hugepagesz = false;
+ if (parsed_default_hugepagesz) {
+ pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
+ return 0;
+ }
+
+ size = (unsigned long)memparse(s, NULL);
+
+ if (!arch_hugetlb_valid_size(size)) {
+ pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
+ return 0;
+ }
+
+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
+ parsed_valid_hugepagesz = true;
+ parsed_default_hugepagesz = true;
+ default_hstate_idx = hstate_index(size_to_hstate(size));
+
+ /*
+ * The number of default huge pages (for this size) could have been
+ * specified as the first hugetlb parameter: hugepages=X. If so,
+ * then default_hstate_max_huge_pages is set. If the default huge
+ * page size is gigantic (>= MAX_ORDER), then the pages must be
+ * allocated here from bootmem allocator.
+ */
+ if (default_hstate_max_huge_pages) {
+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ if (hstate_is_gigantic(&default_hstate))
+ hugetlb_hstate_alloc_pages(&default_hstate);
+ default_hstate_max_huge_pages = 0;
+ }
+
+ return 1;
+}
+__setup("default_hugepagesz=", default_hugepagesz_setup);
static unsigned int cpuset_mems_nr(unsigned int *array)
{
@@ -4465,9 +4579,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* entry could be a migration/hwpoison entry at this point, so this
* check prevents the kernel from going below assuming that we have
- * a active hugepage in pagecache. This goto expects the 2nd page fault,
- * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
- * handle it.
+ * an active hugepage in pagecache. This goto expects the 2nd page
+ * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
+ * properly handle it.
*/
if (!pte_present(entry))
goto out_mutex;
@@ -5354,8 +5468,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
* huge_pte_offset() - Walk the page table to resolve the hugepage
* entry at address @addr
*
- * Return: Pointer to page table or swap entry (PUD or PMD) for
- * address @addr, or NULL if a p*d_none() entry is encountered and the
+ * Return: Pointer to page table entry (PUD or PMD) for
+ * address @addr, or NULL if a !p*d_present() entry is encountered and the
* size @sz doesn't match the hugepage size at this level of the page
* table.
*/
@@ -5364,8 +5478,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
{
pgd_t *pgd;
p4d_t *p4d;
- pud_t *pud, pud_entry;
- pmd_t *pmd, pmd_entry;
+ pud_t *pud;
+ pmd_t *pmd;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
@@ -5375,22 +5489,16 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
return NULL;
pud = pud_offset(p4d, addr);
- pud_entry = READ_ONCE(*pud);
- if (sz != PUD_SIZE && pud_none(pud_entry))
- return NULL;
- /* hugepage or swap? */
- if (pud_huge(pud_entry) || !pud_present(pud_entry))
+ if (sz == PUD_SIZE)
+ /* must be pud huge, non-present or none */
return (pte_t *)pud;
-
- pmd = pmd_offset(pud, addr);
- pmd_entry = READ_ONCE(*pmd);
- if (sz != PMD_SIZE && pmd_none(pmd_entry))
+ if (!pud_present(*pud))
return NULL;
- /* hugepage or swap? */
- if (pmd_huge(pmd_entry) || !pmd_present(pmd_entry))
- return (pte_t *)pmd;
+ /* must have a valid entry and size to go further */
- return NULL;
+ pmd = pmd_offset(pud, addr);
+ /* must be pmd huge, non-present or none */
+ return (pte_t *)pmd;
}
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
diff --git a/mm/internal.h b/mm/internal.h
index b5634e78f01d..791e4b5a807c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,18 +49,20 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
struct zap_details *details);
-extern unsigned int __do_page_cache_readahead(struct address_space *mapping,
- struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+void force_page_cache_readahead(struct address_space *, struct file *,
+ pgoff_t index, unsigned long nr_to_read);
+void __do_page_cache_readahead(struct address_space *, struct file *,
+ pgoff_t index, unsigned long nr_to_read,
unsigned long lookahead_size);
/*
* Submit IO for the read-ahead request in file_ra_state.
*/
-static inline unsigned long ra_submit(struct file_ra_state *ra,
+static inline void ra_submit(struct file_ra_state *ra,
struct address_space *mapping, struct file *filp)
{
- return __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
+ __do_page_cache_readahead(mapping, filp,
+ ra->start, ra->size, ra->async_size);
}
/**
@@ -125,12 +127,12 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
* between functions involved in allocations, including the alloc_pages*
* family of functions.
*
- * nodemask, migratetype and high_zoneidx are initialized only once in
+ * nodemask, migratetype and highest_zoneidx are initialized only once in
* __alloc_pages_nodemask() and then never change.
*
- * zonelist, preferred_zone and classzone_idx are set first in
+ * zonelist, preferred_zone and highest_zoneidx are set first in
* __alloc_pages_nodemask() for the fast path, and might be later changed
- * in __alloc_pages_slowpath(). All other functions pass the whole strucure
+ * in __alloc_pages_slowpath(). All other functions pass the whole structure
* by a const pointer.
*/
struct alloc_context {
@@ -138,12 +140,21 @@ struct alloc_context {
nodemask_t *nodemask;
struct zoneref *preferred_zoneref;
int migratetype;
- enum zone_type high_zoneidx;
+
+ /*
+ * highest_zoneidx represents highest usable zone index of
+ * the allocation request. Due to the nature of the zone,
+ * memory on lower zone than the highest_zoneidx will be
+ * protected by lowmem_reserve[highest_zoneidx].
+ *
+ * highest_zoneidx is also used by reclaim/compaction to limit
+ * the target zone since higher zone than this index cannot be
+ * usable for this allocation request.
+ */
+ enum zone_type highest_zoneidx;
bool spread_dirty_pages;
};
-#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref)
-
/*
* Locate the struct page for both the matching buddy in our
* pair (buddy1) and the combined O(n+1) page they form (page).
@@ -222,7 +233,7 @@ struct compact_control {
int order; /* order a direct compactor needs */
int migratetype; /* migratetype of direct compactor */
const unsigned int alloc_flags; /* alloc flags of a direct compactor */
- const int classzone_idx; /* zone index of a direct compactor */
+ const int highest_zoneidx; /* zone index of a direct compactor */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool no_set_skip_hint; /* Don't mark blocks for skipping */
@@ -527,7 +538,7 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long);
extern void set_pageblock_order(void);
-unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *page_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN WMARK_MIN
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 2906358e42f0..757d4074fe28 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -33,7 +33,6 @@
#include <linux/types.h>
#include <linux/vmalloc.h>
#include <linux/bug.h>
-#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -613,24 +612,6 @@ void kasan_free_shadow(const struct vm_struct *vm)
}
#endif
-extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip);
-extern bool report_enabled(void);
-
-bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip)
-{
- unsigned long flags = user_access_save();
- bool ret = false;
-
- if (likely(report_enabled())) {
- __kasan_report(addr, size, is_write, ip);
- ret = true;
- }
-
- user_access_restore(flags);
-
- return ret;
-}
-
#ifdef CONFIG_MEMORY_HOTPLUG
static bool shadow_mapped(unsigned long addr)
{
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 80f23c9da6b0..51ec45407a0b 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -29,6 +29,7 @@
#include <linux/kasan.h>
#include <linux/module.h>
#include <linux/sched/task_stack.h>
+#include <linux/uaccess.h>
#include <asm/sections.h>
@@ -454,7 +455,7 @@ static void print_shadow_for_address(const void *addr)
}
}
-bool report_enabled(void)
+static bool report_enabled(void)
{
if (current->kasan_depth)
return false;
@@ -479,7 +480,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
end_report(&flags);
}
-void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip)
+static void __kasan_report(unsigned long addr, size_t size, bool is_write,
+ unsigned long ip)
{
struct kasan_access_info info;
void *tagged_addr;
@@ -518,6 +520,22 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon
end_report(&flags);
}
+bool kasan_report(unsigned long addr, size_t size, bool is_write,
+ unsigned long ip)
+{
+ unsigned long flags = user_access_save();
+ bool ret = false;
+
+ if (likely(report_enabled())) {
+ __kasan_report(addr, size, is_write, ip);
+ ret = true;
+ }
+
+ user_access_restore(flags);
+
+ return ret;
+}
+
#ifdef CONFIG_KASAN_INLINE
/*
* With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 99d77ffb79c2..3f032487825b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -28,6 +28,8 @@ enum scan_result {
SCAN_SUCCEED,
SCAN_PMD_NULL,
SCAN_EXCEED_NONE_PTE,
+ SCAN_EXCEED_SWAP_PTE,
+ SCAN_EXCEED_SHARED_PTE,
SCAN_PTE_NON_PRESENT,
SCAN_PTE_UFFD_WP,
SCAN_PAGE_RO,
@@ -47,7 +49,6 @@ enum scan_result {
SCAN_DEL_PAGE_LRU,
SCAN_ALLOC_HUGE_PAGE_FAIL,
SCAN_CGROUP_CHARGE_FAIL,
- SCAN_EXCEED_SWAP_PTE,
SCAN_TRUNCATED,
SCAN_PAGE_HAS_PRIVATE,
};
@@ -72,6 +73,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
*/
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
+static unsigned int khugepaged_max_ptes_shared __read_mostly;
#define MM_SLOTS_HASH_BITS 10
static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -291,15 +293,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr =
__ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
khugepaged_max_ptes_swap_store);
+static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
+}
+
+static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_shared;
+
+ err = kstrtoul(buf, 10, &max_ptes_shared);
+ if (err || max_ptes_shared > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_shared = max_ptes_shared;
+
+ return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_shared_attr =
+ __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
+ khugepaged_max_ptes_shared_store);
+
static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr,
&khugepaged_max_ptes_none_attr.attr,
+ &khugepaged_max_ptes_swap_attr.attr,
+ &khugepaged_max_ptes_shared_attr.attr,
&pages_to_scan_attr.attr,
&pages_collapsed_attr.attr,
&full_scans_attr.attr,
&scan_sleep_millisecs_attr.attr,
&alloc_sleep_millisecs_attr.attr,
- &khugepaged_max_ptes_swap_attr.attr,
NULL,
};
@@ -359,6 +389,7 @@ int __init khugepaged_init(void)
khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
+ khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
return 0;
}
@@ -512,27 +543,52 @@ void __khugepaged_exit(struct mm_struct *mm)
static void release_pte_page(struct page *page)
{
- dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page));
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_is_file_lru(page),
+ -compound_nr(page));
unlock_page(page);
putback_lru_page(page);
}
-static void release_pte_pages(pte_t *pte, pte_t *_pte)
+static void release_pte_pages(pte_t *pte, pte_t *_pte,
+ struct list_head *compound_pagelist)
{
+ struct page *page, *tmp;
+
while (--_pte >= pte) {
pte_t pteval = *_pte;
- if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
- release_pte_page(pte_page(pteval));
+
+ page = pte_page(pteval);
+ if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
+ !PageCompound(page))
+ release_pte_page(page);
+ }
+
+ list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
+ list_del(&page->lru);
+ release_pte_page(page);
}
}
+static bool is_refcount_suitable(struct page *page)
+{
+ int expected_refcount;
+
+ expected_refcount = total_mapcount(page);
+ if (PageSwapCache(page))
+ expected_refcount += compound_nr(page);
+
+ return page_count(page) == expected_refcount;
+}
+
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
- pte_t *pte)
+ pte_t *pte,
+ struct list_head *compound_pagelist)
{
struct page *page = NULL;
pte_t *_pte;
- int none_or_zero = 0, result = 0, referenced = 0;
+ int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
bool writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
@@ -558,13 +614,27 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
- /* TODO: teach khugepaged to collapse THP mapped with pte */
- if (PageCompound(page)) {
- result = SCAN_PAGE_COMPOUND;
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+
+ if (page_mapcount(page) > 1 &&
+ ++shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
goto out;
}
- VM_BUG_ON_PAGE(!PageAnon(page), page);
+ if (PageCompound(page)) {
+ struct page *p;
+ page = compound_head(page);
+
+ /*
+ * Check if we have dealt with the compound page
+ * already
+ */
+ list_for_each_entry(p, compound_pagelist, lru) {
+ if (page == p)
+ goto next;
+ }
+ }
/*
* We can do it before isolate_lru_page because the
@@ -578,28 +648,30 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
/*
- * cannot use mapcount: can't collapse if there's a gup pin.
- * The page must only be referenced by the scanned process
- * and page swap cache.
+ * Check if the page has any GUP (or other external) pins.
+ *
+ * The page table that maps the page has been already unlinked
+ * from the page table tree and this process cannot get
+ * an additinal pin on the page.
+ *
+ * New pins can come later if the page is shared across fork,
+ * but not from this process. The other process cannot write to
+ * the page, only trigger CoW.
*/
- if (page_count(page) != 1 + PageSwapCache(page)) {
+ if (!is_refcount_suitable(page)) {
unlock_page(page);
result = SCAN_PAGE_COUNT;
goto out;
}
- if (pte_write(pteval)) {
- writable = true;
- } else {
- if (PageSwapCache(page) &&
- !reuse_swap_page(page, NULL)) {
- unlock_page(page);
- result = SCAN_SWAP_CACHE_PAGE;
- goto out;
- }
+ if (!pte_write(pteval) && PageSwapCache(page) &&
+ !reuse_swap_page(page, NULL)) {
/*
- * Page is not in the swap cache. It can be collapsed
- * into a THP.
+ * Page is in the swap cache and cannot be re-used.
+ * It cannot be collapsed into a THP.
*/
+ unlock_page(page);
+ result = SCAN_SWAP_CACHE_PAGE;
+ goto out;
}
/*
@@ -611,16 +683,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_DEL_PAGE_LRU;
goto out;
}
- inc_node_page_state(page,
- NR_ISOLATED_ANON + page_is_file_lru(page));
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_is_file_lru(page),
+ compound_nr(page));
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
+ if (PageCompound(page))
+ list_add_tail(&page->lru, compound_pagelist);
+next:
/* There should be enough young pte to collapse the page */
if (pte_young(pteval) ||
page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced++;
+
+ if (pte_write(pteval))
+ writable = true;
}
if (likely(writable)) {
if (likely(referenced)) {
@@ -634,7 +713,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
out:
- release_pte_pages(pte, _pte);
+ release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
return 0;
@@ -643,13 +722,14 @@ out:
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
struct vm_area_struct *vma,
unsigned long address,
- spinlock_t *ptl)
+ spinlock_t *ptl,
+ struct list_head *compound_pagelist)
{
+ struct page *src_page, *tmp;
pte_t *_pte;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, page++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
- struct page *src_page;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
clear_user_highpage(page, address);
@@ -669,8 +749,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
- VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
- release_pte_page(src_page);
+ if (!PageCompound(src_page))
+ release_pte_page(src_page);
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
@@ -687,6 +767,11 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
free_page_and_swap_cache(src_page);
}
}
+
+ list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
+ list_del(&src_page->lru);
+ release_pte_page(src_page);
+ }
}
static void khugepaged_alloc_sleep(void)
@@ -899,11 +984,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
.pgoff = linear_page_index(vma, address),
};
- /* we only decide to swapin, if there is enough young ptes */
- if (referenced < HPAGE_PMD_NR/2) {
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
- }
vmf.pte = pte_offset_map(pmd, address);
for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
vmf.pte++, vmf.address += PAGE_SIZE) {
@@ -936,6 +1016,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
}
vmf.pte--;
pte_unmap(vmf.pte);
+
+ /* Drain LRU add pagevec to remove extra pin on the swapped in pages */
+ if (swapped_in)
+ lru_add_drain();
+
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
return true;
}
@@ -943,15 +1028,15 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
- int node, int referenced)
+ int node, int referenced, int unmapped)
{
+ LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
pte_t *pte;
pgtable_t pgtable;
struct page *new_page;
spinlock_t *pmd_ptl, *pte_ptl;
int isolated = 0, result = 0;
- struct mem_cgroup *memcg;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
gfp_t gfp;
@@ -974,15 +1059,15 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out_nolock;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out_nolock;
}
+ count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
down_read(&mm->mmap_sem);
result = hugepage_vma_revalidate(mm, address, &vma);
if (result) {
- mem_cgroup_cancel_charge(new_page, memcg, true);
up_read(&mm->mmap_sem);
goto out_nolock;
}
@@ -990,7 +1075,6 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd = mm_find_pmd(mm, address);
if (!pmd) {
result = SCAN_PMD_NULL;
- mem_cgroup_cancel_charge(new_page, memcg, true);
up_read(&mm->mmap_sem);
goto out_nolock;
}
@@ -1000,8 +1084,8 @@ static void collapse_huge_page(struct mm_struct *mm,
* If it fails, we release mmap_sem and jump out_nolock.
* Continuing to collapse causes inconsistency.
*/
- if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) {
- mem_cgroup_cancel_charge(new_page, memcg, true);
+ if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
+ pmd, referenced)) {
up_read(&mm->mmap_sem);
goto out_nolock;
}
@@ -1044,7 +1128,8 @@ static void collapse_huge_page(struct mm_struct *mm,
mmu_notifier_invalidate_range_end(&range);
spin_lock(pte_ptl);
- isolated = __collapse_huge_page_isolate(vma, address, pte);
+ isolated = __collapse_huge_page_isolate(vma, address, pte,
+ &compound_pagelist);
spin_unlock(pte_ptl);
if (unlikely(!isolated)) {
@@ -1069,7 +1154,8 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
anon_vma_unlock_write(vma->anon_vma);
- __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
+ __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
+ &compound_pagelist);
pte_unmap(pte);
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
@@ -1087,8 +1173,6 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
- mem_cgroup_commit_charge(new_page, memcg, false, true);
- count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
lru_cache_add_active_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
@@ -1102,10 +1186,11 @@ static void collapse_huge_page(struct mm_struct *mm,
out_up_write:
up_write(&mm->mmap_sem);
out_nolock:
+ if (!IS_ERR_OR_NULL(*hpage))
+ mem_cgroup_uncharge(*hpage);
trace_mm_collapse_huge_page(mm, isolated, result);
return;
out:
- mem_cgroup_cancel_charge(new_page, memcg, true);
goto out_up_write;
}
@@ -1116,7 +1201,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, none_or_zero = 0, result = 0, referenced = 0;
+ int ret = 0, result = 0, referenced = 0;
+ int none_or_zero = 0, shared = 0;
struct page *page = NULL;
unsigned long _address;
spinlock_t *ptl;
@@ -1188,12 +1274,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
- /* TODO: teach khugepaged to collapse THP mapped with pte */
- if (PageCompound(page)) {
- result = SCAN_PAGE_COMPOUND;
+ if (page_mapcount(page) > 1 &&
+ ++shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
goto out_unmap;
}
+ page = compound_head(page);
+
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
@@ -1220,11 +1308,23 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
}
/*
- * cannot use mapcount: can't collapse if there's a gup pin.
- * The page must only be referenced by the scanned process
- * and page swap cache.
+ * Check if the page has any GUP (or other external) pins.
+ *
+ * Here the check is racy it may see totmal_mapcount > refcount
+ * in some cases.
+ * For example, one process with one forked child process.
+ * The parent has the PMD split due to MADV_DONTNEED, then
+ * the child is trying unmap the whole PMD, but khugepaged
+ * may be scanning the parent between the child has
+ * PageDoubleMap flag cleared and dec the mapcount. So
+ * khugepaged may see total_mapcount > refcount.
+ *
+ * But such case is ephemeral we could always retry collapse
+ * later. However it may report false positive if the page
+ * has excessive GUP pins (i.e. 512). Anyway the same check
+ * will be done again later the risk seems low.
*/
- if (page_count(page) != 1 + PageSwapCache(page)) {
+ if (!is_refcount_suitable(page)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@@ -1233,22 +1333,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
mmu_notifier_test_young(vma->vm_mm, address))
referenced++;
}
- if (writable) {
- if (referenced) {
- result = SCAN_SUCCEED;
- ret = 1;
- } else {
- result = SCAN_LACK_REFERENCED_PAGE;
- }
- } else {
+ if (!writable) {
result = SCAN_PAGE_RO;
+ } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
+ result = SCAN_LACK_REFERENCED_PAGE;
+ } else {
+ result = SCAN_SUCCEED;
+ ret = 1;
}
out_unmap:
pte_unmap_unlock(pte, ptl);
if (ret) {
node = khugepaged_find_target_node();
/* collapse_huge_page will return with the mmap_sem released */
- collapse_huge_page(mm, address, hpage, node, referenced);
+ collapse_huge_page(mm, address, hpage, node,
+ referenced, unmapped);
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
@@ -1515,7 +1614,6 @@ static void collapse_file(struct mm_struct *mm,
struct address_space *mapping = file->f_mapping;
gfp_t gfp;
struct page *new_page;
- struct mem_cgroup *memcg;
pgoff_t index, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
@@ -1534,10 +1632,11 @@ static void collapse_file(struct mm_struct *mm,
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out;
}
+ count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
/* This will be less messy when we use multi-index entries */
do {
@@ -1547,7 +1646,6 @@ static void collapse_file(struct mm_struct *mm,
break;
xas_unlock_irq(&xas);
if (!xas_nomem(&xas, GFP_KERNEL)) {
- mem_cgroup_cancel_charge(new_page, memcg, true);
result = SCAN_FAIL;
goto out;
}
@@ -1692,6 +1790,7 @@ static void collapse_file(struct mm_struct *mm,
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL)) {
result = SCAN_PAGE_HAS_PRIVATE;
+ putback_lru_page(page);
goto out_unlock;
}
@@ -1740,12 +1839,9 @@ out_unlock:
}
if (nr_none) {
- struct zone *zone = page_zone(new_page);
-
- __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
+ __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
if (is_shmem)
- __mod_node_page_state(zone->zone_pgdat,
- NR_SHMEM, nr_none);
+ __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
}
xa_locked:
@@ -1783,15 +1879,9 @@ xa_unlocked:
SetPageUptodate(new_page);
page_ref_add(new_page, HPAGE_PMD_NR - 1);
- mem_cgroup_commit_charge(new_page, memcg, false, true);
-
- if (is_shmem) {
+ if (is_shmem)
set_page_dirty(new_page);
- lru_cache_add_anon(new_page);
- } else {
- lru_cache_add_file(new_page);
- }
- count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
+ lru_cache_add(new_page);
/*
* Remove pte page tables, so we can re-fault the page as huge.
@@ -1838,13 +1928,14 @@ xa_unlocked:
VM_BUG_ON(nr_none);
xas_unlock_irq(&xas);
- mem_cgroup_cancel_charge(new_page, memcg, true);
new_page->mapping = NULL;
}
unlock_page(new_page);
out:
VM_BUG_ON(!list_empty(&pagelist));
+ if (!IS_ERR_OR_NULL(*hpage))
+ mem_cgroup_uncharge(*hpage);
/* TODO: tracepoints */
}
@@ -2083,6 +2174,8 @@ static void khugepaged_do_scan(void)
barrier(); /* write khugepaged_pages_to_scan to local stack */
+ lru_add_drain_all();
+
while (progress < pages) {
if (!khugepaged_prealloc_page(&hpage, &wait))
break;
diff --git a/mm/ksm.c b/mm/ksm.c
index 281c00129a2e..18c5d005bd01 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -612,7 +612,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
* Move the old stable node to the second dimension
* queued in the hlist_dup. The invariant is that all
* dup stable_nodes in the chain->hlist point to pages
- * that are wrprotected and have the exact same
+ * that are write protected and have the exact same
* content.
*/
stable_node_chain_add_dup(dup, chain);
@@ -1148,7 +1148,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
/*
* No need to check ksm_use_zero_pages here: we can only have a
- * zero_page here if ksm_use_zero_pages was enabled alreaady.
+ * zero_page here if ksm_use_zero_pages was enabled already.
*/
if (!is_zero_pfn(page_to_pfn(kpage))) {
get_page(kpage);
@@ -1608,7 +1608,7 @@ again:
* continue. All KSM pages belonging to the
* stable_node dups in a stable_node chain
* have the same content and they're
- * wrprotected at all times. Any will work
+ * write protected at all times. Any will work
* fine to continue the walk.
*/
tree_page = get_ksm_page(stable_node_any,
@@ -1843,7 +1843,7 @@ again:
* continue. All KSM pages belonging to the
* stable_node dups in a stable_node chain
* have the same content and they're
- * wrprotected at all times. Any will work
+ * write protected at all times. Any will work
* fine to continue the walk.
*/
tree_page = get_ksm_page(stable_node_any,
@@ -2001,7 +2001,7 @@ static void stable_tree_append(struct rmap_item *rmap_item,
* duplicate. page_migration could break later if rmap breaks,
* so we can as well crash here. We really need to check for
* rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
- * for other negative values as an undeflow if detected here
+ * for other negative values as an underflow if detected here
* for the first time (and not when decreasing rmap_hlist_len)
* would be sign of memory corruption in the stable_node.
*/
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 4d5294c39bba..9222910ab1cb 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -213,7 +213,7 @@ restart:
/*
* decrement nr_to_walk first so that we don't livelock if we
- * get stuck on large numbesr of LRU_RETRY items
+ * get stuck on large numbers of LRU_RETRY items
*/
if (!*nr_to_walk)
break;
diff --git a/mm/memblock.c b/mm/memblock.c
index c79ba6f9920c..39aceafc57f6 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -78,7 +78,7 @@
* * memblock_alloc*() - these functions return the **virtual** address
* of the allocated memory.
*
- * Note, that both API variants use implict assumptions about allowed
+ * Note, that both API variants use implicit assumptions about allowed
* memory ranges and the fallback methods. Consult the documentation
* of memblock_alloc_internal() and memblock_alloc_range_nid()
* functions for more elaborate description.
@@ -620,7 +620,7 @@ repeat:
* area, insert that portion.
*/
if (rbase > base) {
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#ifdef CONFIG_NEED_MULTIPLE_NODES
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
WARN_ON(flags != rgn->flags);
@@ -1197,7 +1197,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
*idx = ULLONG_MAX;
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
* Common iterator interface used to define for_each_mem_pfn_range().
*/
@@ -1207,13 +1206,15 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
{
struct memblock_type *type = &memblock.memory;
struct memblock_region *r;
+ int r_nid;
while (++*idx < type->cnt) {
r = &type->regions[*idx];
+ r_nid = memblock_get_region_node(r);
if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
continue;
- if (nid == MAX_NUMNODES || nid == r->nid)
+ if (nid == MAX_NUMNODES || nid == r_nid)
break;
}
if (*idx >= type->cnt) {
@@ -1226,7 +1227,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
if (out_end_pfn)
*out_end_pfn = PFN_DOWN(r->base + r->size);
if (out_nid)
- *out_nid = r->nid;
+ *out_nid = r_nid;
}
/**
@@ -1245,6 +1246,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
struct memblock_type *type, int nid)
{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
int start_rgn, end_rgn;
int i, ret;
@@ -1256,9 +1258,10 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
memblock_set_region_node(&type->regions[i], nid);
memblock_merge_regions(type);
+#endif
return 0;
}
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/**
* __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
@@ -1797,7 +1800,6 @@ bool __init_memblock memblock_is_map_memory(phys_addr_t addr)
return !memblock_is_nomap(&memblock.memory.regions[i]);
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
unsigned long *start_pfn, unsigned long *end_pfn)
{
@@ -1810,9 +1812,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
*start_pfn = PFN_DOWN(type->regions[mid].base);
*end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);
- return type->regions[mid].nid;
+ return memblock_get_region_node(&type->regions[mid]);
}
-#endif
/**
* memblock_is_region_memory - check if a region is a subset of memory
@@ -1903,7 +1904,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
size = rgn->size;
end = base + size - 1;
flags = rgn->flags;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#ifdef CONFIG_NEED_MULTIPLE_NODES
if (memblock_get_region_node(rgn) != MAX_NUMNODES)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a3b97f103966..d04f1e242d47 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -83,9 +83,9 @@ static bool cgroup_memory_nokmem;
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
-int do_swap_account __read_mostly;
+bool cgroup_memory_noswap __read_mostly;
#else
-#define do_swap_account 0
+#define cgroup_memory_noswap 1
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -95,7 +95,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
}
#define THRESHOLDS_EVENTS_TARGET 128
@@ -834,25 +834,8 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
- bool compound, int nr_pages)
+ int nr_pages)
{
- /*
- * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
- * counted as CACHE even if it's on ANON LRU.
- */
- if (PageAnon(page))
- __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
- else {
- __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
- if (PageSwapBacked(page))
- __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
- }
-
- if (compound) {
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
- }
-
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
__count_memcg_events(memcg, PGPGIN, 1);
@@ -1218,9 +1201,8 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
* @page: the page
* @pgdat: pgdat of the page
*
- * This function is only safe when following the LRU page isolation
- * and putback protocol: the LRU lock must be held, and the page must
- * either be PageLRU() or the caller must have isolated/allocated it.
+ * This function relies on page->mem_cgroup being stable - see the
+ * access rules in commit_charge().
*/
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
{
@@ -1314,7 +1296,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
if (do_memsw_account()) {
count = page_counter_read(&memcg->memsw);
limit = READ_ONCE(memcg->memsw.max);
- if (count <= limit)
+ if (count < limit)
margin = min(margin, limit - count);
else
margin = 0;
@@ -1389,10 +1371,10 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
*/
seq_buf_printf(&s, "anon %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_RSS) *
+ (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
PAGE_SIZE);
seq_buf_printf(&s, "file %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_CACHE) *
+ (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
PAGE_SIZE);
seq_buf_printf(&s, "kernel_stack %llu\n",
(u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
@@ -1418,15 +1400,11 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
(u64)memcg_page_state(memcg, NR_WRITEBACK) *
PAGE_SIZE);
- /*
- * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
- * with the NR_ANON_THP vm counter, but right now it's a pain in the
- * arse because it requires migrating the work out of rmap to a place
- * where the page->mem_cgroup is set up and stable.
- */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
seq_buf_printf(&s, "anon_thp %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
- PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_ANON_THPS) *
+ HPAGE_PMD_SIZE);
+#endif
for (i = 0; i < NR_LRU_LISTS; i++)
seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
@@ -1451,6 +1429,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
memcg_page_state(memcg, WORKINGSET_REFAULT));
seq_buf_printf(&s, "workingset_activate %lu\n",
memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+ seq_buf_printf(&s, "workingset_restore %lu\n",
+ memcg_page_state(memcg, WORKINGSET_RESTORE));
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
@@ -1979,6 +1959,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
*/
struct mem_cgroup *lock_page_memcg(struct page *page)
{
+ struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
unsigned long flags;
@@ -1998,7 +1979,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
if (mem_cgroup_disabled())
return NULL;
again:
- memcg = page->mem_cgroup;
+ memcg = head->mem_cgroup;
if (unlikely(!memcg))
return NULL;
@@ -2006,7 +1987,7 @@ again:
return memcg;
spin_lock_irqsave(&memcg->move_lock, flags);
- if (memcg != page->mem_cgroup) {
+ if (memcg != head->mem_cgroup) {
spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
@@ -2049,7 +2030,9 @@ void __unlock_page_memcg(struct mem_cgroup *memcg)
*/
void unlock_page_memcg(struct page *page)
{
- __unlock_page_memcg(page->mem_cgroup);
+ struct page *head = compound_head(page);
+
+ __unlock_page_memcg(head->mem_cgroup);
}
EXPORT_SYMBOL(unlock_page_memcg);
@@ -2621,69 +2604,18 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
css_put_many(&memcg->css, nr_pages);
}
-static void lock_page_lru(struct page *page, int *isolated)
-{
- pg_data_t *pgdat = page_pgdat(page);
-
- spin_lock_irq(&pgdat->lru_lock);
- if (PageLRU(page)) {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_lru(page));
- *isolated = 1;
- } else
- *isolated = 0;
-}
-
-static void unlock_page_lru(struct page *page, int isolated)
-{
- pg_data_t *pgdat = page_pgdat(page);
-
- if (isolated) {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- SetPageLRU(page);
- add_page_to_lru_list(page, lruvec, page_lru(page));
- }
- spin_unlock_irq(&pgdat->lru_lock);
-}
-
-static void commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare)
+static void commit_charge(struct page *page, struct mem_cgroup *memcg)
{
- int isolated;
-
VM_BUG_ON_PAGE(page->mem_cgroup, page);
-
/*
- * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
- * may already be on some other mem_cgroup's LRU. Take care of it.
- */
- if (lrucare)
- lock_page_lru(page, &isolated);
-
- /*
- * Nobody should be changing or seriously looking at
- * page->mem_cgroup at this point:
- *
- * - the page is uncharged
- *
- * - the page is off-LRU
+ * Any of the following ensures page->mem_cgroup stability:
*
- * - an anonymous fault has exclusive page access, except for
- * a locked page table
- *
- * - a page cache insertion, a swapin fault, or a migration
- * have the page locked
+ * - the page lock
+ * - LRU isolation
+ * - lock_page_memcg()
+ * - exclusive reference
*/
page->mem_cgroup = memcg;
-
- if (lrucare)
- unlock_page_lru(page, isolated);
}
#ifdef CONFIG_MEMCG_KMEM
@@ -3015,8 +2947,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
for (i = 1; i < HPAGE_PMD_NR; i++)
head[i].mem_cgroup = head->mem_cgroup;
-
- __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3201,7 +3131,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
* Test whether @memcg has children, dead or alive. Note that this
* function doesn't care whether @memcg has use_hierarchy enabled and
* returns %true if there are child csses according to the cgroup
- * hierarchy. Testing use_hierarchy is the caller's responsiblity.
+ * hierarchy. Testing use_hierarchy is the caller's responsibility.
*/
static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
@@ -3299,8 +3229,8 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
unsigned long val;
if (mem_cgroup_is_root(memcg)) {
- val = memcg_page_state(memcg, MEMCG_CACHE) +
- memcg_page_state(memcg, MEMCG_RSS);
+ val = memcg_page_state(memcg, NR_FILE_PAGES) +
+ memcg_page_state(memcg, NR_ANON_MAPPED);
if (swap)
val += memcg_page_state(memcg, MEMCG_SWAP);
} else {
@@ -3688,7 +3618,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
+ int nid, unsigned int lru_mask, bool tree)
{
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
unsigned long nr = 0;
@@ -3699,13 +3629,17 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
+ if (tree)
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+ else
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
}
return nr;
}
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
- unsigned int lru_mask)
+ unsigned int lru_mask,
+ bool tree)
{
unsigned long nr = 0;
enum lru_list lru;
@@ -3713,7 +3647,10 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
+ if (tree)
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+ else
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
}
return nr;
}
@@ -3733,34 +3670,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
};
const struct numa_stat *stat;
int nid;
- unsigned long nr;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
- seq_printf(m, "%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
+ seq_printf(m, "%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ false));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, false));
seq_putc(m, '\n');
}
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- struct mem_cgroup *iter;
-
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_node_nr_lru_pages(
- iter, nid, stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
+
+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ true));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, true));
seq_putc(m, '\n');
}
@@ -3769,9 +3700,11 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
#endif /* CONFIG_NUMA */
static const unsigned int memcg1_stats[] = {
- MEMCG_CACHE,
- MEMCG_RSS,
- MEMCG_RSS_HUGE,
+ NR_FILE_PAGES,
+ NR_ANON_MAPPED,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ NR_ANON_THPS,
+#endif
NR_SHMEM,
NR_FILE_MAPPED,
NR_FILE_DIRTY,
@@ -3782,7 +3715,9 @@ static const unsigned int memcg1_stats[] = {
static const char *const memcg1_stat_names[] = {
"cache",
"rss",
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"rss_huge",
+#endif
"shmem",
"mapped_file",
"dirty",
@@ -3808,11 +3743,16 @@ static int memcg_stat_show(struct seq_file *m, void *v)
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+ unsigned long nr;
+
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
- memcg_page_state_local(memcg, memcg1_stats[i]) *
- PAGE_SIZE);
+ nr = memcg_page_state_local(memcg, memcg1_stats[i]);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (memcg1_stats[i] == NR_ANON_THPS)
+ nr *= HPAGE_PMD_NR;
+#endif
+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -3858,23 +3798,17 @@ static int memcg_stat_show(struct seq_file *m, void *v)
{
pg_data_t *pgdat;
struct mem_cgroup_per_node *mz;
- struct zone_reclaim_stat *rstat;
- unsigned long recent_rotated[2] = {0, 0};
- unsigned long recent_scanned[2] = {0, 0};
+ unsigned long anon_cost = 0;
+ unsigned long file_cost = 0;
for_each_online_pgdat(pgdat) {
mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
- rstat = &mz->lruvec.reclaim_stat;
- recent_rotated[0] += rstat->recent_rotated[0];
- recent_rotated[1] += rstat->recent_rotated[1];
- recent_scanned[0] += rstat->recent_scanned[0];
- recent_scanned[1] += rstat->recent_scanned[1];
+ anon_cost += mz->lruvec.anon_cost;
+ file_cost += mz->lruvec.file_cost;
}
- seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
- seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
- seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
- seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
+ seq_printf(m, "anon_cost %lu\n", anon_cost);
+ seq_printf(m, "file_cost %lu\n", file_cost);
}
#endif
@@ -4850,7 +4784,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
* limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
* memory-controlled cgroups to 64k.
*
- * However, there usually are many references to the oflline CSS after
+ * However, there usually are many references to the offline CSS after
* the cgroup has been destroyed, such as page cache or reclaimable
* slab objects, that don't need to hang on to the ID. We want to keep
* those dead CSS from occupying IDs, or we might quickly exhaust the
@@ -5308,8 +5242,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
* we call find_get_page() with swapper_space directly.
*/
page = find_get_page(swap_address_space(ent), swp_offset(ent));
- if (do_memsw_account())
- entry->val = ent.val;
+ entry->val = ent.val;
return page;
}
@@ -5343,8 +5276,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
page = find_get_entry(mapping, pgoff);
if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
- if (do_memsw_account())
- *entry = swp;
+ *entry = swp;
page = find_get_page(swap_address_space(swp),
swp_offset(swp));
}
@@ -5375,10 +5307,8 @@ static int mem_cgroup_move_account(struct page *page,
{
struct lruvec *from_vec, *to_vec;
struct pglist_data *pgdat;
- unsigned long flags;
unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
- bool anon;
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5396,30 +5326,47 @@ static int mem_cgroup_move_account(struct page *page,
if (page->mem_cgroup != from)
goto out_unlock;
- anon = PageAnon(page);
-
pgdat = page_pgdat(page);
from_vec = mem_cgroup_lruvec(from, pgdat);
to_vec = mem_cgroup_lruvec(to, pgdat);
- spin_lock_irqsave(&from->move_lock, flags);
+ lock_page_memcg(page);
- if (!anon && page_mapped(page)) {
- __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
- __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
- }
+ if (PageAnon(page)) {
+ if (page_mapped(page)) {
+ __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
+ if (PageTransHuge(page)) {
+ __mod_lruvec_state(from_vec, NR_ANON_THPS,
+ -nr_pages);
+ __mod_lruvec_state(to_vec, NR_ANON_THPS,
+ nr_pages);
+ }
- /*
- * move_lock grabbed above and caller set from->moving_account, so
- * mod_memcg_page_state will serialize updates to PageDirty.
- * So mapping should be stable for dirty pages.
- */
- if (!anon && PageDirty(page)) {
- struct address_space *mapping = page_mapping(page);
+ }
+ } else {
+ __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
+
+ if (PageSwapBacked(page)) {
+ __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
+ }
- if (mapping_cap_account_dirty(mapping)) {
- __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
- __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
+ if (page_mapped(page)) {
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
+ }
+
+ if (PageDirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping_cap_account_dirty(mapping)) {
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
+ -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
+ nr_pages);
+ }
}
}
@@ -5429,22 +5376,30 @@ static int mem_cgroup_move_account(struct page *page,
}
/*
+ * All state has been migrated, let's switch to the new memcg.
+ *
* It is safe to change page->mem_cgroup here because the page
- * is referenced, charged, and isolated - we can't race with
- * uncharging, charging, migration, or LRU putback.
+ * is referenced, charged, isolated, and locked: we can't race
+ * with (un)charging, migration, LRU putback, or anything else
+ * that would rely on a stable page->mem_cgroup.
+ *
+ * Note that lock_page_memcg is a memcg lock, not a page lock,
+ * to save space. As soon as we switch page->mem_cgroup to a
+ * new memcg that isn't locked, the above state can change
+ * concurrently again. Make sure we're truly done with it.
*/
+ smp_mb();
- /* caller should have done css_get */
- page->mem_cgroup = to;
+ page->mem_cgroup = to; /* caller should have done css_get */
- spin_unlock_irqrestore(&from->move_lock, flags);
+ __unlock_page_memcg(from);
ret = 0;
local_irq_disable();
- mem_cgroup_charge_statistics(to, page, compound, nr_pages);
+ mem_cgroup_charge_statistics(to, page, nr_pages);
memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
memcg_check_events(from, page);
local_irq_enable();
out_unlock:
@@ -6227,7 +6182,6 @@ static struct cftype memory_files[] = {
},
{
.name = "stat",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
},
{
@@ -6430,125 +6384,63 @@ out:
}
/**
- * mem_cgroup_try_charge - try charging a page
+ * mem_cgroup_charge - charge a newly allocated page to a cgroup
* @page: page to charge
* @mm: mm context of the victim
* @gfp_mask: reclaim mode
- * @memcgp: charged memcg return
- * @compound: charge the page as compound or small page
*
* Try to charge @page to the memcg that @mm belongs to, reclaiming
* pages according to @gfp_mask if necessary.
*
- * Returns 0 on success, with *@memcgp pointing to the charged memcg.
- * Otherwise, an error code is returned.
- *
- * After page->mapping has been set up, the caller must finalize the
- * charge with mem_cgroup_commit_charge(). Or abort the transaction
- * with mem_cgroup_cancel_charge() in case page instantiation fails.
+ * Returns 0 on success. Otherwise, an error code is returned.
*/
-int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound)
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
{
+ unsigned int nr_pages = hpage_nr_pages(page);
struct mem_cgroup *memcg = NULL;
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret = 0;
if (mem_cgroup_disabled())
goto out;
if (PageSwapCache(page)) {
+ swp_entry_t ent = { .val = page_private(page), };
+ unsigned short id;
+
/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
- * already charged pages, too. The USED bit is protected by
- * the page lock, which serializes swap cache removal, which
+ * already charged pages, too. page->mem_cgroup is protected
+ * by the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (compound_head(page)->mem_cgroup)
goto out;
- if (do_swap_account) {
- swp_entry_t ent = { .val = page_private(page), };
- unsigned short id = lookup_swap_cgroup_id(ent);
-
- rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
- if (memcg && !css_tryget_online(&memcg->css))
- memcg = NULL;
- rcu_read_unlock();
- }
+ id = lookup_swap_cgroup_id(ent);
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (memcg && !css_tryget_online(&memcg->css))
+ memcg = NULL;
+ rcu_read_unlock();
}
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
ret = try_charge(memcg, gfp_mask, nr_pages);
+ if (ret)
+ goto out_put;
- css_put(&memcg->css);
-out:
- *memcgp = memcg;
- return ret;
-}
-
-int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound)
-{
- struct mem_cgroup *memcg;
- int ret;
-
- ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
- memcg = *memcgp;
- mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
- return ret;
-}
-
-/**
- * mem_cgroup_commit_charge - commit a page charge
- * @page: page to charge
- * @memcg: memcg to charge the page to
- * @lrucare: page might be on LRU already
- * @compound: charge the page as compound or small page
- *
- * Finalize a charge transaction started by mem_cgroup_try_charge(),
- * after page->mapping has been set up. This must happen atomically
- * as part of the page instantiation, i.e. under the page table lock
- * for anonymous pages, under the page lock for page and swap cache.
- *
- * In addition, the page must not be on the LRU during the commit, to
- * prevent racing with task migration. If it might be, use @lrucare.
- *
- * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
- */
-void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare, bool compound)
-{
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
-
- VM_BUG_ON_PAGE(!page->mapping, page);
- VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
-
- if (mem_cgroup_disabled())
- return;
- /*
- * Swap faults will attempt to charge the same page multiple
- * times. But reuse_swap_page() might have removed the page
- * from swapcache already, so we can't check PageSwapCache().
- */
- if (!memcg)
- return;
-
- commit_charge(page, memcg, lrucare);
+ commit_charge(page, memcg);
local_irq_disable();
- mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
memcg_check_events(memcg, page);
local_irq_enable();
- if (do_memsw_account() && PageSwapCache(page)) {
+ if (PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) };
/*
* The swap entry might not get freed for a long time,
@@ -6557,42 +6449,18 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
*/
mem_cgroup_uncharge_swap(entry, nr_pages);
}
-}
-/**
- * mem_cgroup_cancel_charge - cancel a page charge
- * @page: page to charge
- * @memcg: memcg to charge the page to
- * @compound: charge the page as compound or small page
- *
- * Cancel a charge transaction started by mem_cgroup_try_charge().
- */
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
- bool compound)
-{
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
-
- if (mem_cgroup_disabled())
- return;
- /*
- * Swap faults will attempt to charge the same page multiple
- * times. But reuse_swap_page() might have removed the page
- * from swapcache already, so we can't check PageSwapCache().
- */
- if (!memcg)
- return;
-
- cancel_charge(memcg, nr_pages);
+out_put:
+ css_put(&memcg->css);
+out:
+ return ret;
}
struct uncharge_gather {
struct mem_cgroup *memcg;
+ unsigned long nr_pages;
unsigned long pgpgout;
- unsigned long nr_anon;
- unsigned long nr_file;
unsigned long nr_kmem;
- unsigned long nr_huge;
- unsigned long nr_shmem;
struct page *dummy_page;
};
@@ -6603,37 +6471,32 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
- unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
unsigned long flags;
if (!mem_cgroup_is_root(ug->memcg)) {
- page_counter_uncharge(&ug->memcg->memory, nr_pages);
+ page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, nr_pages);
+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
memcg_oom_recover(ug->memcg);
}
local_irq_save(flags);
- __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
- __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
- __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
- __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
if (!mem_cgroup_is_root(ug->memcg))
- css_put_many(&ug->memcg->css, nr_pages);
+ css_put_many(&ug->memcg->css, ug->nr_pages);
}
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
{
+ unsigned long nr_pages;
+
VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
- !PageHWPoison(page) , page);
if (!page->mem_cgroup)
return;
@@ -6652,23 +6515,13 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
ug->memcg = page->mem_cgroup;
}
- if (!PageKmemcg(page)) {
- unsigned int nr_pages = 1;
+ nr_pages = compound_nr(page);
+ ug->nr_pages += nr_pages;
- if (PageTransHuge(page)) {
- nr_pages = compound_nr(page);
- ug->nr_huge += nr_pages;
- }
- if (PageAnon(page))
- ug->nr_anon += nr_pages;
- else {
- ug->nr_file += nr_pages;
- if (PageSwapBacked(page))
- ug->nr_shmem += nr_pages;
- }
+ if (!PageKmemcg(page)) {
ug->pgpgout++;
} else {
- ug->nr_kmem += compound_nr(page);
+ ug->nr_kmem += nr_pages;
__ClearPageKmemcg(page);
}
@@ -6705,8 +6558,7 @@ static void uncharge_list(struct list_head *page_list)
* mem_cgroup_uncharge - uncharge a page
* @page: page to uncharge
*
- * Uncharge a page previously charged with mem_cgroup_try_charge() and
- * mem_cgroup_commit_charge().
+ * Uncharge a page previously charged with mem_cgroup_charge().
*/
void mem_cgroup_uncharge(struct page *page)
{
@@ -6729,7 +6581,7 @@ void mem_cgroup_uncharge(struct page *page)
* @page_list: list of pages to uncharge
*
* Uncharge a list of pages previously charged with
- * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
+ * mem_cgroup_charge().
*/
void mem_cgroup_uncharge_list(struct list_head *page_list)
{
@@ -6782,11 +6634,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
page_counter_charge(&memcg->memsw, nr_pages);
css_get_many(&memcg->css, nr_pages);
- commit_charge(newpage, memcg, false);
+ commit_charge(newpage, memcg);
local_irq_save(flags);
- mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage),
- nr_pages);
+ mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
memcg_check_events(memcg, newpage);
local_irq_restore(flags);
}
@@ -6974,7 +6825,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
- if (!do_memsw_account())
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
memcg = page->mem_cgroup;
@@ -7003,7 +6854,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
- if (memcg != swap_memcg) {
+ if (!cgroup_memory_noswap && memcg != swap_memcg) {
if (!mem_cgroup_is_root(swap_memcg))
page_counter_charge(&swap_memcg->memsw, nr_entries);
page_counter_uncharge(&memcg->memsw, nr_entries);
@@ -7016,8 +6867,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* only synchronisation we have for updating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
- -nr_entries);
+ mem_cgroup_charge_statistics(memcg, page, -nr_entries);
memcg_check_events(memcg, page);
if (!mem_cgroup_is_root(memcg))
@@ -7040,7 +6890,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
struct mem_cgroup *memcg;
unsigned short oldid;
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
memcg = page->mem_cgroup;
@@ -7056,7 +6906,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
memcg = mem_cgroup_id_get_online(memcg);
- if (!mem_cgroup_is_root(memcg) &&
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
@@ -7084,14 +6934,11 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
struct mem_cgroup *memcg;
unsigned short id;
- if (!do_swap_account)
- return;
-
id = swap_cgroup_record(entry, 0, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
- if (!mem_cgroup_is_root(memcg)) {
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->swap, nr_pages);
else
@@ -7107,7 +6954,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
long nr_swap_pages = get_nr_swap_pages();
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return nr_swap_pages;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
@@ -7124,7 +6971,7 @@ bool mem_cgroup_swap_full(struct page *page)
if (vm_swap_full())
return true;
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return false;
memcg = page->mem_cgroup;
@@ -7139,22 +6986,15 @@ bool mem_cgroup_swap_full(struct page *page)
return false;
}
-/* for remember boot option*/
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
-static int really_do_swap_account __initdata = 1;
-#else
-static int really_do_swap_account __initdata;
-#endif
-
-static int __init enable_swap_account(char *s)
+static int __init setup_swap_account(char *s)
{
if (!strcmp(s, "1"))
- really_do_swap_account = 1;
+ cgroup_memory_noswap = 0;
else if (!strcmp(s, "0"))
- really_do_swap_account = 0;
+ cgroup_memory_noswap = 1;
return 1;
}
-__setup("swapaccount=", enable_swap_account);
+__setup("swapaccount=", setup_swap_account);
static u64 swap_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
@@ -7220,7 +7060,7 @@ static struct cftype swap_files[] = {
{ } /* terminate */
};
-static struct cftype memsw_cgroup_files[] = {
+static struct cftype memsw_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
@@ -7249,13 +7089,16 @@ static struct cftype memsw_cgroup_files[] = {
static int __init mem_cgroup_swap_init(void)
{
- if (!mem_cgroup_disabled() && really_do_swap_account) {
- do_swap_account = 1;
- WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
- swap_files));
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
- memsw_cgroup_files));
- }
+ /* No memory control -> no swap control */
+ if (mem_cgroup_disabled())
+ cgroup_memory_noswap = true;
+
+ if (cgroup_memory_noswap)
+ return 0;
+
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
+
return 0;
}
subsys_initcall(mem_cgroup_swap_init);
diff --git a/mm/memory.c b/mm/memory.c
index f703fe8c8346..8d52a91d707b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -802,8 +802,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
- } else if (pte_devmap(pte)) {
- page = pte_page(pte);
}
out_set_pte:
@@ -2469,7 +2467,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
}
/*
- * The same page can be mapped back since last copy attampt.
+ * The same page can be mapped back since last copy attempt.
* Try to copy again under PTL.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
@@ -2647,7 +2645,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
- struct mem_cgroup *memcg;
struct mmu_notifier_range range;
if (unlikely(anon_vma_prepare(vma)))
@@ -2678,8 +2675,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
}
- if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
+ cgroup_throttle_swaprate(new_page, GFP_KERNEL);
__SetPageUptodate(new_page);
@@ -2713,7 +2711,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
- mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
@@ -2751,8 +2748,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
/* Free the old page.. */
new_page = old_page;
page_copied = 1;
- } else {
- mem_cgroup_cancel_charge(new_page, memcg, false);
}
if (new_page)
@@ -3090,7 +3085,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
- struct mem_cgroup *memcg;
swp_entry_t entry;
pte_t pte;
int locked;
@@ -3131,10 +3125,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
if (page) {
+ int err;
+
__SetPageLocked(page);
__SetPageSwapBacked(page);
set_page_private(page, entry.val);
- lru_cache_add_anon(page);
+
+ /* Tell memcg to use swap ownership records */
+ SetPageSwapCache(page);
+ err = mem_cgroup_charge(page, vma->vm_mm,
+ GFP_KERNEL);
+ ClearPageSwapCache(page);
+ if (err)
+ goto out_page;
+
+ lru_cache_add(page);
swap_readpage(page, true);
}
} else {
@@ -3195,11 +3200,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_page;
}
- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
- ret = VM_FAULT_OOM;
- goto out_page;
- }
+ cgroup_throttle_swaprate(page, GFP_KERNEL);
/*
* Back out if somebody else already faulted in this pte.
@@ -3247,11 +3248,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
- mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
}
@@ -3287,7 +3286,6 @@ unlock:
out:
return ret;
out_nomap:
- mem_cgroup_cancel_charge(page, memcg, false);
pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
unlock_page(page);
@@ -3308,7 +3306,6 @@ out_release:
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct mem_cgroup *memcg;
struct page *page;
vm_fault_t ret = 0;
pte_t entry;
@@ -3361,9 +3358,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (!page)
goto oom;
- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
- false))
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
+ cgroup_throttle_swaprate(page, GFP_KERNEL);
/*
* The memory barrier inside __SetPageUptodate makes sure that
@@ -3388,14 +3385,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
@@ -3406,7 +3401,6 @@ unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
release:
- mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
goto unlock;
oom_free_page:
@@ -3611,7 +3605,6 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
* mapping. If needed, the fucntion allocates page table or use pre-allocated.
*
* @vmf: fault environment
- * @memcg: memcg to charge page (only for private mappings)
* @page: page to map
*
* Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
@@ -3622,8 +3615,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
*
* Return: %0 on success, %VM_FAULT_ code in case of error.
*/
-vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
- struct page *page)
+vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -3631,9 +3623,6 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
vm_fault_t ret;
if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
- /* THP on COW? */
- VM_BUG_ON_PAGE(memcg, page);
-
ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
@@ -3657,7 +3646,6 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
@@ -3706,7 +3694,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
if (!(vmf->vma->vm_flags & VM_SHARED))
ret = check_stable_address_space(vmf->vma->vm_mm);
if (!ret)
- ret = alloc_set_pte(vmf, vmf->memcg, page);
+ ret = alloc_set_pte(vmf, page);
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
@@ -3866,11 +3854,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
- &vmf->memcg, false)) {
+ if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
+ cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -3888,7 +3876,6 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
goto uncharge_out;
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
put_page(vmf->cow_page);
return ret;
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fc0aad0bc1f5..c4d5c45820d0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -98,11 +98,14 @@ void mem_hotplug_done(void)
u64 max_mem_size = U64_MAX;
/* add this memory to iomem resource */
-static struct resource *register_memory_resource(u64 start, u64 size)
+static struct resource *register_memory_resource(u64 start, u64 size,
+ const char *resource_name)
{
struct resource *res;
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- char *resource_name = "System RAM";
+
+ if (strcmp(resource_name, "System RAM"))
+ flags |= IORESOURCE_MEM_DRIVER_MANAGED;
/*
* Make sure value parsed from 'mem=' only restricts memory adding
@@ -862,10 +865,9 @@ static void reset_node_present_pages(pg_data_t *pgdat)
}
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
+static pg_data_t __ref *hotadd_new_pgdat(int nid)
{
struct pglist_data *pgdat;
- unsigned long start_pfn = PFN_DOWN(start);
pgdat = NODE_DATA(nid);
if (!pgdat) {
@@ -879,13 +881,13 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
} else {
int cpu;
/*
- * Reset the nr_zones, order and classzone_idx before reuse.
- * Note that kswapd will init kswapd_classzone_idx properly
+ * Reset the nr_zones, order and highest_zoneidx before reuse.
+ * Note that kswapd will init kswapd_highest_zoneidx properly
* when it starts in the near future.
*/
pgdat->nr_zones = 0;
pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = 0;
+ pgdat->kswapd_highest_zoneidx = 0;
for_each_online_cpu(cpu) {
struct per_cpu_nodestat *p;
@@ -895,9 +897,8 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
}
/* we can use NODE_DATA(nid) from here */
-
pgdat->node_id = nid;
- pgdat->node_start_pfn = start_pfn;
+ pgdat->node_start_pfn = 0;
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_core_hotplug(nid);
@@ -932,7 +933,6 @@ static void rollback_node_hotadd(int nid)
/**
* try_online_node - online a node if offlined
* @nid: the node ID
- * @start: start addr of the node
* @set_node_online: Whether we want to online the node
* called by cpu_up() to online a node without onlined memory.
*
@@ -941,7 +941,7 @@ static void rollback_node_hotadd(int nid)
* 0 -> the node is already online
* -ENOMEM -> the node could not be allocated
*/
-static int __try_online_node(int nid, u64 start, bool set_node_online)
+static int __try_online_node(int nid, bool set_node_online)
{
pg_data_t *pgdat;
int ret = 1;
@@ -949,7 +949,7 @@ static int __try_online_node(int nid, u64 start, bool set_node_online)
if (node_online(nid))
return 0;
- pgdat = hotadd_new_pgdat(nid, start);
+ pgdat = hotadd_new_pgdat(nid);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
@@ -973,7 +973,7 @@ int try_online_node(int nid)
int ret;
mem_hotplug_begin();
- ret = __try_online_node(nid, 0, true);
+ ret = __try_online_node(nid, true);
mem_hotplug_done();
return ret;
}
@@ -1017,17 +1017,17 @@ int __ref add_memory_resource(int nid, struct resource *res)
if (ret)
return ret;
+ if (!node_possible(nid)) {
+ WARN(1, "node %d was absent from the node_possible_map\n", nid);
+ return -EINVAL;
+ }
+
mem_hotplug_begin();
- /*
- * Add new range to memblock so that when hotadd_new_pgdat() is called
- * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
- * this new range and calculate total pages correctly. The range will
- * be removed at hot-remove time.
- */
- memblock_add_node(start, size, nid);
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
+ memblock_add_node(start, size, nid);
- ret = __try_online_node(nid, start, false);
+ ret = __try_online_node(nid, false);
if (ret < 0)
goto error;
new_node = ret;
@@ -1060,7 +1060,8 @@ int __ref add_memory_resource(int nid, struct resource *res)
BUG_ON(ret);
/* create new memmap entry */
- firmware_map_add_hotplug(start, start + size, "System RAM");
+ if (!strcmp(res->name, "System RAM"))
+ firmware_map_add_hotplug(start, start + size, "System RAM");
/* device_online() will take the lock when calling online_pages() */
mem_hotplug_done();
@@ -1074,7 +1075,8 @@ error:
/* rollback pgdat allocation and others */
if (new_node)
rollback_node_hotadd(nid);
- memblock_remove(start, size);
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
+ memblock_remove(start, size);
mem_hotplug_done();
return ret;
}
@@ -1085,7 +1087,7 @@ int __ref __add_memory(int nid, u64 start, u64 size)
struct resource *res;
int ret;
- res = register_memory_resource(start, size);
+ res = register_memory_resource(start, size, "System RAM");
if (IS_ERR(res))
return PTR_ERR(res);
@@ -1107,82 +1109,57 @@ int add_memory(int nid, u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(add_memory);
-#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
- * set and the size of the free page is given by page_order(). Using this,
- * the function determines if the pageblock contains only free pages.
- * Due to buddy contraints, a free page at least the size of a pageblock will
- * be located at the start of the pageblock
+ * Add special, driver-managed memory to the system as system RAM. Such
+ * memory is not exposed via the raw firmware-provided memmap as system
+ * RAM, instead, it is detected and added by a driver - during cold boot,
+ * after a reboot, and after kexec.
+ *
+ * Reasons why this memory should not be used for the initial memmap of a
+ * kexec kernel or for placing kexec images:
+ * - The booting kernel is in charge of determining how this memory will be
+ * used (e.g., use persistent memory as system RAM)
+ * - Coordination with a hypervisor is required before this memory
+ * can be used (e.g., inaccessible parts).
+ *
+ * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
+ * memory map") are created. Also, the created memory resource is flagged
+ * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case
+ * this memory as well (esp., not place kexec images onto it).
+ *
+ * The resource_name (visible via /proc/iomem) has to have the format
+ * "System RAM ($DRIVER)".
*/
-static inline int pageblock_free(struct page *page)
+int add_memory_driver_managed(int nid, u64 start, u64 size,
+ const char *resource_name)
{
- return PageBuddy(page) && page_order(page) >= pageblock_order;
-}
+ struct resource *res;
+ int rc;
-/* Return the pfn of the start of the next active pageblock after a given pfn */
-static unsigned long next_active_pageblock(unsigned long pfn)
-{
- struct page *page = pfn_to_page(pfn);
+ if (!resource_name ||
+ strstr(resource_name, "System RAM (") != resource_name ||
+ resource_name[strlen(resource_name) - 1] != ')')
+ return -EINVAL;
- /* Ensure the starting page is pageblock-aligned */
- BUG_ON(pfn & (pageblock_nr_pages - 1));
+ lock_device_hotplug();
- /* If the entire pageblock is free, move to the end of free page */
- if (pageblock_free(page)) {
- int order;
- /* be careful. we don't have locks, page_order can be changed.*/
- order = page_order(page);
- if ((order < MAX_ORDER) && (order >= pageblock_order))
- return pfn + (1 << order);
+ res = register_memory_resource(start, size, resource_name);
+ if (IS_ERR(res)) {
+ rc = PTR_ERR(res);
+ goto out_unlock;
}
- return pfn + pageblock_nr_pages;
-}
-
-static bool is_pageblock_removable_nolock(unsigned long pfn)
-{
- struct page *page = pfn_to_page(pfn);
- struct zone *zone;
-
- /*
- * We have to be careful here because we are iterating over memory
- * sections which are not zone aware so we might end up outside of
- * the zone but still within the section.
- * We have to take care about the node as well. If the node is offline
- * its NODE_DATA will be NULL - see page_zone.
- */
- if (!node_online(page_to_nid(page)))
- return false;
-
- zone = page_zone(page);
- pfn = page_to_pfn(page);
- if (!zone_spans_pfn(zone, pfn))
- return false;
-
- return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE,
- MEMORY_OFFLINE);
-}
-
-/* Checks if this range of memory is likely to be hot-removable. */
-bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
-{
- unsigned long end_pfn, pfn;
-
- end_pfn = min(start_pfn + nr_pages,
- zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
-
- /* Check the starting page of each pageblock within the range */
- for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
- if (!is_pageblock_removable_nolock(pfn))
- return false;
- cond_resched();
- }
+ rc = add_memory_resource(nid, res);
+ if (rc < 0)
+ release_memory_resource(res);
- /* All pageblocks in the memory block are likely to be hot-removable */
- return true;
+out_unlock:
+ unlock_device_hotplug();
+ return rc;
}
+EXPORT_SYMBOL_GPL(add_memory_driver_managed);
+#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* Confirm all pages in a range [start, end) belong to the same zone (skipping
* memory holes). When true, return the zone.
@@ -1360,7 +1337,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
}
/*
- * Check all pages in range, recoreded as memory resource, are isolated.
+ * Check all pages in range, recorded as memory resource, are isolated.
*/
static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
@@ -1372,11 +1349,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
static int __init cmdline_parse_movable_node(char *p)
{
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
movable_node_enabled = true;
-#else
- pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
-#endif
return 0;
}
early_param("movable_node", cmdline_parse_movable_node);
@@ -1750,8 +1723,12 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
mem_hotplug_begin();
arch_remove_memory(nid, start, size, NULL);
- memblock_free(start, size);
- memblock_remove(start, size);
+
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
+ memblock_free(start, size);
+ memblock_remove(start, size);
+ }
+
__release_memory_resource(start, size);
try_offline_node(nid);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 48ba9729062e..1965e2681877 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -927,10 +927,7 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
int locked = 1;
err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
- if (err == 0) {
- /* E.g. GUP interrupted by fatal signal */
- err = -EFAULT;
- } else if (err > 0) {
+ if (err > 0) {
err = page_to_nid(p);
put_page(p);
}
diff --git a/mm/memremap.c b/mm/memremap.c
index 03e38b7a38f1..8afcc54c8928 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -97,6 +97,26 @@ static unsigned long pfn_next(unsigned long pfn)
return pfn + 1;
}
+/*
+ * This returns true if the page is reserved by ZONE_DEVICE driver.
+ */
+bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ struct dev_pagemap *pgmap;
+ struct vmem_altmap *altmap;
+ bool ret = false;
+
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (!pgmap)
+ return ret;
+ altmap = pgmap_altmap(pgmap);
+ if (altmap && pfn < (altmap->base_pfn + altmap->reserve))
+ ret = true;
+ put_dev_pagemap(pgmap);
+
+ return ret;
+}
+
#define for_each_device_pfn(pfn, map) \
for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
diff --git a/mm/migrate.c b/mm/migrate.c
index 7160c1556f79..6af62476b471 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -490,11 +490,18 @@ int migrate_page_move_mapping(struct address_space *mapping,
* are mapped to swap space.
*/
if (newzone != oldzone) {
- __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
- __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
+ struct lruvec *old_lruvec, *new_lruvec;
+ struct mem_cgroup *memcg;
+
+ memcg = page_memcg(page);
+ old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
+ new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
+
+ __dec_lruvec_state(old_lruvec, NR_FILE_PAGES);
+ __inc_lruvec_state(new_lruvec, NR_FILE_PAGES);
if (PageSwapBacked(page) && !PageSwapCache(page)) {
- __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
- __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
+ __dec_lruvec_state(old_lruvec, NR_SHMEM);
+ __inc_lruvec_state(new_lruvec, NR_SHMEM);
}
if (dirty && mapping_cap_account_dirty(mapping)) {
__dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
@@ -797,10 +804,7 @@ recheck_buffers:
if (rc != MIGRATEPAGE_SUCCESS)
goto unlock_buffers;
- ClearPagePrivate(page);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
+ attach_page_private(newpage, detach_page_private(page));
get_page(newpage);
bh = head;
@@ -810,8 +814,6 @@ recheck_buffers:
} while (bh != head);
- SetPagePrivate(newpage);
-
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
@@ -1032,7 +1034,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* to the LRU. Later, when the IO completes the pages are
* marked uptodate and unlocked. However, the queueing
* could be merging multiple pages for one bio (e.g.
- * mpage_readpages). If an allocation happens for the
+ * mpage_readahead). If an allocation happens for the
* second or third page, the process can end up locking
* the same page twice and deadlocking. Rather than
* trying to be clever about what pages can be locked,
@@ -1170,6 +1172,20 @@ out:
#define ICE_noinline
#endif
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static inline void thp_pmd_migration_success(bool success)
+{
+ if (success)
+ count_vm_event(THP_PMD_MIGRATION_SUCCESS);
+ else
+ count_vm_event(THP_PMD_MIGRATION_FAILURE);
+}
+#else
+static inline void thp_pmd_migration_success(bool success)
+{
+}
+#endif
+
/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
@@ -1232,6 +1248,14 @@ out:
* we want to retry.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
+ /*
+ * When the page to be migrated has been freed from under
+ * us, that is considered a MIGRATEPAGE_SUCCESS, but no
+ * newpage has been allocated. It should not be counted
+ * as a successful THP migration.
+ */
+ if (newpage && PageTransHuge(newpage))
+ thp_pmd_migration_success(true);
put_page(page);
if (reason == MR_MEMORY_FAILURE) {
/*
@@ -1474,6 +1498,7 @@ retry:
unlock_page(page);
if (!rc) {
list_safe_reset_next(page, page2, lru);
+ thp_pmd_migration_success(false);
goto retry;
}
}
@@ -2739,7 +2764,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
{
struct vm_area_struct *vma = migrate->vma;
struct mm_struct *mm = vma->vm_mm;
- struct mem_cgroup *memcg;
bool flush = false;
spinlock_t *ptl;
pte_t entry;
@@ -2786,7 +2810,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
if (unlikely(anon_vma_prepare(vma)))
goto abort;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
goto abort;
/*
@@ -2832,7 +2856,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
if (!is_zone_device_page(page))
lru_cache_add_active_or_unevictable(page, vma);
get_page(page);
@@ -2854,7 +2877,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
unlock_abort:
pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 7da6991d9435..435e5f794b3b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -67,26 +67,30 @@ void __init mminit_verify_pageflags_layout(void)
unsigned long or_mask, add_mask;
shift = 8 * sizeof(unsigned long);
- width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT;
+ width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
LAST_CPUPID_WIDTH,
+ KASAN_TAG_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
- "Section %d Node %d Zone %d Lastcpupid %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
SECTIONS_SHIFT,
NODES_SHIFT,
ZONES_SHIFT,
- LAST_CPUPID_SHIFT);
+ LAST_CPUPID_SHIFT,
+ KASAN_TAG_WIDTH);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
- "Section %lu Node %lu Zone %lu Lastcpupid %lu\n",
+ "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",
(unsigned long)SECTIONS_PGSHIFT,
(unsigned long)NODES_PGSHIFT,
(unsigned long)ZONES_PGSHIFT,
- (unsigned long)LAST_CPUPID_PGSHIFT);
+ (unsigned long)LAST_CPUPID_PGSHIFT,
+ (unsigned long)KASAN_TAG_PGSHIFT);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
"Node/Zone ID: %lu -> %lu\n",
(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
diff --git a/mm/mmap.c b/mm/mmap.c
index 24834a531268..82c381fd8daa 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1207,7 +1207,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
}
/*
- * Rough compatbility check to quickly see if it's even worth looking
+ * Rough compatibility check to quickly see if it's even worth looking
* at sharing an anon_vma.
*
* They need to have the same vm_file, and the flags can only differ
@@ -1850,6 +1850,22 @@ unacct_error:
return error;
}
+static inline unsigned long gap_start_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_start offset to adjust gap address to the
+ * desired alignment
+ */
+ return (info->align_offset - addr) & info->align_mask;
+}
+
+static inline unsigned long gap_end_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_end offset to adjust gap address to the desired alignment */
+ return (addr - info->align_offset) & info->align_mask;
+}
+
static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
/*
@@ -1864,10 +1880,7 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;
/* Adjust search limits by the desired length */
if (info->high_limit < length)
@@ -1899,6 +1912,7 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
}
gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
+ gap_start += gap_start_offset(info, gap_start);
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
@@ -1927,6 +1941,7 @@ check_current:
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
gap_start = vm_end_gap(vma->vm_prev);
+ gap_start += gap_start_offset(info, gap_start);
gap_end = vm_start_gap(vma);
goto check_current;
}
@@ -1936,17 +1951,17 @@ check_current:
check_highest:
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
+ gap_start += gap_start_offset(info, gap_start);
gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
if (gap_start > high_limit)
return -ENOMEM;
found:
/* We found a suitable gap. Clip it with the original low_limit. */
- if (gap_start < info->low_limit)
+ if (gap_start < info->low_limit) {
gap_start = info->low_limit;
-
- /* Adjust gap address to the desired alignment */
- gap_start += (info->align_offset - gap_start) & info->align_mask;
+ gap_start += gap_start_offset(info, gap_start);
+ }
VM_BUG_ON(gap_start + info->length > info->high_limit);
VM_BUG_ON(gap_start + info->length > gap_end);
@@ -1959,16 +1974,14 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;
/*
* Adjust search limits by the desired length.
* See implementation comment at top of unmapped_area().
*/
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < length)
return -ENOMEM;
high_limit = gap_end - length;
@@ -2005,6 +2018,7 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
check_current:
/* Check if current node has a suitable gap */
gap_end = vm_start_gap(vma);
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < low_limit)
return -ENOMEM;
if (gap_start <= high_limit &&
@@ -2039,13 +2053,14 @@ check_current:
found:
/* We found a suitable gap. Clip it with the original high_limit. */
- if (gap_end > info->high_limit)
+ if (gap_end > info->high_limit) {
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
+ }
found_highest:
/* Compute highest gap address at the desired alignment */
gap_end -= info->length;
- gap_end -= (gap_end - info->align_offset) & info->align_mask;
VM_BUG_ON(gap_end < info->low_limit);
VM_BUG_ON(gap_end < gap_start);
diff --git a/mm/mremap.c b/mm/mremap.c
index 6aa6ea605068..dccb7a4471a0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -785,7 +785,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
out:
if (offset_in_page(ret)) {
vm_unacct_memory(charged);
- locked = 0;
+ locked = false;
}
if (downgraded)
up_read(&current->mm->mmap_sem);
diff --git a/mm/nommu.c b/mm/nommu.c
index 318df4e236c9..dfae55f41901 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -140,7 +140,7 @@ void vfree(const void *addr)
}
EXPORT_SYMBOL(vfree);
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc(unsigned long size, gfp_t gfp_mask)
{
/*
* You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
@@ -150,16 +150,25 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
}
EXPORT_SYMBOL(__vmalloc);
-void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags)
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+ unsigned long start, unsigned long end, gfp_t gfp_mask,
+ pgprot_t prot, unsigned long vm_flags, int node,
+ const void *caller)
{
- return __vmalloc(size, flags, PAGE_KERNEL);
+ return __vmalloc(size, gfp_mask);
+}
+
+void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
+ int node, const void *caller)
+{
+ return __vmalloc(size, gfp_mask);
}
static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
{
void *ret;
- ret = __vmalloc(size, flags, PAGE_KERNEL);
+ ret = __vmalloc(size, flags);
if (ret) {
struct vm_area_struct *vma;
@@ -179,12 +188,6 @@ void *vmalloc_user(unsigned long size)
}
EXPORT_SYMBOL(vmalloc_user);
-void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
-{
- return __vmalloc_user_flags(size, flags | __GFP_ZERO);
-}
-EXPORT_SYMBOL(vmalloc_user_node_flags);
-
struct page *vmalloc_to_page(const void *addr)
{
return virt_to_page(addr);
@@ -230,7 +233,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM);
}
EXPORT_SYMBOL(vmalloc);
@@ -248,8 +251,7 @@ EXPORT_SYMBOL(vmalloc);
*/
void *vzalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL);
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
}
EXPORT_SYMBOL(vzalloc);
@@ -302,7 +304,7 @@ EXPORT_SYMBOL(vzalloc_node);
void *vmalloc_exec(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM);
}
/**
@@ -314,7 +316,7 @@ void *vmalloc_exec(unsigned long size)
*/
void *vmalloc_32(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+ return __vmalloc(size, GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc_32);
@@ -351,7 +353,7 @@ void vunmap(const void *addr)
}
EXPORT_SYMBOL(vunmap);
-void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
BUG();
return NULL;
@@ -369,18 +371,6 @@ void vm_unmap_aliases(void)
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
-/*
- * Implement a stub for vmalloc_sync_[un]mapping() if the architecture
- * chose not to have one.
- */
-void __weak vmalloc_sync_mappings(void)
-{
-}
-
-void __weak vmalloc_sync_unmappings(void)
-{
-}
-
struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
BUG();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dfc357614e56..4daedf7b91f6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -254,7 +254,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
{
struct zone *zone;
struct zoneref *z;
- enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
+ enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
bool cpuset_limited = false;
int nid;
@@ -294,7 +294,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
/* Check this allocation failure is caused by cpuset's wall function */
for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
- high_zoneidx, oc->nodemask)
+ highest_zoneidx, oc->nodemask)
if (!cpuset_zone_allowed(zone, oc->gfp_mask))
cpuset_limited = true;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d3ee4c4dafac..a53f4cfa7628 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -257,7 +257,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
* requiring writeback.
*
* This number of dirtyable pages is the base value of which the
- * user-configurable dirty ratio is the effictive number of pages that
+ * user-configurable dirty ratio is the effective number of pages that
* are allowed to be actually dirtied. Per individual zone, or
* globally by using the sum of dirtyable pages over all zones.
*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1a8c676f34b7..e744da4a2049 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,6 +68,7 @@
#include <linux/lockdep.h>
#include <linux/nmi.h>
#include <linux/psi.h>
+#include <linux/padata.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -302,14 +303,14 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
#endif
};
-compound_page_dtor * const compound_page_dtors[] = {
- NULL,
- free_compound_page,
+compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
+ [NULL_COMPOUND_DTOR] = NULL,
+ [COMPOUND_PAGE_DTOR] = free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
- free_huge_page,
+ [HUGETLB_PAGE_DTOR] = free_huge_page,
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- free_transhuge_page,
+ [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
#endif
};
@@ -335,7 +336,6 @@ static unsigned long nr_kernel_pages __initdata;
static unsigned long nr_all_pages __initdata;
static unsigned long dma_reserve __initdata;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long required_kernelcore __initdata;
@@ -348,7 +348,6 @@ static bool mirrored_kernelcore __meminitdata;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
EXPORT_SYMBOL(movable_zone);
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#if MAX_NUMNODES > 1
unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
@@ -609,8 +608,7 @@ static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
}
#endif
-static void bad_page(struct page *page, const char *reason,
- unsigned long bad_flags)
+static void bad_page(struct page *page, const char *reason)
{
static unsigned long resume;
static unsigned long nr_shown;
@@ -639,10 +637,6 @@ static void bad_page(struct page *page, const char *reason,
pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
__dump_page(page, reason);
- bad_flags &= page->flags;
- if (bad_flags)
- pr_alert("bad because of flags: %#lx(%pGp)\n",
- bad_flags, &bad_flags);
dump_page_owner(page);
print_modules();
@@ -1077,13 +1071,9 @@ static inline bool page_expected_state(struct page *page,
return true;
}
-static void free_pages_check_bad(struct page *page)
+static const char *page_bad_reason(struct page *page, unsigned long flags)
{
- const char *bad_reason;
- unsigned long bad_flags;
-
- bad_reason = NULL;
- bad_flags = 0;
+ const char *bad_reason = NULL;
if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
@@ -1091,24 +1081,32 @@ static void free_pages_check_bad(struct page *page)
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _refcount";
- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
- bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
- bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
+ if (unlikely(page->flags & flags)) {
+ if (flags == PAGE_FLAGS_CHECK_AT_PREP)
+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
+ else
+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
if (unlikely(page->mem_cgroup))
bad_reason = "page still charged to cgroup";
#endif
- bad_page(page, bad_reason, bad_flags);
+ return bad_reason;
+}
+
+static void check_free_page_bad(struct page *page)
+{
+ bad_page(page,
+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
}
-static inline int free_pages_check(struct page *page)
+static inline int check_free_page(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
return 0;
/* Something has gone sideways, find it */
- free_pages_check_bad(page);
+ check_free_page_bad(page);
return 1;
}
@@ -1130,7 +1128,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
case 1:
/* the first tail page: ->mapping may be compound_mapcount() */
if (unlikely(compound_mapcount(page))) {
- bad_page(page, "nonzero compound_mapcount", 0);
+ bad_page(page, "nonzero compound_mapcount");
goto out;
}
break;
@@ -1142,17 +1140,17 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
break;
default:
if (page->mapping != TAIL_MAPPING) {
- bad_page(page, "corrupted mapping in tail page", 0);
+ bad_page(page, "corrupted mapping in tail page");
goto out;
}
break;
}
if (unlikely(!PageTail(page))) {
- bad_page(page, "PageTail not set", 0);
+ bad_page(page, "PageTail not set");
goto out;
}
if (unlikely(compound_head(page) != head_page)) {
- bad_page(page, "compound_head not consistent", 0);
+ bad_page(page, "compound_head not consistent");
goto out;
}
ret = 0;
@@ -1194,7 +1192,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_pages_check(page, page + i);
- if (unlikely(free_pages_check(page + i))) {
+ if (unlikely(check_free_page(page + i))) {
bad++;
continue;
}
@@ -1206,7 +1204,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (memcg_kmem_enabled() && PageKmemcg(page))
__memcg_kmem_uncharge_page(page, order);
if (check_free)
- bad += free_pages_check(page);
+ bad += check_free_page(page);
if (bad)
return false;
@@ -1253,7 +1251,7 @@ static bool free_pcp_prepare(struct page *page)
static bool bulkfree_pcp_prepare(struct page *page)
{
if (debug_pagealloc_enabled_static())
- return free_pages_check(page);
+ return check_free_page(page);
else
return false;
}
@@ -1274,7 +1272,7 @@ static bool free_pcp_prepare(struct page *page)
static bool bulkfree_pcp_prepare(struct page *page)
{
- return free_pages_check(page);
+ return check_free_page(page);
}
#endif /* CONFIG_DEBUG_VM */
@@ -1499,45 +1497,49 @@ void __free_pages_core(struct page *page, unsigned int order)
__free_pages(page, order);
}
-#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
- defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
+#ifdef CONFIG_NEED_MULTIPLE_NODES
static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
-int __meminit early_pfn_to_nid(unsigned long pfn)
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+
+/*
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ */
+int __meminit __early_pfn_to_nid(unsigned long pfn,
+ struct mminit_pfnnid_cache *state)
{
- static DEFINE_SPINLOCK(early_pfn_lock);
+ unsigned long start_pfn, end_pfn;
int nid;
- spin_lock(&early_pfn_lock);
- nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
- if (nid < 0)
- nid = first_online_node;
- spin_unlock(&early_pfn_lock);
+ if (state->last_start <= pfn && pfn < state->last_end)
+ return state->last_nid;
+
+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
+ if (nid != NUMA_NO_NODE) {
+ state->last_start = start_pfn;
+ state->last_end = end_pfn;
+ state->last_nid = nid;
+ }
return nid;
}
-#endif
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+int __meminit early_pfn_to_nid(unsigned long pfn)
{
+ static DEFINE_SPINLOCK(early_pfn_lock);
int nid;
+ spin_lock(&early_pfn_lock);
nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
- if (nid >= 0 && nid != node)
- return false;
- return true;
-}
+ if (nid < 0)
+ nid = first_online_node;
+ spin_unlock(&early_pfn_lock);
-#else
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- return true;
+ return nid;
}
-#endif
-
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
@@ -1692,7 +1694,6 @@ static void __init deferred_free_pages(unsigned long pfn,
} else if (!(pfn & nr_pgmask)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 1;
- touch_nmi_watchdog();
} else {
nr_free++;
}
@@ -1722,7 +1723,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
continue;
} else if (!page || !(pfn & nr_pgmask)) {
page = pfn_to_page(pfn);
- touch_nmi_watchdog();
} else {
page++;
}
@@ -1816,16 +1816,51 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
return nr_pages;
}
+struct definit_args {
+ struct zone *zone;
+ atomic_long_t nr_pages;
+};
+
+static void __init
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
+ void *arg)
+{
+ unsigned long spfn, epfn, nr_pages = 0;
+ struct definit_args *args = arg;
+ struct zone *zone = args->zone;
+ u64 i;
+
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
+
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so that we
+ * can avoid introducing any issues with the buddy allocator.
+ */
+ while (spfn < end_pfn) {
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ cond_resched();
+ }
+
+ atomic_long_add(nr_pages, &args->nr_pages);
+}
+
+/* An arch may override for more concurrency. */
+__weak int __init
+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+ return 1;
+}
+
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
unsigned long spfn = 0, epfn = 0, nr_pages = 0;
- unsigned long first_init_pfn, flags;
+ unsigned long first_init_pfn, flags, epfn_align;
unsigned long start = jiffies;
struct zone *zone;
- int zid;
+ int zid, max_threads;
u64 i;
/* Bind memory initialisation thread to a local node if possible */
@@ -1845,6 +1880,13 @@ static int __init deferred_init_memmap(void *data)
BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
pgdat->first_deferred_pfn = ULONG_MAX;
+ /*
+ * Once we unlock here, the zone cannot be grown anymore, thus if an
+ * interrupt thread must allocate this early in boot, zone must be
+ * pre-grown prior to start of deferred page initialization.
+ */
+ pgdat_resize_unlock(pgdat, &flags);
+
/* Only the highest zone is deferred so find it */
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
zone = pgdat->node_zones + zid;
@@ -1857,16 +1899,33 @@ static int __init deferred_init_memmap(void *data)
first_init_pfn))
goto zone_empty;
- /*
- * Initialize and free pages in MAX_ORDER sized increments so
- * that we can avoid introducing any issues with the buddy
- * allocator.
- */
- while (spfn < epfn)
+ max_threads = deferred_page_init_max_threads(cpumask);
+
+ while (spfn < epfn) {
+ epfn_align = ALIGN_DOWN(epfn, PAGES_PER_SECTION);
+
+ if (IS_ALIGNED(spfn, PAGES_PER_SECTION) &&
+ epfn_align - spfn >= PAGES_PER_SECTION) {
+ struct definit_args arg = { zone, ATOMIC_LONG_INIT(0) };
+ struct padata_mt_job job = {
+ .thread_fn = deferred_init_memmap_chunk,
+ .fn_arg = &arg,
+ .start = spfn,
+ .size = epfn_align - spfn,
+ .align = PAGES_PER_SECTION,
+ .min_chunk = PAGES_PER_SECTION,
+ .max_threads = max_threads,
+ };
+
+ padata_do_multithreaded(&job);
+ nr_pages += atomic_long_read(&arg.nr_pages);
+ spfn = epfn_align;
+ }
+
nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ cond_resched();
+ }
zone_empty:
- pgdat_resize_unlock(pgdat, &flags);
-
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
@@ -1909,17 +1968,6 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
pgdat_resize_lock(pgdat, &flags);
/*
- * If deferred pages have been initialized while we were waiting for
- * the lock, return true, as the zone was grown. The caller will retry
- * this zone. We won't return to this function since the caller also
- * has this static branch.
- */
- if (!static_branch_unlikely(&deferred_pages)) {
- pgdat_resize_unlock(pgdat, &flags);
- return true;
- }
-
- /*
* If someone grew this zone while we were waiting for spinlock, return
* true, as there might be enough pages already.
*/
@@ -1947,6 +1995,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
first_deferred_pfn = spfn;
nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ touch_nmi_watchdog();
/* We should only stop along section boundaries */
if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
@@ -2092,31 +2141,14 @@ static inline void expand(struct zone *zone, struct page *page,
static void check_new_page_bad(struct page *page)
{
- const char *bad_reason = NULL;
- unsigned long bad_flags = 0;
-
- if (unlikely(atomic_read(&page->_mapcount) != -1))
- bad_reason = "nonzero mapcount";
- if (unlikely(page->mapping != NULL))
- bad_reason = "non-NULL mapping";
- if (unlikely(page_ref_count(page) != 0))
- bad_reason = "nonzero _refcount";
if (unlikely(page->flags & __PG_HWPOISON)) {
- bad_reason = "HWPoisoned (hardware-corrupted)";
- bad_flags = __PG_HWPOISON;
/* Don't complain about hwpoisoned pages */
page_mapcount_reset(page); /* remove PageBuddy */
return;
}
- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
- bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
- bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
- }
-#ifdef CONFIG_MEMCG
- if (unlikely(page->mem_cgroup))
- bad_reason = "page still charged to cgroup";
-#endif
- bad_page(page, bad_reason, bad_flags);
+
+ bad_page(page,
+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
}
/*
@@ -2609,7 +2641,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
int order;
bool ret;
- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
ac->nodemask) {
/*
* Preserve at least one pageblock unless memory pressure
@@ -2768,6 +2800,20 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
{
struct page *page;
+#ifdef CONFIG_CMA
+ /*
+ * Balance movable allocations between regular and CMA areas by
+ * allocating from CMA when over half of the zone's free memory
+ * is in the CMA area.
+ */
+ if (migratetype == MIGRATE_MOVABLE &&
+ zone_page_state(zone, NR_FREE_CMA_PAGES) >
+ zone_page_state(zone, NR_FREE_PAGES) / 2) {
+ page = __rmqueue_cma_fallback(zone, order);
+ if (page)
+ return page;
+ }
+#endif
retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
@@ -3464,7 +3510,7 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
* to check in the allocation paths if no pages are free.
*/
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
- int classzone_idx, unsigned int alloc_flags,
+ int highest_zoneidx, unsigned int alloc_flags,
long free_pages)
{
long min = mark;
@@ -3509,7 +3555,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
* are not met, then a high-order request also cannot go ahead
* even if a suitable page happened to be free.
*/
- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+ if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
return false;
/* If this is an order-0 request then the watermark is fine */
@@ -3542,14 +3588,15 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
}
bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
- int classzone_idx, unsigned int alloc_flags)
+ int highest_zoneidx, unsigned int alloc_flags)
{
- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, unsigned int alloc_flags)
+ unsigned long mark, int highest_zoneidx,
+ unsigned int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
long cma_pages = 0;
@@ -3567,22 +3614,23 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
* the caller is !atomic then it'll uselessly search the free
* list. That corner case is then slower but it is harmless.
*/
- if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
+ if (!order && (free_pages - cma_pages) >
+ mark + z->lowmem_reserve[highest_zoneidx])
return true;
- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
free_pages);
}
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx)
+ unsigned long mark, int highest_zoneidx)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
- return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
free_pages);
}
@@ -3659,8 +3707,8 @@ retry:
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
- for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_next_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
struct page *page;
unsigned long mark;
@@ -3714,8 +3762,20 @@ retry:
}
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
+ /*
+ * Allow GFP_ATOMIC order-0 allocations to exclude the
+ * zone->watermark_boost in their watermark calculations.
+ * We rely on the ALLOC_ flags set for GFP_ATOMIC requests in
+ * gfp_to_alloc_flags() for this. Reason not to use the
+ * GFP_ATOMIC directly is that we want to fall back to slow path
+ * thus wake up kswapd.
+ */
+ if (unlikely(!order && !(alloc_flags & ALLOC_WMARK_MASK) &&
+ (alloc_flags & (ALLOC_HARDER | ALLOC_HIGH)))) {
+ mark = zone->_watermark[WMARK_MIN];
+ }
if (!zone_watermark_fast(zone, order, mark,
- ac_classzone_idx(ac), alloc_flags)) {
+ ac->highest_zoneidx, alloc_flags)) {
int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -3748,7 +3808,7 @@ retry:
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
- ac_classzone_idx(ac), alloc_flags))
+ ac->highest_zoneidx, alloc_flags))
goto try_this_zone;
continue;
@@ -3907,7 +3967,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
- if (ac->high_zoneidx < ZONE_NORMAL)
+ if (ac->highest_zoneidx < ZONE_NORMAL)
goto out;
if (pm_suspended_storage())
goto out;
@@ -4110,10 +4170,10 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
* Let's give them a good hope and keep retrying while the order-0
* watermarks are OK.
*/
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
- ac_classzone_idx(ac), alloc_flags))
+ ac->highest_zoneidx, alloc_flags))
return true;
}
return false;
@@ -4237,12 +4297,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
struct zoneref *z;
struct zone *zone;
pg_data_t *last_pgdat = NULL;
- enum zone_type high_zoneidx = ac->high_zoneidx;
+ enum zone_type highest_zoneidx = ac->highest_zoneidx;
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
if (last_pgdat != zone->zone_pgdat)
- wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
+ wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
last_pgdat = zone->zone_pgdat;
}
}
@@ -4285,7 +4345,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= ALLOC_HARDER;
#ifdef CONFIG_CMA
- if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
return alloc_flags;
@@ -4377,8 +4437,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* request even if all reclaimable pages are considered then we are
* screwed and have to go OOM.
*/
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
unsigned long available;
unsigned long reclaimable;
unsigned long min_wmark = min_wmark_pages(zone);
@@ -4392,7 +4452,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* reclaimable pages?
*/
wmark = __zone_watermark_ok(zone, order, min_wmark,
- ac_classzone_idx(ac), alloc_flags, available);
+ ac->highest_zoneidx, alloc_flags, available);
trace_reclaim_retry_zone(z, order, reclaimable,
available, min_wmark, *no_progress_loops, wmark);
if (wmark) {
@@ -4511,7 +4571,7 @@ retry_cpuset:
* could end up iterating over non-eligible zones endlessly.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
- ac->high_zoneidx, ac->nodemask);
+ ac->highest_zoneidx, ac->nodemask);
if (!ac->preferred_zoneref->zone)
goto nopage;
@@ -4598,7 +4658,7 @@ retry:
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->nodemask = NULL;
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
- ac->high_zoneidx, ac->nodemask);
+ ac->highest_zoneidx, ac->nodemask);
}
/* Attempt with potentially adjusted zonelist and alloc_flags */
@@ -4732,10 +4792,10 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
- ac->high_zoneidx = gfp_zone(gfp_mask);
+ ac->highest_zoneidx = gfp_zone(gfp_mask);
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
- ac->migratetype = gfpflags_to_migratetype(gfp_mask);
+ ac->migratetype = gfp_migratetype(gfp_mask);
if (cpusets_enabled()) {
*alloc_mask |= __GFP_HARDWALL;
@@ -4771,7 +4831,7 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
* may get reset for allocations that ignore memory policies.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
- ac->high_zoneidx, ac->nodemask);
+ ac->highest_zoneidx, ac->nodemask);
}
/*
@@ -5685,14 +5745,13 @@ static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
int node, load, nr_nodes = 0;
- nodemask_t used_mask;
+ nodemask_t used_mask = NODE_MASK_NONE;
int local_node, prev_node;
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
load = nr_online_nodes;
prev_node = local_node;
- nodes_clear(used_mask);
memset(node_order, 0, sizeof(node_order));
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
@@ -5904,7 +5963,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
static bool __meminit
overlap_memmap_init(unsigned long zone, unsigned long *pfn)
{
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
static struct memblock_region *r;
if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
@@ -5920,27 +5978,9 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
return true;
}
}
-#endif
return false;
}
-#ifdef CONFIG_SPARSEMEM
-/* Skip PFNs that belong to non-present sections */
-static inline __meminit unsigned long next_pfn(unsigned long pfn)
-{
- const unsigned long section_nr = pfn_to_section_nr(++pfn);
-
- if (present_section_nr(section_nr))
- return pfn;
- return section_nr_to_pfn(next_present_section_nr(section_nr));
-}
-#else
-static inline __meminit unsigned long next_pfn(unsigned long pfn)
-{
- return pfn++;
-}
-#endif
-
/*
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
@@ -5980,14 +6020,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* function. They do not exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn)) {
- pfn = next_pfn(pfn);
- continue;
- }
- if (!early_pfn_in_nid(pfn, nid)) {
- pfn++;
- continue;
- }
if (overlap_memmap_init(zone, &pfn))
continue;
if (defer_init(nid, pfn, end_pfn))
@@ -6103,9 +6135,23 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
void __meminit __weak memmap_init(unsigned long size, int nid,
- unsigned long zone, unsigned long start_pfn)
+ unsigned long zone,
+ unsigned long range_start_pfn)
{
- memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
+ unsigned long start_pfn, end_pfn;
+ unsigned long range_end_pfn = range_start_pfn + size;
+ int i;
+
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+ end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+
+ if (end_pfn > start_pfn) {
+ size = end_pfn - start_pfn;
+ memmap_init_zone(size, nid, zone, start_pfn,
+ MEMMAP_EARLY, NULL);
+ }
+ }
}
static int zone_batchsize(struct zone *zone)
@@ -6257,10 +6303,25 @@ void __init setup_per_cpu_pageset(void)
{
struct pglist_data *pgdat;
struct zone *zone;
+ int __maybe_unused cpu;
for_each_populated_zone(zone)
setup_zone_pageset(zone);
+#ifdef CONFIG_NUMA
+ /*
+ * Unpopulated zones continue using the boot pagesets.
+ * The numa stats for these pagesets need to be reset.
+ * Otherwise, they will end up skewing the stats of
+ * the nodes these zones are associated with.
+ */
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
+ memset(pcp->vm_numa_stat_diff, 0,
+ sizeof(pcp->vm_numa_stat_diff));
+ }
+#endif
+
for_each_online_pgdat(pgdat)
pgdat->per_cpu_nodestats =
alloc_percpu(struct per_cpu_nodestat);
@@ -6303,57 +6364,6 @@ void __meminit init_currently_empty_zone(struct zone *zone,
zone->initialized = 1;
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-
-/*
- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
- */
-int __meminit __early_pfn_to_nid(unsigned long pfn,
- struct mminit_pfnnid_cache *state)
-{
- unsigned long start_pfn, end_pfn;
- int nid;
-
- if (state->last_start <= pfn && pfn < state->last_end)
- return state->last_nid;
-
- nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- if (nid != NUMA_NO_NODE) {
- state->last_start = start_pfn;
- state->last_end = end_pfn;
- state->last_nid = nid;
- }
-
- return nid;
-}
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-
-/**
- * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
- * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
- *
- * If an architecture guarantees that all ranges registered contain no holes
- * and may be freed, this this function may be used instead of calling
- * memblock_free_early_nid() manually.
- */
-void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
-{
- unsigned long start_pfn, end_pfn;
- int i, this_nid;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
- start_pfn = min(start_pfn, max_low_pfn);
- end_pfn = min(end_pfn, max_low_pfn);
-
- if (start_pfn < end_pfn)
- memblock_free_early_nid(PFN_PHYS(start_pfn),
- (end_pfn - start_pfn) << PAGE_SHIFT,
- this_nid);
- }
-}
-
/**
* sparse_memory_present_with_active_regions - Call memory_present for each active range
* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -6466,8 +6476,7 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zone_start_pfn,
- unsigned long *zone_end_pfn,
- unsigned long *ignored)
+ unsigned long *zone_end_pfn)
{
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
@@ -6531,8 +6540,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *ignored)
+ unsigned long node_end_pfn)
{
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
@@ -6579,45 +6587,9 @@ static unsigned long __init zone_absent_pages_in_node(int nid,
return nr_absent;
}
-#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static inline unsigned long __init zone_spanned_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zone_start_pfn,
- unsigned long *zone_end_pfn,
- unsigned long *zones_size)
-{
- unsigned int zone;
-
- *zone_start_pfn = node_start_pfn;
- for (zone = 0; zone < zone_type; zone++)
- *zone_start_pfn += zones_size[zone];
-
- *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
-
- return zones_size[zone_type];
-}
-
-static inline unsigned long __init zone_absent_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zholes_size)
-{
- if (!zholes_size)
- return 0;
-
- return zholes_size[zone_type];
-}
-
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zones_size,
- unsigned long *zholes_size)
+ unsigned long node_end_pfn)
{
unsigned long realtotalpages = 0, totalpages = 0;
enum zone_type i;
@@ -6625,17 +6597,21 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long spanned, absent;
unsigned long size, real_size;
- size = zone_spanned_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn,
- &zone_start_pfn,
- &zone_end_pfn,
- zones_size);
- real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
- node_start_pfn, node_end_pfn,
- zholes_size);
+ spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn,
+ &zone_start_pfn,
+ &zone_end_pfn);
+ absent = zone_absent_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn);
+
+ size = spanned;
+ real_size = size - absent;
+
if (size)
zone->zone_start_pfn = zone_start_pfn;
else
@@ -6935,10 +6911,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
*/
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
-#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
mem_map -= offset;
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
}
#endif
}
@@ -6955,30 +6929,25 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
#endif
-void __init free_area_init_node(int nid, unsigned long *zones_size,
- unsigned long node_start_pfn,
- unsigned long *zholes_size)
+static void __init free_area_init_node(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
- WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
+ WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
+
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pgdat->node_id = nid;
- pgdat->node_start_pfn = node_start_pfn;
+ pgdat->node_start_pfn = start_pfn;
pgdat->per_cpu_nodestats = NULL;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
-#else
- start_pfn = node_start_pfn;
-#endif
- calculate_node_totalpages(pgdat, start_pfn, end_pfn,
- zones_size, zholes_size);
+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
alloc_node_mem_map(pgdat);
pgdat_set_deferred_range(pgdat);
@@ -6986,6 +6955,11 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
free_area_init_core(pgdat);
}
+void __init free_area_init_memoryless_node(int nid)
+{
+ free_area_init_node(nid);
+}
+
#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
/*
* Initialize all valid struct pages in the range [spfn, epfn) and mark them
@@ -7069,8 +7043,6 @@ static inline void __init init_unavailable_mem(void)
}
#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-
#if MAX_NUMNODES > 1
/*
* Figure out the number of possible node ids.
@@ -7134,24 +7106,6 @@ unsigned long __init node_map_pfn_alignment(void)
return ~accl_mask + 1;
}
-/* Find the lowest pfn for a node */
-static unsigned long __init find_min_pfn_for_node(int nid)
-{
- unsigned long min_pfn = ULONG_MAX;
- unsigned long start_pfn;
- int i;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
- min_pfn = min(min_pfn, start_pfn);
-
- if (min_pfn == ULONG_MAX) {
- pr_warn("Could not find start_pfn for node %d\n", nid);
- return 0;
- }
-
- return min_pfn;
-}
-
/**
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
@@ -7160,7 +7114,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
*/
unsigned long __init find_min_pfn_with_active_regions(void)
{
- return find_min_pfn_for_node(MAX_NUMNODES);
+ return PHYS_PFN(memblock_start_of_DRAM());
}
/*
@@ -7213,7 +7167,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (!memblock_is_hotpluggable(r))
continue;
- nid = r->nid;
+ nid = memblock_get_region_node(r);
usable_startpfn = PFN_DOWN(r->base);
zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
@@ -7234,7 +7188,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (memblock_is_mirror(r))
continue;
- nid = r->nid;
+ nid = memblock_get_region_node(r);
usable_startpfn = memblock_region_memory_base_pfn(r);
@@ -7414,8 +7368,17 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
}
}
+/*
+ * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * such cases we allow max_zone_pfn sorted in the descending order
+ */
+bool __weak arch_has_descending_max_zone_pfns(void)
+{
+ return false;
+}
+
/**
- * free_area_init_nodes - Initialise all pg_data_t and zone data
+ * free_area_init - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
*
* This will call free_area_init_node() for each active node in the system.
@@ -7427,10 +7390,11 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
* starts where the previous one ended. For example, ZONE_DMA32 starts
* at arch_max_dma_pfn.
*/
-void __init free_area_init_nodes(unsigned long *max_zone_pfn)
+void __init free_area_init(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
- int i, nid;
+ int i, nid, zone;
+ bool descending;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
@@ -7439,14 +7403,20 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
sizeof(arch_zone_highest_possible_pfn));
start_pfn = find_min_pfn_with_active_regions();
+ descending = arch_has_descending_max_zone_pfns();
for (i = 0; i < MAX_NR_ZONES; i++) {
- if (i == ZONE_MOVABLE)
+ if (descending)
+ zone = MAX_NR_ZONES - i - 1;
+ else
+ zone = i;
+
+ if (zone == ZONE_MOVABLE)
continue;
- end_pfn = max(max_zone_pfn[i], start_pfn);
- arch_zone_lowest_possible_pfn[i] = start_pfn;
- arch_zone_highest_possible_pfn[i] = end_pfn;
+ end_pfn = max(max_zone_pfn[zone], start_pfn);
+ arch_zone_lowest_possible_pfn[zone] = start_pfn;
+ arch_zone_highest_possible_pfn[zone] = end_pfn;
start_pfn = end_pfn;
}
@@ -7499,8 +7469,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
init_unavailable_mem();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
- free_area_init_node(nid, NULL,
- find_min_pfn_for_node(nid), NULL);
+ free_area_init_node(nid);
/* Any memory on that node */
if (pgdat->node_present_pages)
@@ -7565,8 +7534,6 @@ static int __init cmdline_parse_movablecore(char *p)
early_param("kernelcore", cmdline_parse_kernelcore);
early_param("movablecore", cmdline_parse_movablecore);
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
void adjust_managed_page_count(struct page *page, long count)
{
atomic_long_add(count, &page_zone(page)->managed_pages);
@@ -7689,13 +7656,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
dma_reserve = new_dma_reserve;
}
-void __init free_area_init(unsigned long *zones_size)
-{
- init_unavailable_mem();
- free_area_init_node(0, zones_size,
- __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
-}
-
static int page_alloc_cpu_dead(unsigned int cpu)
{
@@ -7813,9 +7773,10 @@ static void setup_per_zone_lowmem_reserve(void)
idx--;
lower_zone = pgdat->node_zones + idx;
- if (sysctl_lowmem_reserve_ratio[idx] < 1) {
- sysctl_lowmem_reserve_ratio[idx] = 0;
+ if (!sysctl_lowmem_reserve_ratio[idx] ||
+ !zone_managed_pages(lower_zone)) {
lower_zone->lowmem_reserve[j] = 0;
+ continue;
} else {
lower_zone->lowmem_reserve[j] =
managed_pages / sysctl_lowmem_reserve_ratio[idx];
@@ -7880,9 +7841,9 @@ static void __setup_per_zone_wmarks(void)
mult_frac(zone_managed_pages(zone),
watermark_scale_factor, 10000));
+ zone->watermark_boost = 0;
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
- zone->watermark_boost = 0;
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -8067,7 +8028,15 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
+ int i;
+
proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (sysctl_lowmem_reserve_ratio[i] < 1)
+ sysctl_lowmem_reserve_ratio[i] = 0;
+ }
+
setup_per_zone_lowmem_reserve();
return 0;
}
@@ -8231,7 +8200,7 @@ void *__init alloc_large_system_hash(const char *tablename,
table = memblock_alloc_raw(size,
SMP_CACHE_BYTES);
} else if (get_order(size) >= MAX_ORDER || hashdist) {
- table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ table = __vmalloc(size, gfp_flags);
virt = true;
} else {
/*
@@ -8395,7 +8364,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
- unsigned long nr_reclaimed;
+ unsigned int nr_reclaimed;
unsigned long pfn = start;
unsigned int tries = 0;
int ret = 0;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 18ecde9f45b2..360461509423 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -312,8 +312,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
continue;
page_owner = get_page_owner(page_ext);
- page_mt = gfpflags_to_migratetype(
- page_owner->gfp_mask);
+ page_mt = gfp_migratetype(page_owner->gfp_mask);
if (pageblock_mt != page_mt) {
if (is_migrate_cma(pageblock_mt))
count[MIGRATE_MOVABLE]++;
@@ -359,7 +358,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
/* Print information relevant to grouping pages by mobility */
pageblock_mt = get_pageblock_migratetype(page);
- page_mt = gfpflags_to_migratetype(page_owner->gfp_mask);
+ page_mt = gfp_migratetype(page_owner->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
pfn,
@@ -416,7 +415,7 @@ void __dump_page_owner(struct page *page)
page_owner = get_page_owner(page_ext);
gfp_mask = page_owner->gfp_mask;
- mt = gfpflags_to_migratetype(gfp_mask);
+ mt = gfp_migratetype(gfp_mask);
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
pr_alert("page_owner info is not present (never set?)\n");
diff --git a/mm/percpu.c b/mm/percpu.c
index 7da7d7737dab..696367b18222 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -482,7 +482,7 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
if (size <= PAGE_SIZE)
return kzalloc(size, gfp);
else
- return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
+ return __vmalloc(size, gfp | __GFP_ZERO);
}
/**
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 3d7c01e76efc..d18f0e1b6792 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -194,7 +194,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
- pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp));
+ pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return old;
}
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 26208d0d03b7..f4ce916f5602 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -36,6 +36,9 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
+ if (st->effective_prot)
+ st->effective_prot(st, 0, pgd_val(val));
+
if (pgd_leaf(val))
st->note_page(st, addr, 0, pgd_val(val));
@@ -53,6 +56,9 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
+ if (st->effective_prot)
+ st->effective_prot(st, 1, p4d_val(val));
+
if (p4d_leaf(val))
st->note_page(st, addr, 1, p4d_val(val));
@@ -70,6 +76,9 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
+ if (st->effective_prot)
+ st->effective_prot(st, 2, pud_val(val));
+
if (pud_leaf(val))
st->note_page(st, addr, 2, pud_val(val));
@@ -87,6 +96,8 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
+ if (st->effective_prot)
+ st->effective_prot(st, 3, pmd_val(val));
if (pmd_leaf(val))
st->note_page(st, addr, 3, pmd_val(val));
@@ -97,8 +108,12 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
+ pte_t val = READ_ONCE(*pte);
+
+ if (st->effective_prot)
+ st->effective_prot(st, 4, pte_val(val));
- st->note_page(st, addr, 4, pte_val(READ_ONCE(*pte)));
+ st->note_page(st, addr, 4, pte_val(val));
return 0;
}
diff --git a/mm/readahead.c b/mm/readahead.c
index 2fe72cd29b47..3c9a8dd7c56c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -22,6 +22,7 @@
#include <linux/mm_inline.h>
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
+#include <linux/sched/mm.h>
#include "internal.h"
@@ -113,94 +114,126 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
EXPORT_SYMBOL(read_cache_pages);
-static int read_pages(struct address_space *mapping, struct file *filp,
- struct list_head *pages, unsigned int nr_pages, gfp_t gfp)
+static void read_pages(struct readahead_control *rac, struct list_head *pages,
+ bool skip_page)
{
+ const struct address_space_operations *aops = rac->mapping->a_ops;
+ struct page *page;
struct blk_plug plug;
- unsigned page_idx;
- int ret;
+
+ if (!readahead_count(rac))
+ goto out;
blk_start_plug(&plug);
- if (mapping->a_ops->readpages) {
- ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
+ if (aops->readahead) {
+ aops->readahead(rac);
+ /* Clean up the remaining pages */
+ while ((page = readahead_page(rac))) {
+ unlock_page(page);
+ put_page(page);
+ }
+ } else if (aops->readpages) {
+ aops->readpages(rac->file, rac->mapping, pages,
+ readahead_count(rac));
/* Clean up the remaining pages */
put_pages_list(pages);
- goto out;
- }
-
- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = lru_to_page(pages);
- list_del(&page->lru);
- if (!add_to_page_cache_lru(page, mapping, page->index, gfp))
- mapping->a_ops->readpage(filp, page);
- put_page(page);
+ rac->_index += rac->_nr_pages;
+ rac->_nr_pages = 0;
+ } else {
+ while ((page = readahead_page(rac))) {
+ aops->readpage(rac->file, page);
+ put_page(page);
+ }
}
- ret = 0;
-out:
blk_finish_plug(&plug);
- return ret;
+ BUG_ON(!list_empty(pages));
+ BUG_ON(readahead_count(rac));
+
+out:
+ if (skip_page)
+ rac->_index++;
}
-/*
- * __do_page_cache_readahead() actually reads a chunk of disk. It allocates
- * the pages first, then submits them for I/O. This avoids the very bad
- * behaviour which would occur if page allocations are causing VM writeback.
- * We really don't want to intermingle reads and writes like that.
+/**
+ * page_cache_readahead_unbounded - Start unchecked readahead.
+ * @mapping: File address space.
+ * @file: This instance of the open file; used for authentication.
+ * @index: First page index to read.
+ * @nr_to_read: The number of pages to read.
+ * @lookahead_size: Where to start the next readahead.
*
- * Returns the number of pages requested, or the maximum amount of I/O allowed.
+ * This function is for filesystems to call when they want to start
+ * readahead beyond a file's stated i_size. This is almost certainly
+ * not the function you want to call. Use page_cache_async_readahead()
+ * or page_cache_sync_readahead() instead.
+ *
+ * Context: File is referenced by caller. Mutexes may be held by caller.
+ * May sleep, but will not reenter filesystem to reclaim memory.
*/
-unsigned int __do_page_cache_readahead(struct address_space *mapping,
- struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+void page_cache_readahead_unbounded(struct address_space *mapping,
+ struct file *file, pgoff_t index, unsigned long nr_to_read,
unsigned long lookahead_size)
{
- struct inode *inode = mapping->host;
- struct page *page;
- unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
- int page_idx;
- unsigned int nr_pages = 0;
- loff_t isize = i_size_read(inode);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
+ struct readahead_control rac = {
+ .mapping = mapping,
+ .file = file,
+ ._index = index,
+ };
+ unsigned long i;
- if (isize == 0)
- goto out;
-
- end_index = ((isize - 1) >> PAGE_SHIFT);
+ /*
+ * Partway through the readahead operation, we will have added
+ * locked pages to the page cache, but will not yet have submitted
+ * them for I/O. Adding another page may need to allocate memory,
+ * which can trigger memory reclaim. Telling the VM we're in
+ * the middle of a filesystem operation will cause it to not
+ * touch file-backed pages, preventing a deadlock. Most (all?)
+ * filesystems already specify __GFP_NOFS in their mapping's
+ * gfp_mask, but let's be explicit here.
+ */
+ unsigned int nofs = memalloc_nofs_save();
/*
* Preallocate as many pages as we will need.
*/
- for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
- pgoff_t page_offset = offset + page_idx;
+ for (i = 0; i < nr_to_read; i++) {
+ struct page *page = xa_load(&mapping->i_pages, index + i);
- if (page_offset > end_index)
- break;
+ BUG_ON(index + i != rac._index + rac._nr_pages);
- page = xa_load(&mapping->i_pages, page_offset);
if (page && !xa_is_value(page)) {
/*
- * Page already present? Kick off the current batch of
- * contiguous pages before continuing with the next
- * batch.
+ * Page already present? Kick off the current batch
+ * of contiguous pages before continuing with the
+ * next batch. This page may be the one we would
+ * have intended to mark as Readahead, but we don't
+ * have a stable reference to this page, and it's
+ * not worth getting one just for that.
*/
- if (nr_pages)
- read_pages(mapping, filp, &page_pool, nr_pages,
- gfp_mask);
- nr_pages = 0;
+ read_pages(&rac, &page_pool, true);
continue;
}
page = __page_cache_alloc(gfp_mask);
if (!page)
break;
- page->index = page_offset;
- list_add(&page->lru, &page_pool);
- if (page_idx == nr_to_read - lookahead_size)
+ if (mapping->a_ops->readpages) {
+ page->index = index + i;
+ list_add(&page->lru, &page_pool);
+ } else if (add_to_page_cache_lru(page, mapping, index + i,
+ gfp_mask) < 0) {
+ put_page(page);
+ read_pages(&rac, &page_pool, true);
+ continue;
+ }
+ if (i == nr_to_read - lookahead_size)
SetPageReadahead(page);
- nr_pages++;
+ rac._nr_pages++;
}
/*
@@ -208,26 +241,53 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping,
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
- if (nr_pages)
- read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
- BUG_ON(!list_empty(&page_pool));
-out:
- return nr_pages;
+ read_pages(&rac, &page_pool, false);
+ memalloc_nofs_restore(nofs);
+}
+EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded);
+
+/*
+ * __do_page_cache_readahead() actually reads a chunk of disk. It allocates
+ * the pages first, then submits them for I/O. This avoids the very bad
+ * behaviour which would occur if page allocations are causing VM writeback.
+ * We really don't want to intermingle reads and writes like that.
+ */
+void __do_page_cache_readahead(struct address_space *mapping,
+ struct file *file, pgoff_t index, unsigned long nr_to_read,
+ unsigned long lookahead_size)
+{
+ struct inode *inode = mapping->host;
+ loff_t isize = i_size_read(inode);
+ pgoff_t end_index; /* The last page we want to read */
+
+ if (isize == 0)
+ return;
+
+ end_index = (isize - 1) >> PAGE_SHIFT;
+ if (index > end_index)
+ return;
+ /* Don't read past the page containing the last byte of the file */
+ if (nr_to_read > end_index - index)
+ nr_to_read = end_index - index + 1;
+
+ page_cache_readahead_unbounded(mapping, file, index, nr_to_read,
+ lookahead_size);
}
/*
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once.
*/
-int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read)
+void force_page_cache_readahead(struct address_space *mapping,
+ struct file *filp, pgoff_t index, unsigned long nr_to_read)
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
struct file_ra_state *ra = &filp->f_ra;
unsigned long max_pages;
- if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
- return -EINVAL;
+ if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
+ !mapping->a_ops->readahead))
+ return;
/*
* If the request exceeds the readahead window, allow the read to
@@ -240,12 +300,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
- __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0);
+ __do_page_cache_readahead(mapping, filp, index, this_chunk, 0);
- offset += this_chunk;
+ index += this_chunk;
nr_to_read -= this_chunk;
}
- return 0;
}
/*
@@ -324,21 +383,21 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
*/
/*
- * Count contiguously cached pages from @offset-1 to @offset-@max,
+ * Count contiguously cached pages from @index-1 to @index-@max,
* this count is a conservative estimation of
* - length of the sequential read sequence, or
* - thrashing threshold in memory tight systems
*/
static pgoff_t count_history_pages(struct address_space *mapping,
- pgoff_t offset, unsigned long max)
+ pgoff_t index, unsigned long max)
{
pgoff_t head;
rcu_read_lock();
- head = page_cache_prev_miss(mapping, offset - 1, max);
+ head = page_cache_prev_miss(mapping, index - 1, max);
rcu_read_unlock();
- return offset - 1 - head;
+ return index - 1 - head;
}
/*
@@ -346,13 +405,13 @@ static pgoff_t count_history_pages(struct address_space *mapping,
*/
static int try_context_readahead(struct address_space *mapping,
struct file_ra_state *ra,
- pgoff_t offset,
+ pgoff_t index,
unsigned long req_size,
unsigned long max)
{
pgoff_t size;
- size = count_history_pages(mapping, offset, max);
+ size = count_history_pages(mapping, index, max);
/*
* not enough history pages:
@@ -365,10 +424,10 @@ static int try_context_readahead(struct address_space *mapping,
* starts from beginning of file:
* it is a strong indication of long-run stream (or whole-file-read)
*/
- if (size >= offset)
+ if (size >= index)
size *= 2;
- ra->start = offset;
+ ra->start = index;
ra->size = min(size + req_size, max);
ra->async_size = 1;
@@ -378,16 +437,15 @@ static int try_context_readahead(struct address_space *mapping,
/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
-static unsigned long
-ondemand_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- bool hit_readahead_marker, pgoff_t offset,
- unsigned long req_size)
+static void ondemand_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *filp,
+ bool hit_readahead_marker, pgoff_t index,
+ unsigned long req_size)
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
- pgoff_t prev_offset;
+ pgoff_t prev_index;
/*
* If the request exceeds the readahead window, allow the read to
@@ -399,15 +457,15 @@ ondemand_readahead(struct address_space *mapping,
/*
* start of file
*/
- if (!offset)
+ if (!index)
goto initial_readahead;
/*
- * It's the expected callback offset, assume sequential access.
+ * It's the expected callback index, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
- if ((offset == (ra->start + ra->size - ra->async_size) ||
- offset == (ra->start + ra->size))) {
+ if ((index == (ra->start + ra->size - ra->async_size) ||
+ index == (ra->start + ra->size))) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
@@ -424,14 +482,14 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_miss(mapping, offset + 1, max_pages);
+ start = page_cache_next_miss(mapping, index + 1, max_pages);
rcu_read_unlock();
- if (!start || start - offset > max_pages)
- return 0;
+ if (!start || start - index > max_pages)
+ return;
ra->start = start;
- ra->size = start - offset; /* old async_size */
+ ra->size = start - index; /* old async_size */
ra->size += req_size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
@@ -446,28 +504,29 @@ ondemand_readahead(struct address_space *mapping,
/*
* sequential cache miss
- * trivial case: (offset - prev_offset) == 1
- * unaligned reads: (offset - prev_offset) == 0
+ * trivial case: (index - prev_index) == 1
+ * unaligned reads: (index - prev_index) == 0
*/
- prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
- if (offset - prev_offset <= 1UL)
+ prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
+ if (index - prev_index <= 1UL)
goto initial_readahead;
/*
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
- if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
+ if (try_context_readahead(mapping, ra, index, req_size, max_pages))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
- return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
+ __do_page_cache_readahead(mapping, filp, index, req_size, 0);
+ return;
initial_readahead:
- ra->start = offset;
+ ra->start = index;
ra->size = get_init_ra_size(req_size, max_pages);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
@@ -478,7 +537,7 @@ readit:
* the resulted next readahead window into the current one.
* Take care of maximum IO pages as above.
*/
- if (offset == ra->start && ra->size == ra->async_size) {
+ if (index == ra->start && ra->size == ra->async_size) {
add_pages = get_next_ra_size(ra, max_pages);
if (ra->size + add_pages <= max_pages) {
ra->async_size = add_pages;
@@ -489,7 +548,7 @@ readit:
}
}
- return ra_submit(ra, mapping, filp);
+ ra_submit(ra, mapping, filp);
}
/**
@@ -497,9 +556,8 @@ readit:
* @mapping: address_space which holds the pagecache and I/O vectors
* @ra: file_ra_state which holds the readahead state
* @filp: passed on to ->readpage() and ->readpages()
- * @offset: start offset into @mapping, in pagecache page-sized units
- * @req_size: hint: total size of the read which the caller is performing in
- * pagecache pages
+ * @index: Index of first page to be read.
+ * @req_count: Total number of pages being read by the caller.
*
* page_cache_sync_readahead() should be called when a cache miss happened:
* it will submit the read. The readahead logic may decide to piggyback more
@@ -508,7 +566,7 @@ readit:
*/
void page_cache_sync_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
- pgoff_t offset, unsigned long req_size)
+ pgoff_t index, unsigned long req_count)
{
/* no read-ahead */
if (!ra->ra_pages)
@@ -519,12 +577,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
/* be dumb */
if (filp && (filp->f_mode & FMODE_RANDOM)) {
- force_page_cache_readahead(mapping, filp, offset, req_size);
+ force_page_cache_readahead(mapping, filp, index, req_count);
return;
}
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, false, offset, req_size);
+ ondemand_readahead(mapping, ra, filp, false, index, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
@@ -533,21 +591,20 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
* @mapping: address_space which holds the pagecache and I/O vectors
* @ra: file_ra_state which holds the readahead state
* @filp: passed on to ->readpage() and ->readpages()
- * @page: the page at @offset which has the PG_readahead flag set
- * @offset: start offset into @mapping, in pagecache page-sized units
- * @req_size: hint: total size of the read which the caller is performing in
- * pagecache pages
+ * @page: The page at @index which triggered the readahead call.
+ * @index: Index of first page to be read.
+ * @req_count: Total number of pages being read by the caller.
*
* page_cache_async_readahead() should be called when a page is used which
- * has the PG_readahead flag; this is a marker to suggest that the application
+ * is marked as PageReadahead; this is a marker to suggest that the application
* has used up enough of the readahead window that we should start pulling in
* more pages.
*/
void
page_cache_async_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
- struct page *page, pgoff_t offset,
- unsigned long req_size)
+ struct page *page, pgoff_t index,
+ unsigned long req_count)
{
/* no read-ahead */
if (!ra->ra_pages)
@@ -571,7 +628,7 @@ page_cache_async_readahead(struct address_space *mapping,
return;
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, true, offset, req_size);
+ ondemand_readahead(mapping, ra, filp, true, index, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index f79a206b271a..ad4a0fdcc94c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1114,6 +1114,11 @@ void do_page_add_anon_rmap(struct page *page,
bool compound = flags & RMAP_COMPOUND;
bool first;
+ if (unlikely(PageKsm(page)))
+ lock_page_memcg(page);
+ else
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
if (compound) {
atomic_t *mapcount;
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -1133,13 +1138,14 @@ void do_page_add_anon_rmap(struct page *page,
* disabled.
*/
if (compound)
- __inc_node_page_state(page, NR_ANON_THPS);
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
}
- if (unlikely(PageKsm(page)))
- return;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (unlikely(PageKsm(page))) {
+ unlock_page_memcg(page);
+ return;
+ }
/* address might be in next vma when migration races vma_adjust */
if (first)
@@ -1174,14 +1180,14 @@ void page_add_new_anon_rmap(struct page *page,
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
- __inc_node_page_state(page, NR_ANON_THPS);
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
} else {
/* Anon THP always mapped first with PMD */
VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* increment count (starts at -1) */
atomic_set(&page->_mapcount, 0);
}
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
__page_set_anon_rmap(page, vma, address, 1);
}
@@ -1230,13 +1236,12 @@ static void page_remove_file_rmap(struct page *page, bool compound)
int i, nr = 1;
VM_BUG_ON_PAGE(compound && !PageHead(page), page);
- lock_page_memcg(page);
/* Hugepages are not counted in NR_FILE_MAPPED for now. */
if (unlikely(PageHuge(page))) {
/* hugetlb pages are always mapped with pmds */
atomic_dec(compound_mapcount_ptr(page));
- goto out;
+ return;
}
/* page still mapped by someone else? */
@@ -1246,14 +1251,14 @@ static void page_remove_file_rmap(struct page *page, bool compound)
nr++;
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
- goto out;
+ return;
if (PageSwapBacked(page))
__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
else
__dec_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
- goto out;
+ return;
}
/*
@@ -1265,8 +1270,6 @@ static void page_remove_file_rmap(struct page *page, bool compound)
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
-out:
- unlock_page_memcg(page);
}
static void page_remove_anon_compound_rmap(struct page *page)
@@ -1283,7 +1286,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return;
- __dec_node_page_state(page, NR_ANON_THPS);
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
if (TestClearPageDoubleMap(page)) {
/*
@@ -1310,7 +1313,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
clear_page_mlock(page);
if (nr)
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
}
/**
@@ -1322,22 +1325,28 @@ static void page_remove_anon_compound_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page, bool compound)
{
- if (!PageAnon(page))
- return page_remove_file_rmap(page, compound);
+ lock_page_memcg(page);
- if (compound)
- return page_remove_anon_compound_rmap(page);
+ if (!PageAnon(page)) {
+ page_remove_file_rmap(page, compound);
+ goto out;
+ }
+
+ if (compound) {
+ page_remove_anon_compound_rmap(page);
+ goto out;
+ }
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
- return;
+ goto out;
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- __dec_node_page_state(page, NR_ANON_MAPPED);
+ __dec_lruvec_page_state(page, NR_ANON_MAPPED);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
@@ -1354,6 +1363,8 @@ void page_remove_rmap(struct page *page, bool compound)
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
+out:
+ unlock_page_memcg(page);
}
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index bd8840082c94..ea95a3e46fbb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -605,11 +605,13 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
*/
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
- pgoff_t index, void *expected, gfp_t gfp)
+ pgoff_t index, void *expected, gfp_t gfp,
+ struct mm_struct *charge_mm)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
unsigned long i = 0;
unsigned long nr = compound_nr(page);
+ int error;
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -621,6 +623,18 @@ static int shmem_add_to_page_cache(struct page *page,
page->mapping = mapping;
page->index = index;
+ if (!PageSwapCache(page)) {
+ error = mem_cgroup_charge(page, charge_mm, gfp);
+ if (error) {
+ if (PageTransHuge(page)) {
+ count_vm_event(THP_FILE_FALLBACK);
+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
+ }
+ goto error;
+ }
+ }
+ cgroup_throttle_swaprate(page, gfp);
+
do {
void *entry;
xas_lock_irq(&xas);
@@ -641,19 +655,22 @@ next:
__inc_node_page_state(page, NR_SHMEM_THPS);
}
mapping->nrpages += nr;
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
- __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
+ __mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
+ __mod_lruvec_page_state(page, NR_SHMEM, nr);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
- page->mapping = NULL;
- page_ref_sub(page, nr);
- return xas_error(&xas);
+ error = xas_error(&xas);
+ goto error;
}
return 0;
+error:
+ page->mapping = NULL;
+ page_ref_sub(page, nr);
+ return error;
}
/*
@@ -670,8 +687,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
error = shmem_replace_entry(mapping, page->index, page, radswap);
page->mapping = NULL;
mapping->nrpages--;
- __dec_node_page_state(page, NR_FILE_PAGES);
- __dec_node_page_state(page, NR_SHMEM);
+ __dec_lruvec_page_state(page, NR_FILE_PAGES);
+ __dec_lruvec_page_state(page, NR_SHMEM);
xa_unlock_irq(&mapping->i_pages);
put_page(page);
BUG_ON(error);
@@ -1578,8 +1595,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
xa_lock_irq(&swap_mapping->i_pages);
error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
if (!error) {
- __inc_node_page_state(newpage, NR_FILE_PAGES);
- __dec_node_page_state(oldpage, NR_FILE_PAGES);
+ mem_cgroup_migrate(oldpage, newpage);
+ __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
+ __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
}
xa_unlock_irq(&swap_mapping->i_pages);
@@ -1591,8 +1609,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
oldpage = newpage;
} else {
- mem_cgroup_migrate(oldpage, newpage);
- lru_cache_add_anon(newpage);
+ lru_cache_add(newpage);
*pagep = newpage;
}
@@ -1619,7 +1636,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
- struct mem_cgroup *memcg;
struct page *page;
swp_entry_t swap;
int error;
@@ -1664,31 +1680,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
goto failed;
}
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- false);
- if (!error) {
- error = shmem_add_to_page_cache(page, mapping, index,
- swp_to_radix_entry(swap), gfp);
- /*
- * We already confirmed swap under page lock, and make
- * no memory allocation here, so usually no possibility
- * of error; but free_swap_and_cache() only trylocks a
- * page, so it is just possible that the entry has been
- * truncated or holepunched since swap was confirmed.
- * shmem_undo_range() will have done some of the
- * unaccounting, now delete_from_swap_cache() will do
- * the rest.
- */
- if (error) {
- mem_cgroup_cancel_charge(page, memcg, false);
- delete_from_swap_cache(page);
- }
- }
+ error = shmem_add_to_page_cache(page, mapping, index,
+ swp_to_radix_entry(swap), gfp,
+ charge_mm);
if (error)
goto failed;
- mem_cgroup_commit_charge(page, memcg, true, false);
-
spin_lock_irq(&info->lock);
info->swapped--;
shmem_recalc_inode(inode);
@@ -1734,7 +1731,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
- struct mem_cgroup *memcg;
struct page *page;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
@@ -1859,25 +1855,12 @@ alloc_nohuge:
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- PageTransHuge(page));
- if (error) {
- if (PageTransHuge(page)) {
- count_vm_event(THP_FILE_FALLBACK);
- count_vm_event(THP_FILE_FALLBACK_CHARGE);
- }
- goto unacct;
- }
error = shmem_add_to_page_cache(page, mapping, hindex,
- NULL, gfp & GFP_RECLAIM_MASK);
- if (error) {
- mem_cgroup_cancel_charge(page, memcg,
- PageTransHuge(page));
+ NULL, gfp & GFP_RECLAIM_MASK,
+ charge_mm);
+ if (error)
goto unacct;
- }
- mem_cgroup_commit_charge(page, memcg, false,
- PageTransHuge(page));
- lru_cache_add_anon(page);
+ lru_cache_add(page);
spin_lock_irq(&info->lock);
info->alloced += compound_nr(page);
@@ -2314,7 +2297,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
struct address_space *mapping = inode->i_mapping;
gfp_t gfp = mapping_gfp_mask(mapping);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
- struct mem_cgroup *memcg;
spinlock_t *ptl;
void *page_kaddr;
struct page *page;
@@ -2364,16 +2346,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (unlikely(offset >= max_off))
goto out_release;
- ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
- if (ret)
- goto out_release;
-
ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
- gfp & GFP_RECLAIM_MASK);
+ gfp & GFP_RECLAIM_MASK, dst_mm);
if (ret)
- goto out_release_uncharge;
-
- mem_cgroup_commit_charge(page, memcg, false, false);
+ goto out_release;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
if (dst_vma->vm_flags & VM_WRITE)
@@ -2394,13 +2370,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
ret = -EFAULT;
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(offset >= max_off))
- goto out_release_uncharge_unlock;
+ goto out_release_unlock;
ret = -EEXIST;
if (!pte_none(*dst_pte))
- goto out_release_uncharge_unlock;
+ goto out_release_unlock;
- lru_cache_add_anon(page);
+ lru_cache_add(page);
spin_lock_irq(&info->lock);
info->alloced++;
@@ -2419,12 +2395,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
ret = 0;
out:
return ret;
-out_release_uncharge_unlock:
+out_release_unlock:
pte_unmap_unlock(dst_pte, ptl);
ClearPageDirty(page);
delete_from_page_cache(page);
-out_release_uncharge:
- mem_cgroup_cancel_charge(page, memcg, false);
out_release:
unlock_page(page);
put_page(page);
diff --git a/mm/slab.c b/mm/slab.c
index a89633603b2d..9350062ffc1a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3106,7 +3106,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
- enum zone_type high_zoneidx = gfp_zone(flags);
+ enum zone_type highest_zoneidx = gfp_zone(flags);
void *obj = NULL;
struct page *page;
int nid;
@@ -3124,7 +3124,7 @@ retry:
* Look through allowed nodes for objects available
* from existing per node queues.
*/
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
nid = zone_to_nid(zone);
if (cpuset_zone_allowed(zone, flags) &&
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 23c7500eea7d..9e72ba224175 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1303,7 +1303,8 @@ void __init create_kmalloc_caches(slab_flags_t flags)
kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
kmalloc_info[i].name[KMALLOC_DMA],
kmalloc_info[i].size,
- SLAB_CACHE_DMA | flags, 0, 0);
+ SLAB_CACHE_DMA | flags, 0,
+ kmalloc_info[i].size);
}
}
#endif
diff --git a/mm/slub.c b/mm/slub.c
index fdedd7e93cd2..03e063cd979f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -679,6 +679,20 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...)
va_end(args);
}
+static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+ void *freelist, void *nextfree)
+{
+ if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
+ !check_valid_pointer(s, page, nextfree)) {
+ object_err(s, page, freelist, "Freechain corrupt");
+ freelist = NULL;
+ slab_fix(s, "Isolate corrupted freechain");
+ return true;
+ }
+
+ return false;
+}
+
static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
{
unsigned int off; /* Offset of last byte */
@@ -1410,6 +1424,11 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
+static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+ void *freelist, void *nextfree)
+{
+ return false;
+}
#endif /* CONFIG_SLUB_DEBUG */
/*
@@ -1919,7 +1938,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
- enum zone_type high_zoneidx = gfp_zone(flags);
+ enum zone_type highest_zoneidx = gfp_zone(flags);
void *object;
unsigned int cpuset_mems_cookie;
@@ -1948,7 +1967,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
do {
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = node_zonelist(mempolicy_slab_node(), flags);
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
struct kmem_cache_node *n;
n = get_node(s, zone_to_nid(zone));
@@ -1994,7 +2013,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
#ifdef CONFIG_PREEMPTION
/*
- * Calculate the next globally unique transaction for disambiguiation
+ * Calculate the next globally unique transaction for disambiguation
* during cmpxchg. The transactions start with the cpu number and are then
* incremented by CONFIG_NR_CPUS.
*/
@@ -2093,6 +2112,14 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
void *prior;
unsigned long counters;
+ /*
+ * If 'nextfree' is invalid, it is possible that the object at
+ * 'freelist' is already corrupted. So isolate all objects
+ * starting at 'freelist'.
+ */
+ if (freelist_corrupted(s, page, freelist, nextfree))
+ break;
+
do {
prior = page->freelist;
counters = page->counters;
@@ -3739,12 +3766,14 @@ error:
}
static void list_slab_objects(struct kmem_cache *s, struct page *page,
- const char *text)
+ const char *text, unsigned long *map)
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
- unsigned long *map;
+
+ if (!map)
+ return;
slab_err(s, page, text, s->name);
slab_lock(page);
@@ -3757,8 +3786,6 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
print_tracking(s, p);
}
}
- put_map(map);
-
slab_unlock(page);
#endif
}
@@ -3772,6 +3799,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
LIST_HEAD(discard);
struct page *page, *h;
+ unsigned long *map = NULL;
+
+#ifdef CONFIG_SLUB_DEBUG
+ map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
+#endif
BUG_ON(irqs_disabled());
spin_lock_irq(&n->list_lock);
@@ -3781,11 +3813,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
list_add(&page->slab_list, &discard);
} else {
list_slab_objects(s, page,
- "Objects remaining in %s on __kmem_cache_shutdown()");
+ "Objects remaining in %s on __kmem_cache_shutdown()",
+ map);
}
}
spin_unlock_irq(&n->list_lock);
+#ifdef CONFIG_SLUB_DEBUG
+ bitmap_free(map);
+#endif
+
list_for_each_entry_safe(page, h, &discard, slab_list)
discard_slab(s, page);
}
@@ -5656,7 +5693,8 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
*/
if (buffer)
buf = buffer;
- else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
+ else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) &&
+ !IS_ENABLED(CONFIG_SLUB_STATS))
buf = mbuf;
else {
buffer = (char *) get_zeroed_page(GFP_KERNEL);
@@ -5690,19 +5728,6 @@ static struct kobj_type slab_ktype = {
.release = kmem_cache_release,
};
-static int uevent_filter(struct kset *kset, struct kobject *kobj)
-{
- struct kobj_type *ktype = get_ktype(kobj);
-
- if (ktype == &slab_ktype)
- return 1;
- return 0;
-}
-
-static const struct kset_uevent_ops slab_uevent_ops = {
- .filter = uevent_filter,
-};
-
static struct kset *slab_kset;
static inline struct kset *cache_kset(struct kmem_cache *s)
@@ -5770,7 +5795,6 @@ static void sysfs_slab_remove_workfn(struct work_struct *work)
#ifdef CONFIG_MEMCG
kset_unregister(s->memcg_kset);
#endif
- kobject_uevent(&s->kobj, KOBJ_REMOVE);
out:
kobject_put(&s->kobj);
}
@@ -5828,7 +5852,6 @@ static int sysfs_slab_add(struct kmem_cache *s)
}
#endif
- kobject_uevent(&s->kobj, KOBJ_ADD);
if (!unmergeable) {
/* Setup first alias */
sysfs_slab_alias(s, s->name);
@@ -5909,7 +5932,7 @@ static int __init slab_sysfs_init(void)
mutex_lock(&slab_mutex);
- slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
+ slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
if (!slab_kset) {
mutex_unlock(&slab_mutex);
pr_err("Cannot register slab subsystem.\n");
diff --git a/mm/sparse.c b/mm/sparse.c
index 1aee5a481571..6284328cd9f2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -288,7 +288,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
/*
* Mark all memblocks as present using memory_present(). This is a
- * convienence function that is useful for a number of arches
+ * convenience function that is useful for a number of arches
* to mark all of the systems memory as present during initialization.
*/
void __init memblocks_present(void)
diff --git a/mm/swap.c b/mm/swap.c
index bf9a79fed62d..acd88873f076 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -83,8 +83,6 @@ static void __put_single_page(struct page *page)
static void __put_compound_page(struct page *page)
{
- compound_page_dtor *dtor;
-
/*
* __page_cache_release() is supposed to be called for thp, not for
* hugetlb. This is because hugetlb page does never have PageLRU set
@@ -93,8 +91,7 @@ static void __put_compound_page(struct page *page)
*/
if (!PageHuge(page))
__page_cache_release(page);
- dtor = get_compound_page_dtor(page);
- (*dtor)(page);
+ destroy_compound_page(page);
}
void __put_page(struct page *page)
@@ -262,21 +259,47 @@ void rotate_reclaimable_page(struct page *page)
}
}
-static void update_page_reclaim_stat(struct lruvec *lruvec,
- int file, int rotated)
+void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
{
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+ do {
+ unsigned long lrusize;
+
+ /* Record cost event */
+ if (file)
+ lruvec->file_cost += nr_pages;
+ else
+ lruvec->anon_cost += nr_pages;
+
+ /*
+ * Decay previous events
+ *
+ * Because workloads change over time (and to avoid
+ * overflow) we keep these statistics as a floating
+ * average, which ends up weighing recent refaults
+ * more than old ones.
+ */
+ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
+ lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
+ lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
+ lruvec_page_state(lruvec, NR_ACTIVE_FILE);
+
+ if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
+ lruvec->file_cost /= 2;
+ lruvec->anon_cost /= 2;
+ }
+ } while ((lruvec = parent_lruvec(lruvec)));
+}
- reclaim_stat->recent_scanned[file]++;
- if (rotated)
- reclaim_stat->recent_rotated[file]++;
+void lru_note_cost_page(struct page *page)
+{
+ lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
+ page_is_file_lru(page), hpage_nr_pages(page));
}
static void __activate_page(struct page *page, struct lruvec *lruvec,
void *arg)
{
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_lru(page);
int lru = page_lru_base_type(page);
del_page_from_lru_list(page, lruvec, lru);
@@ -286,7 +309,6 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
trace_mm_lru_activate(page);
__count_vm_event(PGACTIVATE);
- update_page_reclaim_stat(lruvec, file, 1);
}
}
@@ -402,35 +424,6 @@ void mark_page_accessed(struct page *page)
}
EXPORT_SYMBOL(mark_page_accessed);
-static void __lru_cache_add(struct page *page)
-{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
-
- get_page(page);
- if (!pagevec_add(pvec, page) || PageCompound(page))
- __pagevec_lru_add(pvec);
- put_cpu_var(lru_add_pvec);
-}
-
-/**
- * lru_cache_add_anon - add a page to the page lists
- * @page: the page to add
- */
-void lru_cache_add_anon(struct page *page)
-{
- if (PageActive(page))
- ClearPageActive(page);
- __lru_cache_add(page);
-}
-
-void lru_cache_add_file(struct page *page)
-{
- if (PageActive(page))
- ClearPageActive(page);
- __lru_cache_add(page);
-}
-EXPORT_SYMBOL(lru_cache_add_file);
-
/**
* lru_cache_add - add a page to a page list
* @page: the page to be added to the LRU.
@@ -442,10 +435,17 @@ EXPORT_SYMBOL(lru_cache_add_file);
*/
void lru_cache_add(struct page *page)
{
+ struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
- __lru_cache_add(page);
+
+ get_page(page);
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ __pagevec_lru_add(pvec);
+ put_cpu_var(lru_add_pvec);
}
+EXPORT_SYMBOL(lru_cache_add);
/**
* lru_cache_add_active_or_unevictable
@@ -501,7 +501,7 @@ void lru_cache_add_active_or_unevictable(struct page *page,
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
- int lru, file;
+ int lru;
bool active;
if (!PageLRU(page))
@@ -515,7 +515,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
return;
active = PageActive(page);
- file = page_is_file_lru(page);
lru = page_lru_base_type(page);
del_page_from_lru_list(page, lruvec, lru + active);
@@ -541,14 +540,12 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
if (active)
__count_vm_event(PGDEACTIVATE);
- update_page_reclaim_stat(lruvec, file, 0);
}
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_lru(page);
int lru = page_lru_base_type(page);
del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
@@ -557,7 +554,6 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
add_page_to_lru_list(page, lruvec, lru);
__count_vm_events(PGDEACTIVATE, hpage_nr_pages(page));
- update_page_reclaim_stat(lruvec, file, 0);
}
}
@@ -582,7 +578,6 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
__count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
count_memcg_page_event(page, PGLAZYFREE);
- update_page_reclaim_stat(lruvec, 1, 0);
}
}
@@ -890,8 +885,6 @@ EXPORT_SYMBOL(__pagevec_release);
void lru_add_page_tail(struct page *page, struct page *page_tail,
struct lruvec *lruvec, struct list_head *list)
{
- const int file = 0;
-
VM_BUG_ON_PAGE(!PageHead(page), page);
VM_BUG_ON_PAGE(PageCompound(page_tail), page);
VM_BUG_ON_PAGE(PageLRU(page_tail), page);
@@ -917,9 +910,6 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
add_page_to_lru_list_tail(page_tail, lruvec,
page_lru(page_tail));
}
-
- if (!PageUnevictable(page))
- update_page_reclaim_stat(lruvec, file, PageActive(page_tail));
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -962,8 +952,6 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
if (page_evictable(page)) {
lru = page_lru(page);
- update_page_reclaim_stat(lruvec, page_is_file_lru(page),
- PageActive(page));
if (was_unevictable)
count_vm_event(UNEVICTABLE_PGRESCUED);
} else {
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index 45affaef3bc6..7f34343c075a 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -171,9 +171,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
unsigned long length;
struct swap_cgroup_ctrl *ctrl;
- if (!do_swap_account)
- return 0;
-
length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
array_size = length * sizeof(void *);
@@ -209,9 +206,6 @@ void swap_cgroup_swapoff(int type)
unsigned long i, length;
struct swap_cgroup_ctrl *ctrl;
- if (!do_swap_account)
- return;
-
mutex_lock(&swap_cgroup_mutex);
ctrl = &swap_cgroup_ctrl[type];
map = ctrl->map;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ebed37bbf7a3..9d20b00627af 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -360,12 +360,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
bool *new_page_allocated)
{
- struct page *found_page = NULL, *new_page = NULL;
struct swap_info_struct *si;
- int err;
+ struct page *page;
+
*new_page_allocated = false;
- do {
+ for (;;) {
+ int err;
/*
* First check the swap cache. Since this is normally
* called after lookup_swap_cache() failed, re-calling
@@ -373,12 +374,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
*/
si = get_swap_device(entry);
if (!si)
- break;
- found_page = find_get_page(swap_address_space(entry),
- swp_offset(entry));
+ return NULL;
+ page = find_get_page(swap_address_space(entry),
+ swp_offset(entry));
put_swap_device(si);
- if (found_page)
- break;
+ if (page)
+ return page;
/*
* Just skip read ahead for unused swap slot.
@@ -389,54 +390,71 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* else swap_off will be aborted if we return NULL.
*/
if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
- break;
+ return NULL;
/*
- * Get a new page to read into from swap.
+ * Get a new page to read into from swap. Allocate it now,
+ * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
+ * cause any racers to loop around until we add it to cache.
*/
- if (!new_page) {
- new_page = alloc_page_vma(gfp_mask, vma, addr);
- if (!new_page)
- break; /* Out of memory */
- }
+ page = alloc_page_vma(gfp_mask, vma, addr);
+ if (!page)
+ return NULL;
/*
* Swap entry may have been freed since our caller observed it.
*/
err = swapcache_prepare(entry);
- if (err == -EEXIST) {
- /*
- * We might race against get_swap_page() and stumble
- * across a SWAP_HAS_CACHE swap_map entry whose page
- * has not been brought into the swapcache yet.
- */
- cond_resched();
- continue;
- } else if (err) /* swp entry is obsolete ? */
+ if (!err)
break;
- /* May fail (-ENOMEM) if XArray node allocation failed. */
- __SetPageLocked(new_page);
- __SetPageSwapBacked(new_page);
- err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
- if (likely(!err)) {
- /* Initiate read into locked page */
- SetPageWorkingset(new_page);
- lru_cache_add_anon(new_page);
- *new_page_allocated = true;
- return new_page;
- }
- __ClearPageLocked(new_page);
+ put_page(page);
+ if (err != -EEXIST)
+ return NULL;
+
/*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
+ * We might race against __delete_from_swap_cache(), and
+ * stumble across a swap_map entry whose SWAP_HAS_CACHE
+ * has not yet been cleared. Or race against another
+ * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
+ * in swap_map, but not yet added its page to swap cache.
*/
- put_swap_page(new_page, entry);
- } while (err != -ENOMEM);
+ cond_resched();
+ }
+
+ /*
+ * The swap entry is ours to swap in. Prepare the new page.
+ */
+
+ __SetPageLocked(page);
+ __SetPageSwapBacked(page);
+
+ /* May fail (-ENOMEM) if XArray node allocation failed. */
+ if (add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL)) {
+ put_swap_page(page, entry);
+ goto fail_unlock;
+ }
+
+ if (mem_cgroup_charge(page, NULL, gfp_mask)) {
+ delete_from_swap_cache(page);
+ goto fail_unlock;
+ }
+
+ /* XXX: Move to lru_cache_add() when it supports new vs putback */
+ spin_lock_irq(&page_pgdat(page)->lru_lock);
+ lru_note_cost_page(page);
+ spin_unlock_irq(&page_pgdat(page)->lru_lock);
+
+ /* Caller will initiate read into locked page */
+ SetPageWorkingset(page);
+ lru_cache_add(page);
+ *new_page_allocated = true;
+ return page;
- if (new_page)
- put_page(new_page);
- return found_page;
+fail_unlock:
+ unlock_page(page);
+ put_page(page);
+ return NULL;
}
/*
@@ -509,10 +527,11 @@ static unsigned long swapin_nr_pages(unsigned long offset)
return 1;
hits = atomic_xchg(&swapin_readahead_hits, 0);
- pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
+ pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
+ max_pages,
atomic_read(&last_readahead_pages));
if (!hits)
- prev_offset = offset;
+ WRITE_ONCE(prev_offset, offset);
atomic_set(&last_readahead_pages, pages);
return pages;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 5871a2aa86a5..0d4863f696a7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -601,7 +601,6 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
{
struct percpu_cluster *cluster;
struct swap_cluster_info *ci;
- bool found_free;
unsigned long tmp, max;
new_cluster:
@@ -617,14 +616,13 @@ new_cluster:
* discarding, do discard now and reclaim them
*/
swap_do_scheduled_discard(si);
- *scan_base = *offset = si->cluster_next;
+ *scan_base = this_cpu_read(*si->cluster_next_cpu);
+ *offset = *scan_base;
goto new_cluster;
} else
return false;
}
- found_free = false;
-
/*
* Other CPUs can use our cluster if they can't find a free cluster,
* check if there is still free entry in the cluster
@@ -632,27 +630,23 @@ new_cluster:
tmp = cluster->next;
max = min_t(unsigned long, si->max,
(cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
- if (tmp >= max) {
- cluster_set_null(&cluster->index);
- goto new_cluster;
- }
- ci = lock_cluster(si, tmp);
- while (tmp < max) {
- if (!si->swap_map[tmp]) {
- found_free = true;
- break;
+ if (tmp < max) {
+ ci = lock_cluster(si, tmp);
+ while (tmp < max) {
+ if (!si->swap_map[tmp])
+ break;
+ tmp++;
}
- tmp++;
+ unlock_cluster(ci);
}
- unlock_cluster(ci);
- if (!found_free) {
+ if (tmp >= max) {
cluster_set_null(&cluster->index);
goto new_cluster;
}
cluster->next = tmp + 1;
*offset = tmp;
*scan_base = tmp;
- return found_free;
+ return true;
}
static void __del_from_avail_list(struct swap_info_struct *p)
@@ -729,6 +723,34 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
}
}
+static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
+{
+ unsigned long prev;
+
+ if (!(si->flags & SWP_SOLIDSTATE)) {
+ si->cluster_next = next;
+ return;
+ }
+
+ prev = this_cpu_read(*si->cluster_next_cpu);
+ /*
+ * Cross the swap address space size aligned trunk, choose
+ * another trunk randomly to avoid lock contention on swap
+ * address space if possible.
+ */
+ if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
+ (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
+ /* No free swap slots available */
+ if (si->highest_bit <= si->lowest_bit)
+ return;
+ next = si->lowest_bit +
+ prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
+ next = ALIGN(next, SWAP_ADDRESS_SPACE_PAGES);
+ next = max_t(unsigned int, next, si->lowest_bit);
+ }
+ this_cpu_write(*si->cluster_next_cpu, next);
+}
+
static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned char usage, int nr,
swp_entry_t slots[])
@@ -739,9 +761,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned long last_in_cluster = 0;
int latency_ration = LATENCY_LIMIT;
int n_ret = 0;
-
- if (nr > SWAP_BATCH)
- nr = SWAP_BATCH;
+ bool scanned_many = false;
/*
* We try to cluster swap pages by allocating them sequentially
@@ -755,17 +775,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
*/
si->flags += SWP_SCANNING;
- scan_base = offset = si->cluster_next;
+ /*
+ * Use percpu scan base for SSD to reduce lock contention on
+ * cluster and swap cache. For HDD, sequential access is more
+ * important.
+ */
+ if (si->flags & SWP_SOLIDSTATE)
+ scan_base = this_cpu_read(*si->cluster_next_cpu);
+ else
+ scan_base = si->cluster_next;
+ offset = scan_base;
/* SSD algorithm */
if (si->cluster_info) {
- if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
- goto checks;
- else
+ if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
goto scan;
- }
-
- if (unlikely(!si->cluster_nr--)) {
+ } else if (unlikely(!si->cluster_nr--)) {
if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
si->cluster_nr = SWAPFILE_CLUSTER - 1;
goto checks;
@@ -848,7 +873,6 @@ checks:
unlock_cluster(ci);
swap_range_alloc(si, offset, 1);
- si->cluster_next = offset + 1;
slots[n_ret++] = swp_entry(si->type, offset);
/* got enough slots or reach max slots? */
@@ -871,19 +895,33 @@ checks:
if (si->cluster_info) {
if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
goto checks;
- else
- goto done;
- }
- /* non-ssd case */
- ++offset;
-
- /* non-ssd case, still more slots in cluster? */
- if (si->cluster_nr && !si->swap_map[offset]) {
+ } else if (si->cluster_nr && !si->swap_map[++offset]) {
+ /* non-ssd case, still more slots in cluster? */
--si->cluster_nr;
goto checks;
}
+ /*
+ * Even if there's no free clusters available (fragmented),
+ * try to scan a little more quickly with lock held unless we
+ * have scanned too many slots already.
+ */
+ if (!scanned_many) {
+ unsigned long scan_limit;
+
+ if (offset < scan_base)
+ scan_limit = scan_base;
+ else
+ scan_limit = si->highest_bit;
+ for (; offset <= scan_limit && --latency_ration > 0;
+ offset++) {
+ if (!si->swap_map[offset])
+ goto checks;
+ }
+ }
+
done:
+ set_cluster_next(si, offset + 1);
si->flags -= SWP_SCANNING;
return n_ret;
@@ -901,6 +939,7 @@ scan:
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
+ scanned_many = true;
}
}
offset = si->lowest_bit;
@@ -916,6 +955,7 @@ scan:
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
+ scanned_many = true;
}
offset++;
}
@@ -1004,11 +1044,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
if (avail_pgs <= 0)
goto noswap;
- if (n_goal > SWAP_BATCH)
- n_goal = SWAP_BATCH;
-
- if (n_goal > avail_pgs)
- n_goal = avail_pgs;
+ n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
atomic_long_sub(n_goal * size, &nr_swap_pages);
@@ -1275,13 +1311,14 @@ unlock_out:
}
static unsigned char __swap_entry_free(struct swap_info_struct *p,
- swp_entry_t entry, unsigned char usage)
+ swp_entry_t entry)
{
struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
+ unsigned char usage;
ci = lock_cluster_or_swap_info(p, offset);
- usage = __swap_entry_free_locked(p, offset, usage);
+ usage = __swap_entry_free_locked(p, offset, 1);
unlock_cluster_or_swap_info(p, ci);
if (!usage)
free_swap_slot(entry);
@@ -1316,7 +1353,7 @@ void swap_free(swp_entry_t entry)
p = _swap_info_get(entry);
if (p)
- __swap_entry_free(p, entry, 1);
+ __swap_entry_free(p, entry);
}
/*
@@ -1739,7 +1776,7 @@ int free_swap_and_cache(swp_entry_t entry)
p = _swap_info_get(entry);
if (p) {
- count = __swap_entry_free(p, entry, 1);
+ count = __swap_entry_free(p, entry);
if (count == SWAP_HAS_CACHE &&
!swap_page_trans_huge_swapped(p, entry))
__try_to_reclaim_swap(p, swp_offset(entry),
@@ -1854,7 +1891,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, swp_entry_t entry, struct page *page)
{
struct page *swapcache;
- struct mem_cgroup *memcg;
spinlock_t *ptl;
pte_t *pte;
int ret = 1;
@@ -1864,15 +1900,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
if (unlikely(!page))
return -ENOMEM;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
- ret = -ENOMEM;
- goto out_nolock;
- }
-
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
- mem_cgroup_cancel_charge(page, memcg, false);
ret = 0;
goto out;
}
@@ -1884,10 +1913,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
page_add_anon_rmap(page, vma, addr, false);
- mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, addr, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
swap_free(entry);
@@ -1898,7 +1925,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
activate_page(page);
out:
pte_unmap_unlock(pte, ptl);
-out_nolock:
if (page != swapcache) {
unlock_page(page);
put_page(page);
@@ -1937,10 +1963,14 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_unmap(pte);
swap_map = &si->swap_map[offset];
- vmf.vma = vma;
- vmf.address = addr;
- vmf.pmd = pmd;
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
+ page = lookup_swap_cache(entry, vma, addr);
+ if (!page) {
+ vmf.vma = vma;
+ vmf.address = addr;
+ vmf.pmd = pmd;
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+ &vmf);
+ }
if (!page) {
if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
goto try_next;
@@ -2650,6 +2680,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
+ free_percpu(p->cluster_next_cpu);
+ p->cluster_next_cpu = NULL;
vfree(swap_map);
kvfree(cluster_info);
kvfree(frontswap_map);
@@ -2757,20 +2789,24 @@ static int swap_show(struct seq_file *swap, void *v)
struct swap_info_struct *si = v;
struct file *file;
int len;
+ unsigned int bytes, inuse;
if (si == SEQ_START_TOKEN) {
- seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+ seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
return 0;
}
+ bytes = si->pages << (PAGE_SHIFT - 10);
+ inuse = si->inuse_pages << (PAGE_SHIFT - 10);
+
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
- seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
+ seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
len < 40 ? 40 - len : 1, " ",
S_ISBLK(file_inode(file)->i_mode) ?
"partition" : "file\t",
- si->pages << (PAGE_SHIFT - 10),
- si->inuse_pages << (PAGE_SHIFT - 10),
+ bytes, bytes < 10000000 ? "\t" : "",
+ inuse, inuse < 10000000 ? "\t" : "",
si->prio);
return 0;
}
@@ -3202,11 +3238,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
unsigned long ci, nr_cluster;
p->flags |= SWP_SOLIDSTATE;
+ p->cluster_next_cpu = alloc_percpu(unsigned int);
+ if (!p->cluster_next_cpu) {
+ error = -ENOMEM;
+ goto bad_swap_unlock_inode;
+ }
/*
* select a random position to start with to help wear leveling
* SSD
*/
- p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
+ for_each_possible_cpu(cpu) {
+ per_cpu(*p->cluster_next_cpu, cpu) =
+ 1 + prandom_u32_max(p->highest_bit);
+ }
nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
@@ -3322,6 +3366,8 @@ bad_swap_unlock_inode:
bad_swap:
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
+ free_percpu(p->cluster_next_cpu);
+ p->cluster_next_cpu = NULL;
if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
set_blocksize(p->bdev, p->old_block_size);
blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
@@ -3654,7 +3700,7 @@ static bool swap_count_continued(struct swap_info_struct *si,
spin_lock(&si->cont_lock);
offset &= ~PAGE_MASK;
- page = list_entry(head->lru.next, struct page, lru);
+ page = list_next_entry(head, lru);
map = kmap_atomic(page) + offset;
if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
@@ -3666,13 +3712,13 @@ static bool swap_count_continued(struct swap_info_struct *si,
*/
while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
kunmap_atomic(map);
- page = list_entry(page->lru.next, struct page, lru);
+ page = list_next_entry(page, lru);
BUG_ON(page == head);
map = kmap_atomic(page) + offset;
}
if (*map == SWAP_CONT_MAX) {
kunmap_atomic(map);
- page = list_entry(page->lru.next, struct page, lru);
+ page = list_next_entry(page, lru);
if (page == head) {
ret = false; /* add count continuation */
goto out;
@@ -3682,12 +3728,10 @@ init_map: *map = 0; /* we didn't zero the page */
}
*map += 1;
kunmap_atomic(map);
- page = list_entry(page->lru.prev, struct page, lru);
- while (page != head) {
+ while ((page = list_prev_entry(page, lru)) != head) {
map = kmap_atomic(page) + offset;
*map = COUNT_CONTINUED;
kunmap_atomic(map);
- page = list_entry(page->lru.prev, struct page, lru);
}
ret = true; /* incremented */
@@ -3698,7 +3742,7 @@ init_map: *map = 0; /* we didn't zero the page */
BUG_ON(count != COUNT_CONTINUED);
while (*map == COUNT_CONTINUED) {
kunmap_atomic(map);
- page = list_entry(page->lru.next, struct page, lru);
+ page = list_next_entry(page, lru);
BUG_ON(page == head);
map = kmap_atomic(page) + offset;
}
@@ -3707,13 +3751,11 @@ init_map: *map = 0; /* we didn't zero the page */
if (*map == 0)
count = 0;
kunmap_atomic(map);
- page = list_entry(page->lru.prev, struct page, lru);
- while (page != head) {
+ while ((page = list_prev_entry(page, lru)) != head) {
map = kmap_atomic(page) + offset;
*map = SWAP_CONT_MAX | count;
count = COUNT_CONTINUED;
kunmap_atomic(map);
- page = list_entry(page->lru.prev, struct page, lru);
}
ret = count == COUNT_CONTINUED;
}
@@ -3745,11 +3787,12 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
- gfp_t gfp_mask)
+void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
{
struct swap_info_struct *si, *next;
- if (!(gfp_mask & __GFP_IO) || !memcg)
+ int nid = page_to_nid(page);
+
+ if (!(gfp_mask & __GFP_IO))
return;
if (!blk_cgroup_congested())
@@ -3763,11 +3806,10 @@ void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
return;
spin_lock(&swap_avail_lock);
- plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
- avail_lists[node]) {
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
+ avail_lists[nid]) {
if (si->bdev) {
- blkcg_schedule_throttle(bdev_get_queue(si->bdev),
- true);
+ blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
break;
}
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 512576e171ce..7f5194046b01 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -56,7 +56,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
struct page **pagep,
bool wp_copy)
{
- struct mem_cgroup *memcg;
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
void *page_kaddr;
@@ -97,7 +96,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
__SetPageUptodate(page);
ret = -ENOMEM;
- if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
goto out_release;
_dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
@@ -124,7 +123,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
inc_mm_counter(dst_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, dst_vma);
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -138,7 +136,6 @@ out:
return ret;
out_release_uncharge_unlock:
pte_unmap_unlock(dst_pte, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
out_release:
put_page(page);
goto out;
diff --git a/mm/util.c b/mm/util.c
index 8defc8ec141f..db059539de24 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -580,7 +580,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (ret || size <= PAGE_SIZE)
return ret;
- return __vmalloc_node_flags_caller(size, node, flags,
+ return __vmalloc_node(size, 1, flags, node,
__builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node);
@@ -604,6 +604,24 @@ void kvfree(const void *addr)
}
EXPORT_SYMBOL(kvfree);
+/**
+ * kvfree_sensitive - Free a data object containing sensitive information.
+ * @addr: address of the data object to be freed.
+ * @len: length of the data object.
+ *
+ * Use the special memzero_explicit() function to clear the content of a
+ * kvmalloc'ed object containing sensitive data to make sure that the
+ * compiler won't optimize out the data clearing.
+ */
+void kvfree_sensitive(const void *addr, size_t len)
+{
+ if (likely(!ZERO_OR_NULL_PTR(addr))) {
+ memzero_explicit((void *)addr, len);
+ kvfree(addr);
+ }
+}
+EXPORT_SYMBOL(kvfree_sensitive);
+
static inline void *__page_rmapping(struct page *page)
{
unsigned long mapping;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9a8227afa073..1e94497b7388 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -69,7 +69,8 @@ static void free_work(struct work_struct *w)
/*** Page table manipulation functions ***/
-static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
+static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pte_t *pte;
@@ -78,73 +79,118 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
} while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
}
-static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
+ int cleared;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_clear_huge(pmd))
+
+ cleared = pmd_clear_huge(pmd);
+ if (cleared || pmd_bad(*pmd))
+ *mask |= PGTBL_PMD_MODIFIED;
+
+ if (cleared)
continue;
if (pmd_none_or_clear_bad(pmd))
continue;
- vunmap_pte_range(pmd, addr, next);
+ vunmap_pte_range(pmd, addr, next, mask);
} while (pmd++, addr = next, addr != end);
}
-static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
+static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
+ int cleared;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_clear_huge(pud))
+
+ cleared = pud_clear_huge(pud);
+ if (cleared || pud_bad(*pud))
+ *mask |= PGTBL_PUD_MODIFIED;
+
+ if (cleared)
continue;
if (pud_none_or_clear_bad(pud))
continue;
- vunmap_pmd_range(pud, addr, next);
+ vunmap_pmd_range(pud, addr, next, mask);
} while (pud++, addr = next, addr != end);
}
-static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
+static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
+ int cleared;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
- if (p4d_clear_huge(p4d))
+
+ cleared = p4d_clear_huge(p4d);
+ if (cleared || p4d_bad(*p4d))
+ *mask |= PGTBL_P4D_MODIFIED;
+
+ if (cleared)
continue;
if (p4d_none_or_clear_bad(p4d))
continue;
- vunmap_pud_range(p4d, addr, next);
+ vunmap_pud_range(p4d, addr, next, mask);
} while (p4d++, addr = next, addr != end);
}
-static void vunmap_page_range(unsigned long addr, unsigned long end)
+/**
+ * unmap_kernel_range_noflush - unmap kernel VM area
+ * @start: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify
+ * should have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing. The caller is responsible
+ * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
+ * function and flush_tlb_kernel_range() after.
+ */
+void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
{
- pgd_t *pgd;
+ unsigned long end = start + size;
unsigned long next;
+ pgd_t *pgd;
+ unsigned long addr = start;
+ pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
+ start = addr;
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
+ if (pgd_bad(*pgd))
+ mask |= PGTBL_PGD_MODIFIED;
if (pgd_none_or_clear_bad(pgd))
continue;
- vunmap_p4d_range(pgd, addr, next);
+ vunmap_p4d_range(pgd, addr, next, &mask);
} while (pgd++, addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
}
static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pte_t *pte;
@@ -153,7 +199,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
* callers keep track of where we're up to.
*/
- pte = pte_alloc_kernel(pmd, addr);
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
do {
@@ -166,94 +212,117 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
return 0;
}
static int vmap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
- pmd = pmd_alloc(&init_mm, pud, addr);
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
+ if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
- pud = pud_alloc(&init_mm, p4d, addr);
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
+ if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
- p4d = p4d_alloc(&init_mm, pgd, addr);
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
if (!p4d)
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
+ if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
}
-/*
- * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
- * will have pfns corresponding to the "pages" array.
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
*
- * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
+ * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
+ * have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing. The caller is responsible for
+ * calling flush_cache_vmap() on to-be-mapped areas before calling this
+ * function.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
*/
-static int vmap_page_range_noflush(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages)
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+ pgprot_t prot, struct page **pages)
{
- pgd_t *pgd;
+ unsigned long start = addr;
+ unsigned long end = addr + size;
unsigned long next;
- unsigned long addr = start;
+ pgd_t *pgd;
int err = 0;
int nr = 0;
+ pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
+ if (pgd_bad(*pgd))
+ mask |= PGTBL_PGD_MODIFIED;
+ err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
- return nr;
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return 0;
}
-static int vmap_page_range(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages)
+int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
+ struct page **pages)
{
int ret;
- ret = vmap_page_range_noflush(start, end, prot, pages);
- flush_cache_vmap(start, end);
+ ret = map_kernel_range_noflush(start, size, prot, pages);
+ flush_cache_vmap(start, start + size);
return ret;
}
@@ -1223,14 +1292,6 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
/*
- * Clear the pagetable entries of a given vmap_area
- */
-static void unmap_vmap_area(struct vmap_area *va)
-{
- vunmap_page_range(va->va_start, va->va_end);
-}
-
-/*
* lazy_max_pages is the maximum amount of virtual address space we gather up
* before attempting to purge with a TLB flush.
*
@@ -1293,12 +1354,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
return false;
/*
- * First make sure the mappings are removed from all page-tables
- * before they are freed.
- */
- vmalloc_sync_unmappings();
-
- /*
* TODO: to calculate a flush range without looping.
* The list can be up to lazy_max_pages() elements.
*/
@@ -1391,7 +1446,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- unmap_vmap_area(va);
+ unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
@@ -1665,7 +1720,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
return vaddr;
}
-static void vb_free(const void *addr, unsigned long size)
+static void vb_free(unsigned long addr, unsigned long size)
{
unsigned long offset;
unsigned long vb_idx;
@@ -1675,24 +1730,22 @@ static void vb_free(const void *addr, unsigned long size)
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
- flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
+ flush_cache_vunmap(addr, addr + size);
order = get_order(size);
- offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
- offset >>= PAGE_SHIFT;
+ offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
- vb_idx = addr_to_vb_idx((unsigned long)addr);
+ vb_idx = addr_to_vb_idx(addr);
rcu_read_lock();
vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
rcu_read_unlock();
BUG_ON(!vb);
- vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+ unmap_kernel_range_noflush(addr, size);
if (debug_pagealloc_enabled_static())
- flush_tlb_kernel_range((unsigned long)addr,
- (unsigned long)addr + size);
+ flush_tlb_kernel_range(addr, addr + size);
spin_lock(&vb->lock);
@@ -1792,7 +1845,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
if (likely(count <= VMAP_MAX_ALLOC)) {
debug_check_no_locks_freed(mem, size);
- vb_free(mem, size);
+ vb_free(addr, size);
return;
}
@@ -1819,7 +1872,7 @@ EXPORT_SYMBOL(vm_unmap_ram);
*
* Returns: a pointer to the address that has been mapped, or %NULL on failure
*/
-void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
unsigned long addr;
@@ -1843,7 +1896,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
kasan_unpoison_vmalloc(mem, size);
- if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
+ if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
@@ -1988,51 +2041,6 @@ void __init vmalloc_init(void)
}
/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
- * specify should have been allocated using get_vm_area() and its
- * friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is
- * responsible for calling flush_cache_vmap() on to-be-mapped areas
- * before calling this function.
- *
- * RETURNS:
- * The number of pages mapped on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
- pgprot_t prot, struct page **pages)
-{
- return vmap_page_range_noflush(addr, addr + size, prot, pages);
-}
-
-/**
- * unmap_kernel_range_noflush - unmap kernel VM area
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
- * specify should have been allocated using get_vm_area() and its
- * friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is
- * responsible for calling flush_cache_vunmap() on to-be-mapped areas
- * before calling this function and flush_tlb_kernel_range() after.
- */
-void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
-{
- vunmap_page_range(addr, addr + size);
-}
-EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
-
-/**
* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
* @addr: start of the VM area to unmap
* @size: size of the VM area to unmap
@@ -2045,22 +2053,9 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
unsigned long end = addr + size;
flush_cache_vunmap(addr, end);
- vunmap_page_range(addr, end);
+ unmap_kernel_range_noflush(addr, size);
flush_tlb_kernel_range(addr, end);
}
-EXPORT_SYMBOL_GPL(unmap_kernel_range);
-
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
-{
- unsigned long addr = (unsigned long)area->addr;
- unsigned long end = addr + get_vm_area_size(area);
- int err;
-
- err = vmap_page_range(addr, end, prot, pages);
-
- return err > 0 ? 0 : err;
-}
-EXPORT_SYMBOL_GPL(map_vm_area);
static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
@@ -2128,14 +2123,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
return area;
}
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
- unsigned long start, unsigned long end)
-{
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
- GFP_KERNEL, __builtin_return_address(0));
-}
-EXPORT_SYMBOL_GPL(__get_vm_area);
-
struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
const void *caller)
@@ -2441,7 +2428,8 @@ void *vmap(struct page **pages, unsigned int count,
if (!area)
return NULL;
- if (map_vm_area(area, prot, pages)) {
+ if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
+ pages) < 0) {
vunmap(area->addr);
return NULL;
}
@@ -2450,9 +2438,6 @@ void *vmap(struct page **pages, unsigned int count,
}
EXPORT_SYMBOL(vmap);
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
@@ -2470,7 +2455,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
- PAGE_KERNEL, node, area->caller);
+ node, area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@@ -2504,8 +2489,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (map_vm_area(area, prot, pages))
+ if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
+ prot, pages) < 0)
goto fail;
+
return area->addr;
fail:
@@ -2573,27 +2560,16 @@ fail:
return NULL;
}
-/*
- * This is only for performance analysis of vmalloc and stress purpose.
- * It is required by vmalloc test module, therefore do not use it other
- * than that.
- */
-#ifdef CONFIG_TEST_VMALLOC_MODULE
-EXPORT_SYMBOL_GPL(__vmalloc_node_range);
-#endif
-
/**
* __vmalloc_node - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
* @gfp_mask: flags for the page level allocator
- * @prot: protection mask for the allocated pages
* @node: node to use for allocation or NUMA_NO_NODE
* @caller: caller's return address
*
- * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * Allocate enough pages to cover @size from the page level allocator with
+ * @gfp_mask flags. Map them into contiguous kernel virtual space.
*
* Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
* and __GFP_NOFAIL are not supported
@@ -2603,35 +2579,28 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller)
+void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, int node, const void *caller)
{
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
- gfp_mask, prot, 0, node, caller);
+ gfp_mask, PAGE_KERNEL, 0, node, caller);
}
+/*
+ * This is only for performance analysis of vmalloc and stress purpose.
+ * It is required by vmalloc test module, therefore do not use it other
+ * than that.
+ */
+#ifdef CONFIG_TEST_VMALLOC_MODULE
+EXPORT_SYMBOL_GPL(__vmalloc_node);
+#endif
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc(unsigned long size, gfp_t gfp_mask)
{
- return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
+ return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc);
-static inline void *__vmalloc_node_flags(unsigned long size,
- int node, gfp_t flags)
-{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
- node, __builtin_return_address(0));
-}
-
-
-void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
- void *caller)
-{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
-}
-
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
@@ -2646,8 +2615,8 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
- GFP_KERNEL);
+ return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc);
@@ -2666,8 +2635,8 @@ EXPORT_SYMBOL(vmalloc);
*/
void *vzalloc(unsigned long size)
{
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
- GFP_KERNEL | __GFP_ZERO);
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc);
@@ -2704,8 +2673,8 @@ EXPORT_SYMBOL(vmalloc_user);
*/
void *vmalloc_node(unsigned long size, int node)
{
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
- node, __builtin_return_address(0));
+ return __vmalloc_node(size, 1, GFP_KERNEL, node,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node);
@@ -2718,39 +2687,16 @@ EXPORT_SYMBOL(vmalloc_node);
* allocator and map them into contiguous kernel virtual space.
* The memory allocated is set to zero.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc_node() instead.
- *
* Return: pointer to the allocated memory or %NULL on error
*/
void *vzalloc_node(unsigned long size, int node)
{
- return __vmalloc_node_flags(size, node,
- GFP_KERNEL | __GFP_ZERO);
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_node);
/**
- * vmalloc_user_node_flags - allocate memory for userspace on a specific node
- * @size: allocation size
- * @node: numa node
- * @flags: flags for the page level allocator
- *
- * The resulting memory area is zeroed so it can be mapped to userspace
- * without leaking data.
- *
- * Return: pointer to the allocated memory or %NULL on error
- */
-void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
-{
- return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
- flags | __GFP_ZERO, PAGE_KERNEL,
- VM_USERMAP, node,
- __builtin_return_address(0));
-}
-EXPORT_SYMBOL(vmalloc_user_node_flags);
-
-/**
* vmalloc_exec - allocate virtually contiguous, executable memory
* @size: allocation size
*
@@ -2793,8 +2739,8 @@ void *vmalloc_exec(unsigned long size)
*/
void *vmalloc_32(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
+ return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);
@@ -3137,21 +3083,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-/*
- * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
- * not to have one.
- *
- * The purpose of this function is to make sure the vmalloc area
- * mappings are identical in all page-tables in the system.
- */
-void __weak vmalloc_sync_mappings(void)
-{
-}
-
-void __weak vmalloc_sync_unmappings(void)
-{
-}
-
static int f(pte_t *pte, unsigned long addr, void *data)
{
pte_t ***p = data;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a37c87b5aee2..3a482b22fe4e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -79,6 +79,12 @@ struct scan_control {
*/
struct mem_cgroup *target_mem_cgroup;
+ /*
+ * Scan pressure balancing between anon and file LRUs
+ */
+ unsigned long anon_cost;
+ unsigned long file_cost;
+
/* Can active pages be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2
@@ -161,7 +167,7 @@ struct scan_control {
#endif
/*
- * From 0 .. 100. Higher means more swappy.
+ * From 0 .. 200. Higher means more swappy.
*/
int vm_swappiness = 60;
/*
@@ -676,7 +682,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
freed += ret;
/*
* Bail out if someone want to register a new shrinker to
- * prevent the regsitration from being stalled for long periods
+ * prevent the registration from being stalled for long periods
* by parallel ongoing shrinking.
*/
if (rwsem_is_contended(&shrinker_rwsem)) {
@@ -1066,17 +1072,17 @@ static void page_check_dirty_writeback(struct page *page,
/*
* shrink_page_list() returns the number of reclaimed pages
*/
-static unsigned long shrink_page_list(struct list_head *page_list,
- struct pglist_data *pgdat,
- struct scan_control *sc,
- enum ttu_flags ttu_flags,
- struct reclaim_stat *stat,
- bool ignore_references)
+static unsigned int shrink_page_list(struct list_head *page_list,
+ struct pglist_data *pgdat,
+ struct scan_control *sc,
+ enum ttu_flags ttu_flags,
+ struct reclaim_stat *stat,
+ bool ignore_references)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
- unsigned nr_reclaimed = 0;
- unsigned pgactivate = 0;
+ unsigned int nr_reclaimed = 0;
+ unsigned int pgactivate = 0;
memset(stat, 0, sizeof(*stat));
cond_resched();
@@ -1295,11 +1301,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
if (page_mapped(page)) {
enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
+ bool was_swapbacked = PageSwapBacked(page);
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
+
if (!try_to_unmap(page, flags)) {
stat->nr_unmap_fail += nr_pages;
+ if (!was_swapbacked && PageSwapBacked(page))
+ stat->nr_lazyfree_fail += nr_pages;
goto activate_locked;
}
}
@@ -1349,6 +1359,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
+ stat->nr_pageout += hpage_nr_pages(page);
+
if (PageWriteback(page))
goto keep;
if (PageDirty(page))
@@ -1438,7 +1450,7 @@ free_it:
* appear not as the counts should be low
*/
if (unlikely(PageTransHuge(page)))
- (*get_compound_page_dtor(page))(page);
+ destroy_compound_page(page);
else
list_add(&page->lru, &free_pages);
continue;
@@ -1483,7 +1495,7 @@ keep:
return nr_reclaimed;
}
-unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *page_list)
{
struct scan_control sc = {
@@ -1491,8 +1503,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
.priority = DEF_PRIORITY,
.may_unmap = 1,
};
- struct reclaim_stat dummy_stat;
- unsigned long ret;
+ struct reclaim_stat stat;
+ unsigned int nr_reclaimed;
struct page *page, *next;
LIST_HEAD(clean_pages);
@@ -1504,11 +1516,21 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
}
}
- ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_IGNORE_ACCESS, &dummy_stat, true);
+ nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
+ TTU_IGNORE_ACCESS, &stat, true);
list_splice(&clean_pages, page_list);
- mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
- return ret;
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed);
+ /*
+ * Since lazyfree pages are isolated from file LRU from the beginning,
+ * they will rotate back to anonymous LRU in the end if it failed to
+ * discard so isolated count will be mismatched.
+ * Compensate the isolated count for both LRU lists.
+ */
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
+ stat.nr_lazyfree_fail);
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
+ -stat.nr_lazyfree_fail);
+ return nr_reclaimed;
}
/*
@@ -1591,7 +1613,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
/*
* Update LRU sizes after isolating pages. The LRU size updates must
- * be complete before mem_cgroup_update_lru_size due to a santity check.
+ * be complete before mem_cgroup_update_lru_size due to a sanity check.
*/
static __always_inline void update_lru_sizes(struct lruvec *lruvec,
enum lru_list lru, unsigned long *nr_zone_taken)
@@ -1602,10 +1624,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
if (!nr_zone_taken[zid])
continue;
- __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
-#ifdef CONFIG_MEMCG
- mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
-#endif
+ update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
}
}
@@ -1859,7 +1878,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&pgdat->lru_lock);
- (*get_compound_page_dtor(page))(page);
+ destroy_compound_page(page);
spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
@@ -1899,13 +1918,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
{
LIST_HEAD(page_list);
unsigned long nr_scanned;
- unsigned long nr_reclaimed = 0;
+ unsigned int nr_reclaimed = 0;
unsigned long nr_taken;
struct reclaim_stat stat;
- int file = is_file_lru(lru);
+ bool file = is_file_lru(lru);
enum vm_event_item item;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
bool stalled = false;
while (unlikely(too_many_isolated(pgdat, file, sc))) {
@@ -1929,12 +1947,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- reclaim_stat->recent_scanned[file] += nr_taken;
-
item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ __count_vm_events(PGSCAN_ANON + file, nr_scanned);
+
spin_unlock_irq(&pgdat->lru_lock);
if (nr_taken == 0)
@@ -1945,16 +1963,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&pgdat->lru_lock);
+ move_pages_to_lru(lruvec, &page_list);
+
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+ lru_note_cost(lruvec, file, stat.nr_pageout);
item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
- reclaim_stat->recent_rotated[0] += stat.nr_activate[0];
- reclaim_stat->recent_rotated[1] += stat.nr_activate[1];
-
- move_pages_to_lru(lruvec, &page_list);
-
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+ __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2001,7 +2018,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct page *page;
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
int file = is_file_lru(lru);
@@ -2015,7 +2031,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- reclaim_stat->recent_scanned[file] += nr_taken;
__count_vm_events(PGREFILL, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
@@ -2042,7 +2057,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (page_referenced(page, 0, sc->target_mem_cgroup,
&vm_flags)) {
- nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
@@ -2053,6 +2067,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
* so we ignore them here.
*/
if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
+ nr_rotated += hpage_nr_pages(page);
list_add(&page->lru, &l_active);
continue;
}
@@ -2067,13 +2082,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
* Move pages back to the lru list.
*/
spin_lock_irq(&pgdat->lru_lock);
- /*
- * Count referenced pages from currently used mappings as rotated,
- * even though only some of them are actually re-activated. This
- * helps balance scan pressure between file and anonymous pages in
- * get_scan_count.
- */
- reclaim_stat->recent_rotated[file] += nr_rotated;
nr_activate = move_pages_to_lru(lruvec, &l_active);
nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
@@ -2095,7 +2103,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
unsigned long reclaim_pages(struct list_head *page_list)
{
int nid = NUMA_NO_NODE;
- unsigned long nr_reclaimed = 0;
+ unsigned int nr_reclaimed = 0;
LIST_HEAD(node_page_list);
struct reclaim_stat dummy_stat;
struct page *page;
@@ -2229,14 +2237,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
{
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
u64 denominator = 0; /* gcc */
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
- unsigned long anon, file;
unsigned long ap, fp;
enum lru_list lru;
@@ -2286,57 +2291,35 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
}
scan_balance = SCAN_FRACT;
-
- /*
- * With swappiness at 100, anonymous and file have the same priority.
- * This scanning priority is essentially the inverse of IO cost.
- */
- anon_prio = swappiness;
- file_prio = 200 - anon_prio;
-
/*
- * OK, so we have swap space and a fair amount of page cache
- * pages. We use the recently rotated / recently scanned
- * ratios to determine how valuable each cache is.
+ * Calculate the pressure balance between anon and file pages.
+ *
+ * The amount of pressure we put on each LRU is inversely
+ * proportional to the cost of reclaiming each list, as
+ * determined by the share of pages that are refaulting, times
+ * the relative IO cost of bringing back a swapped out
+ * anonymous page vs reloading a filesystem page (swappiness).
*
- * Because workloads change over time (and to avoid overflow)
- * we keep these statistics as a floating average, which ends
- * up weighing recent references more than old ones.
+ * Although we limit that influence to ensure no list gets
+ * left behind completely: at least a third of the pressure is
+ * applied, before swappiness.
*
- * anon in [0], file in [1]
+ * With swappiness at 100, anon and file have equal IO cost.
*/
+ total_cost = sc->anon_cost + sc->file_cost;
+ anon_cost = total_cost + sc->anon_cost;
+ file_cost = total_cost + sc->file_cost;
+ total_cost = anon_cost + file_cost;
- anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
- lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
- file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
+ ap = swappiness * (total_cost + 1);
+ ap /= anon_cost + 1;
- spin_lock_irq(&pgdat->lru_lock);
- if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
- reclaim_stat->recent_scanned[0] /= 2;
- reclaim_stat->recent_rotated[0] /= 2;
- }
-
- if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
- reclaim_stat->recent_scanned[1] /= 2;
- reclaim_stat->recent_rotated[1] /= 2;
- }
-
- /*
- * The amount of pressure on anon vs file pages is inversely
- * proportional to the fraction of recently scanned pages on
- * each list that were recently referenced and in active use.
- */
- ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
- ap /= reclaim_stat->recent_rotated[0] + 1;
-
- fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
- fp /= reclaim_stat->recent_rotated[1] + 1;
- spin_unlock_irq(&pgdat->lru_lock);
+ fp = (200 - swappiness) * (total_cost + 1);
+ fp /= file_cost + 1;
fraction[0] = ap;
fraction[1] = fp;
- denominator = ap + fp + 1;
+ denominator = ap + fp;
out:
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
@@ -2388,7 +2371,7 @@ out:
/*
* Minimally target SWAP_CLUSTER_MAX pages to keep
- * reclaim moving forwards, avoiding decremeting
+ * reclaim moving forwards, avoiding decrementing
* sc->priority further than desirable.
*/
scan = max(scan, SWAP_CLUSTER_MAX);
@@ -2566,7 +2549,7 @@ static bool in_reclaim_compaction(struct scan_control *sc)
* Reclaim/compaction is used for high-order allocation requests. It reclaims
* order-0 pages before compacting the zone. should_continue_reclaim() returns
* true if more pages should be reclaimed such that when the page allocator
- * calls try_to_compact_zone() that it will have enough free pages to succeed.
+ * calls try_to_compact_pages() that it will have enough free pages to succeed.
* It will give up earlier than that if there is difficulty reclaiming pages.
*/
static inline bool should_continue_reclaim(struct pglist_data *pgdat,
@@ -2697,6 +2680,14 @@ again:
nr_scanned = sc->nr_scanned;
/*
+ * Determine the scan balance between anon and file LRUs.
+ */
+ spin_lock_irq(&pgdat->lru_lock);
+ sc->anon_cost = target_lruvec->anon_cost;
+ sc->file_cost = target_lruvec->file_cost;
+ spin_unlock_irq(&pgdat->lru_lock);
+
+ /*
* Target desirable inactive:active list ratios for the anon
* and file LRU lists.
*/
@@ -3131,8 +3122,8 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
/* kswapd must be awake if processes are being throttled */
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
- if (READ_ONCE(pgdat->kswapd_classzone_idx) > ZONE_NORMAL)
- WRITE_ONCE(pgdat->kswapd_classzone_idx, ZONE_NORMAL);
+ if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
wake_up_interruptible(&pgdat->kswapd_wait);
}
@@ -3385,7 +3376,7 @@ static void age_active_anon(struct pglist_data *pgdat,
} while (memcg);
}
-static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
+static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
{
int i;
struct zone *zone;
@@ -3397,7 +3388,7 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
* start prematurely when there is no boosting and a lower
* zone is balanced.
*/
- for (i = classzone_idx; i >= 0; i--) {
+ for (i = highest_zoneidx; i >= 0; i--) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
@@ -3411,9 +3402,9 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
/*
* Returns true if there is an eligible zone balanced for the request order
- * and classzone_idx
+ * and highest_zoneidx
*/
-static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
{
int i;
unsigned long mark = -1;
@@ -3423,19 +3414,19 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
* Check watermarks bottom-up as lower zones are more likely to
* meet watermarks.
*/
- for (i = 0; i <= classzone_idx; i++) {
+ for (i = 0; i <= highest_zoneidx; i++) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
mark = high_wmark_pages(zone);
- if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
+ if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
return true;
}
/*
- * If a node has no populated zone within classzone_idx, it does not
+ * If a node has no populated zone within highest_zoneidx, it does not
* need balancing by definition. This can happen if a zone-restricted
* allocation tries to wake a remote kswapd.
*/
@@ -3461,7 +3452,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat)
*
* Returns true if kswapd is ready to sleep
*/
-static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
+ int highest_zoneidx)
{
/*
* The throttled processes are normally woken up in balance_pgdat() as
@@ -3483,7 +3475,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
- if (pgdat_balanced(pgdat, order, classzone_idx)) {
+ if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
clear_pgdat_congested(pgdat);
return true;
}
@@ -3547,7 +3539,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
* or lower is eligible for reclaim until at least one usable zone is
* balanced.
*/
-static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{
int i;
unsigned long nr_soft_reclaimed;
@@ -3575,7 +3567,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
* stall or direct reclaim until kswapd is finished.
*/
nr_boost_reclaim = 0;
- for (i = 0; i <= classzone_idx; i++) {
+ for (i = 0; i <= highest_zoneidx; i++) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
@@ -3593,7 +3585,7 @@ restart:
bool balanced;
bool ret;
- sc.reclaim_idx = classzone_idx;
+ sc.reclaim_idx = highest_zoneidx;
/*
* If the number of buffer_heads exceeds the maximum allowed
@@ -3623,7 +3615,7 @@ restart:
* on the grounds that the normal reclaim should be enough to
* re-evaluate if boosting is required when kswapd next wakes.
*/
- balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
+ balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
if (!balanced && nr_boost_reclaim) {
nr_boost_reclaim = 0;
goto restart;
@@ -3723,7 +3715,7 @@ out:
if (boosted) {
unsigned long flags;
- for (i = 0; i <= classzone_idx; i++) {
+ for (i = 0; i <= highest_zoneidx; i++) {
if (!zone_boosts[i])
continue;
@@ -3738,7 +3730,7 @@ out:
* As there is now likely space, wakeup kcompact to defragment
* pageblocks.
*/
- wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
+ wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
}
snapshot_refaults(NULL, pgdat);
@@ -3756,22 +3748,22 @@ out:
}
/*
- * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be
- * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not
- * a valid index then either kswapd runs for first time or kswapd couldn't sleep
- * after previous reclaim attempt (node is still unbalanced). In that case
- * return the zone index of the previous kswapd reclaim cycle.
+ * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
+ * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
+ * not a valid index then either kswapd runs for first time or kswapd couldn't
+ * sleep after previous reclaim attempt (node is still unbalanced). In that
+ * case return the zone index of the previous kswapd reclaim cycle.
*/
-static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
- enum zone_type prev_classzone_idx)
+static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
+ enum zone_type prev_highest_zoneidx)
{
- enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx);
+ enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
- return curr_idx == MAX_NR_ZONES ? prev_classzone_idx : curr_idx;
+ return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
}
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
- unsigned int classzone_idx)
+ unsigned int highest_zoneidx)
{
long remaining = 0;
DEFINE_WAIT(wait);
@@ -3788,7 +3780,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* eligible zone balanced that it's also unlikely that compaction will
* succeed.
*/
- if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
/*
* Compaction records what page blocks it recently failed to
* isolate pages from and skips them in the future scanning.
@@ -3801,18 +3793,19 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* We have freed the memory, now we should compact it to make
* allocation of the requested order possible.
*/
- wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
+ wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
remaining = schedule_timeout(HZ/10);
/*
- * If woken prematurely then reset kswapd_classzone_idx and
+ * If woken prematurely then reset kswapd_highest_zoneidx and
* order. The values will either be from a wakeup request or
* the previous request that slept prematurely.
*/
if (remaining) {
- WRITE_ONCE(pgdat->kswapd_classzone_idx,
- kswapd_classzone_idx(pgdat, classzone_idx));
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
+ kswapd_highest_zoneidx(pgdat,
+ highest_zoneidx));
if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
@@ -3827,7 +3820,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* go fully to sleep until explicitly woken up.
*/
if (!remaining &&
- prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
+ prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
@@ -3869,7 +3862,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
static int kswapd(void *p)
{
unsigned int alloc_order, reclaim_order;
- unsigned int classzone_idx = MAX_NR_ZONES - 1;
+ unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -3893,22 +3886,24 @@ static int kswapd(void *p)
set_freezable();
WRITE_ONCE(pgdat->kswapd_order, 0);
- WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES);
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
for ( ; ; ) {
bool ret;
alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
- classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
+ highest_zoneidx = kswapd_highest_zoneidx(pgdat,
+ highest_zoneidx);
kswapd_try_sleep:
kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
- classzone_idx);
+ highest_zoneidx);
- /* Read the new order and classzone_idx */
+ /* Read the new order and highest_zoneidx */
alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
- classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
+ highest_zoneidx = kswapd_highest_zoneidx(pgdat,
+ highest_zoneidx);
WRITE_ONCE(pgdat->kswapd_order, 0);
- WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES);
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
ret = try_to_freeze();
if (kthread_should_stop())
@@ -3929,9 +3924,10 @@ kswapd_try_sleep:
* but kcompactd is woken to compact for the original
* request (alloc_order).
*/
- trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
+ trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
alloc_order);
- reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
+ reclaim_order = balance_pgdat(pgdat, alloc_order,
+ highest_zoneidx);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
}
@@ -3949,7 +3945,7 @@ kswapd_try_sleep:
* needed.
*/
void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
- enum zone_type classzone_idx)
+ enum zone_type highest_zoneidx)
{
pg_data_t *pgdat;
enum zone_type curr_idx;
@@ -3961,10 +3957,10 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
return;
pgdat = zone->zone_pgdat;
- curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx);
+ curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
- if (curr_idx == MAX_NR_ZONES || curr_idx < classzone_idx)
- WRITE_ONCE(pgdat->kswapd_classzone_idx, classzone_idx);
+ if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
if (READ_ONCE(pgdat->kswapd_order) < order)
WRITE_ONCE(pgdat->kswapd_order, order);
@@ -3974,8 +3970,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
/* Hopeless node, leave it to direct reclaim if possible */
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
- (pgdat_balanced(pgdat, order, classzone_idx) &&
- !pgdat_watermark_boosted(pgdat, classzone_idx))) {
+ (pgdat_balanced(pgdat, order, highest_zoneidx) &&
+ !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
/*
* There may be plenty of free memory available, but it's too
* fragmented for high-order allocations. Wake up kcompactd
@@ -3984,11 +3980,11 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
* ratelimit its work.
*/
if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
- wakeup_kcompactd(pgdat, order, classzone_idx);
+ wakeup_kcompactd(pgdat, order, highest_zoneidx);
return;
}
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
gfp_flags);
wake_up_interruptible(&pgdat->kswapd_wait);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a9a40ef4f55..ffa50284e95b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1204,6 +1204,10 @@ const char * const vmstat_text[] = {
"pgscan_kswapd",
"pgscan_direct",
"pgscan_direct_throttle",
+ "pgscan_anon",
+ "pgscan_file",
+ "pgsteal_anon",
+ "pgsteal_file",
#ifdef CONFIG_NUMA
"zone_reclaim_failed",
@@ -1277,6 +1281,10 @@ const char * const vmstat_text[] = {
"thp_zero_page_alloc_failed",
"thp_swpout",
"thp_swpout_fallback",
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ "thp_pmd_migration_success",
+ "thp_pmd_migration_failure",
+#endif
#endif
#ifdef CONFIG_MEMORY_BALLOON
"balloon_inflate",
@@ -1593,6 +1601,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
zone->present_pages,
zone_managed_pages(zone));
+ /* If unpopulated, no other information is useful */
+ if (!populated_zone(zone)) {
+ seq_putc(m, '\n');
+ return;
+ }
+
seq_printf(m,
"\n protection: (%ld",
zone->lowmem_reserve[0]);
@@ -1600,12 +1614,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
seq_putc(m, ')');
- /* If unpopulated, no other information is useful */
- if (!populated_zone(zone)) {
- seq_putc(m, '\n');
- return;
- }
-
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
zone_page_state(zone, i));
@@ -2058,24 +2066,14 @@ static int unusable_show(struct seq_file *m, void *arg)
return 0;
}
-static const struct seq_operations unusable_op = {
+static const struct seq_operations unusable_sops = {
.start = frag_start,
.next = frag_next,
.stop = frag_stop,
.show = unusable_show,
};
-static int unusable_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &unusable_op);
-}
-
-static const struct file_operations unusable_file_ops = {
- .open = unusable_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(unusable);
static void extfrag_show_print(struct seq_file *m,
pg_data_t *pgdat, struct zone *zone)
@@ -2110,24 +2108,14 @@ static int extfrag_show(struct seq_file *m, void *arg)
return 0;
}
-static const struct seq_operations extfrag_op = {
+static const struct seq_operations extfrag_sops = {
.start = frag_start,
.next = frag_next,
.stop = frag_stop,
.show = extfrag_show,
};
-static int extfrag_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &extfrag_op);
-}
-
-static const struct file_operations extfrag_file_ops = {
- .open = extfrag_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(extfrag);
static int __init extfrag_debug_init(void)
{
@@ -2136,10 +2124,10 @@ static int __init extfrag_debug_init(void)
extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
- &unusable_file_ops);
+ &unusable_fops);
debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
- &extfrag_file_ops);
+ &extfrag_fops);
return 0;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 474186b76ced..d481ea452eeb 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -277,8 +277,8 @@ void workingset_refault(struct page *page, void *shadow)
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
+ unsigned long workingset_size;
struct pglist_data *pgdat;
- unsigned long active_file;
struct mem_cgroup *memcg;
unsigned long eviction;
struct lruvec *lruvec;
@@ -310,7 +310,6 @@ void workingset_refault(struct page *page, void *shadow)
goto out;
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->inactive_age);
- active_file = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
/*
* Calculate the refault distance
@@ -345,10 +344,18 @@ void workingset_refault(struct page *page, void *shadow)
/*
* Compare the distance to the existing workingset size. We
- * don't act on pages that couldn't stay resident even if all
- * the memory was available to the page cache.
+ * don't activate pages that couldn't stay resident even if
+ * all the memory was available to the page cache. Whether
+ * cache can compete with anon or not depends on having swap.
*/
- if (refault_distance > active_file)
+ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
+ if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_INACTIVE_ANON);
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_ACTIVE_ANON);
+ }
+ if (refault_distance > workingset_size)
goto out;
SetPageActive(page);
@@ -358,6 +365,10 @@ void workingset_refault(struct page *page, void *shadow)
/* Page was active prior to eviction */
if (workingset) {
SetPageWorkingset(page);
+ /* XXX: Move to lru_cache_add() when it supports new vs putback */
+ spin_lock_irq(&page_pgdat(page)->lru_lock);
+ lru_note_cost_page(page);
+ spin_unlock_irq(&page_pgdat(page)->lru_lock);
inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
}
out:
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 8c3bb5e508b8..460b0feced26 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -43,6 +43,7 @@
#include <linux/spinlock.h>
#include <linux/zpool.h>
#include <linux/magic.h>
+#include <linux/kmemleak.h>
/*
* NCHUNKS_ORDER determines the internal allocation granularity, effectively
@@ -215,6 +216,8 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
(gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
if (slots) {
+ /* It will be freed separately in free_handle(). */
+ kmemleak_not_leak(slots);
memset(slots->slot, 0, sizeof(slots->slot));
slots->pool = (unsigned long)pool;
rwlock_init(&slots->lock);
diff --git a/mm/zbud.c b/mm/zbud.c
index de5dd4ddaa82..bc93aa4e46fc 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -243,7 +243,7 @@ static struct zbud_header *init_zbud_page(struct page *page)
zhdr->last_chunks = 0;
INIT_LIST_HEAD(&zhdr->buddy);
INIT_LIST_HEAD(&zhdr->lru);
- zhdr->under_reclaim = 0;
+ zhdr->under_reclaim = false;
return zhdr;
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 2f836a2b993f..f6dc0673e62c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -293,7 +293,7 @@ struct zspage {
};
struct mapping_area {
-#ifdef CONFIG_PGTABLE_MAPPING
+#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
struct vm_struct *vm; /* vm area for mapping object that span pages */
#else
char *vm_buf; /* copy buffer for objects that span pages */
@@ -1113,7 +1113,7 @@ static struct zspage *find_get_zspage(struct size_class *class)
return zspage;
}
-#ifdef CONFIG_PGTABLE_MAPPING
+#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
static inline int __zs_cpu_up(struct mapping_area *area)
{
/*
@@ -1138,7 +1138,9 @@ static inline void __zs_cpu_down(struct mapping_area *area)
static inline void *__zs_map_object(struct mapping_area *area,
struct page *pages[2], int off, int size)
{
- BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages));
+ unsigned long addr = (unsigned long)area->vm->addr;
+
+ BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0);
area->vm_addr = area->vm->addr;
return area->vm_addr + off;
}
@@ -1151,7 +1153,7 @@ static inline void __zs_unmap_object(struct mapping_area *area,
unmap_kernel_range(addr, PAGE_SIZE * 2);
}
-#else /* CONFIG_PGTABLE_MAPPING */
+#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
static inline int __zs_cpu_up(struct mapping_area *area)
{
@@ -1233,7 +1235,7 @@ out:
pagefault_enable();
}
-#endif /* CONFIG_PGTABLE_MAPPING */
+#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
static int zs_cpu_prepare(unsigned int cpu)
{
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 4778db5601b0..c83ffe912163 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1089,16 +1089,14 @@ static int do_replace(struct net *net, const void __user *user,
tmp.name[sizeof(tmp.name) - 1] = 0;
countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids;
- newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT,
- PAGE_KERNEL);
+ newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT);
if (!newinfo)
return -ENOMEM;
if (countersize)
memset(newinfo->counters, 0, countersize);
- newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT,
- PAGE_KERNEL);
+ newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT);
if (!newinfo->entries) {
ret = -ENOMEM;
goto free_newinfo;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index a0e97f6c1072..66f22e8aa529 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -190,8 +190,7 @@ EXPORT_SYMBOL(ceph_compare_options);
* kvmalloc() doesn't fall back to the vmalloc allocator unless flags are
* compatible with (a superset of) GFP_KERNEL. This is because while the
* actual pages are allocated with the specified flags, the page table pages
- * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take
- * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc().
+ * are always allocated with GFP_KERNEL.
*
* ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO.
*/
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index ed808439d304..dd750241958b 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -43,6 +43,8 @@ my $list_types = 0;
my $fix = 0;
my $fix_inplace = 0;
my $root;
+my $gitroot = $ENV{'GIT_DIR'};
+$gitroot = ".git" if !defined($gitroot);
my %debug;
my %camelcase = ();
my %use_type = ();
@@ -244,6 +246,8 @@ list_types(0) if ($list_types);
$fix = 1 if ($fix_inplace);
$check_orig = $check;
+die "$P: --git cannot be used with --file or --fix\n" if ($git && ($file || $fix));
+
my $exit = 0;
my $perl_version_ok = 1;
@@ -267,11 +271,11 @@ if ($color =~ /^[01]$/) {
} elsif ($color =~ /^auto$/i) {
$color = (-t STDOUT);
} else {
- die "Invalid color mode: $color\n";
+ die "$P: Invalid color mode: $color\n";
}
# skip TAB size 1 to avoid additional checks on $tabsize - 1
-die "Invalid TAB size: $tabsize\n" if ($tabsize < 2);
+die "$P: Invalid TAB size: $tabsize\n" if ($tabsize < 2);
sub hash_save_array_words {
my ($hashRef, $arrayRef) = @_;
@@ -897,7 +901,7 @@ sub is_maintained_obsolete {
sub is_SPDX_License_valid {
my ($license) = @_;
- return 1 if (!$tree || which("python") eq "" || !(-e "$root/scripts/spdxcheck.py") || !(-e "$root/.git"));
+ return 1 if (!$tree || which("python") eq "" || !(-e "$root/scripts/spdxcheck.py") || !(-e "$gitroot"));
my $root_path = abs_path($root);
my $status = `cd "$root_path"; echo "$license" | python scripts/spdxcheck.py -`;
@@ -915,7 +919,7 @@ sub seed_camelcase_includes {
$camelcase_seeded = 1;
- if (-e ".git") {
+ if (-e "$gitroot") {
my $git_last_include_commit = `${git_command} log --no-merges --pretty=format:"%h%n" -1 -- include`;
chomp $git_last_include_commit;
$camelcase_cache = ".checkpatch-camelcase.git.$git_last_include_commit";
@@ -943,7 +947,7 @@ sub seed_camelcase_includes {
return;
}
- if (-e ".git") {
+ if (-e "$gitroot") {
$files = `${git_command} ls-files "include/*.h"`;
@include_files = split('\n', $files);
}
@@ -966,7 +970,7 @@ sub seed_camelcase_includes {
sub git_commit_info {
my ($commit, $id, $desc) = @_;
- return ($id, $desc) if ((which("git") eq "") || !(-e ".git"));
+ return ($id, $desc) if ((which("git") eq "") || !(-e "$gitroot"));
my $output = `${git_command} log --no-color --format='%H %s' -1 $commit 2>&1`;
$output =~ s/^\s*//gm;
@@ -1005,7 +1009,7 @@ my $fixlinenr = -1;
# If input is git commits, extract all commits from the commit expressions.
# For example, HEAD-3 means we need check 'HEAD, HEAD~1, HEAD~2'.
-die "$P: No git repository found\n" if ($git && !-e ".git");
+die "$P: No git repository found\n" if ($git && !-e "$gitroot");
if ($git) {
my @commits = ();
@@ -1058,6 +1062,7 @@ for my $filename (@ARGV) {
while (<$FILE>) {
chomp;
push(@rawlines, $_);
+ $vname = qq("$1") if ($filename eq '-' && $_ =~ m/^Subject:\s+(.+)/i);
}
close($FILE);
@@ -1674,8 +1679,16 @@ sub ctx_statement_level {
sub ctx_locate_comment {
my ($first_line, $end_line) = @_;
+ # If c99 comment on the current line, or the line before or after
+ my ($current_comment) = ($rawlines[$end_line - 1] =~ m@^\+.*(//.*$)@);
+ return $current_comment if (defined $current_comment);
+ ($current_comment) = ($rawlines[$end_line - 2] =~ m@^[\+ ].*(//.*$)@);
+ return $current_comment if (defined $current_comment);
+ ($current_comment) = ($rawlines[$end_line] =~ m@^[\+ ].*(//.*$)@);
+ return $current_comment if (defined $current_comment);
+
# Catch a comment on the end of the line itself.
- my ($current_comment) = ($rawlines[$end_line - 1] =~ m@.*(/\*.*\*/)\s*(?:\\\s*)?$@);
+ ($current_comment) = ($rawlines[$end_line - 1] =~ m@.*(/\*.*\*/)\s*(?:\\\s*)?$@);
return $current_comment if (defined $current_comment);
# Look through the context and try and figure out if there is a
@@ -3060,14 +3073,43 @@ sub process {
#print "is_start<$is_start> is_end<$is_end> length<$length>\n";
}
-# check for MAINTAINERS entries that don't have the right form
- if ($realfile =~ /^MAINTAINERS$/ &&
- $rawline =~ /^\+[A-Z]:/ &&
- $rawline !~ /^\+[A-Z]:\t\S/) {
- if (WARN("MAINTAINERS_STYLE",
- "MAINTAINERS entries use one tab after TYPE:\n" . $herecurr) &&
- $fix) {
- $fixed[$fixlinenr] =~ s/^(\+[A-Z]):\s*/$1:\t/;
+# check MAINTAINERS entries
+ if ($realfile =~ /^MAINTAINERS$/) {
+# check MAINTAINERS entries for the right form
+ if ($rawline =~ /^\+[A-Z]:/ &&
+ $rawline !~ /^\+[A-Z]:\t\S/) {
+ if (WARN("MAINTAINERS_STYLE",
+ "MAINTAINERS entries use one tab after TYPE:\n" . $herecurr) &&
+ $fix) {
+ $fixed[$fixlinenr] =~ s/^(\+[A-Z]):\s*/$1:\t/;
+ }
+ }
+# check MAINTAINERS entries for the right ordering too
+ my $preferred_order = 'MRLSWQBCPTFXNK';
+ if ($rawline =~ /^\+[A-Z]:/ &&
+ $prevrawline =~ /^[\+ ][A-Z]:/) {
+ $rawline =~ /^\+([A-Z]):\s*(.*)/;
+ my $cur = $1;
+ my $curval = $2;
+ $prevrawline =~ /^[\+ ]([A-Z]):\s*(.*)/;
+ my $prev = $1;
+ my $prevval = $2;
+ my $curindex = index($preferred_order, $cur);
+ my $previndex = index($preferred_order, $prev);
+ if ($curindex < 0) {
+ WARN("MAINTAINERS_STYLE",
+ "Unknown MAINTAINERS entry type: '$cur'\n" . $herecurr);
+ } else {
+ if ($previndex >= 0 && $curindex < $previndex) {
+ WARN("MAINTAINERS_STYLE",
+ "Misordered MAINTAINERS entry - list '$cur:' before '$prev:'\n" . $hereprev);
+ } elsif ((($prev eq 'F' && $cur eq 'F') ||
+ ($prev eq 'X' && $cur eq 'X')) &&
+ ($prevval cmp $curval) > 0) {
+ WARN("MAINTAINERS_STYLE",
+ "Misordered MAINTAINERS entry - list file patterns in alphabetic order\n" . $hereprev);
+ }
+ }
}
}
diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 6cbcd1a3e113..484d2fbf5921 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -19,6 +19,7 @@ my $V = '0.26';
use Getopt::Long qw(:config no_auto_abbrev);
use Cwd;
use File::Find;
+use File::Spec::Functions;
my $cur_path = fastgetcwd() . '/';
my $lk_path = "./";
@@ -57,7 +58,7 @@ my $status = 0;
my $letters = "";
my $keywords = 1;
my $sections = 0;
-my $file_emails = 0;
+my $email_file_emails = 0;
my $from_filename = 0;
my $pattern_depth = 0;
my $self_test = undef;
@@ -69,6 +70,12 @@ my $vcs_used = 0;
my $exit = 0;
+my @files = ();
+my @fixes = (); # If a patch description includes Fixes: lines
+my @range = ();
+my @keyword_tvi = ();
+my @file_emails = ();
+
my %commit_author_hash;
my %commit_signer_hash;
@@ -266,7 +273,7 @@ if (!GetOptions(
'pattern-depth=i' => \$pattern_depth,
'k|keywords!' => \$keywords,
'sections!' => \$sections,
- 'fe|file-emails!' => \$file_emails,
+ 'fe|file-emails!' => \$email_file_emails,
'f|file' => \$from_filename,
'find-maintainer-files' => \$find_maintainer_files,
'mpath|maintainer-path=s' => \$maintainer_path,
@@ -424,6 +431,22 @@ sub read_all_maintainer_files {
}
}
+sub maintainers_in_file {
+ my ($file) = @_;
+
+ return if ($file =~ m@\bMAINTAINERS$@);
+
+ if (-f $file && ($email_file_emails || $file =~ /\.yaml$/)) {
+ open(my $f, '<', $file)
+ or die "$P: Can't open $file: $!\n";
+ my $text = do { local($/) ; <$f> };
+ close($f);
+
+ my @poss_addr = $text =~ m$[A-Za-zÀ-ÿ\"\' \,\.\+-]*\s*[\,]*\s*[\(\<\{]{0,1}[A-Za-z0-9_\.\+-]+\@[A-Za-z0-9\.-]+\.[A-Za-z0-9]+[\)\>\}]{0,1}$g;
+ push(@file_emails, clean_file_emails(@poss_addr));
+ }
+}
+
#
# Read mail address map
#
@@ -504,18 +527,13 @@ sub read_mailmap {
## use the filenames on the command line or find the filenames in the patchfiles
-my @files = ();
-my @fixes = (); # If a patch description includes Fixes: lines
-my @range = ();
-my @keyword_tvi = ();
-my @file_emails = ();
-
if (!@ARGV) {
push(@ARGV, "&STDIN");
}
foreach my $file (@ARGV) {
if ($file ne "&STDIN") {
+ $file = canonpath($file);
##if $file is a directory and it lacks a trailing slash, add one
if ((-d $file)) {
$file =~ s@([^/])$@$1/@;
@@ -527,7 +545,7 @@ foreach my $file (@ARGV) {
$file =~ s/^\Q${cur_path}\E//; #strip any absolute path
$file =~ s/^\Q${lk_path}\E//; #or the path to the lk tree
push(@files, $file);
- if ($file ne "MAINTAINERS" && -f $file && ($keywords || $file_emails)) {
+ if ($file ne "MAINTAINERS" && -f $file && $keywords) {
open(my $f, '<', $file)
or die "$P: Can't open $file: $!\n";
my $text = do { local($/) ; <$f> };
@@ -539,10 +557,6 @@ foreach my $file (@ARGV) {
}
}
}
- if ($file_emails) {
- my @poss_addr = $text =~ m$[A-Za-zÀ-ÿ\"\' \,\.\+-]*\s*[\,]*\s*[\(\<\{]{0,1}[A-Za-z0-9_\.\+-]+\@[A-Za-z0-9\.-]+\.[A-Za-z0-9]+[\)\>\}]{0,1}$g;
- push(@file_emails, clean_file_emails(@poss_addr));
- }
}
} else {
my $file_cnt = @files;
@@ -923,6 +937,8 @@ sub get_maintainers {
print("\n");
}
}
+
+ maintainers_in_file($file);
}
if ($keywords) {
@@ -1835,7 +1851,7 @@ tm toggle maintainers
tg toggle git entries
tl toggle open list entries
ts toggle subscriber list entries
-f emails in file [$file_emails]
+f emails in file [$email_file_emails]
k keywords in file [$keywords]
r remove duplicates [$email_remove_duplicates]
p# pattern match depth [$pattern_depth]
@@ -1960,7 +1976,7 @@ EOT
bool_invert(\$email_git_all_signature_types);
$rerun = 1;
} elsif ($sel eq "f") {
- bool_invert(\$file_emails);
+ bool_invert(\$email_file_emails);
$rerun = 1;
} elsif ($sel eq "r") {
bool_invert(\$email_remove_duplicates);
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 1fc17cb317a9..338a526cbfa5 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -378,15 +378,4 @@ static inline void key_check(const struct key *key)
#define key_check(key) do {} while(0)
#endif
-
-/*
- * Helper function to clear and free a kvmalloc'ed memory object.
- */
-static inline void __kvzfree(const void *addr, size_t len)
-{
- if (addr) {
- memset((void *)addr, 0, len);
- kvfree(addr);
- }
-}
#endif /* _INTERNAL_H */
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 6763ee45e04d..2c6fb90bccdf 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -144,10 +144,7 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type,
key_ref_put(keyring_ref);
error3:
- if (payload) {
- memzero_explicit(payload, plen);
- kvfree(payload);
- }
+ kvfree_sensitive(payload, plen);
error2:
kfree(description);
error:
@@ -362,7 +359,7 @@ long keyctl_update_key(key_serial_t id,
key_ref_put(key_ref);
error2:
- __kvzfree(payload, plen);
+ kvfree_sensitive(payload, plen);
error:
return ret;
}
@@ -917,7 +914,7 @@ can_read_key:
*/
if (ret > key_data_len) {
if (unlikely(key_data))
- __kvzfree(key_data, key_data_len);
+ kvfree_sensitive(key_data, key_data_len);
key_data_len = ret;
continue; /* Allocate buffer */
}
@@ -926,7 +923,7 @@ can_read_key:
ret = -EFAULT;
break;
}
- __kvzfree(key_data, key_data_len);
+ kvfree_sensitive(key_data, key_data_len);
key_put_out:
key_put(key);
@@ -1230,10 +1227,7 @@ long keyctl_instantiate_key_common(key_serial_t id,
keyctl_change_reqkey_auth(NULL);
error2:
- if (payload) {
- memzero_explicit(payload, plen);
- kvfree(payload);
- }
+ kvfree_sensitive(payload, plen);
error:
return ret;
}
diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c
index a83553fbedf0..bea46ed157a6 100644
--- a/sound/core/memalloc.c
+++ b/sound/core/memalloc.c
@@ -143,7 +143,7 @@ int snd_dma_alloc_pages(int type, struct device *device, size_t size,
break;
case SNDRV_DMA_TYPE_VMALLOC:
gfp = snd_mem_get_gfp_flags(device, GFP_KERNEL | __GFP_HIGHMEM);
- dmab->area = __vmalloc(size, gfp, PAGE_KERNEL);
+ dmab->area = __vmalloc(size, gfp);
dmab->addr = 0;
break;
#ifdef CONFIG_HAS_DMA
diff --git a/sound/core/pcm_memory.c b/sound/core/pcm_memory.c
index fcab37ea6641..860935e3aea4 100644
--- a/sound/core/pcm_memory.c
+++ b/sound/core/pcm_memory.c
@@ -460,7 +460,7 @@ int _snd_pcm_lib_alloc_vmalloc_buffer(struct snd_pcm_substream *substream,
return 0; /* already large enough */
vfree(runtime->dma_area);
}
- runtime->dma_area = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ runtime->dma_area = __vmalloc(size, gfp_flags);
if (!runtime->dma_area)
return -ENOMEM;
runtime->dma_bytes = size;
diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config
index 14a77ea4a8da..b80ee3f6e265 100644
--- a/tools/testing/selftests/lib/config
+++ b/tools/testing/selftests/lib/config
@@ -2,3 +2,4 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_BITMAP=m
CONFIG_PRIME_NUMBERS=m
CONFIG_TEST_STRSCPY=m
+CONFIG_TEST_BITOPS=m
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 4f1831e62ea5..849e8226395a 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
hugepage-mmap
hugepage-shm
+khugepaged
map_hugetlb
map_populate
thuge-gen
@@ -9,6 +10,7 @@ mlock2-tests
mremap_dontunmap
on-fault-limit
transhuge-stress
+protection_keys
userfaultfd
mlock-intersect-test
mlock-random-test
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index c6eb5305a0f6..bb378de60290 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -20,6 +20,31 @@ TEST_GEN_FILES += on-fault-limit
TEST_GEN_FILES += thuge-gen
TEST_GEN_FILES += transhuge-stress
TEST_GEN_FILES += userfaultfd
+TEST_GEN_FILES += khugepaged
+
+ifeq ($(ARCH),x86_64)
+CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
+CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_64bit_program.c)
+CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_program.c -no-pie)
+
+TARGETS := protection_keys
+BINARIES_32 := $(TARGETS:%=%_32)
+BINARIES_64 := $(TARGETS:%=%_64)
+
+ifeq ($(CAN_BUILD_WITH_NOPIE),1)
+CFLAGS += -no-pie
+endif
+
+ifeq ($(CAN_BUILD_I386),1)
+TEST_GEN_FILES += $(BINARIES_32)
+endif
+
+ifeq ($(CAN_BUILD_X86_64),1)
+TEST_GEN_FILES += $(BINARIES_64)
+endif
+else
+TEST_GEN_FILES += protection_keys
+endif
ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64))
TEST_GEN_FILES += va_128TBswitch
@@ -34,6 +59,55 @@ TEST_FILES := test_vmalloc.sh
KSFT_KHDR_INSTALL := 1
include ../lib.mk
+ifeq ($(ARCH),x86_64)
+BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
+BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
+
+define gen-target-rule-32
+$(1) $(1)_32: $(OUTPUT)/$(1)_32
+.PHONY: $(1) $(1)_32
+endef
+
+define gen-target-rule-64
+$(1) $(1)_64: $(OUTPUT)/$(1)_64
+.PHONY: $(1) $(1)_64
+endef
+
+ifeq ($(CAN_BUILD_I386),1)
+$(BINARIES_32): CFLAGS += -m32
+$(BINARIES_32): LDLIBS += -lrt -ldl -lm
+$(BINARIES_32): %_32: %.c
+ $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
+$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t))))
+endif
+
+ifeq ($(CAN_BUILD_X86_64),1)
+$(BINARIES_64): CFLAGS += -m64
+$(BINARIES_64): LDLIBS += -lrt -ldl
+$(BINARIES_64): %_64: %.c
+ $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
+$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-64,$(t))))
+endif
+
+# x86_64 users should be encouraged to install 32-bit libraries
+ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01)
+all: warn_32bit_failure
+
+warn_32bit_failure:
+ @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \
+ echo "environment. This will reduce test coverage of 64-bit" 2>&1; \
+ echo "kernels. If you are using a Debian-like distribution," 2>&1; \
+ echo "try:"; 2>&1; \
+ echo ""; \
+ echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \
+ echo ""; \
+ echo "If you are using a Fedora-like distribution, try:"; \
+ echo ""; \
+ echo " yum install glibc-devel.*i686"; \
+ exit 0;
+endif
+endif
+
$(OUTPUT)/hmm-tests: LDLIBS += -lhugetlbfs -lpthread
$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
new file mode 100644
index 000000000000..51b89cedd09d
--- /dev/null
+++ b/tools/testing/selftests/vm/khugepaged.c
@@ -0,0 +1,1035 @@
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/wait.h>
+
+#ifndef MADV_PAGEOUT
+#define MADV_PAGEOUT 21
+#endif
+
+#define BASE_ADDR ((void *)(1UL << 30))
+static unsigned long hpage_pmd_size;
+static unsigned long page_size;
+static int hpage_pmd_nr;
+
+#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
+#define PID_SMAPS "/proc/self/smaps"
+
+enum thp_enabled {
+ THP_ALWAYS,
+ THP_MADVISE,
+ THP_NEVER,
+};
+
+static const char *thp_enabled_strings[] = {
+ "always",
+ "madvise",
+ "never",
+ NULL
+};
+
+enum thp_defrag {
+ THP_DEFRAG_ALWAYS,
+ THP_DEFRAG_DEFER,
+ THP_DEFRAG_DEFER_MADVISE,
+ THP_DEFRAG_MADVISE,
+ THP_DEFRAG_NEVER,
+};
+
+static const char *thp_defrag_strings[] = {
+ "always",
+ "defer",
+ "defer+madvise",
+ "madvise",
+ "never",
+ NULL
+};
+
+enum shmem_enabled {
+ SHMEM_ALWAYS,
+ SHMEM_WITHIN_SIZE,
+ SHMEM_ADVISE,
+ SHMEM_NEVER,
+ SHMEM_DENY,
+ SHMEM_FORCE,
+};
+
+static const char *shmem_enabled_strings[] = {
+ "always",
+ "within_size",
+ "advise",
+ "never",
+ "deny",
+ "force",
+ NULL
+};
+
+struct khugepaged_settings {
+ bool defrag;
+ unsigned int alloc_sleep_millisecs;
+ unsigned int scan_sleep_millisecs;
+ unsigned int max_ptes_none;
+ unsigned int max_ptes_swap;
+ unsigned int max_ptes_shared;
+ unsigned long pages_to_scan;
+};
+
+struct settings {
+ enum thp_enabled thp_enabled;
+ enum thp_defrag thp_defrag;
+ enum shmem_enabled shmem_enabled;
+ bool debug_cow;
+ bool use_zero_page;
+ struct khugepaged_settings khugepaged;
+};
+
+static struct settings default_settings = {
+ .thp_enabled = THP_MADVISE,
+ .thp_defrag = THP_DEFRAG_ALWAYS,
+ .shmem_enabled = SHMEM_NEVER,
+ .debug_cow = 0,
+ .use_zero_page = 0,
+ .khugepaged = {
+ .defrag = 1,
+ .alloc_sleep_millisecs = 10,
+ .scan_sleep_millisecs = 10,
+ },
+};
+
+static struct settings saved_settings;
+static bool skip_settings_restore;
+
+static int exit_status;
+
+static void success(const char *msg)
+{
+ printf(" \e[32m%s\e[0m\n", msg);
+}
+
+static void fail(const char *msg)
+{
+ printf(" \e[31m%s\e[0m\n", msg);
+ exit_status++;
+}
+
+static int read_file(const char *path, char *buf, size_t buflen)
+{
+ int fd;
+ ssize_t numread;
+
+ fd = open(path, O_RDONLY);
+ if (fd == -1)
+ return 0;
+
+ numread = read(fd, buf, buflen - 1);
+ if (numread < 1) {
+ close(fd);
+ return 0;
+ }
+
+ buf[numread] = '\0';
+ close(fd);
+
+ return (unsigned int) numread;
+}
+
+static int write_file(const char *path, const char *buf, size_t buflen)
+{
+ int fd;
+ ssize_t numwritten;
+
+ fd = open(path, O_WRONLY);
+ if (fd == -1)
+ return 0;
+
+ numwritten = write(fd, buf, buflen - 1);
+ close(fd);
+ if (numwritten < 1)
+ return 0;
+
+ return (unsigned int) numwritten;
+}
+
+static int read_string(const char *name, const char *strings[])
+{
+ char path[PATH_MAX];
+ char buf[256];
+ char *c;
+ int ret;
+
+ ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+ if (ret >= PATH_MAX) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ if (!read_file(path, buf, sizeof(buf))) {
+ perror(path);
+ exit(EXIT_FAILURE);
+ }
+
+ c = strchr(buf, '[');
+ if (!c) {
+ printf("%s: Parse failure\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ c++;
+ memmove(buf, c, sizeof(buf) - (c - buf));
+
+ c = strchr(buf, ']');
+ if (!c) {
+ printf("%s: Parse failure\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ *c = '\0';
+
+ ret = 0;
+ while (strings[ret]) {
+ if (!strcmp(strings[ret], buf))
+ return ret;
+ ret++;
+ }
+
+ printf("Failed to parse %s\n", name);
+ exit(EXIT_FAILURE);
+}
+
+static void write_string(const char *name, const char *val)
+{
+ char path[PATH_MAX];
+ int ret;
+
+ ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+ if (ret >= PATH_MAX) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ if (!write_file(path, val, strlen(val) + 1)) {
+ perror(path);
+ exit(EXIT_FAILURE);
+ }
+}
+
+static const unsigned long read_num(const char *name)
+{
+ char path[PATH_MAX];
+ char buf[21];
+ int ret;
+
+ ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+ if (ret >= PATH_MAX) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ ret = read_file(path, buf, sizeof(buf));
+ if (ret < 0) {
+ perror("read_file(read_num)");
+ exit(EXIT_FAILURE);
+ }
+
+ return strtoul(buf, NULL, 10);
+}
+
+static void write_num(const char *name, unsigned long num)
+{
+ char path[PATH_MAX];
+ char buf[21];
+ int ret;
+
+ ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+ if (ret >= PATH_MAX) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ sprintf(buf, "%ld", num);
+ if (!write_file(path, buf, strlen(buf) + 1)) {
+ perror(path);
+ exit(EXIT_FAILURE);
+ }
+}
+
+static void write_settings(struct settings *settings)
+{
+ struct khugepaged_settings *khugepaged = &settings->khugepaged;
+
+ write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
+ write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
+ write_string("shmem_enabled",
+ shmem_enabled_strings[settings->shmem_enabled]);
+ write_num("debug_cow", settings->debug_cow);
+ write_num("use_zero_page", settings->use_zero_page);
+
+ write_num("khugepaged/defrag", khugepaged->defrag);
+ write_num("khugepaged/alloc_sleep_millisecs",
+ khugepaged->alloc_sleep_millisecs);
+ write_num("khugepaged/scan_sleep_millisecs",
+ khugepaged->scan_sleep_millisecs);
+ write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
+ write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
+ write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
+ write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
+}
+
+static void restore_settings(int sig)
+{
+ if (skip_settings_restore)
+ goto out;
+
+ printf("Restore THP and khugepaged settings...");
+ write_settings(&saved_settings);
+ success("OK");
+ if (sig)
+ exit(EXIT_FAILURE);
+out:
+ exit(exit_status);
+}
+
+static void save_settings(void)
+{
+ printf("Save THP and khugepaged settings...");
+ saved_settings = (struct settings) {
+ .thp_enabled = read_string("enabled", thp_enabled_strings),
+ .thp_defrag = read_string("defrag", thp_defrag_strings),
+ .shmem_enabled =
+ read_string("shmem_enabled", shmem_enabled_strings),
+ .debug_cow = read_num("debug_cow"),
+ .use_zero_page = read_num("use_zero_page"),
+ };
+ saved_settings.khugepaged = (struct khugepaged_settings) {
+ .defrag = read_num("khugepaged/defrag"),
+ .alloc_sleep_millisecs =
+ read_num("khugepaged/alloc_sleep_millisecs"),
+ .scan_sleep_millisecs =
+ read_num("khugepaged/scan_sleep_millisecs"),
+ .max_ptes_none = read_num("khugepaged/max_ptes_none"),
+ .max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
+ .max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
+ .pages_to_scan = read_num("khugepaged/pages_to_scan"),
+ };
+ success("OK");
+
+ signal(SIGTERM, restore_settings);
+ signal(SIGINT, restore_settings);
+ signal(SIGHUP, restore_settings);
+ signal(SIGQUIT, restore_settings);
+}
+
+static void adjust_settings(void)
+{
+
+ printf("Adjust settings...");
+ write_settings(&default_settings);
+ success("OK");
+}
+
+#define MAX_LINE_LENGTH 500
+
+static bool check_for_pattern(FILE *fp, char *pattern, char *buf)
+{
+ while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
+ if (!strncmp(buf, pattern, strlen(pattern)))
+ return true;
+ }
+ return false;
+}
+
+static bool check_huge(void *addr)
+{
+ bool thp = false;
+ int ret;
+ FILE *fp;
+ char buffer[MAX_LINE_LENGTH];
+ char addr_pattern[MAX_LINE_LENGTH];
+
+ ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+ (unsigned long) addr);
+ if (ret >= MAX_LINE_LENGTH) {
+ printf("%s: Pattern is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+
+ fp = fopen(PID_SMAPS, "r");
+ if (!fp) {
+ printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
+ exit(EXIT_FAILURE);
+ }
+ if (!check_for_pattern(fp, addr_pattern, buffer))
+ goto err_out;
+
+ ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB",
+ hpage_pmd_size >> 10);
+ if (ret >= MAX_LINE_LENGTH) {
+ printf("%s: Pattern is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ /*
+ * Fetch the AnonHugePages: in the same block and check whether it got
+ * the expected number of hugeepages next.
+ */
+ if (!check_for_pattern(fp, "AnonHugePages:", buffer))
+ goto err_out;
+
+ if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
+ goto err_out;
+
+ thp = true;
+err_out:
+ fclose(fp);
+ return thp;
+}
+
+
+static bool check_swap(void *addr, unsigned long size)
+{
+ bool swap = false;
+ int ret;
+ FILE *fp;
+ char buffer[MAX_LINE_LENGTH];
+ char addr_pattern[MAX_LINE_LENGTH];
+
+ ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+ (unsigned long) addr);
+ if (ret >= MAX_LINE_LENGTH) {
+ printf("%s: Pattern is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+
+ fp = fopen(PID_SMAPS, "r");
+ if (!fp) {
+ printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
+ exit(EXIT_FAILURE);
+ }
+ if (!check_for_pattern(fp, addr_pattern, buffer))
+ goto err_out;
+
+ ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
+ size >> 10);
+ if (ret >= MAX_LINE_LENGTH) {
+ printf("%s: Pattern is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ /*
+ * Fetch the Swap: in the same block and check whether it got
+ * the expected number of hugeepages next.
+ */
+ if (!check_for_pattern(fp, "Swap:", buffer))
+ goto err_out;
+
+ if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
+ goto err_out;
+
+ swap = true;
+err_out:
+ fclose(fp);
+ return swap;
+}
+
+static void *alloc_mapping(void)
+{
+ void *p;
+
+ p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (p != BASE_ADDR) {
+ printf("Failed to allocate VMA at %p\n", BASE_ADDR);
+ exit(EXIT_FAILURE);
+ }
+
+ return p;
+}
+
+static void fill_memory(int *p, unsigned long start, unsigned long end)
+{
+ int i;
+
+ for (i = start / page_size; i < end / page_size; i++)
+ p[i * page_size / sizeof(*p)] = i + 0xdead0000;
+}
+
+static void validate_memory(int *p, unsigned long start, unsigned long end)
+{
+ int i;
+
+ for (i = start / page_size; i < end / page_size; i++) {
+ if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
+ printf("Page %d is corrupted: %#x\n",
+ i, p[i * page_size / sizeof(*p)]);
+ exit(EXIT_FAILURE);
+ }
+ }
+}
+
+#define TICK 500000
+static bool wait_for_scan(const char *msg, char *p)
+{
+ int full_scans;
+ int timeout = 6; /* 3 seconds */
+
+ /* Sanity check */
+ if (check_huge(p)) {
+ printf("Unexpected huge page\n");
+ exit(EXIT_FAILURE);
+ }
+
+ madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+
+ /* Wait until the second full_scan completed */
+ full_scans = read_num("khugepaged/full_scans") + 2;
+
+ printf("%s...", msg);
+ while (timeout--) {
+ if (check_huge(p))
+ break;
+ if (read_num("khugepaged/full_scans") >= full_scans)
+ break;
+ printf(".");
+ usleep(TICK);
+ }
+
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+ return !timeout;
+}
+
+static void alloc_at_fault(void)
+{
+ struct settings settings = default_settings;
+ char *p;
+
+ settings.thp_enabled = THP_ALWAYS;
+ write_settings(&settings);
+
+ p = alloc_mapping();
+ *p = 1;
+ printf("Allocate huge page on fault...");
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ write_settings(&default_settings);
+
+ madvise(p, page_size, MADV_DONTNEED);
+ printf("Split huge PMD on MADV_DONTNEED...");
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_full(void)
+{
+ void *p;
+
+ p = alloc_mapping();
+ fill_memory(p, 0, hpage_pmd_size);
+ if (wait_for_scan("Collapse fully populated PTE table", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_empty(void)
+{
+ void *p;
+
+ p = alloc_mapping();
+ if (wait_for_scan("Do not collapse empty PTE table", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ fail("Fail");
+ else
+ success("OK");
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry(void)
+{
+ void *p;
+
+ p = alloc_mapping();
+ fill_memory(p, 0, page_size);
+ if (wait_for_scan("Collapse PTE table with single PTE entry present", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, page_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_none(void)
+{
+ int max_ptes_none = hpage_pmd_nr / 2;
+ struct settings settings = default_settings;
+ void *p;
+
+ settings.khugepaged.max_ptes_none = max_ptes_none;
+ write_settings(&settings);
+
+ p = alloc_mapping();
+
+ fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+ if (wait_for_scan("Do not collapse with max_ptes_none exceeded", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ fail("Fail");
+ else
+ success("OK");
+ validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+
+ fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+ if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+
+ munmap(p, hpage_pmd_size);
+ write_settings(&default_settings);
+}
+
+static void collapse_swapin_single_pte(void)
+{
+ void *p;
+ p = alloc_mapping();
+ fill_memory(p, 0, hpage_pmd_size);
+
+ printf("Swapout one page...");
+ if (madvise(p, page_size, MADV_PAGEOUT)) {
+ perror("madvise(MADV_PAGEOUT)");
+ exit(EXIT_FAILURE);
+ }
+ if (check_swap(p, page_size)) {
+ success("OK");
+ } else {
+ fail("Fail");
+ goto out;
+ }
+
+ if (wait_for_scan("Collapse with swapping in single PTE entry", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+out:
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_swap(void)
+{
+ int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
+ void *p;
+
+ p = alloc_mapping();
+
+ fill_memory(p, 0, hpage_pmd_size);
+ printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
+ if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
+ perror("madvise(MADV_PAGEOUT)");
+ exit(EXIT_FAILURE);
+ }
+ if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
+ success("OK");
+ } else {
+ fail("Fail");
+ goto out;
+ }
+
+ if (wait_for_scan("Do not collapse with max_ptes_swap exceeded", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ fail("Fail");
+ else
+ success("OK");
+ validate_memory(p, 0, hpage_pmd_size);
+
+ fill_memory(p, 0, hpage_pmd_size);
+ printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr);
+ if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
+ perror("madvise(MADV_PAGEOUT)");
+ exit(EXIT_FAILURE);
+ }
+ if (check_swap(p, max_ptes_swap * page_size)) {
+ success("OK");
+ } else {
+ fail("Fail");
+ goto out;
+ }
+
+ if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+out:
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry_compound(void)
+{
+ void *p;
+
+ p = alloc_mapping();
+
+ printf("Allocate huge page...");
+ madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+ fill_memory(p, 0, hpage_pmd_size);
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+ printf("Split huge page leaving single PTE mapping compound page...");
+ madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ if (wait_for_scan("Collapse PTE table with single PTE mapping compound page", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, page_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_full_of_compound(void)
+{
+ void *p;
+
+ p = alloc_mapping();
+
+ printf("Allocate huge page...");
+ madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+ fill_memory(p, 0, hpage_pmd_size);
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Split huge page leaving single PTE page table full of compound pages...");
+ madvise(p, page_size, MADV_NOHUGEPAGE);
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ if (wait_for_scan("Collapse PTE table full of compound pages", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_compound_extreme(void)
+{
+ void *p;
+ int i;
+
+ p = alloc_mapping();
+ for (i = 0; i < hpage_pmd_nr; i++) {
+ printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
+ i + 1, hpage_pmd_nr);
+
+ madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
+ fill_memory(BASE_ADDR, 0, hpage_pmd_size);
+ if (!check_huge(BASE_ADDR)) {
+ printf("Failed to allocate huge page\n");
+ exit(EXIT_FAILURE);
+ }
+ madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+ p = mremap(BASE_ADDR - i * page_size,
+ i * page_size + hpage_pmd_size,
+ (i + 1) * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED,
+ BASE_ADDR + 2 * hpage_pmd_size);
+ if (p == MAP_FAILED) {
+ perror("mremap+unmap");
+ exit(EXIT_FAILURE);
+ }
+
+ p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
+ (i + 1) * page_size,
+ (i + 1) * page_size + hpage_pmd_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED,
+ BASE_ADDR - (i + 1) * page_size);
+ if (p == MAP_FAILED) {
+ perror("mremap+alloc");
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ munmap(BASE_ADDR, hpage_pmd_size);
+ fill_memory(p, 0, hpage_pmd_size);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ if (wait_for_scan("Collapse PTE table full of different compound pages", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_fork(void)
+{
+ int wstatus;
+ void *p;
+
+ p = alloc_mapping();
+
+ printf("Allocate small page...");
+ fill_memory(p, 0, page_size);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Share small page over fork()...");
+ if (!fork()) {
+ /* Do not touch settings on child exit */
+ skip_settings_restore = true;
+ exit_status = 0;
+
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ fill_memory(p, page_size, 2 * page_size);
+
+ if (wait_for_scan("Collapse PTE table with single page shared with parent process", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ validate_memory(p, 0, page_size);
+ munmap(p, hpage_pmd_size);
+ exit(exit_status);
+ }
+
+ wait(&wstatus);
+ exit_status += WEXITSTATUS(wstatus);
+
+ printf("Check if parent still has small page...");
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, page_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_fork_compound(void)
+{
+ int wstatus;
+ void *p;
+
+ p = alloc_mapping();
+
+ printf("Allocate huge page...");
+ madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+ fill_memory(p, 0, hpage_pmd_size);
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Share huge page over fork()...");
+ if (!fork()) {
+ /* Do not touch settings on child exit */
+ skip_settings_restore = true;
+ exit_status = 0;
+
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Split huge page PMD in child process...");
+ madvise(p, page_size, MADV_NOHUGEPAGE);
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ fill_memory(p, 0, page_size);
+
+ write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
+ if (wait_for_scan("Collapse PTE table full of compound pages in child", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ write_num("khugepaged/max_ptes_shared",
+ default_settings.khugepaged.max_ptes_shared);
+
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+ exit(exit_status);
+ }
+
+ wait(&wstatus);
+ exit_status += WEXITSTATUS(wstatus);
+
+ printf("Check if parent still has huge page...");
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_shared()
+{
+ int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
+ int wstatus;
+ void *p;
+
+ p = alloc_mapping();
+
+ printf("Allocate huge page...");
+ madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+ fill_memory(p, 0, hpage_pmd_size);
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Share huge page over fork()...");
+ if (!fork()) {
+ /* Do not touch settings on child exit */
+ skip_settings_restore = true;
+ exit_status = 0;
+
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Trigger CoW on page %d of %d...",
+ hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
+ fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p))
+ fail("Timeout");
+ else if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Trigger CoW on page %d of %d...",
+ hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
+ fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+
+ if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+ exit(exit_status);
+ }
+
+ wait(&wstatus);
+ exit_status += WEXITSTATUS(wstatus);
+
+ printf("Check if parent still has huge page...");
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+}
+
+int main(void)
+{
+ setbuf(stdout, NULL);
+
+ page_size = getpagesize();
+ hpage_pmd_size = read_num("hpage_pmd_size");
+ hpage_pmd_nr = hpage_pmd_size / page_size;
+
+ default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
+ default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
+ default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
+ default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
+
+ save_settings();
+ adjust_settings();
+
+ alloc_at_fault();
+ collapse_full();
+ collapse_empty();
+ collapse_single_pte_entry();
+ collapse_max_ptes_none();
+ collapse_swapin_single_pte();
+ collapse_max_ptes_swap();
+ collapse_single_pte_entry_compound();
+ collapse_full_of_compound();
+ collapse_compound_extreme();
+ collapse_fork();
+ collapse_fork_compound();
+ collapse_max_ptes_shared();
+
+ restore_settings(0);
+}
diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c
index ee06cb0b9efb..3a7b5ef0b0c6 100644
--- a/tools/testing/selftests/vm/mremap_dontunmap.c
+++ b/tools/testing/selftests/vm/mremap_dontunmap.c
@@ -11,7 +11,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <stdlib.h>
#include <unistd.h>
#include "../kselftest.h"
diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h
new file mode 100644
index 000000000000..622a85848f61
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PKEYS_HELPER_H
+#define _PKEYS_HELPER_H
+#define _GNU_SOURCE
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+
+/* Define some kernel-like types */
+#define u8 __u8
+#define u16 __u16
+#define u32 __u32
+#define u64 __u64
+
+#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 0
+#endif
+#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
+extern int dprint_in_signal;
+extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+
+extern int test_nr;
+extern int iteration_nr;
+
+#ifdef __GNUC__
+__attribute__((format(printf, 1, 2)))
+#endif
+static inline void sigsafe_printf(const char *format, ...)
+{
+ va_list ap;
+
+ if (!dprint_in_signal) {
+ va_start(ap, format);
+ vprintf(format, ap);
+ va_end(ap);
+ } else {
+ int ret;
+ /*
+ * No printf() functions are signal-safe.
+ * They deadlock easily. Write the format
+ * string to get some output, even if
+ * incomplete.
+ */
+ ret = write(1, format, strlen(format));
+ if (ret < 0)
+ exit(1);
+ }
+}
+#define dprintf_level(level, args...) do { \
+ if (level <= DEBUG_LEVEL) \
+ sigsafe_printf(args); \
+} while (0)
+#define dprintf0(args...) dprintf_level(0, args)
+#define dprintf1(args...) dprintf_level(1, args)
+#define dprintf2(args...) dprintf_level(2, args)
+#define dprintf3(args...) dprintf_level(3, args)
+#define dprintf4(args...) dprintf_level(4, args)
+
+extern void abort_hooks(void);
+#define pkey_assert(condition) do { \
+ if (!(condition)) { \
+ dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
+ __FILE__, __LINE__, \
+ test_nr, iteration_nr); \
+ dprintf0("errno at assert: %d", errno); \
+ abort_hooks(); \
+ exit(__LINE__); \
+ } \
+} while (0)
+
+__attribute__((noinline)) int read_ptr(int *ptr);
+void expected_pkey_fault(int pkey);
+int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
+int sys_pkey_free(unsigned long pkey);
+int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+ unsigned long pkey);
+void record_pkey_malloc(void *ptr, long size, int prot);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+#include "pkey-x86.h"
+#elif defined(__powerpc64__) /* arch */
+#include "pkey-powerpc.h"
+#else /* arch */
+#error Architecture not supported
+#endif /* arch */
+
+#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
+
+static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
+{
+ u32 shift = pkey_bit_position(pkey);
+ /* mask out bits from pkey in old value */
+ reg &= ~((u64)PKEY_MASK << shift);
+ /* OR in new bits for pkey */
+ reg |= (flags & PKEY_MASK) << shift;
+ return reg;
+}
+
+static inline u64 get_pkey_bits(u64 reg, int pkey)
+{
+ u32 shift = pkey_bit_position(pkey);
+ /*
+ * shift down the relevant bits to the lowest two, then
+ * mask off all the other higher bits
+ */
+ return ((reg >> shift) & PKEY_MASK);
+}
+
+extern u64 shadow_pkey_reg;
+
+static inline u64 _read_pkey_reg(int line)
+{
+ u64 pkey_reg = __read_pkey_reg();
+
+ dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx"
+ " shadow: %016llx\n",
+ line, pkey_reg, shadow_pkey_reg);
+ assert(pkey_reg == shadow_pkey_reg);
+
+ return pkey_reg;
+}
+
+#define read_pkey_reg() _read_pkey_reg(__LINE__)
+
+static inline void write_pkey_reg(u64 pkey_reg)
+{
+ dprintf4("%s() changing %016llx to %016llx\n", __func__,
+ __read_pkey_reg(), pkey_reg);
+ /* will do the shadow check for us: */
+ read_pkey_reg();
+ __write_pkey_reg(pkey_reg);
+ shadow_pkey_reg = pkey_reg;
+ dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__,
+ pkey_reg, __read_pkey_reg());
+}
+
+/*
+ * These are technically racy. since something could
+ * change PKEY register between the read and the write.
+ */
+static inline void __pkey_access_allow(int pkey, int do_allow)
+{
+ u64 pkey_reg = read_pkey_reg();
+ int bit = pkey * 2;
+
+ if (do_allow)
+ pkey_reg &= (1<<bit);
+ else
+ pkey_reg |= (1<<bit);
+
+ dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
+ write_pkey_reg(pkey_reg);
+}
+
+static inline void __pkey_write_allow(int pkey, int do_allow_write)
+{
+ u64 pkey_reg = read_pkey_reg();
+ int bit = pkey * 2 + 1;
+
+ if (do_allow_write)
+ pkey_reg &= (1<<bit);
+ else
+ pkey_reg |= (1<<bit);
+
+ write_pkey_reg(pkey_reg);
+ dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
+}
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1))
+#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
+#define ALIGN_PTR_UP(p, ptr_align_to) \
+ ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
+#define ALIGN_PTR_DOWN(p, ptr_align_to) \
+ ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+
+static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si)
+{
+#ifdef si_pkey
+ return &si->si_pkey;
+#else
+ return (u32 *)(((u8 *)si) + si_pkey_offset);
+#endif
+}
+
+static inline int kernel_has_pkeys(void)
+{
+ /* try allocating a key and see if it succeeds */
+ int ret = sys_pkey_alloc(0, 0);
+ if (ret <= 0) {
+ return 0;
+ }
+ sys_pkey_free(ret);
+ return 1;
+}
+
+static inline int is_pkeys_supported(void)
+{
+ /* check if the cpu supports pkeys */
+ if (!cpu_has_pkeys()) {
+ dprintf1("SKIP: %s: no CPU support\n", __func__);
+ return 0;
+ }
+
+ /* check if the kernel supports pkeys */
+ if (!kernel_has_pkeys()) {
+ dprintf1("SKIP: %s: no kernel support\n", __func__);
+ return 0;
+ }
+
+ return 1;
+}
+
+#endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h
new file mode 100644
index 000000000000..1ebb586b2fbc
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-powerpc.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PKEYS_POWERPC_H
+#define _PKEYS_POWERPC_H
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key 386
+#endif
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc 384
+# define SYS_pkey_free 385
+#endif
+#define REG_IP_IDX PT_NIP
+#define REG_TRAPNO PT_TRAP
+#define gregs gp_regs
+#define fpregs fp_regs
+#define si_pkey_offset 0x20
+
+#undef PKEY_DISABLE_ACCESS
+#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */
+
+#undef PKEY_DISABLE_WRITE
+#define PKEY_DISABLE_WRITE 0x2
+
+#define NR_PKEYS 32
+#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey
+ and 24 other keys that cannot be
+ represented in the PTE */
+#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0,
+ pkey-1 and exec-only key */
+#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1,
+ pkey-31 and exec-only key */
+#define PKEY_BITS_PER_PKEY 2
+#define HPAGE_SIZE (1UL << 24)
+#define PAGE_SIZE sysconf(_SC_PAGESIZE)
+
+static inline u32 pkey_bit_position(int pkey)
+{
+ return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
+}
+
+static inline u64 __read_pkey_reg(void)
+{
+ u64 pkey_reg;
+
+ asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg));
+
+ return pkey_reg;
+}
+
+static inline void __write_pkey_reg(u64 pkey_reg)
+{
+ u64 amr = pkey_reg;
+
+ dprintf4("%s() changing %016llx to %016llx\n",
+ __func__, __read_pkey_reg(), pkey_reg);
+
+ asm volatile("isync; mtspr 0xd, %0; isync"
+ : : "r" ((unsigned long)(amr)) : "memory");
+
+ dprintf4("%s() pkey register after changing %016llx to %016llx\n",
+ __func__, __read_pkey_reg(), pkey_reg);
+}
+
+static inline int cpu_has_pkeys(void)
+{
+ /* No simple way to determine this */
+ return 1;
+}
+
+static inline bool arch_is_powervm()
+{
+ struct stat buf;
+
+ if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) &&
+ (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) &&
+ (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) )
+ return true;
+
+ return false;
+}
+
+static inline int get_arch_reserved_keys(void)
+{
+ if (sysconf(_SC_PAGESIZE) == 4096)
+ return NR_RESERVED_PKEYS_4K;
+ else
+ if (arch_is_powervm())
+ return NR_RESERVED_PKEYS_64K_4KEYS;
+ else
+ return NR_RESERVED_PKEYS_64K_3KEYS;
+}
+
+void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+ /*
+ * powerpc does not allow userspace to change permissions of exec-only
+ * keys since those keys are not allocated by userspace. The signal
+ * handler wont be able to reset the permissions, which means the code
+ * will infinitely continue to segfault here.
+ */
+ return;
+}
+
+/* 4-byte instructions * 16384 = 64K page */
+#define __page_o_noops() asm(".rept 16384 ; nop; .endr")
+
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+ void *ptr;
+ int ret;
+
+ dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+ size, prot, pkey);
+ pkey_assert(pkey < NR_PKEYS);
+ ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ pkey_assert(ptr != (void *)-1);
+
+ ret = syscall(__NR_subpage_prot, ptr, size, NULL);
+ if (ret) {
+ perror("subpage_perm");
+ return PTR_ERR_ENOTSUP;
+ }
+
+ ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+ pkey_assert(!ret);
+ record_pkey_malloc(ptr, size, prot);
+
+ dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+ return ptr;
+}
+
+#endif /* _PKEYS_POWERPC_H */
diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h
new file mode 100644
index 000000000000..3be20f5d5275
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PKEYS_X86_H
+#define _PKEYS_X86_H
+
+#ifdef __i386__
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key 380
+#endif
+
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc 381
+# define SYS_pkey_free 382
+#endif
+
+#define REG_IP_IDX REG_EIP
+#define si_pkey_offset 0x14
+
+#else
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key 329
+#endif
+
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc 330
+# define SYS_pkey_free 331
+#endif
+
+#define REG_IP_IDX REG_RIP
+#define si_pkey_offset 0x20
+
+#endif
+
+#ifndef PKEY_DISABLE_ACCESS
+# define PKEY_DISABLE_ACCESS 0x1
+#endif
+
+#ifndef PKEY_DISABLE_WRITE
+# define PKEY_DISABLE_WRITE 0x2
+#endif
+
+#define NR_PKEYS 16
+#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */
+#define PKEY_BITS_PER_PKEY 2
+#define HPAGE_SIZE (1UL<<21)
+#define PAGE_SIZE 4096
+#define MB (1<<20)
+
+static inline void __page_o_noops(void)
+{
+ /* 8-bytes of instruction * 512 bytes = 1 page */
+ asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
+}
+
+static inline u64 __read_pkey_reg(void)
+{
+ unsigned int eax, edx;
+ unsigned int ecx = 0;
+ unsigned pkey_reg;
+
+ asm volatile(".byte 0x0f,0x01,0xee\n\t"
+ : "=a" (eax), "=d" (edx)
+ : "c" (ecx));
+ pkey_reg = eax;
+ return pkey_reg;
+}
+
+static inline void __write_pkey_reg(u64 pkey_reg)
+{
+ unsigned int eax = pkey_reg;
+ unsigned int ecx = 0;
+ unsigned int edx = 0;
+
+ dprintf4("%s() changing %016llx to %016llx\n", __func__,
+ __read_pkey_reg(), pkey_reg);
+ asm volatile(".byte 0x0f,0x01,0xef\n\t"
+ : : "a" (eax), "c" (ecx), "d" (edx));
+ assert(pkey_reg == __read_pkey_reg());
+}
+
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile(
+ "cpuid;"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
+#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */
+
+static inline int cpu_has_pkeys(void)
+{
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+
+ eax = 0x7;
+ ecx = 0x0;
+ __cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (!(ecx & X86_FEATURE_PKU)) {
+ dprintf2("cpu does not have PKU\n");
+ return 0;
+ }
+ if (!(ecx & X86_FEATURE_OSPKE)) {
+ dprintf2("cpu does not have OSPKE\n");
+ return 0;
+ }
+ return 1;
+}
+
+static inline u32 pkey_bit_position(int pkey)
+{
+ return pkey * PKEY_BITS_PER_PKEY;
+}
+
+#define XSTATE_PKEY_BIT (9)
+#define XSTATE_PKEY 0x200
+
+int pkey_reg_xstate_offset(void)
+{
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+ int xstate_offset;
+ int xstate_size;
+ unsigned long XSTATE_CPUID = 0xd;
+ int leaf;
+
+ /* assume that XSTATE_PKEY is set in XCR0 */
+ leaf = XSTATE_PKEY_BIT;
+ {
+ eax = XSTATE_CPUID;
+ ecx = leaf;
+ __cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (leaf == XSTATE_PKEY_BIT) {
+ xstate_offset = ebx;
+ xstate_size = eax;
+ }
+ }
+
+ if (xstate_size == 0) {
+ printf("could not find size/offset of PKEY in xsave state\n");
+ return 0;
+ }
+
+ return xstate_offset;
+}
+
+static inline int get_arch_reserved_keys(void)
+{
+ return NR_RESERVED_PKEYS;
+}
+
+void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+ int ptr_contents;
+
+ ptr_contents = read_ptr(p1);
+ dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+ expected_pkey_fault(pkey);
+}
+
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+ return PTR_ERR_ENOTSUP;
+}
+
+#endif /* _PKEYS_X86_H */
diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c
index 480995bceefa..fc19addcb5c8 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1,11 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Tests x86 Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
+ * Tests Memory Protection Keys (see Documentation/vm/protection-keys.txt)
*
* There are examples in here of:
* * how to set protection keys on memory
- * * how to set/clear bits in PKRU (the rights register)
- * * how to handle SEGV_PKRU signals and extract pkey-relevant
+ * * how to set/clear bits in pkey registers (the rights register)
+ * * how to handle SEGV_PKUERR signals and extract pkey-relevant
* information from the siginfo
*
* Things to add:
@@ -22,8 +22,10 @@
* gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
*/
#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
#include <errno.h>
#include <linux/futex.h>
+#include <time.h>
#include <sys/time.h>
#include <sys/syscall.h>
#include <string.h>
@@ -48,34 +50,10 @@
int iteration_nr = 1;
int test_nr;
-unsigned int shadow_pkru;
-
-#define HPAGE_SIZE (1UL<<21)
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
-#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1))
-#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
-#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
-#define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
-#define __stringify_1(x...) #x
-#define __stringify(x...) __stringify_1(x)
-
-#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
-
+u64 shadow_pkey_reg;
int dprint_in_signal;
char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
-extern void abort_hooks(void);
-#define pkey_assert(condition) do { \
- if (!(condition)) { \
- dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
- __FILE__, __LINE__, \
- test_nr, iteration_nr); \
- dprintf0("errno at assert: %d", errno); \
- abort_hooks(); \
- exit(__LINE__); \
- } \
-} while (0)
-
void cat_into_file(char *str, char *file)
{
int fd = open(file, O_RDWR);
@@ -158,12 +136,6 @@ void abort_hooks(void)
#endif
}
-static inline void __page_o_noops(void)
-{
- /* 8-bytes of instruction * 512 bytes = 1 page */
- asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
-}
-
/*
* This attempts to have roughly a page of instructions followed by a few
* instructions that do a write, and another page of instructions. That
@@ -174,7 +146,12 @@ static inline void __page_o_noops(void)
* will then fault, which makes sure that the fault code handles
* execute-only memory properly.
*/
+#ifdef __powerpc64__
+/* This way, both 4K and 64K alignment are maintained */
+__attribute__((__aligned__(65536)))
+#else
__attribute__((__aligned__(PAGE_SIZE)))
+#endif
void lots_o_noops_around_write(int *write_to_me)
{
dprintf3("running %s()\n", __func__);
@@ -186,51 +163,134 @@ void lots_o_noops_around_write(int *write_to_me)
dprintf3("%s() done\n", __func__);
}
-/* Define some kernel-like types */
-#define u8 uint8_t
-#define u16 uint16_t
-#define u32 uint32_t
-#define u64 uint64_t
+void dump_mem(void *dumpme, int len_bytes)
+{
+ char *c = (void *)dumpme;
+ int i;
-#ifdef __i386__
+ for (i = 0; i < len_bytes; i += sizeof(u64)) {
+ u64 *ptr = (u64 *)(c + i);
+ dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr);
+ }
+}
-#ifndef SYS_mprotect_key
-# define SYS_mprotect_key 380
-#endif
+static u32 hw_pkey_get(int pkey, unsigned long flags)
+{
+ u64 pkey_reg = __read_pkey_reg();
-#ifndef SYS_pkey_alloc
-# define SYS_pkey_alloc 381
-# define SYS_pkey_free 382
-#endif
+ dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
+ __func__, pkey, flags, 0, 0);
+ dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg);
-#define REG_IP_IDX REG_EIP
-#define si_pkey_offset 0x14
+ return (u32) get_pkey_bits(pkey_reg, pkey);
+}
-#else
+static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
+{
+ u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
+ u64 old_pkey_reg = __read_pkey_reg();
+ u64 new_pkey_reg;
-#ifndef SYS_mprotect_key
-# define SYS_mprotect_key 329
-#endif
+ /* make sure that 'rights' only contains the bits we expect: */
+ assert(!(rights & ~mask));
-#ifndef SYS_pkey_alloc
-# define SYS_pkey_alloc 330
-# define SYS_pkey_free 331
-#endif
+ /* modify bits accordingly in old pkey_reg and assign it */
+ new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights);
-#define REG_IP_IDX REG_RIP
-#define si_pkey_offset 0x20
+ __write_pkey_reg(new_pkey_reg);
-#endif
+ dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x"
+ " pkey_reg now: %016llx old_pkey_reg: %016llx\n",
+ __func__, pkey, rights, flags, 0, __read_pkey_reg(),
+ old_pkey_reg);
+ return 0;
+}
-void dump_mem(void *dumpme, int len_bytes)
+void pkey_disable_set(int pkey, int flags)
{
- char *c = (void *)dumpme;
- int i;
+ unsigned long syscall_flags = 0;
+ int ret;
+ int pkey_rights;
+ u64 orig_pkey_reg = read_pkey_reg();
- for (i = 0; i < len_bytes; i += sizeof(u64)) {
- u64 *ptr = (u64 *)(c + i);
- dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr);
- }
+ dprintf1("START->%s(%d, 0x%x)\n", __func__,
+ pkey, flags);
+ pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+ pkey_rights = hw_pkey_get(pkey, syscall_flags);
+
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+
+ pkey_assert(pkey_rights >= 0);
+
+ pkey_rights |= flags;
+
+ ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
+ assert(!ret);
+ /* pkey_reg and flags have the same format */
+ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
+ dprintf1("%s(%d) shadow: 0x%016llx\n",
+ __func__, pkey, shadow_pkey_reg);
+
+ pkey_assert(ret >= 0);
+
+ pkey_rights = hw_pkey_get(pkey, syscall_flags);
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+
+ dprintf1("%s(%d) pkey_reg: 0x%016llx\n",
+ __func__, pkey, read_pkey_reg());
+ if (flags)
+ pkey_assert(read_pkey_reg() >= orig_pkey_reg);
+ dprintf1("END<---%s(%d, 0x%x)\n", __func__,
+ pkey, flags);
+}
+
+void pkey_disable_clear(int pkey, int flags)
+{
+ unsigned long syscall_flags = 0;
+ int ret;
+ int pkey_rights = hw_pkey_get(pkey, syscall_flags);
+ u64 orig_pkey_reg = read_pkey_reg();
+
+ pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+ pkey_assert(pkey_rights >= 0);
+
+ pkey_rights &= ~flags;
+
+ ret = hw_pkey_set(pkey, pkey_rights, 0);
+ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
+ pkey_assert(ret >= 0);
+
+ pkey_rights = hw_pkey_get(pkey, syscall_flags);
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+
+ dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__,
+ pkey, read_pkey_reg());
+ if (flags)
+ assert(read_pkey_reg() <= orig_pkey_reg);
+}
+
+void pkey_write_allow(int pkey)
+{
+ pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
+}
+void pkey_write_deny(int pkey)
+{
+ pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
+}
+void pkey_access_allow(int pkey)
+{
+ pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
+}
+void pkey_access_deny(int pkey)
+{
+ pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
}
/* Failed address bound checks: */
@@ -255,7 +315,7 @@ static char *si_code_str(int si_code)
return "UNKNOWN";
}
-int pkru_faults;
+int pkey_faults;
int last_si_pkey = -1;
void signal_handler(int signum, siginfo_t *si, void *vucontext)
{
@@ -263,24 +323,28 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
int trapno;
unsigned long ip;
char *fpregs;
- u32 *pkru_ptr;
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ u32 *pkey_reg_ptr;
+ int pkey_reg_offset;
+#endif /* arch */
u64 siginfo_pkey;
u32 *si_pkey_ptr;
- int pkru_offset;
- fpregset_t fpregset;
dprint_in_signal = 1;
dprintf1(">>>>===============SIGSEGV============================\n");
- dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__,
- __rdpkru(), shadow_pkru);
+ dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
+ __func__, __LINE__,
+ __read_pkey_reg(), shadow_pkey_reg);
trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
- fpregset = uctxt->uc_mcontext.fpregs;
- fpregs = (void *)fpregset;
+ fpregs = (char *) uctxt->uc_mcontext.fpregs;
- dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__,
- trapno, ip, si_code_str(si->si_code), si->si_code);
+ dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n",
+ __func__, trapno, ip, si_code_str(si->si_code),
+ si->si_code);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
#ifdef __i386__
/*
* 32-bit has some extra padding so that userspace can tell whether
@@ -288,20 +352,22 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
* state. We just assume that it is here.
*/
fpregs += 0x70;
-#endif
- pkru_offset = pkru_xstate_offset();
- pkru_ptr = (void *)(&fpregs[pkru_offset]);
+#endif /* i386 */
+ pkey_reg_offset = pkey_reg_xstate_offset();
+ pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]);
- dprintf1("siginfo: %p\n", si);
- dprintf1(" fpregs: %p\n", fpregs);
/*
- * If we got a PKRU fault, we *HAVE* to have at least one bit set in
+ * If we got a PKEY fault, we *HAVE* to have at least one bit set in
* here.
*/
- dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset());
+ dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset());
if (DEBUG_LEVEL > 4)
- dump_mem(pkru_ptr - 128, 256);
- pkey_assert(*pkru_ptr);
+ dump_mem(pkey_reg_ptr - 128, 256);
+ pkey_assert(*pkey_reg_ptr);
+#endif /* arch */
+
+ dprintf1("siginfo: %p\n", si);
+ dprintf1(" fpregs: %p\n", fpregs);
if ((si->si_code == SEGV_MAPERR) ||
(si->si_code == SEGV_ACCERR) ||
@@ -310,20 +376,29 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
exit(4);
}
- si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
+ si_pkey_ptr = siginfo_get_pkey_ptr(si);
dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
dump_mem((u8 *)si_pkey_ptr - 8, 24);
siginfo_pkey = *si_pkey_ptr;
pkey_assert(siginfo_pkey < NR_PKEYS);
last_si_pkey = siginfo_pkey;
- dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr);
- /* need __rdpkru() version so we do not do shadow_pkru checking */
- dprintf1("signal pkru from pkru: %08x\n", __rdpkru());
- dprintf1("pkey from siginfo: %jx\n", siginfo_pkey);
- *(u64 *)pkru_ptr = 0x00000000;
- dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n");
- pkru_faults++;
+ /*
+ * need __read_pkey_reg() version so we do not do shadow_pkey_reg
+ * checking
+ */
+ dprintf1("signal pkey_reg from pkey_reg: %016llx\n",
+ __read_pkey_reg());
+ dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey);
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr);
+ *(u64 *)pkey_reg_ptr = 0x00000000;
+ dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n");
+#elif defined(__powerpc64__) /* arch */
+ /* restore access and let the faulting instruction continue */
+ pkey_access_allow(siginfo_pkey);
+#endif /* arch */
+ pkey_faults++;
dprintf1("<<<<==================================================\n");
dprint_in_signal = 0;
}
@@ -391,143 +466,6 @@ pid_t fork_lazy_child(void)
return forkret;
}
-#ifndef PKEY_DISABLE_ACCESS
-# define PKEY_DISABLE_ACCESS 0x1
-#endif
-
-#ifndef PKEY_DISABLE_WRITE
-# define PKEY_DISABLE_WRITE 0x2
-#endif
-
-static u32 hw_pkey_get(int pkey, unsigned long flags)
-{
- u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
- u32 pkru = __rdpkru();
- u32 shifted_pkru;
- u32 masked_pkru;
-
- dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
- __func__, pkey, flags, 0, 0);
- dprintf2("%s() raw pkru: %x\n", __func__, pkru);
-
- shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY));
- dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru);
- masked_pkru = shifted_pkru & mask;
- dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru);
- /*
- * shift down the relevant bits to the lowest two, then
- * mask off all the other high bits.
- */
- return masked_pkru;
-}
-
-static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
-{
- u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
- u32 old_pkru = __rdpkru();
- u32 new_pkru;
-
- /* make sure that 'rights' only contains the bits we expect: */
- assert(!(rights & ~mask));
-
- /* copy old pkru */
- new_pkru = old_pkru;
- /* mask out bits from pkey in old value: */
- new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY));
- /* OR in new bits for pkey: */
- new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY));
-
- __wrpkru(new_pkru);
-
- dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n",
- __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru);
- return 0;
-}
-
-void pkey_disable_set(int pkey, int flags)
-{
- unsigned long syscall_flags = 0;
- int ret;
- int pkey_rights;
- u32 orig_pkru = rdpkru();
-
- dprintf1("START->%s(%d, 0x%x)\n", __func__,
- pkey, flags);
- pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
-
- pkey_rights = hw_pkey_get(pkey, syscall_flags);
-
- dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
- pkey, pkey, pkey_rights);
- pkey_assert(pkey_rights >= 0);
-
- pkey_rights |= flags;
-
- ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
- assert(!ret);
- /*pkru and flags have the same format */
- shadow_pkru |= flags << (pkey * 2);
- dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru);
-
- pkey_assert(ret >= 0);
-
- pkey_rights = hw_pkey_get(pkey, syscall_flags);
- dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
- pkey, pkey, pkey_rights);
-
- dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
- if (flags)
- pkey_assert(rdpkru() > orig_pkru);
- dprintf1("END<---%s(%d, 0x%x)\n", __func__,
- pkey, flags);
-}
-
-void pkey_disable_clear(int pkey, int flags)
-{
- unsigned long syscall_flags = 0;
- int ret;
- int pkey_rights = hw_pkey_get(pkey, syscall_flags);
- u32 orig_pkru = rdpkru();
-
- pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
-
- dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
- pkey, pkey, pkey_rights);
- pkey_assert(pkey_rights >= 0);
-
- pkey_rights |= flags;
-
- ret = hw_pkey_set(pkey, pkey_rights, 0);
- /* pkru and flags have the same format */
- shadow_pkru &= ~(flags << (pkey * 2));
- pkey_assert(ret >= 0);
-
- pkey_rights = hw_pkey_get(pkey, syscall_flags);
- dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
- pkey, pkey, pkey_rights);
-
- dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
- if (flags)
- assert(rdpkru() > orig_pkru);
-}
-
-void pkey_write_allow(int pkey)
-{
- pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
-}
-void pkey_write_deny(int pkey)
-{
- pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
-}
-void pkey_access_allow(int pkey)
-{
- pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
-}
-void pkey_access_deny(int pkey)
-{
- pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
-}
-
int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
unsigned long pkey)
{
@@ -561,33 +499,44 @@ int alloc_pkey(void)
int ret;
unsigned long init_val = 0x0;
- dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n",
- __LINE__, __rdpkru(), shadow_pkru);
+ dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
+ __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg);
ret = sys_pkey_alloc(0, init_val);
/*
- * pkey_alloc() sets PKRU, so we need to reflect it in
- * shadow_pkru:
+ * pkey_alloc() sets PKEY register, so we need to reflect it in
+ * shadow_pkey_reg:
*/
- dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
- __LINE__, ret, __rdpkru(), shadow_pkru);
+ dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
if (ret) {
/* clear both the bits: */
- shadow_pkru &= ~(0x3 << (ret * 2));
- dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
- __LINE__, ret, __rdpkru(), shadow_pkru);
+ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
+ ~PKEY_MASK);
+ dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__,
+ __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
/*
* move the new state in from init_val
- * (remember, we cheated and init_val == pkru format)
+ * (remember, we cheated and init_val == pkey_reg format)
*/
- shadow_pkru |= (init_val << (ret * 2));
+ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
+ init_val);
}
- dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
- __LINE__, ret, __rdpkru(), shadow_pkru);
- dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno);
+ dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
+ dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno);
/* for shadow checking: */
- rdpkru();
- dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
- __LINE__, ret, __rdpkru(), shadow_pkru);
+ read_pkey_reg();
+ dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
return ret;
}
@@ -612,10 +561,10 @@ int alloc_random_pkey(void)
int nr_alloced = 0;
int random_index;
memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
+ srand((unsigned int)time(NULL));
/* allocate every possible key and make a note of which ones we got */
max_nr_pkey_allocs = NR_PKEYS;
- max_nr_pkey_allocs = 1;
for (i = 0; i < max_nr_pkey_allocs; i++) {
int new_pkey = alloc_pkey();
if (new_pkey < 0)
@@ -638,8 +587,9 @@ int alloc_random_pkey(void)
free_ret = sys_pkey_free(alloced_pkeys[i]);
pkey_assert(!free_ret);
}
- dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
- __LINE__, ret, __rdpkru(), shadow_pkru);
+ dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n", __func__,
+ __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
return ret;
}
@@ -657,11 +607,15 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
if (nr_iterations-- < 0)
break;
- dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
- __LINE__, ret, __rdpkru(), shadow_pkru);
+ dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
sys_pkey_free(rpkey);
- dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
- __LINE__, ret, __rdpkru(), shadow_pkru);
+ dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
}
pkey_assert(pkey < NR_PKEYS);
@@ -669,8 +623,9 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
ptr, size, orig_prot, pkey, ret);
pkey_assert(!ret);
- dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
- __LINE__, ret, __rdpkru(), shadow_pkru);
+ dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n", __func__,
+ __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
return ret;
}
@@ -752,7 +707,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
void *ptr;
int ret;
- rdpkru();
+ read_pkey_reg();
dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
size, prot, pkey);
pkey_assert(pkey < NR_PKEYS);
@@ -761,7 +716,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
pkey_assert(!ret);
record_pkey_malloc(ptr, size, prot);
- rdpkru();
+ read_pkey_reg();
dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
return ptr;
@@ -798,12 +753,15 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
}
int hugetlb_setup_ok;
+#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages"
#define GET_NR_HUGE_PAGES 10
void setup_hugetlbfs(void)
{
int err;
int fd;
- char buf[] = "123";
+ char buf[256];
+ long hpagesz_kb;
+ long hpagesz_mb;
if (geteuid() != 0) {
fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
@@ -814,11 +772,16 @@ void setup_hugetlbfs(void)
/*
* Now go make sure that we got the pages and that they
- * are 2M pages. Someone might have made 1G the default.
+ * are PMD-level pages. Someone might have made PUD-level
+ * pages the default.
*/
- fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY);
+ hpagesz_kb = HPAGE_SIZE / 1024;
+ hpagesz_mb = hpagesz_kb / 1024;
+ sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb);
+ fd = open(buf, O_RDONLY);
if (fd < 0) {
- perror("opening sysfs 2M hugetlb config");
+ fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n",
+ hpagesz_mb, strerror(errno));
return;
}
@@ -826,13 +789,14 @@ void setup_hugetlbfs(void)
err = read(fd, buf, sizeof(buf)-1);
close(fd);
if (err <= 0) {
- perror("reading sysfs 2M hugetlb config");
+ fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n",
+ hpagesz_mb, strerror(errno));
return;
}
if (atoi(buf) != GET_NR_HUGE_PAGES) {
- fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n",
- buf, GET_NR_HUGE_PAGES);
+ fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n",
+ hpagesz_mb, buf, GET_NR_HUGE_PAGES);
return;
}
@@ -886,6 +850,7 @@ void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
malloc_pkey_with_mprotect,
+ malloc_pkey_with_mprotect_subpage,
malloc_pkey_anon_huge,
malloc_pkey_hugetlb
/* can not do direct with the pkey_mprotect() API:
@@ -924,14 +889,14 @@ void *malloc_pkey(long size, int prot, u16 pkey)
return ret;
}
-int last_pkru_faults;
+int last_pkey_faults;
#define UNKNOWN_PKEY -2
-void expected_pk_fault(int pkey)
+void expected_pkey_fault(int pkey)
{
- dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n",
- __func__, last_pkru_faults, pkru_faults);
+ dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n",
+ __func__, last_pkey_faults, pkey_faults);
dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
- pkey_assert(last_pkru_faults + 1 == pkru_faults);
+ pkey_assert(last_pkey_faults + 1 == pkey_faults);
/*
* For exec-only memory, we do not know the pkey in
@@ -940,24 +905,28 @@ void expected_pk_fault(int pkey)
if (pkey != UNKNOWN_PKEY)
pkey_assert(last_si_pkey == pkey);
+#if defined(__i386__) || defined(__x86_64__) /* arch */
/*
- * The signal handler shold have cleared out PKRU to let the
+ * The signal handler shold have cleared out PKEY register to let the
* test program continue. We now have to restore it.
*/
- if (__rdpkru() != 0)
+ if (__read_pkey_reg() != 0)
+#else /* arch */
+ if (__read_pkey_reg() != shadow_pkey_reg)
+#endif /* arch */
pkey_assert(0);
- __wrpkru(shadow_pkru);
- dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n",
- __func__, shadow_pkru);
- last_pkru_faults = pkru_faults;
+ __write_pkey_reg(shadow_pkey_reg);
+ dprintf1("%s() set pkey_reg=%016llx to restore state after signal "
+ "nuked it\n", __func__, shadow_pkey_reg);
+ last_pkey_faults = pkey_faults;
last_si_pkey = -1;
}
-#define do_not_expect_pk_fault(msg) do { \
- if (last_pkru_faults != pkru_faults) \
- dprintf0("unexpected PK fault: %s\n", msg); \
- pkey_assert(last_pkru_faults == pkru_faults); \
+#define do_not_expect_pkey_fault(msg) do { \
+ if (last_pkey_faults != pkey_faults) \
+ dprintf0("unexpected PKey fault: %s\n", msg); \
+ pkey_assert(last_pkey_faults == pkey_faults); \
} while (0)
int test_fds[10] = { -1 };
@@ -1000,6 +969,58 @@ __attribute__((noinline)) int read_ptr(int *ptr)
return *ptr;
}
+void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
+{
+ int i, err;
+ int max_nr_pkey_allocs;
+ int alloced_pkeys[NR_PKEYS];
+ int nr_alloced = 0;
+ long size;
+
+ pkey_assert(pkey_last_malloc_record);
+ size = pkey_last_malloc_record->size;
+ /*
+ * This is a bit of a hack. But mprotect() requires
+ * huge-page-aligned sizes when operating on hugetlbfs.
+ * So, make sure that we use something that's a multiple
+ * of a huge page when we can.
+ */
+ if (size >= HPAGE_SIZE)
+ size = HPAGE_SIZE;
+
+ /* allocate every possible key and make sure key-0 never got allocated */
+ max_nr_pkey_allocs = NR_PKEYS;
+ for (i = 0; i < max_nr_pkey_allocs; i++) {
+ int new_pkey = alloc_pkey();
+ pkey_assert(new_pkey != 0);
+
+ if (new_pkey < 0)
+ break;
+ alloced_pkeys[nr_alloced++] = new_pkey;
+ }
+ /* free all the allocated keys */
+ for (i = 0; i < nr_alloced; i++) {
+ int free_ret;
+
+ if (!alloced_pkeys[i])
+ continue;
+ free_ret = sys_pkey_free(alloced_pkeys[i]);
+ pkey_assert(!free_ret);
+ }
+
+ /* attach key-0 in various modes */
+ err = sys_mprotect_pkey(ptr, size, PROT_READ, 0);
+ pkey_assert(!err);
+ err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0);
+ pkey_assert(!err);
+ err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0);
+ pkey_assert(!err);
+ err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0);
+ pkey_assert(!err);
+ err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0);
+ pkey_assert(!err);
+}
+
void test_read_of_write_disabled_region(int *ptr, u16 pkey)
{
int ptr_contents;
@@ -1015,26 +1036,67 @@ void test_read_of_access_disabled_region(int *ptr, u16 pkey)
int ptr_contents;
dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
- rdpkru();
+ read_pkey_reg();
+ pkey_access_deny(pkey);
+ ptr_contents = read_ptr(ptr);
+ dprintf1("*ptr: %d\n", ptr_contents);
+ expected_pkey_fault(pkey);
+}
+
+void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
+ u16 pkey)
+{
+ int ptr_contents;
+
+ dprintf1("disabling access to PKEY[%02d], doing read @ %p\n",
+ pkey, ptr);
+ ptr_contents = read_ptr(ptr);
+ dprintf1("reading ptr before disabling the read : %d\n",
+ ptr_contents);
+ read_pkey_reg();
pkey_access_deny(pkey);
ptr_contents = read_ptr(ptr);
dprintf1("*ptr: %d\n", ptr_contents);
- expected_pk_fault(pkey);
+ expected_pkey_fault(pkey);
}
+
+void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
+ u16 pkey)
+{
+ *ptr = __LINE__;
+ dprintf1("disabling write access; after accessing the page, "
+ "to PKEY[%02d], doing write\n", pkey);
+ pkey_write_deny(pkey);
+ *ptr = __LINE__;
+ expected_pkey_fault(pkey);
+}
+
void test_write_of_write_disabled_region(int *ptr, u16 pkey)
{
dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
pkey_write_deny(pkey);
*ptr = __LINE__;
- expected_pk_fault(pkey);
+ expected_pkey_fault(pkey);
}
void test_write_of_access_disabled_region(int *ptr, u16 pkey)
{
dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
pkey_access_deny(pkey);
*ptr = __LINE__;
- expected_pk_fault(pkey);
+ expected_pkey_fault(pkey);
}
+
+void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
+ u16 pkey)
+{
+ *ptr = __LINE__;
+ dprintf1("disabling access; after accessing the page, "
+ " to PKEY[%02d], doing write\n", pkey);
+ pkey_access_deny(pkey);
+ *ptr = __LINE__;
+ expected_pkey_fault(pkey);
+}
+
void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
{
int ret;
@@ -1160,9 +1222,11 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
int new_pkey;
dprintf1("%s() alloc loop: %d\n", __func__, i);
new_pkey = alloc_pkey();
- dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__,
- __LINE__, err, __rdpkru(), shadow_pkru);
- rdpkru(); /* for shadow checking */
+ dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, err, __read_pkey_reg(),
+ shadow_pkey_reg);
+ read_pkey_reg(); /* for shadow checking */
dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
if ((new_pkey == -1) && (errno == ENOSPC)) {
dprintf2("%s() failed to allocate pkey after %d tries\n",
@@ -1188,6 +1252,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
dprintf3("%s()::%d\n", __func__, __LINE__);
/*
+ * On x86:
* There are 16 pkeys supported in hardware. Three are
* allocated by the time we get here:
* 1. The default key (0)
@@ -1195,13 +1260,21 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
* 3. One allocated by the test code and passed in via
* 'pkey' to this function.
* Ensure that we can allocate at least another 13 (16-3).
+ *
+ * On powerpc:
+ * There are either 5, 28, 29 or 32 pkeys supported in
+ * hardware depending on the page size (4K or 64K) and
+ * platform (powernv or powervm). Four are allocated by
+ * the time we get here. These include pkey-0, pkey-1,
+ * exec-only pkey and the one allocated by the test code.
+ * Ensure that we can allocate the remaining.
*/
- pkey_assert(i >= NR_PKEYS-3);
+ pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1));
for (i = 0; i < nr_allocated_pkeys; i++) {
err = sys_pkey_free(allocated_pkeys[i]);
pkey_assert(!err);
- rdpkru(); /* for shadow checking */
+ read_pkey_reg(); /* for shadow checking */
}
}
@@ -1287,7 +1360,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey)
pkey_assert(ret != -1);
/* Now access from the current task, and expect an exception: */
peek_result = read_ptr(ptr);
- expected_pk_fault(pkey);
+ expected_pkey_fault(pkey);
/*
* Try to access the NON-pkey-protected "plain_ptr" via ptrace:
@@ -1297,7 +1370,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey)
pkey_assert(ret != -1);
/* Now access from the current task, and expect NO exception: */
peek_result = read_ptr(plain_ptr);
- do_not_expect_pk_fault("read plain pointer after ptrace");
+ do_not_expect_pkey_fault("read plain pointer after ptrace");
ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
pkey_assert(ret != -1);
@@ -1347,17 +1420,15 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
pkey_assert(!ret);
pkey_access_deny(pkey);
- dprintf2("pkru: %x\n", rdpkru());
+ dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
/*
* Make sure this is an *instruction* fault
*/
madvise(p1, PAGE_SIZE, MADV_DONTNEED);
lots_o_noops_around_write(&scratch);
- do_not_expect_pk_fault("executing on PROT_EXEC memory");
- ptr_contents = read_ptr(p1);
- dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
- expected_pk_fault(pkey);
+ do_not_expect_pkey_fault("executing on PROT_EXEC memory");
+ expect_fault_on_read_execonly_key(p1, pkey);
}
void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
@@ -1378,15 +1449,13 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
pkey_assert(!ret);
- dprintf2("pkru: %x\n", rdpkru());
+ dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
/* Make sure this is an *instruction* fault */
madvise(p1, PAGE_SIZE, MADV_DONTNEED);
lots_o_noops_around_write(&scratch);
- do_not_expect_pk_fault("executing on PROT_EXEC memory");
- ptr_contents = read_ptr(p1);
- dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
- expected_pk_fault(UNKNOWN_PKEY);
+ do_not_expect_pkey_fault("executing on PROT_EXEC memory");
+ expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY);
/*
* Put the memory back to non-PROT_EXEC. Should clear the
@@ -1400,7 +1469,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
pkey_assert(!ret);
ptr_contents = read_ptr(p1);
- do_not_expect_pk_fault("plain read on recently PROT_EXEC area");
+ do_not_expect_pkey_fault("plain read on recently PROT_EXEC area");
}
void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
@@ -1408,7 +1477,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
int size = PAGE_SIZE;
int sret;
- if (cpu_has_pku()) {
+ if (cpu_has_pkeys()) {
dprintf1("SKIP: %s: no CPU support\n", __func__);
return;
}
@@ -1420,8 +1489,11 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
void (*pkey_tests[])(int *ptr, u16 pkey) = {
test_read_of_write_disabled_region,
test_read_of_access_disabled_region,
+ test_read_of_access_disabled_region_with_page_already_mapped,
test_write_of_write_disabled_region,
+ test_write_of_write_disabled_region_with_page_already_mapped,
test_write_of_access_disabled_region,
+ test_write_of_access_disabled_region_with_page_already_mapped,
test_kernel_write_of_access_disabled_region,
test_kernel_write_of_write_disabled_region,
test_kernel_gup_of_access_disabled_region,
@@ -1433,6 +1505,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = {
test_pkey_syscalls_on_non_allocated_pkey,
test_pkey_syscalls_bad_args,
test_pkey_alloc_exhaust,
+ test_pkey_alloc_free_attach_pkey0,
};
void run_tests_once(void)
@@ -1442,7 +1515,7 @@ void run_tests_once(void)
for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
int pkey;
- int orig_pkru_faults = pkru_faults;
+ int orig_pkey_faults = pkey_faults;
dprintf1("======================\n");
dprintf1("test %d preparing...\n", test_nr);
@@ -1457,8 +1530,8 @@ void run_tests_once(void)
free_pkey_malloc(ptr);
sys_pkey_free(pkey);
- dprintf1("pkru_faults: %d\n", pkru_faults);
- dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults);
+ dprintf1("pkey_faults: %d\n", pkey_faults);
+ dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults);
tracing_off();
close_test_fds();
@@ -1471,18 +1544,19 @@ void run_tests_once(void)
void pkey_setup_shadow(void)
{
- shadow_pkru = __rdpkru();
+ shadow_pkey_reg = __read_pkey_reg();
}
int main(void)
{
int nr_iterations = 22;
+ int pkeys_supported = is_pkeys_supported();
setup_handlers();
- printf("has pku: %d\n", cpu_has_pku());
+ printf("has pkeys: %d\n", pkeys_supported);
- if (!cpu_has_pku()) {
+ if (!pkeys_supported) {
int size = PAGE_SIZE;
int *ptr;
@@ -1495,7 +1569,7 @@ int main(void)
}
pkey_setup_shadow();
- printf("startup pkru: %x\n", rdpkru());
+ printf("startup pkey_reg: %016llx\n", read_pkey_reg());
setup_hugetlbfs();
while (nr_iterations-- > 0)
diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore
index 022a1f3b64ef..1aaef5bf119a 100644
--- a/tools/testing/selftests/x86/.gitignore
+++ b/tools/testing/selftests/x86/.gitignore
@@ -12,5 +12,4 @@ ldt_gdt
iopl
mpx-mini-test
ioperm
-protection_keys
test_vdso
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 5d49bfec1e9a..5f16821c7f63 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie)
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
check_initial_reg_state sigreturn iopl ioperm \
- protection_keys test_vdso test_vsyscall mov_ss_trap \
+ test_vdso test_vsyscall mov_ss_trap \
syscall_arg_fault
TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/x86/pkey-helpers.h
deleted file mode 100644
index 254e5436bdd9..000000000000
--- a/tools/testing/selftests/x86/pkey-helpers.h
+++ /dev/null
@@ -1,219 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _PKEYS_HELPER_H
-#define _PKEYS_HELPER_H
-#define _GNU_SOURCE
-#include <string.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <signal.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <ucontext.h>
-#include <sys/mman.h>
-
-#define NR_PKEYS 16
-#define PKRU_BITS_PER_PKEY 2
-
-#ifndef DEBUG_LEVEL
-#define DEBUG_LEVEL 0
-#endif
-#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
-extern int dprint_in_signal;
-extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
-static inline void sigsafe_printf(const char *format, ...)
-{
- va_list ap;
-
- if (!dprint_in_signal) {
- va_start(ap, format);
- vprintf(format, ap);
- va_end(ap);
- } else {
- int ret;
- /*
- * No printf() functions are signal-safe.
- * They deadlock easily. Write the format
- * string to get some output, even if
- * incomplete.
- */
- ret = write(1, format, strlen(format));
- if (ret < 0)
- exit(1);
- }
-}
-#define dprintf_level(level, args...) do { \
- if (level <= DEBUG_LEVEL) \
- sigsafe_printf(args); \
-} while (0)
-#define dprintf0(args...) dprintf_level(0, args)
-#define dprintf1(args...) dprintf_level(1, args)
-#define dprintf2(args...) dprintf_level(2, args)
-#define dprintf3(args...) dprintf_level(3, args)
-#define dprintf4(args...) dprintf_level(4, args)
-
-extern unsigned int shadow_pkru;
-static inline unsigned int __rdpkru(void)
-{
- unsigned int eax, edx;
- unsigned int ecx = 0;
- unsigned int pkru;
-
- asm volatile(".byte 0x0f,0x01,0xee\n\t"
- : "=a" (eax), "=d" (edx)
- : "c" (ecx));
- pkru = eax;
- return pkru;
-}
-
-static inline unsigned int _rdpkru(int line)
-{
- unsigned int pkru = __rdpkru();
-
- dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n",
- line, pkru, shadow_pkru);
- assert(pkru == shadow_pkru);
-
- return pkru;
-}
-
-#define rdpkru() _rdpkru(__LINE__)
-
-static inline void __wrpkru(unsigned int pkru)
-{
- unsigned int eax = pkru;
- unsigned int ecx = 0;
- unsigned int edx = 0;
-
- dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
- asm volatile(".byte 0x0f,0x01,0xef\n\t"
- : : "a" (eax), "c" (ecx), "d" (edx));
- assert(pkru == __rdpkru());
-}
-
-static inline void wrpkru(unsigned int pkru)
-{
- dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
- /* will do the shadow check for us: */
- rdpkru();
- __wrpkru(pkru);
- shadow_pkru = pkru;
- dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru());
-}
-
-/*
- * These are technically racy. since something could
- * change PKRU between the read and the write.
- */
-static inline void __pkey_access_allow(int pkey, int do_allow)
-{
- unsigned int pkru = rdpkru();
- int bit = pkey * 2;
-
- if (do_allow)
- pkru &= (1<<bit);
- else
- pkru |= (1<<bit);
-
- dprintf4("pkru now: %08x\n", rdpkru());
- wrpkru(pkru);
-}
-
-static inline void __pkey_write_allow(int pkey, int do_allow_write)
-{
- long pkru = rdpkru();
- int bit = pkey * 2 + 1;
-
- if (do_allow_write)
- pkru &= (1<<bit);
- else
- pkru |= (1<<bit);
-
- wrpkru(pkru);
- dprintf4("pkru now: %08x\n", rdpkru());
-}
-
-#define PROT_PKEY0 0x10 /* protection key value (bit 0) */
-#define PROT_PKEY1 0x20 /* protection key value (bit 1) */
-#define PROT_PKEY2 0x40 /* protection key value (bit 2) */
-#define PROT_PKEY3 0x80 /* protection key value (bit 3) */
-
-#define PAGE_SIZE 4096
-#define MB (1<<20)
-
-static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- /* ecx is often an input as well as an output. */
- asm volatile(
- "cpuid;"
- : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
- : "0" (*eax), "2" (*ecx));
-}
-
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
-#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */
-#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */
-
-static inline int cpu_has_pku(void)
-{
- unsigned int eax;
- unsigned int ebx;
- unsigned int ecx;
- unsigned int edx;
-
- eax = 0x7;
- ecx = 0x0;
- __cpuid(&eax, &ebx, &ecx, &edx);
-
- if (!(ecx & X86_FEATURE_PKU)) {
- dprintf2("cpu does not have PKU\n");
- return 0;
- }
- if (!(ecx & X86_FEATURE_OSPKE)) {
- dprintf2("cpu does not have OSPKE\n");
- return 0;
- }
- return 1;
-}
-
-#define XSTATE_PKRU_BIT (9)
-#define XSTATE_PKRU 0x200
-
-int pkru_xstate_offset(void)
-{
- unsigned int eax;
- unsigned int ebx;
- unsigned int ecx;
- unsigned int edx;
- int xstate_offset;
- int xstate_size;
- unsigned long XSTATE_CPUID = 0xd;
- int leaf;
-
- /* assume that XSTATE_PKRU is set in XCR0 */
- leaf = XSTATE_PKRU_BIT;
- {
- eax = XSTATE_CPUID;
- ecx = leaf;
- __cpuid(&eax, &ebx, &ecx, &edx);
-
- if (leaf == XSTATE_PKRU_BIT) {
- xstate_offset = ebx;
- xstate_size = eax;
- }
- }
-
- if (xstate_size == 0) {
- printf("could not find size/offset of PKRU in xsave state\n");
- return 0;
- }
-
- return xstate_offset;
-}
-
-#endif /* _PKEYS_HELPER_H */
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
index d3a8755c039c..85eb65ea16d3 100644
--- a/tools/vm/page_owner_sort.c
+++ b/tools/vm/page_owner_sort.c
@@ -4,8 +4,7 @@
*
* Example use:
* cat /sys/kernel/debug/page_owner > page_owner_full.txt
- * grep -v ^PFN page_owner_full.txt > page_owner.txt
- * ./page_owner_sort page_owner.txt sorted_page_owner.txt
+ * ./page_owner_sort page_owner_full.txt sorted_page_owner.txt
*
* See Documentation/vm/page_owner.rst
*/
@@ -38,6 +37,8 @@ int read_block(char *buf, int buf_size, FILE *fin)
while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) {
if (*curr == '\n') /* empty line */
return curr - buf;
+ if (!strncmp(curr, "PFN", 3))
+ continue;
curr += strlen(curr);
}