summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2016-05-18 14:01:09 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2016-05-18 14:01:11 +1000
commitd0a783895690d227fb7cadc5482d787fb44fd425 (patch)
tree7cc168d7bbbe21c87542b9a9d001dd00c2122f5a
parent8944e06b0b1a046221054633dd28a164508981ec (diff)
parenta758ba8622449cb31893a0030d0dada82cfb9315 (diff)
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/DocBook/debugobjects.tmpl26
-rw-r--r--Documentation/blockdev/zram.txt27
-rw-r--r--Documentation/filesystems/nilfs2.txt5
-rw-r--r--Documentation/filesystems/proc.txt1
-rw-r--r--Documentation/gdb-kernel-debugging.txt21
-rw-r--r--Documentation/kernel-parameters.txt14
-rw-r--r--Documentation/memory-hotplug.txt9
-rw-r--r--Documentation/sysctl/vm.txt14
-rw-r--r--Documentation/vm/transhuge.txt18
-rw-r--r--Documentation/vm/z3fold.txt27
-rw-r--r--Kbuild10
-rw-r--r--MAINTAINERS10
-rw-r--r--arch/Kconfig12
-rw-r--r--arch/alpha/Kconfig1
-rw-r--r--arch/alpha/kernel/process.c8
-rw-r--r--arch/arc/Kconfig1
-rw-r--r--arch/arc/include/asm/hugepage.h2
-rw-r--r--arch/arc/kernel/process.c7
-rw-r--r--arch/arm/Kconfig2
-rw-r--r--arch/arm/configs/axm55xx_defconfig2
-rw-r--r--arch/arm/configs/bcm2835_defconfig2
-rw-r--r--arch/arm/configs/bcm_defconfig3
-rw-r--r--arch/arm/configs/colibri_pxa270_defconfig2
-rw-r--r--arch/arm/configs/colibri_pxa300_defconfig2
-rw-r--r--arch/arm/configs/dove_defconfig2
-rw-r--r--arch/arm/configs/efm32_defconfig2
-rw-r--r--arch/arm/configs/exynos_defconfig2
-rw-r--r--arch/arm/configs/ezx_defconfig2
-rw-r--r--arch/arm/configs/h5000_defconfig2
-rw-r--r--arch/arm/configs/hisi_defconfig2
-rw-r--r--arch/arm/configs/imote2_defconfig2
-rw-r--r--arch/arm/configs/imx_v6_v7_defconfig2
-rw-r--r--arch/arm/configs/keystone_defconfig2
-rw-r--r--arch/arm/configs/lpc18xx_defconfig2
-rw-r--r--arch/arm/configs/magician_defconfig2
-rw-r--r--arch/arm/configs/mmp2_defconfig2
-rw-r--r--arch/arm/configs/moxart_defconfig2
-rw-r--r--arch/arm/configs/multi_v7_defconfig2
-rw-r--r--arch/arm/configs/mvebu_v7_defconfig2
-rw-r--r--arch/arm/configs/mxs_defconfig2
-rw-r--r--arch/arm/configs/omap2plus_defconfig2
-rw-r--r--arch/arm/configs/pxa168_defconfig2
-rw-r--r--arch/arm/configs/pxa3xx_defconfig2
-rw-r--r--arch/arm/configs/pxa910_defconfig2
-rw-r--r--arch/arm/configs/pxa_defconfig2
-rw-r--r--arch/arm/configs/qcom_defconfig2
-rw-r--r--arch/arm/configs/raumfeld_defconfig2
-rw-r--r--arch/arm/configs/shmobile_defconfig2
-rw-r--r--arch/arm/configs/socfpga_defconfig2
-rw-r--r--arch/arm/configs/stm32_defconfig2
-rw-r--r--arch/arm/configs/sunxi_defconfig2
-rw-r--r--arch/arm/configs/tegra_defconfig2
-rw-r--r--arch/arm/configs/u300_defconfig2
-rw-r--r--arch/arm/configs/u8500_defconfig2
-rw-r--r--arch/arm/configs/vt8500_v6_v7_defconfig2
-rw-r--r--arch/arm/configs/xcep_defconfig2
-rw-r--r--arch/arm/configs/zx_defconfig3
-rw-r--r--arch/arm/include/asm/page.h2
-rw-r--r--arch/arm/include/asm/pgtable-3level.h5
-rw-r--r--arch/arm/kernel/process.c4
-rw-r--r--arch/arm/kernel/smp.c2
-rw-r--r--arch/arm/mm/Kconfig3
-rw-r--r--arch/arm/vfp/vfpmodule.c4
-rw-r--r--arch/arm64/configs/defconfig2
-rw-r--r--arch/arm64/include/asm/pgtable.h5
-rw-r--r--arch/arm64/kernel/process.c7
-rw-r--r--arch/arm64/mm/hugetlbpage.c1
-rw-r--r--arch/avr32/Kconfig2
-rw-r--r--arch/avr32/kernel/process.c4
-rw-r--r--arch/blackfin/Kconfig1
-rw-r--r--arch/blackfin/include/asm/processor.h7
-rw-r--r--arch/c6x/kernel/process.c4
-rw-r--r--arch/cris/Kconfig2
-rw-r--r--arch/cris/arch-v10/kernel/process.c9
-rw-r--r--arch/cris/arch-v32/kernel/process.c4
-rw-r--r--arch/frv/include/asm/processor.h7
-rw-r--r--arch/h8300/Kconfig1
-rw-r--r--arch/h8300/include/asm/processor.h7
-rw-r--r--arch/hexagon/kernel/process.c7
-rw-r--r--arch/ia64/Kconfig1
-rw-r--r--arch/ia64/kernel/perfmon.c4
-rw-r--r--arch/ia64/kernel/process.c14
-rw-r--r--arch/m32r/Kconfig1
-rw-r--r--arch/m32r/kernel/process.c9
-rw-r--r--arch/m68k/Kconfig.cpu11
-rw-r--r--arch/m68k/include/asm/processor.h7
-rw-r--r--arch/metag/Kconfig2
-rw-r--r--arch/metag/include/asm/processor.h2
-rw-r--r--arch/metag/kernel/process.c6
-rw-r--r--arch/metag/mm/hugetlbpage.c1
-rw-r--r--arch/microblaze/Kconfig1
-rw-r--r--arch/microblaze/include/asm/processor.h10
-rw-r--r--arch/mips/Kconfig3
-rw-r--r--arch/mips/configs/bcm47xx_defconfig2
-rw-r--r--arch/mips/configs/bmips_be_defconfig2
-rw-r--r--arch/mips/configs/bmips_stb_defconfig2
-rw-r--r--arch/mips/configs/ci20_defconfig2
-rw-r--r--arch/mips/configs/db1xxx_defconfig1
-rw-r--r--arch/mips/configs/lemote2f_defconfig2
-rw-r--r--arch/mips/configs/loongson3_defconfig3
-rw-r--r--arch/mips/configs/nlm_xlp_defconfig2
-rw-r--r--arch/mips/configs/nlm_xlr_defconfig2
-rw-r--r--arch/mips/configs/pistachio_defconfig2
-rw-r--r--arch/mips/configs/qi_lb60_defconfig2
-rw-r--r--arch/mips/configs/rt305x_defconfig2
-rw-r--r--arch/mips/configs/xway_defconfig2
-rw-r--r--arch/mips/include/asm/cpu-features.h10
-rw-r--r--arch/mips/include/asm/pgtable.h1
-rw-r--r--arch/mips/kernel/process.c4
-rw-r--r--arch/mips/mm/tlb-r4k.c21
-rw-r--r--arch/mn10300/Kconfig1
-rw-r--r--arch/mn10300/configs/asb2364_defconfig1
-rw-r--r--arch/mn10300/include/asm/fpu.h6
-rw-r--r--arch/mn10300/kernel/process.c4
-rw-r--r--arch/nios2/Kconfig1
-rw-r--r--arch/nios2/include/asm/processor.h5
-rw-r--r--arch/openrisc/Kconfig1
-rw-r--r--arch/openrisc/include/asm/processor.h9
-rw-r--r--arch/parisc/Kconfig1
-rw-r--r--arch/parisc/configs/generic-64bit_defconfig2
-rw-r--r--arch/parisc/kernel/process.c7
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/configs/40x/virtex_defconfig2
-rw-r--r--arch/powerpc/configs/44x/virtex5_defconfig2
-rw-r--r--arch/powerpc/configs/44x/warp_defconfig2
-rw-r--r--arch/powerpc/configs/52xx/cm5200_defconfig2
-rw-r--r--arch/powerpc/configs/52xx/lite5200b_defconfig2
-rw-r--r--arch/powerpc/configs/52xx/motionpro_defconfig2
-rw-r--r--arch/powerpc/configs/52xx/tqm5200_defconfig2
-rw-r--r--arch/powerpc/configs/gamecube_defconfig2
-rw-r--r--arch/powerpc/configs/mpc5200_defconfig2
-rw-r--r--arch/powerpc/configs/pasemi_defconfig2
-rw-r--r--arch/powerpc/configs/wii_defconfig2
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h1
-rw-r--r--arch/powerpc/include/asm/pgtable.h1
-rw-r--r--arch/powerpc/kernel/process.c4
-rw-r--r--arch/powerpc/mm/hugetlbpage.c6
-rw-r--r--arch/s390/Kconfig3
-rw-r--r--arch/s390/configs/default_defconfig2
-rw-r--r--arch/s390/configs/gcov_defconfig2
-rw-r--r--arch/s390/configs/performance_defconfig2
-rw-r--r--arch/s390/configs/zfcpdump_defconfig2
-rw-r--r--arch/s390/include/asm/pgtable.h1
-rw-r--r--arch/s390/kernel/machine_kexec.c28
-rw-r--r--arch/s390/kernel/process.c5
-rw-r--r--arch/score/Kconfig1
-rw-r--r--arch/score/kernel/process.c2
-rw-r--r--arch/sh/Kconfig3
-rw-r--r--arch/sh/configs/apsh4ad0a_defconfig1
-rw-r--r--arch/sh/configs/edosk7760_defconfig2
-rw-r--r--arch/sh/configs/sdk7786_defconfig3
-rw-r--r--arch/sh/configs/se7206_defconfig1
-rw-r--r--arch/sh/configs/se7722_defconfig2
-rw-r--r--arch/sh/configs/sh7785lcr_32bit_defconfig2
-rw-r--r--arch/sh/configs/shx3_defconfig1
-rw-r--r--arch/sh/configs/urquell_defconfig3
-rw-r--r--arch/sh/kernel/process_32.c7
-rw-r--r--arch/sh/kernel/process_64.c5
-rw-r--r--arch/sparc/Kconfig3
-rw-r--r--arch/sparc/configs/sparc64_defconfig2
-rw-r--r--arch/sparc/include/asm/pgtable_64.h2
-rw-r--r--arch/sparc/kernel/process_32.c12
-rw-r--r--arch/sparc/kernel/process_64.c4
-rw-r--r--arch/tile/Kconfig2
-rw-r--r--arch/tile/configs/tilegx_defconfig1
-rw-r--r--arch/tile/configs/tilepro_defconfig1
-rw-r--r--arch/tile/include/asm/pgtable.h1
-rw-r--r--arch/tile/kernel/process.c4
-rw-r--r--arch/tile/kernel/setup.c4
-rw-r--r--arch/tile/mm/hugetlbpage.c7
-rw-r--r--arch/tile/mm/init.c2
-rw-r--r--arch/um/configs/i386_defconfig1
-rw-r--r--arch/um/configs/x86_64_defconfig1
-rw-r--r--arch/um/kernel/process.c4
-rw-r--r--arch/unicore32/kernel/process.c7
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/configs/i386_defconfig3
-rw-r--r--arch/x86/configs/x86_64_defconfig3
-rw-r--r--arch/x86/include/asm/pgtable.h1
-rw-r--r--arch/x86/include/asm/uaccess.h5
-rw-r--r--arch/x86/include/asm/uaccess_64.h7
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c1
-rw-r--r--arch/x86/kernel/machine_kexec_64.c46
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/mm/hugetlbpage.c1
-rw-r--r--arch/x86/mm/numa.c4
-rw-r--r--arch/xtensa/Kconfig1
-rw-r--r--arch/xtensa/configs/audio_kc705_defconfig2
-rw-r--r--arch/xtensa/configs/generic_kc705_defconfig3
-rw-r--r--arch/xtensa/configs/nommu_kc705_defconfig2
-rw-r--r--arch/xtensa/configs/smp_lx200_defconfig3
-rw-r--r--arch/xtensa/kernel/process.c4
-rw-r--r--block/genhd.c2
-rw-r--r--block/partitions/ldm.c60
-rw-r--r--drivers/base/memory.c28
-rw-r--r--drivers/block/aoe/aoecmd.c2
-rw-r--r--drivers/block/zram/zcomp.c300
-rw-r--r--drivers/block/zram/zcomp.h14
-rw-r--r--drivers/block/zram/zram_drv.c83
-rw-r--r--drivers/block/zram/zram_drv.h1
-rw-r--r--drivers/char/random.c21
-rw-r--r--drivers/hwspinlock/hwspinlock_core.c2
-rw-r--r--drivers/hwtracing/intel_th/msu.c2
-rw-r--r--drivers/media/usb/dvb-usb/dib0700_core.c2
-rw-r--r--drivers/memstick/host/rtsx_usb_ms.c2
-rw-r--r--drivers/net/ethernet/cavium/thunder/nicvf_queues.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rx.c20
-rw-r--r--drivers/net/ethernet/qlogic/qede/qede_main.c6
-rw-r--r--drivers/platform/x86/wmi.c104
-rw-r--r--drivers/scsi/bfa/bfi.h2
-rw-r--r--drivers/staging/comedi/drivers/daqboard2000.c2
-rw-r--r--fs/befs/datastream.c8
-rw-r--r--fs/befs/io.c4
-rw-r--r--fs/befs/linuxvfs.c8
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/buffer.c10
-rw-r--r--fs/dax.c43
-rw-r--r--fs/efivarfs/inode.c40
-rw-r--r--fs/eventpoll.c12
-rw-r--r--fs/exec.c9
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext4/file.c1
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/f2fs/file.c2
-rw-r--r--fs/fs-writeback.c3
-rw-r--r--fs/nilfs2/alloc.c39
-rw-r--r--fs/nilfs2/alloc.h11
-rw-r--r--fs/nilfs2/bmap.c10
-rw-r--r--fs/nilfs2/bmap.h22
-rw-r--r--fs/nilfs2/btnode.c9
-rw-r--r--fs/nilfs2/btnode.h8
-rw-r--r--fs/nilfs2/btree.c21
-rw-r--r--fs/nilfs2/btree.h6
-rw-r--r--fs/nilfs2/cpfile.c22
-rw-r--r--fs/nilfs2/cpfile.h10
-rw-r--r--fs/nilfs2/dat.c8
-rw-r--r--fs/nilfs2/dat.h8
-rw-r--r--fs/nilfs2/dir.c59
-rw-r--r--fs/nilfs2/direct.c17
-rw-r--r--fs/nilfs2/direct.h6
-rw-r--r--fs/nilfs2/export.h2
-rw-r--r--fs/nilfs2/file.c7
-rw-r--r--fs/nilfs2/gcinode.c9
-rw-r--r--fs/nilfs2/ifile.c14
-rw-r--r--fs/nilfs2/ifile.h9
-rw-r--r--fs/nilfs2/inode.c105
-rw-r--r--fs/nilfs2/ioctl.c17
-rw-r--r--fs/nilfs2/mdt.c63
-rw-r--r--fs/nilfs2/mdt.h21
-rw-r--r--fs/nilfs2/namei.c12
-rw-r--r--fs/nilfs2/nilfs.h33
-rw-r--r--fs/nilfs2/page.c15
-rw-r--r--fs/nilfs2/page.h10
-rw-r--r--fs/nilfs2/recovery.c25
-rw-r--r--fs/nilfs2/segbuf.c10
-rw-r--r--fs/nilfs2/segbuf.h11
-rw-r--r--fs/nilfs2/segment.c125
-rw-r--r--fs/nilfs2/segment.h44
-rw-r--r--fs/nilfs2/sufile.c14
-rw-r--r--fs/nilfs2/sufile.h10
-rw-r--r--fs/nilfs2/super.c21
-rw-r--r--fs/nilfs2/sysfs.c10
-rw-r--r--fs/nilfs2/sysfs.h6
-rw-r--r--fs/nilfs2/the_nilfs.c16
-rw-r--r--fs/nilfs2/the_nilfs.h27
-rw-r--r--fs/notify/fsnotify.h7
-rw-r--r--fs/notify/group.c17
-rw-r--r--fs/notify/mark.c78
-rw-r--r--fs/ocfs2/alloc.c3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c185
-rw-r--r--fs/ocfs2/ocfs2_fs.h2
-rw-r--r--fs/ocfs2/slot_map.c6
-rw-r--r--fs/proc/array.c20
-rw-r--r--fs/proc/base.c42
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/ramfs/file-nommu.c8
-rw-r--r--fs/reiserfs/objectid.c2
-rw-r--r--fs/select.c67
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/xfs/xfs_file.c1
-rw-r--r--include/asm-generic/pgtable.h8
-rw-r--r--include/linux/bootmem.h16
-rw-r--r--include/linux/compaction.h151
-rw-r--r--include/linux/compiler-gcc.h1
-rw-r--r--include/linux/compiler.h4
-rw-r--r--include/linux/cpuset.h42
-rw-r--r--include/linux/crc64_ecma.h56
-rw-r--r--include/linux/dax.h3
-rw-r--r--include/linux/debugobjects.h17
-rw-r--r--include/linux/device.h12
-rw-r--r--include/linux/efi.h14
-rw-r--r--include/linux/fsnotify_backend.h2
-rw-r--r--include/linux/genhd.h23
-rw-r--r--include/linux/hardirq.h2
-rw-r--r--include/linux/huge_mm.h4
-rw-r--r--include/linux/hugetlb.h11
-rw-r--r--include/linux/hugetlb_cgroup.h4
-rw-r--r--include/linux/hugetlb_inline.h6
-rw-r--r--include/linux/kasan-checks.h12
-rw-r--r--include/linux/kasan.h13
-rw-r--r--include/linux/kernel.h4
-rw-r--r--include/linux/kexec.h6
-rw-r--r--include/linux/memcontrol.h31
-rw-r--r--include/linux/memory_hotplug.h8
-rw-r--r--include/linux/mempolicy.h16
-rw-r--r--include/linux/mempool.h3
-rw-r--r--include/linux/mm.h35
-rw-r--r--include/linux/mm_inline.h24
-rw-r--r--include/linux/mm_types.h16
-rw-r--r--include/linux/mmzone.h51
-rw-r--r--include/linux/nilfs2_fs.h79
-rw-r--r--include/linux/nodemask.h11
-rw-r--r--include/linux/oom.h17
-rw-r--r--include/linux/padata.h5
-rw-r--r--include/linux/page-flags.h25
-rw-r--r--include/linux/page_ref.h26
-rw-r--r--include/linux/pagemap.h32
-rw-r--r--include/linux/percpu.h3
-rw-r--r--include/linux/poll.h11
-rw-r--r--include/linux/printk.h14
-rw-r--r--include/linux/radix-tree.h171
-rw-r--r--include/linux/random.h1
-rw-r--r--include/linux/sched.h21
-rw-r--r--include/linux/sem.h1
-rw-r--r--include/linux/slab.h16
-rw-r--r--include/linux/slab_def.h4
-rw-r--r--include/linux/string.h2
-rw-r--r--include/linux/swap.h1
-rw-r--r--include/linux/syscalls.h8
-rw-r--r--include/linux/time64.h17
-rw-r--r--include/linux/timekeeping.h1
-rw-r--r--include/linux/types.h1
-rw-r--r--include/linux/uuid.h21
-rw-r--r--include/linux/vmalloc.h3
-rw-r--r--include/linux/vmstat.h12
-rw-r--r--include/linux/zsmalloc.h4
-rw-r--r--include/trace/events/compaction.h3
-rw-r--r--include/trace/events/huge_memory.h38
-rw-r--r--include/uapi/linux/uuid.h4
-rw-r--r--init/Kconfig36
-rw-r--r--init/main.c10
-rw-r--r--ipc/msg.c5
-rw-r--r--ipc/sem.c124
-rw-r--r--kernel/cpuset.c22
-rw-r--r--kernel/exit.c34
-rw-r--r--kernel/fork.c54
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/irq/irqdomain.c7
-rw-r--r--kernel/kexec.c109
-rw-r--r--kernel/kexec_core.c12
-rw-r--r--kernel/kexec_file.c8
-rw-r--r--kernel/padata.c138
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/printk/Makefile1
-rw-r--r--kernel/printk/internal.h57
-rw-r--r--kernel/printk/nmi.c260
-rw-r--r--kernel/printk/printk.c107
-rw-r--r--kernel/rcu/update.c26
-rw-r--r--kernel/signal.c10
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/sysctl_binary.c23
-rw-r--r--kernel/time/hrtimer.c23
-rw-r--r--kernel/time/time.c21
-rw-r--r--kernel/time/timekeeping.c13
-rw-r--r--kernel/time/timer.c63
-rw-r--r--kernel/workqueue.c52
-rw-r--r--lib/Kconfig10
-rw-r--r--lib/Kconfig.debug8
-rw-r--r--lib/Makefile3
-rw-r--r--lib/crc64_ecma.c341
-rw-r--r--lib/debugobjects.c92
-rw-r--r--lib/gcd.c77
-rw-r--r--lib/nmi_backtrace.c89
-rw-r--r--lib/nodemask.c30
-rw-r--r--lib/percpu_counter.c6
-rw-r--r--lib/radix-tree.c933
-rw-r--r--lib/strncpy_from_user.c2
-rw-r--r--lib/test_kasan.c98
-rw-r--r--lib/uuid.c91
-rw-r--r--lib/vsprintf.c21
-rw-r--r--mm/Kconfig66
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c20
-rw-r--r--mm/compaction.c258
-rw-r--r--mm/filemap.c33
-rw-r--r--mm/highmem.c12
-rw-r--r--mm/huge_memory.c126
-rw-r--r--mm/hugetlb.c37
-rw-r--r--mm/hugetlb_cgroup.c35
-rw-r--r--mm/internal.h14
-rw-r--r--mm/kasan/Makefile1
-rw-r--r--mm/kasan/kasan.c133
-rw-r--r--mm/kasan/kasan.h21
-rw-r--r--mm/kasan/quarantine.c291
-rw-r--r--mm/kasan/report.c1
-rw-r--r--mm/memblock.c33
-rw-r--r--mm/memcontrol.c41
-rw-r--r--mm/memory-failure.c72
-rw-r--r--mm/memory.c7
-rw-r--r--mm/memory_hotplug.c91
-rw-r--r--mm/mempolicy.c63
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mmap.c17
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mremap.c47
-rw-r--r--mm/oom_kill.c129
-rw-r--r--mm/page-writeback.c8
-rw-r--r--mm/page_alloc.c1233
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/page_owner.c5
-rw-r--r--mm/page_poison.c8
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c130
-rw-r--r--mm/slab.c785
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c11
-rw-r--r--mm/swap.c5
-rw-r--r--mm/swap_state.c3
-rw-r--r--mm/util.c29
-rw-r--r--mm/vmalloc.c39
-rw-r--r--mm/vmscan.c164
-rw-r--r--mm/vmstat.c171
-rw-r--r--mm/z3fold.c792
-rw-r--r--mm/zsmalloc.c175
-rw-r--r--mm/zswap.c12
-rw-r--r--net/socket.c8
-rw-r--r--net/wireless/util.c2
-rw-r--r--samples/kprobes/jprobe_example.c2
-rwxr-xr-xscripts/checkpatch.pl142
-rwxr-xr-xscripts/decode_stacktrace.sh55
-rw-r--r--scripts/gdb/linux/Makefile12
-rw-r--r--scripts/gdb/linux/constants.py.in59
-rw-r--r--scripts/gdb/linux/cpus.py38
-rw-r--r--scripts/gdb/linux/dmesg.py11
-rw-r--r--scripts/gdb/linux/lists.py21
-rw-r--r--scripts/gdb/linux/modules.py24
-rw-r--r--scripts/gdb/linux/proc.py156
-rw-r--r--scripts/gdb/linux/radixtree.py97
-rw-r--r--scripts/gdb/linux/tasks.py19
-rw-r--r--scripts/gdb/linux/utils.py32
-rw-r--r--scripts/gdb/vmlinux-gdb.py2
-rw-r--r--scripts/spelling.txt1
-rw-r--r--security/integrity/ima/ima_policy.c14
-rw-r--r--tools/testing/radix-tree/Makefile4
-rw-r--r--tools/testing/radix-tree/generated/autoconf.h3
-rw-r--r--tools/testing/radix-tree/linux/init.h1
-rw-r--r--tools/testing/radix-tree/linux/kernel.h15
-rw-r--r--tools/testing/radix-tree/linux/slab.h1
-rw-r--r--tools/testing/radix-tree/linux/types.h7
-rw-r--r--tools/testing/radix-tree/main.c84
-rw-r--r--tools/testing/radix-tree/multiorder.c337
-rw-r--r--tools/testing/radix-tree/regression2.c7
-rw-r--r--tools/testing/radix-tree/tag_check.c10
-rw-r--r--tools/testing/radix-tree/test.c49
-rw-r--r--tools/testing/radix-tree/test.h11
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/CFcommon2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/CFcommon2
461 files changed, 8898 insertions, 4614 deletions
diff --git a/Documentation/DocBook/debugobjects.tmpl b/Documentation/DocBook/debugobjects.tmpl
index 24979f691e3e..7e4f34fde697 100644
--- a/Documentation/DocBook/debugobjects.tmpl
+++ b/Documentation/DocBook/debugobjects.tmpl
@@ -316,8 +316,8 @@
</itemizedlist>
</para>
<para>
- The function returns 1 when the fixup was successful,
- otherwise 0. The return value is used to update the
+ The function returns true when the fixup was successful,
+ otherwise false. The return value is used to update the
statistics.
</para>
<para>
@@ -341,8 +341,8 @@
</itemizedlist>
</para>
<para>
- The function returns 1 when the fixup was successful,
- otherwise 0. The return value is used to update the
+ The function returns true when the fixup was successful,
+ otherwise false. The return value is used to update the
statistics.
</para>
<para>
@@ -359,7 +359,8 @@
statically initialized object or not. In case it is it calls
debug_object_init() and debug_object_activate() to make the
object known to the tracker and marked active. In this case
- the function should return 0 because this is not a real fixup.
+ the function should return false because this is not a real
+ fixup.
</para>
</sect1>
@@ -376,8 +377,8 @@
</itemizedlist>
</para>
<para>
- The function returns 1 when the fixup was successful,
- otherwise 0. The return value is used to update the
+ The function returns true when the fixup was successful,
+ otherwise false. The return value is used to update the
statistics.
</para>
</sect1>
@@ -397,8 +398,8 @@
</itemizedlist>
</para>
<para>
- The function returns 1 when the fixup was successful,
- otherwise 0. The return value is used to update the
+ The function returns true when the fixup was successful,
+ otherwise false. The return value is used to update the
statistics.
</para>
</sect1>
@@ -414,8 +415,8 @@
debug bucket.
</para>
<para>
- The function returns 1 when the fixup was successful,
- otherwise 0. The return value is used to update the
+ The function returns true when the fixup was successful,
+ otherwise false. The return value is used to update the
statistics.
</para>
<para>
@@ -427,7 +428,8 @@
case. The fixup function should check if this is a legitimate
case of a statically initialized object or not. In this case only
debug_object_init() should be called to make the object known to
- the tracker. Then the function should return 0 because this is not
+ the tracker. Then the function should return false because this
+ is not
a real fixup.
</para>
</sect1>
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 5bda5031c83d..d88f0c70cd7f 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -59,27 +59,16 @@ num_devices parameter is optional and tells zram how many devices should be
pre-created. Default: 1.
2) Set max number of compression streams
- Compression backend may use up to max_comp_streams compression streams,
- thus allowing up to max_comp_streams concurrent compression operations.
- By default, compression backend uses single compression stream.
-
- Examples:
- #show max compression streams number
+ Regardless the value passed to this attribute, ZRAM will always
+ allocate multiple compression streams - one per online CPUs - thus
+ allowing several concurrent compression operations. The number of
+ allocated compression streams goes down when some of the CPUs
+ become offline. There is no single-compression-stream mode anymore,
+ unless you are running a UP system or has only 1 CPU online.
+
+ To find out how many streams are currently available:
cat /sys/block/zram0/max_comp_streams
- #set max compression streams number to 3
- echo 3 > /sys/block/zram0/max_comp_streams
-
-Note:
-In order to enable compression backend's multi stream support max_comp_streams
-must be initially set to desired concurrency level before ZRAM device
-initialisation. Once the device initialised as a single stream compression
-backend (max_comp_streams equals to 1), you will see error if you try to change
-the value of max_comp_streams because single stream compression backend
-implemented as a special case by lock overhead issue and does not support
-dynamic max_comp_streams. Only multi stream backend supports dynamic
-max_comp_streams adjustment.
-
3) Select compression algorithm
Using comp_algorithm device attribute one can see available and
currently selected (shown in square brackets) compression algorithms,
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 41c3d332acc9..5b21ef76f751 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -268,3 +268,8 @@ among NILFS2 files can be depicted as follows:
( regular file, directory, or symlink )
For detail on the format of each file, please see include/linux/nilfs2_fs.h.
+
+There are no patents or other intellectual property that we protect
+with regard to the design of NILFS2. It is allowed to replicate the
+design in hopes that other operating systems could share (mount, read,
+write, etc.) data stored in this format.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 7f5607a089b4..e8d00759bfa5 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -225,6 +225,7 @@ Table 1-2: Contents of the status files (as of 4.1)
TracerPid PID of process tracing this process (0 if not)
Uid Real, effective, saved set, and file system UIDs
Gid Real, effective, saved set, and file system GIDs
+ Umask file mode creation mask
FDSize number of file descriptor slots currently allocated
Groups supplementary group list
NStgid descendant namespace thread group ID hierarchy
diff --git a/Documentation/gdb-kernel-debugging.txt b/Documentation/gdb-kernel-debugging.txt
index 7050ce8794b9..4ab7d43d0754 100644
--- a/Documentation/gdb-kernel-debugging.txt
+++ b/Documentation/gdb-kernel-debugging.txt
@@ -139,6 +139,27 @@ Examples of using the Linux-provided gdb helpers
start_comm = "swapper/2\000\000\000\000\000\000"
}
+ o Dig into a radix tree data structure, such as the IRQ descriptors:
+ (gdb) print (struct irq_desc)$lx_radix_tree_lookup(irq_desc_tree, 18)
+ $6 = {
+ irq_common_data = {
+ state_use_accessors = 67584,
+ handler_data = 0x0 <__vectors_start>,
+ msi_desc = 0x0 <__vectors_start>,
+ affinity = {{
+ bits = {65535}
+ }}
+ },
+ irq_data = {
+ mask = 0,
+ irq = 18,
+ hwirq = 27,
+ common = 0xee803d80,
+ chip = 0xc0eb0854 <gic_data>,
+ domain = 0xee808000,
+ parent_data = 0x0 <__vectors_start>,
+ chip_data = 0xc0eb0854 <gic_data>
+ } <... trimmed ...>
List of commands and functions
------------------------------
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e3faa1bc8be9..5349363b603c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2174,6 +2174,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
[KNL,SH] Allow user to override the default size for
per-device physically contiguous DMA buffers.
+ memhp_default_state=online/offline
+ [KNL] Set the initial state for the memory hotplug
+ onlining policy. If not specified, the default value is
+ set according to the
+ CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
+ option.
+ See Documentation/memory-hotplug.txt.
+
memmap=exactmap [KNL,X86] Enable setting of an exact
E820 memory map, as specified by the user.
Such memmap=exactmap lines can be constructed based on
@@ -3142,8 +3150,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Format: <bool> (1/Y/y=enable, 0/N/n=disable)
default: disabled
- printk.time= Show timing data prefixed to each printk message line
- Format: <bool> (1/Y/y=enable, 0/N/n=disable)
+ printk.time= Show timestamp prefixed to each printk message line
+ Format: <string>
+ (0/N/n = disable, 1/Y/y = local clock,
+ 2 = monotonic clock, 3 = real clock)
processor.max_cstate= [HW,ACPI]
Limit processor to maximum C-state
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 443f4b44ad97..0d7cb955aa01 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -261,10 +261,11 @@ it according to the policy which can be read from "auto_online_blocks" file:
% cat /sys/devices/system/memory/auto_online_blocks
-The default is "offline" which means the newly added memory is not in a
-ready-to-use state and you have to "online" the newly added memory blocks
-manually. Automatic onlining can be requested by writing "online" to
-"auto_online_blocks" file:
+The default depends on the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
+option. If it is disabled the default is "offline" which means the newly added
+memory is not in a ready-to-use state and you have to "online" the newly added
+memory blocks manually. Automatic onlining can be requested by writing "online"
+to "auto_online_blocks" file:
% echo online > /sys/devices/system/memory/auto_online_blocks
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 34a5fece3121..720355cbdf45 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm:
- panic_on_oom
- percpu_pagelist_fraction
- stat_interval
+- stat_refresh
- swappiness
- user_reserve_kbytes
- vfs_cache_pressure
@@ -755,6 +756,19 @@ is 1 second.
==============================================================
+stat_refresh
+
+Any read or write (by root only) flushes all the per-cpu vm statistics
+into their global totals, for more accurate reports when testing
+e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
+
+As a side-effect, it also checks for negative totals (elsewhere reported
+as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
+(At time of writing, a few stats are known sometimes to be found negative,
+with no ill effects: errors and warnings on these stats are suppressed.)
+
+==============================================================
+
swappiness
This control is used to define how aggressive the kernel will swap
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index d9cb65cf5cfd..7c871d6beb63 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -340,7 +340,7 @@ unaffected. libhugetlbfs will also work fine as usual.
== Graceful fallback ==
-Code walking pagetables but unware about huge pmds can simply call
+Code walking pagetables but unaware about huge pmds can simply call
split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
pmd_offset. It's trivial to make the code transparent hugepage aware
by just grepping for "pmd_offset" and adding split_huge_pmd where
@@ -394,9 +394,9 @@ hugepage natively. Once finished you can drop the page table lock.
Refcounting on THP is mostly consistent with refcounting on other compound
pages:
- - get_page()/put_page() and GUP operate in head page's ->_count.
+ - get_page()/put_page() and GUP operate in head page's ->_refcount.
- - ->_count in tail pages is always zero: get_page_unless_zero() never
+ - ->_refcount in tail pages is always zero: get_page_unless_zero() never
succeed on tail pages.
- map/unmap of the pages with PTE entry increment/decrement ->_mapcount
@@ -414,7 +414,7 @@ tracking. The alternative is alter ->_mapcount in all subpages on each
map/unmap of the whole compound page.
We set PG_double_map when a PMD of the page got split for the first time,
-but still have PMD mapping. The addtional references go away with last
+but still have PMD mapping. The additional references go away with last
compound_mapcount.
split_huge_page internally has to distribute the refcounts in the head
@@ -426,16 +426,16 @@ requests to split pinned huge page: it expects page count to be equal to
sum of mapcount of all sub-pages plus one (split_huge_page caller must
have reference for head page).
-split_huge_page uses migration entries to stabilize page->_count and
+split_huge_page uses migration entries to stabilize page->_refcount and
page->_mapcount.
We safe against physical memory scanners too: the only legitimate way
scanner can get reference to a page is get_page_unless_zero().
-All tail pages has zero ->_count until atomic_add(). It prevent scanner
-from geting reference to tail page up to the point. After the atomic_add()
-we don't care about ->_count value. We already known how many references
-with should uncharge from head page.
+All tail pages have zero ->_refcount until atomic_add(). This prevents the
+scanner from getting a reference to the tail page up to that point. After the
+atomic_add() we don't care about the ->_refcount value. We already known how
+many references should be uncharged from the head page.
For head page get_page_unless_zero() will succeed and we don't mind. It's
clear where reference should go after split: it will stay on head page.
diff --git a/Documentation/vm/z3fold.txt b/Documentation/vm/z3fold.txt
new file mode 100644
index 000000000000..3afff6e551b0
--- /dev/null
+++ b/Documentation/vm/z3fold.txt
@@ -0,0 +1,27 @@
+z3fold
+------
+
+z3fold is a special purpose allocator for storing compressed pages.
+It is designed to store up to three compressed pages per physical page.
+It is a zbud derivative which allows for higher compression
+ratio keeping the simplicity and determinism of its predecessor.
+
+The main differences between z3fold and zbud are:
+* unlike zbud, z3fold allows for up to PAGE_SIZE allocations
+* z3fold can hold up to 3 compressed pages in its page
+* z3fold doesn't export any API itself and is thus intended to be used
+ via the zpool API.
+
+To keep the determinism and simplicity, z3fold, just like zbud, always
+stores an integral number of compressed pages per page, but it can store
+up to 3 pages unlike zbud which can store at most 2. Therefore the
+compression ratio goes to around 2.7x while zbud's one is around 1.7x.
+
+Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not
+return a dereferenceable pointer. Instead, it returns an unsigned long
+handle which encodes actual location of the allocated object.
+
+Keeping effective compression ratio close to zsmalloc's, z3fold doesn't
+depend on MMU enabled and provides more predictable reclaim behavior
+which makes it a better fit for small and response-critical systems.
+
diff --git a/Kbuild b/Kbuild
index f55cefd9bf29..3d0ae152af7c 100644
--- a/Kbuild
+++ b/Kbuild
@@ -5,6 +5,7 @@
# 2) Generate timeconst.h
# 3) Generate asm-offsets.h (may need bounds.h and timeconst.h)
# 4) Check for missing system calls
+# 5) Generate constants.py (may need bounds.h)
# Default sed regexp - multiline due to syntax constraints
define sed-y
@@ -96,5 +97,14 @@ quiet_cmd_syscalls = CALL $<
missing-syscalls: scripts/checksyscalls.sh $(offsets-file) FORCE
$(call cmd,syscalls)
+#####
+# 5) Generate constants for Python GDB integration
+#
+
+extra-$(CONFIG_GDB_SCRIPTS) += build_constants_py
+
+build_constants_py: $(obj)/$(timeconst-file) $(obj)/$(bounds-file)
+ @$(MAKE) $(build)=scripts/gdb/linux $@
+
# Keep these three files during make clean
no-clean-files := $(bounds-file) $(offsets-file) $(timeconst-file)
diff --git a/MAINTAINERS b/MAINTAINERS
index 8f2850db5b51..44599eb2856c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4993,6 +4993,7 @@ F: drivers/scsi/gdt*
GDB KERNEL DEBUGGING HELPER SCRIPTS
M: Jan Kiszka <jan.kiszka@siemens.com>
+M: Kieran Bingham <kieran@bingham.xyz>
S: Supported
F: scripts/gdb/
@@ -6398,7 +6399,7 @@ S: Maintained
F: arch/*/include/asm/kasan.h
F: arch/*/mm/kasan_init*
F: Documentation/kasan.txt
-F: include/linux/kasan.h
+F: include/linux/kasan*.h
F: lib/test_kasan.c
F: mm/kasan/
F: scripts/Makefile.kasan
@@ -8038,6 +8039,7 @@ NILFS2 FILESYSTEM
M: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
L: linux-nilfs@vger.kernel.org
W: http://nilfs.sourceforge.net/
+W: http://nilfs.osdn.jp/
T: git git://github.com/konis/nilfs2.git
S: Supported
F: Documentation/filesystems/nilfs2.txt
@@ -8418,7 +8420,6 @@ F: drivers/of/resolver.c
OPENRISC ARCHITECTURE
M: Jonas Bonn <jonas@southpole.se>
W: http://openrisc.net
-L: linux@lists.openrisc.net (moderated for non-subscribers)
S: Maintained
T: git git://openrisc.net/~jonas/linux
F: arch/openrisc/
@@ -8539,7 +8540,6 @@ F: drivers/platform/x86/panasonic-laptop.c
PANASONIC MN10300/AM33/AM34 PORT
M: David Howells <dhowells@redhat.com>
-M: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
L: linux-am33-list@redhat.com (moderated for non-subscribers)
W: ftp://ftp.redhat.com/pub/redhat/gnupro/AM33/
S: Maintained
@@ -8973,7 +8973,6 @@ F: drivers/pinctrl/pinctrl-single.c
PIN CONTROLLER - ST SPEAR
M: Viresh Kumar <vireshk@kernel.org>
-L: spear-devel@list.st.com
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
W: http://www.st.com/spear
S: Maintained
@@ -10178,7 +10177,6 @@ F: drivers/mmc/host/sdhci-s3c*
SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) ST SPEAR DRIVER
M: Viresh Kumar <vireshk@kernel.org>
-L: spear-devel@list.st.com
L: linux-mmc@vger.kernel.org
S: Maintained
F: drivers/mmc/host/sdhci-spear.c
@@ -10741,7 +10739,6 @@ F: include/linux/compiler.h
SPEAR PLATFORM SUPPORT
M: Viresh Kumar <vireshk@kernel.org>
M: Shiraz Hashim <shiraz.linux.kernel@gmail.com>
-L: spear-devel@list.st.com
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
W: http://www.st.com/spear
S: Maintained
@@ -10750,7 +10747,6 @@ F: arch/arm/mach-spear/
SPEAR CLOCK FRAMEWORK SUPPORT
M: Viresh Kumar <vireshk@kernel.org>
-L: spear-devel@list.st.com
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
W: http://www.st.com/spear
S: Maintained
diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5e7e17..b16e74e4b5af 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -187,7 +187,11 @@ config HAVE_OPTPROBES
config HAVE_KPROBES_ON_FTRACE
bool
+config HAVE_NMI
+ bool
+
config HAVE_NMI_WATCHDOG
+ depends on HAVE_NMI
bool
#
# An arch should select this if it provides all these things:
@@ -517,6 +521,11 @@ config HAVE_ARCH_MMAP_RND_BITS
- ARCH_MMAP_RND_BITS_MIN
- ARCH_MMAP_RND_BITS_MAX
+config HAVE_EXIT_THREAD
+ bool
+ help
+ An architecture implements exit_thread.
+
config ARCH_MMAP_RND_BITS_MIN
int
@@ -638,4 +647,7 @@ config COMPAT_OLD_SIGACTION
config ARCH_NO_COHERENT_DMA_MMAP
bool
+config CPU_NO_EFFICIENT_FFS
+ def_bool n
+
source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index fe99f894e57d..7f312d80b43b 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -26,6 +26,7 @@ config ALPHA
select MODULES_USE_ELF_RELA
select ODD_RT_SIGACTION
select OLD_SIGSUSPEND
+ select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
help
The Alpha is a 64-bit general-purpose processor designed and
marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 84d13263ce46..b483156698d5 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -210,14 +210,6 @@ start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
}
EXPORT_SYMBOL(start_thread);
-/*
- * Free current thread data structures etc..
- */
-void
-exit_thread(void)
-{
-}
-
void
flush_thread(void)
{
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 8894f7e7e3de..0dcbacfdea4b 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -107,6 +107,7 @@ choice
config ISA_ARCOMPACT
bool "ARCompact ISA"
+ select CPU_NO_EFFICIENT_FFS
help
The original ARC ISA of ARC600/700 cores
diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h
index 7afe3356b770..317ff773e1ca 100644
--- a/arch/arc/include/asm/hugepage.h
+++ b/arch/arc/include/asm/hugepage.h
@@ -61,8 +61,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd);
-#define has_transparent_hugepage() 1
-
/* Generic variants assume pgtable_t is struct page *, hence need for these */
#define __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index a3f750e76b68..b5db9e7fd649 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -183,13 +183,6 @@ void flush_thread(void)
{
}
-/*
- * Free any architecture-specific thread data structures, etc.
- */
-void exit_thread(void)
-{
-}
-
int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
{
return 0;
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b99d25b4133e..90542db1220d 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -50,6 +50,7 @@ config ARM
select HAVE_DMA_CONTIGUOUS if MMU
select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) && !CPU_ENDIAN_BE32 && MMU
select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
+ select HAVE_EXIT_THREAD
select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
select HAVE_FUNCTION_TRACER if (!XIP_KERNEL)
@@ -66,6 +67,7 @@ config ARM
select HAVE_KRETPROBES if (HAVE_KPROBES)
select HAVE_MEMBLOCK
select HAVE_MOD_ARCH_SPECIFIC
+ select HAVE_NMI
select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
select HAVE_OPTPROBES if !THUMB2_KERNEL
select HAVE_PERF_EVENTS
diff --git a/arch/arm/configs/axm55xx_defconfig b/arch/arm/configs/axm55xx_defconfig
index d3260d7d5af1..55eba3576030 100644
--- a/arch/arm/configs/axm55xx_defconfig
+++ b/arch/arm/configs/axm55xx_defconfig
@@ -232,7 +232,7 @@ CONFIG_NFS_FSCACHE=y
CONFIG_SUNRPC_DEBUG=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_FS=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig
index 79de828e49ad..cc55e4252fda 100644
--- a/arch/arm/configs/bcm2835_defconfig
+++ b/arch/arm/configs/bcm2835_defconfig
@@ -130,7 +130,7 @@ CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_BOOT_PRINTK_DELAY=y
CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO=y
diff --git a/arch/arm/configs/bcm_defconfig b/arch/arm/configs/bcm_defconfig
index 7117662bab2e..7e0797eae75c 100644
--- a/arch/arm/configs/bcm_defconfig
+++ b/arch/arm/configs/bcm_defconfig
@@ -12,7 +12,6 @@ CONFIG_CGROUPS=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_SCHED=y
CONFIG_BLK_CGROUP=y
CONFIG_NAMESPACES=y
@@ -121,7 +120,7 @@ CONFIG_CONFIGFS_FS=y
# CONFIG_MISC_FILESYSTEMS is not set
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_FS=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/arm/configs/colibri_pxa270_defconfig b/arch/arm/configs/colibri_pxa270_defconfig
index 0b9211b2b73b..99b706260084 100644
--- a/arch/arm/configs/colibri_pxa270_defconfig
+++ b/arch/arm/configs/colibri_pxa270_defconfig
@@ -156,7 +156,7 @@ CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=m
CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_UTF8=m
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
diff --git a/arch/arm/configs/colibri_pxa300_defconfig b/arch/arm/configs/colibri_pxa300_defconfig
index be02fe2b14cb..3a0b92b9a86b 100644
--- a/arch/arm/configs/colibri_pxa300_defconfig
+++ b/arch/arm/configs/colibri_pxa300_defconfig
@@ -58,7 +58,7 @@ CONFIG_INOTIFY=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
CONFIG_ROOT_NFS=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_INFO=y
# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/arm/configs/dove_defconfig b/arch/arm/configs/dove_defconfig
index 701677f9248c..d952d3d911b9 100644
--- a/arch/arm/configs/dove_defconfig
+++ b/arch/arm/configs/dove_defconfig
@@ -119,7 +119,7 @@ CONFIG_NLS_CODEPAGE_850=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_ISO8859_2=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_FS=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/configs/efm32_defconfig b/arch/arm/configs/efm32_defconfig
index c0dac0f0f804..306d1566d259 100644
--- a/arch/arm/configs/efm32_defconfig
+++ b/arch/arm/configs/efm32_defconfig
@@ -93,7 +93,7 @@ CONFIG_EXT2_FS=y
CONFIG_ROMFS_FS=y
CONFIG_ROMFS_BACKED_BY_MTD=y
# CONFIG_NETWORK_FILESYSTEMS is not set
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig
index 10f49ab5328e..a8cf295724c5 100644
--- a/arch/arm/configs/exynos_defconfig
+++ b/arch/arm/configs/exynos_defconfig
@@ -240,7 +240,7 @@ CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_FS=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/arm/configs/ezx_defconfig b/arch/arm/configs/ezx_defconfig
index ea316c4b890e..c4bdecd93207 100644
--- a/arch/arm/configs/ezx_defconfig
+++ b/arch/arm/configs/ezx_defconfig
@@ -379,7 +379,7 @@ CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_KOI8_R=m
CONFIG_NLS_KOI8_U=m
CONFIG_NLS_UTF8=m
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/configs/h5000_defconfig b/arch/arm/configs/h5000_defconfig
index 37903e3f0efc..e3507cb9835a 100644
--- a/arch/arm/configs/h5000_defconfig
+++ b/arch/arm/configs/h5000_defconfig
@@ -66,7 +66,7 @@ CONFIG_TMPFS=y
CONFIG_JFFS2_FS=y
CONFIG_JFFS2_COMPRESSION_OPTIONS=y
# CONFIG_NETWORK_FILESYSTEMS is not set
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_KERNEL=y
# CONFIG_SCHED_DEBUG is not set
# CONFIG_DEBUG_BUGVERBOSE is not set
diff --git a/arch/arm/configs/hisi_defconfig b/arch/arm/configs/hisi_defconfig
index b2e340b272ee..4dc0a4a26c2f 100644
--- a/arch/arm/configs/hisi_defconfig
+++ b/arch/arm/configs/hisi_defconfig
@@ -82,7 +82,7 @@ CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_FS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
diff --git a/arch/arm/configs/imote2_defconfig b/arch/arm/configs/imote2_defconfig
index 18e59feaa307..672010d8cbaa 100644
--- a/arch/arm/configs/imote2_defconfig
+++ b/arch/arm/configs/imote2_defconfig
@@ -350,7 +350,7 @@ CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_KOI8_R=m
CONFIG_NLS_KOI8_U=m
CONFIG_NLS_UTF8=m
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig
index 21339ce29654..71f8384bd01f 100644
--- a/arch/arm/configs/imx_v6_v7_defconfig
+++ b/arch/arm/configs/imx_v6_v7_defconfig
@@ -366,7 +366,7 @@ CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_FS=y
CONFIG_MAGIC_SYSRQ=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/configs/keystone_defconfig b/arch/arm/configs/keystone_defconfig
index faba04d93ad5..042d46c3ff5c 100644
--- a/arch/arm/configs/keystone_defconfig
+++ b/arch/arm/configs/keystone_defconfig
@@ -197,7 +197,7 @@ CONFIG_NFSD_V3=y
CONFIG_NFSD_V3_ACL=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_SHIRQ=y
CONFIG_DEBUG_USER=y
diff --git a/arch/arm/configs/lpc18xx_defconfig b/arch/arm/configs/lpc18xx_defconfig
index 2ae00b09cfc2..c25ff23e2ac4 100644
--- a/arch/arm/configs/lpc18xx_defconfig
+++ b/arch/arm/configs/lpc18xx_defconfig
@@ -159,7 +159,7 @@ CONFIG_EXT2_FS=y
# CONFIG_INOTIFY_USER is not set
CONFIG_JFFS2_FS=y
# CONFIG_NETWORK_FILESYSTEMS is not set
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/arm/configs/magician_defconfig b/arch/arm/configs/magician_defconfig
index a5b4920cd6d4..accbf109abcf 100644
--- a/arch/arm/configs/magician_defconfig
+++ b/arch/arm/configs/magician_defconfig
@@ -167,7 +167,7 @@ CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_CODEPAGE_1251=m
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_KERNEL=y
# CONFIG_SCHED_DEBUG is not set
CONFIG_TIMER_STATS=y
diff --git a/arch/arm/configs/mmp2_defconfig b/arch/arm/configs/mmp2_defconfig
index f1cb95e58af0..0385e4a98cb1 100644
--- a/arch/arm/configs/mmp2_defconfig
+++ b/arch/arm/configs/mmp2_defconfig
@@ -81,7 +81,7 @@ CONFIG_NFS_V3=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
diff --git a/arch/arm/configs/moxart_defconfig b/arch/arm/configs/moxart_defconfig
index a3cb76cfb828..32fab5a0c30e 100644
--- a/arch/arm/configs/moxart_defconfig
+++ b/arch/arm/configs/moxart_defconfig
@@ -124,7 +124,7 @@ CONFIG_EXT3_FS=y
CONFIG_TMPFS=y
CONFIG_CONFIGFS_FS=y
CONFIG_JFFS2_FS=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index 8f857564657f..f7e0d49b515a 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -844,7 +844,7 @@ CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_FS=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_LOCKUP_DETECTOR=y
diff --git a/arch/arm/configs/mvebu_v7_defconfig b/arch/arm/configs/mvebu_v7_defconfig
index 486a4cabb0dd..d9dfcab96da9 100644
--- a/arch/arm/configs/mvebu_v7_defconfig
+++ b/arch/arm/configs/mvebu_v7_defconfig
@@ -147,7 +147,7 @@ CONFIG_NLS_CODEPAGE_850=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_ISO8859_2=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_FS=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig
index 6e0f751be229..8a3d2f21c18f 100644
--- a/arch/arm/configs/mxs_defconfig
+++ b/arch/arm/configs/mxs_defconfig
@@ -166,7 +166,7 @@ CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_CODEPAGE_850=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_ISO8859_15=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_FRAME_WARN=2048
CONFIG_UNUSED_SYMBOLS=y
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index ac717cccd2b5..506f4fa57dc9 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -460,7 +460,7 @@ CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_SPLIT=y
CONFIG_DEBUG_INFO_DWARF4=y
diff --git a/arch/arm/configs/pxa168_defconfig b/arch/arm/configs/pxa168_defconfig
index 74d7e0104f8d..5824b89de1c3 100644
--- a/arch/arm/configs/pxa168_defconfig
+++ b/arch/arm/configs/pxa168_defconfig
@@ -57,7 +57,7 @@ CONFIG_NFS_V3=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_DEBUG_PREEMPT is not set
diff --git a/arch/arm/configs/pxa3xx_defconfig b/arch/arm/configs/pxa3xx_defconfig
index 5f337d7ceb5b..e60d65624f35 100644
--- a/arch/arm/configs/pxa3xx_defconfig
+++ b/arch/arm/configs/pxa3xx_defconfig
@@ -117,7 +117,7 @@ CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
CONFIG_NLS=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_SHIRQ=y
diff --git a/arch/arm/configs/pxa910_defconfig b/arch/arm/configs/pxa910_defconfig
index 3bb7771d3c19..a2f6f08e7b5e 100644
--- a/arch/arm/configs/pxa910_defconfig
+++ b/arch/arm/configs/pxa910_defconfig
@@ -65,7 +65,7 @@ CONFIG_NFS_V3=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_DEBUG_PREEMPT is not set
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index dc5517eaf09f..621c4dc1c642 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -734,7 +734,7 @@ CONFIG_NLS_ASCII=m
CONFIG_NLS_ISO8859_1=m
CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_UTF8=m
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO=y
CONFIG_FRAME_WARN=0
diff --git a/arch/arm/configs/qcom_defconfig b/arch/arm/configs/qcom_defconfig
index 7bff7bf24a85..5dfac70cc0d5 100644
--- a/arch/arm/configs/qcom_defconfig
+++ b/arch/arm/configs/qcom_defconfig
@@ -181,7 +181,7 @@ CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/arm/configs/raumfeld_defconfig b/arch/arm/configs/raumfeld_defconfig
index 3d833aea545a..b836f8f0f066 100644
--- a/arch/arm/configs/raumfeld_defconfig
+++ b/arch/arm/configs/raumfeld_defconfig
@@ -196,7 +196,7 @@ CONFIG_NLS_ISO8859_15=y
CONFIG_NLS_KOI8_R=y
CONFIG_NLS_KOI8_U=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_INFO=y
# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/arm/configs/shmobile_defconfig b/arch/arm/configs/shmobile_defconfig
index f2d635566a13..e67b449e957e 100644
--- a/arch/arm/configs/shmobile_defconfig
+++ b/arch/arm/configs/shmobile_defconfig
@@ -210,7 +210,7 @@ CONFIG_NFS_V4_1=y
CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_ARM_UNWIND is not set
diff --git a/arch/arm/configs/socfpga_defconfig b/arch/arm/configs/socfpga_defconfig
index 753f1a5defc7..e277dc607aff 100644
--- a/arch/arm/configs/socfpga_defconfig
+++ b/arch/arm/configs/socfpga_defconfig
@@ -101,7 +101,7 @@ CONFIG_NFS_FS=y
CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/arm/configs/stm32_defconfig b/arch/arm/configs/stm32_defconfig
index 1e5ec2a0e4cf..44b6b037cd3f 100644
--- a/arch/arm/configs/stm32_defconfig
+++ b/arch/arm/configs/stm32_defconfig
@@ -61,7 +61,7 @@ CONFIG_STM32_DMA=y
# CONFIG_DNOTIFY is not set
# CONFIG_INOTIFY_USER is not set
CONFIG_NLS=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/arm/configs/sunxi_defconfig b/arch/arm/configs/sunxi_defconfig
index 81a1b92fd4e6..e20ae3925e36 100644
--- a/arch/arm/configs/sunxi_defconfig
+++ b/arch/arm/configs/sunxi_defconfig
@@ -146,6 +146,6 @@ CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_FS=y
CONFIG_CRYPTO_DEV_SUN4I_SS=y
diff --git a/arch/arm/configs/tegra_defconfig b/arch/arm/configs/tegra_defconfig
index 6012a1ec779f..bc80a47957c9 100644
--- a/arch/arm/configs/tegra_defconfig
+++ b/arch/arm/configs/tegra_defconfig
@@ -287,7 +287,7 @@ CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_FS=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/arm/configs/u300_defconfig b/arch/arm/configs/u300_defconfig
index aaa95ab606a8..77460061cba4 100644
--- a/arch/arm/configs/u300_defconfig
+++ b/arch/arm/configs/u300_defconfig
@@ -63,7 +63,7 @@ CONFIG_VFAT_FS=y
CONFIG_TMPFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_FS=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/configs/u8500_defconfig b/arch/arm/configs/u8500_defconfig
index b7b09189f1c5..d95796881b89 100644
--- a/arch/arm/configs/u8500_defconfig
+++ b/arch/arm/configs/u8500_defconfig
@@ -128,7 +128,7 @@ CONFIG_NFS_FS=y
CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_FS=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/arm/configs/vt8500_v6_v7_defconfig b/arch/arm/configs/vt8500_v6_v7_defconfig
index 1bfaa7bfc392..d5a2cc807940 100644
--- a/arch/arm/configs/vt8500_v6_v7_defconfig
+++ b/arch/arm/configs/vt8500_v6_v7_defconfig
@@ -84,6 +84,6 @@ CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_KERNEL=y
CONFIG_LOCKUP_DETECTOR=y
diff --git a/arch/arm/configs/xcep_defconfig b/arch/arm/configs/xcep_defconfig
index 721832ffe2d7..5c8435db7ae9 100644
--- a/arch/arm/configs/xcep_defconfig
+++ b/arch/arm/configs/xcep_defconfig
@@ -85,7 +85,7 @@ CONFIG_NFS_V3=y
CONFIG_NLS=m
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_UTF8=m
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_STRIP_ASM_SYMS=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/configs/zx_defconfig b/arch/arm/configs/zx_defconfig
index ab683fbbb954..2684bfa04232 100644
--- a/arch/arm/configs/zx_defconfig
+++ b/arch/arm/configs/zx_defconfig
@@ -7,7 +7,6 @@ CONFIG_CGROUPS=y
CONFIG_CGROUP_DEBUG=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_SCHED=y
CONFIG_RT_GROUP_SCHED=y
CONFIG_NAMESPACES=y
@@ -98,7 +97,7 @@ CONFIG_TMPFS_POSIX_ACL=y
CONFIG_NLS_CODEPAGE_936=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_INFO=y
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 4355f0ec44d6..f98baaec0a15 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -17,6 +17,8 @@
#ifndef __ASSEMBLY__
+#include <linux/personality.h> /* For READ_IMPLIES_EXEC */
+
#ifndef CONFIG_MMU
#include <asm/page-nommu.h>
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index dc46398bc3a5..fa70db7c714b 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -281,11 +281,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
flush_pmd_entry(pmdp);
}
-static inline int has_transparent_hugepage(void)
-{
- return 1;
-}
-
#endif /* __ASSEMBLY__ */
#endif /* _ASM_PGTABLE_3LEVEL_H */
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 4adfb46e3ee9..a647d6642f3e 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -193,9 +193,9 @@ EXPORT_SYMBOL_GPL(thread_notify_head);
/*
* Free current thread data structures etc..
*/
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
{
- thread_notify(THREAD_NOTIFY_EXIT, current_thread_info());
+ thread_notify(THREAD_NOTIFY_EXIT, task_thread_info(tsk));
}
void flush_thread(void)
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 7afe48ae5d76..861521606c6d 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -644,9 +644,11 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
break;
case IPI_CPU_BACKTRACE:
+ printk_nmi_enter();
irq_enter();
nmi_cpu_backtrace(regs);
irq_exit();
+ printk_nmi_exit();
break;
default:
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 55347662e5ed..cb569b65a54d 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -421,18 +421,21 @@ config CPU_32v3
select CPU_USE_DOMAINS if MMU
select NEED_KUSER_HELPERS
select TLS_REG_EMUL if SMP || !MMU
+ select CPU_NO_EFFICIENT_FFS
config CPU_32v4
bool
select CPU_USE_DOMAINS if MMU
select NEED_KUSER_HELPERS
select TLS_REG_EMUL if SMP || !MMU
+ select CPU_NO_EFFICIENT_FFS
config CPU_32v4T
bool
select CPU_USE_DOMAINS if MMU
select NEED_KUSER_HELPERS
select TLS_REG_EMUL if SMP || !MMU
+ select CPU_NO_EFFICIENT_FFS
config CPU_32v5
bool
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 2a61e4b04600..73085d3482ed 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -156,10 +156,6 @@ static void vfp_thread_copy(struct thread_info *thread)
* - we could be preempted if tree preempt rcu is enabled, so
* it is unsafe to use thread->cpu.
* THREAD_NOTIFY_EXIT
- * - the thread (v) will be running on the local CPU, so
- * v === current_thread_info()
- * - thread->cpu is the local CPU number at the time it is accessed,
- * but may change at any time.
* - we could be preempted if tree preempt rcu is enabled, so
* it is unsafe to use thread->cpu.
*/
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index fd2d74d0491e..0bd559a57fdb 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -324,7 +324,7 @@ CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
CONFIG_VIRTUALIZATION=y
CONFIG_KVM=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_FS=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 1910bf47d4a3..46472a91b6df 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -316,11 +316,6 @@ static inline int pmd_protnone(pmd_t pmd)
#define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
-static inline int has_transparent_hugepage(void)
-{
- return 1;
-}
-
#define __pgprot_modify(prot,mask,bits) \
__pgprot((pgprot_val(prot) & ~(mask)) | (bits))
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 48eea6866c67..6cd2612236dc 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -200,13 +200,6 @@ void show_regs(struct pt_regs * regs)
__show_regs(regs);
}
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
static void tls_thread_flush(void)
{
asm ("msr tpidr_el0, xzr");
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 589fd28e1fb5..aa8aee7d6929 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -307,6 +307,7 @@ static __init int setup_hugepagesz(char *opt)
} else if (ps == PUD_SIZE) {
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
} else {
+ hugetlb_bad_size();
pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
return 0;
}
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 18b88779e701..7e75d45e20cd 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -4,6 +4,7 @@ config AVR32
# that we usually don't need on AVR32.
select EXPERT
select HAVE_CLK
+ select HAVE_EXIT_THREAD
select HAVE_OPROFILE
select HAVE_KPROBES
select VIRT_TO_BUS
@@ -17,6 +18,7 @@ config AVR32
select GENERIC_CLOCKEVENTS
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
+ select HAVE_NMI
help
AVR32 is a high-performance 32-bit RISC microprocessor core,
designed for cost-sensitive embedded applications, with particular
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 42a53e740a7e..68e5b9dac059 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -62,9 +62,9 @@ void machine_restart(char *cmd)
/*
* Free current thread data structures etc
*/
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
{
- ocd_disable(current);
+ ocd_disable(tsk);
}
void flush_thread(void)
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index a63c12259e77..28c63fea786d 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -40,6 +40,7 @@ config BLACKFIN
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
+ select HAVE_NMI
config GENERIC_CSUM
def_bool y
diff --git a/arch/blackfin/include/asm/processor.h b/arch/blackfin/include/asm/processor.h
index 7acd46653df3..0c265aba94ad 100644
--- a/arch/blackfin/include/asm/processor.h
+++ b/arch/blackfin/include/asm/processor.h
@@ -76,13 +76,6 @@ static inline void release_thread(struct task_struct *dead_task)
}
/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
-/*
* Return saved PC of a blocked thread.
*/
#define thread_saved_pc(tsk) (tsk->thread.pc)
diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c
index 3ae9f5a166a0..0ee7686a78f3 100644
--- a/arch/c6x/kernel/process.c
+++ b/arch/c6x/kernel/process.c
@@ -82,10 +82,6 @@ void flush_thread(void)
{
}
-void exit_thread(void)
-{
-}
-
/*
* Do necessary setup to start up a newly executed thread.
*/
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 99bda1ba3d2f..0776341399a3 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -62,6 +62,7 @@ config CRIS
select OLD_SIGSUSPEND
select OLD_SIGACTION
select GPIOLIB
+ select HAVE_EXIT_THREAD if ETRAX_ARCH_V32
select IRQ_DOMAIN if ETRAX_ARCH_V32
select OF if ETRAX_ARCH_V32
select OF_EARLY_FLATTREE if ETRAX_ARCH_V32
@@ -69,6 +70,7 @@ config CRIS
select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
select HAVE_DEBUG_BUGVERBOSE if ETRAX_ARCH_V32
+ select HAVE_NMI
config HZ
int
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 02b783457be0..96e5afef6b47 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -35,15 +35,6 @@ void default_idle(void)
local_irq_enable();
}
-/*
- * Free current thread data structures etc..
- */
-
-void exit_thread(void)
-{
- /* Nothing needs to be done. */
-}
-
/* if the watchdog is enabled, we can simply disable interrupts and go
* into an eternal loop, and the watchdog will reset the CPU after 0.1s
* if on the other hand the watchdog wasn't enabled, we just enable it and wait
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index c7ce784a393c..4d1afa9f9fd3 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -33,9 +33,9 @@ void default_idle(void)
*/
extern void deconfigure_bp(long pid);
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
{
- deconfigure_bp(current->pid);
+ deconfigure_bp(tsk->pid);
}
/*
diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h
index ae8d423e79d9..73f0a79ad8e6 100644
--- a/arch/frv/include/asm/processor.h
+++ b/arch/frv/include/asm/processor.h
@@ -97,13 +97,6 @@ extern asmlinkage void *restore_user_regs(const struct user_context *target, ...
#define forget_segments() do { } while (0)
/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
-/*
* Return saved PC of a blocked thread.
*/
extern unsigned long thread_saved_pc(struct task_struct *tsk);
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 986ea84caaed..aa232de2d4bc 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -20,6 +20,7 @@ config H8300
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZO
select HAVE_ARCH_KGDB
+ select CPU_NO_EFFICIENT_FFS
config RWSEM_GENERIC_SPINLOCK
def_bool y
diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h
index 54e3fd83c336..111df7397ac7 100644
--- a/arch/h8300/include/asm/processor.h
+++ b/arch/h8300/include/asm/processor.h
@@ -111,13 +111,6 @@ static inline void release_thread(struct task_struct *dead_task)
}
/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
-/*
* Return saved PC of a blocked thread.
*/
unsigned long thread_saved_pc(struct task_struct *tsk);
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index a9ebd471823a..d9edfd3fc52a 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -137,13 +137,6 @@ void release_thread(struct task_struct *dead_task)
}
/*
- * Free any architecture-specific thread data structures, etc.
- */
-void exit_thread(void)
-{
-}
-
-/*
* Some archs flush debug and FPU info here
*/
void flush_thread(void)
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index b534ebab36ea..f80758cb7157 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -18,6 +18,7 @@ config IA64
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select HAVE_UNSTABLE_SCHED_CLOCK
+ select HAVE_EXIT_THREAD
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_KPROBES
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 9cd607b06964..2436ad5f92c1 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -4542,8 +4542,8 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
/*
- * called only from exit_thread(): task == current
- * we come here only if current has a context attached (loaded or masked)
+ * called only from exit_thread()
+ * we come here only if the task has a context attached (loaded or masked)
*/
void
pfm_exit_thread(struct task_struct *task)
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index b51514957620..aae6c4dc7ae7 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -570,22 +570,22 @@ flush_thread (void)
}
/*
- * Clean up state associated with current thread. This is called when
+ * Clean up state associated with a thread. This is called when
* the thread calls exit().
*/
void
-exit_thread (void)
+exit_thread (struct task_struct *tsk)
{
- ia64_drop_fpu(current);
+ ia64_drop_fpu(tsk);
#ifdef CONFIG_PERFMON
/* if needed, stop monitoring and flush state to perfmon context */
- if (current->thread.pfm_context)
- pfm_exit_thread(current);
+ if (tsk->thread.pfm_context)
+ pfm_exit_thread(tsk);
/* free debug register resources */
- if (current->thread.flags & IA64_THREAD_DBG_VALID)
- pfm_release_debug_registers(current);
+ if (tsk->thread.flags & IA64_THREAD_DBG_VALID)
+ pfm_release_debug_registers(tsk);
#endif
}
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index c82b29253991..3cc8498fe0fe 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -17,6 +17,7 @@ config M32R
select ARCH_USES_GETTIMEOFFSET
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
+ select CPU_NO_EFFICIENT_FFS
config SBUS
bool
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index e69221d581d5..a88b1f01e91f 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -101,15 +101,6 @@ void show_regs(struct pt_regs * regs)
#endif
}
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
- /* Nothing to do. */
- DPRINTK("pid = %d\n", current->pid);
-}
-
void flush_thread(void)
{
DPRINTK("pid = %d\n", current->pid);
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index c1beb5ae181f..8ace920ca24a 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -40,6 +40,7 @@ config M68000
select CPU_HAS_NO_MULDIV64
select CPU_HAS_NO_UNALIGNED
select GENERIC_CSUM
+ select CPU_NO_EFFICIENT_FFS
help
The Freescale (was Motorola) 68000 CPU is the first generation of
the well known M68K family of processors. The CPU core as well as
@@ -51,6 +52,7 @@ config MCPU32
bool
select CPU_HAS_NO_BITFIELDS
select CPU_HAS_NO_UNALIGNED
+ select CPU_NO_EFFICIENT_FFS
help
The Freescale (was then Motorola) CPU32 is a CPU core that is
based on the 68020 processor. For the most part it is used in
@@ -130,6 +132,7 @@ config M5206
depends on !MMU
select COLDFIRE_SW_A7
select HAVE_MBAR
+ select CPU_NO_EFFICIENT_FFS
help
Motorola ColdFire 5206 processor support.
@@ -138,6 +141,7 @@ config M5206e
depends on !MMU
select COLDFIRE_SW_A7
select HAVE_MBAR
+ select CPU_NO_EFFICIENT_FFS
help
Motorola ColdFire 5206e processor support.
@@ -163,6 +167,7 @@ config M5249
depends on !MMU
select COLDFIRE_SW_A7
select HAVE_MBAR
+ select CPU_NO_EFFICIENT_FFS
help
Motorola ColdFire 5249 processor support.
@@ -171,6 +176,7 @@ config M525x
depends on !MMU
select COLDFIRE_SW_A7
select HAVE_MBAR
+ select CPU_NO_EFFICIENT_FFS
help
Freescale (Motorola) Coldfire 5251/5253 processor support.
@@ -189,6 +195,7 @@ config M5272
depends on !MMU
select COLDFIRE_SW_A7
select HAVE_MBAR
+ select CPU_NO_EFFICIENT_FFS
help
Motorola ColdFire 5272 processor support.
@@ -217,6 +224,7 @@ config M5307
select COLDFIRE_SW_A7
select HAVE_CACHE_CB
select HAVE_MBAR
+ select CPU_NO_EFFICIENT_FFS
help
Motorola ColdFire 5307 processor support.
@@ -242,6 +250,7 @@ config M5407
select COLDFIRE_SW_A7
select HAVE_CACHE_CB
select HAVE_MBAR
+ select CPU_NO_EFFICIENT_FFS
help
Motorola ColdFire 5407 processor support.
@@ -251,6 +260,7 @@ config M547x
select MMU_COLDFIRE if MMU
select HAVE_CACHE_CB
select HAVE_MBAR
+ select CPU_NO_EFFICIENT_FFS
help
Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
@@ -260,6 +270,7 @@ config M548x
select M54xx
select HAVE_CACHE_CB
select HAVE_MBAR
+ select CPU_NO_EFFICIENT_FFS
help
Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h
index 20dda1d4b860..a6ce2ec8d693 100644
--- a/arch/m68k/include/asm/processor.h
+++ b/arch/m68k/include/asm/processor.h
@@ -153,13 +153,6 @@ static inline void release_thread(struct task_struct *dead_task)
{
}
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
extern unsigned long thread_saved_pc(struct task_struct *tsk);
unsigned long get_wchan(struct task_struct *p);
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88da3e31..5b7a45d99cfb 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -11,6 +11,7 @@ config METAG
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
select HAVE_DYNAMIC_FTRACE
+ select HAVE_EXIT_THREAD
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_TRACER
select HAVE_KERNEL_BZIP2
@@ -29,6 +30,7 @@ config METAG
select OF
select OF_EARLY_FLATTREE
select SPARSE_IRQ
+ select CPU_NO_EFFICIENT_FFS
config STACKTRACE_SUPPORT
def_bool y
diff --git a/arch/metag/include/asm/processor.h b/arch/metag/include/asm/processor.h
index 0838ca699764..a0333ebcac35 100644
--- a/arch/metag/include/asm/processor.h
+++ b/arch/metag/include/asm/processor.h
@@ -134,8 +134,6 @@ static inline void release_thread(struct task_struct *dead_task)
#define copy_segments(tsk, mm) do { } while (0)
#define release_segments(mm) do { } while (0)
-extern void exit_thread(void);
-
/*
* Return saved PC of a blocked thread.
*/
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
index 7f546183a0f0..35062796edf2 100644
--- a/arch/metag/kernel/process.c
+++ b/arch/metag/kernel/process.c
@@ -345,10 +345,10 @@ void flush_thread(void)
/*
* Free current thread data structures etc.
*/
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
{
- clear_fpu(&current->thread);
- clear_dsp(&current->thread);
+ clear_fpu(&tsk->thread);
+ clear_dsp(&tsk->thread);
}
/* TODO: figure out how to unwind the kernel stack here to figure out
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index b38700ae4e84..db1b7da91e4f 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -239,6 +239,7 @@ static __init int setup_hugepagesz(char *opt)
if (ps == (1 << HPAGE_SHIFT)) {
hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT);
} else {
+ hugetlb_bad_size();
pr_err("hugepagesz: Unsupported page size %lu M\n",
ps >> 20);
return 0;
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3d793b55f60c..f17c3a4fb697 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -32,6 +32,7 @@ config MICROBLAZE
select OF_EARLY_FLATTREE
select TRACING_SUPPORT
select VIRT_TO_BUS
+ select CPU_NO_EFFICIENT_FFS
config SWAP
def_bool n
diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h
index 497a988d79c2..c38d0dd91134 100644
--- a/arch/microblaze/include/asm/processor.h
+++ b/arch/microblaze/include/asm/processor.h
@@ -70,11 +70,6 @@ static inline void release_thread(struct task_struct *dead_task)
{
}
-/* Free all resources held by a thread. */
-static inline void exit_thread(void)
-{
-}
-
extern unsigned long thread_saved_pc(struct task_struct *t);
extern unsigned long get_wchan(struct task_struct *p);
@@ -127,11 +122,6 @@ static inline void release_thread(struct task_struct *dead_task)
{
}
-/* Free current thread data structures etc. */
-static inline void exit_thread(void)
-{
-}
-
/* Return saved (kernel) PC of a blocked thread. */
# define thread_saved_pc(tsk) \
((tsk)->thread.regs ? (tsk)->thread.regs->r15 : 0)
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 5663f411c225..fb08bd9d8ef0 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -48,6 +48,7 @@ config MIPS
select GENERIC_SCHED_CLOCK if !CAVIUM_OCTEON_SOC
select GENERIC_CMOS_UPDATE
select HAVE_MOD_ARCH_SPECIFIC
+ select HANDLE_DOMAIN_IRQ
select VIRT_TO_BUS
select MODULES_USE_ELF_REL if MODULES
select MODULES_USE_ELF_RELA if MODULES && 64BIT
@@ -62,7 +63,7 @@ config MIPS
select HAVE_IRQ_TIME_ACCOUNTING
select GENERIC_TIME_VSYSCALL
select ARCH_CLOCKSOURCE_DATA
- select HANDLE_DOMAIN_IRQ
+ select HAVE_NMI
menu "Machine selection"
diff --git a/arch/mips/configs/bcm47xx_defconfig b/arch/mips/configs/bcm47xx_defconfig
index fad8e964f14c..e4a29db1c416 100644
--- a/arch/mips/configs/bcm47xx_defconfig
+++ b/arch/mips/configs/bcm47xx_defconfig
@@ -74,7 +74,7 @@ CONFIG_USB_HCD_BCMA=y
CONFIG_USB_HCD_SSB=y
CONFIG_LEDS_TRIGGER_TIMER=y
CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_REDUCED=y
CONFIG_STRIP_ASM_SYMS=y
diff --git a/arch/mips/configs/bmips_be_defconfig b/arch/mips/configs/bmips_be_defconfig
index acf7785c4cdb..cabc4ea3f87f 100644
--- a/arch/mips/configs/bmips_be_defconfig
+++ b/arch/mips/configs/bmips_be_defconfig
@@ -33,7 +33,7 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
# CONFIG_STANDALONE is not set
# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_BRCMSTB_GISB_ARB=y
CONFIG_MTD=y
CONFIG_MTD_BCM63XX_PARTS=y
diff --git a/arch/mips/configs/bmips_stb_defconfig b/arch/mips/configs/bmips_stb_defconfig
index 4eb5d6e9cf8f..080066254b8e 100644
--- a/arch/mips/configs/bmips_stb_defconfig
+++ b/arch/mips/configs/bmips_stb_defconfig
@@ -34,7 +34,7 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
# CONFIG_STANDALONE is not set
# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_BRCMSTB_GISB_ARB=y
CONFIG_MTD=y
CONFIG_MTD_CFI=y
diff --git a/arch/mips/configs/ci20_defconfig b/arch/mips/configs/ci20_defconfig
index 43e0ba24470c..bf164feefed2 100644
--- a/arch/mips/configs/ci20_defconfig
+++ b/arch/mips/configs/ci20_defconfig
@@ -148,7 +148,7 @@ CONFIG_NLS_ISO8859_15=y
CONFIG_NLS_KOI8_R=y
CONFIG_NLS_KOI8_U=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_STRIP_ASM_SYMS=y
CONFIG_DEBUG_FS=y
diff --git a/arch/mips/configs/db1xxx_defconfig b/arch/mips/configs/db1xxx_defconfig
index 3bdb72a70364..f0c8971030c4 100644
--- a/arch/mips/configs/db1xxx_defconfig
+++ b/arch/mips/configs/db1xxx_defconfig
@@ -18,7 +18,6 @@ CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_MEMCG=y
CONFIG_MEMCG_SWAP=y
CONFIG_MEMCG_KMEM=y
diff --git a/arch/mips/configs/lemote2f_defconfig b/arch/mips/configs/lemote2f_defconfig
index 5da76e0e120f..3b1f697e7bf1 100644
--- a/arch/mips/configs/lemote2f_defconfig
+++ b/arch/mips/configs/lemote2f_defconfig
@@ -404,7 +404,7 @@ CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_KOI8_R=m
CONFIG_NLS_KOI8_U=m
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_FRAME_WARN=1024
CONFIG_STRIP_ASM_SYMS=y
CONFIG_DEBUG_FS=y
diff --git a/arch/mips/configs/loongson3_defconfig b/arch/mips/configs/loongson3_defconfig
index f8bf915c6d6b..b4dfece16396 100644
--- a/arch/mips/configs/loongson3_defconfig
+++ b/arch/mips/configs/loongson3_defconfig
@@ -25,7 +25,6 @@ CONFIG_TASK_XACCT=y
CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_LOG_BUF_SHIFT=14
CONFIG_CPUSETS=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_MEMCG=y
CONFIG_MEMCG_SWAP=y
CONFIG_BLK_CGROUP=y
@@ -330,7 +329,7 @@ CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_CODEPAGE_936=y
CONFIG_NLS_ASCII=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_FRAME_WARN=1024
CONFIG_STRIP_ASM_SYMS=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/mips/configs/nlm_xlp_defconfig b/arch/mips/configs/nlm_xlp_defconfig
index b496c25fced6..4cfd1d4b9d5d 100644
--- a/arch/mips/configs/nlm_xlp_defconfig
+++ b/arch/mips/configs/nlm_xlp_defconfig
@@ -553,7 +553,7 @@ CONFIG_NLS_ISO8859_14=m
CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_KOI8_R=m
CONFIG_NLS_KOI8_U=m
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_FRAME_WARN=1024
diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig
index 8e99ad807a57..16628fa58ccc 100644
--- a/arch/mips/configs/nlm_xlr_defconfig
+++ b/arch/mips/configs/nlm_xlr_defconfig
@@ -513,7 +513,7 @@ CONFIG_NLS_ISO8859_14=m
CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_KOI8_R=m
CONFIG_NLS_KOI8_U=m
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_UNUSED_SYMBOLS=y
diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig
index 8b7429127a1d..022907e3df17 100644
--- a/arch/mips/configs/pistachio_defconfig
+++ b/arch/mips/configs/pistachio_defconfig
@@ -299,7 +299,7 @@ CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=m
CONFIG_NLS_ASCII=m
CONFIG_NLS_ISO8859_1=m
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0
diff --git a/arch/mips/configs/qi_lb60_defconfig b/arch/mips/configs/qi_lb60_defconfig
index d7bb8cce1068..e9dd20a123ca 100644
--- a/arch/mips/configs/qi_lb60_defconfig
+++ b/arch/mips/configs/qi_lb60_defconfig
@@ -172,7 +172,7 @@ CONFIG_NLS_ISO8859_15=y
CONFIG_NLS_KOI8_R=y
CONFIG_NLS_KOI8_U=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_STRIP_ASM_SYMS=y
CONFIG_READABLE_ASM=y
diff --git a/arch/mips/configs/rt305x_defconfig b/arch/mips/configs/rt305x_defconfig
index d14ae2fa7d13..005754cbee06 100644
--- a/arch/mips/configs/rt305x_defconfig
+++ b/arch/mips/configs/rt305x_defconfig
@@ -145,7 +145,7 @@ CONFIG_JFFS2_COMPRESSION_OPTIONS=y
CONFIG_SQUASHFS=y
# CONFIG_SQUASHFS_ZLIB is not set
CONFIG_SQUASHFS_XZ=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_MAGIC_SYSRQ=y
CONFIG_STRIP_ASM_SYMS=y
diff --git a/arch/mips/configs/xway_defconfig b/arch/mips/configs/xway_defconfig
index 8987846240f7..c293c78d7686 100644
--- a/arch/mips/configs/xway_defconfig
+++ b/arch/mips/configs/xway_defconfig
@@ -143,7 +143,7 @@ CONFIG_JFFS2_COMPRESSION_OPTIONS=y
CONFIG_SQUASHFS=y
# CONFIG_SQUASHFS_ZLIB is not set
CONFIG_SQUASHFS_XZ=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_STRIP_ASM_SYMS=y
CONFIG_DEBUG_FS=y
diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index e6f19fc61bf2..e961c8a7ea66 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -204,6 +204,16 @@
#endif
#endif
+/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
+#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
+ (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
+ (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
+ (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
+ (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
+ (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
+#define CPU_NO_EFFICIENT_FFS 1
+#endif
+
#ifndef cpu_has_mips_1
# define cpu_has_mips_1 (!cpu_has_mips_r6)
#endif
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index e07a105cafc2..a6b611f1da43 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -533,6 +533,7 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define has_transparent_hugepage has_transparent_hugepage
extern int has_transparent_hugepage(void);
static inline int pmd_trans_huge(pmd_t pmd)
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index a6b3dc54260a..411c971e3417 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -73,10 +73,6 @@ void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
regs->regs[29] = sp;
}
-void exit_thread(void)
-{
-}
-
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
/*
diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c
index 5a5c7fec645e..e8b335c16295 100644
--- a/arch/mips/mm/tlb-r4k.c
+++ b/arch/mips/mm/tlb-r4k.c
@@ -405,19 +405,20 @@ void add_wired_entry(unsigned long entrylo0, unsigned long entrylo1,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-int __init has_transparent_hugepage(void)
+int has_transparent_hugepage(void)
{
- unsigned int mask;
- unsigned long flags;
-
- local_irq_save(flags);
- write_c0_pagemask(PM_HUGE_MASK);
- back_to_back_c0_hazard();
- mask = read_c0_pagemask();
- write_c0_pagemask(PM_DEFAULT_MASK);
+ static unsigned int mask = -1;
- local_irq_restore(flags);
+ if (mask == -1) { /* first call comes during __init */
+ unsigned long flags;
+ local_irq_save(flags);
+ write_c0_pagemask(PM_HUGE_MASK);
+ back_to_back_c0_hazard();
+ mask = read_c0_pagemask();
+ write_c0_pagemask(PM_DEFAULT_MASK);
+ local_irq_restore(flags);
+ }
return mask == PM_HUGE_MASK;
}
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 06ddb5501ab1..9627e81a6cbb 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -1,5 +1,6 @@
config MN10300
def_bool y
+ select HAVE_EXIT_THREAD
select HAVE_OPROFILE
select HAVE_UID16
select GENERIC_IRQ_SHOW
diff --git a/arch/mn10300/configs/asb2364_defconfig b/arch/mn10300/configs/asb2364_defconfig
index fbb96ae3122a..cd0a6cb17dee 100644
--- a/arch/mn10300/configs/asb2364_defconfig
+++ b/arch/mn10300/configs/asb2364_defconfig
@@ -11,7 +11,6 @@ CONFIG_CGROUPS=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_RELAY=y
# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
CONFIG_EXPERT=y
diff --git a/arch/mn10300/include/asm/fpu.h b/arch/mn10300/include/asm/fpu.h
index 738ff72659d5..a47e995d45f3 100644
--- a/arch/mn10300/include/asm/fpu.h
+++ b/arch/mn10300/include/asm/fpu.h
@@ -76,11 +76,9 @@ static inline void unlazy_fpu(struct task_struct *tsk)
preempt_enable();
}
-static inline void exit_fpu(void)
+static inline void exit_fpu(struct task_struct *tsk)
{
#ifdef CONFIG_LAZY_SAVE_FPU
- struct task_struct *tsk = current;
-
preempt_disable();
if (fpu_state_owner == tsk)
fpu_state_owner = NULL;
@@ -123,7 +121,7 @@ static inline void fpu_init_state(void) {}
static inline void fpu_save(struct fpu_state_struct *s) {}
static inline void fpu_kill_state(struct task_struct *tsk) {}
static inline void unlazy_fpu(struct task_struct *tsk) {}
-static inline void exit_fpu(void) {}
+static inline void exit_fpu(struct task_struct *tsk) {}
static inline void flush_fpu(void) {}
static inline int fpu_setup_sigcontext(struct fpucontext *buf) { return 0; }
static inline int fpu_restore_sigcontext(struct fpucontext *buf) { return 0; }
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 3707da583d05..cbede4e88dee 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -103,9 +103,9 @@ void show_regs(struct pt_regs *regs)
/*
* free current thread data structures etc..
*/
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
{
- exit_fpu();
+ exit_fpu(tsk);
}
void flush_thread(void)
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 87ca653eb5f3..51a56c8b04b4 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -15,6 +15,7 @@ config NIOS2
select SOC_BUS
select SPARSE_IRQ
select USB_ARCH_HAS_HCD if USB_SUPPORT
+ select CPU_NO_EFFICIENT_FFS
config GENERIC_CSUM
def_bool y
diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h
index c2ba45c159c7..1c953f0cadbf 100644
--- a/arch/nios2/include/asm/processor.h
+++ b/arch/nios2/include/asm/processor.h
@@ -75,11 +75,6 @@ static inline void release_thread(struct task_struct *dead_task)
{
}
-/* Free current thread data structures etc.. */
-static inline void exit_thread(void)
-{
-}
-
/* Return saved PC of a blocked thread. */
#define thread_saved_pc(tsk) ((tsk)->thread.kregs->ea)
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index e118c02cc79a..142cb057c41b 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -25,6 +25,7 @@ config OPENRISC
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
select OR1K_PIC
+ select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
config MMU
def_bool y
diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h
index 4d235e3d2534..70334c9f7d24 100644
--- a/arch/openrisc/include/asm/processor.h
+++ b/arch/openrisc/include/asm/processor.h
@@ -85,15 +85,6 @@ void release_thread(struct task_struct *);
unsigned long get_wchan(struct task_struct *p);
/*
- * Free current thread data structures etc..
- */
-
-extern inline void exit_thread(void)
-{
- /* Nothing needs to be done. */
-}
-
-/*
* Return saved PC of a blocked thread. For now, this is the "user" PC
*/
extern unsigned long thread_saved_pc(struct task_struct *t);
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 766da0653599..dc117385ce2e 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -35,6 +35,7 @@ config PARISC
select HAVE_ARCH_TRACEHOOK
select HAVE_UNSTABLE_SCHED_CLOCK if (SMP || !64BIT)
select ARCH_NO_COHERENT_DMA_MMAP
+ select CPU_NO_EFFICIENT_FFS
help
The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig
index e945c08892fa..df76193398cc 100644
--- a/arch/parisc/configs/generic-64bit_defconfig
+++ b/arch/parisc/configs/generic-64bit_defconfig
@@ -279,7 +279,7 @@ CONFIG_NLS_ASCII=m
CONFIG_NLS_ISO8859_1=m
CONFIG_NLS_ISO8859_2=m
CONFIG_NLS_UTF8=m
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_STRIP_ASM_SYMS=y
CONFIG_UNUSED_SYMBOLS=y
CONFIG_DEBUG_FS=y
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 809905a811ed..40639439d8b3 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -144,13 +144,6 @@ void machine_power_off(void)
void (*pm_power_off)(void) = machine_power_off;
EXPORT_SYMBOL(pm_power_off);
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
void flush_thread(void)
{
/* Only needs to handle fpu stuff or perf monitors.
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f0403b58ae8b..01f7464d9fea 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -155,6 +155,7 @@ config PPC
select NO_BOOTMEM
select HAVE_GENERIC_RCU_GUP
select HAVE_PERF_EVENTS_NMI if PPC64
+ select HAVE_NMI if PERF_EVENTS
select EDAC_SUPPORT
select EDAC_ATOMIC_SCRUB
select ARCH_HAS_DMA_SET_COHERENT_MASK
diff --git a/arch/powerpc/configs/40x/virtex_defconfig b/arch/powerpc/configs/40x/virtex_defconfig
index bcb0c4d854db..bb076ef2e5da 100644
--- a/arch/powerpc/configs/40x/virtex_defconfig
+++ b/arch/powerpc/configs/40x/virtex_defconfig
@@ -72,7 +72,7 @@ CONFIG_CRC_CCITT=y
CONFIG_FONTS=y
CONFIG_FONT_8x8=y
CONFIG_FONT_8x16=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/powerpc/configs/44x/virtex5_defconfig b/arch/powerpc/configs/44x/virtex5_defconfig
index 53d0300b3390..f5cf6571c518 100644
--- a/arch/powerpc/configs/44x/virtex5_defconfig
+++ b/arch/powerpc/configs/44x/virtex5_defconfig
@@ -71,7 +71,7 @@ CONFIG_CRC_CCITT=y
CONFIG_FONTS=y
CONFIG_FONT_8x8=y
CONFIG_FONT_8x16=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/powerpc/configs/44x/warp_defconfig b/arch/powerpc/configs/44x/warp_defconfig
index ee434375fc24..e925959ed912 100644
--- a/arch/powerpc/configs/44x/warp_defconfig
+++ b/arch/powerpc/configs/44x/warp_defconfig
@@ -93,7 +93,7 @@ CONFIG_NLS_ISO8859_15=y
CONFIG_NLS_UTF8=y
CONFIG_CRC_CCITT=y
CONFIG_CRC_T10DIF=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_FS=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/powerpc/configs/52xx/cm5200_defconfig b/arch/powerpc/configs/52xx/cm5200_defconfig
index 19fad0e0016e..3cc0a427a05c 100644
--- a/arch/powerpc/configs/52xx/cm5200_defconfig
+++ b/arch/powerpc/configs/52xx/cm5200_defconfig
@@ -74,7 +74,7 @@ CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DETECT_HUNG_TASK=y
# CONFIG_DEBUG_BUGVERBOSE is not set
CONFIG_CRYPTO_ECB=y
diff --git a/arch/powerpc/configs/52xx/lite5200b_defconfig b/arch/powerpc/configs/52xx/lite5200b_defconfig
index 5f40ba92a39a..5a24f604e5de 100644
--- a/arch/powerpc/configs/52xx/lite5200b_defconfig
+++ b/arch/powerpc/configs/52xx/lite5200b_defconfig
@@ -60,7 +60,7 @@ CONFIG_TMPFS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DETECT_HUNG_TASK=y
# CONFIG_DEBUG_BUGVERBOSE is not set
diff --git a/arch/powerpc/configs/52xx/motionpro_defconfig b/arch/powerpc/configs/52xx/motionpro_defconfig
index 909e185a88d1..e7ef6c8ae75b 100644
--- a/arch/powerpc/configs/52xx/motionpro_defconfig
+++ b/arch/powerpc/configs/52xx/motionpro_defconfig
@@ -86,7 +86,7 @@ CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DETECT_HUNG_TASK=y
# CONFIG_DEBUG_BUGVERBOSE is not set
diff --git a/arch/powerpc/configs/52xx/tqm5200_defconfig b/arch/powerpc/configs/52xx/tqm5200_defconfig
index efab8388ea92..5a91b504fcd3 100644
--- a/arch/powerpc/configs/52xx/tqm5200_defconfig
+++ b/arch/powerpc/configs/52xx/tqm5200_defconfig
@@ -88,7 +88,7 @@ CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DETECT_HUNG_TASK=y
# CONFIG_DEBUG_BUGVERBOSE is not set
diff --git a/arch/powerpc/configs/gamecube_defconfig b/arch/powerpc/configs/gamecube_defconfig
index 6c6c60f1aba4..d4dbe36fd2d5 100644
--- a/arch/powerpc/configs/gamecube_defconfig
+++ b/arch/powerpc/configs/gamecube_defconfig
@@ -92,7 +92,7 @@ CONFIG_CIFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
CONFIG_CRC_CCITT=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_MUTEXES=y
CONFIG_LATENCYTOP=y
diff --git a/arch/powerpc/configs/mpc5200_defconfig b/arch/powerpc/configs/mpc5200_defconfig
index 9fd041bfd778..4b7dc3363367 100644
--- a/arch/powerpc/configs/mpc5200_defconfig
+++ b/arch/powerpc/configs/mpc5200_defconfig
@@ -128,7 +128,7 @@ CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/powerpc/configs/pasemi_defconfig b/arch/powerpc/configs/pasemi_defconfig
index 8f94782eb907..b025444fb2ac 100644
--- a/arch/powerpc/configs/pasemi_defconfig
+++ b/arch/powerpc/configs/pasemi_defconfig
@@ -171,7 +171,7 @@ CONFIG_NFSD_V4=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
CONFIG_CRC_CCITT=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_FS=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig
index 34eaf528fa87..2d9ee188fc93 100644
--- a/arch/powerpc/configs/wii_defconfig
+++ b/arch/powerpc/configs/wii_defconfig
@@ -113,7 +113,7 @@ CONFIG_CIFS=m
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
CONFIG_CRC_CCITT=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_MUTEXES=y
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 48dc76c13094..6aed7f6c24f3 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -829,6 +829,7 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd);
extern int hash__has_transparent_hugepage(void);
+#define has_transparent_hugepage has_transparent_hugepage
static inline int has_transparent_hugepage(void)
{
if (radix_enabled())
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 47897a30982d..ee09e99097f0 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -65,7 +65,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
struct page **pages, int *nr);
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_large(pmd) 0
-#define has_transparent_hugepage() 0
#endif
pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
bool *is_thp, unsigned *shift);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ea8a28fd6f31..e2f12cbcade9 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1329,10 +1329,6 @@ void show_regs(struct pt_regs * regs)
show_instructions(regs);
}
-void exit_thread(void)
-{
-}
-
void flush_thread(void)
{
#ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 790c5de3b183..5aac1a3f86cd 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -775,8 +775,10 @@ static int __init hugepage_setup_sz(char *str)
size = memparse(str, &str);
- if (add_huge_page_size(size) != 0)
- printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
+ if (add_huge_page_size(size) != 0) {
+ hugetlb_bad_size();
+ pr_err("Invalid huge page size specified(%llu)\n", size);
+ }
return 1;
}
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index de0fcc08dff5..a8c259059adf 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -123,6 +123,7 @@ config S390
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_EARLY_PFN_TO_NID
select HAVE_ARCH_JUMP_LABEL
+ select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_SOFT_DIRTY
select HAVE_ARCH_TRACEHOOK
@@ -134,6 +135,7 @@ config S390
select HAVE_DMA_API_DEBUG
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
+ select HAVE_EXIT_THREAD
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
@@ -165,6 +167,7 @@ config S390
select TTY
select VIRT_CPU_ACCOUNTING
select VIRT_TO_BUS
+ select HAVE_NMI
config SCHED_OMIT_FRAME_POINTER
diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig
index 0ac42cc4f880..5e3e73efad61 100644
--- a/arch/s390/configs/default_defconfig
+++ b/arch/s390/configs/default_defconfig
@@ -534,7 +534,7 @@ CONFIG_NLS_ISO8859_1=m
CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_UTF8=m
CONFIG_DLM=m
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO=y
CONFIG_FRAME_WARN=1024
diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig
index a31dcd56f7c0..aedc828fac01 100644
--- a/arch/s390/configs/gcov_defconfig
+++ b/arch/s390/configs/gcov_defconfig
@@ -528,7 +528,7 @@ CONFIG_NLS_ISO8859_1=m
CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_UTF8=m
CONFIG_DLM=m
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_FRAME_WARN=1024
diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig
index 7b73bf353345..b5e9dc1fa8f9 100644
--- a/arch/s390/configs/performance_defconfig
+++ b/arch/s390/configs/performance_defconfig
@@ -528,7 +528,7 @@ CONFIG_NLS_ISO8859_1=m
CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_UTF8=m
CONFIG_DLM=m
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_FRAME_WARN=1024
diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig
index 1719843a55a2..ef2a9b65b414 100644
--- a/arch/s390/configs/zfcpdump_defconfig
+++ b/arch/s390/configs/zfcpdump_defconfig
@@ -56,7 +56,7 @@ CONFIG_RAW_DRIVER=y
# CONFIG_INOTIFY_USER is not set
CONFIG_CONFIGFS_FS=y
# CONFIG_MISC_FILESYSTEMS is not set
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2f66645587a2..18d2beb89340 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1223,6 +1223,7 @@ static inline int pmd_trans_huge(pmd_t pmd)
return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE;
}
+#define has_transparent_hugepage has_transparent_hugepage
static inline int has_transparent_hugepage(void)
{
return MACHINE_HAS_HPAGE ? 1 : 0;
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 2f1b7217c25c..0e64f08d3d69 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -43,13 +43,13 @@ static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action,
switch (action) {
case PM_SUSPEND_PREPARE:
case PM_HIBERNATION_PREPARE:
- if (crashk_res.start)
- crash_map_reserved_pages();
+ if (kexec_crash_image)
+ arch_kexec_unprotect_crashkres();
break;
case PM_POST_SUSPEND:
case PM_POST_HIBERNATION:
- if (crashk_res.start)
- crash_unmap_reserved_pages();
+ if (kexec_crash_image)
+ arch_kexec_protect_crashkres();
break;
default:
return NOTIFY_DONE;
@@ -60,6 +60,8 @@ static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action,
static int __init machine_kdump_pm_init(void)
{
pm_notifier(machine_kdump_pm_cb, 0);
+ /* Create initial mapping for crashkernel memory */
+ arch_kexec_unprotect_crashkres();
return 0;
}
arch_initcall(machine_kdump_pm_init);
@@ -146,6 +148,8 @@ static int kdump_csum_valid(struct kimage *image)
#endif
}
+#ifdef CONFIG_CRASH_DUMP
+
/*
* Map or unmap crashkernel memory
*/
@@ -167,21 +171,25 @@ static void crash_map_pages(int enable)
}
/*
- * Map crashkernel memory
+ * Unmap crashkernel memory
*/
-void crash_map_reserved_pages(void)
+void arch_kexec_protect_crashkres(void)
{
- crash_map_pages(1);
+ if (crashk_res.end)
+ crash_map_pages(0);
}
/*
- * Unmap crashkernel memory
+ * Map crashkernel memory
*/
-void crash_unmap_reserved_pages(void)
+void arch_kexec_unprotect_crashkres(void)
{
- crash_map_pages(0);
+ if (crashk_res.end)
+ crash_map_pages(1);
}
+#endif
+
/*
* Give back memory to hypervisor before new kdump is loaded
*/
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 481d7a83efc6..bba4fa74b321 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -68,9 +68,10 @@ extern void kernel_thread_starter(void);
/*
* Free current thread data structures etc..
*/
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
{
- exit_thread_runtime_instr();
+ if (tsk == current)
+ exit_thread_runtime_instr();
}
void flush_thread(void)
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 366e1b599a7b..507d63181389 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -14,6 +14,7 @@ config SCORE
select VIRT_TO_BUS
select MODULES_USE_ELF_REL
select CLONE_BACKWARDS
+ select CPU_NO_EFFICIENT_FFS
choice
prompt "System type"
diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c
index a1519ad3d49d..aae9480706c2 100644
--- a/arch/score/kernel/process.c
+++ b/arch/score/kernel/process.c
@@ -56,8 +56,6 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp)
regs->regs[0] = sp;
}
-void exit_thread(void) {}
-
/*
* When a process does an "exec", machine state like FPU and debug
* registers need to be reset. This is a hook function for that.
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc3fc81..e803a836cb7c 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -20,6 +20,7 @@ config SUPERH
select PERF_USE_VMALLOC
select HAVE_DEBUG_KMEMLEAK
select HAVE_KERNEL_GZIP
+ select CPU_NO_EFFICIENT_FFS
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
select HAVE_KERNEL_XZ
@@ -44,6 +45,7 @@ config SUPERH
select OLD_SIGSUSPEND
select OLD_SIGACTION
select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_NMI
help
The SuperH is a RISC processor targeted for use in embedded systems
and consumer electronics; it was also used in the Sega Dreamcast
@@ -71,6 +73,7 @@ config SUPERH32
config SUPERH64
def_bool ARCH = "sh64"
+ select HAVE_EXIT_THREAD
select KALLSYMS
config ARCH_DEFCONFIG
diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig
index a8d975793b6d..fe45d2c9b151 100644
--- a/arch/sh/configs/apsh4ad0a_defconfig
+++ b/arch/sh/configs/apsh4ad0a_defconfig
@@ -10,7 +10,6 @@ CONFIG_CGROUPS=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_MEMCG=y
CONFIG_BLK_CGROUP=y
CONFIG_NAMESPACES=y
diff --git a/arch/sh/configs/edosk7760_defconfig b/arch/sh/configs/edosk7760_defconfig
index e1077a041ac3..e588da2ba566 100644
--- a/arch/sh/configs/edosk7760_defconfig
+++ b/arch/sh/configs/edosk7760_defconfig
@@ -109,7 +109,7 @@ CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_ISO8859_15=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_MAGIC_SYSRQ=y
CONFIG_UNUSED_SYMBOLS=y
diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig
index e7e56a4131b4..280695c64d3e 100644
--- a/arch/sh/configs/sdk7786_defconfig
+++ b/arch/sh/configs/sdk7786_defconfig
@@ -17,7 +17,6 @@ CONFIG_CGROUP_DEVICE=y
CONFIG_CPUSETS=y
# CONFIG_PROC_PID_CPUSET is not set
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_MEMCG=y
CONFIG_CGROUP_MEMCG_SWAP=y
CONFIG_CGROUP_SCHED=y
@@ -216,7 +215,7 @@ CONFIG_NLS_ASCII=m
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_UTF8=m
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig
index 6bc30ab9fd18..91853a67ec34 100644
--- a/arch/sh/configs/se7206_defconfig
+++ b/arch/sh/configs/se7206_defconfig
@@ -10,7 +10,6 @@ CONFIG_CGROUPS=y
CONFIG_CGROUP_DEBUG=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_MEMCG=y
CONFIG_RELAY=y
CONFIG_NAMESPACES=y
diff --git a/arch/sh/configs/se7722_defconfig b/arch/sh/configs/se7722_defconfig
index ae998c7e2ee0..09aac57eddb7 100644
--- a/arch/sh/configs/se7722_defconfig
+++ b/arch/sh/configs/se7722_defconfig
@@ -53,7 +53,7 @@ CONFIG_EXT3_FS=y
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_HUGETLBFS=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_FS=y
diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig
index 9bdcf72ec06a..bdb7783a63ee 100644
--- a/arch/sh/configs/sh7785lcr_32bit_defconfig
+++ b/arch/sh/configs/sh7785lcr_32bit_defconfig
@@ -144,7 +144,7 @@ CONFIG_NFSD_V3=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_CODEPAGE_932=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
diff --git a/arch/sh/configs/shx3_defconfig b/arch/sh/configs/shx3_defconfig
index cd6c519f8fad..4a4269ad5b04 100644
--- a/arch/sh/configs/shx3_defconfig
+++ b/arch/sh/configs/shx3_defconfig
@@ -12,7 +12,6 @@ CONFIG_CGROUPS=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_MEMCG=y
CONFIG_RELAY=y
CONFIG_NAMESPACES=y
diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig
index 1e843dbed5f0..79723309fcfa 100644
--- a/arch/sh/configs/urquell_defconfig
+++ b/arch/sh/configs/urquell_defconfig
@@ -14,7 +14,6 @@ CONFIG_CGROUP_DEVICE=y
CONFIG_CPUSETS=y
# CONFIG_PROC_PID_CPUSET is not set
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_MEMCG=y
CONFIG_CGROUP_MEMCG_SWAP=y
CONFIG_CGROUP_SCHED=y
@@ -143,7 +142,7 @@ CONFIG_ROOT_NFS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_CODEPAGE_932=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_DEBUG_FS=y
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 2885fc9d9dcd..ee12e9451874 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -76,13 +76,6 @@ void start_thread(struct pt_regs *regs, unsigned long new_pc,
}
EXPORT_SYMBOL(start_thread);
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
void flush_thread(void)
{
struct task_struct *tsk = current;
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index e2062e643341..9d3e9916555d 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -288,7 +288,7 @@ void show_regs(struct pt_regs *regs)
/*
* Free current thread data structures etc..
*/
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
{
/*
* See arch/sparc/kernel/process.c for the precedent for doing
@@ -307,9 +307,8 @@ void exit_thread(void)
* which it would get safely nulled.
*/
#ifdef CONFIG_SH_FPU
- if (last_task_used_math == current) {
+ if (last_task_used_math == tsk)
last_task_used_math = NULL;
- }
#endif
}
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index db0a26cffa97..546293d9e6c5 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -20,6 +20,7 @@ config SPARC
select HAVE_OPROFILE
select HAVE_ARCH_KGDB if !SMP || SPARC64
select HAVE_ARCH_TRACEHOOK
+ select HAVE_EXIT_THREAD
select SYSCTL_EXCEPTION_TRACE
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select RTC_CLASS
@@ -41,6 +42,7 @@ config SPARC
select ODD_RT_SIGACTION
select OLD_SIGSUSPEND
select ARCH_HAS_SG_CHAIN
+ select CPU_NO_EFFICIENT_FFS
config SPARC32
def_bool !64BIT
@@ -78,6 +80,7 @@ config SPARC64
select NO_BOOTMEM
select HAVE_ARCH_AUDITSYSCALL
select ARCH_SUPPORTS_ATOMIC_RMW
+ select HAVE_NMI
config ARCH_DEFCONFIG
string
diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig
index 3583d676a916..e4bbc76eb63c 100644
--- a/arch/sparc/configs/sparc64_defconfig
+++ b/arch/sparc/configs/sparc64_defconfig
@@ -202,7 +202,7 @@ CONFIG_EXT3_FS_SECURITY=y
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_HUGETLBFS=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
# CONFIG_ENABLE_WARN_DEPRECATED is not set
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index f089cfa249f3..93ce0ada3c63 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -681,8 +681,6 @@ static inline unsigned long pmd_trans_huge(pmd_t pmd)
return pte_val(pte) & _PAGE_PMD_HUGE;
}
-#define has_transparent_hugepage() 1
-
static inline pmd_t pmd_mkold(pmd_t pmd)
{
pte_t pte = __pte(pmd_val(pmd));
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index c5113c7ce2fd..b7780a5bef11 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -184,21 +184,21 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
/*
* Free current thread data structures etc..
*/
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
{
#ifndef CONFIG_SMP
- if(last_task_used_math == current) {
+ if (last_task_used_math == tsk) {
#else
- if (test_thread_flag(TIF_USEDFPU)) {
+ if (test_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU)) {
#endif
/* Keep process from leaving FPU in a bogon state. */
put_psr(get_psr() | PSR_EF);
- fpsave(&current->thread.float_regs[0], &current->thread.fsr,
- &current->thread.fpqueue[0], &current->thread.fpqdepth);
+ fpsave(&tsk->thread.float_regs[0], &tsk->thread.fsr,
+ &tsk->thread.fpqueue[0], &tsk->thread.fpqdepth);
#ifndef CONFIG_SMP
last_task_used_math = NULL;
#else
- clear_thread_flag(TIF_USEDFPU);
+ clear_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU);
#endif
}
}
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index c16ef1af1843..fa14402b33f9 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -417,9 +417,9 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
}
/* Free current thread data structures etc.. */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
{
- struct thread_info *t = current_thread_info();
+ struct thread_info *t = task_thread_info(tsk);
if (t->utraps) {
if (t->utraps[0] < 2)
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index c3bbb295bc4a..4820a02838ac 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -24,7 +24,9 @@ config TILE
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
select HAVE_DMA_API_DEBUG
+ select HAVE_EXIT_THREAD
select HAVE_KVM if !TILEGX
+ select HAVE_NMI if USE_PMC
select HAVE_PERF_EVENTS
select HAVE_SYSCALL_TRACEPOINTS
select MODULES_USE_ELF_RELA
diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig
index dea47c31ab16..fd122ef45b00 100644
--- a/arch/tile/configs/tilegx_defconfig
+++ b/arch/tile/configs/tilegx_defconfig
@@ -16,7 +16,6 @@ CONFIG_CGROUP_DEBUG=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_SCHED=y
CONFIG_RT_GROUP_SCHED=y
CONFIG_BLK_CGROUP=y
diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig
index 95743eedf747..eb6a55944191 100644
--- a/arch/tile/configs/tilepro_defconfig
+++ b/arch/tile/configs/tilepro_defconfig
@@ -15,7 +15,6 @@ CONFIG_CGROUP_DEBUG=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_SCHED=y
CONFIG_RT_GROUP_SCHED=y
CONFIG_BLK_CGROUP=y
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 96cecf55522e..2a26cc4fefc2 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -487,7 +487,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define has_transparent_hugepage() 1
#define pmd_trans_huge pmd_huge_page
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index b5f30d376ce1..6b705ccc9cc1 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -541,7 +541,7 @@ void flush_thread(void)
/*
* Free current thread data structures etc..
*/
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
{
#ifdef CONFIG_HARDWALL
/*
@@ -550,7 +550,7 @@ void exit_thread(void)
* the last reference to a hardwall fd, it would already have
* been released and deactivated at this point.)
*/
- hardwall_deactivate_all(current);
+ hardwall_deactivate_all(tsk);
#endif
}
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index a992238e9b58..153020abd2f5 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -962,9 +962,7 @@ static void __init setup_numa_mapping(void)
cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]);
cpu_2_node[best_cpu] = node;
cpumask_clear_cpu(best_cpu, &unbound_cpus);
- node = next_node(node, default_nodes);
- if (node == MAX_NUMNODES)
- node = first_node(default_nodes);
+ node = next_node_in(node, default_nodes);
}
/* Print out node assignments and set defaults for disabled cpus */
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index e212c64682c5..77ceaa343fce 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -308,11 +308,16 @@ static bool saw_hugepagesz;
static __init int setup_hugepagesz(char *opt)
{
+ int rc;
+
if (!saw_hugepagesz) {
saw_hugepagesz = true;
memset(huge_shift, 0, sizeof(huge_shift));
}
- return __setup_hugepagesz(memparse(opt, NULL));
+ rc = __setup_hugepagesz(memparse(opt, NULL));
+ if (rc)
+ hugetlb_bad_size();
+ return rc;
}
__setup("hugepagesz=", setup_hugepagesz);
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index a0582b7f41d3..adce25462b0d 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -679,7 +679,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end)
* Hacky direct set to avoid unnecessary
* lock take/release for EVERY page here.
*/
- p->_count.counter = 0;
+ p->_refcount.counter = 0;
p->_mapcount.counter = -1;
}
init_page_count(page);
diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig
index a12bf68c9f3a..5636221b8785 100644
--- a/arch/um/configs/i386_defconfig
+++ b/arch/um/configs/i386_defconfig
@@ -17,7 +17,6 @@ CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_SCHED=y
CONFIG_BLK_CGROUP=y
# CONFIG_PID_NS is not set
diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig
index 3aab117bd553..7a67b7ac1a7e 100644
--- a/arch/um/configs/x86_64_defconfig
+++ b/arch/um/configs/x86_64_defconfig
@@ -15,7 +15,6 @@ CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_SCHED=y
CONFIG_BLK_CGROUP=y
# CONFIG_PID_NS is not set
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 48af59aae129..0b04711f1f18 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -103,10 +103,6 @@ void interrupt_end(void)
tracehook_notify_resume(regs);
}
-void exit_thread(void)
-{
-}
-
int get_current_pid(void)
{
return task_pid_nr(current);
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index b008e9961465..00299c927852 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -201,13 +201,6 @@ void show_regs(struct pt_regs *regs)
__backtrace();
}
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
void flush_thread(void)
{
struct thread_info *thread = current_thread_info();
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 15f827678843..48ac29034e1e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -105,6 +105,7 @@ config X86
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
select HAVE_EFFICIENT_UNALIGNED_ACCESS
+ select HAVE_EXIT_THREAD
select HAVE_FENTRY if X86_64
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_FP_TEST
@@ -130,6 +131,7 @@ config X86
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MIXED_BREAKPOINTS_REGS
+ select HAVE_NMI
select HAVE_OPROFILE
select HAVE_OPTPROBES
select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 265901a84f3f..6e7906d10ede 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -17,7 +17,6 @@ CONFIG_CGROUPS=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_SCHED=y
CONFIG_BLK_DEV_INITRD=y
# CONFIG_COMPAT_BRK is not set
@@ -286,7 +285,7 @@ CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
# CONFIG_ENABLE_WARN_DEPRECATED is not set
CONFIG_FRAME_WARN=1024
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 0c8d7963483c..1f912dccc1d7 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -16,7 +16,6 @@ CONFIG_CGROUPS=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_SCHED=y
CONFIG_BLK_DEV_INITRD=y
# CONFIG_COMPAT_BRK is not set
@@ -285,7 +284,7 @@ CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
# CONFIG_ENABLE_WARN_DEPRECATED is not set
CONFIG_MAGIC_SYSRQ=y
# CONFIG_UNUSED_SYMBOLS is not set
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f86491a7bc9d..1a27396b6ea0 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -181,6 +181,7 @@ static inline int pmd_trans_huge(pmd_t pmd)
return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
}
+#define has_transparent_hugepage has_transparent_hugepage
static inline int has_transparent_hugepage(void)
{
return boot_cpu_has(X86_FEATURE_PSE);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 8fd3c0e6d792..d40ec723f799 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -5,6 +5,7 @@
*/
#include <linux/errno.h>
#include <linux/compiler.h>
+#include <linux/kasan-checks.h>
#include <linux/thread_info.h>
#include <linux/string.h>
#include <asm/asm.h>
@@ -740,6 +741,8 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
might_fault();
+ kasan_check_write(to, n);
+
/*
* While we would like to have the compiler do the checking for us
* even in the non-constant size case, any false positives there are
@@ -773,6 +776,8 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
{
int sz = __compiletime_object_size(from);
+ kasan_check_read(from, n);
+
might_fault();
/* See the comment in copy_from_user() above. */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 307698688fa1..2eac2aa3e37f 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -7,6 +7,7 @@
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/lockdep.h>
+#include <linux/kasan-checks.h>
#include <asm/alternative.h>
#include <asm/cpufeatures.h>
#include <asm/page.h>
@@ -109,6 +110,7 @@ static __always_inline __must_check
int __copy_from_user(void *dst, const void __user *src, unsigned size)
{
might_fault();
+ kasan_check_write(dst, size);
return __copy_from_user_nocheck(dst, src, size);
}
@@ -175,6 +177,7 @@ static __always_inline __must_check
int __copy_to_user(void __user *dst, const void *src, unsigned size)
{
might_fault();
+ kasan_check_read(src, size);
return __copy_to_user_nocheck(dst, src, size);
}
@@ -242,12 +245,14 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
static __must_check __always_inline int
__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
{
+ kasan_check_write(dst, size);
return __copy_from_user_nocheck(dst, src, size);
}
static __must_check __always_inline int
__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
{
+ kasan_check_read(src, size);
return __copy_to_user_nocheck(dst, src, size);
}
@@ -258,6 +263,7 @@ static inline int
__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
{
might_fault();
+ kasan_check_write(dst, size);
return __copy_user_nocache(dst, src, size, 1);
}
@@ -265,6 +271,7 @@ static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
unsigned size)
{
+ kasan_check_write(dst, size);
return __copy_user_nocache(dst, src, size, 0);
}
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 045e424fb368..7788ce643bf4 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,7 +18,6 @@
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/delay.h>
-#include <linux/seq_buf.h>
#ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(int watchdog_thresh)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index ba7fbba9831b..fc3389fc47a2 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -337,6 +337,7 @@ void arch_crash_save_vmcoreinfo(void)
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
+ VMCOREINFO_PHYS_BASE(phys_base);
}
/* arch-dependent functionality related to kexec file-based syscall */
@@ -538,3 +539,48 @@ overflow:
return -ENOEXEC;
}
#endif /* CONFIG_KEXEC_FILE */
+
+static int
+kexec_mark_range(unsigned long start, unsigned long end, bool protect)
+{
+ struct page *page;
+ unsigned int nr_pages;
+
+ /*
+ * For physical range: [start, end]. We must skip the unassigned
+ * crashk resource with zero-valued "end" member.
+ */
+ if (!end || start > end)
+ return 0;
+
+ page = pfn_to_page(start >> PAGE_SHIFT);
+ nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ if (protect)
+ return set_pages_ro(page, nr_pages);
+ else
+ return set_pages_rw(page, nr_pages);
+}
+
+static void kexec_mark_crashkres(bool protect)
+{
+ unsigned long control;
+
+ kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
+
+ /* Don't touch the control code page used in crash_kexec().*/
+ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
+ /* Control code page is located in the 2nd page. */
+ kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
+ control += KEXEC_CONTROL_PAGE_SIZE;
+ kexec_mark_range(control, crashk_res.end, protect);
+}
+
+void arch_kexec_protect_crashkres(void)
+{
+ kexec_mark_crashkres(true);
+}
+
+void arch_kexec_unprotect_crashkres(void)
+{
+ kexec_mark_crashkres(false);
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 2915d54e9dd5..96becbbb52e0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -97,10 +97,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
/*
* Free current thread data structures etc..
*/
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
{
- struct task_struct *me = current;
- struct thread_struct *t = &me->thread;
+ struct thread_struct *t = &tsk->thread;
unsigned long *bp = t->io_bitmap_ptr;
struct fpu *fpu = &t->fpu;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 14a95054d4e0..2ae8584b44c7 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -165,6 +165,7 @@ static __init int setup_hugepagesz(char *opt)
} else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) {
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
} else {
+ hugetlb_bad_size();
printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
ps >> 20);
return 0;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f70c1ff46125..9c086c57105c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -617,9 +617,7 @@ static void __init numa_init_array(void)
if (early_cpu_to_node(i) != NUMA_NO_NODE)
continue;
numa_set_node(i, rr);
- rr = next_node(rr, node_online_map);
- if (rr == MAX_NUMNODES)
- rr = first_node(node_online_map);
+ rr = next_node_in(rr, node_online_map);
}
}
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 85257afe71c3..64336f666fb6 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -14,6 +14,7 @@ config XTENSA
select GENERIC_PCI_IOMAP
select GENERIC_SCHED_CLOCK
select HAVE_DMA_API_DEBUG
+ select HAVE_EXIT_THREAD
select HAVE_FUNCTION_TRACER
select HAVE_FUTEX_CMPXCHG if !MMU
select HAVE_HW_BREAKPOINT if PERF_EVENTS
diff --git a/arch/xtensa/configs/audio_kc705_defconfig b/arch/xtensa/configs/audio_kc705_defconfig
index c4904db15582..bc0e69f64cf7 100644
--- a/arch/xtensa/configs/audio_kc705_defconfig
+++ b/arch/xtensa/configs/audio_kc705_defconfig
@@ -123,7 +123,7 @@ CONFIG_ROOT_NFS=y
CONFIG_SUNRPC_DEBUG=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/xtensa/configs/generic_kc705_defconfig b/arch/xtensa/configs/generic_kc705_defconfig
index f4b7b3888da8..f6d88f81e233 100644
--- a/arch/xtensa/configs/generic_kc705_defconfig
+++ b/arch/xtensa/configs/generic_kc705_defconfig
@@ -11,7 +11,6 @@ CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_MEMCG=y
CONFIG_NAMESPACES=y
CONFIG_SCHED_AUTOGROUP=y
@@ -111,7 +110,7 @@ CONFIG_ROOT_NFS=y
CONFIG_SUNRPC_DEBUG=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/xtensa/configs/nommu_kc705_defconfig b/arch/xtensa/configs/nommu_kc705_defconfig
index 337d5ba2d285..072733684308 100644
--- a/arch/xtensa/configs/nommu_kc705_defconfig
+++ b/arch/xtensa/configs/nommu_kc705_defconfig
@@ -107,7 +107,7 @@ CONFIG_ROOT_NFS=y
CONFIG_SUNRPC_DEBUG=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO=y
# CONFIG_FRAME_POINTER is not set
diff --git a/arch/xtensa/configs/smp_lx200_defconfig b/arch/xtensa/configs/smp_lx200_defconfig
index 22eeacba37cc..efbbaf94add1 100644
--- a/arch/xtensa/configs/smp_lx200_defconfig
+++ b/arch/xtensa/configs/smp_lx200_defconfig
@@ -11,7 +11,6 @@ CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
CONFIG_MEMCG=y
CONFIG_NAMESPACES=y
CONFIG_SCHED_AUTOGROUP=y
@@ -115,7 +114,7 @@ CONFIG_ROOT_NFS=y
CONFIG_SUNRPC_DEBUG=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO=y
CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 5bbfed81c97b..e0ded48561db 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -115,10 +115,10 @@ void arch_cpu_idle(void)
/*
* This is called when the thread calls exit().
*/
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
{
#if XTENSA_HAVE_COPROCESSORS
- coprocessor_release_all(current_thread_info());
+ coprocessor_release_all(task_thread_info(tsk));
#endif
}
diff --git a/block/genhd.c b/block/genhd.c
index 9f42526b4d62..eb05dfc1dffe 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -877,7 +877,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c
index e507cfbd044e..edcea70674c9 100644
--- a/block/partitions/ldm.c
+++ b/block/partitions/ldm.c
@@ -27,6 +27,8 @@
#include <linux/pagemap.h>
#include <linux/stringify.h>
#include <linux/kernel.h>
+#include <linux/uuid.h>
+
#include "ldm.h"
#include "check.h"
#include "msdos.h"
@@ -66,60 +68,6 @@ void _ldm_printk(const char *level, const char *function, const char *fmt, ...)
}
/**
- * ldm_parse_hexbyte - Convert a ASCII hex number to a byte
- * @src: Pointer to at least 2 characters to convert.
- *
- * Convert a two character ASCII hex string to a number.
- *
- * Return: 0-255 Success, the byte was parsed correctly
- * -1 Error, an invalid character was supplied
- */
-static int ldm_parse_hexbyte (const u8 *src)
-{
- unsigned int x; /* For correct wrapping */
- int h;
-
- /* high part */
- x = h = hex_to_bin(src[0]);
- if (h < 0)
- return -1;
-
- /* low part */
- h = hex_to_bin(src[1]);
- if (h < 0)
- return -1;
-
- return (x << 4) + h;
-}
-
-/**
- * ldm_parse_guid - Convert GUID from ASCII to binary
- * @src: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
- * @dest: Memory block to hold binary GUID (16 bytes)
- *
- * N.B. The GUID need not be NULL terminated.
- *
- * Return: 'true' @dest contains binary GUID
- * 'false' @dest contents are undefined
- */
-static bool ldm_parse_guid (const u8 *src, u8 *dest)
-{
- static const int size[] = { 4, 2, 2, 2, 6 };
- int i, j, v;
-
- if (src[8] != '-' || src[13] != '-' ||
- src[18] != '-' || src[23] != '-')
- return false;
-
- for (j = 0; j < 5; j++, src++)
- for (i = 0; i < size[j]; i++, src+=2, *dest++ = v)
- if ((v = ldm_parse_hexbyte (src)) < 0)
- return false;
-
- return true;
-}
-
-/**
* ldm_parse_privhead - Read the LDM Database PRIVHEAD structure
* @data: Raw database PRIVHEAD structure loaded from the device
* @ph: In-memory privhead structure in which to return parsed information
@@ -167,7 +115,7 @@ static bool ldm_parse_privhead(const u8 *data, struct privhead *ph)
ldm_error("PRIVHEAD disk size doesn't match real disk size");
return false;
}
- if (!ldm_parse_guid(data + 0x0030, ph->disk_id)) {
+ if (uuid_be_to_bin(data + 0x0030, (uuid_be *)ph->disk_id)) {
ldm_error("PRIVHEAD contains an invalid GUID.");
return false;
}
@@ -944,7 +892,7 @@ static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
disk = &vb->vblk.disk;
ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name,
sizeof (disk->alt_name));
- if (!ldm_parse_guid (buffer + 0x19 + r_name, disk->disk_id))
+ if (uuid_be_to_bin(buffer + 0x19 + r_name, (uuid_be *)disk->disk_id))
return false;
return true;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index f46dba8b7092..dc75de9059cd 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -391,6 +391,7 @@ static ssize_t show_valid_zones(struct device *dev,
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
struct page *first_page;
struct zone *zone;
+ int zone_shift = 0;
start_pfn = section_nr_to_pfn(mem->start_section_nr);
end_pfn = start_pfn + nr_pages;
@@ -402,21 +403,26 @@ static ssize_t show_valid_zones(struct device *dev,
zone = page_zone(first_page);
- if (zone_idx(zone) == ZONE_MOVABLE - 1) {
- /*The mem block is the last memoryblock of this zone.*/
- if (end_pfn == zone_end_pfn(zone))
- return sprintf(buf, "%s %s\n",
- zone->name, (zone + 1)->name);
+ /* MMOP_ONLINE_KEEP */
+ sprintf(buf, "%s", zone->name);
+
+ /* MMOP_ONLINE_KERNEL */
+ zone_shift = zone_can_shift(start_pfn, nr_pages, ZONE_NORMAL);
+ if (zone_shift) {
+ strcat(buf, " ");
+ strcat(buf, (zone + zone_shift)->name);
}
- if (zone_idx(zone) == ZONE_MOVABLE) {
- /*The mem block is the first memoryblock of ZONE_MOVABLE.*/
- if (start_pfn == zone->zone_start_pfn)
- return sprintf(buf, "%s %s\n",
- zone->name, (zone - 1)->name);
+ /* MMOP_ONLINE_MOVABLE */
+ zone_shift = zone_can_shift(start_pfn, nr_pages, ZONE_MOVABLE);
+ if (zone_shift) {
+ strcat(buf, " ");
+ strcat(buf, (zone + zone_shift)->name);
}
- return sprintf(buf, "%s\n", zone->name);
+ strcat(buf, "\n");
+
+ return strlen(buf);
}
static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL);
#endif
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 437b3a822f44..d597e432e195 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -861,7 +861,7 @@ rqbiocnt(struct request *r)
* discussion.
*
* We cannot use get_page in the workaround, because it insists on a
- * positive page count as a precondition. So we use _count directly.
+ * positive page count as a precondition. So we use _refcount directly.
*/
static void
bio_pageinc(struct bio *bio)
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 3ef42e563bb5..b51a816d766b 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/sched.h>
+#include <linux/cpu.h>
#include "zcomp.h"
#include "zcomp_lzo.h"
@@ -20,29 +21,6 @@
#include "zcomp_lz4.h"
#endif
-/*
- * single zcomp_strm backend
- */
-struct zcomp_strm_single {
- struct mutex strm_lock;
- struct zcomp_strm *zstrm;
-};
-
-/*
- * multi zcomp_strm backend
- */
-struct zcomp_strm_multi {
- /* protect strm list */
- spinlock_t strm_lock;
- /* max possible number of zstrm streams */
- int max_strm;
- /* number of available zstrm streams */
- int avail_strm;
- /* list of available strms */
- struct list_head idle_strm;
- wait_queue_head_t strm_wait;
-};
-
static struct zcomp_backend *backends[] = {
&zcomp_lzo,
#ifdef CONFIG_ZRAM_LZ4_COMPRESS
@@ -93,188 +71,6 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
return zstrm;
}
-/*
- * get idle zcomp_strm or wait until other process release
- * (zcomp_strm_release()) one for us
- */
-static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp)
-{
- struct zcomp_strm_multi *zs = comp->stream;
- struct zcomp_strm *zstrm;
-
- while (1) {
- spin_lock(&zs->strm_lock);
- if (!list_empty(&zs->idle_strm)) {
- zstrm = list_entry(zs->idle_strm.next,
- struct zcomp_strm, list);
- list_del(&zstrm->list);
- spin_unlock(&zs->strm_lock);
- return zstrm;
- }
- /* zstrm streams limit reached, wait for idle stream */
- if (zs->avail_strm >= zs->max_strm) {
- spin_unlock(&zs->strm_lock);
- wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
- continue;
- }
- /* allocate new zstrm stream */
- zs->avail_strm++;
- spin_unlock(&zs->strm_lock);
- /*
- * This function can be called in swapout/fs write path
- * so we can't use GFP_FS|IO. And it assumes we already
- * have at least one stream in zram initialization so we
- * don't do best effort to allocate more stream in here.
- * A default stream will work well without further multiple
- * streams. That's why we use NORETRY | NOWARN.
- */
- zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY |
- __GFP_NOWARN);
- if (!zstrm) {
- spin_lock(&zs->strm_lock);
- zs->avail_strm--;
- spin_unlock(&zs->strm_lock);
- wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
- continue;
- }
- break;
- }
- return zstrm;
-}
-
-/* add stream back to idle list and wake up waiter or free the stream */
-static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm)
-{
- struct zcomp_strm_multi *zs = comp->stream;
-
- spin_lock(&zs->strm_lock);
- if (zs->avail_strm <= zs->max_strm) {
- list_add(&zstrm->list, &zs->idle_strm);
- spin_unlock(&zs->strm_lock);
- wake_up(&zs->strm_wait);
- return;
- }
-
- zs->avail_strm--;
- spin_unlock(&zs->strm_lock);
- zcomp_strm_free(comp, zstrm);
-}
-
-/* change max_strm limit */
-static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
-{
- struct zcomp_strm_multi *zs = comp->stream;
- struct zcomp_strm *zstrm;
-
- spin_lock(&zs->strm_lock);
- zs->max_strm = num_strm;
- /*
- * if user has lowered the limit and there are idle streams,
- * immediately free as much streams (and memory) as we can.
- */
- while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) {
- zstrm = list_entry(zs->idle_strm.next,
- struct zcomp_strm, list);
- list_del(&zstrm->list);
- zcomp_strm_free(comp, zstrm);
- zs->avail_strm--;
- }
- spin_unlock(&zs->strm_lock);
- return true;
-}
-
-static void zcomp_strm_multi_destroy(struct zcomp *comp)
-{
- struct zcomp_strm_multi *zs = comp->stream;
- struct zcomp_strm *zstrm;
-
- while (!list_empty(&zs->idle_strm)) {
- zstrm = list_entry(zs->idle_strm.next,
- struct zcomp_strm, list);
- list_del(&zstrm->list);
- zcomp_strm_free(comp, zstrm);
- }
- kfree(zs);
-}
-
-static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
-{
- struct zcomp_strm *zstrm;
- struct zcomp_strm_multi *zs;
-
- comp->destroy = zcomp_strm_multi_destroy;
- comp->strm_find = zcomp_strm_multi_find;
- comp->strm_release = zcomp_strm_multi_release;
- comp->set_max_streams = zcomp_strm_multi_set_max_streams;
- zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL);
- if (!zs)
- return -ENOMEM;
-
- comp->stream = zs;
- spin_lock_init(&zs->strm_lock);
- INIT_LIST_HEAD(&zs->idle_strm);
- init_waitqueue_head(&zs->strm_wait);
- zs->max_strm = max_strm;
- zs->avail_strm = 1;
-
- zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
- if (!zstrm) {
- kfree(zs);
- return -ENOMEM;
- }
- list_add(&zstrm->list, &zs->idle_strm);
- return 0;
-}
-
-static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp)
-{
- struct zcomp_strm_single *zs = comp->stream;
- mutex_lock(&zs->strm_lock);
- return zs->zstrm;
-}
-
-static void zcomp_strm_single_release(struct zcomp *comp,
- struct zcomp_strm *zstrm)
-{
- struct zcomp_strm_single *zs = comp->stream;
- mutex_unlock(&zs->strm_lock);
-}
-
-static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
-{
- /* zcomp_strm_single support only max_comp_streams == 1 */
- return false;
-}
-
-static void zcomp_strm_single_destroy(struct zcomp *comp)
-{
- struct zcomp_strm_single *zs = comp->stream;
- zcomp_strm_free(comp, zs->zstrm);
- kfree(zs);
-}
-
-static int zcomp_strm_single_create(struct zcomp *comp)
-{
- struct zcomp_strm_single *zs;
-
- comp->destroy = zcomp_strm_single_destroy;
- comp->strm_find = zcomp_strm_single_find;
- comp->strm_release = zcomp_strm_single_release;
- comp->set_max_streams = zcomp_strm_single_set_max_streams;
- zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL);
- if (!zs)
- return -ENOMEM;
-
- comp->stream = zs;
- mutex_init(&zs->strm_lock);
- zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
- if (!zs->zstrm) {
- kfree(zs);
- return -ENOMEM;
- }
- return 0;
-}
-
/* show available compressors */
ssize_t zcomp_available_show(const char *comp, char *buf)
{
@@ -299,19 +95,14 @@ bool zcomp_available_algorithm(const char *comp)
return find_backend(comp) != NULL;
}
-bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
-{
- return comp->set_max_streams(comp, num_strm);
-}
-
struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
{
- return comp->strm_find(comp);
+ return *get_cpu_ptr(comp->stream);
}
void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm)
{
- comp->strm_release(comp, zstrm);
+ put_cpu_ptr(comp->stream);
}
int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
@@ -327,9 +118,83 @@ int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
return comp->backend->decompress(src, src_len, dst);
}
+static int __zcomp_cpu_notifier(struct zcomp *comp,
+ unsigned long action, unsigned long cpu)
+{
+ struct zcomp_strm *zstrm;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (WARN_ON(*per_cpu_ptr(comp->stream, cpu)))
+ break;
+ zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
+ if (IS_ERR_OR_NULL(zstrm)) {
+ pr_err("Can't allocate a compression stream\n");
+ return NOTIFY_BAD;
+ }
+ *per_cpu_ptr(comp->stream, cpu) = zstrm;
+ break;
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ zstrm = *per_cpu_ptr(comp->stream, cpu);
+ if (!IS_ERR_OR_NULL(zstrm))
+ zcomp_strm_free(comp, zstrm);
+ *per_cpu_ptr(comp->stream, cpu) = NULL;
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static int zcomp_cpu_notifier(struct notifier_block *nb,
+ unsigned long action, void *pcpu)
+{
+ unsigned long cpu = (unsigned long)pcpu;
+ struct zcomp *comp = container_of(nb, typeof(*comp), notifier);
+
+ return __zcomp_cpu_notifier(comp, action, cpu);
+}
+
+static int zcomp_init(struct zcomp *comp)
+{
+ unsigned long cpu;
+ int ret;
+
+ comp->notifier.notifier_call = zcomp_cpu_notifier;
+
+ comp->stream = alloc_percpu(struct zcomp_strm *);
+ if (!comp->stream)
+ return -ENOMEM;
+
+ cpu_notifier_register_begin();
+ for_each_online_cpu(cpu) {
+ ret = __zcomp_cpu_notifier(comp, CPU_UP_PREPARE, cpu);
+ if (ret == NOTIFY_BAD)
+ goto cleanup;
+ }
+ __register_cpu_notifier(&comp->notifier);
+ cpu_notifier_register_done();
+ return 0;
+
+cleanup:
+ for_each_online_cpu(cpu)
+ __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu);
+ cpu_notifier_register_done();
+ return -ENOMEM;
+}
+
void zcomp_destroy(struct zcomp *comp)
{
- comp->destroy(comp);
+ unsigned long cpu;
+
+ cpu_notifier_register_begin();
+ for_each_online_cpu(cpu)
+ __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu);
+ __unregister_cpu_notifier(&comp->notifier);
+ cpu_notifier_register_done();
+
+ free_percpu(comp->stream);
kfree(comp);
}
@@ -339,9 +204,9 @@ void zcomp_destroy(struct zcomp *comp)
* backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL)
* if requested algorithm is not supported, ERR_PTR(-ENOMEM) in
* case of allocation error, or any other error potentially
- * returned by functions zcomp_strm_{multi,single}_create.
+ * returned by zcomp_init().
*/
-struct zcomp *zcomp_create(const char *compress, int max_strm)
+struct zcomp *zcomp_create(const char *compress)
{
struct zcomp *comp;
struct zcomp_backend *backend;
@@ -356,10 +221,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm)
return ERR_PTR(-ENOMEM);
comp->backend = backend;
- if (max_strm > 1)
- error = zcomp_strm_multi_create(comp, max_strm);
- else
- error = zcomp_strm_single_create(comp);
+ error = zcomp_init(comp);
if (error) {
kfree(comp);
return ERR_PTR(error);
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index b7d2a4bcae54..ffd88cb747fe 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -10,8 +10,6 @@
#ifndef _ZCOMP_H_
#define _ZCOMP_H_
-#include <linux/mutex.h>
-
struct zcomp_strm {
/* compression/decompression buffer */
void *buffer;
@@ -21,8 +19,6 @@ struct zcomp_strm {
* working memory)
*/
void *private;
- /* used in multi stream backend, protected by backend strm_lock */
- struct list_head list;
};
/* static compression backend */
@@ -41,19 +37,15 @@ struct zcomp_backend {
/* dynamic per-device compression frontend */
struct zcomp {
- void *stream;
+ struct zcomp_strm * __percpu *stream;
struct zcomp_backend *backend;
-
- struct zcomp_strm *(*strm_find)(struct zcomp *comp);
- void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm);
- bool (*set_max_streams)(struct zcomp *comp, int num_strm);
- void (*destroy)(struct zcomp *comp);
+ struct notifier_block notifier;
};
ssize_t zcomp_available_show(const char *comp, char *buf);
bool zcomp_available_algorithm(const char *comp);
-struct zcomp *zcomp_create(const char *comp, int max_strm);
+struct zcomp *zcomp_create(const char *comp);
void zcomp_destroy(struct zcomp *comp);
struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 370c2f76016d..8fcfbebe79cd 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -304,46 +304,25 @@ static ssize_t mem_used_max_store(struct device *dev,
return len;
}
+/*
+ * We switched to per-cpu streams and this attr is not needed anymore.
+ * However, we will keep it around for some time, because:
+ * a) we may revert per-cpu streams in the future
+ * b) it's visible to user space and we need to follow our 2 years
+ * retirement rule; but we already have a number of 'soon to be
+ * altered' attrs, so max_comp_streams need to wait for the next
+ * layoff cycle.
+ */
static ssize_t max_comp_streams_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- int val;
- struct zram *zram = dev_to_zram(dev);
-
- down_read(&zram->init_lock);
- val = zram->max_comp_streams;
- up_read(&zram->init_lock);
-
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
}
static ssize_t max_comp_streams_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
- int num;
- struct zram *zram = dev_to_zram(dev);
- int ret;
-
- ret = kstrtoint(buf, 0, &num);
- if (ret < 0)
- return ret;
- if (num < 1)
- return -EINVAL;
-
- down_write(&zram->init_lock);
- if (init_done(zram)) {
- if (!zcomp_set_max_streams(zram->comp, num)) {
- pr_info("Cannot change max compression streams\n");
- ret = -EINVAL;
- goto out;
- }
- }
-
- zram->max_comp_streams = num;
- ret = len;
-out:
- up_write(&zram->init_lock);
- return ret;
+ return len;
}
static ssize_t comp_algorithm_show(struct device *dev,
@@ -514,7 +493,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
goto out_error;
}
- meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
+ meta->mem_pool = zs_create_pool(pool_name);
if (!meta->mem_pool) {
pr_err("Error creating memory pool\n");
goto out_error;
@@ -650,7 +629,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
{
int ret = 0;
size_t clen;
- unsigned long handle;
+ unsigned long handle = 0;
struct page *page;
unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
struct zram_meta *meta = zram->meta;
@@ -673,9 +652,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
goto out;
}
- zstrm = zcomp_strm_find(zram->comp);
+compress_again:
user_mem = kmap_atomic(page);
-
if (is_partial_io(bvec)) {
memcpy(uncmem + offset, user_mem + bvec->bv_offset,
bvec->bv_len);
@@ -699,6 +677,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
goto out;
}
+ zstrm = zcomp_strm_find(zram->comp);
ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
if (!is_partial_io(bvec)) {
kunmap_atomic(user_mem);
@@ -710,6 +689,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
pr_err("Compression failed! err=%d\n", ret);
goto out;
}
+
src = zstrm->buffer;
if (unlikely(clen > max_zpage_size)) {
clen = PAGE_SIZE;
@@ -717,8 +697,33 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
src = uncmem;
}
- handle = zs_malloc(meta->mem_pool, clen);
+ /*
+ * handle allocation has 2 paths:
+ * a) fast path is executed with preemption disabled (for
+ * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
+ * since we can't sleep;
+ * b) slow path enables preemption and attempts to allocate
+ * the page with __GFP_DIRECT_RECLAIM bit set. we have to
+ * put per-cpu compression stream and, thus, to re-do
+ * the compression once handle is allocated.
+ *
+ * if we have a 'non-null' handle here then we are coming
+ * from the slow path and handle has already been allocated.
+ */
+ if (!handle)
+ handle = zs_malloc(meta->mem_pool, clen,
+ __GFP_KSWAPD_RECLAIM |
+ __GFP_NOWARN |
+ __GFP_HIGHMEM);
if (!handle) {
+ zcomp_strm_release(zram->comp, zstrm);
+ zstrm = NULL;
+
+ handle = zs_malloc(meta->mem_pool, clen,
+ GFP_NOIO | __GFP_HIGHMEM);
+ if (handle)
+ goto compress_again;
+
pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
index, clen);
ret = -ENOMEM;
@@ -1009,7 +1014,6 @@ static void zram_reset_device(struct zram *zram)
/* Reset stats */
memset(&zram->stats, 0, sizeof(zram->stats));
zram->disksize = 0;
- zram->max_comp_streams = 1;
set_capacity(zram->disk, 0);
part_stat_set_all(&zram->disk->part0, 0);
@@ -1038,7 +1042,7 @@ static ssize_t disksize_store(struct device *dev,
if (!meta)
return -ENOMEM;
- comp = zcomp_create(zram->compressor, zram->max_comp_streams);
+ comp = zcomp_create(zram->compressor);
if (IS_ERR(comp)) {
pr_err("Cannot initialise %s compressing backend\n",
zram->compressor);
@@ -1273,7 +1277,6 @@ static int zram_add(void)
}
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
zram->meta = NULL;
- zram->max_comp_streams = 1;
pr_info("Added device: %s\n", zram->disk->disk_name);
return device_id;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 8e92339686d7..06b1636f4722 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -102,7 +102,6 @@ struct zram {
* the number of pages zram can consume for storing compressed data
*/
unsigned long limit_pages;
- int max_comp_streams;
struct zram_stats stats;
atomic_t refcount; /* refcount for zram_meta */
diff --git a/drivers/char/random.c b/drivers/char/random.c
index b583e5336630..0158d3bff7e5 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -260,6 +260,7 @@
#include <linux/irq.h>
#include <linux/syscalls.h>
#include <linux/completion.h>
+#include <linux/uuid.h>
#include <asm/processor.h>
#include <asm/uaccess.h>
@@ -1621,26 +1622,6 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
return urandom_read(NULL, buf, count, NULL);
}
-/***************************************************************
- * Random UUID interface
- *
- * Used here for a Boot ID, but can be useful for other kernel
- * drivers.
- ***************************************************************/
-
-/*
- * Generate random UUID
- */
-void generate_random_uuid(unsigned char uuid_out[16])
-{
- get_random_bytes(uuid_out, 16);
- /* Set UUID version to 4 --- truly random generation */
- uuid_out[6] = (uuid_out[6] & 0x0F) | 0x40;
- /* Set the UUID variant to DCE */
- uuid_out[8] = (uuid_out[8] & 0x3F) | 0x80;
-}
-EXPORT_SYMBOL(generate_random_uuid);
-
/********************************************************************
*
* Sysctl interface
diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c
index d50c701b19d6..4074441444fe 100644
--- a/drivers/hwspinlock/hwspinlock_core.c
+++ b/drivers/hwspinlock/hwspinlock_core.c
@@ -313,7 +313,7 @@ int of_hwspin_lock_get_id(struct device_node *np, int index)
hwlock = radix_tree_deref_slot(slot);
if (unlikely(!hwlock))
continue;
- if (radix_tree_is_indirect_ptr(hwlock)) {
+ if (radix_tree_deref_retry(hwlock)) {
slot = radix_tree_iter_retry(&iter);
continue;
}
diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index 0974090abc7d..e8d55a153a65 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -1172,7 +1172,7 @@ static void msc_mmap_close(struct vm_area_struct *vma)
if (!atomic_dec_and_mutex_lock(&msc->mmap_count, &msc->buf_mutex))
return;
- /* drop page _counts */
+ /* drop page _refcounts */
for (pg = 0; pg < msc->nr_pages; pg++) {
struct page *page = msc_buffer_get_page(msc, pg);
diff --git a/drivers/media/usb/dvb-usb/dib0700_core.c b/drivers/media/usb/dvb-usb/dib0700_core.c
index c16f999b9d7c..bf890c3d9cda 100644
--- a/drivers/media/usb/dvb-usb/dib0700_core.c
+++ b/drivers/media/usb/dvb-usb/dib0700_core.c
@@ -517,7 +517,7 @@ int dib0700_download_firmware(struct usb_device *udev, const struct firmware *fw
if (nb_packet_buffer_size < 1)
nb_packet_buffer_size = 1;
- /* get the fimware version */
+ /* get the firmware version */
usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
REQUEST_GET_VERSION,
USB_TYPE_VENDOR | USB_DIR_IN, 0, 0,
diff --git a/drivers/memstick/host/rtsx_usb_ms.c b/drivers/memstick/host/rtsx_usb_ms.c
index 1105db2355d2..d34bc3530385 100644
--- a/drivers/memstick/host/rtsx_usb_ms.c
+++ b/drivers/memstick/host/rtsx_usb_ms.c
@@ -706,7 +706,7 @@ poll_again:
if (host->eject)
break;
- msleep(1000);
+ schedule_timeout_idle(HZ);
}
complete(&host->detect_ms_exit);
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 06b819db51b1..0ff8e60deccb 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -23,7 +23,7 @@ static void nicvf_get_page(struct nicvf *nic)
if (!nic->rb_pageref || !nic->rb_page)
return;
- atomic_add(nic->rb_pageref, &nic->rb_page->_count);
+ page_ref_add(nic->rb_page, nic->rb_pageref);
nic->rb_pageref = 0;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f3456798c596..74a243d5a593 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -433,8 +433,8 @@ static int mlx5e_alloc_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
if (unlikely(mlx5e_alloc_and_map_page(rq, wi, i)))
goto err_unmap;
- atomic_add(mlx5e_mpwqe_strides_per_page(rq),
- &wi->umr.dma_info[i].page->_count);
+ page_ref_add(wi->umr.dma_info[i].page,
+ mlx5e_mpwqe_strides_per_page(rq));
wi->skbs_frags[i] = 0;
}
@@ -452,8 +452,8 @@ err_unmap:
while (--i >= 0) {
dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
PCI_DMA_FROMDEVICE);
- atomic_sub(mlx5e_mpwqe_strides_per_page(rq),
- &wi->umr.dma_info[i].page->_count);
+ page_ref_sub(wi->umr.dma_info[i].page,
+ mlx5e_mpwqe_strides_per_page(rq));
put_page(wi->umr.dma_info[i].page);
}
dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -477,8 +477,8 @@ void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
PCI_DMA_FROMDEVICE);
- atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
- &wi->umr.dma_info[i].page->_count);
+ page_ref_sub(wi->umr.dma_info[i].page,
+ mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
put_page(wi->umr.dma_info[i].page);
}
dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -527,8 +527,8 @@ static int mlx5e_alloc_rx_linear_mpwqe(struct mlx5e_rq *rq,
*/
split_page(wi->dma_info.page, MLX5_MPWRQ_WQE_PAGE_ORDER);
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
- atomic_add(mlx5e_mpwqe_strides_per_page(rq),
- &wi->dma_info.page[i]._count);
+ page_ref_add(&wi->dma_info.page[i],
+ mlx5e_mpwqe_strides_per_page(rq));
wi->skbs_frags[i] = 0;
}
@@ -551,8 +551,8 @@ void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq,
dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz,
PCI_DMA_FROMDEVICE);
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
- atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
- &wi->dma_info.page[i]._count);
+ page_ref_sub(&wi->dma_info.page[i],
+ mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
put_page(&wi->dma_info.page[i]);
}
}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 8114541f327c..73dd525fbf08 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -920,7 +920,7 @@ static inline int qede_realloc_rx_buffer(struct qede_dev *edev,
* network stack to take the ownership of the page
* which can be recycled multiple times by the driver.
*/
- atomic_inc(&curr_cons->data->_count);
+ page_ref_inc(curr_cons->data);
qede_reuse_page(edev, rxq, curr_cons);
}
@@ -1036,7 +1036,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
/* Incr page ref count to reuse on allocation failure
* so that it doesn't get freed while freeing SKB.
*/
- atomic_inc(&current_bd->data->_count);
+ page_ref_inc(current_bd->data);
goto out;
}
@@ -1487,7 +1487,7 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
* freeing SKB.
*/
- atomic_inc(&sw_rx_data->data->_count);
+ page_ref_inc(sw_rx_data->data);
rxq->rx_alloc_errors++;
qede_recycle_rx_bd_ring(rxq, edev,
fp_cqe->bd_num);
diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index eb391a281833..ceeb8c188ef3 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -37,6 +37,7 @@
#include <linux/acpi.h>
#include <linux/slab.h>
#include <linux/module.h>
+#include <linux/uuid.h>
ACPI_MODULE_NAME("wmi");
MODULE_AUTHOR("Carlos Corbacho");
@@ -115,100 +116,21 @@ static struct acpi_driver acpi_wmi_driver = {
* GUID parsing functions
*/
-/**
- * wmi_parse_hexbyte - Convert a ASCII hex number to a byte
- * @src: Pointer to at least 2 characters to convert.
- *
- * Convert a two character ASCII hex string to a number.
- *
- * Return: 0-255 Success, the byte was parsed correctly
- * -1 Error, an invalid character was supplied
- */
-static int wmi_parse_hexbyte(const u8 *src)
-{
- int h;
- int value;
-
- /* high part */
- h = value = hex_to_bin(src[0]);
- if (value < 0)
- return -1;
-
- /* low part */
- value = hex_to_bin(src[1]);
- if (value >= 0)
- return (h << 4) | value;
- return -1;
-}
-
-/**
- * wmi_swap_bytes - Rearrange GUID bytes to match GUID binary
- * @src: Memory block holding binary GUID (16 bytes)
- * @dest: Memory block to hold byte swapped binary GUID (16 bytes)
- *
- * Byte swap a binary GUID to match it's real GUID value
- */
-static void wmi_swap_bytes(u8 *src, u8 *dest)
-{
- int i;
-
- for (i = 0; i <= 3; i++)
- memcpy(dest + i, src + (3 - i), 1);
-
- for (i = 0; i <= 1; i++)
- memcpy(dest + 4 + i, src + (5 - i), 1);
-
- for (i = 0; i <= 1; i++)
- memcpy(dest + 6 + i, src + (7 - i), 1);
-
- memcpy(dest + 8, src + 8, 8);
-}
-
-/**
- * wmi_parse_guid - Convert GUID from ASCII to binary
- * @src: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
- * @dest: Memory block to hold binary GUID (16 bytes)
- *
- * N.B. The GUID need not be NULL terminated.
- *
- * Return: 'true' @dest contains binary GUID
- * 'false' @dest contents are undefined
- */
-static bool wmi_parse_guid(const u8 *src, u8 *dest)
-{
- static const int size[] = { 4, 2, 2, 2, 6 };
- int i, j, v;
-
- if (src[8] != '-' || src[13] != '-' ||
- src[18] != '-' || src[23] != '-')
- return false;
-
- for (j = 0; j < 5; j++, src++) {
- for (i = 0; i < size[j]; i++, src += 2, *dest++ = v) {
- v = wmi_parse_hexbyte(src);
- if (v < 0)
- return false;
- }
- }
-
- return true;
-}
-
static bool find_guid(const char *guid_string, struct wmi_block **out)
{
- char tmp[16], guid_input[16];
+ uuid_le guid_input;
struct wmi_block *wblock;
struct guid_block *block;
struct list_head *p;
- wmi_parse_guid(guid_string, tmp);
- wmi_swap_bytes(tmp, guid_input);
+ if (uuid_le_to_bin(guid_string, &guid_input))
+ return false;
list_for_each(p, &wmi_block_list) {
wblock = list_entry(p, struct wmi_block, list);
block = &wblock->gblock;
- if (memcmp(block->guid, guid_input, 16) == 0) {
+ if (memcmp(block->guid, &guid_input, 16) == 0) {
if (out)
*out = wblock;
return true;
@@ -498,20 +420,20 @@ wmi_notify_handler handler, void *data)
{
struct wmi_block *block;
acpi_status status = AE_NOT_EXIST;
- char tmp[16], guid_input[16];
+ uuid_le guid_input;
struct list_head *p;
if (!guid || !handler)
return AE_BAD_PARAMETER;
- wmi_parse_guid(guid, tmp);
- wmi_swap_bytes(tmp, guid_input);
+ if (uuid_le_to_bin(guid, &guid_input))
+ return AE_BAD_PARAMETER;
list_for_each(p, &wmi_block_list) {
acpi_status wmi_status;
block = list_entry(p, struct wmi_block, list);
- if (memcmp(block->gblock.guid, guid_input, 16) == 0) {
+ if (memcmp(block->gblock.guid, &guid_input, 16) == 0) {
if (block->handler &&
block->handler != wmi_notify_debug)
return AE_ALREADY_ACQUIRED;
@@ -539,20 +461,20 @@ acpi_status wmi_remove_notify_handler(const char *guid)
{
struct wmi_block *block;
acpi_status status = AE_NOT_EXIST;
- char tmp[16], guid_input[16];
+ uuid_le guid_input;
struct list_head *p;
if (!guid)
return AE_BAD_PARAMETER;
- wmi_parse_guid(guid, tmp);
- wmi_swap_bytes(tmp, guid_input);
+ if (uuid_le_to_bin(guid, &guid_input))
+ return AE_BAD_PARAMETER;
list_for_each(p, &wmi_block_list) {
acpi_status wmi_status;
block = list_entry(p, struct wmi_block, list);
- if (memcmp(block->gblock.guid, guid_input, 16) == 0) {
+ if (memcmp(block->gblock.guid, &guid_input, 16) == 0) {
if (!block->handler ||
block->handler == wmi_notify_debug)
return AE_NULL_ENTRY;
diff --git a/drivers/scsi/bfa/bfi.h b/drivers/scsi/bfa/bfi.h
index 97600dcec649..5f698d038b21 100644
--- a/drivers/scsi/bfa/bfi.h
+++ b/drivers/scsi/bfa/bfi.h
@@ -356,7 +356,7 @@ struct bfi_ioc_image_hdr_s {
u8 port0_mode; /* device mode for port 0 */
u8 port1_mode; /* device mode for port 1 */
u32 exec; /* exec vector */
- u32 bootenv; /* fimware boot env */
+ u32 bootenv; /* firmware boot env */
u32 rsvd_b[2];
struct bfi_ioc_fwver_s fwver;
u32 md5sum[BFI_IOC_MD5SUM_SZ];
diff --git a/drivers/staging/comedi/drivers/daqboard2000.c b/drivers/staging/comedi/drivers/daqboard2000.c
index 57ab6680e3ae..a536a15c1d30 100644
--- a/drivers/staging/comedi/drivers/daqboard2000.c
+++ b/drivers/staging/comedi/drivers/daqboard2000.c
@@ -26,7 +26,7 @@
* Much of the functionality of this driver was determined from reading
* the source code for the Windows driver.
*
- * The FPGA on the board requires fimware, which is available from
+ * The FPGA on the board requires firmware, which is available from
* http://www.comedi.org in the comedi_nonfree_firmware tarball.
*
* Configuration options: not applicable, uses PCI auto config
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index dde0b79f3948..af1bc19b7c85 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -48,7 +48,7 @@ struct buffer_head *
befs_read_datastream(struct super_block *sb, const befs_data_stream *ds,
befs_off_t pos, uint * off)
{
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
befs_block_run run;
befs_blocknr_t block; /* block coresponding to pos */
@@ -127,7 +127,7 @@ befs_read_lsymlink(struct super_block *sb, const befs_data_stream *ds,
{
befs_off_t bytes_read = 0; /* bytes readed */
u16 plen;
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
befs_debug(sb, "---> %s length: %llu", __func__, len);
while (bytes_read < len) {
@@ -429,7 +429,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
struct buffer_head *dbl_indir_block;
struct buffer_head *indir_block;
befs_block_run indir_run;
- befs_disk_inode_addr *iaddr_array = NULL;
+ befs_disk_inode_addr *iaddr_array;
struct befs_sb_info *befs_sb = BEFS_SB(sb);
befs_blocknr_t indir_start_blk =
@@ -488,7 +488,6 @@ befs_find_brun_dblindirect(struct super_block *sb,
iaddr_array = (befs_disk_inode_addr *) dbl_indir_block->b_data;
indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]);
brelse(dbl_indir_block);
- iaddr_array = NULL;
/* Read indirect block */
which_block = indir_indx / befs_iaddrs_per_block(sb);
@@ -513,7 +512,6 @@ befs_find_brun_dblindirect(struct super_block *sb,
iaddr_array = (befs_disk_inode_addr *) indir_block->b_data;
*run = fsrun_to_cpu(sb, iaddr_array[block_indx]);
brelse(indir_block);
- iaddr_array = NULL;
blockno_at_run_start = indir_start_blk;
blockno_at_run_start += diblklen * dblindir_indx;
diff --git a/fs/befs/io.c b/fs/befs/io.c
index 7a5b4ec21c56..523c8af2d770 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -26,7 +26,7 @@
struct buffer_head *
befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
{
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
befs_blocknr_t block = 0;
struct befs_sb_info *befs_sb = BEFS_SB(sb);
@@ -63,7 +63,7 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
struct buffer_head *
befs_bread(struct super_block *sb, befs_blocknr_t block)
{
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 71112aa07d84..7da05b159ade 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -155,7 +155,7 @@ befs_get_block(struct inode *inode, sector_t block,
static struct dentry *
befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
- struct inode *inode = NULL;
+ struct inode *inode;
struct super_block *sb = dir->i_sb;
const befs_data_stream *ds = &BEFS_I(dir)->i_data.ds;
befs_off_t offset;
@@ -294,10 +294,10 @@ static void init_once(void *foo)
static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
{
- struct buffer_head *bh = NULL;
- befs_inode *raw_inode = NULL;
+ struct buffer_head *bh;
+ befs_inode *raw_inode;
struct befs_sb_info *befs_sb = BEFS_SB(sb);
- struct befs_inode_info *befs_ino = NULL;
+ struct befs_inode_info *befs_ino;
struct inode *inode;
long ret = -EIO;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f831d3849c10..97f324642b5f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1837,6 +1837,7 @@ const struct file_operations def_blk_fops = {
.write_iter = blkdev_write_iter,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
+ .get_unmapped_area = dax_get_unmapped_area,
.unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_blkdev_ioctl,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e0290303bb35..2b88127bba5b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,13 +20,13 @@
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
-#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
+#include <linux/uuid.h>
#include <asm/div64.h>
#include "ctree.h"
#include "extent_map.h"
diff --git a/fs/buffer.c b/fs/buffer.c
index af0d9a82a8ed..754813a6962b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -255,17 +255,17 @@ out:
*/
static void free_more_memory(void)
{
- struct zone *zone;
+ struct zoneref *z;
int nid;
wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
yield();
for_each_online_node(nid) {
- (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
- gfp_zone(GFP_NOFS), NULL,
- &zone);
- if (zone)
+
+ z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
+ gfp_zone(GFP_NOFS), NULL);
+ if (z->zone)
try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
GFP_NOFS, NULL);
}
diff --git a/fs/dax.c b/fs/dax.c
index 5a282260d27e..1bd0786b603d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1036,3 +1036,46 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
return dax_zero_page_range(inode, from, length, get_block);
}
EXPORT_SYMBOL_GPL(dax_truncate_page);
+
+/**
+ * dax_get_unmapped_area - handle get_unmapped_area for a DAX file
+ * @filp: The file being mmap'd, if not NULL
+ * @addr: The mmap address. If NULL, the kernel assigns the address
+ * @len: The mmap size in bytes
+ * @pgoff: The page offset in the file where the mapping starts from.
+ * @flags: The mmap flags
+ *
+ * This function can be called by a filesystem for get_unmapped_area().
+ * When a target file is a DAX file, it aligns the mmap address at the
+ * beginning of the file by the pmd size.
+ */
+unsigned long dax_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ unsigned long off, off_end, off_pmd, len_pmd, addr_pmd;
+
+ if (!IS_ENABLED(CONFIG_FS_DAX_PMD) ||
+ !filp || addr || !IS_DAX(filp->f_mapping->host))
+ goto out;
+
+ off = pgoff << PAGE_SHIFT;
+ off_end = off + len;
+ off_pmd = round_up(off, PMD_SIZE); /* pmd-aligned offset */
+
+ if ((off_end <= off_pmd) || ((off_end - off_pmd) < PMD_SIZE))
+ goto out;
+
+ len_pmd = len + PMD_SIZE;
+ if ((off + len_pmd) < off)
+ goto out;
+
+ addr_pmd = current->mm->get_unmapped_area(filp, addr, len_pmd,
+ pgoff, flags);
+ if (!IS_ERR_VALUE(addr_pmd)) {
+ addr_pmd += (off - addr_pmd) & (PMD_SIZE - 1);
+ return addr_pmd;
+ }
+out:
+ return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+EXPORT_SYMBOL_GPL(dax_get_unmapped_area);
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index e2ab6d0497f2..1d73fc6dba13 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/ctype.h>
#include <linux/slab.h>
+#include <linux/uuid.h>
#include "internal.h"
@@ -46,11 +47,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
*/
bool efivarfs_valid_name(const char *str, int len)
{
- static const char dashes[EFI_VARIABLE_GUID_LEN] = {
- [8] = 1, [13] = 1, [18] = 1, [23] = 1
- };
const char *s = str + len - EFI_VARIABLE_GUID_LEN;
- int i;
/*
* We need a GUID, plus at least one letter for the variable name,
@@ -68,37 +65,7 @@ bool efivarfs_valid_name(const char *str, int len)
*
* 12345678-1234-1234-1234-123456789abc
*/
- for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) {
- if (dashes[i]) {
- if (*s++ != '-')
- return false;
- } else {
- if (!isxdigit(*s++))
- return false;
- }
- }
-
- return true;
-}
-
-static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid)
-{
- guid->b[0] = hex_to_bin(str[6]) << 4 | hex_to_bin(str[7]);
- guid->b[1] = hex_to_bin(str[4]) << 4 | hex_to_bin(str[5]);
- guid->b[2] = hex_to_bin(str[2]) << 4 | hex_to_bin(str[3]);
- guid->b[3] = hex_to_bin(str[0]) << 4 | hex_to_bin(str[1]);
- guid->b[4] = hex_to_bin(str[11]) << 4 | hex_to_bin(str[12]);
- guid->b[5] = hex_to_bin(str[9]) << 4 | hex_to_bin(str[10]);
- guid->b[6] = hex_to_bin(str[16]) << 4 | hex_to_bin(str[17]);
- guid->b[7] = hex_to_bin(str[14]) << 4 | hex_to_bin(str[15]);
- guid->b[8] = hex_to_bin(str[19]) << 4 | hex_to_bin(str[20]);
- guid->b[9] = hex_to_bin(str[21]) << 4 | hex_to_bin(str[22]);
- guid->b[10] = hex_to_bin(str[24]) << 4 | hex_to_bin(str[25]);
- guid->b[11] = hex_to_bin(str[26]) << 4 | hex_to_bin(str[27]);
- guid->b[12] = hex_to_bin(str[28]) << 4 | hex_to_bin(str[29]);
- guid->b[13] = hex_to_bin(str[30]) << 4 | hex_to_bin(str[31]);
- guid->b[14] = hex_to_bin(str[32]) << 4 | hex_to_bin(str[33]);
- guid->b[15] = hex_to_bin(str[34]) << 4 | hex_to_bin(str[35]);
+ return uuid_is_valid(s);
}
static int efivarfs_create(struct inode *dir, struct dentry *dentry,
@@ -119,8 +86,7 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
/* length of the variable name itself: remove GUID and separator */
namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1;
- efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1,
- &var->var.VendorGuid);
+ uuid_le_to_bin(dentry->d_name.name + namelen + 1, &var->var.VendorGuid);
if (efivar_variable_is_removable(var->var.VendorGuid,
dentry->d_name.name, namelen))
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8a74a2a52e0f..10db91218933 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1583,15 +1583,15 @@ static int ep_send_events(struct eventpoll *ep,
return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
}
-static inline struct timespec ep_set_mstimeout(long ms)
+static inline struct timespec64 ep_set_mstimeout(long ms)
{
- struct timespec now, ts = {
+ struct timespec64 now, ts = {
.tv_sec = ms / MSEC_PER_SEC,
.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
};
- ktime_get_ts(&now);
- return timespec_add_safe(now, ts);
+ ktime_get_ts64(&now);
+ return timespec64_add_safe(now, ts);
}
/**
@@ -1621,11 +1621,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
ktime_t expires, *to = NULL;
if (timeout > 0) {
- struct timespec end_time = ep_set_mstimeout(timeout);
+ struct timespec64 end_time = ep_set_mstimeout(timeout);
slack = select_estimate_accuracy(&end_time);
to = &expires;
- *to = timespec_to_ktime(end_time);
+ *to = timespec64_to_ktime(end_time);
} else if (timeout == 0) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
diff --git a/fs/exec.c b/fs/exec.c
index fdecb7615587..2f44590d88a9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1495,8 +1495,15 @@ int remove_arg_zero(struct linux_binprm *bprm)
kunmap_atomic(kaddr);
put_arg_page(page);
- if (offset == PAGE_SIZE)
+ if (offset == PAGE_SIZE) {
free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
+ } else if (offset == PAGE_SIZE - 1) {
+ /*
+ * The trailing '\0' is the last byte in a page - we're
+ * about to advance past that byte so free its page now
+ */
+ free_arg_page(bprm, (bprm->p >> PAGE_SHIFT));
+ }
} while (offset == PAGE_SIZE);
bprm->p++;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 868c02317b05..cd117abd023a 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -172,6 +172,7 @@ const struct file_operations ext2_file_operations = {
.open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
+ .get_unmapped_area = dax_get_unmapped_area,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index df44c877892a..4f57272a6355 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -703,6 +703,7 @@ const struct file_operations ext4_file_operations = {
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
+ .get_unmapped_area = dax_get_unmapped_area,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 8850254136ae..7002467bfbac 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -106,7 +106,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
if (!journal) {
- ret = generic_file_fsync(file, start, end, datasync);
+ if (test_opt(inode->i_sb, BARRIER))
+ ret = generic_file_fsync(file, start, end, datasync);
+ else
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0acf8cacb2be..28cc412852af 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -13,8 +13,8 @@
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/file.h>
-#include <linux/random.h>
#include <linux/quotaops.h>
+#include <linux/uuid.h>
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b1442d13a928..f8436c2bbe16 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -20,7 +20,7 @@
#include <linux/uaccess.h>
#include <linux/mount.h>
#include <linux/pagevec.h>
-#include <linux/random.h>
+#include <linux/uuid.h>
#include "f2fs.h"
#include "node.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 592cea54cea0..989a2cef6b76 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -931,7 +931,8 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
* This is WB_SYNC_NONE writeback, so if allocation fails just
* wakeup the thread for old dirty data writeback
*/
- work = kzalloc(sizeof(*work), GFP_ATOMIC);
+ work = kzalloc(sizeof(*work),
+ GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
if (!work) {
trace_writeback_nowork(wb);
wb_wakeup(wb);
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 2ccbf5531554..1a85d94f5b25 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -13,13 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Original code was written by Koji Sato <koji@osrg.net>.
- * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
- * Amagai Yoshiji <amagai@osrg.net>.
+ * Originally written by Koji Sato.
+ * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji.
*/
#include <linux/types.h>
@@ -58,7 +53,7 @@ nilfs_palloc_groups_count(const struct inode *inode)
* @inode: inode of metadata file using this allocator
* @entry_size: size of the persistent object
*/
-int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
+int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -73,13 +68,17 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
mi->mi_blocks_per_group =
DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
mi->mi_entries_per_block) + 1;
- /* Number of blocks in a group including entry blocks and
- a bitmap block */
+ /*
+ * Number of blocks in a group including entry blocks
+ * and a bitmap block
+ */
mi->mi_blocks_per_desc_block =
nilfs_palloc_groups_per_desc_block(inode) *
mi->mi_blocks_per_group + 1;
- /* Number of blocks per descriptor including the
- descriptor block */
+ /*
+ * Number of blocks per descriptor including the
+ * descriptor block
+ */
return 0;
}
@@ -389,7 +388,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
*/
static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
- unsigned bsize,
+ unsigned int bsize,
spinlock_t *lock)
{
int pos, end = bsize;
@@ -624,7 +623,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu\n",
+ "entry number %llu already freed: ino=%lu",
(unsigned long long)req->pr_entry_nr,
(unsigned long)inode->i_ino);
else
@@ -665,7 +664,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu\n",
+ "entry number %llu already freed: ino=%lu",
(unsigned long long)req->pr_entry_nr,
(unsigned long)inode->i_ino);
else
@@ -740,8 +739,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
unsigned long group, group_offset;
__u64 group_min_nr, last_nrs[8];
const unsigned long epg = nilfs_palloc_entries_per_group(inode);
- const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
- unsigned entry_start, end, pos;
+ const unsigned int epb = NILFS_MDT(inode)->mi_entries_per_block;
+ unsigned int entry_start, end, pos;
spinlock_t *lock;
int i, j, k, ret;
u32 nfree;
@@ -774,7 +773,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
if (!nilfs_clear_bit_atomic(lock, group_offset,
bitmap)) {
nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu\n",
+ "entry number %llu already freed: ino=%lu",
(unsigned long long)entry_nrs[j],
(unsigned long)inode->i_ino);
} else {
@@ -819,7 +818,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
last_nrs[k]);
if (ret && ret != -ENOENT) {
nilfs_warning(inode->i_sb, __func__,
- "failed to delete block of entry %llu: ino=%lu, err=%d\n",
+ "failed to delete block of entry %llu: ino=%lu, err=%d",
(unsigned long long)last_nrs[k],
(unsigned long)inode->i_ino, ret);
}
@@ -838,7 +837,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
ret = nilfs_palloc_delete_bitmap_block(inode, group);
if (ret && ret != -ENOENT) {
nilfs_warning(inode->i_sb, __func__,
- "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n",
+ "failed to delete bitmap block of group %lu: ino=%lu, err=%d",
group,
(unsigned long)inode->i_ino, ret);
}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 6e6f49aa53df..05149e606a78 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -13,13 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Original code was written by Koji Sato <koji@osrg.net>.
- * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
- * Amagai Yoshiji <amagai@osrg.net>.
+ * Originally written by Koji Sato.
+ * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji.
*/
#ifndef _NILFS_ALLOC_H
@@ -42,7 +37,7 @@ nilfs_palloc_entries_per_group(const struct inode *inode)
return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
}
-int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
+int nilfs_palloc_init_blockgroup(struct inode *, unsigned int);
int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
struct buffer_head **);
void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index a9fb3636c142..f2a7877e0c8c 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/fs.h>
@@ -46,7 +42,7 @@ static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
if (err == -EINVAL) {
nilfs_error(inode->i_sb, fname,
- "broken bmap (inode number=%lu)\n", inode->i_ino);
+ "broken bmap (inode number=%lu)", inode->i_ino);
err = -EIO;
}
return err;
@@ -97,7 +93,7 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
}
int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
- unsigned maxblocks)
+ unsigned int maxblocks)
{
int ret;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index bfa817ce40b3..b6a4c8f93ac8 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_BMAP_H
@@ -61,7 +57,7 @@ struct nilfs_bmap_stats {
struct nilfs_bmap_operations {
int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *,
- unsigned);
+ unsigned int);
int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
int (*bop_delete)(struct nilfs_bmap *, __u64);
void (*bop_clear)(struct nilfs_bmap *);
@@ -126,10 +122,14 @@ struct nilfs_bmap {
/* pointer type */
#define NILFS_BMAP_PTR_P 0 /* physical block number (i.e. LBN) */
-#define NILFS_BMAP_PTR_VS 1 /* virtual block number (single
- version) */
-#define NILFS_BMAP_PTR_VM 2 /* virtual block number (has multiple
- versions) */
+#define NILFS_BMAP_PTR_VS 1 /*
+ * virtual block number (single
+ * version)
+ */
+#define NILFS_BMAP_PTR_VM 2 /*
+ * virtual block number (has multiple
+ * versions)
+ */
#define NILFS_BMAP_PTR_U (-1) /* never perform pointer operations */
#define NILFS_BMAP_USE_VBN(bmap) ((bmap)->b_ptr_type > 0)
@@ -154,7 +154,7 @@ struct nilfs_bmap_store {
int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
-int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned int);
int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp);
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index e0c9daf9aa22..0576033699bc 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -13,13 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * This file was originally written by Seiji Kihara <kihara@osrg.net>
- * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
- * stabilization and simplification.
+ * Originally written by Seiji Kihara.
+ * Fully revised by Ryusuke Konishi for stabilization and simplification.
*
*/
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index d876b565ce64..2cc1b80e18f7 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -13,12 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Seiji Kihara <kihara@osrg.net>
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Seiji Kihara.
+ * Revised by Ryusuke Konishi.
*/
#ifndef _NILFS_BTNODE_H
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 3a3821b00486..eccb1c89ccbb 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/slab.h>
@@ -689,7 +685,8 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
}
static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
- __u64 key, __u64 *ptrp, unsigned maxblocks)
+ __u64 key, __u64 *ptrp,
+ unsigned int maxblocks)
{
struct nilfs_btree_path *path;
struct nilfs_btree_node *node;
@@ -1032,12 +1029,12 @@ static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree,
if (ptr != NILFS_BMAP_INVALID_PTR)
/* sequential access */
return ptr;
- else {
- ptr = nilfs_btree_find_near(btree, path);
- if (ptr != NILFS_BMAP_INVALID_PTR)
- /* near */
- return ptr;
- }
+
+ ptr = nilfs_btree_find_near(btree, path);
+ if (ptr != NILFS_BMAP_INVALID_PTR)
+ /* near */
+ return ptr;
+
/* block group */
return nilfs_bmap_find_target_in_group(btree);
}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 22c02e35b6ef..df1a25faa83b 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_BTREE_H
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index b6596cab9e99..8a3d3b65af3f 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/kernel.h>
@@ -41,6 +37,7 @@ static unsigned long
nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
{
__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+
do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
return (unsigned long)tcno;
}
@@ -50,6 +47,7 @@ static unsigned long
nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
{
__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+
return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
}
@@ -433,7 +431,8 @@ static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
}
static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
- void *buf, unsigned cisz, size_t nci)
+ void *buf, unsigned int cisz,
+ size_t nci)
{
struct nilfs_checkpoint *cp;
struct nilfs_cpinfo *ci = buf;
@@ -484,7 +483,8 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
}
static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
- void *buf, unsigned cisz, size_t nci)
+ void *buf, unsigned int cisz,
+ size_t nci)
{
struct buffer_head *bh;
struct nilfs_cpfile_header *header;
@@ -570,7 +570,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
*/
ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
- void *buf, unsigned cisz, size_t nci)
+ void *buf, unsigned int cisz, size_t nci)
{
switch (mode) {
case NILFS_CHECKPOINT:
@@ -870,8 +870,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
void *kaddr;
int ret;
- /* CP number is invalid if it's zero or larger than the
- largest exist one.*/
+ /*
+ * CP number is invalid if it's zero or larger than the
+ * largest existing one.
+ */
if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
return -ENOENT;
down_read(&NILFS_MDT(cpfile)->mi_sem);
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index a242b9a314f9..0249744ae234 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_CPFILE_H
@@ -37,8 +33,8 @@ int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
int nilfs_cpfile_is_snapshot(struct inode *, __u64);
int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
-ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
- size_t);
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *,
+ unsigned int, size_t);
int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
struct nilfs_inode *raw_inode, struct inode **inodep);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 7dc23f100e57..7367610ea807 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/types.h>
@@ -428,7 +424,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
return ret;
}
-ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
size_t nvi)
{
struct buffer_head *entry_bh;
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index cbd8e9732503..abbfdabcabea 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_DAT_H
@@ -51,7 +47,7 @@ void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
int nilfs_dat_mark_dirty(struct inode *, __u64);
int nilfs_dat_freev(struct inode *, __u64 *, size_t);
int nilfs_dat_move(struct inode *, __u64, sector_t);
-ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned int, size_t);
int nilfs_dat_read(struct super_block *sb, size_t entry_size,
struct nilfs_inode *raw_inode, struct inode **inodep);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 6723d45a631a..e506f4f7120a 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
+ * Modified for NILFS by Amagai Yoshiji.
*/
/*
* linux/fs/ext2/dir.c
@@ -50,7 +46,7 @@
* nilfs uses block-sized chunks. Arguably, sector-sized ones would be
* more robust, but we have what we have
*/
-static inline unsigned nilfs_chunk_size(struct inode *inode)
+static inline unsigned int nilfs_chunk_size(struct inode *inode)
{
return inode->i_sb->s_blocksize;
}
@@ -65,9 +61,9 @@ static inline void nilfs_put_page(struct page *page)
* Return the offset into page `page_nr' of the last valid
* byte in that page, plus one.
*/
-static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
+static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr)
{
- unsigned last_byte = inode->i_size;
+ unsigned int last_byte = inode->i_size;
last_byte -= page_nr << PAGE_SHIFT;
if (last_byte > PAGE_SIZE)
@@ -75,20 +71,22 @@ static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
return last_byte;
}
-static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
+static int nilfs_prepare_chunk(struct page *page, unsigned int from,
+ unsigned int to)
{
loff_t pos = page_offset(page) + from;
+
return __block_write_begin(page, pos, to - from, nilfs_get_block);
}
static void nilfs_commit_chunk(struct page *page,
struct address_space *mapping,
- unsigned from, unsigned to)
+ unsigned int from, unsigned int to)
{
struct inode *dir = mapping->host;
loff_t pos = page_offset(page) + from;
- unsigned len = to - from;
- unsigned nr_dirty, copied;
+ unsigned int len = to - from;
+ unsigned int nr_dirty, copied;
int err;
nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
@@ -106,10 +104,10 @@ static bool nilfs_check_page(struct page *page)
{
struct inode *dir = page->mapping->host;
struct super_block *sb = dir->i_sb;
- unsigned chunk_size = nilfs_chunk_size(dir);
+ unsigned int chunk_size = nilfs_chunk_size(dir);
char *kaddr = page_address(page);
- unsigned offs, rec_len;
- unsigned limit = PAGE_SIZE;
+ unsigned int offs, rec_len;
+ unsigned int limit = PAGE_SIZE;
struct nilfs_dir_entry *p;
char *error;
@@ -259,7 +257,6 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
unsigned int offset = pos & ~PAGE_MASK;
unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
-/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
return 0;
@@ -321,7 +318,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
{
const unsigned char *name = qstr->name;
int namelen = qstr->len;
- unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+ unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
unsigned long start, n;
unsigned long npages = dir_pages(dir);
struct page *page = NULL;
@@ -340,6 +337,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
n = start;
do {
char *kaddr;
+
page = nilfs_get_page(dir, n);
if (!IS_ERR(page)) {
kaddr = page_address(page);
@@ -410,8 +408,8 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
struct page *page, struct inode *inode)
{
- unsigned from = (char *) de - (char *) page_address(page);
- unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
+ unsigned int from = (char *)de - (char *)page_address(page);
+ unsigned int to = from + nilfs_rec_len_from_disk(de->rec_len);
struct address_space *mapping = page->mapping;
int err;
@@ -433,15 +431,15 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
struct inode *dir = d_inode(dentry->d_parent);
const unsigned char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
- unsigned chunk_size = nilfs_chunk_size(dir);
- unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+ unsigned int chunk_size = nilfs_chunk_size(dir);
+ unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
unsigned short rec_len, name_len;
struct page *page = NULL;
struct nilfs_dir_entry *de;
unsigned long npages = dir_pages(dir);
unsigned long n;
char *kaddr;
- unsigned from, to;
+ unsigned int from, to;
int err;
/*
@@ -533,13 +531,14 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
char *kaddr = page_address(page);
- unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
- unsigned to = ((char *)dir - kaddr) +
- nilfs_rec_len_from_disk(dir->rec_len);
- struct nilfs_dir_entry *pde = NULL;
- struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
+ unsigned int from, to;
+ struct nilfs_dir_entry *de, *pde = NULL;
int err;
+ from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
+ to = ((char *)dir - kaddr) + nilfs_rec_len_from_disk(dir->rec_len);
+ de = (struct nilfs_dir_entry *)(kaddr + from);
+
while ((char *)de < (char *)dir) {
if (de->rec_len == 0) {
nilfs_error(inode->i_sb, __func__,
@@ -572,7 +571,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
{
struct address_space *mapping = inode->i_mapping;
struct page *page = grab_cache_page(mapping, 0);
- unsigned chunk_size = nilfs_chunk_size(inode);
+ unsigned int chunk_size = nilfs_chunk_size(inode);
struct nilfs_dir_entry *de;
int err;
void *kaddr;
@@ -630,8 +629,8 @@ int nilfs_empty_dir(struct inode *inode)
while ((char *)de <= kaddr) {
if (de->rec_len == 0) {
nilfs_error(inode->i_sb, __func__,
- "zero-length directory entry "
- "(kaddr=%p, de=%p)\n", kaddr, de);
+ "zero-length directory entry (kaddr=%p, de=%p)",
+ kaddr, de);
goto not_empty;
}
if (de->inode != 0) {
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index ebf89fd8ac1a..251a44928405 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/errno.h>
@@ -62,7 +58,7 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
__u64 key, __u64 *ptrp,
- unsigned maxblocks)
+ unsigned int maxblocks)
{
struct inode *dat = NULL;
__u64 ptr, ptr2;
@@ -83,7 +79,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
ptr = blocknr;
}
- maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1);
+ maxblocks = min_t(unsigned int, maxblocks,
+ NILFS_DIRECT_KEY_MAX - key + 1);
for (cnt = 1; cnt < maxblocks &&
(ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) !=
NILFS_BMAP_INVALID_PTR;
@@ -110,9 +107,9 @@ nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key)
if (ptr != NILFS_BMAP_INVALID_PTR)
/* sequential access */
return ptr;
- else
- /* block group */
- return nilfs_bmap_find_target_in_group(direct);
+
+ /* block group */
+ return nilfs_bmap_find_target_in_group(direct);
}
static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index dc643de20a25..3015a6e78724 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_DIRECT_H
diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h
index 19ccbf9522ab..00107fdb9343 100644
--- a/fs/nilfs2/export.h
+++ b/fs/nilfs2/export.h
@@ -20,6 +20,6 @@ struct nilfs_fid {
u32 parent_gen;
u64 parent_ino;
-} __attribute__ ((packed));
+} __packed;
#endif
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 088ba001c6ef..547381f3ce13 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Amagai Yoshiji <amagai@osrg.net>,
- * Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Amagai Yoshiji and Ryusuke Konishi.
*/
#include <linux/fs.h>
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 0224b7826ace..693aded72498 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -13,13 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
- * and Ryusuke Konishi <ryusuke@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Seiji Kihara, Amagai Yoshiji, and Ryusuke Konishi.
+ * Revised by Ryusuke Konishi.
*
*/
/*
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 6548c7851b48..1d2b1805327a 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -13,12 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Amagai Yoshiji <amagai@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Amagai Yoshiji.
+ * Revised by Ryusuke Konishi.
*
*/
@@ -68,8 +64,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
struct nilfs_palloc_req req;
int ret;
- req.pr_entry_nr = 0; /* 0 says find free inode from beginning of
- a group. dull code!! */
+ req.pr_entry_nr = 0; /*
+ * 0 says find free inode from beginning
+ * of a group. dull code!!
+ */
req.pr_entry_bh = NULL;
ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 679674d13372..23ad2f091e76 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -13,12 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Amagai Yoshiji <amagai@osrg.net>
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Amagai Yoshiji.
+ * Revised by Ryusuke Konishi.
*
*/
@@ -36,6 +32,7 @@ static inline struct nilfs_inode *
nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
{
void *kaddr = kmap(ibh->b_page);
+
return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index cfebcd2fc7f3..a0ebdb17e912 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -87,7 +83,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
__u64 blknum = 0;
int err = 0, ret;
- unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
+ unsigned int maxblocks = bh_result->b_size >> inode->i_blkbits;
down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
@@ -133,11 +129,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
/* Error handling should be detailed */
set_buffer_new(bh_result);
set_buffer_delay(bh_result);
- map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
- to proper value */
+ map_bh(bh_result, inode->i_sb, 0);
+ /* Disk block number must be changed to proper value */
+
} else if (ret == -ENOENT) {
- /* not found is not error (e.g. hole); must return without
- the mapped state flag. */
+ /*
+ * not found is not error (e.g. hole); must return without
+ * the mapped state flag.
+ */
;
} else {
err = ret;
@@ -167,7 +166,7 @@ static int nilfs_readpage(struct file *file, struct page *page)
* @nr_pages - number of pages to be read
*/
static int nilfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+ struct list_head *pages, unsigned int nr_pages)
{
return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
}
@@ -226,7 +225,7 @@ static int nilfs_set_page_dirty(struct page *page)
int ret = __set_page_dirty_nobuffers(page);
if (page_has_buffers(page)) {
- unsigned nr_dirty = 0;
+ unsigned int nr_dirty = 0;
struct buffer_head *bh, *head;
/*
@@ -249,7 +248,7 @@ static int nilfs_set_page_dirty(struct page *page)
if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
} else if (ret) {
- unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+ unsigned int nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
nilfs_set_file_dirty(inode, nr_dirty);
}
@@ -291,8 +290,8 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
- unsigned start = pos & (PAGE_SIZE - 1);
- unsigned nr_dirty;
+ unsigned int start = pos & (PAGE_SIZE - 1);
+ unsigned int nr_dirty;
int err;
nr_dirty = nilfs_page_count_clean_buffers(page, start,
@@ -399,23 +398,26 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
err = nilfs_init_acl(inode, dir);
if (unlikely(err))
- goto failed_after_creation; /* never occur. When supporting
- nilfs_init_acl(), proper cancellation of
- above jobs should be considered */
+ /*
+ * Never occur. When supporting nilfs_init_acl(),
+ * proper cancellation of above jobs should be considered.
+ */
+ goto failed_after_creation;
return inode;
failed_after_creation:
clear_nlink(inode);
unlock_new_inode(inode);
- iput(inode); /* raw_inode will be deleted through
- nilfs_evict_inode() */
+ iput(inode); /*
+ * raw_inode will be deleted through
+ * nilfs_evict_inode().
+ */
goto failed;
failed_ifile_create_inode:
make_bad_inode(inode);
- iput(inode); /* if i_nlink == 1, generic_forget_inode() will be
- called */
+ iput(inode);
failed:
return ERR_PTR(err);
}
@@ -666,8 +668,10 @@ void nilfs_write_inode_common(struct inode *inode,
else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
raw_inode->i_device_code =
cpu_to_le64(huge_encode_dev(inode->i_rdev));
- /* When extending inode, nilfs->ns_inode_size should be checked
- for substitutions of appended fields */
+ /*
+ * When extending inode, nilfs->ns_inode_size should be checked
+ * for substitutions of appended fields.
+ */
}
void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
@@ -685,9 +689,12 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
nilfs_write_inode_common(inode, raw_inode, 0);
- /* XXX: call with has_bmap = 0 is a workaround to avoid
- deadlock of bmap. This delays update of i_bmap to just
- before writing */
+ /*
+ * XXX: call with has_bmap = 0 is a workaround to avoid
+ * deadlock of bmap. This delays update of i_bmap to just
+ * before writing.
+ */
+
nilfs_ifile_unmap_inode(ifile, ino, ibh);
}
@@ -752,14 +759,15 @@ void nilfs_truncate(struct inode *inode)
nilfs_mark_inode_dirty(inode);
nilfs_set_file_dirty(inode, 0);
nilfs_transaction_commit(sb);
- /* May construct a logical segment and may fail in sync mode.
- But truncate has no return value. */
+ /*
+ * May construct a logical segment and may fail in sync mode.
+ * But truncate has no return value.
+ */
}
static void nilfs_clear_inode(struct inode *inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
- struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
/*
* Free resources allocated in nilfs_read_inode(), here.
@@ -768,8 +776,8 @@ static void nilfs_clear_inode(struct inode *inode)
brelse(ii->i_bh);
ii->i_bh = NULL;
- if (mdi && mdi->mi_palloc_cache)
- nilfs_palloc_destroy_cache(inode);
+ if (nilfs_is_metadata_file_inode(inode))
+ nilfs_mdt_clear(inode);
if (test_bit(NILFS_I_BMAP, &ii->i_state))
nilfs_bmap_clear(ii->i_bmap);
@@ -811,8 +819,10 @@ void nilfs_evict_inode(struct inode *inode)
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
nilfs_transaction_commit(sb);
- /* May construct a logical segment and may fail in sync mode.
- But delete_inode has no return value. */
+ /*
+ * May construct a logical segment and may fail in sync mode.
+ * But delete_inode has no return value.
+ */
}
int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
@@ -856,6 +866,7 @@ out_err:
int nilfs_permission(struct inode *inode, int mask)
{
struct nilfs_root *root = NILFS_I(inode)->i_root;
+
if ((mask & MAY_WRITE) && root &&
root->cno != NILFS_CPTREE_CURRENT_CNO)
return -EROFS; /* snapshot is not writable */
@@ -906,7 +917,7 @@ int nilfs_inode_dirty(struct inode *inode)
return ret;
}
-int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
+int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
@@ -919,17 +930,23 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
spin_lock(&nilfs->ns_inode_lock);
if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
!test_bit(NILFS_I_BUSY, &ii->i_state)) {
- /* Because this routine may race with nilfs_dispose_list(),
- we have to check NILFS_I_QUEUED here, too. */
+ /*
+ * Because this routine may race with nilfs_dispose_list(),
+ * we have to check NILFS_I_QUEUED here, too.
+ */
if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
- /* This will happen when somebody is freeing
- this inode. */
+ /*
+ * This will happen when somebody is freeing
+ * this inode.
+ */
nilfs_warning(inode->i_sb, __func__,
- "cannot get inode (ino=%lu)\n",
+ "cannot get inode (ino=%lu)",
inode->i_ino);
spin_unlock(&nilfs->ns_inode_lock);
- return -EINVAL; /* NILFS_I_DIRTY may remain for
- freeing inode */
+ return -EINVAL; /*
+ * NILFS_I_DIRTY may remain for
+ * freeing inode.
+ */
}
list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
set_bit(NILFS_I_QUEUED, &ii->i_state);
@@ -946,7 +963,7 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
err = nilfs_load_inode_block(inode, &ibh);
if (unlikely(err)) {
nilfs_warning(inode->i_sb, __func__,
- "failed to reget inode block.\n");
+ "failed to reget inode block.");
return err;
}
nilfs_update_inode(inode, ibh, flags);
@@ -973,7 +990,7 @@ void nilfs_dirty_inode(struct inode *inode, int flags)
if (is_bad_inode(inode)) {
nilfs_warning(inode->i_sb, __func__,
- "tried to mark bad_inode dirty. ignored.\n");
+ "tried to mark bad_inode dirty. ignored.");
dump_stack();
return;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index e8fe24882b5b..358b57e2cdf9 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#include <linux/fs.h>
@@ -783,6 +779,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
size_t nmembs = argv->v_nmembs;
struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
struct nilfs_bdesc *bdescs = buf;
+ struct buffer_head *bh;
int ret, i;
for (i = 0; i < nmembs; i++) {
@@ -800,12 +797,16 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
/* skip dead block */
continue;
if (bdescs[i].bd_level == 0) {
- ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
- bdescs[i].bd_offset);
- if (ret < 0) {
+ ret = nilfs_mdt_get_block(nilfs->ns_dat,
+ bdescs[i].bd_offset,
+ false, NULL, &bh);
+ if (unlikely(ret)) {
WARN_ON(ret == -ENOENT);
return ret;
}
+ mark_buffer_dirty(bh);
+ nilfs_mdt_mark_dirty(nilfs->ns_dat);
+ put_bh(bh);
} else {
ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset,
bdescs[i].bd_level);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index f6982b9153d5..3417d859a03c 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*/
#include <linux/buffer_head.h>
@@ -32,6 +28,7 @@
#include "segment.h"
#include "page.h"
#include "mdt.h"
+#include "alloc.h" /* nilfs_palloc_destroy_cache() */
#include <trace/events/nilfs2.h>
@@ -393,34 +390,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
return ret;
}
-/**
- * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty.
- * @inode: inode of the meta data file
- * @block: block offset
- *
- * Return Value: On success, it returns 0. On error, the following negative
- * error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
- *
- * %-EIO - I/O error
- *
- * %-ENOENT - the specified block does not exist (hole block)
- */
-int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
-{
- struct buffer_head *bh;
- int err;
-
- err = nilfs_mdt_read_block(inode, block, 0, &bh);
- if (unlikely(err))
- return err;
- mark_buffer_dirty(bh);
- nilfs_mdt_mark_dirty(inode);
- brelse(bh);
- return 0;
-}
-
int nilfs_mdt_fetch_dirty(struct inode *inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -497,8 +466,32 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
return 0;
}
-void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
- unsigned header_size)
+/**
+ * nilfs_mdt_clear - do cleanup for the metadata file
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_clear(struct inode *inode)
+{
+ struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+ if (mdi->mi_palloc_cache)
+ nilfs_palloc_destroy_cache(inode);
+}
+
+/**
+ * nilfs_mdt_destroy - release resources used by the metadata file
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_destroy(struct inode *inode)
+{
+ struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+ kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+ kfree(mdi);
+}
+
+void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size,
+ unsigned int header_size)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 03246cac3338..3f67f3932097 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*/
#ifndef _NILFS_MDT_H
@@ -57,8 +53,8 @@ struct nilfs_shadow_map {
struct nilfs_mdt_info {
struct rw_semaphore mi_sem;
struct blockgroup_lock *mi_bgl;
- unsigned mi_entry_size;
- unsigned mi_first_entry_offset;
+ unsigned int mi_entry_size;
+ unsigned int mi_first_entry_offset;
unsigned long mi_entries_per_block;
struct nilfs_palloc_cache *mi_palloc_cache;
struct nilfs_shadow_map *mi_shadow;
@@ -71,6 +67,11 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
return inode->i_private;
}
+static inline int nilfs_is_metadata_file_inode(const struct inode *inode)
+{
+ return inode->i_private != NULL;
+}
+
/* Default GFP flags using highmem */
#define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM)
@@ -83,11 +84,13 @@ int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
struct buffer_head **out_bh);
int nilfs_mdt_delete_block(struct inode *, unsigned long);
int nilfs_mdt_forget_block(struct inode *, unsigned long);
-int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
int nilfs_mdt_fetch_dirty(struct inode *);
int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
-void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
+void nilfs_mdt_clear(struct inode *inode);
+void nilfs_mdt_destroy(struct inode *inode);
+
+void nilfs_mdt_set_entry_size(struct inode *, unsigned int, unsigned int);
int nilfs_mdt_setup_shadow_map(struct inode *inode,
struct nilfs_shadow_map *shadow);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 3b2af05f9fb4..1ec8ae5995a5 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
- * Ryusuke Konishi <ryusuke@osrg.net>
+ * Modified for NILFS by Amagai Yoshiji and Ryusuke Konishi.
*/
/*
* linux/fs/ext2/namei.c
@@ -49,6 +44,7 @@
static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
{
int err = nilfs_add_link(dentry, inode);
+
if (!err) {
d_instantiate(dentry, inode);
unlock_new_inode(inode);
@@ -143,7 +139,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
{
struct nilfs_transaction_info ti;
struct super_block *sb = dir->i_sb;
- unsigned l = strlen(symname)+1;
+ unsigned int l = strlen(symname) + 1;
struct inode *inode;
int err;
@@ -288,7 +284,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
if (!inode->i_nlink) {
nilfs_warning(inode->i_sb, __func__,
- "deleting nonexistent file (%lu), %d\n",
+ "deleting nonexistent file (%lu), %d",
inode->i_ino, inode->i_nlink);
set_nlink(inode, 1);
}
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 385704027575..b1d48bc0532d 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>
- * Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Koji Sato and Ryusuke Konishi.
*/
#ifndef _NILFS_H
@@ -69,8 +64,10 @@ struct nilfs_inode_info {
*/
struct rw_semaphore xattr_sem;
#endif
- struct buffer_head *i_bh; /* i_bh contains a new or dirty
- disk inode */
+ struct buffer_head *i_bh; /*
+ * i_bh contains a new or dirty
+ * disk inode.
+ */
struct nilfs_root *i_root;
struct inode vfs_inode;
};
@@ -100,8 +97,10 @@ enum {
NILFS_I_NEW = 0, /* Inode is newly created */
NILFS_I_DIRTY, /* The file is dirty */
NILFS_I_QUEUED, /* inode is in dirty_files list */
- NILFS_I_BUSY, /* inode is grabbed by a segment
- constructor */
+ NILFS_I_BUSY, /*
+ * Inode is grabbed by a segment
+ * constructor
+ */
NILFS_I_COLLECTED, /* All dirty blocks are collected */
NILFS_I_UPDATED, /* The file has been written back */
NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */
@@ -145,8 +144,10 @@ enum {
struct nilfs_transaction_info {
u32 ti_magic;
void *ti_save;
- /* This should never used. If this happens,
- one of other filesystems has a bug. */
+ /*
+ * This should never be used. If it happens,
+ * one of other filesystems has a bug.
+ */
unsigned short ti_flags;
unsigned short ti_count;
};
@@ -156,8 +157,10 @@ struct nilfs_transaction_info {
/* ti_flags */
#define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */
-#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the
- end of transaction. */
+#define NILFS_TI_SYNC 0x0002 /*
+ * Force to construct segment at the
+ * end of transaction.
+ */
#define NILFS_TI_GC 0x0004 /* GC context */
#define NILFS_TI_COMMIT 0x0008 /* Change happened or not */
#define NILFS_TI_WRITER 0x0010 /* Constructor context */
@@ -279,7 +282,7 @@ extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
int nilfs_permission(struct inode *inode, int mask);
int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
extern int nilfs_inode_dirty(struct inode *);
-int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
+int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty);
extern int __nilfs_mark_inode_dirty(struct inode *, int);
extern void nilfs_dirty_inode(struct inode *, int flags);
int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 489391561cda..d97ba5f11b77 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>,
- * Seiji Kihara <kihara@osrg.net>.
+ * Written by Ryusuke Konishi and Seiji Kihara.
*/
#include <linux/pagemap.h>
@@ -440,12 +435,12 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
__nilfs_clear_page_dirty(page);
}
-unsigned nilfs_page_count_clean_buffers(struct page *page,
- unsigned from, unsigned to)
+unsigned int nilfs_page_count_clean_buffers(struct page *page,
+ unsigned int from, unsigned int to)
{
- unsigned block_start, block_end;
+ unsigned int block_start, block_end;
struct buffer_head *bh, *head;
- unsigned nc = 0;
+ unsigned int nc = 0;
for (bh = head = page_buffers(page), block_start = 0;
bh != head || !block_start;
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index a43b8287d012..f3687c958fa8 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>,
- * Seiji Kihara <kihara@osrg.net>.
+ * Written by Ryusuke Konishi and Seiji Kihara.
*/
#ifndef _NILFS_PAGE_H
@@ -58,7 +53,8 @@ void nilfs_copy_back_pages(struct address_space *, struct address_space *);
void nilfs_clear_dirty_page(struct page *, bool);
void nilfs_clear_dirty_pages(struct address_space *, bool);
void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
-unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int,
+ unsigned int);
unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
sector_t start_blk,
sector_t *blkoff);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 5afa77fadc11..d893dc912b62 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*/
#include <linux/buffer_head.h>
@@ -47,8 +43,10 @@ enum {
/* work structure for recovery */
struct nilfs_recovery_block {
- ino_t ino; /* Inode number of the file that this block
- belongs to */
+ ino_t ino; /*
+ * Inode number of the file that this block
+ * belongs to
+ */
sector_t blocknr; /* block number */
__u64 vblocknr; /* virtual block number */
unsigned long blkoff; /* File offset of the data block (per block) */
@@ -156,7 +154,7 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
sr = (struct nilfs_super_root *)bh_sr->b_data;
if (check) {
- unsigned bytes = le16_to_cpu(sr->sr_bytes);
+ unsigned int bytes = le16_to_cpu(sr->sr_bytes);
if (bytes == 0 || bytes > nilfs->ns_blocksize) {
ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
@@ -508,7 +506,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
{
struct inode *inode;
struct nilfs_recovery_block *rb, *n;
- unsigned blocksize = nilfs->ns_blocksize;
+ unsigned int blocksize = nilfs->ns_blocksize;
struct page *page;
loff_t pos;
int err = 0, err2 = 0;
@@ -526,6 +524,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
0, &page, nilfs_get_block);
if (unlikely(err)) {
loff_t isize = inode->i_size;
+
if (pos + blocksize > isize)
nilfs_write_failed(inode->i_mapping,
pos + blocksize);
@@ -872,9 +871,11 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
flags = le16_to_cpu(sum->ss_flags);
if (!(flags & NILFS_SS_SR) && !scan_newer) {
- /* This will never happen because a superblock
- (last_segment) always points to a pseg
- having a super root. */
+ /*
+ * This will never happen because a superblock
+ * (last_segment) always points to a pseg with
+ * a super root.
+ */
ret = NILFS_SEG_FAIL_CONSISTENCY;
goto failed;
}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index f63620ce3892..bf36df10540b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -133,7 +129,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
return 0;
}
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags,
time_t ctime, __u64 cno)
{
int err;
@@ -240,7 +236,7 @@ nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
{
struct nilfs_super_root *raw_sr;
struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info;
- unsigned srsize;
+ unsigned int srsize;
u32 crc;
raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index b04f08cc2397..7bbccc099709 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
#ifndef _NILFS_SEGBUF_H
@@ -82,7 +78,7 @@ struct nilfs_segment_buffer {
__u64 sb_nextnum;
sector_t sb_fseg_start, sb_fseg_end;
sector_t sb_pseg_start;
- unsigned sb_rest_blocks;
+ unsigned int sb_rest_blocks;
/* Buffers */
struct list_head sb_segsum_buffers;
@@ -124,7 +120,8 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
struct nilfs_segment_buffer *prev);
void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
struct the_nilfs *);
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time_t,
+ __u64);
int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
struct buffer_head **);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 4317f72568e6..e78b68a81aec 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -49,18 +45,26 @@
*/
#define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */
-#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments
- appended in collection retry loop */
+#define SC_MAX_SEGDELTA 64 /*
+ * Upper limit of the number of segments
+ * appended in collection retry loop
+ */
/* Construction mode */
enum {
SC_LSEG_SR = 1, /* Make a logical segment having a super root */
- SC_LSEG_DSYNC, /* Flush data blocks of a given file and make
- a logical segment without a super root */
- SC_FLUSH_FILE, /* Flush data files, leads to segment writes without
- creating a checkpoint */
- SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without
- a checkpoint */
+ SC_LSEG_DSYNC, /*
+ * Flush data blocks of a given file and make
+ * a logical segment without a super root.
+ */
+ SC_FLUSH_FILE, /*
+ * Flush data files, leads to segment writes without
+ * creating a checkpoint.
+ */
+ SC_FLUSH_DAT, /*
+ * Flush DAT file. This also creates segments
+ * without a checkpoint.
+ */
};
/* Stage numbers of dirty block collection */
@@ -154,17 +158,15 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
if (cur_ti) {
if (cur_ti->ti_magic == NILFS_TI_MAGIC)
return ++cur_ti->ti_count;
- else {
- /*
- * If journal_info field is occupied by other FS,
- * it is saved and will be restored on
- * nilfs_transaction_commit().
- */
- printk(KERN_WARNING
- "NILFS warning: journal info from a different "
- "FS\n");
- save = current->journal_info;
- }
+
+ /*
+ * If journal_info field is occupied by other FS,
+ * it is saved and will be restored on
+ * nilfs_transaction_commit().
+ */
+ printk(KERN_WARNING
+ "NILFS warning: journal info from a different FS\n");
+ save = current->journal_info;
}
if (!ti) {
ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
@@ -397,10 +399,10 @@ static void nilfs_transaction_unlock(struct super_block *sb)
static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
struct nilfs_segsum_pointer *ssp,
- unsigned bytes)
+ unsigned int bytes)
{
struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
- unsigned blocksize = sci->sc_super->s_blocksize;
+ unsigned int blocksize = sci->sc_super->s_blocksize;
void *p;
if (unlikely(ssp->offset + bytes > blocksize)) {
@@ -422,8 +424,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
{
struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
struct buffer_head *sumbh;
- unsigned sumbytes;
- unsigned flags = 0;
+ unsigned int sumbytes;
+ unsigned int flags = 0;
int err;
if (nilfs_doing_gc())
@@ -444,8 +446,10 @@ static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
{
sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
- return -E2BIG; /* The current segment is filled up
- (internal code) */
+ return -E2BIG; /*
+ * The current segment is filled up
+ * (internal code)
+ */
sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
return nilfs_segctor_reset_segment_buffer(sci);
}
@@ -472,9 +476,9 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
*/
static int nilfs_segctor_segsum_block_required(
struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
- unsigned binfo_size)
+ unsigned int binfo_size)
{
- unsigned blocksize = sci->sc_super->s_blocksize;
+ unsigned int blocksize = sci->sc_super->s_blocksize;
/* Size of finfo and binfo is enough small against blocksize */
return ssp->offset + binfo_size +
@@ -533,7 +537,7 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
struct buffer_head *bh,
struct inode *inode,
- unsigned binfo_size)
+ unsigned int binfo_size)
{
struct nilfs_segment_buffer *segbuf;
int required, err = 0;
@@ -617,7 +621,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
*vblocknr = binfo->bi_v.bi_vblocknr;
}
-static struct nilfs_sc_operations nilfs_sc_file_ops = {
+static const struct nilfs_sc_operations nilfs_sc_file_ops = {
.collect_data = nilfs_collect_file_data,
.collect_node = nilfs_collect_file_node,
.collect_bmap = nilfs_collect_file_bmap,
@@ -666,7 +670,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
*binfo_dat = binfo->bi_dat;
}
-static struct nilfs_sc_operations nilfs_sc_dat_ops = {
+static const struct nilfs_sc_operations nilfs_sc_dat_ops = {
.collect_data = nilfs_collect_dat_data,
.collect_node = nilfs_collect_file_node,
.collect_bmap = nilfs_collect_dat_bmap,
@@ -674,7 +678,7 @@ static struct nilfs_sc_operations nilfs_sc_dat_ops = {
.write_node_binfo = nilfs_write_dat_node_binfo,
};
-static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+static const struct nilfs_sc_operations nilfs_sc_dsync_ops = {
.collect_data = nilfs_collect_file_data,
.collect_node = NULL,
.collect_bmap = NULL,
@@ -777,7 +781,7 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs,
{
struct nilfs_inode_info *ii, *n;
struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
- unsigned nv = 0;
+ unsigned int nv = 0;
while (!list_empty(head)) {
spin_lock(&nilfs->ns_inode_lock);
@@ -875,9 +879,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
&raw_cp, &bh_cp);
if (likely(!err)) {
- /* The following code is duplicated with cpfile. But, it is
- needed to collect the checkpoint even if it was not newly
- created */
+ /*
+ * The following code is duplicated with cpfile. But, it is
+ * needed to collect the checkpoint even if it was not newly
+ * created.
+ */
mark_buffer_dirty(bh_cp);
nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
nilfs_cpfile_put_checkpoint(
@@ -958,7 +964,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
{
struct buffer_head *bh_sr;
struct nilfs_super_root *raw_sr;
- unsigned isz, srsz;
+ unsigned int isz, srsz;
bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
@@ -1043,7 +1049,7 @@ static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
struct inode *inode,
- struct nilfs_sc_operations *sc_ops)
+ const struct nilfs_sc_operations *sc_ops)
{
LIST_HEAD(data_buffers);
LIST_HEAD(node_buffers);
@@ -1406,8 +1412,10 @@ static void nilfs_free_incomplete_logs(struct list_head *logs,
if (atomic_read(&segbuf->sb_err)) {
/* Case 1: The first segment failed */
if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
- /* Case 1a: Partial segment appended into an existing
- segment */
+ /*
+ * Case 1a: Partial segment appended into an existing
+ * segment
+ */
nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
segbuf->sb_fseg_end);
else /* Case 1b: New full segment */
@@ -1550,7 +1558,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
sector_t blocknr;
unsigned long nfinfo = segbuf->sb_sum.nfinfo;
unsigned long nblocks = 0, ndatablk = 0;
- struct nilfs_sc_operations *sc_op = NULL;
+ const struct nilfs_sc_operations *sc_op = NULL;
struct nilfs_segsum_pointer ssp;
struct nilfs_finfo *finfo = NULL;
union nilfs_binfo binfo;
@@ -1631,8 +1639,10 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
static void nilfs_begin_page_io(struct page *page)
{
if (!page || PageWriteback(page))
- /* For split b-tree node pages, this function may be called
- twice. We ignore the 2nd or later calls by this check. */
+ /*
+ * For split b-tree node pages, this function may be called
+ * twice. We ignore the 2nd or later calls by this check.
+ */
return;
lock_page(page);
@@ -1942,7 +1952,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
ifile, ii->vfs_inode.i_ino, &ibh);
if (unlikely(err)) {
nilfs_warning(sci->sc_super, __func__,
- "failed to get inode block.\n");
+ "failed to get inode block.");
return err;
}
mark_buffer_dirty(ibh);
@@ -2395,6 +2405,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
static void nilfs_construction_timeout(unsigned long data)
{
struct task_struct *p = (struct task_struct *)data;
+
wake_up_process(p);
}
@@ -2555,10 +2566,10 @@ static int nilfs_segctor_thread(void *arg)
if (timeout || sci->sc_seq_request != sci->sc_seq_done)
mode = SC_LSEG_SR;
- else if (!sci->sc_flush_request)
- break;
- else
+ else if (sci->sc_flush_request)
mode = nilfs_segctor_flush_mode(sci);
+ else
+ break;
spin_unlock(&sci->sc_state_lock);
nilfs_segctor_thread_construct(sci, mode);
@@ -2684,8 +2695,10 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
{
int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
- /* The segctord thread was stopped and its timer was removed.
- But some tasks remain. */
+ /*
+ * The segctord thread was stopped and its timer was removed.
+ * But some tasks remain.
+ */
do {
struct nilfs_transaction_info ti;
@@ -2727,13 +2740,13 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
if (!list_empty(&sci->sc_dirty_files)) {
nilfs_warning(sci->sc_super, __func__,
- "dirty file(s) after the final construction\n");
+ "dirty file(s) after the final construction");
nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
}
if (!list_empty(&sci->sc_iput_queue)) {
nilfs_warning(sci->sc_super, __func__,
- "iput queue is not empty\n");
+ "iput queue is not empty");
nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
}
@@ -2810,7 +2823,7 @@ void nilfs_detach_log_writer(struct super_block *sb)
if (!list_empty(&nilfs->ns_dirty_files)) {
list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
nilfs_warning(sb, __func__,
- "Hit dirty file after stopped log writer\n");
+ "Hit dirty file after stopped log writer");
}
spin_unlock(&nilfs->ns_inode_lock);
up_write(&nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 0408b9b2814b..6565c10b7b76 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
#ifndef _NILFS_SEGMENT_H
@@ -75,7 +71,7 @@ struct nilfs_recovery_info {
*/
struct nilfs_cstage {
int scnt;
- unsigned flags;
+ unsigned int flags;
struct nilfs_inode_info *dirty_file_ptr;
struct nilfs_inode_info *gc_inode_ptr;
};
@@ -84,7 +80,7 @@ struct nilfs_segment_buffer;
struct nilfs_segsum_pointer {
struct buffer_head *bh;
- unsigned offset; /* offset in bytes */
+ unsigned int offset; /* offset in bytes */
};
/**
@@ -193,11 +189,15 @@ enum {
NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */
NILFS_SC_UNCLOSED, /* Logical segment is not closed */
NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */
- NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a
- checkpoint */
- NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files
- other than DAT, cpfile, sufile, or files
- moved by GC */
+ NILFS_SC_PRIOR_FLUSH, /*
+ * Requesting immediate flush without making a
+ * checkpoint
+ */
+ NILFS_SC_HAVE_DELTA, /*
+ * Next checkpoint will have update of files
+ * other than DAT, cpfile, sufile, or files
+ * moved by GC.
+ */
};
/* sc_state */
@@ -207,17 +207,23 @@ enum {
/*
* Constant parameters
*/
-#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when
- destroying segctord */
+#define NILFS_SC_CLEANUP_RETRY 3 /*
+ * Retry count of construction when
+ * destroying segctord
+ */
/*
* Default values of timeout, in seconds.
*/
-#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks.
- It triggers construction of a
- logical segment with a super root */
-#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root
- creation */
+#define NILFS_SC_DEFAULT_TIMEOUT 5 /*
+ * Timeout value of dirty blocks.
+ * It triggers construction of a
+ * logical segment with a super root.
+ */
+#define NILFS_SC_DEFAULT_SR_FREQ 30 /*
+ * Maximum frequency of super root
+ * creation
+ */
/*
* The default threshold amount of data, in block counts.
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 52821ffc11f4..1963595a1580 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -13,12 +13,8 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
- * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Written by Koji Sato.
+ * Revised by Ryusuke Konishi.
*/
#include <linux/kernel.h>
@@ -61,6 +57,7 @@ static unsigned long
nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
{
__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+
do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
return (unsigned long)t;
}
@@ -69,6 +66,7 @@ static unsigned long
nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
{
__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+
return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
}
@@ -819,7 +817,7 @@ out:
* %-ENOMEM - Insufficient amount of memory available.
*/
ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
- unsigned sisz, size_t nsi)
+ unsigned int sisz, size_t nsi)
{
struct buffer_head *su_bh;
struct nilfs_segment_usage *su;
@@ -897,7 +895,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
* %-EINVAL - Invalid values in input (segment number, flags or nblocks)
*/
ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
- unsigned supsz, size_t nsup)
+ unsigned int supsz, size_t nsup)
{
struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
struct buffer_head *header_bh, *bh;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index b8afd72f2379..46e89872294c 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>.
+ * Written by Koji Sato.
*/
#ifndef _NILFS_SUFILE_H
@@ -42,9 +38,9 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
unsigned long nblocks, time_t modtime);
int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
-ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
+ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned int,
size_t);
-ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t);
+ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned int, size_t);
int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
void (*dofunc)(struct inode *, __u64,
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7f5d3d9f1c37..666107a18a22 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*/
/*
* linux/fs/ext2/super.c
@@ -173,12 +169,10 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
static void nilfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
- if (mdi) {
- kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
- kfree(mdi);
- }
+ if (nilfs_is_metadata_file_inode(inode))
+ nilfs_mdt_destroy(inode);
+
kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
}
@@ -279,7 +273,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
}
} else if (sbp[1] &&
sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
- memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+ memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
}
if (flip && sbp[1])
@@ -749,6 +743,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
while ((p = strsep(&options, ",")) != NULL) {
int token;
+
if (!*p)
continue;
@@ -891,7 +886,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval);
nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max);
- return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
+ return !parse_options(data, sb, 0) ? -EINVAL : 0;
}
int nilfs_check_feature_compatibility(struct super_block *sb,
@@ -1316,7 +1311,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
}
if (!s->s_root) {
- s_new = true;
+ s_new = true;
/* New superblock instance created */
s->s_mode = mode;
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index bbb0dcc35905..8ffa42b704d8 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -68,7 +68,7 @@ static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \
static const struct sysfs_ops nilfs_##name##_attr_ops = { \
.show = nilfs_##name##_attr_show, \
.store = nilfs_##name##_attr_store, \
-};
+}
#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \
static void nilfs_##name##_attr_release(struct kobject *kobj) \
@@ -84,7 +84,7 @@ static struct kobj_type nilfs_##name##_ktype = { \
.default_attrs = nilfs_##name##_attrs, \
.sysfs_ops = &nilfs_##name##_attr_ops, \
.release = nilfs_##name##_attr_release, \
-};
+}
#define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \
static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \
@@ -756,7 +756,7 @@ nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- unsigned sbwcount;
+ unsigned int sbwcount;
down_read(&nilfs->ns_sem);
sbwcount = nilfs->ns_sbwcount;
@@ -770,7 +770,7 @@ nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- unsigned sb_update_freq;
+ unsigned int sb_update_freq;
down_read(&nilfs->ns_sem);
sb_update_freq = nilfs->ns_sb_update_freq;
@@ -784,7 +784,7 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
struct the_nilfs *nilfs,
const char *buf, size_t count)
{
- unsigned val;
+ unsigned int val;
int err;
err = kstrtouint(skip_spaces(buf), 0, &val);
diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h
index 677e3a1a8370..648cedf9c06e 100644
--- a/fs/nilfs2/sysfs.h
+++ b/fs/nilfs2/sysfs.h
@@ -66,7 +66,7 @@ struct nilfs_##name##_attr { \
char *); \
ssize_t (*store)(struct kobject *, struct attribute *, \
const char *, size_t); \
-};
+}
NILFS_COMMON_ATTR_STRUCT(feature);
@@ -77,7 +77,7 @@ struct nilfs_##name##_attr { \
char *); \
ssize_t (*store)(struct nilfs_##name##_attr *, struct the_nilfs *, \
const char *, size_t); \
-};
+}
NILFS_DEV_ATTR_STRUCT(dev);
NILFS_DEV_ATTR_STRUCT(segments);
@@ -93,7 +93,7 @@ struct nilfs_##name##_attr { \
char *); \
ssize_t (*store)(struct nilfs_##name##_attr *, struct nilfs_root *, \
const char *, size_t); \
-};
+}
NILFS_CP_ATTR_STRUCT(snapshot);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 69bd801afb53..809bd2de7ad0 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -112,8 +108,8 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
struct nilfs_super_root *raw_sr;
struct nilfs_super_block **sbp = nilfs->ns_sbp;
struct nilfs_inode *rawi;
- unsigned dat_entry_size, segment_usage_size, checkpoint_size;
- unsigned inode_size;
+ unsigned int dat_entry_size, segment_usage_size, checkpoint_size;
+ unsigned int inode_size;
int err;
err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
@@ -621,8 +617,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
if (err)
goto out;
- /* not failed_sbh; sbh is released automatically
- when reloading fails. */
+ /*
+ * Not to failed_sbh; sbh is released automatically
+ * when reloading fails.
+ */
}
nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
nilfs->ns_blocksize = blocksize;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 23778d385836..79369fd6b13b 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -13,11 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Ryusuke Konishi.
*
*/
@@ -118,10 +114,10 @@ struct the_nilfs {
struct buffer_head *ns_sbh[2];
struct nilfs_super_block *ns_sbp[2];
time_t ns_sbwtime;
- unsigned ns_sbwcount;
- unsigned ns_sbsize;
- unsigned ns_mount_state;
- unsigned ns_sb_update_freq;
+ unsigned int ns_sbwcount;
+ unsigned int ns_sbsize;
+ unsigned int ns_mount_state;
+ unsigned int ns_sb_update_freq;
/*
* Following fields are dedicated to a writable FS-instance.
@@ -226,15 +222,14 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty)
* Mount option operations
*/
#define nilfs_clear_opt(nilfs, opt) \
- do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
+ ((nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt)
#define nilfs_set_opt(nilfs, opt) \
- do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0)
+ ((nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt)
#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt)
#define nilfs_write_opt(nilfs, mask, opt) \
- do { (nilfs)->ns_mount_opt = \
+ ((nilfs)->ns_mount_opt = \
(((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) | \
- NILFS_MOUNT_##opt); \
- } while (0)
+ NILFS_MOUNT_##opt)) \
/**
* struct nilfs_root - nilfs root object
@@ -273,6 +268,7 @@ struct nilfs_root {
static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
{
u64 t = get_seconds();
+
return t < nilfs->ns_sbwtime ||
t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq;
}
@@ -280,6 +276,7 @@ static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
{
int flip_bits = nilfs->ns_sbwcount & 0x0FL;
+
return (flip_bits != 0x08 && flip_bits != 0x0F);
}
@@ -308,7 +305,7 @@ static inline void nilfs_get_root(struct nilfs_root *root)
static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
{
- unsigned valid_fs;
+ unsigned int valid_fs;
down_read(&nilfs->ns_sem);
valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index b44c68a857e7..0a3bc2cf192c 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -56,6 +56,13 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
&mnt->mnt_root->d_lock);
}
+/* prepare for freeing all marks associated with given group */
+extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
+/*
+ * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list
+ */
+extern void fsnotify_mark_destroy_list(void);
+
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
* about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index d16b62cb2854..3e2dd85be5dd 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -47,12 +47,21 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
*/
void fsnotify_destroy_group(struct fsnotify_group *group)
{
- /* clear all inode marks for this group */
- fsnotify_clear_marks_by_group(group);
+ /* clear all inode marks for this group, attach them to destroy_list */
+ fsnotify_detach_group_marks(group);
- synchronize_srcu(&fsnotify_mark_srcu);
+ /*
+ * Wait for fsnotify_mark_srcu period to end and free all marks in
+ * destroy_list
+ */
+ fsnotify_mark_destroy_list();
- /* clear the notification queue of all events */
+ /*
+ * Since we have waited for fsnotify_mark_srcu in
+ * fsnotify_mark_destroy_list() there can be no outstanding event
+ * notification against this group. So clearing the notification queue
+ * of all events is reliable now.
+ */
fsnotify_flush_notify(group);
/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 7115c5d7d373..d3fea0bd89e2 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -97,8 +97,8 @@ struct srcu_struct fsnotify_mark_srcu;
static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
-static void fsnotify_mark_destroy(struct work_struct *work);
-static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
+static void fsnotify_mark_destroy_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
@@ -173,11 +173,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
}
/*
- * Free fsnotify mark. The freeing is actually happening from a kthread which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
+ * Prepare mark for freeing and add it to the list of marks prepared for
+ * freeing. The actual freeing must happen after SRCU period ends and the
+ * caller is responsible for this.
+ *
+ * The function returns true if the mark was added to the list of marks for
+ * freeing. The function returns false if someone else has already called
+ * __fsnotify_free_mark() for the mark.
*/
-void fsnotify_free_mark(struct fsnotify_mark *mark)
+static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
{
struct fsnotify_group *group = mark->group;
@@ -185,17 +189,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
/* something else already called this function on this mark */
if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
spin_unlock(&mark->lock);
- return;
+ return false;
}
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
spin_unlock(&mark->lock);
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- queue_delayed_work(system_unbound_wq, &reaper_work,
- FSNOTIFY_REAPER_DELAY);
-
/*
* Some groups like to know that marks are being freed. This is a
* callback to the group function to let it know that this mark
@@ -203,6 +201,25 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
*/
if (group->ops->freeing_mark)
group->ops->freeing_mark(mark, group);
+
+ spin_lock(&destroy_lock);
+ list_add(&mark->g_list, &destroy_list);
+ spin_unlock(&destroy_lock);
+
+ return true;
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a workqueue which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+ if (__fsnotify_free_mark(mark)) {
+ queue_delayed_work(system_unbound_wq, &reaper_work,
+ FSNOTIFY_REAPER_DELAY);
+ }
}
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
@@ -468,11 +485,29 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
}
/*
- * Given a group, destroy all of the marks associated with that group.
+ * Given a group, prepare for freeing all the marks associated with that group.
+ * The marks are attached to the list of marks prepared for destruction, the
+ * caller is responsible for freeing marks in that list after SRCU period has
+ * ended.
*/
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+void fsnotify_detach_group_marks(struct fsnotify_group *group)
{
- fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+ struct fsnotify_mark *mark;
+
+ while (1) {
+ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ if (list_empty(&group->marks_list)) {
+ mutex_unlock(&group->mark_mutex);
+ break;
+ }
+ mark = list_first_entry(&group->marks_list,
+ struct fsnotify_mark, g_list);
+ fsnotify_get_mark(mark);
+ fsnotify_detach_mark(mark);
+ mutex_unlock(&group->mark_mutex);
+ __fsnotify_free_mark(mark);
+ fsnotify_put_mark(mark);
+ }
}
void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
@@ -499,7 +534,11 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
mark->free_mark = free_mark;
}
-static void fsnotify_mark_destroy(struct work_struct *work)
+/*
+ * Destroy all marks in destroy_list, waits for SRCU period to finish before
+ * actually freeing marks.
+ */
+void fsnotify_mark_destroy_list(void)
{
struct fsnotify_mark *mark, *next;
struct list_head private_destroy_list;
@@ -516,3 +555,8 @@ static void fsnotify_mark_destroy(struct work_struct *work)
fsnotify_put_mark(mark);
}
}
+
+static void fsnotify_mark_destroy_workfn(struct work_struct *work)
+{
+ fsnotify_mark_destroy_list();
+}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e361d1a0ca09..460c0cedab3a 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5351,7 +5351,7 @@ static int ocfs2_truncate_rec(handle_t *handle,
{
int ret;
u32 left_cpos, rec_range, trunc_range;
- int wants_rotate = 0, is_rightmost_tree_rec = 0;
+ int is_rightmost_tree_rec = 0;
struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
struct ocfs2_path *left_path = NULL;
struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -5457,7 +5457,6 @@ static int ocfs2_truncate_rec(handle_t *handle,
memset(rec, 0, sizeof(*rec));
ocfs2_cleanup_merge(el, index);
- wants_rotate = 1;
next_free = le16_to_cpu(el->l_next_free_rec);
if (is_rightmost_tree_rec && next_free > 1) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 1934abb6b680..6aaf3e351391 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -272,10 +272,21 @@ struct o2hb_region {
struct delayed_work hr_write_timeout_work;
unsigned long hr_last_timeout_start;
+ /* negotiate timer, used to negotiate extending hb timeout. */
+ struct delayed_work hr_nego_timeout_work;
+ unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
/* Used during o2hb_check_slot to hold a copy of the block
* being checked because we temporarily have to zero out the
* crc field. */
struct o2hb_disk_heartbeat_block *hr_tmp_block;
+
+ /* Message key for negotiate timeout message. */
+ unsigned int hr_key;
+ struct list_head hr_handler_list;
+
+ /* last hb status, 0 for success, other value for error. */
+ int hr_last_hb_status;
};
struct o2hb_bio_wait_ctxt {
@@ -284,6 +295,17 @@ struct o2hb_bio_wait_ctxt {
int wc_error;
};
+#define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2)
+
+enum {
+ O2HB_NEGO_TIMEOUT_MSG = 1,
+ O2HB_NEGO_APPROVE_MSG = 2,
+};
+
+struct o2hb_nego_msg {
+ u8 node_num;
+};
+
static void o2hb_write_timeout(struct work_struct *work)
{
int failed, quorum;
@@ -319,7 +341,7 @@ static void o2hb_write_timeout(struct work_struct *work)
o2quo_disk_timeout();
}
-static void o2hb_arm_write_timeout(struct o2hb_region *reg)
+static void o2hb_arm_timeout(struct o2hb_region *reg)
{
/* Arm writeout only after thread reaches steady state */
if (atomic_read(&reg->hr_steady_iterations) != 0)
@@ -334,14 +356,132 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
spin_unlock(&o2hb_live_lock);
}
cancel_delayed_work(&reg->hr_write_timeout_work);
- reg->hr_last_timeout_start = jiffies;
schedule_delayed_work(&reg->hr_write_timeout_work,
msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
+
+ cancel_delayed_work(&reg->hr_nego_timeout_work);
+ /* negotiate timeout must be less than write timeout. */
+ schedule_delayed_work(&reg->hr_nego_timeout_work,
+ msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS));
+ memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap));
}
-static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
+static void o2hb_disarm_timeout(struct o2hb_region *reg)
{
cancel_delayed_work_sync(&reg->hr_write_timeout_work);
+ cancel_delayed_work_sync(&reg->hr_nego_timeout_work);
+}
+
+static int o2hb_send_nego_msg(int key, int type, u8 target)
+{
+ struct o2hb_nego_msg msg;
+ int status, ret;
+
+ msg.node_num = o2nm_this_node();
+again:
+ ret = o2net_send_message(type, key, &msg, sizeof(msg),
+ target, &status);
+
+ if (ret == -EAGAIN || ret == -ENOMEM) {
+ msleep(100);
+ goto again;
+ }
+
+ return ret;
+}
+
+static void o2hb_nego_timeout(struct work_struct *work)
+{
+ unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ int master_node, i, ret;
+ struct o2hb_region *reg;
+
+ reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work);
+ /* don't negotiate timeout if last hb failed since it is very
+ * possible io failed. Should let write timeout fence self.
+ */
+ if (reg->hr_last_hb_status)
+ return;
+
+ o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ /* lowest node as master node to make negotiate decision. */
+ master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0);
+
+ if (master_node == o2nm_this_node()) {
+ if (!test_bit(master_node, reg->hr_nego_node_bitmap)) {
+ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n",
+ o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000,
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+ set_bit(master_node, reg->hr_nego_node_bitmap);
+ }
+ if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap,
+ sizeof(reg->hr_nego_node_bitmap))) {
+ /* check negotiate bitmap every second to do timeout
+ * approve decision.
+ */
+ schedule_delayed_work(&reg->hr_nego_timeout_work,
+ msecs_to_jiffies(1000));
+
+ return;
+ }
+
+ printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+ /* approve negotiate timeout request. */
+ o2hb_arm_timeout(reg);
+
+ i = -1;
+ while ((i = find_next_bit(live_node_bitmap,
+ O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+ if (i == master_node)
+ continue;
+
+ mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i);
+ ret = o2hb_send_nego_msg(reg->hr_key,
+ O2HB_NEGO_APPROVE_MSG, i);
+ if (ret)
+ mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n",
+ i, ret);
+ }
+ } else {
+ /* negotiate timeout with master node. */
+ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n",
+ o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(&reg->hr_item),
+ reg->hr_dev_name, master_node);
+ ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG,
+ master_node);
+ if (ret)
+ mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n",
+ master_node, ret);
+ }
+}
+
+static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct o2hb_region *reg = data;
+ struct o2hb_nego_msg *nego_msg;
+
+ nego_msg = (struct o2hb_nego_msg *)msg->buf;
+ printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n",
+ nego_msg->node_num, config_item_name(&reg->hr_item), reg->hr_dev_name);
+ if (nego_msg->node_num < O2NM_MAX_NODES)
+ set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap);
+ else
+ mlog(ML_ERROR, "got nego timeout message from bad node.\n");
+
+ return 0;
+}
+
+static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct o2hb_region *reg = data;
+
+ printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+ o2hb_arm_timeout(reg);
+ return 0;
}
static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -1032,7 +1172,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
/* Skip disarming the timeout if own slot has stale/bad data */
if (own_slot_ok) {
o2hb_set_quorum_device(reg);
- o2hb_arm_write_timeout(reg);
+ o2hb_arm_timeout(reg);
+ reg->hr_last_timeout_start = jiffies;
}
bail:
@@ -1096,6 +1237,7 @@ static int o2hb_thread(void *data)
before_hb = ktime_get_real();
ret = o2hb_do_disk_heartbeat(reg);
+ reg->hr_last_hb_status = ret;
after_hb = ktime_get_real();
@@ -1114,7 +1256,7 @@ static int o2hb_thread(void *data)
}
}
- o2hb_disarm_write_timeout(reg);
+ o2hb_disarm_timeout(reg);
/* unclean stop is only used in very bad situation */
for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
@@ -1451,12 +1593,12 @@ static void o2hb_region_release(struct config_item *item)
list_del(&reg->hr_all_item);
spin_unlock(&o2hb_live_lock);
+ o2net_unregister_handler_list(&reg->hr_handler_list);
kfree(reg);
}
static int o2hb_read_block_input(struct o2hb_region *reg,
const char *page,
- size_t count,
unsigned long *ret_bytes,
unsigned int *ret_bits)
{
@@ -1499,8 +1641,8 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
if (reg->hr_bdev)
return -EINVAL;
- status = o2hb_read_block_input(reg, page, count,
- &block_bytes, &block_bits);
+ status = o2hb_read_block_input(reg, page, &block_bytes,
+ &block_bits);
if (status)
return status;
@@ -1763,6 +1905,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
}
INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
+ INIT_DELAYED_WORK(&reg->hr_nego_timeout_work, o2hb_nego_timeout);
/*
* A node is considered live after it has beat LIVE_THRESHOLD
@@ -1996,13 +2139,37 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+ /* this is the same way to generate msg key as dlm, for local heartbeat,
+ * name is also the same, so make initial crc value different to avoid
+ * message key conflict.
+ */
+ reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS,
+ name, strlen(name));
+ INIT_LIST_HEAD(&reg->hr_handler_list);
+ ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key,
+ sizeof(struct o2hb_nego_msg),
+ o2hb_nego_timeout_handler,
+ reg, NULL, &reg->hr_handler_list);
+ if (ret)
+ goto free;
+
+ ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key,
+ sizeof(struct o2hb_nego_msg),
+ o2hb_nego_approve_handler,
+ reg, NULL, &reg->hr_handler_list);
+ if (ret)
+ goto unregister_handler;
+
ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
if (ret) {
config_item_put(&reg->hr_item);
- goto free;
+ goto unregister_handler;
}
return &reg->hr_item;
+
+unregister_handler:
+ o2net_unregister_handler_list(&reg->hr_handler_list);
free:
kfree(reg);
return ERR_PTR(ret);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 540ab5b75dbb..44d178b8d1aa 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -580,7 +580,7 @@ struct ocfs2_extended_slot {
/*00*/ __u8 es_valid;
__u8 es_reserved1[3];
__le32 es_node_num;
-/*10*/
+/*08*/
};
/*
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 1e09592148ad..d7407994f308 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -535,12 +535,8 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
spin_unlock(&osb->osb_lock);
status = ocfs2_update_disk_slot(osb, si, slot_num);
- if (status < 0) {
+ if (status < 0)
mlog_errno(status);
- goto bail;
- }
-bail:
ocfs2_free_slot_info(osb);
}
-
diff --git a/fs/proc/array.c b/fs/proc/array.c
index b6c00ce0e29e..88c7de12197b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -83,6 +83,7 @@
#include <linux/tracehook.h>
#include <linux/string_helpers.h>
#include <linux/user_namespace.h>
+#include <linux/fs_struct.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -139,12 +140,25 @@ static inline const char *get_task_state(struct task_struct *tsk)
return task_state_array[fls(state)];
}
+static inline int get_task_umask(struct task_struct *tsk)
+{
+ struct fs_struct *fs;
+ int umask = -ENOENT;
+
+ task_lock(tsk);
+ fs = tsk->fs;
+ if (fs)
+ umask = fs->umask;
+ task_unlock(tsk);
+ return umask;
+}
+
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *p)
{
struct user_namespace *user_ns = seq_user_ns(m);
struct group_info *group_info;
- int g;
+ int g, umask;
struct task_struct *tracer;
const struct cred *cred;
pid_t ppid, tpid = 0, tgid, ngid;
@@ -162,6 +176,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
ngid = task_numa_group_id(p);
cred = get_task_cred(p);
+ umask = get_task_umask(p);
+ if (umask >= 0)
+ seq_printf(m, "Umask:\t%#04o\n", umask);
+
task_lock(p);
if (p->files)
max_fds = files_fdtable(p->files)->max_fds;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ff4527dd69b7..a11eb7196ec8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3163,6 +3163,44 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
}
/*
+ * proc_tid_comm_permission is a special permission function exclusively
+ * used for the node /proc/<pid>/task/<tid>/comm.
+ * It bypasses generic permission checks in the case where a task of the same
+ * task group attempts to access the node.
+ * The rationale behind this is that glibc and bionic access this node for
+ * cross thread naming (pthread_set/getname_np(!self)). However, if
+ * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
+ * which locks out the cross thread naming implementation.
+ * This function makes sure that the node is always accessible for members of
+ * same thread group.
+ */
+static int proc_tid_comm_permission(struct inode *inode, int mask)
+{
+ bool is_same_tgroup;
+ struct task_struct *task;
+
+ task = get_proc_task(inode);
+ if (!task)
+ return -ESRCH;
+ is_same_tgroup = same_thread_group(current, task);
+ put_task_struct(task);
+
+ if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
+ /* This file (/proc/<pid>/task/<tid>/comm) can always be
+ * read or written by the members of the corresponding
+ * thread group.
+ */
+ return 0;
+ }
+
+ return generic_permission(inode, mask);
+}
+
+static const struct inode_operations proc_tid_comm_inode_operations = {
+ .permission = proc_tid_comm_permission,
+};
+
+/*
* Tasks
*/
static const struct pid_entry tid_base_stuff[] = {
@@ -3180,7 +3218,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
- REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
+ NOD("comm", S_IFREG|S_IRUGO|S_IWUSR,
+ &proc_tid_comm_inode_operations,
+ &proc_pid_set_comm_operations, {}),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
ONE("syscall", S_IRUSR, proc_pid_syscall),
#endif
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 712f1b9992cc..3ecd445e830d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page)
/*
- * Caveats on high order pages: page->_count will only be set
+ * Caveats on high order pages: page->_refcount will only be set
* -1 on the head page; SLUB/SLQB do the same for PG_slab;
* SLOB won't set PG_slab at all on compound pages.
*/
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index a586467f6ff6..be3ddd189cd4 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -211,14 +211,11 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
struct page **pages = NULL, **ptr, *page;
loff_t isize;
- if (!(flags & MAP_SHARED))
- return addr;
-
/* the mapping mustn't extend beyond the EOF */
lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
isize = i_size_read(inode);
- ret = -EINVAL;
+ ret = -ENOSYS;
maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (pgoff >= maxpages)
goto out;
@@ -227,7 +224,6 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
goto out;
/* gang-find the pages */
- ret = -ENOMEM;
pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
goto out_free;
@@ -263,7 +259,7 @@ out:
*/
static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
{
- if (!(vma->vm_flags & VM_SHARED))
+ if (!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE)))
return -ENOSYS;
file_accessed(file);
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index 99a5d5dae46a..415d66ca87d1 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -3,8 +3,8 @@
*/
#include <linux/string.h>
-#include <linux/random.h>
#include <linux/time.h>
+#include <linux/uuid.h>
#include "reiserfs.h"
/* find where objectid map starts */
diff --git a/fs/select.c b/fs/select.c
index 869293988c2a..8ed9da50896a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -47,7 +47,7 @@
#define MAX_SLACK (100 * NSEC_PER_MSEC)
-static long __estimate_accuracy(struct timespec *tv)
+static long __estimate_accuracy(struct timespec64 *tv)
{
long slack;
int divfactor = 1000;
@@ -70,10 +70,10 @@ static long __estimate_accuracy(struct timespec *tv)
return slack;
}
-u64 select_estimate_accuracy(struct timespec *tv)
+u64 select_estimate_accuracy(struct timespec64 *tv)
{
u64 ret;
- struct timespec now;
+ struct timespec64 now;
/*
* Realtime tasks get a slack of 0 for obvious reasons.
@@ -82,8 +82,8 @@ u64 select_estimate_accuracy(struct timespec *tv)
if (rt_task(current))
return 0;
- ktime_get_ts(&now);
- now = timespec_sub(*tv, now);
+ ktime_get_ts64(&now);
+ now = timespec64_sub(*tv, now);
ret = __estimate_accuracy(&now);
if (ret < current->timer_slack_ns)
return current->timer_slack_ns;
@@ -260,7 +260,7 @@ EXPORT_SYMBOL(poll_schedule_timeout);
/**
* poll_select_set_timeout - helper function to setup the timeout value
- * @to: pointer to timespec variable for the final timeout
+ * @to: pointer to timespec64 variable for the final timeout
* @sec: seconds (from user space)
* @nsec: nanoseconds (from user space)
*
@@ -269,26 +269,28 @@ EXPORT_SYMBOL(poll_schedule_timeout);
*
* Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
*/
-int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
{
- struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+ struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
- if (!timespec_valid(&ts))
+ if (!timespec64_valid(&ts))
return -EINVAL;
/* Optimize for the zero timeout value here */
if (!sec && !nsec) {
to->tv_sec = to->tv_nsec = 0;
} else {
- ktime_get_ts(to);
- *to = timespec_add_safe(*to, ts);
+ ktime_get_ts64(to);
+ *to = timespec64_add_safe(*to, ts);
}
return 0;
}
-static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+static int poll_select_copy_remaining(struct timespec64 *end_time,
+ void __user *p,
int timeval, int ret)
{
+ struct timespec64 rts64;
struct timespec rts;
struct timeval rtv;
@@ -302,16 +304,18 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
if (!end_time->tv_sec && !end_time->tv_nsec)
return ret;
- ktime_get_ts(&rts);
- rts = timespec_sub(*end_time, rts);
- if (rts.tv_sec < 0)
- rts.tv_sec = rts.tv_nsec = 0;
+ ktime_get_ts64(&rts64);
+ rts64 = timespec64_sub(*end_time, rts64);
+ if (rts64.tv_sec < 0)
+ rts64.tv_sec = rts64.tv_nsec = 0;
+
+ rts = timespec64_to_timespec(rts64);
if (timeval) {
if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
memset(&rtv, 0, sizeof(rtv));
- rtv.tv_sec = rts.tv_sec;
- rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+ rtv.tv_sec = rts64.tv_sec;
+ rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
if (!copy_to_user(p, &rtv, sizeof(rtv)))
return ret;
@@ -396,7 +400,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
wait->_key |= POLLOUT_SET;
}
-int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
+int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table;
@@ -522,7 +526,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
* pointer to the expiry value.
*/
if (end_time && !to) {
- expire = timespec_to_ktime(*end_time);
+ expire = timespec64_to_ktime(*end_time);
to = &expire;
}
@@ -545,7 +549,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
* I'm trying ERESTARTNOHAND which restart only when you want to.
*/
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
- fd_set __user *exp, struct timespec *end_time)
+ fd_set __user *exp, struct timespec64 *end_time)
{
fd_set_bits fds;
void *bits;
@@ -622,7 +626,7 @@ out_nofds:
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
fd_set __user *, exp, struct timeval __user *, tvp)
{
- struct timespec end_time, *to = NULL;
+ struct timespec64 end_time, *to = NULL;
struct timeval tv;
int ret;
@@ -648,15 +652,17 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
const sigset_t __user *sigmask, size_t sigsetsize)
{
sigset_t ksigmask, sigsaved;
- struct timespec ts, end_time, *to = NULL;
+ struct timespec ts;
+ struct timespec64 ts64, end_time, *to = NULL;
int ret;
if (tsp) {
if (copy_from_user(&ts, tsp, sizeof(ts)))
return -EFAULT;
+ ts64 = timespec_to_timespec64(ts);
to = &end_time;
- if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
return -EINVAL;
}
@@ -779,7 +785,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
}
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
- struct timespec *end_time)
+ struct timespec64 *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
@@ -854,7 +860,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
* pointer to the expiry value.
*/
if (end_time && !to) {
- expire = timespec_to_ktime(*end_time);
+ expire = timespec64_to_ktime(*end_time);
to = &expire;
}
@@ -868,7 +874,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
sizeof(struct pollfd))
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
- struct timespec *end_time)
+ struct timespec64 *end_time)
{
struct poll_wqueues table;
int err = -EFAULT, fdcount, len, size;
@@ -936,7 +942,7 @@ static long do_restart_poll(struct restart_block *restart_block)
{
struct pollfd __user *ufds = restart_block->poll.ufds;
int nfds = restart_block->poll.nfds;
- struct timespec *to = NULL, end_time;
+ struct timespec64 *to = NULL, end_time;
int ret;
if (restart_block->poll.has_timeout) {
@@ -957,7 +963,7 @@ static long do_restart_poll(struct restart_block *restart_block)
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
int, timeout_msecs)
{
- struct timespec end_time, *to = NULL;
+ struct timespec64 end_time, *to = NULL;
int ret;
if (timeout_msecs >= 0) {
@@ -993,7 +999,8 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
size_t, sigsetsize)
{
sigset_t ksigmask, sigsaved;
- struct timespec ts, end_time, *to = NULL;
+ struct timespec ts;
+ struct timespec64 end_time, *to = NULL;
int ret;
if (tsp) {
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index f4fbc7b6b794..3cbb904a6d7d 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -28,8 +28,8 @@
#include "ubifs.h"
#include <linux/slab.h>
-#include <linux/random.h>
#include <linux/math64.h>
+#include <linux/uuid.h>
/*
* Default journal size in logical eraseblocks as a percent of total
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 47fc63295422..5e3dd473743d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1700,6 +1700,7 @@ const struct file_operations xfs_file_operations = {
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
+ .get_unmapped_area = dax_get_unmapped_area,
.fallocate = xfs_file_fallocate,
};
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 9401f4819891..d4458b6dbfb4 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -806,4 +806,12 @@ static inline int pmd_clear_huge(pmd_t *pmd)
#define io_remap_pfn_range remap_pfn_range
#endif
+#ifndef has_transparent_hugepage
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define has_transparent_hugepage() 1
+#else
+#define has_transparent_hugepage() 0
+#endif
+#endif
+
#endif /* _ASM_GENERIC_PGTABLE_H */
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 35b22f94d2d2..f9be32691718 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -83,34 +83,34 @@ extern void *__alloc_bootmem(unsigned long size,
unsigned long goal);
extern void *__alloc_bootmem_nopanic(unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
extern void *__alloc_bootmem_node(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
void *__alloc_bootmem_node_high(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
- unsigned long limit);
+ unsigned long limit) __malloc;
extern void *__alloc_bootmem_low(unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
void *__alloc_bootmem_low_nopanic(unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal) __malloc;
#ifdef CONFIG_NO_BOOTMEM
/* We are using top down, so it is safe to use 0 here */
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index d7c8de583a23..a58c852a268f 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -2,21 +2,46 @@
#define _LINUX_COMPACTION_H
/* Return values for compact_zone() and try_to_compact_pages() */
-/* compaction didn't start as it was deferred due to past failures */
-#define COMPACT_DEFERRED 0
-/* compaction didn't start as it was not possible or direct reclaim was more suitable */
-#define COMPACT_SKIPPED 1
-/* compaction should continue to another pageblock */
-#define COMPACT_CONTINUE 2
-/* direct compaction partially compacted a zone and there are suitable pages */
-#define COMPACT_PARTIAL 3
-/* The full zone was compacted */
-#define COMPACT_COMPLETE 4
-/* For more detailed tracepoint output */
-#define COMPACT_NO_SUITABLE_PAGE 5
-#define COMPACT_NOT_SUITABLE_ZONE 6
-#define COMPACT_CONTENDED 7
/* When adding new states, please adjust include/trace/events/compaction.h */
+enum compact_result {
+ /* For more detailed tracepoint output - internal to compaction */
+ COMPACT_NOT_SUITABLE_ZONE,
+ /*
+ * compaction didn't start as it was not possible or direct reclaim
+ * was more suitable
+ */
+ COMPACT_SKIPPED,
+ /* compaction didn't start as it was deferred due to past failures */
+ COMPACT_DEFERRED,
+
+ /* compaction not active last round */
+ COMPACT_INACTIVE = COMPACT_DEFERRED,
+
+ /* For more detailed tracepoint output - internal to compaction */
+ COMPACT_NO_SUITABLE_PAGE,
+ /* compaction should continue to another pageblock */
+ COMPACT_CONTINUE,
+
+ /*
+ * The full zone was compacted scanned but wasn't successfull to compact
+ * suitable pages.
+ */
+ COMPACT_COMPLETE,
+ /*
+ * direct compaction has scanned part of the zone but wasn't successfull
+ * to compact suitable pages.
+ */
+ COMPACT_PARTIAL_SKIPPED,
+
+ /* compaction terminated prematurely due to lock contentions */
+ COMPACT_CONTENDED,
+
+ /*
+ * direct compaction partially compacted a zone and there might be
+ * suitable pages
+ */
+ COMPACT_PARTIAL,
+};
/* Used to signal whether compaction detected need_sched() or lock contention */
/* No contention detected */
@@ -38,13 +63,14 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
extern int sysctl_compact_unevictable_allowed;
extern int fragmentation_index(struct zone *zone, unsigned int order);
-extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended);
+extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
+ unsigned int order,
+ unsigned int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, int *contended);
extern void compact_pgdat(pg_data_t *pgdat, int order);
extern void reset_isolation_suitable(pg_data_t *pgdat);
-extern unsigned long compaction_suitable(struct zone *zone, int order,
- int alloc_flags, int classzone_idx);
+extern enum compact_result compaction_suitable(struct zone *zone, int order,
+ unsigned int alloc_flags, int classzone_idx);
extern void defer_compaction(struct zone *zone, int order);
extern bool compaction_deferred(struct zone *zone, int order);
@@ -52,12 +78,80 @@ extern void compaction_defer_reset(struct zone *zone, int order,
bool alloc_success);
extern bool compaction_restarting(struct zone *zone, int order);
+/* Compaction has made some progress and retrying makes sense */
+static inline bool compaction_made_progress(enum compact_result result)
+{
+ /*
+ * Even though this might sound confusing this in fact tells us
+ * that the compaction successfully isolated and migrated some
+ * pageblocks.
+ */
+ if (result == COMPACT_PARTIAL)
+ return true;
+
+ return false;
+}
+
+/* Compaction has failed and it doesn't make much sense to keep retrying. */
+static inline bool compaction_failed(enum compact_result result)
+{
+ /* All zones were scanned completely and still not result. */
+ if (result == COMPACT_COMPLETE)
+ return true;
+
+ return false;
+}
+
+/*
+ * Compaction has backed off for some reason. It might be throttling or
+ * lock contention. Retrying is still worthwhile.
+ */
+static inline bool compaction_withdrawn(enum compact_result result)
+{
+ /*
+ * Compaction backed off due to watermark checks for order-0
+ * so the regular reclaim has to try harder and reclaim something.
+ */
+ if (result == COMPACT_SKIPPED)
+ return true;
+
+ /*
+ * If compaction is deferred for high-order allocations, it is
+ * because sync compaction recently failed. If this is the case
+ * and the caller requested a THP allocation, we do not want
+ * to heavily disrupt the system, so we fail the allocation
+ * instead of entering direct reclaim.
+ */
+ if (result == COMPACT_DEFERRED)
+ return true;
+
+ /*
+ * If compaction in async mode encounters contention or blocks higher
+ * priority task we back off early rather than cause stalls.
+ */
+ if (result == COMPACT_CONTENDED)
+ return true;
+
+ /*
+ * Page scanners have met but we haven't scanned full zones so this
+ * is a back off in fact.
+ */
+ if (result == COMPACT_PARTIAL_SKIPPED)
+ return true;
+
+ return false;
+}
+
+
+bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
+ int alloc_flags);
+
extern int kcompactd_run(int nid);
extern void kcompactd_stop(int nid);
extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
#else
-static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
+static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask,
unsigned int order, int alloc_flags,
const struct alloc_context *ac,
enum migrate_mode mode, int *contended)
@@ -73,7 +167,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
{
}
-static inline unsigned long compaction_suitable(struct zone *zone, int order,
+static inline enum compact_result compaction_suitable(struct zone *zone, int order,
int alloc_flags, int classzone_idx)
{
return COMPACT_SKIPPED;
@@ -88,6 +182,21 @@ static inline bool compaction_deferred(struct zone *zone, int order)
return true;
}
+static inline bool compaction_made_progress(enum compact_result result)
+{
+ return false;
+}
+
+static inline bool compaction_failed(enum compact_result result)
+{
+ return false;
+}
+
+static inline bool compaction_withdrawn(enum compact_result result)
+{
+ return true;
+}
+
static inline int kcompactd_run(int nid)
{
return 0;
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 3d5202eda22f..e2949397c19b 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -142,6 +142,7 @@
#if GCC_VERSION >= 30400
#define __must_check __attribute__((warn_unused_result))
+#define __malloc __attribute__((__malloc__))
#endif
#if GCC_VERSION >= 40000
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b5ff9881bef8..793c0829e3a3 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -357,6 +357,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
#define __deprecated_for_modules
#endif
+#ifndef __malloc
+#define __malloc
+#endif
+
/*
* Allow us to avoid 'defined but not used' warnings on functions and data,
* as well as force them to be emitted to the assembly file.
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 85a868ccb493..bfc204e70338 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -16,26 +16,26 @@
#ifdef CONFIG_CPUSETS
-extern struct static_key cpusets_enabled_key;
+extern struct static_key_false cpusets_enabled_key;
static inline bool cpusets_enabled(void)
{
- return static_key_false(&cpusets_enabled_key);
+ return static_branch_unlikely(&cpusets_enabled_key);
}
static inline int nr_cpusets(void)
{
/* jump label reference count + the top-level cpuset */
- return static_key_count(&cpusets_enabled_key) + 1;
+ return static_key_count(&cpusets_enabled_key.key) + 1;
}
static inline void cpuset_inc(void)
{
- static_key_slow_inc(&cpusets_enabled_key);
+ static_branch_inc(&cpusets_enabled_key);
}
static inline void cpuset_dec(void)
{
- static_key_slow_dec(&cpusets_enabled_key);
+ static_branch_dec(&cpusets_enabled_key);
}
extern int cpuset_init(void);
@@ -48,16 +48,25 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
void cpuset_init_current_mems_allowed(void);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
-extern int __cpuset_node_allowed(int node, gfp_t gfp_mask);
+extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask);
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
{
- return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask);
+ if (cpusets_enabled())
+ return __cpuset_node_allowed(node, gfp_mask);
+ return true;
}
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
- return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+ return __cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+ if (cpusets_enabled())
+ return __cpuset_zone_allowed(z, gfp_mask);
+ return true;
}
extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -172,14 +181,19 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
return 1;
}
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
{
- return 1;
+ return true;
}
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
- return 1;
+ return true;
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+ return true;
}
static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644
index 000000000000..bba7a4d692b3
--- /dev/null
+++ b/include/linux/crc64_ecma.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 7743e51f826c..c9fc89a42f11 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -16,6 +16,8 @@ int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
struct page *read_dax_sector(struct block_device *bdev, sector_t n);
int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
unsigned int offset, unsigned int length);
+unsigned long dax_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags);
#else
static inline struct page *read_dax_sector(struct block_device *bdev,
sector_t n)
@@ -27,6 +29,7 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
{
return -ENXIO;
}
+#define dax_get_unmapped_area NULL
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 98ffcbd4888e..46056cb161fc 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -38,8 +38,10 @@ struct debug_obj {
* @name: name of the object typee
* @debug_hint: function returning address, which have associated
* kernel symbol, to allow identify the object
+ * @is_static_object return true if the obj is static, otherwise return false
* @fixup_init: fixup function, which is called when the init check
- * fails
+ * fails. All fixup functions must return true if fixup
+ * was successful, otherwise return false
* @fixup_activate: fixup function, which is called when the activate check
* fails
* @fixup_destroy: fixup function, which is called when the destroy check
@@ -51,12 +53,13 @@ struct debug_obj {
*/
struct debug_obj_descr {
const char *name;
- void *(*debug_hint) (void *addr);
- int (*fixup_init) (void *addr, enum debug_obj_state state);
- int (*fixup_activate) (void *addr, enum debug_obj_state state);
- int (*fixup_destroy) (void *addr, enum debug_obj_state state);
- int (*fixup_free) (void *addr, enum debug_obj_state state);
- int (*fixup_assert_init)(void *addr, enum debug_obj_state state);
+ void *(*debug_hint)(void *addr);
+ bool (*is_static_object)(void *addr);
+ bool (*fixup_init)(void *addr, enum debug_obj_state state);
+ bool (*fixup_activate)(void *addr, enum debug_obj_state state);
+ bool (*fixup_destroy)(void *addr, enum debug_obj_state state);
+ bool (*fixup_free)(void *addr, enum debug_obj_state state);
+ bool (*fixup_assert_init)(void *addr, enum debug_obj_state state);
};
#ifdef CONFIG_DEBUG_OBJECTS
diff --git a/include/linux/device.h b/include/linux/device.h
index 11b5f17c6fa6..38f02814d53a 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -609,14 +609,14 @@ typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data);
#ifdef CONFIG_DEBUG_DEVRES
extern void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
- int nid, const char *name);
+ int nid, const char *name) __malloc;
#define devres_alloc(release, size, gfp) \
__devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release)
#define devres_alloc_node(release, size, gfp, nid) \
__devres_alloc_node(release, size, gfp, nid, #release)
#else
extern void *devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
- int nid);
+ int nid) __malloc;
static inline void *devres_alloc(dr_release_t release, size_t size, gfp_t gfp)
{
return devres_alloc_node(release, size, gfp, NUMA_NO_NODE);
@@ -648,12 +648,12 @@ extern void devres_remove_group(struct device *dev, void *id);
extern int devres_release_group(struct device *dev, void *id);
/* managed devm_k.alloc/kfree for device drivers */
-extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp);
+extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc;
extern __printf(3, 0)
char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt,
- va_list ap);
+ va_list ap) __malloc;
extern __printf(3, 4)
-char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...);
+char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) __malloc;
static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp)
{
return devm_kmalloc(dev, size, gfp | __GFP_ZERO);
@@ -671,7 +671,7 @@ static inline void *devm_kcalloc(struct device *dev,
return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO);
}
extern void devm_kfree(struct device *dev, void *p);
-extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp);
+extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc;
extern void *devm_kmemdup(struct device *dev, const void *src, size_t len,
gfp_t gfp);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index df7acb51f3cc..fae16e40128c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -22,6 +22,7 @@
#include <linux/pstore.h>
#include <linux/reboot.h>
#include <linux/screen_info.h>
+#include <linux/uuid.h>
#include <asm/page.h>
@@ -44,17 +45,10 @@ typedef u16 efi_char16_t; /* UNICODE character */
typedef u64 efi_physical_addr_t;
typedef void *efi_handle_t;
-
-typedef struct {
- u8 b[16];
-} efi_guid_t;
+typedef uuid_le efi_guid_t;
#define EFI_GUID(a,b,c,d0,d1,d2,d3,d4,d5,d6,d7) \
-((efi_guid_t) \
-{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
- (b) & 0xff, ((b) >> 8) & 0xff, \
- (c) & 0xff, ((c) >> 8) & 0xff, \
- (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+ UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)
/*
* Generic EFI table header
@@ -1117,7 +1111,7 @@ extern int efi_status_to_err(efi_status_t status);
* Length of a GUID string (strlen("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"))
* not including trailing NUL
*/
-#define EFI_VARIABLE_GUID_LEN 36
+#define EFI_VARIABLE_GUID_LEN UUID_STRING_LEN
/*
* The type of search to perform when calling boottime->locate_handle
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 1259e53d9296..29f917517299 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -359,8 +359,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
/* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
-/* run all the marks in a group, and flag them to be freed */
-extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
extern void fsnotify_get_mark(struct fsnotify_mark *mark);
extern void fsnotify_put_mark(struct fsnotify_mark *mark);
extern void fsnotify_unmount_inodes(struct super_block *sb);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5c706765404a..359a8e4bd44d 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -14,6 +14,7 @@
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/percpu-refcount.h>
+#include <linux/uuid.h>
#ifdef CONFIG_BLOCK
@@ -93,7 +94,7 @@ struct disk_stats {
* Enough for the string representation of any kind of UUID plus NULL.
* EFI UUID is 36 characters. MSDOS UUID is 11 characters.
*/
-#define PARTITION_META_INFO_UUIDLTH 37
+#define PARTITION_META_INFO_UUIDLTH (UUID_STRING_LEN + 1)
struct partition_meta_info {
char uuid[PARTITION_META_INFO_UUIDLTH];
@@ -228,27 +229,9 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part)
return NULL;
}
-static inline void part_pack_uuid(const u8 *uuid_str, u8 *to)
-{
- int i;
- for (i = 0; i < 16; ++i) {
- *to++ = (hex_to_bin(*uuid_str) << 4) |
- (hex_to_bin(*(uuid_str + 1)));
- uuid_str += 2;
- switch (i) {
- case 3:
- case 5:
- case 7:
- case 9:
- uuid_str++;
- continue;
- }
- }
-}
-
static inline int blk_part_pack_uuid(const u8 *uuid_str, u8 *to)
{
- part_pack_uuid(uuid_str, to);
+ uuid_be_to_bin(uuid_str, (uuid_be *)to);
return 0;
}
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index dfd59d6bc6f0..c683996110b1 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -61,6 +61,7 @@ extern void irq_exit(void);
#define nmi_enter() \
do { \
+ printk_nmi_enter(); \
lockdep_off(); \
ftrace_nmi_enter(); \
BUG_ON(in_nmi()); \
@@ -77,6 +78,7 @@ extern void irq_exit(void);
preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \
ftrace_nmi_exit(); \
lockdep_on(); \
+ printk_nmi_exit(); \
} while (0)
#endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index d7b9e5346fba..419fb9e03447 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -28,9 +28,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned char *vec);
-extern bool move_huge_pmd(struct vm_area_struct *vma,
- struct vm_area_struct *new_vma,
- unsigned long old_addr,
+extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd);
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7d953c2542a8..c26d4638f665 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -338,6 +338,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
/* arch callback */
int __init alloc_bootmem_huge_page(struct hstate *h);
+void __init hugetlb_bad_size(void);
void __init hugetlb_add_hstate(unsigned order);
struct hstate *size_to_hstate(unsigned long size);
@@ -352,9 +353,7 @@ extern unsigned int default_hstate_idx;
static inline struct hstate *hstate_inode(struct inode *i)
{
- struct hugetlbfs_sb_info *hsb;
- hsb = HUGETLBFS_SB(i->i_sb);
- return hsb->hstate;
+ return HUGETLBFS_SB(i->i_sb)->hstate;
}
static inline struct hstate *hstate_file(struct file *f)
@@ -453,12 +452,12 @@ static inline pgoff_t basepage_index(struct page *page)
extern void dissolve_free_huge_pages(unsigned long start_pfn,
unsigned long end_pfn);
-static inline int hugepage_migration_supported(struct hstate *h)
+static inline bool hugepage_migration_supported(struct hstate *h)
{
#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
return huge_page_shift(h) == PMD_SHIFT;
#else
- return 0;
+ return false;
#endif
}
@@ -520,7 +519,7 @@ static inline pgoff_t basepage_index(struct page *page)
return page->index;
}
#define dissolve_free_huge_pages(s, e) do {} while (0)
-#define hugepage_migration_supported(h) 0
+#define hugepage_migration_supported(h) false
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
struct mm_struct *mm, pte_t *pte)
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 24154c26d469..063962f6dfc6 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -93,20 +93,17 @@ hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
struct page *page)
{
- return;
}
static inline void
hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page)
{
- return;
}
static inline void
hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg)
{
- return;
}
static inline void hugetlb_cgroup_file_init(void)
@@ -116,7 +113,6 @@ static inline void hugetlb_cgroup_file_init(void)
static inline void hugetlb_cgroup_migrate(struct page *oldhpage,
struct page *newhpage)
{
- return;
}
#endif /* CONFIG_MEM_RES_CTLR_HUGETLB */
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 2bb681fbeb35..a4e7ca0f3585 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -5,16 +5,16 @@
#include <linux/mm.h>
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
return !!(vma->vm_flags & VM_HUGETLB);
}
#else
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
- return 0;
+ return false;
}
#endif
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
new file mode 100644
index 000000000000..b7f8aced7870
--- /dev/null
+++ b/include/linux/kasan-checks.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_KASAN_CHECKS_H
+#define _LINUX_KASAN_CHECKS_H
+
+#ifdef CONFIG_KASAN
+void kasan_check_read(const void *p, unsigned int size);
+void kasan_check_write(const void *p, unsigned int size);
+#else
+static inline void kasan_check_read(const void *p, unsigned int size) { }
+static inline void kasan_check_write(const void *p, unsigned int size) { }
+#endif
+
+#endif
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 737371b56044..611927f5870d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -50,6 +50,8 @@ void kasan_free_pages(struct page *page, unsigned int order);
void kasan_cache_create(struct kmem_cache *cache, size_t *size,
unsigned long *flags);
+void kasan_cache_shrink(struct kmem_cache *cache);
+void kasan_cache_destroy(struct kmem_cache *cache);
void kasan_poison_slab(struct page *page);
void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
@@ -63,7 +65,8 @@ void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
void kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
-void kasan_slab_free(struct kmem_cache *s, void *object);
+bool kasan_slab_free(struct kmem_cache *s, void *object);
+void kasan_poison_slab_free(struct kmem_cache *s, void *object);
struct kasan_cache {
int alloc_meta_offset;
@@ -88,6 +91,8 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {}
static inline void kasan_cache_create(struct kmem_cache *cache,
size_t *size,
unsigned long *flags) {}
+static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
+static inline void kasan_cache_destroy(struct kmem_cache *cache) {}
static inline void kasan_poison_slab(struct page *page) {}
static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
@@ -105,7 +110,11 @@ static inline void kasan_krealloc(const void *object, size_t new_size,
static inline void kasan_slab_alloc(struct kmem_cache *s, void *object,
gfp_t flags) {}
-static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
+{
+ return false;
+}
+static inline void kasan_poison_slab_free(struct kmem_cache *s, void *object) {}
static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
static inline void kasan_free_shadow(const struct vm_struct *vm) {}
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f3e45cb0b6a2..94aa10ffe156 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -419,9 +419,9 @@ extern __printf(3, 4)
int scnprintf(char *buf, size_t size, const char *fmt, ...);
extern __printf(3, 0)
int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
-extern __printf(2, 3)
+extern __printf(2, 3) __malloc
char *kasprintf(gfp_t gfp, const char *fmt, ...);
-extern __printf(2, 0)
+extern __printf(2, 0) __malloc
char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
extern __printf(2, 0)
const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 2cc643c6e870..c76641cfa0da 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -230,8 +230,6 @@ extern void crash_kexec(struct pt_regs *);
int kexec_should_crash(struct task_struct *);
void crash_save_cpu(struct pt_regs *regs, int cpu);
void crash_save_vmcoreinfo(void);
-void crash_map_reserved_pages(void);
-void crash_unmap_reserved_pages(void);
void arch_crash_save_vmcoreinfo(void);
__printf(1, 2)
void vmcoreinfo_append_str(const char *fmt, ...);
@@ -258,6 +256,8 @@ unsigned long paddr_vmcoreinfo_note(void);
vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
#define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+ vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
@@ -317,6 +317,8 @@ int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
Elf_Shdr *sechdrs, unsigned int relsec);
int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
unsigned int relsec);
+void arch_kexec_protect_crashkres(void);
+void arch_kexec_unprotect_crashkres(void);
#else /* !CONFIG_KEXEC_CORE */
struct pt_regs;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1191d79aa495..a805474df4ab 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -415,25 +415,6 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
return mz->lru_size[lru];
}
-static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
- unsigned long inactive_ratio;
- unsigned long inactive;
- unsigned long active;
- unsigned long gb;
-
- inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
- active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
- gb = (inactive + active) >> (30 - PAGE_SHIFT);
- if (gb)
- inactive_ratio = int_sqrt(10 * gb);
- else
- inactive_ratio = 1;
-
- return inactive * inactive_ratio < active;
-}
-
void mem_cgroup_handle_over_high(void);
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
@@ -646,24 +627,12 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
return true;
}
-static inline bool
-mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
- return true;
-}
-
static inline unsigned long
mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
return 0;
}
-static inline void
-mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
- int increment)
-{
-}
-
static inline unsigned long
mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index adbef586e696..a864d792c7b5 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -247,16 +247,16 @@ static inline void mem_hotplug_done(void) {}
#ifdef CONFIG_MEMORY_HOTREMOVE
-extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
+extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
extern void try_offline_node(int nid);
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
extern void remove_memory(int nid, u64 start, u64 size);
#else
-static inline int is_mem_section_removable(unsigned long pfn,
+static inline bool is_mem_section_removable(unsigned long pfn,
unsigned long nr_pages)
{
- return 0;
+ return false;
}
static inline void try_offline_node(int nid) {}
@@ -284,5 +284,7 @@ extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
unsigned long map_offset);
extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
unsigned long pnum);
+extern int zone_can_shift(unsigned long pfn, unsigned long nr_pages,
+ enum zone_type target);
#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 2696c1f05ed1..4429d255c8ab 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -172,14 +172,14 @@ extern int mpol_parse_str(char *str, struct mempolicy **mpol);
extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
/* Check if a vma is migratable */
-static inline int vma_migratable(struct vm_area_struct *vma)
+static inline bool vma_migratable(struct vm_area_struct *vma)
{
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
- return 0;
+ return false;
#ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
if (vma->vm_flags & VM_HUGETLB)
- return 0;
+ return false;
#endif
/*
@@ -190,8 +190,8 @@ static inline int vma_migratable(struct vm_area_struct *vma)
if (vma->vm_file &&
gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
< policy_zone)
- return 0;
- return 1;
+ return false;
+ return true;
}
extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
@@ -228,6 +228,12 @@ static inline void mpol_free_shared_policy(struct shared_policy *p)
{
}
+static inline struct mempolicy *
+mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+{
+ return NULL;
+}
+
#define vma_policy(vma) NULL
static inline int
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 69b6951e8fd2..b1086c936507 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -5,6 +5,7 @@
#define _LINUX_MEMPOOL_H
#include <linux/wait.h>
+#include <linux/compiler.h>
struct kmem_cache;
@@ -31,7 +32,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
extern int mempool_resize(mempool_t *pool, int new_min_nr);
extern void mempool_destroy(mempool_t *pool);
-extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
+extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
extern void mempool_free(void *element, mempool_t *pool);
/*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 727f799757ab..65d18a45b8e8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -447,14 +447,14 @@ unsigned long vmalloc_to_pfn(const void *addr);
* On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
* is no special casing required.
*/
-static inline int is_vmalloc_addr(const void *x)
+static inline bool is_vmalloc_addr(const void *x)
{
#ifdef CONFIG_MMU
unsigned long addr = (unsigned long)x;
return addr >= VMALLOC_START && addr < VMALLOC_END;
#else
- return 0;
+ return false;
#endif
}
#ifdef CONFIG_MMU
@@ -475,8 +475,7 @@ static inline atomic_t *compound_mapcount_ptr(struct page *page)
static inline int compound_mapcount(struct page *page)
{
- if (!PageCompound(page))
- return 0;
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
page = compound_head(page);
return atomic_read(compound_mapcount_ptr(page)) + 1;
}
@@ -734,7 +733,7 @@ static inline void get_page(struct page *page)
page = compound_head(page);
/*
* Getting a normal page or the head of a compound page
- * requires to already have an elevated page->_count.
+ * requires to already have an elevated page->_refcount.
*/
VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
page_ref_inc(page);
@@ -850,10 +849,7 @@ extern int page_cpupid_xchg_last(struct page *page, int cpupid);
static inline void page_cpupid_reset_last(struct page *page)
{
- int cpupid = (1 << LAST_CPUPID_SHIFT) - 1;
-
- page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
- page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+ page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
#else /* !CONFIG_NUMA_BALANCING */
@@ -1032,26 +1028,7 @@ static inline pgoff_t page_file_index(struct page *page)
return page->index;
}
-/*
- * Return true if this page is mapped into pagetables.
- * For compound page it returns true if any subpage of compound page is mapped.
- */
-static inline bool page_mapped(struct page *page)
-{
- int i;
- if (likely(!PageCompound(page)))
- return atomic_read(&page->_mapcount) >= 0;
- page = compound_head(page);
- if (atomic_read(compound_mapcount_ptr(page)) >= 0)
- return true;
- if (PageHuge(page))
- return false;
- for (i = 0; i < hpage_nr_pages(page); i++) {
- if (atomic_read(&page[i]._mapcount) >= 0)
- return true;
- }
- return false;
-}
+bool page_mapped(struct page *page);
/*
* Return true only if the page has been allocated with
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 712e8c37a200..5bd29ba4f174 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -22,22 +22,34 @@ static inline int page_is_file_cache(struct page *page)
return !PageSwapBacked(page);
}
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
+ enum lru_list lru, int nr_pages)
+{
+ __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+}
+
+static __always_inline void update_lru_size(struct lruvec *lruvec,
+ enum lru_list lru, int nr_pages)
+{
+#ifdef CONFIG_MEMCG
+ mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+#else
+ __update_lru_size(lruvec, lru, nr_pages);
+#endif
+}
+
static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
{
- int nr_pages = hpage_nr_pages(page);
- mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+ update_lru_size(lruvec, lru, hpage_nr_pages(page));
list_add(&page->lru, &lruvec->lists[lru]);
- __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
}
static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
{
- int nr_pages = hpage_nr_pages(page);
- mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
list_del(&page->lru);
- __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages);
+ update_lru_size(lruvec, lru, -hpage_nr_pages(page));
}
/**
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c2d75b4fa86c..d553855503e6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -12,6 +12,7 @@
#include <linux/cpumask.h>
#include <linux/uprobes.h>
#include <linux/page-flags-layout.h>
+#include <linux/workqueue.h>
#include <asm/page.h>
#include <asm/mmu.h>
@@ -73,9 +74,9 @@ struct page {
unsigned long counters;
#else
/*
- * Keep _count separate from slub cmpxchg_double data.
- * As the rest of the double word is protected by
- * slab_lock but _count is not.
+ * Keep _refcount separate from slub cmpxchg_double
+ * data. As the rest of the double word is protected by
+ * slab_lock but _refcount is not.
*/
unsigned counters;
#endif
@@ -97,7 +98,11 @@ struct page {
};
int units; /* SLOB */
};
- atomic_t _count; /* Usage count, see below. */
+ /*
+ * Usage count, *USE WRAPPER FUNCTION*
+ * when manual accounting. See page_ref.h
+ */
+ atomic_t _refcount;
};
unsigned int active; /* SLAB */
};
@@ -248,7 +253,7 @@ struct page_frag_cache {
__u32 offset;
#endif
/* we maintain a pagecount bias, so that we dont dirty cache line
- * containing page->_count every time we allocate a fragment.
+ * containing page->_refcount every time we allocate a fragment.
*/
unsigned int pagecnt_bias;
bool pfmemalloc;
@@ -509,6 +514,7 @@ struct mm_struct {
#ifdef CONFIG_HUGETLB_PAGE
atomic_long_t hugetlb_usage;
#endif
+ struct work_struct async_put_work;
};
static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c60df9257cc7..02069c23486d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -85,13 +85,6 @@ extern int page_group_by_mobility_disabled;
get_pfnblock_flags_mask(page, page_to_pfn(page), \
PB_migrate_end, MIGRATETYPE_MASK)
-static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
-{
- BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
- return get_pfnblock_flags_mask(page, pfn, PB_migrate_end,
- MIGRATETYPE_MASK);
-}
-
struct free_area {
struct list_head free_list[MIGRATE_TYPES];
unsigned long nr_free;
@@ -746,8 +739,12 @@ static inline bool is_dev_zone(const struct zone *zone)
extern struct mutex zonelists_mutex;
void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+ int classzone_idx, unsigned int alloc_flags,
+ long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, int alloc_flags);
+ unsigned long mark, int classzone_idx,
+ unsigned int alloc_flags);
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx);
enum memmap_context {
@@ -828,10 +825,7 @@ static inline int is_highmem_idx(enum zone_type idx)
static inline int is_highmem(struct zone *zone)
{
#ifdef CONFIG_HIGHMEM
- int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones;
- return zone_off == ZONE_HIGHMEM * sizeof(*zone) ||
- (zone_off == ZONE_MOVABLE * sizeof(*zone) &&
- zone_movable_is_highmem());
+ return is_highmem_idx(zone_idx(zone));
#else
return 0;
#endif
@@ -922,6 +916,10 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
#endif /* CONFIG_NUMA */
}
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
+ enum zone_type highest_zoneidx,
+ nodemask_t *nodes);
+
/**
* next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
* @z - The cursor used as a starting point for the search
@@ -934,9 +932,14 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
* being examined. It should be advanced by one before calling
* next_zones_zonelist again.
*/
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
- nodemask_t *nodes);
+ nodemask_t *nodes)
+{
+ if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
+ return z;
+ return __next_zones_zonelist(z, highest_zoneidx, nodes);
+}
/**
* first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
@@ -952,13 +955,10 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
*/
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
enum zone_type highest_zoneidx,
- nodemask_t *nodes,
- struct zone **zone)
+ nodemask_t *nodes)
{
- struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs,
+ return next_zones_zonelist(zonelist->_zonerefs,
highest_zoneidx, nodes);
- *zone = zonelist_zone(z);
- return z;
}
/**
@@ -973,10 +973,17 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
* within a given nodemask
*/
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
- for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
+ for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \
+ zone; \
+ z = next_zones_zonelist(++z, highidx, nodemask), \
+ zone = zonelist_zone(z))
+
+#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
+ for (zone = z->zone; \
zone; \
z = next_zones_zonelist(++z, highidx, nodemask), \
- zone = zonelist_zone(z)) \
+ zone = zonelist_zone(z))
+
/**
* for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
@@ -1056,7 +1063,7 @@ struct mem_section {
unsigned long *pageblock_flags;
#ifdef CONFIG_PAGE_EXTENSION
/*
- * If !SPARSEMEM, pgdat doesn't have page_ext pointer. We use
+ * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
* section. (see page_ext.h about this.)
*/
struct page_ext *page_ext;
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index e9fcf90b270d..5988dd57ba66 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -13,12 +13,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Written by Koji Sato <koji@osrg.net>
- * Ryusuke Konishi <ryusuke@osrg.net>
+ * Written by Koji Sato and Ryusuke Konishi.
*/
/*
* linux/include/linux/ext2_fs.h
@@ -132,10 +127,14 @@ struct nilfs_super_root {
#define NILFS_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */
#define NILFS_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */
#define NILFS_MOUNT_BARRIER 0x1000 /* Use block barriers */
-#define NILFS_MOUNT_STRICT_ORDER 0x2000 /* Apply strict in-order
- semantics also for data */
-#define NILFS_MOUNT_NORECOVERY 0x4000 /* Disable write access during
- mount-time recovery */
+#define NILFS_MOUNT_STRICT_ORDER 0x2000 /*
+ * Apply strict in-order
+ * semantics also for data
+ */
+#define NILFS_MOUNT_NORECOVERY 0x4000 /*
+ * Disable write access during
+ * mount-time recovery
+ */
#define NILFS_MOUNT_DISCARD 0x8000 /* Issue DISCARD requests */
@@ -147,16 +146,20 @@ struct nilfs_super_block {
__le16 s_minor_rev_level; /* minor revision level */
__le16 s_magic; /* Magic signature */
- __le16 s_bytes; /* Bytes count of CRC calculation
- for this structure. s_reserved
- is excluded. */
+ __le16 s_bytes; /*
+ * Bytes count of CRC calculation
+ * for this structure. s_reserved
+ * is excluded.
+ */
__le16 s_flags; /* flags */
__le32 s_crc_seed; /* Seed value of CRC calculation */
/*10*/ __le32 s_sum; /* Check sum of super block */
- __le32 s_log_block_size; /* Block size represented as follows
- blocksize =
- 1 << (s_log_block_size + 10) */
+ __le32 s_log_block_size; /*
+ * Block size represented as follows
+ * blocksize =
+ * 1 << (s_log_block_size + 10)
+ */
__le64 s_nsegments; /* Number of segments in filesystem */
/*20*/ __le64 s_dev_size; /* block device size in bytes */
__le64 s_first_data_block; /* 1st seg disk block number */
@@ -168,8 +171,10 @@ struct nilfs_super_block {
__le64 s_last_seq; /* seq. number of seg written last */
/*50*/ __le64 s_free_blocks_count; /* Free blocks count */
- __le64 s_ctime; /* Creation time (execution time of
- newfs) */
+ __le64 s_ctime; /*
+ * Creation time (execution time of
+ * newfs)
+ */
/*60*/ __le64 s_mtime; /* Mount time */
__le64 s_wtime; /* Write time */
/*70*/ __le16 s_mnt_count; /* Mount count */
@@ -193,8 +198,10 @@ struct nilfs_super_block {
/*A8*/ char s_volume_name[80]; /* volume name */
/*F8*/ __le32 s_c_interval; /* Commit interval of segment */
- __le32 s_c_block_max; /* Threshold of data amount for
- the segment construction */
+ __le32 s_c_block_max; /*
+ * Threshold of data amount for
+ * the segment construction
+ */
/*100*/ __le64 s_feature_compat; /* Compatible feature set */
__le64 s_feature_compat_ro; /* Read-only compatible feature set */
__le64 s_feature_incompat; /* Incompatible feature set */
@@ -247,12 +254,18 @@ struct nilfs_super_block {
#define NILFS_SB_OFFSET_BYTES 1024 /* byte offset of nilfs superblock */
-#define NILFS_SEG_MIN_BLOCKS 16 /* Minimum number of blocks in
- a full segment */
-#define NILFS_PSEG_MIN_BLOCKS 2 /* Minimum number of blocks in
- a partial segment */
-#define NILFS_MIN_NRSVSEGS 8 /* Minimum number of reserved
- segments */
+#define NILFS_SEG_MIN_BLOCKS 16 /*
+ * Minimum number of blocks in
+ * a full segment
+ */
+#define NILFS_PSEG_MIN_BLOCKS 2 /*
+ * Minimum number of blocks in
+ * a partial segment
+ */
+#define NILFS_MIN_NRSVSEGS 8 /*
+ * Minimum number of reserved
+ * segments
+ */
/*
* We call DAT, cpfile, and sufile root metadata files. Inodes of
@@ -327,9 +340,9 @@ enum {
~NILFS_DIR_ROUND)
#define NILFS_MAX_REC_LEN ((1<<16)-1)
-static inline unsigned nilfs_rec_len_from_disk(__le16 dlen)
+static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen)
{
- unsigned len = le16_to_cpu(dlen);
+ unsigned int len = le16_to_cpu(dlen);
#if !defined(__KERNEL__) || (PAGE_SIZE >= 65536)
if (len == NILFS_MAX_REC_LEN)
@@ -338,7 +351,7 @@ static inline unsigned nilfs_rec_len_from_disk(__le16 dlen)
return len;
}
-static inline __le16 nilfs_rec_len_to_disk(unsigned len)
+static inline __le16 nilfs_rec_len_to_disk(unsigned int len)
{
#if !defined(__KERNEL__) || (PAGE_SIZE >= 65536)
if (len == (1 << 16))
@@ -518,9 +531,11 @@ struct nilfs_checkpoint {
__le64 cp_inodes_count;
__le64 cp_blocks_count;
- /* Do not change the byte offset of ifile inode.
- To keep the compatibility of the disk format,
- additional fields should be added behind cp_ifile_inode. */
+ /*
+ * Do not change the byte offset of ifile inode.
+ * To keep the compatibility of the disk format,
+ * additional fields should be added behind cp_ifile_inode.
+ */
struct nilfs_inode cp_ifile_inode;
};
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 6e85889cf9ab..f746e44d4046 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -43,8 +43,10 @@
*
* int first_node(mask) Number lowest set bit, or MAX_NUMNODES
* int next_node(node, mask) Next node past 'node', or MAX_NUMNODES
+ * int next_node_in(node, mask) Next node past 'node', or wrap to first,
+ * or MAX_NUMNODES
* int first_unset_node(mask) First node not set in mask, or
- * MAX_NUMNODES.
+ * MAX_NUMNODES
*
* nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set
* NODE_MASK_ALL Initializer - all bits set
@@ -259,6 +261,13 @@ static inline int __next_node(int n, const nodemask_t *srcp)
return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}
+/*
+ * Find the next present node in src, starting after node n, wrapping around to
+ * the first node in src if needed. Returns MAX_NUMNODES if src is empty.
+ */
+#define next_node_in(n, src) __next_node_in((n), &(src))
+int __next_node_in(int node, const nodemask_t *srcp);
+
static inline void init_nodemask_of_node(nodemask_t *mask, int node)
{
nodes_clear(*mask);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 628a43242a34..06de5886de99 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -50,28 +50,33 @@ enum oom_scan_t {
OOM_SCAN_SELECT, /* always select this thread first */
};
-/* Thread is the potential origin of an oom condition; kill first on oom */
-#define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1)
-
extern struct mutex oom_lock;
static inline void set_current_oom_origin(void)
{
- current->signal->oom_flags |= OOM_FLAG_ORIGIN;
+ current->signal->oom_flag_origin = true;
}
static inline void clear_current_oom_origin(void)
{
- current->signal->oom_flags &= ~OOM_FLAG_ORIGIN;
+ current->signal->oom_flag_origin = false;
}
static inline bool oom_task_origin(const struct task_struct *p)
{
- return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
+ return p->signal->oom_flag_origin;
}
extern void mark_oom_victim(struct task_struct *tsk);
+#ifdef CONFIG_MMU
+extern void try_oom_reaper(struct task_struct *tsk);
+#else
+static inline void try_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif
+
extern unsigned long oom_badness(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask,
unsigned long totalpages);
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 438694650471..113ee626a4dc 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -175,11 +175,6 @@ extern int padata_do_parallel(struct padata_instance *pinst,
extern void padata_do_serial(struct padata_priv *padata);
extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
cpumask_var_t cpumask);
-extern int padata_set_cpumasks(struct padata_instance *pinst,
- cpumask_var_t pcpumask,
- cpumask_var_t cbcpumask);
-extern int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask);
-extern int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask);
extern int padata_start(struct padata_instance *pinst);
extern void padata_stop(struct padata_instance *pinst);
extern int padata_register_cpumask_notifier(struct padata_instance *pinst,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6b052aa7b5b7..e5a32445f930 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -371,10 +371,15 @@ PAGEFLAG(Idle, idle, PF_ANY)
#define PAGE_MAPPING_KSM 2
#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
+static __always_inline int PageAnonHead(struct page *page)
+{
+ return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
static __always_inline int PageAnon(struct page *page)
{
page = compound_head(page);
- return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+ return PageAnonHead(page);
}
#ifdef CONFIG_KSM
@@ -474,7 +479,7 @@ static inline void ClearPageCompound(struct page *page)
}
#endif
-#define PG_head_mask ((1L << PG_head))
+#define PG_head_mask ((1UL << PG_head))
#ifdef CONFIG_HUGETLB_PAGE
int PageHuge(struct page *page);
@@ -665,7 +670,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
}
#ifdef CONFIG_MMU
-#define __PG_MLOCKED (1 << PG_mlocked)
+#define __PG_MLOCKED (1UL << PG_mlocked)
#else
#define __PG_MLOCKED 0
#endif
@@ -675,11 +680,11 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
* these flags set. It they are, there is a problem.
*/
#define PAGE_FLAGS_CHECK_AT_FREE \
- (1 << PG_lru | 1 << PG_locked | \
- 1 << PG_private | 1 << PG_private_2 | \
- 1 << PG_writeback | 1 << PG_reserved | \
- 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
- 1 << PG_unevictable | __PG_MLOCKED)
+ (1UL << PG_lru | 1UL << PG_locked | \
+ 1UL << PG_private | 1UL << PG_private_2 | \
+ 1UL << PG_writeback | 1UL << PG_reserved | \
+ 1UL << PG_slab | 1UL << PG_swapcache | 1UL << PG_active | \
+ 1UL << PG_unevictable | __PG_MLOCKED)
/*
* Flags checked when a page is prepped for return by the page allocator.
@@ -690,10 +695,10 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
* alloc-free cycle to prevent from reusing the page.
*/
#define PAGE_FLAGS_CHECK_AT_PREP \
- (((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
+ (((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
#define PAGE_FLAGS_PRIVATE \
- (1 << PG_private | 1 << PG_private_2)
+ (1UL << PG_private | 1UL << PG_private_2)
/**
* page_has_private - Determine if page has private stuff
* @page: The page to be checked
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index e596d5d9540e..8b5e0a9f2431 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -63,17 +63,17 @@ static inline void __page_ref_unfreeze(struct page *page, int v)
static inline int page_ref_count(struct page *page)
{
- return atomic_read(&page->_count);
+ return atomic_read(&page->_refcount);
}
static inline int page_count(struct page *page)
{
- return atomic_read(&compound_head(page)->_count);
+ return atomic_read(&compound_head(page)->_refcount);
}
static inline void set_page_count(struct page *page, int v)
{
- atomic_set(&page->_count, v);
+ atomic_set(&page->_refcount, v);
if (page_ref_tracepoint_active(__tracepoint_page_ref_set))
__page_ref_set(page, v);
}
@@ -89,35 +89,35 @@ static inline void init_page_count(struct page *page)
static inline void page_ref_add(struct page *page, int nr)
{
- atomic_add(nr, &page->_count);
+ atomic_add(nr, &page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
__page_ref_mod(page, nr);
}
static inline void page_ref_sub(struct page *page, int nr)
{
- atomic_sub(nr, &page->_count);
+ atomic_sub(nr, &page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
__page_ref_mod(page, -nr);
}
static inline void page_ref_inc(struct page *page)
{
- atomic_inc(&page->_count);
+ atomic_inc(&page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
__page_ref_mod(page, 1);
}
static inline void page_ref_dec(struct page *page)
{
- atomic_dec(&page->_count);
+ atomic_dec(&page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
__page_ref_mod(page, -1);
}
static inline int page_ref_sub_and_test(struct page *page, int nr)
{
- int ret = atomic_sub_and_test(nr, &page->_count);
+ int ret = atomic_sub_and_test(nr, &page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
__page_ref_mod_and_test(page, -nr, ret);
@@ -126,7 +126,7 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
static inline int page_ref_dec_and_test(struct page *page)
{
- int ret = atomic_dec_and_test(&page->_count);
+ int ret = atomic_dec_and_test(&page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
__page_ref_mod_and_test(page, -1, ret);
@@ -135,7 +135,7 @@ static inline int page_ref_dec_and_test(struct page *page)
static inline int page_ref_dec_return(struct page *page)
{
- int ret = atomic_dec_return(&page->_count);
+ int ret = atomic_dec_return(&page->_refcount);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
__page_ref_mod_and_return(page, -1, ret);
@@ -144,7 +144,7 @@ static inline int page_ref_dec_return(struct page *page)
static inline int page_ref_add_unless(struct page *page, int nr, int u)
{
- int ret = atomic_add_unless(&page->_count, nr, u);
+ int ret = atomic_add_unless(&page->_refcount, nr, u);
if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless))
__page_ref_mod_unless(page, nr, ret);
@@ -153,7 +153,7 @@ static inline int page_ref_add_unless(struct page *page, int nr, int u)
static inline int page_ref_freeze(struct page *page, int count)
{
- int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count);
+ int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze))
__page_ref_freeze(page, count, ret);
@@ -165,7 +165,7 @@ static inline void page_ref_unfreeze(struct page *page, int count)
VM_BUG_ON_PAGE(page_count(page) != 0, page);
VM_BUG_ON(count == 0);
- atomic_set(&page->_count, count);
+ atomic_set(&page->_refcount, count);
if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze))
__page_ref_unfreeze(page, count);
}
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7e1ab155c67c..97354102794d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -90,12 +90,12 @@ void release_pages(struct page **pages, int nr, bool cold);
/*
* speculatively take a reference to a page.
- * If the page is free (_count == 0), then _count is untouched, and 0
- * is returned. Otherwise, _count is incremented by 1 and 1 is returned.
+ * If the page is free (_refcount == 0), then _refcount is untouched, and 0
+ * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
*
* This function must be called inside the same rcu_read_lock() section as has
* been used to lookup the page in the pagecache radix-tree (or page table):
- * this allows allocators to use a synchronize_rcu() to stabilize _count.
+ * this allows allocators to use a synchronize_rcu() to stabilize _refcount.
*
* Unless an RCU grace period has passed, the count of all pages coming out
* of the allocator must be considered unstable. page_count may return higher
@@ -111,7 +111,7 @@ void release_pages(struct page **pages, int nr, bool cold);
* 2. conditionally increment refcount
* 3. check the page is still in pagecache (if no, goto 1)
*
- * Remove-side that cares about stability of _count (eg. reclaim) has the
+ * Remove-side that cares about stability of _refcount (eg. reclaim) has the
* following (with tree_lock held for write):
* A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
* B. remove page from pagecache
@@ -518,33 +518,27 @@ void page_endio(struct page *page, int rw, int err);
extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter);
/*
- * Fault a userspace page into pagetables. Return non-zero on a fault.
- *
- * This assumes that two userspace pages are always sufficient.
+ * Fault one or two userspace pages into pagetables.
+ * Return -EINVAL if more than two pages would be needed.
+ * Return non-zero on a fault.
*/
static inline int fault_in_pages_writeable(char __user *uaddr, int size)
{
- int ret;
+ int span, ret;
if (unlikely(size == 0))
return 0;
+ span = offset_in_page(uaddr) + size;
+ if (span > 2 * PAGE_SIZE)
+ return -EINVAL;
/*
* Writing zeroes into userspace here is OK, because we know that if
* the zero gets there, we'll be overwriting it.
*/
ret = __put_user(0, uaddr);
- if (ret == 0) {
- char __user *end = uaddr + size - 1;
-
- /*
- * If the page was already mapped, this will get a cache miss
- * for sure, so try to avoid doing it.
- */
- if (((unsigned long)uaddr & PAGE_MASK) !=
- ((unsigned long)end & PAGE_MASK))
- ret = __put_user(0, end);
- }
+ if (ret == 0 && span > PAGE_SIZE)
+ ret = __put_user(0, uaddr + size - 1);
return ret;
}
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 4bc6dafb703e..56939d3f6e53 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -129,7 +129,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
(typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
__alignof__(type))
-/* To avoid include hell, as printk can not declare this, we declare it here */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-
#endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 9fb4f40d9a26..37b057b63b46 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
ktime_t *expires, unsigned long slack);
-extern u64 select_estimate_accuracy(struct timespec *tv);
+extern u64 select_estimate_accuracy(struct timespec64 *tv);
static inline int poll_schedule(struct poll_wqueues *pwq, int state)
@@ -153,12 +153,13 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset)
#define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
-extern int do_select(int n, fd_set_bits *fds, struct timespec *end_time);
+extern int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time);
extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
- struct timespec *end_time);
+ struct timespec64 *end_time);
extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
- fd_set __user *exp, struct timespec *end_time);
+ fd_set __user *exp, struct timespec64 *end_time);
-extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec);
+extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec,
+ long nsec);
#endif /* _LINUX_POLL_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9ccbdf2c1453..f4da695fd615 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -122,7 +122,19 @@ static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif
-typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK_NMI
+extern void printk_nmi_init(void);
+extern void printk_nmi_enter(void);
+extern void printk_nmi_exit(void);
+extern void printk_nmi_flush(void);
+extern void printk_nmi_flush_on_panic(void);
+#else
+static inline void printk_nmi_init(void) { }
+static inline void printk_nmi_enter(void) { }
+static inline void printk_nmi_exit(void) { }
+static inline void printk_nmi_flush(void) { }
+static inline void printk_nmi_flush_on_panic(void) { }
+#endif /* PRINTK_NMI */
#ifdef CONFIG_PRINTK
asmlinkage __printf(5, 0)
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index d08d6ec3bf53..cb4b7e8cee81 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -29,42 +29,45 @@
#include <linux/rcupdate.h>
/*
- * An indirect pointer (root->rnode pointing to a radix_tree_node, rather
- * than a data item) is signalled by the low bit set in the root->rnode
- * pointer.
- *
- * In this case root->height is > 0, but the indirect pointer tests are
- * needed for RCU lookups (because root->height is unreliable). The only
- * time callers need worry about this is when doing a lookup_slot under
- * RCU.
- *
- * Indirect pointer in fact is also used to tag the last pointer of a node
- * when it is shrunk, before we rcu free the node. See shrink code for
- * details.
+ * The bottom two bits of the slot determine how the remaining bits in the
+ * slot are interpreted:
+ *
+ * 00 - data pointer
+ * 01 - internal entry
+ * 10 - exceptional entry
+ * 11 - locked exceptional entry
+ *
+ * The internal entry may be a pointer to the next level in the tree, a
+ * sibling entry, or an indicator that the entry in this slot has been moved
+ * to another location in the tree and the lookup should be restarted. While
+ * NULL fits the 'data pointer' pattern, it means that there is no entry in
+ * the tree for this index (no matter what level of the tree it is found at).
+ * This means that you cannot store NULL in the tree as a value for the index.
*/
-#define RADIX_TREE_INDIRECT_PTR 1
+#define RADIX_TREE_ENTRY_MASK 3UL
+#define RADIX_TREE_INTERNAL_NODE 1UL
+
/*
- * A common use of the radix tree is to store pointers to struct pages;
- * but shmem/tmpfs needs also to store swap entries in the same tree:
- * those are marked as exceptional entries to distinguish them.
+ * Most users of the radix tree store pointers but shmem/tmpfs stores swap
+ * entries in the same tree. They are marked as exceptional entries to
+ * distinguish them from pointers to struct page.
* EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it.
*/
#define RADIX_TREE_EXCEPTIONAL_ENTRY 2
#define RADIX_TREE_EXCEPTIONAL_SHIFT 2
-static inline int radix_tree_is_indirect_ptr(void *ptr)
+static inline bool radix_tree_is_internal_node(void *ptr)
{
- return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
+ return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) ==
+ RADIX_TREE_INTERNAL_NODE;
}
/*** radix-tree API starts here ***/
#define RADIX_TREE_MAX_TAGS 3
-#ifdef __KERNEL__
+#ifndef RADIX_TREE_MAP_SHIFT
#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
-#else
-#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */
#endif
#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
@@ -77,16 +80,13 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
RADIX_TREE_MAP_SHIFT))
-/* Height component in node->path */
-#define RADIX_TREE_HEIGHT_SHIFT (RADIX_TREE_MAX_PATH + 1)
-#define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1)
-
/* Internally used bits of node->count */
#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1)
#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
struct radix_tree_node {
- unsigned int path; /* Offset in parent & height from the bottom */
+ unsigned char shift; /* Bits remaining in each slot */
+ unsigned char offset; /* Slot offset in parent */
unsigned int count;
union {
struct {
@@ -106,13 +106,11 @@ struct radix_tree_node {
/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
struct radix_tree_root {
- unsigned int height;
gfp_t gfp_mask;
struct radix_tree_node __rcu *rnode;
};
#define RADIX_TREE_INIT(mask) { \
- .height = 0, \
.gfp_mask = (mask), \
.rnode = NULL, \
}
@@ -122,11 +120,15 @@ struct radix_tree_root {
#define INIT_RADIX_TREE(root, mask) \
do { \
- (root)->height = 0; \
(root)->gfp_mask = (mask); \
(root)->rnode = NULL; \
} while (0)
+static inline bool radix_tree_empty(struct radix_tree_root *root)
+{
+ return root->rnode == NULL;
+}
+
/**
* Radix-tree synchronization
*
@@ -222,7 +224,7 @@ static inline void *radix_tree_deref_slot_protected(void **pslot,
*/
static inline int radix_tree_deref_retry(void *arg)
{
- return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR);
+ return unlikely(radix_tree_is_internal_node(arg));
}
/**
@@ -243,8 +245,7 @@ static inline int radix_tree_exceptional_entry(void *arg)
*/
static inline int radix_tree_exception(void *arg)
{
- return unlikely((unsigned long)arg &
- (RADIX_TREE_INDIRECT_PTR | RADIX_TREE_EXCEPTIONAL_ENTRY));
+ return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
}
/**
@@ -257,7 +258,7 @@ static inline int radix_tree_exception(void *arg)
*/
static inline void radix_tree_replace_slot(void **pslot, void *item)
{
- BUG_ON(radix_tree_is_indirect_ptr(item));
+ BUG_ON(radix_tree_is_internal_node(item));
rcu_assign_pointer(*pslot, item);
}
@@ -279,9 +280,12 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
- unsigned long first_index, unsigned int max_items);
+struct radix_tree_node *radix_tree_replace_clear_tags(
+ struct radix_tree_root *root,
+ unsigned long index, void *entry);
+unsigned int radix_tree_gang_lookup(struct radix_tree_root *root,
+ void **results, unsigned long first_index,
+ unsigned int max_items);
unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
void ***results, unsigned long *indices,
unsigned long first_index, unsigned int max_items);
@@ -318,8 +322,9 @@ static inline void radix_tree_preload_end(void)
* struct radix_tree_iter - radix tree iterator state
*
* @index: index of current slot
- * @next_index: next-to-last index for this chunk
+ * @next_index: one beyond the last index for this chunk
* @tags: bit-mask for tag-iterating
+ * @shift: shift for the node that holds our slots
*
* This radix tree iterator works in terms of "chunks" of slots. A chunk is a
* subinterval of slots contained within one radix tree leaf node. It is
@@ -332,8 +337,20 @@ struct radix_tree_iter {
unsigned long index;
unsigned long next_index;
unsigned long tags;
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+ unsigned int shift;
+#endif
};
+static inline unsigned int iter_shift(struct radix_tree_iter *iter)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+ return iter->shift;
+#else
+ return 0;
+#endif
+}
+
#define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */
#define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */
#define RADIX_TREE_ITER_CONTIG 0x0200 /* stop at first hole */
@@ -393,6 +410,12 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
return NULL;
}
+static inline unsigned long
+__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
+{
+ return iter->index + (slots << iter_shift(iter));
+}
+
/**
* radix_tree_iter_next - resume iterating when the chunk may be invalid
* @iter: iterator state
@@ -404,7 +427,7 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
static inline __must_check
void **radix_tree_iter_next(struct radix_tree_iter *iter)
{
- iter->next_index = iter->index + 1;
+ iter->next_index = __radix_tree_iter_add(iter, 1);
iter->tags = 0;
return NULL;
}
@@ -418,7 +441,12 @@ void **radix_tree_iter_next(struct radix_tree_iter *iter)
static __always_inline long
radix_tree_chunk_size(struct radix_tree_iter *iter)
{
- return iter->next_index - iter->index;
+ return (iter->next_index - iter->index) >> iter_shift(iter);
+}
+
+static inline struct radix_tree_node *entry_to_node(void *ptr)
+{
+ return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
}
/**
@@ -436,24 +464,49 @@ static __always_inline void **
radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
{
if (flags & RADIX_TREE_ITER_TAGGED) {
+ void *canon = slot;
+
iter->tags >>= 1;
+ if (unlikely(!iter->tags))
+ return NULL;
+ while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
+ radix_tree_is_internal_node(slot[1])) {
+ if (entry_to_node(slot[1]) == canon) {
+ iter->tags >>= 1;
+ iter->index = __radix_tree_iter_add(iter, 1);
+ slot++;
+ continue;
+ }
+ iter->next_index = __radix_tree_iter_add(iter, 1);
+ return NULL;
+ }
if (likely(iter->tags & 1ul)) {
- iter->index++;
+ iter->index = __radix_tree_iter_add(iter, 1);
return slot + 1;
}
- if (!(flags & RADIX_TREE_ITER_CONTIG) && likely(iter->tags)) {
+ if (!(flags & RADIX_TREE_ITER_CONTIG)) {
unsigned offset = __ffs(iter->tags);
iter->tags >>= offset;
- iter->index += offset + 1;
+ iter->index = __radix_tree_iter_add(iter, offset + 1);
return slot + offset + 1;
}
} else {
- long size = radix_tree_chunk_size(iter);
+ long count = radix_tree_chunk_size(iter);
+ void *canon = slot;
- while (--size > 0) {
+ while (--count > 0) {
slot++;
- iter->index++;
+ iter->index = __radix_tree_iter_add(iter, 1);
+
+ if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
+ radix_tree_is_internal_node(*slot)) {
+ if (entry_to_node(*slot) == canon)
+ continue;
+ iter->next_index = iter->index;
+ break;
+ }
+
if (likely(*slot))
return slot;
if (flags & RADIX_TREE_ITER_CONTIG) {
@@ -467,34 +520,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
}
/**
- * radix_tree_for_each_chunk - iterate over chunks
- *
- * @slot: the void** variable for pointer to chunk first slot
- * @root: the struct radix_tree_root pointer
- * @iter: the struct radix_tree_iter pointer
- * @start: iteration starting index
- * @flags: RADIX_TREE_ITER_* and tag index
- *
- * Locks can be released and reacquired between iterations.
- */
-#define radix_tree_for_each_chunk(slot, root, iter, start, flags) \
- for (slot = radix_tree_iter_init(iter, start) ; \
- (slot = radix_tree_next_chunk(root, iter, flags)) ;)
-
-/**
- * radix_tree_for_each_chunk_slot - iterate over slots in one chunk
- *
- * @slot: the void** variable, at the beginning points to chunk first slot
- * @iter: the struct radix_tree_iter pointer
- * @flags: RADIX_TREE_ITER_*, should be constant
- *
- * This macro is designed to be nested inside radix_tree_for_each_chunk().
- * @slot points to the radix tree slot, @iter->index contains its index.
- */
-#define radix_tree_for_each_chunk_slot(slot, iter, flags) \
- for (; slot ; slot = radix_tree_next_slot(slot, iter, flags))
-
-/**
* radix_tree_for_each_slot - iterate over non-empty slots
*
* @slot: the void** variable for pointer to slot
diff --git a/include/linux/random.h b/include/linux/random.h
index 9c29122037f9..e47e533742b5 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -26,7 +26,6 @@ extern void get_random_bytes(void *buf, int nbytes);
extern int add_random_ready_callback(struct random_ready_callback *rdy);
extern void del_random_ready_callback(struct random_ready_callback *rdy);
extern void get_random_bytes_arch(void *buf, int nbytes);
-void generate_random_uuid(unsigned char uuid_out[16]);
extern int random_int_secret_init(void);
#ifndef MODULE
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c2a7c8ab224a..870a700ee54e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -522,6 +522,7 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_HAS_UPROBES 19 /* has uprobes */
#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
+#define MMF_OOM_REAPED 21 /* mm has been already reaped */
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
@@ -793,7 +794,11 @@ struct signal_struct {
struct tty_audit_buf *tty_audit_buf;
#endif
- oom_flags_t oom_flags;
+ /*
+ * Thread is the potential origin of an oom condition; kill first on
+ * oom
+ */
+ bool oom_flag_origin;
short oom_score_adj; /* OOM kill score adjustment */
short oom_score_adj_min; /* OOM kill score adjustment min value.
* Only settable by CAP_SYS_RESOURCE. */
@@ -2735,6 +2740,11 @@ static inline void mmdrop(struct mm_struct * mm)
/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
+/* same as above but performs the slow path from the async kontext. Can
+ * be called from the atomic context as well
+ */
+extern void mmput_async(struct mm_struct *);
+
/* Grab a reference to a task's mm, if it is not already going away */
extern struct mm_struct *get_task_mm(struct task_struct *task);
/*
@@ -2763,7 +2773,14 @@ static inline int copy_thread_tls(
}
#endif
extern void flush_thread(void);
-extern void exit_thread(void);
+
+#ifdef CONFIG_HAVE_EXIT_THREAD
+extern void exit_thread(struct task_struct *tsk);
+#else
+static inline void exit_thread(struct task_struct *tsk)
+{
+}
+#endif
extern void exit_files(struct task_struct *);
extern void __cleanup_sighand(struct sighand_struct *);
diff --git a/include/linux/sem.h b/include/linux/sem.h
index 976ce3a19f1b..d0efd6e6c20a 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -21,6 +21,7 @@ struct sem_array {
struct list_head list_id; /* undo requests on this array */
int sem_nsems; /* no. of semaphores in array */
int complex_count; /* pending complex operations */
+ bool complex_mode; /* no parallel simple ops */
};
#ifdef CONFIG_SYSVIPC
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 508bd827e6dc..aeb3e6d00a66 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -315,8 +315,8 @@ static __always_inline int kmalloc_index(size_t size)
}
#endif /* !CONFIG_SLOB */
-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment;
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment;
+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
void kmem_cache_free(struct kmem_cache *, void *);
/*
@@ -339,8 +339,8 @@ static __always_inline void kfree_bulk(size_t size, void **p)
}
#ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment;
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
#else
static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
@@ -354,12 +354,12 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
#endif
#ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment;
+extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;
#ifdef CONFIG_NUMA
extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
- int node, size_t size) __assume_slab_alignment;
+ int node, size_t size) __assume_slab_alignment __malloc;
#else
static __always_inline void *
kmem_cache_alloc_node_trace(struct kmem_cache *s,
@@ -392,10 +392,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
}
#endif /* CONFIG_TRACING */
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
#ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
#else
static __always_inline void *
kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 9edbbf352340..8694f7a5d92b 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -80,6 +80,10 @@ struct kmem_cache {
struct kasan_cache kasan_info;
#endif
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+ void *random_seq;
+#endif
+
struct kmem_cache_node *node[MAX_NUMNODES];
};
diff --git a/include/linux/string.h b/include/linux/string.h
index d3993a79a325..26b6f6a66f83 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -119,7 +119,7 @@ char *strreplace(char *s, char old, char new);
extern void kfree_const(const void *x);
-extern char *kstrdup(const char *s, gfp_t gfp);
+extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ad220359f1b0..0af2bb2028fd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -316,6 +316,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
struct vm_area_struct *vma);
/* linux/mm/vmscan.c */
+extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d795472c54d8..d02239022bd0 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -371,10 +371,10 @@ asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese,
size_t sigsetsize);
asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
siginfo_t __user *uinfo);
-asmlinkage long sys_kill(int pid, int sig);
-asmlinkage long sys_tgkill(int tgid, int pid, int sig);
-asmlinkage long sys_tkill(int pid, int sig);
-asmlinkage long sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo);
+asmlinkage long sys_kill(pid_t pid, int sig);
+asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig);
+asmlinkage long sys_tkill(pid_t pid, int sig);
+asmlinkage long sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo);
asmlinkage long sys_sgetmask(void);
asmlinkage long sys_ssetmask(int newmask);
asmlinkage long sys_signal(int sig, __sighandler_t handler);
diff --git a/include/linux/time64.h b/include/linux/time64.h
index 367d5af899e8..7e5d2fa9ac46 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -65,7 +65,6 @@ static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *
# define timespec64_equal timespec_equal
# define timespec64_compare timespec_compare
# define set_normalized_timespec64 set_normalized_timespec
-# define timespec64_add_safe timespec_add_safe
# define timespec64_add timespec_add
# define timespec64_sub timespec_sub
# define timespec64_valid timespec_valid
@@ -134,15 +133,6 @@ static inline int timespec64_compare(const struct timespec64 *lhs, const struct
extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);
-/*
- * timespec64_add_safe assumes both values are positive and checks for
- * overflow. It will return TIME_T_MAX if the returned value would be
- * smaller then either of the arguments.
- */
-extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
- const struct timespec64 rhs);
-
-
static inline struct timespec64 timespec64_add(struct timespec64 lhs,
struct timespec64 rhs)
{
@@ -224,4 +214,11 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
#endif
+/*
+ * timespec64_add_safe assumes both values are positive and checks for
+ * overflow. It will return TIME64_MAX in case of overflow.
+ */
+extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+ const struct timespec64 rhs);
+
#endif /* _LINUX_TIME64_H */
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 37dbacf84849..4c4390a5d71f 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -246,6 +246,7 @@ static inline u64 ktime_get_raw_ns(void)
extern u64 ktime_get_mono_fast_ns(void);
extern u64 ktime_get_raw_fast_ns(void);
+extern u64 ktime_get_log_ts(u64 *offset_real);
/*
* Timespec interfaces utilizing the ktime based ones
diff --git a/include/linux/types.h b/include/linux/types.h
index 70dd3dfde631..baf718324f4a 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -156,7 +156,6 @@ typedef u32 dma_addr_t;
typedef unsigned __bitwise__ gfp_t;
typedef unsigned __bitwise__ fmode_t;
-typedef unsigned __bitwise__ oom_flags_t;
#ifdef CONFIG_PHYS_ADDR_T_64BIT
typedef u64 phys_addr_t;
diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 6df2509033d7..2d095fc60204 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -1,7 +1,7 @@
/*
* UUID/GUID definition
*
- * Copyright (C) 2010, Intel Corp.
+ * Copyright (C) 2010, 2016 Intel Corp.
* Huang Ying <ying.huang@intel.com>
*
* This program is free software; you can redistribute it and/or
@@ -12,16 +12,17 @@
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _LINUX_UUID_H_
#define _LINUX_UUID_H_
#include <uapi/linux/uuid.h>
+/*
+ * The length of a UUID string ("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee")
+ * not including trailing NUL.
+ */
+#define UUID_STRING_LEN 36
static inline int uuid_le_cmp(const uuid_le u1, const uuid_le u2)
{
@@ -33,7 +34,17 @@ static inline int uuid_be_cmp(const uuid_be u1, const uuid_be u2)
return memcmp(&u1, &u2, sizeof(uuid_be));
}
+void generate_random_uuid(unsigned char uuid[16]);
+
extern void uuid_le_gen(uuid_le *u);
extern void uuid_be_gen(uuid_be *u);
+bool __must_check uuid_is_valid(const char *uuid);
+
+extern const u8 uuid_le_index[16];
+extern const u8 uuid_be_index[16];
+
+int uuid_le_to_bin(const char *uuid, uuid_le *u);
+int uuid_be_to_bin(const char *uuid, uuid_be *u);
+
#endif
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 8b51df3ab334..3d9d786a943c 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -4,6 +4,7 @@
#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/llist.h>
#include <asm/page.h> /* pgprot_t */
#include <linux/rbtree.h>
@@ -45,7 +46,7 @@ struct vmap_area {
unsigned long flags;
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
- struct list_head purge_list; /* "lazy purge" list */
+ struct llist_node purge_list; /* "lazy purge" list */
struct vm_struct *vm;
struct rcu_head rcu_head;
};
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 73fae8c4a5fb..0aa613df463e 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -53,6 +53,8 @@ static inline void count_vm_events(enum vm_event_item item, long delta)
extern void all_vm_events(unsigned long *);
+extern unsigned long sum_vm_event(enum vm_event_item item);
+
extern void vm_events_fold_cpu(int cpu);
#else
@@ -73,6 +75,10 @@ static inline void __count_vm_events(enum vm_event_item item, long delta)
static inline void all_vm_events(unsigned long *ret)
{
}
+static inline unsigned long sum_vm_event(enum vm_event_item item)
+{
+ return 0;
+}
static inline void vm_events_fold_cpu(int cpu)
{
}
@@ -163,12 +169,10 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
#ifdef CONFIG_NUMA
extern unsigned long node_page_state(int node, enum zone_stat_item item);
-extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
#else
#define node_page_state(node, item) global_page_state(item)
-#define zone_statistics(_zl, _z, gfp) do { } while (0)
#endif /* CONFIG_NUMA */
@@ -193,6 +197,10 @@ void quiet_vmstat(void);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);
+struct ctl_table;
+int vmstat_refresh(struct ctl_table *, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+
void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
int calculate_pressure_threshold(struct zone *zone);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 34eb16098a33..57a8e98f2708 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -41,10 +41,10 @@ struct zs_pool_stats {
struct zs_pool;
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags);
+struct zs_pool *zs_create_pool(const char *name);
void zs_destroy_pool(struct zs_pool *pool);
-unsigned long zs_malloc(struct zs_pool *pool, size_t size);
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
void zs_free(struct zs_pool *pool, unsigned long obj);
void *zs_map_object(struct zs_pool *pool, unsigned long handle,
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index e215bf68f521..36e2d6fb1360 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -10,10 +10,11 @@
#include <trace/events/mmflags.h>
#define COMPACTION_STATUS \
- EM( COMPACT_DEFERRED, "deferred") \
EM( COMPACT_SKIPPED, "skipped") \
+ EM( COMPACT_DEFERRED, "deferred") \
EM( COMPACT_CONTINUE, "continue") \
EM( COMPACT_PARTIAL, "partial") \
+ EM( COMPACT_PARTIAL_SKIPPED, "partial_skipped") \
EM( COMPACT_COMPLETE, "complete") \
EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \
EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 551ba4acde4d..bda21183eb05 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -28,7 +28,8 @@
EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \
EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\
EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
- EMe( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed")
+ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \
+ EMe( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte")
#undef EM
#undef EMe
@@ -45,9 +46,9 @@ SCAN_STATUS
TRACE_EVENT(mm_khugepaged_scan_pmd,
TP_PROTO(struct mm_struct *mm, struct page *page, bool writable,
- bool referenced, int none_or_zero, int status),
+ bool referenced, int none_or_zero, int status, int unmapped),
- TP_ARGS(mm, page, writable, referenced, none_or_zero, status),
+ TP_ARGS(mm, page, writable, referenced, none_or_zero, status, unmapped),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
@@ -56,6 +57,7 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
__field(bool, referenced)
__field(int, none_or_zero)
__field(int, status)
+ __field(int, unmapped)
),
TP_fast_assign(
@@ -65,15 +67,17 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
__entry->referenced = referenced;
__entry->none_or_zero = none_or_zero;
__entry->status = status;
+ __entry->unmapped = unmapped;
),
- TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s",
+ TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d",
__entry->mm,
__entry->pfn,
__entry->writable,
__entry->referenced,
__entry->none_or_zero,
- __print_symbolic(__entry->status, SCAN_STATUS))
+ __print_symbolic(__entry->status, SCAN_STATUS),
+ __entry->unmapped)
);
TRACE_EVENT(mm_collapse_huge_page,
@@ -131,5 +135,29 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
__print_symbolic(__entry->status, SCAN_STATUS))
);
+TRACE_EVENT(mm_collapse_huge_page_swapin,
+
+ TP_PROTO(struct mm_struct *mm, int swapped_in, int ret),
+
+ TP_ARGS(mm, swapped_in, ret),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(int, swapped_in)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->swapped_in = swapped_in;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("mm=%p, swapped_in=%d, ret=%d",
+ __entry->mm,
+ __entry->swapped_in,
+ __entry->ret)
+);
+
#endif /* __HUGE_MEMORY_H */
#include <trace/define_trace.h>
diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h
index 786f0773cc33..3738e5fb6a4d 100644
--- a/include/uapi/linux/uuid.h
+++ b/include/uapi/linux/uuid.h
@@ -12,10 +12,6 @@
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _UAPI_LINUX_UUID_H_
diff --git a/init/Kconfig b/init/Kconfig
index 48f66142f693..7b82f3f9f277 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -862,6 +862,28 @@ config LOG_CPU_MAX_BUF_SHIFT
13 => 8 KB for each CPU
12 => 4 KB for each CPU
+config NMI_LOG_BUF_SHIFT
+ int "Temporary per-CPU NMI log buffer size (12 => 4KB, 13 => 8KB)"
+ range 10 21
+ default 13
+ depends on PRINTK_NMI
+ help
+ Select the size of a per-CPU buffer where NMI messages are temporary
+ stored. They are copied to the main log buffer in a safe context
+ to avoid a deadlock. The value defines the size as a power of 2.
+
+ NMI messages are rare and limited. The largest one is when
+ a backtrace is printed. It usually fits into 4KB. Select
+ 8KB if you want to be on the safe side.
+
+ Examples:
+ 17 => 128 KB for each CPU
+ 16 => 64 KB for each CPU
+ 15 => 32 KB for each CPU
+ 14 => 16 KB for each CPU
+ 13 => 8 KB for each CPU
+ 12 => 4 KB for each CPU
+
#
# Architectures with an unreliable sched_clock() should select this:
#
@@ -1467,6 +1489,11 @@ config PRINTK
very difficult to diagnose system problems, saying N here is
strongly discouraged.
+config PRINTK_NMI
+ def_bool y
+ depends on PRINTK
+ depends on HAVE_NMI
+
config BUG
bool "BUG() support" if EXPERT
default y
@@ -1755,6 +1782,15 @@ config SLOB
endchoice
+config SLAB_FREELIST_RANDOM
+ default n
+ depends on SLAB
+ bool "SLAB freelist randomization"
+ help
+ Randomizes the freelist order used on creating new SLABs. This
+ security feature reduces the predictability of the kernel slab
+ allocator against heap overflows.
+
config SLUB_CPU_PARTIAL
default y
depends on SLUB && SMP
diff --git a/init/main.c b/init/main.c
index b3c6e363ae18..4c17fda5c2ff 100644
--- a/init/main.c
+++ b/init/main.c
@@ -569,6 +569,7 @@ asmlinkage __visible void __init start_kernel(void)
timekeeping_init();
time_init();
sched_clock_postinit();
+ printk_nmi_init();
perf_event_init();
profile_init();
call_function_init();
@@ -706,21 +707,20 @@ static int __init initcall_blacklist(char *str)
static bool __init_or_module initcall_blacklisted(initcall_t fn)
{
struct blacklist_entry *entry;
- char *fn_name;
+ char fn_name[KSYM_SYMBOL_LEN];
- fn_name = kasprintf(GFP_KERNEL, "%pf", fn);
- if (!fn_name)
+ if (list_empty(&blacklisted_initcalls))
return false;
+ sprint_symbol_no_offset(fn_name, (unsigned long)fn);
+
list_for_each_entry(entry, &blacklisted_initcalls, next) {
if (!strcmp(fn_name, entry->buf)) {
pr_debug("initcall %s blacklisted\n", fn_name);
- kfree(fn_name);
return true;
}
}
- kfree(fn_name);
return false;
}
#else
diff --git a/ipc/msg.c b/ipc/msg.c
index 1471db9a7e61..59559a215401 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -37,6 +37,7 @@
#include <linux/rwsem.h>
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
+#include <linux/freezer.h>
#include <asm/current.h>
#include <linux/uaccess.h>
@@ -675,7 +676,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
rcu_read_lock();
ipc_lock_object(&msq->q_perm);
@@ -917,7 +918,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
/* Lockless receive, part 1:
* Disable preemption. We don't hold a reference to the queue
diff --git a/ipc/sem.c b/ipc/sem.c
index b3757ea0694b..11d9e605a619 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -162,14 +162,21 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
/*
* Locking:
+ * a) global sem_lock() for read/write
* sem_undo.id_next,
* sem_array.complex_count,
- * sem_array.pending{_alter,_cont},
- * sem_array.sem_undo: global sem_lock() for read/write
- * sem_undo.proc_next: only "current" is allowed to read/write that field.
+ * sem_array.complex_mode
+ * sem_array.pending{_alter,_const},
+ * sem_array.sem_undo
*
+ * b) global or semaphore sem_lock() for read/write:
* sem_array.sem_base[i].pending_{const,alter}:
- * global or semaphore sem_lock() for read/write
+ * sem_array.complex_mode (for read)
+ *
+ * c) special:
+ * sem_undo_list.list_proc:
+ * * undo_list->lock for write
+ * * rcu for read
*/
#define sc_semmsl sem_ctls[0]
@@ -270,23 +277,25 @@ static void sem_rcu_free(struct rcu_head *head)
#define ipc_smp_acquire__after_spin_is_unlocked() smp_rmb()
/*
- * Wait until all currently ongoing simple ops have completed.
+ * Enter the mode suitable for non-simple operations:
* Caller must own sem_perm.lock.
- * New simple ops cannot start, because simple ops first check
- * that sem_perm.lock is free.
- * that a) sem_perm.lock is free and b) complex_count is 0.
*/
-static void sem_wait_array(struct sem_array *sma)
+static void complexmode_enter(struct sem_array *sma)
{
int i;
struct sem *sem;
- if (sma->complex_count) {
- /* The thread that increased sma->complex_count waited on
- * all sem->lock locks. Thus we don't need to wait again.
- */
+ if (sma->complex_mode) {
+ /* We are already in complex_mode. Nothing to do */
return;
}
+ WRITE_ONCE(sma->complex_mode, true);
+
+ /* We need a full barrier:
+ * The write to complex_mode must be visible
+ * before we read the first sem->lock spinlock state.
+ */
+ smp_mb();
for (i = 0; i < sma->sem_nsems; i++) {
sem = sma->sem_base + i;
@@ -296,6 +305,29 @@ static void sem_wait_array(struct sem_array *sma)
}
/*
+ * Try to leave the mode that disallows simple operations:
+ * Caller must own sem_perm.lock.
+ */
+static void complexmode_tryleave(struct sem_array *sma)
+{
+ if (sma->complex_count) {
+ /* Complex ops are sleeping.
+ * We must stay in complex mode
+ */
+ return;
+ }
+ /*
+ * Immediately after setting complex_mode to false,
+ * a simple op can start. Thus: all memory writes
+ * performed by the current operation must be visible
+ * before we set complex_mode to false.
+ */
+ smp_wmb();
+
+ WRITE_ONCE(sma->complex_mode, false);
+}
+
+/*
* If the request contains only one semaphore operation, and there are
* no complex transactions pending, lock only the semaphore involved.
* Otherwise, lock the entire semaphore array, since we either have
@@ -311,56 +343,38 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
/* Complex operation - acquire a full lock */
ipc_lock_object(&sma->sem_perm);
- /* And wait until all simple ops that are processed
- * right now have dropped their locks.
- */
- sem_wait_array(sma);
+ /* Prevent parallel simple ops */
+ complexmode_enter(sma);
return -1;
}
/*
* Only one semaphore affected - try to optimize locking.
- * The rules are:
- * - optimized locking is possible if no complex operation
- * is either enqueued or processed right now.
- * - The test for enqueued complex ops is simple:
- * sma->complex_count != 0
- * - Testing for complex ops that are processed right now is
- * a bit more difficult. Complex ops acquire the full lock
- * and first wait that the running simple ops have completed.
- * (see above)
- * Thus: If we own a simple lock and the global lock is free
- * and complex_count is now 0, then it will stay 0 and
- * thus just locking sem->lock is sufficient.
+ * Optimized locking is possible if no complex operation
+ * is either enqueued or processed right now.
+ *
+ * Both facts are tracked by complex_mode.
*/
sem = sma->sem_base + sops->sem_num;
- if (sma->complex_count == 0) {
+ /*
+ * Initial check for complex_mode. Just an optimization,
+ * no locking.
+ */
+ if (!READ_ONCE(sma->complex_mode)) {
/*
* It appears that no complex operation is around.
* Acquire the per-semaphore lock.
*/
spin_lock(&sem->lock);
- /* Then check that the global lock is free */
- if (!spin_is_locked(&sma->sem_perm.lock)) {
- /*
- * We need a memory barrier with acquire semantics,
- * otherwise we can race with another thread that does:
- * complex_count++;
- * spin_unlock(sem_perm.lock);
- */
- ipc_smp_acquire__after_spin_is_unlocked();
-
- /*
- * Now repeat the test of complex_count:
- * It can't change anymore until we drop sem->lock.
- * Thus: if is now 0, then it will stay 0.
- */
- if (sma->complex_count == 0) {
- /* fast path successful! */
- return sops->sem_num;
- }
+ /* Now repeat the test for complex_mode.
+ * A memory barrier is provided by the spin_lock()
+ * above.
+ */
+ if (!READ_ONCE(sma->complex_mode)) {
+ /* fast path successful! */
+ return sops->sem_num;
}
spin_unlock(&sem->lock);
}
@@ -380,7 +394,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
/* Not a false alarm, thus complete the sequence for a
* full lock.
*/
- sem_wait_array(sma);
+ complexmode_enter(sma);
return -1;
}
}
@@ -389,6 +403,7 @@ static inline void sem_unlock(struct sem_array *sma, int locknum)
{
if (locknum == -1) {
unmerge_queues(sma);
+ complexmode_tryleave(sma);
ipc_unlock_object(&sma->sem_perm);
} else {
struct sem *sem = sma->sem_base + locknum;
@@ -540,6 +555,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
}
sma->complex_count = 0;
+ sma->complex_mode = true; /* dropped by sem_unlock below */
INIT_LIST_HEAD(&sma->pending_alter);
INIT_LIST_HEAD(&sma->pending_const);
INIT_LIST_HEAD(&sma->list_id);
@@ -2195,10 +2211,10 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
/*
* The proc interface isn't aware of sem_lock(), it calls
* ipc_lock_object() directly (in sysvipc_find_ipc).
- * In order to stay compatible with sem_lock(), we must wait until
- * all simple semop() calls have left their critical regions.
+ * In order to stay compatible with sem_lock(), we must
+ * enter / leave complex_mode.
*/
- sem_wait_array(sma);
+ complexmode_enter(sma);
sem_otime = get_semotime(sma);
@@ -2215,6 +2231,8 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
sem_otime,
sma->sem_ctime);
+ complexmode_tryleave(sma);
+
return 0;
}
#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1902956baba1..73e93e53884d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,7 +61,7 @@
#include <linux/cgroup.h>
#include <linux/wait.h>
-struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/* See "Frequency meter" comments, below. */
@@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
-int __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
unsigned long flags;
if (in_interrupt())
- return 1;
+ return true;
if (node_isset(node, current->mems_allowed))
- return 1;
+ return true;
/*
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
*/
if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return 1;
+ return true;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
- return 0;
+ return false;
if (current->flags & PF_EXITING) /* Let dying task have memory */
- return 1;
+ return true;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
spin_lock_irqsave(&callback_lock, flags);
@@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
static int cpuset_spread_node(int *rotor)
{
- int node;
-
- node = next_node(*rotor, current->mems_allowed);
- if (node == MAX_NUMNODES)
- node = first_node(current->mems_allowed);
- *rotor = node;
- return node;
+ return *rotor = next_node_in(*rotor, current->mems_allowed);
}
int cpuset_mem_spread_node(void)
diff --git a/kernel/exit.c b/kernel/exit.c
index fd90195667e1..9e6e1356e6bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -746,7 +746,7 @@ void do_exit(long code)
disassociate_ctty(1);
exit_task_namespaces(tsk);
exit_task_work(tsk);
- exit_thread();
+ exit_thread(tsk);
/*
* Flush inherited counters to the parent - before the parent
@@ -918,17 +918,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
task_pid_type(p, wo->wo_type) == wo->wo_pid;
}
-static int eligible_child(struct wait_opts *wo, struct task_struct *p)
+static int
+eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
{
if (!eligible_pid(wo, p))
return 0;
- /* Wait for all children (clone and not) if __WALL is set;
- * otherwise, wait for clone children *only* if __WCLONE is
- * set; otherwise, wait for non-clone children *only*. (Note:
- * A "clone" child here is one that reports to its parent
- * using a signal other than SIGCHLD.) */
- if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
- && !(wo->wo_flags & __WALL))
+
+ /*
+ * Wait for all children (clone and not) if __WALL is set or
+ * if it is traced by us.
+ */
+ if (ptrace || (wo->wo_flags & __WALL))
+ return 1;
+
+ /*
+ * Otherwise, wait for clone children *only* if __WCLONE is set;
+ * otherwise, wait for non-clone children *only*.
+ *
+ * Note: a "clone" child here is one that reports to its parent
+ * using a signal other than SIGCHLD, or a non-leader thread which
+ * we can only see if it is traced by us.
+ */
+ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
return 0;
return 1;
@@ -1300,7 +1311,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
if (unlikely(exit_state == EXIT_DEAD))
return 0;
- ret = eligible_child(wo, p);
+ ret = eligible_child(wo, ptrace, p);
if (!ret)
return ret;
@@ -1524,7 +1535,8 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
enum pid_type type;
long ret;
- if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
+ if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
+ __WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
return -EINVAL;
diff --git a/kernel/fork.c b/kernel/fork.c
index cc584b6a7dcb..7220bca0bfa6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -699,6 +699,26 @@ void __mmdrop(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(__mmdrop);
+static inline void __mmput(struct mm_struct *mm)
+{
+ VM_BUG_ON(atomic_read(&mm->mm_users));
+
+ uprobe_clear_state(mm);
+ exit_aio(mm);
+ ksm_exit(mm);
+ khugepaged_exit(mm); /* must run before exit_mmap */
+ exit_mmap(mm);
+ set_mm_exe_file(mm, NULL);
+ if (!list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ list_del(&mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
+ mmdrop(mm);
+}
+
/*
* Decrement the use count and release all resources for an mm.
*/
@@ -706,24 +726,24 @@ void mmput(struct mm_struct *mm)
{
might_sleep();
+ if (atomic_dec_and_test(&mm->mm_users))
+ __mmput(mm);
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+static void mmput_async_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+ __mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
if (atomic_dec_and_test(&mm->mm_users)) {
- uprobe_clear_state(mm);
- exit_aio(mm);
- ksm_exit(mm);
- khugepaged_exit(mm); /* must run before exit_mmap */
- exit_mmap(mm);
- set_mm_exe_file(mm, NULL);
- if (!list_empty(&mm->mmlist)) {
- spin_lock(&mmlist_lock);
- list_del(&mm->mmlist);
- spin_unlock(&mmlist_lock);
- }
- if (mm->binfmt)
- module_put(mm->binfmt->module);
- mmdrop(mm);
+ INIT_WORK(&mm->async_put_work, mmput_async_fn);
+ schedule_work(&mm->async_put_work);
}
}
-EXPORT_SYMBOL_GPL(mmput);
/**
* set_mm_exe_file - change a reference to the mm's executable file
@@ -1476,7 +1496,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
- goto bad_fork_cleanup_io;
+ goto bad_fork_cleanup_thread;
}
}
@@ -1639,6 +1659,8 @@ bad_fork_cancel_cgroup:
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
+bad_fork_cleanup_thread:
+ exit_thread(p);
bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 913f05cfe8b8..913b3e6e71f1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3666,8 +3666,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
int cmd = op & FUTEX_CMD_MASK;
unsigned int flags = 0;
+#ifdef CONFIG_MMU
if (!(op & FUTEX_PRIVATE_FLAG))
flags |= FLAGS_SHARED;
+#endif
if (op & FUTEX_CLOCK_REALTIME) {
flags |= FLAGS_CLOCKRT;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d65f6f31a5b3..8798b6c9e945 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -139,12 +139,7 @@ void irq_domain_remove(struct irq_domain *domain)
{
mutex_lock(&irq_domain_mutex);
- /*
- * radix_tree_delete() takes care of destroying the root
- * node when all entries are removed. Shout if there are
- * any mappings left.
- */
- WARN_ON(domain->revmap_tree.height);
+ WARN_ON(!radix_tree_empty(&domain->revmap_tree));
list_del(&domain->link);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ee70aef5cd81..4384672d3245 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -103,6 +103,65 @@ out_free_image:
return ret;
}
+static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
+ struct kexec_segment __user *segments, unsigned long flags)
+{
+ struct kimage **dest_image, *image;
+ unsigned long i;
+ int ret;
+
+ if (flags & KEXEC_ON_CRASH) {
+ dest_image = &kexec_crash_image;
+ if (kexec_crash_image)
+ arch_kexec_unprotect_crashkres();
+ } else {
+ dest_image = &kexec_image;
+ }
+
+ if (nr_segments == 0) {
+ /* Uninstall image */
+ kimage_free(xchg(dest_image, NULL));
+ return 0;
+ }
+ if (flags & KEXEC_ON_CRASH) {
+ /*
+ * Loading another kernel to switch to if this one
+ * crashes. Free any current crash dump kernel before
+ * we corrupt it.
+ */
+ kimage_free(xchg(&kexec_crash_image, NULL));
+ }
+
+ ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags);
+ if (ret)
+ return ret;
+
+ if (flags & KEXEC_PRESERVE_CONTEXT)
+ image->preserve_context = 1;
+
+ ret = machine_kexec_prepare(image);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < nr_segments; i++) {
+ ret = kimage_load_segment(image, &image->segment[i]);
+ if (ret)
+ goto out;
+ }
+
+ kimage_terminate(image);
+
+ /* Install the new kernel and uninstall the old */
+ image = xchg(dest_image, image);
+
+out:
+ if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
+ arch_kexec_protect_crashkres();
+
+ kimage_free(image);
+ return ret;
+}
+
/*
* Exec Kernel system call: for obvious reasons only root may call it.
*
@@ -127,7 +186,6 @@ out_free_image:
SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
struct kexec_segment __user *, segments, unsigned long, flags)
{
- struct kimage **dest_image, *image;
int result;
/* We only trust the superuser with rebooting the system. */
@@ -152,9 +210,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
if (nr_segments > KEXEC_SEGMENT_MAX)
return -EINVAL;
- image = NULL;
- result = 0;
-
/* Because we write directly to the reserved memory
* region when loading crash kernels we need a mutex here to
* prevent multiple crash kernels from attempting to load
@@ -166,53 +221,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
if (!mutex_trylock(&kexec_mutex))
return -EBUSY;
- dest_image = &kexec_image;
- if (flags & KEXEC_ON_CRASH)
- dest_image = &kexec_crash_image;
- if (nr_segments > 0) {
- unsigned long i;
-
- if (flags & KEXEC_ON_CRASH) {
- /*
- * Loading another kernel to switch to if this one
- * crashes. Free any current crash dump kernel before
- * we corrupt it.
- */
-
- kimage_free(xchg(&kexec_crash_image, NULL));
- result = kimage_alloc_init(&image, entry, nr_segments,
- segments, flags);
- crash_map_reserved_pages();
- } else {
- /* Loading another kernel to reboot into. */
-
- result = kimage_alloc_init(&image, entry, nr_segments,
- segments, flags);
- }
- if (result)
- goto out;
-
- if (flags & KEXEC_PRESERVE_CONTEXT)
- image->preserve_context = 1;
- result = machine_kexec_prepare(image);
- if (result)
- goto out;
-
- for (i = 0; i < nr_segments; i++) {
- result = kimage_load_segment(image, &image->segment[i]);
- if (result)
- goto out;
- }
- kimage_terminate(image);
- if (flags & KEXEC_ON_CRASH)
- crash_unmap_reserved_pages();
- }
- /* Install the new kernel, and Uninstall the old */
- image = xchg(dest_image, image);
+ result = do_kexec_load(entry, nr_segments, segments, flags);
-out:
mutex_unlock(&kexec_mutex);
- kimage_free(image);
return result;
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1391d3ee3b86..56b3ed0927b0 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -893,6 +893,7 @@ void crash_kexec(struct pt_regs *regs)
old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
if (old_cpu == PANIC_CPU_INVALID) {
/* This is the 1st CPU which comes here, so go ahead. */
+ printk_nmi_flush_on_panic();
__crash_kexec(regs);
/*
@@ -953,7 +954,6 @@ int crash_shrink_memory(unsigned long new_size)
start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
- crash_map_reserved_pages();
crash_free_reserved_phys_range(end, crashk_res.end);
if ((start == end) && (crashk_res.parent != NULL))
@@ -967,7 +967,6 @@ int crash_shrink_memory(unsigned long new_size)
crashk_res.end = end - 1;
insert_resource(&iomem_resource, ram_res);
- crash_unmap_reserved_pages();
unlock:
mutex_unlock(&kexec_mutex);
@@ -1410,7 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_STRUCT_SIZE(list_head);
VMCOREINFO_SIZE(nodemask_t);
VMCOREINFO_OFFSET(page, flags);
- VMCOREINFO_OFFSET(page, _count);
+ VMCOREINFO_OFFSET(page, _refcount);
VMCOREINFO_OFFSET(page, mapping);
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
@@ -1552,13 +1551,14 @@ int kernel_kexec(void)
}
/*
- * Add and remove page tables for crashkernel memory
+ * Protection mechanism for crashkernel reserved memory after
+ * the kdump kernel is loaded.
*
* Provide an empty default implementation here -- architecture
* code may override this
*/
-void __weak crash_map_reserved_pages(void)
+void __weak arch_kexec_protect_crashkres(void)
{}
-void __weak crash_unmap_reserved_pages(void)
+void __weak arch_kexec_unprotect_crashkres(void)
{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index c72d2ff5896e..503bc2d348e5 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -274,8 +274,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
return -EBUSY;
dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH)
+ if (flags & KEXEC_FILE_ON_CRASH) {
dest_image = &kexec_crash_image;
+ if (kexec_crash_image)
+ arch_kexec_unprotect_crashkres();
+ }
if (flags & KEXEC_FILE_UNLOAD)
goto exchange;
@@ -324,6 +327,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
exchange:
image = xchg(dest_image, image);
out:
+ if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
+ arch_kexec_protect_crashkres();
+
mutex_unlock(&kexec_mutex);
kimage_free(image);
return ret;
diff --git a/kernel/padata.c b/kernel/padata.c
index b38bea9c466a..993278895ccc 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -607,33 +607,6 @@ out_replace:
}
/**
- * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
- * one is used by parallel workers and the second one
- * by the wokers doing serialization.
- *
- * @pinst: padata instance
- * @pcpumask: the cpumask to use for parallel workers
- * @cbcpumask: the cpumsak to use for serial workers
- */
-int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
- cpumask_var_t cbcpumask)
-{
- int err;
-
- mutex_lock(&pinst->lock);
- get_online_cpus();
-
- err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
-
- put_online_cpus();
- mutex_unlock(&pinst->lock);
-
- return err;
-
-}
-EXPORT_SYMBOL(padata_set_cpumasks);
-
-/**
* padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
* equivalent to @cpumask.
*
@@ -674,6 +647,43 @@ out:
}
EXPORT_SYMBOL(padata_set_cpumask);
+/**
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+int padata_start(struct padata_instance *pinst)
+{
+ int err = 0;
+
+ mutex_lock(&pinst->lock);
+
+ if (pinst->flags & PADATA_INVALID)
+ err = -EINVAL;
+
+ __padata_start(pinst);
+
+ mutex_unlock(&pinst->lock);
+
+ return err;
+}
+EXPORT_SYMBOL(padata_start);
+
+/**
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+ mutex_lock(&pinst->lock);
+ __padata_stop(pinst);
+ mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
{
struct parallel_data *pd;
@@ -694,42 +704,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
return 0;
}
- /**
- * padata_add_cpu - add a cpu to one or both(parallel and serial)
- * padata cpumasks.
- *
- * @pinst: padata instance
- * @cpu: cpu to add
- * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
- * The @mask may be any combination of the following flags:
- * PADATA_CPU_SERIAL - serial cpumask
- * PADATA_CPU_PARALLEL - parallel cpumask
- */
-
-int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
-{
- int err;
-
- if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
- return -EINVAL;
-
- mutex_lock(&pinst->lock);
-
- get_online_cpus();
- if (mask & PADATA_CPU_SERIAL)
- cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
- if (mask & PADATA_CPU_PARALLEL)
- cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
-
- err = __padata_add_cpu(pinst, cpu);
- put_online_cpus();
-
- mutex_unlock(&pinst->lock);
-
- return err;
-}
-EXPORT_SYMBOL(padata_add_cpu);
-
static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
{
struct parallel_data *pd = NULL;
@@ -789,43 +763,6 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
}
EXPORT_SYMBOL(padata_remove_cpu);
-/**
- * padata_start - start the parallel processing
- *
- * @pinst: padata instance to start
- */
-int padata_start(struct padata_instance *pinst)
-{
- int err = 0;
-
- mutex_lock(&pinst->lock);
-
- if (pinst->flags & PADATA_INVALID)
- err =-EINVAL;
-
- __padata_start(pinst);
-
- mutex_unlock(&pinst->lock);
-
- return err;
-}
-EXPORT_SYMBOL(padata_start);
-
-/**
- * padata_stop - stop the parallel processing
- *
- * @pinst: padata instance to stop
- */
-void padata_stop(struct padata_instance *pinst)
-{
- mutex_lock(&pinst->lock);
- __padata_stop(pinst);
- mutex_unlock(&pinst->lock);
-}
-EXPORT_SYMBOL(padata_stop);
-
-#ifdef CONFIG_HOTPLUG_CPU
-
static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
{
return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
@@ -1091,7 +1028,6 @@ err_free_inst:
err:
return NULL;
}
-EXPORT_SYMBOL(padata_alloc);
/**
* padata_free - free a padata instance
diff --git a/kernel/panic.c b/kernel/panic.c
index 535c96510a44..8aa74497cc5a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -160,8 +160,10 @@ void panic(const char *fmt, ...)
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
- if (!crash_kexec_post_notifiers)
+ if (!crash_kexec_post_notifiers) {
+ printk_nmi_flush_on_panic();
__crash_kexec(NULL);
+ }
/*
* Note smp_send_stop is the usual smp shutdown function, which
@@ -176,6 +178,8 @@ void panic(const char *fmt, ...)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
+ /* Call flush even twice. It tries harder with a single online CPU */
+ printk_nmi_flush_on_panic();
kmsg_dump(KMSG_DUMP_PANIC);
/*
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 85405bdcf2b3..abb0042a427b 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,2 +1,3 @@
obj-y = printk.o
+obj-$(CONFIG_PRINTK_NMI) += nmi.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
new file mode 100644
index 000000000000..7fd2838fa417
--- /dev/null
+++ b/kernel/printk/internal.h
@@ -0,0 +1,57 @@
+/*
+ * internal.h - printk internal definitions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/percpu.h>
+
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+
+int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
+
+#ifdef CONFIG_PRINTK_NMI
+
+extern raw_spinlock_t logbuf_lock;
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it temporary stores the strings into a per-CPU buffer.
+ * The alternative implementation is chosen transparently
+ * via per-CPU variable.
+ */
+DECLARE_PER_CPU(printk_func_t, printk_func);
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+ return this_cpu_read(printk_func)(fmt, args);
+}
+
+extern atomic_t nmi_message_lost;
+static inline int get_nmi_message_lost(void)
+{
+ return atomic_xchg(&nmi_message_lost, 0);
+}
+
+#else /* CONFIG_PRINTK_NMI */
+
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+ return vprintk_default(fmt, args);
+}
+
+static inline int get_nmi_message_lost(void)
+{
+ return 0;
+}
+
+#endif /* CONFIG_PRINTK_NMI */
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
new file mode 100644
index 000000000000..b69eb8a2876f
--- /dev/null
+++ b/kernel/printk/nmi.c
@@ -0,0 +1,260 @@
+/*
+ * nmi.c - Safe printk in NMI context
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/debug_locks.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/irq_work.h>
+#include <linux/printk.h>
+
+#include "internal.h"
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it uses an alternative implementation that temporary stores
+ * the strings into a per-CPU buffer. The content of the buffer
+ * is later flushed into the main ring buffer via IRQ work.
+ *
+ * The alternative implementation is chosen transparently
+ * via @printk_func per-CPU variable.
+ *
+ * The implementation allows to flush the strings also from another CPU.
+ * There are situations when we want to make sure that all buffers
+ * were handled or when IRQs are blocked.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static int printk_nmi_irq_ready;
+atomic_t nmi_message_lost;
+
+#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \
+ sizeof(atomic_t) - sizeof(struct irq_work))
+
+struct nmi_seq_buf {
+ atomic_t len; /* length of written data */
+ struct irq_work work; /* IRQ work that flushes the buffer */
+ unsigned char buffer[NMI_LOG_BUF_LEN];
+};
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+ struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+ int add = 0;
+ size_t len;
+
+again:
+ len = atomic_read(&s->len);
+
+ if (len >= sizeof(s->buffer)) {
+ atomic_inc(&nmi_message_lost);
+ return 0;
+ }
+
+ /*
+ * Make sure that all old data have been read before the buffer was
+ * reseted. This is not needed when we just append data.
+ */
+ if (!len)
+ smp_rmb();
+
+ add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+
+ /*
+ * Do it once again if the buffer has been flushed in the meantime.
+ * Note that atomic_cmpxchg() is an implicit memory barrier that
+ * makes sure that the data were written before updating s->len.
+ */
+ if (atomic_cmpxchg(&s->len, len, len + add) != len)
+ goto again;
+
+ /* Get flushed in a more safe context. */
+ if (add && printk_nmi_irq_ready) {
+ /* Make sure that IRQ work is really initialized. */
+ smp_rmb();
+ irq_work_queue(&s->work);
+ }
+
+ return add;
+}
+
+/*
+ * printk one line from the temporary buffer from @start index until
+ * and including the @end index.
+ */
+static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+ const char *buf = s->buffer + start;
+
+ /*
+ * The buffers are flushed in NMI only on panic. The messages must
+ * go only into the ring buffer at this stage. Consoles will get
+ * explicitly called later when a crashdump is not generated.
+ */
+ if (in_nmi())
+ printk_deferred("%.*s", (end - start) + 1, buf);
+ else
+ printk("%.*s", (end - start) + 1, buf);
+
+}
+
+/*
+ * Flush data from the associated per_CPU buffer. The function
+ * can be called either via IRQ work or independently.
+ */
+static void __printk_nmi_flush(struct irq_work *work)
+{
+ static raw_spinlock_t read_lock =
+ __RAW_SPIN_LOCK_INITIALIZER(read_lock);
+ struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+ unsigned long flags;
+ size_t len, size;
+ int i, last_i;
+
+ /*
+ * The lock has two functions. First, one reader has to flush all
+ * available message to make the lockless synchronization with
+ * writers easier. Second, we do not want to mix messages from
+ * different CPUs. This is especially important when printing
+ * a backtrace.
+ */
+ raw_spin_lock_irqsave(&read_lock, flags);
+
+ i = 0;
+more:
+ len = atomic_read(&s->len);
+
+ /*
+ * This is just a paranoid check that nobody has manipulated
+ * the buffer an unexpected way. If we printed something then
+ * @len must only increase.
+ */
+ if (i && i >= len)
+ pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n",
+ i, len);
+
+ if (!len)
+ goto out; /* Someone else has already flushed the buffer. */
+
+ /* Make sure that data has been written up to the @len */
+ smp_rmb();
+
+ size = min(len, sizeof(s->buffer));
+ last_i = i;
+
+ /* Print line by line. */
+ for (; i < size; i++) {
+ if (s->buffer[i] == '\n') {
+ print_nmi_seq_line(s, last_i, i);
+ last_i = i + 1;
+ }
+ }
+ /* Check if there was a partial line. */
+ if (last_i < size) {
+ print_nmi_seq_line(s, last_i, size - 1);
+ pr_cont("\n");
+ }
+
+ /*
+ * Check that nothing has got added in the meantime and truncate
+ * the buffer. Note that atomic_cmpxchg() is an implicit memory
+ * barrier that makes sure that the data were copied before
+ * updating s->len.
+ */
+ if (atomic_cmpxchg(&s->len, len, 0) != len)
+ goto more;
+
+out:
+ raw_spin_unlock_irqrestore(&read_lock, flags);
+}
+
+/**
+ * printk_nmi_flush - flush all per-cpu nmi buffers.
+ *
+ * The buffers are flushed automatically via IRQ work. This function
+ * is useful only when someone wants to be sure that all buffers have
+ * been flushed at some point.
+ */
+void printk_nmi_flush(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+/**
+ * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system
+ * goes down.
+ *
+ * Similar to printk_nmi_flush() but it can be called even in NMI context when
+ * the system goes down. It does the best effort to get NMI messages into
+ * the main ring buffer.
+ *
+ * Note that it could try harder when there is only one CPU online.
+ */
+void printk_nmi_flush_on_panic(void)
+{
+ /*
+ * Make sure that we could access the main ring buffer.
+ * Do not risk a double release when more CPUs are up.
+ */
+ if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
+ if (num_online_cpus() > 1)
+ return;
+
+ debug_locks_off();
+ raw_spin_lock_init(&logbuf_lock);
+ }
+
+ printk_nmi_flush();
+}
+
+void __init printk_nmi_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+ init_irq_work(&s->work, __printk_nmi_flush);
+ }
+
+ /* Make sure that IRQ works are initialized before enabling. */
+ smp_wmb();
+ printk_nmi_irq_ready = 1;
+
+ /* Flush pending messages that did not have scheduled IRQ works. */
+ printk_nmi_flush();
+}
+
+void printk_nmi_enter(void)
+{
+ this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+ this_cpu_write(printk_func, vprintk_default);
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bfbf284e4218..0a0e789bba14 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -55,6 +55,7 @@
#include "console_cmdline.h"
#include "braille.h"
+#include "internal.h"
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
@@ -244,7 +245,7 @@ __packed __aligned(4)
* within the scheduler's rq lock. It must be released before calling
* console_unlock() or anything else that might wake up a process.
*/
-static DEFINE_RAW_SPINLOCK(logbuf_lock);
+DEFINE_RAW_SPINLOCK(logbuf_lock);
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
@@ -424,6 +425,8 @@ static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
}
+static u64 printk_get_ts(void);
+
/* insert record into the buffer, discard old ones, update heads */
static int log_store(int facility, int level,
enum log_flags flags, u64 ts_nsec,
@@ -472,7 +475,7 @@ static int log_store(int facility, int level,
if (ts_nsec > 0)
msg->ts_nsec = ts_nsec;
else
- msg->ts_nsec = local_clock();
+ msg->ts_nsec = printk_get_ts();
memset(log_dict(msg) + dict_len, 0, pad_len);
msg->len = size;
@@ -1040,8 +1043,74 @@ static inline void boot_delay_msec(int level)
}
#endif
-static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
-module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+static int printk_time = CONFIG_PRINTK_TIME;
+
+/*
+ * Real clock & 32-bit systems: Selecting the real clock printk timestamp may
+ * lead to unlikely situations where a timestamp is wrong because the real time
+ * offset is read without the protection of a sequence lock in the call to
+ * ktime_get_log_ts() in printk_get_ts() below.
+ */
+static int printk_time_param_set(const char *val,
+ const struct kernel_param *kp)
+{
+ char *param = strstrip((char *)val);
+
+ if (strlen(param) != 1)
+ return -EINVAL;
+
+ switch (param[0]) {
+ /* 0/N/n = disabled */
+ case '0':
+ case 'N':
+ case 'n':
+ printk_time = 0;
+ break;
+ /* 1/Y/y = local clock */
+ case '1':
+ case 'Y':
+ case 'y':
+ printk_time = 1;
+ break;
+ /* 2 = monotonic clock */
+ case '2':
+ printk_time = 2;
+ break;
+ /* 3 = real clock */
+ case '3':
+ printk_time = 3;
+ break;
+ default:
+ pr_warn("printk: invalid timestamp value\n");
+ return -EINVAL;
+ break;
+ }
+
+ pr_info("printk: timestamp set to %d.\n", printk_time);
+ return 0;
+}
+
+static struct kernel_param_ops printk_time_param_ops = {
+ .set = printk_time_param_set,
+ .get = param_get_int,
+};
+
+module_param_cb(time, &printk_time_param_ops, &printk_time, S_IRUGO);
+
+static u64 printk_get_ts(void)
+{
+ u64 mono, offset_real;
+
+ if (printk_time <= 1)
+ return local_clock();
+
+ mono = ktime_get_log_ts(&offset_real);
+
+ if (printk_time == 2)
+ return mono;
+
+ return mono + offset_real;
+}
static size_t print_time(u64 ts, char *buf)
{
@@ -1561,7 +1630,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
cont.facility = facility;
cont.level = level;
cont.owner = current;
- cont.ts_nsec = local_clock();
+ cont.ts_nsec = printk_get_ts();
cont.flags = 0;
cont.cons = 0;
cont.flushed = false;
@@ -1616,6 +1685,7 @@ asmlinkage int vprintk_emit(int facility, int level,
unsigned long flags;
int this_cpu;
int printed_len = 0;
+ int nmi_message_lost;
bool in_sched = false;
/* cpu currently holding logbuf_lock in this function */
static unsigned int logbuf_cpu = UINT_MAX;
@@ -1666,6 +1736,15 @@ asmlinkage int vprintk_emit(int facility, int level,
strlen(recursion_msg));
}
+ nmi_message_lost = get_nmi_message_lost();
+ if (unlikely(nmi_message_lost)) {
+ text_len = scnprintf(textbuf, sizeof(textbuf),
+ "BAD LUCK: lost %d message(s) from NMI context!",
+ nmi_message_lost);
+ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+ NULL, 0, textbuf, text_len);
+ }
+
/*
* The printf needs to come first; we need the syslog
* prefix which might be passed-in as a parameter.
@@ -1807,14 +1886,6 @@ int vprintk_default(const char *fmt, va_list args)
}
EXPORT_SYMBOL_GPL(vprintk_default);
-/*
- * This allows printk to be diverted to another function per cpu.
- * This is useful for calling printk functions from within NMI
- * without worrying about race conditions that can lock up the
- * box.
- */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
-
/**
* printk - print a kernel message
* @fmt: format string
@@ -1838,21 +1909,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
*/
asmlinkage __visible int printk(const char *fmt, ...)
{
- printk_func_t vprintk_func;
va_list args;
int r;
va_start(args, fmt);
-
- /*
- * If a caller overrides the per_cpu printk_func, then it needs
- * to disable preemption when calling printk(). Otherwise
- * the printk_func should be set to the default. No need to
- * disable preemption here.
- */
- vprintk_func = this_cpu_read(printk_func);
r = vprintk_func(fmt, args);
-
va_end(args);
return r;
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index d8a095fb8fac..f0d8322bc3ec 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -380,29 +380,9 @@ void destroy_rcu_head(struct rcu_head *head)
debug_object_free(head, &rcuhead_debug_descr);
}
-/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
- * Activation is performed internally by call_rcu().
- */
-static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+static bool rcuhead_is_static_object(void *addr)
{
- struct rcu_head *head = addr;
-
- switch (state) {
-
- case ODEBUG_STATE_NOTAVAILABLE:
- /*
- * This is not really a fixup. We just make sure that it is
- * tracked in the object tracker.
- */
- debug_object_init(head, &rcuhead_debug_descr);
- debug_object_activate(head, &rcuhead_debug_descr);
- return 0;
- default:
- return 1;
- }
+ return true;
}
/**
@@ -440,7 +420,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
struct debug_obj_descr rcuhead_debug_descr = {
.name = "rcu_head",
- .fixup_activate = rcuhead_fixup_activate,
+ .is_static_object = rcuhead_is_static_object,
};
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/signal.c b/kernel/signal.c
index ab122a2cee41..96e9bc40667f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -224,7 +224,7 @@ static inline void print_dropped_signal(int sig)
if (!__ratelimit(&ratelimit_state))
return;
- printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
+ pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
current->comm, current->pid, sig);
}
@@ -1089,10 +1089,10 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
static void print_fatal_signal(int signr)
{
struct pt_regs *regs = signal_pt_regs();
- printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr);
+ pr_info("potentially unexpected fatal signal %d.\n", signr);
#if defined(__i386__) && !defined(__arch_um__)
- printk(KERN_INFO "code at %08lx: ", regs->ip);
+ pr_info("code at %08lx: ", regs->ip);
{
int i;
for (i = 0; i < 16; i++) {
@@ -1100,10 +1100,10 @@ static void print_fatal_signal(int signr)
if (get_user(insn, (unsigned char *)(regs->ip + i)))
break;
- printk(KERN_CONT "%02x ", insn);
+ pr_cont("%02x ", insn);
}
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
#endif
preempt_disable();
show_regs(regs);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8b318663525..2effd84d83e3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ {
+ .procname = "stat_refresh",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0600,
+ .proc_handler = vmstat_refresh,
+ },
#endif
#ifdef CONFIG_MMU
{
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 10a1d7dc9313..6eb99c17dbd8 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
#include <linux/ctype.h>
#include <linux/netdevice.h>
#include <linux/kernel.h>
+#include <linux/uuid.h>
#include <linux/slab.h>
#include <linux/compat.h>
@@ -1117,9 +1118,8 @@ static ssize_t bin_uuid(struct file *file,
/* Only supports reads */
if (oldval && oldlen) {
- char buf[40], *str = buf;
- unsigned char uuid[16];
- int i;
+ char buf[UUID_STRING_LEN + 1];
+ uuid_be uuid;
result = kernel_read(file, 0, buf, sizeof(buf) - 1);
if (result < 0)
@@ -1127,24 +1127,15 @@ static ssize_t bin_uuid(struct file *file,
buf[result] = '\0';
- /* Convert the uuid to from a string to binary */
- for (i = 0; i < 16; i++) {
- result = -EIO;
- if (!isxdigit(str[0]) || !isxdigit(str[1]))
- goto out;
-
- uuid[i] = (hex_to_bin(str[0]) << 4) |
- hex_to_bin(str[1]);
- str += 2;
- if (*str == '-')
- str++;
- }
+ result = -EIO;
+ if (uuid_be_to_bin(buf, &uuid))
+ goto out;
if (oldlen > 16)
oldlen = 16;
result = -EFAULT;
- if (copy_to_user(oldval, uuid, oldlen))
+ if (copy_to_user(oldval, &uuid, oldlen))
goto out;
copied = oldlen;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index fa0b983290cf..8c7392c4fdbd 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -334,7 +334,7 @@ static void *hrtimer_debug_hint(void *addr)
* fixup_init is called when:
* - an active object is initialized
*/
-static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
{
struct hrtimer *timer = addr;
@@ -342,30 +342,25 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
hrtimer_cancel(timer);
debug_object_init(timer, &hrtimer_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
/*
* fixup_activate is called when:
* - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
*/
-static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
{
switch (state) {
-
- case ODEBUG_STATE_NOTAVAILABLE:
- WARN_ON_ONCE(1);
- return 0;
-
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
default:
- return 0;
+ return false;
}
}
@@ -373,7 +368,7 @@ static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
* fixup_free is called when:
* - an active object is freed
*/
-static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
{
struct hrtimer *timer = addr;
@@ -381,9 +376,9 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
hrtimer_cancel(timer);
debug_object_free(timer, &hrtimer_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a4064b612066..667b9335f5d6 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -769,3 +769,24 @@ struct timespec timespec_add_safe(const struct timespec lhs,
return res;
}
+
+/*
+ * Add two timespec64 values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0).
+ * And, each timespec64 is in normalized form.
+ */
+struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+ const struct timespec64 rhs)
+{
+ struct timespec64 res;
+
+ set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec,
+ lhs.tv_nsec + rhs.tv_nsec);
+
+ if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
+ res.tv_sec = TIME64_MAX;
+ res.tv_nsec = 0;
+ }
+
+ return res;
+}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 479d25cd3d4f..feb1995f8ffd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -43,6 +43,7 @@ static struct {
static DEFINE_RAW_SPINLOCK(timekeeper_lock);
static struct timekeeper shadow_timekeeper;
+static int timekeeping_active;
/**
* struct tk_fast - NMI safe timekeeper
@@ -419,6 +420,16 @@ u64 ktime_get_raw_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
+u64 ktime_get_log_ts(u64 *offset_real)
+{
+ *offset_real = ktime_to_ns(tk_core.timekeeper.offs_real);
+
+ if (timekeeping_active)
+ return ktime_get_mono_fast_ns();
+ else
+ return local_clock();
+}
+
/* Suspend-time cycles value for halted fast timekeeper. */
static cycle_t cycles_at_suspend;
@@ -1503,6 +1514,8 @@ void __init timekeeping_init(void)
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ timekeeping_active = 1;
}
/* time in seconds when suspend began for persistent clock */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 73164c3aa56b..3a95f9728778 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -489,11 +489,19 @@ static void *timer_debug_hint(void *addr)
return ((struct timer_list *) addr)->function;
}
+static bool timer_is_static_object(void *addr)
+{
+ struct timer_list *timer = addr;
+
+ return (timer->entry.pprev == NULL &&
+ timer->entry.next == TIMER_ENTRY_STATIC);
+}
+
/*
* fixup_init is called when:
* - an active object is initialized
*/
-static int timer_fixup_init(void *addr, enum debug_obj_state state)
+static bool timer_fixup_init(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
@@ -501,9 +509,9 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
del_timer_sync(timer);
debug_object_init(timer, &timer_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -516,36 +524,22 @@ static void stub_timer(unsigned long data)
/*
* fixup_activate is called when:
* - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
*/
-static int timer_fixup_activate(void *addr, enum debug_obj_state state)
+static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
switch (state) {
-
case ODEBUG_STATE_NOTAVAILABLE:
- /*
- * This is not really a fixup. The timer was
- * statically initialized. We just make sure that it
- * is tracked in the object tracker.
- */
- if (timer->entry.pprev == NULL &&
- timer->entry.next == TIMER_ENTRY_STATIC) {
- debug_object_init(timer, &timer_debug_descr);
- debug_object_activate(timer, &timer_debug_descr);
- return 0;
- } else {
- setup_timer(timer, stub_timer, 0);
- return 1;
- }
- return 0;
+ setup_timer(timer, stub_timer, 0);
+ return true;
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
default:
- return 0;
+ return false;
}
}
@@ -553,7 +547,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
* fixup_free is called when:
* - an active object is freed
*/
-static int timer_fixup_free(void *addr, enum debug_obj_state state)
+static bool timer_fixup_free(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
@@ -561,9 +555,9 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
del_timer_sync(timer);
debug_object_free(timer, &timer_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -571,32 +565,23 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
* fixup_assert_init is called when:
* - an untracked/uninit-ed object is found
*/
-static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
+static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
- if (timer->entry.next == TIMER_ENTRY_STATIC) {
- /*
- * This is not really a fixup. The timer was
- * statically initialized. We just make sure that it
- * is tracked in the object tracker.
- */
- debug_object_init(timer, &timer_debug_descr);
- return 0;
- } else {
- setup_timer(timer, stub_timer, 0);
- return 1;
- }
+ setup_timer(timer, stub_timer, 0);
+ return true;
default:
- return 0;
+ return false;
}
}
static struct debug_obj_descr timer_debug_descr = {
.name = "timer_list",
.debug_hint = timer_debug_hint,
+ .is_static_object = timer_is_static_object,
.fixup_init = timer_fixup_init,
.fixup_activate = timer_fixup_activate,
.fixup_free = timer_fixup_free,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5f5068e94003..e1c0e996b5ae 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -433,54 +433,28 @@ static void *work_debug_hint(void *addr)
return ((struct work_struct *) addr)->func;
}
-/*
- * fixup_init is called when:
- * - an active object is initialized
- */
-static int work_fixup_init(void *addr, enum debug_obj_state state)
+static bool work_is_static_object(void *addr)
{
struct work_struct *work = addr;
- switch (state) {
- case ODEBUG_STATE_ACTIVE:
- cancel_work_sync(work);
- debug_object_init(work, &work_debug_descr);
- return 1;
- default:
- return 0;
- }
+ return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
}
/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * fixup_init is called when:
+ * - an active object is initialized
*/
-static int work_fixup_activate(void *addr, enum debug_obj_state state)
+static bool work_fixup_init(void *addr, enum debug_obj_state state)
{
struct work_struct *work = addr;
switch (state) {
-
- case ODEBUG_STATE_NOTAVAILABLE:
- /*
- * This is not really a fixup. The work struct was
- * statically initialized. We just make sure that it
- * is tracked in the object tracker.
- */
- if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
- debug_object_init(work, &work_debug_descr);
- debug_object_activate(work, &work_debug_descr);
- return 0;
- }
- WARN_ON_ONCE(1);
- return 0;
-
case ODEBUG_STATE_ACTIVE:
- WARN_ON(1);
-
+ cancel_work_sync(work);
+ debug_object_init(work, &work_debug_descr);
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -488,7 +462,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
* fixup_free is called when:
* - an active object is freed
*/
-static int work_fixup_free(void *addr, enum debug_obj_state state)
+static bool work_fixup_free(void *addr, enum debug_obj_state state)
{
struct work_struct *work = addr;
@@ -496,17 +470,17 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
cancel_work_sync(work);
debug_object_free(work, &work_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
static struct debug_obj_descr work_debug_descr = {
.name = "work_struct",
.debug_hint = work_debug_hint,
+ .is_static_object = work_is_static_object,
.fixup_init = work_fixup_init,
- .fixup_activate = work_fixup_activate,
.fixup_free = work_fixup_free,
};
diff --git a/lib/Kconfig b/lib/Kconfig
index 61d55bd0ed89..9d188fbd856a 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -185,6 +185,13 @@ config CRC8
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
@@ -362,6 +369,9 @@ config INTERVAL_TREE
for more information.
+config RADIX_TREE_MULTIORDER
+ bool
+
config ASSOCIATIVE_ARRAY
bool
help
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4fff4b032af8..5d57177fd96b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1,7 +1,9 @@
menu "printk and dmesg options"
config PRINTK_TIME
- bool "Show timing information on printks"
+ int "Show timing information on printks (0-3)"
+ range 0 3
+ default "0"
depends on PRINTK
help
Selecting this option causes time stamps of the printk()
@@ -10,7 +12,9 @@ config PRINTK_TIME
The timestamp is always recorded internally, and exported
to /dev/kmsg. This flag just specifies if the timestamp should
- be included, not that the timestamp is recorded.
+ be included, not that the timestamp is recorded. 0 disables the
+ timestamp and 1 uses the local clock, 2 uses the monotonic clock, and
+ 3 uses real clock.
The behavior is also controlled by the kernel command line
parameter printk.time=1. See Documentation/kernel-parameters.txt
diff --git a/lib/Makefile b/lib/Makefile
index 931396ada5eb..efb41dee8dd0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
sha1.o md5.o irq_regs.o argv_split.o \
flex_proportions.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
- earlycpio.o seq_buf.o nmi_backtrace.o
+ earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
lib-$(CONFIG_MMU) += ioremap.o
@@ -97,6 +97,7 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_842_COMPRESS) += 842/
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644
index 000000000000..41629ea5a60c
--- /dev/null
+++ b/lib/crc64_ecma.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 519b5a10fd70..a8e12601eb37 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -269,16 +269,15 @@ static void debug_print_object(struct debug_obj *obj, char *msg)
* Try to repair the damage, so we have a better chance to get useful
* debug output.
*/
-static int
-debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state),
+static bool
+debug_object_fixup(bool (*fixup)(void *addr, enum debug_obj_state state),
void * addr, enum debug_obj_state state)
{
- int fixed = 0;
-
- if (fixup)
- fixed = fixup(addr, state);
- debug_objects_fixups += fixed;
- return fixed;
+ if (fixup && fixup(addr, state)) {
+ debug_objects_fixups++;
+ return true;
+ }
+ return false;
}
static void debug_object_is_on_stack(void *addr, int onstack)
@@ -416,7 +415,7 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
ret = debug_object_fixup(descr->fixup_activate, addr, state);
- return ret ? -EINVAL : 0;
+ return ret ? 0 : -EINVAL;
case ODEBUG_STATE_DESTROYED:
debug_print_object(obj, "activate");
@@ -432,14 +431,21 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
raw_spin_unlock_irqrestore(&db->lock, flags);
/*
- * This happens when a static object is activated. We
- * let the type specific code decide whether this is
- * true or not.
+ * We are here when a static object is activated. We
+ * let the type specific code confirm whether this is
+ * true or not. if true, we just make sure that the
+ * static object is tracked in the object tracker. If
+ * not, this must be a bug, so we try to fix it up.
*/
- if (debug_object_fixup(descr->fixup_activate, addr,
- ODEBUG_STATE_NOTAVAILABLE)) {
+ if (descr->is_static_object && descr->is_static_object(addr)) {
+ /* track this static object */
+ debug_object_init(addr, descr);
+ debug_object_activate(addr, descr);
+ } else {
debug_print_object(&o, "activate");
- return -EINVAL;
+ ret = debug_object_fixup(descr->fixup_activate, addr,
+ ODEBUG_STATE_NOTAVAILABLE);
+ return ret ? 0 : -EINVAL;
}
return 0;
}
@@ -603,12 +609,18 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
raw_spin_unlock_irqrestore(&db->lock, flags);
/*
- * Maybe the object is static. Let the type specific
- * code decide what to do.
+ * Maybe the object is static, and we let the type specific
+ * code confirm. Track this static object if true, else invoke
+ * fixup.
*/
- if (debug_object_fixup(descr->fixup_assert_init, addr,
- ODEBUG_STATE_NOTAVAILABLE))
+ if (descr->is_static_object && descr->is_static_object(addr)) {
+ /* Track this static object */
+ debug_object_init(addr, descr);
+ } else {
debug_print_object(&o, "assert_init");
+ debug_object_fixup(descr->fixup_assert_init, addr,
+ ODEBUG_STATE_NOTAVAILABLE);
+ }
return;
}
@@ -793,11 +805,18 @@ struct self_test {
static __initdata struct debug_obj_descr descr_type_test;
+static bool __init is_static_object(void *addr)
+{
+ struct self_test *obj = addr;
+
+ return obj->static_init;
+}
+
/*
* fixup_init is called when:
* - an active object is initialized
*/
-static int __init fixup_init(void *addr, enum debug_obj_state state)
+static bool __init fixup_init(void *addr, enum debug_obj_state state)
{
struct self_test *obj = addr;
@@ -805,37 +824,31 @@ static int __init fixup_init(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
debug_object_deactivate(obj, &descr_type_test);
debug_object_init(obj, &descr_type_test);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
/*
* fixup_activate is called when:
* - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
*/
-static int __init fixup_activate(void *addr, enum debug_obj_state state)
+static bool __init fixup_activate(void *addr, enum debug_obj_state state)
{
struct self_test *obj = addr;
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
- if (obj->static_init == 1) {
- debug_object_init(obj, &descr_type_test);
- debug_object_activate(obj, &descr_type_test);
- return 0;
- }
- return 1;
-
+ return true;
case ODEBUG_STATE_ACTIVE:
debug_object_deactivate(obj, &descr_type_test);
debug_object_activate(obj, &descr_type_test);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -843,7 +856,7 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state)
* fixup_destroy is called when:
* - an active object is destroyed
*/
-static int __init fixup_destroy(void *addr, enum debug_obj_state state)
+static bool __init fixup_destroy(void *addr, enum debug_obj_state state)
{
struct self_test *obj = addr;
@@ -851,9 +864,9 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
debug_object_deactivate(obj, &descr_type_test);
debug_object_destroy(obj, &descr_type_test);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -861,7 +874,7 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
* fixup_free is called when:
* - an active object is freed
*/
-static int __init fixup_free(void *addr, enum debug_obj_state state)
+static bool __init fixup_free(void *addr, enum debug_obj_state state)
{
struct self_test *obj = addr;
@@ -869,9 +882,9 @@ static int __init fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
debug_object_deactivate(obj, &descr_type_test);
debug_object_free(obj, &descr_type_test);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -917,6 +930,7 @@ out:
static __initdata struct debug_obj_descr descr_type_test = {
.name = "selftest",
+ .is_static_object = is_static_object,
.fixup_init = fixup_init,
.fixup_activate = fixup_activate,
.fixup_destroy = fixup_destroy,
diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f129d7b8..135ee6407a5e 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,20 +2,77 @@
#include <linux/gcd.h>
#include <linux/export.h>
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears in a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
unsigned long gcd(unsigned long a, unsigned long b)
{
- unsigned long r;
+ unsigned long r = a | b;
+
+ if (!a || !b)
+ return r;
- if (a < b)
- swap(a, b);
+ b >>= __ffs(b);
+ if (b == 1)
+ return r & -r;
- if (!b)
- return a;
- while ((r = a % b) != 0) {
- a = b;
- b = r;
+ for (;;) {
+ a >>= __ffs(a);
+ if (a == 1)
+ return r & -r;
+ if (a == b)
+ return a << __ffs(r);
+
+ if (a < b)
+ swap(a, b);
+ a -= b;
}
- return b;
}
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+ unsigned long r = a | b;
+
+ if (!a || !b)
+ return r;
+
+ /* Isolate lsbit of r */
+ r &= -r;
+
+ while (!(b & r))
+ b >>= 1;
+ if (b == r)
+ return r;
+
+ for (;;) {
+ while (!(a & r))
+ a >>= 1;
+ if (a == r)
+ return r;
+ if (a == b)
+ return a;
+
+ if (a < b)
+ swap(a, b);
+ a -= b;
+ a >>= 1;
+ if (a & r)
+ a += b;
+ a >>= 1;
+ }
+}
+
+#endif
+
EXPORT_SYMBOL_GPL(gcd);
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 6019c53c669e..26caf51cc238 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -16,33 +16,14 @@
#include <linux/delay.h>
#include <linux/kprobes.h>
#include <linux/nmi.h>
-#include <linux/seq_buf.h>
#ifdef arch_trigger_all_cpu_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE 4096
-
-struct nmi_seq_buf {
- unsigned char buffer[NMI_BUF_SIZE];
- struct seq_buf seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
- const char *buf = s->buffer + start;
-
- printk("%.*s", (end - start) + 1, buf);
-}
-
/*
* When raise() is called it will be is passed a pointer to the
* backtrace_mask. Architectures that call nmi_cpu_backtrace()
@@ -52,8 +33,7 @@ static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
void nmi_trigger_all_cpu_backtrace(bool include_self,
void (*raise)(cpumask_t *mask))
{
- struct nmi_seq_buf *s;
- int i, cpu, this_cpu = get_cpu();
+ int i, this_cpu = get_cpu();
if (test_and_set_bit(0, &backtrace_flag)) {
/*
@@ -68,17 +48,6 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
if (!include_self)
cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
- cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
-
- /*
- * Set up per_cpu seq_buf buffers that the NMIs running on the other
- * CPUs will write to.
- */
- for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
- s = &per_cpu(nmi_print_seq, cpu);
- seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
- }
-
if (!cpumask_empty(to_cpumask(backtrace_mask))) {
pr_info("Sending NMI to %s CPUs:\n",
(include_self ? "all" : "other"));
@@ -94,73 +63,25 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
}
/*
- * Now that all the NMIs have triggered, we can dump out their
- * back traces safely to the console.
+ * Force flush any remote buffers that might be stuck in IRQ context
+ * and therefore could not run their irq_work.
*/
- for_each_cpu(cpu, &printtrace_mask) {
- int len, last_i = 0;
+ printk_nmi_flush();
- s = &per_cpu(nmi_print_seq, cpu);
- len = seq_buf_used(&s->seq);
- if (!len)
- continue;
-
- /* Print line by line. */
- for (i = 0; i < len; i++) {
- if (s->buffer[i] == '\n') {
- print_seq_line(s, last_i, i);
- last_i = i + 1;
- }
- }
- /* Check if there was a partial line. */
- if (last_i < len) {
- print_seq_line(s, last_i, len - 1);
- pr_cont("\n");
- }
- }
-
- clear_bit(0, &backtrace_flag);
- smp_mb__after_atomic();
+ clear_bit_unlock(0, &backtrace_flag);
put_cpu();
}
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
- struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- unsigned int len = seq_buf_used(&s->seq);
-
- seq_buf_vprintf(&s->seq, fmt, args);
- return seq_buf_used(&s->seq) - len;
-}
-
bool nmi_cpu_backtrace(struct pt_regs *regs)
{
int cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- printk_func_t printk_func_save = this_cpu_read(printk_func);
-
- /* Replace printk to write into the NMI seq */
- this_cpu_write(printk_func, nmi_vprintk);
pr_warn("NMI backtrace for cpu %d\n", cpu);
if (regs)
show_regs(regs);
else
dump_stack();
- this_cpu_write(printk_func, printk_func_save);
-
cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return true;
}
diff --git a/lib/nodemask.c b/lib/nodemask.c
new file mode 100644
index 000000000000..e42a5bf44d33
--- /dev/null
+++ b/lib/nodemask.c
@@ -0,0 +1,30 @@
+#include <linux/nodemask.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+int __next_node_in(int node, const nodemask_t *srcp)
+{
+ int ret = __next_node(node, srcp);
+
+ if (ret == MAX_NUMNODES)
+ ret = __first_node(srcp);
+ return ret;
+}
+EXPORT_SYMBOL(__next_node_in);
+
+#ifdef CONFIG_NUMA
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns NUMA_NO_NODE if nodemask is empty)
+ */
+int node_random(const nodemask_t *maskp)
+{
+ int w, bit = NUMA_NO_NODE;
+
+ w = nodes_weight(*maskp);
+ if (w)
+ bit = bitmap_ord_to_pos(maskp->bits,
+ get_random_int() % w, MAX_NUMNODES);
+ return bit;
+}
+#endif
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index f051d69f0910..72d36113ccaa 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -19,7 +19,7 @@ static DEFINE_SPINLOCK(percpu_counters_lock);
static struct debug_obj_descr percpu_counter_debug_descr;
-static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
+static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
{
struct percpu_counter *fbc = addr;
@@ -27,9 +27,9 @@ static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
percpu_counter_destroy(fbc);
debug_object_free(fbc, &percpu_counter_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 1624c4117961..8b7d8459bb9d 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -4,6 +4,8 @@
* Copyright (C) 2005 SGI, Christoph Lameter
* Copyright (C) 2006 Nick Piggin
* Copyright (C) 2012 Konstantin Khlebnikov
+ * Copyright (C) 2016 Intel, Matthew Wilcox
+ * Copyright (C) 2016 Intel, Ross Zwisler
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
@@ -37,12 +39,6 @@
/*
- * The height_to_maxindex array needs to be one deeper than the maximum
- * path as height 0 holds only 1 entry.
- */
-static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
-
-/*
* Radix tree node cache.
*/
static struct kmem_cache *radix_tree_node_cachep;
@@ -64,20 +60,58 @@ static struct kmem_cache *radix_tree_node_cachep;
* Per-cpu pool of preloaded nodes
*/
struct radix_tree_preload {
- int nr;
+ unsigned nr;
/* nodes->private_data points to next preallocated node */
struct radix_tree_node *nodes;
};
static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
-static inline void *ptr_to_indirect(void *ptr)
+static inline void *node_to_entry(void *ptr)
+{
+ return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
+}
+
+#define RADIX_TREE_RETRY node_to_entry(NULL)
+
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+/* Sibling slots point directly to another slot in the same node */
+static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node)
+{
+ void **ptr = node;
+ return (parent->slots <= ptr) &&
+ (ptr < parent->slots + RADIX_TREE_MAP_SIZE);
+}
+#else
+static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node)
{
- return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
+ return false;
}
+#endif
-static inline void *indirect_to_ptr(void *ptr)
+static inline unsigned long get_slot_offset(struct radix_tree_node *parent,
+ void **slot)
{
- return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
+ return slot - parent->slots;
+}
+
+static unsigned int radix_tree_descend(struct radix_tree_node *parent,
+ struct radix_tree_node **nodep, unsigned long index)
+{
+ unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
+ void **entry = rcu_dereference_raw(parent->slots[offset]);
+
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+ if (radix_tree_is_internal_node(entry)) {
+ unsigned long siboff = get_slot_offset(parent, entry);
+ if (siboff < RADIX_TREE_MAP_SIZE) {
+ offset = siboff;
+ entry = rcu_dereference_raw(parent->slots[offset]);
+ }
+ }
+#endif
+
+ *nodep = (void *)entry;
+ return offset;
}
static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
@@ -108,7 +142,7 @@ static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
}
-static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
+static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
{
root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
}
@@ -120,7 +154,12 @@ static inline void root_tag_clear_all(struct radix_tree_root *root)
static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
{
- return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+ return (__force int)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+}
+
+static inline unsigned root_tags_get(struct radix_tree_root *root)
+{
+ return (__force unsigned)root->gfp_mask >> __GFP_BITS_SHIFT;
}
/*
@@ -129,7 +168,7 @@ static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
*/
static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
{
- int idx;
+ unsigned idx;
for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
if (node->tags[tag][idx])
return 1;
@@ -173,38 +212,45 @@ radix_tree_find_next_bit(const unsigned long *addr,
return size;
}
-#if 0
-static void dump_node(void *slot, int height, int offset)
+#ifndef __KERNEL__
+static void dump_node(struct radix_tree_node *node, unsigned long index)
{
- struct radix_tree_node *node;
- int i;
+ unsigned long i;
- if (!slot)
- return;
+ pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n",
+ node, node->offset,
+ node->tags[0][0], node->tags[1][0], node->tags[2][0],
+ node->shift, node->count, node->parent);
- if (height == 0) {
- pr_debug("radix entry %p offset %d\n", slot, offset);
- return;
+ for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+ unsigned long first = index | (i << node->shift);
+ unsigned long last = first | ((1UL << node->shift) - 1);
+ void *entry = node->slots[i];
+ if (!entry)
+ continue;
+ if (is_sibling_entry(node, entry)) {
+ pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n",
+ entry, i,
+ *(void **)entry_to_node(entry),
+ first, last);
+ } else if (!radix_tree_is_internal_node(entry)) {
+ pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
+ entry, i, first, last);
+ } else {
+ dump_node(entry_to_node(entry), first);
+ }
}
-
- node = indirect_to_ptr(slot);
- pr_debug("radix node: %p offset %d tags %lx %lx %lx path %x count %d parent %p\n",
- slot, offset, node->tags[0][0], node->tags[1][0],
- node->tags[2][0], node->path, node->count, node->parent);
-
- for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
- dump_node(node->slots[i], height - 1, i);
}
/* For debug */
static void radix_tree_dump(struct radix_tree_root *root)
{
- pr_debug("radix root: %p height %d rnode %p tags %x\n",
- root, root->height, root->rnode,
+ pr_debug("radix root: %p rnode %p tags %x\n",
+ root, root->rnode,
root->gfp_mask >> __GFP_BITS_SHIFT);
- if (!radix_tree_is_indirect_ptr(root->rnode))
+ if (!radix_tree_is_internal_node(root->rnode))
return;
- dump_node(root->rnode, root->height, 0);
+ dump_node(entry_to_node(root->rnode), 0);
}
#endif
@@ -219,9 +265,9 @@ radix_tree_node_alloc(struct radix_tree_root *root)
gfp_t gfp_mask = root_gfp_mask(root);
/*
- * Preload code isn't irq safe and it doesn't make sence to use
- * preloading in the interrupt anyway as all the allocations have to
- * be atomic. So just do normal allocation when in interrupt.
+ * Preload code isn't irq safe and it doesn't make sense to use
+ * preloading during an interrupt anyway as all the allocations have
+ * to be atomic. So just do normal allocation when in interrupt.
*/
if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
struct radix_tree_preload *rtp;
@@ -257,7 +303,7 @@ radix_tree_node_alloc(struct radix_tree_root *root)
ret = kmem_cache_alloc(radix_tree_node_cachep,
gfp_mask | __GFP_ACCOUNT);
out:
- BUG_ON(radix_tree_is_indirect_ptr(ret));
+ BUG_ON(radix_tree_is_internal_node(ret));
return ret;
}
@@ -357,38 +403,58 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
EXPORT_SYMBOL(radix_tree_maybe_preload);
/*
- * Return the maximum key which can be store into a
- * radix tree with height HEIGHT.
+ * The maximum index which can be stored in a radix tree
*/
-static inline unsigned long radix_tree_maxindex(unsigned int height)
+static inline unsigned long shift_maxindex(unsigned int shift)
+{
+ return (RADIX_TREE_MAP_SIZE << shift) - 1;
+}
+
+static inline unsigned long node_maxindex(struct radix_tree_node *node)
+{
+ return shift_maxindex(node->shift);
+}
+
+static unsigned radix_tree_load_root(struct radix_tree_root *root,
+ struct radix_tree_node **nodep, unsigned long *maxindex)
{
- return height_to_maxindex[height];
+ struct radix_tree_node *node = rcu_dereference_raw(root->rnode);
+
+ *nodep = node;
+
+ if (likely(radix_tree_is_internal_node(node))) {
+ node = entry_to_node(node);
+ *maxindex = node_maxindex(node);
+ return node->shift + RADIX_TREE_MAP_SHIFT;
+ }
+
+ *maxindex = 0;
+ return 0;
}
/*
* Extend a radix tree so it can store key @index.
*/
static int radix_tree_extend(struct radix_tree_root *root,
- unsigned long index, unsigned order)
+ unsigned long index, unsigned int shift)
{
- struct radix_tree_node *node;
struct radix_tree_node *slot;
- unsigned int height;
+ unsigned int maxshift;
int tag;
- /* Figure out what the height should be. */
- height = root->height + 1;
- while (index > radix_tree_maxindex(height))
- height++;
+ /* Figure out what the shift should be. */
+ maxshift = shift;
+ while (index > shift_maxindex(maxshift))
+ maxshift += RADIX_TREE_MAP_SHIFT;
- if ((root->rnode == NULL) && (order == 0)) {
- root->height = height;
+ slot = root->rnode;
+ if (!slot)
goto out;
- }
do {
- unsigned int newheight;
- if (!(node = radix_tree_node_alloc(root)))
+ struct radix_tree_node *node = radix_tree_node_alloc(root);
+
+ if (!node)
return -ENOMEM;
/* Propagate the aggregated tag info into the new root */
@@ -397,25 +463,20 @@ static int radix_tree_extend(struct radix_tree_root *root,
tag_set(node, tag, 0);
}
- /* Increase the height. */
- newheight = root->height+1;
- BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
- node->path = newheight;
+ BUG_ON(shift > BITS_PER_LONG);
+ node->shift = shift;
+ node->offset = 0;
node->count = 1;
node->parent = NULL;
- slot = root->rnode;
- if (radix_tree_is_indirect_ptr(slot) && newheight > 1) {
- slot = indirect_to_ptr(slot);
- slot->parent = node;
- slot = ptr_to_indirect(slot);
- }
+ if (radix_tree_is_internal_node(slot))
+ entry_to_node(slot)->parent = node;
node->slots[0] = slot;
- node = ptr_to_indirect(node);
- rcu_assign_pointer(root->rnode, node);
- root->height = newheight;
- } while (height > root->height);
+ slot = node_to_entry(node);
+ rcu_assign_pointer(root->rnode, slot);
+ shift += RADIX_TREE_MAP_SHIFT;
+ } while (shift <= maxshift);
out:
- return 0;
+ return maxshift + RADIX_TREE_MAP_SHIFT;
}
/**
@@ -439,71 +500,70 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
unsigned order, struct radix_tree_node **nodep,
void ***slotp)
{
- struct radix_tree_node *node = NULL, *slot;
- unsigned int height, shift, offset;
- int error;
+ struct radix_tree_node *node = NULL, *child;
+ void **slot = (void **)&root->rnode;
+ unsigned long maxindex;
+ unsigned int shift, offset = 0;
+ unsigned long max = index | ((1UL << order) - 1);
- BUG_ON((0 < order) && (order < RADIX_TREE_MAP_SHIFT));
+ shift = radix_tree_load_root(root, &child, &maxindex);
/* Make sure the tree is high enough. */
- if (index > radix_tree_maxindex(root->height)) {
- error = radix_tree_extend(root, index, order);
- if (error)
+ if (max > maxindex) {
+ int error = radix_tree_extend(root, max, shift);
+ if (error < 0)
return error;
+ shift = error;
+ child = root->rnode;
+ if (order == shift)
+ shift += RADIX_TREE_MAP_SHIFT;
}
- slot = root->rnode;
-
- height = root->height;
- shift = height * RADIX_TREE_MAP_SHIFT;
-
- offset = 0; /* uninitialised var warning */
while (shift > order) {
- if (slot == NULL) {
+ shift -= RADIX_TREE_MAP_SHIFT;
+ if (child == NULL) {
/* Have to add a child node. */
- if (!(slot = radix_tree_node_alloc(root)))
+ child = radix_tree_node_alloc(root);
+ if (!child)
return -ENOMEM;
- slot->path = height;
- slot->parent = node;
- if (node) {
- rcu_assign_pointer(node->slots[offset],
- ptr_to_indirect(slot));
+ child->shift = shift;
+ child->offset = offset;
+ child->parent = node;
+ rcu_assign_pointer(*slot, node_to_entry(child));
+ if (node)
node->count++;
- slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
- } else
- rcu_assign_pointer(root->rnode,
- ptr_to_indirect(slot));
- } else if (!radix_tree_is_indirect_ptr(slot))
+ } else if (!radix_tree_is_internal_node(child))
break;
/* Go a level down */
- height--;
- shift -= RADIX_TREE_MAP_SHIFT;
- offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- node = indirect_to_ptr(slot);
- slot = node->slots[offset];
+ node = entry_to_node(child);
+ offset = radix_tree_descend(node, &child, index);
+ slot = &node->slots[offset];
}
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
/* Insert pointers to the canonical entry */
- if ((shift - order) > 0) {
- int i, n = 1 << (shift - order);
+ if (order > shift) {
+ unsigned i, n = 1 << (order - shift);
offset = offset & ~(n - 1);
- slot = ptr_to_indirect(&node->slots[offset]);
+ slot = &node->slots[offset];
+ child = node_to_entry(slot);
for (i = 0; i < n; i++) {
- if (node->slots[offset + i])
+ if (slot[i])
return -EEXIST;
}
for (i = 1; i < n; i++) {
- rcu_assign_pointer(node->slots[offset + i], slot);
+ rcu_assign_pointer(slot[i], child);
node->count++;
}
}
+#endif
if (nodep)
*nodep = node;
if (slotp)
- *slotp = node ? node->slots + offset : (void **)&root->rnode;
+ *slotp = slot;
return 0;
}
@@ -523,7 +583,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
void **slot;
int error;
- BUG_ON(radix_tree_is_indirect_ptr(item));
+ BUG_ON(radix_tree_is_internal_node(item));
error = __radix_tree_create(root, index, order, &node, &slot);
if (error)
@@ -533,12 +593,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
rcu_assign_pointer(*slot, item);
if (node) {
+ unsigned offset = get_slot_offset(node, slot);
node->count++;
- BUG_ON(tag_get(node, 0, index & RADIX_TREE_MAP_MASK));
- BUG_ON(tag_get(node, 1, index & RADIX_TREE_MAP_MASK));
+ BUG_ON(tag_get(node, 0, offset));
+ BUG_ON(tag_get(node, 1, offset));
+ BUG_ON(tag_get(node, 2, offset));
} else {
- BUG_ON(root_tag_get(root, 0));
- BUG_ON(root_tag_get(root, 1));
+ BUG_ON(root_tags_get(root));
}
return 0;
@@ -563,44 +624,25 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
struct radix_tree_node **nodep, void ***slotp)
{
struct radix_tree_node *node, *parent;
- unsigned int height, shift;
+ unsigned long maxindex;
void **slot;
- node = rcu_dereference_raw(root->rnode);
- if (node == NULL)
+ restart:
+ parent = NULL;
+ slot = (void **)&root->rnode;
+ radix_tree_load_root(root, &node, &maxindex);
+ if (index > maxindex)
return NULL;
- if (!radix_tree_is_indirect_ptr(node)) {
- if (index > 0)
- return NULL;
+ while (radix_tree_is_internal_node(node)) {
+ unsigned offset;
- if (nodep)
- *nodep = NULL;
- if (slotp)
- *slotp = (void **)&root->rnode;
- return node;
+ if (node == RADIX_TREE_RETRY)
+ goto restart;
+ parent = entry_to_node(node);
+ offset = radix_tree_descend(parent, &node, index);
+ slot = parent->slots + offset;
}
- node = indirect_to_ptr(node);
-
- height = node->path & RADIX_TREE_HEIGHT_MASK;
- if (index > radix_tree_maxindex(height))
- return NULL;
-
- shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-
- do {
- parent = node;
- slot = node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK);
- node = rcu_dereference_raw(*slot);
- if (node == NULL)
- return NULL;
- if (!radix_tree_is_indirect_ptr(node))
- break;
- node = indirect_to_ptr(node);
-
- shift -= RADIX_TREE_MAP_SHIFT;
- height--;
- } while (height > 0);
if (nodep)
*nodep = parent;
@@ -654,59 +696,72 @@ EXPORT_SYMBOL(radix_tree_lookup);
* radix_tree_tag_set - set a tag on a radix tree node
* @root: radix tree root
* @index: index key
- * @tag: tag index
+ * @tag: tag index
*
* Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
* corresponding to @index in the radix tree. From
* the root all the way down to the leaf node.
*
- * Returns the address of the tagged item. Setting a tag on a not-present
+ * Returns the address of the tagged item. Setting a tag on a not-present
* item is a bug.
*/
void *radix_tree_tag_set(struct radix_tree_root *root,
unsigned long index, unsigned int tag)
{
- unsigned int height, shift;
- struct radix_tree_node *slot;
+ struct radix_tree_node *node, *parent;
+ unsigned long maxindex;
- height = root->height;
- BUG_ON(index > radix_tree_maxindex(height));
+ radix_tree_load_root(root, &node, &maxindex);
+ BUG_ON(index > maxindex);
- slot = indirect_to_ptr(root->rnode);
- shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+ while (radix_tree_is_internal_node(node)) {
+ unsigned offset;
- while (height > 0) {
- int offset;
+ parent = entry_to_node(node);
+ offset = radix_tree_descend(parent, &node, index);
+ BUG_ON(!node);
- offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- if (!tag_get(slot, tag, offset))
- tag_set(slot, tag, offset);
- slot = slot->slots[offset];
- BUG_ON(slot == NULL);
- if (!radix_tree_is_indirect_ptr(slot))
- break;
- slot = indirect_to_ptr(slot);
- shift -= RADIX_TREE_MAP_SHIFT;
- height--;
+ if (!tag_get(parent, tag, offset))
+ tag_set(parent, tag, offset);
}
/* set the root's tag bit */
- if (slot && !root_tag_get(root, tag))
+ if (!root_tag_get(root, tag))
root_tag_set(root, tag);
- return slot;
+ return node;
}
EXPORT_SYMBOL(radix_tree_tag_set);
+static void node_tag_clear(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ unsigned int tag, unsigned int offset)
+{
+ while (node) {
+ if (!tag_get(node, tag, offset))
+ return;
+ tag_clear(node, tag, offset);
+ if (any_tag_set(node, tag))
+ return;
+
+ offset = node->offset;
+ node = node->parent;
+ }
+
+ /* clear the root's tag bit */
+ if (root_tag_get(root, tag))
+ root_tag_clear(root, tag);
+}
+
/**
* radix_tree_tag_clear - clear a tag on a radix tree node
* @root: radix tree root
* @index: index key
- * @tag: tag index
+ * @tag: tag index
*
* Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
- * corresponding to @index in the radix tree. If
- * this causes the leaf node to have no tags set then clear the tag in the
+ * corresponding to @index in the radix tree. If this causes
+ * the leaf node to have no tags set then clear the tag in the
* next-to-leaf node, etc.
*
* Returns the address of the tagged item on success, else NULL. ie:
@@ -715,52 +770,25 @@ EXPORT_SYMBOL(radix_tree_tag_set);
void *radix_tree_tag_clear(struct radix_tree_root *root,
unsigned long index, unsigned int tag)
{
- struct radix_tree_node *node = NULL;
- struct radix_tree_node *slot = NULL;
- unsigned int height, shift;
+ struct radix_tree_node *node, *parent;
+ unsigned long maxindex;
int uninitialized_var(offset);
- height = root->height;
- if (index > radix_tree_maxindex(height))
- goto out;
-
- shift = height * RADIX_TREE_MAP_SHIFT;
- slot = root->rnode;
-
- while (shift) {
- if (slot == NULL)
- goto out;
- if (!radix_tree_is_indirect_ptr(slot))
- break;
- slot = indirect_to_ptr(slot);
-
- shift -= RADIX_TREE_MAP_SHIFT;
- offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- node = slot;
- slot = slot->slots[offset];
- }
-
- if (slot == NULL)
- goto out;
+ radix_tree_load_root(root, &node, &maxindex);
+ if (index > maxindex)
+ return NULL;
- while (node) {
- if (!tag_get(node, tag, offset))
- goto out;
- tag_clear(node, tag, offset);
- if (any_tag_set(node, tag))
- goto out;
+ parent = NULL;
- index >>= RADIX_TREE_MAP_SHIFT;
- offset = index & RADIX_TREE_MAP_MASK;
- node = node->parent;
+ while (radix_tree_is_internal_node(node)) {
+ parent = entry_to_node(node);
+ offset = radix_tree_descend(parent, &node, index);
}
- /* clear the root's tag bit */
- if (root_tag_get(root, tag))
- root_tag_clear(root, tag);
+ if (node)
+ node_tag_clear(root, parent, tag, offset);
-out:
- return slot;
+ return node;
}
EXPORT_SYMBOL(radix_tree_tag_clear);
@@ -768,7 +796,7 @@ EXPORT_SYMBOL(radix_tree_tag_clear);
* radix_tree_tag_get - get a tag on a radix tree node
* @root: radix tree root
* @index: index key
- * @tag: tag index (< RADIX_TREE_MAX_TAGS)
+ * @tag: tag index (< RADIX_TREE_MAX_TAGS)
*
* Return values:
*
@@ -782,48 +810,44 @@ EXPORT_SYMBOL(radix_tree_tag_clear);
int radix_tree_tag_get(struct radix_tree_root *root,
unsigned long index, unsigned int tag)
{
- unsigned int height, shift;
- struct radix_tree_node *node;
+ struct radix_tree_node *node, *parent;
+ unsigned long maxindex;
- /* check the root's tag bit */
if (!root_tag_get(root, tag))
return 0;
- node = rcu_dereference_raw(root->rnode);
- if (node == NULL)
+ radix_tree_load_root(root, &node, &maxindex);
+ if (index > maxindex)
return 0;
-
- if (!radix_tree_is_indirect_ptr(node))
- return (index == 0);
- node = indirect_to_ptr(node);
-
- height = node->path & RADIX_TREE_HEIGHT_MASK;
- if (index > radix_tree_maxindex(height))
+ if (node == NULL)
return 0;
- shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+ while (radix_tree_is_internal_node(node)) {
+ unsigned offset;
- for ( ; ; ) {
- int offset;
+ parent = entry_to_node(node);
+ offset = radix_tree_descend(parent, &node, index);
- if (node == NULL)
+ if (!node)
return 0;
- node = indirect_to_ptr(node);
-
- offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- if (!tag_get(node, tag, offset))
+ if (!tag_get(parent, tag, offset))
return 0;
- if (height == 1)
- return 1;
- node = rcu_dereference_raw(node->slots[offset]);
- if (!radix_tree_is_indirect_ptr(node))
- return 1;
- shift -= RADIX_TREE_MAP_SHIFT;
- height--;
+ if (node == RADIX_TREE_RETRY)
+ break;
}
+
+ return 1;
}
EXPORT_SYMBOL(radix_tree_tag_get);
+static inline void __set_iter_shift(struct radix_tree_iter *iter,
+ unsigned int shift)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+ iter->shift = shift;
+#endif
+}
+
/**
* radix_tree_next_chunk - find next chunk of slots for iteration
*
@@ -835,9 +859,9 @@ EXPORT_SYMBOL(radix_tree_tag_get);
void **radix_tree_next_chunk(struct radix_tree_root *root,
struct radix_tree_iter *iter, unsigned flags)
{
- unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
- struct radix_tree_node *rnode, *node;
- unsigned long index, offset, height;
+ unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
+ struct radix_tree_node *node, *child;
+ unsigned long index, offset, maxindex;
if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
return NULL;
@@ -855,33 +879,28 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
if (!index && iter->index)
return NULL;
- rnode = rcu_dereference_raw(root->rnode);
- if (radix_tree_is_indirect_ptr(rnode)) {
- rnode = indirect_to_ptr(rnode);
- } else if (rnode && !index) {
+ restart:
+ radix_tree_load_root(root, &child, &maxindex);
+ if (index > maxindex)
+ return NULL;
+ if (!child)
+ return NULL;
+
+ if (!radix_tree_is_internal_node(child)) {
/* Single-slot tree */
- iter->index = 0;
- iter->next_index = 1;
+ iter->index = index;
+ iter->next_index = maxindex + 1;
iter->tags = 1;
+ __set_iter_shift(iter, 0);
return (void **)&root->rnode;
- } else
- return NULL;
-
-restart:
- height = rnode->path & RADIX_TREE_HEIGHT_MASK;
- shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
- offset = index >> shift;
+ }
- /* Index outside of the tree */
- if (offset >= RADIX_TREE_MAP_SIZE)
- return NULL;
+ do {
+ node = entry_to_node(child);
+ offset = radix_tree_descend(node, &child, index);
- node = rnode;
- while (1) {
- struct radix_tree_node *slot;
if ((flags & RADIX_TREE_ITER_TAGGED) ?
- !test_bit(offset, node->tags[tag]) :
- !node->slots[offset]) {
+ !tag_get(node, tag, offset) : !child) {
/* Hole detected */
if (flags & RADIX_TREE_ITER_CONTIG)
return NULL;
@@ -893,35 +912,30 @@ restart:
offset + 1);
else
while (++offset < RADIX_TREE_MAP_SIZE) {
- if (node->slots[offset])
+ void *slot = node->slots[offset];
+ if (is_sibling_entry(node, slot))
+ continue;
+ if (slot)
break;
}
- index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1);
- index += offset << shift;
+ index &= ~node_maxindex(node);
+ index += offset << node->shift;
/* Overflow after ~0UL */
if (!index)
return NULL;
if (offset == RADIX_TREE_MAP_SIZE)
goto restart;
+ child = rcu_dereference_raw(node->slots[offset]);
}
- /* This is leaf-node */
- if (!shift)
- break;
-
- slot = rcu_dereference_raw(node->slots[offset]);
- if (slot == NULL)
+ if ((child == NULL) || (child == RADIX_TREE_RETRY))
goto restart;
- if (!radix_tree_is_indirect_ptr(slot))
- break;
- node = indirect_to_ptr(slot);
- shift -= RADIX_TREE_MAP_SHIFT;
- offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- }
+ } while (radix_tree_is_internal_node(child));
/* Update the iterator state */
- iter->index = index;
- iter->next_index = (index | RADIX_TREE_MAP_MASK) + 1;
+ iter->index = (index &~ node_maxindex(node)) | (offset << node->shift);
+ iter->next_index = (index | node_maxindex(node)) + 1;
+ __set_iter_shift(iter, node->shift);
/* Construct iter->tags bit-mask from node->tags[tag] array */
if (flags & RADIX_TREE_ITER_TAGGED) {
@@ -967,7 +981,7 @@ EXPORT_SYMBOL(radix_tree_next_chunk);
* set is outside the range we are scanning. This reults in dangling tags and
* can lead to problems with later tag operations (e.g. livelocks on lookups).
*
- * The function returns number of leaves where the tag was set and sets
+ * The function returns the number of leaves where the tag was set and sets
* *first_indexp to the first unscanned index.
* WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must
* be prepared to handle that.
@@ -977,14 +991,13 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
unsigned long nr_to_tag,
unsigned int iftag, unsigned int settag)
{
- unsigned int height = root->height;
- struct radix_tree_node *node = NULL;
- struct radix_tree_node *slot;
- unsigned int shift;
+ struct radix_tree_node *parent, *node, *child;
+ unsigned long maxindex;
unsigned long tagged = 0;
unsigned long index = *first_indexp;
- last_index = min(last_index, radix_tree_maxindex(height));
+ radix_tree_load_root(root, &child, &maxindex);
+ last_index = min(last_index, maxindex);
if (index > last_index)
return 0;
if (!nr_to_tag)
@@ -993,80 +1006,62 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
*first_indexp = last_index + 1;
return 0;
}
- if (height == 0) {
+ if (!radix_tree_is_internal_node(child)) {
*first_indexp = last_index + 1;
root_tag_set(root, settag);
return 1;
}
- shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
- slot = indirect_to_ptr(root->rnode);
+ node = entry_to_node(child);
for (;;) {
- unsigned long upindex;
- int offset;
-
- offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- if (!slot->slots[offset])
+ unsigned offset = radix_tree_descend(node, &child, index);
+ if (!child)
goto next;
- if (!tag_get(slot, iftag, offset))
+ if (!tag_get(node, iftag, offset))
goto next;
- if (shift) {
- node = slot;
- slot = slot->slots[offset];
- if (radix_tree_is_indirect_ptr(slot)) {
- slot = indirect_to_ptr(slot);
- shift -= RADIX_TREE_MAP_SHIFT;
- continue;
- } else {
- slot = node;
- node = node->parent;
- }
+ /* Sibling slots never have tags set on them */
+ if (radix_tree_is_internal_node(child)) {
+ node = entry_to_node(child);
+ continue;
}
/* tag the leaf */
- tagged += 1 << shift;
- tag_set(slot, settag, offset);
+ tagged++;
+ tag_set(node, settag, offset);
/* walk back up the path tagging interior nodes */
- upindex = index;
- while (node) {
- upindex >>= RADIX_TREE_MAP_SHIFT;
- offset = upindex & RADIX_TREE_MAP_MASK;
-
+ parent = node;
+ for (;;) {
+ offset = parent->offset;
+ parent = parent->parent;
+ if (!parent)
+ break;
/* stop if we find a node with the tag already set */
- if (tag_get(node, settag, offset))
+ if (tag_get(parent, settag, offset))
break;
- tag_set(node, settag, offset);
- node = node->parent;
+ tag_set(parent, settag, offset);
}
-
- /*
- * Small optimization: now clear that node pointer.
- * Since all of this slot's ancestors now have the tag set
- * from setting it above, we have no further need to walk
- * back up the tree setting tags, until we update slot to
- * point to another radix_tree_node.
- */
- node = NULL;
-
-next:
- /* Go to next item at level determined by 'shift' */
- index = ((index >> shift) + 1) << shift;
+ next:
+ /* Go to next entry in node */
+ index = ((index >> node->shift) + 1) << node->shift;
/* Overflow can happen when last_index is ~0UL... */
if (index > last_index || !index)
break;
- if (tagged >= nr_to_tag)
- break;
- while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) {
+ offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
+ while (offset == 0) {
/*
* We've fully scanned this node. Go up. Because
* last_index is guaranteed to be in the tree, what
* we do below cannot wander astray.
*/
- slot = slot->parent;
- shift += RADIX_TREE_MAP_SHIFT;
+ node = node->parent;
+ offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
}
+ if (is_sibling_entry(node, node->slots[offset]))
+ goto next;
+ if (tagged >= nr_to_tag)
+ break;
}
/*
* We need not to tag the root tag if there is no tag which is set with
@@ -1095,9 +1090,10 @@ EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
*
* Like radix_tree_lookup, radix_tree_gang_lookup may be called under
* rcu_read_lock. In this case, rather than the returned results being
- * an atomic snapshot of the tree at a single point in time, the semantics
- * of an RCU protected gang lookup are as though multiple radix_tree_lookups
- * have been issued in individual locks, and results stored in 'results'.
+ * an atomic snapshot of the tree at a single point in time, the
+ * semantics of an RCU protected gang lookup are as though multiple
+ * radix_tree_lookups have been issued in individual locks, and results
+ * stored in 'results'.
*/
unsigned int
radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
@@ -1114,7 +1110,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
results[ret] = rcu_dereference_raw(*slot);
if (!results[ret])
continue;
- if (radix_tree_is_indirect_ptr(results[ret])) {
+ if (radix_tree_is_internal_node(results[ret])) {
slot = radix_tree_iter_retry(&iter);
continue;
}
@@ -1197,7 +1193,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
results[ret] = rcu_dereference_raw(*slot);
if (!results[ret])
continue;
- if (radix_tree_is_indirect_ptr(results[ret])) {
+ if (radix_tree_is_internal_node(results[ret])) {
slot = radix_tree_iter_retry(&iter);
continue;
}
@@ -1247,58 +1243,48 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP)
#include <linux/sched.h> /* for cond_resched() */
+struct locate_info {
+ unsigned long found_index;
+ bool stop;
+};
+
/*
* This linear search is at present only useful to shmem_unuse_inode().
*/
static unsigned long __locate(struct radix_tree_node *slot, void *item,
- unsigned long index, unsigned long *found_index)
+ unsigned long index, struct locate_info *info)
{
- unsigned int shift, height;
unsigned long i;
- height = slot->path & RADIX_TREE_HEIGHT_MASK;
- shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-
- for ( ; height > 1; height--) {
- i = (index >> shift) & RADIX_TREE_MAP_MASK;
- for (;;) {
- if (slot->slots[i] != NULL)
- break;
- index &= ~((1UL << shift) - 1);
- index += 1UL << shift;
- if (index == 0)
- goto out; /* 32-bit wraparound */
- i++;
- if (i == RADIX_TREE_MAP_SIZE)
+ do {
+ unsigned int shift = slot->shift;
+
+ for (i = (index >> shift) & RADIX_TREE_MAP_MASK;
+ i < RADIX_TREE_MAP_SIZE;
+ i++, index += (1UL << shift)) {
+ struct radix_tree_node *node =
+ rcu_dereference_raw(slot->slots[i]);
+ if (node == RADIX_TREE_RETRY)
goto out;
- }
-
- slot = rcu_dereference_raw(slot->slots[i]);
- if (slot == NULL)
- goto out;
- if (!radix_tree_is_indirect_ptr(slot)) {
- if (slot == item) {
- *found_index = index + i;
- index = 0;
- } else {
- index += shift;
+ if (!radix_tree_is_internal_node(node)) {
+ if (node == item) {
+ info->found_index = index;
+ info->stop = true;
+ goto out;
+ }
+ continue;
}
- goto out;
+ node = entry_to_node(node);
+ if (is_sibling_entry(slot, node))
+ continue;
+ slot = node;
+ break;
}
- slot = indirect_to_ptr(slot);
- shift -= RADIX_TREE_MAP_SHIFT;
- }
+ } while (i < RADIX_TREE_MAP_SIZE);
- /* Bottom level: check items */
- for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
- if (slot->slots[i] == item) {
- *found_index = index + i;
- index = 0;
- goto out;
- }
- }
- index += RADIX_TREE_MAP_SIZE;
out:
+ if ((index == 0) && (i == RADIX_TREE_MAP_SIZE))
+ info->stop = true;
return index;
}
@@ -1316,32 +1302,35 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
struct radix_tree_node *node;
unsigned long max_index;
unsigned long cur_index = 0;
- unsigned long found_index = -1;
+ struct locate_info info = {
+ .found_index = -1,
+ .stop = false,
+ };
do {
rcu_read_lock();
node = rcu_dereference_raw(root->rnode);
- if (!radix_tree_is_indirect_ptr(node)) {
+ if (!radix_tree_is_internal_node(node)) {
rcu_read_unlock();
if (node == item)
- found_index = 0;
+ info.found_index = 0;
break;
}
- node = indirect_to_ptr(node);
- max_index = radix_tree_maxindex(node->path &
- RADIX_TREE_HEIGHT_MASK);
+ node = entry_to_node(node);
+
+ max_index = node_maxindex(node);
if (cur_index > max_index) {
rcu_read_unlock();
break;
}
- cur_index = __locate(node, item, cur_index, &found_index);
+ cur_index = __locate(node, item, cur_index, &info);
rcu_read_unlock();
cond_resched();
- } while (cur_index != 0 && cur_index <= max_index);
+ } while (!info.stop && cur_index <= max_index);
- return found_index;
+ return info.found_index;
}
#else
unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
@@ -1351,47 +1340,45 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
#endif /* CONFIG_SHMEM && CONFIG_SWAP */
/**
- * radix_tree_shrink - shrink height of a radix tree to minimal
+ * radix_tree_shrink - shrink radix tree to minimum height
* @root radix tree root
*/
-static inline void radix_tree_shrink(struct radix_tree_root *root)
+static inline bool radix_tree_shrink(struct radix_tree_root *root)
{
- /* try to shrink tree height */
- while (root->height > 0) {
- struct radix_tree_node *to_free = root->rnode;
- struct radix_tree_node *slot;
+ bool shrunk = false;
+
+ for (;;) {
+ struct radix_tree_node *node = root->rnode;
+ struct radix_tree_node *child;
- BUG_ON(!radix_tree_is_indirect_ptr(to_free));
- to_free = indirect_to_ptr(to_free);
+ if (!radix_tree_is_internal_node(node))
+ break;
+ node = entry_to_node(node);
/*
* The candidate node has more than one child, or its child
- * is not at the leftmost slot, or it is a multiorder entry,
- * we cannot shrink.
+ * is not at the leftmost slot, or the child is a multiorder
+ * entry, we cannot shrink.
*/
- if (to_free->count != 1)
+ if (node->count != 1)
break;
- slot = to_free->slots[0];
- if (!slot)
+ child = node->slots[0];
+ if (!child)
break;
+ if (!radix_tree_is_internal_node(child) && node->shift)
+ break;
+
+ if (radix_tree_is_internal_node(child))
+ entry_to_node(child)->parent = NULL;
/*
* We don't need rcu_assign_pointer(), since we are simply
* moving the node from one part of the tree to another: if it
* was safe to dereference the old pointer to it
- * (to_free->slots[0]), it will be safe to dereference the new
+ * (node->slots[0]), it will be safe to dereference the new
* one (root->rnode) as far as dependent read barriers go.
*/
- if (root->height > 1) {
- if (!radix_tree_is_indirect_ptr(slot))
- break;
-
- slot = indirect_to_ptr(slot);
- slot->parent = NULL;
- slot = ptr_to_indirect(slot);
- }
- root->rnode = slot;
- root->height--;
+ root->rnode = child;
/*
* We have a dilemma here. The node's slot[0] must not be
@@ -1403,7 +1390,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
* their slot to become empty sooner or later.
*
* For example, lockless pagecache will look up a slot, deref
- * the page pointer, and if the page is 0 refcount it means it
+ * the page pointer, and if the page has 0 refcount it means it
* was concurrently deleted from pagecache so try the deref
* again. Fortunately there is already a requirement for logic
* to retry the entire slot lookup -- the indirect pointer
@@ -1411,12 +1398,14 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
* also results in a stale slot). So tag the slot as indirect
* to force callers to retry.
*/
- if (root->height == 0)
- *((unsigned long *)&to_free->slots[0]) |=
- RADIX_TREE_INDIRECT_PTR;
+ if (!radix_tree_is_internal_node(child))
+ node->slots[0] = RADIX_TREE_RETRY;
- radix_tree_node_free(to_free);
+ radix_tree_node_free(node);
+ shrunk = true;
}
+
+ return shrunk;
}
/**
@@ -1439,24 +1428,17 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *parent;
if (node->count) {
- if (node == indirect_to_ptr(root->rnode)) {
- radix_tree_shrink(root);
- if (root->height == 0)
- deleted = true;
- }
+ if (node == entry_to_node(root->rnode))
+ deleted |= radix_tree_shrink(root);
return deleted;
}
parent = node->parent;
if (parent) {
- unsigned int offset;
-
- offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
- parent->slots[offset] = NULL;
+ parent->slots[node->offset] = NULL;
parent->count--;
} else {
root_tag_clear_all(root);
- root->height = 0;
root->rnode = NULL;
}
@@ -1469,6 +1451,20 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
return deleted;
}
+static inline void delete_sibling_entries(struct radix_tree_node *node,
+ void *ptr, unsigned offset)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+ int i;
+ for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
+ if (node->slots[offset + i] != ptr)
+ break;
+ node->slots[offset + i] = NULL;
+ node->count--;
+ }
+#endif
+}
+
/**
* radix_tree_delete_item - delete an item from a radix tree
* @root: radix tree root
@@ -1484,7 +1480,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
unsigned long index, void *item)
{
struct radix_tree_node *node;
- unsigned int offset, i;
+ unsigned int offset;
void **slot;
void *entry;
int tag;
@@ -1502,24 +1498,13 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
return entry;
}
- offset = index & RADIX_TREE_MAP_MASK;
+ offset = get_slot_offset(node, slot);
- /*
- * Clear all tags associated with the item to be deleted.
- * This way of doing it would be inefficient, but seldom is any set.
- */
- for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
- if (tag_get(node, tag, offset))
- radix_tree_tag_clear(root, index, tag);
- }
+ /* Clear all tags associated with the item to be deleted. */
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ node_tag_clear(root, node, tag, offset);
- /* Delete any sibling slots pointing to this slot */
- for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
- if (node->slots[offset + i] != ptr_to_indirect(slot))
- break;
- node->slots[offset + i] = NULL;
- node->count--;
- }
+ delete_sibling_entries(node, node_to_entry(slot), offset);
node->slots[offset] = NULL;
node->count--;
@@ -1544,6 +1529,28 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
}
EXPORT_SYMBOL(radix_tree_delete);
+struct radix_tree_node *radix_tree_replace_clear_tags(
+ struct radix_tree_root *root,
+ unsigned long index, void *entry)
+{
+ struct radix_tree_node *node;
+ void **slot;
+
+ __radix_tree_lookup(root, index, &node, &slot);
+
+ if (node) {
+ unsigned int tag, offset = get_slot_offset(node, slot);
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ node_tag_clear(root, node, tag, offset);
+ } else {
+ /* Clear root node tags */
+ root->gfp_mask &= __GFP_BITS_MASK;
+ }
+
+ radix_tree_replace_slot(slot, entry);
+ return node;
+}
+
/**
* radix_tree_tagged - test whether any items in the tree are tagged
* @root: radix tree root
@@ -1564,45 +1571,24 @@ radix_tree_node_ctor(void *arg)
INIT_LIST_HEAD(&node->private_list);
}
-static __init unsigned long __maxindex(unsigned int height)
-{
- unsigned int width = height * RADIX_TREE_MAP_SHIFT;
- int shift = RADIX_TREE_INDEX_BITS - width;
-
- if (shift < 0)
- return ~0UL;
- if (shift >= BITS_PER_LONG)
- return 0UL;
- return ~0UL >> shift;
-}
-
-static __init void radix_tree_init_maxindex(void)
-{
- unsigned int i;
-
- for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
- height_to_maxindex[i] = __maxindex(i);
-}
-
static int radix_tree_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+ unsigned long action, void *hcpu)
{
- int cpu = (long)hcpu;
- struct radix_tree_preload *rtp;
- struct radix_tree_node *node;
-
- /* Free per-cpu pool of perloaded nodes */
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- rtp = &per_cpu(radix_tree_preloads, cpu);
- while (rtp->nr) {
+ int cpu = (long)hcpu;
+ struct radix_tree_preload *rtp;
+ struct radix_tree_node *node;
+
+ /* Free per-cpu pool of preloaded nodes */
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ rtp = &per_cpu(radix_tree_preloads, cpu);
+ while (rtp->nr) {
node = rtp->nodes;
rtp->nodes = node->private_data;
kmem_cache_free(radix_tree_node_cachep, node);
rtp->nr--;
- }
- }
- return NOTIFY_OK;
+ }
+ }
+ return NOTIFY_OK;
}
void __init radix_tree_init(void)
@@ -1611,6 +1597,5 @@ void __init radix_tree_init(void)
sizeof(struct radix_tree_node), 0,
SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
radix_tree_node_ctor);
- radix_tree_init_maxindex();
hotcpu_notifier(radix_tree_callback, 0);
}
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 33840324138c..33f655ef48cd 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -1,5 +1,6 @@
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/kasan-checks.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/errno.h>
@@ -109,6 +110,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
unsigned long max = max_addr - src_addr;
long retval;
+ kasan_check_write(dst, count);
user_access_begin();
retval = do_strncpy_from_user(dst, src, count, max);
user_access_end();
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 82169fbf2453..c640fdbf1fba 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -12,9 +12,12 @@
#define pr_fmt(fmt) "kasan test: %s " fmt, __func__
#include <linux/kernel.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/uaccess.h>
#include <linux/module.h>
static noinline void __init kmalloc_oob_right(void)
@@ -344,6 +347,96 @@ static noinline void __init kasan_stack_oob(void)
*(volatile char *)p;
}
+#ifdef CONFIG_SLAB
+static noinline void __init kasan_quarantine_cache(void)
+{
+ struct kmem_cache *cache = kmem_cache_create(
+ "test", 137, 8, GFP_KERNEL, NULL);
+ int i;
+
+ for (i = 0; i < 100; i++) {
+ void *p = kmem_cache_alloc(cache, GFP_KERNEL);
+
+ kmem_cache_free(cache, p);
+ p = kmalloc(sizeof(u64), GFP_KERNEL);
+ kfree(p);
+ }
+ kmem_cache_shrink(cache);
+ for (i = 0; i < 100; i++) {
+ u64 *p = kmem_cache_alloc(cache, GFP_KERNEL);
+
+ kmem_cache_free(cache, p);
+ p = kmalloc(sizeof(u64), GFP_KERNEL);
+ kfree(p);
+ }
+ kmem_cache_destroy(cache);
+}
+#endif
+
+static noinline void __init ksize_unpoisons_memory(void)
+{
+ char *ptr;
+ size_t size = 123, real_size = size;
+
+ pr_info("ksize() unpoisons the whole allocated chunk\n");
+ ptr = kmalloc(size, GFP_KERNEL);
+ if (!ptr) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+ real_size = ksize(ptr);
+ /* This access doesn't trigger an error. */
+ ptr[size] = 'x';
+ /* This one does. */
+ ptr[real_size] = 'y';
+ kfree(ptr);
+}
+
+static noinline void __init copy_user_test(void)
+{
+ char *kmem;
+ char __user *usermem;
+ size_t size = 10;
+ int unused;
+
+ kmem = kmalloc(size, GFP_KERNEL);
+ if (!kmem)
+ return;
+
+ usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
+ if (IS_ERR(usermem)) {
+ pr_err("Failed to allocate user memory\n");
+ kfree(kmem);
+ return;
+ }
+
+ pr_info("out-of-bounds in copy_from_user()\n");
+ unused = copy_from_user(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in copy_to_user()\n");
+ unused = copy_to_user(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in __copy_from_user()\n");
+ unused = __copy_from_user(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in __copy_to_user()\n");
+ unused = __copy_to_user(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
+ unused = __copy_from_user_inatomic(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
+ unused = __copy_to_user_inatomic(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in strncpy_from_user()\n");
+ unused = strncpy_from_user(kmem, usermem, size + 1);
+
+ vm_munmap((unsigned long)usermem, PAGE_SIZE);
+ kfree(kmem);
+}
+
static int __init kmalloc_tests_init(void)
{
kmalloc_oob_right();
@@ -367,6 +460,11 @@ static int __init kmalloc_tests_init(void)
kmem_cache_oob();
kasan_stack_oob();
kasan_global_oob();
+#ifdef CONFIG_SLAB
+ kasan_quarantine_cache();
+#endif
+ ksize_unpoisons_memory();
+ copy_user_test();
return -EAGAIN;
}
diff --git a/lib/uuid.c b/lib/uuid.c
index 398821e4dce1..e116ae5fa00f 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -1,7 +1,7 @@
/*
* Unified UUID/GUID definition
*
- * Copyright (C) 2009, Intel Corp.
+ * Copyright (C) 2009, 2016 Intel Corp.
* Huang Ying <ying.huang@intel.com>
*
* This program is free software; you can redistribute it and/or
@@ -12,17 +12,40 @@
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
#include <linux/export.h>
#include <linux/uuid.h>
#include <linux/random.h>
+const u8 uuid_le_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(uuid_le_index);
+const u8 uuid_be_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(uuid_be_index);
+
+/***************************************************************
+ * Random UUID interface
+ *
+ * Used here for a Boot ID, but can be useful for other kernel
+ * drivers.
+ ***************************************************************/
+
+/*
+ * Generate random UUID
+ */
+void generate_random_uuid(unsigned char uuid[16])
+{
+ get_random_bytes(uuid, 16);
+ /* Set UUID version to 4 --- truly random generation */
+ uuid[6] = (uuid[6] & 0x0F) | 0x40;
+ /* Set the UUID variant to DCE */
+ uuid[8] = (uuid[8] & 0x3F) | 0x80;
+}
+EXPORT_SYMBOL(generate_random_uuid);
+
static void __uuid_gen_common(__u8 b[16])
{
prandom_bytes(b, 16);
@@ -45,3 +68,61 @@ void uuid_be_gen(uuid_be *bu)
bu->b[6] = (bu->b[6] & 0x0F) | 0x40;
}
EXPORT_SYMBOL_GPL(uuid_be_gen);
+
+/**
+ * uuid_is_valid - checks if UUID string valid
+ * @uuid: UUID string to check
+ *
+ * Description:
+ * It checks if the UUID string is following the format:
+ * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+ * where x is a hex digit.
+ *
+ * Return: true if input is valid UUID string.
+ */
+bool uuid_is_valid(const char *uuid)
+{
+ unsigned int i;
+
+ for (i = 0; i < UUID_STRING_LEN; i++) {
+ if (i == 8 || i == 13 || i == 18 || i == 23) {
+ if (uuid[i] != '-')
+ return false;
+ } else if (!isxdigit(uuid[i])) {
+ return false;
+ }
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(uuid_is_valid);
+
+static int __uuid_to_bin(const char *uuid, __u8 b[16], const u8 ei[16])
+{
+ static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34};
+ unsigned int i;
+
+ if (!uuid_is_valid(uuid))
+ return -EINVAL;
+
+ for (i = 0; i < 16; i++) {
+ int hi = hex_to_bin(uuid[si[i]] + 0);
+ int lo = hex_to_bin(uuid[si[i]] + 1);
+
+ b[ei[i]] = (hi << 4) | lo;
+ }
+
+ return 0;
+}
+
+int uuid_le_to_bin(const char *uuid, uuid_le *u)
+{
+ return __uuid_to_bin(uuid, u->b, uuid_le_index);
+}
+EXPORT_SYMBOL(uuid_le_to_bin);
+
+int uuid_be_to_bin(const char *uuid, uuid_be *u)
+{
+ return __uuid_to_bin(uuid, u->b, uuid_be_index);
+}
+EXPORT_SYMBOL(uuid_be_to_bin);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index ccb664b54280..0967771d8f7f 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -30,6 +30,7 @@
#include <linux/ioport.h>
#include <linux/dcache.h>
#include <linux/cred.h>
+#include <linux/uuid.h>
#include <net/addrconf.h>
#ifdef CONFIG_BLOCK
#include <linux/blkdev.h>
@@ -1304,19 +1305,17 @@ static noinline_for_stack
char *uuid_string(char *buf, char *end, const u8 *addr,
struct printf_spec spec, const char *fmt)
{
- char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
+ char uuid[UUID_STRING_LEN + 1];
char *p = uuid;
int i;
- static const u8 be[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
- static const u8 le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
- const u8 *index = be;
+ const u8 *index = uuid_be_index;
bool uc = false;
switch (*(++fmt)) {
case 'L':
uc = true; /* fall-through */
case 'l':
- index = le;
+ index = uuid_le_index;
break;
case 'B':
uc = true;
@@ -1324,7 +1323,10 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
}
for (i = 0; i < 16; i++) {
- p = hex_byte_pack(p, addr[index[i]]);
+ if (uc)
+ p = hex_byte_pack_upper(p, addr[index[i]]);
+ else
+ p = hex_byte_pack(p, addr[index[i]]);
switch (i) {
case 3:
case 5:
@@ -1337,13 +1339,6 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
*p = 0;
- if (uc) {
- p = uuid;
- do {
- *p = toupper(*p);
- } while (*(++p));
- }
-
return string(buf, end, uuid, spec);
}
diff --git a/mm/Kconfig b/mm/Kconfig
index 989f8f3d77e0..3c75ec75560f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -192,6 +192,22 @@ config MEMORY_HOTPLUG_SPARSE
def_bool y
depends on SPARSEMEM && MEMORY_HOTPLUG
+config MEMORY_HOTPLUG_DEFAULT_ONLINE
+ bool "Online the newly added memory blocks by default"
+ default n
+ depends on MEMORY_HOTPLUG
+ help
+ This option sets the default policy setting for memory hotplug
+ onlining policy (/sys/devices/system/memory/auto_online_blocks) which
+ determines what happens to newly added memory regions. Policy setting
+ can always be changed at runtime.
+ See Documentation/memory-hotplug.txt for more information.
+
+ Say Y here if you want all hot-plugged memory blocks to appear in
+ 'online' state by default.
+ Say N here if you want the default policy to keep all hot-plugged
+ memory blocks in 'offline' state.
+
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
select MEMORY_ISOLATION
@@ -268,11 +284,6 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
-config ZONE_DMA_FLAG
- int
- default "0" if !ZONE_DMA
- default "1"
-
config BOUNCE
bool "Enable bounce buffers"
default y
@@ -393,6 +404,7 @@ config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
select COMPACTION
+ select RADIX_TREE_MULTIORDER
help
Transparent Hugepages allows the kernel to use huge pages and
huge tlb transparently to the applications whenever possible.
@@ -427,6 +439,38 @@ choice
benefit.
endchoice
+choice
+ prompt "Overcommit Mode"
+ default OVERCOMMIT_GUESS
+ depends on EXPERT
+ help
+ Selects the initial value for Overcommit mode.
+
+ NOTE: The overcommit mode can be changed dynamically through sysctl.
+
+ config OVERCOMMIT_GUESS
+ bool "Guess"
+ help
+ Selecting this option forces the initial value of overcommit mode to
+ "Guess" overcommits. This is the default value.
+ See Documentation/vm/overcommit-accounting for more information.
+
+ config OVERCOMMIT_ALWAYS
+ bool "Always"
+ help
+ Selecting this option forces the initial value of overcommit mode to
+ "Always" overcommit.
+ See Documentation/vm/overcommit-accounting for more information.
+
+ config OVERCOMMIT_NEVER
+ bool "Never"
+ help
+ Selecting this option forces the initial value of overcommit mode to
+ "Never" overcommit.
+ See Documentation/vm/overcommit-accounting for more information.
+
+endchoice
+
#
# UP and nommu archs use km based percpu allocator
#
@@ -556,7 +600,7 @@ config ZPOOL
zsmalloc.
config ZBUD
- tristate "Low density storage for compressed pages"
+ tristate "Low (Up to 2x) density storage for compressed pages"
default n
help
A special purpose allocator for storing compressed pages.
@@ -565,6 +609,16 @@ config ZBUD
deterministic reclaim properties that make it preferable to a higher
density approach when reclaim will be used.
+config Z3FOLD
+ tristate "Up to 3x density storage for compressed pages"
+ depends on ZPOOL
+ default n
+ help
+ A special purpose allocator for storing compressed pages.
+ It is designed to store up to three compressed pages per physical
+ page. It is a ZBUD derivative so the simplicity and determinism are
+ still there.
+
config ZSMALLOC
tristate "Memory allocator for compressed pages"
depends on MMU
diff --git a/mm/Makefile b/mm/Makefile
index deb467edca2d..78c6f7dedb83 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o
obj-$(CONFIG_ZBUD) += zbud.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
+obj-$(CONFIG_Z3FOLD) += z3fold.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0c6317b7db38..ed173b8ae8f2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -957,9 +957,8 @@ EXPORT_SYMBOL(congestion_wait);
* jiffies for either a BDI to exit congestion of the given @sync queue
* or a write to complete.
*
- * In the absence of zone congestion, a short sleep or a cond_resched is
- * performed to yield the processor and to allow other subsystems to make
- * a forward progress.
+ * In the absence of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
*
* The return value is 0 if the sleep is for the full timeout. Otherwise,
* it is the number of jiffies that were still remaining when the function
@@ -979,20 +978,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
*/
if (atomic_read(&nr_wb_congested[sync]) == 0 ||
!test_bit(ZONE_CONGESTED, &zone->flags)) {
-
- /*
- * Memory allocation/reclaim might be called from a WQ
- * context and the current implementation of the WQ
- * concurrency control doesn't recognize that a particular
- * WQ is congested if the worker thread is looping without
- * ever sleeping. Therefore we have to do a short sleep
- * here rather than calling cond_resched().
- */
- if (current->flags & PF_WQ_WORKER)
- schedule_timeout_uninterruptible(1);
- else
- cond_resched();
-
+ cond_resched();
/* In case we scheduled, work out time remaining */
ret = timeout - (jiffies - start);
if (ret < 0)
diff --git a/mm/compaction.c b/mm/compaction.c
index 8fa254043801..d8a20fcf8678 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -42,6 +42,11 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>
+#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order))
+#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order))
+#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order)
+#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order)
+
static unsigned long release_freepages(struct list_head *freelist)
{
struct page *page, *next;
@@ -161,7 +166,7 @@ static void reset_cached_positions(struct zone *zone)
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
zone->compact_cached_free_pfn =
- round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
+ pageblock_start_pfn(zone_end_pfn(zone) - 1);
}
/*
@@ -519,10 +524,10 @@ isolate_freepages_range(struct compact_control *cc,
LIST_HEAD(freelist);
pfn = start_pfn;
- block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ block_start_pfn = pageblock_start_pfn(pfn);
if (block_start_pfn < cc->zone->zone_start_pfn)
block_start_pfn = cc->zone->zone_start_pfn;
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(pfn);
for (; pfn < end_pfn; pfn += isolated,
block_start_pfn = block_end_pfn,
@@ -538,8 +543,8 @@ isolate_freepages_range(struct compact_control *cc,
* scanning range to right one.
*/
if (pfn >= block_end_pfn) {
- block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_start_pfn = pageblock_start_pfn(pfn);
+ block_end_pfn = pageblock_end_pfn(pfn);
block_end_pfn = min(block_end_pfn, end_pfn);
}
@@ -633,12 +638,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
{
struct zone *zone = cc->zone;
unsigned long nr_scanned = 0, nr_isolated = 0;
- struct list_head *migratelist = &cc->migratepages;
struct lruvec *lruvec;
unsigned long flags = 0;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
+ bool skip_on_failure = false;
+ unsigned long next_skip_pfn = 0;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -659,10 +665,37 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (compact_should_abort(cc))
return 0;
+ if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
+ skip_on_failure = true;
+ next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+ }
+
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
bool is_lru;
+ if (skip_on_failure && low_pfn >= next_skip_pfn) {
+ /*
+ * We have isolated all migration candidates in the
+ * previous order-aligned block, and did not skip it due
+ * to failure. We should migrate the pages now and
+ * hopefully succeed compaction.
+ */
+ if (nr_isolated)
+ break;
+
+ /*
+ * We failed to isolate in the previous order-aligned
+ * block. Set the new boundary to the end of the
+ * current block. Note we can't simply increase
+ * next_skip_pfn by 1 << order, as low_pfn might have
+ * been incremented by a higher number due to skipping
+ * a compound or a high-order buddy page in the
+ * previous loop iteration.
+ */
+ next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+ }
+
/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort async compaction
@@ -674,7 +707,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
break;
if (!pfn_valid_within(low_pfn))
- continue;
+ goto isolate_fail;
nr_scanned++;
page = pfn_to_page(low_pfn);
@@ -729,11 +762,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (likely(comp_order < MAX_ORDER))
low_pfn += (1UL << comp_order) - 1;
- continue;
+ goto isolate_fail;
}
if (!is_lru)
- continue;
+ goto isolate_fail;
/*
* Migration will fail if an anonymous page is pinned in memory,
@@ -742,7 +775,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (!page_mapping(page) &&
page_count(page) > page_mapcount(page))
- continue;
+ goto isolate_fail;
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
@@ -753,7 +786,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
- continue;
+ goto isolate_fail;
/*
* Page become compound since the non-locked check,
@@ -762,7 +795,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (unlikely(PageCompound(page))) {
low_pfn += (1UL << compound_order(page)) - 1;
- continue;
+ goto isolate_fail;
}
}
@@ -770,7 +803,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Try isolate the page */
if (__isolate_lru_page(page, isolate_mode) != 0)
- continue;
+ goto isolate_fail;
VM_BUG_ON_PAGE(PageCompound(page), page);
@@ -778,15 +811,55 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
del_page_from_lru_list(page, lruvec, page_lru(page));
isolate_success:
- list_add(&page->lru, migratelist);
+ list_add(&page->lru, &cc->migratepages);
cc->nr_migratepages++;
nr_isolated++;
+ /*
+ * Record where we could have freed pages by migration and not
+ * yet flushed them to buddy allocator.
+ * - this is the lowest page that was isolated and likely be
+ * then freed by migration.
+ */
+ if (!cc->last_migrated_pfn)
+ cc->last_migrated_pfn = low_pfn;
+
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
break;
}
+
+ continue;
+isolate_fail:
+ if (!skip_on_failure)
+ continue;
+
+ /*
+ * We have isolated some pages, but then failed. Release them
+ * instead of migrating, as we cannot form the cc->order buddy
+ * page anyway.
+ */
+ if (nr_isolated) {
+ if (locked) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ locked = false;
+ }
+ acct_isolated(zone, cc);
+ putback_movable_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
+ cc->last_migrated_pfn = 0;
+ nr_isolated = 0;
+ }
+
+ if (low_pfn < next_skip_pfn) {
+ low_pfn = next_skip_pfn - 1;
+ /*
+ * The check near the loop beginning would have updated
+ * next_skip_pfn too, but this is a bit simpler.
+ */
+ next_skip_pfn += 1UL << cc->order;
+ }
}
/*
@@ -834,10 +907,10 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
/* Scan block by block. First and last block may be incomplete */
pfn = start_pfn;
- block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ block_start_pfn = pageblock_start_pfn(pfn);
if (block_start_pfn < cc->zone->zone_start_pfn)
block_start_pfn = cc->zone->zone_start_pfn;
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(pfn);
for (; pfn < end_pfn; pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
@@ -924,10 +997,10 @@ static void isolate_freepages(struct compact_control *cc)
* is using.
*/
isolate_start_pfn = cc->free_pfn;
- block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
+ block_start_pfn = pageblock_start_pfn(cc->free_pfn);
block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
zone_end_pfn(zone));
- low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
+ low_pfn = pageblock_end_pfn(cc->migrate_pfn);
/*
* Isolate free pages until enough are available to migrate the
@@ -1070,7 +1143,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
unsigned long block_start_pfn;
unsigned long block_end_pfn;
unsigned long low_pfn;
- unsigned long isolate_start_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
@@ -1081,12 +1153,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* initialized by compact_zone()
*/
low_pfn = cc->migrate_pfn;
- block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
+ block_start_pfn = pageblock_start_pfn(low_pfn);
if (block_start_pfn < zone->zone_start_pfn)
block_start_pfn = zone->zone_start_pfn;
/* Only scan within a pageblock boundary */
- block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(low_pfn);
/*
* Iterate over whole pageblocks until we find the first suitable.
@@ -1125,7 +1197,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
continue;
/* Perform the isolation */
- isolate_start_pfn = low_pfn;
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);
@@ -1135,15 +1206,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
}
/*
- * Record where we could have freed pages by migration and not
- * yet flushed them to buddy allocator.
- * - this is the lowest page that could have been isolated and
- * then freed by migration.
- */
- if (cc->nr_migratepages && !cc->last_migrated_pfn)
- cc->last_migrated_pfn = isolate_start_pfn;
-
- /*
* Either we isolated something and proceed with migration. Or
* we failed and compact_zone should decide if we should
* continue or not.
@@ -1167,7 +1229,7 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
-static int __compact_finished(struct zone *zone, struct compact_control *cc,
+static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
const int migratetype)
{
unsigned int order;
@@ -1190,7 +1252,10 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
if (cc->direct_compaction)
zone->compact_blockskip_flush = true;
- return COMPACT_COMPLETE;
+ if (cc->whole_zone)
+ return COMPACT_COMPLETE;
+ else
+ return COMPACT_PARTIAL_SKIPPED;
}
if (is_via_compact_memory(cc->order))
@@ -1230,8 +1295,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
return COMPACT_NO_SUITABLE_PAGE;
}
-static int compact_finished(struct zone *zone, struct compact_control *cc,
- const int migratetype)
+static enum compact_result compact_finished(struct zone *zone,
+ struct compact_control *cc,
+ const int migratetype)
{
int ret;
@@ -1250,8 +1316,10 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
* COMPACT_PARTIAL - If the allocation would succeed without compaction
* COMPACT_CONTINUE - If compaction should run now
*/
-static unsigned long __compaction_suitable(struct zone *zone, int order,
- int alloc_flags, int classzone_idx)
+static enum compact_result __compaction_suitable(struct zone *zone, int order,
+ unsigned int alloc_flags,
+ int classzone_idx,
+ unsigned long wmark_target)
{
int fragindex;
unsigned long watermark;
@@ -1274,7 +1342,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
* allocated and for a short time, the footprint is higher
*/
watermark += (2UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
+ if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
+ alloc_flags, wmark_target))
return COMPACT_SKIPPED;
/*
@@ -1295,12 +1364,14 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
return COMPACT_CONTINUE;
}
-unsigned long compaction_suitable(struct zone *zone, int order,
- int alloc_flags, int classzone_idx)
+enum compact_result compaction_suitable(struct zone *zone, int order,
+ unsigned int alloc_flags,
+ int classzone_idx)
{
- unsigned long ret;
+ enum compact_result ret;
- ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
+ ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
+ zone_page_state(zone, NR_FREE_PAGES));
trace_mm_compaction_suitable(zone, order, ret);
if (ret == COMPACT_NOT_SUITABLE_ZONE)
ret = COMPACT_SKIPPED;
@@ -1308,9 +1379,42 @@ unsigned long compaction_suitable(struct zone *zone, int order,
return ret;
}
-static int compact_zone(struct zone *zone, struct compact_control *cc)
+bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
+ int alloc_flags)
{
- int ret;
+ struct zone *zone;
+ struct zoneref *z;
+
+ /*
+ * Make sure at least one zone would pass __compaction_suitable if we continue
+ * retrying the reclaim.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+ ac->nodemask) {
+ unsigned long available;
+ enum compact_result compact_result;
+
+ /*
+ * Do not consider all the reclaimable memory because we do not
+ * want to trash just for a single high order allocation which
+ * is even not guaranteed to appear even if __compaction_suitable
+ * is happy about the watermark check.
+ */
+ available = zone_reclaimable_pages(zone) / order;
+ available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+ compact_result = __compaction_suitable(zone, order, alloc_flags,
+ ac_classzone_idx(ac), available);
+ if (compact_result != COMPACT_SKIPPED &&
+ compact_result != COMPACT_NOT_SUITABLE_ZONE)
+ return true;
+ }
+
+ return false;
+}
+
+static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
+{
+ enum compact_result ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
@@ -1318,15 +1422,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
- switch (ret) {
- case COMPACT_PARTIAL:
- case COMPACT_SKIPPED:
- /* Compaction is likely to fail */
+ /* Compaction is likely to fail */
+ if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED)
return ret;
- case COMPACT_CONTINUE:
- /* Fall through to compaction */
- ;
- }
+
+ /* huh, compaction_suitable is returning something unexpected */
+ VM_BUG_ON(ret != COMPACT_CONTINUE);
/*
* Clear pageblock skip if there were failures recently and compaction
@@ -1343,7 +1444,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
- cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
+ cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
zone->compact_cached_free_pfn = cc->free_pfn;
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
@@ -1351,6 +1452,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
+
+ if (cc->migrate_pfn == start_pfn)
+ cc->whole_zone = true;
+
cc->last_migrated_pfn = 0;
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
@@ -1398,6 +1503,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
ret = COMPACT_CONTENDED;
goto out;
}
+ /*
+ * We failed to migrate at least one page in the current
+ * order-aligned block, so skip the rest of it.
+ */
+ if (cc->direct_compaction &&
+ (cc->mode == MIGRATE_ASYNC)) {
+ cc->migrate_pfn = block_end_pfn(
+ cc->migrate_pfn - 1, cc->order);
+ /* Draining pcplists is useless in this case */
+ cc->last_migrated_pfn = 0;
+
+ }
}
check_drain:
@@ -1411,7 +1528,7 @@ check_drain:
if (cc->order > 0 && cc->last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
- cc->migrate_pfn & ~((1UL << cc->order) - 1);
+ block_start_pfn(cc->migrate_pfn, cc->order);
if (cc->last_migrated_pfn < current_block_start) {
cpu = get_cpu();
@@ -1436,7 +1553,7 @@ out:
cc->nr_freepages = 0;
VM_BUG_ON(free_pfn == 0);
/* The cached pfn is always the first in a pageblock */
- free_pfn &= ~(pageblock_nr_pages-1);
+ free_pfn = pageblock_start_pfn(free_pfn);
/*
* Only go back, not forward. The cached pfn might have been
* already reset to zone end in compact_finished()
@@ -1454,11 +1571,11 @@ out:
return ret;
}
-static unsigned long compact_zone_order(struct zone *zone, int order,
+static enum compact_result compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum migrate_mode mode, int *contended,
- int alloc_flags, int classzone_idx)
+ unsigned int alloc_flags, int classzone_idx)
{
- unsigned long ret;
+ enum compact_result ret;
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
@@ -1496,15 +1613,15 @@ int sysctl_extfrag_threshold = 500;
*
* This is the main entry point for direct page compaction.
*/
-unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended)
+enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+ unsigned int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, int *contended)
{
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
- int rc = COMPACT_DEFERRED;
+ enum compact_result rc = COMPACT_SKIPPED;
int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
*contended = COMPACT_CONTENDED_NONE;
@@ -1518,15 +1635,17 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
- int status;
+ enum compact_result status;
int zone_contended;
- if (compaction_deferred(zone, order))
+ if (compaction_deferred(zone, order)) {
+ rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
continue;
+ }
status = compact_zone_order(zone, order, gfp_mask, mode,
&zone_contended, alloc_flags,
- ac->classzone_idx);
+ ac_classzone_idx(ac));
rc = max(status, rc);
/*
* It takes at least one zone that wasn't lock contended
@@ -1536,7 +1655,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
- ac->classzone_idx, alloc_flags)) {
+ ac_classzone_idx(ac), alloc_flags)) {
/*
* We think the allocation will succeed in this zone,
* but it is not certain, hence the false. The caller
@@ -1558,7 +1677,8 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
goto break_loop;
}
- if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
+ if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE ||
+ status == COMPACT_PARTIAL_SKIPPED)) {
/*
* We think that allocation won't succeed in this zone
* so we defer compaction there. If it ends up
@@ -1593,7 +1713,7 @@ break_loop:
* If at least one zone wasn't deferred or skipped, we report if all
* zones that were tried were lock contended.
*/
- if (rc > COMPACT_SKIPPED && all_zones_contended)
+ if (rc > COMPACT_INACTIVE && all_zones_contended)
*contended = COMPACT_CONTENDED_LOCK;
return rc;
@@ -1805,7 +1925,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
cc.classzone_idx, 0)) {
success = true;
compaction_defer_reset(zone, cc.order, false);
- } else if (status == COMPACT_COMPLETE) {
+ } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
/*
* We use sync migration mode here, so we defer like
* sync direct compaction does.
diff --git a/mm/filemap.c b/mm/filemap.c
index 182b21825255..001432a42cdd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -114,14 +114,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
struct radix_tree_node *node;
- unsigned long index;
- unsigned int offset;
- unsigned int tag;
- void **slot;
VM_BUG_ON(!PageLocked(page));
- __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+ node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index,
+ shadow);
if (shadow) {
mapping->nrexceptional++;
@@ -135,23 +132,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
}
mapping->nrpages--;
- if (!node) {
- /* Clear direct pointer tags in root node */
- mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
- radix_tree_replace_slot(slot, shadow);
+ if (!node)
return;
- }
-
- /* Clear tree tags for the removed page */
- index = page->index;
- offset = index & RADIX_TREE_MAP_MASK;
- for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
- if (test_bit(offset, node->tags[tag]))
- radix_tree_tag_clear(&mapping->page_tree, index, tag);
- }
- /* Delete page, swap shadow entry */
- radix_tree_replace_slot(slot, shadow);
workingset_node_pages_dec(node);
if (shadow)
workingset_node_shadows_inc(node);
@@ -213,7 +196,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
* some other bad page check should catch it later.
*/
page_mapcount_reset(page);
- atomic_sub(mapcount, &page->_count);
+ page_ref_sub(page, mapcount);
}
}
@@ -713,8 +696,12 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
* The page might have been evicted from cache only
* recently, in which case it should be activated like
* any other repeatedly accessed page.
+ * The exception is pages getting rewritten; evicting other
+ * data from the working set, only to cache data that will
+ * get overwritten with something else, is a waste of memory.
*/
- if (shadow && workingset_refault(shadow)) {
+ if (!(gfp_mask & __GFP_WRITE) &&
+ shadow && workingset_refault(shadow)) {
SetPageActive(page);
workingset_activation(page);
} else
@@ -2574,7 +2561,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags)
{
struct page *page;
- int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
+ int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
if (flags & AOP_FLAG_NOFS)
fgp_flags |= FGP_NOFS;
diff --git a/mm/highmem.c b/mm/highmem.c
index 123bcd3ed4f2..50b4ca6787f0 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -112,16 +112,12 @@ EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
unsigned int nr_free_highpages (void)
{
- pg_data_t *pgdat;
+ struct zone *zone;
unsigned int pages = 0;
- for_each_online_pgdat(pgdat) {
- pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
- NR_FREE_PAGES);
- if (zone_movable_is_highmem())
- pages += zone_page_state(
- &pgdat->node_zones[ZONE_MOVABLE],
- NR_FREE_PAGES);
+ for_each_populated_zone(zone) {
+ if (is_highmem(zone))
+ pages += zone_page_state(zone, NR_FREE_PAGES);
}
return pages;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 827343f0da9c..ade829df419f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -30,6 +30,7 @@
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
+#include <linux/swapops.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -57,7 +58,8 @@ enum scan_result {
SCAN_SWAP_CACHE_PAGE,
SCAN_DEL_PAGE_LRU,
SCAN_ALLOC_HUGE_PAGE_FAIL,
- SCAN_CGROUP_CHARGE_FAIL
+ SCAN_CGROUP_CHARGE_FAIL,
+ SCAN_EXCEED_SWAP_PTE
};
#define CREATE_TRACE_POINTS
@@ -99,6 +101,8 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
* fault.
*/
static unsigned int khugepaged_max_ptes_none __read_mostly;
+static unsigned int khugepaged_max_ptes_swap __read_mostly = HPAGE_PMD_NR/8;
+static unsigned long allocstall;
static int khugepaged(void *none);
static int khugepaged_slab_init(void);
@@ -595,6 +599,33 @@ static struct kobj_attribute khugepaged_max_ptes_none_attr =
__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
khugepaged_max_ptes_none_store);
+static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
+}
+
+static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_swap;
+
+ err = kstrtoul(buf, 10, &max_ptes_swap);
+ if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_swap = max_ptes_swap;
+
+ return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_swap_attr =
+ __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
+ khugepaged_max_ptes_swap_store);
+
static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr,
&khugepaged_max_ptes_none_attr.attr,
@@ -603,6 +634,7 @@ static struct attribute *khugepaged_attr[] = {
&full_scans_attr.attr,
&scan_sleep_millisecs_attr.attr,
&alloc_sleep_millisecs_attr.attr,
+ &khugepaged_max_ptes_swap_attr.attr,
NULL,
};
@@ -764,10 +796,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
{
- pmd_t entry;
- entry = mk_pmd(page, prot);
- entry = pmd_mkhuge(entry);
- return entry;
+ return pmd_mkhuge(mk_pmd(page, prot));
}
static inline struct list_head *page_deferred_list(struct page *page)
@@ -1699,20 +1728,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return 1;
}
-bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
- unsigned long old_addr,
+bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
pmd_t pmd;
-
struct mm_struct *mm = vma->vm_mm;
if ((old_addr & ~HPAGE_PMD_MASK) ||
(new_addr & ~HPAGE_PMD_MASK) ||
- old_end - old_addr < HPAGE_PMD_SIZE ||
- (new_vma->vm_flags & VM_NOHUGEPAGE))
+ old_end - old_addr < HPAGE_PMD_SIZE)
return false;
/*
@@ -2350,6 +2376,44 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
return !(vma->vm_flags & VM_NO_THP);
}
+/*
+ * Bring missing pages in from swap, to complete THP collapse.
+ * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ *
+ * Called and returns without pte mapped or spinlocks held,
+ * but with mmap_sem held to protect against vma changes.
+ */
+
+static void __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd)
+{
+ unsigned long _address;
+ pte_t *pte, pteval;
+ int swapped_in = 0, ret = 0;
+
+ pte = pte_offset_map(pmd, address);
+ for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ pte++, _address += PAGE_SIZE) {
+ pteval = *pte;
+ if (!is_swap_pte(pteval))
+ continue;
+ swapped_in++;
+ ret = do_swap_page(mm, vma, _address, pte, pmd,
+ FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
+ pteval);
+ if (ret & VM_FAULT_ERROR) {
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
+ return;
+ }
+ /* pte is unmapped now, we need to map it */
+ pte = pte_offset_map(pmd, _address);
+ }
+ pte--;
+ pte_unmap(pte);
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
+}
+
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
@@ -2362,7 +2426,7 @@ static void collapse_huge_page(struct mm_struct *mm,
struct page *new_page;
spinlock_t *pmd_ptl, *pte_ptl;
int isolated = 0, result = 0;
- unsigned long hstart, hend;
+ unsigned long hstart, hend, swap, curr_allocstall;
struct mem_cgroup *memcg;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
@@ -2417,6 +2481,15 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out;
}
+ swap = get_mm_counter(mm, MM_SWAPENTS);
+ curr_allocstall = sum_vm_event(ALLOCSTALL);
+ /*
+ * Don't perform swapin readahead when the system is under pressure,
+ * to avoid unnecessary resource consumption.
+ */
+ if (allocstall == curr_allocstall && swap != 0)
+ __collapse_huge_page_swapin(mm, vma, address, pmd);
+
anon_vma_lock_write(vma->anon_vma);
pte = pte_offset_map(pmd, address);
@@ -2493,9 +2566,6 @@ static void collapse_huge_page(struct mm_struct *mm,
result = SCAN_SUCCEED;
out_up_write:
up_write(&mm->mmap_sem);
- trace_mm_collapse_huge_page(mm, isolated, result);
- return;
-
out_nolock:
trace_mm_collapse_huge_page(mm, isolated, result);
return;
@@ -2515,7 +2585,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
struct page *page = NULL;
unsigned long _address;
spinlock_t *ptl;
- int node = NUMA_NO_NODE;
+ int node = NUMA_NO_NODE, unmapped = 0;
bool writable = false, referenced = false;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -2531,6 +2601,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
+ if (is_swap_pte(pteval)) {
+ if (++unmapped <= khugepaged_max_ptes_swap) {
+ continue;
+ } else {
+ result = SCAN_EXCEED_SWAP_PTE;
+ goto out_unmap;
+ }
+ }
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
++none_or_zero <= khugepaged_max_ptes_none) {
@@ -2617,7 +2695,7 @@ out_unmap:
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
- none_or_zero, result);
+ none_or_zero, result, unmapped);
return ret;
}
@@ -2804,14 +2882,17 @@ static void khugepaged_wait_work(void)
if (!khugepaged_scan_sleep_millisecs)
return;
+ allocstall = sum_vm_event(ALLOCSTALL);
wait_event_freezable_timeout(khugepaged_wait,
kthread_should_stop(),
msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
return;
}
- if (khugepaged_enabled())
+ if (khugepaged_enabled()) {
+ allocstall = sum_vm_event(ALLOCSTALL);
wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
+ }
}
static int khugepaged(void *none)
@@ -2820,6 +2901,7 @@ static int khugepaged(void *none)
set_freezable();
set_user_nice(current, MAX_NICE);
+ allocstall = sum_vm_event(ALLOCSTALL);
while (!kthread_should_stop()) {
khugepaged_do_scan();
@@ -3030,8 +3112,10 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
return;
/*
- * Caller holds the mmap_sem write mode, so a huge pmd cannot
- * materialize from under us.
+ * Caller holds the mmap_sem write mode or the anon_vma lock,
+ * so a huge pmd cannot materialize from under us (khugepaged
+ * holds both the mmap_sem write mode and the anon_vma lock
+ * write mode).
*/
__split_huge_pmd(vma, pmd, address, freeze);
}
@@ -3114,7 +3198,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
/*
- * tail_page->_count is zero and not changing from under us. But
+ * tail_page->_refcount is zero and not changing from under us. But
* get_page_unless_zero() may be running from under us on the
* tail_page. If we used atomic_set() below instead of atomic_inc(), we
* would then run atomic_set() concurrently with
@@ -3341,7 +3425,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mlocked)
lru_add_drain();
- /* Prevent deferred_split_scan() touching ->_count */
+ /* Prevent deferred_split_scan() touching ->_refcount */
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
count = page_count(head);
mapcount = total_mapcount(head);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b14e98129b07..d26162e81fea 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages);
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static unsigned long __initdata default_hstate_size;
+static bool __initdata parsed_valid_hugepagesz = true;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -144,7 +145,8 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
}
}
- if (spool->min_hpages != -1) { /* minimum size accounting */
+ /* minimum size accounting */
+ if (spool->min_hpages != -1 && spool->rsv_hpages) {
if (delta > spool->rsv_hpages) {
/*
* Asking for more reserves than those already taken on
@@ -182,7 +184,8 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
if (spool->max_hpages != -1) /* maximum size accounting */
spool->used_hpages -= delta;
- if (spool->min_hpages != -1) { /* minimum size accounting */
+ /* minimum size accounting */
+ if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
if (spool->rsv_hpages + delta <= spool->min_hpages)
ret = 0;
else
@@ -938,9 +941,7 @@ err:
*/
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
- nid = next_node(nid, *nodes_allowed);
- if (nid == MAX_NUMNODES)
- nid = first_node(*nodes_allowed);
+ nid = next_node_in(nid, *nodes_allowed);
VM_BUG_ON(nid >= MAX_NUMNODES);
return nid;
@@ -1031,8 +1032,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn,
return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
}
-static bool pfn_range_valid_gigantic(unsigned long start_pfn,
- unsigned long nr_pages)
+static bool pfn_range_valid_gigantic(struct zone *z,
+ unsigned long start_pfn, unsigned long nr_pages)
{
unsigned long i, end_pfn = start_pfn + nr_pages;
struct page *page;
@@ -1043,6 +1044,9 @@ static bool pfn_range_valid_gigantic(unsigned long start_pfn,
page = pfn_to_page(i);
+ if (page_zone(page) != z)
+ return false;
+
if (PageReserved(page))
return false;
@@ -1075,7 +1079,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
pfn = ALIGN(z->zone_start_pfn, nr_pages);
while (zone_spans_last_pfn(z, pfn, nr_pages)) {
- if (pfn_range_valid_gigantic(pfn, nr_pages)) {
+ if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
/*
* We release the zone lock here because
* alloc_contig_range() will also lock the zone
@@ -2660,6 +2664,11 @@ static int __init hugetlb_init(void)
subsys_initcall(hugetlb_init);
/* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_bad_size(void)
+{
+ parsed_valid_hugepagesz = false;
+}
+
void __init hugetlb_add_hstate(unsigned int order)
{
struct hstate *h;
@@ -2679,8 +2688,8 @@ void __init hugetlb_add_hstate(unsigned int order)
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
- h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
- h->next_nid_to_free = first_node(node_states[N_MEMORY]);
+ h->next_nid_to_alloc = first_memory_node;
+ h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
@@ -2692,11 +2701,17 @@ static int __init hugetlb_nrpages_setup(char *s)
unsigned long *mhp;
static unsigned long *last_mhp;
+ if (!parsed_valid_hugepagesz) {
+ pr_warn("hugepages = %s preceded by "
+ "an unsupported hugepagesz, ignoring\n", s);
+ parsed_valid_hugepagesz = true;
+ return 1;
+ }
/*
* !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
* so this hugepages= parameter goes to the "default hstate".
*/
- if (!hugetlb_max_hstate)
+ else if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
else
mhp = &parsed_hstate->max_huge_pages;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index d8fb10de0f14..eec1150125b9 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -67,26 +67,42 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
return false;
}
+static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
+ struct hugetlb_cgroup *parent_h_cgroup)
+{
+ int idx;
+
+ for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
+ struct page_counter *counter = &h_cgroup->hugepage[idx];
+ struct page_counter *parent = NULL;
+ unsigned long limit;
+ int ret;
+
+ if (parent_h_cgroup)
+ parent = &parent_h_cgroup->hugepage[idx];
+ page_counter_init(counter, parent);
+
+ limit = round_down(PAGE_COUNTER_MAX,
+ 1 << huge_page_order(&hstates[idx]));
+ ret = page_counter_limit(counter, limit);
+ VM_BUG_ON(ret);
+ }
+}
+
static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
struct hugetlb_cgroup *h_cgroup;
- int idx;
h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
if (!h_cgroup)
return ERR_PTR(-ENOMEM);
- if (parent_h_cgroup) {
- for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
- page_counter_init(&h_cgroup->hugepage[idx],
- &parent_h_cgroup->hugepage[idx]);
- } else {
+ if (!parent_h_cgroup)
root_h_cgroup = h_cgroup;
- for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
- page_counter_init(&h_cgroup->hugepage[idx], NULL);
- }
+
+ hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
return &h_cgroup->css;
}
@@ -285,6 +301,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
return ret;
idx = MEMFILE_IDX(of_cft(of)->private);
+ nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx]));
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_LIMIT:
diff --git a/mm/internal.h b/mm/internal.h
index b79abb6721cf..ffaee661007f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -35,6 +35,10 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
+extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ unsigned int flags, pte_t orig_pte);
+
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
@@ -58,7 +62,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
}
/*
- * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * Turn a non-refcounted page (->_refcount == 0) into refcounted with
* a count of one.
*/
static inline void set_page_refcounted(struct page *page)
@@ -102,13 +106,14 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
struct alloc_context {
struct zonelist *zonelist;
nodemask_t *nodemask;
- struct zone *preferred_zone;
- int classzone_idx;
+ struct zoneref *preferred_zoneref;
int migratetype;
enum zone_type high_zoneidx;
bool spread_dirty_pages;
};
+#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref)
+
/*
* Locate the struct page for both the matching buddy in our
* pair (buddy1) and the combined O(n+1) page they form (page).
@@ -173,9 +178,10 @@ struct compact_control {
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool direct_compaction; /* False from kcompactd or /proc/... */
+ bool whole_zone; /* Whole zone has been scanned */
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
- const int alloc_flags; /* alloc flags of a direct compactor */
+ const unsigned int alloc_flags; /* alloc flags of a direct compactor */
const int classzone_idx; /* zone index of a direct compactor */
struct zone *zone;
int contended; /* Signal need_sched() or lock
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 131daadf40e4..1548749a3d45 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -8,3 +8,4 @@ CFLAGS_REMOVE_kasan.o = -pg
CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
obj-y := kasan.o report.o kasan_init.o
+obj-$(CONFIG_SLAB) += quarantine.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 38f1dd79acdb..18b6a2b8d183 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -273,32 +273,48 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
return memory_is_poisoned_n(addr, size);
}
-
-static __always_inline void check_memory_region(unsigned long addr,
- size_t size, bool write)
+static __always_inline void check_memory_region_inline(unsigned long addr,
+ size_t size, bool write,
+ unsigned long ret_ip)
{
if (unlikely(size == 0))
return;
if (unlikely((void *)addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
- kasan_report(addr, size, write, _RET_IP_);
+ kasan_report(addr, size, write, ret_ip);
return;
}
if (likely(!memory_is_poisoned(addr, size)))
return;
- kasan_report(addr, size, write, _RET_IP_);
+ kasan_report(addr, size, write, ret_ip);
+}
+
+static void check_memory_region(unsigned long addr,
+ size_t size, bool write,
+ unsigned long ret_ip)
+{
+ check_memory_region_inline(addr, size, write, ret_ip);
+}
+
+void kasan_check_read(const void *p, unsigned int size)
+{
+ check_memory_region((unsigned long)p, size, false, _RET_IP_);
}
+EXPORT_SYMBOL(kasan_check_read);
-void __asan_loadN(unsigned long addr, size_t size);
-void __asan_storeN(unsigned long addr, size_t size);
+void kasan_check_write(const void *p, unsigned int size)
+{
+ check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_write);
#undef memset
void *memset(void *addr, int c, size_t len)
{
- __asan_storeN((unsigned long)addr, len);
+ check_memory_region((unsigned long)addr, len, true, _RET_IP_);
return __memset(addr, c, len);
}
@@ -306,8 +322,8 @@ void *memset(void *addr, int c, size_t len)
#undef memmove
void *memmove(void *dest, const void *src, size_t len)
{
- __asan_loadN((unsigned long)src, len);
- __asan_storeN((unsigned long)dest, len);
+ check_memory_region((unsigned long)src, len, false, _RET_IP_);
+ check_memory_region((unsigned long)dest, len, true, _RET_IP_);
return __memmove(dest, src, len);
}
@@ -315,8 +331,8 @@ void *memmove(void *dest, const void *src, size_t len)
#undef memcpy
void *memcpy(void *dest, const void *src, size_t len)
{
- __asan_loadN((unsigned long)src, len);
- __asan_storeN((unsigned long)dest, len);
+ check_memory_region((unsigned long)src, len, false, _RET_IP_);
+ check_memory_region((unsigned long)dest, len, true, _RET_IP_);
return __memcpy(dest, src, len);
}
@@ -388,6 +404,16 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
}
#endif
+void kasan_cache_shrink(struct kmem_cache *cache)
+{
+ quarantine_remove_cache(cache);
+}
+
+void kasan_cache_destroy(struct kmem_cache *cache)
+{
+ quarantine_remove_cache(cache);
+}
+
void kasan_poison_slab(struct page *page)
{
kasan_poison_shadow(page_address(page),
@@ -482,7 +508,7 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
kasan_kmalloc(cache, object, cache->object_size, flags);
}
-void kasan_slab_free(struct kmem_cache *cache, void *object)
+void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
{
unsigned long size = cache->object_size;
unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
@@ -491,18 +517,43 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
return;
+ kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+
+bool kasan_slab_free(struct kmem_cache *cache, void *object)
+{
#ifdef CONFIG_SLAB
- if (cache->flags & SLAB_KASAN) {
- struct kasan_free_meta *free_info =
- get_free_info(cache, object);
+ /* RCU slabs could be legally used after free within the RCU period */
+ if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+ return false;
+
+ if (likely(cache->flags & SLAB_KASAN)) {
struct kasan_alloc_meta *alloc_info =
get_alloc_info(cache, object);
- alloc_info->state = KASAN_STATE_FREE;
- set_track(&free_info->track, GFP_NOWAIT);
+ struct kasan_free_meta *free_info =
+ get_free_info(cache, object);
+
+ switch (alloc_info->state) {
+ case KASAN_STATE_ALLOC:
+ alloc_info->state = KASAN_STATE_QUARANTINE;
+ quarantine_put(free_info, cache);
+ set_track(&free_info->track, GFP_NOWAIT);
+ kasan_poison_slab_free(cache, object);
+ return true;
+ case KASAN_STATE_QUARANTINE:
+ case KASAN_STATE_FREE:
+ pr_err("Double free");
+ dump_stack();
+ break;
+ default:
+ break;
+ }
}
+ return false;
+#else
+ kasan_poison_slab_free(cache, object);
+ return false;
#endif
-
- kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
}
void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
@@ -511,6 +562,9 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
unsigned long redzone_start;
unsigned long redzone_end;
+ if (flags & __GFP_RECLAIM)
+ quarantine_reduce();
+
if (unlikely(object == NULL))
return;
@@ -541,6 +595,9 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
unsigned long redzone_start;
unsigned long redzone_end;
+ if (flags & __GFP_RECLAIM)
+ quarantine_reduce();
+
if (unlikely(ptr == NULL))
return;
@@ -649,22 +706,22 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size)
}
EXPORT_SYMBOL(__asan_unregister_globals);
-#define DEFINE_ASAN_LOAD_STORE(size) \
- void __asan_load##size(unsigned long addr) \
- { \
- check_memory_region(addr, size, false); \
- } \
- EXPORT_SYMBOL(__asan_load##size); \
- __alias(__asan_load##size) \
- void __asan_load##size##_noabort(unsigned long); \
- EXPORT_SYMBOL(__asan_load##size##_noabort); \
- void __asan_store##size(unsigned long addr) \
- { \
- check_memory_region(addr, size, true); \
- } \
- EXPORT_SYMBOL(__asan_store##size); \
- __alias(__asan_store##size) \
- void __asan_store##size##_noabort(unsigned long); \
+#define DEFINE_ASAN_LOAD_STORE(size) \
+ void __asan_load##size(unsigned long addr) \
+ { \
+ check_memory_region_inline(addr, size, false, _RET_IP_);\
+ } \
+ EXPORT_SYMBOL(__asan_load##size); \
+ __alias(__asan_load##size) \
+ void __asan_load##size##_noabort(unsigned long); \
+ EXPORT_SYMBOL(__asan_load##size##_noabort); \
+ void __asan_store##size(unsigned long addr) \
+ { \
+ check_memory_region_inline(addr, size, true, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__asan_store##size); \
+ __alias(__asan_store##size) \
+ void __asan_store##size##_noabort(unsigned long); \
EXPORT_SYMBOL(__asan_store##size##_noabort)
DEFINE_ASAN_LOAD_STORE(1);
@@ -675,7 +732,7 @@ DEFINE_ASAN_LOAD_STORE(16);
void __asan_loadN(unsigned long addr, size_t size)
{
- check_memory_region(addr, size, false);
+ check_memory_region(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__asan_loadN);
@@ -685,7 +742,7 @@ EXPORT_SYMBOL(__asan_loadN_noabort);
void __asan_storeN(unsigned long addr, size_t size)
{
- check_memory_region(addr, size, true);
+ check_memory_region(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__asan_storeN);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 30a2f0ba0e09..7f7ac51d7faf 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -62,6 +62,7 @@ struct kasan_global {
enum kasan_state {
KASAN_STATE_INIT,
KASAN_STATE_ALLOC,
+ KASAN_STATE_QUARANTINE,
KASAN_STATE_FREE
};
@@ -79,9 +80,14 @@ struct kasan_alloc_meta {
u32 reserved;
};
+struct qlist_node {
+ struct qlist_node *next;
+};
struct kasan_free_meta {
- /* Allocator freelist pointer, unused by KASAN. */
- void **freelist;
+ /* This field is used while the object is in the quarantine.
+ * Otherwise it might be used for the allocator freelist.
+ */
+ struct qlist_node quarantine_link;
struct kasan_track track;
};
@@ -105,4 +111,15 @@ static inline bool kasan_report_enabled(void)
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
+#ifdef CONFIG_SLAB
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
+void quarantine_reduce(void);
+void quarantine_remove_cache(struct kmem_cache *cache);
+#else
+static inline void quarantine_put(struct kasan_free_meta *info,
+ struct kmem_cache *cache) { }
+static inline void quarantine_reduce(void) { }
+static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
+#endif
+
#endif
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
new file mode 100644
index 000000000000..1e687d707a4a
--- /dev/null
+++ b/mm/kasan/quarantine.c
@@ -0,0 +1,291 @@
+/*
+ * KASAN quarantine.
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/hash.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/shrinker.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "../slab.h"
+#include "kasan.h"
+
+/* Data structure and operations for quarantine queues. */
+
+/*
+ * Each queue is a signle-linked list, which also stores the total size of
+ * objects inside of it.
+ */
+struct qlist_head {
+ struct qlist_node *head;
+ struct qlist_node *tail;
+ size_t bytes;
+};
+
+#define QLIST_INIT { NULL, NULL, 0 }
+
+static bool qlist_empty(struct qlist_head *q)
+{
+ return !q->head;
+}
+
+static void qlist_init(struct qlist_head *q)
+{
+ q->head = q->tail = NULL;
+ q->bytes = 0;
+}
+
+static void qlist_put(struct qlist_head *q, struct qlist_node *qlink,
+ size_t size)
+{
+ if (unlikely(qlist_empty(q)))
+ q->head = qlink;
+ else
+ q->tail->next = qlink;
+ q->tail = qlink;
+ qlink->next = NULL;
+ q->bytes += size;
+}
+
+static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
+{
+ if (unlikely(qlist_empty(from)))
+ return;
+
+ if (qlist_empty(to)) {
+ *to = *from;
+ qlist_init(from);
+ return;
+ }
+
+ to->tail->next = from->head;
+ to->tail = from->tail;
+ to->bytes += from->bytes;
+
+ qlist_init(from);
+}
+
+static void qlist_move(struct qlist_head *from, struct qlist_node *last,
+ struct qlist_head *to, size_t size)
+{
+ if (unlikely(last == from->tail)) {
+ qlist_move_all(from, to);
+ return;
+ }
+ if (qlist_empty(to))
+ to->head = from->head;
+ else
+ to->tail->next = from->head;
+ to->tail = last;
+ from->head = last->next;
+ last->next = NULL;
+ from->bytes -= size;
+ to->bytes += size;
+}
+
+
+/*
+ * The object quarantine consists of per-cpu queues and a global queue,
+ * guarded by quarantine_lock.
+ */
+static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
+
+static struct qlist_head global_quarantine;
+static DEFINE_SPINLOCK(quarantine_lock);
+
+/* Maximum size of the global queue. */
+static unsigned long quarantine_size;
+
+/*
+ * The fraction of physical memory the quarantine is allowed to occupy.
+ * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep
+ * the ratio low to avoid OOM.
+ */
+#define QUARANTINE_FRACTION 32
+
+/*
+ * smp_load_acquire() here pairs with smp_store_release() in
+ * quarantine_reduce().
+ */
+#define QUARANTINE_LOW_SIZE (smp_load_acquire(&quarantine_size) * 3 / 4)
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+
+static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
+{
+ return virt_to_head_page(qlink)->slab_cache;
+}
+
+static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+ struct kasan_free_meta *free_info =
+ container_of(qlink, struct kasan_free_meta,
+ quarantine_link);
+
+ return ((void *)free_info) - cache->kasan_info.free_meta_offset;
+}
+
+static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+ void *object = qlink_to_object(qlink, cache);
+ struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+ unsigned long flags;
+
+ local_irq_save(flags);
+ alloc_info->state = KASAN_STATE_FREE;
+ ___cache_free(cache, object, _THIS_IP_);
+ local_irq_restore(flags);
+}
+
+static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
+{
+ struct qlist_node *qlink;
+
+ if (unlikely(qlist_empty(q)))
+ return;
+
+ qlink = q->head;
+ while (qlink) {
+ struct kmem_cache *obj_cache =
+ cache ? cache : qlink_to_cache(qlink);
+ struct qlist_node *next = qlink->next;
+
+ qlink_free(qlink, obj_cache);
+ qlink = next;
+ }
+ qlist_init(q);
+}
+
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
+{
+ unsigned long flags;
+ struct qlist_head *q;
+ struct qlist_head temp = QLIST_INIT;
+
+ local_irq_save(flags);
+
+ q = this_cpu_ptr(&cpu_quarantine);
+ qlist_put(q, &info->quarantine_link, cache->size);
+ if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE))
+ qlist_move_all(q, &temp);
+
+ local_irq_restore(flags);
+
+ if (unlikely(!qlist_empty(&temp))) {
+ spin_lock_irqsave(&quarantine_lock, flags);
+ qlist_move_all(&temp, &global_quarantine);
+ spin_unlock_irqrestore(&quarantine_lock, flags);
+ }
+}
+
+void quarantine_reduce(void)
+{
+ size_t new_quarantine_size;
+ unsigned long flags;
+ struct qlist_head to_free = QLIST_INIT;
+ size_t size_to_free = 0;
+ struct qlist_node *last;
+
+ /* smp_load_acquire() here pairs with smp_store_release() below. */
+ if (likely(READ_ONCE(global_quarantine.bytes) <=
+ smp_load_acquire(&quarantine_size)))
+ return;
+
+ spin_lock_irqsave(&quarantine_lock, flags);
+
+ /*
+ * Update quarantine size in case of hotplug. Allocate a fraction of
+ * the installed memory to quarantine minus per-cpu queue limits.
+ */
+ new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+ QUARANTINE_FRACTION;
+ new_quarantine_size -= QUARANTINE_PERCPU_SIZE * num_online_cpus();
+ /* Pairs with smp_load_acquire() above and in QUARANTINE_LOW_SIZE. */
+ smp_store_release(&quarantine_size, new_quarantine_size);
+
+ last = global_quarantine.head;
+ while (last) {
+ struct kmem_cache *cache = qlink_to_cache(last);
+
+ size_to_free += cache->size;
+ if (!last->next || size_to_free >
+ global_quarantine.bytes - QUARANTINE_LOW_SIZE)
+ break;
+ last = last->next;
+ }
+ qlist_move(&global_quarantine, last, &to_free, size_to_free);
+
+ spin_unlock_irqrestore(&quarantine_lock, flags);
+
+ qlist_free_all(&to_free, NULL);
+}
+
+static void qlist_move_cache(struct qlist_head *from,
+ struct qlist_head *to,
+ struct kmem_cache *cache)
+{
+ struct qlist_node *prev;
+
+ if (unlikely(qlist_empty(from)))
+ return;
+
+ prev = from->head;
+ while (prev) {
+ struct qlist_node *qlink = prev->next;
+ struct kmem_cache *obj_cache = qlink_to_cache(qlink);
+
+ if (obj_cache == cache) {
+ if (unlikely(from->tail == qlink))
+ from->tail = prev;
+ prev = qlink->next;
+ from->bytes -= cache->size;
+ qlist_put(to, qlink, cache->size);
+ } else
+ prev = prev->next;
+ }
+}
+
+static void per_cpu_remove_cache(void *arg)
+{
+ struct kmem_cache *cache = arg;
+ struct qlist_head to_free = QLIST_INIT;
+ struct qlist_head *q;
+
+ q = this_cpu_ptr(&cpu_quarantine);
+ qlist_move_cache(q, &to_free, cache);
+ qlist_free_all(&to_free, cache);
+}
+
+void quarantine_remove_cache(struct kmem_cache *cache)
+{
+ unsigned long flags;
+ struct qlist_head to_free = QLIST_INIT;
+
+ on_each_cpu(per_cpu_remove_cache, cache, 1);
+
+ spin_lock_irqsave(&quarantine_lock, flags);
+ qlist_move_cache(&global_quarantine, &to_free, cache);
+ spin_unlock_irqrestore(&quarantine_lock, flags);
+
+ qlist_free_all(&to_free, cache);
+}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 60869a5a0124..b3c122ddd454 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -151,6 +151,7 @@ static void object_err(struct kmem_cache *cache, struct page *page,
print_track(&alloc_info->track);
break;
case KASAN_STATE_FREE:
+ case KASAN_STATE_QUARANTINE:
pr_err("Object freed, allocated with size %u bytes\n",
alloc_info->alloc_size);
free_info = get_free_info(cache, object);
diff --git a/mm/memblock.c b/mm/memblock.c
index b570dddb4cb9..ca099159b45a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -584,6 +584,9 @@ repeat:
nid, flags);
}
+ if (!nr_new)
+ return 0;
+
/*
* If this was the first round, resize array and repeat for actual
* insertions; otherwise, merge and return.
@@ -606,22 +609,14 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
return memblock_add_range(&memblock.memory, base, size, nid, 0);
}
-static int __init_memblock memblock_add_region(phys_addr_t base,
- phys_addr_t size,
- int nid,
- unsigned long flags)
+int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
- flags, (void *)_RET_IP_);
+ 0UL, (void *)_RET_IP_);
- return memblock_add_range(&memblock.memory, base, size, nid, flags);
-}
-
-int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
-{
- return memblock_add_region(base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
}
/**
@@ -732,22 +727,14 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
return memblock_remove_range(&memblock.reserved, base, size);
}
-static int __init_memblock memblock_reserve_region(phys_addr_t base,
- phys_addr_t size,
- int nid,
- unsigned long flags)
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
- flags, (void *)_RET_IP_);
-
- return memblock_add_range(&memblock.reserved, base, size, nid, flags);
-}
+ 0UL, (void *)_RET_IP_);
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
-{
- return memblock_reserve_region(base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
}
/**
@@ -840,7 +827,7 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
{
struct memblock_type *type = &memblock.reserved;
- if (*idx >= 0 && *idx < type->cnt) {
+ if (*idx < type->cnt) {
struct memblock_region *r = &type->regions[*idx];
phys_addr_t base = r->base;
phys_addr_t size = r->size;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fe787f5c41bd..b3f16ab4b431 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1023,22 +1023,40 @@ out:
* @lru: index of lru list the page is sitting on
* @nr_pages: positive when adding or negative when removing
*
- * This function must be called when a page is added to or removed from an
- * lru list.
+ * This function must be called under lru_lock, just before a page is added
+ * to or just after a page is removed from an lru list (that ordering being
+ * so as to allow it to check that lru_size 0 is consistent with list_empty).
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int nr_pages)
{
struct mem_cgroup_per_zone *mz;
unsigned long *lru_size;
+ long size;
+ bool empty;
+
+ __update_lru_size(lruvec, lru, nr_pages);
if (mem_cgroup_disabled())
return;
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
lru_size = mz->lru_size + lru;
- *lru_size += nr_pages;
- VM_BUG_ON((long)(*lru_size) < 0);
+ empty = list_empty(lruvec->lists + lru);
+
+ if (nr_pages < 0)
+ *lru_size += nr_pages;
+
+ size = *lru_size;
+ if (WARN_ONCE(size < 0 || empty != !size,
+ "%s(%p, %d, %d): lru_size %ld but %sempty\n",
+ __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
+ VM_BUG_ON(1);
+ *lru_size = 0;
+ }
+
+ if (nr_pages > 0)
+ *lru_size += nr_pages;
}
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
@@ -1257,6 +1275,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
*/
if (fatal_signal_pending(current) || task_will_free_mem(current)) {
mark_oom_victim(current);
+ try_oom_reaper(current);
goto unlock;
}
@@ -1389,14 +1408,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
mem_cgroup_may_update_nodemask(memcg);
node = memcg->last_scanned_node;
- node = next_node(node, memcg->scan_nodes);
- if (node == MAX_NUMNODES)
- node = first_node(memcg->scan_nodes);
+ node = next_node_in(node, memcg->scan_nodes);
/*
- * We call this when we hit limit, not when pages are added to LRU.
- * No LRU may hold pages because all pages are UNEVICTABLE or
- * memcg is too small and all pages are not on LRU. In that case,
- * we use curret node.
+ * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
+ * last time it really checked all the LRUs due to rate limiting.
+ * Fallback to the current node in that case for simplicity.
*/
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
@@ -2636,8 +2652,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
}
/*
- * Reclaims as many pages from the given memcg as possible and moves
- * the rest to the parent.
+ * Reclaims as many pages from the given memcg as possible.
*
* Caller is responsible for holding css reference for memcg.
*/
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ca5acee53b7a..2fcca6b0e005 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -184,8 +184,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
struct siginfo si;
int ret;
- pr_err("MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
- pfn, t->comm, t->pid);
+ pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
+ pfn, t->comm, t->pid);
si.si_signo = SIGBUS;
si.si_errno = 0;
si.si_addr = (void *)addr;
@@ -208,7 +208,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
}
if (ret < 0)
- pr_info("MCE: Error sending signal to %s:%d: %d\n",
+ pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
t->comm, t->pid, ret);
return ret;
}
@@ -289,7 +289,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
} else {
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
if (!tk) {
- pr_err("MCE: Out of memory while machine check handling\n");
+ pr_err("Memory failure: Out of memory while machine check handling\n");
return;
}
}
@@ -303,7 +303,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* a SIGKILL because the error is not contained anymore.
*/
if (tk->addr == -EFAULT) {
- pr_info("MCE: Unable to find user space address %lx in %s\n",
+ pr_info("Memory failure: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
tk->addr_valid = 0;
}
@@ -334,7 +334,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
* signal and then access the memory. Just kill it.
*/
if (fail || tk->addr_valid == 0) {
- pr_err("MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+ pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
pfn, tk->tsk->comm, tk->tsk->pid);
force_sig(SIGKILL, tk->tsk);
}
@@ -347,7 +347,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
*/
else if (kill_proc(tk->tsk, tk->addr, trapno,
pfn, page, flags) < 0)
- pr_err("MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
+ pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
}
put_task_struct(tk->tsk);
@@ -559,7 +559,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
*/
static int me_unknown(struct page *p, unsigned long pfn)
{
- pr_err("MCE %#lx: Unknown page state\n", pfn);
+ pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
return MF_FAILED;
}
@@ -604,11 +604,12 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
if (mapping->a_ops->error_remove_page) {
err = mapping->a_ops->error_remove_page(mapping, p);
if (err != 0) {
- pr_info("MCE %#lx: Failed to punch page: %d\n",
+ pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
- pr_info("MCE %#lx: failed to release buffers\n", pfn);
+ pr_info("Memory failure: %#lx: failed to release buffers\n",
+ pfn);
} else {
ret = MF_RECOVERED;
}
@@ -620,7 +621,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
if (invalidate_inode_page(p))
ret = MF_RECOVERED;
else
- pr_info("MCE %#lx: Failed to invalidate\n", pfn);
+ pr_info("Memory failure: %#lx: Failed to invalidate\n",
+ pfn);
}
return ret;
}
@@ -833,7 +835,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type,
{
trace_memory_failure_event(pfn, type, result);
- pr_err("MCE %#lx: recovery action for %s: %s\n",
+ pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
}
@@ -849,7 +851,7 @@ static int page_action(struct page_state *ps, struct page *p,
if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
count--;
if (count != 0) {
- pr_err("MCE %#lx: %s still referenced by %d users\n",
+ pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
pfn, action_page_types[ps->type], count);
result = MF_FAILED;
}
@@ -882,7 +884,7 @@ int get_hwpoison_page(struct page *page)
* tries to touch the "partially handled" page.
*/
if (!PageAnon(head)) {
- pr_err("MCE: %#lx: non anonymous thp\n",
+ pr_err("Memory failure: %#lx: non anonymous thp\n",
page_to_pfn(page));
return 0;
}
@@ -892,7 +894,8 @@ int get_hwpoison_page(struct page *page)
if (head == compound_head(page))
return 1;
- pr_info("MCE: %#lx cannot catch tail\n", page_to_pfn(page));
+ pr_info("Memory failure: %#lx cannot catch tail\n",
+ page_to_pfn(page));
put_page(head);
}
@@ -931,12 +934,13 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
return SWAP_SUCCESS;
if (PageKsm(p)) {
- pr_err("MCE %#lx: can't handle KSM pages.\n", pfn);
+ pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
return SWAP_FAIL;
}
if (PageSwapCache(p)) {
- pr_err("MCE %#lx: keeping poisoned page in swap cache\n", pfn);
+ pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
+ pfn);
ttu |= TTU_IGNORE_HWPOISON;
}
@@ -954,7 +958,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
- pr_info("MCE %#lx: corrupted page was clean: dropped without side effects\n",
+ pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
}
@@ -972,7 +976,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
- pr_err("MCE %#lx: failed to unmap page (mapcount=%d)\n",
+ pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
/*
@@ -1040,14 +1044,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
if (!pfn_valid(pfn)) {
- pr_err("MCE %#lx: memory outside kernel control\n", pfn);
+ pr_err("Memory failure: %#lx: memory outside kernel control\n",
+ pfn);
return -ENXIO;
}
p = pfn_to_page(pfn);
orig_head = hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
- pr_err("MCE %#lx: already hardware poisoned\n", pfn);
+ pr_err("Memory failure: %#lx: already hardware poisoned\n",
+ pfn);
return 0;
}
@@ -1112,9 +1118,11 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
unlock_page(hpage);
if (!PageAnon(hpage))
- pr_err("MCE: %#lx: non anonymous thp\n", pfn);
+ pr_err("Memory failure: %#lx: non anonymous thp\n",
+ pfn);
else
- pr_err("MCE: %#lx: thp split failed\n", pfn);
+ pr_err("Memory failure: %#lx: thp split failed\n",
+ pfn);
if (TestClearPageHWPoison(p))
num_poisoned_pages_sub(nr_pages);
put_hwpoison_page(p);
@@ -1178,7 +1186,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* unpoison always clear PG_hwpoison inside page lock
*/
if (!PageHWPoison(p)) {
- pr_err("MCE %#lx: just unpoisoned\n", pfn);
+ pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
put_hwpoison_page(hpage);
@@ -1395,25 +1403,25 @@ int unpoison_memory(unsigned long pfn)
page = compound_head(p);
if (!PageHWPoison(p)) {
- unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n",
+ unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
pfn, &unpoison_rs);
return 0;
}
if (page_count(page) > 1) {
- unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n",
+ unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
pfn, &unpoison_rs);
return 0;
}
if (page_mapped(page)) {
- unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n",
+ unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
pfn, &unpoison_rs);
return 0;
}
if (page_mapping(page)) {
- unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
+ unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
pfn, &unpoison_rs);
return 0;
}
@@ -1424,7 +1432,7 @@ int unpoison_memory(unsigned long pfn)
* In such case, we yield to memory_failure() and make unpoison fail.
*/
if (!PageHuge(page) && PageTransHuge(page)) {
- unpoison_pr_info("MCE: Memory failure is now running on %#lx\n",
+ unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
pfn, &unpoison_rs);
return 0;
}
@@ -1439,13 +1447,13 @@ int unpoison_memory(unsigned long pfn)
* to the end.
*/
if (PageHuge(page)) {
- unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n",
+ unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n",
pfn, &unpoison_rs);
return 0;
}
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
- unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n",
+ unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
pfn, &unpoison_rs);
return 0;
}
@@ -1458,7 +1466,7 @@ int unpoison_memory(unsigned long pfn)
* the free buddy page pool.
*/
if (TestClearPageHWPoison(page)) {
- unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n",
+ unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
pfn, &unpoison_rs);
num_poisoned_pages_sub(nr_pages);
freeit = 1;
diff --git a/mm/memory.c b/mm/memory.c
index 07493e34ab7e..b762b17aa4c5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1744,6 +1744,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long next;
unsigned long end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
+ unsigned long remap_pfn = pfn;
int err;
/*
@@ -1770,7 +1771,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
vma->vm_pgoff = pfn;
}
- err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
+ err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
if (err)
return -EINVAL;
@@ -1789,7 +1790,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
} while (pgd++, addr = next, addr != end);
if (err)
- untrack_pfn(vma, pfn, PAGE_ALIGN(size));
+ untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
return err;
}
@@ -2508,7 +2509,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index aa34431c3f31..522e3ef49d64 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -78,9 +78,24 @@ static struct {
#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
bool memhp_auto_online;
+#else
+bool memhp_auto_online = true;
+#endif
EXPORT_SYMBOL_GPL(memhp_auto_online);
+static int __init setup_memhp_default_state(char *str)
+{
+ if (!strcmp(str, "online"))
+ memhp_auto_online = true;
+ else if (!strcmp(str, "offline"))
+ memhp_auto_online = false;
+
+ return 1;
+}
+__setup("memhp_default_state=", setup_memhp_default_state);
+
void get_online_mems(void)
{
might_sleep();
@@ -434,6 +449,25 @@ out_fail:
return -1;
}
+static struct zone * __meminit move_pfn_range(int zone_shift,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct zone *zone = page_zone(pfn_to_page(start_pfn));
+ int ret = 0;
+
+ if (zone_shift < 0)
+ ret = move_pfn_range_left(zone + zone_shift, zone,
+ start_pfn, end_pfn);
+ else if (zone_shift)
+ ret = move_pfn_range_right(zone, zone + zone_shift,
+ start_pfn, end_pfn);
+
+ if (ret)
+ return NULL;
+
+ return zone + zone_shift;
+}
+
static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
unsigned long end_pfn)
{
@@ -1013,6 +1047,37 @@ static void node_states_set_node(int node, struct memory_notify *arg)
node_set_state(node, N_MEMORY);
}
+int zone_can_shift(unsigned long pfn, unsigned long nr_pages,
+ enum zone_type target)
+{
+ struct zone *zone = page_zone(pfn_to_page(pfn));
+ enum zone_type idx = zone_idx(zone);
+ int i;
+
+ if (idx < target) {
+ /* pages must be at end of current zone */
+ if (pfn + nr_pages != zone_end_pfn(zone))
+ return 0;
+
+ /* no zones in use between current zone and target */
+ for (i = idx + 1; i < target; i++)
+ if (zone_is_initialized(zone - idx + i))
+ return 0;
+ }
+
+ if (target < idx) {
+ /* pages must be at beginning of current zone */
+ if (pfn != zone->zone_start_pfn)
+ return 0;
+
+ /* no zones in use between current zone and target */
+ for (i = target + 1; i < idx; i++)
+ if (zone_is_initialized(zone - idx + i))
+ return 0;
+ }
+
+ return target - idx;
+}
/* Must be protected by mem_hotplug_begin() */
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
@@ -1024,6 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
int nid;
int ret;
struct memory_notify arg;
+ int zone_shift = 0;
/*
* This doesn't need a lock to do pfn_to_page().
@@ -1037,19 +1103,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
!can_online_high_movable(zone))
return -EINVAL;
- if (online_type == MMOP_ONLINE_KERNEL &&
- zone_idx(zone) == ZONE_MOVABLE) {
- if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
- return -EINVAL;
- }
- if (online_type == MMOP_ONLINE_MOVABLE &&
- zone_idx(zone) == ZONE_MOVABLE - 1) {
- if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
- return -EINVAL;
- }
+ if (online_type == MMOP_ONLINE_KERNEL)
+ zone_shift = zone_can_shift(pfn, nr_pages, ZONE_NORMAL);
+ else if (online_type == MMOP_ONLINE_MOVABLE)
+ zone_shift = zone_can_shift(pfn, nr_pages, ZONE_MOVABLE);
- /* Previous code may changed the zone of the pfn range */
- zone = page_zone(pfn_to_page(pfn));
+ zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
+ if (!zone)
+ return -EINVAL;
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
@@ -1410,7 +1471,7 @@ static struct page *next_active_pageblock(struct page *page)
}
/* Checks if this range of memory is likely to be hot-removable. */
-int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
+bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
struct page *page = pfn_to_page(start_pfn);
struct page *end_page = page + nr_pages;
@@ -1418,12 +1479,12 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
/* Check the starting page of each pageblock within the range */
for (; page < end_page; page = next_active_pageblock(page)) {
if (!is_pageblock_removable_nolock(page))
- return 0;
+ return false;
cond_resched();
}
/* All pageblocks in the memory block are likely to be hot-removable */
- return 1;
+ return true;
}
/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 36cc01bc950a..297d6854f849 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -97,7 +97,6 @@
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
-#include <linux/random.h>
#include "internal.h"
@@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
BUG();
if (!node_isset(current->il_next, tmp)) {
- current->il_next = next_node(current->il_next, tmp);
- if (current->il_next >= MAX_NUMNODES)
- current->il_next = first_node(tmp);
+ current->il_next = next_node_in(current->il_next, tmp);
if (current->il_next >= MAX_NUMNODES)
current->il_next = numa_node_id();
}
@@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
struct task_struct *me = current;
nid = me->il_next;
- next = next_node(nid, policy->v.nodes);
- if (next >= MAX_NUMNODES)
- next = first_node(policy->v.nodes);
+ next = next_node_in(nid, policy->v.nodes);
if (next < MAX_NUMNODES)
me->il_next = next;
return nid;
@@ -1744,18 +1739,18 @@ unsigned int mempolicy_slab_node(void)
return interleave_nodes(policy);
case MPOL_BIND: {
+ struct zoneref *z;
+
/*
* Follow bind policy behavior and start allocation at the
* first node.
*/
struct zonelist *zonelist;
- struct zone *zone;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
zonelist = &NODE_DATA(node)->node_zonelists[0];
- (void)first_zones_zonelist(zonelist, highest_zoneidx,
- &policy->v.nodes,
- &zone);
- return zone ? zone->node : node;
+ z = first_zones_zonelist(zonelist, highest_zoneidx,
+ &policy->v.nodes);
+ return z->zone ? z->zone->node : node;
}
default:
@@ -1763,23 +1758,25 @@ unsigned int mempolicy_slab_node(void)
}
}
-/* Do static interleaving for a VMA with known offset. */
+/*
+ * Do static interleaving for a VMA with known offset @n. Returns the n'th
+ * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
+ * number of present nodes.
+ */
static unsigned offset_il_node(struct mempolicy *pol,
- struct vm_area_struct *vma, unsigned long off)
+ struct vm_area_struct *vma, unsigned long n)
{
unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target;
- int c;
- int nid = NUMA_NO_NODE;
+ int i;
+ int nid;
if (!nnodes)
return numa_node_id();
- target = (unsigned int)off % nnodes;
- c = 0;
- do {
+ target = (unsigned int)n % nnodes;
+ nid = first_node(pol->v.nodes);
+ for (i = 0; i < target; i++)
nid = next_node(nid, pol->v.nodes);
- c++;
- } while (c <= target);
return nid;
}
@@ -1805,21 +1802,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
return interleave_nodes(pol);
}
-/*
- * Return the bit number of a random bit set in the nodemask.
- * (returns NUMA_NO_NODE if nodemask is empty)
- */
-int node_random(const nodemask_t *maskp)
-{
- int w, bit = NUMA_NO_NODE;
-
- w = nodes_weight(*maskp);
- if (w)
- bit = bitmap_ord_to_pos(maskp->bits,
- get_random_int() % w, MAX_NUMNODES);
- return bit;
-}
-
#ifdef CONFIG_HUGETLBFS
/*
* huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
@@ -2284,7 +2266,7 @@ static void sp_free(struct sp_node *n)
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol;
- struct zone *zone;
+ struct zoneref *z;
int curnid = page_to_nid(page);
unsigned long pgoff;
int thiscpu = raw_smp_processor_id();
@@ -2316,6 +2298,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break;
case MPOL_BIND:
+
/*
* allows binding to multiple nodes.
* use current page if in policy nodemask,
@@ -2324,11 +2307,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
*/
if (node_isset(curnid, pol->v.nodes))
goto out;
- (void)first_zones_zonelist(
+ z = first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
- &pol->v.nodes, &zone);
- polnid = zone->node;
+ &pol->v.nodes);
+ polnid = z->zone->node;
break;
default:
diff --git a/mm/mempool.c b/mm/mempool.c
index 9b7a14a791cc..9e075f829d0d 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -105,7 +105,7 @@ static inline void poison_element(mempool_t *pool, void *element)
static void kasan_poison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab)
- kasan_slab_free(pool->pool_data, element);
+ kasan_poison_slab_free(pool->pool_data, element);
if (pool->alloc == mempool_kmalloc)
kasan_kfree(element);
if (pool->alloc == mempool_alloc_pages)
diff --git a/mm/migrate.c b/mm/migrate.c
index f9dfb18a4eba..53ab6398e7a2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -332,7 +332,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
- SetPageSwapBacked(newpage);
+ __SetPageSwapBacked(newpage);
return MIGRATEPAGE_SUCCESS;
}
@@ -378,7 +378,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
- SetPageSwapBacked(newpage);
+ __SetPageSwapBacked(newpage);
get_page(newpage); /* add cache reference */
if (PageSwapCache(page)) {
@@ -1791,7 +1791,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
/* Prepare a page as a migration target */
__SetPageLocked(new_page);
- SetPageSwapBacked(new_page);
+ __SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
new_page->mapping = page->mapping;
diff --git a/mm/mmap.c b/mm/mmap.c
index bd2e1a533bc1..b9274a0c82c9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -55,10 +55,6 @@
#define arch_mmap_check(addr, len, flags) (0)
#endif
-#ifndef arch_rebalance_pgtables
-#define arch_rebalance_pgtables(addr, len) (addr)
-#endif
-
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
@@ -70,7 +66,7 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif
-static bool ignore_rlimit_data = true;
+static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
static void unmap_region(struct mm_struct *mm,
@@ -1911,7 +1907,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
if (offset_in_page(addr))
return -EINVAL;
- addr = arch_rebalance_pgtables(addr, len);
error = security_mmap_addr(addr);
return error ? error : addr;
}
@@ -2891,13 +2886,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
if (is_data_mapping(flags) &&
mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
- if (ignore_rlimit_data)
- pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Will be forbidden soon.\n",
+ /* Workaround for Valgrind */
+ if (rlimit(RLIMIT_DATA) == 0 &&
+ mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
+ return true;
+ if (!ignore_rlimit_data) {
+ pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits or use boot option ignore_rlimit_data.\n",
current->comm, current->pid,
(mm->data_vm + npages) << PAGE_SHIFT,
rlimit(RLIMIT_DATA));
- else
return false;
+ }
}
return true;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 52687fb4de6f..5652be858e5e 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -52,7 +52,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
}
/* Returns the next zone at or below highest_zoneidx in a zonelist */
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
nodemask_t *nodes)
{
diff --git a/mm/mremap.c b/mm/mremap.c
index 3fa0a467df66..9dc499977924 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -70,6 +70,22 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
return pmd;
}
+static void take_rmap_locks(struct vm_area_struct *vma)
+{
+ if (vma->vm_file)
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+}
+
+static void drop_rmap_locks(struct vm_area_struct *vma)
+{
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
+ if (vma->vm_file)
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+}
+
static pte_t move_soft_dirty_pte(pte_t pte)
{
/*
@@ -90,8 +106,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
{
- struct address_space *mapping = NULL;
- struct anon_vma *anon_vma = NULL;
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
@@ -114,16 +128,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* serialize access to individual ptes, but only rmap traversal
* order guarantees that we won't miss both the old and new ptes).
*/
- if (need_rmap_locks) {
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
- }
- if (vma->anon_vma) {
- anon_vma = vma->anon_vma;
- anon_vma_lock_write(anon_vma);
- }
- }
+ if (need_rmap_locks)
+ take_rmap_locks(vma);
/*
* We don't have to worry about the ordering of src and dst
@@ -151,10 +157,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
spin_unlock(new_ptl);
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
- if (anon_vma)
- anon_vma_unlock_write(anon_vma);
- if (mapping)
- i_mmap_unlock_write(mapping);
+ if (need_rmap_locks)
+ drop_rmap_locks(vma);
}
#define LATENCY_LIMIT (64 * PAGE_SIZE)
@@ -193,16 +197,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (pmd_trans_huge(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE) {
bool moved;
- VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
- vma);
/* See comment in move_ptes() */
if (need_rmap_locks)
- anon_vma_lock_write(vma->anon_vma);
- moved = move_huge_pmd(vma, new_vma, old_addr,
- new_addr, old_end,
- old_pmd, new_pmd);
+ take_rmap_locks(vma);
+ moved = move_huge_pmd(vma, old_addr, new_addr,
+ old_end, old_pmd, new_pmd);
if (need_rmap_locks)
- anon_vma_unlock_write(vma->anon_vma);
+ drop_rmap_locks(vma);
if (moved) {
need_flush = true;
continue;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 86349586eacb..c0e37dd1422f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -174,8 +174,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
if (!p)
return 0;
+ /*
+ * Do not even consider tasks which are explicitly marked oom
+ * unkillable or have been already oom reaped.
+ */
adj = (long)p->signal->oom_score_adj;
- if (adj == OOM_SCORE_ADJ_MIN) {
+ if (adj == OOM_SCORE_ADJ_MIN ||
+ test_bit(MMF_OOM_REAPED, &p->mm->flags)) {
task_unlock(p);
return 0;
}
@@ -412,6 +417,25 @@ bool oom_killer_disabled __read_mostly;
#define K(x) ((x) << (PAGE_SHIFT-10))
+/*
+ * task->mm can be NULL if the task is the exited group leader. So to
+ * determine whether the task is using a particular mm, we examine all the
+ * task's threads: if one of those is using this mm then this task was also
+ * using it.
+ */
+static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+{
+ struct task_struct *t;
+
+ for_each_thread(p, t) {
+ struct mm_struct *t_mm = READ_ONCE(t->mm);
+ if (t_mm)
+ return t_mm == mm;
+ }
+ return false;
+}
+
+
#ifdef CONFIG_MMU
/*
* OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -422,7 +446,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
-
static bool __oom_reap_task(struct task_struct *tsk)
{
struct mmu_gather tlb;
@@ -491,16 +514,17 @@ static bool __oom_reap_task(struct task_struct *tsk)
up_read(&mm->mmap_sem);
/*
- * Clear TIF_MEMDIE because the task shouldn't be sitting on a
- * reasonably reclaimable memory anymore. OOM killer can continue
- * by selecting other victim if unmapping hasn't led to any
- * improvements. This also means that selecting this task doesn't
- * make any sense.
+ * This task can be safely ignored because we cannot do much more
+ * to release its memory.
*/
- tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
- exit_oom_victim(tsk);
+ set_bit(MMF_OOM_REAPED, &mm->flags);
out:
- mmput(mm);
+ /*
+ * Drop our reference but make sure the mmput slow path is called from a
+ * different context because we shouldn't risk we get stuck there and
+ * put the oom_reaper out of the way.
+ */
+ mmput_async(mm);
return ret;
}
@@ -519,6 +543,15 @@ static void oom_reap_task(struct task_struct *tsk)
debug_show_all_locks();
}
+ /*
+ * Clear TIF_MEMDIE because the task shouldn't be sitting on a
+ * reasonably reclaimable memory anymore or it is not a good candidate
+ * for the oom victim right now because it cannot release its memory
+ * itself nor by the oom reaper.
+ */
+ tsk->oom_reaper_list = NULL;
+ exit_oom_victim(tsk);
+
/* Drop a reference taken by wake_oom_reaper */
put_task_struct(tsk);
}
@@ -563,6 +596,53 @@ static void wake_oom_reaper(struct task_struct *tsk)
wake_up(&oom_reaper_wait);
}
+/* Check if we can reap the given task. This has to be called with stable
+ * tsk->mm
+ */
+void try_oom_reaper(struct task_struct *tsk)
+{
+ struct mm_struct *mm = tsk->mm;
+ struct task_struct *p;
+
+ if (!mm)
+ return;
+
+ /*
+ * There might be other threads/processes which are either not
+ * dying or even not killable.
+ */
+ if (atomic_read(&mm->mm_users) > 1) {
+ rcu_read_lock();
+ for_each_process(p) {
+ bool exiting;
+
+ if (!process_shares_mm(p, mm))
+ continue;
+ if (same_thread_group(p, tsk))
+ continue;
+ if (fatal_signal_pending(p))
+ continue;
+
+ /*
+ * If the task is exiting make sure the whole thread group
+ * is exiting and cannot acces mm anymore.
+ */
+ spin_lock_irq(&p->sighand->siglock);
+ exiting = signal_group_exit(p->signal);
+ spin_unlock_irq(&p->sighand->siglock);
+ if (exiting)
+ continue;
+
+ /* Give up */
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+ }
+
+ wake_oom_reaper(tsk);
+}
+
static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -653,24 +733,6 @@ void oom_killer_enable(void)
}
/*
- * task->mm can be NULL if the task is the exited group leader. So to
- * determine whether the task is using a particular mm, we examine all the
- * task's threads: if one of those is using this mm then this task was also
- * using it.
- */
-static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
-{
- struct task_struct *t;
-
- for_each_thread(p, t) {
- struct mm_struct *t_mm = READ_ONCE(t->mm);
- if (t_mm)
- return t_mm == mm;
- }
- return false;
-}
-
-/*
* Must be called while holding a reference to p, which will be released upon
* returning.
*/
@@ -694,6 +756,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
task_lock(p);
if (p->mm && task_will_free_mem(p)) {
mark_oom_victim(p);
+ try_oom_reaper(p);
task_unlock(p);
put_task_struct(p);
return;
@@ -873,10 +936,20 @@ bool out_of_memory(struct oom_control *oc)
if (current->mm &&
(fatal_signal_pending(current) || task_will_free_mem(current))) {
mark_oom_victim(current);
+ try_oom_reaper(current);
return true;
}
/*
+ * The OOM killer does not compensate for IO-less reclaim.
+ * pagefault_out_of_memory lost its gfp context so we have to
+ * make sure exclude 0 mask - all other users should have at least
+ * ___GFP_DIRECT_RECLAIM to get here.
+ */
+ if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL)))
+ return true;
+
+ /*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bc5149d5ec38..3b88795ab46e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -296,11 +296,15 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
#ifdef CONFIG_HIGHMEM
int node;
unsigned long x = 0;
+ int i;
for_each_node_state(node, N_HIGH_MEMORY) {
- struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *z = &NODE_DATA(node)->node_zones[i];
- x += zone_dirtyable_memory(z);
+ if (is_highmem(z))
+ x += zone_dirtyable_memory(z);
+ }
}
/*
* Unreclaimable memory (kernel memory or anonymous memory
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c1069efcc4d7..477d9382f70d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -352,6 +352,106 @@ static inline bool update_defer_init(pg_data_t *pgdat,
}
#endif
+/* Return a pointer to the bitmap storing bits affecting a block of pages */
+static inline unsigned long *get_pageblock_bitmap(struct page *page,
+ unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ return __pfn_to_section(pfn)->pageblock_flags;
+#else
+ return page_zone(page)->pageblock_flags;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ pfn &= (PAGES_PER_SECTION-1);
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#else
+ pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest to retrieve
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
+ unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
+{
+ unsigned long *bitmap;
+ unsigned long bitidx, word_bitidx;
+ unsigned long word;
+
+ bitmap = get_pageblock_bitmap(page, pfn);
+ bitidx = pfn_to_bitidx(page, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
+
+ word = bitmap[word_bitidx];
+ bitidx += end_bitidx;
+ return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+}
+
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
+{
+ return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
+}
+
+static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+{
+ return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
+}
+
+/**
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @flags: The flags to set
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest
+ * @mask: mask of bits that the caller is interested in
+ */
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
+ unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
+{
+ unsigned long *bitmap;
+ unsigned long bitidx, word_bitidx;
+ unsigned long old_word, word;
+
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+
+ bitmap = get_pageblock_bitmap(page, pfn);
+ bitidx = pfn_to_bitidx(page, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
+
+ VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+ bitidx += end_bitidx;
+ mask <<= (BITS_PER_LONG - bitidx - 1);
+ flags <<= (BITS_PER_LONG - bitidx - 1);
+
+ word = READ_ONCE(bitmap[word_bitidx]);
+ for (;;) {
+ old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+ if (word == old_word)
+ break;
+ word = old_word;
+ }
+}
void set_pageblock_migratetype(struct page *page, int migratetype)
{
@@ -513,14 +613,7 @@ static int __init early_debug_pagealloc(char *buf)
{
if (!buf)
return -EINVAL;
-
- if (strcmp(buf, "on") == 0)
- _debug_pagealloc_enabled = true;
-
- if (strcmp(buf, "off") == 0)
- _debug_pagealloc_enabled = false;
-
- return 0;
+ return kstrtobool(buf, &_debug_pagealloc_enabled);
}
early_param("debug_pagealloc", early_debug_pagealloc);
@@ -784,17 +877,42 @@ out:
zone->free_area[order].nr_free++;
}
-static inline int free_pages_check(struct page *page)
+/*
+ * A bad page could be due to a number of fields. Instead of multiple branches,
+ * try and check multiple fields with one check. The caller must do a detailed
+ * check if necessary.
+ */
+static inline bool page_expected_state(struct page *page,
+ unsigned long check_flags)
{
- const char *bad_reason = NULL;
- unsigned long bad_flags = 0;
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
+ return false;
+
+ if (unlikely((unsigned long)page->mapping |
+ page_ref_count(page) |
+#ifdef CONFIG_MEMCG
+ (unsigned long)page->mem_cgroup |
+#endif
+ (page->flags & check_flags)))
+ return false;
+
+ return true;
+}
+
+static void free_pages_check_bad(struct page *page)
+{
+ const char *bad_reason;
+ unsigned long bad_flags;
+
+ bad_reason = NULL;
+ bad_flags = 0;
if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
- bad_reason = "nonzero _count";
+ bad_reason = "nonzero _refcount";
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
@@ -803,16 +921,146 @@ static inline int free_pages_check(struct page *page)
if (unlikely(page->mem_cgroup))
bad_reason = "page still charged to cgroup";
#endif
- if (unlikely(bad_reason)) {
- bad_page(page, bad_reason, bad_flags);
- return 1;
+ bad_page(page, bad_reason, bad_flags);
+}
+
+static inline int free_pages_check(struct page *page)
+{
+ if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
+ return 0;
+
+ /* Something has gone sideways, find it */
+ free_pages_check_bad(page);
+ return 1;
+}
+
+static int free_tail_pages_check(struct page *head_page, struct page *page)
+{
+ int ret = 1;
+
+ /*
+ * We rely page->lru.next never has bit 0 set, unless the page
+ * is PageTail(). Let's make sure that's true even for poisoned ->lru.
+ */
+ BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
+
+ if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+ ret = 0;
+ goto out;
+ }
+ switch (page - head_page) {
+ case 1:
+ /* the first tail page: ->mapping is compound_mapcount() */
+ if (unlikely(compound_mapcount(page))) {
+ bad_page(page, "nonzero compound_mapcount", 0);
+ goto out;
+ }
+ break;
+ case 2:
+ /*
+ * the second tail page: ->mapping is
+ * page_deferred_list().next -- ignore value.
+ */
+ break;
+ default:
+ if (page->mapping != TAIL_MAPPING) {
+ bad_page(page, "corrupted mapping in tail page", 0);
+ goto out;
+ }
+ break;
+ }
+ if (unlikely(!PageTail(page))) {
+ bad_page(page, "PageTail not set", 0);
+ goto out;
+ }
+ if (unlikely(compound_head(page) != head_page)) {
+ bad_page(page, "compound_head not consistent", 0);
+ goto out;
+ }
+ ret = 0;
+out:
+ page->mapping = NULL;
+ clear_compound_head(page);
+ return ret;
+}
+
+static __always_inline bool free_pages_prepare(struct page *page,
+ unsigned int order, bool check_free)
+{
+ int bad = 0;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ trace_mm_page_free(page, order);
+ kmemcheck_free_shadow(page, order);
+ kasan_free_pages(page, order);
+
+ /*
+ * Check tail pages before head page information is cleared to
+ * avoid checking PageCompound for order-0 pages.
+ */
+ if (unlikely(order)) {
+ bool compound = PageCompound(page);
+ int i;
+
+ VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
+
+ for (i = 1; i < (1 << order); i++) {
+ if (compound)
+ bad += free_tail_pages_check(page, page + i);
+ if (unlikely(free_pages_check(page + i))) {
+ bad++;
+ continue;
+ }
+ (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ }
}
+ if (PageAnonHead(page))
+ page->mapping = NULL;
+ if (check_free)
+ bad += free_pages_check(page);
+ if (bad)
+ return false;
+
page_cpupid_reset_last(page);
- if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
- page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- return 0;
+ page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ reset_page_owner(page, order);
+
+ if (!PageHighMem(page)) {
+ debug_check_no_locks_freed(page_address(page),
+ PAGE_SIZE << order);
+ debug_check_no_obj_freed(page_address(page),
+ PAGE_SIZE << order);
+ }
+ arch_free_page(page, order);
+ kernel_poison_pages(page, 1 << order, 0);
+ kernel_map_pages(page, 1 << order, 0);
+
+ return true;
+}
+
+#ifdef CONFIG_DEBUG_VM
+static inline bool free_pcp_prepare(struct page *page)
+{
+ return free_pages_prepare(page, 0, true);
}
+static inline bool bulkfree_pcp_prepare(struct page *page)
+{
+ return false;
+}
+#else
+static bool free_pcp_prepare(struct page *page)
+{
+ return free_pages_prepare(page, 0, false);
+}
+
+static bool bulkfree_pcp_prepare(struct page *page)
+{
+ return free_pages_check(page);
+}
+#endif /* CONFIG_DEBUG_VM */
+
/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone, and of same order.
@@ -829,15 +1077,16 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
- int to_free = count;
unsigned long nr_scanned;
+ bool isolated_pageblocks;
spin_lock(&zone->lock);
+ isolated_pageblocks = has_isolate_pageblock(zone);
nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
if (nr_scanned)
__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
- while (to_free) {
+ while (count) {
struct page *page;
struct list_head *list;
@@ -857,7 +1106,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* This is the only non-empty list. Free them all. */
if (batch_free == MIGRATE_PCPTYPES)
- batch_free = to_free;
+ batch_free = count;
do {
int mt; /* migratetype of the to-be-freed page */
@@ -870,12 +1119,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
- if (unlikely(has_isolate_pageblock(zone)))
+ if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
+ if (bulkfree_pcp_prepare(page))
+ continue;
+
__free_one_page(page, page_to_pfn(page), zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
- } while (--to_free && --batch_free && !list_empty(list));
+ } while (--count && --batch_free && !list_empty(list));
}
spin_unlock(&zone->lock);
}
@@ -899,56 +1151,6 @@ static void free_one_page(struct zone *zone,
spin_unlock(&zone->lock);
}
-static int free_tail_pages_check(struct page *head_page, struct page *page)
-{
- int ret = 1;
-
- /*
- * We rely page->lru.next never has bit 0 set, unless the page
- * is PageTail(). Let's make sure that's true even for poisoned ->lru.
- */
- BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
-
- if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
- ret = 0;
- goto out;
- }
- switch (page - head_page) {
- case 1:
- /* the first tail page: ->mapping is compound_mapcount() */
- if (unlikely(compound_mapcount(page))) {
- bad_page(page, "nonzero compound_mapcount", 0);
- goto out;
- }
- break;
- case 2:
- /*
- * the second tail page: ->mapping is
- * page_deferred_list().next -- ignore value.
- */
- break;
- default:
- if (page->mapping != TAIL_MAPPING) {
- bad_page(page, "corrupted mapping in tail page", 0);
- goto out;
- }
- break;
- }
- if (unlikely(!PageTail(page))) {
- bad_page(page, "PageTail not set", 0);
- goto out;
- }
- if (unlikely(compound_head(page) != head_page)) {
- bad_page(page, "compound_head not consistent", 0);
- goto out;
- }
- ret = 0;
-out:
- page->mapping = NULL;
- clear_compound_head(page);
- return ret;
-}
-
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
{
@@ -1022,51 +1224,13 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
}
}
-static bool free_pages_prepare(struct page *page, unsigned int order)
-{
- bool compound = PageCompound(page);
- int i, bad = 0;
-
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
-
- trace_mm_page_free(page, order);
- kmemcheck_free_shadow(page, order);
- kasan_free_pages(page, order);
-
- if (PageAnon(page))
- page->mapping = NULL;
- bad += free_pages_check(page);
- for (i = 1; i < (1 << order); i++) {
- if (compound)
- bad += free_tail_pages_check(page, page + i);
- bad += free_pages_check(page + i);
- }
- if (bad)
- return false;
-
- reset_page_owner(page, order);
-
- if (!PageHighMem(page)) {
- debug_check_no_locks_freed(page_address(page),
- PAGE_SIZE << order);
- debug_check_no_obj_freed(page_address(page),
- PAGE_SIZE << order);
- }
- arch_free_page(page, order);
- kernel_poison_pages(page, 1 << order, 0);
- kernel_map_pages(page, 1 << order, 0);
-
- return true;
-}
-
static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
- if (!free_pages_prepare(page, order))
+ if (!free_pages_prepare(page, order, true))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -1076,8 +1240,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-static void __init __free_pages_boot_core(struct page *page,
- unsigned long pfn, unsigned int order)
+static void __init __free_pages_boot_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -1154,7 +1317,7 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
{
if (early_page_uninitialised(pfn))
return;
- return __free_pages_boot_core(page, pfn, order);
+ return __free_pages_boot_core(page, order);
}
/*
@@ -1239,12 +1402,12 @@ static void __init deferred_free_range(struct page *page,
if (nr_pages == MAX_ORDER_NR_PAGES &&
(pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, pfn, MAX_ORDER-1);
+ __free_pages_boot_core(page, MAX_ORDER-1);
return;
}
- for (i = 0; i < nr_pages; i++, page++, pfn++)
- __free_pages_boot_core(page, pfn, 0);
+ for (i = 0; i < nr_pages; i++, page++)
+ __free_pages_boot_core(page, 0);
}
/* Completion tracking for deferred_init_memmap() threads */
@@ -1477,10 +1640,7 @@ static inline void expand(struct zone *zone, struct page *page,
}
}
-/*
- * This page is about to be returned from the page allocator
- */
-static inline int check_new_page(struct page *page)
+static void check_new_page_bad(struct page *page)
{
const char *bad_reason = NULL;
unsigned long bad_flags = 0;
@@ -1503,11 +1663,20 @@ static inline int check_new_page(struct page *page)
if (unlikely(page->mem_cgroup))
bad_reason = "page still charged to cgroup";
#endif
- if (unlikely(bad_reason)) {
- bad_page(page, bad_reason, bad_flags);
- return 1;
- }
- return 0;
+ bad_page(page, bad_reason, bad_flags);
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static inline int check_new_page(struct page *page)
+{
+ if (likely(page_expected_state(page,
+ PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
+ return 0;
+
+ check_new_page_bad(page);
+ return 1;
}
static inline bool free_pages_prezeroed(bool poisoned)
@@ -1516,16 +1685,48 @@ static inline bool free_pages_prezeroed(bool poisoned)
page_poisoning_enabled() && poisoned;
}
-static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
- int alloc_flags)
+#ifdef CONFIG_DEBUG_VM
+static bool check_pcp_refill(struct page *page)
+{
+ return false;
+}
+
+static bool check_new_pcp(struct page *page)
+{
+ return check_new_page(page);
+}
+#else
+static bool check_pcp_refill(struct page *page)
+{
+ return check_new_page(page);
+}
+static bool check_new_pcp(struct page *page)
+{
+ return false;
+}
+#endif /* CONFIG_DEBUG_VM */
+
+static bool check_new_pages(struct page *page, unsigned int order)
+{
+ int i;
+ for (i = 0; i < (1 << order); i++) {
+ struct page *p = page + i;
+
+ if (unlikely(check_new_page(p)))
+ return true;
+ }
+
+ return false;
+}
+
+static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+ unsigned int alloc_flags)
{
int i;
bool poisoned = true;
for (i = 0; i < (1 << order); i++) {
struct page *p = page + i;
- if (unlikely(check_new_page(p)))
- return 1;
if (poisoned)
poisoned &= page_is_poisoned(p);
}
@@ -1557,8 +1758,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
set_page_pfmemalloc(page);
else
clear_page_pfmemalloc(page);
-
- return 0;
}
/*
@@ -1980,6 +2179,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
if (unlikely(page == NULL))
break;
+ if (unlikely(check_pcp_refill(page)))
+ continue;
+
/*
* Split buddy pages returned by expand() are received here
* in physical page order. The page is added to the callers and
@@ -2157,6 +2359,10 @@ void mark_free_pages(struct zone *zone)
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
+
+ if (page_zone(page) != zone)
+ continue;
+
if (!swsusp_page_is_forbidden(page))
swsusp_unset_page_free(page);
}
@@ -2187,7 +2393,7 @@ void free_hot_cold_page(struct page *page, bool cold)
unsigned long pfn = page_to_pfn(page);
int migratetype;
- if (!free_pages_prepare(page, 0))
+ if (!free_pcp_prepare(page))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -2343,12 +2549,44 @@ int split_free_page(struct page *page)
}
/*
+ * Update NUMA hit/miss statistics
+ *
+ * Must be called with interrupts disabled.
+ *
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
+ */
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+ gfp_t flags)
+{
+#ifdef CONFIG_NUMA
+ int local_nid = numa_node_id();
+ enum zone_stat_item local_stat = NUMA_LOCAL;
+
+ if (unlikely(flags & __GFP_OTHER_NODE)) {
+ local_stat = NUMA_OTHER;
+ local_nid = preferred_zone->node;
+ }
+
+ if (z->node == local_nid) {
+ __inc_zone_state(z, NUMA_HIT);
+ __inc_zone_state(z, local_stat);
+ } else {
+ __inc_zone_state(z, NUMA_MISS);
+ __inc_zone_state(preferred_zone, NUMA_FOREIGN);
+ }
+#endif
+}
+
+/*
* Allocate a page from the given zone. Use pcplists for order-0 allocations.
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int alloc_flags, int migratetype)
+ gfp_t gfp_flags, unsigned int alloc_flags,
+ int migratetype)
{
unsigned long flags;
struct page *page;
@@ -2359,21 +2597,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
struct list_head *list;
local_irq_save(flags);
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
- list = &pcp->lists[migratetype];
- if (list_empty(list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, list,
- migratetype, cold);
- if (unlikely(list_empty(list)))
- goto failed;
- }
+ do {
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ list = &pcp->lists[migratetype];
+ if (list_empty(list)) {
+ pcp->count += rmqueue_bulk(zone, 0,
+ pcp->batch, list,
+ migratetype, cold);
+ if (unlikely(list_empty(list)))
+ goto failed;
+ }
- if (cold)
- page = list_last_entry(list, struct page, lru);
- else
- page = list_first_entry(list, struct page, lru);
+ if (cold)
+ page = list_last_entry(list, struct page, lru);
+ else
+ page = list_first_entry(list, struct page, lru);
+ } while (page && check_new_pcp(page));
+ __dec_zone_state(zone, NR_ALLOC_BATCH);
list_del(&page->lru);
pcp->count--;
} else {
@@ -2384,22 +2625,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);
- page = NULL;
- if (alloc_flags & ALLOC_HARDER) {
- page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
- if (page)
- trace_mm_page_alloc_zone_locked(page, order, migratetype);
- }
- if (!page)
- page = __rmqueue(zone, order, migratetype);
+ do {
+ page = NULL;
+ if (alloc_flags & ALLOC_HARDER) {
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+ if (page)
+ trace_mm_page_alloc_zone_locked(page, order, migratetype);
+ }
+ if (!page)
+ page = __rmqueue(zone, order, migratetype);
+ } while (page && check_new_pages(page, order));
spin_unlock(&zone->lock);
if (!page)
goto failed;
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
}
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
!test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
@@ -2500,13 +2743,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
* one free page of a suitable size. Checking now avoids taking the zone lock
* to check in the allocation paths if no pages are free.
*/
-static bool __zone_watermark_ok(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, int alloc_flags,
- long free_pages)
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+ int classzone_idx, unsigned int alloc_flags,
+ long free_pages)
{
long min = mark;
int o;
- const int alloc_harder = (alloc_flags & ALLOC_HARDER);
+ const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
/* free_pages may go negative - that's OK */
free_pages -= (1 << order) - 1;
@@ -2569,12 +2812,38 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
}
bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+ int classzone_idx, unsigned int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
+static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx, unsigned int alloc_flags)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+ long cma_pages = 0;
+
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
+ /*
+ * Fast check for order-0 only. If this fails then the reserves
+ * need to be calculated. There is a corner case where the check
+ * passes but only the high-order atomic reserve are free. If
+ * the caller is !atomic then it'll uselessly search the free
+ * list. That corner case is then slower but it is harmless.
+ */
+ if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
+ return true;
+
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages);
+}
+
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx)
{
@@ -2630,27 +2899,24 @@ static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
- struct zonelist *zonelist = ac->zonelist;
- struct zoneref *z;
- struct page *page = NULL;
+ struct zoneref *z = ac->preferred_zoneref;
struct zone *zone;
- int nr_fair_skipped = 0;
- bool zonelist_rescan;
+ bool fair_skipped = false;
+ bool apply_fair = (alloc_flags & ALLOC_FAIR);
zonelist_scan:
- zonelist_rescan = false;
-
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+ for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
+ struct page *page;
unsigned long mark;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed(zone, gfp_mask))
+ !__cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* Distribute pages in proportion to the individual
@@ -2658,13 +2924,16 @@ zonelist_scan:
* page was allocated in should have no effect on the
* time the page has in memory before being reclaimed.
*/
- if (alloc_flags & ALLOC_FAIR) {
- if (!zone_local(ac->preferred_zone, zone))
- break;
+ if (apply_fair) {
if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
- nr_fair_skipped++;
+ fair_skipped = true;
continue;
}
+ if (!zone_local(ac->preferred_zoneref->zone, zone)) {
+ if (fair_skipped)
+ goto reset_fair;
+ apply_fair = false;
+ }
}
/*
* When allocating a page cache page for writing, we
@@ -2696,8 +2965,8 @@ zonelist_scan:
continue;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
- if (!zone_watermark_ok(zone, order, mark,
- ac->classzone_idx, alloc_flags)) {
+ if (!zone_watermark_fast(zone, order, mark,
+ ac_classzone_idx(ac), alloc_flags)) {
int ret;
/* Checked here to keep the fast path fast */
@@ -2706,7 +2975,7 @@ zonelist_scan:
goto try_this_zone;
if (zone_reclaim_mode == 0 ||
- !zone_allows_reclaim(ac->preferred_zone, zone))
+ !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
ret = zone_reclaim(zone, gfp_mask, order);
@@ -2720,7 +2989,7 @@ zonelist_scan:
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
- ac->classzone_idx, alloc_flags))
+ ac_classzone_idx(ac), alloc_flags))
goto try_this_zone;
continue;
@@ -2728,11 +2997,10 @@ zonelist_scan:
}
try_this_zone:
- page = buffered_rmqueue(ac->preferred_zone, zone, order,
+ page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
- if (prep_new_page(page, order, gfp_mask, alloc_flags))
- goto try_this_zone;
+ prep_new_page(page, order, gfp_mask, alloc_flags);
/*
* If this is a high-order atomic allocation then check
@@ -2753,18 +3021,13 @@ try_this_zone:
* include remote zones now, before entering the slowpath and waking
* kswapd: prefer spilling to a remote zone over swapping locally.
*/
- if (alloc_flags & ALLOC_FAIR) {
- alloc_flags &= ~ALLOC_FAIR;
- if (nr_fair_skipped) {
- zonelist_rescan = true;
- reset_alloc_batches(ac->preferred_zone);
- }
- if (nr_online_nodes > 1)
- zonelist_rescan = true;
- }
-
- if (zonelist_rescan)
+ if (fair_skipped) {
+reset_fair:
+ apply_fair = false;
+ fair_skipped = false;
+ reset_alloc_batches(ac->preferred_zoneref->zone);
goto zonelist_scan;
+ }
return NULL;
}
@@ -2872,22 +3135,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->high_zoneidx < ZONE_NORMAL)
goto out;
- /* The OOM killer does not compensate for IO-less reclaim */
- if (!(gfp_mask & __GFP_FS)) {
- /*
- * XXX: Page reclaim didn't yield anything,
- * and the OOM killer can't be invoked, but
- * keep looping as per tradition.
- *
- * But do not keep looping if oom_killer_disable()
- * was already called, for the system is trying to
- * enter a quiescent state during suspend.
- */
- *did_some_progress = !oom_killer_disabled;
- goto out;
- }
if (pm_suspended_storage())
goto out;
+ /*
+ * XXX: GFP_NOFS allocations should rather fail than rely on
+ * other request to make a forward progress.
+ * We are in an unfortunate situation where out_of_memory cannot
+ * do much for this context but let's try it to at least get
+ * access to memory reserved if the current task is killed (see
+ * out_of_memory). Once filesystems are ready to handle allocation
+ * failures more gracefully we should just bail out here.
+ */
+
/* The OOM killer may not free memory on a specific node */
if (gfp_mask & __GFP_THISNODE)
goto out;
@@ -2913,34 +3172,33 @@ out:
return page;
}
+
+/*
+ * Maximum number of compaction retries wit a progress before OOM
+ * killer is consider as the only way to move forward.
+ */
+#define MAX_COMPACT_RETRIES 16
+
#ifdef CONFIG_COMPACTION
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended_compaction,
- bool *deferred_compaction)
+ unsigned int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, enum compact_result *compact_result)
{
- unsigned long compact_result;
struct page *page;
+ int contended_compaction;
if (!order)
return NULL;
current->flags |= PF_MEMALLOC;
- compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- mode, contended_compaction);
+ *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
+ mode, &contended_compaction);
current->flags &= ~PF_MEMALLOC;
- switch (compact_result) {
- case COMPACT_DEFERRED:
- *deferred_compaction = true;
- /* fall-through */
- case COMPACT_SKIPPED:
+ if (*compact_result <= COMPACT_INACTIVE)
return NULL;
- default:
- break;
- }
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
@@ -2966,19 +3224,93 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTFAIL);
+ /*
+ * In all zones where compaction was attempted (and not
+ * deferred or skipped), lock contention has been detected.
+ * For THP allocation we do not want to disrupt the others
+ * so we fallback to base pages instead.
+ */
+ if (contended_compaction == COMPACT_CONTENDED_LOCK)
+ *compact_result = COMPACT_CONTENDED;
+
+ /*
+ * If compaction was aborted due to need_resched(), we do not
+ * want to further increase allocation latency, unless it is
+ * khugepaged trying to collapse.
+ */
+ if (contended_compaction == COMPACT_CONTENDED_SCHED
+ && !(current->flags & PF_KTHREAD))
+ *compact_result = COMPACT_CONTENDED;
+
cond_resched();
return NULL;
}
+
+static inline bool
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+ enum compact_result compact_result, enum migrate_mode *migrate_mode,
+ int compaction_retries)
+{
+ int max_retries = MAX_COMPACT_RETRIES;
+
+ if (!order)
+ return false;
+
+ /*
+ * compaction considers all the zone as desperately out of memory
+ * so it doesn't really make much sense to retry except when the
+ * failure could be caused by weak migration mode.
+ */
+ if (compaction_failed(compact_result)) {
+ if (*migrate_mode == MIGRATE_ASYNC) {
+ *migrate_mode = MIGRATE_SYNC_LIGHT;
+ return true;
+ }
+ return false;
+ }
+
+ /*
+ * make sure the compaction wasn't deferred or didn't bail out early
+ * due to locks contention before we declare that we should give up.
+ * But do not retry if the given zonelist is not suitable for
+ * compaction.
+ */
+ if (compaction_withdrawn(compact_result))
+ return compaction_zonelist_suitable(ac, order, alloc_flags);
+
+ /*
+ * !costly requests are much more important than __GFP_REPEAT
+ * costly ones because they are de facto nofail and invoke OOM
+ * killer to move on while costly can fail and users are ready
+ * to cope with that. 1/4 retries is rather arbitrary but we
+ * would need much more detailed feedback from compaction to
+ * make a better decision.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ max_retries /= 4;
+ if (compaction_retries <= max_retries)
+ return true;
+
+ return false;
+}
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended_compaction,
- bool *deferred_compaction)
+ unsigned int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, enum compact_result *compact_result)
{
return NULL;
}
+
+static inline bool
+should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+ enum compact_result compact_result,
+ enum migrate_mode *migrate_mode,
+ int compaction_retries)
+{
+ return false;
+}
#endif /* CONFIG_COMPACTION */
/* Perform direct synchronous page reclaim */
@@ -3013,7 +3345,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
+ unsigned int alloc_flags, const struct alloc_context *ac,
unsigned long *did_some_progress)
{
struct page *page = NULL;
@@ -3049,13 +3381,13 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
ac->high_zoneidx, ac->nodemask)
- wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
+ wakeup_kswapd(zone, order, ac_classzone_idx(ac));
}
-static inline int
+static inline unsigned int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
- int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+ unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -3110,18 +3442,113 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
}
+/*
+ * Maximum number of reclaim retries without any progress before OOM killer
+ * is consider as the only way to move forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
+/*
+ * Checks whether it makes sense to retry the reclaim to make a forward progress
+ * for the given allocation request.
+ * The reclaim feedback represented by did_some_progress (any progress during
+ * the last reclaim round) and no_progress_loops (number of reclaim rounds without
+ * any progress in a row) is considered as well as the reclaimable pages on the
+ * applicable zone list (with a backoff mechanism which is a function of
+ * no_progress_loops).
+ *
+ * Returns true if a retry is viable or false to enter the oom path.
+ */
+static inline bool
+should_reclaim_retry(gfp_t gfp_mask, unsigned order,
+ struct alloc_context *ac, int alloc_flags,
+ bool did_some_progress, int no_progress_loops)
+{
+ struct zone *zone;
+ struct zoneref *z;
+
+ /*
+ * Make sure we converge to OOM if we cannot make any progress
+ * several times in the row.
+ */
+ if (no_progress_loops > MAX_RECLAIM_RETRIES)
+ return false;
+
+ /*
+ * Keep reclaiming pages while there is a chance this will lead somewhere.
+ * If none of the target zones can satisfy our allocation request even
+ * if all reclaimable pages are considered then we are screwed and have
+ * to go OOM.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+ ac->nodemask) {
+ unsigned long available;
+ unsigned long reclaimable;
+
+ available = reclaimable = zone_reclaimable_pages(zone);
+ available -= DIV_ROUND_UP(no_progress_loops * available,
+ MAX_RECLAIM_RETRIES);
+ available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+ /*
+ * Would the allocation succeed if we reclaimed the whole
+ * available?
+ */
+ if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
+ ac->high_zoneidx, alloc_flags, available)) {
+ /*
+ * If we didn't make any progress and have a lot of
+ * dirty + writeback pages then we should wait for
+ * an IO to complete to slow down the reclaim and
+ * prevent from pre mature OOM
+ */
+ if (!did_some_progress) {
+ unsigned long writeback;
+ unsigned long dirty;
+
+ writeback = zone_page_state_snapshot(zone,
+ NR_WRITEBACK);
+ dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
+
+ if (2*(writeback + dirty) > reclaimable) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ return true;
+ }
+ }
+
+ /*
+ * Memory allocation/reclaim might be called from a WQ
+ * context and the current implementation of the WQ
+ * concurrency control doesn't recognize that
+ * a particular WQ is congested if the worker thread is
+ * looping without ever sleeping. Therefore we have to
+ * do a short sleep here rather than calling
+ * cond_resched().
+ */
+ if (current->flags & PF_WQ_WORKER)
+ schedule_timeout_uninterruptible(1);
+ else
+ cond_resched();
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
struct page *page = NULL;
- int alloc_flags;
- unsigned long pages_reclaimed = 0;
+ unsigned int alloc_flags;
unsigned long did_some_progress;
enum migrate_mode migration_mode = MIGRATE_ASYNC;
- bool deferred_compaction = false;
- int contended_compaction = COMPACT_CONTENDED_NONE;
+ enum compact_result compact_result;
+ int compaction_retries = 0;
+ int no_progress_loops = 0;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3153,17 +3580,6 @@ retry:
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
- /*
- * Find the true preferred zone if the allocation is unconstrained by
- * cpusets.
- */
- if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
- struct zoneref *preferred_zoneref;
- preferred_zoneref = first_zones_zonelist(ac->zonelist,
- ac->high_zoneidx, NULL, &ac->preferred_zone);
- ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
- }
-
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, order,
alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
@@ -3219,8 +3635,7 @@ retry:
*/
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
migration_mode,
- &contended_compaction,
- &deferred_compaction);
+ &compact_result);
if (page)
goto got_pg;
@@ -3233,35 +3648,19 @@ retry:
* to heavily disrupt the system, so we fail the allocation
* instead of entering direct reclaim.
*/
- if (deferred_compaction)
- goto nopage;
-
- /*
- * In all zones where compaction was attempted (and not
- * deferred or skipped), lock contention has been detected.
- * For THP allocation we do not want to disrupt the others
- * so we fallback to base pages instead.
- */
- if (contended_compaction == COMPACT_CONTENDED_LOCK)
+ if (compact_result == COMPACT_DEFERRED)
goto nopage;
/*
- * If compaction was aborted due to need_resched(), we do not
- * want to further increase allocation latency, unless it is
- * khugepaged trying to collapse.
+ * Compaction is contended so rather back off than cause
+ * excessive stalls.
*/
- if (contended_compaction == COMPACT_CONTENDED_SCHED
- && !(current->flags & PF_KTHREAD))
+ if(compact_result == COMPACT_CONTENDED)
goto nopage;
}
- /*
- * It can become very expensive to allocate transparent hugepages at
- * fault, so use asynchronous memory compaction for THP unless it is
- * khugepaged trying to collapse.
- */
- if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
- migration_mode = MIGRATE_SYNC_LIGHT;
+ if (order && compaction_made_progress(compact_result))
+ compaction_retries++;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
@@ -3273,14 +3672,38 @@ retry:
if (gfp_mask & __GFP_NORETRY)
goto noretry;
- /* Keep reclaiming pages as long as there is reasonable progress */
- pages_reclaimed += did_some_progress;
- if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
- ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
- /* Wait for some write requests to complete then retry */
- wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
+ /*
+ * Do not retry costly high order allocations unless they are
+ * __GFP_REPEAT
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+ goto noretry;
+
+ /*
+ * Costly allocations might have made a progress but this doesn't mean
+ * their order will become available due to high fragmentation so
+ * always increment the no progress counter for them
+ */
+ if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
+ no_progress_loops = 0;
+ else
+ no_progress_loops++;
+
+ if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
+ did_some_progress > 0, no_progress_loops))
+ goto retry;
+
+ /*
+ * It doesn't make any sense to retry for the compaction if the order-0
+ * reclaim is not able to make any progress because the current
+ * implementation of the compaction depends on the sufficient amount
+ * of free memory (see __compaction_suitable)
+ */
+ if (did_some_progress > 0 &&
+ should_compact_retry(ac, order, alloc_flags,
+ compact_result, &migration_mode,
+ compaction_retries))
goto retry;
- }
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -3288,19 +3711,28 @@ retry:
goto got_pg;
/* Retry as long as the OOM killer is making progress */
- if (did_some_progress)
+ if (did_some_progress) {
+ no_progress_loops = 0;
goto retry;
+ }
noretry:
/*
- * High-order allocations do not necessarily loop after
- * direct reclaim and reclaim/compaction depends on compaction
- * being called after reclaim so call directly if necessary
+ * High-order allocations do not necessarily loop after direct reclaim
+ * and reclaim/compaction depends on compaction being called after
+ * reclaim so call directly if necessary.
+ * It can become very expensive to allocate transparent hugepages at
+ * fault, so use asynchronous memory compaction for THP unless it is
+ * khugepaged trying to collapse. All other requests should tolerate
+ * at least light sync migration.
*/
+ if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD))
+ migration_mode = MIGRATE_ASYNC;
+ else
+ migration_mode = MIGRATE_SYNC_LIGHT;
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
ac, migration_mode,
- &contended_compaction,
- &deferred_compaction);
+ &compact_result);
if (page)
goto got_pg;
nopage:
@@ -3316,17 +3748,24 @@ struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
- struct zoneref *preferred_zoneref;
- struct page *page = NULL;
+ struct page *page;
unsigned int cpuset_mems_cookie;
- int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
- gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+ unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+ gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = {
.high_zoneidx = gfp_zone(gfp_mask),
+ .zonelist = zonelist,
.nodemask = nodemask,
.migratetype = gfpflags_to_migratetype(gfp_mask),
};
+ if (cpusets_enabled()) {
+ alloc_mask |= __GFP_HARDWALL;
+ alloc_flags |= ALLOC_CPUSET;
+ if (!ac.nodemask)
+ ac.nodemask = &cpuset_current_mems_allowed;
+ }
+
gfp_mask &= gfp_allowed_mask;
lockdep_trace_alloc(gfp_mask);
@@ -3350,49 +3789,54 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
- /* We set it here, as __alloc_pages_slowpath might have changed it */
- ac.zonelist = zonelist;
-
/* Dirty zone balancing only done in the fast path */
ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
/* The preferred zone is used for statistics later */
- preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
- ac.nodemask ? : &cpuset_current_mems_allowed,
- &ac.preferred_zone);
- if (!ac.preferred_zone)
- goto out;
- ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
+ ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+ ac.high_zoneidx, ac.nodemask);
+ if (!ac.preferred_zoneref) {
+ page = NULL;
+ goto no_zone;
+ }
/* First allocation attempt */
- alloc_mask = gfp_mask|__GFP_HARDWALL;
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
- if (unlikely(!page)) {
- /*
- * Runtime PM, block IO and its error handling path
- * can deadlock because I/O on the device might not
- * complete.
- */
- alloc_mask = memalloc_noio_flags(gfp_mask);
- ac.spread_dirty_pages = false;
-
- page = __alloc_pages_slowpath(alloc_mask, order, &ac);
- }
+ if (likely(page))
+ goto out;
- if (kmemcheck_enabled && page)
- kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+ /*
+ * Runtime PM, block IO and its error handling path can deadlock
+ * because I/O on the device might not complete.
+ */
+ alloc_mask = memalloc_noio_flags(gfp_mask);
+ ac.spread_dirty_pages = false;
- trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
+ /*
+ * Restore the original nodemask if it was potentially replaced with
+ * &cpuset_current_mems_allowed to optimize the fast-path attempt.
+ */
+ if (cpusets_enabled())
+ ac.nodemask = nodemask;
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac);
-out:
+no_zone:
/*
* When updating a task's mems_allowed, it is possible to race with
* parallel threads in such a way that an allocation can fail while
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
+ alloc_mask = gfp_mask;
goto retry_cpuset;
+ }
+
+out:
+ if (kmemcheck_enabled && page)
+ kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+
+ trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
return page;
}
@@ -3790,6 +4234,8 @@ void si_meminfo_node(struct sysinfo *val, int nid)
{
int zone_type; /* needs to be signed */
unsigned long managed_pages = 0;
+ unsigned long managed_highpages = 0;
+ unsigned long free_highpages = 0;
pg_data_t *pgdat = NODE_DATA(nid);
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
@@ -3798,12 +4244,19 @@ void si_meminfo_node(struct sysinfo *val, int nid)
val->sharedram = node_page_state(nid, NR_SHMEM);
val->freeram = node_page_state(nid, NR_FREE_PAGES);
#ifdef CONFIG_HIGHMEM
- val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
- val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
- NR_FREE_PAGES);
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+
+ if (is_highmem(zone)) {
+ managed_highpages += zone->managed_pages;
+ free_highpages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+ }
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
#else
- val->totalhigh = 0;
- val->freehigh = 0;
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
#endif
val->mem_unit = PAGE_SIZE;
}
@@ -4390,13 +4843,12 @@ static void build_zonelists(pg_data_t *pgdat)
*/
int local_memory_node(int node)
{
- struct zone *zone;
+ struct zoneref *z;
- (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+ z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
gfp_zone(GFP_KERNEL),
- NULL,
- &zone);
- return zone->node;
+ NULL);
+ return z->zone->node;
}
#endif
@@ -6395,49 +6847,6 @@ void setup_per_zone_wmarks(void)
}
/*
- * The inactive anon list should be small enough that the VM never has to
- * do too much work, but large enough that each inactive page has a chance
- * to be referenced again before it is swapped out.
- *
- * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
- * INACTIVE_ANON pages on this zone's LRU, maintained by the
- * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
- * the anonymous pages are kept on the inactive list.
- *
- * total target max
- * memory ratio inactive anon
- * -------------------------------------
- * 10MB 1 5MB
- * 100MB 1 50MB
- * 1GB 3 250MB
- * 10GB 10 0.9GB
- * 100GB 31 3GB
- * 1TB 101 10GB
- * 10TB 320 32GB
- */
-static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
-{
- unsigned int gb, ratio;
-
- /* Zone size in gigabytes */
- gb = zone->managed_pages >> (30 - PAGE_SHIFT);
- if (gb)
- ratio = int_sqrt(10 * gb);
- else
- ratio = 1;
-
- zone->inactive_ratio = ratio;
-}
-
-static void __meminit setup_per_zone_inactive_ratio(void)
-{
- struct zone *zone;
-
- for_each_zone(zone)
- calculate_zone_inactive_ratio(zone);
-}
-
-/*
* Initialise min_free_kbytes.
*
* For small machines we want it small (128k min). For large machines
@@ -6482,7 +6891,6 @@ int __meminit init_per_zone_wmark_min(void)
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
- setup_per_zone_inactive_ratio();
return 0;
}
core_initcall(init_per_zone_wmark_min)
@@ -6725,98 +7133,6 @@ void *__init alloc_large_system_hash(const char *tablename,
return table;
}
-/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
- unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
- return __pfn_to_section(pfn)->pageblock_flags;
-#else
- return zone->pageblock_flags;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
- pfn &= (PAGES_PER_SECTION-1);
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#else
- pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest to retrieve
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
-unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
- unsigned long end_bitidx,
- unsigned long mask)
-{
- struct zone *zone;
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
- unsigned long word;
-
- zone = page_zone(page);
- bitmap = get_pageblock_bitmap(zone, pfn);
- bitidx = pfn_to_bitidx(zone, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
-
- word = bitmap[word_bitidx];
- bitidx += end_bitidx;
- return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
-}
-
-/**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @flags: The flags to set
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest
- * @mask: mask of bits that the caller is interested in
- */
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
- unsigned long pfn,
- unsigned long end_bitidx,
- unsigned long mask)
-{
- struct zone *zone;
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
- unsigned long old_word, word;
-
- BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
-
- zone = page_zone(page);
- bitmap = get_pageblock_bitmap(zone, pfn);
- bitidx = pfn_to_bitidx(zone, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
-
- VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
-
- bitidx += end_bitidx;
- mask <<= (BITS_PER_LONG - bitidx - 1);
- flags <<= (BITS_PER_LONG - bitidx - 1);
-
- word = READ_ONCE(bitmap[word_bitidx]);
- for (;;) {
- old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
- if (word == old_word)
- break;
- word = old_word;
- }
-}
-
/*
* This function checks whether pageblock includes unmovable pages or not.
* If @count is not zero, it is okay to include less @count unmovable pages
@@ -6864,7 +7180,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* We can't use page_count without pin a page
* because another CPU can free compound page.
* This check already skips compound tails of THP
- * because their page->_count is zero at all time.
+ * because their page->_refcount is zero at all time.
*/
if (!page_ref_count(page)) {
if (PageBuddy(page))
@@ -7177,7 +7493,8 @@ void zone_pcp_reset(struct zone *zone)
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * All pages in the range must be isolated before calling this.
+ * All pages in the range must be in a single zone and isolated
+ * before calling this.
*/
void
__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c4f568206544..612122bf6a42 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -246,6 +246,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
return pfn;
}
+/* Caller should ensure that requested range is in a single zone */
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
bool skip_hwpoisoned_pages)
{
@@ -288,13 +289,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
* accordance with memory policy of the user process if possible. For
* now as a simple work-around, we use the next node for destination.
*/
- if (PageHuge(page)) {
- int node = next_online_node(page_to_nid(page));
- if (node == MAX_NUMNODES)
- node = first_online_node;
+ if (PageHuge(page))
return alloc_huge_page_node(page_hstate(compound_head(page)),
- node);
- }
+ next_node_in(page_to_nid(page),
+ node_online_map));
if (PageHighMem(page))
gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index ac3d8d129974..792b56da13d8 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -143,7 +143,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
goto err;
/* Print information relevant to grouping pages by mobility */
- pageblock_mt = get_pfnblock_migratetype(page, pfn);
+ pageblock_mt = get_pageblock_migratetype(page);
page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
@@ -301,6 +301,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
page = pfn_to_page(pfn);
+ if (page_zone(page) != zone)
+ continue;
+
/*
* We are safe to check buddy flag and order, because
* this is init stage and only single thread runs.
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 479e7ea2bea6..1eae5fad2446 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -13,13 +13,7 @@ static int early_page_poison_param(char *buf)
{
if (!buf)
return -EINVAL;
-
- if (strcmp(buf, "on") == 0)
- want_page_poisoning = true;
- else if (strcmp(buf, "off") == 0)
- want_page_poisoning = false;
-
- return 0;
+ return strtobool(buf, &want_page_poisoning);
}
early_param("page_poison", early_page_poison_param);
diff --git a/mm/rmap.c b/mm/rmap.c
index 307b555024ef..8a839935b18c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -409,7 +409,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
- BUG_ON(anon_vma->degree);
+ VM_WARN_ON(anon_vma->degree);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
@@ -1249,7 +1249,7 @@ void page_add_new_anon_rmap(struct page *page,
int nr = compound ? hpage_nr_pages(page) : 1;
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
- SetPageSwapBacked(page);
+ __SetPageSwapBacked(page);
if (compound) {
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/* increment count (starts at -1) */
diff --git a/mm/shmem.c b/mm/shmem.c
index e684a9140228..e418a995427d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -101,7 +101,6 @@ struct shmem_falloc {
enum sgp_type {
SGP_READ, /* don't exceed i_size, don't allocate page */
SGP_CACHE, /* don't exceed i_size, may allocate page */
- SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
};
@@ -122,13 +121,14 @@ static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
static int shmem_replace_page(struct page **pagep, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index);
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct mm_struct *fault_mm, int *fault_type);
static inline int shmem_getpage(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp, int *fault_type)
+ struct page **pagep, enum sgp_type sgp)
{
return shmem_getpage_gfp(inode, index, pagep, sgp,
- mapping_gfp_mask(inode->i_mapping), fault_type);
+ mapping_gfp_mask(inode->i_mapping), NULL, NULL);
}
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -169,7 +169,7 @@ static inline int shmem_reacct_size(unsigned long flags,
/*
* ... whereas tmpfs objects are accounted incrementally as
- * pages are allocated, in order to allow huge sparse files.
+ * pages are allocated, in order to allow large sparse files.
* shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
*/
@@ -528,7 +528,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (partial_start) {
struct page *page = NULL;
- shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
+ shmem_getpage(inode, start - 1, &page, SGP_READ);
if (page) {
unsigned int top = PAGE_SIZE;
if (start > end) {
@@ -543,7 +543,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
}
if (partial_end) {
struct page *page = NULL;
- shmem_getpage(inode, end, &page, SGP_READ, NULL);
+ shmem_getpage(inode, end, &page, SGP_READ);
if (page) {
zero_user_segment(page, 0, partial_end);
set_page_dirty(page);
@@ -947,8 +947,7 @@ redirty:
return 0;
}
-#ifdef CONFIG_NUMA
-#ifdef CONFIG_TMPFS
+#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
char buffer[64];
@@ -972,7 +971,18 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
}
return mpol;
}
-#endif /* CONFIG_TMPFS */
+#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
+{
+}
+static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+ return NULL;
+}
+#endif /* CONFIG_NUMA && CONFIG_TMPFS */
+#ifndef CONFIG_NUMA
+#define vm_policy vm_private_data
+#endif
static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
@@ -1008,39 +1018,17 @@ static struct page *shmem_alloc_page(gfp_t gfp,
pvma.vm_ops = NULL;
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
- page = alloc_page_vma(gfp, &pvma, 0);
+ page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false);
+ if (page) {
+ __SetPageLocked(page);
+ __SetPageSwapBacked(page);
+ }
/* Drop reference taken by mpol_shared_policy_lookup() */
mpol_cond_put(pvma.vm_policy);
return page;
}
-#else /* !CONFIG_NUMA */
-#ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
-{
-}
-#endif /* CONFIG_TMPFS */
-
-static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index)
-{
- return swapin_readahead(swap, gfp, NULL, 0);
-}
-
-static inline struct page *shmem_alloc_page(gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index)
-{
- return alloc_page(gfp);
-}
-#endif /* CONFIG_NUMA */
-
-#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
-static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
-{
- return NULL;
-}
-#endif
/*
* When a page is moved from swapcache to shmem filecache (either by the
@@ -1084,9 +1072,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __SetPageLocked(newpage);
SetPageUptodate(newpage);
- SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
SetPageSwapCache(newpage);
@@ -1130,14 +1116,19 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*
* If we allocate a new one we do not mark it dirty. That's up to the
* vm. If we swap it in we mark it dirty since we also free the swap
- * entry since a page cannot live in both the swap and page cache
+ * entry since a page cannot live in both the swap and page cache.
+ *
+ * fault_mm and fault_type are only supplied by shmem_fault:
+ * otherwise they are NULL.
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
+ struct page **pagep, enum sgp_type sgp, gfp_t gfp,
+ struct mm_struct *fault_mm, int *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo;
+ struct mm_struct *charge_mm;
struct mem_cgroup *memcg;
struct page *page;
swp_entry_t swap;
@@ -1155,7 +1146,7 @@ repeat:
page = NULL;
}
- if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+ if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
error = -EINVAL;
goto unlock;
@@ -1183,14 +1174,19 @@ repeat:
*/
info = SHMEM_I(inode);
sbinfo = SHMEM_SB(inode->i_sb);
+ charge_mm = fault_mm ? : current->mm;
if (swap.val) {
/* Look it up and read it in.. */
page = lookup_swap_cache(swap);
if (!page) {
- /* here we actually do the io */
- if (fault_type)
+ /* Or update major stats only when swapin succeeds?? */
+ if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT);
+ }
+ /* Here we actually start the io */
page = shmem_swapin(swap, gfp, info, index);
if (!page) {
error = -ENOMEM;
@@ -1217,7 +1213,7 @@ repeat:
goto failed;
}
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
false);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
@@ -1275,13 +1271,10 @@ repeat:
error = -ENOMEM;
goto decused;
}
-
- __SetPageSwapBacked(page);
- __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
false);
if (error)
goto decused;
@@ -1321,12 +1314,10 @@ clear:
flush_dcache_page(page);
SetPageUptodate(page);
}
- if (sgp == SGP_DIRTY)
- set_page_dirty(page);
}
/* Perhaps the file has been truncated since we checked */
- if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+ if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
if (alloced) {
ClearPageDirty(page);
@@ -1372,6 +1363,7 @@ unlock:
static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = file_inode(vma->vm_file);
+ gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
int error;
int ret = VM_FAULT_LOCKED;
@@ -1433,14 +1425,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
spin_unlock(&inode->i_lock);
}
- error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
+ error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE,
+ gfp, vma->vm_mm, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
-
- if (ret & VM_FAULT_MAJOR) {
- count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- }
return ret;
}
@@ -1587,7 +1575,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
return -EPERM;
}
- return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+ return shmem_getpage(inode, index, pagep, SGP_WRITE);
}
static int
@@ -1633,7 +1621,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* and even mark them dirty, so it cannot exceed the max_blocks limit.
*/
if (!iter_is_iovec(to))
- sgp = SGP_DIRTY;
+ sgp = SGP_CACHE;
index = *ppos >> PAGE_SHIFT;
offset = *ppos & ~PAGE_MASK;
@@ -1653,14 +1641,17 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
break;
}
- error = shmem_getpage(inode, index, &page, sgp, NULL);
+ error = shmem_getpage(inode, index, &page, sgp);
if (error) {
if (error == -EINVAL)
error = 0;
break;
}
- if (page)
+ if (page) {
+ if (sgp == SGP_CACHE)
+ set_page_dirty(page);
unlock_page(page);
+ }
/*
* We must evaluate after, since reads (unlike writes)
@@ -1766,7 +1757,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
error = 0;
while (spd.nr_pages < nr_pages) {
- error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
+ error = shmem_getpage(inode, index, &page, SGP_CACHE);
if (error)
break;
unlock_page(page);
@@ -1788,8 +1779,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
page = spd.pages[page_nr];
if (!PageUptodate(page) || page->mapping != mapping) {
- error = shmem_getpage(inode, index, &page,
- SGP_CACHE, NULL);
+ error = shmem_getpage(inode, index, &page, SGP_CACHE);
if (error)
break;
unlock_page(page);
@@ -2232,8 +2222,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
error = -ENOMEM;
else
- error = shmem_getpage(inode, index, &page, SGP_FALLOC,
- NULL);
+ error = shmem_getpage(inode, index, &page, SGP_FALLOC);
if (error) {
/* Remove the !PageUptodate pages we added */
shmem_undo_range(inode,
@@ -2551,7 +2540,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
- error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
+ error = shmem_getpage(inode, 0, &page, SGP_WRITE);
if (error) {
iput(inode);
return error;
@@ -2592,7 +2581,7 @@ static const char *shmem_get_link(struct dentry *dentry,
return ERR_PTR(-ECHILD);
}
} else {
- error = shmem_getpage(inode, 0, &page, SGP_READ, NULL);
+ error = shmem_getpage(inode, 0, &page, SGP_READ);
if (error)
return ERR_PTR(error);
unlock_page(page);
@@ -3496,7 +3485,8 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
int error;
BUG_ON(mapping->a_ops != &shmem_aops);
- error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
+ error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
+ gfp, NULL, NULL);
if (error)
page = ERR_PTR(error);
else
diff --git a/mm/slab.c b/mm/slab.c
index 17e2848979c5..cc8bbc1e6bc9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -213,6 +213,11 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list);
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
static void cache_reap(struct work_struct *unused);
+static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
+ void **list);
+static inline void fixup_slab_list(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, struct page *page,
+ void **list);
static int slab_early_init = 1;
#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
@@ -421,8 +426,6 @@ static struct kmem_cache kmem_cache_boot = {
.name = "kmem_cache",
};
-#define BAD_ALIEN_MAGIC 0x01020304ul
-
static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -519,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node);
static void init_reap_node(int cpu)
{
- int node;
-
- node = next_node(cpu_to_mem(cpu), node_online_map);
- if (node == MAX_NUMNODES)
- node = first_node(node_online_map);
-
- per_cpu(slab_reap_node, cpu) = node;
+ per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
+ node_online_map);
}
static void next_reap_node(void)
{
int node = __this_cpu_read(slab_reap_node);
- node = next_node(node, node_online_map);
- if (unlikely(node >= MAX_NUMNODES))
- node = first_node(node_online_map);
+ node = next_node_in(node, node_online_map);
__this_cpu_write(slab_reap_node, node);
}
@@ -644,7 +640,7 @@ static int transfer_objects(struct array_cache *to,
static inline struct alien_cache **alloc_alien_cache(int node,
int limit, gfp_t gfp)
{
- return (struct alien_cache **)BAD_ALIEN_MAGIC;
+ return NULL;
}
static inline void free_alien_cache(struct alien_cache **ac_ptr)
@@ -850,6 +846,46 @@ static inline gfp_t gfp_exact_node(gfp_t flags)
}
#endif
+static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
+{
+ struct kmem_cache_node *n;
+
+ /*
+ * Set up the kmem_cache_node for cpu before we can
+ * begin anything. Make sure some other cpu on this
+ * node has not already allocated this
+ */
+ n = get_node(cachep, node);
+ if (n) {
+ spin_lock_irq(&n->list_lock);
+ n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
+ cachep->num;
+ spin_unlock_irq(&n->list_lock);
+
+ return 0;
+ }
+
+ n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
+ if (!n)
+ return -ENOMEM;
+
+ kmem_cache_node_init(n);
+ n->next_reap = jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
+
+ n->free_limit =
+ (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
+
+ /*
+ * The kmem_cache_nodes don't come and go as CPUs
+ * come and go. slab_mutex is sufficient
+ * protection here.
+ */
+ cachep->node[node] = n;
+
+ return 0;
+}
+
/*
* Allocates and initializes node for a node on each slab cache, used for
* either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
@@ -861,46 +897,82 @@ static inline gfp_t gfp_exact_node(gfp_t flags)
*/
static int init_cache_node_node(int node)
{
+ int ret;
struct kmem_cache *cachep;
- struct kmem_cache_node *n;
- const size_t memsize = sizeof(struct kmem_cache_node);
list_for_each_entry(cachep, &slab_caches, list) {
- /*
- * Set up the kmem_cache_node for cpu before we can
- * begin anything. Make sure some other cpu on this
- * node has not already allocated this
- */
- n = get_node(cachep, node);
- if (!n) {
- n = kmalloc_node(memsize, GFP_KERNEL, node);
- if (!n)
- return -ENOMEM;
- kmem_cache_node_init(n);
- n->next_reap = jiffies + REAPTIMEOUT_NODE +
- ((unsigned long)cachep) % REAPTIMEOUT_NODE;
-
- /*
- * The kmem_cache_nodes don't come and go as CPUs
- * come and go. slab_mutex is sufficient
- * protection here.
- */
- cachep->node[node] = n;
- }
-
- spin_lock_irq(&n->list_lock);
- n->free_limit =
- (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- spin_unlock_irq(&n->list_lock);
+ ret = init_cache_node(cachep, node, GFP_KERNEL);
+ if (ret)
+ return ret;
}
+
return 0;
}
-static inline int slabs_tofree(struct kmem_cache *cachep,
- struct kmem_cache_node *n)
+static int setup_kmem_cache_node(struct kmem_cache *cachep,
+ int node, gfp_t gfp, bool force_change)
{
- return (n->free_objects + cachep->num - 1) / cachep->num;
+ int ret = -ENOMEM;
+ struct kmem_cache_node *n;
+ struct array_cache *old_shared = NULL;
+ struct array_cache *new_shared = NULL;
+ struct alien_cache **new_alien = NULL;
+ LIST_HEAD(list);
+
+ if (use_alien_caches) {
+ new_alien = alloc_alien_cache(node, cachep->limit, gfp);
+ if (!new_alien)
+ goto fail;
+ }
+
+ if (cachep->shared) {
+ new_shared = alloc_arraycache(node,
+ cachep->shared * cachep->batchcount, 0xbaadf00d, gfp);
+ if (!new_shared)
+ goto fail;
+ }
+
+ ret = init_cache_node(cachep, node, gfp);
+ if (ret)
+ goto fail;
+
+ n = get_node(cachep, node);
+ spin_lock_irq(&n->list_lock);
+ if (n->shared && force_change) {
+ free_block(cachep, n->shared->entry,
+ n->shared->avail, node, &list);
+ n->shared->avail = 0;
+ }
+
+ if (!n->shared || force_change) {
+ old_shared = n->shared;
+ n->shared = new_shared;
+ new_shared = NULL;
+ }
+
+ if (!n->alien) {
+ n->alien = new_alien;
+ new_alien = NULL;
+ }
+
+ spin_unlock_irq(&n->list_lock);
+ slabs_destroy(cachep, &list);
+
+ /*
+ * To protect lockless access to n->shared during irq disabled context.
+ * If n->shared isn't NULL in irq disabled context, accessing to it is
+ * guaranteed to be valid until irq is re-enabled, because it will be
+ * freed after synchronize_sched().
+ */
+ if (force_change)
+ synchronize_sched();
+
+fail:
+ kfree(old_shared);
+ kfree(new_shared);
+ free_alien_cache(new_alien);
+
+ return ret;
}
static void cpuup_canceled(long cpu)
@@ -967,14 +1039,13 @@ free_slab:
n = get_node(cachep, node);
if (!n)
continue;
- drain_freelist(cachep, n, slabs_tofree(cachep, n));
+ drain_freelist(cachep, n, INT_MAX);
}
}
static int cpuup_prepare(long cpu)
{
struct kmem_cache *cachep;
- struct kmem_cache_node *n = NULL;
int node = cpu_to_mem(cpu);
int err;
@@ -993,44 +1064,9 @@ static int cpuup_prepare(long cpu)
* array caches
*/
list_for_each_entry(cachep, &slab_caches, list) {
- struct array_cache *shared = NULL;
- struct alien_cache **alien = NULL;
-
- if (cachep->shared) {
- shared = alloc_arraycache(node,
- cachep->shared * cachep->batchcount,
- 0xbaadf00d, GFP_KERNEL);
- if (!shared)
- goto bad;
- }
- if (use_alien_caches) {
- alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
- if (!alien) {
- kfree(shared);
- goto bad;
- }
- }
- n = get_node(cachep, node);
- BUG_ON(!n);
-
- spin_lock_irq(&n->list_lock);
- if (!n->shared) {
- /*
- * We are serialised from CPU_DEAD or
- * CPU_UP_CANCELLED by the cpucontrol lock
- */
- n->shared = shared;
- shared = NULL;
- }
-#ifdef CONFIG_NUMA
- if (!n->alien) {
- n->alien = alien;
- alien = NULL;
- }
-#endif
- spin_unlock_irq(&n->list_lock);
- kfree(shared);
- free_alien_cache(alien);
+ err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false);
+ if (err)
+ goto bad;
}
return 0;
@@ -1119,7 +1155,7 @@ static int __meminit drain_cache_node_node(int node)
if (!n)
continue;
- drain_freelist(cachep, n, slabs_tofree(cachep, n));
+ drain_freelist(cachep, n, INT_MAX);
if (!list_empty(&n->slabs_full) ||
!list_empty(&n->slabs_partial)) {
@@ -1200,6 +1236,61 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
}
}
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list,
+ size_t count)
+{
+ size_t i;
+ unsigned int rand;
+
+ for (i = 0; i < count; i++)
+ list[i] = i;
+
+ /* Fisher-Yates shuffle */
+ for (i = count - 1; i > 0; i--) {
+ rand = prandom_u32_state(state);
+ rand %= (i + 1);
+ swap(list[i], list[rand]);
+ }
+}
+
+/* Create a random sequence per cache */
+static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+ unsigned int seed, count = cachep->num;
+ struct rnd_state state;
+
+ if (count < 2)
+ return 0;
+
+ /* If it fails, we will just use the global lists */
+ cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp);
+ if (!cachep->random_seq)
+ return -ENOMEM;
+
+ /* Get best entropy at this stage */
+ get_random_bytes_arch(&seed, sizeof(seed));
+ prandom_seed_state(&state, seed);
+
+ freelist_randomize(&state, cachep->random_seq, count);
+ return 0;
+}
+
+/* Destroy the per-cache random freelist sequence */
+static void cache_random_seq_destroy(struct kmem_cache *cachep)
+{
+ kfree(cachep->random_seq);
+ cachep->random_seq = NULL;
+}
+#else
+static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+ return 0;
+}
+static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
+
/*
* Initialisation. Called after the page allocator have been initialised and
* before smp_init().
@@ -1212,7 +1303,7 @@ void __init kmem_cache_init(void)
sizeof(struct rcu_head));
kmem_cache = &kmem_cache_boot;
- if (num_possible_nodes() == 1)
+ if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1)
use_alien_caches = 0;
for (i = 0; i < NUM_INIT_LISTS; i++)
@@ -1781,7 +1872,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
/*
* Needed to avoid possible looping condition
- * in cache_grow()
+ * in cache_grow_begin()
*/
if (OFF_SLAB(freelist_cache))
continue;
@@ -2138,7 +2229,7 @@ done:
cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
- if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
+ if (flags & SLAB_CACHE_DMA)
cachep->allocflags |= GFP_DMA;
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
@@ -2180,6 +2271,11 @@ static void check_irq_on(void)
BUG_ON(irqs_disabled());
}
+static void check_mutex_acquired(void)
+{
+ BUG_ON(!mutex_is_locked(&slab_mutex));
+}
+
static void check_spinlock_acquired(struct kmem_cache *cachep)
{
#ifdef CONFIG_SMP
@@ -2199,13 +2295,27 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
#else
#define check_irq_off() do { } while(0)
#define check_irq_on() do { } while(0)
+#define check_mutex_acquired() do { } while(0)
#define check_spinlock_acquired(x) do { } while(0)
#define check_spinlock_acquired_node(x, y) do { } while(0)
#endif
-static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
- struct array_cache *ac,
- int force, int node);
+static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
+ int node, bool free_all, struct list_head *list)
+{
+ int tofree;
+
+ if (!ac || !ac->avail)
+ return;
+
+ tofree = free_all ? ac->avail : (ac->limit + 4) / 5;
+ if (tofree > ac->avail)
+ tofree = (ac->avail + 1) / 2;
+
+ free_block(cachep, ac->entry, tofree, node, list);
+ ac->avail -= tofree;
+ memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail);
+}
static void do_drain(void *arg)
{
@@ -2229,6 +2339,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
{
struct kmem_cache_node *n;
int node;
+ LIST_HEAD(list);
on_each_cpu(do_drain, cachep, 1);
check_irq_on();
@@ -2236,8 +2347,13 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
if (n->alien)
drain_alien_cache(cachep, n->alien);
- for_each_kmem_cache_node(cachep, node, n)
- drain_array(cachep, n, n->shared, 1, node);
+ for_each_kmem_cache_node(cachep, node, n) {
+ spin_lock_irq(&n->list_lock);
+ drain_array_locked(cachep, n->shared, node, true, &list);
+ spin_unlock_irq(&n->list_lock);
+
+ slabs_destroy(cachep, &list);
+ }
}
/*
@@ -2288,7 +2404,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
check_irq_on();
for_each_kmem_cache_node(cachep, node, n) {
- drain_freelist(cachep, n, slabs_tofree(cachep, n));
+ drain_freelist(cachep, n, INT_MAX);
ret += !list_empty(&n->slabs_full) ||
!list_empty(&n->slabs_partial);
@@ -2306,6 +2422,8 @@ void __kmem_cache_release(struct kmem_cache *cachep)
int i;
struct kmem_cache_node *n;
+ cache_random_seq_destroy(cachep);
+
free_percpu(cachep->cpu_cache);
/* NUMA: free the node structures */
@@ -2412,15 +2530,115 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
#endif
}
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+/* Hold information during a freelist initialization */
+union freelist_init_state {
+ struct {
+ unsigned int pos;
+ freelist_idx_t *list;
+ unsigned int count;
+ unsigned int rand;
+ };
+ struct rnd_state rnd_state;
+};
+
+/*
+ * Initialize the state based on the randomization methode available.
+ * return true if the pre-computed list is available, false otherwize.
+ */
+static bool freelist_state_initialize(union freelist_init_state *state,
+ struct kmem_cache *cachep,
+ unsigned int count)
+{
+ bool ret;
+ unsigned int rand;
+
+ /* Use best entropy available to define a random shift */
+ get_random_bytes_arch(&rand, sizeof(rand));
+
+ /* Use a random state if the pre-computed list is not available */
+ if (!cachep->random_seq) {
+ prandom_seed_state(&state->rnd_state, rand);
+ ret = false;
+ } else {
+ state->list = cachep->random_seq;
+ state->count = count;
+ state->pos = 0;
+ state->rand = rand;
+ ret = true;
+ }
+ return ret;
+}
+
+/* Get the next entry on the list and randomize it using a random shift */
+static freelist_idx_t next_random_slot(union freelist_init_state *state)
+{
+ return (state->list[state->pos++] + state->rand) % state->count;
+}
+
+/*
+ * Shuffle the freelist initialization state based on pre-computed lists.
+ * return true if the list was successfully shuffled, false otherwise.
+ */
+static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
+{
+ unsigned int objfreelist = 0, i, count = cachep->num;
+ union freelist_init_state state;
+ bool precomputed;
+
+ if (count < 2)
+ return false;
+
+ precomputed = freelist_state_initialize(&state, cachep, count);
+
+ /* Take a random entry as the objfreelist */
+ if (OBJFREELIST_SLAB(cachep)) {
+ if (!precomputed)
+ objfreelist = count - 1;
+ else
+ objfreelist = next_random_slot(&state);
+ page->freelist = index_to_obj(cachep, page, objfreelist) +
+ obj_offset(cachep);
+ count--;
+ }
+
+ /*
+ * On early boot, generate the list dynamically.
+ * Later use a pre-computed list for speed.
+ */
+ if (!precomputed) {
+ freelist_randomize(&state.rnd_state, page->freelist, count);
+ } else {
+ for (i = 0; i < count; i++)
+ set_free_obj(page, i, next_random_slot(&state));
+ }
+
+ if (OBJFREELIST_SLAB(cachep))
+ set_free_obj(page, cachep->num - 1, objfreelist);
+
+ return true;
+}
+#else
+static inline bool shuffle_freelist(struct kmem_cache *cachep,
+ struct page *page)
+{
+ return false;
+}
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
static void cache_init_objs(struct kmem_cache *cachep,
struct page *page)
{
int i;
void *objp;
+ bool shuffled;
cache_init_objs_debug(cachep, page);
- if (OBJFREELIST_SLAB(cachep)) {
+ /* Try to randomize the freelist if enabled */
+ shuffled = shuffle_freelist(cachep, page);
+
+ if (!shuffled && OBJFREELIST_SLAB(cachep)) {
page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
obj_offset(cachep);
}
@@ -2434,17 +2652,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
kasan_poison_object_data(cachep, objp);
}
- set_free_obj(page, i, i);
- }
-}
-
-static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
-{
- if (CONFIG_ZONE_DMA_FLAG) {
- if (flags & GFP_DMA)
- BUG_ON(!(cachep->allocflags & GFP_DMA));
- else
- BUG_ON(cachep->allocflags & GFP_DMA);
+ if (!shuffled)
+ set_free_obj(page, i, i);
}
}
@@ -2502,13 +2711,15 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page,
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
-static int cache_grow(struct kmem_cache *cachep,
- gfp_t flags, int nodeid, struct page *page)
+static struct page *cache_grow_begin(struct kmem_cache *cachep,
+ gfp_t flags, int nodeid)
{
void *freelist;
size_t offset;
gfp_t local_flags;
+ int page_node;
struct kmem_cache_node *n;
+ struct page *page;
/*
* Be lazy and only check for valid flags here, keeping it out of the
@@ -2520,43 +2731,35 @@ static int cache_grow(struct kmem_cache *cachep,
}
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
- /* Take the node list lock to change the colour_next on this node */
check_irq_off();
- n = get_node(cachep, nodeid);
- spin_lock(&n->list_lock);
-
- /* Get colour for the slab, and cal the next value. */
- offset = n->colour_next;
- n->colour_next++;
- if (n->colour_next >= cachep->colour)
- n->colour_next = 0;
- spin_unlock(&n->list_lock);
-
- offset *= cachep->colour_off;
-
if (gfpflags_allow_blocking(local_flags))
local_irq_enable();
/*
- * The test for missing atomic flag is performed here, rather than
- * the more obvious place, simply to reduce the critical path length
- * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
- * will eventually be caught here (where it matters).
- */
- kmem_flagcheck(cachep, flags);
-
- /*
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
- if (!page)
- page = kmem_getpages(cachep, local_flags, nodeid);
+ page = kmem_getpages(cachep, local_flags, nodeid);
if (!page)
goto failed;
+ page_node = page_to_nid(page);
+ n = get_node(cachep, page_node);
+
+ /* Get colour for the slab, and cal the next value. */
+ n->colour_next++;
+ if (n->colour_next >= cachep->colour)
+ n->colour_next = 0;
+
+ offset = n->colour_next;
+ if (offset >= cachep->colour)
+ offset = 0;
+
+ offset *= cachep->colour_off;
+
/* Get slab management. */
freelist = alloc_slabmgmt(cachep, page, offset,
- local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
+ local_flags & ~GFP_CONSTRAINT_MASK, page_node);
if (OFF_SLAB(cachep) && !freelist)
goto opps1;
@@ -2567,21 +2770,40 @@ static int cache_grow(struct kmem_cache *cachep,
if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
- check_irq_off();
- spin_lock(&n->list_lock);
- /* Make slab active. */
- list_add_tail(&page->lru, &(n->slabs_free));
- STATS_INC_GROWN(cachep);
- n->free_objects += cachep->num;
- spin_unlock(&n->list_lock);
- return 1;
+ return page;
+
opps1:
kmem_freepages(cachep, page);
failed:
if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
- return 0;
+ return NULL;
+}
+
+static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
+{
+ struct kmem_cache_node *n;
+ void *list = NULL;
+
+ check_irq_off();
+
+ if (!page)
+ return;
+
+ INIT_LIST_HEAD(&page->lru);
+ n = get_node(cachep, page_to_nid(page));
+
+ spin_lock(&n->list_lock);
+ if (!page->active)
+ list_add_tail(&page->lru, &(n->slabs_free));
+ else
+ fixup_slab_list(cachep, n, page, &list);
+ STATS_INC_GROWN(cachep);
+ n->free_objects += cachep->num - page->active;
+ spin_unlock(&n->list_lock);
+
+ fixup_objfreelist_debug(cachep, &list);
}
#if DEBUG
@@ -2785,18 +3007,42 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
return obj;
}
+/*
+ * Slab list should be fixed up by fixup_slab_list() for existing slab
+ * or cache_grow_end() for new slab
+ */
+static __always_inline int alloc_block(struct kmem_cache *cachep,
+ struct array_cache *ac, struct page *page, int batchcount)
+{
+ /*
+ * There must be at least one object available for
+ * allocation.
+ */
+ BUG_ON(page->active >= cachep->num);
+
+ while (page->active < cachep->num && batchcount--) {
+ STATS_INC_ALLOCED(cachep);
+ STATS_INC_ACTIVE(cachep);
+ STATS_SET_HIGH(cachep);
+
+ ac->entry[ac->avail++] = slab_get_obj(cachep, page);
+ }
+
+ return batchcount;
+}
+
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_cache_node *n;
- struct array_cache *ac;
+ struct array_cache *ac, *shared;
int node;
void *list = NULL;
+ struct page *page;
check_irq_off();
node = numa_mem_id();
-retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2810,16 +3056,20 @@ retry:
n = get_node(cachep, node);
BUG_ON(ac->avail > 0 || !n);
+ shared = READ_ONCE(n->shared);
+ if (!n->free_objects && (!shared || !shared->avail))
+ goto direct_grow;
+
spin_lock(&n->list_lock);
+ shared = READ_ONCE(n->shared);
/* See if we can refill from the shared array */
- if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
- n->shared->touched = 1;
+ if (shared && transfer_objects(ac, shared, batchcount)) {
+ shared->touched = 1;
goto alloc_done;
}
while (batchcount > 0) {
- struct page *page;
/* Get slab alloc is to come from. */
page = get_first_slab(n, false);
if (!page)
@@ -2827,21 +3077,7 @@ retry:
check_spinlock_acquired(cachep);
- /*
- * The slab was either on partial or free list so
- * there must be at least one object available for
- * allocation.
- */
- BUG_ON(page->active >= cachep->num);
-
- while (page->active < cachep->num && batchcount--) {
- STATS_INC_ALLOCED(cachep);
- STATS_INC_ACTIVE(cachep);
- STATS_SET_HIGH(cachep);
-
- ac->entry[ac->avail++] = slab_get_obj(cachep, page);
- }
-
+ batchcount = alloc_block(cachep, ac, page, batchcount);
fixup_slab_list(cachep, n, page, &list);
}
@@ -2851,9 +3087,8 @@ alloc_done:
spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
+direct_grow:
if (unlikely(!ac->avail)) {
- int x;
-
/* Check if we can use obj in pfmemalloc slab */
if (sk_memalloc_socks()) {
void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
@@ -2862,18 +3097,19 @@ alloc_done:
return obj;
}
- x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
+ page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
- /* cache_grow can reenable interrupts, then ac could change. */
+ /*
+ * cache_grow_begin() can reenable interrupts,
+ * then ac could change.
+ */
ac = cpu_cache_get(cachep);
- node = numa_mem_id();
+ if (!ac->avail && page)
+ alloc_block(cachep, ac, page, batchcount);
+ cache_grow_end(cachep, page);
- /* no objects in sight? abort */
- if (!x && ac->avail == 0)
+ if (!ac->avail)
return NULL;
-
- if (!ac->avail) /* objects refilled by interrupt? */
- goto retry;
}
ac->touched = 1;
@@ -2884,9 +3120,6 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
gfp_t flags)
{
might_sleep_if(gfpflags_allow_blocking(flags));
-#if DEBUG
- kmem_flagcheck(cachep, flags);
-#endif
}
#if DEBUG
@@ -2998,19 +3231,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
{
struct zonelist *zonelist;
- gfp_t local_flags;
struct zoneref *z;
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(flags);
void *obj = NULL;
+ struct page *page;
int nid;
unsigned int cpuset_mems_cookie;
if (flags & __GFP_THISNODE)
return NULL;
- local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
-
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = node_zonelist(mempolicy_slab_node(), flags);
@@ -3040,33 +3271,19 @@ retry:
* We may trigger various forms of reclaim on the allowed
* set and go into memory reserves if necessary.
*/
- struct page *page;
-
- if (gfpflags_allow_blocking(local_flags))
- local_irq_enable();
- kmem_flagcheck(cache, flags);
- page = kmem_getpages(cache, local_flags, numa_mem_id());
- if (gfpflags_allow_blocking(local_flags))
- local_irq_disable();
+ page = cache_grow_begin(cache, flags, numa_mem_id());
+ cache_grow_end(cache, page);
if (page) {
+ nid = page_to_nid(page);
+ obj = ____cache_alloc_node(cache,
+ gfp_exact_node(flags), nid);
+
/*
- * Insert into the appropriate per node queues
+ * Another processor may allocate the objects in
+ * the slab since we are not holding any locks.
*/
- nid = page_to_nid(page);
- if (cache_grow(cache, flags, nid, page)) {
- obj = ____cache_alloc_node(cache,
- gfp_exact_node(flags), nid);
- if (!obj)
- /*
- * Another processor may allocate the
- * objects in the slab since we are
- * not holding any locks.
- */
- goto retry;
- } else {
- /* cache_grow already freed obj */
- obj = NULL;
- }
+ if (!obj)
+ goto retry;
}
}
@@ -3083,15 +3300,13 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
{
struct page *page;
struct kmem_cache_node *n;
- void *obj;
+ void *obj = NULL;
void *list = NULL;
- int x;
VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
n = get_node(cachep, nodeid);
BUG_ON(!n);
-retry:
check_irq_off();
spin_lock(&n->list_lock);
page = get_first_slab(n, false);
@@ -3113,18 +3328,18 @@ retry:
spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
- goto done;
+ return obj;
must_grow:
spin_unlock(&n->list_lock);
- x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL);
- if (x)
- goto retry;
-
- return fallback_alloc(cachep, flags);
+ page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
+ if (page) {
+ /* This slab isn't counted yet so don't update free_objects */
+ obj = slab_get_obj(cachep, page);
+ }
+ cache_grow_end(cachep, page);
-done:
- return obj;
+ return obj ? obj : fallback_alloc(cachep, flags);
}
static __always_inline void *
@@ -3242,6 +3457,9 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
{
int i;
struct kmem_cache_node *n = get_node(cachep, node);
+ struct page *page;
+
+ n->free_objects += nr_objects;
for (i = 0; i < nr_objects; i++) {
void *objp;
@@ -3254,17 +3472,11 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
check_spinlock_acquired_node(cachep, node);
slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);
- n->free_objects++;
/* fixup slab chains */
- if (page->active == 0) {
- if (n->free_objects > n->free_limit) {
- n->free_objects -= cachep->num;
- list_add_tail(&page->lru, list);
- } else {
- list_add(&page->lru, &n->slabs_free);
- }
- } else {
+ if (page->active == 0)
+ list_add(&page->lru, &n->slabs_free);
+ else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
@@ -3272,6 +3484,14 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
list_add_tail(&page->lru, &n->slabs_partial);
}
}
+
+ while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
+ n->free_objects -= cachep->num;
+
+ page = list_last_entry(&n->slabs_free, struct page, lru);
+ list_del(&page->lru);
+ list_add(&page->lru, list);
+ }
}
static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
@@ -3327,9 +3547,17 @@ free_done:
static inline void __cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
- struct array_cache *ac = cpu_cache_get(cachep);
+ /* Put the object into the quarantine, don't touch it for now. */
+ if (kasan_slab_free(cachep, objp))
+ return;
+
+ ___cache_free(cachep, objp, caller);
+}
- kasan_slab_free(cachep, objp);
+void ___cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
+{
+ struct array_cache *ac = cpu_cache_get(cachep);
check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
@@ -3645,72 +3873,19 @@ EXPORT_SYMBOL(kfree);
/*
* This initializes kmem_cache_node or resizes various caches for all nodes.
*/
-static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
+static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp)
{
+ int ret;
int node;
struct kmem_cache_node *n;
- struct array_cache *new_shared;
- struct alien_cache **new_alien = NULL;
for_each_online_node(node) {
-
- if (use_alien_caches) {
- new_alien = alloc_alien_cache(node, cachep->limit, gfp);
- if (!new_alien)
- goto fail;
- }
-
- new_shared = NULL;
- if (cachep->shared) {
- new_shared = alloc_arraycache(node,
- cachep->shared*cachep->batchcount,
- 0xbaadf00d, gfp);
- if (!new_shared) {
- free_alien_cache(new_alien);
- goto fail;
- }
- }
-
- n = get_node(cachep, node);
- if (n) {
- struct array_cache *shared = n->shared;
- LIST_HEAD(list);
-
- spin_lock_irq(&n->list_lock);
-
- if (shared)
- free_block(cachep, shared->entry,
- shared->avail, node, &list);
-
- n->shared = new_shared;
- if (!n->alien) {
- n->alien = new_alien;
- new_alien = NULL;
- }
- n->free_limit = (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- spin_unlock_irq(&n->list_lock);
- slabs_destroy(cachep, &list);
- kfree(shared);
- free_alien_cache(new_alien);
- continue;
- }
- n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
- if (!n) {
- free_alien_cache(new_alien);
- kfree(new_shared);
+ ret = setup_kmem_cache_node(cachep, node, gfp, true);
+ if (ret)
goto fail;
- }
- kmem_cache_node_init(n);
- n->next_reap = jiffies + REAPTIMEOUT_NODE +
- ((unsigned long)cachep) % REAPTIMEOUT_NODE;
- n->shared = new_shared;
- n->alien = new_alien;
- n->free_limit = (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- cachep->node[node] = n;
}
+
return 0;
fail:
@@ -3752,7 +3927,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
cachep->shared = shared;
if (!prev)
- goto alloc_node;
+ goto setup_node;
for_each_online_cpu(cpu) {
LIST_HEAD(list);
@@ -3769,8 +3944,8 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
}
free_percpu(prev);
-alloc_node:
- return alloc_kmem_cache_node(cachep, gfp);
+setup_node:
+ return setup_kmem_cache_nodes(cachep, gfp);
}
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
@@ -3804,6 +3979,10 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
int shared = 0;
int batchcount = 0;
+ err = cache_random_seq_create(cachep, gfp);
+ if (err)
+ goto end;
+
if (!is_root_cache(cachep)) {
struct kmem_cache *root = memcg_root_cache(cachep);
limit = root->limit;
@@ -3857,6 +4036,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
batchcount = (limit + 1) / 2;
skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+end:
if (err)
pr_err("enable_cpucache failed for %s, error %d\n",
cachep->name, -err);
@@ -3869,29 +4049,26 @@ skip_setup:
* if drain_array() is used on the shared array.
*/
static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
- struct array_cache *ac, int force, int node)
+ struct array_cache *ac, int node)
{
LIST_HEAD(list);
- int tofree;
+
+ /* ac from n->shared can be freed if we don't hold the slab_mutex. */
+ check_mutex_acquired();
if (!ac || !ac->avail)
return;
- if (ac->touched && !force) {
+
+ if (ac->touched) {
ac->touched = 0;
- } else {
- spin_lock_irq(&n->list_lock);
- if (ac->avail) {
- tofree = force ? ac->avail : (ac->limit + 4) / 5;
- if (tofree > ac->avail)
- tofree = (ac->avail + 1) / 2;
- free_block(cachep, ac->entry, tofree, node, &list);
- ac->avail -= tofree;
- memmove(ac->entry, &(ac->entry[tofree]),
- sizeof(void *) * ac->avail);
- }
- spin_unlock_irq(&n->list_lock);
- slabs_destroy(cachep, &list);
+ return;
}
+
+ spin_lock_irq(&n->list_lock);
+ drain_array_locked(cachep, ac, node, false, &list);
+ spin_unlock_irq(&n->list_lock);
+
+ slabs_destroy(cachep, &list);
}
/**
@@ -3929,7 +4106,7 @@ static void cache_reap(struct work_struct *w)
reap_alien(searchp, n);
- drain_array(searchp, n, cpu_cache_get(searchp), 0, node);
+ drain_array(searchp, n, cpu_cache_get(searchp), node);
/*
* These are racy checks but it does not matter
@@ -3940,7 +4117,7 @@ static void cache_reap(struct work_struct *w)
n->next_reap = jiffies + REAPTIMEOUT_NODE;
- drain_array(searchp, n, n->shared, 0, node);
+ drain_array(searchp, n, n->shared, node);
if (n->free_touched)
n->free_touched = 0;
@@ -4324,7 +4501,7 @@ size_t ksize(const void *objp)
/* We assume that ksize callers could use the whole allocated area,
* so we need to unpoison this area.
*/
- kasan_krealloc(objp, size, GFP_NOWAIT);
+ kasan_unpoison_shadow(objp, size);
return size;
}
diff --git a/mm/slab.h b/mm/slab.h
index 5969769fbee6..dedb1a920fb8 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -462,4 +462,6 @@ void *slab_next(struct seq_file *m, void *p, loff_t *pos);
void slab_stop(struct seq_file *m, void *p);
int memcg_slab_show(struct seq_file *m, void *p);
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
+
#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3239bfd758e6..a65dad7fdcd1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -715,6 +715,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
get_online_cpus();
get_online_mems();
+ kasan_cache_destroy(s);
mutex_lock(&slab_mutex);
s->refcount--;
@@ -753,6 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
get_online_cpus();
get_online_mems();
+ kasan_cache_shrink(cachep);
ret = __kmem_cache_shrink(cachep, false);
put_online_mems();
put_online_cpus();
diff --git a/mm/slub.c b/mm/slub.c
index 4dbb109eb8cd..538c8584ba8d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count
tmp.counters = counters_new;
/*
* page->counters can cover frozen/inuse/objects as well
- * as page->_count. If we assign to ->counters directly
- * we run the risk of losing updates to page->_count, so
+ * as page->_refcount. If we assign to ->counters directly
+ * we run the risk of losing updates to page->_refcount, so
* be careful and only assign to the fields we need.
*/
page->frozen = tmp.frozen;
@@ -3635,8 +3635,9 @@ size_t ksize(const void *object)
{
size_t size = __ksize(object);
/* We assume that ksize callers could use whole allocated area,
- so we need unpoison this area. */
- kasan_krealloc(object, size, GFP_NOWAIT);
+ * so we need to unpoison this area.
+ */
+ kasan_unpoison_shadow(object, size);
return size;
}
EXPORT_SYMBOL(ksize);
@@ -3697,7 +3698,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
* s->cpu_partial is checked locklessly (see put_cpu_partial),
* so we have to make sure the change is visible.
*/
- kick_all_cpus_sync();
+ synchronize_sched();
}
flush_all(s);
diff --git a/mm/swap.c b/mm/swap.c
index 03aacbcb013f..95916142fc46 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,6 +47,9 @@ static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+#endif
/*
* This path almost never happens for VM activity - pages are normally
@@ -274,8 +277,6 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
}
#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
-
static void activate_page_drain(int cpu)
{
struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 366ce3518703..0d457e7db8d6 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -358,7 +358,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
__SetPageLocked(new_page);
- SetPageSwapBacked(new_page);
+ __SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
radix_tree_preload_end();
@@ -370,7 +370,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return new_page;
}
radix_tree_preload_end();
- ClearPageSwapBacked(new_page);
__ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
diff --git a/mm/util.c b/mm/util.c
index 6cc81e7b8705..6682eb92ccca 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -346,6 +346,29 @@ void *page_rmapping(struct page *page)
return __page_rmapping(page);
}
+/*
+ * Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
+ */
+bool page_mapped(struct page *page)
+{
+ int i;
+
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) >= 0;
+ page = compound_head(page);
+ if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+ return true;
+ if (PageHuge(page))
+ return false;
+ for (i = 0; i < hpage_nr_pages(page); i++) {
+ if (atomic_read(&page[i]._mapcount) >= 0)
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(page_mapped);
+
struct anon_vma *page_anon_vma(struct page *page)
{
unsigned long mapping;
@@ -394,7 +417,13 @@ int __page_mapcount(struct page *page)
}
EXPORT_SYMBOL_GPL(__page_mapcount);
+#if defined(CONFIG_OVERCOMMIT_NEVER)
+int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_NEVER;
+#elif defined(CONFIG_OVERCOMMIT_ALWAYS)
+int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_ALWAYS;
+#else
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
+#endif
int sysctl_overcommit_ratio __read_mostly = 50;
unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 293889d7f482..cf7ad1a53be0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -275,13 +275,12 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
/*** Global kva allocator ***/
-#define VM_LAZY_FREE 0x01
-#define VM_LAZY_FREEING 0x02
#define VM_VM_AREA 0x04
static DEFINE_SPINLOCK(vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
+static LLIST_HEAD(vmap_purge_list);
static struct rb_root vmap_area_root = RB_ROOT;
/* The vmap cache globals are protected by vmap_area_lock */
@@ -628,7 +627,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
int sync, int force_flush)
{
static DEFINE_SPINLOCK(purge_lock);
- LIST_HEAD(valist);
+ struct llist_node *valist;
struct vmap_area *va;
struct vmap_area *n_va;
int nr = 0;
@@ -647,20 +646,14 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
if (sync)
purge_fragmented_blocks_allcpus();
- rcu_read_lock();
- list_for_each_entry_rcu(va, &vmap_area_list, list) {
- if (va->flags & VM_LAZY_FREE) {
- if (va->va_start < *start)
- *start = va->va_start;
- if (va->va_end > *end)
- *end = va->va_end;
- nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
- list_add_tail(&va->purge_list, &valist);
- va->flags |= VM_LAZY_FREEING;
- va->flags &= ~VM_LAZY_FREE;
- }
+ valist = llist_del_all(&vmap_purge_list);
+ llist_for_each_entry(va, valist, purge_list) {
+ if (va->va_start < *start)
+ *start = va->va_start;
+ if (va->va_end > *end)
+ *end = va->va_end;
+ nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
}
- rcu_read_unlock();
if (nr)
atomic_sub(nr, &vmap_lazy_nr);
@@ -670,7 +663,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
if (nr) {
spin_lock(&vmap_area_lock);
- list_for_each_entry_safe(va, n_va, &valist, purge_list)
+ llist_for_each_entry_safe(va, n_va, valist, purge_list)
__free_vmap_area(va);
spin_unlock(&vmap_area_lock);
}
@@ -705,9 +698,15 @@ static void purge_vmap_area_lazy(void)
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
- va->flags |= VM_LAZY_FREE;
- atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
- if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
+ int nr_lazy;
+
+ nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
+ &vmap_lazy_nr);
+
+ /* After this point, we may free va at any time */
+ llist_add(&va->purge_list, &vmap_purge_list);
+
+ if (unlikely(nr_lazy > lazy_max_pages()))
try_purge_vmap_area_lazy();
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 142cb61f4822..c4a2f4512fca 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -191,7 +191,7 @@ static bool sane_reclaim(struct scan_control *sc)
}
#endif
-static unsigned long zone_reclaimable_pages(struct zone *zone)
+unsigned long zone_reclaimable_pages(struct zone *zone)
{
unsigned long nr;
@@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
*
* Reversing the order of the tests ensures such a situation cannot
* escape unnoticed. The smp_rmb is needed to ensure the page->flags
- * load is not satisfied before that of page->_count.
+ * load is not satisfied before that of page->_refcount.
*
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
@@ -1374,7 +1374,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
!list_empty(src); scan++) {
struct page *page;
- int nr_pages;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
@@ -1383,10 +1382,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
switch (__isolate_lru_page(page, mode)) {
case 0:
- nr_pages = hpage_nr_pages(page);
- mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
+ nr_taken += hpage_nr_pages(page);
list_move(&page->lru, dst);
- nr_taken += nr_pages;
break;
case -EBUSY:
@@ -1602,8 +1599,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, isolate_mode, lru);
- __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
+ update_lru_size(lruvec, lru, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+ reclaim_stat->recent_scanned[file] += nr_taken;
if (global_reclaim(sc)) {
__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
@@ -1624,8 +1622,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&zone->lru_lock);
- reclaim_stat->recent_scanned[file] += nr_taken;
-
if (global_reclaim(sc)) {
if (current_is_kswapd())
__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
@@ -1720,7 +1716,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page.
*
- * The downside is that we have to touch page->_count against each page.
+ * The downside is that we have to touch page->_refcount against each page.
* But we had to alter page->flags anyway.
*/
@@ -1742,7 +1738,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
SetPageLRU(page);
nr_pages = hpage_nr_pages(page);
- mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+ update_lru_size(lruvec, lru, nr_pages);
list_move(&page->lru, &lruvec->lists[lru]);
pgmoved += nr_pages;
@@ -1760,7 +1756,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
list_add(&page->lru, pages_to_free);
}
}
- __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+
if (!is_active_lru(lru))
__count_vm_events(PGDEACTIVATE, pgmoved);
}
@@ -1794,14 +1790,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);
- if (global_reclaim(sc))
- __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+ update_lru_size(lruvec, lru, -nr_taken);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
+ if (global_reclaim(sc))
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
__count_zone_vm_events(PGREFILL, zone, nr_scanned);
- __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+
spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_hold)) {
@@ -1865,83 +1862,63 @@ static void shrink_active_list(unsigned long nr_to_scan,
free_hot_cold_page_list(&l_hold, true);
}
-#ifdef CONFIG_SWAP
-static bool inactive_anon_is_low_global(struct zone *zone)
-{
- unsigned long active, inactive;
-
- active = zone_page_state(zone, NR_ACTIVE_ANON);
- inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-
- return inactive * zone->inactive_ratio < active;
-}
-
-/**
- * inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @lruvec: LRU vector to check
+/*
+ * The inactive anon list should be small enough that the VM never has
+ * to do too much work.
*
- * Returns true if the zone does not have enough inactive anon pages,
- * meaning some active anon pages need to be deactivated.
- */
-static bool inactive_anon_is_low(struct lruvec *lruvec)
-{
- /*
- * If we don't have swap space, anonymous page deactivation
- * is pointless.
- */
- if (!total_swap_pages)
- return false;
-
- if (!mem_cgroup_disabled())
- return mem_cgroup_inactive_anon_is_low(lruvec);
-
- return inactive_anon_is_low_global(lruvec_zone(lruvec));
-}
-#else
-static inline bool inactive_anon_is_low(struct lruvec *lruvec)
-{
- return false;
-}
-#endif
-
-/**
- * inactive_file_is_low - check if file pages need to be deactivated
- * @lruvec: LRU vector to check
+ * The inactive file list should be small enough to leave most memory
+ * to the established workingset on the scan-resistant active list,
+ * but large enough to avoid thrashing the aggregate readahead window.
*
- * When the system is doing streaming IO, memory pressure here
- * ensures that active file pages get deactivated, until more
- * than half of the file pages are on the inactive list.
+ * Both inactive lists should also be large enough that each inactive
+ * page has a chance to be referenced again before it is reclaimed.
*
- * Once we get to that situation, protect the system's working
- * set from being evicted by disabling active file page aging.
+ * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
+ * on this LRU, maintained by the pageout code. A zone->inactive_ratio
+ * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
*
- * This uses a different ratio than the anonymous pages, because
- * the page cache uses a use-once replacement algorithm.
+ * total target max
+ * memory ratio inactive
+ * -------------------------------------
+ * 10MB 1 5MB
+ * 100MB 1 50MB
+ * 1GB 3 250MB
+ * 10GB 10 0.9GB
+ * 100GB 31 3GB
+ * 1TB 101 10GB
+ * 10TB 320 32GB
*/
-static bool inactive_file_is_low(struct lruvec *lruvec)
+static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
{
+ unsigned long inactive_ratio;
unsigned long inactive;
unsigned long active;
+ unsigned long gb;
- inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
- active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+ /*
+ * If we don't have swap space, anonymous page deactivation
+ * is pointless.
+ */
+ if (!file && !total_swap_pages)
+ return false;
- return active > inactive;
-}
+ inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
+ active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
-static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
-{
- if (is_file_lru(lru))
- return inactive_file_is_low(lruvec);
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
else
- return inactive_anon_is_low(lruvec);
+ inactive_ratio = 1;
+
+ return inactive * inactive_ratio < active;
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, lru))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru)))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -2062,7 +2039,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_file_is_low(lruvec) &&
+ if (!inactive_list_is_low(lruvec, true) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
@@ -2304,7 +2281,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_anon_is_low(lruvec))
+ if (inactive_list_is_low(lruvec, false))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -2482,7 +2459,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
* Returns true if compaction should go ahead for a high-order request, or
* the high-order allocation would succeed without compaction.
*/
-static inline bool compaction_ready(struct zone *zone, int order)
+static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx)
{
unsigned long balance_gap, watermark;
bool watermark_ok;
@@ -2496,7 +2473,7 @@ static inline bool compaction_ready(struct zone *zone, int order)
balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
- watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0);
+ watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx);
/*
* If compaction is deferred, reclaim up to a point where
@@ -2509,7 +2486,7 @@ static inline bool compaction_ready(struct zone *zone, int order)
* If compaction is not ready to start and allocation is not likely
* to succeed without it, then keep reclaiming.
*/
- if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED)
+ if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED)
return false;
return watermark_ok;
@@ -2530,10 +2507,8 @@ static inline bool compaction_ready(struct zone *zone, int order)
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
- *
- * Returns true if a zone was reclaimable.
*/
-static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
@@ -2541,7 +2516,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
unsigned long nr_soft_scanned;
gfp_t orig_mask;
enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
- bool reclaimable = false;
/*
* If the number of buffer_heads in the machine exceeds the maximum
@@ -2589,7 +2563,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (IS_ENABLED(CONFIG_COMPACTION) &&
sc->order > PAGE_ALLOC_COSTLY_ORDER &&
zonelist_zone_idx(z) <= requested_highidx &&
- compaction_ready(zone, sc->order)) {
+ compaction_ready(zone, sc->order, requested_highidx)) {
sc->compaction_ready = true;
continue;
}
@@ -2606,17 +2580,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
&nr_soft_scanned);
sc->nr_reclaimed += nr_soft_reclaimed;
sc->nr_scanned += nr_soft_scanned;
- if (nr_soft_reclaimed)
- reclaimable = true;
/* need some check for avoid more shrink_zone() */
}
- if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
- reclaimable = true;
-
- if (global_reclaim(sc) &&
- !reclaimable && zone_reclaimable(zone))
- reclaimable = true;
+ shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
}
/*
@@ -2624,8 +2591,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* promoted it to __GFP_HIGHMEM.
*/
sc->gfp_mask = orig_mask;
-
- return reclaimable;
}
/*
@@ -2650,7 +2615,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
int initial_priority = sc->priority;
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
- bool zones_reclaimable;
retry:
delayacct_freepages_start();
@@ -2661,7 +2625,7 @@ retry:
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
sc->nr_scanned = 0;
- zones_reclaimable = shrink_zones(zonelist, sc);
+ shrink_zones(zonelist, sc);
total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
@@ -2708,10 +2672,6 @@ retry:
goto retry;
}
- /* Any of the zones still reclaimable? Don't OOM. */
- if (zones_reclaimable)
- return 1;
-
return 0;
}
@@ -2965,7 +2925,7 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
do {
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- if (inactive_anon_is_low(lruvec))
+ if (inactive_list_is_low(lruvec, false))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5e4300482897..57a24e919907 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -34,6 +34,18 @@
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
+unsigned long sum_vm_event(enum vm_event_item item)
+{
+ int cpu;
+ unsigned long ret = 0;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ ret += per_cpu(vm_event_states, cpu).event[item];
+ put_online_cpus();
+ return ret;
+}
+
static void sum_vm_events(unsigned long *ret)
{
int cpu;
@@ -570,49 +582,18 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
#ifdef CONFIG_NUMA
/*
- * zonelist = the list of zones passed to the allocator
- * z = the zone from which the allocation occurred.
- *
- * Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
- */
-void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
-{
- if (z->zone_pgdat == preferred_zone->zone_pgdat) {
- __inc_zone_state(z, NUMA_HIT);
- } else {
- __inc_zone_state(z, NUMA_MISS);
- __inc_zone_state(preferred_zone, NUMA_FOREIGN);
- }
- if (z->node == ((flags & __GFP_OTHER_NODE) ?
- preferred_zone->node : numa_node_id()))
- __inc_zone_state(z, NUMA_LOCAL);
- else
- __inc_zone_state(z, NUMA_OTHER);
-}
-
-/*
* Determine the per node value of a stat item.
*/
unsigned long node_page_state(int node, enum zone_stat_item item)
{
struct zone *zones = NODE_DATA(node)->node_zones;
+ int i;
+ unsigned long count = 0;
- return
-#ifdef CONFIG_ZONE_DMA
- zone_page_state(&zones[ZONE_DMA], item) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
- zone_page_state(&zones[ZONE_DMA32], item) +
-#endif
-#ifdef CONFIG_HIGHMEM
- zone_page_state(&zones[ZONE_HIGHMEM], item) +
-#endif
- zone_page_state(&zones[ZONE_NORMAL], item) +
- zone_page_state(&zones[ZONE_MOVABLE], item);
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ count += zone_page_state(zones + i, item);
+
+ return count;
}
#endif
@@ -1010,6 +991,9 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
if (!memmap_valid_within(pfn, page, zone))
continue;
+ if (page_zone(page) != zone)
+ continue;
+
mtype = get_pageblock_migratetype(page);
if (mtype < MIGRATE_TYPES)
@@ -1069,13 +1053,17 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
block_end_pfn = min(block_end_pfn, end_pfn);
page = pfn_to_page(pfn);
- pageblock_mt = get_pfnblock_migratetype(page, pfn);
+ pageblock_mt = get_pageblock_migratetype(page);
for (; pfn < block_end_pfn; pfn++) {
if (!pfn_valid_within(pfn))
continue;
page = pfn_to_page(pfn);
+
+ if (page_zone(page) != zone)
+ continue;
+
if (PageBuddy(page)) {
pfn += (1UL << page_order(page)) - 1;
continue;
@@ -1376,7 +1364,64 @@ static const struct file_operations proc_vmstat_file_operations = {
static struct workqueue_struct *vmstat_wq;
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
-static cpumask_var_t cpu_stat_off;
+
+static void refresh_vm_stats(struct work_struct *work)
+{
+ refresh_cpu_vm_stats(true);
+}
+
+int vmstat_refresh(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ long val;
+ int err;
+ int i;
+
+ /*
+ * The regular update, every sysctl_stat_interval, may come later
+ * than expected: leaving a significant amount in per_cpu buckets.
+ * This is particularly misleading when checking a quantity of HUGE
+ * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
+ * which can equally be echo'ed to or cat'ted from (by root),
+ * can be used to update the stats just before reading them.
+ *
+ * Oh, and since global_page_state() etc. are so careful to hide
+ * transiently negative values, report an error here if any of
+ * the stats is negative, so we know to go looking for imbalance.
+ */
+ err = schedule_on_each_cpu(refresh_vm_stats);
+ if (err)
+ return err;
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ val = atomic_long_read(&vm_stat[i]);
+ if (val < 0) {
+ switch (i) {
+ case NR_ALLOC_BATCH:
+ case NR_PAGES_SCANNED:
+ /*
+ * These are often seen to go negative in
+ * recent kernels, but not to go permanently
+ * negative. Whilst it would be nicer not to
+ * have exceptions, rooting them out would be
+ * another task, of rather low priority.
+ */
+ break;
+ default:
+ pr_warn("%s: %s %ld\n",
+ __func__, vmstat_text[i], val);
+ err = -EINVAL;
+ break;
+ }
+ }
+ }
+ if (err)
+ return err;
+ if (write)
+ *ppos += *lenp;
+ else
+ *lenp = 0;
+ return 0;
+}
static void vmstat_update(struct work_struct *w)
{
@@ -1385,24 +1430,10 @@ static void vmstat_update(struct work_struct *w)
* Counters were updated so we expect more updates
* to occur in the future. Keep on running the
* update worker thread.
- * If we were marked on cpu_stat_off clear the flag
- * so that vmstat_shepherd doesn't schedule us again.
*/
- if (!cpumask_test_and_clear_cpu(smp_processor_id(),
- cpu_stat_off)) {
- queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+ queue_delayed_work_on(smp_processor_id(), vmstat_wq,
this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
- }
- } else {
- /*
- * We did not update any counters so the app may be in
- * a mode where it does not cause counter updates.
- * We may be uselessly running vmstat_update.
- * Defer the checking for differentials to the
- * shepherd thread on a different processor.
- */
- cpumask_set_cpu(smp_processor_id(), cpu_stat_off);
}
}
@@ -1434,16 +1465,17 @@ static bool need_update(int cpu)
return false;
}
+/*
+ * Switch off vmstat processing and then fold all the remaining differentials
+ * until the diffs stay at zero. The function is used by NOHZ and can only be
+ * invoked when tick processing is not active.
+ */
void quiet_vmstat(void)
{
if (system_state != SYSTEM_RUNNING)
return;
- /*
- * If we are already in hands of the shepherd then there
- * is nothing for us to do here.
- */
- if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+ if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
return;
if (!need_update(smp_processor_id()))
@@ -1458,7 +1490,6 @@ void quiet_vmstat(void)
refresh_cpu_vm_stats(false);
}
-
/*
* Shepherd worker thread that checks the
* differentials of processors that have their worker
@@ -1475,20 +1506,11 @@ static void vmstat_shepherd(struct work_struct *w)
get_online_cpus();
/* Check processors whose vmstat worker threads have been disabled */
- for_each_cpu(cpu, cpu_stat_off) {
+ for_each_online_cpu(cpu) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
- if (need_update(cpu)) {
- if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+ if (!delayed_work_pending(dw) && need_update(cpu))
queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
- } else {
- /*
- * Cancel the work if quiet_vmstat has put this
- * cpu on cpu_stat_off because the work item might
- * be still scheduled
- */
- cancel_delayed_work(dw);
- }
}
put_online_cpus();
@@ -1504,10 +1526,6 @@ static void __init start_shepherd_timer(void)
INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
- if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
- BUG();
- cpumask_copy(cpu_stat_off, cpu_online_mask);
-
vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
@@ -1542,16 +1560,13 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
case CPU_ONLINE_FROZEN:
refresh_zone_stat_thresholds();
node_set_state(cpu_to_node(cpu), N_CPU);
- cpumask_set_cpu(cpu, cpu_stat_off);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
- cpumask_clear_cpu(cpu, cpu_stat_off);
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- cpumask_set_cpu(cpu, cpu_stat_off);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
diff --git a/mm/z3fold.c b/mm/z3fold.c
new file mode 100644
index 000000000000..34917d55d311
--- /dev/null
+++ b/mm/z3fold.c
@@ -0,0 +1,792 @@
+/*
+ * z3fold.c
+ *
+ * Author: Vitaly Wool <vitaly.wool@konsulko.com>
+ * Copyright (C) 2016, Sony Mobile Communications Inc.
+ *
+ * This implementation is based on zbud written by Seth Jennings.
+ *
+ * z3fold is an special purpose allocator for storing compressed pages. It
+ * can store up to three compressed pages per page which improves the
+ * compression ratio of zbud while retaining its main concepts (e. g. always
+ * storing an integral number of objects per page) and simplicity.
+ * It still has simple and deterministic reclaim properties that make it
+ * preferable to a higher density approach (with no requirement on integral
+ * number of object per page) when reclaim is used.
+ *
+ * As in zbud, pages are divided into "chunks". The size of the chunks is
+ * fixed at compile time and is determined by NCHUNKS_ORDER below.
+ *
+ * z3fold doesn't export any API and is meant to be used via zpool API.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/zpool.h>
+
+/*****************
+ * Structures
+*****************/
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation. It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
+ * in allocated page is occupied by z3fold header, NCHUNKS will be calculated
+ * to 63 which shows the max number of free chunks in z3fold page, also there
+ * will be 63 freelists per pool.
+ */
+#define NCHUNKS_ORDER 6
+
+#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE (1 << CHUNK_SHIFT)
+#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+
+#define BUDDY_MASK ((1 << NCHUNKS_ORDER) - 1)
+
+struct z3fold_pool;
+struct z3fold_ops {
+ int (*evict)(struct z3fold_pool *pool, unsigned long handle);
+};
+
+/**
+ * struct z3fold_pool - stores metadata for each z3fold pool
+ * @lock: protects all pool fields and first|last_chunk fields of any
+ * z3fold page in the pool
+ * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies;
+ * the lists each z3fold page is added to depends on the size of
+ * its free region.
+ * @buddied: list tracking the z3fold pages that contain 3 buddies;
+ * these z3fold pages are full
+ * @lru: list tracking the z3fold pages in LRU order by most recently
+ * added buddy.
+ * @pages_nr: number of z3fold pages in the pool.
+ * @ops: pointer to a structure of user defined operations specified at
+ * pool creation time.
+ *
+ * This structure is allocated at pool creation time and maintains metadata
+ * pertaining to a particular z3fold pool.
+ */
+struct z3fold_pool {
+ spinlock_t lock;
+ struct list_head unbuddied[NCHUNKS];
+ struct list_head buddied;
+ struct list_head lru;
+ u64 pages_nr;
+ const struct z3fold_ops *ops;
+ struct zpool *zpool;
+ const struct zpool_ops *zpool_ops;
+};
+
+enum buddy {
+ HEADLESS = 0,
+ FIRST,
+ MIDDLE,
+ LAST,
+ BUDDIES_MAX
+};
+
+/*
+ * struct z3fold_header - z3fold page metadata occupying the first chunk of each
+ * z3fold page, except for HEADLESS pages
+ * @buddy: links the z3fold page into the relevant list in the pool
+ * @first_chunks: the size of the first buddy in chunks, 0 if free
+ * @middle_chunks: the size of the middle buddy in chunks, 0 if free
+ * @last_chunks: the size of the last buddy in chunks, 0 if free
+ * @first_num: the starting number (for the first handle)
+ */
+struct z3fold_header {
+ struct list_head buddy;
+ unsigned short first_chunks;
+ unsigned short middle_chunks;
+ unsigned short last_chunks;
+ unsigned short start_middle;
+ unsigned short first_num:NCHUNKS_ORDER;
+};
+
+/*
+ * Internal z3fold page flags
+ */
+enum z3fold_page_flags {
+ UNDER_RECLAIM = 0,
+ PAGE_HEADLESS,
+ MIDDLE_CHUNK_MAPPED,
+};
+
+/*****************
+ * Helpers
+*****************/
+
+/* Converts an allocation size in bytes to size in z3fold chunks */
+static int size_to_chunks(size_t size)
+{
+ return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
+}
+
+#define for_each_unbuddied_list(_iter, _begin) \
+ for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
+
+/* Initializes the z3fold header of a newly allocated z3fold page */
+static struct z3fold_header *init_z3fold_page(struct page *page)
+{
+ struct z3fold_header *zhdr = page_address(page);
+
+ INIT_LIST_HEAD(&page->lru);
+ clear_bit(UNDER_RECLAIM, &page->private);
+ clear_bit(PAGE_HEADLESS, &page->private);
+ clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+
+ zhdr->first_chunks = 0;
+ zhdr->middle_chunks = 0;
+ zhdr->last_chunks = 0;
+ zhdr->first_num = 0;
+ zhdr->start_middle = 0;
+ INIT_LIST_HEAD(&zhdr->buddy);
+ return zhdr;
+}
+
+/* Resets the struct page fields and frees the page */
+static void free_z3fold_page(struct z3fold_header *zhdr)
+{
+ __free_page(virt_to_page(zhdr));
+}
+
+/*
+ * Encodes the handle of a particular buddy within a z3fold page
+ * Pool lock should be held as this function accesses first_num
+ */
+static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
+{
+ unsigned long handle;
+
+ handle = (unsigned long)zhdr;
+ if (bud != HEADLESS)
+ handle += (bud + zhdr->first_num) & BUDDY_MASK;
+ return handle;
+}
+
+/* Returns the z3fold page where a given handle is stored */
+static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
+{
+ return (struct z3fold_header *)(handle & PAGE_MASK);
+}
+
+/* Returns buddy number */
+static enum buddy handle_to_buddy(unsigned long handle)
+{
+ struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
+ return (handle - zhdr->first_num) & BUDDY_MASK;
+}
+
+/*
+ * Returns the number of free chunks in a z3fold page.
+ * NB: can't be used with HEADLESS pages.
+ */
+static int num_free_chunks(struct z3fold_header *zhdr)
+{
+ int nfree;
+ /*
+ * If there is a middle object, pick up the bigger free space
+ * either before or after it. Otherwise just subtract the number
+ * of chunks occupied by the first and the last objects.
+ */
+ if (zhdr->middle_chunks != 0) {
+ int nfree_before = zhdr->first_chunks ?
+ 0 : zhdr->start_middle - 1;
+ int nfree_after = zhdr->last_chunks ?
+ 0 : NCHUNKS - zhdr->start_middle - zhdr->middle_chunks;
+ nfree = max(nfree_before, nfree_after);
+ } else
+ nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
+ return nfree;
+}
+
+/*****************
+ * API Functions
+*****************/
+/**
+ * z3fold_create_pool() - create a new z3fold pool
+ * @gfp: gfp flags when allocating the z3fold pool structure
+ * @ops: user-defined operations for the z3fold pool
+ *
+ * Return: pointer to the new z3fold pool or NULL if the metadata allocation
+ * failed.
+ */
+static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
+ const struct z3fold_ops *ops)
+{
+ struct z3fold_pool *pool;
+ int i;
+
+ pool = kzalloc(sizeof(struct z3fold_pool), gfp);
+ if (!pool)
+ return NULL;
+ spin_lock_init(&pool->lock);
+ for_each_unbuddied_list(i, 0)
+ INIT_LIST_HEAD(&pool->unbuddied[i]);
+ INIT_LIST_HEAD(&pool->buddied);
+ INIT_LIST_HEAD(&pool->lru);
+ pool->pages_nr = 0;
+ pool->ops = ops;
+ return pool;
+}
+
+/**
+ * z3fold_destroy_pool() - destroys an existing z3fold pool
+ * @pool: the z3fold pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+static void z3fold_destroy_pool(struct z3fold_pool *pool)
+{
+ kfree(pool);
+}
+
+/* Has to be called with lock held */
+static int z3fold_compact_page(struct z3fold_header *zhdr)
+{
+ struct page *page = virt_to_page(zhdr);
+ void *beg = zhdr;
+
+
+ if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) &&
+ zhdr->middle_chunks != 0 &&
+ zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+ memmove(beg + ZHDR_SIZE_ALIGNED,
+ beg + (zhdr->start_middle << CHUNK_SHIFT),
+ zhdr->middle_chunks << CHUNK_SHIFT);
+ zhdr->first_chunks = zhdr->middle_chunks;
+ zhdr->middle_chunks = 0;
+ zhdr->start_middle = 0;
+ zhdr->first_num++;
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * z3fold_alloc() - allocates a region of a given size
+ * @pool: z3fold pool from which to allocate
+ * @size: size in bytes of the desired allocation
+ * @gfp: gfp flags used if the pool needs to grow
+ * @handle: handle of the new allocation
+ *
+ * This function will attempt to find a free region in the pool large enough to
+ * satisfy the allocation request. A search of the unbuddied lists is
+ * performed first. If no suitable free region is found, then a new page is
+ * allocated and added to the pool to satisfy the request.
+ *
+ * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
+ * as z3fold pool pages.
+ *
+ * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
+ * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
+ * a new page.
+ */
+static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
+ unsigned long *handle)
+{
+ int chunks = 0, i, freechunks;
+ struct z3fold_header *zhdr = NULL;
+ enum buddy bud;
+ struct page *page;
+
+ if (!size || (gfp & __GFP_HIGHMEM))
+ return -EINVAL;
+
+ if (size > PAGE_SIZE)
+ return -ENOSPC;
+
+ if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
+ bud = HEADLESS;
+ else {
+ chunks = size_to_chunks(size);
+ spin_lock(&pool->lock);
+
+ /* First, try to find an unbuddied z3fold page. */
+ zhdr = NULL;
+ for_each_unbuddied_list(i, chunks) {
+ if (!list_empty(&pool->unbuddied[i])) {
+ zhdr = list_first_entry(&pool->unbuddied[i],
+ struct z3fold_header, buddy);
+ page = virt_to_page(zhdr);
+ if (zhdr->first_chunks == 0) {
+ if (zhdr->middle_chunks != 0 &&
+ chunks >= zhdr->start_middle)
+ bud = LAST;
+ else
+ bud = FIRST;
+ } else if (zhdr->last_chunks == 0)
+ bud = LAST;
+ else if (zhdr->middle_chunks == 0)
+ bud = MIDDLE;
+ else {
+ pr_err("No free chunks in unbuddied\n");
+ WARN_ON(1);
+ continue;
+ }
+ list_del(&zhdr->buddy);
+ goto found;
+ }
+ }
+ bud = FIRST;
+ spin_unlock(&pool->lock);
+ }
+
+ /* Couldn't find unbuddied z3fold page, create new one */
+ page = alloc_page(gfp);
+ if (!page)
+ return -ENOMEM;
+ spin_lock(&pool->lock);
+ pool->pages_nr++;
+ zhdr = init_z3fold_page(page);
+
+ if (bud == HEADLESS) {
+ set_bit(PAGE_HEADLESS, &page->private);
+ goto headless;
+ }
+
+found:
+ if (bud == FIRST)
+ zhdr->first_chunks = chunks;
+ else if (bud == LAST)
+ zhdr->last_chunks = chunks;
+ else {
+ zhdr->middle_chunks = chunks;
+ zhdr->start_middle = zhdr->first_chunks + 1;
+ }
+
+ if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
+ zhdr->middle_chunks == 0) {
+ /* Add to unbuddied list */
+ freechunks = num_free_chunks(zhdr);
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ } else {
+ /* Add to buddied list */
+ list_add(&zhdr->buddy, &pool->buddied);
+ }
+
+headless:
+ /* Add/move z3fold page to beginning of LRU */
+ if (!list_empty(&page->lru))
+ list_del(&page->lru);
+
+ list_add(&page->lru, &pool->lru);
+
+ *handle = encode_handle(zhdr, bud);
+ spin_unlock(&pool->lock);
+
+ return 0;
+}
+
+/**
+ * z3fold_free() - frees the allocation associated with the given handle
+ * @pool: pool in which the allocation resided
+ * @handle: handle associated with the allocation returned by z3fold_alloc()
+ *
+ * In the case that the z3fold page in which the allocation resides is under
+ * reclaim, as indicated by the PG_reclaim flag being set, this function
+ * only sets the first|last_chunks to 0. The page is actually freed
+ * once both buddies are evicted (see z3fold_reclaim_page() below).
+ */
+static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
+{
+ struct z3fold_header *zhdr;
+ int freechunks;
+ struct page *page;
+ enum buddy bud;
+
+ spin_lock(&pool->lock);
+ zhdr = handle_to_z3fold_header(handle);
+ page = virt_to_page(zhdr);
+
+ if (test_bit(PAGE_HEADLESS, &page->private)) {
+ /* HEADLESS page stored */
+ bud = HEADLESS;
+ } else {
+ bud = (handle - zhdr->first_num) & BUDDY_MASK;
+
+ switch (bud) {
+ case FIRST:
+ zhdr->first_chunks = 0;
+ break;
+ case MIDDLE:
+ zhdr->middle_chunks = 0;
+ zhdr->start_middle = 0;
+ break;
+ case LAST:
+ zhdr->last_chunks = 0;
+ break;
+ default:
+ pr_err("%s: unknown bud %d\n", __func__, bud);
+ WARN_ON(1);
+ spin_unlock(&pool->lock);
+ return;
+ }
+ }
+
+ if (test_bit(UNDER_RECLAIM, &page->private)) {
+ /* z3fold page is under reclaim, reclaim will free */
+ spin_unlock(&pool->lock);
+ return;
+ }
+
+ if (bud != HEADLESS) {
+ /* Remove from existing buddy list */
+ list_del(&zhdr->buddy);
+ }
+
+ if (bud == HEADLESS ||
+ (zhdr->first_chunks == 0 && zhdr->middle_chunks == 0 &&
+ zhdr->last_chunks == 0)) {
+ /* z3fold page is empty, free */
+ list_del(&page->lru);
+ clear_bit(PAGE_HEADLESS, &page->private);
+ free_z3fold_page(zhdr);
+ pool->pages_nr--;
+ } else {
+ z3fold_compact_page(zhdr);
+ /* Add to the unbuddied list */
+ freechunks = num_free_chunks(zhdr);
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ }
+
+ spin_unlock(&pool->lock);
+}
+
+/**
+ * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
+ * @pool: pool from which a page will attempt to be evicted
+ * @retires: number of pages on the LRU list for which eviction will
+ * be attempted before failing
+ *
+ * z3fold reclaim is different from normal system reclaim in that it is done
+ * from the bottom, up. This is because only the bottom layer, z3fold, has
+ * information on how the allocations are organized within each z3fold page.
+ * This has the potential to create interesting locking situations between
+ * z3fold and the user, however.
+ *
+ * To avoid these, this is how z3fold_reclaim_page() should be called:
+
+ * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
+ * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
+ * call the user-defined eviction handler with the pool and handle as
+ * arguments.
+ *
+ * If the handle can not be evicted, the eviction handler should return
+ * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
+ * appropriate list and try the next z3fold page on the LRU up to
+ * a user defined number of retries.
+ *
+ * If the handle is successfully evicted, the eviction handler should
+ * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
+ * contains logic to delay freeing the page if the page is under reclaim,
+ * as indicated by the setting of the PG_reclaim flag on the underlying page.
+ *
+ * If all buddies in the z3fold page are successfully evicted, then the
+ * z3fold page can be freed.
+ *
+ * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
+ * no pages to evict or an eviction handler is not registered, -EAGAIN if
+ * the retry limit was hit.
+ */
+static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
+{
+ int i, ret = 0, freechunks;
+ struct z3fold_header *zhdr;
+ struct page *page;
+ unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
+
+ spin_lock(&pool->lock);
+ if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
+ retries == 0) {
+ spin_unlock(&pool->lock);
+ return -EINVAL;
+ }
+ for (i = 0; i < retries; i++) {
+ page = list_last_entry(&pool->lru, struct page, lru);
+ list_del(&page->lru);
+
+ /* Protect z3fold page against free */
+ set_bit(UNDER_RECLAIM, &page->private);
+ zhdr = page_address(page);
+ if (!test_bit(PAGE_HEADLESS, &page->private)) {
+ list_del(&zhdr->buddy);
+ /*
+ * We need encode the handles before unlocking, since
+ * we can race with free that will set
+ * (first|last)_chunks to 0
+ */
+ first_handle = 0;
+ last_handle = 0;
+ middle_handle = 0;
+ if (zhdr->first_chunks)
+ first_handle = encode_handle(zhdr, FIRST);
+ if (zhdr->middle_chunks)
+ middle_handle = encode_handle(zhdr, MIDDLE);
+ if (zhdr->last_chunks)
+ last_handle = encode_handle(zhdr, LAST);
+ } else {
+ first_handle = encode_handle(zhdr, HEADLESS);
+ last_handle = middle_handle = 0;
+ }
+
+ spin_unlock(&pool->lock);
+
+ /* Issue the eviction callback(s) */
+ if (middle_handle) {
+ ret = pool->ops->evict(pool, middle_handle);
+ if (ret)
+ goto next;
+ }
+ if (first_handle) {
+ ret = pool->ops->evict(pool, first_handle);
+ if (ret)
+ goto next;
+ }
+ if (last_handle) {
+ ret = pool->ops->evict(pool, last_handle);
+ if (ret)
+ goto next;
+ }
+next:
+ spin_lock(&pool->lock);
+ clear_bit(UNDER_RECLAIM, &page->private);
+ if ((test_bit(PAGE_HEADLESS, &page->private) && ret == 0) ||
+ (zhdr->first_chunks == 0 && zhdr->last_chunks == 0 &&
+ zhdr->middle_chunks == 0)) {
+ /*
+ * All buddies are now free, free the z3fold page and
+ * return success.
+ */
+ clear_bit(PAGE_HEADLESS, &page->private);
+ free_z3fold_page(zhdr);
+ pool->pages_nr--;
+ spin_unlock(&pool->lock);
+ return 0;
+ } else if (zhdr->first_chunks != 0 &&
+ zhdr->last_chunks != 0 && zhdr->middle_chunks != 0) {
+ /* Full, add to buddied list */
+ list_add(&zhdr->buddy, &pool->buddied);
+ } else if (!test_bit(PAGE_HEADLESS, &page->private)) {
+ z3fold_compact_page(zhdr);
+ /* add to unbuddied list */
+ freechunks = num_free_chunks(zhdr);
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ }
+
+ /* add to beginning of LRU */
+ list_add(&page->lru, &pool->lru);
+ }
+ spin_unlock(&pool->lock);
+ return -EAGAIN;
+}
+
+/**
+ * z3fold_map() - maps the allocation associated with the given handle
+ * @pool: pool in which the allocation resides
+ * @handle: handle associated with the allocation to be mapped
+ *
+ * Extracts the buddy number from handle and constructs the pointer to the
+ * correct starting chunk within the page.
+ *
+ * Returns: a pointer to the mapped allocation
+ */
+static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
+{
+ struct z3fold_header *zhdr;
+ struct page *page;
+ void *addr;
+ enum buddy buddy;
+
+ spin_lock(&pool->lock);
+ zhdr = handle_to_z3fold_header(handle);
+ addr = zhdr;
+ page = virt_to_page(zhdr);
+
+ if (test_bit(PAGE_HEADLESS, &page->private))
+ goto out;
+
+ buddy = handle_to_buddy(handle);
+ switch (buddy) {
+ case FIRST:
+ addr += ZHDR_SIZE_ALIGNED;
+ break;
+ case MIDDLE:
+ addr += zhdr->start_middle << CHUNK_SHIFT;
+ set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ break;
+ case LAST:
+ addr += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
+ break;
+ default:
+ pr_err("unknown buddy id %d\n", buddy);
+ WARN_ON(1);
+ addr = NULL;
+ break;
+ }
+out:
+ spin_unlock(&pool->lock);
+ return addr;
+}
+
+/**
+ * z3fold_unmap() - unmaps the allocation associated with the given handle
+ * @pool: pool in which the allocation resides
+ * @handle: handle associated with the allocation to be unmapped
+ */
+static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
+{
+ struct z3fold_header *zhdr;
+ struct page *page;
+ enum buddy buddy;
+
+ spin_lock(&pool->lock);
+ zhdr = handle_to_z3fold_header(handle);
+ page = virt_to_page(zhdr);
+
+ if (test_bit(PAGE_HEADLESS, &page->private)) {
+ spin_unlock(&pool->lock);
+ return;
+ }
+
+ buddy = handle_to_buddy(handle);
+ if (buddy == MIDDLE)
+ clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ spin_unlock(&pool->lock);
+}
+
+/**
+ * z3fold_get_pool_size() - gets the z3fold pool size in pages
+ * @pool: pool whose size is being queried
+ *
+ * Returns: size in pages of the given pool. The pool lock need not be
+ * taken to access pages_nr.
+ */
+static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
+{
+ return pool->pages_nr;
+}
+
+/*****************
+ * zpool
+ ****************/
+
+static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
+{
+ if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
+ return pool->zpool_ops->evict(pool->zpool, handle);
+ else
+ return -ENOENT;
+}
+
+static const struct z3fold_ops z3fold_zpool_ops = {
+ .evict = z3fold_zpool_evict
+};
+
+static void *z3fold_zpool_create(const char *name, gfp_t gfp,
+ const struct zpool_ops *zpool_ops,
+ struct zpool *zpool)
+{
+ struct z3fold_pool *pool;
+
+ pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL);
+ if (pool) {
+ pool->zpool = zpool;
+ pool->zpool_ops = zpool_ops;
+ }
+ return pool;
+}
+
+static void z3fold_zpool_destroy(void *pool)
+{
+ z3fold_destroy_pool(pool);
+}
+
+static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+ unsigned long *handle)
+{
+ return z3fold_alloc(pool, size, gfp, handle);
+}
+static void z3fold_zpool_free(void *pool, unsigned long handle)
+{
+ z3fold_free(pool, handle);
+}
+
+static int z3fold_zpool_shrink(void *pool, unsigned int pages,
+ unsigned int *reclaimed)
+{
+ unsigned int total = 0;
+ int ret = -EINVAL;
+
+ while (total < pages) {
+ ret = z3fold_reclaim_page(pool, 8);
+ if (ret < 0)
+ break;
+ total++;
+ }
+
+ if (reclaimed)
+ *reclaimed = total;
+
+ return ret;
+}
+
+static void *z3fold_zpool_map(void *pool, unsigned long handle,
+ enum zpool_mapmode mm)
+{
+ return z3fold_map(pool, handle);
+}
+static void z3fold_zpool_unmap(void *pool, unsigned long handle)
+{
+ z3fold_unmap(pool, handle);
+}
+
+static u64 z3fold_zpool_total_size(void *pool)
+{
+ return z3fold_get_pool_size(pool) * PAGE_SIZE;
+}
+
+static struct zpool_driver z3fold_zpool_driver = {
+ .type = "z3fold",
+ .owner = THIS_MODULE,
+ .create = z3fold_zpool_create,
+ .destroy = z3fold_zpool_destroy,
+ .malloc = z3fold_zpool_malloc,
+ .free = z3fold_zpool_free,
+ .shrink = z3fold_zpool_shrink,
+ .map = z3fold_zpool_map,
+ .unmap = z3fold_zpool_unmap,
+ .total_size = z3fold_zpool_total_size,
+};
+
+MODULE_ALIAS("zpool-z3fold");
+
+static int __init init_z3fold(void)
+{
+ /* Make sure the z3fold header will fit in one chunk */
+ BUILD_BUG_ON(sizeof(struct z3fold_header) > ZHDR_SIZE_ALIGNED);
+ zpool_register_driver(&z3fold_zpool_driver);
+
+ return 0;
+}
+
+static void __exit exit_z3fold(void)
+{
+ zpool_unregister_driver(&z3fold_zpool_driver);
+}
+
+module_init(init_z3fold);
+module_exit(exit_z3fold);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
+MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index fe47fbba995a..72698db958e7 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -247,7 +247,6 @@ struct zs_pool {
struct size_class **size_class;
struct kmem_cache *handle_cachep;
- gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
struct zs_pool_stats stats;
@@ -295,10 +294,10 @@ static void destroy_handle_cache(struct zs_pool *pool)
kmem_cache_destroy(pool->handle_cachep);
}
-static unsigned long alloc_handle(struct zs_pool *pool)
+static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp)
{
return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
- pool->flags & ~__GFP_HIGHMEM);
+ gfp & ~__GFP_HIGHMEM);
}
static void free_handle(struct zs_pool *pool, unsigned long handle)
@@ -324,7 +323,12 @@ static void *zs_zpool_create(const char *name, gfp_t gfp,
const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
- return zs_create_pool(name, gfp);
+ /*
+ * Ignore global gfp flags: zs_malloc() may be invoked from
+ * different contexts and its caller must provide a valid
+ * gfp mask.
+ */
+ return zs_create_pool(name);
}
static void zs_zpool_destroy(void *pool)
@@ -335,7 +339,7 @@ static void zs_zpool_destroy(void *pool)
static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
unsigned long *handle)
{
- *handle = zs_malloc(pool, size);
+ *handle = zs_malloc(pool, size, gfp);
return *handle ? 0 : -1;
}
static void zs_zpool_free(void *pool, unsigned long handle)
@@ -413,26 +417,28 @@ static int is_last_page(struct page *page)
return PagePrivate2(page);
}
-static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
+static void get_zspage_mapping(struct page *first_page,
+ unsigned int *class_idx,
enum fullness_group *fullness)
{
unsigned long m;
- BUG_ON(!is_first_page(page));
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
- m = (unsigned long)page->mapping;
+ m = (unsigned long)first_page->mapping;
*fullness = m & FULLNESS_MASK;
*class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
}
-static void set_zspage_mapping(struct page *page, unsigned int class_idx,
+static void set_zspage_mapping(struct page *first_page,
+ unsigned int class_idx,
enum fullness_group fullness)
{
unsigned long m;
- BUG_ON(!is_first_page(page));
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
(fullness & FULLNESS_MASK);
- page->mapping = (struct address_space *)m;
+ first_page->mapping = (struct address_space *)m;
}
/*
@@ -567,17 +573,17 @@ static const struct file_operations zs_stat_size_ops = {
.release = single_release,
};
-static int zs_pool_stat_create(const char *name, struct zs_pool *pool)
+static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
{
struct dentry *entry;
if (!zs_stat_root)
- return -ENODEV;
+ return;
entry = debugfs_create_dir(name, zs_stat_root);
if (!entry) {
pr_warn("debugfs dir <%s> creation failed\n", name);
- return -ENOMEM;
+ return;
}
pool->stat_dentry = entry;
@@ -586,10 +592,8 @@ static int zs_pool_stat_create(const char *name, struct zs_pool *pool)
if (!entry) {
pr_warn("%s: debugfs file entry <%s> creation failed\n",
name, "classes");
- return -ENOMEM;
+ return;
}
-
- return 0;
}
static void zs_pool_stat_destroy(struct zs_pool *pool)
@@ -607,9 +611,8 @@ static void __exit zs_stat_exit(void)
{
}
-static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool)
+static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name)
{
- return 0;
}
static inline void zs_pool_stat_destroy(struct zs_pool *pool)
@@ -617,7 +620,6 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
}
#endif
-
/*
* For each size class, zspages are divided into different groups
* depending on how "full" they are. This was done so that we could
@@ -625,14 +627,15 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
* the pool (not yet implemented). This function returns fullness
* status of the given page.
*/
-static enum fullness_group get_fullness_group(struct page *page)
+static enum fullness_group get_fullness_group(struct page *first_page)
{
int inuse, max_objects;
enum fullness_group fg;
- BUG_ON(!is_first_page(page));
- inuse = page->inuse;
- max_objects = page->objects;
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+
+ inuse = first_page->inuse;
+ max_objects = first_page->objects;
if (inuse == 0)
fg = ZS_EMPTY;
@@ -652,12 +655,13 @@ static enum fullness_group get_fullness_group(struct page *page)
* have. This functions inserts the given zspage into the freelist
* identified by <class, fullness_group>.
*/
-static void insert_zspage(struct page *page, struct size_class *class,
- enum fullness_group fullness)
+static void insert_zspage(struct size_class *class,
+ enum fullness_group fullness,
+ struct page *first_page)
{
struct page **head;
- BUG_ON(!is_first_page(page));
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
if (fullness >= _ZS_NR_FULLNESS_GROUPS)
return;
@@ -667,7 +671,7 @@ static void insert_zspage(struct page *page, struct size_class *class,
head = &class->fullness_list[fullness];
if (!*head) {
- *head = page;
+ *head = first_page;
return;
}
@@ -675,34 +679,35 @@ static void insert_zspage(struct page *page, struct size_class *class,
* We want to see more ZS_FULL pages and less almost
* empty/full. Put pages with higher ->inuse first.
*/
- list_add_tail(&page->lru, &(*head)->lru);
- if (page->inuse >= (*head)->inuse)
- *head = page;
+ list_add_tail(&first_page->lru, &(*head)->lru);
+ if (first_page->inuse >= (*head)->inuse)
+ *head = first_page;
}
/*
* This function removes the given zspage from the freelist identified
* by <class, fullness_group>.
*/
-static void remove_zspage(struct page *page, struct size_class *class,
- enum fullness_group fullness)
+static void remove_zspage(struct size_class *class,
+ enum fullness_group fullness,
+ struct page *first_page)
{
struct page **head;
- BUG_ON(!is_first_page(page));
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
if (fullness >= _ZS_NR_FULLNESS_GROUPS)
return;
head = &class->fullness_list[fullness];
- BUG_ON(!*head);
+ VM_BUG_ON_PAGE(!*head, first_page);
if (list_empty(&(*head)->lru))
*head = NULL;
- else if (*head == page)
+ else if (*head == first_page)
*head = (struct page *)list_entry((*head)->lru.next,
struct page, lru);
- list_del_init(&page->lru);
+ list_del_init(&first_page->lru);
zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
}
@@ -717,21 +722,19 @@ static void remove_zspage(struct page *page, struct size_class *class,
* fullness group.
*/
static enum fullness_group fix_fullness_group(struct size_class *class,
- struct page *page)
+ struct page *first_page)
{
int class_idx;
enum fullness_group currfg, newfg;
- BUG_ON(!is_first_page(page));
-
- get_zspage_mapping(page, &class_idx, &currfg);
- newfg = get_fullness_group(page);
+ get_zspage_mapping(first_page, &class_idx, &currfg);
+ newfg = get_fullness_group(first_page);
if (newfg == currfg)
goto out;
- remove_zspage(page, class, currfg);
- insert_zspage(page, class, newfg);
- set_zspage_mapping(page, class_idx, newfg);
+ remove_zspage(class, currfg, first_page);
+ insert_zspage(class, newfg, first_page);
+ set_zspage_mapping(first_page, class_idx, newfg);
out:
return newfg;
@@ -809,7 +812,7 @@ static void *location_to_obj(struct page *page, unsigned long obj_idx)
unsigned long obj;
if (!page) {
- BUG_ON(obj_idx);
+ VM_BUG_ON(obj_idx);
return NULL;
}
@@ -842,7 +845,7 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page,
void *obj)
{
if (class->huge) {
- VM_BUG_ON(!is_first_page(page));
+ VM_BUG_ON_PAGE(!is_first_page(page), page);
return page_private(page);
} else
return *(unsigned long *)obj;
@@ -892,8 +895,8 @@ static void free_zspage(struct page *first_page)
{
struct page *nextp, *tmp, *head_extra;
- BUG_ON(!is_first_page(first_page));
- BUG_ON(first_page->inuse);
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+ VM_BUG_ON_PAGE(first_page->inuse, first_page);
head_extra = (struct page *)page_private(first_page);
@@ -914,12 +917,13 @@ static void free_zspage(struct page *first_page)
}
/* Initialize a newly allocated zspage */
-static void init_zspage(struct page *first_page, struct size_class *class)
+static void init_zspage(struct size_class *class, struct page *first_page)
{
unsigned long off = 0;
struct page *page = first_page;
- BUG_ON(!is_first_page(first_page));
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+
while (page) {
struct page *next_page;
struct link_free *link;
@@ -1001,7 +1005,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
prev_page = page;
}
- init_zspage(first_page, class);
+ init_zspage(class, first_page);
first_page->freelist = location_to_obj(first_page, 0);
/* Maximum number of objects we can store in this zspage */
@@ -1234,11 +1238,11 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
return true;
}
-static bool zspage_full(struct page *page)
+static bool zspage_full(struct page *first_page)
{
- BUG_ON(!is_first_page(page));
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
- return page->inuse == page->objects;
+ return first_page->inuse == first_page->objects;
}
unsigned long zs_get_total_pages(struct zs_pool *pool)
@@ -1274,14 +1278,12 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
struct page *pages[2];
void *ret;
- BUG_ON(!handle);
-
/*
* Because we use per-cpu mapping areas shared among the
* pools/users, we can't allow mapping in interrupt context
* because it can corrupt another users mappings.
*/
- BUG_ON(in_interrupt());
+ WARN_ON_ONCE(in_interrupt());
/* From now on, migration cannot move the object */
pin_tag(handle);
@@ -1325,8 +1327,6 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
struct size_class *class;
struct mapping_area *area;
- BUG_ON(!handle);
-
obj = handle_to_obj(handle);
obj_to_location(obj, &page, &obj_idx);
get_zspage_mapping(get_first_page(page), &class_idx, &fg);
@@ -1350,8 +1350,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
-static unsigned long obj_malloc(struct page *first_page,
- struct size_class *class, unsigned long handle)
+static unsigned long obj_malloc(struct size_class *class,
+ struct page *first_page, unsigned long handle)
{
unsigned long obj;
struct link_free *link;
@@ -1391,7 +1391,7 @@ static unsigned long obj_malloc(struct page *first_page,
* otherwise 0.
* Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
*/
-unsigned long zs_malloc(struct zs_pool *pool, size_t size)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
{
unsigned long handle, obj;
struct size_class *class;
@@ -1400,7 +1400,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
return 0;
- handle = alloc_handle(pool);
+ handle = alloc_handle(pool, gfp);
if (!handle)
return 0;
@@ -1413,7 +1413,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
if (!first_page) {
spin_unlock(&class->lock);
- first_page = alloc_zspage(class, pool->flags);
+ first_page = alloc_zspage(class, gfp);
if (unlikely(!first_page)) {
free_handle(pool, handle);
return 0;
@@ -1428,7 +1428,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
class->size, class->pages_per_zspage));
}
- obj = obj_malloc(first_page, class, handle);
+ obj = obj_malloc(class, first_page, handle);
/* Now move the zspage to another fullness group, if required */
fix_fullness_group(class, first_page);
record_obj(handle, obj);
@@ -1438,16 +1438,13 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
}
EXPORT_SYMBOL_GPL(zs_malloc);
-static void obj_free(struct zs_pool *pool, struct size_class *class,
- unsigned long obj)
+static void obj_free(struct size_class *class, unsigned long obj)
{
struct link_free *link;
struct page *first_page, *f_page;
unsigned long f_objidx, f_offset;
void *vaddr;
- BUG_ON(!obj);
-
obj &= ~OBJ_ALLOCATED_TAG;
obj_to_location(obj, &f_page, &f_objidx);
first_page = get_first_page(f_page);
@@ -1487,7 +1484,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
class = pool->size_class[class_idx];
spin_lock(&class->lock);
- obj_free(pool, class, obj);
+ obj_free(class, obj);
fullness = fix_fullness_group(class, first_page);
if (fullness == ZS_EMPTY) {
zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
@@ -1503,8 +1500,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
}
EXPORT_SYMBOL_GPL(zs_free);
-static void zs_object_copy(unsigned long dst, unsigned long src,
- struct size_class *class)
+static void zs_object_copy(struct size_class *class, unsigned long dst,
+ unsigned long src)
{
struct page *s_page, *d_page;
unsigned long s_objidx, d_objidx;
@@ -1547,7 +1544,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src,
kunmap_atomic(d_addr);
kunmap_atomic(s_addr);
s_page = get_next_page(s_page);
- BUG_ON(!s_page);
s_addr = kmap_atomic(s_page);
d_addr = kmap_atomic(d_page);
s_size = class->size - written;
@@ -1557,7 +1553,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src,
if (d_off >= PAGE_SIZE) {
kunmap_atomic(d_addr);
d_page = get_next_page(d_page);
- BUG_ON(!d_page);
d_addr = kmap_atomic(d_page);
d_size = class->size - written;
d_off = 0;
@@ -1572,8 +1567,8 @@ static void zs_object_copy(unsigned long dst, unsigned long src,
* Find alloced object in zspage from index object and
* return handle.
*/
-static unsigned long find_alloced_obj(struct page *page, int index,
- struct size_class *class)
+static unsigned long find_alloced_obj(struct size_class *class,
+ struct page *page, int index)
{
unsigned long head;
int offset = 0;
@@ -1623,7 +1618,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
int ret = 0;
while (1) {
- handle = find_alloced_obj(s_page, index, class);
+ handle = find_alloced_obj(class, s_page, index);
if (!handle) {
s_page = get_next_page(s_page);
if (!s_page)
@@ -1640,8 +1635,8 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
}
used_obj = handle_to_obj(handle);
- free_obj = obj_malloc(d_page, class, handle);
- zs_object_copy(free_obj, used_obj, class);
+ free_obj = obj_malloc(class, d_page, handle);
+ zs_object_copy(class, free_obj, used_obj);
index++;
/*
* record_obj updates handle's value to free_obj and it will
@@ -1652,7 +1647,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
free_obj |= BIT(HANDLE_PIN_BIT);
record_obj(handle, free_obj);
unpin_tag(handle);
- obj_free(pool, class, used_obj);
+ obj_free(class, used_obj);
}
/* Remember last position in this iteration */
@@ -1670,7 +1665,7 @@ static struct page *isolate_target_page(struct size_class *class)
for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
page = class->fullness_list[i];
if (page) {
- remove_zspage(page, class, i);
+ remove_zspage(class, i, page);
break;
}
}
@@ -1692,10 +1687,8 @@ static enum fullness_group putback_zspage(struct zs_pool *pool,
{
enum fullness_group fullness;
- BUG_ON(!is_first_page(first_page));
-
fullness = get_fullness_group(first_page);
- insert_zspage(first_page, class, fullness);
+ insert_zspage(class, fullness, first_page);
set_zspage_mapping(first_page, class->index, fullness);
if (fullness == ZS_EMPTY) {
@@ -1720,7 +1713,7 @@ static struct page *isolate_source_page(struct size_class *class)
if (!page)
continue;
- remove_zspage(page, class, i);
+ remove_zspage(class, i, page);
break;
}
@@ -1757,8 +1750,6 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
spin_lock(&class->lock);
while ((src_page = isolate_source_page(class))) {
- BUG_ON(!is_first_page(src_page));
-
if (!zs_can_compact(class))
break;
@@ -1887,7 +1878,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
* On success, a pointer to the newly created pool is returned,
* otherwise NULL.
*/
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
+struct zs_pool *zs_create_pool(const char *name)
{
int i;
struct zs_pool *pool;
@@ -1957,10 +1948,8 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
prev_class = class;
}
- pool->flags = flags;
-
- if (zs_pool_stat_create(name, pool))
- goto err;
+ /* debug only, don't abort if it fails */
+ zs_pool_stat_create(pool, name);
/*
* Not critical, we still can use the pool
diff --git a/mm/zswap.c b/mm/zswap.c
index de0f119b1780..275b22cc8df4 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -117,7 +117,7 @@ struct zswap_pool {
struct crypto_comp * __percpu *tfm;
struct kref kref;
struct list_head list;
- struct rcu_head rcu_head;
+ struct work_struct work;
struct notifier_block notifier;
char tfm_name[CRYPTO_MAX_ALG_NAME];
};
@@ -658,9 +658,11 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool)
return kref_get_unless_zero(&pool->kref);
}
-static void __zswap_pool_release(struct rcu_head *head)
+static void __zswap_pool_release(struct work_struct *work)
{
- struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head);
+ struct zswap_pool *pool = container_of(work, typeof(*pool), work);
+
+ synchronize_rcu();
/* nobody should have been able to get a kref... */
WARN_ON(kref_get_unless_zero(&pool->kref));
@@ -680,7 +682,9 @@ static void __zswap_pool_empty(struct kref *kref)
WARN_ON(pool == zswap_pool_current());
list_del_rcu(&pool->list);
- call_rcu(&pool->rcu_head, __zswap_pool_release);
+
+ INIT_WORK(&pool->work, __zswap_pool_release);
+ schedule_work(&pool->work);
spin_unlock(&zswap_pools_lock);
}
diff --git a/net/socket.c b/net/socket.c
index e7793f5601ae..a1bd16106625 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2168,7 +2168,8 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
struct mmsghdr __user *entry;
struct compat_mmsghdr __user *compat_entry;
struct msghdr msg_sys;
- struct timespec end_time;
+ struct timespec64 end_time;
+ struct timespec64 timeout64;
if (timeout &&
poll_select_set_timeout(&end_time, timeout->tv_sec,
@@ -2220,8 +2221,9 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
flags |= MSG_DONTWAIT;
if (timeout) {
- ktime_get_ts(timeout);
- *timeout = timespec_sub(end_time, *timeout);
+ ktime_get_ts64(&timeout64);
+ *timeout = timespec64_to_timespec(
+ timespec64_sub(end_time, timeout64));
if (timeout->tv_sec < 0) {
timeout->tv_sec = timeout->tv_nsec = 0;
break;
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 219bd197039e..4e809e978b7d 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -651,7 +651,7 @@ __frame_add_frag(struct sk_buff *skb, struct page *page,
struct skb_shared_info *sh = skb_shinfo(skb);
int page_offset;
- atomic_inc(&page->_count);
+ page_ref_inc(page);
page_offset = ptr - page_address(page);
skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size);
}
diff --git a/samples/kprobes/jprobe_example.c b/samples/kprobes/jprobe_example.c
index c285a3b8a9f1..c3108bb15789 100644
--- a/samples/kprobes/jprobe_example.c
+++ b/samples/kprobes/jprobe_example.c
@@ -25,7 +25,7 @@
/* Proxy routine having the same arguments as actual _do_fork() routine */
static long j_do_fork(unsigned long clone_flags, unsigned long stack_start,
unsigned long stack_size, int __user *parent_tidptr,
- int __user *child_tidptr)
+ int __user *child_tidptr, unsigned long tls)
{
pr_info("jprobe: clone_flags = 0x%lx, stack_start = 0x%lx "
"stack_size = 0x%lx\n", clone_flags, stack_start, stack_size);
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index d574d13ba963..6750595bd7b8 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -27,12 +27,15 @@ my $emacs = 0;
my $terse = 0;
my $showfile = 0;
my $file = 0;
+my $git = 0;
+my %git_commits = ();
my $check = 0;
my $check_orig = 0;
my $summary = 1;
my $mailback = 0;
my $summary_file = 0;
my $show_types = 0;
+my $list_types = 0;
my $fix = 0;
my $fix_inplace = 0;
my $root;
@@ -68,13 +71,24 @@ Options:
--emacs emacs compile window format
--terse one line per report
--showfile emit diffed file position, not input file position
+ -g, --git treat FILE as a single commit or git revision range
+ single git commit with:
+ <rev>
+ <rev>^
+ <rev>~n
+ multiple git commits with:
+ <rev1>..<rev2>
+ <rev1>...<rev2>
+ <rev>-<count>
+ git merges are ignored
-f, --file treat FILE as regular source file
--subjective, --strict enable more subjective tests
+ --list-types list the possible message types
--types TYPE(,TYPE2...) show only these comma separated message types
--ignore TYPE(,TYPE2...) ignore various comma separated message types
+ --show-types show the specific message type in the output
--max-line-length=n set the maximum line length, if exceeded, warn
--min-conf-desc-length=n set the min description length, if shorter, warn
- --show-types show the message "types" in the output
--root=PATH PATH to the kernel tree root
--no-summary suppress the per-file summary
--mailback only produce a report in case of warnings/errors
@@ -106,6 +120,37 @@ EOM
exit($exitcode);
}
+sub uniq {
+ my %seen;
+ return grep { !$seen{$_}++ } @_;
+}
+
+sub list_types {
+ my ($exitcode) = @_;
+
+ my $count = 0;
+
+ local $/ = undef;
+
+ open(my $script, '<', abs_path($P)) or
+ die "$P: Can't read '$P' $!\n";
+
+ my $text = <$script>;
+ close($script);
+
+ my @types = ();
+ for ($text =~ /\b(?:(?:CHK|WARN|ERROR)\s*\(\s*"([^"]+)")/g) {
+ push (@types, $_);
+ }
+ @types = sort(uniq(@types));
+ print("#\tMessage type\n\n");
+ foreach my $type (@types) {
+ print(++$count . "\t" . $type . "\n");
+ }
+
+ exit($exitcode);
+}
+
my $conf = which_conf($configuration_file);
if (-f $conf) {
my @conf_args;
@@ -141,11 +186,13 @@ GetOptions(
'terse!' => \$terse,
'showfile!' => \$showfile,
'f|file!' => \$file,
+ 'g|git!' => \$git,
'subjective!' => \$check,
'strict!' => \$check,
'ignore=s' => \@ignore,
'types=s' => \@use,
'show-types!' => \$show_types,
+ 'list-types!' => \$list_types,
'max-line-length=i' => \$max_line_length,
'min-conf-desc-length=i' => \$min_conf_desc_length,
'root=s' => \$root,
@@ -166,6 +213,8 @@ GetOptions(
help(0) if ($help);
+list_types(0) if ($list_types);
+
$fix = 1 if ($fix_inplace);
$check_orig = $check;
@@ -752,10 +801,42 @@ my @fixed_inserted = ();
my @fixed_deleted = ();
my $fixlinenr = -1;
+# If input is git commits, extract all commits from the commit expressions.
+# For example, HEAD-3 means we need check 'HEAD, HEAD~1, HEAD~2'.
+die "$P: No git repository found\n" if ($git && !-e ".git");
+
+if ($git) {
+ my @commits = ();
+ foreach my $commit_expr (@ARGV) {
+ my $git_range;
+ if ($commit_expr =~ m/^(.*)-(\d+)$/) {
+ $git_range = "-$2 $1";
+ } elsif ($commit_expr =~ m/\.\./) {
+ $git_range = "$commit_expr";
+ } else {
+ $git_range = "-1 $commit_expr";
+ }
+ my $lines = `git log --no-color --no-merges --pretty=format:'%H %s' $git_range`;
+ foreach my $line (split(/\n/, $lines)) {
+ $line =~ /^([0-9a-fA-F]{40,40}) (.*)$/;
+ next if (!defined($1) || !defined($2));
+ my $sha1 = $1;
+ my $subject = $2;
+ unshift(@commits, $sha1);
+ $git_commits{$sha1} = $subject;
+ }
+ }
+ die "$P: no git commits after extraction!\n" if (@commits == 0);
+ @ARGV = @commits;
+}
+
my $vname;
for my $filename (@ARGV) {
my $FILE;
- if ($file) {
+ if ($git) {
+ open($FILE, '-|', "git format-patch -M --stdout -1 $filename") ||
+ die "$P: $filename: git format-patch failed - $!\n";
+ } elsif ($file) {
open($FILE, '-|', "diff -u /dev/null $filename") ||
die "$P: $filename: diff failed - $!\n";
} elsif ($filename eq '-') {
@@ -766,6 +847,8 @@ for my $filename (@ARGV) {
}
if ($filename eq '-') {
$vname = 'Your patch';
+ } elsif ($git) {
+ $vname = "Commit " . substr($filename, 0, 12) . ' ("' . $git_commits{$filename} . '")';
} else {
$vname = $filename;
}
@@ -2755,6 +2838,19 @@ sub process {
"Logical continuations should be on the previous line\n" . $hereprev);
}
+# check indentation starts on a tab stop
+ if ($^V && $^V ge 5.10.0 &&
+ $sline =~ /^\+\t+( +)(?:$c90_Keywords\b|\{\s*$|\}\s*(?:else\b|while\b|\s*$))/) {
+ my $indent = length($1);
+ if ($indent % 8) {
+ if (WARN("TABSTOP",
+ "Statements should start on a tabstop\n" . $herecurr) &&
+ $fix) {
+ $fixed[$fixlinenr] =~ s@(^\+\t+) +@$1 . "\t" x ($indent/8)@e;
+ }
+ }
+ }
+
# check multi-line statement indentation matches previous line
if ($^V && $^V ge 5.10.0 &&
$prevline =~ /^\+([ \t]*)((?:$c90_Keywords(?:\s+if)\s*)|(?:$Declare\s*)?(?:$Ident|\(\s*\*\s*$Ident\s*\))\s*|$Ident\s*=\s*$Ident\s*)\(.*(\&\&|\|\||,)\s*$/) {
@@ -4291,7 +4387,7 @@ sub process {
my $comp = $3;
my $to = $4;
my $newcomp = $comp;
- if ($lead !~ /$Operators\s*$/ &&
+ if ($lead !~ /(?:$Operators|\.)\s*$/ &&
$to !~ /^(?:Constant|[A-Z_][A-Z0-9_]*)$/ &&
WARN("CONSTANT_COMPARISON",
"Comparisons should place the constant on the right side of the test\n" . $herecurr) &&
@@ -5637,6 +5733,16 @@ sub process {
}
}
+# check for #if defined CONFIG_<FOO> || defined CONFIG_<FOO>_MODULE
+ if ($line =~ /^\+\s*#\s*if\s+defined(?:\s*\(?\s*|\s+)(CONFIG_[A-Z_]+)\s*\)?\s*\|\|\s*defined(?:\s*\(?\s*|\s+)\1_MODULE\s*\)?\s*$/) {
+ my $config = $1;
+ if (WARN("PREFER_IS_ENABLED",
+ "Prefer IS_ENABLED(<FOO>) to CONFIG_<FOO> || CONFIG_<FOO>_MODULE\n" . $herecurr) &&
+ $fix) {
+ $fixed[$fixlinenr] = "\+#if IS_ENABLED($config)";
+ }
+ }
+
# check for case / default statements not preceded by break/fallthrough/switch
if ($line =~ /^.\s*(?:case\s+(?:$Ident|$Constant)\s*|default):/) {
my $has_break = 0;
@@ -5827,6 +5933,28 @@ sub process {
}
}
+# whine about ACCESS_ONCE
+ if ($^V && $^V ge 5.10.0 &&
+ $line =~ /\bACCESS_ONCE\s*$balanced_parens\s*(=(?!=))?\s*($FuncArg)?/) {
+ my $par = $1;
+ my $eq = $2;
+ my $fun = $3;
+ $par =~ s/^\(\s*(.*)\s*\)$/$1/;
+ if (defined($eq)) {
+ if (WARN("PREFER_WRITE_ONCE",
+ "Prefer WRITE_ONCE(<FOO>, <BAR>) over ACCESS_ONCE(<FOO>) = <BAR>\n" . $herecurr) &&
+ $fix) {
+ $fixed[$fixlinenr] =~ s/\bACCESS_ONCE\s*\(\s*\Q$par\E\s*\)\s*$eq\s*\Q$fun\E/WRITE_ONCE($par, $fun)/;
+ }
+ } else {
+ if (WARN("PREFER_READ_ONCE",
+ "Prefer READ_ONCE(<FOO>) over ACCESS_ONCE(<FOO>)\n" . $herecurr) &&
+ $fix) {
+ $fixed[$fixlinenr] =~ s/\bACCESS_ONCE\s*\(\s*\Q$par\E\s*\)/READ_ONCE($par)/;
+ }
+ }
+ }
+
# check for lockdep_set_novalidate_class
if ($line =~ /^.\s*lockdep_set_novalidate_class\s*\(/ ||
$line =~ /__lockdep_no_validate__\s*\)/ ) {
@@ -5930,6 +6058,14 @@ sub process {
}
if ($quiet == 0) {
+ # If there were any defects found and not already fixing them
+ if (!$clean and !$fix) {
+ print << "EOM"
+
+NOTE: For some of the reported defects, checkpatch may be able to
+ mechanically convert to the typical style using --fix or --fix-inplace.
+EOM
+ }
# If there were whitespace errors which cleanpatch can fix
# then suggest that.
if ($rpt_cleaners) {
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 00d6d53c2681..c332684e1b5a 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -2,15 +2,17 @@
# (c) 2014, Sasha Levin <sasha.levin@oracle.com>
#set -x
-if [[ $# != 2 ]]; then
+if [[ $# < 2 ]]; then
echo "Usage:"
- echo " $0 [vmlinux] [base path]"
+ echo " $0 [vmlinux] [base path] [modules path]"
exit 1
fi
vmlinux=$1
basepath=$2
+modpath=$3
declare -A cache
+declare -A modcache
parse_symbol() {
# The structure of symbol at this point is:
@@ -19,6 +21,17 @@ parse_symbol() {
# For example:
# do_basic_setup+0x9c/0xbf
+ if [[ $module == "" ]] ; then
+ local objfile=$vmlinux
+ elif [[ "${modcache[$module]+isset}" == "isset" ]]; then
+ local objfile=${modcache[$module]}
+ else
+ [[ $modpath == "" ]] && return
+ local objfile=$(find "$modpath" -name $module.ko -print -quit)
+ [[ $objfile == "" ]] && return
+ modcache[$module]=$objfile
+ fi
+
# Remove the englobing parenthesis
symbol=${symbol#\(}
symbol=${symbol%\)}
@@ -29,11 +42,11 @@ parse_symbol() {
# Use 'nm vmlinux' to figure out the base address of said symbol.
# It's actually faster to call it every time than to load it
# all into bash.
- if [[ "${cache[$name]+isset}" == "isset" ]]; then
- local base_addr=${cache[$name]}
+ if [[ "${cache[$module,$name]+isset}" == "isset" ]]; then
+ local base_addr=${cache[$module,$name]}
else
- local base_addr=$(nm "$vmlinux" | grep -i ' t ' | awk "/ $name\$/ {print \$1}" | head -n1)
- cache["$name"]="$base_addr"
+ local base_addr=$(nm "$objfile" | grep -i ' t ' | awk "/ $name\$/ {print \$1}" | head -n1)
+ cache[$module,$name]="$base_addr"
fi
# Let's start doing the math to get the exact address into the
# symbol. First, strip out the symbol total length.
@@ -48,12 +61,12 @@ parse_symbol() {
local address=$(printf "%x\n" "$expr")
# Pass it to addr2line to get filename and line number
- # Could get more than one result
- if [[ "${cache[$address]+isset}" == "isset" ]]; then
- local code=${cache[$address]}
+ # Could get more than one result
+ if [[ "${cache[$module,$address]+isset}" == "isset" ]]; then
+ local code=${cache[$module,$address]}
else
- local code=$(addr2line -i -e "$vmlinux" "$address")
- cache[$address]=$code
+ local code=$(addr2line -i -e "$objfile" "$address")
+ cache[$module,$address]=$code
fi
# addr2line doesn't return a proper error code if it fails, so
@@ -105,13 +118,23 @@ handle_line() {
fi
done
- # The symbol is the last element, process it
- symbol=${words[$last]}
+ if [[ ${words[$last]} =~ \[([^]]+)\] ]]; then
+ module=${words[$last]}
+ module=${module#\[}
+ module=${module%\]}
+ symbol=${words[$last-1]}
+ unset words[$last-1]
+ else
+ # The symbol is the last element, process it
+ symbol=${words[$last]}
+ module=
+ fi
+
unset words[$last]
parse_symbol # modifies $symbol
# Add up the line number to the symbol
- echo "${words[@]}" "$symbol"
+ echo "${words[@]}" "$symbol $module"
}
while read line; do
@@ -121,8 +144,8 @@ while read line; do
handle_line "$line"
# Is it a code line?
elif [[ $line == *Code:* ]]; then
- decode_code "$line"
- else
+ decode_code "$line"
+ else
# Nothing special in this line, show it as is
echo "$line"
fi
diff --git a/scripts/gdb/linux/Makefile b/scripts/gdb/linux/Makefile
index 6cf1ecf61057..cd129e65d1ff 100644
--- a/scripts/gdb/linux/Makefile
+++ b/scripts/gdb/linux/Makefile
@@ -8,4 +8,14 @@ ifneq ($(KBUILD_SRC),)
endif
@:
-clean-files := *.pyc *.pyo $(if $(KBUILD_SRC),*.py)
+quiet_cmd_gen_constants_py = GEN $@
+ cmd_gen_constants_py = \
+ $(CPP) -E -x c -P $(c_flags) $< > $@ ;\
+ sed -i '1,/<!-- end-c-headers -->/d;' $@
+
+$(obj)/constants.py: $(SRCTREE)/$(obj)/constants.py.in
+ $(call if_changed,gen_constants_py)
+
+build_constants_py: $(obj)/constants.py
+
+clean-files := *.pyc *.pyo $(if $(KBUILD_SRC),*.py) $(obj)/constants.py
diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in
new file mode 100644
index 000000000000..07e6c2befe36
--- /dev/null
+++ b/scripts/gdb/linux/constants.py.in
@@ -0,0 +1,59 @@
+/*
+ * gdb helper commands and functions for Linux kernel debugging
+ *
+ * Kernel constants derived from include files.
+ *
+ * Copyright (c) 2016 Linaro Ltd
+ *
+ * Authors:
+ * Kieran Bingham <kieran.bingham@linaro.org>
+ *
+ * This work is licensed under the terms of the GNU GPL version 2.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/radix-tree.h>
+
+/* We need to stringify expanded macros so that they can be parsed */
+
+#define STRING(x) #x
+#define XSTRING(x) STRING(x)
+
+#define LX_VALUE(x) LX_##x = x
+#define LX_GDBPARSED(x) LX_##x = gdb.parse_and_eval(XSTRING(x))
+
+/*
+ * IS_ENABLED generates (a || b) which is not compatible with python
+ * We can only switch on configuration items we know are available
+ * Therefore - IS_BUILTIN() is more appropriate
+ */
+#define LX_CONFIG(x) LX_##x = IS_BUILTIN(x)
+
+/* The build system will take care of deleting everything above this marker */
+<!-- end-c-headers -->
+
+import gdb
+
+/* linux/fs.h */
+LX_VALUE(MS_RDONLY)
+LX_VALUE(MS_SYNCHRONOUS)
+LX_VALUE(MS_MANDLOCK)
+LX_VALUE(MS_DIRSYNC)
+LX_VALUE(MS_NOATIME)
+LX_VALUE(MS_NODIRATIME)
+
+/* linux/mount.h */
+LX_VALUE(MNT_NOSUID)
+LX_VALUE(MNT_NODEV)
+LX_VALUE(MNT_NOEXEC)
+LX_VALUE(MNT_NOATIME)
+LX_VALUE(MNT_NODIRATIME)
+LX_VALUE(MNT_RELATIME)
+
+/* linux/radix-tree.h */
+LX_VALUE(RADIX_TREE_INDIRECT_PTR)
+LX_GDBPARSED(RADIX_TREE_HEIGHT_MASK)
+LX_GDBPARSED(RADIX_TREE_MAP_SHIFT)
+LX_GDBPARSED(RADIX_TREE_MAP_MASK)
diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py
index 4297b83fedef..ca11e8df31b6 100644
--- a/scripts/gdb/linux/cpus.py
+++ b/scripts/gdb/linux/cpus.py
@@ -97,9 +97,47 @@ def cpu_list(mask_name):
bits >>= 1
bit += 1
+ yield int(cpu)
+
+
+def each_online_cpu():
+ for cpu in cpu_list("__cpu_online_mask"):
+ yield cpu
+
+
+def each_present_cpu():
+ for cpu in cpu_list("__cpu_present_mask"):
+ yield cpu
+
+
+def each_possible_cpu():
+ for cpu in cpu_list("__cpu_possible_mask"):
+ yield cpu
+
+
+def each_active_cpu():
+ for cpu in cpu_list("__cpu_active_mask"):
yield cpu
+class LxCpus(gdb.Command):
+ """List CPU status arrays
+
+Displays the known state of each CPU based on the kernel masks
+and can help identify the state of hotplugged CPUs"""
+
+ def __init__(self):
+ super(LxCpus, self).__init__("lx-cpus", gdb.COMMAND_DATA)
+
+ def invoke(self, arg, from_tty):
+ gdb.write("Possible CPUs : {}\n".format(list(each_possible_cpu())))
+ gdb.write("Present CPUs : {}\n".format(list(each_present_cpu())))
+ gdb.write("Online CPUs : {}\n".format(list(each_online_cpu())))
+ gdb.write("Active CPUs : {}\n".format(list(each_active_cpu())))
+
+LxCpus()
+
+
class PerCpu(gdb.Function):
"""Return per-cpu variable.
diff --git a/scripts/gdb/linux/dmesg.py b/scripts/gdb/linux/dmesg.py
index 927d0d2a3145..f9b92ece7834 100644
--- a/scripts/gdb/linux/dmesg.py
+++ b/scripts/gdb/linux/dmesg.py
@@ -33,11 +33,12 @@ class LxDmesg(gdb.Command):
if log_first_idx < log_next_idx:
log_buf_2nd_half = -1
length = log_next_idx - log_first_idx
- log_buf = inf.read_memory(start, length)
+ log_buf = utils.read_memoryview(inf, start, length).tobytes()
else:
log_buf_2nd_half = log_buf_len - log_first_idx
- log_buf = inf.read_memory(start, log_buf_2nd_half) + \
- inf.read_memory(log_buf_addr, log_next_idx)
+ a = utils.read_memoryview(inf, start, log_buf_2nd_half)
+ b = utils.read_memoryview(inf, log_buf_addr, log_next_idx)
+ log_buf = a.tobytes() + b.tobytes()
pos = 0
while pos < log_buf.__len__():
@@ -50,10 +51,10 @@ class LxDmesg(gdb.Command):
continue
text_len = utils.read_u16(log_buf[pos + 10:pos + 12])
- text = log_buf[pos + 16:pos + 16 + text_len]
+ text = log_buf[pos + 16:pos + 16 + text_len].decode()
time_stamp = utils.read_u64(log_buf[pos:pos + 8])
- for line in memoryview(text).tobytes().splitlines():
+ for line in text.splitlines():
gdb.write("[{time:12.6f}] {line}\n".format(
time=time_stamp / 1000000000.0,
line=line))
diff --git a/scripts/gdb/linux/lists.py b/scripts/gdb/linux/lists.py
index 3a3775bc162b..2f335fbd86fd 100644
--- a/scripts/gdb/linux/lists.py
+++ b/scripts/gdb/linux/lists.py
@@ -18,6 +18,27 @@ from linux import utils
list_head = utils.CachedType("struct list_head")
+def list_for_each(head):
+ if head.type == list_head.get_type().pointer():
+ head = head.dereference()
+ elif head.type != list_head.get_type():
+ raise gdb.GdbError("Must be struct list_head not {}"
+ .format(head.type))
+
+ node = head['next'].dereference()
+ while node.address != head.address:
+ yield node.address
+ node = node['next'].dereference()
+
+
+def list_for_each_entry(head, gdbtype, member):
+ for node in list_for_each(head):
+ if node.type != list_head.get_type().pointer():
+ raise TypeError("Type {} found. Expected struct list_head *."
+ .format(node.type))
+ yield utils.container_of(node, gdbtype, member)
+
+
def list_check(head):
nb = 0
if (head.type == list_head.get_type().pointer()):
diff --git a/scripts/gdb/linux/modules.py b/scripts/gdb/linux/modules.py
index 0a35d6dbfb80..441b23239896 100644
--- a/scripts/gdb/linux/modules.py
+++ b/scripts/gdb/linux/modules.py
@@ -13,7 +13,7 @@
import gdb
-from linux import cpus, utils
+from linux import cpus, utils, lists
module_type = utils.CachedType("struct module")
@@ -21,14 +21,14 @@ module_type = utils.CachedType("struct module")
def module_list():
global module_type
+ modules = utils.gdb_eval_or_none("modules")
+ if modules is None:
+ return
+
module_ptr_type = module_type.get_type().pointer()
- modules = gdb.parse_and_eval("modules")
- entry = modules['next']
- end_of_list = modules.address
- while entry != end_of_list:
- yield utils.container_of(entry, module_ptr_type, "list")
- entry = entry['next']
+ for module in lists.list_for_each_entry(modules, module_ptr_type, "list"):
+ yield module
def find_module_by_name(name):
@@ -78,19 +78,17 @@ class LxLsmod(gdb.Command):
address=str(layout['base']).split()[0],
name=module['name'].string(),
size=str(layout['size']),
- ref=str(module['refcnt']['counter'])))
+ ref=str(module['refcnt']['counter'] - 1)))
- source_list = module['source_list']
t = self._module_use_type.get_type().pointer()
- entry = source_list['next']
first = True
- while entry != source_list.address:
- use = utils.container_of(entry, t, "source_list")
+ sources = module['source_list']
+ for use in lists.list_for_each_entry(sources, t, "source_list"):
gdb.write("{separator}{name}".format(
separator=" " if first else ",",
name=use['source']['name'].string()))
first = False
- entry = entry['next']
+
gdb.write("\n")
diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py
index 6e6709c1830c..38b1f09d1cd9 100644
--- a/scripts/gdb/linux/proc.py
+++ b/scripts/gdb/linux/proc.py
@@ -12,6 +12,10 @@
#
import gdb
+from linux import constants
+from linux import utils
+from linux import tasks
+from linux import lists
class LxCmdLine(gdb.Command):
@@ -39,3 +43,155 @@ class LxVersion(gdb.Command):
gdb.write(gdb.parse_and_eval("linux_banner").string())
LxVersion()
+
+
+# Resource Structure Printers
+# /proc/iomem
+# /proc/ioports
+
+def get_resources(resource, depth):
+ while resource:
+ yield resource, depth
+
+ child = resource['child']
+ if child:
+ for res, deep in get_resources(child, depth + 1):
+ yield res, deep
+
+ resource = resource['sibling']
+
+
+def show_lx_resources(resource_str):
+ resource = gdb.parse_and_eval(resource_str)
+ width = 4 if resource['end'] < 0x10000 else 8
+ # Iterate straight to the first child
+ for res, depth in get_resources(resource['child'], 0):
+ start = int(res['start'])
+ end = int(res['end'])
+ gdb.write(" " * depth * 2 +
+ "{0:0{1}x}-".format(start, width) +
+ "{0:0{1}x} : ".format(end, width) +
+ res['name'].string() + "\n")
+
+
+class LxIOMem(gdb.Command):
+ """Identify the IO memory resource locations defined by the kernel
+
+Equivalent to cat /proc/iomem on a running target"""
+
+ def __init__(self):
+ super(LxIOMem, self).__init__("lx-iomem", gdb.COMMAND_DATA)
+
+ def invoke(self, arg, from_tty):
+ return show_lx_resources("iomem_resource")
+
+LxIOMem()
+
+
+class LxIOPorts(gdb.Command):
+ """Identify the IO port resource locations defined by the kernel
+
+Equivalent to cat /proc/ioports on a running target"""
+
+ def __init__(self):
+ super(LxIOPorts, self).__init__("lx-ioports", gdb.COMMAND_DATA)
+
+ def invoke(self, arg, from_tty):
+ return show_lx_resources("ioport_resource")
+
+LxIOPorts()
+
+
+# Mount namespace viewer
+# /proc/mounts
+
+def info_opts(lst, opt):
+ opts = ""
+ for key, string in lst.items():
+ if opt & key:
+ opts += string
+ return opts
+
+
+FS_INFO = {constants.LX_MS_SYNCHRONOUS: ",sync",
+ constants.LX_MS_MANDLOCK: ",mand",
+ constants.LX_MS_DIRSYNC: ",dirsync",
+ constants.LX_MS_NOATIME: ",noatime",
+ constants.LX_MS_NODIRATIME: ",nodiratime"}
+
+MNT_INFO = {constants.LX_MNT_NOSUID: ",nosuid",
+ constants.LX_MNT_NODEV: ",nodev",
+ constants.LX_MNT_NOEXEC: ",noexec",
+ constants.LX_MNT_NOATIME: ",noatime",
+ constants.LX_MNT_NODIRATIME: ",nodiratime",
+ constants.LX_MNT_RELATIME: ",relatime"}
+
+mount_type = utils.CachedType("struct mount")
+mount_ptr_type = mount_type.get_type().pointer()
+
+
+class LxMounts(gdb.Command):
+ """Report the VFS mounts of the current process namespace.
+
+Equivalent to cat /proc/mounts on a running target
+An integer value can be supplied to display the mount
+values of that process namespace"""
+
+ def __init__(self):
+ super(LxMounts, self).__init__("lx-mounts", gdb.COMMAND_DATA)
+
+ # Equivalent to proc_namespace.c:show_vfsmnt
+ # However, that has the ability to call into s_op functions
+ # whereas we cannot and must make do with the information we can obtain.
+ def invoke(self, arg, from_tty):
+ argv = gdb.string_to_argv(arg)
+ if len(argv) >= 1:
+ try:
+ pid = int(argv[0])
+ except:
+ raise gdb.GdbError("Provide a PID as integer value")
+ else:
+ pid = 1
+
+ task = tasks.get_task_by_pid(pid)
+ if not task:
+ raise gdb.GdbError("Couldn't find a process with PID {}"
+ .format(pid))
+
+ namespace = task['nsproxy']['mnt_ns']
+ if not namespace:
+ raise gdb.GdbError("No namespace for current process")
+
+ for vfs in lists.list_for_each_entry(namespace['list'],
+ mount_ptr_type, "mnt_list"):
+ devname = vfs['mnt_devname'].string()
+ devname = devname if devname else "none"
+
+ pathname = ""
+ parent = vfs
+ while True:
+ mntpoint = parent['mnt_mountpoint']
+ pathname = utils.dentry_name(mntpoint) + pathname
+ if (parent == parent['mnt_parent']):
+ break
+ parent = parent['mnt_parent']
+
+ if (pathname == ""):
+ pathname = "/"
+
+ superblock = vfs['mnt']['mnt_sb']
+ fstype = superblock['s_type']['name'].string()
+ s_flags = int(superblock['s_flags'])
+ m_flags = int(vfs['mnt']['mnt_flags'])
+ rd = "ro" if (s_flags & constants.LX_MS_RDONLY) else "rw"
+
+ gdb.write(
+ "{} {} {} {}{}{} 0 0\n"
+ .format(devname,
+ pathname,
+ fstype,
+ rd,
+ info_opts(FS_INFO, s_flags),
+ info_opts(MNT_INFO, m_flags)))
+
+LxMounts()
diff --git a/scripts/gdb/linux/radixtree.py b/scripts/gdb/linux/radixtree.py
new file mode 100644
index 000000000000..0fdef4e2971a
--- /dev/null
+++ b/scripts/gdb/linux/radixtree.py
@@ -0,0 +1,97 @@
+#
+# gdb helper commands and functions for Linux kernel debugging
+#
+# Radix Tree Parser
+#
+# Copyright (c) 2016 Linaro Ltd
+#
+# Authors:
+# Kieran Bingham <kieran.bingham@linaro.org>
+#
+# This work is licensed under the terms of the GNU GPL version 2.
+#
+
+import gdb
+
+from linux import utils
+from linux import constants
+
+radix_tree_root_type = utils.CachedType("struct radix_tree_root")
+radix_tree_node_type = utils.CachedType("struct radix_tree_node")
+
+
+def is_indirect_ptr(node):
+ long_type = utils.get_long_type()
+ return (node.cast(long_type) & constants.LX_RADIX_TREE_INDIRECT_PTR)
+
+
+def indirect_to_ptr(node):
+ long_type = utils.get_long_type()
+ node_type = node.type
+ indirect_ptr = node.cast(long_type) & ~constants.LX_RADIX_TREE_INDIRECT_PTR
+ return indirect_ptr.cast(node_type)
+
+
+def maxindex(height):
+ height = height & constants.LX_RADIX_TREE_HEIGHT_MASK
+ return gdb.parse_and_eval("height_to_maxindex["+str(height)+"]")
+
+
+def lookup(root, index):
+ if root.type == radix_tree_root_type.get_type().pointer():
+ root = root.dereference()
+ elif root.type != radix_tree_root_type.get_type():
+ raise gdb.GdbError("Must be struct radix_tree_root not {}"
+ .format(root.type))
+
+ node = root['rnode']
+ if node is 0:
+ return None
+
+ if not (is_indirect_ptr(node)):
+ if (index > 0):
+ return None
+ return node
+
+ node = indirect_to_ptr(node)
+
+ height = node['path'] & constants.LX_RADIX_TREE_HEIGHT_MASK
+ if (index > maxindex(height)):
+ return None
+
+ shift = (height-1) * constants.LX_RADIX_TREE_MAP_SHIFT
+
+ while True:
+ new_index = (index >> shift) & constants.LX_RADIX_TREE_MAP_MASK
+ slot = node['slots'][new_index]
+
+ node = slot.cast(node.type.pointer()).dereference()
+ if node is 0:
+ return None
+
+ shift -= constants.LX_RADIX_TREE_MAP_SHIFT
+ height -= 1
+
+ if (height <= 0):
+ break
+
+ return node
+
+
+class LxRadixTree(gdb.Function):
+ """ Lookup and return a node from a RadixTree.
+
+$lx_radix_tree_lookup(root_node [, index]): Return the node at the given index.
+If index is omitted, the root node is dereferenced and returned."""
+
+ def __init__(self):
+ super(LxRadixTree, self).__init__("lx_radix_tree_lookup")
+
+ def invoke(self, root, index=0):
+ result = lookup(root, index)
+ if result is None:
+ raise gdb.GdbError("No entry in tree at index {}".format(index))
+
+ return result
+
+LxRadixTree()
diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py
index 862a4ae24d49..1bf949c43b76 100644
--- a/scripts/gdb/linux/tasks.py
+++ b/scripts/gdb/linux/tasks.py
@@ -114,3 +114,22 @@ variable."""
LxThreadInfoFunc()
+
+
+class LxThreadInfoByPidFunc (gdb.Function):
+ """Calculate Linux thread_info from task variable found by pid
+
+$lx_thread_info_by_pid(PID): Given PID, return the corresponding thread_info
+variable."""
+
+ def __init__(self):
+ super(LxThreadInfoByPidFunc, self).__init__("lx_thread_info_by_pid")
+
+ def invoke(self, pid):
+ task = get_task_by_pid(pid)
+ if task:
+ return get_thread_info(task.dereference())
+ else:
+ raise gdb.GdbError("No task of PID " + str(pid))
+
+LxThreadInfoByPidFunc()
diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py
index 0893b326a28b..50805874cfc3 100644
--- a/scripts/gdb/linux/utils.py
+++ b/scripts/gdb/linux/utils.py
@@ -87,11 +87,24 @@ def get_target_endianness():
return target_endianness
+def read_memoryview(inf, start, length):
+ return memoryview(inf.read_memory(start, length))
+
+
def read_u16(buffer):
+ value = [0, 0]
+
+ if type(buffer[0]) is str:
+ value[0] = ord(buffer[0])
+ value[1] = ord(buffer[1])
+ else:
+ value[0] = buffer[0]
+ value[1] = buffer[1]
+
if get_target_endianness() == LITTLE_ENDIAN:
- return ord(buffer[0]) + (ord(buffer[1]) << 8)
+ return value[0] + (value[1] << 8)
else:
- return ord(buffer[1]) + (ord(buffer[0]) << 8)
+ return value[1] + (value[0] << 8)
def read_u32(buffer):
@@ -154,3 +167,18 @@ def get_gdbserver_type():
if gdbserver_type is not None and hasattr(gdb, 'events'):
gdb.events.exited.connect(exit_handler)
return gdbserver_type
+
+
+def gdb_eval_or_none(expresssion):
+ try:
+ return gdb.parse_and_eval(expresssion)
+ except:
+ return None
+
+
+def dentry_name(d):
+ parent = d['d_parent']
+ if parent == d or parent == 0:
+ return ""
+ p = dentry_name(d['d_parent']) + "/"
+ return p + d['d_iname'].string()
diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py
index d5943eca19cd..3a80ad6eecad 100644
--- a/scripts/gdb/vmlinux-gdb.py
+++ b/scripts/gdb/vmlinux-gdb.py
@@ -30,3 +30,5 @@ else:
import linux.cpus
import linux.lists
import linux.proc
+ import linux.constants
+ import linux.radixtree
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 946caf3bd694..fa79c6d2a5b8 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -428,6 +428,7 @@ feautures||features
fetaure||feature
fetaures||features
fileystem||filesystem
+fimware||firmware
finanize||finalize
findn||find
finilizes||finalizes
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 3cd0a58672dd..0f887a564a29 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -972,7 +972,7 @@ static void policy_func_show(struct seq_file *m, enum ima_hooks func)
int ima_policy_show(struct seq_file *m, void *v)
{
struct ima_rule_entry *entry = v;
- int i = 0;
+ int i;
char tbuf[64] = {0,};
rcu_read_lock();
@@ -1012,17 +1012,7 @@ int ima_policy_show(struct seq_file *m, void *v)
}
if (entry->flags & IMA_FSUUID) {
- seq_puts(m, "fsuuid=");
- for (i = 0; i < ARRAY_SIZE(entry->fsuuid); ++i) {
- switch (i) {
- case 4:
- case 6:
- case 8:
- case 10:
- seq_puts(m, "-");
- }
- seq_printf(m, "%x", entry->fsuuid[i]);
- }
+ seq_printf(m, "fsuuid=%pU", entry->fsuuid);
seq_puts(m, " ");
}
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
index 604212db9d4b..3b530467148e 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -3,7 +3,7 @@ CFLAGS += -I. -g -Wall -D_LGPL_SOURCE
LDFLAGS += -lpthread -lurcu
TARGETS = main
OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \
- regression1.o regression2.o regression3.o
+ regression1.o regression2.o regression3.o multiorder.o
targets: $(TARGETS)
@@ -13,7 +13,7 @@ main: $(OFILES)
clean:
$(RM) -f $(TARGETS) *.o radix-tree.c
-$(OFILES): *.h */*.h
+$(OFILES): *.h */*.h ../../../include/linux/radix-tree.h ../../include/linux/*.h
radix-tree.c: ../../../lib/radix-tree.c
sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
diff --git a/tools/testing/radix-tree/generated/autoconf.h b/tools/testing/radix-tree/generated/autoconf.h
new file mode 100644
index 000000000000..ad18cf5a2a3a
--- /dev/null
+++ b/tools/testing/radix-tree/generated/autoconf.h
@@ -0,0 +1,3 @@
+#define CONFIG_RADIX_TREE_MULTIORDER 1
+#define CONFIG_SHMEM 1
+#define CONFIG_SWAP 1
diff --git a/tools/testing/radix-tree/linux/init.h b/tools/testing/radix-tree/linux/init.h
new file mode 100644
index 000000000000..360cabb3c4e7
--- /dev/null
+++ b/tools/testing/radix-tree/linux/init.h
@@ -0,0 +1 @@
+/* An empty file stub that allows radix-tree.c to compile. */
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index ae013b0160ac..be98a47b4e1b 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -7,19 +7,28 @@
#include <stddef.h>
#include <limits.h>
+#include "../../include/linux/compiler.h"
+#include "../../../include/linux/kconfig.h"
+
+#define RADIX_TREE_MAP_SHIFT 3
+
#ifndef NULL
#define NULL 0
#endif
#define BUG_ON(expr) assert(!(expr))
+#define WARN_ON(expr) assert(!(expr))
#define __init
#define __must_check
#define panic(expr)
#define printk printf
#define __force
-#define likely(c) (c)
-#define unlikely(c) (c)
#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define pr_debug printk
+
+#define smp_rmb() barrier()
+#define smp_wmb() barrier()
+#define cpu_relax() barrier()
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
@@ -28,6 +37,8 @@
(type *)( (char *)__mptr - offsetof(type, member) );})
#define min(a, b) ((a) < (b) ? (a) : (b))
+#define cond_resched() sched_yield()
+
static inline int in_interrupt(void)
{
return 0;
diff --git a/tools/testing/radix-tree/linux/slab.h b/tools/testing/radix-tree/linux/slab.h
index 57282506c21d..6d5a34770fd4 100644
--- a/tools/testing/radix-tree/linux/slab.h
+++ b/tools/testing/radix-tree/linux/slab.h
@@ -3,7 +3,6 @@
#include <linux/types.h>
-#define GFP_KERNEL 1
#define SLAB_HWCACHE_ALIGN 1
#define SLAB_PANIC 2
#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */
diff --git a/tools/testing/radix-tree/linux/types.h b/tools/testing/radix-tree/linux/types.h
index 72a9d85f6c76..faa0b6ff9ca8 100644
--- a/tools/testing/radix-tree/linux/types.h
+++ b/tools/testing/radix-tree/linux/types.h
@@ -1,15 +1,13 @@
#ifndef _TYPES_H
#define _TYPES_H
+#include "../../include/linux/types.h"
+
#define __rcu
#define __read_mostly
#define BITS_PER_LONG (sizeof(long) * 8)
-struct list_head {
- struct list_head *next, *prev;
-};
-
static inline void INIT_LIST_HEAD(struct list_head *list)
{
list->next = list;
@@ -22,7 +20,6 @@ typedef struct {
#define uninitialized_var(x) x = x
-typedef unsigned gfp_t;
#include <linux/gfp.h>
#endif
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c
index 0e83cad27a9f..b7619ff3b552 100644
--- a/tools/testing/radix-tree/main.c
+++ b/tools/testing/radix-tree/main.c
@@ -61,11 +61,11 @@ void __big_gang_check(void)
} while (!wrapped);
}
-void big_gang_check(void)
+void big_gang_check(bool long_run)
{
int i;
- for (i = 0; i < 1000; i++) {
+ for (i = 0; i < (long_run ? 1000 : 3); i++) {
__big_gang_check();
srand(time(0));
printf("%d ", i);
@@ -232,10 +232,72 @@ void copy_tag_check(void)
item_kill_tree(&tree);
}
-static void single_thread_tests(void)
+static void __locate_check(struct radix_tree_root *tree, unsigned long index,
+ unsigned order)
+{
+ struct item *item;
+ unsigned long index2;
+
+ item_insert_order(tree, index, order);
+ item = item_lookup(tree, index);
+ index2 = radix_tree_locate_item(tree, item);
+ if (index != index2) {
+ printf("index %ld order %d inserted; found %ld\n",
+ index, order, index2);
+ abort();
+ }
+}
+
+static void __order_0_locate_check(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+ int i;
+
+ for (i = 0; i < 50; i++)
+ __locate_check(&tree, rand() % INT_MAX, 0);
+
+ item_kill_tree(&tree);
+}
+
+static void locate_check(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+ unsigned order;
+ unsigned long offset, index;
+
+ __order_0_locate_check();
+
+ for (order = 0; order < 20; order++) {
+ for (offset = 0; offset < (1 << (order + 3));
+ offset += (1UL << order)) {
+ for (index = 0; index < (1UL << (order + 5));
+ index += (1UL << order)) {
+ __locate_check(&tree, index + offset, order);
+ }
+ if (radix_tree_locate_item(&tree, &tree) != -1)
+ abort();
+
+ item_kill_tree(&tree);
+ }
+ }
+
+ if (radix_tree_locate_item(&tree, &tree) != -1)
+ abort();
+ __locate_check(&tree, -1, 0);
+ if (radix_tree_locate_item(&tree, &tree) != -1)
+ abort();
+ item_kill_tree(&tree);
+}
+
+static void single_thread_tests(bool long_run)
{
int i;
+ printf("starting single_thread_tests: %d allocated\n", nr_allocated);
+ multiorder_checks();
+ printf("after multiorder_check: %d allocated\n", nr_allocated);
+ locate_check();
+ printf("after locate_check: %d allocated\n", nr_allocated);
tag_check();
printf("after tag_check: %d allocated\n", nr_allocated);
gang_check();
@@ -244,9 +306,9 @@ static void single_thread_tests(void)
printf("after add_and_check: %d allocated\n", nr_allocated);
dynamic_height_check();
printf("after dynamic_height_check: %d allocated\n", nr_allocated);
- big_gang_check();
+ big_gang_check(long_run);
printf("after big_gang_check: %d allocated\n", nr_allocated);
- for (i = 0; i < 2000; i++) {
+ for (i = 0; i < (long_run ? 2000 : 3); i++) {
copy_tag_check();
printf("%d ", i);
fflush(stdout);
@@ -254,15 +316,23 @@ static void single_thread_tests(void)
printf("after copy_tag_check: %d allocated\n", nr_allocated);
}
-int main(void)
+int main(int argc, char **argv)
{
+ bool long_run = false;
+ int opt;
+
+ while ((opt = getopt(argc, argv, "l")) != -1) {
+ if (opt == 'l')
+ long_run = true;
+ }
+
rcu_register_thread();
radix_tree_init();
regression1_test();
regression2_test();
regression3_test();
- single_thread_tests();
+ single_thread_tests(long_run);
sleep(1);
printf("after sleep(1): %d allocated\n", nr_allocated);
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
new file mode 100644
index 000000000000..39d9b9568fe2
--- /dev/null
+++ b/tools/testing/radix-tree/multiorder.c
@@ -0,0 +1,337 @@
+/*
+ * multiorder.c: Multi-order radix tree entry testing
+ * Copyright (c) 2016 Intel Corporation
+ * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
+ * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/radix-tree.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "test.h"
+
+#define for_each_index(i, base, order) \
+ for (i = base; i < base + (1 << order); i++)
+
+static void __multiorder_tag_test(int index, int order)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+ int base, err, i;
+ unsigned long first = 0;
+
+ /* our canonical entry */
+ base = index & ~((1 << order) - 1);
+
+ printf("Multiorder tag test with index %d, canonical entry %d\n",
+ index, base);
+
+ err = item_insert_order(&tree, index, order);
+ assert(!err);
+
+ /*
+ * Verify we get collisions for covered indices. We try and fail to
+ * insert an exceptional entry so we don't leak memory via
+ * item_insert_order().
+ */
+ for_each_index(i, base, order) {
+ err = __radix_tree_insert(&tree, i, order,
+ (void *)(0xA0 | RADIX_TREE_EXCEPTIONAL_ENTRY));
+ assert(err == -EEXIST);
+ }
+
+ for_each_index(i, base, order) {
+ assert(!radix_tree_tag_get(&tree, i, 0));
+ assert(!radix_tree_tag_get(&tree, i, 1));
+ }
+
+ assert(radix_tree_tag_set(&tree, index, 0));
+
+ for_each_index(i, base, order) {
+ assert(radix_tree_tag_get(&tree, i, 0));
+ assert(!radix_tree_tag_get(&tree, i, 1));
+ }
+
+ assert(radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, 10, 0, 1) == 1);
+ assert(radix_tree_tag_clear(&tree, index, 0));
+
+ for_each_index(i, base, order) {
+ assert(!radix_tree_tag_get(&tree, i, 0));
+ assert(radix_tree_tag_get(&tree, i, 1));
+ }
+
+ assert(radix_tree_tag_clear(&tree, index, 1));
+
+ assert(!radix_tree_tagged(&tree, 0));
+ assert(!radix_tree_tagged(&tree, 1));
+
+ item_kill_tree(&tree);
+}
+
+static void multiorder_tag_tests(void)
+{
+ /* test multi-order entry for indices 0-7 with no sibling pointers */
+ __multiorder_tag_test(0, 3);
+ __multiorder_tag_test(5, 3);
+
+ /* test multi-order entry for indices 8-15 with no sibling pointers */
+ __multiorder_tag_test(8, 3);
+ __multiorder_tag_test(15, 3);
+
+ /*
+ * Our order 5 entry covers indices 0-31 in a tree with height=2.
+ * This is broken up as follows:
+ * 0-7: canonical entry
+ * 8-15: sibling 1
+ * 16-23: sibling 2
+ * 24-31: sibling 3
+ */
+ __multiorder_tag_test(0, 5);
+ __multiorder_tag_test(29, 5);
+
+ /* same test, but with indices 32-63 */
+ __multiorder_tag_test(32, 5);
+ __multiorder_tag_test(44, 5);
+
+ /*
+ * Our order 8 entry covers indices 0-255 in a tree with height=3.
+ * This is broken up as follows:
+ * 0-63: canonical entry
+ * 64-127: sibling 1
+ * 128-191: sibling 2
+ * 192-255: sibling 3
+ */
+ __multiorder_tag_test(0, 8);
+ __multiorder_tag_test(190, 8);
+
+ /* same test, but with indices 256-511 */
+ __multiorder_tag_test(256, 8);
+ __multiorder_tag_test(300, 8);
+
+ __multiorder_tag_test(0x12345678UL, 8);
+}
+
+static void multiorder_check(unsigned long index, int order)
+{
+ unsigned long i;
+ unsigned long min = index & ~((1UL << order) - 1);
+ unsigned long max = min + (1UL << order);
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ printf("Multiorder index %ld, order %d\n", index, order);
+
+ assert(item_insert_order(&tree, index, order) == 0);
+
+ for (i = min; i < max; i++) {
+ struct item *item = item_lookup(&tree, i);
+ assert(item != 0);
+ assert(item->index == index);
+ }
+ for (i = 0; i < min; i++)
+ item_check_absent(&tree, i);
+ for (i = max; i < 2*max; i++)
+ item_check_absent(&tree, i);
+ for (i = min; i < max; i++) {
+ static void *entry = (void *)
+ (0xA0 | RADIX_TREE_EXCEPTIONAL_ENTRY);
+ assert(radix_tree_insert(&tree, i, entry) == -EEXIST);
+ }
+
+ assert(item_delete(&tree, index) != 0);
+
+ for (i = 0; i < 2*max; i++)
+ item_check_absent(&tree, i);
+}
+
+static void multiorder_shrink(unsigned long index, int order)
+{
+ unsigned long i;
+ unsigned long max = 1 << order;
+ RADIX_TREE(tree, GFP_KERNEL);
+ struct radix_tree_node *node;
+
+ printf("Multiorder shrink index %ld, order %d\n", index, order);
+
+ assert(item_insert_order(&tree, 0, order) == 0);
+
+ node = tree.rnode;
+
+ assert(item_insert(&tree, index) == 0);
+ assert(node != tree.rnode);
+
+ assert(item_delete(&tree, index) != 0);
+ assert(node == tree.rnode);
+
+ for (i = 0; i < max; i++) {
+ struct item *item = item_lookup(&tree, i);
+ assert(item != 0);
+ assert(item->index == 0);
+ }
+ for (i = max; i < 2*max; i++)
+ item_check_absent(&tree, i);
+
+ if (!item_delete(&tree, 0)) {
+ printf("failed to delete index %ld (order %d)\n", index, order); abort();
+ }
+
+ for (i = 0; i < 2*max; i++)
+ item_check_absent(&tree, i);
+}
+
+static void multiorder_insert_bug(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ item_insert(&tree, 0);
+ radix_tree_tag_set(&tree, 0, 0);
+ item_insert_order(&tree, 3 << 6, 6);
+
+ item_kill_tree(&tree);
+}
+
+void multiorder_iteration(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+ struct radix_tree_iter iter;
+ void **slot;
+ int i, j, err;
+
+ printf("Multiorder iteration test\n");
+
+#define NUM_ENTRIES 11
+ int index[NUM_ENTRIES] = {0, 2, 4, 8, 16, 32, 34, 36, 64, 72, 128};
+ int order[NUM_ENTRIES] = {1, 1, 2, 3, 4, 1, 0, 1, 3, 0, 7};
+
+ for (i = 0; i < NUM_ENTRIES; i++) {
+ err = item_insert_order(&tree, index[i], order[i]);
+ assert(!err);
+ }
+
+ for (j = 0; j < 256; j++) {
+ for (i = 0; i < NUM_ENTRIES; i++)
+ if (j <= (index[i] | ((1 << order[i]) - 1)))
+ break;
+
+ radix_tree_for_each_slot(slot, &tree, &iter, j) {
+ int height = order[i] / RADIX_TREE_MAP_SHIFT;
+ int shift = height * RADIX_TREE_MAP_SHIFT;
+ int mask = (1 << order[i]) - 1;
+
+ assert(iter.index >= (index[i] &~ mask));
+ assert(iter.index <= (index[i] | mask));
+ assert(iter.shift == shift);
+ i++;
+ }
+ }
+
+ item_kill_tree(&tree);
+}
+
+void multiorder_tagged_iteration(void)
+{
+ RADIX_TREE(tree, GFP_KERNEL);
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned long first = 0;
+ int i, j;
+
+ printf("Multiorder tagged iteration test\n");
+
+#define MT_NUM_ENTRIES 9
+ int index[MT_NUM_ENTRIES] = {0, 2, 4, 16, 32, 40, 64, 72, 128};
+ int order[MT_NUM_ENTRIES] = {1, 0, 2, 4, 3, 1, 3, 0, 7};
+
+#define TAG_ENTRIES 7
+ int tag_index[TAG_ENTRIES] = {0, 4, 16, 40, 64, 72, 128};
+
+ for (i = 0; i < MT_NUM_ENTRIES; i++)
+ assert(!item_insert_order(&tree, index[i], order[i]));
+
+ assert(!radix_tree_tagged(&tree, 1));
+
+ for (i = 0; i < TAG_ENTRIES; i++)
+ assert(radix_tree_tag_set(&tree, tag_index[i], 1));
+
+ for (j = 0; j < 256; j++) {
+ int mask, k;
+
+ for (i = 0; i < TAG_ENTRIES; i++) {
+ for (k = i; index[k] < tag_index[i]; k++)
+ ;
+ if (j <= (index[k] | ((1 << order[k]) - 1)))
+ break;
+ }
+
+ radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) {
+ for (k = i; index[k] < tag_index[i]; k++)
+ ;
+ mask = (1 << order[k]) - 1;
+
+ assert(iter.index >= (tag_index[i] &~ mask));
+ assert(iter.index <= (tag_index[i] | mask));
+ i++;
+ }
+ }
+
+ radix_tree_range_tag_if_tagged(&tree, &first, ~0UL,
+ MT_NUM_ENTRIES, 1, 2);
+
+ for (j = 0; j < 256; j++) {
+ int mask, k;
+
+ for (i = 0; i < TAG_ENTRIES; i++) {
+ for (k = i; index[k] < tag_index[i]; k++)
+ ;
+ if (j <= (index[k] | ((1 << order[k]) - 1)))
+ break;
+ }
+
+ radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) {
+ for (k = i; index[k] < tag_index[i]; k++)
+ ;
+ mask = (1 << order[k]) - 1;
+
+ assert(iter.index >= (tag_index[i] &~ mask));
+ assert(iter.index <= (tag_index[i] | mask));
+ i++;
+ }
+ }
+
+ first = 1;
+ radix_tree_range_tag_if_tagged(&tree, &first, ~0UL,
+ MT_NUM_ENTRIES, 1, 0);
+ i = 0;
+ radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) {
+ assert(iter.index == tag_index[i]);
+ i++;
+ }
+
+ item_kill_tree(&tree);
+}
+
+void multiorder_checks(void)
+{
+ int i;
+
+ for (i = 0; i < 20; i++) {
+ multiorder_check(200, i);
+ multiorder_check(0, i);
+ multiorder_check((1UL << i) + 1, i);
+ }
+
+ for (i = 0; i < 15; i++)
+ multiorder_shrink((1UL << (i + RADIX_TREE_MAP_SHIFT)), i);
+
+ multiorder_insert_bug();
+ multiorder_tag_tests();
+ multiorder_iteration();
+ multiorder_tagged_iteration();
+}
diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c
index 5d2fa28cdca3..63bf347aaf33 100644
--- a/tools/testing/radix-tree/regression2.c
+++ b/tools/testing/radix-tree/regression2.c
@@ -51,13 +51,6 @@
#include "regression.h"
-#ifdef __KERNEL__
-#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
-#else
-#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */
-#endif
-
-#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
#define PAGECACHE_TAG_DIRTY 0
#define PAGECACHE_TAG_WRITEBACK 1
#define PAGECACHE_TAG_TOWRITE 2
diff --git a/tools/testing/radix-tree/tag_check.c b/tools/testing/radix-tree/tag_check.c
index 83136be552a0..b7447ceb75e9 100644
--- a/tools/testing/radix-tree/tag_check.c
+++ b/tools/testing/radix-tree/tag_check.c
@@ -12,6 +12,7 @@
static void
__simple_checks(struct radix_tree_root *tree, unsigned long index, int tag)
{
+ unsigned long first = 0;
int ret;
item_check_absent(tree, index);
@@ -22,6 +23,10 @@ __simple_checks(struct radix_tree_root *tree, unsigned long index, int tag)
item_tag_set(tree, index, tag);
ret = item_tag_get(tree, index, tag);
assert(ret != 0);
+ ret = radix_tree_range_tag_if_tagged(tree, &first, ~0UL, 10, tag, !tag);
+ assert(ret == 1);
+ ret = item_tag_get(tree, index, !tag);
+ assert(ret != 0);
ret = item_delete(tree, index);
assert(ret != 0);
item_insert(tree, index);
@@ -304,6 +309,7 @@ static void single_check(void)
struct item *items[BATCH];
RADIX_TREE(tree, GFP_KERNEL);
int ret;
+ unsigned long first = 0;
item_insert(&tree, 0);
item_tag_set(&tree, 0, 0);
@@ -313,6 +319,10 @@ static void single_check(void)
assert(ret == 0);
verify_tag_consistency(&tree, 0);
verify_tag_consistency(&tree, 1);
+ ret = radix_tree_range_tag_if_tagged(&tree, &first, 10, 10, 0, 1);
+ assert(ret == 1);
+ ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1);
+ assert(ret == 1);
item_kill_tree(&tree);
}
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
index 2bebf34cdc27..a6e8099eaf4f 100644
--- a/tools/testing/radix-tree/test.c
+++ b/tools/testing/radix-tree/test.c
@@ -24,14 +24,21 @@ int item_tag_get(struct radix_tree_root *root, unsigned long index, int tag)
return radix_tree_tag_get(root, index, tag);
}
-int __item_insert(struct radix_tree_root *root, struct item *item)
+int __item_insert(struct radix_tree_root *root, struct item *item,
+ unsigned order)
{
- return radix_tree_insert(root, item->index, item);
+ return __radix_tree_insert(root, item->index, order, item);
}
int item_insert(struct radix_tree_root *root, unsigned long index)
{
- return __item_insert(root, item_create(index));
+ return __item_insert(root, item_create(index), 0);
+}
+
+int item_insert_order(struct radix_tree_root *root, unsigned long index,
+ unsigned order)
+{
+ return __item_insert(root, item_create(index), order);
}
int item_delete(struct radix_tree_root *root, unsigned long index)
@@ -136,13 +143,13 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start,
}
static int verify_node(struct radix_tree_node *slot, unsigned int tag,
- unsigned int height, int tagged)
+ int tagged)
{
int anyset = 0;
int i;
int j;
- slot = indirect_to_ptr(slot);
+ slot = entry_to_node(slot);
/* Verify consistency at this level */
for (i = 0; i < RADIX_TREE_TAG_LONGS; i++) {
@@ -152,7 +159,8 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag,
}
}
if (tagged != anyset) {
- printf("tag: %u, height %u, tagged: %d, anyset: %d\n", tag, height, tagged, anyset);
+ printf("tag: %u, shift %u, tagged: %d, anyset: %d\n",
+ tag, slot->shift, tagged, anyset);
for (j = 0; j < RADIX_TREE_MAX_TAGS; j++) {
printf("tag %d: ", j);
for (i = 0; i < RADIX_TREE_TAG_LONGS; i++)
@@ -164,10 +172,10 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag,
assert(tagged == anyset);
/* Go for next level */
- if (height > 1) {
+ if (slot->shift > 0) {
for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
if (slot->slots[i])
- if (verify_node(slot->slots[i], tag, height - 1,
+ if (verify_node(slot->slots[i], tag,
!!test_bit(i, slot->tags[tag]))) {
printf("Failure at off %d\n", i);
for (j = 0; j < RADIX_TREE_MAX_TAGS; j++) {
@@ -184,9 +192,10 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag,
void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag)
{
- if (!root->height)
+ struct radix_tree_node *node = root->rnode;
+ if (!radix_tree_is_internal_node(node))
return;
- verify_node(root->rnode, tag, root->height, !!root_tag_get(root, tag));
+ verify_node(node, tag, !!root_tag_get(root, tag));
}
void item_kill_tree(struct radix_tree_root *root)
@@ -211,9 +220,19 @@ void item_kill_tree(struct radix_tree_root *root)
void tree_verify_min_height(struct radix_tree_root *root, int maxindex)
{
- assert(radix_tree_maxindex(root->height) >= maxindex);
- if (root->height > 1)
- assert(radix_tree_maxindex(root->height-1) < maxindex);
- else if (root->height == 1)
- assert(radix_tree_maxindex(root->height-1) <= maxindex);
+ unsigned shift;
+ struct radix_tree_node *node = root->rnode;
+ if (!radix_tree_is_internal_node(node)) {
+ assert(maxindex == 0);
+ return;
+ }
+
+ node = entry_to_node(node);
+ assert(maxindex <= node_maxindex(node));
+
+ shift = node->shift;
+ if (shift > 0)
+ assert(maxindex > shift_maxindex(shift - RADIX_TREE_MAP_SHIFT));
+ else
+ assert(maxindex > 0);
}
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
index 4e1d95faaa94..e85131369723 100644
--- a/tools/testing/radix-tree/test.h
+++ b/tools/testing/radix-tree/test.h
@@ -8,8 +8,11 @@ struct item {
};
struct item *item_create(unsigned long index);
-int __item_insert(struct radix_tree_root *root, struct item *item);
+int __item_insert(struct radix_tree_root *root, struct item *item,
+ unsigned order);
int item_insert(struct radix_tree_root *root, unsigned long index);
+int item_insert_order(struct radix_tree_root *root, unsigned long index,
+ unsigned order);
int item_delete(struct radix_tree_root *root, unsigned long index);
struct item *item_lookup(struct radix_tree_root *root, unsigned long index);
@@ -23,6 +26,7 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start,
void item_kill_tree(struct radix_tree_root *root);
void tag_check(void);
+void multiorder_checks(void);
struct item *
item_tag_set(struct radix_tree_root *root, unsigned long index, int tag);
@@ -35,6 +39,7 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag);
extern int nr_allocated;
/* Normally private parts of lib/radix-tree.c */
-void *indirect_to_ptr(void *ptr);
+void radix_tree_dump(struct radix_tree_root *root);
int root_tag_get(struct radix_tree_root *root, unsigned int tag);
-unsigned long radix_tree_maxindex(unsigned int height);
+unsigned long node_maxindex(struct radix_tree_node *);
+unsigned long shift_maxindex(unsigned int shift);
diff --git a/tools/testing/selftests/rcutorture/configs/lock/CFcommon b/tools/testing/selftests/rcutorture/configs/lock/CFcommon
index e372dc269254..779f984fb25c 100644
--- a/tools/testing/selftests/rcutorture/configs/lock/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/lock/CFcommon
@@ -1,2 +1,2 @@
CONFIG_LOCK_TORTURE_TEST=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
index f824b4c9d9d9..7461db3a2422 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
@@ -1,5 +1,5 @@
CONFIG_RCU_TORTURE_TEST=y
-CONFIG_PRINTK_TIME=y
+CONFIG_PRINTK_TIME=1
CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y