summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/blockdev/ramdisk.txt21
-rw-r--r--Documentation/devicetree/bindings/rtc/haoyu,hym8563.txt27
-rw-r--r--Documentation/devicetree/bindings/rtc/maxim,ds1742.txt12
-rw-r--r--Documentation/devicetree/bindings/vendor-prefixes.txt1
-rw-r--r--Documentation/dynamic-debug-howto.txt9
-rw-r--r--Documentation/filesystems/00-INDEX58
-rw-r--r--Documentation/filesystems/nilfs2.txt56
-rw-r--r--Documentation/filesystems/proc.txt9
-rw-r--r--Documentation/filesystems/sysfs.txt6
-rw-r--r--Documentation/filesystems/vfat.txt10
-rw-r--r--Documentation/kernel-parameters.txt11
-rw-r--r--Documentation/leds/leds-class.txt3
-rw-r--r--Documentation/printk-formats.txt17
-rw-r--r--Documentation/sysctl/kernel.txt15
-rw-r--r--Documentation/sysctl/vm.txt12
-rw-r--r--Documentation/trace/postprocess/trace-vmscan-postprocess.pl18
-rw-r--r--Documentation/vm/locking130
-rw-r--r--Documentation/vm/overcommit-accounting7
-rw-r--r--MAINTAINERS11
-rw-r--r--arch/alpha/Kconfig8
-rw-r--r--arch/arc/Kconfig4
-rw-r--r--arch/arm/Kconfig12
-rw-r--r--arch/arm/include/asm/dma.h4
-rw-r--r--arch/arm/include/asm/fixmap.h29
-rw-r--r--arch/arm/kernel/devtree.c2
-rw-r--r--arch/arm/kernel/setup.c2
-rw-r--r--arch/arm/mach-omap2/omap_hwmod.c8
-rw-r--r--arch/arm/mm/init.c7
-rw-r--r--arch/hexagon/include/asm/fixmap.h40
-rw-r--r--arch/ia64/Kconfig1
-rw-r--r--arch/ia64/include/asm/dmi.h8
-rw-r--r--arch/ia64/include/asm/processor.h1
-rw-r--r--arch/ia64/mm/contig.c68
-rw-r--r--arch/ia64/mm/discontig.c63
-rw-r--r--arch/ia64/mm/init.c48
-rw-r--r--arch/m32r/Kconfig8
-rw-r--r--arch/metag/include/asm/fixmap.h32
-rw-r--r--arch/metag/mm/init.c3
-rw-r--r--arch/metag/mm/numa.c3
-rw-r--r--arch/microblaze/Kconfig1
-rw-r--r--arch/microblaze/include/asm/fixmap.h44
-rw-r--r--arch/microblaze/mm/init.c3
-rw-r--r--arch/mips/Kconfig10
-rw-r--r--arch/mips/include/asm/fixmap.h33
-rw-r--r--arch/mn10300/Kconfig8
-rw-r--r--arch/parisc/Kconfig8
-rw-r--r--arch/parisc/mm/init.c59
-rw-r--r--arch/powerpc/Kconfig4
-rw-r--r--arch/powerpc/include/asm/fixmap.h44
-rw-r--r--arch/powerpc/mm/mem.c2
-rw-r--r--arch/powerpc/mm/numa.c8
-rw-r--r--arch/s390/Kconfig4
-rw-r--r--arch/sh/Kconfig8
-rw-r--r--arch/sh/include/asm/fixmap.h39
-rw-r--r--arch/sh/kernel/dwarf.c18
-rw-r--r--arch/sh/kernel/setup.c4
-rw-r--r--arch/sparc/Kconfig4
-rw-r--r--arch/sparc/mm/init_64.c5
-rw-r--r--arch/tile/include/asm/fixmap.h33
-rw-r--r--arch/um/include/asm/fixmap.h40
-rw-r--r--arch/unicore32/mm/init.c3
-rw-r--r--arch/x86/Kconfig9
-rw-r--r--arch/x86/include/asm/dmi.h6
-rw-r--r--arch/x86/include/asm/fixmap.h59
-rw-r--r--arch/x86/include/asm/page_types.h4
-rw-r--r--arch/x86/include/asm/tlbflush.h6
-rw-r--r--arch/x86/kernel/check.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c5
-rw-r--r--arch/x86/kernel/cpu/intel.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c4
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/mm/gup.c8
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/memtest.c2
-rw-r--r--arch/x86/mm/numa.c52
-rw-r--r--arch/x86/mm/srat.c21
-rw-r--r--arch/x86/mm/tlb.c52
-rw-r--r--block/cmdline-parser.c18
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/block/Kconfig3
-rw-r--r--drivers/block/cciss.c4
-rw-r--r--drivers/block/loop.c2
-rw-r--r--drivers/block/paride/pg.c2
-rw-r--r--drivers/block/sx8.c16
-rw-r--r--drivers/char/mem.c1
-rw-r--r--drivers/firmware/Kconfig3
-rw-r--r--drivers/firmware/dmi_scan.c20
-rw-r--r--drivers/firmware/memmap.c2
-rw-r--r--drivers/gpu/drm/cirrus/cirrus_mode.c4
-rw-r--r--drivers/gpu/drm/drm_fb_helper.c8
-rw-r--r--drivers/gpu/drm/gma500/backlight.c4
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_acpi.c6
-rw-r--r--drivers/input/Kconfig9
-rw-r--r--drivers/input/Makefile3
-rw-r--r--drivers/input/input.c6
-rw-r--r--drivers/input/leds.c249
-rw-r--r--drivers/iommu/intel-iommu.c2
-rw-r--r--drivers/leds/Kconfig3
-rw-r--r--drivers/mailbox/omap-mbox.h2
-rw-r--r--drivers/memstick/host/rtsx_pci_ms.c30
-rw-r--r--drivers/mfd/max8998.c2
-rw-r--r--drivers/mfd/tps65217.c2
-rw-r--r--drivers/misc/ti-st/st_core.c2
-rw-r--r--drivers/net/irda/donauboe.c15
-rw-r--r--drivers/pps/pps.c57
-rw-r--r--drivers/rtc/Kconfig13
-rw-r--r--drivers/rtc/Makefile1
-rw-r--r--drivers/rtc/class.c24
-rw-r--r--drivers/rtc/rtc-as3722.c19
-rw-r--r--drivers/rtc/rtc-cmos.c8
-rw-r--r--drivers/rtc/rtc-ds1305.c1
-rw-r--r--drivers/rtc/rtc-ds1742.c10
-rw-r--r--drivers/rtc/rtc-hym8563.c606
-rw-r--r--drivers/rtc/rtc-max8907.c11
-rw-r--r--drivers/rtc/rtc-mxc.c10
-rw-r--r--drivers/rtc/rtc-pcf2127.c5
-rw-r--r--drivers/rtc/rtc-s5m.c2
-rw-r--r--drivers/rtc/rtc-twl.c38
-rw-r--r--drivers/rtc/rtc-vr41xx.c50
-rw-r--r--drivers/scsi/megaraid/megaraid_mm.c2
-rw-r--r--drivers/tty/Kconfig4
-rw-r--r--drivers/tty/vt/keyboard.c110
-rw-r--r--drivers/video/aty/aty128fb.c4
-rw-r--r--drivers/video/backlight/backlight.c2
-rw-r--r--drivers/video/backlight/hp680_bl.c6
-rw-r--r--drivers/video/backlight/jornada720_bl.c15
-rw-r--r--drivers/video/backlight/jornada720_lcd.c13
-rw-r--r--drivers/video/backlight/kb3886_bl.c2
-rw-r--r--drivers/video/backlight/l4f00242t03.c6
-rw-r--r--drivers/video/backlight/lp855x_bl.c2
-rw-r--r--drivers/video/backlight/lp8788_bl.c6
-rw-r--r--drivers/video/backlight/omap1_bl.c14
-rw-r--r--drivers/video/backlight/ot200_bl.c9
-rw-r--r--drivers/video/backlight/tosa_bl.c7
-rw-r--r--drivers/video/backlight/tosa_lcd.c6
-rw-r--r--drivers/vlynq/vlynq.c3
-rw-r--r--drivers/w1/masters/w1-gpio.c22
-rw-r--r--drivers/w1/w1_int.c12
-rw-r--r--fs/afs/proc.c4
-rw-r--r--fs/autofs4/autofs_i.h4
-rw-r--r--fs/autofs4/dev-ioctl.c16
-rw-r--r--fs/autofs4/expire.c14
-rw-r--r--fs/autofs4/inode.c49
-rw-r--r--fs/autofs4/root.c6
-rw-r--r--fs/autofs4/symlink.c4
-rw-r--r--fs/autofs4/waitq.c16
-rw-r--r--fs/binfmt_elf.c24
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/coredump.c1
-rw-r--r--fs/coredump.h6
-rw-r--r--fs/exec.c120
-rw-r--r--fs/ext3/dir.c44
-rw-r--r--fs/ext4/block_validity.c33
-rw-r--r--fs/ext4/dir.c35
-rw-r--r--fs/fat/cache.c19
-rw-r--r--fs/fat/fat.h6
-rw-r--r--fs/fat/file.c78
-rw-r--r--fs/fat/inode.c57
-rw-r--r--fs/hfsplus/inode.c59
-rw-r--r--fs/jffs2/fs.c9
-rw-r--r--fs/jffs2/nodelist.c28
-rw-r--r--fs/jffs2/readinode.c26
-rw-r--r--fs/logfs/segment.c3
-rw-r--r--fs/nilfs2/ioctl.c371
-rw-r--r--fs/notify/dnotify/dnotify.c34
-rw-r--r--fs/notify/fanotify/fanotify.c224
-rw-r--r--fs/notify/fanotify/fanotify.h23
-rw-r--r--fs/notify/fanotify/fanotify_user.c41
-rw-r--r--fs/notify/fsnotify.c42
-rw-r--r--fs/notify/group.c1
-rw-r--r--fs/notify/inotify/inotify.h21
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c149
-rw-r--r--fs/notify/inotify/inotify_user.c119
-rw-r--r--fs/notify/notification.c334
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/alloc.c50
-rw-r--r--fs/ocfs2/cluster/Makefile2
-rw-r--r--fs/ocfs2/cluster/nodemanager.c4
-rw-r--r--fs/ocfs2/cluster/tcp.c35
-rw-r--r--fs/ocfs2/cluster/ver.c42
-rw-r--r--fs/ocfs2/cluster/ver.h31
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c5
-rw-r--r--fs/ocfs2/dlm/dlmver.c42
-rw-r--r--fs/ocfs2/dlm/dlmver.h31
-rw-r--r--fs/ocfs2/dlmfs/Makefile2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.c42
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.h31
-rw-r--r--fs/ocfs2/dlmglue.c4
-rw-r--r--fs/ocfs2/file.c66
-rw-r--r--fs/ocfs2/ioctl.c7
-rw-r--r--fs/ocfs2/localalloc.c40
-rw-r--r--fs/ocfs2/localalloc.h6
-rw-r--r--fs/ocfs2/move_extents.c77
-rw-r--r--fs/ocfs2/namei.c122
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/stack_o2cb.c3
-rw-r--r--fs/ocfs2/stack_user.c308
-rw-r--r--fs/ocfs2/stackglue.c16
-rw-r--r--fs/ocfs2/stackglue.h15
-rw-r--r--fs/ocfs2/suballoc.c12
-rw-r--r--fs/ocfs2/suballoc.h12
-rw-r--r--fs/ocfs2/super.c20
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/ver.h31
-rw-r--r--fs/pipe.c3
-rw-r--r--fs/posix_acl.c94
-rw-r--r--fs/proc/array.c18
-rw-r--r--fs/proc/base.c69
-rw-r--r--fs/proc/cmdline.c2
-rw-r--r--fs/proc/consoles.c2
-rw-r--r--fs/proc/cpuinfo.c2
-rw-r--r--fs/proc/devices.c2
-rw-r--r--fs/proc/generic.c3
-rw-r--r--fs/proc/interrupts.c2
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/kmsg.c2
-rw-r--r--fs/proc/loadavg.c2
-rw-r--r--fs/proc/meminfo.c39
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/page.c10
-rw-r--r--fs/proc/proc_devtree.c5
-rw-r--r--fs/proc/softirqs.c2
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/uptime.c2
-rw-r--r--fs/proc/version.c2
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/proc_namespace.c7
-rw-r--r--fs/ramfs/file-mmu.c7
-rw-r--r--fs/ramfs/file-nommu.c17
-rw-r--r--fs/ramfs/inode.c9
-rw-r--r--fs/ramfs/internal.h1
-rw-r--r--fs/read_write.c4
-rw-r--r--fs/reiserfs/reiserfs.h2
-rw-r--r--fs/romfs/super.c6
-rw-r--r--fs/super.c3
-rw-r--r--fs/ubifs/debug.c22
-rw-r--r--fs/ubifs/log.c21
-rw-r--r--fs/ubifs/orphan.c21
-rw-r--r--fs/ubifs/recovery.c21
-rw-r--r--fs/ubifs/super.c24
-rw-r--r--fs/ubifs/tnc.c22
-rw-r--r--fs/ufs/balloc.c30
-rw-r--r--fs/ufs/ialloc.c17
-rw-r--r--fs/ufs/super.c13
-rw-r--r--fs/ufs/ufs.h1
-rw-r--r--include/asm-generic/fixmap.h97
-rw-r--r--include/asm-generic/int-l64.h49
-rw-r--r--include/linux/bootmem.h153
-rw-r--r--include/linux/cache.h4
-rw-r--r--include/linux/ceph/decode.h17
-rw-r--r--include/linux/cmdline-parser.h8
-rw-r--r--include/linux/compaction.h16
-rw-r--r--include/linux/crc64_ecma.h56
-rw-r--r--include/linux/dma-debug.h6
-rw-r--r--include/linux/fsnotify_backend.h118
-rw-r--r--include/linux/genalloc.h2
-rw-r--r--include/linux/gfp.h1
-rw-r--r--include/linux/huge_mm.h23
-rw-r--r--include/linux/hugetlb.h8
-rw-r--r--include/linux/hugetlb_cgroup.h5
-rw-r--r--include/linux/init_task.h2
-rw-r--r--include/linux/input.h21
-rw-r--r--include/linux/ipc.h2
-rw-r--r--include/linux/ipc_namespace.h1
-rw-r--r--include/linux/kernel.h13
-rw-r--r--include/linux/kexec.h1
-rw-r--r--include/linux/ksm.h15
-rw-r--r--include/linux/kthread.h1
-rw-r--r--include/linux/memblock.h55
-rw-r--r--include/linux/memcontrol.h23
-rw-r--r--include/linux/mempolicy.h32
-rw-r--r--include/linux/migrate.h6
-rw-r--r--include/linux/mm.h80
-rw-r--r--include/linux/mman.h1
-rw-r--r--include/linux/mmdebug.h9
-rw-r--r--include/linux/mmzone.h11
-rw-r--r--include/linux/msg.h2
-rw-r--r--include/linux/of.h7
-rw-r--r--include/linux/page-flags.h14
-rw-r--r--include/linux/pagemap.h10
-rw-r--r--include/linux/parser.h1
-rw-r--r--include/linux/percpu.h1
-rw-r--r--include/linux/posix_acl.h78
-rw-r--r--include/linux/printk.h19
-rw-r--r--include/linux/ramfs.h7
-rw-r--r--include/linux/rmap.h27
-rw-r--r--include/linux/sched.h42
-rw-r--r--include/linux/sched/sysctl.h4
-rw-r--r--include/linux/shm.h2
-rw-r--r--include/linux/slab.h9
-rw-r--r--include/linux/splice.h3
-rw-r--r--include/linux/vm_event_item.h4
-rw-r--r--include/linux/vmstat.h8
-rw-r--r--include/linux/w1-gpio.h1
-rw-r--r--include/trace/events/compaction.h42
-rw-r--r--include/trace/events/migrate.h26
-rw-r--r--include/trace/events/sched.h87
-rw-r--r--include/uapi/asm-generic/types.h3
-rw-r--r--init/initramfs.c2
-rw-r--r--init/main.c21
-rw-r--r--ipc/compat.c28
-rw-r--r--ipc/compat_mq.c2
-rw-r--r--ipc/ipc_sysctl.c14
-rw-r--r--ipc/mqueue.c22
-rw-r--r--ipc/msg.c44
-rw-r--r--ipc/sem.c178
-rw-r--r--ipc/shm.c49
-rw-r--r--ipc/util.c350
-rw-r--r--ipc/util.h28
-rw-r--r--kernel/audit_tree.c20
-rw-r--r--kernel/audit_watch.c24
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/hung_task.c3
-rw-r--r--kernel/kexec.c7
-rw-r--r--kernel/kmod.c11
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/kthread.c9
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/printk/printk.c19
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/sched/core.c26
-rw-r--r--kernel/sched/fair.c6
-rw-r--r--kernel/sched/stats.c2
-rw-r--r--kernel/signal.c7
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sysctl.c33
-rw-r--r--kernel/time/sched_clock.c4
-rw-r--r--kernel/user.c3
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/watchdog.c26
-rw-r--r--lib/Kconfig7
-rw-r--r--lib/Kconfig.debug39
-rw-r--r--lib/Makefile3
-rw-r--r--lib/assoc_array.c2
-rw-r--r--lib/cmdline.c14
-rw-r--r--lib/cpumask.c4
-rw-r--r--lib/crc64_ecma.c341
-rw-r--r--lib/decompress_unlz4.c1
-rw-r--r--lib/dma-debug.c193
-rw-r--r--lib/dynamic_debug.c15
-rw-r--r--lib/kstrtox.c1
-rw-r--r--lib/parser.c62
-rw-r--r--lib/rbtree_test.c13
-rw-r--r--lib/show_mem.c6
-rw-r--r--lib/swiotlb.c35
-rw-r--r--lib/test_module.c33
-rw-r--r--lib/test_user_copy.c110
-rw-r--r--lib/vsprintf.c53
-rw-r--r--mm/balloon_compaction.c4
-rw-r--r--mm/cleancache.c6
-rw-r--r--mm/compaction.c63
-rw-r--r--mm/filemap.c16
-rw-r--r--mm/huge_memory.c46
-rw-r--r--mm/hugetlb.c56
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h15
-rw-r--r--mm/ksm.c135
-rw-r--r--mm/memblock.c390
-rw-r--r--mm/memcontrol.c482
-rw-r--r--mm/memory-failure.c31
-rw-r--r--mm/memory.c26
-rw-r--r--mm/memory_hotplug.c13
-rw-r--r--mm/mempolicy.c8
-rw-r--r--mm/migrate.c97
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mlock.c128
-rw-r--r--mm/mm_init.c3
-rw-r--r--mm/mmap.c52
-rw-r--r--mm/mmu_notifier.c3
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/nobootmem.c24
-rw-r--r--mm/nommu.c1
-rw-r--r--mm/oom_kill.c51
-rw-r--r--mm/page_alloc.c208
-rw-r--r--mm/page_cgroup.c5
-rw-r--r--mm/page_io.c4
-rw-r--r--mm/percpu.c38
-rw-r--r--mm/rmap.c590
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.h26
-rw-r--r--mm/slab_common.c90
-rw-r--r--mm/slub.c12
-rw-r--r--mm/sparse-vmemmap.c6
-rw-r--r--mm/sparse.c27
-rw-r--r--mm/swap.c294
-rw-r--r--mm/swap_state.c79
-rw-r--r--mm/swapfile.c19
-rw-r--r--mm/util.c36
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c136
-rw-r--r--mm/zswap.c4
-rw-r--r--net/ipv4/tcp_illinois.c1
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c27
-rwxr-xr-xscripts/checkpatch.pl170
-rwxr-xr-xscripts/get_maintainer.pl91
-rw-r--r--scripts/sortextable.c5
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/user/Makefile13
404 files changed, 7969 insertions, 5323 deletions
diff --git a/Documentation/blockdev/ramdisk.txt b/Documentation/blockdev/ramdisk.txt
index fa72e97dd669..fe2ef978d85a 100644
--- a/Documentation/blockdev/ramdisk.txt
+++ b/Documentation/blockdev/ramdisk.txt
@@ -36,21 +36,30 @@ allowing one to squeeze more programs onto an average installation or
rescue floppy disk.
-2) Kernel Command Line Parameters
+2) Parameters
---------------------------------
+2a) Kernel Command Line Parameters
+
ramdisk_size=N
==============
This parameter tells the RAM disk driver to set up RAM disks of N k size. The
-default is 4096 (4 MB) (8192 (8 MB) on S390).
+default is 4096 (4 MB).
+
+2b) Module parameters
- ramdisk_blocksize=N
- ===================
+ rd_nr
+ =====
+ /dev/ramX devices created.
-This parameter tells the RAM disk driver how many bytes to use per block. The
-default is 1024 (BLOCK_SIZE).
+ max_part
+ ========
+ Maximum partition number.
+ rd_size
+ =======
+ See ramdisk_size.
3) Using "rdev -r"
------------------
diff --git a/Documentation/devicetree/bindings/rtc/haoyu,hym8563.txt b/Documentation/devicetree/bindings/rtc/haoyu,hym8563.txt
new file mode 100644
index 000000000000..31406fd4a43e
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/haoyu,hym8563.txt
@@ -0,0 +1,27 @@
+Haoyu Microelectronics HYM8563 Real Time Clock
+
+The HYM8563 provides basic rtc and alarm functionality
+as well as a clock output of up to 32kHz.
+
+Required properties:
+- compatible: should be: "haoyu,hym8563"
+- reg: i2c address
+- interrupts: rtc alarm/event interrupt
+- #clock-cells: the value should be 0
+
+Example:
+
+hym8563: hym8563@51 {
+ compatible = "haoyu,hym8563";
+ reg = <0x51>;
+
+ interrupts = <13 IRQ_TYPE_EDGE_FALLING>;
+
+ #clock-cells = <0>;
+};
+
+device {
+...
+ clocks = <&hym8563>;
+...
+};
diff --git a/Documentation/devicetree/bindings/rtc/maxim,ds1742.txt b/Documentation/devicetree/bindings/rtc/maxim,ds1742.txt
new file mode 100644
index 000000000000..d0f937c355b5
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/maxim,ds1742.txt
@@ -0,0 +1,12 @@
+* Maxim (Dallas) DS1742/DS1743 Real Time Clock
+
+Required properties:
+- compatible: Should contain "maxim,ds1742".
+- reg: Physical base address of the RTC and length of memory
+ mapped region.
+
+Example:
+ rtc: rtc@10000000 {
+ compatible = "maxim,ds1742";
+ reg = <0x10000000 0x800>;
+ };
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt
index 27b1a9eecb48..b458760691a0 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -37,6 +37,7 @@ fsl Freescale Semiconductor
GEFanuc GE Fanuc Intelligent Platforms Embedded Systems, Inc.
gef GE Fanuc Intelligent Platforms Embedded Systems, Inc.
gmt Global Mixed-mode Technology, Inc.
+haoyu Haoyu Microelectronic Co. Ltd.
hisilicon Hisilicon Limited.
hp Hewlett Packard
ibm International Business Machines (IBM)
diff --git a/Documentation/dynamic-debug-howto.txt b/Documentation/dynamic-debug-howto.txt
index 1bbdcfcf1f13..46325eb2ea76 100644
--- a/Documentation/dynamic-debug-howto.txt
+++ b/Documentation/dynamic-debug-howto.txt
@@ -108,6 +108,12 @@ If your query set is big, you can batch them too:
~# cat query-batch-file > <debugfs>/dynamic_debug/control
+A another way is to use wildcard. The match rule support '*' (matches
+zero or more characters) and '?' (matches exactly one character).For
+example, you can match all usb drivers:
+
+ ~# echo "file drivers/usb/* +p" > <debugfs>/dynamic_debug/control
+
At the syntactical level, a command comprises a sequence of match
specifications, followed by a flags change specification.
@@ -315,6 +321,9 @@ nullarbor:~ # echo -n 'func svc_process -p' >
nullarbor:~ # echo -n 'format "nfsd: READ" +p' >
<debugfs>/dynamic_debug/control
+// enable messages in files of which the pathes include string "usb"
+nullarbor:~ # echo -n '*usb* +p' > <debugfs>/dynamic_debug/control
+
// enable all messages
nullarbor:~ # echo -n '+p' > <debugfs>/dynamic_debug/control
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 8042050eb265..632211cbdd56 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -10,24 +10,32 @@ afs.txt
- info and examples for the distributed AFS (Andrew File System) fs.
affs.txt
- info and mount options for the Amiga Fast File System.
+autofs4-mount-control.txt
+ - info on device control operations for autofs4 module.
automount-support.txt
- information about filesystem automount support.
befs.txt
- information about the BeOS filesystem for Linux.
bfs.txt
- info for the SCO UnixWare Boot Filesystem (BFS).
+btrfs.txt
+ - info for the BTRFS filesystem.
+caching/
+ - directory containing filesystem cache documentation.
ceph.txt
- - info for the Ceph Distributed File System
-cifs.txt
- - description of the CIFS filesystem.
+ - info for the Ceph Distributed File System.
+cifs/
+ - directory containing CIFS filesystem documentation and example code.
coda.txt
- description of the CODA filesystem.
configfs/
- directory containing configfs documentation and example code.
cramfs.txt
- info on the cram filesystem for small storage (ROMs etc).
-dentry-locking.txt
- - info on the RCU-based dcache locking model.
+debugfs.txt
+ - info on the debugfs filesystem.
+devpts.txt
+ - info on the devpts filesystem.
directory-locking
- info about the locking scheme used for directory operations.
dlmfs.txt
@@ -35,7 +43,7 @@ dlmfs.txt
dnotify.txt
- info about directory notification in Linux.
dnotify_test.c
- - example program for dnotify
+ - example program for dnotify.
ecryptfs.txt
- docs on eCryptfs: stacked cryptographic filesystem for Linux.
efivarfs.txt
@@ -48,12 +56,18 @@ ext3.txt
- info, mount options and specifications for the Ext3 filesystem.
ext4.txt
- info, mount options and specifications for the Ext4 filesystem.
-files.txt
- - info on file management in the Linux kernel.
f2fs.txt
- info and mount options for the F2FS filesystem.
+fiemap.txt
+ - info on fiemap ioctl.
+files.txt
+ - info on file management in the Linux kernel.
fuse.txt
- info on the Filesystem in User SpacE including mount options.
+gfs2-glocks.txt
+ - info on the Global File System 2 - Glock internal locking rules.
+gfs2-uevents.txt
+ - info on the Global File System 2 - uevents.
gfs2.txt
- info on the Global File System 2.
hfs.txt
@@ -84,40 +98,58 @@ ntfs.txt
- info and mount options for the NTFS filesystem (Windows NT).
ocfs2.txt
- info and mount options for the OCFS2 clustered filesystem.
+omfs.txt
+ - info on the Optimized MPEG FileSystem.
+path-lookup.txt
+ - info on path walking and name lookup locking.
+pohmelfs/
+ - directory containing pohmelfs filesystem documentation.
porting
- various information on filesystem porting.
proc.txt
- info on Linux's /proc filesystem.
+qnx6.txt
+ - info on the QNX6 filesystem.
+quota.txt
+ - info on Quota subsystem.
ramfs-rootfs-initramfs.txt
- info on the 'in memory' filesystems ramfs, rootfs and initramfs.
-reiser4.txt
- - info on the Reiser4 filesystem based on dancing tree algorithms.
relay.txt
- info on relay, for efficient streaming from kernel to user space.
romfs.txt
- description of the ROMFS filesystem.
seq_file.txt
- - how to use the seq_file API
+ - how to use the seq_file API.
sharedsubtree.txt
- a description of shared subtrees for namespaces.
spufs.txt
- info and mount options for the SPU filesystem used on Cell.
+squashfs.txt
+ - info on the squashfs filesystem.
sysfs-pci.txt
- info on accessing PCI device resources through sysfs.
+sysfs-tagging.txt
+ - info on sysfs tagging to avoid duplicates.
sysfs.txt
- info on sysfs, a ram-based filesystem for exporting kernel objects.
sysv-fs.txt
- info on the SystemV/V7/Xenix/Coherent filesystem.
tmpfs.txt
- info on tmpfs, a filesystem that holds all files in virtual memory.
+ubifs.txt
+ - info on the Unsorted Block Images FileSystem.
udf.txt
- info and mount options for the UDF filesystem.
ufs.txt
- info on the ufs filesystem.
vfat.txt
- - info on using the VFAT filesystem used in Windows NT and Windows 95
+ - info on using the VFAT filesystem used in Windows NT and Windows 95.
vfs.txt
- - overview of the Virtual File System
+ - overview of the Virtual File System.
+xfs-delayed-logging-design.txt
+ - info on the XFS Delayed Logging Design.
+xfs-self-describing-metadata.txt
+ - info on XFS Self Describing Metadata.
xfs.txt
- info and mount options for the XFS filesystem.
xip.txt
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 873a2ab2e9f8..06887d46ccf2 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -81,6 +81,62 @@ nodiscard(*) The discard/TRIM commands are sent to the underlying
block device when blocks are freed. This is useful
for SSD devices and sparse/thinly-provisioned LUNs.
+Ioctls
+======
+
+There is some NILFS2 specific functionality which can be accessed by applications
+through the system call interfaces. The list of all NILFS2 specific ioctls are
+shown in the table below.
+
+Table of NILFS2 specific ioctls
+..............................................................................
+ Ioctl Description
+ NILFS_IOCTL_CHANGE_CPMODE Change mode of given checkpoint between
+ checkpoint and snapshot state. This ioctl is
+ used in chcp and mkcp utilities.
+
+ NILFS_IOCTL_DELETE_CHECKPOINT Remove checkpoint from NILFS2 file system.
+ This ioctl is used in rmcp utility.
+
+ NILFS_IOCTL_GET_CPINFO Return info about requested checkpoints. This
+ ioctl is used in lscp utility and by
+ nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_CPSTAT Return checkpoints statistics. This ioctl is
+ used by lscp, rmcp utilities and by
+ nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_SUINFO Return segment usage info about requested
+ segments. This ioctl is used in lssu,
+ nilfs_resize utilities and by nilfs_cleanerd
+ daemon.
+
+ NILFS_IOCTL_GET_SUSTAT Return segment usage statistics. This ioctl
+ is used in lssu, nilfs_resize utilities and
+ by nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_VINFO Return information on virtual block addresses.
+ This ioctl is used by nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_BDESCS Return information about descriptors of disk
+ block numbers. This ioctl is used by
+ nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_CLEAN_SEGMENTS Do garbage collection operation in the
+ environment of requested parameters from
+ userspace. This ioctl is used by
+ nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_SYNC Make a checkpoint. This ioctl is used in
+ mkcp utility.
+
+ NILFS_IOCTL_RESIZE Resize NILFS2 volume. This ioctl is used
+ by nilfs_resize utility.
+
+ NILFS_IOCTL_SET_ALLOC_RANGE Define lower limit of segments in bytes and
+ upper limit of segments in bytes. This ioctl
+ is used by nilfs_resize utility.
+
NILFS2 usage
============
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 19e10ab3d569..31f76178c987 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -767,6 +767,7 @@ The "Locked" indicates whether the mapping is locked in memory or not.
MemTotal: 16344972 kB
MemFree: 13634064 kB
+MemAvailable: 14836172 kB
Buffers: 3656 kB
Cached: 1195708 kB
SwapCached: 0 kB
@@ -799,6 +800,14 @@ AnonHugePages: 49152 kB
MemTotal: Total usable ram (i.e. physical ram minus a few reserved
bits and the kernel binary code)
MemFree: The sum of LowFree+HighFree
+MemAvailable: An estimate of how much memory is available for starting new
+ applications, without swapping. Calculated from MemFree,
+ SReclaimable, the size of the file LRU lists, and the low
+ watermarks in each zone.
+ The estimate takes into account that the system needs some
+ page cache to function well, and that not all reclaimable
+ slab will be reclaimable, due to items being in use. The
+ impact of those factors will vary from system to system.
Buffers: Relatively temporary storage for raw disk blocks
shouldn't get tremendously large (20MB or so)
Cached: in-memory cache for files read from the disk (the
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index a6619b7064b9..b35a64b82f9e 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -108,12 +108,12 @@ static DEVICE_ATTR(foo, S_IWUSR | S_IRUGO, show_foo, store_foo);
is equivalent to doing:
static struct device_attribute dev_attr_foo = {
- .attr = {
+ .attr = {
.name = "foo",
.mode = S_IWUSR | S_IRUGO,
- .show = show_foo,
- .store = store_foo,
},
+ .show = show_foo,
+ .store = store_foo,
};
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index 4a93e98b290a..5cf57b368dc6 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -175,6 +175,16 @@ nfs=stale_rw|nostale_ro
<bool>: 0,1,yes,no,true,false
+LIMITATION
+---------------------------------------------------------------------
+* The fallocated region of file is discarded at umount/evict time
+ when using fallocate with FALLOC_FL_KEEP_SIZE.
+ So, User should assume that fallocated region can be discarded at
+ last close if there is memory pressure resulting in eviction of
+ the inode from the memory. As a result, for any dependency on
+ the fallocated region, user should make sure to recheck fallocate
+ after reopening the file.
+
TODO
----------------------------------------------------------------------
* Need to get rid of the raw scanning stuff. Instead, always use
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index af0b689a0163..62cc3d333c3c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1050,7 +1050,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
debugfs files are removed at module unload time.
gpt [EFI] Forces disk with valid GPT signature but
- invalid Protective MBR to be treated as GPT.
+ invalid Protective MBR to be treated as GPT. If the
+ primary GPT is corrupted, it enables the backup/alternate
+ GPT to be used instead.
grcan.enable0= [HW] Configuration of physical interface 0. Determines
the "Enable 0" bit of the configuration register.
@@ -1452,6 +1454,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Valid arguments: on, off
Default: on
+ kmemcheck= [KNL] Boot-time kmemcheck enable/disable/one-shot mode
+ Valid arguments: 0, 1, 2
+ kmemcheck=0 (disabled)
+ kmemcheck=1 (enabled)
+ kmemcheck=2 (one-shot mode)
+ Default: 2 (one-shot mode)
+
kstack=N [X86] Print N words from the kernel stack
in oops dumps.
diff --git a/Documentation/leds/leds-class.txt b/Documentation/leds/leds-class.txt
index 79699c200766..62261c04060a 100644
--- a/Documentation/leds/leds-class.txt
+++ b/Documentation/leds/leds-class.txt
@@ -2,9 +2,6 @@
LED handling under Linux
========================
-If you're reading this and thinking about keyboard leds, these are
-handled by the input subsystem and the led class is *not* needed.
-
In its simplest form, the LED class just allows control of LEDs from
userspace. LEDs appear in /sys/class/leds/. The maximum brightness of the
LED is defined in max_brightness file. The brightness file will set the brightness
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index 445ad743ec81..94459b42e0ab 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -55,14 +55,21 @@ Struct Resources:
For printing struct resources. The 'R' and 'r' specifiers result in a
printed resource with ('R') or without ('r') a decoded flags member.
-Physical addresses:
+Physical addresses types phys_addr_t:
- %pa 0x01234567 or 0x0123456789abcdef
+ %pa[p] 0x01234567 or 0x0123456789abcdef
For printing a phys_addr_t type (and its derivatives, such as
resource_size_t) which can vary based on build options, regardless of
the width of the CPU data path. Passed by reference.
+DMA addresses types dma_addr_t:
+
+ %pad 0x01234567 or 0x0123456789abcdef
+
+ For printing a dma_addr_t type which can vary based on build options,
+ regardless of the width of the CPU data path. Passed by reference.
+
Raw buffer as a hex string:
%*ph 00 01 02 ... 3f
%*phC 00:01:02: ... :3f
@@ -177,6 +184,12 @@ dentry names:
equivalent of %s dentry->d_name.name we used to use, %pd<n> prints
n last components. %pD does the same thing for struct file.
+task_struct comm name:
+
+ %pT
+
+ For printing task_struct->comm.
+
struct va_format:
%pV
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 6d486404200e..ee9a2f983b99 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
- domainname
- hostname
- hotplug
+- kexec_load_disabled
- kptr_restrict
- kstack_depth_to_print [ X86 only ]
- l2cr [ PPC only ]
@@ -287,6 +288,18 @@ Default value is "/sbin/hotplug".
==============================================================
+kexec_load_disabled:
+
+A toggle indicating if the kexec_load syscall has been disabled. This
+value defaults to 0 (false: kexec_load enabled), but can be set to 1
+(true: kexec_load disabled). Once true, kexec can no longer be used, and
+the toggle cannot be set back to false. This allows a kexec image to be
+loaded before disabling the syscall, allowing a system to set up (and
+later use) an image without it being altered. Generally used together
+with the "modules_disabled" sysctl.
+
+==============================================================
+
kptr_restrict:
This toggle indicates whether restrictions are placed on
@@ -331,7 +344,7 @@ A toggle value indicating if modules are allowed to be loaded
in an otherwise modular kernel. This toggle defaults to off
(0), but can be set true (1). Once true, modules can be
neither loaded nor unloaded, and the toggle cannot be set back
-to false.
+to false. Generally used with the "kexec_load_disabled" toggle.
==============================================================
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 1fbd4eb7b64a..9f5481bdc5a4 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -47,6 +47,7 @@ Currently, these files are in /proc/sys/vm:
- numa_zonelist_order
- oom_dump_tasks
- oom_kill_allocating_task
+- overcommit_kbytes
- overcommit_memory
- overcommit_ratio
- page-cluster
@@ -574,6 +575,17 @@ The default value is 0.
==============================================================
+overcommit_kbytes:
+
+When overcommit_memory is set to 2, the committed address space is not
+permitted to exceed swap plus this amount of physical RAM. See below.
+
+Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one
+of them may be specified at a time. Setting one disables the other (which
+then appears as 0 when read).
+
+==============================================================
+
overcommit_memory:
This value contains a flag that enables memory overcommitment.
diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
index 4a37c4759cd2..00e425faa2fd 100644
--- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -123,7 +123,7 @@ my $regex_writepage;
# Static regex used. Specified like this for readability and for use with /o
# (process_pid) (cpus ) ( time ) (tpoint ) (details)
-my $regex_traceevent = '\s*([a-zA-Z0-9-]*)\s*(\[[0-9]*\])\s*([0-9.]*):\s*([a-zA-Z_]*):\s*(.*)';
+my $regex_traceevent = '\s*([a-zA-Z0-9-]*)\s*(\[[0-9]*\])(\s*[dX.][Nnp.][Hhs.][0-9a-fA-F.]*|)\s*([0-9.]*):\s*([a-zA-Z_]*):\s*(.*)';
my $regex_statname = '[-0-9]*\s\((.*)\).*';
my $regex_statppid = '[-0-9]*\s\(.*\)\s[A-Za-z]\s([0-9]*).*';
@@ -270,8 +270,8 @@ EVENT_PROCESS:
while ($traceevent = <STDIN>) {
if ($traceevent =~ /$regex_traceevent/o) {
$process_pid = $1;
- $timestamp = $3;
- $tracepoint = $4;
+ $timestamp = $4;
+ $tracepoint = $5;
$process_pid =~ /(.*)-([0-9]*)$/;
my $process = $1;
@@ -299,7 +299,7 @@ EVENT_PROCESS:
$perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}++;
$perprocesspid{$process_pid}->{STATE_DIRECT_BEGIN} = $timestamp;
- $details = $5;
+ $details = $6;
if ($details !~ /$regex_direct_begin/o) {
print "WARNING: Failed to parse mm_vmscan_direct_reclaim_begin as expected\n";
print " $details\n";
@@ -322,7 +322,7 @@ EVENT_PROCESS:
$perprocesspid{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index] = "$order-$latency";
}
} elsif ($tracepoint eq "mm_vmscan_kswapd_wake") {
- $details = $5;
+ $details = $6;
if ($details !~ /$regex_kswapd_wake/o) {
print "WARNING: Failed to parse mm_vmscan_kswapd_wake as expected\n";
print " $details\n";
@@ -356,7 +356,7 @@ EVENT_PROCESS:
} elsif ($tracepoint eq "mm_vmscan_wakeup_kswapd") {
$perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}++;
- $details = $5;
+ $details = $6;
if ($details !~ /$regex_wakeup_kswapd/o) {
print "WARNING: Failed to parse mm_vmscan_wakeup_kswapd as expected\n";
print " $details\n";
@@ -366,7 +366,7 @@ EVENT_PROCESS:
my $order = $3;
$perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order]++;
} elsif ($tracepoint eq "mm_vmscan_lru_isolate") {
- $details = $5;
+ $details = $6;
if ($details !~ /$regex_lru_isolate/o) {
print "WARNING: Failed to parse mm_vmscan_lru_isolate as expected\n";
print " $details\n";
@@ -387,7 +387,7 @@ EVENT_PROCESS:
}
$perprocesspid{$process_pid}->{HIGH_NR_CONTIG_DIRTY} += $nr_contig_dirty;
} elsif ($tracepoint eq "mm_vmscan_lru_shrink_inactive") {
- $details = $5;
+ $details = $6;
if ($details !~ /$regex_lru_shrink_inactive/o) {
print "WARNING: Failed to parse mm_vmscan_lru_shrink_inactive as expected\n";
print " $details\n";
@@ -397,7 +397,7 @@ EVENT_PROCESS:
my $nr_reclaimed = $4;
$perprocesspid{$process_pid}->{HIGH_NR_RECLAIMED} += $nr_reclaimed;
} elsif ($tracepoint eq "mm_vmscan_writepage") {
- $details = $5;
+ $details = $6;
if ($details !~ /$regex_writepage/o) {
print "WARNING: Failed to parse mm_vmscan_writepage as expected\n";
print " $details\n";
diff --git a/Documentation/vm/locking b/Documentation/vm/locking
deleted file mode 100644
index f61228bd6395..000000000000
--- a/Documentation/vm/locking
+++ /dev/null
@@ -1,130 +0,0 @@
-Started Oct 1999 by Kanoj Sarcar <kanojsarcar@yahoo.com>
-
-The intent of this file is to have an uptodate, running commentary
-from different people about how locking and synchronization is done
-in the Linux vm code.
-
-page_table_lock & mmap_sem
---------------------------------------
-
-Page stealers pick processes out of the process pool and scan for
-the best process to steal pages from. To guarantee the existence
-of the victim mm, a mm_count inc and a mmdrop are done in swap_out().
-Page stealers hold kernel_lock to protect against a bunch of races.
-The vma list of the victim mm is also scanned by the stealer,
-and the page_table_lock is used to preserve list sanity against the
-process adding/deleting to the list. This also guarantees existence
-of the vma. Vma existence is not guaranteed once try_to_swap_out()
-drops the page_table_lock. To guarantee the existence of the underlying
-file structure, a get_file is done before the swapout() method is
-invoked. The page passed into swapout() is guaranteed not to be reused
-for a different purpose because the page reference count due to being
-present in the user's pte is not released till after swapout() returns.
-
-Any code that modifies the vmlist, or the vm_start/vm_end/
-vm_flags:VM_LOCKED/vm_next of any vma *in the list* must prevent
-kswapd from looking at the chain.
-
-The rules are:
-1. To scan the vmlist (look but don't touch) you must hold the
- mmap_sem with read bias, i.e. down_read(&mm->mmap_sem)
-2. To modify the vmlist you need to hold the mmap_sem with
- read&write bias, i.e. down_write(&mm->mmap_sem) *AND*
- you need to take the page_table_lock.
-3. The swapper takes _just_ the page_table_lock, this is done
- because the mmap_sem can be an extremely long lived lock
- and the swapper just cannot sleep on that.
-4. The exception to this rule is expand_stack, which just
- takes the read lock and the page_table_lock, this is ok
- because it doesn't really modify fields anybody relies on.
-5. You must be able to guarantee that while holding page_table_lock
- or page_table_lock of mm A, you will not try to get either lock
- for mm B.
-
-The caveats are:
-1. find_vma() makes use of, and updates, the mmap_cache pointer hint.
-The update of mmap_cache is racy (page stealer can race with other code
-that invokes find_vma with mmap_sem held), but that is okay, since it
-is a hint. This can be fixed, if desired, by having find_vma grab the
-page_table_lock.
-
-
-Code that add/delete elements from the vmlist chain are
-1. callers of insert_vm_struct
-2. callers of merge_segments
-3. callers of avl_remove
-
-Code that changes vm_start/vm_end/vm_flags:VM_LOCKED of vma's on
-the list:
-1. expand_stack
-2. mprotect
-3. mlock
-4. mremap
-
-It is advisable that changes to vm_start/vm_end be protected, although
-in some cases it is not really needed. Eg, vm_start is modified by
-expand_stack(), it is hard to come up with a destructive scenario without
-having the vmlist protection in this case.
-
-The page_table_lock nests with the inode i_mmap_mutex and the kmem cache
-c_spinlock spinlocks. This is okay, since the kmem code asks for pages after
-dropping c_spinlock. The page_table_lock also nests with pagecache_lock and
-pagemap_lru_lock spinlocks, and no code asks for memory with these locks
-held.
-
-The page_table_lock is grabbed while holding the kernel_lock spinning monitor.
-
-The page_table_lock is a spin lock.
-
-Note: PTL can also be used to guarantee that no new clones using the
-mm start up ... this is a loose form of stability on mm_users. For
-example, it is used in copy_mm to protect against a racing tlb_gather_mmu
-single address space optimization, so that the zap_page_range (from
-truncate) does not lose sending ipi's to cloned threads that might
-be spawned underneath it and go to user mode to drag in pte's into tlbs.
-
-swap_lock
---------------
-The swap devices are chained in priority order from the "swap_list" header.
-The "swap_list" is used for the round-robin swaphandle allocation strategy.
-The #free swaphandles is maintained in "nr_swap_pages". These two together
-are protected by the swap_lock.
-
-The swap_lock also protects all the device reference counts on the
-corresponding swaphandles, maintained in the "swap_map" array, and the
-"highest_bit" and "lowest_bit" fields.
-
-The swap_lock is a spinlock, and is never acquired from intr level.
-
-To prevent races between swap space deletion or async readahead swapins
-deciding whether a swap handle is being used, ie worthy of being read in
-from disk, and an unmap -> swap_free making the handle unused, the swap
-delete and readahead code grabs a temp reference on the swaphandle to
-prevent warning messages from swap_duplicate <- read_swap_cache_async.
-
-Swap cache locking
-------------------
-Pages are added into the swap cache with kernel_lock held, to make sure
-that multiple pages are not being added (and hence lost) by associating
-all of them with the same swaphandle.
-
-Pages are guaranteed not to be removed from the scache if the page is
-"shared": ie, other processes hold reference on the page or the associated
-swap handle. The only code that does not follow this rule is shrink_mmap,
-which deletes pages from the swap cache if no process has a reference on
-the page (multiple processes might have references on the corresponding
-swap handle though). lookup_swap_cache() races with shrink_mmap, when
-establishing a reference on a scache page, so, it must check whether the
-page it located is still in the swapcache, or shrink_mmap deleted it.
-(This race is due to the fact that shrink_mmap looks at the page ref
-count with pagecache_lock, but then drops pagecache_lock before deleting
-the page from the scache).
-
-do_wp_page and do_swap_page have MP races in them while trying to figure
-out whether a page is "shared", by looking at the page_count + swap_count.
-To preserve the sum of the counts, the page lock _must_ be acquired before
-calling is_page_shared (else processes might switch their swap_count refs
-to the page count refs, after the page count ref has been snapshotted).
-
-Swap device deletion code currently breaks all the scache assumptions,
-since it grabs neither mmap_sem nor page_table_lock.
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting
index 8eaa2fc4b8fa..cbfaaa674118 100644
--- a/Documentation/vm/overcommit-accounting
+++ b/Documentation/vm/overcommit-accounting
@@ -14,8 +14,8 @@ The Linux kernel supports the following overcommit handling modes
2 - Don't overcommit. The total address space commit
for the system is not permitted to exceed swap + a
- configurable percentage (default is 50) of physical RAM.
- Depending on the percentage you use, in most situations
+ configurable amount (default is 50%) of physical RAM.
+ Depending on the amount you use, in most situations
this means a process will not be killed while accessing
pages but will receive errors on memory allocation as
appropriate.
@@ -26,7 +26,8 @@ The Linux kernel supports the following overcommit handling modes
The overcommit policy is set via the sysctl `vm.overcommit_memory'.
-The overcommit percentage is set via `vm.overcommit_ratio'.
+The overcommit amount can be set via `vm.overcommit_ratio' (percentage)
+or `vm.overcommit_kbytes' (absolute value).
The current overcommit limit and amount committed are viewable in
/proc/meminfo as CommitLimit and Committed_AS respectively.
diff --git a/MAINTAINERS b/MAINTAINERS
index 48e113cf951c..7798b077d7f9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -93,6 +93,11 @@ Descriptions of section entries:
N: Files and directories with regex patterns.
N: [^a-z]tegra all files whose path contains the word tegra
One pattern per line. Multiple N: lines acceptable.
+ scripts/get_maintainer.pl has different behavior for files that
+ match F: pattern and matches of N: patterns. By default,
+ get_maintainer will not look at git log history when an F: pattern
+ match occurs. When an N: match occurs, git log history is used
+ to also notify the people that have git commit signatures.
X: Files and directories that are NOT maintained, same rules as F:
Files exclusions are tested before file matches.
Can be useful for excluding a specific subdirectory, for instance:
@@ -3982,6 +3987,12 @@ S: Orphan
F: Documentation/filesystems/hfs.txt
F: fs/hfs/
+HFSPLUS FILESYSTEM
+L: linux-fsdevel@vger.kernel.org
+S: Orphan
+F: Documentation/filesystems/hfsplus.txt
+F: fs/hfsplus/
+
HGA FRAMEBUFFER DRIVER
M: Ferenc Bakonyi <fero@drama.obuda.kando.hu>
L: linux-nvidia@lists.surfsouth.com
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 7ce5ce586bd9..97a2d9a096b9 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -540,13 +540,13 @@ config SMP
depends on ALPHA_SABLE || ALPHA_LYNX || ALPHA_RAWHIDE || ALPHA_DP264 || ALPHA_WILDFIRE || ALPHA_TITAN || ALPHA_GENERIC || ALPHA_SHARK || ALPHA_MARVEL
---help---
This enables support for systems with more than one CPU. If you have
- a system with only one CPU, like most personal computers, say N. If
- you have a system with more than one CPU, say Y.
+ a system with only one CPU, say N. If you have a system with more
+ than one CPU, say Y.
- If you say N here, the kernel will run on single and multiprocessor
+ If you say N here, the kernel will run on uni- and multiprocessor
machines, but will use only one CPU of a multiprocessor machine. If
you say Y here, the kernel will run on many, but not all,
- singleprocessor machines. On a singleprocessor machine, the kernel
+ uniprocessor machines. On a uniprocessor machine, the kernel
will run faster if you say N here.
See also the SMP-HOWTO available at
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 9063ae6553cc..5438cabbc45d 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -128,8 +128,8 @@ config SMP
default n
help
This enables support for systems with more than one CPU. If you have
- a system with only one CPU, like most personal computers, say N. If
- you have a system with more than one CPU, say Y.
+ a system with only one CPU, say N. If you have a system with more
+ than one CPU, say Y.
if SMP
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index f9b0fd387c6f..dc6ef9a2c649 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1470,14 +1470,14 @@ config SMP
depends on MMU || ARM_MPU
help
This enables support for systems with more than one CPU. If you have
- a system with only one CPU, like most personal computers, say N. If
- you have a system with more than one CPU, say Y.
+ a system with only one CPU, say N. If you have a system with more
+ than one CPU, say Y.
- If you say N here, the kernel will run on single and multiprocessor
+ If you say N here, the kernel will run on uni- and multiprocessor
machines, but will use only one CPU of a multiprocessor machine. If
- you say Y here, the kernel will run on many, but not all, single
- processor machines. On a single processor machine, the kernel will
- run faster if you say N here.
+ you say Y here, the kernel will run on many, but not all,
+ uniprocessor machines. On a uniprocessor machine, the kernel
+ will run faster if you say N here.
See also <file:Documentation/x86/i386/IO-APIC.txt>,
<file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at
diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h
index 58b8c6a0ab1f..99084431d6ae 100644
--- a/arch/arm/include/asm/dma.h
+++ b/arch/arm/include/asm/dma.h
@@ -8,8 +8,8 @@
#define MAX_DMA_ADDRESS 0xffffffffUL
#else
#define MAX_DMA_ADDRESS ({ \
- extern unsigned long arm_dma_zone_size; \
- arm_dma_zone_size ? \
+ extern phys_addr_t arm_dma_zone_size; \
+ arm_dma_zone_size && arm_dma_zone_size < (0x10000000 - PAGE_OFFSET) ? \
(PAGE_OFFSET + arm_dma_zone_size) : 0xffffffffUL; })
#endif
diff --git a/arch/arm/include/asm/fixmap.h b/arch/arm/include/asm/fixmap.h
index bbae919bceb4..68ea615c2a28 100644
--- a/arch/arm/include/asm/fixmap.h
+++ b/arch/arm/include/asm/fixmap.h
@@ -14,28 +14,15 @@
*/
#define FIXADDR_START 0xfff00000UL
-#define FIXADDR_TOP 0xfffe0000UL
-#define FIXADDR_SIZE (FIXADDR_TOP - FIXADDR_START)
+#define FIXADDR_END 0xfffe0000UL
+#define FIXADDR_TOP (FIXADDR_END - PAGE_SIZE)
-#define FIX_KMAP_BEGIN 0
-#define FIX_KMAP_END (FIXADDR_SIZE >> PAGE_SHIFT)
+enum fixed_addresses {
+ FIX_KMAP_BEGIN,
+ FIX_KMAP_END = (FIXADDR_TOP - FIXADDR_START) >> PAGE_SHIFT,
+ __end_of_fixed_addresses
+};
-#define __fix_to_virt(x) (FIXADDR_START + ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) (((x) - FIXADDR_START) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-
-static inline unsigned long fix_to_virt(const unsigned int idx)
-{
- if (idx >= FIX_KMAP_END)
- __this_fixmap_does_not_exist();
- return __fix_to_virt(idx);
-}
-
-static inline unsigned int virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
+#include <asm-generic/fixmap.h>
#endif
diff --git a/arch/arm/kernel/devtree.c b/arch/arm/kernel/devtree.c
index 34d5fd585bbb..f751714d52c1 100644
--- a/arch/arm/kernel/devtree.c
+++ b/arch/arm/kernel/devtree.c
@@ -33,7 +33,7 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
{
- return alloc_bootmem_align(size, align);
+ return memblock_virt_alloc(size, align);
}
void __init arm_dt_memblock_reserve(void)
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index a4729c6be25d..1e8b030dbefd 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -731,7 +731,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
kernel_data.end = virt_to_phys(_end - 1);
for_each_memblock(memory, region) {
- res = alloc_bootmem_low(sizeof(*res));
+ res = memblock_virt_alloc(sizeof(*res), 0);
res->name = "System RAM";
res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c
index 8a1b5e0bad40..f7a6fd35b1e4 100644
--- a/arch/arm/mach-omap2/omap_hwmod.c
+++ b/arch/arm/mach-omap2/omap_hwmod.c
@@ -2791,9 +2791,7 @@ static int __init _alloc_links(struct omap_hwmod_link **ml,
sz = sizeof(struct omap_hwmod_link) * LINKS_PER_OCP_IF;
*sl = NULL;
- *ml = alloc_bootmem(sz);
-
- memset(*ml, 0, sz);
+ *ml = memblock_virt_alloc(sz, 0);
*sl = (void *)(*ml) + sizeof(struct omap_hwmod_link);
@@ -2912,9 +2910,7 @@ static int __init _alloc_linkspace(struct omap_hwmod_ocp_if **ois)
pr_debug("omap_hwmod: %s: allocating %d byte linkspace (%d links)\n",
__func__, sz, max_ls);
- linkspace = alloc_bootmem(sz);
-
- memset(linkspace, 0, sz);
+ linkspace = memblock_virt_alloc(sz, 0);
return 0;
}
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 6dd66a999d9f..8a271ffce12a 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -92,9 +92,6 @@ void show_mem(unsigned int filter)
printk("Mem-info:\n");
show_free_areas(filter);
- if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
- return;
-
for_each_bank (i, mi) {
struct membank *bank = &mi->bank[i];
unsigned int pfn1, pfn2;
@@ -407,7 +404,7 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn)
* free the section of the memmap array.
*/
if (pg < pgend)
- free_bootmem(pg, pgend - pg);
+ memblock_free_early(pg, pgend - pg);
}
/*
@@ -578,7 +575,7 @@ void __init mem_init(void)
MLK(DTCM_OFFSET, (unsigned long) dtcm_end),
MLK(ITCM_OFFSET, (unsigned long) itcm_end),
#endif
- MLK(FIXADDR_START, FIXADDR_TOP),
+ MLK(FIXADDR_START, FIXADDR_END),
MLM(VMALLOC_START, VMALLOC_END),
MLM(PAGE_OFFSET, (unsigned long)high_memory),
#ifdef CONFIG_HIGHMEM
diff --git a/arch/hexagon/include/asm/fixmap.h b/arch/hexagon/include/asm/fixmap.h
index b75b6bf4269c..1387f84b42b6 100644
--- a/arch/hexagon/include/asm/fixmap.h
+++ b/arch/hexagon/include/asm/fixmap.h
@@ -26,45 +26,7 @@
*/
#include <asm/mem-layout.h>
-/*
- * Full fixmap support involves set_fixmap() functions, but
- * these may not be needed if all we're after is an area for
- * highmem kernel mappings.
- */
-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-
-/**
- * fix_to_virt -- "index to address" translation.
- *
- * If anyone tries to use the idx directly without translation,
- * we catch the bug with a NULL-deference kernel oops. Illegal
- * ranges of incoming indices are caught too.
- */
-static inline unsigned long fix_to_virt(const unsigned int idx)
-{
- /*
- * This branch gets completely eliminated after inlining,
- * except when someone tries to use fixaddr indices in an
- * illegal way. (such as mixing up address types or using
- * out-of-range indices).
- *
- * If it doesn't get removed, the linker will complain
- * loudly with a reasonably clear error message..
- */
- if (idx >= __end_of_fixed_addresses)
- __this_fixmap_does_not_exist();
-
- return __fix_to_virt(idx);
-}
-
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
+#include <asm-generic/fixmap.h>
#define kmap_get_fixmap_pte(vaddr) \
pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), \
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 7cf2347eb899..0c8e553e0b9f 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -105,6 +105,7 @@ config HAVE_SETUP_PER_CPU_AREA
config DMI
bool
default y
+ select DMI_SCAN_MACHINE_NON_EFI_FALLBACK
config EFI
bool
diff --git a/arch/ia64/include/asm/dmi.h b/arch/ia64/include/asm/dmi.h
index 185d3d18d0ec..f365a61f5c71 100644
--- a/arch/ia64/include/asm/dmi.h
+++ b/arch/ia64/include/asm/dmi.h
@@ -5,8 +5,10 @@
#include <asm/io.h>
/* Use normal IO mappings for DMI */
-#define dmi_ioremap ioremap
-#define dmi_iounmap(x,l) iounmap(x)
-#define dmi_alloc(l) kzalloc(l, GFP_ATOMIC)
+#define dmi_early_remap ioremap
+#define dmi_early_unmap(x, l) iounmap(x)
+#define dmi_remap ioremap
+#define dmi_unmap iounmap
+#define dmi_alloc(l) kzalloc(l, GFP_ATOMIC)
#endif
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index 5a84b3a50741..efd1b927ccb7 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -71,6 +71,7 @@
#include <linux/compiler.h>
#include <linux/threads.h>
#include <linux/types.h>
+#include <linux/bitops.h>
#include <asm/fpu.h>
#include <asm/page.h>
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index da5237d636d6..52715a71aede 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -31,74 +31,6 @@
static unsigned long max_gap;
#endif
-/**
- * show_mem - give short summary of memory stats
- *
- * Shows a simple page count of reserved and used pages in the system.
- * For discontig machines, it does this on a per-pgdat basis.
- */
-void show_mem(unsigned int filter)
-{
- int i, total_reserved = 0;
- int total_shared = 0, total_cached = 0;
- unsigned long total_present = 0;
- pg_data_t *pgdat;
-
- printk(KERN_INFO "Mem-info:\n");
- show_free_areas(filter);
- printk(KERN_INFO "Node memory in pages:\n");
- if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
- return;
- for_each_online_pgdat(pgdat) {
- unsigned long present;
- unsigned long flags;
- int shared = 0, cached = 0, reserved = 0;
- int nid = pgdat->node_id;
-
- if (skip_free_areas_node(filter, nid))
- continue;
- pgdat_resize_lock(pgdat, &flags);
- present = pgdat->node_present_pages;
- for(i = 0; i < pgdat->node_spanned_pages; i++) {
- struct page *page;
- if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
- touch_nmi_watchdog();
- if (pfn_valid(pgdat->node_start_pfn + i))
- page = pfn_to_page(pgdat->node_start_pfn + i);
- else {
-#ifdef CONFIG_VIRTUAL_MEM_MAP
- if (max_gap < LARGE_GAP)
- continue;
-#endif
- i = vmemmap_find_next_valid_pfn(nid, i) - 1;
- continue;
- }
- if (PageReserved(page))
- reserved++;
- else if (PageSwapCache(page))
- cached++;
- else if (page_count(page))
- shared += page_count(page)-1;
- }
- pgdat_resize_unlock(pgdat, &flags);
- total_present += present;
- total_reserved += reserved;
- total_cached += cached;
- total_shared += shared;
- printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, "
- "shrd: %10d, swpd: %10d\n", nid,
- present, reserved, shared, cached);
- }
- printk(KERN_INFO "%ld pages of RAM\n", total_present);
- printk(KERN_INFO "%d reserved pages\n", total_reserved);
- printk(KERN_INFO "%d pages shared\n", total_shared);
- printk(KERN_INFO "%d pages swap cached\n", total_cached);
- printk(KERN_INFO "Total of %ld pages in page table cache\n",
- quicklist_total_size());
- printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
-}
-
-
/* physical address where the bootmem map is located */
unsigned long bootmap_start;
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 2de08f4d9930..878626805369 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -608,69 +608,6 @@ void *per_cpu_init(void)
#endif /* CONFIG_SMP */
/**
- * show_mem - give short summary of memory stats
- *
- * Shows a simple page count of reserved and used pages in the system.
- * For discontig machines, it does this on a per-pgdat basis.
- */
-void show_mem(unsigned int filter)
-{
- int i, total_reserved = 0;
- int total_shared = 0, total_cached = 0;
- unsigned long total_present = 0;
- pg_data_t *pgdat;
-
- printk(KERN_INFO "Mem-info:\n");
- show_free_areas(filter);
- if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
- return;
- printk(KERN_INFO "Node memory in pages:\n");
- for_each_online_pgdat(pgdat) {
- unsigned long present;
- unsigned long flags;
- int shared = 0, cached = 0, reserved = 0;
- int nid = pgdat->node_id;
-
- if (skip_free_areas_node(filter, nid))
- continue;
- pgdat_resize_lock(pgdat, &flags);
- present = pgdat->node_present_pages;
- for(i = 0; i < pgdat->node_spanned_pages; i++) {
- struct page *page;
- if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
- touch_nmi_watchdog();
- if (pfn_valid(pgdat->node_start_pfn + i))
- page = pfn_to_page(pgdat->node_start_pfn + i);
- else {
- i = vmemmap_find_next_valid_pfn(nid, i) - 1;
- continue;
- }
- if (PageReserved(page))
- reserved++;
- else if (PageSwapCache(page))
- cached++;
- else if (page_count(page))
- shared += page_count(page)-1;
- }
- pgdat_resize_unlock(pgdat, &flags);
- total_present += present;
- total_reserved += reserved;
- total_cached += cached;
- total_shared += shared;
- printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, "
- "shrd: %10d, swpd: %10d\n", nid,
- present, reserved, shared, cached);
- }
- printk(KERN_INFO "%ld pages of RAM\n", total_present);
- printk(KERN_INFO "%d reserved pages\n", total_reserved);
- printk(KERN_INFO "%d pages shared\n", total_shared);
- printk(KERN_INFO "%d pages swap cached\n", total_cached);
- printk(KERN_INFO "Total of %ld pages in page table cache\n",
- quicklist_total_size());
- printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
-}
-
-/**
* call_pernode_memory - use SRAT to call callback functions with node info
* @start: physical start of range
* @len: length of range
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 88504abf5704..25c350264a41 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -684,3 +684,51 @@ per_linux32_init(void)
}
__initcall(per_linux32_init);
+
+/**
+ * show_mem - give short summary of memory stats
+ *
+ * Shows a simple page count of reserved and used pages in the system.
+ * For discontig machines, it does this on a per-pgdat basis.
+ */
+void show_mem(unsigned int filter)
+{
+ int total_reserved = 0;
+ unsigned long total_present = 0;
+ pg_data_t *pgdat;
+
+ printk(KERN_INFO "Mem-info:\n");
+ show_free_areas(filter);
+ printk(KERN_INFO "Node memory in pages:\n");
+ for_each_online_pgdat(pgdat) {
+ unsigned long present;
+ unsigned long flags;
+ int reserved = 0;
+ int nid = pgdat->node_id;
+ int zoneid;
+
+ if (skip_free_areas_node(filter, nid))
+ continue;
+ pgdat_resize_lock(pgdat, &flags);
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct zone *zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ reserved += zone->present_pages - zone->managed_pages;
+ }
+ present = pgdat->node_present_pages;
+
+ pgdat_resize_unlock(pgdat, &flags);
+ total_present += present;
+ total_reserved += reserved;
+ printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, ",
+ nid, present, reserved);
+ }
+ printk(KERN_INFO "%ld pages of RAM\n", total_present);
+ printk(KERN_INFO "%d reserved pages\n", total_reserved);
+ printk(KERN_INFO "Total of %ld pages in page table cache\n",
+ quicklist_total_size());
+ printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
+}
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 09ef94a8a7c3..ca4504424dae 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -277,13 +277,13 @@ config SMP
bool "Symmetric multi-processing support"
---help---
This enables support for systems with more than one CPU. If you have
- a system with only one CPU, like most personal computers, say N. If
- you have a system with more than one CPU, say Y.
+ a system with only one CPU, say N. If you have a system with more
+ than one CPU, say Y.
- If you say N here, the kernel will run on single and multiprocessor
+ If you say N here, the kernel will run on uni- and multiprocessor
machines, but will use only one CPU of a multiprocessor machine. If
you say Y here, the kernel will run on many, but not all,
- singleprocessor machines. On a singleprocessor machine, the kernel
+ uniprocessor machines. On a uniprocessor machine, the kernel
will run faster if you say N here.
People using multiprocessor machines who say Y here should also say
diff --git a/arch/metag/include/asm/fixmap.h b/arch/metag/include/asm/fixmap.h
index 33312751c92b..af621b041739 100644
--- a/arch/metag/include/asm/fixmap.h
+++ b/arch/metag/include/asm/fixmap.h
@@ -51,37 +51,7 @@ enum fixed_addresses {
#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START ((FIXADDR_TOP - FIXADDR_SIZE) & PMD_MASK)
-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without tranlation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static inline unsigned long fix_to_virt(const unsigned int idx)
-{
- /*
- * this branch gets completely eliminated after inlining,
- * except when someone tries to use fixaddr indices in an
- * illegal way. (such as mixing up address types or using
- * out-of-range indices).
- *
- * If it doesn't get removed, the linker will complain
- * loudly with a reasonably clear error message..
- */
- if (idx >= __end_of_fixed_addresses)
- __this_fixmap_does_not_exist();
-
- return __fix_to_virt(idx);
-}
-
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
+#include <asm-generic/fixmap.h>
#define kmap_get_fixmap_pte(vaddr) \
pte_offset_kernel( \
diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c
index 3cd6288f65c2..11fa51c89617 100644
--- a/arch/metag/mm/init.c
+++ b/arch/metag/mm/init.c
@@ -204,7 +204,8 @@ static void __init do_init_bootmem(void)
start_pfn = memblock_region_memory_base_pfn(reg);
end_pfn = memblock_region_memory_end_pfn(reg);
memblock_set_node(PFN_PHYS(start_pfn),
- PFN_PHYS(end_pfn - start_pfn), 0);
+ PFN_PHYS(end_pfn - start_pfn),
+ &memblock.memory, 0);
}
/* All of system RAM sits in node 0 for the non-NUMA case */
diff --git a/arch/metag/mm/numa.c b/arch/metag/mm/numa.c
index b172aa45fcf8..67b46c295072 100644
--- a/arch/metag/mm/numa.c
+++ b/arch/metag/mm/numa.c
@@ -42,7 +42,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
memblock_add(start, end - start);
memblock_set_node(PFN_PHYS(start_pfn),
- PFN_PHYS(end_pfn - start_pfn), nid);
+ PFN_PHYS(end_pfn - start_pfn),
+ &memblock.memory, nid);
/* Node-local pgdat */
pgdat_paddr = memblock_alloc_base(sizeof(struct pglist_data),
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index e23cccde9c27..8d581ab06c5d 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -30,6 +30,7 @@ config MICROBLAZE
select MODULES_USE_ELF_RELA
select CLONE_BACKWARDS3
select CLKSRC_OF
+ select BUILDTIME_EXTABLE_SORT
config SWAP
def_bool n
diff --git a/arch/microblaze/include/asm/fixmap.h b/arch/microblaze/include/asm/fixmap.h
index f2b312e10b10..06c0e2b1883f 100644
--- a/arch/microblaze/include/asm/fixmap.h
+++ b/arch/microblaze/include/asm/fixmap.h
@@ -58,52 +58,12 @@ enum fixed_addresses {
extern void __set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags);
-#define set_fixmap(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL)
-/*
- * Some hardware wants to get fixmapped without caching.
- */
-#define set_fixmap_nocache(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL_CI)
-
-#define clear_fixmap(idx) \
- __set_fixmap(idx, 0, __pgprot(0))
-
#define __FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without tranlation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
-{
- /*
- * this branch gets completely eliminated after inlining,
- * except when someone tries to use fixaddr indices in an
- * illegal way. (such as mixing up address types or using
- * out-of-range indices).
- *
- * If it doesn't get removed, the linker will complain
- * loudly with a reasonably clear error message..
- */
- if (idx >= __end_of_fixed_addresses)
- __this_fixmap_does_not_exist();
-
- return __fix_to_virt(idx);
-}
+#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_CI
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
+#include <asm-generic/fixmap.h>
#endif /* !__ASSEMBLY__ */
#endif
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 74c7bcc1e82d..89077d346714 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -192,7 +192,8 @@ void __init setup_memory(void)
start_pfn = memblock_region_memory_base_pfn(reg);
end_pfn = memblock_region_memory_end_pfn(reg);
memblock_set_node(start_pfn << PAGE_SHIFT,
- (end_pfn - start_pfn) << PAGE_SHIFT, 0);
+ (end_pfn - start_pfn) << PAGE_SHIFT,
+ &memblock.memory, 0);
}
/* free bootmem is whole main memory */
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 35aca20721f5..dc37faabb6c4 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2161,13 +2161,13 @@ config SMP
depends on SYS_SUPPORTS_SMP
help
This enables support for systems with more than one CPU. If you have
- a system with only one CPU, like most personal computers, say N. If
- you have a system with more than one CPU, say Y.
+ a system with only one CPU, say N. If you have a system with more
+ than one CPU, say Y.
- If you say N here, the kernel will run on single and multiprocessor
+ If you say N here, the kernel will run on uni- and multiprocessor
machines, but will use only one CPU of a multiprocessor machine. If
you say Y here, the kernel will run on many, but not all,
- singleprocessor machines. On a singleprocessor machine, the kernel
+ uniprocessor machines. On a uniprocessor machine, the kernel
will run faster if you say N here.
People using multiprocessor machines who say Y here should also say
@@ -2479,7 +2479,7 @@ source "drivers/pcmcia/Kconfig"
source "drivers/pci/hotplug/Kconfig"
config RAPIDIO
- bool "RapidIO support"
+ tristate "RapidIO support"
depends on PCI
default n
help
diff --git a/arch/mips/include/asm/fixmap.h b/arch/mips/include/asm/fixmap.h
index dfaaf493e9d4..8c012af2f451 100644
--- a/arch/mips/include/asm/fixmap.h
+++ b/arch/mips/include/asm/fixmap.h
@@ -71,38 +71,7 @@ enum fixed_addresses {
#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without tranlation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static inline unsigned long fix_to_virt(const unsigned int idx)
-{
- /*
- * this branch gets completely eliminated after inlining,
- * except when someone tries to use fixaddr indices in an
- * illegal way. (such as mixing up address types or using
- * out-of-range indices).
- *
- * If it doesn't get removed, the linker will complain
- * loudly with a reasonably clear error message..
- */
- if (idx >= __end_of_fixed_addresses)
- __this_fixmap_does_not_exist();
-
- return __fix_to_virt(idx);
-}
-
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
+#include <asm-generic/fixmap.h>
#define kmap_get_fixmap_pte(vaddr) \
pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)), (vaddr))
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 8bde9237d13b..a648de1b1096 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -184,13 +184,13 @@ config SMP
depends on MN10300_PROC_MN2WS0038 || MN10300_PROC_MN2WS0050
---help---
This enables support for systems with more than one CPU. If you have
- a system with only one CPU, like most personal computers, say N. If
- you have a system with more than one CPU, say Y.
+ a system with only one CPU, say N. If you have a system with more
+ than one CPU, say Y.
- If you say N here, the kernel will run on single and multiprocessor
+ If you say N here, the kernel will run on uni- and multiprocessor
machines, but will use only one CPU of a multiprocessor machine. If
you say Y here, the kernel will run on many, but not all,
- singleprocessor machines. On a singleprocessor machine, the kernel
+ uniprocessor machines. On a uniprocessor machine, the kernel
will run faster if you say N here.
See also <file:Documentation/x86/i386/IO-APIC.txt>,
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index b5f1858baf33..bb2a8ec440e7 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -229,13 +229,13 @@ config SMP
bool "Symmetric multi-processing support"
---help---
This enables support for systems with more than one CPU. If you have
- a system with only one CPU, like most personal computers, say N. If
- you have a system with more than one CPU, say Y.
+ a system with only one CPU, say N. If you have a system with more
+ than one CPU, say Y.
- If you say N here, the kernel will run on single and multiprocessor
+ If you say N here, the kernel will run on uni- and multiprocessor
machines, but will use only one CPU of a multiprocessor machine. If
you say Y here, the kernel will run on many, but not all,
- singleprocessor machines. On a singleprocessor machine, the kernel
+ uniprocessor machines. On a uniprocessor machine, the kernel
will run faster if you say N here.
See also <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 96f8168cf4ec..ae085ad0fba0 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -645,55 +645,30 @@ EXPORT_SYMBOL(empty_zero_page);
void show_mem(unsigned int filter)
{
- int i,free = 0,total = 0,reserved = 0;
- int shared = 0, cached = 0;
+ int total = 0,reserved = 0;
+ pg_data_t *pgdat;
printk(KERN_INFO "Mem-info:\n");
show_free_areas(filter);
- if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
- return;
-#ifndef CONFIG_DISCONTIGMEM
- i = max_mapnr;
- while (i-- > 0) {
- total++;
- if (PageReserved(mem_map+i))
- reserved++;
- else if (PageSwapCache(mem_map+i))
- cached++;
- else if (!page_count(&mem_map[i]))
- free++;
- else
- shared += page_count(&mem_map[i]) - 1;
- }
-#else
- for (i = 0; i < npmem_ranges; i++) {
- int j;
- for (j = node_start_pfn(i); j < node_end_pfn(i); j++) {
- struct page *p;
- unsigned long flags;
-
- pgdat_resize_lock(NODE_DATA(i), &flags);
- p = nid_page_nr(i, j) - node_start_pfn(i);
-
- total++;
- if (PageReserved(p))
- reserved++;
- else if (PageSwapCache(p))
- cached++;
- else if (!page_count(p))
- free++;
- else
- shared += page_count(p) - 1;
- pgdat_resize_unlock(NODE_DATA(i), &flags);
- }
+ for_each_online_pgdat(pgdat) {
+ unsigned long flags;
+ int zoneid;
+
+ pgdat_resize_lock(pgdat, &flags);
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct zone *zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ total += zone->present_pages;
+ reserved = zone->present_pages - zone->managed_pages;
+ }
+ pgdat_resize_unlock(pgdat, &flags);
}
-#endif
+
printk(KERN_INFO "%d pages of RAM\n", total);
printk(KERN_INFO "%d reserved pages\n", reserved);
- printk(KERN_INFO "%d pages shared\n", shared);
- printk(KERN_INFO "%d pages swap cached\n", cached);
-
#ifdef CONFIG_DISCONTIGMEM
{
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7c3d97d8da70..f58aefe7b6e0 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -797,7 +797,7 @@ config HAS_RAPIDIO
default n
config RAPIDIO
- bool "RapidIO support"
+ tristate "RapidIO support"
depends on HAS_RAPIDIO || PCI
help
If you say Y here, the kernel will include drivers and
@@ -805,7 +805,7 @@ config RAPIDIO
config FSL_RIO
bool "Freescale Embedded SRIO Controller support"
- depends on RAPIDIO && HAS_RAPIDIO
+ depends on RAPIDIO = y && HAS_RAPIDIO
default "n"
---help---
Include support for RapidIO controller on Freescale embedded
diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h
index 5c2c0233175e..90f604bbcd19 100644
--- a/arch/powerpc/include/asm/fixmap.h
+++ b/arch/powerpc/include/asm/fixmap.h
@@ -58,52 +58,12 @@ enum fixed_addresses {
extern void __set_fixmap (enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags);
-#define set_fixmap(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL)
-/*
- * Some hardware wants to get fixmapped without caching.
- */
-#define set_fixmap_nocache(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL_NCG)
-
-#define clear_fixmap(idx) \
- __set_fixmap(idx, 0, __pgprot(0))
-
#define __FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without tranlation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
-{
- /*
- * this branch gets completely eliminated after inlining,
- * except when someone tries to use fixaddr indices in an
- * illegal way. (such as mixing up address types or using
- * out-of-range indices).
- *
- * If it doesn't get removed, the linker will complain
- * loudly with a reasonably clear error message..
- */
- if (idx >= __end_of_fixed_addresses)
- __this_fixmap_does_not_exist();
-
- return __fix_to_virt(idx);
-}
+#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_NCG
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
+#include <asm-generic/fixmap.h>
#endif /* !__ASSEMBLY__ */
#endif
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index b1c5734bc2ce..4b5cd5c2594d 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -209,7 +209,7 @@ void __init do_init_bootmem(void)
/* Place all memblock_regions in the same node and merge contiguous
* memblock_regions
*/
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
/* Add all physical memory to the bootmem map, mark each area
* present.
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 4f50c6a9e68f..86a63de072c6 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -720,7 +720,8 @@ static void __init parse_drconf_memory(struct device_node *memory)
node_set_online(nid);
sz = numa_enforce_memory_limit(base, size);
if (sz)
- memblock_set_node(base, sz, nid);
+ memblock_set_node(base, sz,
+ &memblock.memory, nid);
} while (--ranges);
}
}
@@ -810,7 +811,7 @@ new_range:
continue;
}
- memblock_set_node(start, size, nid);
+ memblock_set_node(start, size, &memblock.memory, nid);
if (--ranges)
goto new_range;
@@ -847,7 +848,8 @@ static void __init setup_nonnuma(void)
fake_numa_create_new_node(end_pfn, &nid);
memblock_set_node(PFN_PHYS(start_pfn),
- PFN_PHYS(end_pfn - start_pfn), nid);
+ PFN_PHYS(end_pfn - start_pfn),
+ &memblock.memory, nid);
node_set_online(nid);
}
}
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e9f312532526..4f858f77d870 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -334,10 +334,10 @@ config SMP
a system with only one CPU, like most personal computers, say N. If
you have a system with more than one CPU, say Y.
- If you say N here, the kernel will run on single and multiprocessor
+ If you say N here, the kernel will run on uni- and multiprocessor
machines, but will use only one CPU of a multiprocessor machine. If
you say Y here, the kernel will run on many, but not all,
- singleprocessor machines. On a singleprocessor machine, the kernel
+ uniprocessor machines. On a uniprocessor machine, the kernel
will run faster if you say N here.
See also the SMP-HOWTO available at
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index ce298317a73e..6357710753d5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -701,13 +701,13 @@ config SMP
depends on SYS_SUPPORTS_SMP
---help---
This enables support for systems with more than one CPU. If you have
- a system with only one CPU, like most personal computers, say N. If
- you have a system with more than one CPU, say Y.
+ a system with only one CPU, say N. If you have a system with more
+ than one CPU, say Y.
- If you say N here, the kernel will run on single and multiprocessor
+ If you say N here, the kernel will run on uni- and multiprocessor
machines, but will use only one CPU of a multiprocessor machine. If
you say Y here, the kernel will run on many, but not all,
- singleprocessor machines. On a singleprocessor machine, the kernel
+ uniprocessor machines. On a uniprocessor machine, the kernel
will run faster if you say N here.
People using multiprocessor machines who say Y here should also say
diff --git a/arch/sh/include/asm/fixmap.h b/arch/sh/include/asm/fixmap.h
index cbe0186b6794..4daf91c3b725 100644
--- a/arch/sh/include/asm/fixmap.h
+++ b/arch/sh/include/asm/fixmap.h
@@ -79,13 +79,6 @@ extern void __set_fixmap(enum fixed_addresses idx,
unsigned long phys, pgprot_t flags);
extern void __clear_fixmap(enum fixed_addresses idx, pgprot_t flags);
-#define set_fixmap(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL)
-/*
- * Some hardware wants to get fixmapped without caching.
- */
-#define set_fixmap_nocache(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
/*
* used by vmalloc.c.
*
@@ -101,36 +94,8 @@ extern void __clear_fixmap(enum fixed_addresses idx, pgprot_t flags);
#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without tranlation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static inline unsigned long fix_to_virt(const unsigned int idx)
-{
- /*
- * this branch gets completely eliminated after inlining,
- * except when someone tries to use fixaddr indices in an
- * illegal way. (such as mixing up address types or using
- * out-of-range indices).
- *
- * If it doesn't get removed, the linker will complain
- * loudly with a reasonably clear error message..
- */
- if (idx >= __end_of_fixed_addresses)
- __this_fixmap_does_not_exist();
+#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_NOCACHE
- return __fix_to_virt(idx);
-}
+#include <asm-generic/fixmap.h>
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
#endif
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index 49c09c7d5b77..67a049e75ec1 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -995,29 +995,19 @@ static struct unwinder dwarf_unwinder = {
static void dwarf_unwinder_cleanup(void)
{
- struct rb_node **fde_rb_node = &fde_root.rb_node;
- struct rb_node **cie_rb_node = &cie_root.rb_node;
+ struct dwarf_fde *fde, *next_fde;
+ struct dwarf_cie *cie, *next_cie;
/*
* Deallocate all the memory allocated for the DWARF unwinder.
* Traverse all the FDE/CIE lists and remove and free all the
* memory associated with those data structures.
*/
- while (*fde_rb_node) {
- struct dwarf_fde *fde;
-
- fde = rb_entry(*fde_rb_node, struct dwarf_fde, node);
- rb_erase(*fde_rb_node, &fde_root);
+ rbtree_postorder_for_each_entry_safe(fde, next_fde, &fde_root, node)
kfree(fde);
- }
- while (*cie_rb_node) {
- struct dwarf_cie *cie;
-
- cie = rb_entry(*cie_rb_node, struct dwarf_cie, node);
- rb_erase(*cie_rb_node, &cie_root);
+ rbtree_postorder_for_each_entry_safe(cie, next_cie, &cie_root, node)
kfree(cie);
- }
kmem_cache_destroy(dwarf_reg_cachep);
kmem_cache_destroy(dwarf_frame_cachep);
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 1cf90e947dbf..de19cfa768f2 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -230,8 +230,8 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn,
pmb_bolt_mapping((unsigned long)__va(start), start, end - start,
PAGE_KERNEL);
- memblock_set_node(PFN_PHYS(start_pfn),
- PFN_PHYS(end_pfn - start_pfn), nid);
+ memblock_set_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn),
+ &memblock.memory, nid);
}
void __init __weak plat_early_device_setup(void)
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 034a680ffec2..c51efdcd07a2 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -153,10 +153,10 @@ config SMP
a system with only one CPU, say N. If you have a system with more
than one CPU, say Y.
- If you say N here, the kernel will run on single and multiprocessor
+ If you say N here, the kernel will run on uni- and multiprocessor
machines, but will use only one CPU of a multiprocessor machine. If
you say Y here, the kernel will run on many, but not all,
- singleprocessor machines. On a singleprocessor machine, the kernel
+ uniprocessor machines. On a uniprocessor machine, the kernel
will run faster if you say N here.
People using multiprocessor machines who say Y here should also say
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 5322e530d09c..eafbc65c9c47 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1021,7 +1021,8 @@ static void __init add_node_ranges(void)
"start[%lx] end[%lx]\n",
nid, start, this_end);
- memblock_set_node(start, this_end - start, nid);
+ memblock_set_node(start, this_end - start,
+ &memblock.memory, nid);
start = this_end;
}
}
@@ -1325,7 +1326,7 @@ static void __init bootmem_init_nonnuma(void)
(top_of_ram - total_ram) >> 20);
init_node_masks_nonnuma();
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
allocate_node_data(0);
node_set_online(0);
}
diff --git a/arch/tile/include/asm/fixmap.h b/arch/tile/include/asm/fixmap.h
index c6b9c1b38fd1..ffe2637aeb31 100644
--- a/arch/tile/include/asm/fixmap.h
+++ b/arch/tile/include/asm/fixmap.h
@@ -25,9 +25,6 @@
#include <asm/kmap_types.h>
#endif
-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
/*
* Here we define all the compile-time 'special' virtual
* addresses. The point is to have a constant address at
@@ -83,35 +80,7 @@ enum fixed_addresses {
#define FIXADDR_START (FIXADDR_TOP + PAGE_SIZE - __FIXADDR_SIZE)
#define FIXADDR_BOOT_START (FIXADDR_TOP + PAGE_SIZE - __FIXADDR_BOOT_SIZE)
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without tranlation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
-{
- /*
- * this branch gets completely eliminated after inlining,
- * except when someone tries to use fixaddr indices in an
- * illegal way. (such as mixing up address types or using
- * out-of-range indices).
- *
- * If it doesn't get removed, the linker will complain
- * loudly with a reasonably clear error message..
- */
- if (idx >= __end_of_fixed_addresses)
- __this_fixmap_does_not_exist();
-
- return __fix_to_virt(idx);
-}
-
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
+#include <asm-generic/fixmap.h>
#endif /* !__ASSEMBLY__ */
diff --git a/arch/um/include/asm/fixmap.h b/arch/um/include/asm/fixmap.h
index 21a423bae5e8..3094ea3c73b0 100644
--- a/arch/um/include/asm/fixmap.h
+++ b/arch/um/include/asm/fixmap.h
@@ -43,13 +43,6 @@ enum fixed_addresses {
extern void __set_fixmap (enum fixed_addresses idx,
unsigned long phys, pgprot_t flags);
-#define set_fixmap(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL)
-/*
- * Some hardware wants to get fixmapped without caching.
- */
-#define set_fixmap_nocache(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
/*
* used by vmalloc.c.
*
@@ -62,37 +55,6 @@ extern void __set_fixmap (enum fixed_addresses idx,
#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without tranlation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static inline unsigned long fix_to_virt(const unsigned int idx)
-{
- /*
- * this branch gets completely eliminated after inlining,
- * except when someone tries to use fixaddr indices in an
- * illegal way. (such as mixing up address types or using
- * out-of-range indices).
- *
- * If it doesn't get removed, the linker will complain
- * loudly with a reasonably clear error message..
- */
- if (idx >= __end_of_fixed_addresses)
- __this_fixmap_does_not_exist();
-
- return __fix_to_virt(idx);
-}
-
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
+#include <asm-generic/fixmap.h>
#endif
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index ae6bc036db92..be2bde9b07cf 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -66,9 +66,6 @@ void show_mem(unsigned int filter)
printk(KERN_DEFAULT "Mem-info:\n");
show_free_areas(filter);
- if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
- return;
-
for_each_bank(i, mi) {
struct membank *bank = &mi->bank[i];
unsigned int pfn1, pfn2;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index beff90683a1b..9421881d52df 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -280,13 +280,13 @@ config SMP
bool "Symmetric multi-processing support"
---help---
This enables support for systems with more than one CPU. If you have
- a system with only one CPU, like most personal computers, say N. If
- you have a system with more than one CPU, say Y.
+ a system with only one CPU, say N. If you have a system with more
+ than one CPU, say Y.
- If you say N here, the kernel will run on single and multiprocessor
+ If you say N here, the kernel will run on uni- and multiprocessor
machines, but will use only one CPU of a multiprocessor machine. If
you say Y here, the kernel will run on many, but not all,
- singleprocessor machines. On a singleprocessor machine, the kernel
+ uniprocessor machines. On a uniprocessor machine, the kernel
will run faster if you say N here.
Note that if you say Y here and choose architecture "586" or
@@ -748,6 +748,7 @@ config APB_TIMER
# The code disables itself when not needed.
config DMI
default y
+ select DMI_SCAN_MACHINE_NON_EFI_FALLBACK
bool "Enable DMI scanning" if EXPERT
---help---
Enabled scanning of DMI to identify machine quirks. Say Y
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index fd8f9e2ca35f..535192f6bfad 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -13,7 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len)
}
/* Use early IO mappings for DMI because it's initialized early */
-#define dmi_ioremap early_ioremap
-#define dmi_iounmap early_iounmap
+#define dmi_early_remap early_ioremap
+#define dmi_early_unmap early_iounmap
+#define dmi_remap ioremap
+#define dmi_unmap iounmap
#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index e846225265ed..7252cd339175 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -175,64 +175,7 @@ static inline void __set_fixmap(enum fixed_addresses idx,
}
#endif
-#define set_fixmap(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL)
-
-/*
- * Some hardware wants to get fixmapped without caching.
- */
-#define set_fixmap_nocache(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
-
-#define clear_fixmap(idx) \
- __set_fixmap(idx, 0, __pgprot(0))
-
-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without translation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
-{
- /*
- * this branch gets completely eliminated after inlining,
- * except when someone tries to use fixaddr indices in an
- * illegal way. (such as mixing up address types or using
- * out-of-range indices).
- *
- * If it doesn't get removed, the linker will complain
- * loudly with a reasonably clear error message..
- */
- if (idx >= __end_of_fixed_addresses)
- __this_fixmap_does_not_exist();
-
- return __fix_to_virt(idx);
-}
-
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
-
-/* Return an pointer with offset calculated */
-static __always_inline unsigned long
-__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
-{
- __set_fixmap(idx, phys, flags);
- return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
-}
-
-#define set_fixmap_offset(idx, phys) \
- __set_fixmap_offset(idx, phys, PAGE_KERNEL)
-
-#define set_fixmap_offset_nocache(idx, phys) \
- __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
+#include <asm-generic/fixmap.h>
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..2f59cce3b38a 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,9 +51,9 @@ extern int devmem_is_allowed(unsigned long pagenr);
extern unsigned long max_low_pfn_mapped;
extern unsigned long max_pfn_mapped;
-static inline phys_addr_t get_max_mapped(void)
+static inline phys_addr_t get_max_low_mapped(void)
{
- return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
+ return (phys_addr_t)max_low_pfn_mapped << PAGE_SHIFT;
}
bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index e6d90babc245..04905bfc508b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -62,7 +62,7 @@ static inline void __flush_tlb_all(void)
static inline void __flush_tlb_one(unsigned long addr)
{
- count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
}
@@ -93,13 +93,13 @@ static inline void __flush_tlb_one(unsigned long addr)
*/
static inline void __flush_tlb_up(void)
{
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
}
static inline void flush_tlb_all(void)
{
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb_all();
}
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index e2dbcb7dabdd..83a7995625a6 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void)
corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
- for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
PAGE_SIZE, corruption_check_size);
end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 4a48e8bbd857..1ac871d53912 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -757,10 +757,7 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
{
- tlb_flushall_shift = 5;
-
- if (c->x86 <= 0x11)
- tlb_flushall_shift = 4;
+ tlb_flushall_shift = 6;
}
static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 3db61c644e44..5cd9bfabd645 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -640,21 +640,17 @@ static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
case 0x61d: /* six-core 45 nm xeon "Dunnington" */
tlb_flushall_shift = -1;
break;
+ case 0x63a: /* Ivybridge */
+ tlb_flushall_shift = 2;
+ break;
case 0x61a: /* 45 nm nehalem, "Bloomfield" */
case 0x61e: /* 45 nm nehalem, "Lynnfield" */
case 0x625: /* 32 nm nehalem, "Clarkdale" */
case 0x62c: /* 32 nm nehalem, "Gulftown" */
case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
case 0x62f: /* 32 nm Xeon E7 */
- tlb_flushall_shift = 6;
- break;
case 0x62a: /* SandyBridge */
case 0x62d: /* SandyBridge, "Romely-EP" */
- tlb_flushall_shift = 5;
- break;
- case 0x63a: /* Ivybridge */
- tlb_flushall_shift = 1;
- break;
default:
tlb_flushall_shift = 6;
}
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index ce2d0a2c3e4f..0e25a1bc5ab5 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
}
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Save MTRR state */
@@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
static void post_set(void) __releases(set_atomicity_lock)
{
/* Flush TLBs (no need to flush caches - they are disabled) */
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Intel (P6) standard MTRRs */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 174da5fc5a7b..988c00a1f60d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1120,7 +1120,7 @@ void __init memblock_find_dma_reserve(void)
nr_pages += end_pfn - start_pfn;
}
- for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
+ for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
if (start_pfn < end_pfn)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 06853e670354..c9675594d7ca 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1119,7 +1119,7 @@ void __init setup_arch(char **cmdline_p)
setup_real_mode();
- memblock_set_current_limit(get_max_mapped());
+ memblock_set_current_limit(get_max_low_mapped());
dma_contiguous_reserve(0);
/*
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 0596e8e0cc19..207d9aef662d 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -108,8 +108,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
static inline void get_head_page_multiple(struct page *page, int nr)
{
- VM_BUG_ON(page != compound_head(page));
- VM_BUG_ON(page_count(page) == 0);
+ VM_BUG_ON_PAGE(page != compound_head(page), page);
+ VM_BUG_ON_PAGE(page_count(page) == 0, page);
atomic_add(nr, &page->_count);
SetPageReferenced(page);
}
@@ -135,7 +135,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
head = pte_page(pte);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON(compound_head(page) != head);
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
if (PageTail(page))
get_huge_page_tail(page);
@@ -212,7 +212,7 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
head = pte_page(pte);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON(compound_head(page) != head);
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
if (PageTail(page))
get_huge_page_tail(page);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 5bdc5430597c..e39504878aec 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -665,7 +665,7 @@ void __init initmem_init(void)
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
sparse_memory_present_with_active_regions(0);
#ifdef CONFIG_FLATMEM
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 104d56a9245f..f35c66c5959a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -643,7 +643,7 @@ kernel_physical_mapping_init(unsigned long start,
#ifndef CONFIG_NUMA
void __init initmem_init(void)
{
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
}
#endif
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 8dabbed409ee..1e9da795767a 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -74,7 +74,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
u64 i;
phys_addr_t this_start, this_end;
- for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
this_start = clamp_t(phys_addr_t, this_start, start, end);
this_end = clamp_t(phys_addr_t, this_end, start, end);
if (this_start < this_end) {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index c85da7bb6b60..81b2750f3666 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -491,7 +491,16 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
for (i = 0; i < mi->nr_blks; i++) {
struct numa_memblk *mb = &mi->blk[i];
- memblock_set_node(mb->start, mb->end - mb->start, mb->nid);
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.memory, mb->nid);
+
+ /*
+ * At this time, all memory regions reserved by memblock are
+ * used by the kernel. Set the nid in memblock.reserved will
+ * mark out all the nodes the kernel resides in.
+ */
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.reserved, mb->nid);
}
/*
@@ -553,6 +562,30 @@ static void __init numa_init_array(void)
}
}
+static void __init numa_clear_kernel_node_hotplug(void)
+{
+ int i, nid;
+ nodemask_t numa_kernel_nodes;
+ unsigned long start, end;
+ struct memblock_type *type = &memblock.reserved;
+
+ /* Mark all kernel nodes. */
+ for (i = 0; i < type->cnt; i++)
+ node_set(type->regions[i].nid, numa_kernel_nodes);
+
+ /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ nid = numa_meminfo.blk[i].nid;
+ if (!node_isset(nid, numa_kernel_nodes))
+ continue;
+
+ start = numa_meminfo.blk[i].start;
+ end = numa_meminfo.blk[i].end;
+
+ memblock_clear_hotplug(start, end - start);
+ }
+}
+
static int __init numa_init(int (*init_func)(void))
{
int i;
@@ -565,7 +598,12 @@ static int __init numa_init(int (*init_func)(void))
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
- WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
+ MAX_NUMNODES));
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
+ MAX_NUMNODES));
+ /* In case that parsing SRAT failed. */
+ WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
numa_reset_distance();
ret = init_func();
@@ -601,6 +639,16 @@ static int __init numa_init(int (*init_func)(void))
numa_clear_node(i);
}
numa_init_array();
+
+ /*
+ * At very early time, the kernel have to use some memory such as
+ * loading the kernel image. We cannot prevent this anyway. So any
+ * node the kernel resides in should be un-hotpluggable.
+ *
+ * And when we come here, numa_init() won't fail.
+ */
+ numa_clear_kernel_node_hotplug();
+
return 0;
}
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 266ca912f62e..2f78a38ac281 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -42,15 +42,25 @@ static __init inline int srat_disabled(void)
return acpi_numa < 0;
}
-/* Callback for SLIT parsing */
+/**
+ * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for
+ * I/O localities since SRAT does not list them. I/O localities are
+ * not supported at this point.
+ */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
int i, j;
- for (i = 0; i < slit->locality_count; i++)
- for (j = 0; j < slit->locality_count; j++)
+ for (i = 0; i < slit->locality_count; i++) {
+ if (pxm_to_node(i) == NUMA_NO_NODE)
+ continue;
+ for (j = 0; j < slit->locality_count; j++) {
+ if (pxm_to_node(j) == NUMA_NO_NODE)
+ continue;
numa_set_distance(pxm_to_node(i), pxm_to_node(j),
slit->entry[slit->locality_count * i + j]);
+ }
+ }
}
/* Callback for Proximity Domain -> x2APIC mapping */
@@ -181,6 +191,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
(unsigned long long) start, (unsigned long long) end - 1,
hotpluggable ? " hotplug" : "");
+ /* Mark hotplug range in memblock. */
+ if (hotpluggable && memblock_mark_hotplug(start, ma->length))
+ pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
+ (unsigned long long)start, (unsigned long long)end - 1);
+
return 0;
out_err_bad_srat:
bad_srat();
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ae699b3bbac8..dd8dda167a24 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info)
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_end == TLB_FLUSH_ALL)
local_flush_tlb();
@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
info.flush_start = start;
info.flush_end = end;
- count_vm_event(NR_TLB_REMOTE_FLUSH);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (is_uv_system()) {
unsigned int cpu;
@@ -151,44 +151,19 @@ void flush_tlb_current_task(void)
preempt_disable();
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
-/*
- * It can find out the THP large page, or
- * HUGETLB page in tlb_flush when THP disabled
- */
-static inline unsigned long has_large_page(struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- unsigned long addr = ALIGN(start, HPAGE_SIZE);
- for (; addr < end; addr += HPAGE_SIZE) {
- pgd = pgd_offset(mm, addr);
- if (likely(!pgd_none(*pgd))) {
- pud = pud_offset(pgd, addr);
- if (likely(!pud_none(*pud))) {
- pmd = pmd_offset(pud, addr);
- if (likely(!pmd_none(*pmd)))
- if (pmd_large(*pmd))
- return addr;
- }
- }
- }
- return 0;
-}
-
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
{
unsigned long addr;
unsigned act_entries, tlb_entries = 0;
+ unsigned long nr_base_pages;
preempt_disable();
if (current->active_mm != mm)
@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
tlb_entries = tlb_lli_4k[ENTRIES];
else
tlb_entries = tlb_lld_4k[ENTRIES];
+
/* Assume all of TLB entries was occupied by this task */
- act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
+ act_entries = tlb_entries >> tlb_flushall_shift;
+ act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
+ nr_base_pages = (end - start) >> PAGE_SHIFT;
/* tlb_flushall_shift is on balance point, details in commit log */
- if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ if (nr_base_pages > act_entries) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
} else {
- if (has_large_page(mm, start, end)) {
- local_flush_tlb();
- goto flush_all;
- }
/* flush range by one by one 'invlpg' */
for (addr = start; addr < end; addr += PAGE_SIZE) {
- count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
}
@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
static void do_flush_tlb_all(void *info)
{
- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(smp_processor_id());
@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
- count_vm_event(NR_TLB_REMOTE_FLUSH);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c
index cc2637f8674e..9dbc67e42a99 100644
--- a/block/cmdline-parser.c
+++ b/block/cmdline-parser.c
@@ -4,8 +4,7 @@
* Written by Cai Zhiyong <caizhiyong@huawei.com>
*
*/
-#include <linux/buffer_head.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/cmdline-parser.h>
static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
@@ -159,6 +158,7 @@ void cmdline_parts_free(struct cmdline_parts **parts)
*parts = next_parts;
}
}
+EXPORT_SYMBOL(cmdline_parts_free);
int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline)
{
@@ -206,6 +206,7 @@ fail:
cmdline_parts_free(parts);
goto done;
}
+EXPORT_SYMBOL(cmdline_parts_parse);
struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
const char *bdev)
@@ -214,17 +215,17 @@ struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
parts = parts->next_parts;
return parts;
}
+EXPORT_SYMBOL(cmdline_parts_find);
/*
* add_part()
* 0 success.
* 1 can not add so many partitions.
*/
-void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
- int slot,
- int (*add_part)(int, struct cmdline_subpart *, void *),
- void *param)
-
+int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
+ int slot,
+ int (*add_part)(int, struct cmdline_subpart *, void *),
+ void *param)
{
sector_t from = 0;
struct cmdline_subpart *subpart;
@@ -247,4 +248,7 @@ void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
if (add_part(slot, subpart, param))
break;
}
+
+ return slot;
}
+EXPORT_SYMBOL(cmdline_parts_set);
diff --git a/block/genhd.c b/block/genhd.c
index 791f41943132..7bd4372e8b6f 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -849,7 +849,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 86b9f37d102e..9ffa90c6201c 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -368,7 +368,8 @@ config BLK_DEV_RAM
For details, read <file:Documentation/blockdev/ramdisk.txt>.
To compile this driver as a module, choose M here: the
- module will be called rd.
+ module will be called brd. An alias "rd" has been defined
+ for historical reasons.
Most normal users won't need the RAM disk functionality, and can
thus say N here.
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index b35fc4f5237c..036e8ab86c71 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -5004,7 +5004,7 @@ reinit_after_soft_reset:
i = alloc_cciss_hba(pdev);
if (i < 0)
- return -1;
+ return -ENOMEM;
h = hba[i];
h->pdev = pdev;
@@ -5205,7 +5205,7 @@ clean_no_release_regions:
*/
pci_set_drvdata(pdev, NULL);
free_hba(h);
- return -1;
+ return -ENODEV;
}
static void cciss_shutdown(struct pci_dev *pdev)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 33fde3a39759..66e8c3b94ef3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -799,7 +799,7 @@ static void loop_config_discard(struct loop_device *lo)
/*
* We use punch hole to reclaim the free space used by the
- * image a.k.a. discard. However we do support discard if
+ * image a.k.a. discard. However we do not support discard if
* encryption is enabled, because it may give an attacker
* useful information.
*/
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index 4a27b1de5fcb..2ce3dfd7e6b9 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -581,7 +581,7 @@ static ssize_t pg_write(struct file *filp, const char __user *buf, size_t count,
if (hdr.magic != PG_MAGIC)
return -EINVAL;
- if (hdr.dlen > PG_MAX_DATA)
+ if (hdr.dlen < 0 || hdr.dlen > PG_MAX_DATA)
return -EINVAL;
if ((count - hs) > PG_MAX_DATA)
return -EINVAL;
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 3fb6ab4c8b4e..d5e2d12b9d9e 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -1744,20 +1744,6 @@ static void carm_remove_one (struct pci_dev *pdev)
kfree(host);
pci_release_regions(pdev);
pci_disable_device(pdev);
- pci_set_drvdata(pdev, NULL);
}
-static int __init carm_init(void)
-{
- return pci_register_driver(&carm_driver);
-}
-
-static void __exit carm_exit(void)
-{
- pci_unregister_driver(&carm_driver);
-}
-
-module_init(carm_init);
-module_exit(carm_exit);
-
-
+module_pci_driver(carm_driver);
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index f895a8c8a244..92c5937f80c3 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -22,7 +22,6 @@
#include <linux/device.h>
#include <linux/highmem.h>
#include <linux/backing-dev.h>
-#include <linux/bootmem.h>
#include <linux/splice.h>
#include <linux/pfn.h>
#include <linux/export.h>
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index a6ef6acaa1c8..41983883cef4 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -108,6 +108,9 @@ config DMI_SYSFS
under /sys/firmware/dmi when this option is enabled and
loaded.
+config DMI_SCAN_MACHINE_NON_EFI_FALLBACK
+ bool
+
config ISCSI_IBFT_FIND
bool "iSCSI Boot Firmware Table Attributes"
depends on X86 && ACPI
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index c7e81ff8f3ef..17afc51f3054 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -116,7 +116,7 @@ static int __init dmi_walk_early(void (*decode)(const struct dmi_header *,
{
u8 *buf;
- buf = dmi_ioremap(dmi_base, dmi_len);
+ buf = dmi_early_remap(dmi_base, dmi_len);
if (buf == NULL)
return -1;
@@ -124,7 +124,7 @@ static int __init dmi_walk_early(void (*decode)(const struct dmi_header *,
add_device_randomness(buf, dmi_len);
- dmi_iounmap(buf, dmi_len);
+ dmi_early_unmap(buf, dmi_len);
return 0;
}
@@ -527,18 +527,18 @@ void __init dmi_scan_machine(void)
* needed during early boot. This also means we can
* iounmap the space when we're done with it.
*/
- p = dmi_ioremap(efi.smbios, 32);
+ p = dmi_early_remap(efi.smbios, 32);
if (p == NULL)
goto error;
memcpy_fromio(buf, p, 32);
- dmi_iounmap(p, 32);
+ dmi_early_unmap(p, 32);
if (!dmi_present(buf)) {
dmi_available = 1;
goto out;
}
- } else {
- p = dmi_ioremap(0xF0000, 0x10000);
+ } else if (IS_ENABLED(CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK)) {
+ p = dmi_early_remap(0xF0000, 0x10000);
if (p == NULL)
goto error;
@@ -554,12 +554,12 @@ void __init dmi_scan_machine(void)
memcpy_fromio(buf + 16, q, 16);
if (!dmi_present(buf)) {
dmi_available = 1;
- dmi_iounmap(p, 0x10000);
+ dmi_early_unmap(p, 0x10000);
goto out;
}
memcpy(buf, buf + 16, 16);
}
- dmi_iounmap(p, 0x10000);
+ dmi_early_unmap(p, 0x10000);
}
error:
pr_info("DMI not present or invalid.\n");
@@ -831,13 +831,13 @@ int dmi_walk(void (*decode)(const struct dmi_header *, void *),
if (!dmi_available)
return -1;
- buf = ioremap(dmi_base, dmi_len);
+ buf = dmi_remap(dmi_base, dmi_len);
if (buf == NULL)
return -1;
dmi_table(buf, dmi_len, dmi_num, decode, private_data);
- iounmap(buf);
+ dmi_unmap(buf);
return 0;
}
EXPORT_SYMBOL_GPL(dmi_walk);
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index e2e04b007e15..17cf96c45f2b 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -324,7 +324,7 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
{
struct firmware_map_entry *entry;
- entry = alloc_bootmem(sizeof(struct firmware_map_entry));
+ entry = memblock_virt_alloc(sizeof(struct firmware_map_entry), 0);
if (WARN_ON(!entry))
return -ENOMEM;
diff --git a/drivers/gpu/drm/cirrus/cirrus_mode.c b/drivers/gpu/drm/cirrus/cirrus_mode.c
index 58dd900083b9..530f78f84dee 100644
--- a/drivers/gpu/drm/cirrus/cirrus_mode.c
+++ b/drivers/gpu/drm/cirrus/cirrus_mode.c
@@ -273,8 +273,8 @@ static int cirrus_crtc_mode_set(struct drm_crtc *crtc,
sr07 |= 0x11;
break;
case 16:
- sr07 |= 0xc1;
- hdr = 0xc0;
+ sr07 |= 0x17;
+ hdr = 0xc1;
break;
case 24:
sr07 |= 0x15;
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 98a03639b413..a0ba94e87d7c 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -418,6 +418,14 @@ static void drm_fb_helper_dpms(struct fb_info *info, int dpms_mode)
return;
/*
+ * fbdev->blank can be called from irq context in case of a panic.
+ * Since we already have our own special panic handler which will
+ * restore the fbdev console mode completely, just bail out early.
+ */
+ if (oops_in_progress)
+ return;
+
+ /*
* For each CRTC in this fb, turn the connectors on/off.
*/
drm_modeset_lock_all(dev);
diff --git a/drivers/gpu/drm/gma500/backlight.c b/drivers/gpu/drm/gma500/backlight.c
index 143eba3309c5..ea7dfc59d796 100644
--- a/drivers/gpu/drm/gma500/backlight.c
+++ b/drivers/gpu/drm/gma500/backlight.c
@@ -26,13 +26,13 @@
#include "intel_bios.h"
#include "power.h"
+#ifdef CONFIG_BACKLIGHT_CLASS_DEVICE
static void do_gma_backlight_set(struct drm_device *dev)
{
-#ifdef CONFIG_BACKLIGHT_CLASS_DEVICE
struct drm_psb_private *dev_priv = dev->dev_private;
backlight_update_status(dev_priv->backlight_device);
-#endif
}
+#endif
void gma_backlight_enable(struct drm_device *dev)
{
diff --git a/drivers/gpu/drm/nouveau/nouveau_acpi.c b/drivers/gpu/drm/nouveau/nouveau_acpi.c
index 3c149617cfcb..4ef83df2b246 100644
--- a/drivers/gpu/drm/nouveau/nouveau_acpi.c
+++ b/drivers/gpu/drm/nouveau/nouveau_acpi.c
@@ -61,6 +61,7 @@ bool nouveau_is_v1_dsm(void) {
#define NOUVEAU_DSM_HAS_MUX 0x1
#define NOUVEAU_DSM_HAS_OPT 0x2
+#ifdef CONFIG_VGA_SWITCHEROO
static const char nouveau_dsm_muid[] = {
0xA0, 0xA0, 0x95, 0x9D, 0x60, 0x00, 0x48, 0x4D,
0xB3, 0x4D, 0x7E, 0x5F, 0xEA, 0x12, 0x9F, 0xD4,
@@ -326,6 +327,11 @@ void nouveau_unregister_dsm_handler(void)
if (nouveau_dsm_priv.optimus_detected || nouveau_dsm_priv.dsm_detected)
vga_switcheroo_unregister_handler();
}
+#else
+void nouveau_register_dsm_handler(void) {}
+void nouveau_unregister_dsm_handler(void) {}
+void nouveau_switcheroo_optimus_dsm(void) {}
+#endif
/* retrieve the ROM in 4k blocks */
static int nouveau_rom_call(acpi_handle rom_handle, uint8_t *bios,
diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig
index a11ff74a5127..9eac8de9e8b7 100644
--- a/drivers/input/Kconfig
+++ b/drivers/input/Kconfig
@@ -178,6 +178,15 @@ comment "Input Device Drivers"
source "drivers/input/keyboard/Kconfig"
+config INPUT_LEDS
+ bool "LED Support"
+ depends on LEDS_CLASS = INPUT || LEDS_CLASS = y
+ select LEDS_TRIGGERS
+ default y
+ help
+ This option enables support for LEDs on keyboards managed
+ by the input layer.
+
source "drivers/input/mouse/Kconfig"
source "drivers/input/joystick/Kconfig"
diff --git a/drivers/input/Makefile b/drivers/input/Makefile
index 5ca3f631497f..2ab5f3336da5 100644
--- a/drivers/input/Makefile
+++ b/drivers/input/Makefile
@@ -6,6 +6,9 @@
obj-$(CONFIG_INPUT) += input-core.o
input-core-y := input.o input-compat.o input-mt.o ff-core.o
+ifeq ($(CONFIG_INPUT_LEDS),y)
+input-core-y += leds.o
+endif
obj-$(CONFIG_INPUT_FF_MEMLESS) += ff-memless.o
obj-$(CONFIG_INPUT_POLLDEV) += input-polldev.o
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 1c4c0db05550..3b9284b18e70 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -708,6 +708,9 @@ static void input_disconnect_device(struct input_dev *dev)
handle->open = 0;
spin_unlock_irq(&dev->event_lock);
+
+ if (is_event_supported(EV_LED, dev->evbit, EV_MAX))
+ input_led_disconnect(dev);
}
/**
@@ -2134,6 +2137,9 @@ int input_register_device(struct input_dev *dev)
list_add_tail(&dev->node, &input_dev_list);
+ if (is_event_supported(EV_LED, dev->evbit, EV_MAX))
+ input_led_connect(dev);
+
list_for_each_entry(handler, &input_handler_list, node)
input_attach_handler(dev, handler);
diff --git a/drivers/input/leds.c b/drivers/input/leds.c
new file mode 100644
index 000000000000..1d8a980719ab
--- /dev/null
+++ b/drivers/input/leds.c
@@ -0,0 +1,249 @@
+/*
+ * LED support for the input layer
+ *
+ * Copyright 2010-2013 Samuel Thibault <samuel.thibault@ens-lyon.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/leds.h>
+#include <linux/input.h>
+
+/*
+ * Keyboard LEDs are propagated by default like the following example:
+ *
+ * VT keyboard numlock trigger
+ * -> vt::numl VT LED
+ * -> vt-numl VT trigger
+ * -> per-device inputX::numl LED
+ *
+ * Userland can however choose the trigger for the vt::numl LED, or
+ * independently choose the trigger for any inputx::numl LED.
+ *
+ *
+ * VT LED classes and triggers are registered on-demand according to
+ * existing LED devices
+ */
+
+/* Handler for VT LEDs, just triggers the corresponding VT trigger. */
+static void vt_led_set(struct led_classdev *cdev,
+ enum led_brightness brightness);
+static struct led_classdev vt_leds[LED_CNT] = {
+#define DEFINE_INPUT_LED(vt_led, nam, deftrig) \
+ [vt_led] = { \
+ .name = "vt::"nam, \
+ .max_brightness = 1, \
+ .brightness_set = vt_led_set, \
+ .default_trigger = deftrig, \
+ }
+/* Default triggers for the VT LEDs just correspond to the legacy
+ * usage. */
+ DEFINE_INPUT_LED(LED_NUML, "numl", "kbd-numlock"),
+ DEFINE_INPUT_LED(LED_CAPSL, "capsl", "kbd-capslock"),
+ DEFINE_INPUT_LED(LED_SCROLLL, "scrolll", "kbd-scrollock"),
+ DEFINE_INPUT_LED(LED_COMPOSE, "compose", NULL),
+ DEFINE_INPUT_LED(LED_KANA, "kana", "kbd-kanalock"),
+ DEFINE_INPUT_LED(LED_SLEEP, "sleep", NULL),
+ DEFINE_INPUT_LED(LED_SUSPEND, "suspend", NULL),
+ DEFINE_INPUT_LED(LED_MUTE, "mute", NULL),
+ DEFINE_INPUT_LED(LED_MISC, "misc", NULL),
+ DEFINE_INPUT_LED(LED_MAIL, "mail", NULL),
+ DEFINE_INPUT_LED(LED_CHARGING, "charging", NULL),
+};
+static const char *const vt_led_names[LED_CNT] = {
+ [LED_NUML] = "numl",
+ [LED_CAPSL] = "capsl",
+ [LED_SCROLLL] = "scrolll",
+ [LED_COMPOSE] = "compose",
+ [LED_KANA] = "kana",
+ [LED_SLEEP] = "sleep",
+ [LED_SUSPEND] = "suspend",
+ [LED_MUTE] = "mute",
+ [LED_MISC] = "misc",
+ [LED_MAIL] = "mail",
+ [LED_CHARGING] = "charging",
+};
+/* Handler for hotplug initialization */
+static void vt_led_trigger_activate(struct led_classdev *cdev);
+/* VT triggers */
+static struct led_trigger vt_led_triggers[LED_CNT] = {
+#define DEFINE_INPUT_LED_TRIGGER(vt_led, nam) \
+ [vt_led] = { \
+ .name = "vt-"nam, \
+ .activate = vt_led_trigger_activate, \
+ }
+ DEFINE_INPUT_LED_TRIGGER(LED_NUML, "numl"),
+ DEFINE_INPUT_LED_TRIGGER(LED_CAPSL, "capsl"),
+ DEFINE_INPUT_LED_TRIGGER(LED_SCROLLL, "scrolll"),
+ DEFINE_INPUT_LED_TRIGGER(LED_COMPOSE, "compose"),
+ DEFINE_INPUT_LED_TRIGGER(LED_KANA, "kana"),
+ DEFINE_INPUT_LED_TRIGGER(LED_SLEEP, "sleep"),
+ DEFINE_INPUT_LED_TRIGGER(LED_SUSPEND, "suspend"),
+ DEFINE_INPUT_LED_TRIGGER(LED_MUTE, "mute"),
+ DEFINE_INPUT_LED_TRIGGER(LED_MISC, "misc"),
+ DEFINE_INPUT_LED_TRIGGER(LED_MAIL, "mail"),
+ DEFINE_INPUT_LED_TRIGGER(LED_CHARGING, "charging"),
+};
+
+/* Lock for registration coherency */
+static DEFINE_MUTEX(vt_led_registered_lock);
+
+/* Which VT LED classes and triggers are registered */
+static unsigned long vt_led_registered[BITS_TO_LONGS(LED_CNT)];
+
+/* Number of input devices having each LED */
+static int vt_led_references[LED_CNT];
+
+/* VT LED state change, tell the VT trigger. */
+static void vt_led_set(struct led_classdev *cdev,
+ enum led_brightness brightness)
+{
+ int led = cdev - vt_leds;
+
+ led_trigger_event(&vt_led_triggers[led], !!brightness);
+}
+
+/* LED state change for some keyboard, notify that keyboard. */
+static void perdevice_input_led_set(struct led_classdev *cdev,
+ enum led_brightness brightness)
+{
+ struct input_dev *dev;
+ struct led_classdev *leds;
+ int led;
+
+ dev = cdev->dev->platform_data;
+ if (!dev)
+ /* Still initializing */
+ return;
+ leds = dev->leds;
+ led = cdev - leds;
+
+ input_event(dev, EV_LED, led, !!brightness);
+ input_event(dev, EV_SYN, SYN_REPORT, 0);
+}
+
+/* Keyboard hotplug, initialize its LED status */
+static void vt_led_trigger_activate(struct led_classdev *cdev)
+{
+ struct led_trigger *trigger = cdev->trigger;
+ int led = trigger - vt_led_triggers;
+
+ if (cdev->brightness_set)
+ cdev->brightness_set(cdev, vt_leds[led].brightness);
+}
+
+/* Free led stuff from input device, used at abortion and disconnection. */
+static void input_led_delete(struct input_dev *dev)
+{
+ if (dev) {
+ struct led_classdev *leds = dev->leds;
+ if (leds) {
+ int i;
+ for (i = 0; i < LED_CNT; i++)
+ kfree(leds[i].name);
+ kfree(leds);
+ dev->leds = NULL;
+ }
+ }
+}
+
+/* A new input device with potential LEDs to connect. */
+int input_led_connect(struct input_dev *dev)
+{
+ int i, error = 0;
+ struct led_classdev *leds;
+
+ dev->leds = leds = kzalloc(sizeof(*leds) * LED_CNT, GFP_KERNEL);
+ if (!dev->leds)
+ return -ENOMEM;
+
+ /* lazily register missing VT LEDs */
+ mutex_lock(&vt_led_registered_lock);
+ for (i = 0; i < LED_CNT; i++)
+ if (vt_leds[i].name && test_bit(i, dev->ledbit)) {
+ if (!vt_led_references[i]) {
+ led_trigger_register(&vt_led_triggers[i]);
+ /* This keyboard is first to have led i,
+ * try to register it */
+ if (!led_classdev_register(NULL, &vt_leds[i]))
+ vt_led_references[i] = 1;
+ else
+ led_trigger_unregister(&vt_led_triggers[i]);
+ } else
+ vt_led_references[i]++;
+ }
+ mutex_unlock(&vt_led_registered_lock);
+
+ /* and register this device's LEDs */
+ for (i = 0; i < LED_CNT; i++)
+ if (vt_leds[i].name && test_bit(i, dev->ledbit)) {
+ leds[i].name = kasprintf(GFP_KERNEL, "%s::%s",
+ dev_name(&dev->dev),
+ vt_led_names[i]);
+ if (!leds[i].name) {
+ error = -ENOMEM;
+ goto err;
+ }
+ leds[i].max_brightness = 1;
+ leds[i].brightness_set = perdevice_input_led_set;
+ leds[i].default_trigger = vt_led_triggers[i].name;
+ }
+
+ /* No issue so far, we can register for real. */
+ for (i = 0; i < LED_CNT; i++)
+ if (leds[i].name) {
+ led_classdev_register(&dev->dev, &leds[i]);
+ leds[i].dev->platform_data = dev;
+ perdevice_input_led_set(&leds[i],
+ vt_leds[i].brightness);
+ }
+
+ return 0;
+
+err:
+ input_led_delete(dev);
+ return error;
+}
+
+/*
+ * Disconnected input device. Clean it, and deregister now-useless VT LEDs and
+ * triggers.
+ */
+void input_led_disconnect(struct input_dev *dev)
+{
+ int i;
+ struct led_classdev *leds = dev->leds;
+
+ for (i = 0; i < LED_CNT; i++)
+ if (leds[i].name)
+ led_classdev_unregister(&leds[i]);
+
+ input_led_delete(dev);
+
+ mutex_lock(&vt_led_registered_lock);
+ for (i = 0; i < LED_CNT; i++) {
+ if (!vt_leds[i].name || !test_bit(i, dev->ledbit))
+ continue;
+
+ vt_led_references[i]--;
+ if (vt_led_references[i]) {
+ /* Still some devices needing it */
+ continue;
+ }
+
+ led_classdev_unregister(&vt_leds[i]);
+ led_trigger_unregister(&vt_led_triggers[i]);
+ clear_bit(i, vt_led_registered);
+ }
+ mutex_unlock(&vt_led_registered_lock);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("User LED support for input layer");
+MODULE_AUTHOR("Samuel Thibault <samuel.thibault@ens-lyon.org>");
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 5ac7efc70ca9..a22c86c867fa 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -889,7 +889,7 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level,
/* If range covers entire pagetable, free it */
if (!(start_pfn > level_pfn ||
- last_pfn < level_pfn + level_size(level))) {
+ last_pfn < level_pfn + level_size(level) - 1)) {
dma_clear_pte(pte);
domain_flush_cache(domain, pte, sizeof(*pte));
free_pgtable_page(level_pte);
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 72156c123033..6e3ca5940e98 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -11,9 +11,6 @@ menuconfig NEW_LEDS
Say Y to enable Linux LED support. This allows control of supported
LEDs from both userspace and optionally, by kernel events (triggers).
- This is not related to standard keyboard LEDs which are controlled
- via the input system.
-
if NEW_LEDS
config LEDS_CLASS
diff --git a/drivers/mailbox/omap-mbox.h b/drivers/mailbox/omap-mbox.h
index 6cd38fc68599..86d7518cd13b 100644
--- a/drivers/mailbox/omap-mbox.h
+++ b/drivers/mailbox/omap-mbox.h
@@ -52,7 +52,7 @@ struct omap_mbox_queue {
struct omap_mbox {
const char *name;
- unsigned int irq;
+ int irq;
struct omap_mbox_queue *txq, *rxq;
struct omap_mbox_ops *ops;
struct device *dev;
diff --git a/drivers/memstick/host/rtsx_pci_ms.c b/drivers/memstick/host/rtsx_pci_ms.c
index 25f8f93decb6..2a635b6fdaf7 100644
--- a/drivers/memstick/host/rtsx_pci_ms.c
+++ b/drivers/memstick/host/rtsx_pci_ms.c
@@ -145,6 +145,8 @@ static int ms_transfer_data(struct realtek_pci_ms *host, unsigned char data_dir,
unsigned int length = sg->length;
u16 sec_cnt = (u16)(length / 512);
u8 val, trans_mode, dma_dir;
+ struct memstick_dev *card = host->msh->card;
+ bool pro_card = card->id.type == MEMSTICK_TYPE_PRO;
dev_dbg(ms_dev(host), "%s: tpc = 0x%02x, data_dir = %s, length = %d\n",
__func__, tpc, (data_dir == READ) ? "READ" : "WRITE",
@@ -152,19 +154,21 @@ static int ms_transfer_data(struct realtek_pci_ms *host, unsigned char data_dir,
if (data_dir == READ) {
dma_dir = DMA_DIR_FROM_CARD;
- trans_mode = MS_TM_AUTO_READ;
+ trans_mode = pro_card ? MS_TM_AUTO_READ : MS_TM_NORMAL_READ;
} else {
dma_dir = DMA_DIR_TO_CARD;
- trans_mode = MS_TM_AUTO_WRITE;
+ trans_mode = pro_card ? MS_TM_AUTO_WRITE : MS_TM_NORMAL_WRITE;
}
rtsx_pci_init_cmd(pcr);
rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, MS_TPC, 0xFF, tpc);
- rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, MS_SECTOR_CNT_H,
- 0xFF, (u8)(sec_cnt >> 8));
- rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, MS_SECTOR_CNT_L,
- 0xFF, (u8)sec_cnt);
+ if (pro_card) {
+ rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, MS_SECTOR_CNT_H,
+ 0xFF, (u8)(sec_cnt >> 8));
+ rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, MS_SECTOR_CNT_L,
+ 0xFF, (u8)sec_cnt);
+ }
rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, MS_TRANS_CFG, 0xFF, cfg);
rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, IRQSTAT0,
@@ -192,8 +196,14 @@ static int ms_transfer_data(struct realtek_pci_ms *host, unsigned char data_dir,
}
rtsx_pci_read_register(pcr, MS_TRANS_CFG, &val);
- if (val & (MS_INT_CMDNK | MS_INT_ERR | MS_CRC16_ERR | MS_RDY_TIMEOUT))
- return -EIO;
+ if (pro_card) {
+ if (val & (MS_INT_CMDNK | MS_INT_ERR |
+ MS_CRC16_ERR | MS_RDY_TIMEOUT))
+ return -EIO;
+ } else {
+ if (val & (MS_CRC16_ERR | MS_RDY_TIMEOUT))
+ return -EIO;
+ }
return 0;
}
@@ -462,8 +472,8 @@ static int rtsx_pci_ms_set_param(struct memstick_host *msh,
clock = 19000000;
ssc_depth = RTSX_SSC_DEPTH_500K;
- err = rtsx_pci_write_register(pcr, MS_CFG,
- 0x18, MS_BUS_WIDTH_1);
+ err = rtsx_pci_write_register(pcr, MS_CFG, 0x58,
+ MS_BUS_WIDTH_1 | PUSH_TIME_DEFAULT);
if (err < 0)
return err;
} else if (value == MEMSTICK_PAR4) {
diff --git a/drivers/mfd/max8998.c b/drivers/mfd/max8998.c
index f47eaa70eae0..612ca404e150 100644
--- a/drivers/mfd/max8998.c
+++ b/drivers/mfd/max8998.c
@@ -175,7 +175,7 @@ static inline int max8998_i2c_get_driver_data(struct i2c_client *i2c,
if (IS_ENABLED(CONFIG_OF) && i2c->dev.of_node) {
const struct of_device_id *match;
match = of_match_node(max8998_dt_match, i2c->dev.of_node);
- return (int)match->data;
+ return (int)(long)match->data;
}
return (int)id->driver_data;
diff --git a/drivers/mfd/tps65217.c b/drivers/mfd/tps65217.c
index 6939ae56c2e1..966cf65c5c36 100644
--- a/drivers/mfd/tps65217.c
+++ b/drivers/mfd/tps65217.c
@@ -170,7 +170,7 @@ static int tps65217_probe(struct i2c_client *client,
"Failed to find matching dt id\n");
return -EINVAL;
}
- chip_id = (unsigned int)match->data;
+ chip_id = (unsigned int)(unsigned long)match->data;
status_off = of_property_read_bool(client->dev.of_node,
"ti,pmic-shutdown-controller");
}
diff --git a/drivers/misc/ti-st/st_core.c b/drivers/misc/ti-st/st_core.c
index 3aed525e55b4..78a998bfcd45 100644
--- a/drivers/misc/ti-st/st_core.c
+++ b/drivers/misc/ti-st/st_core.c
@@ -343,7 +343,7 @@ void st_int_recv(void *disc_data,
/* Unknow packet? */
default:
type = *ptr;
- if (st_gdata->list[type] == NULL) {
+ if (type >= ST_MAX_CHANNELS || st_gdata->list[type] == NULL) {
pr_err("chip/interface misbehavior dropping"
" frame starting with 0x%02x", type);
goto done;
diff --git a/drivers/net/irda/donauboe.c b/drivers/net/irda/donauboe.c
index 768dfe9a9315..6d3e2093bf7f 100644
--- a/drivers/net/irda/donauboe.c
+++ b/drivers/net/irda/donauboe.c
@@ -1755,17 +1755,4 @@ static struct pci_driver donauboe_pci_driver = {
.resume = toshoboe_wakeup
};
-static int __init
-donauboe_init (void)
-{
- return pci_register_driver(&donauboe_pci_driver);
-}
-
-static void __exit
-donauboe_cleanup (void)
-{
- pci_unregister_driver(&donauboe_pci_driver);
-}
-
-module_init(donauboe_init);
-module_exit(donauboe_cleanup);
+module_pci_driver(donauboe_pci_driver);
diff --git a/drivers/pps/pps.c b/drivers/pps/pps.c
index 2f07cd615665..983f50c4b7b4 100644
--- a/drivers/pps/pps.c
+++ b/drivers/pps/pps.c
@@ -152,35 +152,38 @@ static long pps_cdev_ioctl(struct file *file,
if (err)
return -EFAULT;
- ev = pps->last_ev;
-
- /* Manage the timeout */
- if (fdata.timeout.flags & PPS_TIME_INVALID)
- err = wait_event_interruptible(pps->queue,
- ev != pps->last_ev);
- else {
- unsigned long ticks;
-
- dev_dbg(pps->dev, "timeout %lld.%09d\n",
- (long long) fdata.timeout.sec,
- fdata.timeout.nsec);
- ticks = fdata.timeout.sec * HZ;
- ticks += fdata.timeout.nsec / (NSEC_PER_SEC / HZ);
-
- if (ticks != 0) {
- err = wait_event_interruptible_timeout(
- pps->queue,
- ev != pps->last_ev,
- ticks);
- if (err == 0)
- return -ETIMEDOUT;
+ if (!(file->f_flags & O_NONBLOCK)) {
+ ev = pps->last_ev;
+
+ /* Manage the timeout */
+ if (fdata.timeout.flags & PPS_TIME_INVALID)
+ err = wait_event_interruptible(pps->queue,
+ ev != pps->last_ev);
+ else {
+ unsigned long ticks;
+
+ dev_dbg(pps->dev, "timeout %lld.%09d\n",
+ (long long) fdata.timeout.sec,
+ fdata.timeout.nsec);
+ ticks = fdata.timeout.sec * HZ;
+ ticks += fdata.timeout.nsec /
+ (NSEC_PER_SEC / HZ);
+
+ if (ticks != 0) {
+ err = wait_event_interruptible_timeout(
+ pps->queue,
+ ev != pps->last_ev,
+ ticks);
+ if (err == 0)
+ return -ETIMEDOUT;
+ }
}
- }
- /* Check for pending signals */
- if (err == -ERESTARTSYS) {
- dev_dbg(pps->dev, "pending signal caught\n");
- return -EINTR;
+ /* Check for pending signals */
+ if (err == -ERESTARTSYS) {
+ dev_dbg(pps->dev, "pending signal caught\n");
+ return -EINTR;
+ }
}
/* Return the fetched timestamp */
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index b1328a45b095..db933decc39c 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -212,6 +212,17 @@ config RTC_DRV_DS3232
This driver can also be built as a module. If so, the module
will be called rtc-ds3232.
+config RTC_DRV_HYM8563
+ tristate "Haoyu Microelectronics HYM8563"
+ depends on I2C && OF
+ help
+ Say Y to enable support for the HYM8563 I2C RTC chip. Apart
+ from the usual rtc functions it provides a clock output of
+ up to 32kHz.
+
+ This driver can also be built as a module. If so, the module
+ will be called rtc-hym8563.
+
config RTC_DRV_LP8788
tristate "TI LP8788 RTC driver"
depends on MFD_LP8788
@@ -637,7 +648,7 @@ comment "Platform RTC drivers"
config RTC_DRV_CMOS
tristate "PC-style 'CMOS'"
- depends on X86 || ARM || M32R || ATARI || PPC || MIPS || SPARC64
+ depends on X86 || ARM || M32R || PPC || MIPS || SPARC64
default y if X86
help
Say "yes" here to get direct support for the real time clock
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index c00741a0bf10..b427bf7dd20d 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_RTC_DRV_EP93XX) += rtc-ep93xx.o
obj-$(CONFIG_RTC_DRV_FM3130) += rtc-fm3130.o
obj-$(CONFIG_RTC_DRV_GENERIC) += rtc-generic.o
obj-$(CONFIG_RTC_DRV_HID_SENSOR_TIME) += rtc-hid-sensor-time.o
+obj-$(CONFIG_RTC_DRV_HYM8563) += rtc-hym8563.o
obj-$(CONFIG_RTC_DRV_IMXDI) += rtc-imxdi.o
obj-$(CONFIG_RTC_DRV_ISL1208) += rtc-isl1208.o
obj-$(CONFIG_RTC_DRV_ISL12022) += rtc-isl12022.o
diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 02426812bebc..589351ef75d0 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -14,6 +14,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
+#include <linux/of.h>
#include <linux/rtc.h>
#include <linux/kdev_t.h>
#include <linux/idr.h>
@@ -157,12 +158,27 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
{
struct rtc_device *rtc;
struct rtc_wkalrm alrm;
- int id, err;
+ int of_id = -1, id = -1, err;
+
+ if (dev->of_node)
+ of_id = of_alias_get_id(dev->of_node, "rtc");
+ else if (dev->parent && dev->parent->of_node)
+ of_id = of_alias_get_id(dev->parent->of_node, "rtc");
+
+ if (of_id >= 0) {
+ id = ida_simple_get(&rtc_ida, of_id, of_id + 1,
+ GFP_KERNEL);
+ if (id < 0)
+ dev_warn(dev, "/aliases ID %d not available\n",
+ of_id);
+ }
- id = ida_simple_get(&rtc_ida, 0, 0, GFP_KERNEL);
if (id < 0) {
- err = id;
- goto exit;
+ id = ida_simple_get(&rtc_ida, 0, 0, GFP_KERNEL);
+ if (id < 0) {
+ err = id;
+ goto exit;
+ }
}
rtc = kzalloc(sizeof(struct rtc_device), GFP_KERNEL);
diff --git a/drivers/rtc/rtc-as3722.c b/drivers/rtc/rtc-as3722.c
index 9cfa8170a2d6..4af016985890 100644
--- a/drivers/rtc/rtc-as3722.c
+++ b/drivers/rtc/rtc-as3722.c
@@ -198,7 +198,7 @@ static int as3722_rtc_probe(struct platform_device *pdev)
device_init_wakeup(&pdev->dev, 1);
- as3722_rtc->rtc = rtc_device_register("as3722", &pdev->dev,
+ as3722_rtc->rtc = devm_rtc_device_register(&pdev->dev, "as3722-rtc",
&as3722_rtc_ops, THIS_MODULE);
if (IS_ERR(as3722_rtc->rtc)) {
ret = PTR_ERR(as3722_rtc->rtc);
@@ -209,28 +209,16 @@ static int as3722_rtc_probe(struct platform_device *pdev)
as3722_rtc->alarm_irq = platform_get_irq(pdev, 0);
dev_info(&pdev->dev, "RTC interrupt %d\n", as3722_rtc->alarm_irq);
- ret = request_threaded_irq(as3722_rtc->alarm_irq, NULL,
+ ret = devm_request_threaded_irq(&pdev->dev, as3722_rtc->alarm_irq, NULL,
as3722_alarm_irq, IRQF_ONESHOT | IRQF_EARLY_RESUME,
"rtc-alarm", as3722_rtc);
if (ret < 0) {
dev_err(&pdev->dev, "Failed to request alarm IRQ %d: %d\n",
as3722_rtc->alarm_irq, ret);
- goto scrub;
+ return ret;
}
disable_irq(as3722_rtc->alarm_irq);
return 0;
-scrub:
- rtc_device_unregister(as3722_rtc->rtc);
- return ret;
-}
-
-static int as3722_rtc_remove(struct platform_device *pdev)
-{
- struct as3722_rtc *as3722_rtc = platform_get_drvdata(pdev);
-
- free_irq(as3722_rtc->alarm_irq, as3722_rtc);
- rtc_device_unregister(as3722_rtc->rtc);
- return 0;
}
#ifdef CONFIG_PM_SLEEP
@@ -260,7 +248,6 @@ static const struct dev_pm_ops as3722_rtc_pm_ops = {
static struct platform_driver as3722_rtc_driver = {
.probe = as3722_rtc_probe,
- .remove = as3722_rtc_remove,
.driver = {
.name = "as3722-rtc",
.pm = &as3722_rtc_pm_ops,
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index a2325bc5e497..cae212f30d65 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -756,11 +756,9 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
irq_handler_t rtc_cmos_int_handler;
if (is_hpet_enabled()) {
- int err;
-
rtc_cmos_int_handler = hpet_rtc_interrupt;
- err = hpet_register_irq_handler(cmos_interrupt);
- if (err != 0) {
+ retval = hpet_register_irq_handler(cmos_interrupt);
+ if (retval) {
dev_warn(dev, "hpet_register_irq_handler "
" failed in rtc_init().");
goto cleanup1;
@@ -1175,7 +1173,7 @@ static struct platform_driver cmos_platform_driver = {
.remove = __exit_p(cmos_platform_remove),
.shutdown = cmos_platform_shutdown,
.driver = {
- .name = (char *) driver_name,
+ .name = driver_name,
#ifdef CONFIG_PM
.pm = &cmos_pm_ops,
#endif
diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c
index 80f323731ee2..2dd586a19b59 100644
--- a/drivers/rtc/rtc-ds1305.c
+++ b/drivers/rtc/rtc-ds1305.c
@@ -787,7 +787,6 @@ static int ds1305_remove(struct spi_device *spi)
cancel_work_sync(&ds1305->work);
}
- spi_set_drvdata(spi, NULL);
return 0;
}
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 17b73fdc3b6e..5a1f3b2a8f1e 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -13,12 +13,13 @@
*/
#include <linux/bcd.h>
-#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
#include <linux/rtc.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
#include <linux/platform_device.h>
#include <linux/io.h>
#include <linux/module.h>
@@ -215,12 +216,19 @@ static int ds1742_rtc_remove(struct platform_device *pdev)
return 0;
}
+static struct of_device_id __maybe_unused ds1742_rtc_of_match[] = {
+ { .compatible = "maxim,ds1742", },
+ { }
+};
+MODULE_DEVICE_TABLE(of, ds1742_rtc_of_match);
+
static struct platform_driver ds1742_rtc_driver = {
.probe = ds1742_rtc_probe,
.remove = ds1742_rtc_remove,
.driver = {
.name = "rtc-ds1742",
.owner = THIS_MODULE,
+ .of_match_table = ds1742_rtc_of_match,
},
};
diff --git a/drivers/rtc/rtc-hym8563.c b/drivers/rtc/rtc-hym8563.c
new file mode 100644
index 000000000000..bd628a6f981d
--- /dev/null
+++ b/drivers/rtc/rtc-hym8563.c
@@ -0,0 +1,606 @@
+/*
+ * Haoyu HYM8563 RTC driver
+ *
+ * Copyright (C) 2013 MundoReader S.L.
+ * Author: Heiko Stuebner <heiko@sntech.de>
+ *
+ * based on rtc-HYM8563
+ * Copyright (C) 2010 ROCKCHIP, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/clk-provider.h>
+#include <linux/i2c.h>
+#include <linux/bcd.h>
+#include <linux/rtc.h>
+
+#define HYM8563_CTL1 0x00
+#define HYM8563_CTL1_TEST BIT(7)
+#define HYM8563_CTL1_STOP BIT(5)
+#define HYM8563_CTL1_TESTC BIT(3)
+
+#define HYM8563_CTL2 0x01
+#define HYM8563_CTL2_TI_TP BIT(4)
+#define HYM8563_CTL2_AF BIT(3)
+#define HYM8563_CTL2_TF BIT(2)
+#define HYM8563_CTL2_AIE BIT(1)
+#define HYM8563_CTL2_TIE BIT(0)
+
+#define HYM8563_SEC 0x02
+#define HYM8563_SEC_VL BIT(7)
+#define HYM8563_SEC_MASK 0x7f
+
+#define HYM8563_MIN 0x03
+#define HYM8563_MIN_MASK 0x7f
+
+#define HYM8563_HOUR 0x04
+#define HYM8563_HOUR_MASK 0x3f
+
+#define HYM8563_DAY 0x05
+#define HYM8563_DAY_MASK 0x3f
+
+#define HYM8563_WEEKDAY 0x06
+#define HYM8563_WEEKDAY_MASK 0x07
+
+#define HYM8563_MONTH 0x07
+#define HYM8563_MONTH_CENTURY BIT(7)
+#define HYM8563_MONTH_MASK 0x1f
+
+#define HYM8563_YEAR 0x08
+
+#define HYM8563_ALM_MIN 0x09
+#define HYM8563_ALM_HOUR 0x0a
+#define HYM8563_ALM_DAY 0x0b
+#define HYM8563_ALM_WEEK 0x0c
+
+/* Each alarm check can be disabled by setting this bit in the register */
+#define HYM8563_ALM_BIT_DISABLE BIT(7)
+
+#define HYM8563_CLKOUT 0x0d
+#define HYM8563_CLKOUT_DISABLE BIT(7)
+#define HYM8563_CLKOUT_32768 0
+#define HYM8563_CLKOUT_1024 1
+#define HYM8563_CLKOUT_32 2
+#define HYM8563_CLKOUT_1 3
+#define HYM8563_CLKOUT_MASK 3
+
+#define HYM8563_TMR_CTL 0x0e
+#define HYM8563_TMR_CTL_ENABLE BIT(7)
+#define HYM8563_TMR_CTL_4096 0
+#define HYM8563_TMR_CTL_64 1
+#define HYM8563_TMR_CTL_1 2
+#define HYM8563_TMR_CTL_1_60 3
+#define HYM8563_TMR_CTL_MASK 3
+
+#define HYM8563_TMR_CNT 0x0f
+
+struct hym8563 {
+ struct i2c_client *client;
+ struct rtc_device *rtc;
+ bool valid;
+#ifdef CONFIG_COMMON_CLK
+ struct clk_hw clkout_hw;
+#endif
+};
+
+/*
+ * RTC handling
+ */
+
+static int hym8563_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ struct hym8563 *hym8563 = i2c_get_clientdata(client);
+ u8 buf[7];
+ int ret;
+
+ if (!hym8563->valid) {
+ dev_warn(&client->dev, "no valid clock/calendar values available\n");
+ return -EPERM;
+ }
+
+ ret = i2c_smbus_read_i2c_block_data(client, HYM8563_SEC, 7, buf);
+
+ tm->tm_sec = bcd2bin(buf[0] & HYM8563_SEC_MASK);
+ tm->tm_min = bcd2bin(buf[1] & HYM8563_MIN_MASK);
+ tm->tm_hour = bcd2bin(buf[2] & HYM8563_HOUR_MASK);
+ tm->tm_mday = bcd2bin(buf[3] & HYM8563_DAY_MASK);
+ tm->tm_wday = bcd2bin(buf[4] & HYM8563_WEEKDAY_MASK); /* 0 = Sun */
+ tm->tm_mon = bcd2bin(buf[5] & HYM8563_MONTH_MASK) - 1; /* 0 = Jan */
+ tm->tm_year = bcd2bin(buf[6]) + 100;
+
+ return 0;
+}
+
+static int hym8563_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ struct hym8563 *hym8563 = i2c_get_clientdata(client);
+ u8 buf[7];
+ int ret;
+
+ /* Years >= 2100 are to far in the future, 19XX is to early */
+ if (tm->tm_year < 100 || tm->tm_year >= 200)
+ return -EINVAL;
+
+ buf[0] = bin2bcd(tm->tm_sec);
+ buf[1] = bin2bcd(tm->tm_min);
+ buf[2] = bin2bcd(tm->tm_hour);
+ buf[3] = bin2bcd(tm->tm_mday);
+ buf[4] = bin2bcd(tm->tm_wday);
+ buf[5] = bin2bcd(tm->tm_mon + 1);
+
+ /*
+ * While the HYM8563 has a century flag in the month register,
+ * it does not seem to carry it over a subsequent write/read.
+ * So we'll limit ourself to 100 years, starting at 2000 for now.
+ */
+ buf[6] = tm->tm_year - 100;
+
+ /*
+ * CTL1 only contains TEST-mode bits apart from stop,
+ * so no need to read the value first
+ */
+ ret = i2c_smbus_write_byte_data(client, HYM8563_CTL1,
+ HYM8563_CTL1_STOP);
+ if (ret < 0)
+ return ret;
+
+ ret = i2c_smbus_write_i2c_block_data(client, HYM8563_SEC, 7, buf);
+ if (ret < 0)
+ return ret;
+
+ ret = i2c_smbus_write_byte_data(client, HYM8563_CTL1, 0);
+ if (ret < 0)
+ return ret;
+
+ hym8563->valid = true;
+
+ return 0;
+}
+
+static int hym8563_rtc_alarm_irq_enable(struct device *dev,
+ unsigned int enabled)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ int data;
+
+ data = i2c_smbus_read_byte_data(client, HYM8563_CTL2);
+ if (data < 0)
+ return data;
+
+ if (enabled)
+ data |= HYM8563_CTL2_AIE;
+ else
+ data &= ~HYM8563_CTL2_AIE;
+
+ return i2c_smbus_write_byte_data(client, HYM8563_CTL2, data);
+};
+
+static int hym8563_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ struct rtc_time *alm_tm = &alm->time;
+ u8 buf[4];
+ int ret;
+
+ ret = i2c_smbus_read_i2c_block_data(client, HYM8563_ALM_MIN, 4, buf);
+ if (ret < 0)
+ return ret;
+
+ /* The alarm only has a minute accuracy */
+ alm_tm->tm_sec = -1;
+
+ alm_tm->tm_min = (buf[0] & HYM8563_ALM_BIT_DISABLE) ?
+ -1 :
+ bcd2bin(buf[0] & HYM8563_MIN_MASK);
+ alm_tm->tm_hour = (buf[1] & HYM8563_ALM_BIT_DISABLE) ?
+ -1 :
+ bcd2bin(buf[1] & HYM8563_HOUR_MASK);
+ alm_tm->tm_mday = (buf[2] & HYM8563_ALM_BIT_DISABLE) ?
+ -1 :
+ bcd2bin(buf[2] & HYM8563_DAY_MASK);
+ alm_tm->tm_wday = (buf[3] & HYM8563_ALM_BIT_DISABLE) ?
+ -1 :
+ bcd2bin(buf[3] & HYM8563_WEEKDAY_MASK);
+
+ alm_tm->tm_mon = -1;
+ alm_tm->tm_year = -1;
+
+ ret = i2c_smbus_read_byte_data(client, HYM8563_CTL2);
+ if (ret < 0)
+ return ret;
+
+ if (ret & HYM8563_CTL2_AIE)
+ alm->enabled = 1;
+
+ return 0;
+}
+
+static int hym8563_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ struct rtc_time *alm_tm = &alm->time;
+ u8 buf[4];
+ int ret;
+
+ /*
+ * The alarm has no seconds so deal with it
+ */
+ if (alm_tm->tm_sec) {
+ alm_tm->tm_sec = 0;
+ alm_tm->tm_min++;
+ if (alm_tm->tm_min >= 60) {
+ alm_tm->tm_min = 0;
+ alm_tm->tm_hour++;
+ if (alm_tm->tm_hour >= 24) {
+ alm_tm->tm_hour = 0;
+ alm_tm->tm_mday++;
+ if (alm_tm->tm_mday > 31)
+ alm_tm->tm_mday = 0;
+ }
+ }
+ }
+
+ ret = i2c_smbus_read_byte_data(client, HYM8563_CTL2);
+ if (ret < 0)
+ return ret;
+
+ ret &= ~HYM8563_CTL2_AIE;
+
+ ret = i2c_smbus_write_byte_data(client, HYM8563_CTL2, ret);
+ if (ret < 0)
+ return ret;
+
+ buf[0] = (alm_tm->tm_min < 60 && alm_tm->tm_min >= 0) ?
+ bin2bcd(alm_tm->tm_min) : HYM8563_ALM_BIT_DISABLE;
+
+ buf[1] = (alm_tm->tm_hour < 24 && alm_tm->tm_hour >= 0) ?
+ bin2bcd(alm_tm->tm_hour) : HYM8563_ALM_BIT_DISABLE;
+
+ buf[2] = (alm_tm->tm_mday <= 31 && alm_tm->tm_mday >= 1) ?
+ bin2bcd(alm_tm->tm_mday) : HYM8563_ALM_BIT_DISABLE;
+
+ buf[3] = (alm_tm->tm_wday < 7 && alm_tm->tm_wday >= 0) ?
+ bin2bcd(alm_tm->tm_wday) : HYM8563_ALM_BIT_DISABLE;
+
+ ret = i2c_smbus_write_i2c_block_data(client, HYM8563_ALM_MIN, 4, buf);
+ if (ret < 0)
+ return ret;
+
+ return hym8563_rtc_alarm_irq_enable(dev, alm->enabled);
+}
+
+static const struct rtc_class_ops hym8563_rtc_ops = {
+ .read_time = hym8563_rtc_read_time,
+ .set_time = hym8563_rtc_set_time,
+ .alarm_irq_enable = hym8563_rtc_alarm_irq_enable,
+ .read_alarm = hym8563_rtc_read_alarm,
+ .set_alarm = hym8563_rtc_set_alarm,
+};
+
+/*
+ * Handling of the clkout
+ */
+
+#ifdef CONFIG_COMMON_CLK
+#define clkout_hw_to_hym8563(_hw) container_of(_hw, struct hym8563, clkout_hw)
+
+static int clkout_rates[] = {
+ 32768,
+ 1024,
+ 32,
+ 1,
+};
+
+static unsigned long hym8563_clkout_recalc_rate(struct clk_hw *hw,
+ unsigned long parent_rate)
+{
+ struct hym8563 *hym8563 = clkout_hw_to_hym8563(hw);
+ struct i2c_client *client = hym8563->client;
+ int ret = i2c_smbus_read_byte_data(client, HYM8563_CLKOUT);
+
+ if (ret < 0 || ret & HYM8563_CLKOUT_DISABLE)
+ return 0;
+
+ ret &= HYM8563_CLKOUT_MASK;
+ return clkout_rates[ret];
+}
+
+static long hym8563_clkout_round_rate(struct clk_hw *hw, unsigned long rate,
+ unsigned long *prate)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(clkout_rates); i++)
+ if (clkout_rates[i] <= rate)
+ return clkout_rates[i];
+
+ return 0;
+}
+
+static int hym8563_clkout_set_rate(struct clk_hw *hw, unsigned long rate,
+ unsigned long parent_rate)
+{
+ struct hym8563 *hym8563 = clkout_hw_to_hym8563(hw);
+ struct i2c_client *client = hym8563->client;
+ int ret = i2c_smbus_read_byte_data(client, HYM8563_CLKOUT);
+ int i;
+
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < ARRAY_SIZE(clkout_rates); i++)
+ if (clkout_rates[i] == rate) {
+ ret &= ~HYM8563_CLKOUT_MASK;
+ ret |= i;
+ return i2c_smbus_write_byte_data(client,
+ HYM8563_CLKOUT, ret);
+ }
+
+ return -EINVAL;
+}
+
+static int hym8563_clkout_control(struct clk_hw *hw, bool enable)
+{
+ struct hym8563 *hym8563 = clkout_hw_to_hym8563(hw);
+ struct i2c_client *client = hym8563->client;
+ int ret = i2c_smbus_read_byte_data(client, HYM8563_CLKOUT);
+
+ if (ret < 0)
+ return ret;
+
+ if (enable)
+ ret &= ~HYM8563_CLKOUT_DISABLE;
+ else
+ ret |= HYM8563_CLKOUT_DISABLE;
+
+ return i2c_smbus_write_byte_data(client, HYM8563_CLKOUT, ret);
+}
+
+static int hym8563_clkout_prepare(struct clk_hw *hw)
+{
+ return hym8563_clkout_control(hw, 1);
+}
+
+static void hym8563_clkout_unprepare(struct clk_hw *hw)
+{
+ hym8563_clkout_control(hw, 0);
+}
+
+static int hym8563_clkout_is_prepared(struct clk_hw *hw)
+{
+ struct hym8563 *hym8563 = clkout_hw_to_hym8563(hw);
+ struct i2c_client *client = hym8563->client;
+ int ret = i2c_smbus_read_byte_data(client, HYM8563_CLKOUT);
+
+ if (ret < 0)
+ return ret;
+
+ return !(ret & HYM8563_CLKOUT_DISABLE);
+}
+
+static const struct clk_ops hym8563_clkout_ops = {
+ .prepare = hym8563_clkout_prepare,
+ .unprepare = hym8563_clkout_unprepare,
+ .is_prepared = hym8563_clkout_is_prepared,
+ .recalc_rate = hym8563_clkout_recalc_rate,
+ .round_rate = hym8563_clkout_round_rate,
+ .set_rate = hym8563_clkout_set_rate,
+};
+
+static struct clk *hym8563_clkout_register_clk(struct hym8563 *hym8563)
+{
+ struct i2c_client *client = hym8563->client;
+ struct device_node *node = client->dev.of_node;
+ struct clk *clk;
+ struct clk_init_data init;
+ int ret;
+
+ ret = i2c_smbus_write_byte_data(client, HYM8563_CLKOUT,
+ HYM8563_CLKOUT_DISABLE);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ init.name = "hym8563-clkout";
+ init.ops = &hym8563_clkout_ops;
+ init.flags = CLK_IS_ROOT;
+ init.parent_names = NULL;
+ init.num_parents = 0;
+ hym8563->clkout_hw.init = &init;
+
+ /* register the clock */
+ clk = clk_register(&client->dev, &hym8563->clkout_hw);
+
+ if (!IS_ERR(clk))
+ of_clk_add_provider(node, of_clk_src_simple_get, clk);
+
+ return clk;
+}
+#endif
+
+/*
+ * The alarm interrupt is implemented as a level-low interrupt in the
+ * hym8563, while the timer interrupt uses a falling edge.
+ * We don't use the timer at all, so the interrupt is requested to
+ * use the level-low trigger.
+ */
+static irqreturn_t hym8563_irq(int irq, void *dev_id)
+{
+ struct hym8563 *hym8563 = (struct hym8563 *)dev_id;
+ struct i2c_client *client = hym8563->client;
+ struct mutex *lock = &hym8563->rtc->ops_lock;
+ int data, ret;
+
+ mutex_lock(lock);
+
+ /* Clear the alarm flag */
+
+ data = i2c_smbus_read_byte_data(client, HYM8563_CTL2);
+ if (data < 0) {
+ dev_err(&client->dev, "%s: error reading i2c data %d\n",
+ __func__, data);
+ goto out;
+ }
+
+ data &= ~HYM8563_CTL2_AF;
+
+ ret = i2c_smbus_write_byte_data(client, HYM8563_CTL2, data);
+ if (ret < 0) {
+ dev_err(&client->dev, "%s: error writing i2c data %d\n",
+ __func__, ret);
+ }
+
+out:
+ mutex_unlock(lock);
+ return IRQ_HANDLED;
+}
+
+static int hym8563_init_device(struct i2c_client *client)
+{
+ int ret;
+
+ /* Clear stop flag if present */
+ ret = i2c_smbus_write_byte_data(client, HYM8563_CTL1, 0);
+ if (ret < 0)
+ return ret;
+
+ ret = i2c_smbus_read_byte_data(client, HYM8563_CTL2);
+ if (ret < 0)
+ return ret;
+
+ /* Disable alarm and timer interrupts */
+ ret &= ~HYM8563_CTL2_AIE;
+ ret &= ~HYM8563_CTL2_TIE;
+
+ /* Clear any pending alarm and timer flags */
+ if (ret & HYM8563_CTL2_AF)
+ ret &= ~HYM8563_CTL2_AF;
+
+ if (ret & HYM8563_CTL2_TF)
+ ret &= ~HYM8563_CTL2_TF;
+
+ ret &= ~HYM8563_CTL2_TI_TP;
+
+ return i2c_smbus_write_byte_data(client, HYM8563_CTL2, ret);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int hym8563_suspend(struct device *dev)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ int ret;
+
+ if (device_may_wakeup(dev)) {
+ ret = enable_irq_wake(client->irq);
+ if (ret) {
+ dev_err(dev, "enable_irq_wake failed, %d\n", ret);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int hym8563_resume(struct device *dev)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+
+ if (device_may_wakeup(dev))
+ disable_irq_wake(client->irq);
+
+ return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(hym8563_pm_ops, hym8563_suspend, hym8563_resume);
+
+static int hym8563_probe(struct i2c_client *client,
+ const struct i2c_device_id *id)
+{
+ struct hym8563 *hym8563;
+ int ret;
+
+ hym8563 = devm_kzalloc(&client->dev, sizeof(*hym8563), GFP_KERNEL);
+ if (!hym8563)
+ return -ENOMEM;
+
+ hym8563->client = client;
+ i2c_set_clientdata(client, hym8563);
+
+ device_set_wakeup_capable(&client->dev, true);
+
+ ret = hym8563_init_device(client);
+ if (ret) {
+ dev_err(&client->dev, "could not init device, %d\n", ret);
+ return ret;
+ }
+
+ ret = devm_request_threaded_irq(&client->dev, client->irq,
+ NULL, hym8563_irq,
+ IRQF_TRIGGER_LOW | IRQF_ONESHOT,
+ client->name, hym8563);
+ if (ret < 0) {
+ dev_err(&client->dev, "irq %d request failed, %d\n",
+ client->irq, ret);
+ return ret;
+ }
+
+ /* check state of calendar information */
+ ret = i2c_smbus_read_byte_data(client, HYM8563_SEC);
+ if (ret < 0)
+ return ret;
+
+ hym8563->valid = !(ret & HYM8563_SEC_VL);
+ dev_dbg(&client->dev, "rtc information is %s\n",
+ hym8563->valid ? "valid" : "invalid");
+
+ hym8563->rtc = devm_rtc_device_register(&client->dev, client->name,
+ &hym8563_rtc_ops, THIS_MODULE);
+ if (IS_ERR(hym8563->rtc))
+ return PTR_ERR(hym8563->rtc);
+
+#ifdef CONFIG_COMMON_CLK
+ hym8563_clkout_register_clk(hym8563);
+#endif
+
+ return 0;
+}
+
+static const struct i2c_device_id hym8563_id[] = {
+ { "hym8563", 0 },
+ {},
+};
+MODULE_DEVICE_TABLE(i2c, hym8563_id);
+
+static struct of_device_id hym8563_dt_idtable[] = {
+ { .compatible = "haoyu,hym8563" },
+ {},
+};
+MODULE_DEVICE_TABLE(of, hym8563_dt_idtable);
+
+static struct i2c_driver hym8563_driver = {
+ .driver = {
+ .name = "rtc-hym8563",
+ .owner = THIS_MODULE,
+ .pm = &hym8563_pm_ops,
+ .of_match_table = hym8563_dt_idtable,
+ },
+ .probe = hym8563_probe,
+ .id_table = hym8563_id,
+};
+
+module_i2c_driver(hym8563_driver);
+
+MODULE_AUTHOR("Heiko Stuebner <heiko@sntech.de>");
+MODULE_DESCRIPTION("HYM8563 RTC driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/rtc/rtc-max8907.c b/drivers/rtc/rtc-max8907.c
index 8e45b3c4aa2f..3032178bd9e6 100644
--- a/drivers/rtc/rtc-max8907.c
+++ b/drivers/rtc/rtc-max8907.c
@@ -51,7 +51,7 @@ static irqreturn_t max8907_irq_handler(int irq, void *data)
{
struct max8907_rtc *rtc = data;
- regmap_update_bits(rtc->regmap, MAX8907_REG_ALARM0_CNTL, 0x7f, 0);
+ regmap_write(rtc->regmap, MAX8907_REG_ALARM0_CNTL, 0);
rtc_update_irq(rtc->rtc_dev, 1, RTC_IRQF | RTC_AF);
@@ -64,7 +64,7 @@ static void regs_to_tm(u8 *regs, struct rtc_time *tm)
bcd2bin(regs[RTC_YEAR1]) - 1900;
tm->tm_mon = bcd2bin(regs[RTC_MONTH] & 0x1f) - 1;
tm->tm_mday = bcd2bin(regs[RTC_DATE] & 0x3f);
- tm->tm_wday = (regs[RTC_WEEKDAY] & 0x07) - 1;
+ tm->tm_wday = (regs[RTC_WEEKDAY] & 0x07);
if (regs[RTC_HOUR] & HOUR_12) {
tm->tm_hour = bcd2bin(regs[RTC_HOUR] & 0x01f);
if (tm->tm_hour == 12)
@@ -88,7 +88,7 @@ static void tm_to_regs(struct rtc_time *tm, u8 *regs)
regs[RTC_YEAR1] = bin2bcd(low);
regs[RTC_MONTH] = bin2bcd(tm->tm_mon + 1);
regs[RTC_DATE] = bin2bcd(tm->tm_mday);
- regs[RTC_WEEKDAY] = tm->tm_wday + 1;
+ regs[RTC_WEEKDAY] = tm->tm_wday;
regs[RTC_HOUR] = bin2bcd(tm->tm_hour);
regs[RTC_MIN] = bin2bcd(tm->tm_min);
regs[RTC_SEC] = bin2bcd(tm->tm_sec);
@@ -153,7 +153,7 @@ static int max8907_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
tm_to_regs(&alrm->time, regs);
/* Disable alarm while we update the target time */
- ret = regmap_update_bits(rtc->regmap, MAX8907_REG_ALARM0_CNTL, 0x7f, 0);
+ ret = regmap_write(rtc->regmap, MAX8907_REG_ALARM0_CNTL, 0);
if (ret < 0)
return ret;
@@ -163,8 +163,7 @@ static int max8907_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
return ret;
if (alrm->enabled)
- ret = regmap_update_bits(rtc->regmap, MAX8907_REG_ALARM0_CNTL,
- 0x7f, 0x7f);
+ ret = regmap_write(rtc->regmap, MAX8907_REG_ALARM0_CNTL, 0x77);
return ret;
}
diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c
index 50c572645546..419874fefa4b 100644
--- a/drivers/rtc/rtc-mxc.c
+++ b/drivers/rtc/rtc-mxc.c
@@ -391,11 +391,13 @@ static int mxc_rtc_probe(struct platform_device *pdev)
pdata->clk = devm_clk_get(&pdev->dev, NULL);
if (IS_ERR(pdata->clk)) {
dev_err(&pdev->dev, "unable to get clock!\n");
- ret = PTR_ERR(pdata->clk);
- goto exit_free_pdata;
+ return PTR_ERR(pdata->clk);
}
- clk_prepare_enable(pdata->clk);
+ ret = clk_prepare_enable(pdata->clk);
+ if (ret)
+ return ret;
+
rate = clk_get_rate(pdata->clk);
if (rate == 32768)
@@ -447,8 +449,6 @@ static int mxc_rtc_probe(struct platform_device *pdev)
exit_put_clk:
clk_disable_unprepare(pdata->clk);
-exit_free_pdata:
-
return ret;
}
diff --git a/drivers/rtc/rtc-pcf2127.c b/drivers/rtc/rtc-pcf2127.c
index 1ee514a3972c..9bd842e97749 100644
--- a/drivers/rtc/rtc-pcf2127.c
+++ b/drivers/rtc/rtc-pcf2127.c
@@ -197,10 +197,7 @@ static int pcf2127_probe(struct i2c_client *client,
pcf2127_driver.driver.name,
&pcf2127_rtc_ops, THIS_MODULE);
- if (IS_ERR(pcf2127->rtc))
- return PTR_ERR(pcf2127->rtc);
-
- return 0;
+ return PTR_ERR_OR_ZERO(pcf2127->rtc);
}
static const struct i2c_device_id pcf2127_id[] = {
diff --git a/drivers/rtc/rtc-s5m.c b/drivers/rtc/rtc-s5m.c
index ae8119dc2846..476af93543f6 100644
--- a/drivers/rtc/rtc-s5m.c
+++ b/drivers/rtc/rtc-s5m.c
@@ -639,6 +639,7 @@ static void s5m_rtc_shutdown(struct platform_device *pdev)
s5m_rtc_enable_smpl(info, false);
}
+#ifdef CONFIG_PM_SLEEP
static int s5m_rtc_resume(struct device *dev)
{
struct s5m_rtc_info *info = dev_get_drvdata(dev);
@@ -660,6 +661,7 @@ static int s5m_rtc_suspend(struct device *dev)
return ret;
}
+#endif /* CONFIG_PM_SLEEP */
static SIMPLE_DEV_PM_OPS(s5m_rtc_pm_ops, s5m_rtc_suspend, s5m_rtc_resume);
diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c
index c2e80d7ca5e2..1915464e4cd6 100644
--- a/drivers/rtc/rtc-twl.c
+++ b/drivers/rtc/rtc-twl.c
@@ -479,7 +479,7 @@ static int twl_rtc_probe(struct platform_device *pdev)
u8 rd_reg;
if (irq <= 0)
- goto out1;
+ return ret;
/* Initialize the register map */
if (twl_class_is_4030())
@@ -489,7 +489,7 @@ static int twl_rtc_probe(struct platform_device *pdev)
ret = twl_rtc_read_u8(&rd_reg, REG_RTC_STATUS_REG);
if (ret < 0)
- goto out1;
+ return ret;
if (rd_reg & BIT_RTC_STATUS_REG_POWER_UP_M)
dev_warn(&pdev->dev, "Power up reset detected.\n");
@@ -500,7 +500,7 @@ static int twl_rtc_probe(struct platform_device *pdev)
/* Clear RTC Power up reset and pending alarm interrupts */
ret = twl_rtc_write_u8(rd_reg, REG_RTC_STATUS_REG);
if (ret < 0)
- goto out1;
+ return ret;
if (twl_class_is_6030()) {
twl6030_interrupt_unmask(TWL6030_RTC_INT_MASK,
@@ -512,7 +512,7 @@ static int twl_rtc_probe(struct platform_device *pdev)
dev_info(&pdev->dev, "Enabling TWL-RTC\n");
ret = twl_rtc_write_u8(BIT_RTC_CTRL_REG_STOP_RTC_M, REG_RTC_CTRL_REG);
if (ret < 0)
- goto out1;
+ return ret;
/* ensure interrupts are disabled, bootloaders can be strange */
ret = twl_rtc_write_u8(0, REG_RTC_INTERRUPTS_REG);
@@ -522,34 +522,29 @@ static int twl_rtc_probe(struct platform_device *pdev)
/* init cached IRQ enable bits */
ret = twl_rtc_read_u8(&rtc_irq_bits, REG_RTC_INTERRUPTS_REG);
if (ret < 0)
- goto out1;
+ return ret;
device_init_wakeup(&pdev->dev, 1);
- rtc = rtc_device_register(pdev->name,
- &pdev->dev, &twl_rtc_ops, THIS_MODULE);
+ rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
+ &twl_rtc_ops, THIS_MODULE);
if (IS_ERR(rtc)) {
- ret = PTR_ERR(rtc);
dev_err(&pdev->dev, "can't register RTC device, err %ld\n",
PTR_ERR(rtc));
- goto out1;
+ return PTR_ERR(rtc);
}
- ret = request_threaded_irq(irq, NULL, twl_rtc_interrupt,
- IRQF_TRIGGER_RISING | IRQF_ONESHOT,
- dev_name(&rtc->dev), rtc);
+ ret = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+ twl_rtc_interrupt,
+ IRQF_TRIGGER_RISING | IRQF_ONESHOT,
+ dev_name(&rtc->dev), rtc);
if (ret < 0) {
dev_err(&pdev->dev, "IRQ is not free.\n");
- goto out2;
+ return ret;
}
platform_set_drvdata(pdev, rtc);
return 0;
-
-out2:
- rtc_device_unregister(rtc);
-out1:
- return ret;
}
/*
@@ -559,9 +554,6 @@ out1:
static int twl_rtc_remove(struct platform_device *pdev)
{
/* leave rtc running, but disable irqs */
- struct rtc_device *rtc = platform_get_drvdata(pdev);
- int irq = platform_get_irq(pdev, 0);
-
mask_rtc_irq_bit(BIT_RTC_INTERRUPTS_REG_IT_ALARM_M);
mask_rtc_irq_bit(BIT_RTC_INTERRUPTS_REG_IT_TIMER_M);
if (twl_class_is_6030()) {
@@ -571,10 +563,6 @@ static int twl_rtc_remove(struct platform_device *pdev)
REG_INT_MSK_STS_A);
}
-
- free_irq(irq, rtc);
-
- rtc_device_unregister(rtc);
return 0;
}
diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c
index aabc22c587fb..88c9c92e89fd 100644
--- a/drivers/rtc/rtc-vr41xx.c
+++ b/drivers/rtc/rtc-vr41xx.c
@@ -293,7 +293,7 @@ static int rtc_probe(struct platform_device *pdev)
if (!res)
return -EBUSY;
- rtc1_base = ioremap(res->start, resource_size(res));
+ rtc1_base = devm_ioremap(&pdev->dev, res->start, resource_size(res));
if (!rtc1_base)
return -EBUSY;
@@ -303,13 +303,14 @@ static int rtc_probe(struct platform_device *pdev)
goto err_rtc1_iounmap;
}
- rtc2_base = ioremap(res->start, resource_size(res));
+ rtc2_base = devm_ioremap(&pdev->dev, res->start, resource_size(res));
if (!rtc2_base) {
retval = -EBUSY;
goto err_rtc1_iounmap;
}
- rtc = rtc_device_register(rtc_name, &pdev->dev, &vr41xx_rtc_ops, THIS_MODULE);
+ rtc = devm_rtc_device_register(&pdev->dev, rtc_name, &vr41xx_rtc_ops,
+ THIS_MODULE);
if (IS_ERR(rtc)) {
retval = PTR_ERR(rtc);
goto err_iounmap_all;
@@ -330,24 +331,24 @@ static int rtc_probe(struct platform_device *pdev)
aie_irq = platform_get_irq(pdev, 0);
if (aie_irq <= 0) {
retval = -EBUSY;
- goto err_device_unregister;
+ goto err_iounmap_all;
}
- retval = request_irq(aie_irq, elapsedtime_interrupt, 0,
- "elapsed_time", pdev);
+ retval = devm_request_irq(&pdev->dev, aie_irq, elapsedtime_interrupt, 0,
+ "elapsed_time", pdev);
if (retval < 0)
- goto err_device_unregister;
+ goto err_iounmap_all;
pie_irq = platform_get_irq(pdev, 1);
if (pie_irq <= 0) {
retval = -EBUSY;
- goto err_free_irq;
+ goto err_iounmap_all;
}
- retval = request_irq(pie_irq, rtclong1_interrupt, 0,
- "rtclong1", pdev);
+ retval = devm_request_irq(&pdev->dev, pie_irq, rtclong1_interrupt, 0,
+ "rtclong1", pdev);
if (retval < 0)
- goto err_free_irq;
+ goto err_iounmap_all;
platform_set_drvdata(pdev, rtc);
@@ -358,47 +359,20 @@ static int rtc_probe(struct platform_device *pdev)
return 0;
-err_free_irq:
- free_irq(aie_irq, pdev);
-
-err_device_unregister:
- rtc_device_unregister(rtc);
-
err_iounmap_all:
- iounmap(rtc2_base);
rtc2_base = NULL;
err_rtc1_iounmap:
- iounmap(rtc1_base);
rtc1_base = NULL;
return retval;
}
-static int rtc_remove(struct platform_device *pdev)
-{
- struct rtc_device *rtc;
-
- rtc = platform_get_drvdata(pdev);
- if (rtc)
- rtc_device_unregister(rtc);
-
- free_irq(aie_irq, pdev);
- free_irq(pie_irq, pdev);
- if (rtc1_base)
- iounmap(rtc1_base);
- if (rtc2_base)
- iounmap(rtc2_base);
-
- return 0;
-}
-
/* work with hotplug and coldplug */
MODULE_ALIAS("platform:RTC");
static struct platform_driver rtc_platform_driver = {
.probe = rtc_probe,
- .remove = rtc_remove,
.driver = {
.name = rtc_name,
.owner = THIS_MODULE,
diff --git a/drivers/scsi/megaraid/megaraid_mm.c b/drivers/scsi/megaraid/megaraid_mm.c
index dfffd0f37916..a70692779a16 100644
--- a/drivers/scsi/megaraid/megaraid_mm.c
+++ b/drivers/scsi/megaraid/megaraid_mm.c
@@ -486,6 +486,8 @@ mimd_to_kioc(mimd_t __user *umimd, mraid_mmadp_t *adp, uioc_t *kioc)
pthru32->dataxferaddr = kioc->buf_paddr;
if (kioc->data_dir & UIOC_WR) {
+ if (pthru32->dataxferlen > kioc->xferlen)
+ return -EINVAL;
if (copy_from_user(kioc->buf_vaddr, kioc->user_data,
pthru32->dataxferlen)) {
return (-EFAULT);
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index b24aa010f68c..65cd80bf9aec 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -13,6 +13,10 @@ config VT
bool "Virtual terminal" if EXPERT
depends on !S390 && !UML
select INPUT
+ select NEW_LEDS
+ select LEDS_CLASS
+ select LEDS_TRIGGERS
+ select INPUT_LEDS
default y
---help---
If you say Y here, you will get support for terminal devices with
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index d0e3a4497707..d6ecfc9e734f 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -33,6 +33,7 @@
#include <linux/string.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/leds.h>
#include <linux/kbd_kern.h>
#include <linux/kbd_diacr.h>
@@ -130,6 +131,7 @@ static char rep; /* flag telling character repeat */
static int shift_state = 0;
static unsigned char ledstate = 0xff; /* undefined */
+static unsigned char lockstate = 0xff; /* undefined */
static unsigned char ledioctl;
/*
@@ -961,6 +963,41 @@ static void k_brl(struct vc_data *vc, unsigned char value, char up_flag)
}
}
+/* We route VT keyboard "leds" through triggers */
+static void kbd_ledstate_trigger_activate(struct led_classdev *cdev);
+
+static struct led_trigger ledtrig_ledstate[] = {
+#define DEFINE_LEDSTATE_TRIGGER(kbd_led, nam) \
+ [kbd_led] = { \
+ .name = nam, \
+ .activate = kbd_ledstate_trigger_activate, \
+ }
+ DEFINE_LEDSTATE_TRIGGER(VC_SCROLLOCK, "kbd-scrollock"),
+ DEFINE_LEDSTATE_TRIGGER(VC_NUMLOCK, "kbd-numlock"),
+ DEFINE_LEDSTATE_TRIGGER(VC_CAPSLOCK, "kbd-capslock"),
+ DEFINE_LEDSTATE_TRIGGER(VC_KANALOCK, "kbd-kanalock"),
+#undef DEFINE_LEDSTATE_TRIGGER
+};
+
+static void kbd_lockstate_trigger_activate(struct led_classdev *cdev);
+
+static struct led_trigger ledtrig_lockstate[] = {
+#define DEFINE_LOCKSTATE_TRIGGER(kbd_led, nam) \
+ [kbd_led] = { \
+ .name = nam, \
+ .activate = kbd_lockstate_trigger_activate, \
+ }
+ DEFINE_LOCKSTATE_TRIGGER(VC_SHIFTLOCK, "kbd-shiftlock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_ALTGRLOCK, "kbd-altgrlock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_CTRLLOCK, "kbd-ctrllock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_ALTLOCK, "kbd-altlock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_SHIFTLLOCK, "kbd-shiftllock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_SHIFTRLOCK, "kbd-shiftrlock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_CTRLLLOCK, "kbd-ctrlllock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_CTRLRLOCK, "kbd-ctrlrlock"),
+#undef DEFINE_LOCKSTATE_TRIGGER
+};
+
/*
* The leds display either (i) the status of NumLock, CapsLock, ScrollLock,
* or (ii) whatever pattern of lights people want to show using KDSETLED,
@@ -995,18 +1032,25 @@ static inline unsigned char getleds(void)
return kbd->ledflagstate;
}
-static int kbd_update_leds_helper(struct input_handle *handle, void *data)
+/* Called on trigger connection, to set initial state */
+static void kbd_ledstate_trigger_activate(struct led_classdev *cdev)
{
- unsigned char leds = *(unsigned char *)data;
+ struct led_trigger *trigger = cdev->trigger;
+ int led = trigger - ledtrig_ledstate;
- if (test_bit(EV_LED, handle->dev->evbit)) {
- input_inject_event(handle, EV_LED, LED_SCROLLL, !!(leds & 0x01));
- input_inject_event(handle, EV_LED, LED_NUML, !!(leds & 0x02));
- input_inject_event(handle, EV_LED, LED_CAPSL, !!(leds & 0x04));
- input_inject_event(handle, EV_SYN, SYN_REPORT, 0);
- }
+ tasklet_disable(&keyboard_tasklet);
+ led_trigger_event(trigger, ledstate & (1 << led) ? LED_FULL : LED_OFF);
+ tasklet_enable(&keyboard_tasklet);
+}
- return 0;
+static void kbd_lockstate_trigger_activate(struct led_classdev *cdev)
+{
+ struct led_trigger *trigger = cdev->trigger;
+ int led = trigger - ledtrig_lockstate;
+
+ tasklet_disable(&keyboard_tasklet);
+ led_trigger_event(trigger, lockstate & (1 << led) ? LED_FULL : LED_OFF);
+ tasklet_enable(&keyboard_tasklet);
}
/**
@@ -1095,16 +1139,29 @@ static void kbd_bh(unsigned long dummy)
{
unsigned char leds;
unsigned long flags;
-
+ int i;
+
spin_lock_irqsave(&led_lock, flags);
leds = getleds();
spin_unlock_irqrestore(&led_lock, flags);
if (leds != ledstate) {
- input_handler_for_each_handle(&kbd_handler, &leds,
- kbd_update_leds_helper);
+ for (i = 0; i < ARRAY_SIZE(ledtrig_ledstate); i++)
+ if ((leds ^ ledstate) & (1 << i))
+ led_trigger_event(&ledtrig_ledstate[i],
+ leds & (1 << i)
+ ? LED_FULL : LED_OFF);
ledstate = leds;
}
+
+ if (kbd->lockstate != lockstate) {
+ for (i = 0; i < ARRAY_SIZE(ledtrig_lockstate); i++)
+ if ((kbd->lockstate ^ lockstate) & (1 << i))
+ led_trigger_event(&ledtrig_lockstate[i],
+ kbd->lockstate & (1 << i)
+ ? LED_FULL : LED_OFF);
+ lockstate = kbd->lockstate;
+ }
}
DECLARE_TASKLET_DISABLED(keyboard_tasklet, kbd_bh, 0);
@@ -1442,20 +1499,6 @@ static void kbd_disconnect(struct input_handle *handle)
kfree(handle);
}
-/*
- * Start keyboard handler on the new keyboard by refreshing LED state to
- * match the rest of the system.
- */
-static void kbd_start(struct input_handle *handle)
-{
- tasklet_disable(&keyboard_tasklet);
-
- if (ledstate != 0xff)
- kbd_update_leds_helper(handle, &ledstate);
-
- tasklet_enable(&keyboard_tasklet);
-}
-
static const struct input_device_id kbd_ids[] = {
{
.flags = INPUT_DEVICE_ID_MATCH_EVBIT,
@@ -1477,7 +1520,6 @@ static struct input_handler kbd_handler = {
.match = kbd_match,
.connect = kbd_connect,
.disconnect = kbd_disconnect,
- .start = kbd_start,
.name = "kbd",
.id_table = kbd_ids,
};
@@ -1501,6 +1543,20 @@ int __init kbd_init(void)
if (error)
return error;
+ for (i = 0; i < ARRAY_SIZE(ledtrig_ledstate); i++) {
+ error = led_trigger_register(&ledtrig_ledstate[i]);
+ if (error)
+ pr_err("error %d while registering trigger %s\n",
+ error, ledtrig_ledstate[i].name);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ledtrig_lockstate); i++) {
+ error = led_trigger_register(&ledtrig_lockstate[i]);
+ if (error)
+ pr_err("error %d while registering trigger %s\n",
+ error, ledtrig_lockstate[i].name);
+ }
+
tasklet_enable(&keyboard_tasklet);
tasklet_schedule(&keyboard_tasklet);
diff --git a/drivers/video/aty/aty128fb.c b/drivers/video/aty/aty128fb.c
index 12ca031877d4..52108be69e77 100644
--- a/drivers/video/aty/aty128fb.c
+++ b/drivers/video/aty/aty128fb.c
@@ -357,11 +357,13 @@ static int default_lcd_on = 1;
static bool mtrr = true;
#endif
+#ifdef CONFIG_FB_ATY128_BACKLIGHT
#ifdef CONFIG_PMAC_BACKLIGHT
static int backlight = 1;
#else
static int backlight = 0;
#endif
+#endif
/* PLL constants */
struct aty128_constants {
@@ -1671,7 +1673,9 @@ static int aty128fb_setup(char *options)
default_crt_on = simple_strtoul(this_opt+4, NULL, 0);
continue;
} else if (!strncmp(this_opt, "backlight:", 10)) {
+#ifdef CONFIG_FB_ATY128_BACKLIGHT
backlight = simple_strtoul(this_opt+10, NULL, 0);
+#endif
continue;
}
#ifdef CONFIG_MTRR
diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index 5d05555fe841..b83f00eaca9c 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -175,8 +175,6 @@ static ssize_t brightness_store(struct device *dev,
}
mutex_unlock(&bd->ops_lock);
- backlight_generate_event(bd, BACKLIGHT_UPDATE_SYSFS);
-
return rc;
}
static DEVICE_ATTR_RW(brightness);
diff --git a/drivers/video/backlight/hp680_bl.c b/drivers/video/backlight/hp680_bl.c
index 00076ecfe9b8..8ea42b8d9bc8 100644
--- a/drivers/video/backlight/hp680_bl.c
+++ b/drivers/video/backlight/hp680_bl.c
@@ -110,8 +110,8 @@ static int hp680bl_probe(struct platform_device *pdev)
memset(&props, 0, sizeof(struct backlight_properties));
props.type = BACKLIGHT_RAW;
props.max_brightness = HP680_MAX_INTENSITY;
- bd = backlight_device_register("hp680-bl", &pdev->dev, NULL,
- &hp680bl_ops, &props);
+ bd = devm_backlight_device_register(&pdev->dev, "hp680-bl", &pdev->dev,
+ NULL, &hp680bl_ops, &props);
if (IS_ERR(bd))
return PTR_ERR(bd);
@@ -131,8 +131,6 @@ static int hp680bl_remove(struct platform_device *pdev)
bd->props.power = 0;
hp680bl_send_intensity(bd);
- backlight_device_unregister(bd);
-
return 0;
}
diff --git a/drivers/video/backlight/jornada720_bl.c b/drivers/video/backlight/jornada720_bl.c
index 3ccb89340f22..6ce96b4a8796 100644
--- a/drivers/video/backlight/jornada720_bl.c
+++ b/drivers/video/backlight/jornada720_bl.c
@@ -115,9 +115,10 @@ static int jornada_bl_probe(struct platform_device *pdev)
memset(&props, 0, sizeof(struct backlight_properties));
props.type = BACKLIGHT_RAW;
props.max_brightness = BL_MAX_BRIGHT;
- bd = backlight_device_register(S1D_DEVICENAME, &pdev->dev, NULL,
- &jornada_bl_ops, &props);
+ bd = devm_backlight_device_register(&pdev->dev, S1D_DEVICENAME,
+ &pdev->dev, NULL, &jornada_bl_ops,
+ &props);
if (IS_ERR(bd)) {
ret = PTR_ERR(bd);
dev_err(&pdev->dev, "failed to register device, err=%x\n", ret);
@@ -139,18 +140,8 @@ static int jornada_bl_probe(struct platform_device *pdev)
return 0;
}
-static int jornada_bl_remove(struct platform_device *pdev)
-{
- struct backlight_device *bd = platform_get_drvdata(pdev);
-
- backlight_device_unregister(bd);
-
- return 0;
-}
-
static struct platform_driver jornada_bl_driver = {
.probe = jornada_bl_probe,
- .remove = jornada_bl_remove,
.driver = {
.name = "jornada_bl",
},
diff --git a/drivers/video/backlight/jornada720_lcd.c b/drivers/video/backlight/jornada720_lcd.c
index b061413f1a65..da3876c9b3ae 100644
--- a/drivers/video/backlight/jornada720_lcd.c
+++ b/drivers/video/backlight/jornada720_lcd.c
@@ -100,7 +100,8 @@ static int jornada_lcd_probe(struct platform_device *pdev)
struct lcd_device *lcd_device;
int ret;
- lcd_device = lcd_device_register(S1D_DEVICENAME, &pdev->dev, NULL, &jornada_lcd_props);
+ lcd_device = devm_lcd_device_register(&pdev->dev, S1D_DEVICENAME,
+ &pdev->dev, NULL, &jornada_lcd_props);
if (IS_ERR(lcd_device)) {
ret = PTR_ERR(lcd_device);
@@ -119,18 +120,8 @@ static int jornada_lcd_probe(struct platform_device *pdev)
return 0;
}
-static int jornada_lcd_remove(struct platform_device *pdev)
-{
- struct lcd_device *lcd_device = platform_get_drvdata(pdev);
-
- lcd_device_unregister(lcd_device);
-
- return 0;
-}
-
static struct platform_driver jornada_lcd_driver = {
.probe = jornada_lcd_probe,
- .remove = jornada_lcd_remove,
.driver = {
.name = "jornada_lcd",
},
diff --git a/drivers/video/backlight/kb3886_bl.c b/drivers/video/backlight/kb3886_bl.c
index 7592cc25c963..84a110a719cb 100644
--- a/drivers/video/backlight/kb3886_bl.c
+++ b/drivers/video/backlight/kb3886_bl.c
@@ -78,7 +78,7 @@ static struct kb3886bl_machinfo *bl_machinfo;
static unsigned long kb3886bl_flags;
#define KB3886BL_SUSPENDED 0x01
-static struct dmi_system_id __initdata kb3886bl_device_table[] = {
+static struct dmi_system_id kb3886bl_device_table[] __initdata = {
{
.ident = "Sahara Touch-iT",
.matches = {
diff --git a/drivers/video/backlight/l4f00242t03.c b/drivers/video/backlight/l4f00242t03.c
index b5fc13bc24e7..63e763828e0e 100644
--- a/drivers/video/backlight/l4f00242t03.c
+++ b/drivers/video/backlight/l4f00242t03.c
@@ -223,8 +223,8 @@ static int l4f00242t03_probe(struct spi_device *spi)
return PTR_ERR(priv->core_reg);
}
- priv->ld = lcd_device_register("l4f00242t03",
- &spi->dev, priv, &l4f_ops);
+ priv->ld = devm_lcd_device_register(&spi->dev, "l4f00242t03", &spi->dev,
+ priv, &l4f_ops);
if (IS_ERR(priv->ld))
return PTR_ERR(priv->ld);
@@ -243,8 +243,6 @@ static int l4f00242t03_remove(struct spi_device *spi)
struct l4f00242t03_priv *priv = spi_get_drvdata(spi);
l4f00242t03_lcd_power_set(priv->ld, FB_BLANK_POWERDOWN);
- lcd_device_unregister(priv->ld);
-
return 0;
}
diff --git a/drivers/video/backlight/lp855x_bl.c b/drivers/video/backlight/lp855x_bl.c
index cae80d555e84..2ca3a040007b 100644
--- a/drivers/video/backlight/lp855x_bl.c
+++ b/drivers/video/backlight/lp855x_bl.c
@@ -125,7 +125,7 @@ static bool lp855x_is_valid_rom_area(struct lp855x *lp, u8 addr)
return false;
}
- return (addr >= start && addr <= end);
+ return addr >= start && addr <= end;
}
static int lp8557_bl_off(struct lp855x *lp)
diff --git a/drivers/video/backlight/lp8788_bl.c b/drivers/video/backlight/lp8788_bl.c
index e49905d495dc..daba34dc46d4 100644
--- a/drivers/video/backlight/lp8788_bl.c
+++ b/drivers/video/backlight/lp8788_bl.c
@@ -63,13 +63,13 @@ static struct lp8788_bl_config default_bl_config = {
static inline bool is_brightness_ctrl_by_pwm(enum lp8788_bl_ctrl_mode mode)
{
- return (mode == LP8788_BL_COMB_PWM_BASED);
+ return mode == LP8788_BL_COMB_PWM_BASED;
}
static inline bool is_brightness_ctrl_by_register(enum lp8788_bl_ctrl_mode mode)
{
- return (mode == LP8788_BL_REGISTER_ONLY ||
- mode == LP8788_BL_COMB_REGISTER_BASED);
+ return mode == LP8788_BL_REGISTER_ONLY ||
+ mode == LP8788_BL_COMB_REGISTER_BASED;
}
static int lp8788_backlight_configure(struct lp8788_bl *bl)
diff --git a/drivers/video/backlight/omap1_bl.c b/drivers/video/backlight/omap1_bl.c
index ac11a4650c19..a0dcd88ac74f 100644
--- a/drivers/video/backlight/omap1_bl.c
+++ b/drivers/video/backlight/omap1_bl.c
@@ -146,8 +146,8 @@ static int omapbl_probe(struct platform_device *pdev)
memset(&props, 0, sizeof(struct backlight_properties));
props.type = BACKLIGHT_RAW;
props.max_brightness = OMAPBL_MAX_INTENSITY;
- dev = backlight_device_register("omap-bl", &pdev->dev, bl, &omapbl_ops,
- &props);
+ dev = devm_backlight_device_register(&pdev->dev, "omap-bl", &pdev->dev,
+ bl, &omapbl_ops, &props);
if (IS_ERR(dev))
return PTR_ERR(dev);
@@ -170,20 +170,10 @@ static int omapbl_probe(struct platform_device *pdev)
return 0;
}
-static int omapbl_remove(struct platform_device *pdev)
-{
- struct backlight_device *dev = platform_get_drvdata(pdev);
-
- backlight_device_unregister(dev);
-
- return 0;
-}
-
static SIMPLE_DEV_PM_OPS(omapbl_pm_ops, omapbl_suspend, omapbl_resume);
static struct platform_driver omapbl_driver = {
.probe = omapbl_probe,
- .remove = omapbl_remove,
.driver = {
.name = "omap-bl",
.pm = &omapbl_pm_ops,
diff --git a/drivers/video/backlight/ot200_bl.c b/drivers/video/backlight/ot200_bl.c
index fdbb6ee5027c..f5a5202dd79d 100644
--- a/drivers/video/backlight/ot200_bl.c
+++ b/drivers/video/backlight/ot200_bl.c
@@ -118,8 +118,9 @@ static int ot200_backlight_probe(struct platform_device *pdev)
props.brightness = 100;
props.type = BACKLIGHT_RAW;
- bl = backlight_device_register(dev_name(&pdev->dev), &pdev->dev, data,
- &ot200_backlight_ops, &props);
+ bl = devm_backlight_device_register(&pdev->dev, dev_name(&pdev->dev),
+ &pdev->dev, data, &ot200_backlight_ops,
+ &props);
if (IS_ERR(bl)) {
dev_err(&pdev->dev, "failed to register backlight\n");
retval = PTR_ERR(bl);
@@ -137,10 +138,6 @@ error_devm_kzalloc:
static int ot200_backlight_remove(struct platform_device *pdev)
{
- struct backlight_device *bl = platform_get_drvdata(pdev);
-
- backlight_device_unregister(bl);
-
/* on module unload set brightness to 100% */
cs5535_mfgpt_write(pwm_timer, MFGPT_REG_COUNTER, 0);
cs5535_mfgpt_write(pwm_timer, MFGPT_REG_SETUP, MFGPT_SETUP_CNTEN);
diff --git a/drivers/video/backlight/tosa_bl.c b/drivers/video/backlight/tosa_bl.c
index b8db9338cacd..3ad676558c80 100644
--- a/drivers/video/backlight/tosa_bl.c
+++ b/drivers/video/backlight/tosa_bl.c
@@ -105,8 +105,9 @@ static int tosa_bl_probe(struct i2c_client *client,
memset(&props, 0, sizeof(struct backlight_properties));
props.type = BACKLIGHT_RAW;
props.max_brightness = 512 - 1;
- data->bl = backlight_device_register("tosa-bl", &client->dev, data,
- &bl_ops, &props);
+ data->bl = devm_backlight_device_register(&client->dev, "tosa-bl",
+ &client->dev, data, &bl_ops,
+ &props);
if (IS_ERR(data->bl)) {
ret = PTR_ERR(data->bl);
goto err_reg;
@@ -128,9 +129,7 @@ static int tosa_bl_remove(struct i2c_client *client)
{
struct tosa_bl_data *data = i2c_get_clientdata(client);
- backlight_device_unregister(data->bl);
data->bl = NULL;
-
return 0;
}
diff --git a/drivers/video/backlight/tosa_lcd.c b/drivers/video/backlight/tosa_lcd.c
index be5d636764bf..f08d641ccd01 100644
--- a/drivers/video/backlight/tosa_lcd.c
+++ b/drivers/video/backlight/tosa_lcd.c
@@ -206,8 +206,8 @@ static int tosa_lcd_probe(struct spi_device *spi)
tosa_lcd_tg_on(data);
- data->lcd = lcd_device_register("tosa-lcd", &spi->dev, data,
- &tosa_lcd_ops);
+ data->lcd = devm_lcd_device_register(&spi->dev, "tosa-lcd", &spi->dev,
+ data, &tosa_lcd_ops);
if (IS_ERR(data->lcd)) {
ret = PTR_ERR(data->lcd);
@@ -226,8 +226,6 @@ static int tosa_lcd_remove(struct spi_device *spi)
{
struct tosa_lcd_data *data = spi_get_drvdata(spi);
- lcd_device_unregister(data->lcd);
-
if (data->i2c)
i2c_unregister_device(data->i2c);
diff --git a/drivers/vlynq/vlynq.c b/drivers/vlynq/vlynq.c
index 7b07135ab26e..c0227f9418eb 100644
--- a/drivers/vlynq/vlynq.c
+++ b/drivers/vlynq/vlynq.c
@@ -762,7 +762,8 @@ static int vlynq_remove(struct platform_device *pdev)
device_unregister(&dev->dev);
iounmap(dev->local);
- release_mem_region(dev->regs_start, dev->regs_end - dev->regs_start);
+ release_mem_region(dev->regs_start,
+ dev->regs_end - dev->regs_start + 1);
kfree(dev);
diff --git a/drivers/w1/masters/w1-gpio.c b/drivers/w1/masters/w1-gpio.c
index e36b18b2817b..9709b8b484ba 100644
--- a/drivers/w1/masters/w1-gpio.c
+++ b/drivers/w1/masters/w1-gpio.c
@@ -18,10 +18,31 @@
#include <linux/of_gpio.h>
#include <linux/err.h>
#include <linux/of.h>
+#include <linux/delay.h>
#include "../w1.h"
#include "../w1_int.h"
+static u8 w1_gpio_set_pullup(void *data, int delay)
+{
+ struct w1_gpio_platform_data *pdata = data;
+
+ if (delay) {
+ pdata->pullup_duration = delay;
+ } else {
+ if (pdata->pullup_duration) {
+ gpio_direction_output(pdata->pin, 1);
+
+ msleep(pdata->pullup_duration);
+
+ gpio_direction_input(pdata->pin);
+ }
+ pdata->pullup_duration = 0;
+ }
+
+ return 0;
+}
+
static void w1_gpio_write_bit_dir(void *data, u8 bit)
{
struct w1_gpio_platform_data *pdata = data;
@@ -132,6 +153,7 @@ static int w1_gpio_probe(struct platform_device *pdev)
} else {
gpio_direction_input(pdata->pin);
master->write_bit = w1_gpio_write_bit_dir;
+ master->set_pullup = w1_gpio_set_pullup;
}
err = w1_add_master_device(master);
diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c
index 5a98649f6abc..590bd8a7cd1b 100644
--- a/drivers/w1/w1_int.c
+++ b/drivers/w1/w1_int.c
@@ -117,18 +117,6 @@ int w1_add_master_device(struct w1_bus_master *master)
printk(KERN_ERR "w1_add_master_device: invalid function set\n");
return(-EINVAL);
}
- /* While it would be electrically possible to make a device that
- * generated a strong pullup in bit bang mode, only hardware that
- * controls 1-wire time frames are even expected to support a strong
- * pullup. w1_io.c would need to support calling set_pullup before
- * the last write_bit operation of a w1_write_8 which it currently
- * doesn't.
- */
- if (!master->write_byte && !master->touch_bit && master->set_pullup) {
- printk(KERN_ERR "w1_add_master_device: set_pullup requires "
- "write_byte or touch_bit, disabling\n");
- master->set_pullup = NULL;
- }
/* Lock until the device is added (or not) to w1_masters. */
mutex_lock(&w1_mlock);
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 526e4bbbde59..276cb6ed0b93 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -147,11 +147,11 @@ int afs_proc_init(void)
if (!proc_afs)
goto error_dir;
- p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops);
+ p = proc_create("cells", S_IFREG | S_IRUGO | S_IWUSR, proc_afs, &afs_proc_cells_fops);
if (!p)
goto error_cells;
- p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops);
+ p = proc_create("rootcell", S_IFREG | S_IRUGO | S_IWUSR, proc_afs, &afs_proc_rootcell_fops);
if (!p)
goto error_rootcell;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 4218e26df916..acf32054edd8 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -104,7 +104,7 @@ struct autofs_sb_info {
u32 magic;
int pipefd;
struct file *pipe;
- pid_t oz_pgrp;
+ struct pid *oz_pgrp;
int catatonic;
int version;
int sub_version;
@@ -140,7 +140,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
filesystem without "magic".) */
static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
- return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp;
+ return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
}
/* Does a dentry have some pending activity? */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 1818ce7f5a06..3182c0e68b42 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -346,6 +346,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
{
int pipefd;
int err = 0;
+ struct pid *new_pid = NULL;
if (param->setpipefd.pipefd == -1)
return -EINVAL;
@@ -357,7 +358,17 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
mutex_unlock(&sbi->wq_mutex);
return -EBUSY;
} else {
- struct file *pipe = fget(pipefd);
+ struct file *pipe;
+
+ new_pid = get_task_pid(current, PIDTYPE_PGID);
+
+ if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
+ AUTOFS_WARN("Not allowed to change PID namespace");
+ err = -EINVAL;
+ goto out;
+ }
+
+ pipe = fget(pipefd);
if (!pipe) {
err = -EBADF;
goto out;
@@ -367,12 +378,13 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
fput(pipe);
goto out;
}
- sbi->oz_pgrp = task_pgrp_nr(current);
+ swap(sbi->oz_pgrp, new_pid);
sbi->pipefd = pipefd;
sbi->pipe = pipe;
sbi->catatonic = 0;
}
out:
+ put_pid(new_pid);
mutex_unlock(&sbi->wq_mutex);
return err;
}
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3d9d3f5d5dda..394e90b02c5e 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -402,6 +402,20 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
goto next;
}
+ if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+ DPRINTK("checking symlink %p %.*s",
+ dentry, (int)dentry->d_name.len, dentry->d_name.name);
+ /*
+ * A symlink can't be "busy" in the usual sense so
+ * just check last used for expire timeout.
+ */
+ if (autofs4_can_expire(dentry, timeout, do_now)) {
+ expired = dentry;
+ goto found;
+ }
+ goto next;
+ }
+
if (simple_empty(dentry))
goto next;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 3b9cc9b973c2..d7bd395ab586 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -56,8 +56,11 @@ void autofs4_kill_sb(struct super_block *sb)
* just call kill_anon_super when we are called from
* deactivate_super.
*/
- if (sbi) /* Free wait queues, close pipe */
+ if (sbi) {
+ /* Free wait queues, close pipe */
autofs4_catatonic_mode(sbi);
+ put_pid(sbi->oz_pgrp);
+ }
DPRINTK("shutting down");
kill_litter_super(sb);
@@ -80,7 +83,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
seq_printf(m, ",gid=%u",
from_kgid_munged(&init_user_ns, root_inode->i_gid));
- seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
+ seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp));
seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
seq_printf(m, ",minproto=%d", sbi->min_proto);
seq_printf(m, ",maxproto=%d", sbi->max_proto);
@@ -124,7 +127,8 @@ static const match_table_t tokens = {
};
static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
- pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
+ int *pgrp, bool *pgrp_set, unsigned int *type,
+ int *minproto, int *maxproto)
{
char *p;
substring_t args[MAX_OPT_ARGS];
@@ -132,7 +136,6 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
*uid = current_uid();
*gid = current_gid();
- *pgrp = task_pgrp_nr(current);
*minproto = AUTOFS_MIN_PROTO_VERSION;
*maxproto = AUTOFS_MAX_PROTO_VERSION;
@@ -171,6 +174,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
if (match_int(args, &option))
return 1;
*pgrp = option;
+ *pgrp_set = true;
break;
case Opt_minproto:
if (match_int(args, &option))
@@ -206,10 +210,13 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
int pipefd;
struct autofs_sb_info *sbi;
struct autofs_info *ino;
+ int pgrp;
+ bool pgrp_set = false;
+ int ret = -EINVAL;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
- goto fail_unlock;
+ return -ENOMEM;
DPRINTK("starting up, sbi = %p",sbi);
s->s_fs_info = sbi;
@@ -218,7 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->pipe = NULL;
sbi->catatonic = 1;
sbi->exp_timeout = 0;
- sbi->oz_pgrp = task_pgrp_nr(current);
+ sbi->oz_pgrp = NULL;
sbi->sb = s;
sbi->version = 0;
sbi->sub_version = 0;
@@ -243,8 +250,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
* Get the root inode and dentry, but defer checking for errors.
*/
ino = autofs4_new_ino(sbi);
- if (!ino)
+ if (!ino) {
+ ret = -ENOMEM;
goto fail_free;
+ }
root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
root = d_make_root(root_inode);
if (!root)
@@ -255,12 +264,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
/* Can this call block? */
if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
- &sbi->oz_pgrp, &sbi->type, &sbi->min_proto,
- &sbi->max_proto)) {
+ &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
+ &sbi->max_proto)) {
printk("autofs: called with bogus options\n");
goto fail_dput;
}
+ if (pgrp_set) {
+ sbi->oz_pgrp = find_get_pid(pgrp);
+ if (!sbi->oz_pgrp) {
+ pr_warn("autofs: could not find process group %d\n",
+ pgrp);
+ goto fail_dput;
+ }
+ } else {
+ sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
+ }
+
if (autofs_type_trigger(sbi->type))
__managed_dentry_set_managed(root);
@@ -284,14 +304,15 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->version = sbi->max_proto;
sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
- DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp);
+ DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
pipe = fget(pipefd);
-
+
if (!pipe) {
printk("autofs: could not open pipe file descriptor\n");
goto fail_dput;
}
- if (autofs_prepare_pipe(pipe) < 0)
+ ret = autofs_prepare_pipe(pipe);
+ if (ret < 0)
goto fail_fput;
sbi->pipe = pipe;
sbi->pipefd = pipefd;
@@ -316,10 +337,10 @@ fail_dput:
fail_ino:
kfree(ino);
fail_free:
+ put_pid(sbi->oz_pgrp);
kfree(sbi);
s->s_fs_info = NULL;
-fail_unlock:
- return -EINVAL;
+ return ret;
}
struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 92ef341ba0cf..2caf36ac3e93 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -558,7 +558,7 @@ static int autofs4_dir_symlink(struct inode *dir,
dget(dentry);
atomic_inc(&ino->count);
p_ino = autofs4_dentry_ino(dentry->d_parent);
- if (p_ino && dentry->d_parent != dentry)
+ if (p_ino && !IS_ROOT(dentry))
atomic_inc(&p_ino->count);
dir->i_mtime = CURRENT_TIME;
@@ -593,7 +593,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
if (atomic_dec_and_test(&ino->count)) {
p_ino = autofs4_dentry_ino(dentry->d_parent);
- if (p_ino && dentry->d_parent != dentry)
+ if (p_ino && !IS_ROOT(dentry))
atomic_dec(&p_ino->count);
}
dput(ino->dentry);
@@ -732,7 +732,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
dget(dentry);
atomic_inc(&ino->count);
p_ino = autofs4_dentry_ino(dentry->d_parent);
- if (p_ino && dentry->d_parent != dentry)
+ if (p_ino && !IS_ROOT(dentry))
atomic_inc(&p_ino->count);
inc_nlink(dir);
dir->i_mtime = CURRENT_TIME;
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index f27c094a1919..1e8ea192be2b 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,6 +14,10 @@
static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
{
+ struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ if (ino && !autofs4_oz_mode(sbi))
+ ino->last_used = jiffies;
nd_set_link(nd, dentry->d_inode->i_private);
return NULL;
}
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 689e40d983ad..116fd38ee472 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -347,11 +347,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
struct qstr qstr;
char *name;
int status, ret, type;
+ pid_t pid;
+ pid_t tgid;
/* In catatonic mode, we don't wait for nobody */
if (sbi->catatonic)
return -ENOENT;
+ /*
+ * Try translating pids to the namespace of the daemon.
+ *
+ * Zero means failure: we are in an unrelated pid namespace.
+ */
+ pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+ tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+ if (pid == 0 || tgid == 0)
+ return -ENOENT;
+
if (!dentry->d_inode) {
/*
* A wait for a negative dentry is invalid for certain
@@ -417,8 +429,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
wq->ino = autofs4_get_ino(sbi);
wq->uid = current_uid();
wq->gid = current_gid();
- wq->pid = current->pid;
- wq->tgid = current->tgid;
+ wq->pid = pid;
+ wq->tgid = tgid;
wq->status = -EINTR; /* Status return if interrupted */
wq->wait_ctr = 2;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 571a42326908..1a965e654f2e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -140,6 +140,25 @@ static int padzero(unsigned long elf_bss)
#define ELF_BASE_PLATFORM NULL
#endif
+/*
+ * Use get_random_int() to implement AT_RANDOM while avoiding depletion
+ * of the entropy pool.
+ */
+static void get_atrandom_bytes(unsigned char *buf, size_t nbytes)
+{
+ unsigned char *p = buf;
+
+ while (nbytes) {
+ unsigned int random_variable;
+ size_t chunk = min(nbytes, sizeof(random_variable));
+
+ random_variable = get_random_int();
+ memcpy(p, &random_variable, chunk);
+ p += chunk;
+ nbytes -= chunk;
+ }
+}
+
static int
create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
unsigned long load_addr, unsigned long interp_load_addr)
@@ -201,7 +220,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
/*
* Generate 16 random bytes for userspace PRNG seeding.
*/
- get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+ get_atrandom_bytes(k_rand_bytes, sizeof(k_rand_bytes));
u_rand_bytes = (elf_addr_t __user *)
STACK_ALLOC(p, sizeof(k_rand_bytes));
if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
@@ -543,9 +562,6 @@ out:
* libraries. There is no binary dependent code anywhere else.
*/
-#define INTERPRETER_NONE 0
-#define INTERPRETER_ELF 2
-
#ifndef STACK_RND_MASK
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
#endif
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index dc52e13d58e0..3881610b6438 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -680,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
struct i2c_msg __user *tmsgs;
struct i2c_msg32 __user *umsgs;
compat_caddr_t datap;
- int nmsgs, i;
+ u32 nmsgs;
+ int i;
if (get_user(nmsgs, &udata->nmsgs))
return -EFAULT;
diff --git a/fs/coredump.c b/fs/coredump.c
index bc3fbcd32558..e3ad709a4232 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -40,7 +40,6 @@
#include <trace/events/task.h>
#include "internal.h"
-#include "coredump.h"
#include <trace/events/sched.h>
diff --git a/fs/coredump.h b/fs/coredump.h
deleted file mode 100644
index e39ff072110d..000000000000
--- a/fs/coredump.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _FS_COREDUMP_H
-#define _FS_COREDUMP_H
-
-extern int __get_dumpable(unsigned long mm_flags);
-
-#endif
diff --git a/fs/exec.c b/fs/exec.c
index 7ea097f6b341..e1529b4c79b1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -62,7 +62,6 @@
#include <trace/events/task.h>
#include "internal.h"
-#include "coredump.h"
#include <trace/events/sched.h>
@@ -843,7 +842,6 @@ static int exec_mmap(struct mm_struct *mm)
tsk->active_mm = mm;
activate_mm(active_mm, mm);
task_unlock(tsk);
- arch_pick_mmap_layout(mm);
if (old_mm) {
up_read(&old_mm->mmap_sem);
BUG_ON(active_mm != old_mm);
@@ -1088,8 +1086,8 @@ int flush_old_exec(struct linux_binprm * bprm)
bprm->mm = NULL; /* We're using it now */
set_fs(USER_DS);
- current->flags &=
- ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE);
+ current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
+ PF_NOFREEZE | PF_NO_SETAFFINITY);
flush_thread();
current->personality &= ~bprm->per_clear;
@@ -1139,9 +1137,7 @@ void setup_new_exec(struct linux_binprm * bprm)
/* An exec changes our domain. We are no longer part of the thread
group */
-
current->self_exec_id++;
-
flush_signal_handlers(current, 0);
do_close_on_exec(current->files);
}
@@ -1173,6 +1169,10 @@ void free_bprm(struct linux_binprm *bprm)
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
+ if (bprm->file) {
+ allow_write_access(bprm->file);
+ fput(bprm->file);
+ }
/* If a binfmt changed the interp, free it. */
if (bprm->interp != bprm->filename)
kfree(bprm->interp);
@@ -1224,11 +1224,10 @@ EXPORT_SYMBOL(install_exec_creds);
* - the caller must hold ->cred_guard_mutex to protect against
* PTRACE_ATTACH
*/
-static int check_unsafe_exec(struct linux_binprm *bprm)
+static void check_unsafe_exec(struct linux_binprm *bprm)
{
struct task_struct *p = current, *t;
unsigned n_fs;
- int res = 0;
if (p->ptrace) {
if (p->ptrace & PT_PTRACE_CAP)
@@ -1244,31 +1243,25 @@ static int check_unsafe_exec(struct linux_binprm *bprm)
if (current->no_new_privs)
bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
+ t = p;
n_fs = 1;
spin_lock(&p->fs->lock);
rcu_read_lock();
- for (t = next_thread(p); t != p; t = next_thread(t)) {
+ while_each_thread(p, t) {
if (t->fs == p->fs)
n_fs++;
}
rcu_read_unlock();
- if (p->fs->users > n_fs) {
+ if (p->fs->users > n_fs)
bprm->unsafe |= LSM_UNSAFE_SHARE;
- } else {
- res = -EAGAIN;
- if (!p->fs->in_exec) {
- p->fs->in_exec = 1;
- res = 1;
- }
- }
+ else
+ p->fs->in_exec = 1;
spin_unlock(&p->fs->lock);
-
- return res;
}
-/*
- * Fill the binprm structure from the inode.
+/*
+ * Fill the binprm structure from the inode.
* Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
*
* This may be called multiple times for binary chains (scripts for example).
@@ -1430,14 +1423,7 @@ static int exec_binprm(struct linux_binprm *bprm)
audit_bprm(bprm);
trace_sched_process_exec(current, old_pid, bprm);
ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
- current->did_exec = 1;
proc_exec_connector(current);
-
- if (bprm->file) {
- allow_write_access(bprm->file);
- fput(bprm->file);
- bprm->file = NULL; /* to catch use-after-free */
- }
}
return ret;
@@ -1453,7 +1439,6 @@ static int do_execve_common(const char *filename,
struct linux_binprm *bprm;
struct file *file;
struct files_struct *displaced;
- bool clear_in_exec;
int retval;
/*
@@ -1485,10 +1470,7 @@ static int do_execve_common(const char *filename,
if (retval)
goto out_free;
- retval = check_unsafe_exec(bprm);
- if (retval < 0)
- goto out_free;
- clear_in_exec = retval;
+ check_unsafe_exec(bprm);
current->in_execve = 1;
file = open_exec(filename);
@@ -1504,7 +1486,7 @@ static int do_execve_common(const char *filename,
retval = bprm_mm_init(bprm);
if (retval)
- goto out_file;
+ goto out_unmark;
bprm->argc = count(argv, MAX_ARG_STRINGS);
if ((retval = bprm->argc) < 0)
@@ -1551,15 +1533,8 @@ out:
mmput(bprm->mm);
}
-out_file:
- if (bprm->file) {
- allow_write_access(bprm->file);
- fput(bprm->file);
- }
-
out_unmark:
- if (clear_in_exec)
- current->fs->in_exec = 0;
+ current->fs->in_exec = 0;
current->in_execve = 0;
out_free:
@@ -1609,67 +1584,22 @@ void set_binfmt(struct linux_binfmt *new)
if (new)
__module_get(new->module);
}
-
EXPORT_SYMBOL(set_binfmt);
/*
- * set_dumpable converts traditional three-value dumpable to two flags and
- * stores them into mm->flags. It modifies lower two bits of mm->flags, but
- * these bits are not changed atomically. So get_dumpable can observe the
- * intermediate state. To avoid doing unexpected behavior, get get_dumpable
- * return either old dumpable or new one by paying attention to the order of
- * modifying the bits.
- *
- * dumpable | mm->flags (binary)
- * old new | initial interim final
- * ---------+-----------------------
- * 0 1 | 00 01 01
- * 0 2 | 00 10(*) 11
- * 1 0 | 01 00 00
- * 1 2 | 01 11 11
- * 2 0 | 11 10(*) 00
- * 2 1 | 11 11 01
- *
- * (*) get_dumpable regards interim value of 10 as 11.
+ * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
*/
void set_dumpable(struct mm_struct *mm, int value)
{
- switch (value) {
- case SUID_DUMP_DISABLE:
- clear_bit(MMF_DUMPABLE, &mm->flags);
- smp_wmb();
- clear_bit(MMF_DUMP_SECURELY, &mm->flags);
- break;
- case SUID_DUMP_USER:
- set_bit(MMF_DUMPABLE, &mm->flags);
- smp_wmb();
- clear_bit(MMF_DUMP_SECURELY, &mm->flags);
- break;
- case SUID_DUMP_ROOT:
- set_bit(MMF_DUMP_SECURELY, &mm->flags);
- smp_wmb();
- set_bit(MMF_DUMPABLE, &mm->flags);
- break;
- }
-}
-
-int __get_dumpable(unsigned long mm_flags)
-{
- int ret;
+ unsigned long old, new;
- ret = mm_flags & MMF_DUMPABLE_MASK;
- return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret;
-}
+ if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
+ return;
-/*
- * This returns the actual value of the suid_dumpable flag. For things
- * that are using this for checking for privilege transitions, it must
- * test against SUID_DUMP_USER rather than treating it as a boolean
- * value.
- */
-int get_dumpable(struct mm_struct *mm)
-{
- return __get_dumpable(mm->flags);
+ do {
+ old = ACCESS_ONCE(mm->flags);
+ new = (old & ~MMF_DUMPABLE_MASK) | value;
+ } while (cmpxchg(&mm->flags, old, new) != old);
}
SYSCALL_DEFINE3(execve,
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index bafdd48eefde..e66e4808719f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -309,43 +309,17 @@ struct fname {
*/
static void free_rb_tree_fname(struct rb_root *root)
{
- struct rb_node *n = root->rb_node;
- struct rb_node *parent;
- struct fname *fname;
-
- while (n) {
- /* Do the node's children first */
- if (n->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- /*
- * The node has no children; free it, and then zero
- * out parent's link to it. Finally go to the
- * beginning of the loop and try to free the parent
- * node.
- */
- parent = rb_parent(n);
- fname = rb_entry(n, struct fname, rb_hash);
- while (fname) {
- struct fname * old = fname;
+ struct fname *fname, *next;
+
+ rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
+ do {
+ struct fname *old = fname;
fname = fname->next;
- kfree (old);
- }
- if (!parent)
- *root = RB_ROOT;
- else if (parent->rb_left == n)
- parent->rb_left = NULL;
- else if (parent->rb_right == n)
- parent->rb_right = NULL;
- n = parent;
- }
-}
+ kfree(old);
+ } while (fname);
+ *root = RB_ROOT;
+}
static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
loff_t pos)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3f11656bd72e..41eb9dcfac7e 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -180,37 +180,12 @@ int ext4_setup_system_zone(struct super_block *sb)
/* Called when the filesystem is unmounted */
void ext4_release_system_zone(struct super_block *sb)
{
- struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node;
- struct rb_node *parent;
- struct ext4_system_zone *entry;
+ struct ext4_system_zone *entry, *n;
- while (n) {
- /* Do the node's children first */
- if (n->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- /*
- * The node has no children; free it, and then zero
- * out parent's link to it. Finally go to the
- * beginning of the loop and try to free the parent
- * node.
- */
- parent = rb_parent(n);
- entry = rb_entry(n, struct ext4_system_zone, node);
+ rbtree_postorder_for_each_entry_safe(entry, n,
+ &EXT4_SB(sb)->system_blks, node)
kmem_cache_free(ext4_system_zone_cachep, entry);
- if (!parent)
- EXT4_SB(sb)->system_blks = RB_ROOT;
- else if (parent->rb_left == n)
- parent->rb_left = NULL;
- else if (parent->rb_right == n)
- parent->rb_right = NULL;
- n = parent;
- }
+
EXT4_SB(sb)->system_blks = RB_ROOT;
}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 680bb3388919..d638c57e996e 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -353,41 +353,16 @@ struct fname {
*/
static void free_rb_tree_fname(struct rb_root *root)
{
- struct rb_node *n = root->rb_node;
- struct rb_node *parent;
- struct fname *fname;
-
- while (n) {
- /* Do the node's children first */
- if (n->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- /*
- * The node has no children; free it, and then zero
- * out parent's link to it. Finally go to the
- * beginning of the loop and try to free the parent
- * node.
- */
- parent = rb_parent(n);
- fname = rb_entry(n, struct fname, rb_hash);
+ struct fname *fname, *next;
+
+ rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
while (fname) {
struct fname *old = fname;
fname = fname->next;
kfree(old);
}
- if (!parent)
- *root = RB_ROOT;
- else if (parent->rb_left == n)
- parent->rb_left = NULL;
- else if (parent->rb_right == n)
- parent->rb_right = NULL;
- n = parent;
- }
+
+ *root = RB_ROOT;
}
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 91ad9e1c9441..d22c1a209808 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -325,19 +325,26 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
if (sector >= last_block) {
- if (!create)
- return 0;
-
/*
- * ->mmu_private can access on only allocation path.
- * (caller must hold ->i_mutex)
+ * Both ->mmu_private and ->i_disksize can access
+ * on only allocation path. (caller must hold ->i_mutex)
*/
- last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+ last_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1))
>> blocksize_bits;
+ if (!create) {
+ /* Map a block in fallocated region */
+ if (atomic_read(&MSDOS_I(inode)->beyond_isize))
+ if (sector < last_block)
+ goto out_map_cluster;
+
+ return 0;
+ }
+
if (sector >= last_block)
return 0;
}
+out_map_cluster:
cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
offset = sector & (sbi->sec_per_clus - 1);
cluster = fat_bmap_cluster(inode, cluster);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 7c31f4bc74a9..b8842769b0c5 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -118,7 +118,8 @@ struct msdos_inode_info {
unsigned int cache_valid_id;
/* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */
- loff_t mmu_private; /* physically allocated size */
+ loff_t mmu_private; /* physically allocated size (initialized) */
+ loff_t i_disksize; /* physically allocated size (uninitialized) */
int i_start; /* first cluster or 0 */
int i_logstart; /* logical first cluster */
@@ -128,6 +129,9 @@ struct msdos_inode_info {
struct hlist_node i_dir_hash; /* hash by i_logstart */
struct rw_semaphore truncate_lock; /* protect bmap against truncate */
struct inode vfs_inode;
+
+ /* for getting block number beyond file size in case of fallocate */
+ atomic_t beyond_isize;
};
struct fat_slot_info {
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 9b104f543056..79db8b6ab347 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -17,8 +17,12 @@
#include <linux/blkdev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
+#include <linux/falloc.h>
#include "fat.h"
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len);
+
static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
@@ -182,6 +186,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .fallocate = fat_fallocate,
};
static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -220,6 +225,75 @@ out:
return err;
}
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ int cluster;
+ int nr_cluster; /* Number of clusters to be allocated */
+ loff_t mm_bytes; /* Number of bytes to be allocated for file */
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int err = 0;
+
+ /* No support for hole punch or other fallocate flags. */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /* No support for dir */
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&inode->i_mutex);
+ if ((offset + len) <= MSDOS_I(inode)->i_disksize)
+ goto error;
+
+ err = inode_newsize_ok(inode, (len + offset));
+ if (err)
+ goto error;
+
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ /* First compute the number of clusters to be allocated */
+ mm_bytes = offset + len - round_up(MSDOS_I(inode)->mmu_private,
+ sbi->cluster_size);
+ nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
+ sbi->cluster_bits;
+
+ /* Start the allocation.We are not zeroing out the clusters */
+ while (nr_cluster-- > 0) {
+ err = fat_alloc_clusters(inode, &cluster, 1);
+ if (err) {
+ fat_msg(sb, KERN_ERR,
+ "fat_fallocate(): fat_alloc_clusters() error");
+ goto error;
+ }
+ err = fat_chain_add(inode, cluster, 1);
+ if (err) {
+ fat_free_clusters(inode, cluster);
+ goto error;
+ }
+ MSDOS_I(inode)->i_disksize += sbi->cluster_size;
+ }
+ } else {
+ /* This is just an expanding truncate */
+ err = fat_cont_expand(inode, (offset + len));
+ if (err)
+ fat_msg(sb, KERN_ERR,
+ "fat_fallocate(): fat_cont_expand() error");
+ }
+
+error:
+ mutex_unlock(&inode->i_mutex);
+ return err;
+}
+
/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
@@ -300,8 +374,10 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset)
* This protects against truncating a file bigger than it was then
* trying to write into the hole.
*/
- if (MSDOS_I(inode)->mmu_private > offset)
+ if (MSDOS_I(inode)->i_disksize > offset) {
MSDOS_I(inode)->mmu_private = offset;
+ MSDOS_I(inode)->i_disksize = offset;
+ }
nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 854b578f6695..ba9831d9f648 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -54,6 +54,25 @@ static int fat_add_cluster(struct inode *inode)
return err;
}
+static void check_fallocated_region(struct inode *inode, sector_t iblock,
+ unsigned long *max_blocks, struct buffer_head *bh_result)
+{
+ struct super_block *sb = inode->i_sb;
+ sector_t last_block, disk_block;
+ const unsigned long blocksize = sb->s_blocksize;
+ const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+ last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+ >> blocksize_bits;
+ disk_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1))
+ >> blocksize_bits;
+ if (iblock >= last_block && iblock <= disk_block) {
+ MSDOS_I(inode)->mmu_private += *max_blocks << blocksize_bits;
+ set_buffer_new(bh_result);
+ }
+
+}
+
static inline int __fat_get_block(struct inode *inode, sector_t iblock,
unsigned long *max_blocks,
struct buffer_head *bh_result, int create)
@@ -68,8 +87,11 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
if (err)
return err;
if (phys) {
- map_bh(bh_result, sb, phys);
*max_blocks = min(mapped_blocks, *max_blocks);
+ if (create)
+ check_fallocated_region(inode, iblock, max_blocks,
+ bh_result);
+ map_bh(bh_result, sb, phys);
return 0;
}
if (!create)
@@ -93,6 +115,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
*max_blocks = min(mapped_blocks, *max_blocks);
MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
+ MSDOS_I(inode)->i_disksize = MSDOS_I(inode)->mmu_private;
err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
if (err)
@@ -206,6 +229,13 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
loff_t size = offset + iov_length(iov, nr_segs);
if (MSDOS_I(inode)->mmu_private < size)
return 0;
+
+ /*
+ * In case of writing in fallocated region, return 0 and
+ * fallback to buffered write.
+ */
+ if (MSDOS_I(inode)->i_disksize > MSDOS_I(inode)->mmu_private)
+ return 0;
}
/*
@@ -226,7 +256,10 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
down_read(&MSDOS_I(mapping->host)->truncate_lock);
+ /* To get block number beyond file size in fallocated region */
+ atomic_set(&MSDOS_I(mapping->host)->beyond_isize, 1);
blocknr = generic_block_bmap(mapping, block, fat_get_block);
+ atomic_set(&MSDOS_I(mapping->host)->beyond_isize, 0);
up_read(&MSDOS_I(mapping->host)->truncate_lock);
return blocknr;
@@ -408,6 +441,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
if (error < 0)
return error;
MSDOS_I(inode)->mmu_private = inode->i_size;
+ MSDOS_I(inode)->i_disksize = inode->i_size;
set_nlink(inode, fat_subdirs(inode));
} else { /* not a directory */
@@ -423,6 +457,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
inode->i_fop = &fat_file_operations;
inode->i_mapping->a_ops = &fat_aops;
MSDOS_I(inode)->mmu_private = inode->i_size;
+ MSDOS_I(inode)->i_disksize = inode->i_size;
}
if (de->attr & ATTR_SYS) {
if (sbi->options.sys_immutable)
@@ -494,6 +529,25 @@ static void fat_evict_inode(struct inode *inode)
if (!inode->i_nlink) {
inode->i_size = 0;
fat_truncate_blocks(inode, 0);
+ } else {
+ /* Release unwritten fallocated blocks on inode eviction. */
+ if (MSDOS_I(inode)->mmu_private < MSDOS_I(inode)->i_disksize) {
+ int err;
+ fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
+ /* Fallocate results in updating the i_start/iogstart
+ * for the zero byte file. So, make it return to
+ * original state during evict and commit it
+ * synchrnously to avoid any corruption on the next
+ * access to the cluster chain for the file.
+ */
+ err = fat_sync_inode(inode);
+ if (err) {
+ fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
+ "update on disk inode for unused fallocated "
+ "blocks, inode could be corrupted. Please run "
+ "fsck");
+ }
+ }
}
invalidate_inode_buffers(inode);
clear_inode(inode);
@@ -1223,6 +1277,7 @@ static int fat_read_root(struct inode *inode)
& ~((loff_t)sbi->cluster_size - 1)) >> 9;
MSDOS_I(inode)->i_logstart = 0;
MSDOS_I(inode)->mmu_private = inode->i_size;
+ MSDOS_I(inode)->i_disksize = inode->i_size;
fat_save_attrs(inode, ATTR_DIR);
inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 37213d075f3c..3ebda928229c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -178,64 +178,6 @@ const struct dentry_operations hfsplus_dentry_operations = {
.d_compare = hfsplus_compare_dentry,
};
-static struct dentry *hfsplus_file_lookup(struct inode *dir,
- struct dentry *dentry, unsigned int flags)
-{
- struct hfs_find_data fd;
- struct super_block *sb = dir->i_sb;
- struct inode *inode = NULL;
- struct hfsplus_inode_info *hip;
- int err;
-
- if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
- goto out;
-
- inode = HFSPLUS_I(dir)->rsrc_inode;
- if (inode)
- goto out;
-
- inode = new_inode(sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- hip = HFSPLUS_I(inode);
- inode->i_ino = dir->i_ino;
- INIT_LIST_HEAD(&hip->open_dir_list);
- mutex_init(&hip->extents_lock);
- hip->extent_state = 0;
- hip->flags = 0;
- hip->userflags = 0;
- set_bit(HFSPLUS_I_RSRC, &hip->flags);
-
- err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
- if (!err) {
- err = hfsplus_find_cat(sb, dir->i_ino, &fd);
- if (!err)
- err = hfsplus_cat_read_inode(inode, &fd);
- hfs_find_exit(&fd);
- }
- if (err) {
- iput(inode);
- return ERR_PTR(err);
- }
- hip->rsrc_inode = dir;
- HFSPLUS_I(dir)->rsrc_inode = inode;
- igrab(dir);
-
- /*
- * __mark_inode_dirty expects inodes to be hashed. Since we don't
- * want resource fork inodes in the regular inode space, we make them
- * appear hashed, but do not put on any lists. hlist_del()
- * will work fine and require no locking.
- */
- hlist_add_fake(&inode->i_hash);
-
- mark_inode_dirty(inode);
-out:
- d_add(dentry, inode);
- return NULL;
-}
-
static void hfsplus_get_perms(struct inode *inode,
struct hfsplus_perm *perms, int dir)
{
@@ -385,7 +327,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
}
static const struct inode_operations hfsplus_file_inode_operations = {
- .lookup = hfsplus_file_lookup,
.setattr = hfsplus_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 09b3ed455724..2b91675ffcab 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -456,12 +456,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
The umask is only applied if there's no default ACL */
ret = jffs2_init_acl_pre(dir_i, inode, &mode);
if (ret) {
- make_bad_inode(inode);
- iput(inode);
- return ERR_PTR(ret);
+ mutex_unlock(&f->sem);
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(ret);
}
ret = jffs2_do_new_inode (c, f, mode, ri);
if (ret) {
+ mutex_unlock(&f->sem);
make_bad_inode(inode);
iput(inode);
return ERR_PTR(ret);
@@ -478,6 +480,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
inode->i_size = 0;
if (insert_inode_locked(inode) < 0) {
+ mutex_unlock(&f->sem);
make_bad_inode(inode);
iput(inode);
return ERR_PTR(-EINVAL);
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 975a1f562c10..9a5449bc3afb 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -564,25 +564,10 @@ struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_
they're killed. */
void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
{
- struct jffs2_node_frag *frag;
- struct jffs2_node_frag *parent;
-
- if (!root->rb_node)
- return;
+ struct jffs2_node_frag *frag, *next;
dbg_fragtree("killing\n");
-
- frag = (rb_entry(root->rb_node, struct jffs2_node_frag, rb));
- while(frag) {
- if (frag->rb.rb_left) {
- frag = frag_left(frag);
- continue;
- }
- if (frag->rb.rb_right) {
- frag = frag_right(frag);
- continue;
- }
-
+ rbtree_postorder_for_each_entry_safe(frag, next, root, rb) {
if (frag->node && !(--frag->node->frags)) {
/* Not a hole, and it's the final remaining frag
of this node. Free the node */
@@ -591,17 +576,8 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
jffs2_free_full_dnode(frag->node);
}
- parent = frag_parent(frag);
- if (parent) {
- if (frag_left(parent) == frag)
- parent->rb.rb_left = NULL;
- else
- parent->rb.rb_right = NULL;
- }
jffs2_free_node_frag(frag);
- frag = parent;
-
cond_resched();
}
}
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index ae81b01e6fd7..386303dca382 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -543,33 +543,13 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
{
- struct rb_node *this;
- struct jffs2_tmp_dnode_info *tn;
-
- this = list->rb_node;
+ struct jffs2_tmp_dnode_info *tn, *next;
- /* Now at bottom of tree */
- while (this) {
- if (this->rb_left)
- this = this->rb_left;
- else if (this->rb_right)
- this = this->rb_right;
- else {
- tn = rb_entry(this, struct jffs2_tmp_dnode_info, rb);
+ rbtree_postorder_for_each_entry_safe(tn, next, list, rb) {
jffs2_free_full_dnode(tn->fn);
jffs2_free_tmp_dnode_info(tn);
-
- this = rb_parent(this);
- if (!this)
- break;
-
- if (this->rb_left == &tn->rb)
- this->rb_left = NULL;
- else if (this->rb_right == &tn->rb)
- this->rb_right = NULL;
- else BUG();
- }
}
+
*list = RB_ROOT;
}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index d448a777166b..7f9b096d8d57 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -62,7 +62,8 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
page = read_cache_page(mapping, index, filler, sb);
else {
page = find_or_create_page(mapping, index, GFP_NOFS);
- unlock_page(page);
+ if (page)
+ unlock_page(page);
}
return page;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index b44bdb291b84..2b34021948e4 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -37,7 +37,26 @@
#include "sufile.h"
#include "dat.h"
-
+/**
+ * nilfs_ioctl_wrap_copy - wrapping function of get/set metadata info
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @dir: set of direction flags
+ * @dofunc: concrete function of get/set metadata info
+ *
+ * Description: nilfs_ioctl_wrap_copy() gets/sets metadata info by means of
+ * calling dofunc() function on the basis of @argv argument.
+ *
+ * Return Value: On success, 0 is returned and requested metadata info
+ * is copied into userspace. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
struct nilfs_argv *argv, int dir,
ssize_t (*dofunc)(struct the_nilfs *,
@@ -57,6 +76,14 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
if (argv->v_size > PAGE_SIZE)
return -EINVAL;
+ /*
+ * Reject pairs of a start item position (argv->v_index) and a
+ * total count (argv->v_nmembs) which leads position 'pos' to
+ * overflow by the increment at the end of the loop.
+ */
+ if (argv->v_index > ~(__u64)0 - argv->v_nmembs)
+ return -EINVAL;
+
buf = (void *)__get_free_pages(GFP_NOFS, 0);
if (unlikely(!buf))
return -ENOMEM;
@@ -99,6 +126,9 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
return ret;
}
+/**
+ * nilfs_ioctl_getflags - ioctl to support lsattr
+ */
static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
{
unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE;
@@ -106,6 +136,9 @@ static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
return put_user(flags, (int __user *)argp);
}
+/**
+ * nilfs_ioctl_setflags - ioctl to support chattr
+ */
static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
void __user *argp)
{
@@ -158,11 +191,33 @@ out:
return ret;
}
+/**
+ * nilfs_ioctl_getversion - get info about a file's version (generation number)
+ */
static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
{
return put_user(inode->i_generation, (int __user *)argp);
}
+/**
+ * nilfs_ioctl_change_cpmode - change checkpoint mode (checkpoint/snapshot)
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_change_cpmode() function changes mode of
+ * given checkpoint between checkpoint and snapshot state. This ioctl
+ * is used in chcp and mkcp utilities.
+ *
+ * Return Value: On success, 0 is returned and mode of a checkpoint is
+ * changed. On error, one of the following negative error codes
+ * is returned.
+ *
+ * %-EPERM - Operation not permitted.
+ *
+ * %-EFAULT - Failure during checkpoint mode changing.
+ */
static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
@@ -198,6 +253,25 @@ out:
return ret;
}
+/**
+ * nilfs_ioctl_delete_checkpoint - remove checkpoint
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_delete_checkpoint() function removes
+ * checkpoint from NILFS2 file system. This ioctl is used in rmcp
+ * utility.
+ *
+ * Return Value: On success, 0 is returned and a checkpoint is
+ * removed. On error, one of the following negative error codes
+ * is returned.
+ *
+ * %-EPERM - Operation not permitted.
+ *
+ * %-EFAULT - Failure during checkpoint removing.
+ */
static int
nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
@@ -229,6 +303,21 @@ out:
return ret;
}
+/**
+ * nilfs_ioctl_do_get_cpinfo - callback method getting info about checkpoints
+ * @nilfs: nilfs object
+ * @posp: pointer on array of checkpoint's numbers
+ * @flags: checkpoint mode (checkpoint or snapshot)
+ * @buf: buffer for storing checkponts' info
+ * @size: size in bytes of one checkpoint info item in array
+ * @nmembs: number of checkpoints in array (numbers and infos)
+ *
+ * Description: nilfs_ioctl_do_get_cpinfo() function returns info about
+ * requested checkpoints. The NILFS_IOCTL_GET_CPINFO ioctl is used in
+ * lscp utility and by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_cpinfo structures in output buffer.
+ */
static ssize_t
nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
void *buf, size_t size, size_t nmembs)
@@ -242,6 +331,27 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
return ret;
}
+/**
+ * nilfs_ioctl_get_cpstat - get checkpoints statistics
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_get_cpstat() returns information about checkpoints.
+ * The NILFS_IOCTL_GET_CPSTAT ioctl is used by lscp, rmcp utilities
+ * and by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and checkpoints information is
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting checkpoints statistics.
+ */
static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
@@ -260,6 +370,21 @@ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
return ret;
}
+/**
+ * nilfs_ioctl_do_get_suinfo - callback method getting segment usage info
+ * @nilfs: nilfs object
+ * @posp: pointer on array of segment numbers
+ * @flags: *not used*
+ * @buf: buffer for storing suinfo array
+ * @size: size in bytes of one suinfo item in array
+ * @nmembs: count of segment numbers and suinfos in array
+ *
+ * Description: nilfs_ioctl_do_get_suinfo() function returns segment usage
+ * info about requested segments. The NILFS_IOCTL_GET_SUINFO ioctl is used
+ * in lssu, nilfs_resize utilities and by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_suinfo structures in output buffer.
+ */
static ssize_t
nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
void *buf, size_t size, size_t nmembs)
@@ -273,6 +398,27 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
return ret;
}
+/**
+ * nilfs_ioctl_get_sustat - get segment usage statistics
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_get_sustat() returns segment usage statistics.
+ * The NILFS_IOCTL_GET_SUSTAT ioctl is used in lssu, nilfs_resize utilities
+ * and by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and segment usage information is
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting segment usage statistics.
+ */
static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
@@ -291,6 +437,21 @@ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
return ret;
}
+/**
+ * nilfs_ioctl_do_get_vinfo - callback method getting virtual blocks info
+ * @nilfs: nilfs object
+ * @posp: *not used*
+ * @flags: *not used*
+ * @buf: buffer for storing array of nilfs_vinfo structures
+ * @size: size in bytes of one vinfo item in array
+ * @nmembs: count of vinfos in array
+ *
+ * Description: nilfs_ioctl_do_get_vinfo() function returns information
+ * on virtual block addresses. The NILFS_IOCTL_GET_VINFO ioctl is used
+ * by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_vinfo structures in output buffer.
+ */
static ssize_t
nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
void *buf, size_t size, size_t nmembs)
@@ -303,6 +464,21 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
return ret;
}
+/**
+ * nilfs_ioctl_do_get_bdescs - callback method getting disk block descriptors
+ * @nilfs: nilfs object
+ * @posp: *not used*
+ * @flags: *not used*
+ * @buf: buffer for storing array of nilfs_bdesc structures
+ * @size: size in bytes of one bdesc item in array
+ * @nmembs: count of bdescs in array
+ *
+ * Description: nilfs_ioctl_do_get_bdescs() function returns information
+ * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
+ * is used by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_bdescs structures in output buffer.
+ */
static ssize_t
nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
void *buf, size_t size, size_t nmembs)
@@ -329,6 +505,29 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
return nmembs;
}
+/**
+ * nilfs_ioctl_get_bdescs - get disk block descriptors
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_do_get_bdescs() function returns information
+ * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
+ * is used by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and disk block descriptors are
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting disk block descriptors.
+ */
static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
@@ -352,6 +551,26 @@ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
return ret;
}
+/**
+ * nilfs_ioctl_move_inode_block - prepare data/node block for moving by GC
+ * @inode: inode object
+ * @vdesc: descriptor of virtual block number
+ * @buffers: list of moving buffers
+ *
+ * Description: nilfs_ioctl_move_inode_block() function registers data/node
+ * buffer in the GC pagecache and submit read request.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - Requested block doesn't exist.
+ *
+ * %-EEXIST - Blocks conflict is detected.
+ */
static int nilfs_ioctl_move_inode_block(struct inode *inode,
struct nilfs_vdesc *vdesc,
struct list_head *buffers)
@@ -397,6 +616,19 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
return 0;
}
+/**
+ * nilfs_ioctl_move_blocks - move valid inode's blocks during garbage collection
+ * @sb: superblock object
+ * @argv: vector of arguments from userspace
+ * @buf: array of nilfs_vdesc structures
+ *
+ * Description: nilfs_ioctl_move_blocks() function reads valid data/node
+ * blocks that garbage collector specified with the array of nilfs_vdesc
+ * structures and stores them into page caches of GC inodes.
+ *
+ * Return Value: Number of processed nilfs_vdesc structures or
+ * error code, otherwise.
+ */
static int nilfs_ioctl_move_blocks(struct super_block *sb,
struct nilfs_argv *argv, void *buf)
{
@@ -462,6 +694,25 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
return ret;
}
+/**
+ * nilfs_ioctl_delete_checkpoints - delete checkpoints
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of periods of checkpoints numbers
+ *
+ * Description: nilfs_ioctl_delete_checkpoints() function deletes checkpoints
+ * in the period from p_start to p_end, excluding p_end itself. The checkpoints
+ * which have been already deleted are ignored.
+ *
+ * Return Value: Number of processed nilfs_period structures or
+ * error code, otherwise.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - invalid checkpoints.
+ */
static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
struct nilfs_argv *argv, void *buf)
{
@@ -479,6 +730,24 @@ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
return nmembs;
}
+/**
+ * nilfs_ioctl_free_vblocknrs - free virtual block numbers
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of virtual block numbers
+ *
+ * Description: nilfs_ioctl_free_vblocknrs() function frees
+ * the virtual block numbers specified by @buf and @argv->v_nmembs.
+ *
+ * Return Value: Number of processed virtual block numbers or
+ * error code, otherwise.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The virtual block number have not been allocated.
+ */
static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
struct nilfs_argv *argv, void *buf)
{
@@ -490,6 +759,24 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
return (ret < 0) ? ret : nmembs;
}
+/**
+ * nilfs_ioctl_mark_blocks_dirty - mark blocks dirty
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of block descriptors
+ *
+ * Description: nilfs_ioctl_mark_blocks_dirty() function marks
+ * metadata file or data blocks as dirty.
+ *
+ * Return Value: Number of processed block descriptors or
+ * error code, otherwise.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ */
static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
struct nilfs_argv *argv, void *buf)
{
@@ -571,6 +858,20 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
return ret;
}
+/**
+ * nilfs_ioctl_clean_segments - clean segments
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_clean_segments() function makes garbage
+ * collection operation in the environment of requested parameters
+ * from userspace. The NILFS_IOCTL_CLEAN_SEGMENTS ioctl is used by
+ * nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
@@ -682,6 +983,33 @@ out:
return ret;
}
+/**
+ * nilfs_ioctl_sync - make a checkpoint
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_sync() function constructs a logical segment
+ * for checkpointing. This function guarantees that all modified data
+ * and metadata are written out to the device when it successfully
+ * returned.
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
@@ -710,6 +1038,14 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
return 0;
}
+/**
+ * nilfs_ioctl_resize - resize NILFS2 volume
+ * @inode: inode object
+ * @filp: file object
+ * @argp: pointer on argument from userspace
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
static int nilfs_ioctl_resize(struct inode *inode, struct file *filp,
void __user *argp)
{
@@ -735,6 +1071,17 @@ out:
return ret;
}
+/**
+ * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated
+ * @inode: inode object
+ * @argp: pointer on argument from userspace
+ *
+ * Decription: nilfs_ioctl_set_alloc_range() function defines lower limit
+ * of segments in bytes and upper limit of segments in bytes.
+ * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility.
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
{
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
@@ -767,6 +1114,28 @@ out:
return ret;
}
+/**
+ * nilfs_ioctl_get_info - wrapping function of get metadata info
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ * @membsz: size of an item in bytes
+ * @dofunc: concrete function of getting metadata info
+ *
+ * Description: nilfs_ioctl_get_info() gets metadata info by means of
+ * calling dofunc() function.
+ *
+ * Return Value: On success, 0 is returned and requested metadata info
+ * is copied into userspace. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp,
size_t membsz,
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 1fedd5f7ccc4..0b9ff4395e6a 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -82,20 +82,23 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
* events.
*/
static int dnotify_handle_event(struct fsnotify_group *group,
+ struct inode *inode,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- struct fsnotify_event *event)
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name)
{
struct dnotify_mark *dn_mark;
- struct inode *to_tell;
struct dnotify_struct *dn;
struct dnotify_struct **prev;
struct fown_struct *fown;
- __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
+ __u32 test_mask = mask & ~FS_EVENT_ON_CHILD;
- BUG_ON(vfsmount_mark);
+ /* not a dir, dnotify doesn't care */
+ if (!S_ISDIR(inode->i_mode))
+ return 0;
- to_tell = event->to_tell;
+ BUG_ON(vfsmount_mark);
dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
@@ -122,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group,
return 0;
}
-/*
- * Given an inode and mask determine if dnotify would be interested in sending
- * userspace notification for that pair.
- */
-static bool dnotify_should_send_event(struct fsnotify_group *group,
- struct inode *inode,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- __u32 mask, void *data, int data_type)
-{
- /* not a dir, dnotify doesn't care */
- if (!S_ISDIR(inode->i_mode))
- return false;
-
- return true;
-}
-
static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
{
struct dnotify_mark *dn_mark = container_of(fsn_mark,
@@ -152,10 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
static struct fsnotify_ops dnotify_fsnotify_ops = {
.handle_event = dnotify_handle_event,
- .should_send_event = dnotify_should_send_event,
- .free_group_priv = NULL,
- .freeing_mark = NULL,
- .free_event_priv = NULL,
};
/*
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 0c2f9122b262..58772623f02a 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -9,31 +9,27 @@
#include <linux/types.h>
#include <linux/wait.h>
-static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
+#include "fanotify.h"
+
+static bool should_merge(struct fsnotify_event *old_fsn,
+ struct fsnotify_event *new_fsn)
{
- pr_debug("%s: old=%p new=%p\n", __func__, old, new);
+ struct fanotify_event_info *old, *new;
- if (old->to_tell == new->to_tell &&
- old->data_type == new->data_type &&
- old->tgid == new->tgid) {
- switch (old->data_type) {
- case (FSNOTIFY_EVENT_PATH):
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
- /* dont merge two permission events */
- if ((old->mask & FAN_ALL_PERM_EVENTS) &&
- (new->mask & FAN_ALL_PERM_EVENTS))
- return false;
+ /* dont merge two permission events */
+ if ((old_fsn->mask & FAN_ALL_PERM_EVENTS) &&
+ (new_fsn->mask & FAN_ALL_PERM_EVENTS))
+ return false;
#endif
- if ((old->path.mnt == new->path.mnt) &&
- (old->path.dentry == new->path.dentry))
- return true;
- break;
- case (FSNOTIFY_EVENT_NONE):
- return true;
- default:
- BUG();
- };
- }
+ pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
+ old = FANOTIFY_E(old_fsn);
+ new = FANOTIFY_E(new_fsn);
+
+ if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid &&
+ old->path.mnt == new->path.mnt &&
+ old->path.dentry == new->path.dentry)
+ return true;
return false;
}
@@ -41,59 +37,28 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
static struct fsnotify_event *fanotify_merge(struct list_head *list,
struct fsnotify_event *event)
{
- struct fsnotify_event_holder *test_holder;
- struct fsnotify_event *test_event = NULL;
- struct fsnotify_event *new_event;
+ struct fsnotify_event *test_event;
+ bool do_merge = false;
pr_debug("%s: list=%p event=%p\n", __func__, list, event);
-
- list_for_each_entry_reverse(test_holder, list, event_list) {
- if (should_merge(test_holder->event, event)) {
- test_event = test_holder->event;
+ list_for_each_entry_reverse(test_event, list, list) {
+ if (should_merge(test_event, event)) {
+ do_merge = true;
break;
}
}
- if (!test_event)
+ if (!do_merge)
return NULL;
- fsnotify_get_event(test_event);
-
- /* if they are exactly the same we are done */
- if (test_event->mask == event->mask)
- return test_event;
-
- /*
- * if the refcnt == 2 this is the only queue
- * for this event and so we can update the mask
- * in place.
- */
- if (atomic_read(&test_event->refcnt) == 2) {
- test_event->mask |= event->mask;
- return test_event;
- }
-
- new_event = fsnotify_clone_event(test_event);
-
- /* done with test_event */
- fsnotify_put_event(test_event);
-
- /* couldn't allocate memory, merge was not possible */
- if (unlikely(!new_event))
- return ERR_PTR(-ENOMEM);
-
- /* build new event and replace it on the list */
- new_event->mask = (test_event->mask | event->mask);
- fsnotify_replace_event(test_holder, new_event);
-
- /* we hold a reference on new_event from clone_event */
- return new_event;
+ test_event->mask |= event->mask;
+ return test_event;
}
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
static int fanotify_get_response_from_access(struct fsnotify_group *group,
- struct fsnotify_event *event)
+ struct fanotify_event_info *event)
{
int ret;
@@ -106,7 +71,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
return 0;
/* userspace responded, convert to something usable */
- spin_lock(&event->lock);
switch (event->response) {
case FAN_ALLOW:
ret = 0;
@@ -116,7 +80,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
ret = -EPERM;
}
event->response = 0;
- spin_unlock(&event->lock);
pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
group, event, ret);
@@ -125,58 +88,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
}
#endif
-static int fanotify_handle_event(struct fsnotify_group *group,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *fanotify_mark,
- struct fsnotify_event *event)
-{
- int ret = 0;
- struct fsnotify_event *notify_event = NULL;
-
- BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
- BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
- BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
- BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
- BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
- BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
- BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
- BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
- BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
- BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
-
- pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-
- notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
- if (IS_ERR(notify_event))
- return PTR_ERR(notify_event);
-
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
- if (event->mask & FAN_ALL_PERM_EVENTS) {
- /* if we merged we need to wait on the new event */
- if (notify_event)
- event = notify_event;
- ret = fanotify_get_response_from_access(group, event);
- }
-#endif
-
- if (notify_event)
- fsnotify_put_event(notify_event);
-
- return ret;
-}
-
-static bool fanotify_should_send_event(struct fsnotify_group *group,
- struct inode *to_tell,
- struct fsnotify_mark *inode_mark,
+static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmnt_mark,
- __u32 event_mask, void *data, int data_type)
+ u32 event_mask,
+ void *data, int data_type)
{
__u32 marks_mask, marks_ignored_mask;
struct path *path = data;
- pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
- "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
- inode_mark, vfsmnt_mark, event_mask, data, data_type);
+ pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
+ " data_type=%d\n", __func__, inode_mark, vfsmnt_mark,
+ event_mask, data, data_type);
/* if we don't have enough info to send an event to userspace say no */
if (data_type != FSNOTIFY_EVENT_PATH)
@@ -217,6 +139,74 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
return false;
}
+static int fanotify_handle_event(struct fsnotify_group *group,
+ struct inode *inode,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *fanotify_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name)
+{
+ int ret = 0;
+ struct fanotify_event_info *event;
+ struct fsnotify_event *fsn_event;
+ struct fsnotify_event *notify_fsn_event;
+
+ BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
+ BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+ BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+ BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
+ BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+ BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
+ BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
+ BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
+ BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
+ BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
+
+ if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data,
+ data_type))
+ return 0;
+
+ pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
+ mask);
+
+ event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
+ if (unlikely(!event))
+ return -ENOMEM;
+
+ fsn_event = &event->fse;
+ fsnotify_init_event(fsn_event, inode, mask);
+ event->tgid = get_pid(task_tgid(current));
+ if (data_type == FSNOTIFY_EVENT_PATH) {
+ struct path *path = data;
+ event->path = *path;
+ path_get(&event->path);
+ } else {
+ event->path.mnt = NULL;
+ event->path.dentry = NULL;
+ }
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+ event->response = 0;
+#endif
+
+ notify_fsn_event = fsnotify_add_notify_event(group, fsn_event,
+ fanotify_merge);
+ if (notify_fsn_event) {
+ /* Our event wasn't used in the end. Free it. */
+ fsnotify_destroy_event(group, fsn_event);
+ if (IS_ERR(notify_fsn_event))
+ return PTR_ERR(notify_fsn_event);
+ /* We need to ask about a different events after a merge... */
+ event = FANOTIFY_E(notify_fsn_event);
+ fsn_event = notify_fsn_event;
+ }
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+ if (fsn_event->mask & FAN_ALL_PERM_EVENTS)
+ ret = fanotify_get_response_from_access(group, event);
+#endif
+ return ret;
+}
+
static void fanotify_free_group_priv(struct fsnotify_group *group)
{
struct user_struct *user;
@@ -226,10 +216,18 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
free_uid(user);
}
+static void fanotify_free_event(struct fsnotify_event *fsn_event)
+{
+ struct fanotify_event_info *event;
+
+ event = FANOTIFY_E(fsn_event);
+ path_put(&event->path);
+ put_pid(event->tgid);
+ kmem_cache_free(fanotify_event_cachep, event);
+}
+
const struct fsnotify_ops fanotify_fsnotify_ops = {
.handle_event = fanotify_handle_event,
- .should_send_event = fanotify_should_send_event,
.free_group_priv = fanotify_free_group_priv,
- .free_event_priv = NULL,
- .freeing_mark = NULL,
+ .free_event = fanotify_free_event,
};
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
new file mode 100644
index 000000000000..0e90174a116a
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.h
@@ -0,0 +1,23 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+
+extern struct kmem_cache *fanotify_event_cachep;
+
+struct fanotify_event_info {
+ struct fsnotify_event fse;
+ /*
+ * We hold ref to this path so it may be dereferenced at any point
+ * during this object's lifetime
+ */
+ struct path path;
+ struct pid *tgid;
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+ u32 response; /* userspace answer to question */
+#endif
+};
+
+static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
+{
+ return container_of(fse, struct fanotify_event_info, fse);
+}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index e44cb6427df3..57d7c083cb4b 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -19,6 +19,7 @@
#include "../../mount.h"
#include "../fdinfo.h"
+#include "fanotify.h"
#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
#define FANOTIFY_DEFAULT_MAX_MARKS 8192
@@ -28,11 +29,12 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops;
static struct kmem_cache *fanotify_mark_cache __read_mostly;
static struct kmem_cache *fanotify_response_event_cache __read_mostly;
+struct kmem_cache *fanotify_event_cachep __read_mostly;
struct fanotify_response_event {
struct list_head list;
__s32 fd;
- struct fsnotify_event *event;
+ struct fanotify_event_info *event;
};
/*
@@ -61,8 +63,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
}
static int create_fd(struct fsnotify_group *group,
- struct fsnotify_event *event,
- struct file **file)
+ struct fanotify_event_info *event,
+ struct file **file)
{
int client_fd;
struct file *new_file;
@@ -73,12 +75,6 @@ static int create_fd(struct fsnotify_group *group,
if (client_fd < 0)
return client_fd;
- if (event->data_type != FSNOTIFY_EVENT_PATH) {
- WARN_ON(1);
- put_unused_fd(client_fd);
- return -EINVAL;
- }
-
/*
* we need a new file handle for the userspace program so it can read even if it was
* originally opened O_WRONLY.
@@ -109,23 +105,25 @@ static int create_fd(struct fsnotify_group *group,
}
static int fill_event_metadata(struct fsnotify_group *group,
- struct fanotify_event_metadata *metadata,
- struct fsnotify_event *event,
- struct file **file)
+ struct fanotify_event_metadata *metadata,
+ struct fsnotify_event *fsn_event,
+ struct file **file)
{
int ret = 0;
+ struct fanotify_event_info *event;
pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
- group, metadata, event);
+ group, metadata, fsn_event);
*file = NULL;
+ event = container_of(fsn_event, struct fanotify_event_info, fse);
metadata->event_len = FAN_EVENT_METADATA_LEN;
metadata->metadata_len = FAN_EVENT_METADATA_LEN;
metadata->vers = FANOTIFY_METADATA_VERSION;
metadata->reserved = 0;
- metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
+ metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS;
metadata->pid = pid_vnr(event->tgid);
- if (unlikely(event->mask & FAN_Q_OVERFLOW))
+ if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
metadata->fd = FAN_NOFD;
else {
metadata->fd = create_fd(group, event, file);
@@ -209,7 +207,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
if (!re)
return -ENOMEM;
- re->event = event;
+ re->event = FANOTIFY_E(event);
re->fd = fd;
mutex_lock(&group->fanotify_data.access_mutex);
@@ -217,7 +215,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
if (atomic_read(&group->fanotify_data.bypass_perm)) {
mutex_unlock(&group->fanotify_data.access_mutex);
kmem_cache_free(fanotify_response_event_cache, re);
- event->response = FAN_ALLOW;
+ FANOTIFY_E(event)->response = FAN_ALLOW;
return 0;
}
@@ -273,7 +271,7 @@ out_close_fd:
out:
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
if (event->mask & FAN_ALL_PERM_EVENTS) {
- event->response = FAN_DENY;
+ FANOTIFY_E(event)->response = FAN_DENY;
wake_up(&group->fanotify_data.access_waitq);
}
#endif
@@ -321,7 +319,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
if (IS_ERR(kevent))
break;
ret = copy_event_to_user(group, kevent, buf);
- fsnotify_put_event(kevent);
+ fsnotify_destroy_event(group, kevent);
if (ret < 0)
break;
buf += ret;
@@ -409,7 +407,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct fsnotify_group *group;
- struct fsnotify_event_holder *holder;
+ struct fsnotify_event *fsn_event;
void __user *p;
int ret = -ENOTTY;
size_t send_len = 0;
@@ -421,7 +419,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
switch (cmd) {
case FIONREAD:
mutex_lock(&group->notification_mutex);
- list_for_each_entry(holder, &group->notification_list, event_list)
+ list_for_each_entry(fsn_event, &group->notification_list, list)
send_len += FAN_EVENT_METADATA_LEN;
mutex_unlock(&group->notification_mutex);
ret = put_user(send_len, (int __user *) p);
@@ -906,6 +904,7 @@ static int __init fanotify_user_setup(void)
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
SLAB_PANIC);
+ fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
return 0;
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4bb21d67d9b1..1d4e1ea2f37c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell,
struct fsnotify_mark *vfsmount_mark,
__u32 mask, void *data,
int data_is, u32 cookie,
- const unsigned char *file_name,
- struct fsnotify_event **event)
+ const unsigned char *file_name)
{
struct fsnotify_group *group = NULL;
__u32 inode_test_mask = 0;
@@ -170,27 +169,17 @@ static int send_to_group(struct inode *to_tell,
pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
" inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
- " data=%p data_is=%d cookie=%d event=%p\n",
+ " data=%p data_is=%d cookie=%d\n",
__func__, group, to_tell, mask, inode_mark,
inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
- data_is, cookie, *event);
+ data_is, cookie);
if (!inode_test_mask && !vfsmount_test_mask)
return 0;
- if (group->ops->should_send_event(group, to_tell, inode_mark,
- vfsmount_mark, mask, data,
- data_is) == false)
- return 0;
-
- if (!*event) {
- *event = fsnotify_create_event(to_tell, mask, data,
- data_is, file_name,
- cookie, GFP_KERNEL);
- if (!*event)
- return -ENOMEM;
- }
- return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
+ return group->ops->handle_event(group, to_tell, inode_mark,
+ vfsmount_mark, mask, data, data_is,
+ file_name);
}
/*
@@ -205,7 +194,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
struct fsnotify_group *inode_group, *vfsmount_group;
- struct fsnotify_event *event = NULL;
struct mount *mnt;
int idx, ret = 0;
/* global tests shouldn't care about events on child only the specific event */
@@ -258,18 +246,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
if (inode_group > vfsmount_group) {
/* handle inode */
- ret = send_to_group(to_tell, inode_mark, NULL, mask, data,
- data_is, cookie, file_name, &event);
+ ret = send_to_group(to_tell, inode_mark, NULL, mask,
+ data, data_is, cookie, file_name);
/* we didn't use the vfsmount_mark */
vfsmount_group = NULL;
} else if (vfsmount_group > inode_group) {
- ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data,
- data_is, cookie, file_name, &event);
+ ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
+ data, data_is, cookie, file_name);
inode_group = NULL;
} else {
ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
- mask, data, data_is, cookie, file_name,
- &event);
+ mask, data, data_is, cookie,
+ file_name);
}
if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
@@ -285,12 +273,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
ret = 0;
out:
srcu_read_unlock(&fsnotify_mark_srcu, idx);
- /*
- * fsnotify_create_event() took a reference so the event can't be cleaned
- * up while we are still trying to add it to lists, drop that one.
- */
- if (event)
- fsnotify_put_event(event);
return ret;
}
diff --git a/fs/notify/group.c b/fs/notify/group.c
index bd2625bd88b4..ee674fe2cec7 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -99,6 +99,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
INIT_LIST_HEAD(&group->marks_list);
group->ops = ops;
+ fsnotify_init_event(&group->overflow_event, NULL, FS_Q_OVERFLOW);
return group;
}
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index b6642e4de4bf..485eef3f4407 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -2,11 +2,12 @@
#include <linux/inotify.h>
#include <linux/slab.h> /* struct kmem_cache */
-extern struct kmem_cache *event_priv_cachep;
-
-struct inotify_event_private_data {
- struct fsnotify_event_private_data fsnotify_event_priv_data;
+struct inotify_event_info {
+ struct fsnotify_event fse;
int wd;
+ u32 sync_cookie;
+ int name_len;
+ char name[];
};
struct inotify_inode_mark {
@@ -14,8 +15,18 @@ struct inotify_inode_mark {
int wd;
};
+static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
+{
+ return container_of(fse, struct inotify_event_info, fse);
+}
+
extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
struct fsnotify_group *group);
-extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
+extern int inotify_handle_event(struct fsnotify_group *group,
+ struct inode *inode,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name);
extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 4216308b81b4..aad1a35e9af1 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -34,100 +34,87 @@
#include "inotify.h"
/*
- * Check if 2 events contain the same information. We do not compare private data
- * but at this moment that isn't a problem for any know fsnotify listeners.
+ * Check if 2 events contain the same information.
*/
-static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+static bool event_compare(struct fsnotify_event *old_fsn,
+ struct fsnotify_event *new_fsn)
{
- if ((old->mask == new->mask) &&
- (old->to_tell == new->to_tell) &&
- (old->data_type == new->data_type) &&
- (old->name_len == new->name_len)) {
- switch (old->data_type) {
- case (FSNOTIFY_EVENT_INODE):
- /* remember, after old was put on the wait_q we aren't
- * allowed to look at the inode any more, only thing
- * left to check was if the file_name is the same */
- if (!old->name_len ||
- !strcmp(old->file_name, new->file_name))
- return true;
- break;
- case (FSNOTIFY_EVENT_PATH):
- if ((old->path.mnt == new->path.mnt) &&
- (old->path.dentry == new->path.dentry))
- return true;
- break;
- case (FSNOTIFY_EVENT_NONE):
- if (old->mask & FS_Q_OVERFLOW)
- return true;
- else if (old->mask & FS_IN_IGNORED)
- return false;
- return true;
- };
- }
+ struct inotify_event_info *old, *new;
+
+ if (old_fsn->mask & FS_IN_IGNORED)
+ return false;
+ old = INOTIFY_E(old_fsn);
+ new = INOTIFY_E(new_fsn);
+ if ((old_fsn->mask == new_fsn->mask) &&
+ (old_fsn->inode == new_fsn->inode) &&
+ (old->name_len == new->name_len) &&
+ (!old->name_len || !strcmp(old->name, new->name)))
+ return true;
return false;
}
static struct fsnotify_event *inotify_merge(struct list_head *list,
struct fsnotify_event *event)
{
- struct fsnotify_event_holder *last_holder;
struct fsnotify_event *last_event;
- /* and the list better be locked by something too */
- spin_lock(&event->lock);
-
- last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
- last_event = last_holder->event;
- if (event_compare(last_event, event))
- fsnotify_get_event(last_event);
- else
- last_event = NULL;
-
- spin_unlock(&event->lock);
-
+ last_event = list_entry(list->prev, struct fsnotify_event, list);
+ if (!event_compare(last_event, event))
+ return NULL;
return last_event;
}
-static int inotify_handle_event(struct fsnotify_group *group,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- struct fsnotify_event *event)
+int inotify_handle_event(struct fsnotify_group *group,
+ struct inode *inode,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name)
{
struct inotify_inode_mark *i_mark;
- struct inode *to_tell;
- struct inotify_event_private_data *event_priv;
- struct fsnotify_event_private_data *fsn_event_priv;
+ struct inotify_event_info *event;
struct fsnotify_event *added_event;
- int wd, ret = 0;
+ struct fsnotify_event *fsn_event;
+ int ret = 0;
+ int len = 0;
+ int alloc_len = sizeof(struct inotify_event_info);
BUG_ON(vfsmount_mark);
- pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
- event, event->to_tell, event->mask);
+ if ((inode_mark->mask & FS_EXCL_UNLINK) &&
+ (data_type == FSNOTIFY_EVENT_PATH)) {
+ struct path *path = data;
- to_tell = event->to_tell;
+ if (d_unlinked(path->dentry))
+ return 0;
+ }
+ if (file_name) {
+ len = strlen(file_name);
+ alloc_len += len + 1;
+ }
+
+ pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
+ mask);
i_mark = container_of(inode_mark, struct inotify_inode_mark,
fsn_mark);
- wd = i_mark->wd;
- event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
- if (unlikely(!event_priv))
+ event = kmalloc(alloc_len, GFP_KERNEL);
+ if (unlikely(!event))
return -ENOMEM;
- fsn_event_priv = &event_priv->fsnotify_event_priv_data;
-
- fsnotify_get_group(group);
- fsn_event_priv->group = group;
- event_priv->wd = wd;
+ fsn_event = &event->fse;
+ fsnotify_init_event(fsn_event, inode, mask);
+ event->wd = i_mark->wd;
+ event->name_len = len;
+ if (len)
+ strcpy(event->name, file_name);
- added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
+ added_event = fsnotify_add_notify_event(group, fsn_event, inotify_merge);
if (added_event) {
- inotify_free_event_priv(fsn_event_priv);
- if (!IS_ERR(added_event))
- fsnotify_put_event(added_event);
- else
+ /* Our event wasn't used in the end. Free it. */
+ fsnotify_destroy_event(group, fsn_event);
+ if (IS_ERR(added_event))
ret = PTR_ERR(added_event);
}
@@ -142,22 +129,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify
inotify_ignored_and_remove_idr(fsn_mark, group);
}
-static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- __u32 mask, void *data, int data_type)
-{
- if ((inode_mark->mask & FS_EXCL_UNLINK) &&
- (data_type == FSNOTIFY_EVENT_PATH)) {
- struct path *path = data;
-
- if (d_unlinked(path->dentry))
- return false;
- }
-
- return true;
-}
-
/*
* This is NEVER supposed to be called. Inotify marks should either have been
* removed from the idr when the watch was removed or in the
@@ -202,22 +173,14 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
free_uid(group->inotify_data.user);
}
-void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
+static void inotify_free_event(struct fsnotify_event *fsn_event)
{
- struct inotify_event_private_data *event_priv;
-
-
- event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
- fsnotify_event_priv_data);
-
- fsnotify_put_group(fsn_event_priv->group);
- kmem_cache_free(event_priv_cachep, event_priv);
+ kfree(INOTIFY_E(fsn_event));
}
const struct fsnotify_ops inotify_fsnotify_ops = {
.handle_event = inotify_handle_event,
- .should_send_event = inotify_should_send_event,
.free_group_priv = inotify_free_group_priv,
- .free_event_priv = inotify_free_event_priv,
+ .free_event = inotify_free_event,
.freeing_mark = inotify_freeing_mark,
};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 60f954a891ab..497395c8274b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly;
static int inotify_max_user_watches __read_mostly;
static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
-struct kmem_cache *event_priv_cachep __read_mostly;
#ifdef CONFIG_SYSCTL
@@ -124,6 +123,16 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
return ret;
}
+static int round_event_name_len(struct fsnotify_event *fsn_event)
+{
+ struct inotify_event_info *event;
+
+ event = INOTIFY_E(fsn_event);
+ if (!event->name_len)
+ return 0;
+ return roundup(event->name_len + 1, sizeof(struct inotify_event));
+}
+
/*
* Get an inotify_kernel_event if one exists and is small
* enough to fit in "count". Return an error pointer if
@@ -144,9 +153,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- if (event->name_len)
- event_size += roundup(event->name_len + 1, event_size);
-
+ event_size += round_event_name_len(event);
if (event_size > count)
return ERR_PTR(-EINVAL);
@@ -164,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
* buffer we had in "get_one_event()" above.
*/
static ssize_t copy_event_to_user(struct fsnotify_group *group,
- struct fsnotify_event *event,
+ struct fsnotify_event *fsn_event,
char __user *buf)
{
struct inotify_event inotify_event;
- struct fsnotify_event_private_data *fsn_priv;
- struct inotify_event_private_data *priv;
+ struct inotify_event_info *event;
size_t event_size = sizeof(struct inotify_event);
- size_t name_len = 0;
-
- pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+ size_t name_len;
+ size_t pad_name_len;
- /* we get the inotify watch descriptor from the event private data */
- spin_lock(&event->lock);
- fsn_priv = fsnotify_remove_priv_from_event(group, event);
- spin_unlock(&event->lock);
-
- if (!fsn_priv)
- inotify_event.wd = -1;
- else {
- priv = container_of(fsn_priv, struct inotify_event_private_data,
- fsnotify_event_priv_data);
- inotify_event.wd = priv->wd;
- inotify_free_event_priv(fsn_priv);
- }
+ pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
+ event = INOTIFY_E(fsn_event);
+ name_len = event->name_len;
/*
- * round up event->name_len so it is a multiple of event_size
+ * round up name length so it is a multiple of event_size
* plus an extra byte for the terminating '\0'.
*/
- if (event->name_len)
- name_len = roundup(event->name_len + 1, event_size);
- inotify_event.len = name_len;
-
- inotify_event.mask = inotify_mask_to_arg(event->mask);
+ pad_name_len = round_event_name_len(fsn_event);
+ inotify_event.len = pad_name_len;
+ inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
+ inotify_event.wd = event->wd;
inotify_event.cookie = event->sync_cookie;
/* send the main event */
@@ -209,20 +203,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
/*
* fsnotify only stores the pathname, so here we have to send the pathname
* and then pad that pathname out to a multiple of sizeof(inotify_event)
- * with zeros. I get my zeros from the nul_inotify_event.
+ * with zeros.
*/
- if (name_len) {
- unsigned int len_to_zero = name_len - event->name_len;
+ if (pad_name_len) {
/* copy the path name */
- if (copy_to_user(buf, event->file_name, event->name_len))
+ if (copy_to_user(buf, event->name, name_len))
return -EFAULT;
- buf += event->name_len;
+ buf += name_len;
/* fill userspace with 0's */
- if (clear_user(buf, len_to_zero))
+ if (clear_user(buf, pad_name_len - name_len))
return -EFAULT;
- buf += len_to_zero;
- event_size += name_len;
+ event_size += pad_name_len;
}
return event_size;
@@ -254,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
if (IS_ERR(kevent))
break;
ret = copy_event_to_user(group, kevent, buf);
- fsnotify_put_event(kevent);
+ fsnotify_destroy_event(group, kevent);
if (ret < 0)
break;
buf += ret;
@@ -297,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct fsnotify_group *group;
- struct fsnotify_event_holder *holder;
- struct fsnotify_event *event;
+ struct fsnotify_event *fsn_event;
void __user *p;
int ret = -ENOTTY;
size_t send_len = 0;
@@ -311,12 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case FIONREAD:
mutex_lock(&group->notification_mutex);
- list_for_each_entry(holder, &group->notification_list, event_list) {
- event = holder->event;
+ list_for_each_entry(fsn_event, &group->notification_list,
+ list) {
send_len += sizeof(struct inotify_event);
- if (event->name_len)
- send_len += roundup(event->name_len + 1,
- sizeof(struct inotify_event));
+ send_len += round_event_name_len(fsn_event);
}
mutex_unlock(&group->notification_mutex);
ret = put_user(send_len, (int __user *) p);
@@ -503,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
struct fsnotify_group *group)
{
struct inotify_inode_mark *i_mark;
- struct fsnotify_event *ignored_event, *notify_event;
- struct inotify_event_private_data *event_priv;
- struct fsnotify_event_private_data *fsn_event_priv;
- int ret;
-
- i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
-
- ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
- FSNOTIFY_EVENT_NONE, NULL, 0,
- GFP_NOFS);
- if (!ignored_event)
- goto skip_send_ignore;
-
- event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
- if (unlikely(!event_priv))
- goto skip_send_ignore;
-
- fsn_event_priv = &event_priv->fsnotify_event_priv_data;
-
- fsnotify_get_group(group);
- fsn_event_priv->group = group;
- event_priv->wd = i_mark->wd;
-
- notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
- if (notify_event) {
- if (IS_ERR(notify_event))
- ret = PTR_ERR(notify_event);
- else
- fsnotify_put_event(notify_event);
- inotify_free_event_priv(fsn_event_priv);
- }
-skip_send_ignore:
- /* matches the reference taken when the event was created */
- if (ignored_event)
- fsnotify_put_event(ignored_event);
+ /* Queue ignore event for the watch */
+ inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
+ NULL, FSNOTIFY_EVENT_NONE, NULL);
+ i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
/* remove this mark from the idr */
inotify_remove_from_idr(group, i_mark);
@@ -836,7 +794,6 @@ static int __init inotify_user_setup(void)
BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
- event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
inotify_max_queued_events = 16384;
inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 7b51b05f160c..952237b8e2d2 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -48,15 +48,6 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
-static struct kmem_cache *fsnotify_event_cachep;
-static struct kmem_cache *fsnotify_event_holder_cachep;
-/*
- * This is a magic event we send when the q is too full. Since it doesn't
- * hold real event information we just keep one system wide and use it any time
- * it is needed. It's refcnt is set 1 at kernel init time and will never
- * get set to 0 so it will never get 'freed'
- */
-static struct fsnotify_event *q_overflow_event;
static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
/**
@@ -76,60 +67,14 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
return list_empty(&group->notification_list) ? true : false;
}
-void fsnotify_get_event(struct fsnotify_event *event)
+void fsnotify_destroy_event(struct fsnotify_group *group,
+ struct fsnotify_event *event)
{
- atomic_inc(&event->refcnt);
-}
-
-void fsnotify_put_event(struct fsnotify_event *event)
-{
- if (!event)
+ /* Overflow events are per-group and we don't want to free them */
+ if (!event || event->mask == FS_Q_OVERFLOW)
return;
- if (atomic_dec_and_test(&event->refcnt)) {
- pr_debug("%s: event=%p\n", __func__, event);
-
- if (event->data_type == FSNOTIFY_EVENT_PATH)
- path_put(&event->path);
-
- BUG_ON(!list_empty(&event->private_data_list));
-
- kfree(event->file_name);
- put_pid(event->tgid);
- kmem_cache_free(fsnotify_event_cachep, event);
- }
-}
-
-struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
-{
- return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
-}
-
-void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
-{
- if (holder)
- kmem_cache_free(fsnotify_event_holder_cachep, holder);
-}
-
-/*
- * Find the private data that the group previously attached to this event when
- * the group added the event to the notification queue (fsnotify_add_notify_event)
- */
-struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
-{
- struct fsnotify_event_private_data *lpriv;
- struct fsnotify_event_private_data *priv = NULL;
-
- assert_spin_locked(&event->lock);
-
- list_for_each_entry(lpriv, &event->private_data_list, event_list) {
- if (lpriv->group == group) {
- priv = lpriv;
- list_del(&priv->event_list);
- break;
- }
- }
- return priv;
+ group->ops->free_event(event);
}
/*
@@ -137,91 +82,35 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
* event off the queue to deal with. If the event is successfully added to the
* group's notification queue, a reference is taken on event.
*/
-struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
- struct fsnotify_event_private_data *priv,
+struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
+ struct fsnotify_event *event,
struct fsnotify_event *(*merge)(struct list_head *,
struct fsnotify_event *))
{
struct fsnotify_event *return_event = NULL;
- struct fsnotify_event_holder *holder = NULL;
struct list_head *list = &group->notification_list;
- pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv);
-
- /*
- * There is one fsnotify_event_holder embedded inside each fsnotify_event.
- * Check if we expect to be able to use that holder. If not alloc a new
- * holder.
- * For the overflow event it's possible that something will use the in
- * event holder before we get the lock so we may need to jump back and
- * alloc a new holder, this can't happen for most events...
- */
- if (!list_empty(&event->holder.event_list)) {
-alloc_holder:
- holder = fsnotify_alloc_event_holder();
- if (!holder)
- return ERR_PTR(-ENOMEM);
- }
+ pr_debug("%s: group=%p event=%p\n", __func__, group, event);
mutex_lock(&group->notification_mutex);
if (group->q_len >= group->max_events) {
- event = q_overflow_event;
-
- /*
- * we need to return the overflow event
- * which means we need a ref
- */
- fsnotify_get_event(event);
+ /* Queue overflow event only if it isn't already queued */
+ if (list_empty(&group->overflow_event.list))
+ event = &group->overflow_event;
return_event = event;
-
- /* sorry, no private data on the overflow event */
- priv = NULL;
}
if (!list_empty(list) && merge) {
- struct fsnotify_event *tmp;
-
- tmp = merge(list, event);
- if (tmp) {
- mutex_unlock(&group->notification_mutex);
-
- if (return_event)
- fsnotify_put_event(return_event);
- if (holder != &event->holder)
- fsnotify_destroy_event_holder(holder);
- return tmp;
- }
- }
-
- spin_lock(&event->lock);
-
- if (list_empty(&event->holder.event_list)) {
- if (unlikely(holder))
- fsnotify_destroy_event_holder(holder);
- holder = &event->holder;
- } else if (unlikely(!holder)) {
- /* between the time we checked above and got the lock the in
- * event holder was used, go back and get a new one */
- spin_unlock(&event->lock);
- mutex_unlock(&group->notification_mutex);
-
+ return_event = merge(list, event);
if (return_event) {
- fsnotify_put_event(return_event);
- return_event = NULL;
+ mutex_unlock(&group->notification_mutex);
+ return return_event;
}
-
- goto alloc_holder;
}
group->q_len++;
- holder->event = event;
-
- fsnotify_get_event(event);
- list_add_tail(&holder->event_list, list);
- if (priv)
- list_add_tail(&priv->event_list, &event->private_data_list);
- spin_unlock(&event->lock);
+ list_add_tail(&event->list, list);
mutex_unlock(&group->notification_mutex);
wake_up(&group->notification_waitq);
@@ -230,32 +119,20 @@ alloc_holder:
}
/*
- * Remove and return the first event from the notification list. There is a
- * reference held on this event since it was on the list. It is the responsibility
- * of the caller to drop this reference.
+ * Remove and return the first event from the notification list. It is the
+ * responsibility of the caller to destroy the obtained event
*/
struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
{
struct fsnotify_event *event;
- struct fsnotify_event_holder *holder;
BUG_ON(!mutex_is_locked(&group->notification_mutex));
pr_debug("%s: group=%p\n", __func__, group);
- holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
-
- event = holder->event;
-
- spin_lock(&event->lock);
- holder->event = NULL;
- list_del_init(&holder->event_list);
- spin_unlock(&event->lock);
-
- /* event == holder means we are referenced through the in event holder */
- if (holder != &event->holder)
- fsnotify_destroy_event_holder(holder);
-
+ event = list_first_entry(&group->notification_list,
+ struct fsnotify_event, list);
+ list_del(&event->list);
group->q_len--;
return event;
@@ -266,15 +143,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
*/
struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
{
- struct fsnotify_event *event;
- struct fsnotify_event_holder *holder;
-
BUG_ON(!mutex_is_locked(&group->notification_mutex));
- holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
- event = holder->event;
-
- return event;
+ return list_first_entry(&group->notification_list,
+ struct fsnotify_event, list);
}
/*
@@ -284,181 +156,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
void fsnotify_flush_notify(struct fsnotify_group *group)
{
struct fsnotify_event *event;
- struct fsnotify_event_private_data *priv;
mutex_lock(&group->notification_mutex);
while (!fsnotify_notify_queue_is_empty(group)) {
event = fsnotify_remove_notify_event(group);
- /* if they don't implement free_event_priv they better not have attached any */
- if (group->ops->free_event_priv) {
- spin_lock(&event->lock);
- priv = fsnotify_remove_priv_from_event(group, event);
- spin_unlock(&event->lock);
- if (priv)
- group->ops->free_event_priv(priv);
- }
- fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
+ fsnotify_destroy_event(group, event);
}
mutex_unlock(&group->notification_mutex);
}
-static void initialize_event(struct fsnotify_event *event)
-{
- INIT_LIST_HEAD(&event->holder.event_list);
- atomic_set(&event->refcnt, 1);
-
- spin_lock_init(&event->lock);
-
- INIT_LIST_HEAD(&event->private_data_list);
-}
-
-/*
- * Caller damn well better be holding whatever mutex is protecting the
- * old_holder->event_list and the new_event must be a clean event which
- * cannot be found anywhere else in the kernel.
- */
-int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
- struct fsnotify_event *new_event)
-{
- struct fsnotify_event *old_event = old_holder->event;
- struct fsnotify_event_holder *new_holder = &new_event->holder;
-
- enum event_spinlock_class {
- SPINLOCK_OLD,
- SPINLOCK_NEW,
- };
-
- pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
-
- /*
- * if the new_event's embedded holder is in use someone
- * screwed up and didn't give us a clean new event.
- */
- BUG_ON(!list_empty(&new_holder->event_list));
-
- spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
- spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
-
- new_holder->event = new_event;
- list_replace_init(&old_holder->event_list, &new_holder->event_list);
-
- spin_unlock(&new_event->lock);
- spin_unlock(&old_event->lock);
-
- /* event == holder means we are referenced through the in event holder */
- if (old_holder != &old_event->holder)
- fsnotify_destroy_event_holder(old_holder);
-
- fsnotify_get_event(new_event); /* on the list take reference */
- fsnotify_put_event(old_event); /* off the list, drop reference */
-
- return 0;
-}
-
-struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
-{
- struct fsnotify_event *event;
-
- event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
- if (!event)
- return NULL;
-
- pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
-
- memcpy(event, old_event, sizeof(*event));
- initialize_event(event);
-
- if (event->name_len) {
- event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
- if (!event->file_name) {
- kmem_cache_free(fsnotify_event_cachep, event);
- return NULL;
- }
- }
- event->tgid = get_pid(old_event->tgid);
- if (event->data_type == FSNOTIFY_EVENT_PATH)
- path_get(&event->path);
-
- return event;
-}
-
/*
* fsnotify_create_event - Allocate a new event which will be sent to each
* group's handle_event function if the group was interested in this
* particular event.
*
- * @to_tell the inode which is supposed to receive the event (sometimes a
+ * @inode the inode which is supposed to receive the event (sometimes a
* parent of the inode to which the event happened.
* @mask what actually happened.
* @data pointer to the object which was actually affected
* @data_type flag indication if the data is a file, path, inode, nothing...
* @name the filename, if available
*/
-struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
- int data_type, const unsigned char *name,
- u32 cookie, gfp_t gfp)
+void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
+ u32 mask)
{
- struct fsnotify_event *event;
-
- event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
- if (!event)
- return NULL;
-
- pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
- __func__, event, to_tell, mask, data, data_type);
-
- initialize_event(event);
-
- if (name) {
- event->file_name = kstrdup(name, gfp);
- if (!event->file_name) {
- kmem_cache_free(fsnotify_event_cachep, event);
- return NULL;
- }
- event->name_len = strlen(event->file_name);
- }
-
- event->tgid = get_pid(task_tgid(current));
- event->sync_cookie = cookie;
- event->to_tell = to_tell;
- event->data_type = data_type;
-
- switch (data_type) {
- case FSNOTIFY_EVENT_PATH: {
- struct path *path = data;
- event->path.dentry = path->dentry;
- event->path.mnt = path->mnt;
- path_get(&event->path);
- break;
- }
- case FSNOTIFY_EVENT_INODE:
- event->inode = data;
- break;
- case FSNOTIFY_EVENT_NONE:
- event->inode = NULL;
- event->path.dentry = NULL;
- event->path.mnt = NULL;
- break;
- default:
- BUG();
- }
-
+ INIT_LIST_HEAD(&event->list);
+ event->inode = inode;
event->mask = mask;
-
- return event;
-}
-
-static __init int fsnotify_notification_init(void)
-{
- fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
- fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
-
- q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
- FSNOTIFY_EVENT_NONE, NULL, 0,
- GFP_KERNEL);
- if (!q_overflow_event)
- panic("unable to allocate fsnotify q_overflow_event\n");
-
- return 0;
}
-subsys_initcall(fsnotify_notification_init);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f17e58b32989..ce210d4951a1 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -38,7 +38,6 @@ ocfs2-objs := \
symlink.o \
sysfile.o \
uptodate.o \
- ver.o \
quota_local.o \
quota_global.o \
xattr.o \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index dc7411fe185d..e2edff38be52 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4742,6 +4742,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
enum ocfs2_alloc_restarted *reason_ret)
{
int status = 0, err = 0;
+ int need_free = 0;
int free_extents;
enum ocfs2_alloc_restarted reason = RESTART_NONE;
u32 bit_off, num_bits;
@@ -4796,7 +4797,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ need_free = 1;
+ goto bail;
}
block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
@@ -4807,7 +4809,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
num_bits, flags, meta_ac);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ need_free = 1;
+ goto bail;
}
ocfs2_journal_dirty(handle, et->et_root_bh);
@@ -4821,6 +4824,19 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
reason = RESTART_TRANS;
}
+bail:
+ if (need_free) {
+ if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ bit_off, num_bits);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, bit_off),
+ num_bits);
+ }
+
leave:
if (reason_ret)
*reason_ret = reason;
@@ -6805,6 +6821,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct buffer_head *di_bh)
{
int ret, i, has_data, num_pages = 0;
+ int need_free = 0;
+ u32 bit_off, num;
handle_t *handle;
u64 uninitialized_var(block);
struct ocfs2_inode_info *oi = OCFS2_I(inode);
@@ -6850,7 +6868,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
if (has_data) {
- u32 bit_off, num;
unsigned int page_end;
u64 phys;
@@ -6886,6 +6903,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
if (ret) {
mlog_errno(ret);
+ need_free = 1;
goto out_commit;
}
@@ -6896,6 +6914,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
if (ret) {
mlog_errno(ret);
+ need_free = 1;
goto out_commit;
}
@@ -6927,6 +6946,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
if (ret) {
mlog_errno(ret);
+ need_free = 1;
goto out_commit;
}
@@ -6938,6 +6958,18 @@ out_commit:
dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (need_free) {
+ if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ bit_off, num);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, bit_off),
+ num);
+ }
+
ocfs2_commit_trans(osb, handle);
out_unlock:
@@ -7126,7 +7158,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
if (end > i_size_read(inode))
end = i_size_read(inode);
- BUG_ON(start >= end);
+ BUG_ON(start > end);
if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
@@ -7260,14 +7292,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
minlen = range->minlen >> osb->s_clustersize_bits;
- trimmed = 0;
- if (!len) {
- range->len = 0;
- return 0;
- }
-
- if (minlen >= osb->bitmap_cpg)
+ if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
return -EINVAL;
main_bm_inode = ocfs2_get_system_file_inode(osb,
@@ -7293,6 +7319,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
goto out_unlock;
}
+ len = range->len >> osb->s_clustersize_bits;
if (start + len > le32_to_cpu(main_bm->i_clusters))
len = le32_to_cpu(main_bm->i_clusters) - start;
@@ -7307,6 +7334,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
last_bit = osb->bitmap_cpg;
+ trimmed = 0;
for (group = first_group; group <= last_group;) {
if (first_bit + len >= osb->bitmap_cpg)
last_bit = osb->bitmap_cpg;
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index bc8c5e7d8608..1aefc0350ec3 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
- quorum.o tcp.o netdebug.o ver.o
+ quorum.o tcp.o netdebug.o
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index bb240647ca5f..441c84e169e6 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -29,7 +29,6 @@
#include "heartbeat.h"
#include "masklog.h"
#include "sys.h"
-#include "ver.h"
/* for now we operate under the assertion that there can be only one
* cluster active at a time. Changing this will require trickling
@@ -945,8 +944,6 @@ static int __init init_o2nm(void)
{
int ret = -1;
- cluster_print_version();
-
ret = o2hb_init();
if (ret)
goto out;
@@ -984,6 +981,7 @@ out:
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster management");
module_init(init_o2nm)
module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2cd2406b4140..1828201bc901 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1826,7 +1826,7 @@ int o2net_register_hb_callbacks(void)
/* ------------------------------------------------------------ */
-static int o2net_accept_one(struct socket *sock)
+static int o2net_accept_one(struct socket *sock, int *more)
{
int ret, slen;
struct sockaddr_in sin;
@@ -1837,6 +1837,7 @@ static int o2net_accept_one(struct socket *sock)
struct o2net_node *nn;
BUG_ON(sock == NULL);
+ *more = 0;
ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
sock->sk->sk_protocol, &new_sock);
if (ret)
@@ -1848,6 +1849,7 @@ static int o2net_accept_one(struct socket *sock)
if (ret < 0)
goto out;
+ *more = 1;
new_sock->sk->sk_allocation = GFP_ATOMIC;
ret = o2net_set_nodelay(new_sock);
@@ -1949,8 +1951,15 @@ out:
static void o2net_accept_many(struct work_struct *work)
{
struct socket *sock = o2net_listen_sock;
- while (o2net_accept_one(sock) == 0)
+ int more;
+ int err;
+
+ for (;;) {
+ err = o2net_accept_one(sock, &more);
+ if (!more)
+ break;
cond_resched();
+ }
}
static void o2net_listen_data_ready(struct sock *sk, int bytes)
@@ -1964,18 +1973,30 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes)
goto out;
}
- /* ->sk_data_ready is also called for a newly established child socket
- * before it has been accepted and the acceptor has set up their
- * data_ready.. we only want to queue listen work for our listening
- * socket */
+ /* This callback may called twice when a new connection
+ * is being established as a child socket inherits everything
+ * from a parent LISTEN socket, including the data_ready cb of
+ * the parent. This leads to a hazard. In o2net_accept_one()
+ * we are still initializing the child socket but have not
+ * changed the inherited data_ready callback yet when
+ * data starts arriving.
+ * We avoid this hazard by checking the state.
+ * For the listening socket, the state will be TCP_LISTEN; for the new
+ * socket, will be TCP_ESTABLISHED. Also, in this case,
+ * sk->sk_user_data is not a valid function pointer.
+ */
+
if (sk->sk_state == TCP_LISTEN) {
mlog(ML_TCP, "bytes: %d\n", bytes);
queue_work(o2net_wq, &o2net_listen_work);
+ } else {
+ ready = NULL;
}
out:
read_unlock(&sk->sk_callback_lock);
- ready(sk, bytes);
+ if (ready != NULL)
+ ready(sk, bytes);
}
static int o2net_open_listening_sock(__be32 addr, __be16 port)
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
deleted file mode 100644
index a56eee6abad3..000000000000
--- a/fs/ocfs2/cluster/ver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define CLUSTER_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
-
-void cluster_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
deleted file mode 100644
index 32554c3382c2..000000000000
--- a/fs/ocfs2/cluster/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef O2CLUSTER_VER_H
-#define O2CLUSTER_VER_H
-
-void cluster_print_version(void);
-
-#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index c8a044efbb15..bd1aab1f49a4 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2
obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
- dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
+ dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8b3382abf840..33660a4a52fa 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -43,8 +43,6 @@
#include "dlmdomain.h"
#include "dlmdebug.h"
-#include "dlmver.h"
-
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
#include "cluster/masklog.h"
@@ -2328,8 +2326,6 @@ static int __init dlm_init(void)
{
int status;
- dlm_print_version();
-
status = dlm_init_mle_cache();
if (status) {
mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
@@ -2379,6 +2375,7 @@ static void __exit dlm_exit (void)
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");
module_init(dlm_init);
module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
deleted file mode 100644
index dfc0da4d158d..000000000000
--- a/fs/ocfs2/dlm/dlmver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmver.h"
-
-#define DLM_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
-
-void dlm_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
deleted file mode 100644
index f674aee77a16..000000000000
--- a/fs/ocfs2/dlm/dlmver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLM_VER_H
-#define DLM_VER_H
-
-void dlm_print_version(void);
-
-#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index f14be89a6701..eed3db8c5b49 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2
obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
-ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index efa2b3d339e3..09b7d9dac71d 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -49,7 +49,6 @@
#include "stackglue.h"
#include "userdlm.h"
-#include "dlmfsver.h"
#define MLOG_MASK_PREFIX ML_DLMFS
#include "cluster/masklog.h"
@@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void)
int status;
int cleanup_inode = 0, cleanup_worker = 0;
- dlmfs_print_version();
-
status = bdi_init(&dlmfs_backing_dev_info);
if (status)
return status;
@@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void)
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
module_init(init_dlmfs_fs)
module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
deleted file mode 100644
index a733b3321f83..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmfsver.h"
-
-#define DLM_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
-
-void dlmfs_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
deleted file mode 100644
index f35eadbed25c..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLMFS_VER_H
-#define DLMFS_VER_H
-
-void dlmfs_print_version(void);
-
-#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3407b2c62b21..19986959d149 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2996,6 +2996,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
/* for now, uuid == domain */
status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+ osb->osb_cluster_name,
+ strlen(osb->osb_cluster_name),
osb->uuid_str,
strlen(osb->uuid_str),
&lproto, ocfs2_do_node_down, osb,
@@ -3005,7 +3007,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
goto bail;
}
- status = ocfs2_cluster_this_node(&osb->node_num);
+ status = ocfs2_cluster_this_node(conn, &osb->node_num);
if (status < 0) {
mlog_errno(status);
mlog(ML_ERROR,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6fff128cad16..a2d20c58ef07 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -185,6 +185,9 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
file->f_path.dentry->d_name.name,
(unsigned long long)datasync);
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return 0;
+
err = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (err)
return err;
@@ -474,11 +477,6 @@ static int ocfs2_truncate_file(struct inode *inode,
goto bail;
}
- /* lets handle the simple truncate cases before doing any more
- * cluster locking. */
- if (new_i_size == le64_to_cpu(fe->i_size))
- goto bail;
-
down_write(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_resv_discard(&osb->osb_la_resmap,
@@ -718,7 +716,8 @@ leave:
* While a write will already be ordering the data, a truncate will not.
* Thus, we need to explicitly order the zeroed pages.
*/
-static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
+ struct buffer_head *di_bh)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
handle_t *handle = NULL;
@@ -735,7 +734,14 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
}
ret = ocfs2_jbd2_file_inode(handle, inode);
- if (ret < 0)
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret)
mlog_errno(ret);
out:
@@ -751,7 +757,7 @@ out:
* to be too fragile to do exactly what we need without us having to
* worry about recursive locking in ->write_begin() and ->write_end(). */
static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
- u64 abs_to)
+ u64 abs_to, struct buffer_head *di_bh)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
@@ -759,6 +765,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
handle_t *handle = NULL;
int ret = 0;
unsigned zero_from, zero_to, block_start, block_end;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
BUG_ON(abs_from >= abs_to);
BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
@@ -801,7 +808,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
}
if (!handle) {
- handle = ocfs2_zero_start_ordered_transaction(inode);
+ handle = ocfs2_zero_start_ordered_transaction(inode,
+ di_bh);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
@@ -818,8 +826,22 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
ret = 0;
}
- if (handle)
+ if (handle) {
+ /*
+ * fs-writeback will release the dirty pages without page lock
+ * whose offset are over inode size, the release happens at
+ * block_write_full_page_endio().
+ */
+ i_size_write(inode, abs_to);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ di->i_mtime_nsec = di->i_ctime_nsec;
+ ocfs2_journal_dirty(handle, di_bh);
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+ }
out_unlock:
unlock_page(page);
@@ -915,7 +937,7 @@ out:
* has made sure that the entire range needs zeroing.
*/
static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
- u64 range_end)
+ u64 range_end, struct buffer_head *di_bh)
{
int rc = 0;
u64 next_pos;
@@ -931,7 +953,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
if (next_pos > range_end)
next_pos = range_end;
- rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+ rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
if (rc < 0) {
mlog_errno(rc);
break;
@@ -977,7 +999,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
range_end = zero_to_size;
ret = ocfs2_zero_extend_range(inode, range_start,
- range_end);
+ range_end, di_bh);
if (ret) {
mlog_errno(ret);
break;
@@ -1145,14 +1167,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
goto bail_unlock_rw;
}
- if (size_change && attr->ia_size != i_size_read(inode)) {
+ if (size_change) {
status = inode_newsize_ok(inode, attr->ia_size);
if (status)
goto bail_unlock;
inode_dio_wait(inode);
- if (i_size_read(inode) > attr->ia_size) {
+ if (i_size_read(inode) >= attr->ia_size) {
if (ocfs2_should_order_data(inode)) {
status = ocfs2_begin_ordered_truncate(inode,
attr->ia_size);
@@ -1869,7 +1891,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
}
size = sr->l_start + sr->l_len;
- if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
+ if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
+ cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
if (sr->l_len <= 0) {
ret = -EINVAL;
goto out_inode_unlock;
@@ -2622,7 +2645,16 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
case SEEK_SET:
break;
case SEEK_END:
- offset += inode->i_size;
+ /* SEEK_END requires the OCFS2 inode lock for the file
+ * because it references the file's size.
+ */
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ offset += i_size_read(inode);
+ ocfs2_inode_unlock(inode, 0);
break;
case SEEK_CUR:
if (offset == 0) {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index fa32ce9b455d..8ca3c29accbf 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
+#include <linux/blkdev.h>
#include <linux/compat.h>
#include <cluster/masklog.h>
@@ -966,15 +967,21 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case FITRIM:
{
struct super_block *sb = inode->i_sb;
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
if (copy_from_user(&range, argp, sizeof(range)))
return -EFAULT;
+ range.minlen = max_t(u64, q->limits.discard_granularity,
+ range.minlen);
ret = ocfs2_trim_fs(sb, &range);
if (ret < 0)
return ret;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index cd5496b7a0a3..25ec3b712d5f 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -781,6 +781,46 @@ bail:
return status;
}
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 bit_off,
+ u32 num_bits)
+{
+ int status, start;
+ struct inode *local_alloc_inode;
+ void *bitmap;
+ struct ocfs2_dinode *alloc;
+ struct ocfs2_local_alloc *la;
+
+ BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+
+ local_alloc_inode = ac->ac_inode;
+ alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+ la = OCFS2_LOCAL_ALLOC(alloc);
+
+ bitmap = la->la_bitmap;
+ start = bit_off - le32_to_cpu(la->la_bm_off);
+
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(local_alloc_inode),
+ osb->local_alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ while (num_bits--)
+ ocfs2_clear_bit(start++, bitmap);
+
+ le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits);
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
+bail:
+ return status;
+}
+
static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
{
u32 count;
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 1be9b5864460..44a7d1fb2dec 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -55,6 +55,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
u32 *bit_off,
u32 *num_bits);
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 bit_off,
+ u32 num_bits);
+
void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
unsigned int num_clusters);
void ocfs2_la_enable_worker(struct work_struct *work);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 631a98213474..64c304d668f0 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -561,83 +561,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
}
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
- handle_t *handle,
- struct buffer_head *di_bh,
- u32 num_bits,
- u16 chain)
-{
- int ret;
- u32 tmp_used;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
- struct ocfs2_chain_list *cl =
- (struct ocfs2_chain_list *) &di->id2.i_chain;
-
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
- di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
- le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
- ocfs2_journal_dirty(handle, di_bh);
-
-out:
- return ret;
-}
-
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits)
-{
- int status;
- void *bitmap = bg->bg_bitmap;
- int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
-
- /* All callers get the descriptor via
- * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
- BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
- BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
-
- mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
- num_bits);
-
- if (ocfs2_is_cluster_bitmap(alloc_inode))
- journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-
- status = ocfs2_journal_access_gd(handle,
- INODE_CACHE(alloc_inode),
- group_bh,
- journal_type);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
- if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
- }
- while (num_bits--)
- ocfs2_set_bit(bit_off++, bitmap);
-
- ocfs2_journal_dirty(handle, group_bh);
-
-bail:
- return status;
-}
-
static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
u32 len, int ext_flags)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4f791f6d27d0..179661a21b61 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -644,6 +644,7 @@ static int ocfs2_link(struct dentry *old_dentry,
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dir_lookup_result lookup = { NULL, };
sigset_t oldset;
+ u64 old_de_ino;
trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
old_dentry->d_name.len, old_dentry->d_name.name,
@@ -666,6 +667,18 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out;
}
+ err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name,
+ old_dentry->d_name.len, &old_de_ino);
+ if (err) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (old_de_ino != OCFS2_I(inode)->ip_blkno) {
+ err = -ENOENT;
+ goto out;
+ }
+
err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
dentry->d_name.len);
if (err)
@@ -954,6 +967,65 @@ leave:
return status;
}
+static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
+ u64 src_inode_no, u64 dest_inode_no)
+{
+ int ret = 0, i = 0;
+ u64 parent_inode_no = 0;
+ u64 child_inode_no = src_inode_no;
+ struct inode *child_inode;
+
+#define MAX_LOOKUP_TIMES 32
+ while (1) {
+ child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
+ if (IS_ERR(child_inode)) {
+ ret = PTR_ERR(child_inode);
+ break;
+ }
+
+ ret = ocfs2_inode_lock(child_inode, NULL, 0);
+ if (ret < 0) {
+ iput(child_inode);
+ if (ret != -ENOENT)
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
+ &parent_inode_no);
+ ocfs2_inode_unlock(child_inode, 0);
+ iput(child_inode);
+ if (ret < 0) {
+ ret = -ENOENT;
+ break;
+ }
+
+ if (parent_inode_no == dest_inode_no) {
+ ret = 1;
+ break;
+ }
+
+ if (parent_inode_no == osb->root_inode->i_ino) {
+ ret = 0;
+ break;
+ }
+
+ child_inode_no = parent_inode_no;
+
+ if (++i >= MAX_LOOKUP_TIMES) {
+ mlog(ML_NOTICE, "max lookup times reached, filesystem "
+ "may have nested directories, "
+ "src inode: %llu, dest inode: %llu.\n",
+ (unsigned long long)src_inode_no,
+ (unsigned long long)dest_inode_no);
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
/*
* The only place this should be used is rename!
* if they have the same id, then the 1st one is the only one locked.
@@ -965,6 +1037,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
struct inode *inode2)
{
int status;
+ int inode1_is_ancestor, inode2_is_ancestor;
struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
struct buffer_head **tmpbh;
@@ -978,9 +1051,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
if (*bh2)
*bh2 = NULL;
- /* we always want to lock the one with the lower lockid first. */
+ /* we always want to lock the one with the lower lockid first.
+ * and if they are nested, we lock ancestor first */
if (oi1->ip_blkno != oi2->ip_blkno) {
- if (oi1->ip_blkno < oi2->ip_blkno) {
+ inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
+ oi1->ip_blkno);
+ if (inode1_is_ancestor < 0) {
+ status = inode1_is_ancestor;
+ goto bail;
+ }
+
+ inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
+ oi2->ip_blkno);
+ if (inode2_is_ancestor < 0) {
+ status = inode2_is_ancestor;
+ goto bail;
+ }
+
+ if ((inode1_is_ancestor == 1) ||
+ (oi1->ip_blkno < oi2->ip_blkno &&
+ inode2_is_ancestor == 0)) {
/* switch id1 and id2 around */
tmpbh = bh2;
bh2 = bh1;
@@ -1097,6 +1187,22 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
rename_lock = 1;
+
+ /* here we cannot guarantee the inodes haven't just been
+ * changed, so check if they are nested again */
+ status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
+ old_inode->i_ino);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ } else if (status == 1) {
+ status = -EPERM;
+ mlog(ML_ERROR, "src inode %llu should not be ancestor "
+ "of new dir inode %llu\n",
+ (unsigned long long)old_inode->i_ino,
+ (unsigned long long)new_dir->i_ino);
+ goto bail;
+ }
}
/* if old and new are the same, this'll just do one lock. */
@@ -2101,17 +2207,17 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
goto leave;
}
- /* remove it from the orphan directory */
- status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(orphan_dir_inode),
+ orphan_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- status = ocfs2_journal_access_di(handle,
- INODE_CACHE(orphan_dir_inode),
- orphan_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ /* remove it from the orphan directory */
+ status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
if (status < 0) {
mlog_errno(status);
goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3a903470c794..553f53cc73ae 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -387,6 +387,7 @@ struct ocfs2_super
u8 osb_stackflags;
char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+ char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
struct ocfs2_cluster_connection *cconn;
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index bf1f8930456f..1724d43d3da1 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
return 0;
}
-static int o2cb_cluster_this_node(unsigned int *node)
+static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node)
{
int node_num;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 286edf1e231f..13a8537d8e8b 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -23,6 +23,7 @@
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/reboot.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
#include "stackglue.h"
@@ -102,6 +103,12 @@
#define OCFS2_TEXT_UUID_LEN 32
#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
+#define VERSION_LOCK "version_lock"
+
+enum ocfs2_connection_type {
+ WITH_CONTROLD,
+ NO_CONTROLD
+};
/*
* ocfs2_live_connection is refcounted because the filesystem and
@@ -110,6 +117,13 @@
struct ocfs2_live_connection {
struct list_head oc_list;
struct ocfs2_cluster_connection *oc_conn;
+ enum ocfs2_connection_type oc_type;
+ atomic_t oc_this_node;
+ int oc_our_slot;
+ struct dlm_lksb oc_version_lksb;
+ char oc_lvb[DLM_LVB_LEN];
+ struct completion oc_sync_wait;
+ wait_queue_head_t oc_wait;
};
struct ocfs2_control_private {
@@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
* mount path. Since the VFS prevents multiple calls to
* fill_super(), we can't get dupes here.
*/
-static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
- struct ocfs2_live_connection **c_ret)
+static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
+ struct ocfs2_live_connection *c)
{
int rc = 0;
- struct ocfs2_live_connection *c;
-
- c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
- if (!c)
- return -ENOMEM;
mutex_lock(&ocfs2_control_lock);
c->oc_conn = conn;
- if (atomic_read(&ocfs2_control_opened))
+ if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
list_add(&c->oc_list, &ocfs2_live_connection_list);
else {
printk(KERN_ERR
@@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
}
mutex_unlock(&ocfs2_control_lock);
-
- if (!rc)
- *c_ret = c;
- else
- kfree(c);
-
return rc;
}
@@ -799,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
return 0;
}
+static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ ver->pv_major = pv->pv_major;
+ ver->pv_minor = pv->pv_minor;
+}
+
+static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ pv->pv_major = ver->pv_major;
+ pv->pv_minor = ver->pv_minor;
+}
+
+static void sync_wait_cb(void *arg)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ complete(&lc->oc_sync_wait);
+}
+
+static int sync_unlock(struct ocfs2_cluster_connection *conn,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x error %d\n",
+ name, lksb->sb_lkid, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ if (lksb->sb_status != -DLM_EUNLOCK) {
+ printk(KERN_ERR "%s lkid %x status %d\n",
+ name, lksb->sb_lkid, lksb->sb_status);
+ return -1;
+ }
+ return 0;
+}
+
+static int sync_lock(struct ocfs2_cluster_connection *conn,
+ int mode, uint32_t flags,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error, status;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
+ name, strlen(name),
+ 0, sync_wait_cb, conn, NULL);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
+ name, lksb->sb_lkid, flags, mode, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ status = lksb->sb_status;
+
+ if (status && status != -EAGAIN) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
+ name, lksb->sb_lkid, flags, mode, status);
+ }
+
+ return status;
+}
+
+
+static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
+ int flags)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_lock(conn, mode, flags,
+ &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+static int version_unlock(struct ocfs2_cluster_connection *conn)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+/* get_protocol_version()
+ *
+ * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
+ * The algorithm is:
+ * 1. Attempt to take the lock in EX mode (non-blocking).
+ * 2. If successful (which means it is the first mount), write the
+ * version number and downconvert to PR lock.
+ * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
+ * taking the PR lock.
+ */
+
+static int get_protocol_version(struct ocfs2_cluster_connection *conn)
+{
+ int ret;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ struct ocfs2_protocol_version pv;
+
+ running_proto.pv_major =
+ ocfs2_user_plugin.sp_max_proto.pv_major;
+ running_proto.pv_minor =
+ ocfs2_user_plugin.sp_max_proto.pv_minor;
+
+ lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
+ ret = version_lock(conn, DLM_LOCK_EX,
+ DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
+ if (!ret) {
+ conn->cc_version.pv_major = running_proto.pv_major;
+ conn->cc_version.pv_minor = running_proto.pv_minor;
+ version_to_lvb(&running_proto, lc->oc_lvb);
+ version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+ } else if (ret == -EAGAIN) {
+ ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
+ if (ret)
+ goto out;
+ lvb_to_version(lc->oc_lvb, &pv);
+
+ if ((pv.pv_major != running_proto.pv_major) ||
+ (pv.pv_minor > running_proto.pv_minor)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ conn->cc_version.pv_major = pv.pv_major;
+ conn->cc_version.pv_minor = pv.pv_minor;
+ }
+out:
+ return ret;
+}
+
+static void user_recover_prep(void *arg)
+{
+}
+
+static void user_recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
+ slot->nodeid, slot->slot);
+ conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
+
+}
+
+static void user_recover_done(void *arg, struct dlm_slot *slots,
+ int num_slots, int our_slot,
+ uint32_t generation)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ int i;
+
+ for (i = 0; i < num_slots; i++)
+ if (slots[i].slot == our_slot) {
+ atomic_set(&lc->oc_this_node, slots[i].nodeid);
+ break;
+ }
+
+ lc->oc_our_slot = our_slot;
+ wake_up(&lc->oc_wait);
+}
+
+static const struct dlm_lockspace_ops ocfs2_ls_ops = {
+ .recover_prep = user_recover_prep,
+ .recover_slot = user_recover_slot,
+ .recover_done = user_recover_done,
+};
+
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+ version_unlock(conn);
+ dlm_release_lockspace(conn->cc_lockspace, 2);
+ conn->cc_lockspace = NULL;
+ ocfs2_live_connection_drop(conn->cc_private);
+ conn->cc_private = NULL;
+ return 0;
+}
+
static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
{
dlm_lockspace_t *fsdlm;
- struct ocfs2_live_connection *uninitialized_var(control);
- int rc = 0;
+ struct ocfs2_live_connection *lc;
+ int rc, ops_rv;
BUG_ON(conn == NULL);
- rc = ocfs2_live_connection_new(conn, &control);
+ lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+ if (!lc) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ init_waitqueue_head(&lc->oc_wait);
+ init_completion(&lc->oc_sync_wait);
+ atomic_set(&lc->oc_this_node, 0);
+ conn->cc_private = lc;
+ lc->oc_type = NO_CONTROLD;
+
+ rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
+ DLM_LSFL_FS, DLM_LVB_LEN,
+ &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
+ if (rc)
+ goto out;
+
+ if (ops_rv == -EOPNOTSUPP) {
+ lc->oc_type = WITH_CONTROLD;
+ printk(KERN_NOTICE "ocfs2: You seem to be using an older "
+ "version of dlm_controld and/or ocfs2-tools."
+ " Please consider upgrading.\n");
+ } else if (ops_rv) {
+ rc = ops_rv;
+ goto out;
+ }
+ conn->cc_lockspace = fsdlm;
+
+ rc = ocfs2_live_connection_attach(conn, lc);
if (rc)
goto out;
+ if (lc->oc_type == NO_CONTROLD) {
+ rc = get_protocol_version(conn);
+ if (rc) {
+ printk(KERN_ERR "ocfs2: Could not determine"
+ " locking version\n");
+ user_cluster_disconnect(conn);
+ goto out;
+ }
+ wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
+ }
+
/*
* running_proto must have been set before we allowed any mounts
* to proceed.
@@ -818,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
printk(KERN_ERR
"Unable to mount with fs locking protocol version "
- "%u.%u because the userspace control daemon has "
- "negotiated %u.%u\n",
+ "%u.%u because negotiated protocol is %u.%u\n",
conn->cc_version.pv_major, conn->cc_version.pv_minor,
running_proto.pv_major, running_proto.pv_minor);
rc = -EPROTO;
- ocfs2_live_connection_drop(control);
- goto out;
- }
-
- rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
- NULL, NULL, NULL, &fsdlm);
- if (rc) {
- ocfs2_live_connection_drop(control);
- goto out;
+ ocfs2_live_connection_drop(lc);
+ lc = NULL;
}
- conn->cc_private = control;
- conn->cc_lockspace = fsdlm;
out:
+ if (rc && lc)
+ kfree(lc);
return rc;
}
-static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
-{
- dlm_release_lockspace(conn->cc_lockspace, 2);
- conn->cc_lockspace = NULL;
- ocfs2_live_connection_drop(conn->cc_private);
- conn->cc_private = NULL;
- return 0;
-}
-static int user_cluster_this_node(unsigned int *this_node)
+static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *this_node)
{
int rc;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ if (lc->oc_type == WITH_CONTROLD)
+ rc = ocfs2_control_get_this_node();
+ else if (lc->oc_type == NO_CONTROLD)
+ rc = atomic_read(&lc->oc_this_node);
+ else
+ rc = -EINVAL;
- rc = ocfs2_control_get_this_node();
if (rc < 0)
return rc;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index cb7ec0b63ddc..1324e6600e57 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
EXPORT_SYMBOL_GPL(ocfs2_plock);
int ocfs2_cluster_connect(const char *stack_name,
+ const char *cluster_name,
+ int cluster_name_len,
const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
@@ -342,8 +344,10 @@ int ocfs2_cluster_connect(const char *stack_name,
goto out;
}
- memcpy(new_conn->cc_name, group, grouplen);
+ strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
new_conn->cc_namelen = grouplen;
+ strlcpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1);
+ new_conn->cc_cluster_name_len = cluster_name_len;
new_conn->cc_recovery_handler = recovery_handler;
new_conn->cc_recovery_data = recovery_data;
@@ -386,8 +390,9 @@ int ocfs2_cluster_connect_agnostic(const char *group,
if (cluster_stack_name[0])
stack_name = cluster_stack_name;
- return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
- recovery_handler, recovery_data, conn);
+ return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
+ lproto, recovery_handler, recovery_data,
+ conn);
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
@@ -460,9 +465,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
-int ocfs2_cluster_this_node(unsigned int *node)
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node)
{
- return active_stack->sp_ops->this_node(node);
+ return active_stack->sp_ops->this_node(conn, node);
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 1ec56fdb8d0d..66334a30cea8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -45,6 +45,9 @@ struct file_lock;
*/
#define GROUP_NAME_MAX 64
+/* This shadows OCFS2_CLUSTER_NAME_LEN */
+#define CLUSTER_NAME_MAX 16
+
/*
* ocfs2_protocol_version changes when ocfs2 does something different in
@@ -97,8 +100,10 @@ struct ocfs2_locking_protocol {
* locking compatibility.
*/
struct ocfs2_cluster_connection {
- char cc_name[GROUP_NAME_MAX];
+ char cc_name[GROUP_NAME_MAX + 1];
int cc_namelen;
+ char cc_cluster_name[CLUSTER_NAME_MAX + 1];
+ int cc_cluster_name_len;
struct ocfs2_protocol_version cc_version;
struct ocfs2_locking_protocol *cc_proto;
void (*cc_recovery_handler)(int node_num, void *recovery_data);
@@ -152,7 +157,8 @@ struct ocfs2_stack_operations {
* ->this_node() returns the cluster's unique identifier for the
* local node.
*/
- int (*this_node)(unsigned int *node);
+ int (*this_node)(struct ocfs2_cluster_connection *conn,
+ unsigned int *node);
/*
* Call the underlying dlm lock function. The ->dlm_lock()
@@ -239,6 +245,8 @@ struct ocfs2_stack_plugin {
/* Used by the filesystem */
int ocfs2_cluster_connect(const char *stack_name,
+ const char *cluster_name,
+ int cluster_name_len,
const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
@@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group,
int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
int hangup_pending);
void ocfs2_cluster_hangup(const char *group, int grouplen);
-int ocfs2_cluster_this_node(unsigned int *node);
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node);
struct ocfs2_lock_res;
int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 2c91452c4047..47ae2663a6f5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
struct ocfs2_suballoc_result *res);
static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
int nr);
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits);
static int ocfs2_relink_block_group(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *fe_bh,
@@ -1343,7 +1337,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
return status;
}
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
+int ocfs2_block_group_set_bits(handle_t *handle,
struct inode *alloc_inode,
struct ocfs2_group_desc *bg,
struct buffer_head *group_bh,
@@ -1388,8 +1382,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
ocfs2_journal_dirty(handle, group_bh);
bail:
- if (status)
- mlog_errno(status);
return status;
}
@@ -1588,7 +1580,7 @@ static int ocfs2_block_group_search(struct inode *inode,
return ret;
}
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
handle_t *handle,
struct buffer_head *di_bh,
u32 num_bits,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a36d0aa50911..218d8036b3e7 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,6 +86,18 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
u32 bits_wanted,
struct ocfs2_alloc_context **ac);
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain);
+int ocfs2_block_group_set_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_group_desc *bg,
+ struct buffer_head *group_bh,
+ unsigned int bit_off,
+ unsigned int num_bits);
+
int ocfs2_claim_metadata(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c41492957aa5..49d84f80f36c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -68,7 +68,6 @@
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
-#include "ver.h"
#include "xattr.h"
#include "quota.h"
#include "refcounttree.h"
@@ -90,6 +89,7 @@ static struct dentry *ocfs2_debugfs_root = NULL;
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster file system");
struct mount_options
{
@@ -1618,8 +1618,6 @@ static int __init ocfs2_init(void)
{
int status, i;
- ocfs2_print_version();
-
for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
init_waitqueue_head(&ocfs2__ioend_wq[i]);
@@ -1947,11 +1945,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_shutdown_local_alloc(osb);
- ocfs2_truncate_log_shutdown(osb);
-
/* This will disable recovery and flush any recovery work. */
ocfs2_recovery_exit(osb);
+ /*
+ * During dismount, when it recovers another node it will call
+ * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq.
+ */
+ ocfs2_truncate_log_shutdown(osb);
+
ocfs2_journal_shutdown(osb);
ocfs2_sync_blockdev(sb);
@@ -2225,10 +2227,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (ocfs2_clusterinfo_valid(osb)) {
osb->osb_stackflags =
OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
- memcpy(osb->osb_cluster_stack,
+ strlcpy(osb->osb_cluster_stack,
OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
- OCFS2_STACK_LABEL_LEN);
- osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+ OCFS2_STACK_LABEL_LEN + 1);
if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
mlog(ML_ERROR,
"couldn't mount because of an invalid "
@@ -2237,6 +2238,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = -EINVAL;
goto bail;
}
+ strlcpy(osb->osb_cluster_name,
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
+ OCFS2_CLUSTER_NAME_LEN + 1);
} else {
/* The empty string is identical with classic tools that
* don't know about s_cluster_info. */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
deleted file mode 100644
index e2488f4128a2..000000000000
--- a/fs/ocfs2/ver.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define OCFS2_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
-
-void ocfs2_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
deleted file mode 100644
index d7395cb91d2f..000000000000
--- a/fs/ocfs2/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_VER_H
-#define OCFS2_VER_H
-
-void ocfs2_print_version(void);
-
-#endif /* OCFS2_VER_H */
diff --git a/fs/pipe.c b/fs/pipe.c
index 0e0752ef2715..78fd0d0788db 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -663,10 +663,11 @@ out:
wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
- if (ret > 0) {
+ if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
int err = file_update_time(filp);
if (err)
ret = err;
+ sb_end_write(file_inode(filp)->i_sb);
}
return ret;
}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 8bd2135b7f82..551e61ba15b6 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,11 +22,80 @@
#include <linux/errno.h>
-EXPORT_SYMBOL(posix_acl_init);
-EXPORT_SYMBOL(posix_acl_alloc);
-EXPORT_SYMBOL(posix_acl_valid);
-EXPORT_SYMBOL(posix_acl_equiv_mode);
-EXPORT_SYMBOL(posix_acl_from_mode);
+struct posix_acl **acl_by_type(struct inode *inode, int type)
+{
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ return &inode->i_acl;
+ case ACL_TYPE_DEFAULT:
+ return &inode->i_default_acl;
+ default:
+ BUG();
+ }
+}
+EXPORT_SYMBOL(acl_by_type);
+
+struct posix_acl *get_cached_acl(struct inode *inode, int type)
+{
+ struct posix_acl **p = acl_by_type(inode, type);
+ struct posix_acl *acl = ACCESS_ONCE(*p);
+ if (acl) {
+ spin_lock(&inode->i_lock);
+ acl = *p;
+ if (acl != ACL_NOT_CACHED)
+ acl = posix_acl_dup(acl);
+ spin_unlock(&inode->i_lock);
+ }
+ return acl;
+}
+EXPORT_SYMBOL(get_cached_acl);
+
+struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
+{
+ return rcu_dereference(*acl_by_type(inode, type));
+}
+EXPORT_SYMBOL(get_cached_acl_rcu);
+
+void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+ struct posix_acl **p = acl_by_type(inode, type);
+ struct posix_acl *old;
+ spin_lock(&inode->i_lock);
+ old = *p;
+ rcu_assign_pointer(*p, posix_acl_dup(acl));
+ spin_unlock(&inode->i_lock);
+ if (old != ACL_NOT_CACHED)
+ posix_acl_release(old);
+}
+EXPORT_SYMBOL(set_cached_acl);
+
+void forget_cached_acl(struct inode *inode, int type)
+{
+ struct posix_acl **p = acl_by_type(inode, type);
+ struct posix_acl *old;
+ spin_lock(&inode->i_lock);
+ old = *p;
+ *p = ACL_NOT_CACHED;
+ spin_unlock(&inode->i_lock);
+ if (old != ACL_NOT_CACHED)
+ posix_acl_release(old);
+}
+EXPORT_SYMBOL(forget_cached_acl);
+
+void forget_all_cached_acls(struct inode *inode)
+{
+ struct posix_acl *old_access, *old_default;
+ spin_lock(&inode->i_lock);
+ old_access = inode->i_acl;
+ old_default = inode->i_default_acl;
+ inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
+ spin_unlock(&inode->i_lock);
+ if (old_access != ACL_NOT_CACHED)
+ posix_acl_release(old_access);
+ if (old_default != ACL_NOT_CACHED)
+ posix_acl_release(old_default);
+}
+EXPORT_SYMBOL(forget_all_cached_acls);
/*
* Init a fresh posix_acl
@@ -37,6 +106,7 @@ posix_acl_init(struct posix_acl *acl, int count)
atomic_set(&acl->a_refcount, 1);
acl->a_count = count;
}
+EXPORT_SYMBOL(posix_acl_init);
/*
* Allocate a new ACL with the specified number of entries.
@@ -51,6 +121,7 @@ posix_acl_alloc(int count, gfp_t flags)
posix_acl_init(acl, count);
return acl;
}
+EXPORT_SYMBOL(posix_acl_alloc);
/*
* Clone an ACL.
@@ -78,8 +149,6 @@ posix_acl_valid(const struct posix_acl *acl)
{
const struct posix_acl_entry *pa, *pe;
int state = ACL_USER_OBJ;
- kuid_t prev_uid = INVALID_UID;
- kgid_t prev_gid = INVALID_GID;
int needs_mask = 0;
FOREACH_ACL_ENTRY(pa, acl, pe) {
@@ -98,10 +167,6 @@ posix_acl_valid(const struct posix_acl *acl)
return -EINVAL;
if (!uid_valid(pa->e_uid))
return -EINVAL;
- if (uid_valid(prev_uid) &&
- uid_lte(pa->e_uid, prev_uid))
- return -EINVAL;
- prev_uid = pa->e_uid;
needs_mask = 1;
break;
@@ -117,10 +182,6 @@ posix_acl_valid(const struct posix_acl *acl)
return -EINVAL;
if (!gid_valid(pa->e_gid))
return -EINVAL;
- if (gid_valid(prev_gid) &&
- gid_lte(pa->e_gid, prev_gid))
- return -EINVAL;
- prev_gid = pa->e_gid;
needs_mask = 1;
break;
@@ -146,6 +207,7 @@ posix_acl_valid(const struct posix_acl *acl)
return 0;
return -EINVAL;
}
+EXPORT_SYMBOL(posix_acl_valid);
/*
* Returns 0 if the acl can be exactly represented in the traditional
@@ -186,6 +248,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
*mode_p = (*mode_p & ~S_IRWXUGO) | mode;
return not_equiv;
}
+EXPORT_SYMBOL(posix_acl_equiv_mode);
/*
* Create an ACL representing the file mode permission bits of an inode.
@@ -207,6 +270,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags)
acl->a_entries[2].e_perm = (mode & S_IRWXO);
return acl;
}
+EXPORT_SYMBOL(posix_acl_from_mode);
/*
* Return 0 if current is granted want access to the inode
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 1bd2077187fd..656e401794de 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -140,24 +140,15 @@ static const char * const task_state_array[] = {
"t (tracing stop)", /* 8 */
"Z (zombie)", /* 16 */
"X (dead)", /* 32 */
- "x (dead)", /* 64 */
- "K (wakekill)", /* 128 */
- "W (waking)", /* 256 */
- "P (parked)", /* 512 */
};
static inline const char *get_task_state(struct task_struct *tsk)
{
- unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state;
- const char * const *p = &task_state_array[0];
+ unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
- BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array));
+ BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
- while (state) {
- p++;
- state >>= 1;
- }
- return *p;
+ return task_state_array[fls(state)];
}
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
@@ -453,8 +444,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
min_flt += t->min_flt;
maj_flt += t->maj_flt;
gtime += task_gtime(t);
- t = next_thread(t);
- } while (t != task);
+ } while_each_thread(task, t);
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 03c8d747be48..51507065263b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1658,13 +1658,18 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
return 0;
}
+static inline bool proc_inode_is_dead(struct inode *inode)
+{
+ return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
+}
+
int pid_delete_dentry(const struct dentry *dentry)
{
/* Is the task we represent dead?
* If so, then don't put the dentry on the lru list,
* kill it immediately.
*/
- return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
+ return proc_inode_is_dead(dentry->d_inode);
}
const struct dentry_operations pid_dentry_operations =
@@ -3092,34 +3097,42 @@ out_no_task:
* In the case of a seek we start with the leader and walk nr
* threads past it.
*/
-static struct task_struct *first_tid(struct task_struct *leader,
- int tid, int nr, struct pid_namespace *ns)
+static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
+ struct pid_namespace *ns)
{
- struct task_struct *pos;
+ struct task_struct *pos, *task;
+ unsigned long nr = f_pos;
+
+ if (nr != f_pos) /* 32bit overflow? */
+ return NULL;
rcu_read_lock();
- /* Attempt to start with the pid of a thread */
- if (tid && (nr > 0)) {
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ goto fail;
+
+ /* Attempt to start with the tid of a thread */
+ if (tid && nr) {
pos = find_task_by_pid_ns(tid, ns);
- if (pos && (pos->group_leader == leader))
+ if (pos && same_thread_group(pos, task))
goto found;
}
/* If nr exceeds the number of threads there is nothing todo */
- pos = NULL;
- if (nr && nr >= get_nr_threads(leader))
- goto out;
+ if (nr >= get_nr_threads(task))
+ goto fail;
/* If we haven't found our starting place yet start
* with the leader and walk nr threads forward.
*/
- for (pos = leader; nr > 0; --nr) {
- pos = next_thread(pos);
- if (pos == leader) {
- pos = NULL;
- goto out;
- }
- }
+ pos = task = task->group_leader;
+ do {
+ if (!nr--)
+ goto found;
+ } while_each_thread(task, pos);
+fail:
+ pos = NULL;
+ goto out;
found:
get_task_struct(pos);
out:
@@ -3152,25 +3165,16 @@ static struct task_struct *next_tid(struct task_struct *start)
/* for the /proc/TGID/task/ directories */
static int proc_task_readdir(struct file *file, struct dir_context *ctx)
{
- struct task_struct *leader = NULL;
- struct task_struct *task = get_proc_task(file_inode(file));
+ struct inode *inode = file_inode(file);
+ struct task_struct *task;
struct pid_namespace *ns;
int tid;
- if (!task)
- return -ENOENT;
- rcu_read_lock();
- if (pid_alive(task)) {
- leader = task->group_leader;
- get_task_struct(leader);
- }
- rcu_read_unlock();
- put_task_struct(task);
- if (!leader)
+ if (proc_inode_is_dead(inode))
return -ENOENT;
if (!dir_emit_dots(file, ctx))
- goto out;
+ return 0;
/* f_version caches the tgid value that the last readdir call couldn't
* return. lseek aka telldir automagically resets f_version to 0.
@@ -3178,7 +3182,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
ns = file->f_dentry->d_sb->s_fs_info;
tid = (int)file->f_version;
file->f_version = 0;
- for (task = first_tid(leader, tid, ctx->pos - 2, ns);
+ for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
task;
task = next_tid(task), ctx->pos++) {
char name[PROC_NUMBUF];
@@ -3194,8 +3198,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
break;
}
}
-out:
- put_task_struct(leader);
+
return 0;
}
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index 82676e3fcd1d..cbd82dff7e81 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -26,4 +26,4 @@ static int __init proc_cmdline_init(void)
proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
return 0;
}
-module_init(proc_cmdline_init);
+fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index 51942d5abcec..290ba85cb900 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -109,4 +109,4 @@ static int __init proc_consoles_init(void)
proc_create("consoles", 0, NULL, &proc_consoles_operations);
return 0;
}
-module_init(proc_consoles_init);
+fs_initcall(proc_consoles_init);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 5a1e539a234b..06f4d31e0396 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -21,4 +21,4 @@ static int __init proc_cpuinfo_init(void)
proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
return 0;
}
-module_init(proc_cpuinfo_init);
+fs_initcall(proc_cpuinfo_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index b14347167c35..50493edc30e5 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -67,4 +67,4 @@ static int __init proc_devices_init(void)
proc_create("devices", 0, NULL, &proc_devinfo_operations);
return 0;
}
-module_init(proc_devices_init);
+fs_initcall(proc_devices_init);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index cca93b6fb9a9..b7f268eb5f45 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -49,8 +49,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
setattr_copy(inode, iattr);
mark_inode_dirty(inode);
- de->uid = inode->i_uid;
- de->gid = inode->i_gid;
+ proc_set_user(de, inode->i_uid, inode->i_gid);
de->mode = inode->i_mode;
return 0;
}
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index 05029c0e2f24..a352d5703b41 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -50,4 +50,4 @@ static int __init proc_interrupts_init(void)
proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
return 0;
}
-module_init(proc_interrupts_init);
+fs_initcall(proc_interrupts_init);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 5ed0e52d6aa0..39e6ef32f0bd 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -639,4 +639,4 @@ static int __init proc_kcore_init(void)
return 0;
}
-module_init(proc_kcore_init);
+fs_initcall(proc_kcore_init);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index bdfabdaefdce..05f8dcdb086e 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -61,4 +61,4 @@ static int __init proc_kmsg_init(void)
proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
return 0;
}
-module_init(proc_kmsg_init);
+fs_initcall(proc_kmsg_init);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 1afa4dd4cae2..aec66e6c2060 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -42,4 +42,4 @@ static int __init proc_loadavg_init(void)
proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
return 0;
}
-module_init(proc_loadavg_init);
+fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a77d2b299199..136e548d9567 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -26,7 +26,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
unsigned long committed;
struct vmalloc_info vmi;
long cached;
+ long available;
+ unsigned long pagecache;
+ unsigned long wmark_low = 0;
unsigned long pages[NR_LRU_LISTS];
+ struct zone *zone;
int lru;
/*
@@ -47,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_page_state(NR_LRU_BASE + lru);
+ for_each_zone(zone)
+ wmark_low += zone->watermark[WMARK_LOW];
+
+ /*
+ * Estimate the amount of memory available for userspace allocations,
+ * without causing swapping.
+ *
+ * Free memory cannot be taken below the low watermark, before the
+ * system starts swapping.
+ */
+ available = i.freeram - wmark_low;
+
+ /*
+ * Not all the page cache can be freed, otherwise the system will
+ * start swapping. Assume at least half of the page cache, or the
+ * low watermark worth of cache, needs to stay.
+ */
+ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache -= min(pagecache / 2, wmark_low);
+ available += pagecache;
+
+ /*
+ * Part of the reclaimable swap consists of items that are in use,
+ * and cannot be freed. Cap this estimate at the low watermark.
+ */
+ available += global_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+ if (available < 0)
+ available = 0;
+
/*
* Tagged format, for easy grepping and expansion.
*/
seq_printf(m,
"MemTotal: %8lu kB\n"
"MemFree: %8lu kB\n"
+ "MemAvailable: %8lu kB\n"
"Buffers: %8lu kB\n"
"Cached: %8lu kB\n"
"SwapCached: %8lu kB\n"
@@ -105,6 +141,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
,
K(i.totalram),
K(i.freeram),
+ K(available),
K(i.bufferram),
K(cached),
K(total_swapcache_pages()),
@@ -183,4 +220,4 @@ static int __init proc_meminfo_init(void)
proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
return 0;
}
-module_init(proc_meminfo_init);
+fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 5f9bc8a746c9..d4a35746cab9 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -131,4 +131,4 @@ static int __init proc_nommu_init(void)
return 0;
}
-module_init(proc_nommu_init);
+fs_initcall(proc_nommu_init);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b8730d9ebaee..02174a610315 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -118,10 +118,12 @@ u64 stable_page_flags(struct page *page)
/*
* PageTransCompound can be true for non-huge compound pages (slab
* pages or pages allocated by drivers with __GFP_COMP) because it
- * just checks PG_head/PG_tail, so we need to check PageLRU to make
- * sure a given page is a thp, not a non-huge compound page.
+ * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
+ * to make sure a given page is a thp, not a non-huge compound page.
*/
- else if (PageTransCompound(page) && PageLRU(compound_trans_head(page)))
+ else if (PageTransCompound(page) &&
+ (PageLRU(compound_trans_head(page)) ||
+ PageAnon(compound_trans_head(page))))
u |= 1 << KPF_THP;
/*
@@ -217,4 +219,4 @@ static int __init proc_page_init(void)
proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
return 0;
}
-module_init(proc_page_init);
+fs_initcall(proc_page_init);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 70779b2fc209..c82dd5147845 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -74,9 +74,9 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
return NULL;
if (!strncmp(name, "security-", 9))
- ent->size = 0; /* don't leak number of password chars */
+ proc_set_size(ent, 0); /* don't leak number of password chars */
else
- ent->size = pp->length;
+ proc_set_size(ent, pp->length);
return ent;
}
@@ -232,6 +232,7 @@ void __init proc_device_tree_init(void)
return;
root = of_find_node_by_path("/");
if (root == NULL) {
+ remove_proc_entry("device-tree", NULL);
pr_debug("/proc/device-tree: can't find root\n");
return;
}
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 62604be9f58d..ad8a77f94beb 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -41,4 +41,4 @@ static int __init proc_softirqs_init(void)
proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
return 0;
}
-module_init(proc_softirqs_init);
+fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 1cf86c0e8689..6f599c62f0cc 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -221,4 +221,4 @@ static int __init proc_stat_init(void)
proc_create("stat", 0, NULL, &proc_stat_operations);
return 0;
}
-module_init(proc_stat_init);
+fs_initcall(proc_stat_init);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 061894625903..7141b8d0ca9e 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -49,4 +49,4 @@ static int __init proc_uptime_init(void)
proc_create("uptime", 0, NULL, &uptime_proc_fops);
return 0;
}
-module_init(proc_uptime_init);
+fs_initcall(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index 76817a60678c..d2154eb6d78f 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -31,4 +31,4 @@ static int __init proc_version_init(void)
proc_create("version", 0, NULL, &version_proc_fops);
return 0;
}
-module_init(proc_version_init);
+fs_initcall(proc_version_init);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9100d6959886..2ca7ba047f04 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1082,7 +1082,7 @@ static int __init vmcore_init(void)
proc_vmcore->size = vmcore_size;
return 0;
}
-module_init(vmcore_init)
+fs_initcall(vmcore_init);
/* Cleanup function for vmcore module. */
void vmcore_cleanup(void)
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 439406e081af..7be26f03a3f5 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -234,17 +234,12 @@ static int mounts_open_common(struct inode *inode, struct file *file,
rcu_read_lock();
nsp = task_nsproxy(task);
- if (!nsp) {
+ if (!nsp || !nsp->mnt_ns) {
rcu_read_unlock();
put_task_struct(task);
goto err;
}
ns = nsp->mnt_ns;
- if (!ns) {
- rcu_read_unlock();
- put_task_struct(task);
- goto err;
- }
get_mnt_ns(ns);
rcu_read_unlock();
task_lock(task);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 4884ac5ae9be..1e56a4e8cf7c 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -30,13 +30,6 @@
#include "internal.h"
-const struct address_space_operations ramfs_aops = {
- .readpage = simple_readpage,
- .write_begin = simple_write_begin,
- .write_end = simple_write_end,
- .set_page_dirty = __set_page_dirty_no_writeback,
-};
-
const struct file_operations ramfs_file_operations = {
.read = do_sync_read,
.aio_read = generic_file_aio_read,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 8d5b438cc188..0b3d8e4cb2fa 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -27,13 +27,12 @@
#include "internal.h"
static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
-
-const struct address_space_operations ramfs_aops = {
- .readpage = simple_readpage,
- .write_begin = simple_write_begin,
- .write_end = simple_write_end,
- .set_page_dirty = __set_page_dirty_no_writeback,
-};
+static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags);
+static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
const struct file_operations ramfs_file_operations = {
.mmap = ramfs_nommu_mmap,
@@ -197,7 +196,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
* - the pages to be mapped must exist
* - the pages be physically contiguous in sequence
*/
-unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
@@ -256,7 +255,7 @@ out:
/*
* set up a mapping for shared memory segments
*/
-int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
{
if (!(vma->vm_flags & VM_SHARED))
return -ENOSYS;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 39d14659a8d3..d365b1c4eb3c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -43,6 +43,13 @@
static const struct super_operations ramfs_ops;
static const struct inode_operations ramfs_dir_inode_operations;
+static const struct address_space_operations ramfs_aops = {
+ .readpage = simple_readpage,
+ .write_begin = simple_write_begin,
+ .write_end = simple_write_end,
+ .set_page_dirty = __set_page_dirty_no_writeback,
+};
+
static struct backing_dev_info ramfs_backing_dev_info = {
.name = "ramfs",
.ra_pages = 0, /* No readahead */
@@ -275,4 +282,4 @@ int __init init_ramfs_fs(void)
return err;
}
-module_init(init_ramfs_fs)
+fs_initcall(init_ramfs_fs);
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
index 6b330639b51d..a9d8ae88fa15 100644
--- a/fs/ramfs/internal.h
+++ b/fs/ramfs/internal.h
@@ -10,5 +10,4 @@
*/
-extern const struct address_space_operations ramfs_aops;
extern const struct inode_operations ramfs_file_inode_operations;
diff --git a/fs/read_write.c b/fs/read_write.c
index 58e440df1bc6..1193ffd03565 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -901,10 +901,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
io_fn_t fn;
iov_fn_t fnv;
- ret = -EFAULT;
- if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
- goto out;
-
ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
UIO_FASTIOV, iovstack, &iov);
if (ret <= 0)
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index f8adaee537c2..dfb617b2bad2 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1958,8 +1958,6 @@ struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
#define MAX_US_INT 0xffff
// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
-#define U32_MAX (~(__u32)0)
-
static inline loff_t max_reiserfs_offset(struct inode *inode)
{
if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index ff1d3d42e72a..d8418782862b 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -533,16 +533,14 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
root = romfs_iget(sb, pos);
if (IS_ERR(root))
- goto error;
+ return PTR_ERR(root);
sb->s_root = d_make_root(root);
if (!sb->s_root)
- goto error;
+ return -ENOMEM;
return 0;
-error:
- return -EINVAL;
error_rsb_inval:
ret = -EINVAL;
error_rsb:
diff --git a/fs/super.c b/fs/super.c
index e5f6c2cfac38..cecd780e0f44 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -166,6 +166,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
if (!s)
return NULL;
+ INIT_LIST_HEAD(&s->s_mounts);
+
if (security_sb_alloc(s))
goto fail;
@@ -188,7 +190,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
if (list_lru_init(&s->s_inode_lru))
goto fail;
- INIT_LIST_HEAD(&s->s_mounts);
init_rwsem(&s->s_umount);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
/*
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index cc1febd8fadf..5157b866a853 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2118,26 +2118,10 @@ out_free:
*/
static void free_inodes(struct fsck_data *fsckd)
{
- struct rb_node *this = fsckd->inodes.rb_node;
- struct fsck_inode *fscki;
+ struct fsck_inode *fscki, *n;
- while (this) {
- if (this->rb_left)
- this = this->rb_left;
- else if (this->rb_right)
- this = this->rb_right;
- else {
- fscki = rb_entry(this, struct fsck_inode, rb);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &fscki->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
- kfree(fscki);
- }
- }
+ rbtree_postorder_for_each_entry_safe(fscki, n, &fsckd->inodes, rb)
+ kfree(fscki);
}
/**
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 36bd4efd0819..a902c5919e42 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -574,27 +574,10 @@ static int done_already(struct rb_root *done_tree, int lnum)
*/
static void destroy_done_tree(struct rb_root *done_tree)
{
- struct rb_node *this = done_tree->rb_node;
- struct done_ref *dr;
+ struct done_ref *dr, *n;
- while (this) {
- if (this->rb_left) {
- this = this->rb_left;
- continue;
- } else if (this->rb_right) {
- this = this->rb_right;
- continue;
- }
- dr = rb_entry(this, struct done_ref, rb);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &dr->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
+ rbtree_postorder_for_each_entry_safe(dr, n, done_tree, rb)
kfree(dr);
- }
}
/**
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index ba32da3fe08a..f1c3e5a1b315 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -815,27 +815,10 @@ static int dbg_find_check_orphan(struct rb_root *root, ino_t inum)
static void dbg_free_check_tree(struct rb_root *root)
{
- struct rb_node *this = root->rb_node;
- struct check_orphan *o;
+ struct check_orphan *o, *n;
- while (this) {
- if (this->rb_left) {
- this = this->rb_left;
- continue;
- } else if (this->rb_right) {
- this = this->rb_right;
- continue;
- }
- o = rb_entry(this, struct check_orphan, rb);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &o->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
+ rbtree_postorder_for_each_entry_safe(o, n, root, rb)
kfree(o);
- }
}
static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 065096e36ed9..c14adb2f420c 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -1335,29 +1335,14 @@ static void remove_ino(struct ubifs_info *c, ino_t inum)
*/
void ubifs_destroy_size_tree(struct ubifs_info *c)
{
- struct rb_node *this = c->size_tree.rb_node;
- struct size_entry *e;
+ struct size_entry *e, *n;
- while (this) {
- if (this->rb_left) {
- this = this->rb_left;
- continue;
- } else if (this->rb_right) {
- this = this->rb_right;
- continue;
- }
- e = rb_entry(this, struct size_entry, rb);
+ rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) {
if (e->inode)
iput(e->inode);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &e->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
kfree(e);
}
+
c->size_tree = RB_ROOT;
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f69daa514a57..5ded8490c0c6 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -873,26 +873,10 @@ static void free_orphans(struct ubifs_info *c)
*/
static void free_buds(struct ubifs_info *c)
{
- struct rb_node *this = c->buds.rb_node;
- struct ubifs_bud *bud;
-
- while (this) {
- if (this->rb_left)
- this = this->rb_left;
- else if (this->rb_right)
- this = this->rb_right;
- else {
- bud = rb_entry(this, struct ubifs_bud, rb);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &bud->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
- kfree(bud);
- }
- }
+ struct ubifs_bud *bud, *n;
+
+ rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb)
+ kfree(bud);
}
/**
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 349f31a30f40..9083bc7ed4ae 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -178,27 +178,11 @@ static int ins_clr_old_idx_znode(struct ubifs_info *c,
*/
void destroy_old_idx(struct ubifs_info *c)
{
- struct rb_node *this = c->old_idx.rb_node;
- struct ubifs_old_idx *old_idx;
+ struct ubifs_old_idx *old_idx, *n;
- while (this) {
- if (this->rb_left) {
- this = this->rb_left;
- continue;
- } else if (this->rb_right) {
- this = this->rb_right;
- continue;
- }
- old_idx = rb_entry(this, struct ubifs_old_idx, rb);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &old_idx->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
+ rbtree_postorder_for_each_entry_safe(old_idx, n, &c->old_idx, rb)
kfree(old_idx);
- }
+
c->old_idx = RB_ROOT;
}
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index a7ea492ae660..59aa24dc0cdd 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -54,7 +54,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
if (ufs_fragnum(fragment) + count > uspi->s_fpg)
ufs_error (sb, "ufs_free_fragments", "internal error");
- mutex_lock(&UFS_SB(sb)->s_lock);
+ lock_ufs(sb);
cgno = ufs_dtog(uspi, fragment);
bit = ufs_dtogd(uspi, fragment);
@@ -118,12 +118,12 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
ubh_sync_block(UCPI_UBH(ucpi));
ufs_mark_sb_dirty(sb);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT\n");
return;
failed:
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT (FAILED)\n");
return;
}
@@ -155,7 +155,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
goto failed;
}
- mutex_lock(&UFS_SB(sb)->s_lock);
+ lock_ufs(sb);
do_more:
overflow = 0;
@@ -215,12 +215,12 @@ do_more:
}
ufs_mark_sb_dirty(sb);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT\n");
return;
failed_unlock:
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
failed:
UFSD("EXIT (FAILED)\n");
return;
@@ -361,7 +361,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
usb1 = ubh_get_usb_first(uspi);
*err = -ENOSPC;
- mutex_lock(&UFS_SB(sb)->s_lock);
+ lock_ufs(sb);
tmp = ufs_data_ptr_to_cpu(sb, p);
if (count + ufs_fragnum(fragment) > uspi->s_fpb) {
@@ -382,19 +382,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
"fragment %llu, tmp %llu\n",
(unsigned long long)fragment,
(unsigned long long)tmp);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
return INVBLOCK;
}
if (fragment < UFS_I(inode)->i_lastfrag) {
UFSD("EXIT (ALREADY ALLOCATED)\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
return 0;
}
}
else {
if (tmp) {
UFSD("EXIT (ALREADY ALLOCATED)\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
return 0;
}
}
@@ -403,7 +403,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
* There is not enough space for user on the device
*/
if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT (FAILED)\n");
return 0;
}
@@ -428,7 +428,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
ufs_clear_frags(inode, result + oldcount,
newcount - oldcount, locked_page != NULL);
}
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
return result;
}
@@ -443,7 +443,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
fragment + count);
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
locked_page != NULL);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
return result;
}
@@ -481,7 +481,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
*err = 0;
UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
fragment + count);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
if (newcount < request)
ufs_free_fragments (inode, result + newcount, request - newcount);
ufs_free_fragments (inode, tmp, oldcount);
@@ -489,7 +489,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
return result;
}
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT (FAILED)\n");
return 0;
}
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index d0426d74817b..da5e52551850 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -71,11 +71,11 @@ void ufs_free_inode (struct inode * inode)
ino = inode->i_ino;
- mutex_lock(&UFS_SB(sb)->s_lock);
+ lock_ufs(sb);
if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) {
ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
return;
}
@@ -83,7 +83,7 @@ void ufs_free_inode (struct inode * inode)
bit = ufs_inotocgoff (ino);
ucpi = ufs_load_cylinder (sb, cg);
if (!ucpi) {
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
return;
}
ucg = ubh_get_ucg(UCPI_UBH(ucpi));
@@ -117,7 +117,7 @@ void ufs_free_inode (struct inode * inode)
ubh_sync_block(UCPI_UBH(ucpi));
ufs_mark_sb_dirty(sb);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT\n");
}
@@ -197,7 +197,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
uspi = sbi->s_uspi;
usb1 = ubh_get_usb_first(uspi);
- mutex_lock(&sbi->s_lock);
+ lock_ufs(sb);
/*
* Try to place the inode in its parent directory
@@ -332,21 +332,20 @@ cg_found:
sync_dirty_buffer(bh);
brelse(bh);
}
-
- mutex_unlock(&sbi->s_lock);
+ unlock_ufs(sb);
UFSD("allocating inode %lu\n", inode->i_ino);
UFSD("EXIT\n");
return inode;
fail_remove_inode:
- mutex_unlock(&sbi->s_lock);
+ unlock_ufs(sb);
clear_nlink(inode);
iput(inode);
UFSD("EXIT (FAILED): err %d\n", err);
return ERR_PTR(err);
failed:
- mutex_unlock(&sbi->s_lock);
+ unlock_ufs(sb);
make_bad_inode(inode);
iput (inode);
UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 329f2f53b7ed..e5a993416fec 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -699,7 +699,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
unsigned flags;
lock_ufs(sb);
- mutex_lock(&UFS_SB(sb)->s_lock);
UFSD("ENTER\n");
@@ -717,7 +716,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
ufs_put_cstotal(sb);
UFSD("EXIT\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
@@ -762,6 +760,7 @@ static void ufs_put_super(struct super_block *sb)
ubh_brelse_uspi (sbi->s_uspi);
kfree (sbi->s_uspi);
+ mutex_destroy(&sbi->mutex);
kfree (sbi);
sb->s_fs_info = NULL;
UFSD("EXIT\n");
@@ -805,7 +804,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
}
#endif
mutex_init(&sbi->mutex);
- mutex_init(&sbi->s_lock);
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
/*
@@ -1262,6 +1260,7 @@ failed:
if (ubh)
ubh_brelse_uspi (uspi);
kfree (uspi);
+ mutex_destroy(&sbi->mutex);
kfree(sbi);
sb->s_fs_info = NULL;
UFSD("EXIT (FAILED)\n");
@@ -1281,7 +1280,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
unsigned flags;
lock_ufs(sb);
- mutex_lock(&UFS_SB(sb)->s_lock);
uspi = UFS_SB(sb)->s_uspi;
flags = UFS_SB(sb)->s_flags;
usb1 = ubh_get_usb_first(uspi);
@@ -1295,7 +1293,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
new_mount_opt = 0;
ufs_set_opt (new_mount_opt, ONERROR_LOCK);
if (!ufs_parse_options (data, &new_mount_opt)) {
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
@@ -1303,14 +1300,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
new_mount_opt |= ufstype;
} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
printk("ufstype can't be changed during remount\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
UFS_SB(sb)->s_mount_opt = new_mount_opt;
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
}
@@ -1335,7 +1330,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
#ifndef CONFIG_UFS_FS_WRITE
printk("ufs was compiled with read-only support, "
"can't be mounted as read-write\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
#else
@@ -1345,13 +1339,11 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
printk("this ufstype is read-only supported\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
if (!ufs_read_cylinder_structures(sb)) {
printk("failed during remounting\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EPERM;
}
@@ -1359,7 +1351,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
#endif
}
UFS_SB(sb)->s_mount_opt = new_mount_opt;
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
}
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index ff2c15ab81aa..343e6fc571e5 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -24,7 +24,6 @@ struct ufs_sb_info {
int work_queued; /* non-zero if the delayed work is queued */
struct delayed_work sync_work; /* FS sync delayed work */
spinlock_t work_lock; /* protects sync_work and work_queued */
- struct mutex s_lock;
};
struct ufs_inode_info {
diff --git a/include/asm-generic/fixmap.h b/include/asm-generic/fixmap.h
new file mode 100644
index 000000000000..5a64ca4621f3
--- /dev/null
+++ b/include/asm-generic/fixmap.h
@@ -0,0 +1,97 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
+ * Break out common bits to asm-generic by Mark Salter, November 2013
+ */
+
+#ifndef __ASM_GENERIC_FIXMAP_H
+#define __ASM_GENERIC_FIXMAP_H
+
+#include <linux/bug.h>
+
+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+#ifndef __ASSEMBLY__
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without translation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+{
+ BUILD_BUG_ON(idx >= __end_of_fixed_addresses);
+ return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+ BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+ return __virt_to_fix(vaddr);
+}
+
+/*
+ * Provide some reasonable defaults for page flags.
+ * Not all architectures use all of these different types and some
+ * architectures use different names.
+ */
+#ifndef FIXMAP_PAGE_NORMAL
+#define FIXMAP_PAGE_NORMAL PAGE_KERNEL
+#endif
+#ifndef FIXMAP_PAGE_NOCACHE
+#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_NOCACHE
+#endif
+#ifndef FIXMAP_PAGE_IO
+#define FIXMAP_PAGE_IO PAGE_KERNEL_IO
+#endif
+#ifndef FIXMAP_PAGE_CLEAR
+#define FIXMAP_PAGE_CLEAR __pgprot(0)
+#endif
+
+#ifndef set_fixmap
+#define set_fixmap(idx, phys) \
+ __set_fixmap(idx, phys, FIXMAP_PAGE_NORMAL)
+#endif
+
+#ifndef clear_fixmap
+#define clear_fixmap(idx) \
+ __set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR)
+#endif
+
+/* Return a pointer with offset calculated */
+#define __set_fixmap_offset(idx, phys, flags) \
+({ \
+ unsigned long addr; \
+ __set_fixmap(idx, phys, flags); \
+ addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1)); \
+ addr; \
+})
+
+#define set_fixmap_offset(idx, phys) \
+ __set_fixmap_offset(idx, phys, FIXMAP_PAGE_NORMAL)
+
+/*
+ * Some hardware wants to get fixmapped without caching.
+ */
+#define set_fixmap_nocache(idx, phys) \
+ __set_fixmap(idx, phys, FIXMAP_PAGE_NOCACHE)
+
+#define set_fixmap_offset_nocache(idx, phys) \
+ __set_fixmap_offset(idx, phys, FIXMAP_PAGE_NOCACHE)
+
+/*
+ * Some fixmaps are for IO
+ */
+#define set_fixmap_io(idx, phys) \
+ __set_fixmap(idx, phys, FIXMAP_PAGE_IO)
+
+#endif /* __ASSEMBLY__ */
+#endif /* __ASM_GENERIC_FIXMAP_H */
diff --git a/include/asm-generic/int-l64.h b/include/asm-generic/int-l64.h
deleted file mode 100644
index 27d4ec0dfce0..000000000000
--- a/include/asm-generic/int-l64.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * asm-generic/int-l64.h
- *
- * Integer declarations for architectures which use "long"
- * for 64-bit types.
- */
-#ifndef _ASM_GENERIC_INT_L64_H
-#define _ASM_GENERIC_INT_L64_H
-
-#include <uapi/asm-generic/int-l64.h>
-
-
-#ifndef __ASSEMBLY__
-
-typedef signed char s8;
-typedef unsigned char u8;
-
-typedef signed short s16;
-typedef unsigned short u16;
-
-typedef signed int s32;
-typedef unsigned int u32;
-
-typedef signed long s64;
-typedef unsigned long u64;
-
-#define S8_C(x) x
-#define U8_C(x) x ## U
-#define S16_C(x) x
-#define U16_C(x) x ## U
-#define S32_C(x) x
-#define U32_C(x) x ## U
-#define S64_C(x) x ## L
-#define U64_C(x) x ## UL
-
-#else /* __ASSEMBLY__ */
-
-#define S8_C(x) x
-#define U8_C(x) x
-#define S16_C(x) x
-#define U16_C(x) x
-#define S32_C(x) x
-#define U32_C(x) x
-#define S64_C(x) x
-#define U64_C(x) x
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_GENERIC_INT_L64_H */
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index f1f07d31a3af..2fae55def608 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -5,6 +5,7 @@
#define _LINUX_BOOTMEM_H
#include <linux/mmzone.h>
+#include <linux/mm_types.h>
#include <asm/dma.h>
/*
@@ -52,7 +53,6 @@ extern void free_bootmem_node(pg_data_t *pgdat,
unsigned long size);
extern void free_bootmem(unsigned long physaddr, unsigned long size);
extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
-extern void __free_pages_bootmem(struct page *page, unsigned int order);
/*
* Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
@@ -142,6 +142,157 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
#define alloc_bootmem_low_pages_node(pgdat, x) \
__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
+
+#if defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM)
+
+/* FIXME: use MEMBLOCK_ALLOC_* variants here */
+#define BOOTMEM_ALLOC_ACCESSIBLE 0
+#define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0)
+
+/* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
+void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
+ phys_addr_t align, phys_addr_t min_addr,
+ phys_addr_t max_addr, int nid);
+void *memblock_virt_alloc_try_nid(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr, int nid);
+void __memblock_free_early(phys_addr_t base, phys_addr_t size);
+void __memblock_free_late(phys_addr_t base, phys_addr_t size);
+
+static inline void * __init memblock_virt_alloc(
+ phys_addr_t size, phys_addr_t align)
+{
+ return memblock_virt_alloc_try_nid(size, align, BOOTMEM_LOW_LIMIT,
+ BOOTMEM_ALLOC_ACCESSIBLE,
+ NUMA_NO_NODE);
+}
+
+static inline void * __init memblock_virt_alloc_nopanic(
+ phys_addr_t size, phys_addr_t align)
+{
+ return memblock_virt_alloc_try_nid_nopanic(size, align,
+ BOOTMEM_LOW_LIMIT,
+ BOOTMEM_ALLOC_ACCESSIBLE,
+ NUMA_NO_NODE);
+}
+
+static inline void * __init memblock_virt_alloc_from_nopanic(
+ phys_addr_t size, phys_addr_t align, phys_addr_t min_addr)
+{
+ return memblock_virt_alloc_try_nid_nopanic(size, align, min_addr,
+ BOOTMEM_ALLOC_ACCESSIBLE,
+ NUMA_NO_NODE);
+}
+
+static inline void * __init memblock_virt_alloc_node(
+ phys_addr_t size, int nid)
+{
+ return memblock_virt_alloc_try_nid(size, 0, BOOTMEM_LOW_LIMIT,
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
+}
+
+static inline void * __init memblock_virt_alloc_node_nopanic(
+ phys_addr_t size, int nid)
+{
+ return memblock_virt_alloc_try_nid_nopanic(size, 0, BOOTMEM_LOW_LIMIT,
+ BOOTMEM_ALLOC_ACCESSIBLE,
+ nid);
+}
+
+static inline void __init memblock_free_early(
+ phys_addr_t base, phys_addr_t size)
+{
+ __memblock_free_early(base, size);
+}
+
+static inline void __init memblock_free_early_nid(
+ phys_addr_t base, phys_addr_t size, int nid)
+{
+ __memblock_free_early(base, size);
+}
+
+static inline void __init memblock_free_late(
+ phys_addr_t base, phys_addr_t size)
+{
+ __memblock_free_late(base, size);
+}
+
+#else
+
+#define BOOTMEM_ALLOC_ACCESSIBLE 0
+
+
+/* Fall back to all the existing bootmem APIs */
+static inline void * __init memblock_virt_alloc(
+ phys_addr_t size, phys_addr_t align)
+{
+ if (!align)
+ align = SMP_CACHE_BYTES;
+ return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT);
+}
+
+static inline void * __init memblock_virt_alloc_nopanic(
+ phys_addr_t size, phys_addr_t align)
+{
+ if (!align)
+ align = SMP_CACHE_BYTES;
+ return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT);
+}
+
+static inline void * __init memblock_virt_alloc_from_nopanic(
+ phys_addr_t size, phys_addr_t align, phys_addr_t min_addr)
+{
+ return __alloc_bootmem_nopanic(size, align, min_addr);
+}
+
+static inline void * __init memblock_virt_alloc_node(
+ phys_addr_t size, int nid)
+{
+ return __alloc_bootmem_node(NODE_DATA(nid), size, SMP_CACHE_BYTES,
+ BOOTMEM_LOW_LIMIT);
+}
+
+static inline void * __init memblock_virt_alloc_node_nopanic(
+ phys_addr_t size, int nid)
+{
+ return __alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
+ SMP_CACHE_BYTES,
+ BOOTMEM_LOW_LIMIT);
+}
+
+static inline void * __init memblock_virt_alloc_try_nid(phys_addr_t size,
+ phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid)
+{
+ return __alloc_bootmem_node_high(NODE_DATA(nid), size, align,
+ min_addr);
+}
+
+static inline void * __init memblock_virt_alloc_try_nid_nopanic(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr, int nid)
+{
+ return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align,
+ min_addr, max_addr);
+}
+
+static inline void __init memblock_free_early(
+ phys_addr_t base, phys_addr_t size)
+{
+ free_bootmem(base, size);
+}
+
+static inline void __init memblock_free_early_nid(
+ phys_addr_t base, phys_addr_t size, int nid)
+{
+ free_bootmem_node(NODE_DATA(nid), base, size);
+}
+
+static inline void __init memblock_free_late(
+ phys_addr_t base, phys_addr_t size)
+{
+ free_bootmem_late(base, size);
+}
+#endif /* defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) */
+
#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
extern void *alloc_remap(int nid, unsigned long size);
#else
diff --git a/include/linux/cache.h b/include/linux/cache.h
index 4c570653ab84..17e7e82d2aa7 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -1,11 +1,11 @@
#ifndef __LINUX_CACHE_H
#define __LINUX_CACHE_H
-#include <linux/kernel.h>
+#include <uapi/linux/kernel.h>
#include <asm/cache.h>
#ifndef L1_CACHE_ALIGN
-#define L1_CACHE_ALIGN(x) ALIGN(x, L1_CACHE_BYTES)
+#define L1_CACHE_ALIGN(x) __ALIGN_KERNEL(x, L1_CACHE_BYTES)
#endif
#ifndef SMP_CACHE_BYTES
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 0442c3d800f0..a6ef9cc267ec 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -8,23 +8,6 @@
#include <linux/ceph/types.h>
-/* This seemed to be the easiest place to define these */
-
-#define U8_MAX ((u8)(~0U))
-#define U16_MAX ((u16)(~0U))
-#define U32_MAX ((u32)(~0U))
-#define U64_MAX ((u64)(~0ULL))
-
-#define S8_MAX ((s8)(U8_MAX >> 1))
-#define S16_MAX ((s16)(U16_MAX >> 1))
-#define S32_MAX ((s32)(U32_MAX >> 1))
-#define S64_MAX ((s64)(U64_MAX >> 1LL))
-
-#define S8_MIN ((s8)(-S8_MAX - 1))
-#define S16_MIN ((s16)(-S16_MAX - 1))
-#define S32_MIN ((s32)(-S32_MAX - 1))
-#define S64_MIN ((s64)(-S64_MAX - 1LL))
-
/*
* in all cases,
* void **p pointer to position pointer
diff --git a/include/linux/cmdline-parser.h b/include/linux/cmdline-parser.h
index a0f9280421ec..2e6dce6e5c2a 100644
--- a/include/linux/cmdline-parser.h
+++ b/include/linux/cmdline-parser.h
@@ -37,9 +37,9 @@ int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline);
struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
const char *bdev);
-void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
- int slot,
- int (*add_part)(int, struct cmdline_subpart *, void *),
- void *param);
+int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
+ int slot,
+ int (*add_part)(int, struct cmdline_subpart *, void *),
+ void *param);
#endif /* CMDLINEPARSEH */
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 091d72e70d8a..7e1c76e3cd68 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -62,6 +62,22 @@ static inline bool compaction_deferred(struct zone *zone, int order)
return zone->compact_considered < defer_limit;
}
+/*
+ * Update defer tracking counters after successful compaction of given order,
+ * which means an allocation either succeeded (alloc_success == true) or is
+ * expected to succeed.
+ */
+static inline void compaction_defer_reset(struct zone *zone, int order,
+ bool alloc_success)
+{
+ if (alloc_success) {
+ zone->compact_considered = 0;
+ zone->compact_defer_shift = 0;
+ }
+ if (order >= zone->compact_order_failed)
+ zone->compact_order_failed = order + 1;
+}
+
/* Returns true if restarting compaction after many failures */
static inline bool compaction_restarting(struct zone *zone, int order)
{
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644
index 000000000000..bba7a4d692b3
--- /dev/null
+++ b/include/linux/crc64_ecma.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index fc0e34ce038f..fe8cb610deac 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -85,6 +85,8 @@ extern void debug_dma_sync_sg_for_device(struct device *dev,
extern void debug_dma_dump_mappings(struct device *dev);
+extern void debug_dma_assert_idle(struct page *page);
+
#else /* CONFIG_DMA_API_DEBUG */
static inline void dma_debug_add_bus(struct bus_type *bus)
@@ -183,6 +185,10 @@ static inline void debug_dma_dump_mappings(struct device *dev)
{
}
+static inline void debug_dma_assert_idle(struct page *page)
+{
+}
+
#endif /* CONFIG_DMA_API_DEBUG */
#endif /* __DMA_DEBUG_H */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 4b2ee8d12f5e..7d8d5e608594 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -15,7 +15,6 @@
#include <linux/path.h> /* struct path */
#include <linux/spinlock.h>
#include <linux/types.h>
-
#include <linux/atomic.h>
/*
@@ -79,6 +78,7 @@ struct fsnotify_group;
struct fsnotify_event;
struct fsnotify_mark;
struct fsnotify_event_private_data;
+struct fsnotify_fname;
/*
* Each group much define these ops. The fsnotify infrastructure will call
@@ -94,17 +94,27 @@ struct fsnotify_event_private_data;
* userspace messages that marks have been removed.
*/
struct fsnotify_ops {
- bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- __u32 mask, void *data, int data_type);
int (*handle_event)(struct fsnotify_group *group,
+ struct inode *inode,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- struct fsnotify_event *event);
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name);
void (*free_group_priv)(struct fsnotify_group *group);
void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
- void (*free_event_priv)(struct fsnotify_event_private_data *priv);
+ void (*free_event)(struct fsnotify_event *event);
+};
+
+/*
+ * all of the information about the original object we want to now send to
+ * a group. If you want to carry more info from the accessing task to the
+ * listener this structure is where you need to be adding fields.
+ */
+struct fsnotify_event {
+ struct list_head list;
+ /* inode may ONLY be dereferenced during handle_event(). */
+ struct inode *inode; /* either the inode the event happened to or its parent */
+ u32 mask; /* the type of access, bitwise OR for FS_* event types */
};
/*
@@ -148,7 +158,11 @@ struct fsnotify_group {
* a group */
struct list_head marks_list; /* all inode marks for this group */
- struct fasync_struct *fsn_fa; /* async notification */
+ struct fasync_struct *fsn_fa; /* async notification */
+
+ struct fsnotify_event overflow_event; /* Event we queue when the
+ * notification list is too
+ * full */
/* groups can define private fields here or use the void *private */
union {
@@ -177,76 +191,10 @@ struct fsnotify_group {
};
};
-/*
- * A single event can be queued in multiple group->notification_lists.
- *
- * each group->notification_list will point to an event_holder which in turns points
- * to the actual event that needs to be sent to userspace.
- *
- * Seemed cheaper to create a refcnt'd event and a small holder for every group
- * than create a different event for every group
- *
- */
-struct fsnotify_event_holder {
- struct fsnotify_event *event;
- struct list_head event_list;
-};
-
-/*
- * Inotify needs to tack data onto an event. This struct lets us later find the
- * correct private data of the correct group.
- */
-struct fsnotify_event_private_data {
- struct fsnotify_group *group;
- struct list_head event_list;
-};
-
-/*
- * all of the information about the original object we want to now send to
- * a group. If you want to carry more info from the accessing task to the
- * listener this structure is where you need to be adding fields.
- */
-struct fsnotify_event {
- /*
- * If we create an event we are also likely going to need a holder
- * to link to a group. So embed one holder in the event. Means only
- * one allocation for the common case where we only have one group
- */
- struct fsnotify_event_holder holder;
- spinlock_t lock; /* protection for the associated event_holder and private_list */
- /* to_tell may ONLY be dereferenced during handle_event(). */
- struct inode *to_tell; /* either the inode the event happened to or its parent */
- /*
- * depending on the event type we should have either a path or inode
- * We hold a reference on path, but NOT on inode. Since we have the ref on
- * the path, it may be dereferenced at any point during this object's
- * lifetime. That reference is dropped when this object's refcnt hits
- * 0. If this event contains an inode instead of a path, the inode may
- * ONLY be used during handle_event().
- */
- union {
- struct path path;
- struct inode *inode;
- };
/* when calling fsnotify tell it if the data is a path or inode */
#define FSNOTIFY_EVENT_NONE 0
#define FSNOTIFY_EVENT_PATH 1
#define FSNOTIFY_EVENT_INODE 2
- int data_type; /* which of the above union we have */
- atomic_t refcnt; /* how many groups still are using/need to send this event */
- __u32 mask; /* the type of access, bitwise OR for FS_* event types */
-
- u32 sync_cookie; /* used to corrolate events, namely inotify mv events */
- const unsigned char *file_name;
- size_t name_len;
- struct pid *tgid;
-
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
- __u32 response; /* userspace answer to question */
-#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
-
- struct list_head private_data_list; /* groups can store private data here */
-};
/*
* Inode specific fields in an fsnotify_mark
@@ -370,17 +318,12 @@ extern void fsnotify_put_group(struct fsnotify_group *group);
extern void fsnotify_destroy_group(struct fsnotify_group *group);
/* fasync handler function */
extern int fsnotify_fasync(int fd, struct file *file, int on);
-/* take a reference to an event */
-extern void fsnotify_get_event(struct fsnotify_event *event);
-extern void fsnotify_put_event(struct fsnotify_event *event);
-/* find private data previously attached to an event and unlink it */
-extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group,
- struct fsnotify_event *event);
-
+/* Free event from memory */
+extern void fsnotify_destroy_event(struct fsnotify_group *group,
+ struct fsnotify_event *event);
/* attach the event to the group notification queue */
extern struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
struct fsnotify_event *event,
- struct fsnotify_event_private_data *priv,
struct fsnotify_event *(*merge)(struct list_head *,
struct fsnotify_event *));
/* true if the group notification queue is empty */
@@ -430,15 +373,8 @@ extern void fsnotify_put_mark(struct fsnotify_mark *mark);
extern void fsnotify_unmount_inodes(struct list_head *list);
/* put here because inotify does some weird stuff when destroying watches */
-extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
- void *data, int data_is,
- const unsigned char *name,
- u32 cookie, gfp_t gfp);
-
-/* fanotify likes to change events after they are on lists... */
-extern struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event);
-extern int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
- struct fsnotify_event *new_event);
+extern void fsnotify_init_event(struct fsnotify_event *event,
+ struct inode *to_tell, u32 mask);
#else
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 1eda33d7cb10..1c2fdaa2ffc3 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -30,6 +30,8 @@
#ifndef __GENALLOC_H__
#define __GENALLOC_H__
+#include <linux/spinlock_types.h>
+
struct device;
struct device_node;
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 9b4dd491f7e8..0437439bc047 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -1,6 +1,7 @@
#ifndef __LINUX_GFP_H
#define __LINUX_GFP_H
+#include <linux/mmdebug.h>
#include <linux/mmzone.h>
#include <linux/stddef.h>
#include <linux/linkage.h>
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 91672e2deec3..db512014e061 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -157,6 +157,26 @@ static inline int hpage_nr_pages(struct page *page)
return HPAGE_PMD_NR;
return 1;
}
+/*
+ * compound_trans_head() should be used instead of compound_head(),
+ * whenever the "page" passed as parameter could be the tail of a
+ * transparent hugepage that could be undergoing a
+ * __split_huge_page_refcount(). The page structure layout often
+ * changes across releases and it makes extensive use of unions. So if
+ * the page structure layout will change in a way that
+ * page->first_page gets clobbered by __split_huge_page_refcount, the
+ * implementation making use of smp_rmb() will be required.
+ *
+ * Currently we define compound_trans_head as compound_head, because
+ * page->private is in the same union with page->first_page, and
+ * page->private isn't clobbered. However this also means we're
+ * currently leaving dirt into the page->private field of anonymous
+ * pages resulting from a THP split, instead of setting page->private
+ * to zero like for every other page that has PG_private not set. But
+ * anonymous pages don't use page->private so this is not a problem.
+ */
+#if 0
+/* This will be needed if page->private will be clobbered in split_huge_page */
static inline struct page *compound_trans_head(struct page *page)
{
if (PageTail(page)) {
@@ -174,6 +194,9 @@ static inline struct page *compound_trans_head(struct page *page)
}
return page;
}
+#else
+#define compound_trans_head(page) compound_head(page)
+#endif
extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index bd7e98752222..8c43cc469d78 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -2,6 +2,7 @@
#define _LINUX_HUGETLB_H
#include <linux/mm_types.h>
+#include <linux/mmdebug.h>
#include <linux/fs.h>
#include <linux/hugetlb_inline.h>
#include <linux/cgroup.h>
@@ -31,7 +32,6 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
void hugepage_put_subpool(struct hugepage_subpool *spool);
int PageHuge(struct page *page);
-int PageHeadHuge(struct page *page_head);
void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
@@ -104,11 +104,6 @@ static inline int PageHuge(struct page *page)
return 0;
}
-static inline int PageHeadHuge(struct page *page_head)
-{
- return 0;
-}
-
static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
}
@@ -360,6 +355,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
static inline struct hstate *page_hstate(struct page *page)
{
+ VM_BUG_ON_PAGE(!PageHuge(page), page);
return size_to_hstate(PAGE_SIZE << compound_order(page));
}
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index ce8217f7b5c2..787bba3bf552 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -15,6 +15,7 @@
#ifndef _LINUX_HUGETLB_CGROUP_H
#define _LINUX_HUGETLB_CGROUP_H
+#include <linux/mmdebug.h>
#include <linux/res_counter.h>
struct hugetlb_cgroup;
@@ -28,7 +29,7 @@ struct hugetlb_cgroup;
static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
{
- VM_BUG_ON(!PageHuge(page));
+ VM_BUG_ON_PAGE(!PageHuge(page), page);
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
return NULL;
@@ -38,7 +39,7 @@ static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
static inline
int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
{
- VM_BUG_ON(!PageHuge(page));
+ VM_BUG_ON_PAGE(!PageHuge(page), page);
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
return -1;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 4dfdb893362a..6df7f9fe0d01 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -41,6 +41,7 @@ extern struct fs_struct init_fs;
#define INIT_SIGNALS(sig) { \
.nr_threads = 1, \
+ .thread_head = LIST_HEAD_INIT(init_task.thread_node), \
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
.shared_pending = { \
.list = LIST_HEAD_INIT(sig.shared_pending.list), \
@@ -222,6 +223,7 @@ extern struct task_group root_task_group;
[PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \
}, \
.thread_group = LIST_HEAD_INIT(tsk.thread_group), \
+ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \
INIT_IDS \
INIT_PERF_EVENTS(tsk) \
INIT_TRACE_IRQFLAGS \
diff --git a/include/linux/input.h b/include/linux/input.h
index 82ce323b9986..6453b22372ac 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -79,6 +79,7 @@ struct input_value {
* @led: reflects current state of device's LEDs
* @snd: reflects current state of sound effects
* @sw: reflects current state of device's switches
+ * @leds: leds objects for the device's LEDs
* @open: this method is called when the very first user calls
* input_open_device(). The driver must prepare the device
* to start generating events (start polling thread,
@@ -164,6 +165,8 @@ struct input_dev {
unsigned long snd[BITS_TO_LONGS(SND_CNT)];
unsigned long sw[BITS_TO_LONGS(SW_CNT)];
+ struct led_classdev *leds;
+
int (*open)(struct input_dev *dev);
void (*close)(struct input_dev *dev);
int (*flush)(struct input_dev *dev, struct file *file);
@@ -531,4 +534,22 @@ int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file);
int input_ff_create_memless(struct input_dev *dev, void *data,
int (*play_effect)(struct input_dev *, void *, struct ff_effect *));
+#ifdef CONFIG_INPUT_LEDS
+
+int input_led_connect(struct input_dev *dev);
+void input_led_disconnect(struct input_dev *dev);
+
+#else
+
+static inline int input_led_connect(struct input_dev *dev)
+{
+ return 0;
+}
+
+static inline void input_led_disconnect(struct input_dev *dev)
+{
+}
+
+#endif
+
#endif
diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index 8d861b2651f7..9d84942ae2e5 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -11,7 +11,7 @@
struct kern_ipc_perm
{
spinlock_t lock;
- int deleted;
+ bool deleted;
int id;
key_t key;
kuid_t uid;
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index f6c82de12541..e7831d203737 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -21,7 +21,6 @@ struct user_namespace;
struct ipc_ids {
int in_use;
unsigned short seq;
- unsigned short seq_max;
struct rw_semaphore rwsem;
struct idr ipcs_idr;
int next_id;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2aa3d4b000e6..f74bb581ab64 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -29,6 +29,19 @@
#define ULLONG_MAX (~0ULL)
#define SIZE_MAX (~(size_t)0)
+#define U8_MAX ((u8)~0U)
+#define S8_MAX ((s8)(U8_MAX>>1))
+#define S8_MIN ((s8)(-S8_MAX - 1))
+#define U16_MAX ((u16)~0U)
+#define S16_MAX ((s16)(U16_MAX>>1))
+#define S16_MIN ((s16)(-S16_MAX - 1))
+#define U32_MAX ((u32)~0U)
+#define S32_MAX ((s32)(U32_MAX>>1))
+#define S32_MIN ((s32)(-S32_MAX - 1))
+#define U64_MAX ((u64)~0ULL)
+#define S64_MAX ((s64)(U64_MAX>>1))
+#define S64_MIN ((s64)(-S64_MAX - 1))
+
#define STACK_MAGIC 0xdeadbeef
#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x))
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 5fd33dc1fe3a..6d4066cdb5b5 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -170,6 +170,7 @@ unsigned long paddr_vmcoreinfo_note(void);
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
+extern int kexec_load_disabled;
#ifndef kexec_flush_icache_page
#define kexec_flush_icache_page(page)
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 45c9b6a17bcb..3be6bb18562d 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -73,11 +73,7 @@ static inline void set_page_stable_node(struct page *page,
struct page *ksm_might_need_to_copy(struct page *page,
struct vm_area_struct *vma, unsigned long address);
-int page_referenced_ksm(struct page *page,
- struct mem_cgroup *memcg, unsigned long *vm_flags);
-int try_to_unmap_ksm(struct page *page, enum ttu_flags flags);
-int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg);
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
void ksm_migrate_page(struct page *newpage, struct page *oldpage);
#else /* !CONFIG_KSM */
@@ -115,13 +111,8 @@ static inline int page_referenced_ksm(struct page *page,
return 0;
}
-static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
-{
- return 0;
-}
-
-static inline int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page*,
- struct vm_area_struct *, unsigned long, void *), void *arg)
+static inline int rmap_walk_ksm(struct page *page,
+ struct rmap_walk_control *rwc)
{
return 0;
}
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 7dcef3317689..50050ae17b00 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -51,6 +51,7 @@ void kthread_parkme(void);
int kthreadd(void *unused);
extern struct task_struct *kthreadd_task;
extern int tsk_fork_get_node(struct task_struct *tsk);
+void set_kthreadd_affinity(void);
/*
* Simple work processor based on kthread.
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 77c60e52939d..1ef66360f0b0 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -19,9 +19,13 @@
#define INIT_MEMBLOCK_REGIONS 128
+/* Definition of memblock flags. */
+#define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */
+
struct memblock_region {
phys_addr_t base;
phys_addr_t size;
+ unsigned long flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid;
#endif
@@ -43,15 +47,21 @@ struct memblock {
extern struct memblock memblock;
extern int memblock_debug;
+#ifdef CONFIG_MOVABLE_NODE
+/* If movable_node boot option specified */
+extern bool movable_node_enabled;
+#endif /* CONFIG_MOVABLE_NODE */
#define memblock_dbg(fmt, ...) \
if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
-phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align, int nid);
+phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
+ phys_addr_t start, phys_addr_t end,
+ int nid);
phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align);
phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
+phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr);
void memblock_allow_resize(void);
int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
int memblock_add(phys_addr_t base, phys_addr_t size);
@@ -59,6 +69,28 @@ int memblock_remove(phys_addr_t base, phys_addr_t size);
int memblock_free(phys_addr_t base, phys_addr_t size);
int memblock_reserve(phys_addr_t base, phys_addr_t size);
void memblock_trim_memory(phys_addr_t align);
+int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
+int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
+#ifdef CONFIG_MOVABLE_NODE
+static inline bool memblock_is_hotpluggable(struct memblock_region *m)
+{
+ return m->flags & MEMBLOCK_HOTPLUG;
+}
+
+static inline bool movable_node_is_enabled(void)
+{
+ return movable_node_enabled;
+}
+#else
+static inline bool memblock_is_hotpluggable(struct memblock_region *m)
+{
+ return false;
+}
+static inline bool movable_node_is_enabled(void)
+{
+ return false;
+}
+#endif
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
@@ -87,7 +119,7 @@ void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start,
/**
* for_each_free_mem_range - iterate through free memblock areas
* @i: u64 used as loop variable
- * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
@@ -107,7 +139,7 @@ void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start,
/**
* for_each_free_mem_range_reverse - rev-iterate through free memblock areas
* @i: u64 used as loop variable
- * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
@@ -121,8 +153,21 @@ void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start,
i != (u64)ULLONG_MAX; \
__next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid))
+static inline void memblock_set_region_flags(struct memblock_region *r,
+ unsigned long flags)
+{
+ r->flags |= flags;
+}
+
+static inline void memblock_clear_region_flags(struct memblock_region *r,
+ unsigned long flags)
+{
+ r->flags &= ~flags;
+}
+
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid);
+int memblock_set_node(phys_addr_t base, phys_addr_t size,
+ struct memblock_type *type, int nid);
static inline void memblock_set_region_node(struct memblock_region *r, int nid)
{
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b3e7a667e03c..abd0113b6620 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -497,10 +497,11 @@ void __memcg_kmem_commit_charge(struct page *page,
void __memcg_kmem_uncharge_pages(struct page *page, int order);
int memcg_cache_id(struct mem_cgroup *memcg);
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
- struct kmem_cache *root_cache);
-void memcg_release_cache(struct kmem_cache *cachep);
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
+int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+ struct kmem_cache *root_cache);
+void memcg_free_cache_params(struct kmem_cache *s);
+void memcg_register_cache(struct kmem_cache *s);
+void memcg_unregister_cache(struct kmem_cache *s);
int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
void memcg_update_array_size(int num_groups);
@@ -640,19 +641,21 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
return -1;
}
-static inline int
-memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
- struct kmem_cache *root_cache)
+static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
+ struct kmem_cache *s, struct kmem_cache *root_cache)
{
return 0;
}
-static inline void memcg_release_cache(struct kmem_cache *cachep)
+static inline void memcg_free_cache_params(struct kmem_cache *s)
+{
+}
+
+static inline void memcg_register_cache(struct kmem_cache *s)
{
}
-static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
- struct kmem_cache *s)
+static inline void memcg_unregister_cache(struct kmem_cache *s)
{
}
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 9fe426b30a41..5f1ea756aace 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -211,20 +211,8 @@ static inline void mpol_get(struct mempolicy *pol)
{
}
-static inline struct mempolicy *mpol_dup(struct mempolicy *old)
-{
- return NULL;
-}
-
struct shared_policy {};
-static inline int mpol_set_shared_policy(struct shared_policy *info,
- struct vm_area_struct *vma,
- struct mempolicy *new)
-{
- return -EINVAL;
-}
-
static inline void mpol_shared_policy_init(struct shared_policy *sp,
struct mempolicy *mpol)
{
@@ -234,12 +222,6 @@ static inline void mpol_free_shared_policy(struct shared_policy *p)
{
}
-static inline struct mempolicy *
-mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
-{
- return NULL;
-}
-
#define vma_policy(vma) NULL
static inline int
@@ -266,10 +248,6 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
}
-static inline void mpol_fix_fork_child_flag(struct task_struct *p)
-{
-}
-
static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask)
@@ -284,12 +262,6 @@ static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
return false;
}
-static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk,
- const nodemask_t *mask)
-{
- return false;
-}
-
static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
@@ -307,10 +279,6 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
}
#endif
-static inline void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
-{
-}
-
static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
unsigned long address)
{
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index f015c059e159..84a31ad0b791 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -35,16 +35,12 @@ enum migrate_reason {
#ifdef CONFIG_MIGRATION
-extern void putback_lru_pages(struct list_head *l);
extern void putback_movable_pages(struct list_head *l);
extern int migrate_page(struct address_space *,
struct page *, struct page *, enum migrate_mode);
extern int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, enum migrate_mode mode, int reason);
-extern int fail_migrate_page(struct address_space *,
- struct page *, struct page *);
-
extern int migrate_prep(void);
extern int migrate_prep_local(void);
extern int migrate_vmas(struct mm_struct *mm,
@@ -59,7 +55,6 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
int extra_count);
#else
-static inline void putback_lru_pages(struct list_head *l) {}
static inline void putback_movable_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, enum migrate_mode mode, int reason)
@@ -86,7 +81,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
/* Possible settings for the migrate_page() method in address_operations */
#define migrate_page NULL
-#define fail_migrate_page NULL
#endif /* CONFIG_MIGRATION */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf362d053ce1..28b6daade067 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -5,6 +5,7 @@
#ifdef __KERNEL__
+#include <linux/mmdebug.h>
#include <linux/gfp.h>
#include <linux/bug.h>
#include <linux/list.h>
@@ -57,6 +58,15 @@ extern int sysctl_legacy_va_layout;
extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;
+extern int sysctl_overcommit_memory;
+extern int sysctl_overcommit_ratio;
+extern unsigned long sysctl_overcommit_kbytes;
+
+extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *,
+ size_t *, loff_t *);
+extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
+ size_t *, loff_t *);
+
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
/* to align the pointer to the (next) page boundary */
@@ -294,7 +304,7 @@ static inline int get_freepage_migratetype(struct page *page)
*/
static inline int put_page_testzero(struct page *page)
{
- VM_BUG_ON(atomic_read(&page->_count) == 0);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) == 0, page);
return atomic_dec_and_test(&page->_count);
}
@@ -355,7 +365,7 @@ static inline int is_vmalloc_or_module_addr(const void *x)
static inline void compound_lock(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON(PageSlab(page));
+ VM_BUG_ON_PAGE(PageSlab(page), page);
bit_spin_lock(PG_compound_lock, &page->flags);
#endif
}
@@ -363,7 +373,7 @@ static inline void compound_lock(struct page *page)
static inline void compound_unlock(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON(PageSlab(page));
+ VM_BUG_ON_PAGE(PageSlab(page), page);
bit_spin_unlock(PG_compound_lock, &page->flags);
#endif
}
@@ -414,15 +424,44 @@ static inline int page_count(struct page *page)
return atomic_read(&compound_head(page)->_count);
}
+#ifdef CONFIG_HUGETLB_PAGE
+extern int PageHeadHuge(struct page *page_head);
+#else /* CONFIG_HUGETLB_PAGE */
+static inline int PageHeadHuge(struct page *page_head)
+{
+ return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static inline bool __compound_tail_refcounted(struct page *page)
+{
+ return !PageSlab(page) && !PageHeadHuge(page);
+}
+
+/*
+ * This takes a head page as parameter and tells if the
+ * tail page reference counting can be skipped.
+ *
+ * For this to be safe, PageSlab and PageHeadHuge must remain true on
+ * any given page where they return true here, until all tail pins
+ * have been released.
+ */
+static inline bool compound_tail_refcounted(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ return __compound_tail_refcounted(page);
+}
+
static inline void get_huge_page_tail(struct page *page)
{
/*
- * __split_huge_page_refcount() cannot run
- * from under us.
+ * __split_huge_page_refcount() cannot run from under us.
*/
- VM_BUG_ON(page_mapcount(page) < 0);
- VM_BUG_ON(atomic_read(&page->_count) != 0);
- atomic_inc(&page->_mapcount);
+ VM_BUG_ON_PAGE(!PageTail(page), page);
+ VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
+ if (compound_tail_refcounted(page->first_page))
+ atomic_inc(&page->_mapcount);
}
extern bool __get_page_tail(struct page *page);
@@ -436,7 +475,7 @@ static inline void get_page(struct page *page)
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
*/
- VM_BUG_ON(atomic_read(&page->_count) <= 0);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
atomic_inc(&page->_count);
}
@@ -473,13 +512,13 @@ static inline int PageBuddy(struct page *page)
static inline void __SetPageBuddy(struct page *page)
{
- VM_BUG_ON(atomic_read(&page->_mapcount) != -1);
+ VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE);
}
static inline void __ClearPageBuddy(struct page *page)
{
- VM_BUG_ON(!PageBuddy(page));
+ VM_BUG_ON_PAGE(!PageBuddy(page), page);
atomic_set(&page->_mapcount, -1);
}
@@ -984,7 +1023,6 @@ extern void pagefault_out_of_memory(void);
* various contexts.
*/
#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
-#define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */
extern void show_free_areas(unsigned int flags);
extern bool skip_free_areas_node(unsigned int flags, int nid);
@@ -1318,6 +1356,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
+void __init ptlock_cache_init(void);
extern bool ptlock_alloc(struct page *page);
extern void ptlock_free(struct page *page);
@@ -1326,6 +1365,10 @@ static inline spinlock_t *ptlock_ptr(struct page *page)
return page->ptl;
}
#else /* ALLOC_SPLIT_PTLOCKS */
+static inline void ptlock_cache_init(void)
+{
+}
+
static inline bool ptlock_alloc(struct page *page)
{
return true;
@@ -1356,7 +1399,7 @@ static inline bool ptlock_init(struct page *page)
* slab code uses page->slab_cache and page->first_page (for tail
* pages), which share storage with page->ptl.
*/
- VM_BUG_ON(*(unsigned long *)&page->ptl);
+ VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page);
if (!ptlock_alloc(page))
return false;
spin_lock_init(ptlock_ptr(page));
@@ -1378,10 +1421,17 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
return &mm->page_table_lock;
}
+static inline void ptlock_cache_init(void) {}
static inline bool ptlock_init(struct page *page) { return true; }
static inline void pte_lock_deinit(struct page *page) {}
#endif /* USE_SPLIT_PTE_PTLOCKS */
+static inline void pgtable_init(void)
+{
+ ptlock_cache_init();
+ pgtable_cache_init();
+}
+
static inline bool pgtable_page_ctor(struct page *page)
{
inc_zone_page_state(page, NR_PAGETABLE);
@@ -1440,7 +1490,7 @@ static inline bool pgtable_pmd_page_ctor(struct page *page)
static inline void pgtable_pmd_page_dtor(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON(page->pmd_huge_pte);
+ VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
#endif
ptlock_free(page);
}
@@ -1977,8 +2027,6 @@ extern void shake_page(struct page *p, int access);
extern atomic_long_t num_poisoned_pages;
extern int soft_offline_page(struct page *page, int flags);
-extern void dump_page(struct page *page);
-
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
extern void clear_huge_page(struct page *page,
unsigned long addr,
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 7f7f8dae4b1d..16373c8f5f57 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -9,6 +9,7 @@
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
+extern unsigned long sysctl_overcommit_kbytes;
extern struct percpu_counter vm_committed_as;
#ifdef CONFIG_SMP
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 580bd587d916..5042c036dda9 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -1,10 +1,19 @@
#ifndef LINUX_MM_DEBUG_H
#define LINUX_MM_DEBUG_H 1
+struct page;
+
+extern void dump_page(struct page *page, char *reason);
+extern void dump_page_badflags(struct page *page, char *reason,
+ unsigned long badflags);
+
#ifdef CONFIG_DEBUG_VM
#define VM_BUG_ON(cond) BUG_ON(cond)
+#define VM_BUG_ON_PAGE(cond, page) \
+ do { if (unlikely(cond)) { dump_page(page, NULL); BUG(); } } while (0)
#else
#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
+#define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
#endif
#ifdef CONFIG_DEBUG_VIRTUAL
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bd791e452ad7..5f2052c83154 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -490,6 +490,12 @@ struct zone {
unsigned long managed_pages;
/*
+ * Number of MIGRATE_RESEVE page block. To maintain for just
+ * optimization. Protected by zone->lock.
+ */
+ int nr_migrate_reserve_block;
+
+ /*
* rarely used fields:
*/
const char *name;
@@ -758,10 +764,7 @@ typedef struct pglist_data {
int kswapd_max_order;
enum zone_type classzone_idx;
#ifdef CONFIG_NUMA_BALANCING
- /*
- * Lock serializing the per destination node AutoNUMA memory
- * migration rate limiting data.
- */
+ /* Lock serializing the migrate rate limiting window */
spinlock_t numabalancing_migrate_lock;
/* Rate limiting time interval */
diff --git a/include/linux/msg.h b/include/linux/msg.h
index e21f9d44307f..f3f302f9c197 100644
--- a/include/linux/msg.h
+++ b/include/linux/msg.h
@@ -9,7 +9,7 @@ struct msg_msg {
struct list_head m_list;
long m_type;
size_t m_ts; /* message text size */
- struct msg_msgseg* next;
+ struct msg_msgseg *next;
void *security;
/* the actual message follows immediately */
};
diff --git a/include/linux/of.h b/include/linux/of.h
index 276c546980d8..70c64ba17fa5 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -377,8 +377,13 @@ static inline bool of_have_populated_dt(void)
return false;
}
+/* Kill an unused variable warning on a device_node pointer */
+static inline void __of_use_dn(const struct device_node *np)
+{
+}
+
#define for_each_child_of_node(parent, child) \
- while (0)
+ while (__of_use_dn(parent), __of_use_dn(child), 0)
#define for_each_available_child_of_node(parent, child) \
while (0)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 98ada58f9942..d1fe1a761047 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -228,9 +228,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
PAGEFLAG(MappedToDisk, mappedtodisk)
-/* PG_readahead is only used for file reads; PG_reclaim is only for writes */
+/* PG_readahead is only used for reads; PG_reclaim is only for writes */
PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
-PAGEFLAG(Readahead, reclaim) /* Reminder to do async read-ahead */
+PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
#ifdef CONFIG_HIGHMEM
/*
@@ -412,7 +412,7 @@ static inline void ClearPageCompound(struct page *page)
*/
static inline int PageTransHuge(struct page *page)
{
- VM_BUG_ON(PageTail(page));
+ VM_BUG_ON_PAGE(PageTail(page), page);
return PageHead(page);
}
@@ -460,25 +460,25 @@ static inline int PageTransTail(struct page *page)
*/
static inline int PageSlabPfmemalloc(struct page *page)
{
- VM_BUG_ON(!PageSlab(page));
+ VM_BUG_ON_PAGE(!PageSlab(page), page);
return PageActive(page);
}
static inline void SetPageSlabPfmemalloc(struct page *page)
{
- VM_BUG_ON(!PageSlab(page));
+ VM_BUG_ON_PAGE(!PageSlab(page), page);
SetPageActive(page);
}
static inline void __ClearPageSlabPfmemalloc(struct page *page)
{
- VM_BUG_ON(!PageSlab(page));
+ VM_BUG_ON_PAGE(!PageSlab(page), page);
__ClearPageActive(page);
}
static inline void ClearPageSlabPfmemalloc(struct page *page)
{
- VM_BUG_ON(!PageSlab(page));
+ VM_BUG_ON_PAGE(!PageSlab(page), page);
ClearPageActive(page);
}
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e3dea75a078b..1710d1b060ba 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -162,7 +162,7 @@ static inline int page_cache_get_speculative(struct page *page)
* disabling preempt, and hence no need for the "speculative get" that
* SMP requires.
*/
- VM_BUG_ON(page_count(page) == 0);
+ VM_BUG_ON_PAGE(page_count(page) == 0, page);
atomic_inc(&page->_count);
#else
@@ -175,7 +175,7 @@ static inline int page_cache_get_speculative(struct page *page)
return 0;
}
#endif
- VM_BUG_ON(PageTail(page));
+ VM_BUG_ON_PAGE(PageTail(page), page);
return 1;
}
@@ -191,14 +191,14 @@ static inline int page_cache_add_speculative(struct page *page, int count)
# ifdef CONFIG_PREEMPT_COUNT
VM_BUG_ON(!in_atomic());
# endif
- VM_BUG_ON(page_count(page) == 0);
+ VM_BUG_ON_PAGE(page_count(page) == 0, page);
atomic_add(count, &page->_count);
#else
if (unlikely(!atomic_add_unless(&page->_count, count, 0)))
return 0;
#endif
- VM_BUG_ON(PageCompound(page) && page != compound_head(page));
+ VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page);
return 1;
}
@@ -210,7 +210,7 @@ static inline int page_freeze_refs(struct page *page, int count)
static inline void page_unfreeze_refs(struct page *page, int count)
{
- VM_BUG_ON(page_count(page) != 0);
+ VM_BUG_ON_PAGE(page_count(page) != 0, page);
VM_BUG_ON(count == 0);
atomic_set(&page->_count, count);
diff --git a/include/linux/parser.h b/include/linux/parser.h
index ea2281e726f6..39d5b7955b23 100644
--- a/include/linux/parser.h
+++ b/include/linux/parser.h
@@ -29,5 +29,6 @@ int match_token(char *, const match_table_t table, substring_t args[]);
int match_int(substring_t *, int *result);
int match_octal(substring_t *, int *result);
int match_hex(substring_t *, int *result);
+bool match_wildcard(const char *pattern, const char *str);
size_t match_strlcpy(char *, const substring_t *, size_t);
char *match_strdup(const substring_t *);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 9e4761caa80c..e3817d2441b6 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -1,6 +1,7 @@
#ifndef __LINUX_PERCPU_H
#define __LINUX_PERCPU_H
+#include <linux/mmdebug.h>
#include <linux/preempt.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 7931efe71175..fb616942e4c7 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -94,78 +94,12 @@ extern int posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);
extern struct posix_acl *get_posix_acl(struct inode *, int);
extern int set_posix_acl(struct inode *, int, struct posix_acl *);
-#ifdef CONFIG_FS_POSIX_ACL
-static inline struct posix_acl **acl_by_type(struct inode *inode, int type)
-{
- switch (type) {
- case ACL_TYPE_ACCESS:
- return &inode->i_acl;
- case ACL_TYPE_DEFAULT:
- return &inode->i_default_acl;
- default:
- BUG();
- }
-}
-
-static inline struct posix_acl *get_cached_acl(struct inode *inode, int type)
-{
- struct posix_acl **p = acl_by_type(inode, type);
- struct posix_acl *acl = ACCESS_ONCE(*p);
- if (acl) {
- spin_lock(&inode->i_lock);
- acl = *p;
- if (acl != ACL_NOT_CACHED)
- acl = posix_acl_dup(acl);
- spin_unlock(&inode->i_lock);
- }
- return acl;
-}
-
-static inline struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
-{
- return rcu_dereference(*acl_by_type(inode, type));
-}
-
-static inline void set_cached_acl(struct inode *inode,
- int type,
- struct posix_acl *acl)
-{
- struct posix_acl **p = acl_by_type(inode, type);
- struct posix_acl *old;
- spin_lock(&inode->i_lock);
- old = *p;
- rcu_assign_pointer(*p, posix_acl_dup(acl));
- spin_unlock(&inode->i_lock);
- if (old != ACL_NOT_CACHED)
- posix_acl_release(old);
-}
-
-static inline void forget_cached_acl(struct inode *inode, int type)
-{
- struct posix_acl **p = acl_by_type(inode, type);
- struct posix_acl *old;
- spin_lock(&inode->i_lock);
- old = *p;
- *p = ACL_NOT_CACHED;
- spin_unlock(&inode->i_lock);
- if (old != ACL_NOT_CACHED)
- posix_acl_release(old);
-}
-
-static inline void forget_all_cached_acls(struct inode *inode)
-{
- struct posix_acl *old_access, *old_default;
- spin_lock(&inode->i_lock);
- old_access = inode->i_acl;
- old_default = inode->i_default_acl;
- inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
- spin_unlock(&inode->i_lock);
- if (old_access != ACL_NOT_CACHED)
- posix_acl_release(old_access);
- if (old_default != ACL_NOT_CACHED)
- posix_acl_release(old_default);
-}
-#endif
+struct posix_acl **acl_by_type(struct inode *inode, int type);
+struct posix_acl *get_cached_acl(struct inode *inode, int type);
+struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
+void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
+void forget_cached_acl(struct inode *inode, int type);
+void forget_all_cached_acls(struct inode *inode);
static inline void cache_no_acl(struct inode *inode)
{
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 26fb95ce5080..fa47e2708c01 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -5,6 +5,7 @@
#include <linux/init.h>
#include <linux/kern_levels.h>
#include <linux/linkage.h>
+#include <linux/cache.h>
extern const char linux_banner[];
extern const char linux_proc_banner[];
@@ -260,17 +261,17 @@ extern asmlinkage void dump_stack(void) __cold;
*/
#ifdef CONFIG_PRINTK
-#define printk_once(fmt, ...) \
-({ \
- static bool __print_once; \
- \
- if (!__print_once) { \
- __print_once = true; \
- printk(fmt, ##__VA_ARGS__); \
- } \
+#define printk_once(fmt, ...) \
+({ \
+ static bool __print_once __read_mostly; \
+ \
+ if (!__print_once) { \
+ __print_once = true; \
+ printk(fmt, ##__VA_ARGS__); \
+ } \
})
#else
-#define printk_once(fmt, ...) \
+#define printk_once(fmt, ...) \
no_printk(fmt, ##__VA_ARGS__)
#endif
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 753207c8ce20..ecc730977a5a 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -14,13 +14,6 @@ ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
}
#else
extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize);
-extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
- unsigned long addr,
- unsigned long len,
- unsigned long pgoff,
- unsigned long flags);
-
-extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
#endif
extern const struct file_operations ramfs_file_operations;
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6dacb93a6d94..1da693d51255 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -184,13 +184,13 @@ static inline void page_dup_rmap(struct page *page)
int page_referenced(struct page *, int is_locked,
struct mem_cgroup *memcg, unsigned long *vm_flags);
int page_referenced_one(struct page *, struct vm_area_struct *,
- unsigned long address, unsigned int *mapcount, unsigned long *vm_flags);
+ unsigned long address, void *arg);
#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
int try_to_unmap(struct page *, enum ttu_flags flags);
int try_to_unmap_one(struct page *, struct vm_area_struct *,
- unsigned long address, enum ttu_flags flags);
+ unsigned long address, void *arg);
/*
* Called from mm/filemap_xip.c to unmap empty zero page
@@ -236,10 +236,27 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
/*
- * Called by migrate.c to remove migration ptes, but might be used more later.
+ * rmap_walk_control: To control rmap traversing for specific needs
+ *
+ * arg: passed to rmap_one() and invalid_vma()
+ * rmap_one: executed on each vma where page is mapped
+ * done: for checking traversing termination condition
+ * file_nonlinear: for handling file nonlinear mapping
+ * anon_lock: for getting anon_lock by optimized way rather than default
+ * invalid_vma: for skipping uninterested vma
*/
-int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg);
+struct rmap_walk_control {
+ void *arg;
+ int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, void *arg);
+ int (*done)(struct page *page);
+ int (*file_nonlinear)(struct page *, struct address_space *,
+ struct vm_area_struct *vma);
+ struct anon_vma *(*anon_lock)(struct page *page);
+ bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
+};
+
+int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
#else /* !CONFIG_MMU */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffccdad050b5..68a0e84463a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -229,7 +229,7 @@ extern char ___assert_task_state[1 - 2*!!(
/* get_task_state() */
#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
- __TASK_TRACED)
+ __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
@@ -391,22 +391,33 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
#endif
-
-extern void set_dumpable(struct mm_struct *mm, int value);
-extern int get_dumpable(struct mm_struct *mm);
-
#define SUID_DUMP_DISABLE 0 /* No setuid dumping */
#define SUID_DUMP_USER 1 /* Dump as user of process */
#define SUID_DUMP_ROOT 2 /* Dump as root */
/* mm flags */
-/* dumpable bits */
-#define MMF_DUMPABLE 0 /* core dump is permitted */
-#define MMF_DUMP_SECURELY 1 /* core file is readable only by root */
+/* for SUID_DUMP_* above */
#define MMF_DUMPABLE_BITS 2
#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
+extern void set_dumpable(struct mm_struct *mm, int value);
+/*
+ * This returns the actual value of the suid_dumpable flag. For things
+ * that are using this for checking for privilege transitions, it must
+ * test against SUID_DUMP_USER rather than treating it as a boolean
+ * value.
+ */
+static inline int __get_dumpable(unsigned long mm_flags)
+{
+ return mm_flags & MMF_DUMPABLE_MASK;
+}
+
+static inline int get_dumpable(struct mm_struct *mm)
+{
+ return __get_dumpable(mm->flags);
+}
+
/* coredump filter bits */
#define MMF_DUMP_ANON_PRIVATE 2
#define MMF_DUMP_ANON_SHARED 3
@@ -549,6 +560,7 @@ struct signal_struct {
atomic_t sigcnt;
atomic_t live;
int nr_threads;
+ struct list_head thread_head;
wait_queue_head_t wait_chldexit; /* for wait4() */
@@ -1227,7 +1239,6 @@ struct task_struct {
/* Used for emulating ABI behavior of previous Linux versions */
unsigned int personality;
- unsigned did_exec:1;
unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */
unsigned in_iowait:1;
@@ -1271,6 +1282,7 @@ struct task_struct {
/* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;
+ struct list_head thread_node;
struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
@@ -2282,8 +2294,6 @@ extern struct mm_struct *get_task_mm(struct task_struct *task);
extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
/* Remove the current tasks stale references to the old mm_struct */
extern void mm_release(struct task_struct *, struct mm_struct *);
-/* Allocate a new mm structure and copy contents from tsk->mm */
-extern struct mm_struct *dup_mm(struct task_struct *tsk);
extern int copy_thread(unsigned long, unsigned long, unsigned long,
struct task_struct *);
@@ -2341,6 +2351,16 @@ extern bool current_is_single_threaded(void);
#define while_each_thread(g, t) \
while ((t = next_thread(t)) != g)
+#define __for_each_thread(signal, t) \
+ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+
+#define for_each_thread(p, t) \
+ __for_each_thread((p)->signal, t)
+
+/* Careful: this is a double loop, 'break' won't work as expected. */
+#define for_each_process_thread(p, t) \
+ for_each_process(p) for_each_thread(p, t)
+
static inline int get_nr_threads(struct task_struct *tsk)
{
return tsk->signal->nr_threads;
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 31e0193cb0c5..b13cf430764f 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -99,4 +99,8 @@ extern int sched_rt_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
+extern int sysctl_numa_balancing(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+
#endif /* _SCHED_SYSCTL_H */
diff --git a/include/linux/shm.h b/include/linux/shm.h
index 429c1995d756..1e2cd2e6b540 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -9,7 +9,7 @@
struct shmid_kernel /* private to the kernel */
{
struct kern_ipc_perm shm_perm;
- struct file * shm_file;
+ struct file *shm_file;
unsigned long shm_nattch;
unsigned long shm_segsz;
time_t shm_atim;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1e2f4fe12773..a060142aa5f5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -513,7 +513,9 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
*
* Both the root cache and the child caches will have it. For the root cache,
* this will hold a dynamically allocated array large enough to hold
- * information about the currently limited memcgs in the system.
+ * information about the currently limited memcgs in the system. To allow the
+ * array to be accessed without taking any locks, on relocation we free the old
+ * version only after a grace period.
*
* Child caches will hold extra metadata needed for its operation. Fields are:
*
@@ -528,7 +530,10 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
struct memcg_cache_params {
bool is_root_cache;
union {
- struct kmem_cache *memcg_caches[0];
+ struct {
+ struct rcu_head rcu_head;
+ struct kmem_cache *memcg_caches[0];
+ };
struct {
struct mem_cgroup *memcg;
struct list_head list;
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 74575cbf2d6f..0e43906d2fda 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -24,7 +24,8 @@
* Passed to the actors
*/
struct splice_desc {
- unsigned int len, total_len; /* current and remaining length */
+ size_t total_len; /* remaining length */
+ unsigned int len; /* current length */
unsigned int flags; /* splice flags */
/*
* actor() private data
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index c557c6d096de..070de3de8856 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -71,12 +71,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_ZERO_PAGE_ALLOC,
THP_ZERO_PAGE_ALLOC_FAILED,
#endif
-#ifdef CONFIG_SMP
+#ifdef CONFIG_DEBUG_TLBFLUSH
NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */
NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
-#endif
NR_TLB_LOCAL_FLUSH_ALL,
NR_TLB_LOCAL_FLUSH_ONE,
+#endif
NR_VM_EVENT_ITEMS
};
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index e4b948080d20..80ebba9c2e87 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -83,6 +83,14 @@ static inline void vm_events_fold_cpu(int cpu)
#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_DEBUG_TLBFLUSH
+#define count_vm_tlb_event(x) count_vm_event(x)
+#define count_vm_tlb_events(x, y) count_vm_events(x, y)
+#else
+#define count_vm_tlb_event(x) do {} while (0)
+#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
+#endif
+
#define __count_zone_vm_events(item, zone, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + \
zone_idx(zone), delta)
diff --git a/include/linux/w1-gpio.h b/include/linux/w1-gpio.h
index 065e3ae79ab0..d58594a32324 100644
--- a/include/linux/w1-gpio.h
+++ b/include/linux/w1-gpio.h
@@ -20,6 +20,7 @@ struct w1_gpio_platform_data {
unsigned int is_open_drain:1;
void (*enable_external_pullup)(int enable);
unsigned int ext_pullup_enable_pin;
+ unsigned int pullup_duration;
};
#endif /* _LINUX_W1_GPIO_H */
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index fde1b3e94c7d..06f544ef2f6f 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -67,6 +67,48 @@ TRACE_EVENT(mm_compaction_migratepages,
__entry->nr_failed)
);
+TRACE_EVENT(mm_compaction_begin,
+ TP_PROTO(unsigned long zone_start, unsigned long migrate_start,
+ unsigned long free_start, unsigned long zone_end),
+
+ TP_ARGS(zone_start, migrate_start, free_start, zone_end),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, zone_start)
+ __field(unsigned long, migrate_start)
+ __field(unsigned long, free_start)
+ __field(unsigned long, zone_end)
+ ),
+
+ TP_fast_assign(
+ __entry->zone_start = zone_start;
+ __entry->migrate_start = migrate_start;
+ __entry->free_start = free_start;
+ __entry->zone_end = zone_end;
+ ),
+
+ TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu",
+ __entry->zone_start,
+ __entry->migrate_start,
+ __entry->free_start,
+ __entry->zone_end)
+);
+
+TRACE_EVENT(mm_compaction_end,
+ TP_PROTO(int status),
+
+ TP_ARGS(status),
+
+ TP_STRUCT__entry(
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ __entry->status = status;
+ ),
+
+ TP_printk("status=%d", __entry->status)
+);
#endif /* _TRACE_COMPACTION_H */
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index ec2a6ccfd7e5..3075ffbb9a83 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -45,6 +45,32 @@ TRACE_EVENT(mm_migrate_pages,
__print_symbolic(__entry->reason, MIGRATE_REASON))
);
+TRACE_EVENT(mm_numa_migrate_ratelimit,
+
+ TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages),
+
+ TP_ARGS(p, dst_nid, nr_pages),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN)
+ __field( pid_t, pid)
+ __field( int, dst_nid)
+ __field( unsigned long, nr_pages)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->dst_nid = dst_nid;
+ __entry->nr_pages = nr_pages;
+ ),
+
+ TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu",
+ __entry->comm,
+ __entry->pid,
+ __entry->dst_nid,
+ __entry->nr_pages)
+);
#endif /* _TRACE_MIGRATE_H */
/* This part must be outside protection */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 04c308413a5d..67e1bbf83695 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -443,6 +443,93 @@ TRACE_EVENT(sched_process_hang,
);
#endif /* CONFIG_DETECT_HUNG_TASK */
+DECLARE_EVENT_CLASS(sched_move_task_template,
+
+ TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
+
+ TP_ARGS(tsk, src_cpu, dst_cpu),
+
+ TP_STRUCT__entry(
+ __field( pid_t, pid )
+ __field( pid_t, tgid )
+ __field( pid_t, ngid )
+ __field( int, src_cpu )
+ __field( int, src_nid )
+ __field( int, dst_cpu )
+ __field( int, dst_nid )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = task_pid_nr(tsk);
+ __entry->tgid = task_tgid_nr(tsk);
+ __entry->ngid = task_numa_group_id(tsk);
+ __entry->src_cpu = src_cpu;
+ __entry->src_nid = cpu_to_node(src_cpu);
+ __entry->dst_cpu = dst_cpu;
+ __entry->dst_nid = cpu_to_node(dst_cpu);
+ ),
+
+ TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d",
+ __entry->pid, __entry->tgid, __entry->ngid,
+ __entry->src_cpu, __entry->src_nid,
+ __entry->dst_cpu, __entry->dst_nid)
+);
+
+/*
+ * Tracks migration of tasks from one runqueue to another. Can be used to
+ * detect if automatic NUMA balancing is bouncing between nodes
+ */
+DEFINE_EVENT(sched_move_task_template, sched_move_numa,
+ TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
+
+ TP_ARGS(tsk, src_cpu, dst_cpu)
+);
+
+DEFINE_EVENT(sched_move_task_template, sched_stick_numa,
+ TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
+
+ TP_ARGS(tsk, src_cpu, dst_cpu)
+);
+
+TRACE_EVENT(sched_swap_numa,
+
+ TP_PROTO(struct task_struct *src_tsk, int src_cpu,
+ struct task_struct *dst_tsk, int dst_cpu),
+
+ TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu),
+
+ TP_STRUCT__entry(
+ __field( pid_t, src_pid )
+ __field( pid_t, src_tgid )
+ __field( pid_t, src_ngid )
+ __field( int, src_cpu )
+ __field( int, src_nid )
+ __field( pid_t, dst_pid )
+ __field( pid_t, dst_tgid )
+ __field( pid_t, dst_ngid )
+ __field( int, dst_cpu )
+ __field( int, dst_nid )
+ ),
+
+ TP_fast_assign(
+ __entry->src_pid = task_pid_nr(src_tsk);
+ __entry->src_tgid = task_tgid_nr(src_tsk);
+ __entry->src_ngid = task_numa_group_id(src_tsk);
+ __entry->src_cpu = src_cpu;
+ __entry->src_nid = cpu_to_node(src_cpu);
+ __entry->dst_pid = task_pid_nr(dst_tsk);
+ __entry->dst_tgid = task_tgid_nr(dst_tsk);
+ __entry->dst_ngid = task_numa_group_id(dst_tsk);
+ __entry->dst_cpu = dst_cpu;
+ __entry->dst_nid = cpu_to_node(dst_cpu);
+ ),
+
+ TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d",
+ __entry->src_pid, __entry->src_tgid, __entry->src_ngid,
+ __entry->src_cpu, __entry->src_nid,
+ __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid,
+ __entry->dst_cpu, __entry->dst_nid)
+);
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/include/uapi/asm-generic/types.h b/include/uapi/asm-generic/types.h
index bd39806013b5..a3877926b0d4 100644
--- a/include/uapi/asm-generic/types.h
+++ b/include/uapi/asm-generic/types.h
@@ -1,8 +1,7 @@
#ifndef _ASM_GENERIC_TYPES_H
#define _ASM_GENERIC_TYPES_H
/*
- * int-ll64 is used practically everywhere now,
- * so use it as a reasonable default.
+ * int-ll64 is used everywhere now.
*/
#include <asm-generic/int-ll64.h>
diff --git a/init/initramfs.c b/init/initramfs.c
index a67ef9dbda9d..93b61396756b 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -583,7 +583,7 @@ static int __init populate_rootfs(void)
{
char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
if (err)
- panic(err); /* Failed to decompress INTERNAL initramfs */
+ panic("%s", err); /* Failed to decompress INTERNAL initramfs */
if (initrd_start) {
#ifdef CONFIG_BLK_DEV_RAM
int fd;
diff --git a/init/main.c b/init/main.c
index febc511e078a..f333385d9a4f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -99,10 +99,6 @@ extern void radix_tree_init(void);
static inline void mark_rodata_ro(void) { }
#endif
-#ifdef CONFIG_TC
-extern void tc_init(void);
-#endif
-
/*
* Debug helper: via this flag we know that we are in 'early bootup code'
* where only the boot processor is running with IRQ disabled. This means
@@ -282,7 +278,7 @@ static int __init unknown_bootoption(char *param, char *val, const char *unused)
unsigned int i;
for (i = 0; envp_init[i]; i++) {
if (i == MAX_INIT_ENVS) {
- panic_later = "Too many boot env vars at `%s'";
+ panic_later = "env";
panic_param = param;
}
if (!strncmp(param, envp_init[i], val - param))
@@ -294,7 +290,7 @@ static int __init unknown_bootoption(char *param, char *val, const char *unused)
unsigned int i;
for (i = 0; argv_init[i]; i++) {
if (i == MAX_INIT_ARGS) {
- panic_later = "Too many boot init vars at `%s'";
+ panic_later = "init";
panic_param = param;
}
}
@@ -355,9 +351,11 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { }
*/
static void __init setup_command_line(char *command_line)
{
- saved_command_line = alloc_bootmem(strlen (boot_command_line)+1);
- initcall_command_line = alloc_bootmem(strlen (boot_command_line)+1);
- static_command_line = alloc_bootmem(strlen (command_line)+1);
+ saved_command_line =
+ memblock_virt_alloc(strlen(boot_command_line) + 1, 0);
+ initcall_command_line =
+ memblock_virt_alloc(strlen(boot_command_line) + 1, 0);
+ static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0);
strcpy (saved_command_line, boot_command_line);
strcpy (static_command_line, command_line);
}
@@ -476,7 +474,7 @@ static void __init mm_init(void)
mem_init();
kmem_cache_init();
percpu_init_late();
- pgtable_cache_init();
+ pgtable_init();
vmalloc_init();
}
@@ -584,7 +582,8 @@ asmlinkage void __init start_kernel(void)
*/
console_init();
if (panic_later)
- panic(panic_later, panic_param);
+ panic("Too many boot %s vars at `%s'", panic_later,
+ panic_param);
lockdep_info();
diff --git a/ipc/compat.c b/ipc/compat.c
index 892f6585dd60..f71e962756d8 100644
--- a/ipc/compat.c
+++ b/ipc/compat.c
@@ -197,7 +197,7 @@ static inline int __put_compat_ipc_perm(struct ipc64_perm *p,
static inline int get_compat_semid64_ds(struct semid64_ds *s64,
struct compat_semid64_ds __user *up64)
{
- if (!access_ok (VERIFY_READ, up64, sizeof(*up64)))
+ if (!access_ok(VERIFY_READ, up64, sizeof(*up64)))
return -EFAULT;
return __get_compat_ipc64_perm(&s64->sem_perm, &up64->sem_perm);
}
@@ -205,7 +205,7 @@ static inline int get_compat_semid64_ds(struct semid64_ds *s64,
static inline int get_compat_semid_ds(struct semid64_ds *s,
struct compat_semid_ds __user *up)
{
- if (!access_ok (VERIFY_READ, up, sizeof(*up)))
+ if (!access_ok(VERIFY_READ, up, sizeof(*up)))
return -EFAULT;
return __get_compat_ipc_perm(&s->sem_perm, &up->sem_perm);
}
@@ -215,7 +215,7 @@ static inline int put_compat_semid64_ds(struct semid64_ds *s64,
{
int err;
- if (!access_ok (VERIFY_WRITE, up64, sizeof(*up64)))
+ if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64)))
return -EFAULT;
err = __put_compat_ipc64_perm(&s64->sem_perm, &up64->sem_perm);
err |= __put_user(s64->sem_otime, &up64->sem_otime);
@@ -229,7 +229,7 @@ static inline int put_compat_semid_ds(struct semid64_ds *s,
{
int err;
- if (!access_ok (VERIFY_WRITE, up, sizeof(*up)))
+ if (!access_ok(VERIFY_WRITE, up, sizeof(*up)))
return -EFAULT;
err = __put_compat_ipc_perm(&s->sem_perm, &up->sem_perm);
err |= __put_user(s->sem_otime, &up->sem_otime);
@@ -288,11 +288,11 @@ static long do_compat_semctl(int first, int second, int third, u32 pad)
break;
case IPC_SET:
- if (version == IPC_64) {
+ if (version == IPC_64)
err = get_compat_semid64_ds(&s64, compat_ptr(pad));
- } else {
+ else
err = get_compat_semid_ds(&s64, compat_ptr(pad));
- }
+
up64 = compat_alloc_user_space(sizeof(s64));
if (copy_to_user(up64, &s64, sizeof(s64)))
err = -EFAULT;
@@ -376,7 +376,7 @@ COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, second,
struct compat_ipc_kludge ipck;
if (!uptr)
return -EINVAL;
- if (copy_from_user (&ipck, uptr, sizeof(ipck)))
+ if (copy_from_user(&ipck, uptr, sizeof(ipck)))
return -EFAULT;
uptr = compat_ptr(ipck.msgp);
fifth = ipck.msgtyp;
@@ -515,11 +515,11 @@ long compat_sys_msgctl(int first, int second, void __user *uptr)
break;
case IPC_SET:
- if (version == IPC_64) {
+ if (version == IPC_64)
err = get_compat_msqid64(&m64, uptr);
- } else {
+ else
err = get_compat_msqid(&m64, uptr);
- }
+
if (err)
break;
p = compat_alloc_user_space(sizeof(m64));
@@ -702,11 +702,11 @@ long compat_sys_shmctl(int first, int second, void __user *uptr)
case IPC_SET:
- if (version == IPC_64) {
+ if (version == IPC_64)
err = get_compat_shmid64_ds(&s64, uptr);
- } else {
+ else
err = get_compat_shmid_ds(&s64, uptr);
- }
+
if (err)
break;
p = compat_alloc_user_space(sizeof(s64));
diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c
index 380ea4fe08e7..63d7c6de335b 100644
--- a/ipc/compat_mq.c
+++ b/ipc/compat_mq.c
@@ -64,7 +64,7 @@ asmlinkage long compat_sys_mq_open(const char __user *u_name,
return sys_mq_open(u_name, oflag, mode, p);
}
-static int compat_prepare_timeout(struct timespec __user * *p,
+static int compat_prepare_timeout(struct timespec __user **p,
const struct compat_timespec __user *u)
{
struct timespec ts;
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index b0e99deb6d05..17028648cfeb 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -164,21 +164,21 @@ static struct ctl_table ipc_kern_table[] = {
{
.procname = "shmmax",
.data = &init_ipc_ns.shm_ctlmax,
- .maxlen = sizeof (init_ipc_ns.shm_ctlmax),
+ .maxlen = sizeof(init_ipc_ns.shm_ctlmax),
.mode = 0644,
.proc_handler = proc_ipc_doulongvec_minmax,
},
{
.procname = "shmall",
.data = &init_ipc_ns.shm_ctlall,
- .maxlen = sizeof (init_ipc_ns.shm_ctlall),
+ .maxlen = sizeof(init_ipc_ns.shm_ctlall),
.mode = 0644,
.proc_handler = proc_ipc_doulongvec_minmax,
},
{
.procname = "shmmni",
.data = &init_ipc_ns.shm_ctlmni,
- .maxlen = sizeof (init_ipc_ns.shm_ctlmni),
+ .maxlen = sizeof(init_ipc_ns.shm_ctlmni),
.mode = 0644,
.proc_handler = proc_ipc_dointvec,
},
@@ -194,7 +194,7 @@ static struct ctl_table ipc_kern_table[] = {
{
.procname = "msgmax",
.data = &init_ipc_ns.msg_ctlmax,
- .maxlen = sizeof (init_ipc_ns.msg_ctlmax),
+ .maxlen = sizeof(init_ipc_ns.msg_ctlmax),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.extra1 = &zero,
@@ -203,7 +203,7 @@ static struct ctl_table ipc_kern_table[] = {
{
.procname = "msgmni",
.data = &init_ipc_ns.msg_ctlmni,
- .maxlen = sizeof (init_ipc_ns.msg_ctlmni),
+ .maxlen = sizeof(init_ipc_ns.msg_ctlmni),
.mode = 0644,
.proc_handler = proc_ipc_callback_dointvec_minmax,
.extra1 = &zero,
@@ -212,7 +212,7 @@ static struct ctl_table ipc_kern_table[] = {
{
.procname = "msgmnb",
.data = &init_ipc_ns.msg_ctlmnb,
- .maxlen = sizeof (init_ipc_ns.msg_ctlmnb),
+ .maxlen = sizeof(init_ipc_ns.msg_ctlmnb),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.extra1 = &zero,
@@ -221,7 +221,7 @@ static struct ctl_table ipc_kern_table[] = {
{
.procname = "sem",
.data = &init_ipc_ns.sem_ctls,
- .maxlen = 4*sizeof (int),
+ .maxlen = 4*sizeof(int),
.mode = 0644,
.proc_handler = proc_ipc_dointvec,
},
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 95827ce2f3c7..ccf1f9fd263a 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -6,7 +6,7 @@
*
* Spinlocks: Mohamed Abbas (abbas.mohamed@intel.com)
* Lockless receive & send, fd based notify:
- * Manfred Spraul (manfred@colorfullife.com)
+ * Manfred Spraul (manfred@colorfullife.com)
*
* Audit: George Wilson (ltcgcw@us.ibm.com)
*
@@ -73,7 +73,7 @@ struct mqueue_inode_info {
struct mq_attr attr;
struct sigevent notify;
- struct pid* notify_owner;
+ struct pid *notify_owner;
struct user_namespace *notify_user_ns;
struct user_struct *user; /* user who created, for accounting */
struct sock *notify_sock;
@@ -92,7 +92,7 @@ static void remove_notification(struct mqueue_inode_info *info);
static struct kmem_cache *mqueue_inode_cachep;
-static struct ctl_table_header * mq_sysctl_table;
+static struct ctl_table_header *mq_sysctl_table;
static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
{
@@ -466,13 +466,13 @@ out_unlock:
static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = dentry->d_inode;
dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
dir->i_size -= DIRENT_SIZE;
- drop_nlink(inode);
- dput(dentry);
- return 0;
+ drop_nlink(inode);
+ dput(dentry);
+ return 0;
}
/*
@@ -622,7 +622,7 @@ static struct ext_wait_queue *wq_get_first_waiter(
static inline void set_cookie(struct sk_buff *skb, char code)
{
- ((char*)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
+ ((char *)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
}
/*
@@ -1303,11 +1303,11 @@ retry:
out_fput:
fdput(f);
out:
- if (sock) {
+ if (sock)
netlink_detachskb(sock, nc);
- } else if (nc) {
+ else if (nc)
dev_kfree_skb(nc);
- }
+
return ret;
}
diff --git a/ipc/msg.c b/ipc/msg.c
index 558aa91186b6..245db1140ad6 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -253,8 +253,14 @@ static void expunge_all(struct msg_queue *msq, int res)
struct msg_receiver *msr, *t;
list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
- msr->r_msg = NULL;
+ msr->r_msg = NULL; /* initialize expunge ordering */
wake_up_process(msr->r_tsk);
+ /*
+ * Ensure that the wakeup is visible before setting r_msg as
+ * the receiving end depends on it: either spinning on a nil,
+ * or dealing with -EAGAIN cases. See lockless receive part 1
+ * and 2 in do_msgrcv().
+ */
smp_mb();
msr->r_msg = ERR_PTR(res);
}
@@ -318,7 +324,7 @@ SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)
static inline unsigned long
copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version)
{
- switch(version) {
+ switch (version) {
case IPC_64:
return copy_to_user(buf, in, sizeof(*in));
case IPC_OLD:
@@ -363,7 +369,7 @@ copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version)
static inline unsigned long
copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version)
{
- switch(version) {
+ switch (version) {
case IPC_64:
if (copy_from_user(out, buf, sizeof(*out)))
return -EFAULT;
@@ -375,9 +381,9 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version)
if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
return -EFAULT;
- out->msg_perm.uid = tbuf_old.msg_perm.uid;
- out->msg_perm.gid = tbuf_old.msg_perm.gid;
- out->msg_perm.mode = tbuf_old.msg_perm.mode;
+ out->msg_perm.uid = tbuf_old.msg_perm.uid;
+ out->msg_perm.gid = tbuf_old.msg_perm.gid;
+ out->msg_perm.mode = tbuf_old.msg_perm.mode;
if (tbuf_old.msg_qbytes == 0)
out->msg_qbytes = tbuf_old.msg_lqbytes;
@@ -606,13 +612,13 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)
static int testmsg(struct msg_msg *msg, long type, int mode)
{
- switch(mode)
+ switch (mode)
{
case SEARCH_ANY:
case SEARCH_NUMBER:
return 1;
case SEARCH_LESSEQUAL:
- if (msg->m_type <=type)
+ if (msg->m_type <= type)
return 1;
break;
case SEARCH_EQUAL:
@@ -638,15 +644,22 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
list_del(&msr->r_list);
if (msr->r_maxsize < msg->m_ts) {
+ /* initialize pipelined send ordering */
msr->r_msg = NULL;
wake_up_process(msr->r_tsk);
- smp_mb();
+ smp_mb(); /* see barrier comment below */
msr->r_msg = ERR_PTR(-E2BIG);
} else {
msr->r_msg = NULL;
msq->q_lrpid = task_pid_vnr(msr->r_tsk);
msq->q_rtime = get_seconds();
wake_up_process(msr->r_tsk);
+ /*
+ * Ensure that the wakeup is visible before
+ * setting r_msg, as the receiving end depends
+ * on it. See lockless receive part 1 and 2 in
+ * do_msgrcv().
+ */
smp_mb();
msr->r_msg = msg;
@@ -654,6 +667,7 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
}
}
}
+
return 0;
}
@@ -696,7 +710,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
goto out_unlock0;
/* raced with RMID? */
- if (msq->q_perm.deleted) {
+ if (!ipc_valid_object(&msq->q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -716,6 +730,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
goto out_unlock0;
}
+ /* enqueue the sender and prepare to block */
ss_add(msq, &s);
if (!ipc_rcu_getref(msq)) {
@@ -731,7 +746,8 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_lock_object(&msq->q_perm);
ipc_rcu_putref(msq, ipc_rcu_free);
- if (msq->q_perm.deleted) {
+ /* raced with RMID? */
+ if (!ipc_valid_object(&msq->q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -909,7 +925,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
ipc_lock_object(&msq->q_perm);
/* raced with RMID? */
- if (msq->q_perm.deleted) {
+ if (!ipc_valid_object(&msq->q_perm)) {
msg = ERR_PTR(-EIDRM);
goto out_unlock0;
}
@@ -983,7 +999,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
* wake_up_process(). There is a race with exit(), see
* ipc/mqueue.c for the details.
*/
- msg = (struct msg_msg*)msr_d.r_msg;
+ msg = (struct msg_msg *)msr_d.r_msg;
while (msg == NULL) {
cpu_relax();
msg = (struct msg_msg *)msr_d.r_msg;
@@ -1004,7 +1020,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
/* Lockless receive, part 4:
* Repeat test after acquiring the spinlock.
*/
- msg = (struct msg_msg*)msr_d.r_msg;
+ msg = (struct msg_msg *)msr_d.r_msg;
if (msg != ERR_PTR(-EAGAIN))
goto out_unlock0;
diff --git a/ipc/sem.c b/ipc/sem.c
index db9d241af133..bee555417312 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -188,7 +188,7 @@ void sem_exit_ns(struct ipc_namespace *ns)
}
#endif
-void __init sem_init (void)
+void __init sem_init(void)
{
sem_init_ns(&init_ipc_ns);
ipc_init_proc_interface("sysvipc/sem",
@@ -225,7 +225,7 @@ static void unmerge_queues(struct sem_array *sma)
}
/**
- * merge_queues - Merge single semop queues into global queue
+ * merge_queues - merge single semop queues into global queue
* @sma: semaphore array
*
* This function merges all per-semaphore queues into the global queue.
@@ -394,7 +394,7 @@ static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns,
/* ipc_rmid() may have already freed the ID while sem_lock
* was spinning: verify that the structure is still valid
*/
- if (!ipcp->deleted)
+ if (ipc_valid_object(ipcp))
return container_of(ipcp, struct sem_array, sem_perm);
sem_unlock(sma, *locknum);
@@ -445,11 +445,11 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
* * call wake_up_process
* * set queue.status to the final value.
* - the previously blocked thread checks queue.status:
- * * if it's IN_WAKEUP, then it must wait until the value changes
- * * if it's not -EINTR, then the operation was completed by
- * update_queue. semtimedop can return queue.status without
- * performing any operation on the sem array.
- * * otherwise it must acquire the spinlock and check what's up.
+ * * if it's IN_WAKEUP, then it must wait until the value changes
+ * * if it's not -EINTR, then the operation was completed by
+ * update_queue. semtimedop can return queue.status without
+ * performing any operation on the sem array.
+ * * otherwise it must acquire the spinlock and check what's up.
*
* The two-stage algorithm is necessary to protect against the following
* races:
@@ -474,7 +474,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
*
* Called with sem_ids.rwsem held (as a writer)
*/
-
static int newary(struct ipc_namespace *ns, struct ipc_params *params)
{
int id;
@@ -491,12 +490,12 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
if (ns->used_sems + nsems > ns->sc_semmns)
return -ENOSPC;
- size = sizeof (*sma) + nsems * sizeof (struct sem);
+ size = sizeof(*sma) + nsems * sizeof(struct sem);
sma = ipc_rcu_alloc(size);
- if (!sma) {
+ if (!sma)
return -ENOMEM;
- }
- memset (sma, 0, size);
+
+ memset(sma, 0, size);
sma->sem_perm.mode = (semflg & S_IRWXUGO);
sma->sem_perm.key = key;
@@ -584,10 +583,11 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
}
-/** perform_atomic_semop - Perform (if possible) a semaphore operation
+/**
+ * perform_atomic_semop - Perform (if possible) a semaphore operation
* @sma: semaphore array
* @sops: array with operations that should be checked
- * @nsems: number of sops
+ * @nsops: number of operations
* @un: undo array
* @pid: pid that did the change
*
@@ -595,19 +595,18 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
* Returns 1 if the operation is impossible, the caller must sleep.
* Negative values are error codes.
*/
-
static int perform_atomic_semop(struct sem_array *sma, struct sembuf *sops,
int nsops, struct sem_undo *un, int pid)
{
int result, sem_op;
struct sembuf *sop;
- struct sem * curr;
+ struct sem *curr;
for (sop = sops; sop < sops + nsops; sop++) {
curr = sma->sem_base + sop->sem_num;
sem_op = sop->sem_op;
result = curr->semval;
-
+
if (!sem_op && result)
goto would_block;
@@ -616,25 +615,24 @@ static int perform_atomic_semop(struct sem_array *sma, struct sembuf *sops,
goto would_block;
if (result > SEMVMX)
goto out_of_range;
+
if (sop->sem_flg & SEM_UNDO) {
int undo = un->semadj[sop->sem_num] - sem_op;
- /*
- * Exceeding the undo range is an error.
- */
+ /* Exceeding the undo range is an error. */
if (undo < (-SEMAEM - 1) || undo > SEMAEM)
goto out_of_range;
+ un->semadj[sop->sem_num] = undo;
}
+
curr->semval = result;
}
sop--;
while (sop >= sops) {
sma->sem_base[sop->sem_num].sempid = pid;
- if (sop->sem_flg & SEM_UNDO)
- un->semadj[sop->sem_num] -= sop->sem_op;
sop--;
}
-
+
return 0;
out_of_range:
@@ -650,7 +648,10 @@ would_block:
undo:
sop--;
while (sop >= sops) {
- sma->sem_base[sop->sem_num].semval -= sop->sem_op;
+ sem_op = sop->sem_op;
+ sma->sem_base[sop->sem_num].semval -= sem_op;
+ if (sop->sem_flg & SEM_UNDO)
+ un->semadj[sop->sem_num] += sem_op;
sop--;
}
@@ -680,7 +681,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
}
/**
- * wake_up_sem_queue_do(pt) - do the actual wake-up
+ * wake_up_sem_queue_do - do the actual wake-up
* @pt: list of tasks to be woken up
*
* Do the actual wake-up.
@@ -746,7 +747,7 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q)
}
/**
- * wake_const_ops(sma, semnum, pt) - Wake up non-alter tasks
+ * wake_const_ops - wake up non-alter tasks
* @sma: semaphore array.
* @semnum: semaphore that was modified.
* @pt: list head for the tasks that must be woken up.
@@ -796,15 +797,14 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
}
/**
- * do_smart_wakeup_zero(sma, sops, nsops, pt) - wakeup all wait for zero tasks
+ * do_smart_wakeup_zero - wakeup all wait for zero tasks
* @sma: semaphore array
* @sops: operations that were performed
* @nsops: number of operations
* @pt: list head of the tasks that must be woken up.
*
- * do_smart_wakeup_zero() checks all required queue for wait-for-zero
- * operations, based on the actual changes that were performed on the
- * semaphore array.
+ * Checks all required queue for wait-for-zero operations, based
+ * on the actual changes that were performed on the semaphore array.
* The function returns 1 if at least one operation was completed successfully.
*/
static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
@@ -848,7 +848,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
/**
- * update_queue(sma, semnum): Look for tasks that can be completed.
+ * update_queue - look for tasks that can be completed.
* @sma: semaphore array.
* @semnum: semaphore that was modified.
* @pt: list head for the tasks that must be woken up.
@@ -918,7 +918,7 @@ again:
}
/**
- * set_semotime(sma, sops) - set sem_otime
+ * set_semotime - set sem_otime
* @sma: semaphore array
* @sops: operations that modified the array, may be NULL
*
@@ -936,7 +936,7 @@ static void set_semotime(struct sem_array *sma, struct sembuf *sops)
}
/**
- * do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue
+ * do_smart_update - optimized update_queue
* @sma: semaphore array
* @sops: operations that were performed
* @nsops: number of operations
@@ -998,21 +998,21 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
* The counts we return here are a rough approximation, but still
* warrant that semncnt+semzcnt>0 if the task is on the pending queue.
*/
-static int count_semncnt (struct sem_array * sma, ushort semnum)
+static int count_semncnt(struct sem_array *sma, ushort semnum)
{
int semncnt;
- struct sem_queue * q;
+ struct sem_queue *q;
semncnt = 0;
list_for_each_entry(q, &sma->sem_base[semnum].pending_alter, list) {
- struct sembuf * sops = q->sops;
+ struct sembuf *sops = q->sops;
BUG_ON(sops->sem_num != semnum);
if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT))
semncnt++;
}
list_for_each_entry(q, &sma->pending_alter, list) {
- struct sembuf * sops = q->sops;
+ struct sembuf *sops = q->sops;
int nsops = q->nsops;
int i;
for (i = 0; i < nsops; i++)
@@ -1024,21 +1024,21 @@ static int count_semncnt (struct sem_array * sma, ushort semnum)
return semncnt;
}
-static int count_semzcnt (struct sem_array * sma, ushort semnum)
+static int count_semzcnt(struct sem_array *sma, ushort semnum)
{
int semzcnt;
- struct sem_queue * q;
+ struct sem_queue *q;
semzcnt = 0;
list_for_each_entry(q, &sma->sem_base[semnum].pending_const, list) {
- struct sembuf * sops = q->sops;
+ struct sembuf *sops = q->sops;
BUG_ON(sops->sem_num != semnum);
if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT))
semzcnt++;
}
list_for_each_entry(q, &sma->pending_const, list) {
- struct sembuf * sops = q->sops;
+ struct sembuf *sops = q->sops;
int nsops = q->nsops;
int i;
for (i = 0; i < nsops; i++)
@@ -1108,7 +1108,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version)
{
- switch(version) {
+ switch (version) {
case IPC_64:
return copy_to_user(buf, in, sizeof(*in));
case IPC_OLD:
@@ -1151,7 +1151,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid,
int err;
struct sem_array *sma;
- switch(cmd) {
+ switch (cmd) {
case IPC_INFO:
case SEM_INFO:
{
@@ -1162,7 +1162,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid,
if (err)
return err;
- memset(&seminfo,0,sizeof(seminfo));
+ memset(&seminfo, 0, sizeof(seminfo));
seminfo.semmni = ns->sc_semmni;
seminfo.semmns = ns->sc_semmns;
seminfo.semmsl = ns->sc_semmsl;
@@ -1183,7 +1183,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid,
up_read(&sem_ids(ns).rwsem);
if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
return -EFAULT;
- return (max_id < 0) ? 0: max_id;
+ return (max_id < 0) ? 0 : max_id;
}
case IPC_STAT:
case SEM_STAT:
@@ -1239,7 +1239,7 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
{
struct sem_undo *un;
struct sem_array *sma;
- struct sem* curr;
+ struct sem *curr;
int err;
struct list_head tasks;
int val;
@@ -1282,7 +1282,7 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
sem_lock(sma, NULL, -1);
- if (sma->sem_perm.deleted) {
+ if (!ipc_valid_object(&sma->sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
return -EIDRM;
@@ -1309,10 +1309,10 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
int cmd, void __user *p)
{
struct sem_array *sma;
- struct sem* curr;
+ struct sem *curr;
int err, nsems;
ushort fast_sem_io[SEMMSL_FAST];
- ushort* sem_io = fast_sem_io;
+ ushort *sem_io = fast_sem_io;
struct list_head tasks;
INIT_LIST_HEAD(&tasks);
@@ -1342,11 +1342,11 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
int i;
sem_lock(sma, NULL, -1);
- if (sma->sem_perm.deleted) {
+ if (!ipc_valid_object(&sma->sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
- if(nsems > SEMMSL_FAST) {
+ if (nsems > SEMMSL_FAST) {
if (!ipc_rcu_getref(sma)) {
err = -EIDRM;
goto out_unlock;
@@ -1354,14 +1354,14 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
sem_unlock(sma, -1);
rcu_read_unlock();
sem_io = ipc_alloc(sizeof(ushort)*nsems);
- if(sem_io == NULL) {
+ if (sem_io == NULL) {
ipc_rcu_putref(sma, ipc_rcu_free);
return -ENOMEM;
}
rcu_read_lock();
sem_lock_and_putref(sma);
- if (sma->sem_perm.deleted) {
+ if (!ipc_valid_object(&sma->sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1371,7 +1371,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
sem_unlock(sma, -1);
rcu_read_unlock();
err = 0;
- if(copy_to_user(array, sem_io, nsems*sizeof(ushort)))
+ if (copy_to_user(array, sem_io, nsems*sizeof(ushort)))
err = -EFAULT;
goto out_free;
}
@@ -1386,15 +1386,15 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
}
rcu_read_unlock();
- if(nsems > SEMMSL_FAST) {
+ if (nsems > SEMMSL_FAST) {
sem_io = ipc_alloc(sizeof(ushort)*nsems);
- if(sem_io == NULL) {
+ if (sem_io == NULL) {
ipc_rcu_putref(sma, ipc_rcu_free);
return -ENOMEM;
}
}
- if (copy_from_user (sem_io, p, nsems*sizeof(ushort))) {
+ if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) {
ipc_rcu_putref(sma, ipc_rcu_free);
err = -EFAULT;
goto out_free;
@@ -1409,7 +1409,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
}
rcu_read_lock();
sem_lock_and_putref(sma);
- if (sma->sem_perm.deleted) {
+ if (!ipc_valid_object(&sma->sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1435,7 +1435,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
goto out_rcu_wakeup;
sem_lock(sma, NULL, -1);
- if (sma->sem_perm.deleted) {
+ if (!ipc_valid_object(&sma->sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1449,10 +1449,10 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
err = curr->sempid;
goto out_unlock;
case GETNCNT:
- err = count_semncnt(sma,semnum);
+ err = count_semncnt(sma, semnum);
goto out_unlock;
case GETZCNT:
- err = count_semzcnt(sma,semnum);
+ err = count_semzcnt(sma, semnum);
goto out_unlock;
}
@@ -1462,7 +1462,7 @@ out_rcu_wakeup:
rcu_read_unlock();
wake_up_sem_queue_do(&tasks);
out_free:
- if(sem_io != fast_sem_io)
+ if (sem_io != fast_sem_io)
ipc_free(sem_io, sizeof(ushort)*nsems);
return err;
}
@@ -1470,7 +1470,7 @@ out_free:
static inline unsigned long
copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
{
- switch(version) {
+ switch (version) {
case IPC_64:
if (copy_from_user(out, buf, sizeof(*out)))
return -EFAULT;
@@ -1479,7 +1479,7 @@ copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
{
struct semid_ds tbuf_old;
- if(copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
+ if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
return -EFAULT;
out->sem_perm.uid = tbuf_old.sem_perm.uid;
@@ -1506,7 +1506,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid,
struct semid64_ds semid64;
struct kern_ipc_perm *ipcp;
- if(cmd == IPC_SET) {
+ if (cmd == IPC_SET) {
if (copy_semid_from_user(&semid64, p, version))
return -EFAULT;
}
@@ -1566,7 +1566,7 @@ SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
version = ipc_parse_version(&cmd);
ns = current->nsproxy->ipc_ns;
- switch(cmd) {
+ switch (cmd) {
case IPC_INFO:
case SEM_INFO:
case IPC_STAT:
@@ -1634,7 +1634,7 @@ static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
{
struct sem_undo *un;
- assert_spin_locked(&ulp->lock);
+ assert_spin_locked(&ulp->lock);
un = __lookup_undo(ulp, semid);
if (un) {
@@ -1645,7 +1645,7 @@ static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
}
/**
- * find_alloc_undo - Lookup (and if not present create) undo array
+ * find_alloc_undo - lookup (and if not present create) undo array
* @ns: namespace
* @semid: semaphore array id
*
@@ -1670,7 +1670,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
spin_lock(&ulp->lock);
un = lookup_undo(ulp, semid);
spin_unlock(&ulp->lock);
- if (likely(un!=NULL))
+ if (likely(un != NULL))
goto out;
/* no undo structure around - allocate one. */
@@ -1699,7 +1699,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
/* step 3: Acquire the lock on semaphore array */
rcu_read_lock();
sem_lock_and_putref(sma);
- if (sma->sem_perm.deleted) {
+ if (!ipc_valid_object(&sma->sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
kfree(new);
@@ -1735,7 +1735,7 @@ out:
/**
- * get_queue_result - Retrieve the result code from sem_queue
+ * get_queue_result - retrieve the result code from sem_queue
* @q: Pointer to queue structure
*
* Retrieve the return code from the pending queue. If IN_WAKEUP is found in
@@ -1765,7 +1765,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
int error = -EINVAL;
struct sem_array *sma;
struct sembuf fast_sops[SEMOPM_FAST];
- struct sembuf* sops = fast_sops, *sop;
+ struct sembuf *sops = fast_sops, *sop;
struct sem_undo *un;
int undos = 0, alter = 0, max, locknum;
struct sem_queue queue;
@@ -1779,13 +1779,13 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
return -EINVAL;
if (nsops > ns->sc_semopm)
return -E2BIG;
- if(nsops > SEMOPM_FAST) {
- sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL);
- if(sops==NULL)
+ if (nsops > SEMOPM_FAST) {
+ sops = kmalloc(sizeof(*sops)*nsops, GFP_KERNEL);
+ if (sops == NULL)
return -ENOMEM;
}
- if (copy_from_user (sops, tsops, nsops * sizeof(*tsops))) {
- error=-EFAULT;
+ if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
+ error = -EFAULT;
goto out_free;
}
if (timeout) {
@@ -1846,7 +1846,15 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
error = -EIDRM;
locknum = sem_lock(sma, sops, nsops);
- if (sma->sem_perm.deleted)
+ /*
+ * We eventually might perform the following check in a lockless
+ * fashion, considering ipc_valid_object() locking constraints.
+ * If nsops == 1 and there is no contention for sem_perm.lock, then
+ * only a per-semaphore lock is held and it's OK to proceed with the
+ * check below. More details on the fine grained locking scheme
+ * entangled here and why it's RMID race safe on comments at sem_lock()
+ */
+ if (!ipc_valid_object(&sma->sem_perm))
goto out_unlock_free;
/*
* semid identifiers are not unique - find_alloc_undo may have
@@ -1959,10 +1967,8 @@ sleep_again:
* If queue.status != -EINTR we are woken up by another process.
* Leave without unlink_queue(), but with sem_unlock().
*/
-
- if (error != -EINTR) {
+ if (error != -EINTR)
goto out_unlock_free;
- }
/*
* If an interrupt occurred we have to clean up the queue
@@ -1984,7 +1990,7 @@ out_rcu_wakeup:
rcu_read_unlock();
wake_up_sem_queue_do(&tasks);
out_free:
- if(sops != fast_sops)
+ if (sops != fast_sops)
kfree(sops);
return error;
}
@@ -2068,7 +2074,7 @@ void exit_sem(struct task_struct *tsk)
sem_lock(sma, NULL, -1);
/* exit_sem raced with IPC_RMID, nothing to do */
- if (sma->sem_perm.deleted) {
+ if (!ipc_valid_object(&sma->sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
continue;
@@ -2093,7 +2099,7 @@ void exit_sem(struct task_struct *tsk)
/* perform adjustments registered in un */
for (i = 0; i < sma->sem_nsems; i++) {
- struct sem * semaphore = &sma->sem_base[i];
+ struct sem *semaphore = &sma->sem_base[i];
if (un->semadj[i]) {
semaphore->semval += un->semadj[i];
/*
@@ -2107,7 +2113,7 @@ void exit_sem(struct task_struct *tsk)
* Linux caps the semaphore value, both at 0
* and at SEMVMX.
*
- * Manfred <manfred@colorfullife.com>
+ * Manfred <manfred@colorfullife.com>
*/
if (semaphore->semval < 0)
semaphore->semval = 0;
diff --git a/ipc/shm.c b/ipc/shm.c
index 7a51443a51d6..76459616a7fa 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -67,7 +67,7 @@ static const struct vm_operations_struct shm_vm_ops;
static int newseg(struct ipc_namespace *, struct ipc_params *);
static void shm_open(struct vm_area_struct *vma);
static void shm_close(struct vm_area_struct *vma);
-static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp);
+static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
#ifdef CONFIG_PROC_FS
static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
#endif
@@ -91,7 +91,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
struct shmid_kernel *shp;
shp = container_of(ipcp, struct shmid_kernel, shm_perm);
- if (shp->shm_nattch){
+ if (shp->shm_nattch) {
shp->shm_perm.mode |= SHM_DEST;
/* Do not find it any more */
shp->shm_perm.key = IPC_PRIVATE;
@@ -116,7 +116,7 @@ static int __init ipc_ns_init(void)
pure_initcall(ipc_ns_init);
-void __init shm_init (void)
+void __init shm_init(void)
{
ipc_init_proc_interface("sysvipc/shm",
#if BITS_PER_LONG <= 32
@@ -248,7 +248,7 @@ static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
*/
static void shm_close(struct vm_area_struct *vma)
{
- struct file * file = vma->vm_file;
+ struct file *file = vma->vm_file;
struct shm_file_data *sfd = shm_file_data(file);
struct shmid_kernel *shp;
struct ipc_namespace *ns = sfd->ns;
@@ -379,7 +379,7 @@ static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
}
#endif
-static int shm_mmap(struct file * file, struct vm_area_struct * vma)
+static int shm_mmap(struct file *file, struct vm_area_struct *vma)
{
struct shm_file_data *sfd = shm_file_data(file);
int ret;
@@ -477,7 +477,6 @@ static const struct vm_operations_struct shm_vm_ops = {
*
* Called with shm_ids.rwsem held as a writer.
*/
-
static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
{
key_t key = params->key;
@@ -486,7 +485,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
int error;
struct shmid_kernel *shp;
size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- struct file * file;
+ struct file *file;
char name[13];
int id;
vm_flags_t acctflag = 0;
@@ -512,7 +511,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
return error;
}
- sprintf (name, "SYSV%08x", key);
+ sprintf(name, "SYSV%08x", key);
if (shmflg & SHM_HUGETLB) {
struct hstate *hs;
size_t hugesize;
@@ -533,7 +532,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
} else {
/*
* Do not allow no accounting for OVERCOMMIT_NEVER, even
- * if it's asked for.
+ * if it's asked for.
*/
if ((shmflg & SHM_NORESERVE) &&
sysctl_overcommit_memory != OVERCOMMIT_NEVER)
@@ -628,7 +627,7 @@ SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version)
{
- switch(version) {
+ switch (version) {
case IPC_64:
return copy_to_user(buf, in, sizeof(*in));
case IPC_OLD:
@@ -655,7 +654,7 @@ static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_
static inline unsigned long
copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version)
{
- switch(version) {
+ switch (version) {
case IPC_64:
if (copy_from_user(out, buf, sizeof(*out)))
return -EFAULT;
@@ -680,14 +679,14 @@ copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version)
static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version)
{
- switch(version) {
+ switch (version) {
case IPC_64:
return copy_to_user(buf, in, sizeof(*in));
case IPC_OLD:
{
struct shminfo out;
- if(in->shmmax > INT_MAX)
+ if (in->shmmax > INT_MAX)
out.shmmax = INT_MAX;
else
out.shmmax = (int)in->shmmax;
@@ -846,14 +845,14 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
shminfo.shmall = ns->shm_ctlall;
shminfo.shmmin = SHMMIN;
- if(copy_shminfo_to_user (buf, &shminfo, version))
+ if (copy_shminfo_to_user(buf, &shminfo, version))
return -EFAULT;
down_read(&shm_ids(ns).rwsem);
err = ipc_get_maxid(&shm_ids(ns));
up_read(&shm_ids(ns).rwsem);
- if(err<0)
+ if (err < 0)
err = 0;
goto out;
}
@@ -864,7 +863,7 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
memset(&shm_info, 0, sizeof(shm_info));
down_read(&shm_ids(ns).rwsem);
shm_info.used_ids = shm_ids(ns).in_use;
- shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp);
+ shm_get_stat(ns, &shm_info.shm_rss, &shm_info.shm_swp);
shm_info.shm_tot = ns->shm_tot;
shm_info.swap_attempts = 0;
shm_info.swap_successes = 0;
@@ -975,6 +974,13 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
goto out_unlock1;
ipc_lock_object(&shp->shm_perm);
+
+ /* check if shm_destroy() is tearing down shp */
+ if (!ipc_valid_object(&shp->shm_perm)) {
+ err = -EIDRM;
+ goto out_unlock0;
+ }
+
if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
kuid_t euid = current_euid();
if (!uid_eq(euid, shp->shm_perm.uid) &&
@@ -989,13 +995,6 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
}
shm_file = shp->shm_file;
-
- /* check if shm_destroy() is tearing down shp */
- if (shm_file == NULL) {
- err = -EIDRM;
- goto out_unlock0;
- }
-
if (is_file_hugepages(shm_file))
goto out_unlock0;
@@ -1047,7 +1046,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
struct shmid_kernel *shp;
unsigned long addr;
unsigned long size;
- struct file * file;
+ struct file *file;
int err;
unsigned long flags;
unsigned long prot;
@@ -1116,7 +1115,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
ipc_lock_object(&shp->shm_perm);
/* check if shm_destroy() is tearing down shp */
- if (shp->shm_file == NULL) {
+ if (!ipc_valid_object(&shp->shm_perm)) {
ipc_unlock_object(&shp->shm_perm);
err = -EIDRM;
goto out_unlock;
diff --git a/ipc/util.c b/ipc/util.c
index 3ae17a4ace5b..fc5c655ff507 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -110,15 +110,15 @@ static struct notifier_block ipc_memory_nb = {
};
/**
- * ipc_init - initialise IPC subsystem
+ * ipc_init - initialise ipc subsystem
*
- * The various system5 IPC resources (semaphores, messages and shared
- * memory) are initialised
- * A callback routine is registered into the memory hotplug notifier
- * chain: since msgmni scales to lowmem this callback routine will be
- * called upon successful memory add / remove to recompute msmgni.
+ * The various sysv ipc resources (semaphores, messages and shared
+ * memory) are initialised.
+ *
+ * A callback routine is registered into the memory hotplug notifier
+ * chain: since msgmni scales to lowmem this callback routine will be
+ * called upon successful memory add / remove to recompute msmgni.
*/
-
static int __init ipc_init(void)
{
sem_init();
@@ -131,39 +131,29 @@ static int __init ipc_init(void)
__initcall(ipc_init);
/**
- * ipc_init_ids - initialise IPC identifiers
- * @ids: Identifier set
+ * ipc_init_ids - initialise ipc identifiers
+ * @ids: ipc identifier set
*
- * Set up the sequence range to use for the ipc identifier range (limited
- * below IPCMNI) then initialise the ids idr.
+ * Set up the sequence range to use for the ipc identifier range (limited
+ * below IPCMNI) then initialise the ids idr.
*/
-
void ipc_init_ids(struct ipc_ids *ids)
{
- init_rwsem(&ids->rwsem);
-
ids->in_use = 0;
ids->seq = 0;
ids->next_id = -1;
- {
- int seq_limit = INT_MAX/SEQ_MULTIPLIER;
- if (seq_limit > USHRT_MAX)
- ids->seq_max = USHRT_MAX;
- else
- ids->seq_max = seq_limit;
- }
-
+ init_rwsem(&ids->rwsem);
idr_init(&ids->ipcs_idr);
}
#ifdef CONFIG_PROC_FS
static const struct file_operations sysvipc_proc_fops;
/**
- * ipc_init_proc_interface - Create a proc interface for sysipc types using a seq_file interface.
- * @path: Path in procfs
- * @header: Banner to be printed at the beginning of the file.
- * @ids: ipc id table to iterate.
- * @show: show routine.
+ * ipc_init_proc_interface - create a proc interface for sysipc types using a seq_file interface.
+ * @path: Path in procfs
+ * @header: Banner to be printed at the beginning of the file.
+ * @ids: ipc id table to iterate.
+ * @show: show routine.
*/
void __init ipc_init_proc_interface(const char *path, const char *header,
int ids, int (*show)(struct seq_file *, void *))
@@ -184,23 +174,21 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
NULL, /* parent dir */
&sysvipc_proc_fops,
iface);
- if (!pde) {
+ if (!pde)
kfree(iface);
- }
}
#endif
/**
- * ipc_findkey - find a key in an ipc identifier set
- * @ids: Identifier set
- * @key: The key to find
- *
- * Requires ipc_ids.rwsem locked.
- * Returns the LOCKED pointer to the ipc structure if found or NULL
- * if not.
- * If key is found ipc points to the owning ipc structure
+ * ipc_findkey - find a key in an ipc identifier set
+ * @ids: ipc identifier set
+ * @key: key to find
+ *
+ * Returns the locked pointer to the ipc structure if found or NULL
+ * otherwise. If key is found ipc points to the owning ipc structure
+ *
+ * Called with ipc_ids.rwsem held.
*/
-
static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
{
struct kern_ipc_perm *ipc;
@@ -227,12 +215,11 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
}
/**
- * ipc_get_maxid - get the last assigned id
- * @ids: IPC identifier set
+ * ipc_get_maxid - get the last assigned id
+ * @ids: ipc identifier set
*
- * Called with ipc_ids.rwsem held.
+ * Called with ipc_ids.rwsem held.
*/
-
int ipc_get_maxid(struct ipc_ids *ids)
{
struct kern_ipc_perm *ipc;
@@ -258,19 +245,19 @@ int ipc_get_maxid(struct ipc_ids *ids)
}
/**
- * ipc_addid - add an IPC identifier
- * @ids: IPC identifier set
- * @new: new IPC permission set
- * @size: limit for the number of used ids
+ * ipc_addid - add an ipc identifier
+ * @ids: ipc identifier set
+ * @new: new ipc permission set
+ * @size: limit for the number of used ids
*
- * Add an entry 'new' to the IPC ids idr. The permissions object is
- * initialised and the first free entry is set up and the id assigned
- * is returned. The 'new' entry is returned in a locked state on success.
- * On failure the entry is not locked and a negative err-code is returned.
+ * Add an entry 'new' to the ipc ids idr. The permissions object is
+ * initialised and the first free entry is set up and the id assigned
+ * is returned. The 'new' entry is returned in a locked state on success.
+ * On failure the entry is not locked and a negative err-code is returned.
*
- * Called with writer ipc_ids.rwsem held.
+ * Called with writer ipc_ids.rwsem held.
*/
-int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
+int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
{
kuid_t euid;
kgid_t egid;
@@ -286,7 +273,7 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
idr_preload(GFP_KERNEL);
spin_lock_init(&new->lock);
- new->deleted = 0;
+ new->deleted = false;
rcu_read_lock();
spin_lock(&new->lock);
@@ -308,7 +295,7 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
if (next_id < 0) {
new->seq = ids->seq++;
- if (ids->seq > ids->seq_max)
+ if (ids->seq > IPCID_SEQ_MAX)
ids->seq = 0;
} else {
new->seq = ipcid_to_seqx(next_id);
@@ -320,14 +307,14 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
}
/**
- * ipcget_new - create a new ipc object
- * @ns: namespace
- * @ids: IPC identifer set
- * @ops: the actual creation routine to call
- * @params: its parameters
- *
- * This routine is called by sys_msgget, sys_semget() and sys_shmget()
- * when the key is IPC_PRIVATE.
+ * ipcget_new - create a new ipc object
+ * @ns: ipc namespace
+ * @ids: ipc identifer set
+ * @ops: the actual creation routine to call
+ * @params: its parameters
+ *
+ * This routine is called by sys_msgget, sys_semget() and sys_shmget()
+ * when the key is IPC_PRIVATE.
*/
static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids,
struct ipc_ops *ops, struct ipc_params *params)
@@ -341,19 +328,19 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids,
}
/**
- * ipc_check_perms - check security and permissions for an IPC
- * @ns: IPC namespace
- * @ipcp: ipc permission set
- * @ops: the actual security routine to call
- * @params: its parameters
+ * ipc_check_perms - check security and permissions for an ipc object
+ * @ns: ipc namespace
+ * @ipcp: ipc permission set
+ * @ops: the actual security routine to call
+ * @params: its parameters
*
- * This routine is called by sys_msgget(), sys_semget() and sys_shmget()
- * when the key is not IPC_PRIVATE and that key already exists in the
- * ids IDR.
+ * This routine is called by sys_msgget(), sys_semget() and sys_shmget()
+ * when the key is not IPC_PRIVATE and that key already exists in the
+ * ds IDR.
*
- * On success, the IPC id is returned.
+ * On success, the ipc id is returned.
*
- * It is called with ipc_ids.rwsem and ipcp->lock held.
+ * It is called with ipc_ids.rwsem and ipcp->lock held.
*/
static int ipc_check_perms(struct ipc_namespace *ns,
struct kern_ipc_perm *ipcp,
@@ -374,96 +361,96 @@ static int ipc_check_perms(struct ipc_namespace *ns,
}
/**
- * ipcget_public - get an ipc object or create a new one
- * @ns: namespace
- * @ids: IPC identifer set
- * @ops: the actual creation routine to call
- * @params: its parameters
- *
- * This routine is called by sys_msgget, sys_semget() and sys_shmget()
- * when the key is not IPC_PRIVATE.
- * It adds a new entry if the key is not found and does some permission
- * / security checkings if the key is found.
- *
- * On success, the ipc id is returned.
+ * ipcget_public - get an ipc object or create a new one
+ * @ns: ipc namespace
+ * @ids: ipc identifer set
+ * @ops: the actual creation routine to call
+ * @params: its parameters
+ *
+ * This routine is called by sys_msgget, sys_semget() and sys_shmget()
+ * when the key is not IPC_PRIVATE.
+ * It adds a new entry if the key is not found and does some permission
+ * / security checkings if the key is found.
+ *
+ * On success, the ipc id is returned.
*/
static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
- struct ipc_ops *ops, struct ipc_params *params)
+ struct ipc_ops *ops, struct ipc_params *params)
{
struct kern_ipc_perm *ipcp;
int flg = params->flg;
- int err;
+ int err = 0;
- /*
- * Take the lock as a writer since we are potentially going to add
- * a new entry + read locks are not "upgradable"
- */
down_write(&ids->rwsem);
ipcp = ipc_findkey(ids, params->key);
- if (ipcp == NULL) {
+
+ if (!ipcp) {
/* key not used */
if (!(flg & IPC_CREAT))
err = -ENOENT;
- else
+ else /* create new ipc object */
err = ops->getnew(ns, params);
- } else {
- /* ipc object has been locked by ipc_findkey() */
-
- if (flg & IPC_CREAT && flg & IPC_EXCL)
- err = -EEXIST;
- else {
- err = 0;
- if (ops->more_checks)
- err = ops->more_checks(ipcp, params);
- if (!err)
- /*
- * ipc_check_perms returns the IPC id on
- * success
- */
- err = ipc_check_perms(ns, ipcp, ops, params);
- }
+
+ goto done_write;
+ }
+
+ if ((flg & IPC_CREAT) && (flg & IPC_EXCL)) {
+ /* ipc object was locked by successful ipc_findkey() lookup */
ipc_unlock(ipcp);
+ err = -ENOENT;
+
+ goto done_write;
}
- up_write(&ids->rwsem);
+ /*
+ * The key was found, so we will just perform routinary checks on
+ * ipc the object. Share the lock among other readers.
+ */
+ downgrade_write(&ids->rwsem);
+
+ if (ops->more_checks)
+ err = ops->more_checks(ipcp, params);
+ if (!err)
+ /* returns the IPC id on success */
+ err = ipc_check_perms(ns, ipcp, ops, params);
+
+ ipc_unlock(ipcp);
+
+ up_read(&ids->rwsem);
+ return err;
+done_write:
+ up_write(&ids->rwsem);
return err;
}
-
/**
- * ipc_rmid - remove an IPC identifier
- * @ids: IPC identifier set
- * @ipcp: ipc perm structure containing the identifier to remove
+ * ipc_rmid - remove an ipc identifier
+ * @ids: ipc identifier set
+ * @ipcp: ipc perm structure containing the identifier to remove
*
- * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
- * before this function is called, and remain locked on the exit.
+ * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
+ * before this function is called, and remain locked on the exit.
*/
-
void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
{
int lid = ipcid_to_idx(ipcp->id);
idr_remove(&ids->ipcs_idr, lid);
-
ids->in_use--;
-
- ipcp->deleted = 1;
-
- return;
+ ipcp->deleted = true;
}
/**
- * ipc_alloc - allocate ipc space
- * @size: size desired
+ * ipc_alloc - allocate ipc space
+ * @size: size desired
*
- * Allocate memory from the appropriate pools and return a pointer to it.
- * NULL is returned if the allocation fails
+ * Allocate memory from the appropriate pools and return a pointer to it.
+ * NULL is returned if the allocation fails
*/
-
void *ipc_alloc(int size)
{
void *out;
- if(size > PAGE_SIZE)
+ if (size > PAGE_SIZE)
out = vmalloc(size);
else
out = kmalloc(size, GFP_KERNEL);
@@ -471,28 +458,27 @@ void *ipc_alloc(int size)
}
/**
- * ipc_free - free ipc space
- * @ptr: pointer returned by ipc_alloc
- * @size: size of block
+ * ipc_free - free ipc space
+ * @ptr: pointer returned by ipc_alloc
+ * @size: size of block
*
- * Free a block created with ipc_alloc(). The caller must know the size
- * used in the allocation call.
+ * Free a block created with ipc_alloc(). The caller must know the size
+ * used in the allocation call.
*/
-
-void ipc_free(void* ptr, int size)
+void ipc_free(void *ptr, int size)
{
- if(size > PAGE_SIZE)
+ if (size > PAGE_SIZE)
vfree(ptr);
else
kfree(ptr);
}
/**
- * ipc_rcu_alloc - allocate ipc and rcu space
- * @size: size desired
+ * ipc_rcu_alloc - allocate ipc and rcu space
+ * @size: size desired
*
- * Allocate memory for the rcu header structure + the object.
- * Returns the pointer to the object or NULL upon failure.
+ * Allocate memory for the rcu header structure + the object.
+ * Returns the pointer to the object or NULL upon failure.
*/
void *ipc_rcu_alloc(int size)
{
@@ -534,17 +520,16 @@ void ipc_rcu_free(struct rcu_head *head)
}
/**
- * ipcperms - check IPC permissions
- * @ns: IPC namespace
- * @ipcp: IPC permission set
- * @flag: desired permission set.
+ * ipcperms - check ipc permissions
+ * @ns: ipc namespace
+ * @ipcp: ipc permission set
+ * @flag: desired permission set
*
- * Check user, group, other permissions for access
- * to ipc resources. return 0 if allowed
+ * Check user, group, other permissions for access
+ * to ipc resources. return 0 if allowed
*
- * @flag will most probably be 0 or S_...UGO from <linux/stat.h>
+ * @flag will most probably be 0 or S_...UGO from <linux/stat.h>
*/
-
int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag)
{
kuid_t euid = current_euid();
@@ -572,16 +557,14 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag)
*/
/**
- * kernel_to_ipc64_perm - convert kernel ipc permissions to user
- * @in: kernel permissions
- * @out: new style IPC permissions
+ * kernel_to_ipc64_perm - convert kernel ipc permissions to user
+ * @in: kernel permissions
+ * @out: new style ipc permissions
*
- * Turn the kernel object @in into a set of permissions descriptions
- * for returning to userspace (@out).
+ * Turn the kernel object @in into a set of permissions descriptions
+ * for returning to userspace (@out).
*/
-
-
-void kernel_to_ipc64_perm (struct kern_ipc_perm *in, struct ipc64_perm *out)
+void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out)
{
out->key = in->key;
out->uid = from_kuid_munged(current_user_ns(), in->uid);
@@ -593,15 +576,14 @@ void kernel_to_ipc64_perm (struct kern_ipc_perm *in, struct ipc64_perm *out)
}
/**
- * ipc64_perm_to_ipc_perm - convert new ipc permissions to old
- * @in: new style IPC permissions
- * @out: old style IPC permissions
+ * ipc64_perm_to_ipc_perm - convert new ipc permissions to old
+ * @in: new style ipc permissions
+ * @out: old style ipc permissions
*
- * Turn the new style permissions object @in into a compatibility
- * object and store it into the @out pointer.
+ * Turn the new style permissions object @in into a compatibility
+ * object and store it into the @out pointer.
*/
-
-void ipc64_perm_to_ipc_perm (struct ipc64_perm *in, struct ipc_perm *out)
+void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out)
{
out->key = in->key;
SET_UID(out->uid, in->uid);
@@ -635,8 +617,8 @@ struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id)
}
/**
- * ipc_lock - Lock an ipc structure without rwsem held
- * @ids: IPC identifier set
+ * ipc_lock - lock an ipc structure without rwsem held
+ * @ids: ipc identifier set
* @id: ipc id to look for
*
* Look for an id in the ipc ids idr and lock the associated ipc object.
@@ -657,7 +639,7 @@ struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id)
/* ipc_rmid() may have already freed the ID while ipc_lock
* was spinning: here verify that the structure is still valid
*/
- if (!out->deleted)
+ if (ipc_valid_object(out))
return out;
spin_unlock(&out->lock);
@@ -693,11 +675,11 @@ out:
/**
* ipcget - Common sys_*get() code
- * @ns : namsepace
- * @ids : IPC identifier set
- * @ops : operations to be called on ipc object creation, permission checks
- * and further checks
- * @params : the parameters needed by the previous operations.
+ * @ns: namsepace
+ * @ids: ipc identifier set
+ * @ops: operations to be called on ipc object creation, permission checks
+ * and further checks
+ * @params: the parameters needed by the previous operations.
*
* Common routine called by sys_msgget(), sys_semget() and sys_shmget().
*/
@@ -711,7 +693,7 @@ int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
}
/**
- * ipc_update_perm - update the permissions of an IPC.
+ * ipc_update_perm - update the permissions of an ipc object
* @in: the permission given as input.
* @out: the permission of the ipc to set.
*/
@@ -732,7 +714,7 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
/**
* ipcctl_pre_down_nolock - retrieve an ipc and check permissions for some IPC_XXX cmd
- * @ns: the ipc namespace
+ * @ns: ipc namespace
* @ids: the table of ids where to look for the ipc
* @id: the id of the ipc to retrieve
* @cmd: the cmd to check
@@ -779,15 +761,14 @@ err:
/**
- * ipc_parse_version - IPC call version
- * @cmd: pointer to command
+ * ipc_parse_version - ipc call version
+ * @cmd: pointer to command
*
- * Return IPC_64 for new style IPC and IPC_OLD for old style IPC.
- * The @cmd value is turned from an encoding command and version into
- * just the command code.
+ * Return IPC_64 for new style IPC and IPC_OLD for old style IPC.
+ * The @cmd value is turned from an encoding command and version into
+ * just the command code.
*/
-
-int ipc_parse_version (int *cmd)
+int ipc_parse_version(int *cmd)
{
if (*cmd & IPC_64) {
*cmd ^= IPC_64;
@@ -824,7 +805,7 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
if (total >= ids->in_use)
return NULL;
- for ( ; pos < IPCMNI; pos++) {
+ for (; pos < IPCMNI; pos++) {
ipc = idr_find(&ids->ipcs_idr, pos);
if (ipc != NULL) {
*new_pos = pos + 1;
@@ -927,8 +908,10 @@ static int sysvipc_proc_open(struct inode *inode, struct file *file)
goto out;
ret = seq_open(file, &sysvipc_proc_seqops);
- if (ret)
- goto out_kfree;
+ if (ret) {
+ kfree(iter);
+ goto out;
+ }
seq = file->private_data;
seq->private = iter;
@@ -937,9 +920,6 @@ static int sysvipc_proc_open(struct inode *inode, struct file *file)
iter->ns = get_ipc_ns(current->nsproxy->ipc_ns);
out:
return ret;
-out_kfree:
- kfree(iter);
- goto out;
}
static int sysvipc_proc_release(struct inode *inode, struct file *file)
diff --git a/ipc/util.h b/ipc/util.h
index 59d78aa94987..9c47d6f6c7b4 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -15,9 +15,9 @@
#define SEQ_MULTIPLIER (IPCMNI)
-void sem_init (void);
-void msg_init (void);
-void shm_init (void);
+void sem_init(void);
+void msg_init(void);
+void shm_init(void);
struct ipc_namespace;
@@ -100,6 +100,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER)
+#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX)
/* must be called with ids->rwsem acquired for writing */
int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
@@ -116,8 +117,8 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
/* for rare, potentially huge allocations.
* both function can sleep
*/
-void* ipc_alloc(int size);
-void ipc_free(void* ptr, int size);
+void *ipc_alloc(int size);
+void ipc_free(void *ptr, int size);
/*
* For allocation that need to be freed by RCU.
@@ -125,7 +126,7 @@ void ipc_free(void* ptr, int size);
* getref increases the refcount, the putref call that reduces the recount
* to 0 schedules the rcu destruction. Caller must guarantee locking.
*/
-void* ipc_rcu_alloc(int size);
+void *ipc_rcu_alloc(int size);
int ipc_rcu_getref(void *ptr);
void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head));
void ipc_rcu_free(struct rcu_head *head);
@@ -144,7 +145,7 @@ struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
/* On IA-64, we always use the "64-bit version" of the IPC structures. */
# define ipc_parse_version(cmd) IPC_64
#else
-int ipc_parse_version (int *cmd);
+int ipc_parse_version(int *cmd);
#endif
extern void free_msg(struct msg_msg *msg);
@@ -185,6 +186,19 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm)
rcu_read_unlock();
}
+/*
+ * ipc_valid_object() - helper to sort out IPC_RMID races for codepaths
+ * where the respective ipc_ids.rwsem is not being held down.
+ * Checks whether the ipc object is still around or if it's gone already, as
+ * ipc_rmid() may have already freed the ID while the ipc lock was spinning.
+ * Needs to be called with kern_ipc_perm.lock held -- exception made for one
+ * checkpoint case at sys_semtimedop() as noted in code commentary.
+ */
+static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
+{
+ return !perm->deleted;
+}
+
struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
struct ipc_ops *ops, struct ipc_params *params);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 43c307dc9453..67ccf0e7cca9 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -912,12 +912,13 @@ static void evict_chunk(struct audit_chunk *chunk)
}
static int audit_tree_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmonut_mark,
- struct fsnotify_event *event)
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name)
{
- BUG();
- return -EOPNOTSUPP;
+ return 0;
}
static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
@@ -933,19 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
BUG_ON(atomic_read(&entry->refcnt) < 1);
}
-static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- __u32 mask, void *data, int data_type)
-{
- return false;
-}
-
static const struct fsnotify_ops audit_tree_ops = {
.handle_event = audit_tree_handle_event,
- .should_send_event = audit_tree_send_event,
- .free_group_priv = NULL,
- .free_event_priv = NULL,
.freeing_mark = audit_tree_freeing_mark,
};
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 22831c4d369c..2596fac5dcb4 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -465,35 +465,27 @@ void audit_remove_watch_rule(struct audit_krule *krule)
}
}
-static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- __u32 mask, void *data, int data_type)
-{
- return true;
-}
-
/* Update watch data in audit rules based on fsnotify events. */
static int audit_watch_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- struct fsnotify_event *event)
+ u32 mask, void *data, int data_type,
+ const unsigned char *dname)
{
struct inode *inode;
- __u32 mask = event->mask;
- const char *dname = event->file_name;
struct audit_parent *parent;
parent = container_of(inode_mark, struct audit_parent, mark);
BUG_ON(group != audit_watch_group);
- switch (event->data_type) {
+ switch (data_type) {
case (FSNOTIFY_EVENT_PATH):
- inode = event->path.dentry->d_inode;
+ inode = ((struct path *)data)->dentry->d_inode;
break;
case (FSNOTIFY_EVENT_INODE):
- inode = event->inode;
+ inode = (struct inode *)data;
break;
default:
BUG();
@@ -512,11 +504,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
}
static const struct fsnotify_ops audit_watch_fsnotify_ops = {
- .should_send_event = audit_watch_should_send_event,
.handle_event = audit_watch_handle_event,
- .free_group_priv = NULL,
- .freeing_mark = NULL,
- .free_event_priv = NULL,
};
static int __init audit_watch_init(void)
diff --git a/kernel/exit.c b/kernel/exit.c
index a949819055d5..1e77fc645317 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
__this_cpu_dec(process_counts);
}
list_del_rcu(&p->thread_group);
+ list_del_rcu(&p->thread_node);
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 294189fc7ac8..a17621c6cd42 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -800,14 +800,11 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
* Allocate a new mm structure and copy contents from the
* mm structure of the passed in task structure.
*/
-struct mm_struct *dup_mm(struct task_struct *tsk)
+static struct mm_struct *dup_mm(struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm = current->mm;
int err;
- if (!oldmm)
- return NULL;
-
mm = allocate_mm();
if (!mm)
goto fail_nomem;
@@ -1035,6 +1032,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->nr_threads = 1;
atomic_set(&sig->live, 1);
atomic_set(&sig->sigcnt, 1);
+
+ /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
+ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
+ tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
+
init_waitqueue_head(&sig->wait_chldexit);
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
@@ -1224,7 +1226,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
- p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
INIT_LIST_HEAD(&p->children);
@@ -1474,6 +1475,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
atomic_inc(&current->signal->sigcnt);
list_add_tail_rcu(&p->thread_group,
&p->group_leader->thread_group);
+ list_add_tail_rcu(&p->thread_node,
+ &p->signal->thread_head);
}
attach_pid(p, PIDTYPE_PID);
nr_threads++;
@@ -1647,7 +1650,7 @@ SYSCALL_DEFINE0(fork)
return do_fork(SIGCHLD, 0, 0, NULL, NULL);
#else
/* can not support in nommu mode */
- return(-EINVAL);
+ return -EINVAL;
#endif
}
#endif
@@ -1655,7 +1658,7 @@ SYSCALL_DEFINE0(fork)
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
0, NULL, NULL);
}
#endif
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 9328b80eaf14..7899ee9dd212 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -244,5 +244,4 @@ static int __init hung_task_init(void)
return 0;
}
-
-module_init(hung_task_init);
+subsys_initcall(hung_task_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9c970167e402..00232e288f61 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -932,6 +932,7 @@ static int kimage_load_segment(struct kimage *image,
*/
struct kimage *kexec_image;
struct kimage *kexec_crash_image;
+int kexec_load_disabled;
static DEFINE_MUTEX(kexec_mutex);
@@ -942,7 +943,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
int result;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT))
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
return -EPERM;
/*
@@ -1234,7 +1235,7 @@ static int __init crash_notes_memory_init(void)
}
return 0;
}
-module_init(crash_notes_memory_init)
+subsys_initcall(crash_notes_memory_init);
/*
@@ -1628,7 +1629,7 @@ static int __init crash_save_vmcoreinfo_init(void)
return 0;
}
-module_init(crash_save_vmcoreinfo_init)
+subsys_initcall(crash_save_vmcoreinfo_init);
/*
* Move into place and start executing a preloaded standalone
diff --git a/kernel/kmod.c b/kernel/kmod.c
index b086006c59e7..ed8e7bd35074 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -40,6 +40,7 @@
#include <linux/ptrace.h>
#include <linux/async.h>
#include <asm/uaccess.h>
+#include <linux/kthread.h>
#include <trace/events/module.h>
@@ -209,8 +210,14 @@ static int ____call_usermodehelper(void *data)
flush_signal_handlers(current, 1);
spin_unlock_irq(&current->sighand->siglock);
- /* We can run anywhere, unlike our parent keventd(). */
- set_cpus_allowed_ptr(current, cpu_all_mask);
+ /*
+ * Kthreadd can be restricted to a set of processors if the user wants
+ * to protect other processors from OS latencies. If that has happened
+ * then we do not want to disturb the other processors here either so we
+ * start the usermode helper threads only on the processors allowed for
+ * kthreadd.
+ */
+ set_kthreadd_affinity();
/*
* Our parent is keventd, which runs with elevated scheduling priority.
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 9659d38e008f..d945a949760f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -126,7 +126,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
{
return sprintf(buf, "%lx %x\n",
paddr_vmcoreinfo_note(),
- (unsigned int)vmcoreinfo_max_size);
+ (unsigned int)sizeof(vmcoreinfo_note));
}
KERNEL_ATTR_RO(vmcoreinfo);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b5ae3ee860a9..232f06c4a537 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -136,6 +136,15 @@ void *kthread_data(struct task_struct *task)
return to_kthread(task)->data;
}
+/*
+ * Set the affinity of the calling task to be the same
+ * as the kthreadd affinities.
+ */
+void set_kthreadd_affinity(void)
+{
+ set_cpus_allowed_ptr(current, &kthreadd_task->cpus_allowed);
+}
+
/**
* probe_kthread_data - speculative version of kthread_data()
* @task: possible kthread task in question
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b38109e204af..d9f61a145802 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -637,7 +637,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
BUG_ON(!region);
} else
/* This allocation cannot fail */
- region = alloc_bootmem(sizeof(struct nosave_region));
+ region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index be7c86bae576..b1d255f04135 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -757,14 +757,10 @@ void __init setup_log_buf(int early)
return;
if (early) {
- unsigned long mem;
-
- mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
- if (!mem)
- return;
- new_log_buf = __va(mem);
+ new_log_buf =
+ memblock_virt_alloc(new_log_buf_len, PAGE_SIZE);
} else {
- new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
+ new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0);
}
if (unlikely(!new_log_buf)) {
@@ -1599,10 +1595,13 @@ asmlinkage int vprintk_emit(int facility, int level,
* either merge it with the current buffer and flush, or if
* there was a race with interrupts (prefix == true) then just
* flush it out and store this line separately.
+ * If the preceding printk was from a different task and missed
+ * a newline, flush and append the newline.
*/
- if (cont.len && cont.owner == current) {
- if (!(lflags & LOG_PREFIX))
- stored = cont_add(facility, level, text, text_len);
+ if (cont.len) {
+ if (cont.owner == current && !(lflags & LOG_PREFIX))
+ stored = cont_add(facility, level, text,
+ text_len);
cont_flush(LOG_NEWLINE);
}
diff --git a/kernel/profile.c b/kernel/profile.c
index 6631e1ef55ab..b37576b22acc 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -604,5 +604,5 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
hotcpu_notifier(profile_cpu_callback, 0);
return 0;
}
-module_init(create_proc_profile);
+subsys_initcall(create_proc_profile);
#endif /* CONFIG_PROC_FS */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index de46c2098e36..7f45fd52bc9a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1119,6 +1119,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
goto out;
+ trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
out:
@@ -1780,7 +1781,29 @@ void set_numabalancing_state(bool enabled)
numabalancing_enabled = enabled;
}
#endif /* CONFIG_SCHED_DEBUG */
-#endif /* CONFIG_NUMA_BALANCING */
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_numa_balancing(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = numabalancing_enabled;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_numabalancing_state(state);
+ return err;
+}
+#endif
+#endif
/*
* fork()/clone()-time setup:
@@ -4590,6 +4613,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
/* TODO: This is not properly updating schedstats */
+ trace_sched_move_numa(p, curr_cpu, target_cpu);
return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b24b6cfde9aa..867b0a4b0893 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1250,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p)
p->numa_scan_period = task_scan_min(p);
if (env.best_task == NULL) {
- int ret = migrate_task_to(p, env.best_cpu);
+ ret = migrate_task_to(p, env.best_cpu);
+ if (ret != 0)
+ trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
return ret;
}
ret = migrate_swap(p, env.best_task);
+ if (ret != 0)
+ trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
put_task_struct(env.best_task);
return ret;
}
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index da98af347e8b..a476bea17fbc 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void)
proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
return 0;
}
-module_init(proc_schedstat_init);
+subsys_initcall(proc_schedstat_init);
diff --git a/kernel/signal.c b/kernel/signal.c
index 940b30ee9a30..52f881db1ca0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2047,8 +2047,8 @@ static bool do_signal_stop(int signr)
if (task_set_jobctl_pending(current, signr | gstop))
sig->group_stop_count++;
- for (t = next_thread(current); t != current;
- t = next_thread(t)) {
+ t = current;
+ while_each_thread(current, t) {
/*
* Setting state to TASK_STOPPED for a group
* stop is always done with the siglock held,
@@ -3125,8 +3125,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
rm_from_queue_full(&mask, &t->signal->shared_pending);
do {
rm_from_queue_full(&mask, &t->pending);
- t = next_thread(t);
- } while (t != current);
+ } while_each_thread(current, t);
}
}
diff --git a/kernel/sys.c b/kernel/sys.c
index c72311324ea7..c0a58be780a4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -895,8 +895,7 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
* only important on a multi-user system anyway, to make sure one user
* can't send a signal to a process owned by another. -TYT, 12/12/91
*
- * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
- * LBT 04.03.94
+ * !PF_FORKNOEXEC check to conform completely to POSIX.
*/
SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
{
@@ -932,7 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
if (task_session(p) != task_session(group_leader))
goto out;
err = -EACCES;
- if (p->did_exec)
+ if (!(p->flags & PF_FORKNOEXEC))
goto out;
} else {
err = -ESRCH;
@@ -1572,8 +1571,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
t = p;
do {
accumulate_thread_rusage(t, r);
- t = next_thread(t);
- } while (t != p);
+ } while_each_thread(p, t);
break;
default:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8da99f905cf..096db7452cbd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -62,6 +62,7 @@
#include <linux/capability.h>
#include <linux/binfmts.h>
#include <linux/sched/sysctl.h>
+#include <linux/kexec.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -95,8 +96,6 @@
#if defined(CONFIG_SYSCTL)
/* External variables not in a header file. */
-extern int sysctl_overcommit_memory;
-extern int sysctl_overcommit_ratio;
extern int max_threads;
extern int suid_dumpable;
#ifdef CONFIG_COREDUMP
@@ -391,6 +390,15 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "numa_balancing",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_numa_balancing,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
{
@@ -607,6 +615,18 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
+#ifdef CONFIG_KEXEC
+ {
+ .procname = "kexec_load_disabled",
+ .data = &kexec_load_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_MODULES
{
.procname = "modprobe",
@@ -1121,7 +1141,14 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_overcommit_ratio,
.maxlen = sizeof(sysctl_overcommit_ratio),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = overcommit_ratio_handler,
+ },
+ {
+ .procname = "overcommit_kbytes",
+ .data = &sysctl_overcommit_kbytes,
+ .maxlen = sizeof(sysctl_overcommit_kbytes),
+ .mode = 0644,
+ .proc_handler = overcommit_kbytes_handler,
},
{
.procname = "page-cluster",
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0abb36464281..e5387a069060 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -132,6 +132,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);
r = rate;
+ /*
+ * Use 4MHz instead of 1MHz so that things like 1.832Mhz show as
+ * 1832Khz
+ */
if (r >= 4000000) {
r /= 1000000;
r_unit = 'M';
diff --git a/kernel/user.c b/kernel/user.c
index c006131beb77..294fc6a94168 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -222,5 +222,4 @@ static int __init uid_cache_init(void)
return 0;
}
-
-module_init(uid_cache_init);
+subsys_initcall(uid_cache_init);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 240fb62cf394..4f211868e6a2 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -902,4 +902,4 @@ static __init int user_namespaces_init(void)
user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
return 0;
}
-module_init(user_namespaces_init);
+subsys_initcall(user_namespaces_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4431610f049a..9cbaef929c2f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -158,14 +158,14 @@ void touch_all_softlockup_watchdogs(void)
#ifdef CONFIG_HARDLOCKUP_DETECTOR
void touch_nmi_watchdog(void)
{
- if (watchdog_user_enabled) {
- unsigned cpu;
-
- for_each_present_cpu(cpu) {
- if (per_cpu(watchdog_nmi_touch, cpu) != true)
- per_cpu(watchdog_nmi_touch, cpu) = true;
- }
- }
+ /*
+ * Using __raw here because some code paths have
+ * preemption enabled. If preemption is enabled
+ * then interrupts should be enabled too, in which
+ * case we shouldn't have to worry about the watchdog
+ * going off.
+ */
+ __raw_get_cpu_var(watchdog_nmi_touch) = true;
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -239,10 +239,12 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
- if (hardlockup_panic)
+ if (hardlockup_panic) {
+ trigger_all_cpu_backtrace();
panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
- else
+ } else {
WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ }
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -323,8 +325,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
else
dump_stack();
- if (softlockup_panic)
+ if (softlockup_panic) {
+ trigger_all_cpu_backtrace();
panic("softlockup: hung tasks");
+ }
__this_cpu_write(soft_watchdog_warn, true);
} else
__this_cpu_write(soft_watchdog_warn, false);
diff --git a/lib/Kconfig b/lib/Kconfig
index 991c98bc4a3f..d2cbd6f749f5 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -177,6 +177,13 @@ config CRC8
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 501ec4deaddb..dbf94a7d25a8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1573,8 +1573,43 @@ config DMA_API_DEBUG
With this option you will be able to detect common bugs in device
drivers like double-freeing of DMA mappings or freeing mappings that
were never allocated.
- This option causes a performance degredation. Use only if you want
- to debug device drivers. If unsure, say N.
+
+ This also attempts to catch cases where a page owned by DMA is
+ accessed by the cpu in a way that could cause data corruption. For
+ example, this enables cow_user_page() to check that the source page is
+ not undergoing DMA.
+
+ This option causes a performance degradation. Use only if you want to
+ debug device drivers and dma interactions.
+
+ If unsure, say N.
+
+config TEST_MODULE
+ tristate "Test module loading with 'hello world' module"
+ default n
+ depends on m
+ help
+ This builds the "test_module" module that emits "Hello, world"
+ on printk when loaded. It is designed to be used for basic
+ evaluation of the module loading subsystem (for example when
+ validating module verification). It lacks any extra dependencies,
+ and will not normally be loaded by the system unless explicitly
+ requested by name.
+
+ If unsure, say N.
+
+config TEST_USER_COPY
+ tristate "Test user/kernel boundary protections"
+ default n
+ depends on m
+ help
+ This builds the "test_user_copy" module that runs sanity checks
+ on the copy_to/from_user infrastructure, making sure basic
+ user/kernel boundary testing is working. If it fails to load,
+ a regression has been detected in the user/kernel memory boundary
+ protections.
+
+ If unsure, say N.
source "samples/Kconfig"
diff --git a/lib/Makefile b/lib/Makefile
index d0f79c547d97..7a42a946cab2 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -31,6 +31,8 @@ obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
obj-y += kstrtox.o
obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
+obj-$(CONFIG_TEST_MODULE) += test_module.o
+obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o
ifeq ($(CONFIG_DEBUG_KOBJECT),y)
CFLAGS_kobject.o += -DDEBUG
@@ -66,6 +68,7 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
diff --git a/lib/assoc_array.c b/lib/assoc_array.c
index 1b6a44f1ec3e..c0b1007011e1 100644
--- a/lib/assoc_array.c
+++ b/lib/assoc_array.c
@@ -157,7 +157,7 @@ enum assoc_array_walk_status {
assoc_array_walk_tree_empty,
assoc_array_walk_found_terminal_node,
assoc_array_walk_found_wrong_shortcut,
-} status;
+};
struct assoc_array_walk_result {
struct {
diff --git a/lib/cmdline.c b/lib/cmdline.c
index eb6791188cf5..d4932f745e92 100644
--- a/lib/cmdline.c
+++ b/lib/cmdline.c
@@ -49,13 +49,13 @@ static int get_range(char **str, int *pint)
* 3 - hyphen found to denote a range
*/
-int get_option (char **str, int *pint)
+int get_option(char **str, int *pint)
{
char *cur = *str;
if (!cur || !(*cur))
return 0;
- *pint = simple_strtol (cur, str, 0);
+ *pint = simple_strtol(cur, str, 0);
if (cur == *str)
return 0;
if (**str == ',') {
@@ -67,6 +67,7 @@ int get_option (char **str, int *pint)
return 1;
}
+EXPORT_SYMBOL(get_option);
/**
* get_options - Parse a string into a list of integers
@@ -84,13 +85,13 @@ int get_option (char **str, int *pint)
* the parse to end (typically a null terminator, if @str is
* completely parseable).
*/
-
+
char *get_options(const char *str, int nints, int *ints)
{
int res, i = 1;
while (i < nints) {
- res = get_option ((char **)&str, ints + i);
+ res = get_option((char **)&str, ints + i);
if (res == 0)
break;
if (res == 3) {
@@ -112,6 +113,7 @@ char *get_options(const char *str, int nints, int *ints)
ints[0] = i - 1;
return (char *)str;
}
+EXPORT_SYMBOL(get_options);
/**
* memparse - parse a string with mem suffixes into a number
@@ -152,8 +154,4 @@ unsigned long long memparse(const char *ptr, char **retptr)
return ret;
}
-
-
EXPORT_SYMBOL(memparse);
-EXPORT_SYMBOL(get_option);
-EXPORT_SYMBOL(get_options);
diff --git a/lib/cpumask.c b/lib/cpumask.c
index d327b87c99b7..b810b753c607 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -140,7 +140,7 @@ EXPORT_SYMBOL(zalloc_cpumask_var);
*/
void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
- *mask = alloc_bootmem(cpumask_size());
+ *mask = memblock_virt_alloc(cpumask_size(), 0);
}
/**
@@ -161,6 +161,6 @@ EXPORT_SYMBOL(free_cpumask_var);
*/
void __init free_bootmem_cpumask_var(cpumask_var_t mask)
{
- free_bootmem(__pa(mask), cpumask_size());
+ memblock_free_early(__pa(mask), cpumask_size());
}
#endif
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644
index 000000000000..41629ea5a60c
--- /dev/null
+++ b/lib/crc64_ecma.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
diff --git a/lib/decompress_unlz4.c b/lib/decompress_unlz4.c
index 3e67cfad16ad..7d1e83caf8ad 100644
--- a/lib/decompress_unlz4.c
+++ b/lib/decompress_unlz4.c
@@ -141,6 +141,7 @@ STATIC inline int INIT unlz4(u8 *input, int in_len,
goto exit_2;
}
+ ret = -1;
if (flush && flush(outp, dest_len) != dest_len)
goto exit_2;
if (output)
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index d87a17a819d0..c38083871f11 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -53,11 +53,26 @@ enum map_err_types {
#define DMA_DEBUG_STACKTRACE_ENTRIES 5
+/**
+ * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping
+ * @list: node on pre-allocated free_entries list
+ * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent
+ * @type: single, page, sg, coherent
+ * @pfn: page frame of the start address
+ * @offset: offset of mapping relative to pfn
+ * @size: length of the mapping
+ * @direction: enum dma_data_direction
+ * @sg_call_ents: 'nents' from dma_map_sg
+ * @sg_mapped_ents: 'mapped_ents' from dma_map_sg
+ * @map_err_type: track whether dma_mapping_error() was checked
+ * @stacktrace: support backtraces when a violation is detected
+ */
struct dma_debug_entry {
struct list_head list;
struct device *dev;
int type;
- phys_addr_t paddr;
+ unsigned long pfn;
+ size_t offset;
u64 dev_addr;
u64 size;
int direction;
@@ -372,6 +387,11 @@ static void hash_bucket_del(struct dma_debug_entry *entry)
list_del(&entry->list);
}
+static unsigned long long phys_addr(struct dma_debug_entry *entry)
+{
+ return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset;
+}
+
/*
* Dump mapping entries for debugging purposes
*/
@@ -389,9 +409,9 @@ void debug_dma_dump_mappings(struct device *dev)
list_for_each_entry(entry, &bucket->list, list) {
if (!dev || dev == entry->dev) {
dev_info(entry->dev,
- "%s idx %d P=%Lx D=%Lx L=%Lx %s %s\n",
+ "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n",
type2name[entry->type], idx,
- (unsigned long long)entry->paddr,
+ phys_addr(entry), entry->pfn,
entry->dev_addr, entry->size,
dir2name[entry->direction],
maperr2str[entry->map_err_type]);
@@ -404,6 +424,133 @@ void debug_dma_dump_mappings(struct device *dev)
EXPORT_SYMBOL(debug_dma_dump_mappings);
/*
+ * For each page mapped (initial page in the case of
+ * dma_alloc_coherent/dma_map_{single|page}, or each page in a
+ * scatterlist) insert into this tree using the pfn as the key. At
+ * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry. If
+ * the pfn already exists at insertion time add a tag as a reference
+ * count for the overlapping mappings. For now, the overlap tracking
+ * just ensures that 'unmaps' balance 'maps' before marking the pfn
+ * idle, but we should also be flagging overlaps as an API violation.
+ *
+ * Memory usage is mostly constrained by the maximum number of available
+ * dma-debug entries in that we need a free dma_debug_entry before
+ * inserting into the tree. In the case of dma_map_{single|page} and
+ * dma_alloc_coherent there is only one dma_debug_entry and one pfn to
+ * track per event. dma_map_sg(), on the other hand,
+ * consumes a single dma_debug_entry, but inserts 'nents' entries into
+ * the tree.
+ *
+ * At any time debug_dma_assert_idle() can be called to trigger a
+ * warning if the given page is in the active set.
+ */
+static RADIX_TREE(dma_active_pfn, GFP_NOWAIT);
+static DEFINE_SPINLOCK(radix_lock);
+#define ACTIVE_PFN_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1)
+
+static int active_pfn_read_overlap(unsigned long pfn)
+{
+ int overlap = 0, i;
+
+ for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--)
+ if (radix_tree_tag_get(&dma_active_pfn, pfn, i))
+ overlap |= 1 << i;
+ return overlap;
+}
+
+static int active_pfn_set_overlap(unsigned long pfn, int overlap)
+{
+ int i;
+
+ if (overlap > ACTIVE_PFN_MAX_OVERLAP || overlap < 0)
+ return 0;
+
+ for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--)
+ if (overlap & 1 << i)
+ radix_tree_tag_set(&dma_active_pfn, pfn, i);
+ else
+ radix_tree_tag_clear(&dma_active_pfn, pfn, i);
+
+ return overlap;
+}
+
+static void active_pfn_inc_overlap(unsigned long pfn)
+{
+ int overlap = active_pfn_read_overlap(pfn);
+
+ overlap = active_pfn_set_overlap(pfn, ++overlap);
+
+ /* If we overflowed the overlap counter then we're potentially
+ * leaking dma-mappings. Otherwise, if maps and unmaps are
+ * balanced then this overflow may cause false negatives in
+ * debug_dma_assert_idle() as the pfn may be marked idle
+ * prematurely.
+ */
+ WARN_ONCE(overlap == 0,
+ "DMA-API: exceeded %d overlapping mappings of pfn %lx\n",
+ ACTIVE_PFN_MAX_OVERLAP, pfn);
+}
+
+static int active_pfn_dec_overlap(unsigned long pfn)
+{
+ int overlap = active_pfn_read_overlap(pfn);
+
+ return active_pfn_set_overlap(pfn, --overlap);
+}
+
+static int active_pfn_insert(struct dma_debug_entry *entry)
+{
+ unsigned long flags;
+ int rc;
+
+ spin_lock_irqsave(&radix_lock, flags);
+ rc = radix_tree_insert(&dma_active_pfn, entry->pfn, entry);
+ if (rc == -EEXIST)
+ active_pfn_inc_overlap(entry->pfn);
+ spin_unlock_irqrestore(&radix_lock, flags);
+
+ return rc;
+}
+
+static void active_pfn_remove(struct dma_debug_entry *entry)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&radix_lock, flags);
+ if (active_pfn_dec_overlap(entry->pfn) == 0)
+ radix_tree_delete(&dma_active_pfn, entry->pfn);
+ spin_unlock_irqrestore(&radix_lock, flags);
+}
+
+/**
+ * debug_dma_assert_idle() - assert that a page is not undergoing dma
+ * @page: page to lookup in the dma_active_pfn tree
+ *
+ * Place a call to this routine in cases where the cpu touching the page
+ * before the dma completes (page is dma_unmapped) will lead to data
+ * corruption.
+ */
+void debug_dma_assert_idle(struct page *page)
+{
+ unsigned long flags;
+ struct dma_debug_entry *entry;
+
+ if (!page)
+ return;
+
+ spin_lock_irqsave(&radix_lock, flags);
+ entry = radix_tree_lookup(&dma_active_pfn, page_to_pfn(page));
+ spin_unlock_irqrestore(&radix_lock, flags);
+
+ if (!entry)
+ return;
+
+ err_printk(entry->dev, entry,
+ "DMA-API: cpu touching an active dma mapped page "
+ "[pfn=0x%lx]\n", entry->pfn);
+}
+
+/*
* Wrapper function for adding an entry to the hash.
* This function takes care of locking itself.
*/
@@ -411,10 +558,21 @@ static void add_dma_entry(struct dma_debug_entry *entry)
{
struct hash_bucket *bucket;
unsigned long flags;
+ int rc;
bucket = get_hash_bucket(entry, &flags);
hash_bucket_add(bucket, entry);
put_hash_bucket(bucket, &flags);
+
+ rc = active_pfn_insert(entry);
+ if (rc == -ENOMEM) {
+ pr_err("DMA-API: pfn tracking ENOMEM, dma-debug disabled\n");
+ global_disable = true;
+ }
+
+ /* TODO: report -EEXIST errors here as overlapping mappings are
+ * not supported by the DMA API
+ */
}
static struct dma_debug_entry *__dma_entry_alloc(void)
@@ -469,6 +627,8 @@ static void dma_entry_free(struct dma_debug_entry *entry)
{
unsigned long flags;
+ active_pfn_remove(entry);
+
/*
* add to beginning of the list - this way the entries are
* more likely cache hot when they are reallocated.
@@ -895,15 +1055,15 @@ static void check_unmap(struct dma_debug_entry *ref)
ref->dev_addr, ref->size,
type2name[entry->type], type2name[ref->type]);
} else if ((entry->type == dma_debug_coherent) &&
- (ref->paddr != entry->paddr)) {
+ (phys_addr(ref) != phys_addr(entry))) {
err_printk(ref->dev, entry, "DMA-API: device driver frees "
"DMA memory with different CPU address "
"[device address=0x%016llx] [size=%llu bytes] "
"[cpu alloc address=0x%016llx] "
"[cpu free address=0x%016llx]",
ref->dev_addr, ref->size,
- (unsigned long long)entry->paddr,
- (unsigned long long)ref->paddr);
+ phys_addr(entry),
+ phys_addr(ref));
}
if (ref->sg_call_ents && ref->type == dma_debug_sg &&
@@ -1052,7 +1212,8 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
entry->dev = dev;
entry->type = dma_debug_page;
- entry->paddr = page_to_phys(page) + offset;
+ entry->pfn = page_to_pfn(page);
+ entry->offset = offset,
entry->dev_addr = dma_addr;
entry->size = size;
entry->direction = direction;
@@ -1148,7 +1309,8 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
entry->type = dma_debug_sg;
entry->dev = dev;
- entry->paddr = sg_phys(s);
+ entry->pfn = page_to_pfn(sg_page(s));
+ entry->offset = s->offset,
entry->size = sg_dma_len(s);
entry->dev_addr = sg_dma_address(s);
entry->direction = direction;
@@ -1198,7 +1360,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
struct dma_debug_entry ref = {
.type = dma_debug_sg,
.dev = dev,
- .paddr = sg_phys(s),
+ .pfn = page_to_pfn(sg_page(s)),
+ .offset = s->offset,
.dev_addr = sg_dma_address(s),
.size = sg_dma_len(s),
.direction = dir,
@@ -1233,7 +1396,8 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
entry->type = dma_debug_coherent;
entry->dev = dev;
- entry->paddr = virt_to_phys(virt);
+ entry->pfn = page_to_pfn(virt_to_page(virt));
+ entry->offset = (size_t) virt & PAGE_MASK;
entry->size = size;
entry->dev_addr = dma_addr;
entry->direction = DMA_BIDIRECTIONAL;
@@ -1248,7 +1412,8 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
struct dma_debug_entry ref = {
.type = dma_debug_coherent,
.dev = dev,
- .paddr = virt_to_phys(virt),
+ .pfn = page_to_pfn(virt_to_page(virt)),
+ .offset = (size_t) virt & PAGE_MASK,
.dev_addr = addr,
.size = size,
.direction = DMA_BIDIRECTIONAL,
@@ -1356,7 +1521,8 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
struct dma_debug_entry ref = {
.type = dma_debug_sg,
.dev = dev,
- .paddr = sg_phys(s),
+ .pfn = page_to_pfn(sg_page(s)),
+ .offset = s->offset,
.dev_addr = sg_dma_address(s),
.size = sg_dma_len(s),
.direction = direction,
@@ -1388,7 +1554,8 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
struct dma_debug_entry ref = {
.type = dma_debug_sg,
.dev = dev,
- .paddr = sg_phys(s),
+ .pfn = page_to_pfn(sg_page(s)),
+ .offset = s->offset,
.dev_addr = sg_dma_address(s),
.size = sg_dma_len(s),
.direction = direction,
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index c37aeacd7651..600ac57e2777 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -8,6 +8,7 @@
* By Greg Banks <gnb@melbourne.sgi.com>
* Copyright (c) 2008 Silicon Graphics Inc. All Rights Reserved.
* Copyright (C) 2011 Bart Van Assche. All Rights Reserved.
+ * Copyright (C) 2013 Du, Changbin <changbin.du@gmail.com>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
@@ -24,6 +25,7 @@
#include <linux/sysctl.h>
#include <linux/ctype.h>
#include <linux/string.h>
+#include <linux/parser.h>
#include <linux/string_helpers.h>
#include <linux/uaccess.h>
#include <linux/dynamic_debug.h>
@@ -147,7 +149,8 @@ static int ddebug_change(const struct ddebug_query *query,
list_for_each_entry(dt, &ddebug_tables, link) {
/* match against the module name */
- if (query->module && strcmp(query->module, dt->mod_name))
+ if (query->module &&
+ !match_wildcard(query->module, dt->mod_name))
continue;
for (i = 0; i < dt->num_ddebugs; i++) {
@@ -155,14 +158,16 @@ static int ddebug_change(const struct ddebug_query *query,
/* match against the source filename */
if (query->filename &&
- strcmp(query->filename, dp->filename) &&
- strcmp(query->filename, kbasename(dp->filename)) &&
- strcmp(query->filename, trim_prefix(dp->filename)))
+ !match_wildcard(query->filename, dp->filename) &&
+ !match_wildcard(query->filename,
+ kbasename(dp->filename)) &&
+ !match_wildcard(query->filename,
+ trim_prefix(dp->filename)))
continue;
/* match against the function */
if (query->function &&
- strcmp(query->function, dp->function))
+ !match_wildcard(query->function, dp->function))
continue;
/* match against the format */
diff --git a/lib/kstrtox.c b/lib/kstrtox.c
index f78ae0c0c4e2..ec8da78df9be 100644
--- a/lib/kstrtox.c
+++ b/lib/kstrtox.c
@@ -92,7 +92,6 @@ static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res)
rv = _parse_integer(s, base, &_res);
if (rv & KSTRTOX_OVERFLOW)
return -ERANGE;
- rv &= ~KSTRTOX_OVERFLOW;
if (rv == 0)
return -EINVAL;
s += rv;
diff --git a/lib/parser.c b/lib/parser.c
index 807b2aaa33fa..b6d11631231b 100644
--- a/lib/parser.c
+++ b/lib/parser.c
@@ -113,6 +113,7 @@ int match_token(char *s, const match_table_t table, substring_t args[])
return p->token;
}
+EXPORT_SYMBOL(match_token);
/**
* match_number: scan a number in the given base from a substring_t
@@ -163,6 +164,7 @@ int match_int(substring_t *s, int *result)
{
return match_number(s, result, 0);
}
+EXPORT_SYMBOL(match_int);
/**
* match_octal: - scan an octal representation of an integer from a substring_t
@@ -177,6 +179,7 @@ int match_octal(substring_t *s, int *result)
{
return match_number(s, result, 8);
}
+EXPORT_SYMBOL(match_octal);
/**
* match_hex: - scan a hex representation of an integer from a substring_t
@@ -191,6 +194,58 @@ int match_hex(substring_t *s, int *result)
{
return match_number(s, result, 16);
}
+EXPORT_SYMBOL(match_hex);
+
+/**
+ * match_wildcard: - parse if a string matches given wildcard pattern
+ * @pattern: wildcard pattern
+ * @str: the string to be parsed
+ *
+ * Description: Parse the string @str to check if matches wildcard
+ * pattern @pattern. The pattern may contain two type wildcardes:
+ * '*' - matches zero or more characters
+ * '?' - matches one character
+ * If it's matched, return true, else return false.
+ */
+bool match_wildcard(const char *pattern, const char *str)
+{
+ const char *s = str;
+ const char *p = pattern;
+ bool star = false;
+
+ while (*s) {
+ switch (*p) {
+ case '?':
+ s++;
+ p++;
+ break;
+ case '*':
+ star = true;
+ str = s;
+ if (!*++p)
+ return true;
+ pattern = p;
+ break;
+ default:
+ if (*s == *p) {
+ s++;
+ p++;
+ } else {
+ if (!star)
+ return false;
+ str++;
+ s = str;
+ p = pattern;
+ }
+ break;
+ }
+ }
+
+ if (*p == '*')
+ ++p;
+ return !*p;
+}
+EXPORT_SYMBOL(match_wildcard);
/**
* match_strlcpy: - Copy the characters from a substring_t to a sized buffer
@@ -213,6 +268,7 @@ size_t match_strlcpy(char *dest, const substring_t *src, size_t size)
}
return ret;
}
+EXPORT_SYMBOL(match_strlcpy);
/**
* match_strdup: - allocate a new string with the contents of a substring_t
@@ -230,10 +286,4 @@ char *match_strdup(const substring_t *s)
match_strlcpy(p, s, sz);
return p;
}
-
-EXPORT_SYMBOL(match_token);
-EXPORT_SYMBOL(match_int);
-EXPORT_SYMBOL(match_octal);
-EXPORT_SYMBOL(match_hex);
-EXPORT_SYMBOL(match_strlcpy);
EXPORT_SYMBOL(match_strdup);
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 31dd4ccd3baa..8b3c9dc88262 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -8,8 +8,8 @@
#define CHECK_LOOPS 100
struct test_node {
- struct rb_node rb;
u32 key;
+ struct rb_node rb;
/* following fields used for testing augmented rbtree functionality */
u32 val;
@@ -114,6 +114,16 @@ static int black_path_count(struct rb_node *rb)
return count;
}
+static void check_postorder_foreach(int nr_nodes)
+{
+ struct test_node *cur, *n;
+ int count = 0;
+ rbtree_postorder_for_each_entry_safe(cur, n, &root, rb)
+ count++;
+
+ WARN_ON_ONCE(count != nr_nodes);
+}
+
static void check_postorder(int nr_nodes)
{
struct rb_node *rb;
@@ -148,6 +158,7 @@ static void check(int nr_nodes)
WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root))) - 1);
check_postorder(nr_nodes);
+ check_postorder_foreach(nr_nodes);
}
static void check_augmented(int nr_nodes)
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 5847a4921b8e..09225796991a 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -17,9 +17,6 @@ void show_mem(unsigned int filter)
printk("Mem-Info:\n");
show_free_areas(filter);
- if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
- return;
-
for_each_online_pgdat(pgdat) {
unsigned long flags;
int zoneid;
@@ -46,4 +43,7 @@ void show_mem(unsigned int filter)
printk("%lu pages in pagetable cache\n",
quicklist_total_size());
#endif
+#ifdef CONFIG_MEMORY_FAILURE
+ printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
+#endif
}
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 4634ac9cdb38..2e1c102759ce 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -172,8 +172,9 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
/*
* Get the overflow emergency buffer
*/
- v_overflow_buffer = alloc_bootmem_low_pages_nopanic(
- PAGE_ALIGN(io_tlb_overflow));
+ v_overflow_buffer = memblock_virt_alloc_nopanic(
+ PAGE_ALIGN(io_tlb_overflow),
+ PAGE_SIZE);
if (!v_overflow_buffer)
return -ENOMEM;
@@ -184,11 +185,15 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
* to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
* between io_tlb_start and io_tlb_end.
*/
- io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
+ io_tlb_list = memblock_virt_alloc(
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(int)),
+ PAGE_SIZE);
for (i = 0; i < io_tlb_nslabs; i++)
io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
io_tlb_index = 0;
- io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
+ io_tlb_orig_addr = memblock_virt_alloc(
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)),
+ PAGE_SIZE);
if (verbose)
swiotlb_print_info();
@@ -215,13 +220,13 @@ swiotlb_init(int verbose)
bytes = io_tlb_nslabs << IO_TLB_SHIFT;
/* Get IO TLB memory from the low pages */
- vstart = alloc_bootmem_low_pages_nopanic(PAGE_ALIGN(bytes));
+ vstart = memblock_virt_alloc_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE);
if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
return;
if (io_tlb_start)
- free_bootmem(io_tlb_start,
- PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+ memblock_free_early(io_tlb_start,
+ PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
pr_warn("Cannot allocate SWIOTLB buffer");
no_iotlb_memory = true;
}
@@ -357,14 +362,14 @@ void __init swiotlb_free(void)
free_pages((unsigned long)phys_to_virt(io_tlb_start),
get_order(io_tlb_nslabs << IO_TLB_SHIFT));
} else {
- free_bootmem_late(io_tlb_overflow_buffer,
- PAGE_ALIGN(io_tlb_overflow));
- free_bootmem_late(__pa(io_tlb_orig_addr),
- PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
- free_bootmem_late(__pa(io_tlb_list),
- PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
- free_bootmem_late(io_tlb_start,
- PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+ memblock_free_late(io_tlb_overflow_buffer,
+ PAGE_ALIGN(io_tlb_overflow));
+ memblock_free_late(__pa(io_tlb_orig_addr),
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
+ memblock_free_late(__pa(io_tlb_list),
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
+ memblock_free_late(io_tlb_start,
+ PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
}
io_tlb_nslabs = 0;
}
diff --git a/lib/test_module.c b/lib/test_module.c
new file mode 100644
index 000000000000..319b66f1ff61
--- /dev/null
+++ b/lib/test_module.c
@@ -0,0 +1,33 @@
+/*
+ * This module emits "Hello, world" on printk when loaded.
+ *
+ * It is designed to be used for basic evaluation of the module loading
+ * subsystem (for example when validating module signing/verification). It
+ * lacks any extra dependencies, and will not normally be loaded by the
+ * system unless explicitly requested by name.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+
+static int __init test_module_init(void)
+{
+ pr_warn("Hello, world\n");
+
+ return 0;
+}
+
+module_init(test_module_init);
+
+static void __exit test_module_exit(void)
+{
+ pr_warn("Goodbye\n");
+}
+
+module_exit(test_module_exit);
+
+MODULE_AUTHOR("Kees Cook <keescook@chromium.org>");
+MODULE_LICENSE("GPL");
diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c
new file mode 100644
index 000000000000..0ecef3e4690e
--- /dev/null
+++ b/lib/test_user_copy.c
@@ -0,0 +1,110 @@
+/*
+ * Kernel module for testing copy_to/from_user infrastructure.
+ *
+ * Copyright 2013 Google Inc. All Rights Reserved
+ *
+ * Authors:
+ * Kees Cook <keescook@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+
+#define test(condition, msg) \
+({ \
+ int cond = (condition); \
+ if (cond) \
+ pr_warn("%s\n", msg); \
+ cond; \
+})
+
+static int __init test_user_copy_init(void)
+{
+ int ret = 0;
+ char *kmem;
+ char __user *usermem;
+ char *bad_usermem;
+ unsigned long user_addr;
+ unsigned long value = 0x5A;
+
+ kmem = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
+ if (!kmem)
+ return -ENOMEM;
+
+ user_addr = vm_mmap(NULL, 0, PAGE_SIZE * 2,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
+ if (user_addr >= (unsigned long)(TASK_SIZE)) {
+ pr_warn("Failed to allocate user memory\n");
+ kfree(kmem);
+ return -ENOMEM;
+ }
+
+ usermem = (char __user *)user_addr;
+ bad_usermem = (char *)user_addr;
+
+ /* Legitimate usage: none of these should fail. */
+ ret |= test(copy_from_user(kmem, usermem, PAGE_SIZE),
+ "legitimate copy_from_user failed");
+ ret |= test(copy_to_user(usermem, kmem, PAGE_SIZE),
+ "legitimate copy_to_user failed");
+ ret |= test(get_user(value, (unsigned long __user *)usermem),
+ "legitimate get_user failed");
+ ret |= test(put_user(value, (unsigned long __user *)usermem),
+ "legitimate put_user failed");
+
+ /* Invalid usage: none of these should succeed. */
+ ret |= test(!copy_from_user(kmem, (char __user *)(kmem + PAGE_SIZE),
+ PAGE_SIZE),
+ "illegal all-kernel copy_from_user passed");
+ ret |= test(!copy_from_user(bad_usermem, (char __user *)kmem,
+ PAGE_SIZE),
+ "illegal reversed copy_from_user passed");
+ ret |= test(!copy_to_user((char __user *)kmem, kmem + PAGE_SIZE,
+ PAGE_SIZE),
+ "illegal all-kernel copy_to_user passed");
+ ret |= test(!copy_to_user((char __user *)kmem, bad_usermem,
+ PAGE_SIZE),
+ "illegal reversed copy_to_user passed");
+ ret |= test(!get_user(value, (unsigned long __user *)kmem),
+ "illegal get_user passed");
+ ret |= test(!put_user(value, (unsigned long __user *)kmem),
+ "illegal put_user passed");
+
+ vm_munmap(user_addr, PAGE_SIZE * 2);
+ kfree(kmem);
+
+ if (ret == 0) {
+ pr_info("tests passed.\n");
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+module_init(test_user_copy_init);
+
+static void __exit test_user_copy_exit(void)
+{
+ pr_info("unloaded.\n");
+}
+
+module_exit(test_user_copy_exit);
+
+MODULE_AUTHOR("Kees Cook <keescook@chromium.org>");
+MODULE_LICENSE("GPL");
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 10909c571494..a97f18be9070 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1155,6 +1155,45 @@ char *netdev_feature_string(char *buf, char *end, const u8 *addr,
return number(buf, end, *(const netdev_features_t *)addr, spec);
}
+static noinline_for_stack
+char *address_val(char *buf, char *end, const void *addr,
+ struct printf_spec spec, const char *fmt)
+{
+ unsigned long long num;
+
+ spec.flags |= SPECIAL | SMALL | ZEROPAD;
+ spec.base = 16;
+
+ switch (fmt[1]) {
+ case 'd':
+ num = *(const dma_addr_t *)addr;
+ spec.field_width = sizeof(dma_addr_t) * 2 + 2;
+ break;
+ case 'p':
+ default:
+ num = *(const phys_addr_t *)addr;
+ spec.field_width = sizeof(phys_addr_t) * 2 + 2;
+ break;
+ }
+
+ return number(buf, end, num, spec);
+}
+
+static noinline_for_stack
+char *comm_name(char *buf, char *end, struct task_struct *tsk,
+ struct printf_spec spec, const char *fmt)
+{
+ char name[TASK_COMM_LEN];
+
+ /* Caller can pass NULL instead of current. */
+ if (!tsk)
+ tsk = current;
+ /* Not using get_task_comm() in case I'm in IRQ context. */
+ memcpy(name, tsk->comm, TASK_COMM_LEN);
+ name[sizeof(name) - 1] = '\0';
+ return string(buf, end, name, spec);
+}
+
int kptr_restrict __read_mostly;
/*
@@ -1218,9 +1257,11 @@ int kptr_restrict __read_mostly;
* N no separator
* The maximum supported length is 64 bytes of the input. Consider
* to use print_hex_dump() for the larger input.
- * - 'a' For a phys_addr_t type and its derivative types (passed by reference)
+ * - 'a[pd]' For address types [p] phys_addr_t, [d] dma_addr_t and derivatives
+ * (default assumed to be phys_addr_t, passed by reference)
* - 'd[234]' For a dentry name (optionally 2-4 last components)
* - 'D[234]' Same as 'd' but for a struct file
+ * - 'T' task_struct->comm
*
* Note: The difference between 'S' and 'F' is that on ia64 and ppc64
* function pointers are really function descriptors, which contain a
@@ -1232,7 +1273,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
{
int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0);
- if (!ptr && *fmt != 'K') {
+ if (!ptr && *fmt != 'K' && *fmt != 'T') {
/*
* Print (null) with the same width as a pointer so it makes
* tabular output look nice.
@@ -1353,17 +1394,15 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
}
break;
case 'a':
- spec.flags |= SPECIAL | SMALL | ZEROPAD;
- spec.field_width = sizeof(phys_addr_t) * 2 + 2;
- spec.base = 16;
- return number(buf, end,
- (unsigned long long) *((phys_addr_t *)ptr), spec);
+ return address_val(buf, end, ptr, spec, fmt);
case 'd':
return dentry_name(buf, end, ptr, spec, fmt);
case 'D':
return dentry_name(buf, end,
((const struct file *)ptr)->f_path.dentry,
spec, fmt);
+ case 'T':
+ return comm_name(buf, end, ptr, spec, fmt);
}
spec.flags |= SMALL;
if (spec.field_width == -1) {
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 07dbc8ec46cf..6e45a5074bf0 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -267,7 +267,7 @@ void balloon_page_putback(struct page *page)
put_page(page);
} else {
WARN_ON(1);
- dump_page(page);
+ dump_page(page, "not movable balloon page");
}
unlock_page(page);
}
@@ -287,7 +287,7 @@ int balloon_page_migrate(struct page *newpage,
BUG_ON(!trylock_page(newpage));
if (WARN_ON(!__is_movable_balloon_page(page))) {
- dump_page(page);
+ dump_page(page, "not movable balloon page");
unlock_page(newpage);
return rc;
}
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 5875f48ce279..d0eac4350403 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -237,7 +237,7 @@ int __cleancache_get_page(struct page *page)
goto out;
}
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
if (fake_pool_id < 0)
goto out;
@@ -279,7 +279,7 @@ void __cleancache_put_page(struct page *page)
return;
}
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
if (fake_pool_id < 0)
return;
@@ -318,7 +318,7 @@ void __cleancache_invalidate_page(struct address_space *mapping,
if (pool_id < 0)
return;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (cleancache_get_key(mapping->host, &key) >= 0) {
cleancache_ops->invalidate_page(pool_id,
key, page->index);
diff --git a/mm/compaction.c b/mm/compaction.c
index f58bcd016f43..e0ab02d70f13 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -459,6 +459,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
unsigned long flags;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
+ bool skipped_async_unsuitable = false;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -534,6 +535,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (!cc->sync && last_pageblock_nr != pageblock_nr &&
!migrate_async_suitable(get_pageblock_migratetype(page))) {
cc->finished_update_migrate = true;
+ skipped_async_unsuitable = true;
goto next_pageblock;
}
@@ -599,7 +601,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (__isolate_lru_page(page, mode) != 0)
continue;
- VM_BUG_ON(PageTransCompound(page));
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* Successfully isolated */
cc->finished_update_migrate = true;
@@ -627,8 +629,13 @@ next_pageblock:
if (locked)
spin_unlock_irqrestore(&zone->lru_lock, flags);
- /* Update the pageblock-skip if the whole pageblock was scanned */
- if (low_pfn == end_pfn)
+ /*
+ * Update the pageblock-skip information and cached scanner pfn,
+ * if the whole pageblock was scanned without isolating any page.
+ * This is not done when pageblock was skipped due to being unsuitable
+ * for async compaction, so that eventual sync compaction can try.
+ */
+ if (low_pfn == end_pfn && !skipped_async_unsuitable)
update_pageblock_skip(cc, valid_page, nr_isolated, true);
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -660,7 +667,7 @@ static void isolate_freepages(struct zone *zone,
* is the end of the pageblock the migration scanner is using.
*/
pfn = cc->free_pfn;
- low_pfn = cc->migrate_pfn + pageblock_nr_pages;
+ low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
/*
* Take care that if the migration scanner is at the end of the zone
@@ -676,7 +683,7 @@ static void isolate_freepages(struct zone *zone,
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
+ for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
pfn -= pageblock_nr_pages) {
unsigned long isolated;
@@ -738,7 +745,14 @@ static void isolate_freepages(struct zone *zone,
/* split_free_page does not map the pages */
map_pages(freelist);
- cc->free_pfn = high_pfn;
+ /*
+ * If we crossed the migrate scanner, we want to keep it that way
+ * so that compact_finished() may detect this
+ */
+ if (pfn < low_pfn)
+ cc->free_pfn = max(pfn, zone->zone_start_pfn);
+ else
+ cc->free_pfn = high_pfn;
cc->nr_freepages = nr_freepages;
}
@@ -837,6 +851,10 @@ static int compact_finished(struct zone *zone,
/* Compaction run completes if the migrate and free scanner meet */
if (cc->free_pfn <= cc->migrate_pfn) {
+ /* Let the next compaction start anew. */
+ zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
+ zone->compact_cached_free_pfn = zone_end_pfn(zone);
+
/*
* Mark that the PG_migrate_skip information should be cleared
* by kswapd when it goes to sleep. kswapd does not set the
@@ -947,6 +965,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
}
/*
+ * Clear pageblock skip if there were failures recently and compaction
+ * is about to be retried after being deferred. kswapd does not do
+ * this reset as it'll reset the cached information when going to sleep.
+ */
+ if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ __reset_isolation_suitable(zone);
+
+ /*
* Setup to move all movable pages to the end of the zone. Used cached
* information on where the scanners should start but check that it
* is initialised by ensuring the values are within zone boundaries.
@@ -962,13 +988,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
zone->compact_cached_migrate_pfn = cc->migrate_pfn;
}
- /*
- * Clear pageblock skip if there were failures recently and compaction
- * is about to be retried after being deferred. kswapd does not do
- * this reset as it'll reset the cached information when going to sleep.
- */
- if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
- __reset_isolation_suitable(zone);
+ trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
migrate_prep_local();
@@ -1003,7 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
if (err) {
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
- if (err == -ENOMEM) {
+ /*
+ * migrate_pages() may return -ENOMEM when scanners meet
+ * and we want compact_finished() to detect it
+ */
+ if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
ret = COMPACT_PARTIAL;
goto out;
}
@@ -1015,6 +1039,8 @@ out:
cc->nr_freepages -= release_freepages(&cc->freepages);
VM_BUG_ON(cc->nr_freepages != 0);
+ trace_mm_compaction_end(ret);
+
return ret;
}
@@ -1120,12 +1146,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
compact_zone(zone, cc);
if (cc->order > 0) {
- int ok = zone_watermark_ok(zone, cc->order,
- low_wmark_pages(zone), 0, 0);
- if (ok && cc->order >= zone->compact_order_failed)
- zone->compact_order_failed = cc->order + 1;
+ if (zone_watermark_ok(zone, cc->order,
+ low_wmark_pages(zone), 0, 0))
+ compaction_defer_reset(zone, cc->order, false);
/* Currently async compaction is never deferred. */
- else if (!ok && cc->sync)
+ else if (cc->sync)
defer_compaction(zone, cc->order);
}
diff --git a/mm/filemap.c b/mm/filemap.c
index b7749a92021c..7a7f3e0db738 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -409,9 +409,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
{
int error;
- VM_BUG_ON(!PageLocked(old));
- VM_BUG_ON(!PageLocked(new));
- VM_BUG_ON(new->mapping);
+ VM_BUG_ON_PAGE(!PageLocked(old), old);
+ VM_BUG_ON_PAGE(!PageLocked(new), new);
+ VM_BUG_ON_PAGE(new->mapping, new);
error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (!error) {
@@ -461,8 +461,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
{
int error;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageSwapBacked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageSwapBacked(page), page);
error = mem_cgroup_cache_charge(page, current->mm,
gfp_mask & GFP_RECLAIM_MASK);
@@ -607,7 +607,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
*/
void unlock_page(struct page *page)
{
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
smp_mb__after_clear_bit();
wake_up_page(page, PG_locked);
@@ -760,7 +760,7 @@ repeat:
page_cache_release(page);
goto repeat;
}
- VM_BUG_ON(page->index != offset);
+ VM_BUG_ON_PAGE(page->index != offset, page);
}
return page;
}
@@ -1656,7 +1656,7 @@ retry_find:
put_page(page);
goto retry_find;
}
- VM_BUG_ON(page->index != offset);
+ VM_BUG_ON_PAGE(page->index != offset, page);
/*
* We have a locked page in the page cache, now we need to check
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5d80c53b87cb..82166bf974e1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -130,8 +130,14 @@ static int set_recommended_min_free_kbytes(void)
(unsigned long) nr_free_buffer_pages() / 20);
recommended_min <<= (PAGE_SHIFT-10);
- if (recommended_min > min_free_kbytes)
+ if (recommended_min > min_free_kbytes) {
+ if (user_min_free_kbytes >= 0)
+ pr_info("raising min_free_kbytes from %d to %lu "
+ "to help transparent hugepage allocations\n",
+ min_free_kbytes, recommended_min);
+
min_free_kbytes = recommended_min;
+ }
setup_per_zone_wmarks();
return 0;
}
@@ -655,7 +661,7 @@ out:
hugepage_exit_sysfs(hugepage_kobj);
return err;
}
-module_init(hugepage_init)
+subsys_initcall(hugepage_init);
static int __init setup_transparent_hugepage(char *str)
{
@@ -712,7 +718,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pgtable_t pgtable;
spinlock_t *ptl;
- VM_BUG_ON(!PageCompound(page));
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
@@ -893,7 +899,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out;
}
src_page = pmd_page(pmd);
- VM_BUG_ON(!PageHead(src_page));
+ VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
get_page(src_page);
page_dup_rmap(src_page);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -1067,7 +1073,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_pages;
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
@@ -1133,7 +1139,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_unlock;
page = pmd_page(orig_pmd);
- VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+ VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
if (page_mapcount(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
@@ -1211,7 +1217,7 @@ alloc:
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
put_huge_zero_page();
} else {
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
page_remove_rmap(page);
put_page(page);
}
@@ -1249,7 +1255,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
goto out;
page = pmd_page(*pmd);
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
if (flags & FOLL_TOUCH) {
pmd_t _pmd;
/*
@@ -1274,7 +1280,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
}
}
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
- VM_BUG_ON(!PageCompound(page));
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
if (flags & FOLL_GET)
get_page_foll(page);
@@ -1432,9 +1438,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
} else {
page = pmd_page(orig_pmd);
page_remove_rmap(page);
- VM_BUG_ON(page_mapcount(page) < 0);
+ VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
atomic_long_dec(&tlb->mm->nr_ptes);
spin_unlock(ptl);
tlb_remove_page(tlb, page);
@@ -2172,9 +2178,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (unlikely(!page))
goto out;
- VM_BUG_ON(PageCompound(page));
- BUG_ON(!PageAnon(page));
- VM_BUG_ON(!PageSwapBacked(page));
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
/* cannot use mapcount: can't collapse if there's a gup pin */
if (page_count(page) != 1)
@@ -2197,8 +2203,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
/* 0 stands for page_is_file_cache(page) == false */
inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
/* If there is no mapped pte young don't collapse the page */
if (pte_young(pteval) || PageReferenced(page) ||
@@ -2228,7 +2234,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
- VM_BUG_ON(page_mapcount(src_page) != 1);
+ VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
release_pte_page(src_page);
/*
* ptl mostly unnecessary, but preempt has to
@@ -2307,7 +2313,7 @@ static struct page
struct vm_area_struct *vma, unsigned long address,
int node)
{
- VM_BUG_ON(*hpage);
+ VM_BUG_ON_PAGE(*hpage, *hpage);
/*
* Allocate the page while the vma is still valid and under
* the mmap_sem read mode so there is no memory allocation
@@ -2576,7 +2582,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
*/
node = page_to_nid(page);
khugepaged_node_load[node]++;
- VM_BUG_ON(PageCompound(page));
+ VM_BUG_ON_PAGE(PageCompound(page), page);
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
/* cannot use mapcount: can't collapse if there's a gup pin */
@@ -2872,7 +2878,7 @@ again:
return;
}
page = pmd_page(*pmd);
- VM_BUG_ON(!page_count(page));
+ VM_BUG_ON_PAGE(!page_count(page), page);
get_page(page);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dee6cf4e6d34..c01cb9fedb18 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -584,7 +584,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
1 << PG_active | 1 << PG_reserved |
1 << PG_private | 1 << PG_writeback);
}
- VM_BUG_ON(hugetlb_cgroup_from_page(page));
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
arch_release_hugepage(page);
@@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
*/
int PageHuge(struct page *page)
{
- compound_page_dtor *dtor;
-
if (!PageCompound(page))
return 0;
page = compound_head(page);
- dtor = get_compound_page_dtor(page);
-
- return dtor == free_huge_page;
+ return get_compound_page_dtor(page) == free_huge_page;
}
EXPORT_SYMBOL_GPL(PageHuge);
@@ -708,16 +704,11 @@ EXPORT_SYMBOL_GPL(PageHuge);
*/
int PageHeadHuge(struct page *page_head)
{
- compound_page_dtor *dtor;
-
if (!PageHead(page_head))
return 0;
- dtor = get_compound_page_dtor(page_head);
-
- return dtor == free_huge_page;
+ return get_compound_page_dtor(page_head) == free_huge_page;
}
-EXPORT_SYMBOL_GPL(PageHeadHuge);
pgoff_t __basepage_index(struct page *page)
{
@@ -1098,7 +1089,7 @@ retry:
* no users -- drop the buddy allocator's reference.
*/
put_page_testzero(page);
- VM_BUG_ON(page_count(page));
+ VM_BUG_ON_PAGE(page_count(page), page);
enqueue_huge_page(h, page);
}
free:
@@ -1280,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
- addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
- huge_page_size(h), huge_page_size(h), 0);
-
+ addr = memblock_virt_alloc_try_nid_nopanic(
+ huge_page_size(h), huge_page_size(h),
+ 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
if (addr) {
/*
* Use the beginning of the huge page to store the
@@ -1322,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void)
#ifdef CONFIG_HIGHMEM
page = pfn_to_page(m->phys >> PAGE_SHIFT);
- free_bootmem_late((unsigned long)m,
- sizeof(struct huge_bootmem_page));
+ memblock_free_late(__pa(m),
+ sizeof(struct huge_bootmem_page));
#else
page = virt_to_page(m);
#endif
@@ -2355,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+ int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+ mmun_start = vma->vm_start;
+ mmun_end = vma->vm_end;
+ if (cow)
+ mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr);
if (!src_pte)
continue;
dst_pte = huge_pte_alloc(dst, addr, sz);
- if (!dst_pte)
- goto nomem;
+ if (!dst_pte) {
+ ret = -ENOMEM;
+ break;
+ }
/* If the pagetables are shared don't copy or take references */
if (dst_pte == src_pte)
@@ -2386,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
}
- return 0;
-nomem:
- return -ENOMEM;
+ if (cow)
+ mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+
+ return ret;
}
static int is_hugetlb_entry_migration(pte_t pte)
@@ -3079,7 +3081,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
- get_page(pages[i]);
+ get_page_foll(pages[i]);
}
if (vmas)
@@ -3501,7 +3503,7 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
bool isolate_huge_page(struct page *page, struct list_head *list)
{
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
if (!get_page_unless_zero(page))
return false;
spin_lock(&hugetlb_lock);
@@ -3512,7 +3514,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
void putback_active_hugepage(struct page *page)
{
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
spin_lock(&hugetlb_lock);
list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
spin_unlock(&hugetlb_lock);
@@ -3521,7 +3523,7 @@ void putback_active_hugepage(struct page *page)
bool is_hugepage_active(struct page *page)
{
- VM_BUG_ON(!PageHuge(page));
+ VM_BUG_ON_PAGE(!PageHuge(page), page);
/*
* This function can be called for a tail page because the caller,
* scan_movable_pages, scans through a given pfn-range which typically
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index d747a84e09b0..cb00829bb466 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -390,7 +390,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
if (hugetlb_cgroup_disabled())
return;
- VM_BUG_ON(!PageHuge(oldhpage));
+ VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
spin_lock(&hugetlb_lock);
h_cg = hugetlb_cgroup_from_page(oldhpage);
set_hugetlb_cgroup(oldhpage, NULL);
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 4c84678371eb..95487c71cad5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val)
return 0;
inject:
- printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
+ pr_info("Injecting memory failure at pfn %#lx\n", pfn);
return memory_failure(pfn, 18, MF_COUNT_INCREASED);
}
diff --git a/mm/internal.h b/mm/internal.h
index 684f7aa9692a..7e145e8cd1e6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -27,8 +27,8 @@ static inline void set_page_count(struct page *page, int v)
*/
static inline void set_page_refcounted(struct page *page)
{
- VM_BUG_ON(PageTail(page));
- VM_BUG_ON(atomic_read(&page->_count));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count), page);
set_page_count(page, 1);
}
@@ -46,12 +46,10 @@ static inline void __get_page_tail_foll(struct page *page,
* speculative page access (like in
* page_cache_get_speculative()) on tail pages.
*/
- VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
- VM_BUG_ON(atomic_read(&page->_count) != 0);
- VM_BUG_ON(page_mapcount(page) < 0);
+ VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page);
if (get_page_head)
atomic_inc(&page->first_page->_count);
- atomic_inc(&page->_mapcount);
+ get_huge_page_tail(page);
}
/*
@@ -73,7 +71,7 @@ static inline void get_page_foll(struct page *page)
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
*/
- VM_BUG_ON(atomic_read(&page->_count) <= 0);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
atomic_inc(&page->_count);
}
}
@@ -101,6 +99,7 @@ extern void prep_compound_page(struct page *page, unsigned long order);
#ifdef CONFIG_MEMORY_FAILURE
extern bool is_free_buddy_page(struct page *page);
#endif
+extern int user_min_free_kbytes;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -175,7 +174,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
struct page *page)
{
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
return 0;
diff --git a/mm/ksm.c b/mm/ksm.c
index 175fff79dc95..aa4c7c7250c1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1891,21 +1891,24 @@ struct page *ksm_might_need_to_copy(struct page *page,
return new_page;
}
-int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
{
struct stable_node *stable_node;
struct rmap_item *rmap_item;
- unsigned int mapcount = page_mapcount(page);
- int referenced = 0;
+ int ret = SWAP_AGAIN;
int search_new_forks = 0;
- VM_BUG_ON(!PageKsm(page));
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageKsm(page), page);
+
+ /*
+ * Rely on the page lock to protect against concurrent modifications
+ * to that page's node of the stable tree.
+ */
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
stable_node = page_stable_node(page);
if (!stable_node)
- return 0;
+ return ret;
again:
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -1928,113 +1931,16 @@ again:
if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
continue;
- if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
- continue;
-
- referenced += page_referenced_one(page, vma,
- rmap_item->address, &mapcount, vm_flags);
- if (!search_new_forks || !mapcount)
- break;
- }
- anon_vma_unlock_read(anon_vma);
- if (!mapcount)
- goto out;
- }
- if (!search_new_forks++)
- goto again;
-out:
- return referenced;
-}
-
-int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
-{
- struct stable_node *stable_node;
- struct rmap_item *rmap_item;
- int ret = SWAP_AGAIN;
- int search_new_forks = 0;
-
- VM_BUG_ON(!PageKsm(page));
- VM_BUG_ON(!PageLocked(page));
-
- stable_node = page_stable_node(page);
- if (!stable_node)
- return SWAP_FAIL;
-again:
- hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
- struct anon_vma *anon_vma = rmap_item->anon_vma;
- struct anon_vma_chain *vmac;
- struct vm_area_struct *vma;
-
- anon_vma_lock_read(anon_vma);
- anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
- 0, ULONG_MAX) {
- vma = vmac->vma;
- if (rmap_item->address < vma->vm_start ||
- rmap_item->address >= vma->vm_end)
- continue;
- /*
- * Initially we examine only the vma which covers this
- * rmap_item; but later, if there is still work to do,
- * we examine covering vmas in other mms: in case they
- * were forked from the original since ksmd passed.
- */
- if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- ret = try_to_unmap_one(page, vma,
- rmap_item->address, flags);
- if (ret != SWAP_AGAIN || !page_mapped(page)) {
+ ret = rwc->rmap_one(page, vma,
+ rmap_item->address, rwc->arg);
+ if (ret != SWAP_AGAIN) {
anon_vma_unlock_read(anon_vma);
goto out;
}
- }
- anon_vma_unlock_read(anon_vma);
- }
- if (!search_new_forks++)
- goto again;
-out:
- return ret;
-}
-
-#ifdef CONFIG_MIGRATION
-int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg)
-{
- struct stable_node *stable_node;
- struct rmap_item *rmap_item;
- int ret = SWAP_AGAIN;
- int search_new_forks = 0;
-
- VM_BUG_ON(!PageKsm(page));
- VM_BUG_ON(!PageLocked(page));
-
- stable_node = page_stable_node(page);
- if (!stable_node)
- return ret;
-again:
- hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
- struct anon_vma *anon_vma = rmap_item->anon_vma;
- struct anon_vma_chain *vmac;
- struct vm_area_struct *vma;
-
- anon_vma_lock_read(anon_vma);
- anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
- 0, ULONG_MAX) {
- vma = vmac->vma;
- if (rmap_item->address < vma->vm_start ||
- rmap_item->address >= vma->vm_end)
- continue;
- /*
- * Initially we examine only the vma which covers this
- * rmap_item; but later, if there is still work to do,
- * we examine covering vmas in other mms: in case they
- * were forked from the original since ksmd passed.
- */
- if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
- continue;
-
- ret = rmap_one(page, vma, rmap_item->address, arg);
- if (ret != SWAP_AGAIN) {
+ if (rwc->done && rwc->done(page)) {
anon_vma_unlock_read(anon_vma);
goto out;
}
@@ -2047,17 +1953,18 @@ out:
return ret;
}
+#ifdef CONFIG_MIGRATION
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
struct stable_node *stable_node;
- VM_BUG_ON(!PageLocked(oldpage));
- VM_BUG_ON(!PageLocked(newpage));
- VM_BUG_ON(newpage->mapping != oldpage->mapping);
+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
stable_node = page_stable_node(newpage);
if (stable_node) {
- VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
+ VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
stable_node->kpfn = page_to_pfn(newpage);
/*
* newpage->mapping was set in advance; now we need smp_wmb()
@@ -2438,4 +2345,4 @@ out_free:
out:
return err;
}
-module_init(ksm_init)
+subsys_initcall(ksm_init);
diff --git a/mm/memblock.c b/mm/memblock.c
index 53e477bb5558..9c0aeef19440 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -21,6 +21,9 @@
#include <linux/memblock.h>
#include <asm-generic/sections.h>
+#include <linux/io.h>
+
+#include "internal.h"
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -39,6 +42,9 @@ struct memblock memblock __initdata_memblock = {
};
int memblock_debug __initdata_memblock;
+#ifdef CONFIG_MOVABLE_NODE
+bool movable_node_enabled __initdata_memblock = false;
+#endif
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
@@ -91,7 +97,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
@@ -123,7 +129,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Utility called from memblock_find_in_range_node(), find free area top-down.
*
@@ -154,11 +160,11 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
/**
* memblock_find_in_range_node - find free area in given range and node
- * @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Find @size free area aligned to @align in the specified range and node.
*
@@ -173,9 +179,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
* RETURNS:
* Found address on success, 0 on failure.
*/
-phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
- phys_addr_t end, phys_addr_t size,
- phys_addr_t align, int nid)
+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start,
+ phys_addr_t end, int nid)
{
int ret;
phys_addr_t kernel_end;
@@ -238,8 +244,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align)
{
- return memblock_find_in_range_node(start, end, size, align,
- MAX_NUMNODES);
+ return memblock_find_in_range_node(size, align, start, end,
+ NUMA_NO_NODE);
}
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -255,10 +261,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
type->cnt = 1;
type->regions[0].base = 0;
type->regions[0].size = 0;
+ type->regions[0].flags = 0;
memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
}
}
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+
phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
phys_addr_t *addr)
{
@@ -271,6 +280,20 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
memblock.reserved.max);
}
+phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
+ phys_addr_t *addr)
+{
+ if (memblock.memory.regions == memblock_memory_init_regions)
+ return 0;
+
+ *addr = __pa(memblock.memory.regions);
+
+ return PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.memory.max);
+}
+
+#endif
+
/**
* memblock_double_array - double the size of the memblock regions array
* @type: memblock type of the regions array being doubled
@@ -405,7 +428,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
if (this->base + this->size != next->base ||
memblock_get_region_node(this) !=
- memblock_get_region_node(next)) {
+ memblock_get_region_node(next) ||
+ this->flags != next->flags) {
BUG_ON(this->base + this->size > next->base);
i++;
continue;
@@ -425,13 +449,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
* @base: base address of the new region
* @size: size of the new region
* @nid: node id of the new region
+ * @flags: flags of the new region
*
* Insert new memblock region [@base,@base+@size) into @type at @idx.
* @type must already have extra room to accomodate the new region.
*/
static void __init_memblock memblock_insert_region(struct memblock_type *type,
int idx, phys_addr_t base,
- phys_addr_t size, int nid)
+ phys_addr_t size,
+ int nid, unsigned long flags)
{
struct memblock_region *rgn = &type->regions[idx];
@@ -439,6 +465,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
rgn->base = base;
rgn->size = size;
+ rgn->flags = flags;
memblock_set_region_node(rgn, nid);
type->cnt++;
type->total_size += size;
@@ -450,6 +477,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
* @base: base address of the new region
* @size: size of the new region
* @nid: nid of the new region
+ * @flags: flags of the new region
*
* Add new memblock region [@base,@base+@size) into @type. The new region
* is allowed to overlap with existing ones - overlaps don't affect already
@@ -460,7 +488,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
* 0 on success, -errno on failure.
*/
static int __init_memblock memblock_add_region(struct memblock_type *type,
- phys_addr_t base, phys_addr_t size, int nid)
+ phys_addr_t base, phys_addr_t size,
+ int nid, unsigned long flags)
{
bool insert = false;
phys_addr_t obase = base;
@@ -475,6 +504,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type,
WARN_ON(type->cnt != 1 || type->total_size);
type->regions[0].base = base;
type->regions[0].size = size;
+ type->regions[0].flags = flags;
memblock_set_region_node(&type->regions[0], nid);
type->total_size = size;
return 0;
@@ -505,7 +535,8 @@ repeat:
nr_new++;
if (insert)
memblock_insert_region(type, i++, base,
- rbase - base, nid);
+ rbase - base, nid,
+ flags);
}
/* area below @rend is dealt with, forget about it */
base = min(rend, end);
@@ -515,7 +546,8 @@ repeat:
if (base < end) {
nr_new++;
if (insert)
- memblock_insert_region(type, i, base, end - base, nid);
+ memblock_insert_region(type, i, base, end - base,
+ nid, flags);
}
/*
@@ -537,12 +569,13 @@ repeat:
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
int nid)
{
- return memblock_add_region(&memblock.memory, base, size, nid);
+ return memblock_add_region(&memblock.memory, base, size, nid, 0);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
- return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES);
+ return memblock_add_region(&memblock.memory, base, size,
+ MAX_NUMNODES, 0);
}
/**
@@ -597,7 +630,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
rgn->size -= base - rbase;
type->total_size -= base - rbase;
memblock_insert_region(type, i, rbase, base - rbase,
- memblock_get_region_node(rgn));
+ memblock_get_region_node(rgn),
+ rgn->flags);
} else if (rend > end) {
/*
* @rgn intersects from above. Split and redo the
@@ -607,7 +641,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
rgn->size -= end - rbase;
type->total_size -= end - rbase;
memblock_insert_region(type, i--, rbase, end - rbase,
- memblock_get_region_node(rgn));
+ memblock_get_region_node(rgn),
+ rgn->flags);
} else {
/* @rgn is fully contained, record it */
if (!*end_rgn)
@@ -643,28 +678,89 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
(unsigned long long)base,
- (unsigned long long)base + size,
+ (unsigned long long)base + size - 1,
(void *)_RET_IP_);
return __memblock_remove(&memblock.reserved, base, size);
}
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+static int __init_memblock memblock_reserve_region(phys_addr_t base,
+ phys_addr_t size,
+ int nid,
+ unsigned long flags)
{
struct memblock_type *_rgn = &memblock.reserved;
- memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n",
+ memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
- (unsigned long long)base + size,
- (void *)_RET_IP_);
+ (unsigned long long)base + size - 1,
+ flags, (void *)_RET_IP_);
+
+ return memblock_add_region(_rgn, base, size, nid, flags);
+}
+
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_reserve_region(base, size, MAX_NUMNODES, 0);
+}
+
+/**
+ * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * This function isolates region [@base, @base + @size), and mark it with flag
+ * MEMBLOCK_HOTPLUG.
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
+{
+ struct memblock_type *type = &memblock.memory;
+ int i, ret, start_rgn, end_rgn;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = start_rgn; i < end_rgn; i++)
+ memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG);
+
+ memblock_merge_regions(type);
+ return 0;
+}
+
+/**
+ * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * This function isolates region [@base, @base + @size), and clear flag
+ * MEMBLOCK_HOTPLUG for the isolated regions.
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
+{
+ struct memblock_type *type = &memblock.memory;
+ int i, ret, start_rgn, end_rgn;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = start_rgn; i < end_rgn; i++)
+ memblock_clear_region_flags(&type->regions[i],
+ MEMBLOCK_HOTPLUG);
- return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
+ memblock_merge_regions(type);
+ return 0;
}
/**
* __next_free_mem_range - next function for for_each_free_mem_range()
* @idx: pointer to u64 loop variable
- * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
@@ -693,13 +789,16 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
int mi = *idx & 0xffffffff;
int ri = *idx >> 32;
+ if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
for ( ; mi < mem->cnt; mi++) {
struct memblock_region *m = &mem->regions[mi];
phys_addr_t m_start = m->base;
phys_addr_t m_end = m->base + m->size;
/* only memory regions are associated with nodes, check it */
- if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+ if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
continue;
/* scan areas before each reservation for intersection */
@@ -740,12 +839,17 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
/**
* __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
* @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
*
* Reverse of __next_free_mem_range().
+ *
+ * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't
+ * be able to hot-remove hotpluggable memory used by the kernel. So this
+ * function skip hotpluggable regions if needed when allocating memory for the
+ * kernel.
*/
void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
phys_addr_t *out_start,
@@ -756,6 +860,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
int mi = *idx & 0xffffffff;
int ri = *idx >> 32;
+ if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
if (*idx == (u64)ULLONG_MAX) {
mi = mem->cnt - 1;
ri = rsv->cnt;
@@ -767,7 +874,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
phys_addr_t m_end = m->base + m->size;
/* only memory regions are associated with nodes, check it */
- if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+ if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
+ continue;
+
+ /* skip hotpluggable memory regions if needed */
+ if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
continue;
/* scan areas before each reservation for intersection */
@@ -837,18 +948,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
* memblock_set_node - set node ID on memblock regions
* @base: base of area to set node ID for
* @size: size of area to set node ID for
+ * @type: memblock type to set node ID for
* @nid: node ID to set
*
- * Set the nid of memblock memory regions in [@base,@base+@size) to @nid.
+ * Set the nid of memblock @type regions in [@base,@base+@size) to @nid.
* Regions which cross the area boundaries are split as necessary.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
- int nid)
+ struct memblock_type *type, int nid)
{
- struct memblock_type *type = &memblock.memory;
int start_rgn, end_rgn;
int i, ret;
@@ -870,13 +981,13 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
{
phys_addr_t found;
- if (WARN_ON(!align))
- align = __alignof__(long long);
+ if (!align)
+ align = SMP_CACHE_BYTES;
/* align @size to avoid excessive fragmentation on reserved array */
size = round_up(size, align);
- found = memblock_find_in_range_node(0, max_addr, size, align, nid);
+ found = memblock_find_in_range_node(size, align, 0, max_addr, nid);
if (found && !memblock_reserve(found, size))
return found;
@@ -890,7 +1001,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n
phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES);
+ return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
}
phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -920,6 +1031,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
}
+/**
+ * memblock_virt_alloc_internal - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region to allocate (phys address)
+ * @max_addr: the upper bound of the memory region to allocate (phys address)
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * The @min_addr limit is dropped if it can not be satisfied and the allocation
+ * will fall back to memory below @min_addr. Also, allocation may fall back
+ * to any node in the system if the specified node can not
+ * hold the requested memory.
+ *
+ * The allocation is performed from memory region limited by
+ * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
+ *
+ * The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
+ *
+ * The phys address of allocated boot memory block is converted to virtual and
+ * allocated memory is reset to 0.
+ *
+ * In addition, function sets the min_count to 0 using kmemleak_alloc for
+ * allocated boot memory block, so that it is never reported as leaks.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+static void * __init memblock_virt_alloc_internal(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ phys_addr_t alloc;
+ void *ptr;
+
+ if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
+ /*
+ * Detect any accidental use of these APIs after slab is ready, as at
+ * this moment memblock may be deinitialized already and its
+ * internal data may be destroyed (after execution of free_all_bootmem)
+ */
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, nid);
+
+ if (!align)
+ align = SMP_CACHE_BYTES;
+
+ /* align @size to avoid excessive fragmentation on reserved array */
+ size = round_up(size, align);
+
+again:
+ alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
+ nid);
+ if (alloc)
+ goto done;
+
+ if (nid != NUMA_NO_NODE) {
+ alloc = memblock_find_in_range_node(size, align, min_addr,
+ max_addr, NUMA_NO_NODE);
+ if (alloc)
+ goto done;
+ }
+
+ if (min_addr) {
+ min_addr = 0;
+ goto again;
+ } else {
+ goto error;
+ }
+
+done:
+ memblock_reserve(alloc, size);
+ ptr = phys_to_virt(alloc);
+ memset(ptr, 0, size);
+
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks. This is because many of these blocks
+ * are only referred via the physical address which is not
+ * looked up by kmemleak.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
+
+ return ptr;
+
+error:
+ return NULL;
+}
+
+/**
+ * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
+ * additional debug information (including caller info), if enabled.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_nopanic(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr, (void *)_RET_IP_);
+ return memblock_virt_alloc_internal(size, align, min_addr,
+ max_addr, nid);
+}
+
+/**
+ * memblock_virt_alloc_try_nid - allocate boot memory block with panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
+ * which provides debug information (including caller info), if enabled,
+ * and panics if the request can not be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ void *ptr;
+
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr, (void *)_RET_IP_);
+ ptr = memblock_virt_alloc_internal(size, align,
+ min_addr, max_addr, nid);
+ if (ptr)
+ return ptr;
+
+ panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr);
+ return NULL;
+}
+
+/**
+ * __memblock_free_early - free boot memory block
+ * @base: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * Free boot memory block previously allocated by memblock_virt_alloc_xx() API.
+ * The freeing memory will not be released to the buddy allocator.
+ */
+void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
+{
+ memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+ __func__, (u64)base, (u64)base + size - 1,
+ (void *)_RET_IP_);
+ kmemleak_free_part(__va(base), size);
+ __memblock_remove(&memblock.reserved, base, size);
+}
+
+/*
+ * __memblock_free_late - free bootmem block pages directly to buddy allocator
+ * @addr: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are released directly
+ * to the buddy allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
+{
+ u64 cursor, end;
+
+ memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+ __func__, (u64)base, (u64)base + size - 1,
+ (void *)_RET_IP_);
+ kmemleak_free_part(__va(base), size);
+ cursor = PFN_UP(base);
+ end = PFN_DOWN(base + size);
+
+ for (; cursor < end; cursor++) {
+ __free_pages_bootmem(pfn_to_page(cursor), 0);
+ totalram_pages++;
+ }
+}
/*
* Remaining API functions
@@ -1101,6 +1413,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit)
static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
{
unsigned long long base, size;
+ unsigned long flags;
int i;
pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
@@ -1111,13 +1424,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
base = rgn->base;
size = rgn->size;
+ flags = rgn->flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
if (memblock_get_region_node(rgn) != MAX_NUMNODES)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
#endif
- pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n",
- name, i, base, base + size - 1, size, nid_buf);
+ pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
+ name, i, base, base + size - 1, size, nid_buf, flags);
}
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7caff36180cd..a815686b7f0a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,7 +49,6 @@
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
@@ -150,7 +149,7 @@ struct mem_cgroup_reclaim_iter {
* matches memcg->dead_count of the hierarchy root group.
*/
struct mem_cgroup *last_visited;
- unsigned long last_dead_count;
+ int last_dead_count;
/* scan generation, increased every round-trip */
unsigned int generation;
@@ -381,23 +380,12 @@ struct mem_cgroup {
/* WARNING: nodeinfo must be the last member here */
};
-static size_t memcg_size(void)
-{
- return sizeof(struct mem_cgroup) +
- nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-}
-
/* internal only representation about the status of kmem accounting. */
enum {
- KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
- KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+ KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
};
-/* We account when limit is on, but only after call sites are patched */
-#define KMEM_ACCOUNTED_MASK \
- ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
-
#ifdef CONFIG_MEMCG_KMEM
static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
{
@@ -409,16 +397,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
-static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
-{
- set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
-static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
-{
- clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
/*
@@ -1141,10 +1119,8 @@ skip_node:
* protected by css_get and the tree walk is rcu safe.
*/
if (next_css) {
- struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
-
- if (css_tryget(&mem->css))
- return mem;
+ if ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))
+ return mem_cgroup_from_css(next_css);
else {
prev_css = next_css;
goto skip_node;
@@ -1688,13 +1664,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
- struct cgroup *task_cgrp;
- struct cgroup *mem_cgrp;
/*
- * Need a buffer in BSS, can't rely on allocations. The code relies
- * on the assumption that OOM is serialized for memory controller.
- * If this assumption is broken, revisit this code.
+ * protects memcg_name and makes sure that parallel ooms do not
+ * interleave
*/
+ static DEFINE_SPINLOCK(oom_info_lock);
+ struct cgroup *task_cgrp;
+ struct cgroup *mem_cgrp;
static char memcg_name[PATH_MAX];
int ret;
struct mem_cgroup *iter;
@@ -1703,6 +1679,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
if (!p)
return;
+ spin_lock(&oom_info_lock);
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
@@ -1771,6 +1748,7 @@ done:
pr_cont("\n");
}
+ spin_unlock(&oom_info_lock);
}
/*
@@ -2731,7 +2709,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
* MEMDIE process.
*/
if (unlikely(test_thread_flag(TIF_MEMDIE)
- || fatal_signal_pending(current)))
+ || fatal_signal_pending(current))
+ || current->flags & PF_EXITING)
goto bypass;
if (unlikely(task_in_memcg_oom(current)))
@@ -2902,7 +2881,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
unsigned short id;
swp_entry_t ent;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
@@ -2936,7 +2915,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
bool anon;
lock_page_cgroup(pc);
- VM_BUG_ON(PageCgroupUsed(pc));
+ VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
* accessed by any other context at this point.
@@ -2971,7 +2950,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
if (lrucare) {
if (was_on_lru) {
lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
@@ -2997,10 +2976,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
static DEFINE_MUTEX(set_limit_mutex);
#ifdef CONFIG_MEMCG_KMEM
+static DEFINE_MUTEX(activate_kmem_mutex);
+
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
{
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
- (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+ memcg_kmem_is_active(memcg);
}
/*
@@ -3099,16 +3080,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
css_put(&memcg->css);
}
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
-{
- if (!memcg)
- return;
-
- mutex_lock(&memcg->slab_caches_mutex);
- list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
- mutex_unlock(&memcg->slab_caches_mutex);
-}
-
/*
* helper for acessing a memcg's index. It will be used as an index in the
* child cache array in kmem_cache, and also to derive its name. This function
@@ -3119,43 +3090,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}
-/*
- * This ends up being protected by the set_limit mutex, during normal
- * operation, because that is its main call site.
- *
- * But when we create a new cache, we can call this as well if its parent
- * is kmem-limited. That will have to hold set_limit_mutex as well.
- */
-int memcg_update_cache_sizes(struct mem_cgroup *memcg)
-{
- int num, ret;
-
- num = ida_simple_get(&kmem_limited_groups,
- 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
- if (num < 0)
- return num;
- /*
- * After this point, kmem_accounted (that we test atomically in
- * the beginning of this conditional), is no longer 0. This
- * guarantees only one process will set the following boolean
- * to true. We don't need test_and_set because we're protected
- * by the set_limit_mutex anyway.
- */
- memcg_kmem_set_activated(memcg);
-
- ret = memcg_update_all_caches(num+1);
- if (ret) {
- ida_simple_remove(&kmem_limited_groups, num);
- memcg_kmem_clear_activated(memcg);
- return ret;
- }
-
- memcg->kmemcg_id = num;
- INIT_LIST_HEAD(&memcg->memcg_slab_caches);
- mutex_init(&memcg->slab_caches_mutex);
- return 0;
-}
-
static size_t memcg_caches_array_size(int num_groups)
{
ssize_t size;
@@ -3192,18 +3126,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
if (num_groups > memcg_limited_groups_array_size) {
int i;
+ struct memcg_cache_params *new_params;
ssize_t size = memcg_caches_array_size(num_groups);
size *= sizeof(void *);
size += offsetof(struct memcg_cache_params, memcg_caches);
- s->memcg_params = kzalloc(size, GFP_KERNEL);
- if (!s->memcg_params) {
- s->memcg_params = cur_params;
+ new_params = kzalloc(size, GFP_KERNEL);
+ if (!new_params)
return -ENOMEM;
- }
- s->memcg_params->is_root_cache = true;
+ new_params->is_root_cache = true;
/*
* There is the chance it will be bigger than
@@ -3217,7 +3150,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
for (i = 0; i < memcg_limited_groups_array_size; i++) {
if (!cur_params->memcg_caches[i])
continue;
- s->memcg_params->memcg_caches[i] =
+ new_params->memcg_caches[i] =
cur_params->memcg_caches[i];
}
@@ -3230,13 +3163,15 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
* bigger than the others. And all updates will reset this
* anyway.
*/
- kfree(cur_params);
+ rcu_assign_pointer(s->memcg_params, new_params);
+ if (cur_params)
+ kfree_rcu(cur_params, rcu_head);
}
return 0;
}
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
- struct kmem_cache *root_cache)
+int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+ struct kmem_cache *root_cache)
{
size_t size;
@@ -3264,35 +3199,85 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
return 0;
}
-void memcg_release_cache(struct kmem_cache *s)
+void memcg_free_cache_params(struct kmem_cache *s)
+{
+ kfree(s->memcg_params);
+}
+
+void memcg_register_cache(struct kmem_cache *s)
{
struct kmem_cache *root;
struct mem_cgroup *memcg;
int id;
- /*
- * This happens, for instance, when a root cache goes away before we
- * add any memcg.
- */
- if (!s->memcg_params)
+ if (is_root_cache(s))
return;
- if (s->memcg_params->is_root_cache)
- goto out;
+ /*
+ * Holding the slab_mutex assures nobody will touch the memcg_caches
+ * array while we are modifying it.
+ */
+ lockdep_assert_held(&slab_mutex);
+ root = s->memcg_params->root_cache;
memcg = s->memcg_params->memcg;
- id = memcg_cache_id(memcg);
+ id = memcg_cache_id(memcg);
+
+ css_get(&memcg->css);
+
+
+ /*
+ * Since readers won't lock (see cache_from_memcg_idx()), we need a
+ * barrier here to ensure nobody will see the kmem_cache partially
+ * initialized.
+ */
+ smp_wmb();
+
+ /*
+ * Initialize the pointer to this cache in its parent's memcg_params
+ * before adding it to the memcg_slab_caches list, otherwise we can
+ * fail to convert memcg_params_to_cache() while traversing the list.
+ */
+ VM_BUG_ON(root->memcg_params->memcg_caches[id]);
+ root->memcg_params->memcg_caches[id] = s;
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
+ mutex_unlock(&memcg->slab_caches_mutex);
+}
+
+void memcg_unregister_cache(struct kmem_cache *s)
+{
+ struct kmem_cache *root;
+ struct mem_cgroup *memcg;
+ int id;
+
+ if (is_root_cache(s))
+ return;
+
+ /*
+ * Holding the slab_mutex assures nobody will touch the memcg_caches
+ * array while we are modifying it.
+ */
+ lockdep_assert_held(&slab_mutex);
root = s->memcg_params->root_cache;
- root->memcg_params->memcg_caches[id] = NULL;
+ memcg = s->memcg_params->memcg;
+ id = memcg_cache_id(memcg);
mutex_lock(&memcg->slab_caches_mutex);
list_del(&s->memcg_params->list);
mutex_unlock(&memcg->slab_caches_mutex);
+ /*
+ * Clear the pointer to this cache in its parent's memcg_params only
+ * after removing it from the memcg_slab_caches list, otherwise we can
+ * fail to convert memcg_params_to_cache() while traversing the list.
+ */
+ VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
+ root->memcg_params->memcg_caches[id] = NULL;
+
css_put(&memcg->css);
-out:
- kfree(s->memcg_params);
}
/*
@@ -3391,27 +3376,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
schedule_work(&cachep->memcg_params->destroy);
}
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
- struct kmem_cache *s)
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *s)
{
struct kmem_cache *new;
static char *tmp_name = NULL;
+ static DEFINE_MUTEX(mutex); /* protects tmp_name */
- lockdep_assert_held(&memcg_cache_mutex);
+ BUG_ON(!memcg_can_account_kmem(memcg));
+ mutex_lock(&mutex);
/*
* kmem_cache_create_memcg duplicates the given name and
* cgroup_name for this name requires RCU context.
@@ -3434,47 +3408,13 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
if (new)
new->allocflags |= __GFP_KMEMCG;
+ else
+ new = s;
+ mutex_unlock(&mutex);
return new;
}
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- struct kmem_cache *new_cachep;
- int idx;
-
- BUG_ON(!memcg_can_account_kmem(memcg));
-
- idx = memcg_cache_id(memcg);
-
- mutex_lock(&memcg_cache_mutex);
- new_cachep = cache_from_memcg_idx(cachep, idx);
- if (new_cachep) {
- css_put(&memcg->css);
- goto out;
- }
-
- new_cachep = kmem_cache_dup(memcg, cachep);
- if (new_cachep == NULL) {
- new_cachep = cachep;
- css_put(&memcg->css);
- goto out;
- }
-
- atomic_set(&new_cachep->memcg_params->nr_pages , 0);
-
- cachep->memcg_params->memcg_caches[idx] = new_cachep;
- /*
- * the readers won't lock, make sure everybody sees the updated value,
- * so they won't put stuff in the queue again for no reason
- */
- wmb();
-out:
- mutex_unlock(&memcg_cache_mutex);
- return new_cachep;
-}
-
void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
{
struct kmem_cache *c;
@@ -3492,9 +3432,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
*
* Still, we don't want anyone else freeing memcg_caches under our
* noses, which can happen if a new memcg comes to life. As usual,
- * we'll take the set_limit_mutex to protect ourselves against this.
+ * we'll take the activate_kmem_mutex to protect ourselves against
+ * this.
*/
- mutex_lock(&set_limit_mutex);
+ mutex_lock(&activate_kmem_mutex);
for_each_memcg_cache_index(i) {
c = cache_from_memcg_idx(s, i);
if (!c)
@@ -3517,7 +3458,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
cancel_work_sync(&c->memcg_params->destroy);
kmem_cache_destroy(c);
}
- mutex_unlock(&set_limit_mutex);
+ mutex_unlock(&activate_kmem_mutex);
}
struct create_work {
@@ -3549,6 +3490,7 @@ static void memcg_create_cache_work_func(struct work_struct *w)
cw = container_of(w, struct create_work, work);
memcg_create_kmem_cache(cw->memcg, cw->cachep);
+ css_put(&cw->memcg->css);
kfree(cw);
}
@@ -3608,7 +3550,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
gfp_t gfp)
{
struct mem_cgroup *memcg;
- int idx;
+ struct kmem_cache *memcg_cachep;
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3622,15 +3564,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
if (!memcg_can_account_kmem(memcg))
goto out;
- idx = memcg_cache_id(memcg);
-
- /*
- * barrier to mare sure we're always seeing the up to date value. The
- * code updating memcg_caches will issue a write barrier to match this.
- */
- read_barrier_depends();
- if (likely(cache_from_memcg_idx(cachep, idx))) {
- cachep = cache_from_memcg_idx(cachep, idx);
+ memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+ if (likely(memcg_cachep)) {
+ cachep = memcg_cachep;
goto out;
}
@@ -3784,7 +3720,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
if (!memcg)
return;
- VM_BUG_ON(mem_cgroup_is_root(memcg));
+ VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
}
#else
@@ -3863,7 +3799,7 @@ static int mem_cgroup_move_account(struct page *page,
bool anon = PageAnon(page);
VM_BUG_ON(from == to);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
/*
* The page is isolated from LRU. So, collapse function
* will not handle this page. But page splitting can happen.
@@ -3956,7 +3892,7 @@ static int mem_cgroup_move_parent(struct page *page,
parent = root_mem_cgroup;
if (nr_pages > 1) {
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
flags = compound_lock_irqsave(page);
}
@@ -3990,7 +3926,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/*
* Never OOM-kill a process for a huge page. The
* fault handler will fall back to regular pages.
@@ -4010,8 +3946,8 @@ int mem_cgroup_newpage_charge(struct page *page,
{
if (mem_cgroup_disabled())
return 0;
- VM_BUG_ON(page_mapped(page));
- VM_BUG_ON(page->mapping && !PageAnon(page));
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
VM_BUG_ON(!mm);
return mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_ANON);
@@ -4215,7 +4151,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
}
/*
* Check if our page_cgroup is valid
@@ -4307,7 +4243,7 @@ void mem_cgroup_uncharge_page(struct page *page)
/* early check. */
if (page_mapped(page))
return;
- VM_BUG_ON(page->mapping && !PageAnon(page));
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
/*
* If the page is in swap cache, uncharge should be deferred
* to the swap path, which also properly accounts swap usage
@@ -4327,8 +4263,8 @@ void mem_cgroup_uncharge_page(struct page *page)
void mem_cgroup_uncharge_cache_page(struct page *page)
{
- VM_BUG_ON(page_mapped(page));
- VM_BUG_ON(page->mapping);
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping, page);
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
}
@@ -5186,11 +5122,23 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
return val;
}
-static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
-{
- int ret = -EINVAL;
#ifdef CONFIG_MEMCG_KMEM
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int err = 0;
+ int memcg_id;
+
+ if (memcg_kmem_is_active(memcg))
+ return 0;
+
+ /*
+ * We are going to allocate memory for data shared by all memory
+ * cgroups so let's stop accounting here.
+ */
+ memcg_stop_kmem_account();
+
/*
* For simplicity, we won't allow this to be disabled. It also can't
* be changed if the cgroup has children already, or if tasks had
@@ -5204,72 +5152,101 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
* of course permitted.
*/
mutex_lock(&memcg_create_mutex);
- mutex_lock(&set_limit_mutex);
- if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
- if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
- ret = -EBUSY;
- goto out;
- }
- ret = res_counter_set_limit(&memcg->kmem, val);
- VM_BUG_ON(ret);
+ if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
+ err = -EBUSY;
+ mutex_unlock(&memcg_create_mutex);
+ if (err)
+ goto out;
- ret = memcg_update_cache_sizes(memcg);
- if (ret) {
- res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
- goto out;
- }
- static_key_slow_inc(&memcg_kmem_enabled_key);
- /*
- * setting the active bit after the inc will guarantee no one
- * starts accounting before all call sites are patched
- */
- memcg_kmem_set_active(memcg);
- } else
- ret = res_counter_set_limit(&memcg->kmem, val);
+ memcg_id = ida_simple_get(&kmem_limited_groups,
+ 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+ if (memcg_id < 0) {
+ err = memcg_id;
+ goto out;
+ }
+
+ /*
+ * Make sure we have enough space for this cgroup in each root cache's
+ * memcg_params.
+ */
+ err = memcg_update_all_caches(memcg_id + 1);
+ if (err)
+ goto out_rmid;
+
+ memcg->kmemcg_id = memcg_id;
+ INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+ mutex_init(&memcg->slab_caches_mutex);
+
+ /*
+ * We couldn't have accounted to this cgroup, because it hasn't got the
+ * active bit set yet, so this should succeed.
+ */
+ err = res_counter_set_limit(&memcg->kmem, limit);
+ VM_BUG_ON(err);
+
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+ /*
+ * Setting the active bit after enabling static branching will
+ * guarantee no one starts accounting before all call sites are
+ * patched.
+ */
+ memcg_kmem_set_active(memcg);
out:
- mutex_unlock(&set_limit_mutex);
- mutex_unlock(&memcg_create_mutex);
-#endif
+ memcg_resume_kmem_account();
+ return err;
+
+out_rmid:
+ ida_simple_remove(&kmem_limited_groups, memcg_id);
+ goto out;
+}
+
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int ret;
+
+ mutex_lock(&activate_kmem_mutex);
+ ret = __memcg_activate_kmem(memcg, limit);
+ mutex_unlock(&activate_kmem_mutex);
+ return ret;
+}
+
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ int ret;
+
+ if (!memcg_kmem_is_active(memcg))
+ ret = memcg_activate_kmem(memcg, val);
+ else
+ ret = res_counter_set_limit(&memcg->kmem, val);
return ret;
}
-#ifdef CONFIG_MEMCG_KMEM
static int memcg_propagate_kmem(struct mem_cgroup *memcg)
{
int ret = 0;
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- if (!parent)
- goto out;
- memcg->kmem_account_flags = parent->kmem_account_flags;
- /*
- * When that happen, we need to disable the static branch only on those
- * memcgs that enabled it. To achieve this, we would be forced to
- * complicate the code by keeping track of which memcgs were the ones
- * that actually enabled limits, and which ones got it from its
- * parents.
- *
- * It is a lot simpler just to do static_key_slow_inc() on every child
- * that is accounted.
- */
- if (!memcg_kmem_is_active(memcg))
- goto out;
+ if (!parent)
+ return 0;
+ mutex_lock(&activate_kmem_mutex);
/*
- * __mem_cgroup_free() will issue static_key_slow_dec() because this
- * memcg is active already. If the later initialization fails then the
- * cgroup core triggers the cleanup so we do not have to do it here.
+ * If the parent cgroup is not kmem-active now, it cannot be activated
+ * after this point, because it has at least one child already.
*/
- static_key_slow_inc(&memcg_kmem_enabled_key);
-
- mutex_lock(&set_limit_mutex);
- memcg_stop_kmem_account();
- ret = memcg_update_cache_sizes(memcg);
- memcg_resume_kmem_account();
- mutex_unlock(&set_limit_mutex);
-out:
+ if (memcg_kmem_is_active(parent))
+ ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
+ mutex_unlock(&activate_kmem_mutex);
return ret;
}
+#else
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_MEMCG_KMEM */
/*
@@ -5303,7 +5280,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
else if (type == _MEMSWAP)
ret = mem_cgroup_resize_memsw_limit(memcg, val);
else if (type == _KMEM)
- ret = memcg_update_kmem_limit(css, val);
+ ret = memcg_update_kmem_limit(memcg, val);
else
return -EINVAL;
break;
@@ -6402,14 +6379,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- size_t size = memcg_size();
+ size_t size;
- /* Can be very big if nr_node_ids is very big */
- if (size < PAGE_SIZE)
- memcg = kzalloc(size, GFP_KERNEL);
- else
- memcg = vzalloc(size);
+ size = sizeof(struct mem_cgroup);
+ size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+ memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
return NULL;
@@ -6420,10 +6395,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
return memcg;
out_free:
- if (size < PAGE_SIZE)
- kfree(memcg);
- else
- vfree(memcg);
+ kfree(memcg);
return NULL;
}
@@ -6441,7 +6413,6 @@ out_free:
static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- size_t size = memcg_size();
mem_cgroup_remove_from_trees(memcg);
@@ -6462,10 +6433,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
* the cgroup_lock.
*/
disarm_static_keys(memcg);
- if (size < PAGE_SIZE)
- kfree(memcg);
- else
- vfree(memcg);
+ kfree(memcg);
}
/*
@@ -6546,7 +6514,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
- int error = 0;
if (css->cgroup->id > MEM_CGROUP_ID_MAX)
return -ENOSPC;
@@ -6581,10 +6548,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent != root_mem_cgroup)
mem_cgroup_subsys.broken_hierarchy = true;
}
-
- error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
mutex_unlock(&memcg_create_mutex);
- return error;
+
+ return memcg_init_kmem(memcg, &mem_cgroup_subsys);
}
/*
@@ -6893,7 +6859,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
enum mc_target_type ret = MC_TARGET_NONE;
page = pmd_page(pmd);
- VM_BUG_ON(!page || !PageHead(page));
+ VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!move_anon())
return ret;
pc = lookup_page_cgroup(page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fabe55046c1d..4f08a2d61487 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
}
/*
- * Dirty cache page page
+ * Dirty pagecache page
* Issues: when the error hit a hole page the error is not properly
* propagated.
*/
@@ -856,14 +856,14 @@ static int page_action(struct page_state *ps, struct page *p,
* the pages and send SIGBUS to the processes if the data was dirty.
*/
static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
- int trapno, int flags)
+ int trapno, int flags, struct page **hpagep)
{
enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
int kill = 1, forcekill;
- struct page *hpage = compound_head(p);
+ struct page *hpage = *hpagep;
struct page *ppage;
if (PageReserved(p) || PageSlab(p))
@@ -942,11 +942,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* We pinned the head page for hwpoison handling,
* now we split the thp and we are interested in
* the hwpoisoned raw page, so move the refcount
- * to it.
+ * to it. Similarly, page lock is shifted.
*/
if (hpage != p) {
put_page(hpage);
get_page(p);
+ lock_page(p);
+ unlock_page(hpage);
+ *hpagep = p;
}
/* THP is split, so ppage should be the real poisoned page. */
ppage = p;
@@ -964,17 +967,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(ppage, &tokill);
- if (hpage != ppage)
- lock_page(ppage);
-
ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(ppage));
- if (hpage != ppage)
- unlock_page(ppage);
-
/*
* Now that the dirty bit has been propagated to the
* struct page and all unmaps done we can decide if
@@ -1193,8 +1190,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
/*
* Now take care of user space mappings.
* Abort on fail: __delete_from_page_cache() assumes unmapped page.
+ *
+ * When the raw error page is thp tail page, hpage points to the raw
+ * page after thp split.
*/
- if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
+ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
+ != SWAP_SUCCESS) {
printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
res = -EBUSY;
goto out;
@@ -1585,7 +1586,13 @@ static int __soft_offline_page(struct page *page, int flags)
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
- putback_lru_pages(&pagelist);
+ if (!list_empty(&pagelist)) {
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ putback_lru_page(page);
+ }
+
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
diff --git a/mm/memory.c b/mm/memory.c
index 6768ce9e57d2..be6a0c0d4ae0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -59,6 +59,7 @@
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
+#include <linux/dma-debug.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -288,7 +289,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
return 0;
batch = tlb->active;
}
- VM_BUG_ON(batch->nr > batch->max);
+ VM_BUG_ON_PAGE(batch->nr > batch->max, page);
return batch->max - batch->nr;
}
@@ -670,7 +671,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
current->comm,
(long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
- dump_page(page);
+ dump_page(page, "bad pte");
printk(KERN_ALERT
"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
@@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
{
+ debug_dma_assert_idle(src);
+
/*
* If the source page was a PFN mapping, we don't have
* a "struct page" for it. We do a best-effort copy by
@@ -2699,7 +2702,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto unwritable_page;
}
} else
- VM_BUG_ON(!PageLocked(old_page));
+ VM_BUG_ON_PAGE(!PageLocked(old_page), old_page);
/*
* Since we dropped the lock we need to revalidate
@@ -3355,7 +3358,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf.page);
else
- VM_BUG_ON(!PageLocked(vmf.page));
+ VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
/*
* Should we do an early C-O-W break?
@@ -3392,7 +3395,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto unwritable_page;
}
} else
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
page_mkwrite = 1;
}
}
@@ -4272,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
+
+static struct kmem_cache *page_ptl_cachep;
+
+void __init ptlock_cache_init(void)
+{
+ page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
+ SLAB_PANIC, NULL);
+}
+
bool ptlock_alloc(struct page *page)
{
spinlock_t *ptl;
- ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+ ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
if (!ptl)
return false;
page->ptl = ptl;
@@ -4285,6 +4297,6 @@ bool ptlock_alloc(struct page *page)
void ptlock_free(struct page *page)
{
- kfree(page->ptl);
+ kmem_cache_free(page_ptl_cachep, page->ptl);
}
#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 489f235502db..a650db29606f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -9,7 +9,6 @@
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
-#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/pagevec.h>
@@ -269,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
}
/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
- * alloc_bootmem_node_nopanic() */
+ * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
static int __ref ensure_zone_is_initialized(struct zone *zone,
unsigned long start_pfn, unsigned long num_pages)
{
@@ -1108,17 +1107,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
if (ret)
return ret;
- lock_memory_hotplug();
-
res = register_memory_resource(start, size);
ret = -EEXIST;
if (!res)
- goto out;
+ return ret;
{ /* Stupid hack to suppress address-never-null warning */
void *p = NODE_DATA(nid);
new_pgdat = !p;
}
+
+ lock_memory_hotplug();
+
new_node = !node_online(nid);
if (new_node) {
pgdat = hotadd_new_pgdat(nid, start);
@@ -1310,7 +1310,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
#ifdef CONFIG_DEBUG_VM
printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
pfn);
- dump_page(page);
+ dump_page(page, "failed to remove from LRU");
#endif
put_page(page);
/* Because we don't have big zone->lock. we should
@@ -1446,6 +1446,7 @@ static int __init cmdline_parse_movable_node(char *p)
* the kernel away from hotpluggable memory.
*/
memblock_set_bottom_up(true);
+ movable_node_enabled = true;
#else
pr_warn("movable_node option not supported\n");
#endif
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 431fd768fbf3..36cb46cddf61 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1198,10 +1198,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
}
if (PageHuge(page)) {
- if (vma)
- return alloc_huge_page_noerr(vma, address, 1);
- else
- return NULL;
+ BUG_ON(!vma);
+ return alloc_huge_page_noerr(vma, address, 1);
}
/*
* if !vma, alloc_page_vma() will use task or system default policy
@@ -2667,7 +2665,7 @@ static void __init check_numabalancing_enable(void)
if (nr_node_ids > 1 && !numabalancing_override) {
printk(KERN_INFO "Enabling automatic NUMA balancing. "
- "Configure with numa_balancing= or sysctl");
+ "Configure with numa_balancing= or the kernel.numa_balancing sysctl");
set_numabalancing_state(numabalancing_default);
}
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 9194375b2307..734704f6f29b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -72,28 +72,12 @@ int migrate_prep_local(void)
}
/*
- * Add isolated pages on the list back to the LRU under page lock
- * to avoid leaking evictable pages back onto unevictable list.
- */
-void putback_lru_pages(struct list_head *l)
-{
- struct page *page;
- struct page *page2;
-
- list_for_each_entry_safe(page, page2, l, lru) {
- list_del(&page->lru);
- dec_zone_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
- putback_lru_page(page);
- }
-}
-
-/*
* Put previously isolated pages back onto the appropriate lists
* from where they were once taken off for compaction/migration.
*
- * This function shall be used instead of putback_lru_pages(),
- * whenever the isolated pageset has been built by isolate_migratepages_range()
+ * This function shall be used whenever the isolated pageset has been
+ * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
+ * and isolate_huge_page().
*/
void putback_movable_pages(struct list_head *l)
{
@@ -199,7 +183,12 @@ out:
*/
static void remove_migration_ptes(struct page *old, struct page *new)
{
- rmap_walk(new, remove_migration_pte, old);
+ struct rmap_walk_control rwc = {
+ .rmap_one = remove_migration_pte,
+ .arg = old,
+ };
+
+ rmap_walk(new, &rwc);
}
/*
@@ -510,7 +499,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
if (PageUptodate(page))
SetPageUptodate(newpage);
if (TestClearPageActive(page)) {
- VM_BUG_ON(PageUnevictable(page));
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
SetPageActive(newpage);
} else if (TestClearPageUnevictable(page))
SetPageUnevictable(newpage);
@@ -563,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
* Migration functions
***********************************************************/
-/* Always fail migration. Used for mappings that are not movable */
-int fail_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
-{
- return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
-
/*
* Common logic to directly migrate a single page suitable for
* pages that do not use PagePrivate/PagePrivate2.
@@ -890,7 +871,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* free the metadata, so the page can be freed.
*/
if (!page->mapping) {
- VM_BUG_ON(PageAnon(page));
+ VM_BUG_ON_PAGE(PageAnon(page), page);
if (page_has_private(page)) {
try_to_free_buffers(page);
goto uncharge;
@@ -1008,7 +989,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
{
int rc = 0;
int *result = NULL;
- struct page *new_hpage = get_new_page(hpage, private, &result);
+ struct page *new_hpage;
struct anon_vma *anon_vma = NULL;
/*
@@ -1018,9 +999,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
* tables or check whether the hugepage is pmd-based or not before
* kicking migration.
*/
- if (!hugepage_migration_support(page_hstate(hpage)))
+ if (!hugepage_migration_support(page_hstate(hpage))) {
+ putback_active_hugepage(hpage);
return -ENOSYS;
+ }
+ new_hpage = get_new_page(hpage, private, &result);
if (!new_hpage)
return -ENOMEM;
@@ -1120,7 +1104,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
nr_succeeded++;
break;
default:
- /* Permanent failure */
+ /*
+ * Permanent failure (-EBUSY, -ENOSYS, etc.):
+ * unlike -EAGAIN case, the failed page is
+ * removed from migration page list and not
+ * retried in the next outer loop.
+ */
nr_failed++;
break;
}
@@ -1594,35 +1583,42 @@ bool migrate_ratelimited(int node)
}
/* Returns true if the node is migrate rate-limited after the update */
-bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
+static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
+ unsigned long nr_pages)
{
- bool rate_limited = false;
-
/*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
*/
- spin_lock(&pgdat->numabalancing_migrate_lock);
if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+ spin_lock(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies +
msecs_to_jiffies(migrate_interval_millisecs);
+ spin_unlock(&pgdat->numabalancing_migrate_lock);
}
- if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
- rate_limited = true;
- else
- pgdat->numabalancing_migrate_nr_pages += nr_pages;
- spin_unlock(&pgdat->numabalancing_migrate_lock);
-
- return rate_limited;
+ if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
+ trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
+ nr_pages);
+ return true;
+ }
+
+ /*
+ * This is an unlocked non-atomic update so errors are possible.
+ * The consequences are failing to migrate when we potentiall should
+ * have which is not severe enough to warrant locking. If it is ever
+ * a problem, it can be converted to a per-cpu counter.
+ */
+ pgdat->numabalancing_migrate_nr_pages += nr_pages;
+ return false;
}
-int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
int page_lru;
- VM_BUG_ON(compound_order(page) && !PageTransHuge(page));
+ VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
/* Avoid migrating to a node that is nearly full */
if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
@@ -1705,7 +1701,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
if (nr_remaining) {
- putback_lru_pages(&migratepages);
+ if (!list_empty(&migratepages)) {
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ putback_lru_page(page);
+ }
isolated = 0;
} else
count_vm_numa_event(NUMA_PAGE_MIGRATE);
@@ -1752,8 +1753,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
if (!new_page)
goto out_fail;
- page_cpupid_xchg_last(new_page, page_cpupid_last(page));
-
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated) {
put_page(new_page);
diff --git a/mm/mincore.c b/mm/mincore.c
index da2be56a7b8f..101623378fbf 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -225,13 +225,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
- if (is_vm_hugetlb_page(vma)) {
- mincore_hugetlb_page_range(vma, addr, end, vec);
- return (end - addr) >> PAGE_SHIFT;
- }
-
- end = pmd_addr_end(addr, end);
-
if (is_vm_hugetlb_page(vma))
mincore_hugetlb_page_range(vma, addr, end, vec);
else
diff --git a/mm/mlock.c b/mm/mlock.c
index 192e6eebe4f2..ffadd08b83dd 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -79,8 +79,6 @@ void clear_page_mlock(struct page *page)
*/
void mlock_vma_page(struct page *page)
{
- BUG_ON(!PageLocked(page));
-
if (!TestSetPageMlocked(page)) {
mod_zone_page_state(page_zone(page), NR_MLOCK,
hpage_nr_pages(page));
@@ -91,6 +89,26 @@ void mlock_vma_page(struct page *page)
}
/*
+ * Isolate a page from LRU with optional get_page() pin.
+ * Assumes lru_lock already held and page already pinned.
+ */
+static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
+{
+ if (PageLRU(page)) {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
+ if (getpage)
+ get_page(page);
+ ClearPageLRU(page);
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ return true;
+ }
+
+ return false;
+}
+
+/*
* Finish munlock after successful page isolation
*
* Page must be locked. This is a wrapper for try_to_munlock()
@@ -126,9 +144,9 @@ static void __munlock_isolated_page(struct page *page)
static void __munlock_isolation_failed(struct page *page)
{
if (PageUnevictable(page))
- count_vm_event(UNEVICTABLE_PGSTRANDED);
+ __count_vm_event(UNEVICTABLE_PGSTRANDED);
else
- count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+ __count_vm_event(UNEVICTABLE_PGMUNLOCKED);
}
/**
@@ -152,28 +170,34 @@ static void __munlock_isolation_failed(struct page *page)
unsigned int munlock_vma_page(struct page *page)
{
unsigned int nr_pages;
+ struct zone *zone = page_zone(page);
BUG_ON(!PageLocked(page));
- if (TestClearPageMlocked(page)) {
- nr_pages = hpage_nr_pages(page);
- mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
- if (!isolate_lru_page(page))
- __munlock_isolated_page(page);
- else
- __munlock_isolation_failed(page);
- } else {
- nr_pages = hpage_nr_pages(page);
- }
-
/*
- * Regardless of the original PageMlocked flag, we determine nr_pages
- * after touching the flag. This leaves a possible race with a THP page
- * split, such that a whole THP page was munlocked, but nr_pages == 1.
- * Returning a smaller mask due to that is OK, the worst that can
- * happen is subsequent useless scanning of the former tail pages.
- * The NR_MLOCK accounting can however become broken.
+ * Serialize with any parallel __split_huge_page_refcount() which
+ * might otherwise copy PageMlocked to part of the tail pages before
+ * we clear it in the head page. It also stabilizes hpage_nr_pages().
*/
+ spin_lock_irq(&zone->lru_lock);
+
+ nr_pages = hpage_nr_pages(page);
+ if (!TestClearPageMlocked(page))
+ goto unlock_out;
+
+ __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
+
+ if (__munlock_isolate_lru_page(page, true)) {
+ spin_unlock_irq(&zone->lru_lock);
+ __munlock_isolated_page(page);
+ goto out;
+ }
+ __munlock_isolation_failed(page);
+
+unlock_out:
+ spin_unlock_irq(&zone->lru_lock);
+
+out:
return nr_pages - 1;
}
@@ -253,8 +277,8 @@ static int __mlock_posix_error_return(long retval)
static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
int *pgrescued)
{
- VM_BUG_ON(PageLRU(page));
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (page_mapcount(page) <= 1 && page_evictable(page)) {
pagevec_add(pvec, page);
@@ -310,34 +334,24 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
struct page *page = pvec->pages[i];
if (TestClearPageMlocked(page)) {
- struct lruvec *lruvec;
- int lru;
-
- if (PageLRU(page)) {
- lruvec = mem_cgroup_page_lruvec(page, zone);
- lru = page_lru(page);
- /*
- * We already have pin from follow_page_mask()
- * so we can spare the get_page() here.
- */
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, lru);
- } else {
- __munlock_isolation_failed(page);
- goto skip_munlock;
- }
-
- } else {
-skip_munlock:
/*
- * We won't be munlocking this page in the next phase
- * but we still need to release the follow_page_mask()
- * pin. We cannot do it under lru_lock however. If it's
- * the last pin, __page_cache_release would deadlock.
+ * We already have pin from follow_page_mask()
+ * so we can spare the get_page() here.
*/
- pagevec_add(&pvec_putback, pvec->pages[i]);
- pvec->pages[i] = NULL;
+ if (__munlock_isolate_lru_page(page, false))
+ continue;
+ else
+ __munlock_isolation_failed(page);
}
+
+ /*
+ * We won't be munlocking this page in the next phase
+ * but we still need to release the follow_page_mask()
+ * pin. We cannot do it under lru_lock however. If it's
+ * the last pin, __page_cache_release() would deadlock.
+ */
+ pagevec_add(&pvec_putback, pvec->pages[i]);
+ pvec->pages[i] = NULL;
}
delta_munlocked = -nr + pagevec_count(&pvec_putback);
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
@@ -709,19 +723,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
lru_add_drain_all(); /* flush pagevec */
- down_write(&current->mm->mmap_sem);
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
start &= PAGE_MASK;
- locked = len >> PAGE_SHIFT;
- locked += current->mm->locked_vm;
-
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
+ locked = len >> PAGE_SHIFT;
+
+ down_write(&current->mm->mmap_sem);
+
+ locked += current->mm->locked_vm;
/* check against resource limits */
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
error = do_mlock(start, len, 1);
+
up_write(&current->mm->mmap_sem);
if (!error)
error = __mm_populate(start, len, 0);
@@ -732,11 +748,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
int ret;
- down_write(&current->mm->mmap_sem);
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
start &= PAGE_MASK;
+
+ down_write(&current->mm->mmap_sem);
ret = do_mlock(start, len, 0);
up_write(&current->mm->mmap_sem);
+
return ret;
}
@@ -781,12 +799,12 @@ SYSCALL_DEFINE1(mlockall, int, flags)
if (flags & MCL_CURRENT)
lru_add_drain_all(); /* flush pagevec */
- down_write(&current->mm->mmap_sem);
-
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
ret = -ENOMEM;
+ down_write(&current->mm->mmap_sem);
+
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
capable(CAP_IPC_LOCK))
ret = do_mlockall(flags);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 68562e92d50c..857a6434e3a5 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -202,5 +202,4 @@ static int __init mm_sysfs_init(void)
return 0;
}
-
-__initcall(mm_sysfs_init);
+pure_initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 834b2d785f1e..126d8b976bfd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
@@ -1190,6 +1191,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
+static inline int mlock_future_check(struct mm_struct *mm,
+ unsigned long flags,
+ unsigned long len)
+{
+ unsigned long locked, lock_limit;
+
+ /* mlock MCL_FUTURE? */
+ if (flags & VM_LOCKED) {
+ locked = len >> PAGE_SHIFT;
+ locked += mm->locked_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
+ return 0;
+}
+
/*
* The caller must hold down_write(&current->mm->mmap_sem).
*/
@@ -1251,16 +1270,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
if (!can_do_mlock())
return -EPERM;
- /* mlock MCL_FUTURE? */
- if (vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
+ if (mlock_future_check(mm, vm_flags, len))
+ return -EAGAIN;
if (file) {
struct inode *inode = file_inode(file);
@@ -2591,18 +2602,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
if (error & ~PAGE_MASK)
return error;
- /*
- * mlock MCL_FUTURE?
- */
- if (mm->def_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
+ error = mlock_future_check(mm, mm->def_flags, len);
+ if (error)
+ return error;
/*
* mm->mmap_sem is required to protect against another thread
@@ -3140,7 +3142,7 @@ static int init_user_reserve(void)
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
}
-module_init(init_user_reserve)
+subsys_initcall(init_user_reserve);
/*
* Initialise sysctl_admin_reserve_kbytes.
@@ -3161,7 +3163,7 @@ static int init_admin_reserve(void)
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
}
-module_init(init_admin_reserve)
+subsys_initcall(init_admin_reserve);
/*
* Reinititalise user and admin reserves if memory is added or removed.
@@ -3231,4 +3233,4 @@ static int __meminit init_reserve_notifier(void)
return 0;
}
-module_init(init_reserve_notifier)
+subsys_initcall(init_reserve_notifier);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 93e6089cb456..41cefdf0aadd 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -329,5 +329,4 @@ static int __init mmu_notifier_init(void)
{
return init_srcu_struct(&srcu);
}
-
-module_init(mmu_notifier_init);
+subsys_initcall(mmu_notifier_init);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bb53a6591aea..7332c1785744 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/perf_event.h>
+#include <linux/ksm.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -63,7 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
ptent = *pte;
page = vm_normal_page(vma, addr, oldpte);
- if (page) {
+ if (page && !PageKsm(page)) {
if (!pte_numa(oldpte)) {
ptent = pte_mknuma(ptent);
set_pte_at(mm, addr, pte, ptent);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 2c254d374655..e2906a5428fd 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -41,11 +41,13 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
if (limit > memblock.current_limit)
limit = memblock.current_limit;
- addr = memblock_find_in_range_node(goal, limit, size, align, nid);
+ addr = memblock_find_in_range_node(size, align, goal, limit, nid);
if (!addr)
return NULL;
- memblock_reserve(addr, size);
+ if (memblock_reserve(addr, size))
+ return NULL;
+
ptr = phys_to_virt(addr);
memset(ptr, 0, size);
/*
@@ -117,14 +119,22 @@ static unsigned long __init free_low_memory_core_early(void)
phys_addr_t start, end, size;
u64 i;
- for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
+ for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
count += __free_memory_core(start, end);
- /* free range that is used for reserved array if we allocate it */
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+
+ /* Free memblock.reserved array if it was allocated */
size = get_allocated_memblock_reserved_regions_info(&start);
if (size)
count += __free_memory_core(start, start + size);
+ /* Free memblock.memory array if it was allocated */
+ size = get_allocated_memblock_memory_regions_info(&start);
+ if (size)
+ count += __free_memory_core(start, start + size);
+#endif
+
return count;
}
@@ -161,7 +171,7 @@ unsigned long __init free_all_bootmem(void)
reset_all_zones_managed_pages();
/*
- * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
+ * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
@@ -215,7 +225,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
restart:
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+ ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit);
if (ptr)
return ptr;
@@ -299,7 +309,7 @@ again:
if (ptr)
return ptr;
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
+ ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align,
goal, limit);
if (ptr)
return ptr;
diff --git a/mm/nommu.c b/mm/nommu.c
index fec093adad9c..8740213b1647 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn;
struct percpu_counter vm_committed_as;
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e4a600a6163..054ff47c4478 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock);
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
- * @tsk: task struct of which task to consider
+ * @start: task struct of which task to consider
* @mask: nodemask passed to page allocator for mempolicy ooms
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
* and whether or not it has the same set of allowed cpuset nodes.
*/
-static bool has_intersects_mems_allowed(struct task_struct *tsk,
+static bool has_intersects_mems_allowed(struct task_struct *start,
const nodemask_t *mask)
{
- struct task_struct *start = tsk;
+ struct task_struct *tsk;
+ bool ret = false;
- do {
+ rcu_read_lock();
+ for_each_thread(start, tsk) {
if (mask) {
/*
* If this is a mempolicy constrained oom, tsk's
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
* mempolicy intersects current, otherwise it may be
* needlessly killed.
*/
- if (mempolicy_nodemask_intersects(tsk, mask))
- return true;
+ ret = mempolicy_nodemask_intersects(tsk, mask);
} else {
/*
* This is not a mempolicy constrained oom, so only
* check the mems of tsk's cpuset.
*/
- if (cpuset_mems_allowed_intersects(current, tsk))
- return true;
+ ret = cpuset_mems_allowed_intersects(current, tsk);
}
- } while_each_thread(start, tsk);
+ if (ret)
+ break;
+ }
+ rcu_read_unlock();
- return false;
+ return ret;
}
#else
static bool has_intersects_mems_allowed(struct task_struct *tsk,
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
*/
struct task_struct *find_lock_task_mm(struct task_struct *p)
{
- struct task_struct *t = p;
+ struct task_struct *t;
- do {
+ rcu_read_lock();
+
+ for_each_thread(p, t) {
task_lock(t);
if (likely(t->mm))
- return t;
+ goto found;
task_unlock(t);
- } while_each_thread(p, t);
+ }
+ t = NULL;
+found:
+ rcu_read_unlock();
- return NULL;
+ return t;
}
/* return true if the task is not adequate as candidate victim task. */
@@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
unsigned long chosen_points = 0;
rcu_read_lock();
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
unsigned int points;
switch (oom_scan_process_thread(p, totalpages, nodemask,
@@ -323,7 +331,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
chosen = p;
chosen_points = points;
}
- } while_each_thread(g, p);
+ }
if (chosen)
get_task_struct(chosen);
rcu_read_unlock();
@@ -406,7 +414,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
{
struct task_struct *victim = p;
struct task_struct *child;
- struct task_struct *t = p;
+ struct task_struct *t;
struct mm_struct *mm;
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -437,7 +445,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* still freeing memory.
*/
read_lock(&tasklist_lock);
- do {
+ for_each_thread(p, t) {
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
@@ -455,13 +463,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
get_task_struct(victim);
}
}
- } while_each_thread(p, t);
+ }
read_unlock(&tasklist_lock);
- rcu_read_lock();
p = find_lock_task_mm(victim);
if (!p) {
- rcu_read_unlock();
put_task_struct(victim);
return;
} else if (victim != p) {
@@ -487,6 +493,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* That thread will now get access to memory reserves since it has a
* pending fatal signal.
*/
+ rcu_read_lock();
for_each_process(p)
if (p->mm == mm && !same_thread_group(p, victim) &&
!(p->flags & PF_KTHREAD)) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5248fe070aa4..60cbf9b0860a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -205,7 +205,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
};
int min_free_kbytes = 1024;
-int user_min_free_kbytes;
+int user_min_free_kbytes = -1;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
@@ -295,7 +295,7 @@ static inline int bad_range(struct zone *zone, struct page *page)
}
#endif
-static void bad_page(struct page *page)
+static void bad_page(struct page *page, char *reason, unsigned long bad_flags)
{
static unsigned long resume;
static unsigned long nr_shown;
@@ -329,7 +329,7 @@ static void bad_page(struct page *page)
printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- dump_page(page);
+ dump_page_badflags(page, reason, bad_flags);
print_modules();
dump_stack();
@@ -383,7 +383,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
int bad = 0;
if (unlikely(compound_order(page) != order)) {
- bad_page(page);
+ bad_page(page, "wrong compound order", 0);
bad++;
}
@@ -392,8 +392,11 @@ static int destroy_compound_page(struct page *page, unsigned long order)
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- if (unlikely(!PageTail(p) || (p->first_page != page))) {
- bad_page(page);
+ if (unlikely(!PageTail(p))) {
+ bad_page(page, "PageTail not set", 0);
+ bad++;
+ } else if (unlikely(p->first_page != page)) {
+ bad_page(page, "first_page not consistent", 0);
bad++;
}
__ClearPageTail(p);
@@ -506,12 +509,12 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
return 0;
if (page_is_guard(buddy) && page_order(buddy) == order) {
- VM_BUG_ON(page_count(buddy) != 0);
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
return 1;
}
if (PageBuddy(buddy) && page_order(buddy) == order) {
- VM_BUG_ON(page_count(buddy) != 0);
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
return 1;
}
return 0;
@@ -561,8 +564,8 @@ static inline void __free_one_page(struct page *page,
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
- VM_BUG_ON(page_idx & ((1 << order) - 1));
- VM_BUG_ON(bad_range(zone, page));
+ VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
+ VM_BUG_ON_PAGE(bad_range(zone, page), page);
while (order < MAX_ORDER-1) {
buddy_idx = __find_buddy_index(page_idx, order);
@@ -618,12 +621,23 @@ out:
static inline int free_pages_check(struct page *page)
{
- if (unlikely(page_mapcount(page) |
- (page->mapping != NULL) |
- (atomic_read(&page->_count) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
- (mem_cgroup_bad_page_check(page)))) {
- bad_page(page);
+ char *bad_reason = NULL;
+ unsigned long bad_flags = 0;
+
+ if (unlikely(page_mapcount(page)))
+ bad_reason = "nonzero mapcount";
+ if (unlikely(page->mapping != NULL))
+ bad_reason = "non-NULL mapping";
+ if (unlikely(atomic_read(&page->_count) != 0))
+ bad_reason = "nonzero _count";
+ if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
+ bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
+ }
+ if (unlikely(mem_cgroup_bad_page_check(page)))
+ bad_reason = "cgroup check failed";
+ if (unlikely(bad_reason)) {
+ bad_page(page, bad_reason, bad_flags);
return 1;
}
page_cpupid_reset_last(page);
@@ -813,7 +827,7 @@ static inline void expand(struct zone *zone, struct page *page,
area--;
high--;
size >>= 1;
- VM_BUG_ON(bad_range(zone, &page[size]));
+ VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
#ifdef CONFIG_DEBUG_PAGEALLOC
if (high < debug_guardpage_minorder()) {
@@ -843,12 +857,23 @@ static inline void expand(struct zone *zone, struct page *page,
*/
static inline int check_new_page(struct page *page)
{
- if (unlikely(page_mapcount(page) |
- (page->mapping != NULL) |
- (atomic_read(&page->_count) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
- (mem_cgroup_bad_page_check(page)))) {
- bad_page(page);
+ char *bad_reason = NULL;
+ unsigned long bad_flags = 0;
+
+ if (unlikely(page_mapcount(page)))
+ bad_reason = "nonzero mapcount";
+ if (unlikely(page->mapping != NULL))
+ bad_reason = "non-NULL mapping";
+ if (unlikely(atomic_read(&page->_count) != 0))
+ bad_reason = "nonzero _count";
+ if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
+ bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
+ }
+ if (unlikely(mem_cgroup_bad_page_check(page)))
+ bad_reason = "cgroup check failed";
+ if (unlikely(bad_reason)) {
+ bad_page(page, bad_reason, bad_flags);
return 1;
}
return 0;
@@ -865,8 +890,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
}
set_page_private(page, 0);
- set_page_refcounted(page);
-
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
@@ -876,6 +899,16 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
+ /*
+ * Make sure the caller of get_page_unless_zero() will see the
+ * fully initialized page. Say, to ensure that compound_lock()
+ * can't race with the non-atomic __SetPage*() above.
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ smp_wmb();
+#endif
+ set_page_refcounted(page);
+
return 0;
}
@@ -955,7 +988,7 @@ int move_freepages(struct zone *zone,
for (page = start_page; page <= end_page;) {
/* Make sure we are not inadvertently changing nodes */
- VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
@@ -1404,8 +1437,8 @@ void split_page(struct page *page, unsigned int order)
{
int i;
- VM_BUG_ON(PageCompound(page));
- VM_BUG_ON(!page_count(page));
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!page_count(page), page);
#ifdef CONFIG_KMEMCHECK
/*
@@ -1552,7 +1585,7 @@ again:
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
- VM_BUG_ON(bad_range(zone, page));
+ VM_BUG_ON_PAGE(bad_range(zone, page), page);
if (prep_new_page(page, order, gfp_flags))
goto again;
return page;
@@ -2072,13 +2105,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
return;
/*
- * Walking all memory to count page types is very expensive and should
- * be inhibited in non-blockable contexts.
- */
- if (!(gfp_mask & __GFP_WAIT))
- filter |= SHOW_MEM_FILTER_PAGE_COUNT;
-
- /*
* This documents exceptions given to allocations in certain
* contexts that are allowed to allocate outside current's set
* of allowed nodes.
@@ -2242,10 +2268,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
preferred_zone, migratetype);
if (page) {
preferred_zone->compact_blockskip_flush = false;
- preferred_zone->compact_considered = 0;
- preferred_zone->compact_defer_shift = 0;
- if (order >= preferred_zone->compact_order_failed)
- preferred_zone->compact_order_failed = order + 1;
+ compaction_defer_reset(preferred_zone, order, true);
count_vm_event(COMPACTSUCCESS);
return page;
}
@@ -2535,8 +2558,15 @@ rebalance:
}
/* Atomic allocations - we can't balance anything */
- if (!wait)
+ if (!wait) {
+ /*
+ * All existing users of the deprecated __GFP_NOFAIL are
+ * blockable, so warn of any new users that actually allow this
+ * type of allocation to fail.
+ */
+ WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
goto nopage;
+ }
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
@@ -2628,6 +2658,11 @@ rebalance:
pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+
+ /* Allocations that cannot fail must allocate from somewhere */
+ if (gfp_mask & __GFP_NOFAIL)
+ alloc_flags |= ALLOC_HARDER;
+
goto rebalance;
} else {
/*
@@ -3901,6 +3936,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
struct page *page;
unsigned long block_migratetype;
int reserve;
+ int old_reserve;
/*
* Get the start pfn, end pfn and the number of blocks to reserve
@@ -3922,6 +3958,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
* future allocation of hugepages at runtime.
*/
reserve = min(2, reserve);
+ old_reserve = zone->nr_migrate_reserve_block;
+
+ /* When memory hot-add, we almost always need to do nothing */
+ if (reserve == old_reserve)
+ return;
+ zone->nr_migrate_reserve_block = reserve;
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
if (!pfn_valid(pfn))
@@ -3959,6 +4001,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
reserve--;
continue;
}
+ } else if (!old_reserve) {
+ /*
+ * At boot time we don't need to scan the whole zone
+ * for turning off MIGRATE_RESERVE.
+ */
+ break;
}
/*
@@ -4209,7 +4257,6 @@ static noinline __init_refok
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
- struct pglist_data *pgdat = zone->zone_pgdat;
size_t alloc_size;
/*
@@ -4225,7 +4272,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
if (!slab_is_available()) {
zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node_nopanic(pgdat, alloc_size);
+ memblock_virt_alloc_node_nopanic(
+ alloc_size, zone->zone_pgdat->node_id);
} else {
/*
* This case means that a zone whose size was 0 gets new memory
@@ -4345,13 +4393,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
#endif
/**
- * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
+ * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
*
* If an architecture guarantees that all ranges registered with
* add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling free_bootmem() manually.
+ * this function may be used instead of calling memblock_free_early_nid()
+ * manually.
*/
void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
{
@@ -4363,9 +4412,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
end_pfn = min(end_pfn, max_low_pfn);
if (start_pfn < end_pfn)
- free_bootmem_node(NODE_DATA(this_nid),
- PFN_PHYS(start_pfn),
- (end_pfn - start_pfn) << PAGE_SHIFT);
+ memblock_free_early_nid(PFN_PHYS(start_pfn),
+ (end_pfn - start_pfn) << PAGE_SHIFT,
+ this_nid);
}
}
@@ -4636,8 +4685,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,
unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
if (usemapsize)
- zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
- usemapsize);
+ zone->pageblock_flags =
+ memblock_virt_alloc_node_nopanic(usemapsize,
+ pgdat->node_id);
}
#else
static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -4831,7 +4881,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
- map = alloc_bootmem_node_nopanic(pgdat, size);
+ map = memblock_virt_alloc_node_nopanic(size,
+ pgdat->node_id);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -5012,9 +5063,33 @@ static void __init find_zone_movable_pfns_for_nodes(void)
nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+ struct memblock_type *type = &memblock.memory;
+
+ /* Need to find movable_zone earlier when movable_node is specified. */
+ find_usable_zone_for_movable();
/*
- * If movablecore was specified, calculate what size of
+ * If movable_node is specified, ignore kernelcore and movablecore
+ * options.
+ */
+ if (movable_node_is_enabled()) {
+ for (i = 0; i < type->cnt; i++) {
+ if (!memblock_is_hotpluggable(&type->regions[i]))
+ continue;
+
+ nid = type->regions[i].nid;
+
+ usable_startpfn = PFN_DOWN(type->regions[i].base);
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ goto out2;
+ }
+
+ /*
+ * If movablecore=nn[KMG] was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
* and movablecore are specified, then the value of kernelcore
@@ -5040,7 +5115,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
goto out;
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
- find_usable_zone_for_movable();
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
restart:
@@ -5131,6 +5205,7 @@ restart:
if (usable_nodes && required_kernelcore > usable_nodes)
goto restart;
+out2:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
for (nid = 0; nid < MAX_NUMNODES; nid++)
zone_movable_pfn[nid] =
@@ -5692,7 +5767,12 @@ module_init(init_per_zone_wmark_min)
int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, buffer, length, ppos);
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
if (write) {
user_min_free_kbytes = min_free_kbytes;
setup_per_zone_wmarks();
@@ -5857,7 +5937,7 @@ void *__init alloc_large_system_hash(const char *tablename,
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
- table = alloc_bootmem_nopanic(size);
+ table = memblock_virt_alloc_nopanic(size, 0);
else if (hashdist)
table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
else {
@@ -5959,7 +6039,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
- VM_BUG_ON(!zone_spans_pfn(zone, pfn));
+ VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
if (flags & value)
@@ -6457,12 +6537,24 @@ static void dump_page_flags(unsigned long flags)
printk(")\n");
}
-void dump_page(struct page *page)
+void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
{
printk(KERN_ALERT
"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
dump_page_flags(page->flags);
+ if (reason)
+ pr_alert("page dumped because: %s\n", reason);
+ if (page->flags & badflags) {
+ pr_alert("bad because of flags:\n");
+ dump_page_flags(page->flags & badflags);
+ }
mem_cgroup_print_bad_page(page);
}
+
+void dump_page(struct page *page, char *reason)
+{
+ dump_page_badflags(page, reason, 0);
+}
+EXPORT_SYMBOL_GPL(dump_page);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3bd0b8e6ab12..cfd162882c00 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid)
table_size = sizeof(struct page_cgroup) * nr_pages;
- base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
- table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ base = memblock_virt_alloc_try_nid_nopanic(
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
if (!base)
return -ENOMEM;
NODE_DATA(nid)->node_page_cgroup = base;
diff --git a/mm/page_io.c b/mm/page_io.c
index f14eded987fa..7c59ef681381 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -320,8 +320,8 @@ int swap_readpage(struct page *page)
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageUptodate(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageUptodate(page), page);
if (frontswap_load(page) == 0) {
SetPageUptodate(page);
unlock_page(page);
diff --git a/mm/percpu.c b/mm/percpu.c
index afbf352ae580..036cfe07050f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
- ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
+ ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
if (!ptr)
return NULL;
ai = ptr;
@@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
*/
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
- free_bootmem(__pa(ai), ai->__ai_size);
+ memblock_free_early(__pa(ai), ai->__ai_size);
}
/**
@@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
/* process group information and build config tables accordingly */
- group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
- group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
- unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
- unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
+ group_offsets = memblock_virt_alloc(ai->nr_groups *
+ sizeof(group_offsets[0]), 0);
+ group_sizes = memblock_virt_alloc(ai->nr_groups *
+ sizeof(group_sizes[0]), 0);
+ unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
+ unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
@@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
- pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+ pcpu_slot = memblock_virt_alloc(
+ pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
@@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
* covers static area + reserved area (mostly used for module
* static percpu allocation).
*/
- schunk = alloc_bootmem(pcpu_chunk_struct_size);
+ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&schunk->list);
schunk->base_addr = base_addr;
schunk->map = smap;
@@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* init dynamic chunk if necessary */
if (dyn_size) {
- dchunk = alloc_bootmem(pcpu_chunk_struct_size);
+ dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&dchunk->list);
dchunk->base_addr = base_addr;
dchunk->map = dmap;
@@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
- areas = alloc_bootmem_nopanic(areas_size);
+ areas = memblock_virt_alloc_nopanic(areas_size, 0);
if (!areas) {
rc = -ENOMEM;
goto out_free;
@@ -1712,7 +1715,7 @@ out_free_areas:
out_free:
pcpu_free_alloc_info(ai);
if (areas)
- free_bootmem(__pa(areas), areas_size);
+ memblock_free_early(__pa(areas), areas_size);
return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */
@@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
- pages = alloc_bootmem(pages_size);
+ pages = memblock_virt_alloc(pages_size, 0);
/* allocate pages */
j = 0;
@@ -1823,7 +1826,7 @@ enomem:
free_fn(page_address(pages[j]), PAGE_SIZE);
rc = -ENOMEM;
out_free_ar:
- free_bootmem(__pa(pages), pages_size);
+ memblock_free_early(__pa(pages), pages_size);
pcpu_free_alloc_info(ai);
return rc;
}
@@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset);
static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
size_t align)
{
- return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
+ return memblock_virt_alloc_from_nopanic(
+ size, align, __pa(MAX_DMA_ADDRESS));
}
static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
{
- free_bootmem(__pa(ptr), size);
+ memblock_free_early(__pa(ptr), size);
}
void __init setup_per_cpu_areas(void)
@@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void)
void *fc;
ai = pcpu_alloc_alloc_info(1, 1);
- fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ fc = memblock_virt_alloc_from_nopanic(unit_size,
+ PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS));
if (!ai || !fc)
panic("Failed to allocate memory for percpu areas.");
/* kmemleak tracks the percpu allocations separately */
diff --git a/mm/rmap.c b/mm/rmap.c
index 068522d8502a..2dcd3353c3f6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
return 1;
}
+struct page_referenced_arg {
+ int mapcount;
+ int referenced;
+ unsigned long vm_flags;
+ struct mem_cgroup *memcg;
+};
/*
- * Subfunctions of page_referenced: page_referenced_one called
- * repeatedly from either page_referenced_anon or page_referenced_file.
+ * arg: page_referenced_arg will be passed
*/
int page_referenced_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, unsigned int *mapcount,
- unsigned long *vm_flags)
+ unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
int referenced = 0;
+ struct page_referenced_arg *pra = arg;
if (unlikely(PageTransHuge(page))) {
pmd_t *pmd;
@@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
if (!pmd)
- goto out;
+ return SWAP_AGAIN;
if (vma->vm_flags & VM_LOCKED) {
spin_unlock(ptl);
- *mapcount = 0; /* break early from loop */
- *vm_flags |= VM_LOCKED;
- goto out;
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
}
/* go ahead even if the pmd is pmd_trans_splitting() */
@@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
*/
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
- goto out;
+ return SWAP_AGAIN;
if (vma->vm_flags & VM_LOCKED) {
pte_unmap_unlock(pte, ptl);
- *mapcount = 0; /* break early from loop */
- *vm_flags |= VM_LOCKED;
- goto out;
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
}
if (ptep_clear_flush_young_notify(vma, address, pte)) {
@@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
}
- (*mapcount)--;
-
- if (referenced)
- *vm_flags |= vma->vm_flags;
-out:
- return referenced;
-}
-
-static int page_referenced_anon(struct page *page,
- struct mem_cgroup *memcg,
- unsigned long *vm_flags)
-{
- unsigned int mapcount;
- struct anon_vma *anon_vma;
- pgoff_t pgoff;
- struct anon_vma_chain *avc;
- int referenced = 0;
-
- anon_vma = page_lock_anon_vma_read(page);
- if (!anon_vma)
- return referenced;
-
- mapcount = page_mapcount(page);
- pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long address = vma_address(page, vma);
- /*
- * If we are reclaiming on behalf of a cgroup, skip
- * counting on behalf of references from different
- * cgroups
- */
- if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
- continue;
- referenced += page_referenced_one(page, vma, address,
- &mapcount, vm_flags);
- if (!mapcount)
- break;
+ if (referenced) {
+ pra->referenced++;
+ pra->vm_flags |= vma->vm_flags;
}
- page_unlock_anon_vma_read(anon_vma);
- return referenced;
+ pra->mapcount--;
+ if (!pra->mapcount)
+ return SWAP_SUCCESS; /* To break the loop */
+
+ return SWAP_AGAIN;
}
-/**
- * page_referenced_file - referenced check for object-based rmap
- * @page: the page we're checking references on.
- * @memcg: target memory control group
- * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
- *
- * For an object-based mapped page, find all the places it is mapped and
- * check/clear the referenced flag. This is done by following the page->mapping
- * pointer, then walking the chain of vmas it holds. It returns the number
- * of references it found.
- *
- * This function is only called from page_referenced for object-based pages.
- */
-static int page_referenced_file(struct page *page,
- struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
{
- unsigned int mapcount;
- struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct vm_area_struct *vma;
- int referenced = 0;
+ struct page_referenced_arg *pra = arg;
+ struct mem_cgroup *memcg = pra->memcg;
- /*
- * The caller's checks on page->mapping and !PageAnon have made
- * sure that this is a file page: the check for page->mapping
- * excludes the case just before it gets set on an anon page.
- */
- BUG_ON(PageAnon(page));
-
- /*
- * The page lock not only makes sure that page->mapping cannot
- * suddenly be NULLified by truncation, it makes sure that the
- * structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_mutex.
- */
- BUG_ON(!PageLocked(page));
-
- mutex_lock(&mapping->i_mmap_mutex);
-
- /*
- * i_mmap_mutex does not stabilize mapcount at all, but mapcount
- * is more likely to be accurate if we note it after spinning.
- */
- mapcount = page_mapcount(page);
-
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- unsigned long address = vma_address(page, vma);
- /*
- * If we are reclaiming on behalf of a cgroup, skip
- * counting on behalf of references from different
- * cgroups
- */
- if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
- continue;
- referenced += page_referenced_one(page, vma, address,
- &mapcount, vm_flags);
- if (!mapcount)
- break;
- }
+ if (!mm_match_cgroup(vma->vm_mm, memcg))
+ return true;
- mutex_unlock(&mapping->i_mmap_mutex);
- return referenced;
+ return false;
}
/**
@@ -851,41 +768,57 @@ int page_referenced(struct page *page,
struct mem_cgroup *memcg,
unsigned long *vm_flags)
{
- int referenced = 0;
+ int ret;
int we_locked = 0;
+ struct page_referenced_arg pra = {
+ .mapcount = page_mapcount(page),
+ .memcg = memcg,
+ };
+ struct rmap_walk_control rwc = {
+ .rmap_one = page_referenced_one,
+ .arg = (void *)&pra,
+ .anon_lock = page_lock_anon_vma_read,
+ };
*vm_flags = 0;
- if (page_mapped(page) && page_rmapping(page)) {
- if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
- we_locked = trylock_page(page);
- if (!we_locked) {
- referenced++;
- goto out;
- }
- }
- if (unlikely(PageKsm(page)))
- referenced += page_referenced_ksm(page, memcg,
- vm_flags);
- else if (PageAnon(page))
- referenced += page_referenced_anon(page, memcg,
- vm_flags);
- else if (page->mapping)
- referenced += page_referenced_file(page, memcg,
- vm_flags);
- if (we_locked)
- unlock_page(page);
+ if (!page_mapped(page))
+ return 0;
+
+ if (!page_rmapping(page))
+ return 0;
+
+ if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+ we_locked = trylock_page(page);
+ if (!we_locked)
+ return 1;
}
-out:
- return referenced;
+
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip
+ * counting on behalf of references from different
+ * cgroups
+ */
+ if (memcg) {
+ rwc.invalid_vma = invalid_page_referenced_vma;
+ }
+
+ ret = rmap_walk(page, &rwc);
+ *vm_flags = pra.vm_flags;
+
+ if (we_locked)
+ unlock_page(page);
+
+ return pra.referenced;
}
static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address)
+ unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte;
spinlock_t *ptl;
int ret = 0;
+ int *cleaned = arg;
pte = page_check_address(page, mm, address, &ptl, 1);
if (!pte)
@@ -904,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
- if (ret)
+ if (ret) {
mmu_notifier_invalidate_page(mm, address);
+ (*cleaned)++;
+ }
out:
- return ret;
+ return SWAP_AGAIN;
}
-static int page_mkclean_file(struct address_space *mapping, struct page *page)
+static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
{
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct vm_area_struct *vma;
- int ret = 0;
-
- BUG_ON(PageAnon(page));
+ if (vma->vm_flags & VM_SHARED)
+ return 0;
- mutex_lock(&mapping->i_mmap_mutex);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- if (vma->vm_flags & VM_SHARED) {
- unsigned long address = vma_address(page, vma);
- ret += page_mkclean_one(page, vma, address);
- }
- }
- mutex_unlock(&mapping->i_mmap_mutex);
- return ret;
+ return 1;
}
int page_mkclean(struct page *page)
{
- int ret = 0;
+ int cleaned = 0;
+ struct address_space *mapping;
+ struct rmap_walk_control rwc = {
+ .arg = (void *)&cleaned,
+ .rmap_one = page_mkclean_one,
+ .invalid_vma = invalid_mkclean_vma,
+ };
BUG_ON(!PageLocked(page));
- if (page_mapped(page)) {
- struct address_space *mapping = page_mapping(page);
- if (mapping)
- ret = page_mkclean_file(mapping, page);
- }
+ if (!page_mapped(page))
+ return 0;
- return ret;
+ mapping = page_mapping(page);
+ if (!mapping)
+ return 0;
+
+ rmap_walk(page, &rwc);
+
+ return cleaned;
}
EXPORT_SYMBOL_GPL(page_mkclean);
@@ -961,9 +894,9 @@ void page_move_anon_rmap(struct page *page,
{
struct anon_vma *anon_vma = vma->anon_vma;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON(!anon_vma);
- VM_BUG_ON(page->index != linear_page_index(vma, address));
+ VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
@@ -1062,7 +995,7 @@ void do_page_add_anon_rmap(struct page *page,
if (unlikely(PageKsm(page)))
return;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
/* address might be in next vma when migration races vma_adjust */
if (first)
__page_set_anon_rmap(page, vma, address, exclusive);
@@ -1177,17 +1110,17 @@ out:
}
/*
- * Subfunctions of try_to_unmap: try_to_unmap_one called
- * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
+ * @arg: enum ttu_flags will be passed to this argument
*/
int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, enum ttu_flags flags)
+ unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
int ret = SWAP_AGAIN;
+ enum ttu_flags flags = (enum ttu_flags)arg;
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
@@ -1426,124 +1359,18 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
return ret;
}
-bool is_vma_temporary_stack(struct vm_area_struct *vma)
-{
- int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
-
- if (!maybe_stack)
- return false;
-
- if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
- VM_STACK_INCOMPLETE_SETUP)
- return true;
-
- return false;
-}
-
-/**
- * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
- * rmap method
- * @page: the page to unmap/unlock
- * @flags: action and flags
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the anon_vma struct it points to.
- *
- * This function is only called from try_to_unmap/try_to_munlock for
- * anonymous pages.
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * 'LOCKED.
- */
-static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
-{
- struct anon_vma *anon_vma;
- pgoff_t pgoff;
- struct anon_vma_chain *avc;
- int ret = SWAP_AGAIN;
-
- anon_vma = page_lock_anon_vma_read(page);
- if (!anon_vma)
- return ret;
-
- pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long address;
-
- /*
- * During exec, a temporary VMA is setup and later moved.
- * The VMA is moved under the anon_vma lock but not the
- * page tables leading to a race where migration cannot
- * find the migration ptes. Rather than increasing the
- * locking requirements of exec(), migration skips
- * temporary VMAs until after exec() completes.
- */
- if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
- is_vma_temporary_stack(vma))
- continue;
-
- address = vma_address(page, vma);
- ret = try_to_unmap_one(page, vma, address, flags);
- if (ret != SWAP_AGAIN || !page_mapped(page))
- break;
- }
-
- page_unlock_anon_vma_read(anon_vma);
- return ret;
-}
-
-/**
- * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
- * @page: the page to unmap/unlock
- * @flags: action and flags
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the address_space struct it points to.
- *
- * This function is only called from try_to_unmap/try_to_munlock for
- * object-based pages.
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * 'LOCKED.
- */
-static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
+static int try_to_unmap_nonlinear(struct page *page,
+ struct address_space *mapping, struct vm_area_struct *vma)
{
- struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
unsigned long cursor;
unsigned long max_nl_cursor = 0;
unsigned long max_nl_size = 0;
unsigned int mapcount;
- if (PageHuge(page))
- pgoff = page->index << compound_order(page);
-
- mutex_lock(&mapping->i_mmap_mutex);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- unsigned long address = vma_address(page, vma);
- ret = try_to_unmap_one(page, vma, address, flags);
- if (ret != SWAP_AGAIN || !page_mapped(page))
- goto out;
- }
-
- if (list_empty(&mapping->i_mmap_nonlinear))
- goto out;
-
- /*
- * We don't bother to try to find the munlocked page in nonlinears.
- * It's costly. Instead, later, page reclaim logic may call
- * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
- */
- if (TTU_ACTION(flags) == TTU_MUNLOCK)
- goto out;
+ list_for_each_entry(vma,
+ &mapping->i_mmap_nonlinear, shared.nonlinear) {
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.nonlinear) {
cursor = (unsigned long) vma->vm_private_data;
if (cursor > max_nl_cursor)
max_nl_cursor = cursor;
@@ -1553,8 +1380,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
}
if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
- ret = SWAP_FAIL;
- goto out;
+ return SWAP_FAIL;
}
/*
@@ -1566,7 +1392,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
*/
mapcount = page_mapcount(page);
if (!mapcount)
- goto out;
+ return ret;
+
cond_resched();
max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
@@ -1574,10 +1401,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
max_nl_cursor = CLUSTER_SIZE;
do {
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.nonlinear) {
+ list_for_each_entry(vma,
+ &mapping->i_mmap_nonlinear, shared.nonlinear) {
+
cursor = (unsigned long) vma->vm_private_data;
- while ( cursor < max_nl_cursor &&
+ while (cursor < max_nl_cursor &&
cursor < vma->vm_end - vma->vm_start) {
if (try_to_unmap_cluster(cursor, &mapcount,
vma, page) == SWAP_MLOCK)
@@ -1585,7 +1413,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
cursor += CLUSTER_SIZE;
vma->vm_private_data = (void *) cursor;
if ((int)mapcount <= 0)
- goto out;
+ return ret;
}
vma->vm_private_data = (void *) max_nl_cursor;
}
@@ -1600,11 +1428,34 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
*/
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
vma->vm_private_data = NULL;
-out:
- mutex_unlock(&mapping->i_mmap_mutex);
+
return ret;
}
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
+{
+ int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
+
+ if (!maybe_stack)
+ return false;
+
+ if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
+ VM_STACK_INCOMPLETE_SETUP)
+ return true;
+
+ return false;
+}
+
+static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
+{
+ return is_vma_temporary_stack(vma);
+}
+
+static int page_not_mapped(struct page *page)
+{
+ return !page_mapped(page);
+};
+
/**
* try_to_unmap - try to remove all page table mappings to a page
* @page: the page to get unmapped
@@ -1622,16 +1473,29 @@ out:
int try_to_unmap(struct page *page, enum ttu_flags flags)
{
int ret;
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_unmap_one,
+ .arg = (void *)flags,
+ .done = page_not_mapped,
+ .file_nonlinear = try_to_unmap_nonlinear,
+ .anon_lock = page_lock_anon_vma_read,
+ };
- BUG_ON(!PageLocked(page));
- VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
+
+ /*
+ * During exec, a temporary VMA is setup and later moved.
+ * The VMA is moved under the anon_vma lock but not the
+ * page tables leading to a race where migration cannot
+ * find the migration ptes. Rather than increasing the
+ * locking requirements of exec(), migration skips
+ * temporary VMAs until after exec() completes.
+ */
+ if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page))
+ rwc.invalid_vma = invalid_migration_vma;
+
+ ret = rmap_walk(page, &rwc);
- if (unlikely(PageKsm(page)))
- ret = try_to_unmap_ksm(page, flags);
- else if (PageAnon(page))
- ret = try_to_unmap_anon(page, flags);
- else
- ret = try_to_unmap_file(page, flags);
if (ret != SWAP_MLOCK && !page_mapped(page))
ret = SWAP_SUCCESS;
return ret;
@@ -1654,14 +1518,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
*/
int try_to_munlock(struct page *page)
{
- VM_BUG_ON(!PageLocked(page) || PageLRU(page));
+ int ret;
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_unmap_one,
+ .arg = (void *)TTU_MUNLOCK,
+ .done = page_not_mapped,
+ /*
+ * We don't bother to try to find the munlocked page in
+ * nonlinears. It's costly. Instead, later, page reclaim logic
+ * may call try_to_unmap() and recover PG_mlocked lazily.
+ */
+ .file_nonlinear = NULL,
+ .anon_lock = page_lock_anon_vma_read,
- if (unlikely(PageKsm(page)))
- return try_to_unmap_ksm(page, TTU_MUNLOCK);
- else if (PageAnon(page))
- return try_to_unmap_anon(page, TTU_MUNLOCK);
- else
- return try_to_unmap_file(page, TTU_MUNLOCK);
+ };
+
+ VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
+
+ ret = rmap_walk(page, &rwc);
+ return ret;
}
void __put_anon_vma(struct anon_vma *anon_vma)
@@ -1674,18 +1549,13 @@ void __put_anon_vma(struct anon_vma *anon_vma)
anon_vma_free(anon_vma);
}
-#ifdef CONFIG_MIGRATION
-/*
- * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
- * Called by migrate.c to remove migration ptes, but might be used more later.
- */
-static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg)
+static struct anon_vma *rmap_walk_anon_lock(struct page *page,
+ struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct anon_vma_chain *avc;
- int ret = SWAP_AGAIN;
+
+ if (rwc->anon_lock)
+ return rwc->anon_lock(page);
/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
@@ -1695,58 +1565,120 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
*/
anon_vma = page_anon_vma(page);
if (!anon_vma)
- return ret;
+ return NULL;
+
anon_vma_lock_read(anon_vma);
+ return anon_vma;
+}
+
+/*
+ * rmap_walk_anon - do something to anonymous page using the object-based
+ * rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the anon_vma struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
+{
+ struct anon_vma *anon_vma;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct anon_vma_chain *avc;
+ int ret = SWAP_AGAIN;
+
+ anon_vma = rmap_walk_anon_lock(page, rwc);
+ if (!anon_vma)
+ return ret;
+
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
- ret = rmap_one(page, vma, address, arg);
+
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ continue;
+
+ ret = rwc->rmap_one(page, vma, address, rwc->arg);
if (ret != SWAP_AGAIN)
break;
+ if (rwc->done && rwc->done(page))
+ break;
}
anon_vma_unlock_read(anon_vma);
return ret;
}
-static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg)
+/*
+ * rmap_walk_file - do something to file page using the object-based rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
{
struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff_t pgoff = page->index << compound_order(page);
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
+ /*
+ * The page lock not only makes sure that page->mapping cannot
+ * suddenly be NULLified by truncation, it makes sure that the
+ * structure at mapping cannot be freed and reused yet,
+ * so we can safely take mapping->i_mmap_mutex.
+ */
+ VM_BUG_ON(!PageLocked(page));
+
if (!mapping)
return ret;
mutex_lock(&mapping->i_mmap_mutex);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
- ret = rmap_one(page, vma, address, arg);
+
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ continue;
+
+ ret = rwc->rmap_one(page, vma, address, rwc->arg);
if (ret != SWAP_AGAIN)
- break;
+ goto done;
+ if (rwc->done && rwc->done(page))
+ goto done;
}
- /*
- * No nonlinear handling: being always shared, nonlinear vmas
- * never contain migration ptes. Decide what to do about this
- * limitation to linear when we need rmap_walk() on nonlinear.
- */
+
+ if (!rwc->file_nonlinear)
+ goto done;
+
+ if (list_empty(&mapping->i_mmap_nonlinear))
+ goto done;
+
+ ret = rwc->file_nonlinear(page, mapping, vma);
+
+done:
mutex_unlock(&mapping->i_mmap_mutex);
return ret;
}
-int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg)
+int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
{
- VM_BUG_ON(!PageLocked(page));
-
if (unlikely(PageKsm(page)))
- return rmap_walk_ksm(page, rmap_one, arg);
+ return rmap_walk_ksm(page, rwc);
else if (PageAnon(page))
- return rmap_walk_anon(page, rmap_one, arg);
+ return rmap_walk_anon(page, rwc);
else
- return rmap_walk_file(page, rmap_one, arg);
+ return rmap_walk_file(page, rwc);
}
-#endif /* CONFIG_MIGRATION */
#ifdef CONFIG_HUGETLB_PAGE
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 902a14842b74..8156f95ec0cf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -285,8 +285,8 @@ static int shmem_add_to_page_cache(struct page *page,
{
int error;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(!PageSwapBacked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
page_cache_get(page);
page->mapping = mapping;
@@ -491,7 +491,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
continue;
if (!unfalloc || !PageUptodate(page)) {
if (page->mapping == mapping) {
- VM_BUG_ON(PageWriteback(page));
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
truncate_inode_page(mapping, page);
}
}
@@ -568,7 +568,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
lock_page(page);
if (!unfalloc || !PageUptodate(page)) {
if (page->mapping == mapping) {
- VM_BUG_ON(PageWriteback(page));
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
truncate_inode_page(mapping, page);
}
}
diff --git a/mm/slab.h b/mm/slab.h
index 0859c4241ba1..8184a7cde272 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -160,12 +160,36 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
}
+/*
+ * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
+ * That said the caller must assure the memcg's cache won't go away. Since once
+ * created a memcg's cache is destroyed only along with the root cache, it is
+ * true if we are going to allocate from the cache or hold a reference to the
+ * root cache by other means. Otherwise, we should hold either the slab_mutex
+ * or the memcg's slab_caches_mutex while calling this function and accessing
+ * the returned value.
+ */
static inline struct kmem_cache *
cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
+ struct kmem_cache *cachep;
+ struct memcg_cache_params *params;
+
if (!s->memcg_params)
return NULL;
- return s->memcg_params->memcg_caches[idx];
+
+ rcu_read_lock();
+ params = rcu_dereference(s->memcg_params);
+ cachep = params->memcg_caches[idx];
+ rcu_read_unlock();
+
+ /*
+ * Make sure we will access the up-to-date value. The code updating
+ * memcg_caches issues a write barrier to match this (see
+ * memcg_register_cache()).
+ */
+ smp_read_barrier_depends();
+ return cachep;
}
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0b7bb399b0e4..8e40321da091 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -171,13 +171,26 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
struct kmem_cache *parent_cache)
{
struct kmem_cache *s = NULL;
- int err = 0;
+ int err;
get_online_cpus();
mutex_lock(&slab_mutex);
- if (!kmem_cache_sanity_check(memcg, name, size) == 0)
- goto out_locked;
+ err = kmem_cache_sanity_check(memcg, name, size);
+ if (err)
+ goto out_unlock;
+
+ if (memcg) {
+ /*
+ * Since per-memcg caches are created asynchronously on first
+ * allocation (see memcg_kmem_get_cache()), several threads can
+ * try to create the same cache, but only one of them may
+ * succeed. Therefore if we get here and see the cache has
+ * already been created, we silently return NULL.
+ */
+ if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg)))
+ goto out_unlock;
+ }
/*
* Some allocators will constraint the set of valid flags to a subset
@@ -189,45 +202,45 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
if (s)
- goto out_locked;
+ goto out_unlock;
+ err = -ENOMEM;
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
- if (s) {
- s->object_size = s->size = size;
- s->align = calculate_alignment(flags, align, size);
- s->ctor = ctor;
+ if (!s)
+ goto out_unlock;
- if (memcg_register_cache(memcg, s, parent_cache)) {
- kmem_cache_free(kmem_cache, s);
- err = -ENOMEM;
- goto out_locked;
- }
+ s->object_size = s->size = size;
+ s->align = calculate_alignment(flags, align, size);
+ s->ctor = ctor;
- s->name = kstrdup(name, GFP_KERNEL);
- if (!s->name) {
- kmem_cache_free(kmem_cache, s);
- err = -ENOMEM;
- goto out_locked;
- }
+ s->name = kstrdup(name, GFP_KERNEL);
+ if (!s->name)
+ goto out_free_cache;
- err = __kmem_cache_create(s, flags);
- if (!err) {
- s->refcount = 1;
- list_add(&s->list, &slab_caches);
- memcg_cache_list_add(memcg, s);
- } else {
- kfree(s->name);
- kmem_cache_free(kmem_cache, s);
- }
- } else
- err = -ENOMEM;
+ err = memcg_alloc_cache_params(memcg, s, parent_cache);
+ if (err)
+ goto out_free_cache;
+
+ err = __kmem_cache_create(s, flags);
+ if (err)
+ goto out_free_cache;
-out_locked:
+ s->refcount = 1;
+ list_add(&s->list, &slab_caches);
+ memcg_register_cache(s);
+
+out_unlock:
mutex_unlock(&slab_mutex);
put_online_cpus();
- if (err) {
-
+ /*
+ * There is no point in flooding logs with warnings or especially
+ * crashing the system if we fail to create a cache for a memcg. In
+ * this case we will be accounting the memcg allocation to the root
+ * cgroup until we succeed to create its own cache, but it isn't that
+ * critical.
+ */
+ if (err && !memcg) {
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
@@ -236,11 +249,15 @@ out_locked:
name, err);
dump_stack();
}
-
return NULL;
}
-
return s;
+
+out_free_cache:
+ memcg_free_cache_params(s);
+ kfree(s->name);
+ kmem_cache_free(kmem_cache, s);
+ goto out_unlock;
}
struct kmem_cache *
@@ -263,11 +280,12 @@ void kmem_cache_destroy(struct kmem_cache *s)
list_del(&s->list);
if (!__kmem_cache_shutdown(s)) {
+ memcg_unregister_cache(s);
mutex_unlock(&slab_mutex);
if (s->flags & SLAB_DESTROY_BY_RCU)
rcu_barrier();
- memcg_release_cache(s);
+ memcg_free_cache_params(s);
kfree(s->name);
kmem_cache_free(kmem_cache, s);
} else {
diff --git a/mm/slub.c b/mm/slub.c
index a99e9e67c60e..d2388c850b65 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1558,7 +1558,7 @@ static inline void *acquire_slab(struct kmem_cache *s,
new.freelist = freelist;
}
- VM_BUG_ON(new.frozen);
+ VM_BUG_ON_PAGE(new.frozen, &new);
new.frozen = 1;
if (!__cmpxchg_double_slab(s, page,
@@ -1811,7 +1811,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
set_freepointer(s, freelist, prior);
new.counters = counters;
new.inuse--;
- VM_BUG_ON(!new.frozen);
+ VM_BUG_ON_PAGE(!new.frozen, &new);
} while (!__cmpxchg_double_slab(s, page,
prior, counters,
@@ -1839,7 +1839,7 @@ redo:
old.freelist = page->freelist;
old.counters = page->counters;
- VM_BUG_ON(!old.frozen);
+ VM_BUG_ON_PAGE(!old.frozen, &old);
/* Determine target state of the slab */
new.counters = old.counters;
@@ -1951,7 +1951,7 @@ static void unfreeze_partials(struct kmem_cache *s,
old.freelist = page->freelist;
old.counters = page->counters;
- VM_BUG_ON(!old.frozen);
+ VM_BUG_ON_PAGE(!old.frozen, &old);
new.counters = old.counters;
new.freelist = old.freelist;
@@ -2224,7 +2224,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
counters = page->counters;
new.counters = counters;
- VM_BUG_ON(!new.frozen);
+ VM_BUG_ON_PAGE(!new.frozen, &new);
new.inuse = page->objects;
new.frozen = freelist != NULL;
@@ -2318,7 +2318,7 @@ load_freelist:
* page is pointing to the page from which the objects are obtained.
* That page must be frozen for per cpu allocations to work.
*/
- VM_BUG_ON(!c->page->frozen);
+ VM_BUG_ON_PAGE(!c->page->frozen, c->page);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
local_irq_restore(flags);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 27eeab3be757..4cba9c2783a1 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
{
- return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
+ return memblock_virt_alloc_try_nid(size, align, goal,
+ BOOTMEM_ALLOC_ACCESSIBLE, node);
}
static void *vmemmap_buf;
@@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (vmemmap_buf_start) {
/* need to free left buf */
- free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
+ memblock_free_early(__pa(vmemmap_buf),
+ vmemmap_buf_end - vmemmap_buf);
vmemmap_buf = NULL;
vmemmap_buf_end = NULL;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 8cc7be0e9590..63c3ea5c119c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
else
section = kzalloc(array_size, GFP_KERNEL);
} else {
- section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+ section = memblock_virt_alloc_node(array_size, nid);
}
return section;
@@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
- p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
- SMP_CACHE_BYTES, goal, limit);
+ p = memblock_virt_alloc_try_nid_nopanic(size,
+ SMP_CACHE_BYTES, goal, limit,
+ nid);
if (!p && limit) {
limit = 0;
goto again;
@@ -331,7 +332,7 @@ static unsigned long * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
- return alloc_bootmem_node_nopanic(pgdat, size);
+ return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
}
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
return map;
size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
- map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ map = memblock_virt_alloc_try_nid(size,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
return map;
}
void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
}
size = PAGE_ALIGN(size);
- map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ map = memblock_virt_alloc_try_nid(size * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
if (map) {
for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
if (!present_section_nr(pnum))
@@ -545,7 +548,7 @@ void __init sparse_init(void)
* sparse_early_mem_map_alloc, so allocate usemap_map at first.
*/
size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
- usemap_map = alloc_bootmem(size);
+ usemap_map = memblock_virt_alloc(size, 0);
if (!usemap_map)
panic("can not allocate usemap_map\n");
alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
@@ -553,7 +556,7 @@ void __init sparse_init(void)
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
- map_map = alloc_bootmem(size2);
+ map_map = memblock_virt_alloc(size2, 0);
if (!map_map)
panic("can not allocate map_map\n");
alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
@@ -583,9 +586,9 @@ void __init sparse_init(void)
vmemmap_populate_print_last();
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- free_bootmem(__pa(map_map), size2);
+ memblock_free_early(__pa(map_map), size2);
#endif
- free_bootmem(__pa(usemap_map), size);
+ memblock_free_early(__pa(usemap_map), size);
}
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/swap.c b/mm/swap.c
index 84b26aaabd03..b31ba67d440a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,7 +31,6 @@
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
-#include <linux/hugetlb.h>
#include "internal.h"
@@ -58,7 +57,7 @@ static void __page_cache_release(struct page *page)
spin_lock_irqsave(&zone->lru_lock, flags);
lruvec = mem_cgroup_page_lruvec(page, zone);
- VM_BUG_ON(!PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -82,118 +81,150 @@ static void __put_compound_page(struct page *page)
static void put_compound_page(struct page *page)
{
- if (unlikely(PageTail(page))) {
- /* __split_huge_page_refcount can run under us */
- struct page *page_head = compound_trans_head(page);
-
- if (likely(page != page_head &&
- get_page_unless_zero(page_head))) {
- unsigned long flags;
+ struct page *page_head;
+ if (likely(!PageTail(page))) {
+ if (put_page_testzero(page)) {
/*
- * THP can not break up slab pages so avoid taking
- * compound_lock(). Slab performs non-atomic bit ops
- * on page->flags for better performance. In particular
- * slab_unlock() in slub used to be a hot path. It is
- * still hot on arches that do not support
- * this_cpu_cmpxchg_double().
+ * By the time all refcounts have been released
+ * split_huge_page cannot run anymore from under us.
*/
- if (PageSlab(page_head) || PageHeadHuge(page_head)) {
- if (likely(PageTail(page))) {
- /*
- * __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON(!PageHead(page_head));
- atomic_dec(&page->_mapcount);
- if (put_page_testzero(page_head))
- VM_BUG_ON(1);
- if (put_page_testzero(page_head))
- __put_compound_page(page_head);
- return;
- } else
- /*
- * __split_huge_page_refcount
- * run before us, "page" was a
- * THP tail. The split
- * page_head has been freed
- * and reallocated as slab or
- * hugetlbfs page of smaller
- * order (only possible if
- * reallocated as slab on
- * x86).
- */
- goto skip_lock;
- }
+ if (PageHead(page))
+ __put_compound_page(page);
+ else
+ __put_single_page(page);
+ }
+ return;
+ }
+
+ /* __split_huge_page_refcount can run under us */
+ page_head = compound_trans_head(page);
+
+ /*
+ * THP can not break up slab pages so avoid taking
+ * compound_lock() and skip the tail page refcounting (in
+ * _mapcount) too. Slab performs non-atomic bit ops on
+ * page->flags for better performance. In particular
+ * slab_unlock() in slub used to be a hot path. It is still
+ * hot on arches that do not support
+ * this_cpu_cmpxchg_double().
+ *
+ * If "page" is part of a slab or hugetlbfs page it cannot be
+ * splitted and the head page cannot change from under us. And
+ * if "page" is part of a THP page under splitting, if the
+ * head page pointed by the THP tail isn't a THP head anymore,
+ * we'll find PageTail clear after smp_rmb() and we'll treat
+ * it as a single page.
+ */
+ if (!__compound_tail_refcounted(page_head)) {
+ /*
+ * If "page" is a THP tail, we must read the tail page
+ * flags after the head page flags. The
+ * split_huge_page side enforces write memory barriers
+ * between clearing PageTail and before the head page
+ * can be freed and reallocated.
+ */
+ smp_rmb();
+ if (likely(PageTail(page))) {
/*
- * page_head wasn't a dangling pointer but it
- * may not be a head page anymore by the time
- * we obtain the lock. That is ok as long as it
- * can't be freed from under us.
+ * __split_huge_page_refcount cannot race
+ * here.
*/
- flags = compound_lock_irqsave(page_head);
- if (unlikely(!PageTail(page))) {
- /* __split_huge_page_refcount run before us */
- compound_unlock_irqrestore(page_head, flags);
-skip_lock:
- if (put_page_testzero(page_head)) {
- /*
- * The head page may have been
- * freed and reallocated as a
- * compound page of smaller
- * order and then freed again.
- * All we know is that it
- * cannot have become: a THP
- * page, a compound page of
- * higher order, a tail page.
- * That is because we still
- * hold the refcount of the
- * split THP tail and
- * page_head was the THP head
- * before the split.
- */
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
-out_put_single:
- if (put_page_testzero(page))
- __put_single_page(page);
- return;
+ VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
+ VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
+ if (put_page_testzero(page_head)) {
+ /*
+ * If this is the tail of a slab
+ * compound page, the tail pin must
+ * not be the last reference held on
+ * the page, because the PG_slab
+ * cannot be cleared before all tail
+ * pins (which skips the _mapcount
+ * tail refcounting) have been
+ * released. For hugetlbfs the tail
+ * pin may be the last reference on
+ * the page instead, because
+ * PageHeadHuge will not go away until
+ * the compound page enters the buddy
+ * allocator.
+ */
+ VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
+ __put_compound_page(page_head);
}
- VM_BUG_ON(page_head != page->first_page);
+ return;
+ } else
/*
- * We can release the refcount taken by
- * get_page_unless_zero() now that
- * __split_huge_page_refcount() is blocked on
- * the compound_lock.
+ * __split_huge_page_refcount run before us,
+ * "page" was a THP tail. The split page_head
+ * has been freed and reallocated as slab or
+ * hugetlbfs page of smaller order (only
+ * possible if reallocated as slab on x86).
*/
- if (put_page_testzero(page_head))
- VM_BUG_ON(1);
- /* __split_huge_page_refcount will wait now */
- VM_BUG_ON(page_mapcount(page) <= 0);
- atomic_dec(&page->_mapcount);
- VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
- VM_BUG_ON(atomic_read(&page->_count) != 0);
- compound_unlock_irqrestore(page_head, flags);
+ goto out_put_single;
+ }
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
+ unsigned long flags;
+
+ /*
+ * page_head wasn't a dangling pointer but it may not
+ * be a head page anymore by the time we obtain the
+ * lock. That is ok as long as it can't be freed from
+ * under us.
+ */
+ flags = compound_lock_irqsave(page_head);
+ if (unlikely(!PageTail(page))) {
+ /* __split_huge_page_refcount run before us */
+ compound_unlock_irqrestore(page_head, flags);
if (put_page_testzero(page_head)) {
+ /*
+ * The head page may have been freed
+ * and reallocated as a compound page
+ * of smaller order and then freed
+ * again. All we know is that it
+ * cannot have become: a THP page, a
+ * compound page of higher order, a
+ * tail page. That is because we
+ * still hold the refcount of the
+ * split THP tail and page_head was
+ * the THP head before the split.
+ */
if (PageHead(page_head))
__put_compound_page(page_head);
else
__put_single_page(page_head);
}
- } else {
- /* page_head is a dangling pointer */
- VM_BUG_ON(PageTail(page));
- goto out_put_single;
+out_put_single:
+ if (put_page_testzero(page))
+ __put_single_page(page);
+ return;
}
- } else if (put_page_testzero(page)) {
- if (PageHead(page))
- __put_compound_page(page);
- else
- __put_single_page(page);
+ VM_BUG_ON_PAGE(page_head != page->first_page, page);
+ /*
+ * We can release the refcount taken by
+ * get_page_unless_zero() now that
+ * __split_huge_page_refcount() is blocked on the
+ * compound_lock.
+ */
+ if (put_page_testzero(page_head))
+ VM_BUG_ON_PAGE(1, page_head);
+ /* __split_huge_page_refcount will wait now */
+ VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
+ atomic_dec(&page->_mapcount);
+ VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
+ compound_unlock_irqrestore(page_head, flags);
+
+ if (put_page_testzero(page_head)) {
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+ else
+ __put_single_page(page_head);
+ }
+ } else {
+ /* page_head is a dangling pointer */
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ goto out_put_single;
}
}
@@ -221,36 +252,37 @@ bool __get_page_tail(struct page *page)
* split_huge_page().
*/
unsigned long flags;
- bool got = false;
+ bool got;
struct page *page_head = compound_trans_head(page);
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- /* Ref to put_compound_page() comment. */
- if (PageSlab(page_head) || PageHeadHuge(page_head)) {
- if (likely(PageTail(page))) {
- /*
- * This is a hugetlbfs page or a slab
- * page. __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON(!PageHead(page_head));
- __get_page_tail_foll(page, false);
- return true;
- } else {
- /*
- * __split_huge_page_refcount run
- * before us, "page" was a THP
- * tail. The split page_head has been
- * freed and reallocated as slab or
- * hugetlbfs page of smaller order
- * (only possible if reallocated as
- * slab on x86).
- */
- put_page(page_head);
- return false;
- }
+ /* Ref to put_compound_page() comment. */
+ if (!__compound_tail_refcounted(page_head)) {
+ smp_rmb();
+ if (likely(PageTail(page))) {
+ /*
+ * This is a hugetlbfs page or a slab
+ * page. __split_huge_page_refcount
+ * cannot race here.
+ */
+ VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
+ __get_page_tail_foll(page, true);
+ return true;
+ } else {
+ /*
+ * __split_huge_page_refcount run
+ * before us, "page" was a THP
+ * tail. The split page_head has been
+ * freed and reallocated as slab or
+ * hugetlbfs page of smaller order
+ * (only possible if reallocated as
+ * slab on x86).
+ */
+ return false;
}
+ }
+ got = false;
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
/*
* page_head wasn't a dangling pointer but it
* may not be a head page anymore by the time
@@ -572,8 +604,8 @@ EXPORT_SYMBOL(__lru_cache_add);
*/
void lru_cache_add(struct page *page)
{
- VM_BUG_ON(PageActive(page) && PageUnevictable(page));
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
__lru_cache_add(page);
}
@@ -814,7 +846,7 @@ void release_pages(struct page **pages, int nr, int cold)
}
lruvec = mem_cgroup_page_lruvec(page, zone);
- VM_BUG_ON(!PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
}
@@ -856,9 +888,9 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
{
const int file = 0;
- VM_BUG_ON(!PageHead(page));
- VM_BUG_ON(PageCompound(page_tail));
- VM_BUG_ON(PageLRU(page_tail));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page_tail), page);
+ VM_BUG_ON_PAGE(PageLRU(page_tail), page);
VM_BUG_ON(NR_CPUS != 1 &&
!spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
@@ -897,7 +929,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
int active = PageActive(page);
enum lru_list lru = page_lru(page);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, lru);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e6f15f8ca2af..e76ace30d436 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void)
return ret;
}
+static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
+
void show_swap_cache_info(void)
{
printk("%lu pages in swap cache\n", total_swapcache_pages());
@@ -83,9 +85,9 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
int error;
struct address_space *address_space;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageSwapCache(page));
- VM_BUG_ON(!PageSwapBacked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
page_cache_get(page);
SetPageSwapCache(page);
@@ -139,9 +141,9 @@ void __delete_from_swap_cache(struct page *page)
swp_entry_t entry;
struct address_space *address_space;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(!PageSwapCache(page));
- VM_BUG_ON(PageWriteback(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
entry.val = page_private(page);
address_space = swap_address_space(entry);
@@ -165,8 +167,8 @@ int add_to_swap(struct page *page, struct list_head *list)
swp_entry_t entry;
int err;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(!PageUptodate(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageUptodate(page), page);
entry = get_swap_page();
if (!entry.val)
@@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry)
page = find_get_page(swap_address_space(entry), entry.val);
- if (page)
+ if (page) {
INC_CACHE_INFO(find_success);
+ if (TestClearPageReadahead(page))
+ atomic_inc(&swapin_readahead_hits);
+ }
INC_CACHE_INFO(find_total);
return page;
@@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return found_page;
}
+static unsigned long swapin_nr_pages(unsigned long offset)
+{
+ static unsigned long prev_offset;
+ unsigned int pages, max_pages, last_ra;
+ static atomic_t last_readahead_pages;
+
+ max_pages = 1 << ACCESS_ONCE(page_cluster);
+ if (max_pages <= 1)
+ return 1;
+
+ /*
+ * This heuristic has been found to work well on both sequential and
+ * random loads, swapping to hard disk or to SSD: please don't ask
+ * what the "+ 2" means, it just happens to work well, that's all.
+ */
+ pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
+ if (pages == 2) {
+ /*
+ * We can have no readahead hits to judge by: but must not get
+ * stuck here forever, so check for an adjacent offset instead
+ * (and don't even bother to check whether swap type is same).
+ */
+ if (offset != prev_offset + 1 && offset != prev_offset - 1)
+ pages = 1;
+ prev_offset = offset;
+ } else {
+ unsigned int roundup = 4;
+ while (roundup < pages)
+ roundup <<= 1;
+ pages = roundup;
+ }
+
+ if (pages > max_pages)
+ pages = max_pages;
+
+ /* Don't shrink readahead too fast */
+ last_ra = atomic_read(&last_readahead_pages) / 2;
+ if (pages < last_ra)
+ pages = last_ra;
+ atomic_set(&last_readahead_pages, pages);
+
+ return pages;
+}
+
/**
* swapin_readahead - swap in pages in hope we need them soon
* @entry: swap entry of this memory
@@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
- unsigned long offset = swp_offset(entry);
+ unsigned long entry_offset = swp_offset(entry);
+ unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
- unsigned long mask = (1UL << page_cluster) - 1;
+ unsigned long mask;
struct blk_plug plug;
+ mask = swapin_nr_pages(offset) - 1;
+ if (!mask)
+ goto skip;
+
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
end_offset = offset | mask;
@@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
gfp_mask, vma, addr);
if (!page)
continue;
+ if (offset != entry_offset)
+ SetPageReadahead(page);
page_cache_release(page);
}
blk_finish_plug(&plug);
lru_add_drain(); /* Push any new pages onto the LRU now */
+skip:
return read_swap_cache_async(entry, gfp_mask, vma, addr);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 612a7c9795f6..b5719e1d06cd 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -906,7 +906,7 @@ int reuse_swap_page(struct page *page)
{
int count;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return 0;
count = page_mapcount(page);
@@ -926,7 +926,7 @@ int reuse_swap_page(struct page *page)
*/
int try_to_free_swap(struct page *page)
{
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!PageSwapCache(page))
return 0;
@@ -1922,7 +1922,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->swap_map = NULL;
cluster_info = p->cluster_info;
p->cluster_info = NULL;
- p->flags = 0;
frontswap_map = frontswap_map_get(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
@@ -1948,6 +1947,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mutex_unlock(&inode->i_mutex);
}
filp_close(swap_file, NULL);
+
+ /*
+ * Clear the SWP_USED flag after all resources are freed so that swapon
+ * can reuse this swap_info in alloc_swap_info() safely. It is ok to
+ * not hold p->lock after we cleared its SWP_WRITEOK.
+ */
+ spin_lock(&swap_lock);
+ p->flags = 0;
+ spin_unlock(&swap_lock);
+
err = 0;
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
@@ -2714,7 +2723,7 @@ struct swap_info_struct *page_swap_info(struct page *page)
*/
struct address_space *__page_file_mapping(struct page *page)
{
- VM_BUG_ON(!PageSwapCache(page));
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
return page_swap_info(page)->swap_file->f_mapping;
}
EXPORT_SYMBOL_GPL(__page_file_mapping);
@@ -2722,7 +2731,7 @@ EXPORT_SYMBOL_GPL(__page_file_mapping);
pgoff_t __page_file_index(struct page *page)
{
swp_entry_t swap = { .val = page_private(page) };
- VM_BUG_ON(!PageSwapCache(page));
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
return swp_offset(swap);
}
EXPORT_SYMBOL_GPL(__page_file_index);
diff --git a/mm/util.c b/mm/util.c
index 808f375648e7..a24aa22f2473 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page)
return mapping;
}
+int overcommit_ratio_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ sysctl_overcommit_kbytes = 0;
+ return ret;
+}
+
+int overcommit_kbytes_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ sysctl_overcommit_ratio = 0;
+ return ret;
+}
+
/*
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
*/
unsigned long vm_commit_limit(void)
{
- return ((totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100) + total_swap_pages;
+ unsigned long allowed;
+
+ if (sysctl_overcommit_kbytes)
+ allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
+ else
+ allowed = ((totalram_pages - hugetlb_total_pages())
+ * sysctl_overcommit_ratio / 100);
+ allowed += total_swap_pages;
+
+ return allowed;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0fdf96803c5b..e4f0db2a3eae 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -220,12 +220,12 @@ int is_vmalloc_or_module_addr(const void *x)
}
/*
- * Walk a vmap address to the struct page it maps.
+ * Walk a vmap address to the physical pfn it maps to.
*/
-struct page *vmalloc_to_page(const void *vmalloc_addr)
+unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
{
unsigned long addr = (unsigned long) vmalloc_addr;
- struct page *page = NULL;
+ unsigned long pfn = 0;
pgd_t *pgd = pgd_offset_k(addr);
/*
@@ -244,23 +244,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
ptep = pte_offset_map(pmd, addr);
pte = *ptep;
if (pte_present(pte))
- page = pte_page(pte);
+ pfn = pte_pfn(pte);
pte_unmap(ptep);
}
}
}
- return page;
+ return pfn;
}
-EXPORT_SYMBOL(vmalloc_to_page);
+EXPORT_SYMBOL(vmalloc_to_pfn);
/*
- * Map a vmalloc()-space virtual address to the physical page frame number.
+ * Map a vmalloc()-space virtual address to the struct page.
*/
-unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
+struct page *vmalloc_to_page(const void *vmalloc_addr)
{
- return page_to_pfn(vmalloc_to_page(vmalloc_addr));
+ return pfn_to_page(vmalloc_to_pfn(vmalloc_addr));
}
-EXPORT_SYMBOL(vmalloc_to_pfn);
+EXPORT_SYMBOL(vmalloc_to_page);
/*** Global kva allocator ***/
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eea668d9cff6..89dd0f742507 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -281,17 +281,34 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
nr_pages_scanned, lru_pages,
max_pass, delta, total_scan);
- while (total_scan >= batch_size) {
+ /*
+ * Normally, we should not scan less than batch_size objects in one
+ * pass to avoid too frequent shrinker calls, but if the slab has less
+ * than batch_size objects in total and we are really tight on memory,
+ * we will try to reclaim all available objects, otherwise we can end
+ * up failing allocations although there are plenty of reclaimable
+ * objects spread over several slabs with usage less than the
+ * batch_size.
+ *
+ * We detect the "tight on memory" situations by looking at the total
+ * number of objects we want to scan (total_scan). If it is greater
+ * than the total number of objects on slab (max_pass), we must be
+ * scanning at high prio and therefore should try to reclaim as much as
+ * possible.
+ */
+ while (total_scan >= batch_size ||
+ total_scan >= max_pass) {
unsigned long ret;
+ unsigned long nr_to_scan = min(batch_size, total_scan);
- shrinkctl->nr_to_scan = batch_size;
+ shrinkctl->nr_to_scan = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
if (ret == SHRINK_STOP)
break;
freed += ret;
- count_vm_events(SLABS_SCANNED, batch_size);
- total_scan -= batch_size;
+ count_vm_events(SLABS_SCANNED, nr_to_scan);
+ total_scan -= nr_to_scan;
cond_resched();
}
@@ -352,16 +369,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
}
list_for_each_entry(shrinker, &shrinker_list, list) {
- for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
- if (!node_online(shrinkctl->nid))
- continue;
-
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
- (shrinkctl->nid != 0))
- break;
-
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
+ shrinkctl->nid = 0;
freed += shrink_slab_node(shrinkctl, shrinker,
- nr_pages_scanned, lru_pages);
+ nr_pages_scanned, lru_pages);
+ continue;
+ }
+
+ for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+ if (node_online(shrinkctl->nid))
+ freed += shrink_slab_node(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
}
}
@@ -603,7 +621,7 @@ void putback_lru_page(struct page *page)
bool is_unevictable;
int was_unevictable = PageUnevictable(page);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
redo:
ClearPageUnevictable(page);
@@ -794,8 +812,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (!trylock_page(page))
goto keep;
- VM_BUG_ON(PageActive(page));
- VM_BUG_ON(page_zone(page) != zone);
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
sc->nr_scanned++;
@@ -1079,14 +1097,14 @@ activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (PageSwapCache(page) && vm_swap_full())
try_to_free_swap(page);
- VM_BUG_ON(PageActive(page));
+ VM_BUG_ON_PAGE(PageActive(page), page);
SetPageActive(page);
pgactivate++;
keep_locked:
unlock_page(page);
keep:
list_add(&page->lru, &ret_pages);
- VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
+ VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
free_hot_cold_page_list(&free_pages, 1);
@@ -1240,7 +1258,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
- VM_BUG_ON(!PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
switch (__isolate_lru_page(page, mode)) {
case 0:
@@ -1295,7 +1313,7 @@ int isolate_lru_page(struct page *page)
{
int ret = -EBUSY;
- VM_BUG_ON(!page_count(page));
+ VM_BUG_ON_PAGE(!page_count(page), page);
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
@@ -1366,7 +1384,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
struct page *page = lru_to_page(page_list);
int lru;
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
spin_unlock_irq(&zone->lru_lock);
@@ -1586,7 +1604,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
page = lru_to_page(list);
lruvec = mem_cgroup_page_lruvec(page, zone);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
nr_pages = hpage_nr_pages(page);
@@ -2279,7 +2297,12 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ unsigned long lru_pages = 0;
bool aborted_reclaim = false;
+ struct reclaim_state *reclaim_state = current->reclaim_state;
+ struct shrink_control shrink = {
+ .gfp_mask = sc->gfp_mask,
+ };
/*
* If the number of buffer_heads in the machine exceeds the maximum
@@ -2289,6 +2312,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (buffer_heads_over_limit)
sc->gfp_mask |= __GFP_HIGHMEM;
+ nodes_clear(shrink.nodes_to_scan);
+
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
@@ -2300,6 +2325,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (global_reclaim(sc)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
+
+ lru_pages += zone_reclaimable_pages(zone);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
+
if (sc->priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue; /* Let kswapd poll it */
@@ -2336,6 +2365,20 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
shrink_zone(zone, sc);
}
+ /*
+ * Don't shrink slabs when reclaiming memory from over limit cgroups
+ * but do shrink slab at least once when aborting reclaim for
+ * compaction to avoid unevenly scanning file/anon LRU pages over slab
+ * pages.
+ */
+ if (global_reclaim(sc)) {
+ shrink_slab(&shrink, sc->nr_scanned, lru_pages);
+ if (reclaim_state) {
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
+ }
+ }
+
return aborted_reclaim;
}
@@ -2376,13 +2419,9 @@ static bool all_unreclaimable(struct zonelist *zonelist,
* else, the number of pages reclaimed
*/
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
- struct scan_control *sc,
- struct shrink_control *shrink)
+ struct scan_control *sc)
{
unsigned long total_scanned = 0;
- struct reclaim_state *reclaim_state = current->reclaim_state;
- struct zoneref *z;
- struct zone *zone;
unsigned long writeback_threshold;
bool aborted_reclaim;
@@ -2397,32 +2436,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->nr_scanned = 0;
aborted_reclaim = shrink_zones(zonelist, sc);
- /*
- * Don't shrink slabs when reclaiming memory from over limit
- * cgroups but do shrink slab at least once when aborting
- * reclaim for compaction to avoid unevenly scanning file/anon
- * LRU pages over slab pages.
- */
- if (global_reclaim(sc)) {
- unsigned long lru_pages = 0;
-
- nodes_clear(shrink->nodes_to_scan);
- for_each_zone_zonelist(zone, z, zonelist,
- gfp_zone(sc->gfp_mask)) {
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
-
- lru_pages += zone_reclaimable_pages(zone);
- node_set(zone_to_nid(zone),
- shrink->nodes_to_scan);
- }
-
- shrink_slab(shrink, sc->nr_scanned, lru_pages);
- if (reclaim_state) {
- sc->nr_reclaimed += reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
- }
total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
goto out;
@@ -2584,9 +2597,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.target_mem_cgroup = NULL,
.nodemask = nodemask,
};
- struct shrink_control shrink = {
- .gfp_mask = sc.gfp_mask,
- };
/*
* Do not enter reclaim if fatal signal was delivered while throttled.
@@ -2600,7 +2610,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
sc.may_writepage,
gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
@@ -2667,9 +2677,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
};
- struct shrink_control shrink = {
- .gfp_mask = sc.gfp_mask,
- };
/*
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
@@ -2684,7 +2691,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
sc.may_writepage,
sc.gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -3340,9 +3347,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.order = 0,
.priority = DEF_PRIORITY,
};
- struct shrink_control shrink = {
- .gfp_mask = sc.gfp_mask,
- };
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
unsigned long nr_reclaimed;
@@ -3352,7 +3356,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
@@ -3701,7 +3705,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
if (page_evictable(page)) {
enum lru_list lru = page_lru_base_type(page);
- VM_BUG_ON(PageActive(page));
+ VM_BUG_ON_PAGE(PageActive(page), page);
ClearPageUnevictable(page);
del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
add_page_to_lru_list(page, lruvec, lru);
diff --git a/mm/zswap.c b/mm/zswap.c
index 5a63f78a5601..e55bab9dc41f 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -77,12 +77,12 @@ static u64 zswap_duplicate_entry;
**********************************/
/* Enable/disable zswap (disabled by default, fixed at boot for now) */
static bool zswap_enabled __read_mostly;
-module_param_named(enabled, zswap_enabled, bool, 0);
+module_param_named(enabled, zswap_enabled, bool, 0444);
/* Compressor to be used by zswap (fixed at boot for now) */
#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
-module_param_named(compressor, zswap_compressor, charp, 0);
+module_param_named(compressor, zswap_compressor, charp, 0444);
/* The maximum percentage of memory that the compressed pool can occupy */
static unsigned int zswap_max_pool_percent = 20;
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 8a520996f3d2..e498a62b8f97 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -23,7 +23,6 @@
#define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */
#define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */
#define ALPHA_BASE ALPHA_SCALE /* 1.0 */
-#define U32_MAX ((u32)~0U)
#define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */
#define BETA_SHIFT 6
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 3f64a66bf5d9..b827a0f1f351 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -46,31 +46,12 @@ struct iface_node {
static void
rbtree_destroy(struct rb_root *root)
{
- struct rb_node *p, *n = root->rb_node;
- struct iface_node *node;
-
- /* Non-recursive destroy, like in ext3 */
- while (n) {
- if (n->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- p = rb_parent(n);
- node = rb_entry(n, struct iface_node, node);
- if (!p)
- *root = RB_ROOT;
- else if (p->rb_left == n)
- p->rb_left = NULL;
- else if (p->rb_right == n)
- p->rb_right = NULL;
+ struct iface_node *node, *next;
+ rbtree_postorder_for_each_entry_safe(node, next, root, node)
kfree(node);
- n = p;
- }
+
+ *root = RB_ROOT;
}
static int
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 9fb30b15c9dc..05c99c0b7e6c 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -29,6 +29,7 @@ my $mailback = 0;
my $summary_file = 0;
my $show_types = 0;
my $fix = 0;
+my $fix_inplace = 0;
my $root;
my %debug;
my %camelcase = ();
@@ -76,6 +77,9 @@ Options:
"<inputfile>.EXPERIMENTAL-checkpatch-fixes"
with potential errors corrected to the preferred
checkpatch style
+ --fix-inplace EXPERIMENTAL - may create horrible results
+ Is the same as --fix, but overwrites the input
+ file. It's your fault if there's no backup or git
--ignore-perl-version override checking of perl version. expect
runtime errors.
-h, --help, --version display this help and exit
@@ -131,6 +135,7 @@ GetOptions(
'mailback!' => \$mailback,
'summary-file!' => \$summary_file,
'fix!' => \$fix,
+ 'fix-inplace!' => \$fix_inplace,
'ignore-perl-version!' => \$ignore_perl_version,
'debug=s' => \%debug,
'test-only=s' => \$tst_only,
@@ -140,6 +145,8 @@ GetOptions(
help(0) if ($help);
+$fix = 1 if ($fix_inplace);
+
my $exit = 0;
if ($^V && $^V lt $minimum_perl_version) {
@@ -1963,15 +1970,14 @@ sub process {
}
# Check for FSF mailing addresses.
- if ($rawline =~ /You should have received a copy/ ||
- $rawline =~ /write to the Free Software/ ||
- $rawline =~ /59 Temple Place/ ||
- $rawline =~ /51 Franklin Street/) {
+ if ($rawline =~ /\bwrite to the Free/i ||
+ $rawline =~ /\b59\s+Temple\s+Pl/i ||
+ $rawline =~ /\b51\s+Franklin\s+St/i) {
my $herevet = "$here\n" . cat_vet($rawline) . "\n";
my $msg_type = \&ERROR;
$msg_type = \&CHK if ($file);
&{$msg_type}("FSF_MAILING_ADDRESS",
- "Do not include the paragraph about writing to the Free Software Foundation's mailing address from the sample GPL notice. The FSF has changed addresses in the past, and may do so again. Linux already includes a copy of the GPL.\n" . $herevet)
+ "Do not include the paragraph about writing to the Free Software Foundation's mailing address from the sample GPL notice. The FSF has changed addresses in the past, and may do so again. Linux already includes a copy of the GPL.\n" . $herevet)
}
# check for Kconfig help text having a real description
@@ -2034,6 +2040,33 @@ sub process {
"Use of $flag is deprecated, please use \`$replacement->{$flag} instead.\n" . $herecurr) if ($replacement->{$flag});
}
+# check for DT compatible documentation
+ if (defined $root && $realfile =~ /\.dts/ &&
+ $rawline =~ /^\+\s*compatible\s*=/) {
+ my @compats = $rawline =~ /\"([a-zA-Z0-9\-\,\.\+_]+)\"/g;
+
+ foreach my $compat (@compats) {
+ my $compat2 = $compat;
+ my $dt_path = $root . "/Documentation/devicetree/bindings/";
+ $compat2 =~ s/\,[a-z]*\-/\,<\.\*>\-/;
+ `grep -Erq "$compat|$compat2" $dt_path`;
+ if ( $? >> 8 ) {
+ WARN("UNDOCUMENTED_DT_STRING",
+ "DT compatible string \"$compat\" appears un-documented -- check $dt_path\n" . $herecurr);
+ }
+
+ my $vendor = $compat;
+ my $vendor_path = $dt_path . "vendor-prefixes.txt";
+ next if (! -f $vendor_path);
+ $vendor =~ s/^([a-zA-Z0-9]+)\,.*/$1/;
+ `grep -Eq "$vendor" $vendor_path`;
+ if ( $? >> 8 ) {
+ WARN("UNDOCUMENTED_DT_STRING",
+ "DT compatible string vendor \"$vendor\" appears un-documented -- check $vendor_path\n" . $herecurr);
+ }
+ }
+ }
+
# check we are in a valid source file if not then ignore this hunk
next if ($realfile !~ /\.(h|c|s|S|pl|sh)$/);
@@ -2049,16 +2082,12 @@ sub process {
}
# Check for user-visible strings broken across lines, which breaks the ability
-# to grep for the string. Limited to strings used as parameters (those
-# following an open parenthesis), which almost completely eliminates false
-# positives, as well as warning only once per parameter rather than once per
-# line of the string. Make an exception when the previous string ends in a
-# newline (multiple lines in one string constant) or \n\t (common in inline
-# assembly to indent the instruction on the following line).
+# to grep for the string. Make exceptions when the previous string ends in a
+# newline (multiple lines in one string constant) or '\t', '\r', ';', or '{'
+# (common in inline assembly) or is a octal \123 or hexadecimal \xaf value
if ($line =~ /^\+\s*"/ &&
$prevline =~ /"\s*$/ &&
- $prevline =~ /\(/ &&
- $prevrawline !~ /\\n(?:\\t)*"\s*$/) {
+ $prevrawline !~ /(?:\\(?:[ntr]|[0-7]{1,3}|x[0-9a-fA-F]{1,2})|;\s*|\{\s*)"\s*$/) {
WARN("SPLIT_STRING",
"quoted string split across lines\n" . $hereprev);
}
@@ -2115,8 +2144,10 @@ sub process {
if (WARN("SPACE_BEFORE_TAB",
"please, no space before tabs\n" . $herevet) &&
$fix) {
- $fixed[$linenr - 1] =~
- s/(^\+.*) +\t/$1\t/;
+ while ($fixed[$linenr - 1] =~
+ s/(^\+.*) {8,8}+\t/$1\t\t/) {}
+ while ($fixed[$linenr - 1] =~
+ s/(^\+.*) +\t/$1\t/) {}
}
}
@@ -2805,6 +2836,65 @@ sub process {
}
}
+# Function pointer declarations
+# check spacing between type, funcptr, and args
+# canonical declaration is "type (*funcptr)(args...)"
+#
+# the $Declare variable will capture all spaces after the type
+# so check it for trailing missing spaces or multiple spaces
+ if ($line =~ /^.\s*($Declare)\((\s*)\*(\s*)$Ident(\s*)\)(\s*)\(/) {
+ my $declare = $1;
+ my $pre_pointer_space = $2;
+ my $post_pointer_space = $3;
+ my $funcname = $4;
+ my $post_funcname_space = $5;
+ my $pre_args_space = $6;
+
+ if ($declare !~ /\s$/) {
+ WARN("SPACING",
+ "missing space after return type\n" . $herecurr);
+ }
+
+# unnecessary space "type (*funcptr)(args...)"
+ elsif ($declare =~ /\s{2,}$/) {
+ WARN("SPACING",
+ "Multiple spaces after return type\n" . $herecurr);
+ }
+
+# unnecessary space "type ( *funcptr)(args...)"
+ if (defined $pre_pointer_space &&
+ $pre_pointer_space =~ /^\s/) {
+ WARN("SPACING",
+ "Unnecessary space after function pointer open parenthesis\n" . $herecurr);
+ }
+
+# unnecessary space "type (* funcptr)(args...)"
+ if (defined $post_pointer_space &&
+ $post_pointer_space =~ /^\s/) {
+ WARN("SPACING",
+ "Unnecessary space before function pointer name\n" . $herecurr);
+ }
+
+# unnecessary space "type (*funcptr )(args...)"
+ if (defined $post_funcname_space &&
+ $post_funcname_space =~ /^\s/) {
+ WARN("SPACING",
+ "Unnecessary space after function pointer name\n" . $herecurr);
+ }
+
+# unnecessary space "type (*funcptr) (args...)"
+ if (defined $pre_args_space &&
+ $pre_args_space =~ /^\s/) {
+ WARN("SPACING",
+ "Unnecessary space before function pointer arguments\n" . $herecurr);
+ }
+
+ if (show_type("SPACING") && $fix) {
+ $fixed[$linenr - 1] =~
+ s/^(.\s*$Declare)\(\s*\*\s*($Ident)\s*\)\s*\(/rtrim($1) . " " . "\(\*$2\)\("/ex;
+ }
+ }
+
# check for spacing round square brackets; allowed:
# 1. with a type on the left -- int [] a;
# 2. at the beginning of a line for slice initialisers -- [0...10] = 5,
@@ -3125,7 +3215,7 @@ sub process {
}
# check for whitespace before a non-naked semicolon
- if ($line =~ /^\+.*\S\s+;/) {
+ if ($line =~ /^\+.*\S\s+;\s*$/) {
if (WARN("SPACING",
"space prohibited before semicolon\n" . $herecurr) &&
$fix) {
@@ -3249,6 +3339,20 @@ sub process {
}
}
+# if statements using unnecessary parentheses - ie: if ((foo == bar))
+ if ($^V && $^V ge 5.10.0 &&
+ $line =~ /\bif\s*((?:\(\s*){2,})/) {
+ my $openparens = $1;
+ my $count = $openparens =~ tr@\(@\(@;
+ my $msg = "";
+ if ($line =~ /\bif\s*(?:\(\s*){$count,$count}$LvalOrFunc\s*($Compare)\s*$LvalOrFunc(?:\s*\)){$count,$count}/) {
+ my $comp = $4; #Not $1 because of $LvalOrFunc
+ $msg = " - maybe == should be = ?" if ($comp eq "==");
+ WARN("UNNECESSARY_PARENTHESES",
+ "Unnecessary parentheses$msg\n" . $herecurr);
+ }
+ }
+
# Return of what appears to be an errno should normally be -'ve
if ($line =~ /^.\s*return\s*(E[A-Z]*)\s*;/) {
my $name = $1;
@@ -4117,6 +4221,12 @@ sub process {
"$1 uses number as first arg, sizeof is generally wrong\n" . $herecurr);
}
+# check for GFP_NOWAIT use
+ if ($line =~ /\b__GFP_NOFAIL\b/) {
+ WARN("__GFP_NOFAIL",
+ "Use of __GFP_NOFAIL is deprecated, no new users should be added\n" . $herecurr);
+ }
+
# check for multiple semicolons
if ($line =~ /;\s*;\s*$/) {
if (WARN("ONE_SEMICOLON",
@@ -4126,6 +4236,31 @@ sub process {
}
}
+# check for case / default statements not preceeded by break/fallthrough/switch
+ if ($line =~ /^.\s*(?:case\s+(?:$Ident|$Constant)\s*|default):/) {
+ my $has_break = 0;
+ my $has_statement = 0;
+ my $count = 0;
+ my $prevline = $linenr;
+ while ($prevline > 1 && $count < 3 && !$has_break) {
+ $prevline--;
+ my $rline = $rawlines[$prevline - 1];
+ my $fline = $lines[$prevline - 1];
+ last if ($fline =~ /^\@\@/);
+ next if ($fline =~ /^\-/);
+ next if ($fline =~ /^.(?:\s*(?:case\s+(?:$Ident|$Constant)[\s$;]*|default):[\s$;]*)*$/);
+ $has_break = 1 if ($rline =~ /fall[\s_-]*(through|thru)/i);
+ next if ($fline =~ /^.[\s$;]*$/);
+ $has_statement = 1;
+ $count++;
+ $has_break = 1 if ($fline =~ /\bswitch\b|\b(?:break\s*;[\s$;]*$|return\b|goto\b|continue\b)/);
+ }
+ if (!$has_break && $has_statement) {
+ WARN("MISSING_BREAK",
+ "Possible switch case/default not preceeded by break or fallthrough comment\n" . $herecurr);
+ }
+ }
+
# check for switch/default statements without a break;
if ($^V && $^V ge 5.10.0 &&
defined $stat &&
@@ -4361,7 +4496,8 @@ sub process {
hash_show_words(\%ignore_type, "Ignored");
if ($clean == 0 && $fix && "@rawlines" ne "@fixed") {
- my $newfile = $filename . ".EXPERIMENTAL-checkpatch-fixes";
+ my $newfile = $filename;
+ $newfile .= ".EXPERIMENTAL-checkpatch-fixes" if (!$fix_inplace);
my $linecount = 0;
my $f;
diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 5e4fb144a04f..9c3986f4140c 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -98,6 +98,7 @@ my %VCS_cmds_git = (
"available" => '(which("git") ne "") && (-d ".git")',
"find_signers_cmd" =>
"git log --no-color --follow --since=\$email_git_since " .
+ '--numstat --no-merges ' .
'--format="GitCommit: %H%n' .
'GitAuthor: %an <%ae>%n' .
'GitDate: %aD%n' .
@@ -106,6 +107,7 @@ my %VCS_cmds_git = (
" -- \$file",
"find_commit_signers_cmd" =>
"git log --no-color " .
+ '--numstat ' .
'--format="GitCommit: %H%n' .
'GitAuthor: %an <%ae>%n' .
'GitDate: %aD%n' .
@@ -114,6 +116,7 @@ my %VCS_cmds_git = (
" -1 \$commit",
"find_commit_author_cmd" =>
"git log --no-color " .
+ '--numstat ' .
'--format="GitCommit: %H%n' .
'GitAuthor: %an <%ae>%n' .
'GitDate: %aD%n' .
@@ -125,6 +128,7 @@ my %VCS_cmds_git = (
"blame_commit_pattern" => "^([0-9a-f]+) ",
"author_pattern" => "^GitAuthor: (.*)",
"subject_pattern" => "^GitSubject: (.*)",
+ "stat_pattern" => "^(\\d+)\\t(\\d+)\\t\$file\$",
);
my %VCS_cmds_hg = (
@@ -152,6 +156,7 @@ my %VCS_cmds_hg = (
"blame_commit_pattern" => "^([ 0-9a-f]+):",
"author_pattern" => "^HgAuthor: (.*)",
"subject_pattern" => "^HgSubject: (.*)",
+ "stat_pattern" => "^(\\d+)\t(\\d+)\t\$file\$",
);
my $conf = which_conf(".get_maintainer.conf");
@@ -1269,20 +1274,30 @@ sub extract_formatted_signatures {
}
sub vcs_find_signers {
- my ($cmd) = @_;
+ my ($cmd, $file) = @_;
my $commits;
my @lines = ();
my @signatures = ();
+ my @authors = ();
+ my @stats = ();
@lines = &{$VCS_cmds{"execute_cmd"}}($cmd);
my $pattern = $VCS_cmds{"commit_pattern"};
+ my $author_pattern = $VCS_cmds{"author_pattern"};
+ my $stat_pattern = $VCS_cmds{"stat_pattern"};
+
+ $stat_pattern =~ s/(\$\w+)/$1/eeg; #interpolate $stat_pattern
$commits = grep(/$pattern/, @lines); # of commits
+ @authors = grep(/$author_pattern/, @lines);
@signatures = grep(/^[ \t]*${signature_pattern}.*\@.*$/, @lines);
+ @stats = grep(/$stat_pattern/, @lines);
- return (0, @signatures) if !@signatures;
+# print("stats: <@stats>\n");
+
+ return (0, \@signatures, \@authors, \@stats) if !@signatures;
save_commits_by_author(@lines) if ($interactive);
save_commits_by_signer(@lines) if ($interactive);
@@ -1291,9 +1306,10 @@ sub vcs_find_signers {
@signatures = grep(!/${penguin_chiefs}/i, @signatures);
}
+ my ($author_ref, $authors_ref) = extract_formatted_signatures(@authors);
my ($types_ref, $signers_ref) = extract_formatted_signatures(@signatures);
- return ($commits, @$signers_ref);
+ return ($commits, $signers_ref, $authors_ref, \@stats);
}
sub vcs_find_author {
@@ -1849,7 +1865,12 @@ sub vcs_assign {
sub vcs_file_signoffs {
my ($file) = @_;
+ my $authors_ref;
+ my $signers_ref;
+ my $stats_ref;
+ my @authors = ();
my @signers = ();
+ my @stats = ();
my $commits;
$vcs_used = vcs_exists();
@@ -1858,13 +1879,59 @@ sub vcs_file_signoffs {
my $cmd = $VCS_cmds{"find_signers_cmd"};
$cmd =~ s/(\$\w+)/$1/eeg; # interpolate $cmd
- ($commits, @signers) = vcs_find_signers($cmd);
+ ($commits, $signers_ref, $authors_ref, $stats_ref) = vcs_find_signers($cmd, $file);
+
+ @signers = @{$signers_ref} if defined $signers_ref;
+ @authors = @{$authors_ref} if defined $authors_ref;
+ @stats = @{$stats_ref} if defined $stats_ref;
+
+# print("commits: <$commits>\nsigners:<@signers>\nauthors: <@authors>\nstats: <@stats>\n");
foreach my $signer (@signers) {
$signer = deduplicate_email($signer);
}
vcs_assign("commit_signer", $commits, @signers);
+ vcs_assign("authored", $commits, @authors);
+ if ($#authors == $#stats) {
+ my $stat_pattern = $VCS_cmds{"stat_pattern"};
+ $stat_pattern =~ s/(\$\w+)/$1/eeg; #interpolate $stat_pattern
+
+ my $added = 0;
+ my $deleted = 0;
+ for (my $i = 0; $i <= $#stats; $i++) {
+ if ($stats[$i] =~ /$stat_pattern/) {
+ $added += $1;
+ $deleted += $2;
+ }
+ }
+ my @tmp_authors = uniq(@authors);
+ foreach my $author (@tmp_authors) {
+ $author = deduplicate_email($author);
+ }
+ @tmp_authors = uniq(@tmp_authors);
+ my @list_added = ();
+ my @list_deleted = ();
+ foreach my $author (@tmp_authors) {
+ my $auth_added = 0;
+ my $auth_deleted = 0;
+ for (my $i = 0; $i <= $#stats; $i++) {
+ if ($author eq deduplicate_email($authors[$i]) &&
+ $stats[$i] =~ /$stat_pattern/) {
+ $auth_added += $1;
+ $auth_deleted += $2;
+ }
+ }
+ for (my $i = 0; $i < $auth_added; $i++) {
+ push(@list_added, $author);
+ }
+ for (my $i = 0; $i < $auth_deleted; $i++) {
+ push(@list_deleted, $author);
+ }
+ }
+ vcs_assign("added_lines", $added, @list_added);
+ vcs_assign("removed_lines", $deleted, @list_deleted);
+ }
}
sub vcs_file_blame {
@@ -1887,6 +1954,10 @@ sub vcs_file_blame {
if ($email_git_blame_signatures) {
if (vcs_is_hg()) {
my $commit_count;
+ my $commit_authors_ref;
+ my $commit_signers_ref;
+ my $stats_ref;
+ my @commit_authors = ();
my @commit_signers = ();
my $commit = join(" -r ", @commits);
my $cmd;
@@ -1894,19 +1965,27 @@ sub vcs_file_blame {
$cmd = $VCS_cmds{"find_commit_signers_cmd"};
$cmd =~ s/(\$\w+)/$1/eeg; #substitute variables in $cmd
- ($commit_count, @commit_signers) = vcs_find_signers($cmd);
+ ($commit_count, $commit_signers_ref, $commit_authors_ref, $stats_ref) = vcs_find_signers($cmd, $file);
+ @commit_authors = @{$commit_authors_ref} if defined $commit_authors_ref;
+ @commit_signers = @{$commit_signers_ref} if defined $commit_signers_ref;
push(@signers, @commit_signers);
} else {
foreach my $commit (@commits) {
my $commit_count;
+ my $commit_authors_ref;
+ my $commit_signers_ref;
+ my $stats_ref;
+ my @commit_authors = ();
my @commit_signers = ();
my $cmd;
$cmd = $VCS_cmds{"find_commit_signers_cmd"};
$cmd =~ s/(\$\w+)/$1/eeg; #substitute variables in $cmd
- ($commit_count, @commit_signers) = vcs_find_signers($cmd);
+ ($commit_count, $commit_signers_ref, $commit_authors_ref, $stats_ref) = vcs_find_signers($cmd, $file);
+ @commit_authors = @{$commit_authors_ref} if defined $commit_authors_ref;
+ @commit_signers = @{$commit_signers_ref} if defined $commit_signers_ref;
push(@signers, @commit_signers);
}
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index 7941fbdfb050..cc49062acdee 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -39,6 +39,10 @@
#define EM_AARCH64 183
#endif
+#ifndef EM_MICROBLAZE
+#define EM_MICROBLAZE 189
+#endif
+
static int fd_map; /* File descriptor for file being modified. */
static int mmap_failed; /* Boolean flag. */
static void *ehdr_curr; /* current ElfXX_Ehdr * for resource cleanup */
@@ -275,6 +279,7 @@ do_file(char const *const fname)
case EM_ARCOMPACT:
case EM_ARM:
case EM_AARCH64:
+ case EM_MICROBLAZE:
case EM_MIPS:
break;
} /* end switch */
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 9f3eae290900..32487ed18354 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -9,6 +9,7 @@ TARGETS += ptrace
TARGETS += timers
TARGETS += vm
TARGETS += powerpc
+TARGETS += user
all:
for TARGET in $(TARGETS); do \
diff --git a/tools/testing/selftests/user/Makefile b/tools/testing/selftests/user/Makefile
new file mode 100644
index 000000000000..396255bd720e
--- /dev/null
+++ b/tools/testing/selftests/user/Makefile
@@ -0,0 +1,13 @@
+# Makefile for user memory selftests
+
+# No binaries, but make sure arg-less "make" doesn't trigger "run_tests"
+all:
+
+run_tests: all
+ @if /sbin/modprobe test_user_copy ; then \
+ rmmod test_user_copy; \
+ echo "user_copy: ok"; \
+ else \
+ echo "user_copy: [FAIL]"; \
+ exit 1; \
+ fi