summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2014-04-23 13:48:20 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2014-04-23 13:48:20 +1000
commitec2e3208e69f09bb993b106e7a0698997254d299 (patch)
treeee3a109dfa2065e72b116a679ade30d1578410fc
parentd9cf7ff88de666072bbfc7c18ce979be8114cc82 (diff)
parent7f6706aa1af1675f13e964b48d98ba69b41b7d73 (diff)
Merge branch 'akpm-current/current'
-rw-r--r--CREDITS7
-rw-r--r--Documentation/SubmittingPatches22
-rw-r--r--Documentation/cgroups/memory.txt16
-rw-r--r--Documentation/devicetree/bindings/rtc/xgene-rtc.txt28
-rw-r--r--Documentation/filesystems/vfat.txt15
-rw-r--r--Documentation/kernel-parameters.txt19
-rw-r--r--Documentation/leds/leds-class.txt3
-rw-r--r--Documentation/memory-hotplug.txt125
-rw-r--r--Documentation/printk-formats.txt6
-rw-r--r--Documentation/sysctl/vm.txt17
-rw-r--r--MAINTAINERS5
-rw-r--r--arch/alpha/include/asm/Kbuild1
-rw-r--r--arch/alpha/include/asm/scatterlist.h6
-rw-r--r--arch/arc/kernel/troubleshoot.c10
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/include/asm/Kbuild1
-rw-r--r--arch/arm/include/asm/fixmap.h29
-rw-r--r--arch/arm/include/asm/scatterlist.h12
-rw-r--r--arch/arm/mm/init.c2
-rw-r--r--arch/arm64/Kconfig1
-rw-r--r--arch/arm64/boot/dts/apm-storm.dtsi21
-rw-r--r--arch/blackfin/include/asm/unistd.h1
-rw-r--r--arch/cris/include/asm/Kbuild1
-rw-r--r--arch/cris/include/asm/scatterlist.h6
-rw-r--r--arch/cris/include/asm/unistd.h1
-rw-r--r--arch/frv/include/asm/Kbuild1
-rw-r--r--arch/frv/include/asm/scatterlist.h6
-rw-r--r--arch/frv/include/asm/unistd.h1
-rw-r--r--arch/ia64/Kconfig1
-rw-r--r--arch/ia64/include/asm/Kbuild1
-rw-r--r--arch/ia64/include/asm/scatterlist.h7
-rw-r--r--arch/ia64/include/asm/topology.h3
-rw-r--r--arch/m32r/include/asm/Kbuild1
-rw-r--r--arch/m32r/include/asm/scatterlist.h6
-rw-r--r--arch/m68k/include/asm/signal.h9
-rw-r--r--arch/m68k/include/asm/unistd.h1
-rw-r--r--arch/m68k/kernel/sys_m68k.c18
-rw-r--r--arch/microblaze/include/asm/Kbuild1
-rw-r--r--arch/microblaze/include/asm/scatterlist.h1
-rw-r--r--arch/microblaze/include/asm/unistd.h1
-rw-r--r--arch/mips/dec/Makefile2
-rw-r--r--arch/mips/dec/platform.c44
-rw-r--r--arch/mips/include/asm/unistd.h1
-rw-r--r--arch/mips/kernel/traps.c2
-rw-r--r--arch/mips/mm/c-octeon.c2
-rw-r--r--arch/mn10300/include/asm/Kbuild1
-rw-r--r--arch/mn10300/include/asm/scatterlist.h16
-rw-r--r--arch/mn10300/include/asm/unistd.h1
-rw-r--r--arch/parisc/include/asm/unistd.h1
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/Kbuild1
-rw-r--r--arch/powerpc/include/asm/pgtable.h6
-rw-r--r--arch/powerpc/include/asm/scatterlist.h17
-rw-r--r--arch/powerpc/include/asm/topology.h8
-rw-r--r--arch/powerpc/include/asm/unistd.h1
-rw-r--r--arch/powerpc/mm/dma-noncoherent.c1
-rw-r--r--arch/powerpc/mm/subpage-prot.c6
-rw-r--r--arch/powerpc/platforms/44x/warp.c1
-rw-r--r--arch/powerpc/platforms/52xx/efika.c1
-rw-r--r--arch/powerpc/platforms/amigaone/setup.c1
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/include/asm/Kbuild1
-rw-r--r--arch/s390/include/asm/scatterlist.h3
-rw-r--r--arch/score/include/asm/Kbuild1
-rw-r--r--arch/score/include/asm/scatterlist.h6
-rw-r--r--arch/sh/include/asm/unistd.h1
-rw-r--r--arch/sparc/Kconfig1
-rw-r--r--arch/sparc/include/asm/Kbuild1
-rw-r--r--arch/sparc/include/asm/scatterlist.h8
-rw-r--r--arch/sparc/include/asm/unistd.h1
-rw-r--r--arch/um/include/asm/Kbuild1
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/include/asm/Kbuild3
-rw-r--r--arch/x86/include/asm/pgtable.h15
-rw-r--r--arch/x86/include/asm/pgtable_64.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h66
-rw-r--r--arch/x86/include/asm/scatterlist.h8
-rw-r--r--arch/x86/include/asm/signal.h6
-rw-r--r--arch/x86/include/asm/swiotlb.h7
-rw-r--r--arch/x86/include/asm/unistd.h1
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/pci-swiotlb.c9
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/mm/init_64.c34
-rw-r--r--arch/x86/mm/pageattr-test.c2
-rw-r--r--arch/x86/pci/sta2x11-fixup.c6
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/base/dma-contiguous.c42
-rw-r--r--drivers/base/memory.c12
-rw-r--r--drivers/block/zram/zram_drv.c4
-rw-r--r--drivers/gpu/drm/exynos/exynos_drm_g2d.c6
-rw-r--r--drivers/input/Kconfig9
-rw-r--r--drivers/input/Makefile3
-rw-r--r--drivers/input/input.c6
-rw-r--r--drivers/input/leds.c249
-rw-r--r--drivers/iommu/intel-iommu.c32
-rw-r--r--drivers/leds/Kconfig3
-rw-r--r--drivers/misc/ti-st/st_core.c2
-rw-r--r--drivers/net/irda/donauboe.c15
-rw-r--r--drivers/rtc/Kconfig13
-rw-r--r--drivers/rtc/Makefile1
-rw-r--r--drivers/rtc/interface.c15
-rw-r--r--drivers/rtc/rtc-cmos.c85
-rw-r--r--drivers/rtc/rtc-efi.c2
-rw-r--r--drivers/rtc/rtc-m41t80.c104
-rw-r--r--drivers/rtc/rtc-pcf8523.c4
-rw-r--r--drivers/rtc/rtc-xgene.c278
-rw-r--r--drivers/tty/Kconfig4
-rw-r--r--drivers/tty/vt/keyboard.c110
-rw-r--r--drivers/video/backlight/backlight.c2
-rw-r--r--fs/affs/affs.h16
-rw-r--r--fs/affs/amigaffs.c15
-rw-r--r--fs/affs/bitmap.c21
-rw-r--r--fs/affs/dir.c8
-rw-r--r--fs/affs/file.c46
-rw-r--r--fs/affs/inode.c14
-rw-r--r--fs/affs/namei.c39
-rw-r--r--fs/affs/super.c43
-rw-r--r--fs/affs/symlink.c2
-rw-r--r--fs/befs/btree.c7
-rw-r--r--fs/befs/linuxvfs.c5
-rw-r--r--fs/binfmt_elf.c25
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/fat/cache.c70
-rw-r--r--fs/fat/fat.h9
-rw-r--r--fs/fat/file.c78
-rw-r--r--fs/fat/inode.c429
-rw-r--r--fs/hfsplus/attributes.c36
-rw-r--r--fs/hfsplus/catalog.c89
-rw-r--r--fs/hfsplus/dir.c22
-rw-r--r--fs/hfsplus/hfsplus_fs.h7
-rw-r--r--fs/hfsplus/super.c4
-rw-r--r--fs/hfsplus/xattr.c51
-rw-r--r--fs/hfsplus/xattr_security.c49
-rw-r--r--fs/hfsplus/xattr_trusted.c32
-rw-r--r--fs/hfsplus/xattr_user.c32
-rw-r--r--fs/hugetlbfs/inode.c21
-rw-r--r--fs/jffs2/background.c12
-rw-r--r--fs/mpage.c23
-rw-r--r--fs/notify/fanotify/fanotify_user.c17
-rw-r--r--fs/notify/mark.c2
-rw-r--r--fs/ntfs/compress.c2
-rw-r--r--fs/ntfs/super.c4
-rw-r--r--fs/ntfs/sysctl.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c33
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c2
-rw-r--r--fs/ocfs2/dlm/dlmlock.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c20
-rw-r--r--fs/ocfs2/journal.c17
-rw-r--r--fs/ocfs2/namei.c145
-rw-r--r--fs/ocfs2/stackglue.c2
-rw-r--r--fs/ocfs2/super.c8
-rw-r--r--fs/ocfs2/uptodate.c2
-rw-r--r--fs/proc/task_mmu.c287
-rw-r--r--fs/readdir.c2
-rw-r--r--fs/reiserfs/bitmap.c13
-rw-r--r--include/asm-generic/pgtable.h8
-rw-r--r--include/linux/crc64_ecma.h56
-rw-r--r--include/linux/dma-contiguous.h9
-rw-r--r--include/linux/gfp.h10
-rw-r--r--include/linux/hugetlb.h22
-rw-r--r--include/linux/hugetlb_inline.h7
-rw-r--r--include/linux/idr.h13
-rw-r--r--include/linux/input.h21
-rw-r--r--include/linux/mc146818rtc.h4
-rw-r--r--include/linux/memblock.h2
-rw-r--r--include/linux/memcontrol.h17
-rw-r--r--include/linux/memory_hotplug.h14
-rw-r--r--include/linux/mm.h43
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/mmdebug.h15
-rw-r--r--include/linux/mmzone.h9
-rw-r--r--include/linux/page-flags.h3
-rw-r--r--include/linux/pagemap.h28
-rw-r--r--include/linux/scatterlist.h2
-rw-r--r--include/linux/sched.h13
-rw-r--r--include/linux/signal.h21
-rw-r--r--include/linux/slab.h11
-rw-r--r--include/linux/string.h1
-rw-r--r--include/linux/swap.h19
-rw-r--r--include/linux/swapops.h2
-rw-r--r--include/linux/swiotlb.h2
-rw-r--r--include/linux/thread_info.h2
-rw-r--r--include/linux/topology.h3
-rw-r--r--include/linux/vm_event_item.h4
-rw-r--r--include/linux/vmstat.h6
-rw-r--r--include/scsi/scsi.h2
-rw-r--r--include/trace/events/gfpflags.h1
-rw-r--r--init/Kconfig23
-rw-r--r--init/main.c74
-rw-r--r--ipc/compat.c2
-rw-r--r--ipc/compat_mq.c2
-rw-r--r--ipc/msg.c42
-rw-r--r--ipc/sem.c22
-rw-r--r--ipc/shm.c14
-rw-r--r--ipc/util.c12
-rw-r--r--ipc/util.h10
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/exit.c43
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/panic.c23
-rw-r--r--kernel/printk/printk.c144
-rw-r--r--kernel/signal.c94
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/time/sched_clock.c4
-rw-r--r--kernel/watchdog.c10
-rw-r--r--lib/Kconfig14
-rw-r--r--lib/Kconfig.debug10
-rw-r--r--lib/Makefile1
-rw-r--r--lib/crc64_ecma.c341
-rw-r--r--lib/idr.c40
-rw-r--r--lib/scatterlist.c4
-rw-r--r--lib/string.c8
-rw-r--r--lib/swiotlb.c2
-rw-r--r--lib/vsprintf.c20
-rw-r--r--lib/xz/Kconfig24
-rw-r--r--lib/xz/xz_dec_lzma2.c4
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bounce.c7
-rw-r--r--mm/compaction.c67
-rw-r--r--mm/filemap.c49
-rw-r--r--mm/fremap.c7
-rw-r--r--mm/gup.c640
-rw-r--r--mm/huge_memory.c34
-rw-r--r--mm/hugetlb.c355
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/memblock.c21
-rw-r--r--mm/memcontrol.c151
-rw-r--r--mm/memory-failure.c14
-rw-r--r--mm/memory.c650
-rw-r--r--mm/memory_hotplug.c142
-rw-r--r--mm/mempolicy.c264
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/mmap.c30
-rw-r--r--mm/nommu.c5
-rw-r--r--mm/page_alloc.c106
-rw-r--r--mm/pagewalk.c375
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/slab.c33
-rw-r--r--mm/slab.h30
-rw-r--r--mm/slab_common.c59
-rw-r--r--mm/slob.c3
-rw-r--r--mm/slub.c50
-rw-r--r--mm/swap.c31
-rw-r--r--mm/truncate.c8
-rw-r--r--mm/util.c30
-rw-r--r--mm/vmacache.c19
-rw-r--r--mm/vmscan.c106
-rw-r--r--mm/vmstat.c4
-rwxr-xr-xscripts/checkpatch.pl46
-rw-r--r--usr/Kconfig77
255 files changed, 5391 insertions, 2937 deletions
diff --git a/CREDITS b/CREDITS
index c322dcfb926d..ce275b6f4873 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1377,6 +1377,9 @@ S: 17 rue Danton
S: F - 94270 Le Kremlin-Bicêtre
S: France
+N: Jack Hammer
+D: IBM ServeRAID RAID (ips) driver maintenance
+
N: Greg Hankins
E: gregh@cc.gatech.edu
D: fixed keyboard driver to separate LED and locking status
@@ -1687,6 +1690,10 @@ S: Reading
S: RG6 2NU
S: United Kingdom
+N: Dave Jeffery
+E: dhjeffery@gmail.com
+D: SCSI hacks and IBM ServeRAID RAID driver maintenance
+
N: Jakub Jelinek
E: jakub@redhat.com
W: http://sunsite.mff.cuni.cz/~jj
diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index 2a8e89e13e45..7e9abb8a276b 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -132,6 +132,20 @@ Example:
platform_set_drvdata(), but left the variable "dev" unused,
delete it.
+If your patch fixes a bug in a specific commit, e.g. you found an issue using
+git-bisect, please use the 'Fixes:' tag with the first 12 characters of the
+SHA-1 ID, and the one line summary.
+Example:
+
+ Fixes: e21d2170f366 ("video: remove unnecessary platform_set_drvdata()")
+
+The following git-config settings can be used to add a pretty format for
+outputting the above style in the git log or git show commands
+
+ [core]
+ abbrev = 12
+ [pretty]
+ fixes = Fixes: %h (\"%s\")
3) Separate your changes.
@@ -443,7 +457,7 @@ person it names. This tag documents that potentially interested parties
have been included in the discussion
-14) Using Reported-by:, Tested-by:, Reviewed-by: and Suggested-by:
+14) Using Reported-by:, Tested-by:, Reviewed-by:, Suggested-by: and Fixes:
If this patch fixes a problem reported by somebody else, consider adding a
Reported-by: tag to credit the reporter for their contribution. Please
@@ -498,6 +512,12 @@ idea was not posted in a public forum. That said, if we diligently credit our
idea reporters, they will, hopefully, be inspired to help us again in the
future.
+A Fixes: tag indicates that the patch fixes an issue in a previous commit. It
+is used to make it easy to determine where a bug originated, which can help
+review a bug fix. This tag also assists the stable kernel team in determining
+which stable kernel versions should receive your fix. This is the preferred
+method for indicating a bug fixed by the patch. See #2 above for more details.
+
15) The canonical patch format
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 2622115276aa..4937e6fff9b4 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -270,6 +270,11 @@ When oom event notifier is registered, event will be delivered.
2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
+WARNING: Current implementation lacks reclaim support. That means allocation
+ attempts will fail when close to the limit even if there are plenty of
+ kmem available for reclaim. That makes this option unusable in real
+ life so DO NOT SELECT IT unless for development purposes.
+
With the Kernel memory extension, the Memory Controller is able to limit
the amount of kernel memory used by the system. Kernel memory is fundamentally
different than user memory, since it can't be swapped out, which makes it
@@ -535,17 +540,15 @@ Note:
5.3 swappiness
-Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
+Similar to /proc/sys/vm/swappiness, but only affecting reclaim that is
+triggered by this cgroup's hard limit. The tunable in the root cgroup
+corresponds to the global swappiness setting.
+
Please note that unlike the global swappiness, memcg knob set to 0
really prevents from any swapping even if there is a swap storage
available. This might lead to memcg OOM killer if there are no file
pages to reclaim.
-Following cgroups' swappiness can't be changed.
-- root cgroup (uses /proc/sys/vm/swappiness).
-- a cgroup which uses hierarchy and it has other cgroup(s) below it.
-- a cgroup which uses hierarchy and not the root of hierarchy.
-
5.4 failcnt
A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
@@ -754,7 +757,6 @@ You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
#echo 1 > memory.oom_control
-This operation is only allowed to the top cgroup of a sub-hierarchy.
If OOM-killer is disabled, tasks under cgroup will hang/sleep
in memory cgroup's OOM-waitqueue when they request accountable memory.
diff --git a/Documentation/devicetree/bindings/rtc/xgene-rtc.txt b/Documentation/devicetree/bindings/rtc/xgene-rtc.txt
new file mode 100644
index 000000000000..fd195c358446
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/xgene-rtc.txt
@@ -0,0 +1,28 @@
+* APM X-Gene Real Time Clock
+
+RTC controller for the APM X-Gene Real Time Clock
+
+Required properties:
+- compatible : Should be "apm,xgene-rtc"
+- reg: physical base address of the controller and length of memory mapped
+ region.
+- interrupts: IRQ line for the RTC.
+- #clock-cells: Should be 1.
+- clocks: Reference to the clock entry.
+
+Example:
+
+rtcclk: rtcclk {
+ compatible = "fixed-clock";
+ #clock-cells = <1>;
+ clock-frequency = <100000000>;
+ clock-output-names = "rtcclk";
+};
+
+rtc: rtc@10510000 {
+ compatible = "apm,xgene-rtc";
+ reg = <0x0 0x10510000 0x0 0x400>;
+ interrupts = <0x0 0x46 0x4>;
+ #clock-cells = <1>;
+ clocks = <&rtcclk 0>;
+};
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index 4a93e98b290a..223c32171dcc 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -172,9 +172,24 @@ nfs=stale_rw|nostale_ro
To maintain backward compatibility, '-o nfs' is also accepted,
defaulting to stale_rw
+dos1xfloppy -- If set, use a fallback default BIOS Parameter Block
+ configuration, determined by backing device size. These static
+ parameters match defaults assumed by DOS 1.x for 160 kiB,
+ 180 kiB, 320 kiB, and 360 kiB floppies and floppy images.
+
<bool>: 0,1,yes,no,true,false
+LIMITATION
+---------------------------------------------------------------------
+* The fallocated region of file is discarded at umount/evict time
+ when using fallocate with FALLOC_FL_KEEP_SIZE.
+ So, User should assume that fallocated region can be discarded at
+ last close if there is memory pressure resulting in eviction of
+ the inode from the memory. As a result, for any dependency on
+ the fallocated region, user should make sure to recheck fallocate
+ after reopening the file.
+
TODO
----------------------------------------------------------------------
* Need to get rid of the raw scanning stuff. Instead, always use
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 60a278948652..296fb10ba877 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -625,8 +625,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Also note the kernel might malfunction if you disable
some critical bits.
- cma=nn[MG] [ARM,KNL]
- Sets the size of kernel global memory area for contiguous
+ cma=nn[MG]@[start[MG][-end[MG]]]
+ [ARM,X86,KNL]
+ Sets the size of kernel global memory area for
+ contiguous memory allocations and optionally the
+ placement constraint by the physical address range of
memory allocations. For more information, see
include/linux/dma-contiguous.h
@@ -1295,6 +1298,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
for working out where the kernel is dying during
startup.
+ initcall_blacklist= [KNL] Do not execute a comma-separated list of
+ initcall functions. Useful for debugging built-in
+ modules and initcalls.
+
initrd= [BOOT] Specify the location of the initial ramdisk
inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver
@@ -2340,6 +2347,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
timeout < 0: reboot immediately
Format: <timeout>
+ crash_kexec_post_notifiers
+ Run kdump after running panic-notifiers and dumping
+ kmsg. This only for the users who doubt kdump always
+ succeeds in any situation.
+ Note that this also increases risks of kdump failure,
+ because some panic notifiers can make the crashed
+ kernel more unstable.
+
parkbd.port= [HW] Parallel port number the keyboard adapter is
connected to, default is 0.
Format: <parport#>
diff --git a/Documentation/leds/leds-class.txt b/Documentation/leds/leds-class.txt
index 79699c200766..62261c04060a 100644
--- a/Documentation/leds/leds-class.txt
+++ b/Documentation/leds/leds-class.txt
@@ -2,9 +2,6 @@
LED handling under Linux
========================
-If you're reading this and thinking about keyboard leds, these are
-handled by the input subsystem and the led class is *not* needed.
-
In its simplest form, the LED class just allows control of LEDs from
userspace. LEDs appear in /sys/class/leds/. The maximum brightness of the
LED is defined in max_brightness file. The brightness file will set the brightness
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 58340d50f8a6..f304edb8fbe7 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -88,16 +88,21 @@ phase by hand.
1.3. Unit of Memory online/offline operation
------------
-Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole memory
-into chunks of the same size. The chunk is called a "section". The size of
-a section is architecture dependent. For example, power uses 16MiB, ia64 uses
-1GiB. The unit of online/offline operation is "one section". (see Section 3.)
+Memory hotplug uses SPARSEMEM memory model which allows memory to be divided
+into chunks of the same size. These chunks are called "sections". The size of
+a memory section is architecture dependent. For example, power uses 16MiB, ia64
+uses 1GiB.
-To determine the size of sections, please read this file:
+Memory sections are combined into chunks referred to as "memory blocks". The
+size of a memory block is architecture dependent and represents the logical
+unit upon which memory online/offline operations are to be performed. The
+default size of a memory block is the same as memory section size unless an
+architecture specifies otherwise. (see Section 3.)
+
+To determine the size (in bytes) of a memory block please read this file:
/sys/devices/system/memory/block_size_bytes
-This file shows the size of sections in byte.
-----------------------
2. Kernel Configuration
@@ -123,42 +128,35 @@ config options.
(CONFIG_ACPI_CONTAINER).
This option can be kernel module too.
+
--------------------------------
-4 sysfs files for memory hotplug
+3 sysfs files for memory hotplug
--------------------------------
-All sections have their device information in sysfs. Each section is part of
-a memory block under /sys/devices/system/memory as
+All memory blocks have their device information in sysfs. Each memory block
+is described under /sys/devices/system/memory as
/sys/devices/system/memory/memoryXXX
-(XXX is the section id.)
+(XXX is the memory block id.)
-Now, XXX is defined as (start_address_of_section / section_size) of the first
-section contained in the memory block. The files 'phys_index' and
-'end_phys_index' under each directory report the beginning and end section id's
-for the memory block covered by the sysfs directory. It is expected that all
+For the memory block covered by the sysfs directory. It is expected that all
memory sections in this range are present and no memory holes exist in the
range. Currently there is no way to determine if there is a memory hole, but
the existence of one should not affect the hotplug capabilities of the memory
block.
-For example, assume 1GiB section size. A device for a memory starting at
+For example, assume 1GiB memory block size. A device for a memory starting at
0x100000000 is /sys/device/system/memory/memory4
(0x100000000 / 1Gib = 4)
This device covers address range [0x100000000 ... 0x140000000)
-Under each section, you can see 4 or 5 files, the end_phys_index file being
-a recent addition and not present on older kernels.
+Under each memory block, you can see 4 files:
-/sys/devices/system/memory/memoryXXX/start_phys_index
-/sys/devices/system/memory/memoryXXX/end_phys_index
+/sys/devices/system/memory/memoryXXX/phys_index
/sys/devices/system/memory/memoryXXX/phys_device
/sys/devices/system/memory/memoryXXX/state
/sys/devices/system/memory/memoryXXX/removable
-'phys_index' : read-only and contains section id of the first section
- in the memory block, same as XXX.
-'end_phys_index' : read-only and contains section id of the last section
- in the memory block.
+'phys_index' : read-only and contains memory block id, same as XXX.
'state' : read-write
at read: contains online/offline state of memory.
at write: user can specify "online_kernel",
@@ -185,6 +183,7 @@ For example:
A backlink will also be created:
/sys/devices/system/memory/memory9/node0 -> ../../node/node0
+
--------------------------------
4. Physical memory hot-add phase
--------------------------------
@@ -227,11 +226,10 @@ You can tell the physical address of new memory to the kernel by
% echo start_address_of_new_memory > /sys/devices/system/memory/probe
-Then, [start_address_of_new_memory, start_address_of_new_memory + section_size)
-memory range is hot-added. In this case, hotplug script is not called (in
-current implementation). You'll have to online memory by yourself.
-Please see "How to online memory" in this text.
-
+Then, [start_address_of_new_memory, start_address_of_new_memory +
+memory_block_size] memory range is hot-added. In this case, hotplug script is
+not called (in current implementation). You'll have to online memory by
+yourself. Please see "How to online memory" in this text.
------------------------------
@@ -240,36 +238,36 @@ Please see "How to online memory" in this text.
5.1. State of memory
------------
-To see (online/offline) state of memory section, read 'state' file.
+To see (online/offline) state of a memory block, read 'state' file.
% cat /sys/device/system/memory/memoryXXX/state
-If the memory section is online, you'll read "online".
-If the memory section is offline, you'll read "offline".
+If the memory block is online, you'll read "online".
+If the memory block is offline, you'll read "offline".
5.2. How to online memory
------------
Even if the memory is hot-added, it is not at ready-to-use state.
-For using newly added memory, you have to "online" the memory section.
+For using newly added memory, you have to "online" the memory block.
-For onlining, you have to write "online" to the section's state file as:
+For onlining, you have to write "online" to the memory block's state file as:
% echo online > /sys/devices/system/memory/memoryXXX/state
-This onlining will not change the ZONE type of the target memory section,
-If the memory section is in ZONE_NORMAL, you can change it to ZONE_MOVABLE:
+This onlining will not change the ZONE type of the target memory block,
+If the memory block is in ZONE_NORMAL, you can change it to ZONE_MOVABLE:
% echo online_movable > /sys/devices/system/memory/memoryXXX/state
-(NOTE: current limit: this memory section must be adjacent to ZONE_MOVABLE)
+(NOTE: current limit: this memory block must be adjacent to ZONE_MOVABLE)
-And if the memory section is in ZONE_MOVABLE, you can change it to ZONE_NORMAL:
+And if the memory block is in ZONE_MOVABLE, you can change it to ZONE_NORMAL:
% echo online_kernel > /sys/devices/system/memory/memoryXXX/state
-(NOTE: current limit: this memory section must be adjacent to ZONE_NORMAL)
+(NOTE: current limit: this memory block must be adjacent to ZONE_NORMAL)
-After this, section memoryXXX's state will be 'online' and the amount of
+After this, memory block XXX's state will be 'online' and the amount of
available memory will be increased.
Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA).
@@ -284,22 +282,22 @@ This may be changed in future.
6.1 Memory offline and ZONE_MOVABLE
------------
Memory offlining is more complicated than memory online. Because memory offline
-has to make the whole memory section be unused, memory offline can fail if
-the section includes memory which cannot be freed.
+has to make the whole memory block be unused, memory offline can fail if
+the memory block includes memory which cannot be freed.
In general, memory offline can use 2 techniques.
-(1) reclaim and free all memory in the section.
-(2) migrate all pages in the section.
+(1) reclaim and free all memory in the memory block.
+(2) migrate all pages in the memory block.
In the current implementation, Linux's memory offline uses method (2), freeing
-all pages in the section by page migration. But not all pages are
+all pages in the memory block by page migration. But not all pages are
migratable. Under current Linux, migratable pages are anonymous pages and
-page caches. For offlining a section by migration, the kernel has to guarantee
-that the section contains only migratable pages.
+page caches. For offlining a memory block by migration, the kernel has to
+guarantee that the memory block contains only migratable pages.
-Now, a boot option for making a section which consists of migratable pages is
-supported. By specifying "kernelcore=" or "movablecore=" boot option, you can
+Now, a boot option for making a memory block which consists of migratable pages
+is supported. By specifying "kernelcore=" or "movablecore=" boot option, you can
create ZONE_MOVABLE...a zone which is just used for movable pages.
(See also Documentation/kernel-parameters.txt)
@@ -315,28 +313,27 @@ creates ZONE_MOVABLE as following.
Size of memory for movable pages (for offline) is ZZZZ.
-Note) Unfortunately, there is no information to show which section belongs
+Note: Unfortunately, there is no information to show which memory block belongs
to ZONE_MOVABLE. This is TBD.
6.2. How to offline memory
------------
-You can offline a section by using the same sysfs interface that was used in
-memory onlining.
+You can offline a memory block by using the same sysfs interface that was used
+in memory onlining.
% echo offline > /sys/devices/system/memory/memoryXXX/state
-If offline succeeds, the state of the memory section is changed to be "offline".
+If offline succeeds, the state of the memory block is changed to be "offline".
If it fails, some error core (like -EBUSY) will be returned by the kernel.
-Even if a section does not belong to ZONE_MOVABLE, you can try to offline it.
-If it doesn't contain 'unmovable' memory, you'll get success.
+Even if a memory block does not belong to ZONE_MOVABLE, you can try to offline
+it. If it doesn't contain 'unmovable' memory, you'll get success.
-A section under ZONE_MOVABLE is considered to be able to be offlined easily.
-But under some busy state, it may return -EBUSY. Even if a memory section
-cannot be offlined due to -EBUSY, you can retry offlining it and may be able to
-offline it (or not).
-(For example, a page is referred to by some kernel internal call and released
- soon.)
+A memory block under ZONE_MOVABLE is considered to be able to be offlined
+easily. But under some busy state, it may return -EBUSY. Even if a memory
+block cannot be offlined due to -EBUSY, you can retry offlining it and may be
+able to offline it (or not). (For example, a page is referred to by some kernel
+internal call and released soon.)
Consideration:
Memory hotplug's design direction is to make the possibility of memory offlining
@@ -373,11 +370,11 @@ MEMORY_GOING_OFFLINE
Generated to begin the process of offlining memory. Allocations are no
longer possible from the memory but some of the memory to be offlined
is still in use. The callback can be used to free memory known to a
- subsystem from the indicated memory section.
+ subsystem from the indicated memory block.
MEMORY_CANCEL_OFFLINE
Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from
- the section that we attempted to offline.
+ the memory block that we attempted to offline.
MEMORY_OFFLINE
Generated after offlining memory is complete.
@@ -413,8 +410,8 @@ node if necessary.
--------------
- allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
sysctl or new control file.
- - showing memory section and physical device relationship.
- - showing memory section is under ZONE_MOVABLE or not
+ - showing memory block and physical device relationship.
+ - showing memory block is under ZONE_MOVABLE or not
- test and make it better memory offlining.
- support HugeTLB page migration and offlining.
- memmap removing at memory offline.
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index 6f4eb322ffaf..94459b42e0ab 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -184,6 +184,12 @@ dentry names:
equivalent of %s dentry->d_name.name we used to use, %pd<n> prints
n last components. %pD does the same thing for struct file.
+task_struct comm name:
+
+ %pT
+
+ For printing task_struct->comm.
+
struct va_format:
%pV
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index dd9d0e33b443..5b6da0fb5fbf 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -772,16 +772,17 @@ This is value ORed together of
2 = Zone reclaim writes dirty pages out
4 = Zone reclaim swaps pages
-zone_reclaim_mode is set during bootup to 1 if it is determined that pages
-from remote zones will cause a measurable performance reduction. The
-page allocator will then reclaim easily reusable pages (those page
-cache pages that are currently not used) before allocating off node pages.
-
-It may be beneficial to switch off zone reclaim if the system is
-used for a file server and all of memory should be used for caching files
-from disk. In that case the caching effect is more important than
+zone_reclaim_mode is disabled by default. For file servers or workloads
+that benefit from having their data cached, zone_reclaim_mode should be
+left disabled as the caching effect is likely to be more important than
data locality.
+zone_reclaim may be enabled if it's known that the workload is partitioned
+such that each partition fits within a NUMA node and that accessing remote
+memory would cause a measurable performance reduction. The page allocator
+will then reclaim easily reusable pages (those page cache pages that are
+currently not used) before allocating off node pages.
+
Allowing zone reclaim to write out pages stops processes that are
writing large amounts of data from dirtying pages on other nodes. Zone
reclaim will write out dirty pages if a zone fills up and so effectively
diff --git a/MAINTAINERS b/MAINTAINERS
index 04de51f4fcdd..e38b193b29fc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4384,10 +4384,7 @@ F: drivers/scsi/ibmvscsi/
X: drivers/scsi/ibmvscsi/ibmvstgt.c
IBM ServeRAID RAID DRIVER
-P: Jack Hammer
-M: Dave Jeffery <ipslinux@adaptec.com>
-W: http://www.developer.ibm.com/welcome/netfinity/serveraid.html
-S: Supported
+S: Orphan
F: drivers/scsi/ips.*
ICH LPC AND GPIO DRIVER
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 96e54bed5088..e858aa0ad8af 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -6,4 +6,5 @@ generic-y += exec.h
generic-y += hash.h
generic-y += mcs_spinlock.h
generic-y += preempt.h
+generic-y += scatterlist.h
generic-y += trace_clock.h
diff --git a/arch/alpha/include/asm/scatterlist.h b/arch/alpha/include/asm/scatterlist.h
deleted file mode 100644
index 017d7471c3c4..000000000000
--- a/arch/alpha/include/asm/scatterlist.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ALPHA_SCATTERLIST_H
-#define _ALPHA_SCATTERLIST_H
-
-#include <asm-generic/scatterlist.h>
-
-#endif /* !(_ALPHA_SCATTERLIST_H) */
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 73a7450ee622..1badf9b84b51 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -86,12 +86,13 @@ static void show_faulting_vma(unsigned long address, char *buf)
unsigned long ino = 0;
dev_t dev = 0;
char *nm = buf;
+ struct mm_struct *active_mm = current->active_mm;
/* can't use print_vma_addr() yet as it doesn't check for
* non-inclusive vma
*/
-
- vma = find_vma(current->active_mm, address);
+ down_read(&active_mm->mmap_sem);
+ vma = find_vma(active_mm, address);
/* check against the find_vma( ) behaviour which returns the next VMA
* if the container VMA is not found
@@ -110,9 +111,10 @@ static void show_faulting_vma(unsigned long address, char *buf)
vma->vm_start < TASK_UNMAPPED_BASE ?
address : address - vma->vm_start,
nm, vma->vm_start, vma->vm_end);
- } else {
+ } else
pr_info(" @No matching VMA found\n");
- }
+
+ up_read(&active_mm->mmap_sem);
}
static void show_ecr_verbose(struct pt_regs *regs)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 291d40bb5cc2..4ec58c5dc3b3 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -83,6 +83,7 @@ config ARM
<http://www.arm.linux.org.uk/>.
config ARM_HAS_SG_CHAIN
+ select ARCH_HAS_SG_CHAIN
bool
config NEED_SG_DMA_LENGTH
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 23e728ecf8ab..2d95820276fd 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -21,6 +21,7 @@ generic-y += parport.h
generic-y += poll.h
generic-y += preempt.h
generic-y += resource.h
+generic-y += scatterlist.h
generic-y += sections.h
generic-y += segment.h
generic-y += sembuf.h
diff --git a/arch/arm/include/asm/fixmap.h b/arch/arm/include/asm/fixmap.h
index bbae919bceb4..68ea615c2a28 100644
--- a/arch/arm/include/asm/fixmap.h
+++ b/arch/arm/include/asm/fixmap.h
@@ -14,28 +14,15 @@
*/
#define FIXADDR_START 0xfff00000UL
-#define FIXADDR_TOP 0xfffe0000UL
-#define FIXADDR_SIZE (FIXADDR_TOP - FIXADDR_START)
+#define FIXADDR_END 0xfffe0000UL
+#define FIXADDR_TOP (FIXADDR_END - PAGE_SIZE)
-#define FIX_KMAP_BEGIN 0
-#define FIX_KMAP_END (FIXADDR_SIZE >> PAGE_SHIFT)
+enum fixed_addresses {
+ FIX_KMAP_BEGIN,
+ FIX_KMAP_END = (FIXADDR_TOP - FIXADDR_START) >> PAGE_SHIFT,
+ __end_of_fixed_addresses
+};
-#define __fix_to_virt(x) (FIXADDR_START + ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) (((x) - FIXADDR_START) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-
-static inline unsigned long fix_to_virt(const unsigned int idx)
-{
- if (idx >= FIX_KMAP_END)
- __this_fixmap_does_not_exist();
- return __fix_to_virt(idx);
-}
-
-static inline unsigned int virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
+#include <asm-generic/fixmap.h>
#endif
diff --git a/arch/arm/include/asm/scatterlist.h b/arch/arm/include/asm/scatterlist.h
deleted file mode 100644
index cefdb8f898a1..000000000000
--- a/arch/arm/include/asm/scatterlist.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _ASMARM_SCATTERLIST_H
-#define _ASMARM_SCATTERLIST_H
-
-#ifdef CONFIG_ARM_HAS_SG_CHAIN
-#define ARCH_HAS_SG_CHAIN
-#endif
-
-#include <asm/memory.h>
-#include <asm/types.h>
-#include <asm-generic/scatterlist.h>
-
-#endif /* _ASMARM_SCATTERLIST_H */
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 2a77ba8796ae..91a468225853 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -578,7 +578,7 @@ void __init mem_init(void)
MLK(DTCM_OFFSET, (unsigned long) dtcm_end),
MLK(ITCM_OFFSET, (unsigned long) itcm_end),
#endif
- MLK(FIXADDR_START, FIXADDR_TOP),
+ MLK(FIXADDR_START, FIXADDR_END),
MLM(VMALLOC_START, VMALLOC_END),
MLM(PAGE_OFFSET, (unsigned long)high_memory),
#ifdef CONFIG_HIGHMEM
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e6e4d3749a6e..df24b9323ac6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2,6 +2,7 @@ config ARM64
def_bool y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_USE_CMPXCHG_LOCKREF
+ select ARCH_HAS_SG_CHAIN
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_WANT_OPTIONAL_GPIOLIB
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
diff --git a/arch/arm64/boot/dts/apm-storm.dtsi b/arch/arm64/boot/dts/apm-storm.dtsi
index 93f4b2dd9248..4917f3b81a44 100644
--- a/arch/arm64/boot/dts/apm-storm.dtsi
+++ b/arch/arm64/boot/dts/apm-storm.dtsi
@@ -257,6 +257,19 @@
enable-offset = <0x0>;
enable-mask = <0x39>;
};
+
+ rtcclk: rtcclk@17000000 {
+ compatible = "apm,xgene-device-clock";
+ #clock-cells = <1>;
+ clocks = <&socplldiv2 0>;
+ reg = <0x0 0x17000000 0x0 0x2000>;
+ reg-names = "csr-reg";
+ csr-offset = <0xc>;
+ csr-mask = <0x2>;
+ enable-offset = <0x10>;
+ enable-mask = <0x2>;
+ clock-output-names = "rtcclk";
+ };
};
serial0: serial@1c020000 {
@@ -339,5 +352,13 @@
phys = <&phy3 0>;
phy-names = "sata-phy";
};
+
+ rtc: rtc@10510000 {
+ compatible = "apm,xgene-rtc";
+ reg = <0x0 0x10510000 0x0 0x400>;
+ interrupts = <0x0 0x46 0x4>;
+ #clock-cells = <1>;
+ clocks = <&rtcclk 0>;
+ };
};
};
diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h
index c35414bdf7bd..c8c8ff9eff61 100644
--- a/arch/blackfin/include/asm/unistd.h
+++ b/arch/blackfin/include/asm/unistd.h
@@ -12,7 +12,6 @@
#define __ARCH_WANT_SYS_ALARM
#define __ARCH_WANT_SYS_GETHOSTNAME
#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
#define __ARCH_WANT_SYS_TIME
#define __ARCH_WANT_SYS_FADVISE64
#define __ARCH_WANT_SYS_GETPGRP
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild
index afff5105909d..31742dfadff9 100644
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -13,6 +13,7 @@ generic-y += linkage.h
generic-y += mcs_spinlock.h
generic-y += module.h
generic-y += preempt.h
+generic-y += scatterlist.h
generic-y += trace_clock.h
generic-y += vga.h
generic-y += xor.h
diff --git a/arch/cris/include/asm/scatterlist.h b/arch/cris/include/asm/scatterlist.h
deleted file mode 100644
index f11f8f40ec4a..000000000000
--- a/arch/cris/include/asm/scatterlist.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_CRIS_SCATTERLIST_H
-#define __ASM_CRIS_SCATTERLIST_H
-
-#include <asm-generic/scatterlist.h>
-
-#endif /* !(__ASM_CRIS_SCATTERLIST_H) */
diff --git a/arch/cris/include/asm/unistd.h b/arch/cris/include/asm/unistd.h
index 5cc7d1991e48..0f40fed1ba25 100644
--- a/arch/cris/include/asm/unistd.h
+++ b/arch/cris/include/asm/unistd.h
@@ -15,7 +15,6 @@
#define __ARCH_WANT_SYS_GETHOSTNAME
#define __ARCH_WANT_SYS_IPC
#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
#define __ARCH_WANT_SYS_SIGNAL
#define __ARCH_WANT_SYS_TIME
#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild
index 87b95eb8aee5..5b73921b6e9d 100644
--- a/arch/frv/include/asm/Kbuild
+++ b/arch/frv/include/asm/Kbuild
@@ -5,4 +5,5 @@ generic-y += exec.h
generic-y += hash.h
generic-y += mcs_spinlock.h
generic-y += preempt.h
+generic-y += scatterlist.h
generic-y += trace_clock.h
diff --git a/arch/frv/include/asm/scatterlist.h b/arch/frv/include/asm/scatterlist.h
deleted file mode 100644
index 0e5eb3018468..000000000000
--- a/arch/frv/include/asm/scatterlist.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_SCATTERLIST_H
-#define _ASM_SCATTERLIST_H
-
-#include <asm-generic/scatterlist.h>
-
-#endif /* !_ASM_SCATTERLIST_H */
diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h
index 70ec7293dce7..17b5df8fc28a 100644
--- a/arch/frv/include/asm/unistd.h
+++ b/arch/frv/include/asm/unistd.h
@@ -13,7 +13,6 @@
/* #define __ARCH_WANT_SYS_GETHOSTNAME */
#define __ARCH_WANT_SYS_IPC
#define __ARCH_WANT_SYS_PAUSE
-/* #define __ARCH_WANT_SYS_SGETMASK */
/* #define __ARCH_WANT_SYS_SIGNAL */
#define __ARCH_WANT_SYS_TIME
#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 12c3afee0f6f..43e7290fbccf 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -27,6 +27,7 @@ config IA64
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_VIRT_CPU_ACCOUNTING
+ select ARCH_HAS_SG_CHAIN
select VIRT_TO_BUS
select ARCH_DISCARD_MEMBLOCK
select GENERIC_IRQ_PROBE
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 0da4aa2602ae..e8317d2d6c8d 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -5,5 +5,6 @@ generic-y += hash.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
generic-y += preempt.h
+generic-y += scatterlist.h
generic-y += trace_clock.h
generic-y += vtime.h
diff --git a/arch/ia64/include/asm/scatterlist.h b/arch/ia64/include/asm/scatterlist.h
deleted file mode 100644
index 08fd93bff1db..000000000000
--- a/arch/ia64/include/asm/scatterlist.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _ASM_IA64_SCATTERLIST_H
-#define _ASM_IA64_SCATTERLIST_H
-
-#include <asm-generic/scatterlist.h>
-#define ARCH_HAS_SG_CHAIN
-
-#endif /* _ASM_IA64_SCATTERLIST_H */
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 5cb55a1e606b..3555fddc1cf5 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -21,7 +21,8 @@
#define PENALTY_FOR_NODE_WITH_CPUS 255
/*
- * Distance above which we begin to use zone reclaim
+ * Nodes within this distance are eligible for reclaim by zone_reclaim() when
+ * zone_reclaim_mode is enabled.
*/
#define RECLAIM_DISTANCE 15
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild
index 67779a74b62d..accc10a3dc78 100644
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -6,4 +6,5 @@ generic-y += hash.h
generic-y += mcs_spinlock.h
generic-y += module.h
generic-y += preempt.h
+generic-y += scatterlist.h
generic-y += trace_clock.h
diff --git a/arch/m32r/include/asm/scatterlist.h b/arch/m32r/include/asm/scatterlist.h
deleted file mode 100644
index 7370b8b6243e..000000000000
--- a/arch/m32r/include/asm/scatterlist.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_M32R_SCATTERLIST_H
-#define _ASM_M32R_SCATTERLIST_H
-
-#include <asm-generic/scatterlist.h>
-
-#endif /* _ASM_M32R_SCATTERLIST_H */
diff --git a/arch/m68k/include/asm/signal.h b/arch/m68k/include/asm/signal.h
index 214320b50384..8c8ce5e1ee0e 100644
--- a/arch/m68k/include/asm/signal.h
+++ b/arch/m68k/include/asm/signal.h
@@ -60,15 +60,6 @@ static inline int __gen_sigismember(sigset_t *set, int _sig)
__const_sigismember(set,sig) : \
__gen_sigismember(set,sig))
-static inline int sigfindinword(unsigned long word)
-{
- asm ("bfffo %1{#0,#0},%0"
- : "=d" (word)
- : "d" (word & -word)
- : "cc");
- return word ^ 31;
-}
-
#endif /* !CONFIG_CPU_HAS_NO_BITFIELDS */
#ifndef __uClinux__
diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h
index 9d38b73989eb..6a43c160f50e 100644
--- a/arch/m68k/include/asm/unistd.h
+++ b/arch/m68k/include/asm/unistd.h
@@ -13,7 +13,6 @@
#define __ARCH_WANT_SYS_GETHOSTNAME
#define __ARCH_WANT_SYS_IPC
#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
#define __ARCH_WANT_SYS_SIGNAL
#define __ARCH_WANT_SYS_TIME
#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c
index 3a480b3df0d6..d2263a0bdc3d 100644
--- a/arch/m68k/kernel/sys_m68k.c
+++ b/arch/m68k/kernel/sys_m68k.c
@@ -376,7 +376,6 @@ cache_flush_060 (unsigned long addr, int scope, int cache, unsigned long len)
asmlinkage int
sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len)
{
- struct vm_area_struct *vma;
int ret = -EINVAL;
if (scope < FLUSH_SCOPE_LINE || scope > FLUSH_SCOPE_ALL ||
@@ -389,16 +388,23 @@ sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len)
if (!capable(CAP_SYS_ADMIN))
goto out;
} else {
+ struct vm_area_struct *vma;
+ bool invalid;
+
+ /* Check for overflow. */
+ if (addr + len < addr)
+ goto out;
+
/*
* Verify that the specified address region actually belongs
* to this process.
*/
- vma = find_vma (current->mm, addr);
ret = -EINVAL;
- /* Check for overflow. */
- if (addr + len < addr)
- goto out;
- if (vma == NULL || addr < vma->vm_start || addr + len > vma->vm_end)
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, addr);
+ invalid = !vma || addr < vma->vm_start || addr + len > vma->vm_end;
+ up_read(&current->mm->mmap_sem);
+ if (invalid)
goto out;
}
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index c98ed95c0541..2ea655be809d 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -6,5 +6,6 @@ generic-y += exec.h
generic-y += hash.h
generic-y += mcs_spinlock.h
generic-y += preempt.h
+generic-y += scatterlist.h
generic-y += syscalls.h
generic-y += trace_clock.h
diff --git a/arch/microblaze/include/asm/scatterlist.h b/arch/microblaze/include/asm/scatterlist.h
deleted file mode 100644
index 35d786fe93ae..000000000000
--- a/arch/microblaze/include/asm/scatterlist.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/scatterlist.h>
diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h
index b14232b6878f..fd56a8f66489 100644
--- a/arch/microblaze/include/asm/unistd.h
+++ b/arch/microblaze/include/asm/unistd.h
@@ -19,7 +19,6 @@
#define __ARCH_WANT_SYS_ALARM
#define __ARCH_WANT_SYS_GETHOSTNAME
#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
#define __ARCH_WANT_SYS_SIGNAL
#define __ARCH_WANT_SYS_TIME
#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/mips/dec/Makefile b/arch/mips/dec/Makefile
index 3d5d2c56de8d..bd74e05c90b0 100644
--- a/arch/mips/dec/Makefile
+++ b/arch/mips/dec/Makefile
@@ -3,7 +3,7 @@
#
obj-y := ecc-berr.o int-handler.o ioasic-irq.o kn01-berr.o \
- kn02-irq.o kn02xa-berr.o reset.o setup.o time.o
+ kn02-irq.o kn02xa-berr.o platform.o reset.o setup.o time.o
obj-$(CONFIG_TC) += tc.o
obj-$(CONFIG_CPU_HAS_WB) += wbflush.o
diff --git a/arch/mips/dec/platform.c b/arch/mips/dec/platform.c
new file mode 100644
index 000000000000..c7ac86af847a
--- /dev/null
+++ b/arch/mips/dec/platform.c
@@ -0,0 +1,44 @@
+/*
+ * DEC platform devices.
+ *
+ * Copyright (c) 2014 Maciej W. Rozycki
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/mc146818rtc.h>
+#include <linux/platform_device.h>
+
+static struct resource dec_rtc_resources[] = {
+ {
+ .name = "rtc",
+ .flags = IORESOURCE_MEM,
+ },
+};
+
+static struct cmos_rtc_board_info dec_rtc_info = {
+ .flags = CMOS_RTC_FLAGS_NOFREQ,
+ .address_space = 64,
+};
+
+static struct platform_device dec_rtc_device = {
+ .name = "rtc_cmos",
+ .id = PLATFORM_DEVID_NONE,
+ .dev.platform_data = &dec_rtc_info,
+ .resource = dec_rtc_resources,
+ .num_resources = ARRAY_SIZE(dec_rtc_resources),
+};
+
+static int __init dec_add_devices(void)
+{
+ dec_rtc_resources[0].start = RTC_PORT(0);
+ dec_rtc_resources[0].end = RTC_PORT(0) + dec_kn_slot_size - 1;
+ return platform_device_register(&dec_rtc_device);
+}
+
+device_initcall(dec_add_devices);
diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h
index 413d6c612bec..e55813029d5a 100644
--- a/arch/mips/include/asm/unistd.h
+++ b/arch/mips/include/asm/unistd.h
@@ -29,7 +29,6 @@
#define __ARCH_WANT_SYS_GETHOSTNAME
#define __ARCH_WANT_SYS_IPC
#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
#define __ARCH_WANT_SYS_UTIME
#define __ARCH_WANT_SYS_WAITPID
#define __ARCH_WANT_SYS_SOCKETCALL
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 074e857ced28..c51bd20cd081 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -712,10 +712,12 @@ int process_fpemu_return(int sig, void __user *fault_addr)
si.si_addr = fault_addr;
si.si_signo = sig;
if (sig == SIGSEGV) {
+ down_read(&current->mm->mmap_sem);
if (find_vma(current->mm, (unsigned long)fault_addr))
si.si_code = SEGV_ACCERR;
else
si.si_code = SEGV_MAPERR;
+ up_read(&current->mm->mmap_sem);
} else {
si.si_code = BUS_ADRERR;
}
diff --git a/arch/mips/mm/c-octeon.c b/arch/mips/mm/c-octeon.c
index f41a5c5b0865..05b1d7cf9514 100644
--- a/arch/mips/mm/c-octeon.c
+++ b/arch/mips/mm/c-octeon.c
@@ -137,8 +137,10 @@ static void octeon_flush_cache_sigtramp(unsigned long addr)
{
struct vm_area_struct *vma;
+ down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, addr);
octeon_flush_icache_all_cores(vma);
+ up_read(&current->mm->mmap_sem);
}
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild
index 654d5ba6e310..ecbd6676bd33 100644
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -6,4 +6,5 @@ generic-y += exec.h
generic-y += hash.h
generic-y += mcs_spinlock.h
generic-y += preempt.h
+generic-y += scatterlist.h
generic-y += trace_clock.h
diff --git a/arch/mn10300/include/asm/scatterlist.h b/arch/mn10300/include/asm/scatterlist.h
deleted file mode 100644
index 7baa4006008a..000000000000
--- a/arch/mn10300/include/asm/scatterlist.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* MN10300 Scatterlist definitions
- *
- * Copyright (C) 2007 Matsushita Electric Industrial Co., Ltd.
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-#ifndef _ASM_SCATTERLIST_H
-#define _ASM_SCATTERLIST_H
-
-#include <asm-generic/scatterlist.h>
-
-#endif /* _ASM_SCATTERLIST_H */
diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h
index 9d4e2d1ef90e..0522468f488b 100644
--- a/arch/mn10300/include/asm/unistd.h
+++ b/arch/mn10300/include/asm/unistd.h
@@ -26,7 +26,6 @@
#define __ARCH_WANT_SYS_GETHOSTNAME
#define __ARCH_WANT_SYS_IPC
#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
#define __ARCH_WANT_SYS_SIGNAL
#define __ARCH_WANT_SYS_TIME
#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h
index 74d835820ee7..5f4c68daa261 100644
--- a/arch/parisc/include/asm/unistd.h
+++ b/arch/parisc/include/asm/unistd.h
@@ -145,7 +145,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \
#define __ARCH_WANT_SYS_ALARM
#define __ARCH_WANT_SYS_GETHOSTNAME
#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
#define __ARCH_WANT_SYS_SIGNAL
#define __ARCH_WANT_SYS_TIME
#define __ARCH_WANT_COMPAT_SYS_TIME
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e0998997943b..caece570d0b4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -111,6 +111,7 @@ config PPC
select HAVE_DMA_API_DEBUG
select HAVE_OPROFILE
select HAVE_DEBUG_KMEMLEAK
+ select ARCH_HAS_SG_CHAIN
select GENERIC_ATOMIC64 if PPC32
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select HAVE_PERF_EVENTS
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 3fb1bc432f4f..7f23f162ce9c 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -4,5 +4,6 @@ generic-y += hash.h
generic-y += mcs_spinlock.h
generic-y += preempt.h
generic-y += rwsem.h
+generic-y += scatterlist.h
generic-y += trace_clock.h
generic-y += vtime.h
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 3ebb188c3ff5..d98c1ecc3266 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -44,6 +44,12 @@ static inline int pte_present(pte_t pte)
return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA);
}
+#define pte_present_nonuma pte_present_nonuma
+static inline int pte_present_nonuma(pte_t pte)
+{
+ return pte_val(pte) & (_PAGE_PRESENT);
+}
+
#define pte_numa pte_numa
static inline int pte_numa(pte_t pte)
{
diff --git a/arch/powerpc/include/asm/scatterlist.h b/arch/powerpc/include/asm/scatterlist.h
deleted file mode 100644
index de1f620bd5c9..000000000000
--- a/arch/powerpc/include/asm/scatterlist.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _ASM_POWERPC_SCATTERLIST_H
-#define _ASM_POWERPC_SCATTERLIST_H
-/*
- * Copyright (C) 2001 PPC64 Team, IBM Corp
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/dma.h>
-#include <asm-generic/scatterlist.h>
-
-#define ARCH_HAS_SG_CHAIN
-
-#endif /* _ASM_POWERPC_SCATTERLIST_H */
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index c9202151079f..6c8a8c5a37a1 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -9,12 +9,8 @@ struct device_node;
#ifdef CONFIG_NUMA
/*
- * Before going off node we want the VM to try and reclaim from the local
- * node. It does this if the remote distance is larger than RECLAIM_DISTANCE.
- * With the default REMOTE_DISTANCE of 20 and the default RECLAIM_DISTANCE of
- * 20, we never reclaim and go off node straight away.
- *
- * To fix this we choose a smaller value of RECLAIM_DISTANCE.
+ * If zone_reclaim_mode is enabled, a RECLAIM_DISTANCE of 10 will mean that
+ * all zones on all nodes will be eligible for zone_reclaim().
*/
#define RECLAIM_DISTANCE 10
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index 4494f029b632..cadd550898f3 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -29,7 +29,6 @@
#define __ARCH_WANT_SYS_GETHOSTNAME
#define __ARCH_WANT_SYS_IPC
#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
#define __ARCH_WANT_SYS_SIGNAL
#define __ARCH_WANT_SYS_TIME
#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 7b6c10750179..d85e86aac7fb 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -33,6 +33,7 @@
#include <linux/export.h>
#include <asm/tlbflush.h>
+#include <asm/dma.h>
#include "mmu_decl.h"
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 6c0b1f5f8d2c..fa9fb5b4c66c 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -134,7 +134,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->private;
+ struct vm_area_struct *vma = walk->vma;
split_huge_page_pmd(vma, addr, pmd);
return 0;
}
@@ -163,9 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
if (vma->vm_start >= (addr + len))
break;
vma->vm_flags |= VM_NOHUGEPAGE;
- subpage_proto_walk.private = vma;
- walk_page_range(vma->vm_start, vma->vm_end,
- &subpage_proto_walk);
+ walk_page_vma(vma, &subpage_proto_walk);
vma = vma->vm_next;
}
}
diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c
index 534574a97ec9..3a104284b338 100644
--- a/arch/powerpc/platforms/44x/warp.c
+++ b/arch/powerpc/platforms/44x/warp.c
@@ -25,6 +25,7 @@
#include <asm/time.h>
#include <asm/uic.h>
#include <asm/ppc4xx.h>
+#include <asm/dma.h>
static __initdata struct of_device_id warp_of_bus[] = {
diff --git a/arch/powerpc/platforms/52xx/efika.c b/arch/powerpc/platforms/52xx/efika.c
index 18c104820198..47d66794cf3e 100644
--- a/arch/powerpc/platforms/52xx/efika.c
+++ b/arch/powerpc/platforms/52xx/efika.c
@@ -13,6 +13,7 @@
#include <generated/utsrelease.h>
#include <linux/pci.h>
#include <linux/of.h>
+#include <asm/dma.h>
#include <asm/prom.h>
#include <asm/time.h>
#include <asm/machdep.h>
diff --git a/arch/powerpc/platforms/amigaone/setup.c b/arch/powerpc/platforms/amigaone/setup.c
index 03aabc0e16ac..2fe12046279e 100644
--- a/arch/powerpc/platforms/amigaone/setup.c
+++ b/arch/powerpc/platforms/amigaone/setup.c
@@ -24,6 +24,7 @@
#include <asm/i8259.h>
#include <asm/time.h>
#include <asm/udbg.h>
+#include <asm/dma.h>
extern void __flush_disable_L1(void);
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 009c58b575c0..f357d8e7ddce 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -146,6 +146,7 @@ config S390
select TTY
select VIRT_CPU_ACCOUNTING
select VIRT_TO_BUS
+ select ARCH_HAS_SG_CHAIN
config SCHED_OMIT_FRAME_POINTER
def_bool y
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 57892a8a9055..b3fea0722ff1 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -4,4 +4,5 @@ generic-y += clkdev.h
generic-y += hash.h
generic-y += mcs_spinlock.h
generic-y += preempt.h
+generic-y += scatterlist.h
generic-y += trace_clock.h
diff --git a/arch/s390/include/asm/scatterlist.h b/arch/s390/include/asm/scatterlist.h
deleted file mode 100644
index 6d45ef6c12a7..000000000000
--- a/arch/s390/include/asm/scatterlist.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#include <asm-generic/scatterlist.h>
-
-#define ARCH_HAS_SG_CHAIN
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild
index 2f947aba4bd4..aad209199f7e 100644
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -8,5 +8,6 @@ generic-y += cputime.h
generic-y += hash.h
generic-y += mcs_spinlock.h
generic-y += preempt.h
+generic-y += scatterlist.h
generic-y += trace_clock.h
generic-y += xor.h
diff --git a/arch/score/include/asm/scatterlist.h b/arch/score/include/asm/scatterlist.h
deleted file mode 100644
index 9f533b8362c7..000000000000
--- a/arch/score/include/asm/scatterlist.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_SCORE_SCATTERLIST_H
-#define _ASM_SCORE_SCATTERLIST_H
-
-#include <asm-generic/scatterlist.h>
-
-#endif /* _ASM_SCORE_SCATTERLIST_H */
diff --git a/arch/sh/include/asm/unistd.h b/arch/sh/include/asm/unistd.h
index e77816c4b9bc..126fe8340b22 100644
--- a/arch/sh/include/asm/unistd.h
+++ b/arch/sh/include/asm/unistd.h
@@ -11,7 +11,6 @@
# define __ARCH_WANT_SYS_GETHOSTNAME
# define __ARCH_WANT_SYS_IPC
# define __ARCH_WANT_SYS_PAUSE
-# define __ARCH_WANT_SYS_SGETMASK
# define __ARCH_WANT_SYS_SIGNAL
# define __ARCH_WANT_SYS_TIME
# define __ARCH_WANT_SYS_UTIME
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 29f2e988c56a..e1ea0ff154d7 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@ config SPARC
select MODULES_USE_ELF_RELA
select ODD_RT_SIGACTION
select OLD_SIGSUSPEND
+ select ARCH_HAS_SG_CHAIN
config SPARC32
def_bool !64BIT
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index a45821818003..cdd1b447bb6c 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -15,6 +15,7 @@ generic-y += mcs_spinlock.h
generic-y += module.h
generic-y += mutex.h
generic-y += preempt.h
+generic-y += scatterlist.h
generic-y += serial.h
generic-y += trace_clock.h
generic-y += types.h
diff --git a/arch/sparc/include/asm/scatterlist.h b/arch/sparc/include/asm/scatterlist.h
deleted file mode 100644
index 92bb638313f8..000000000000
--- a/arch/sparc/include/asm/scatterlist.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _SPARC_SCATTERLIST_H
-#define _SPARC_SCATTERLIST_H
-
-#include <asm-generic/scatterlist.h>
-
-#define ARCH_HAS_SG_CHAIN
-
-#endif /* !(_SPARC_SCATTERLIST_H) */
diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h
index dfa53fdd5cbc..0aac1e8f2968 100644
--- a/arch/sparc/include/asm/unistd.h
+++ b/arch/sparc/include/asm/unistd.h
@@ -25,7 +25,6 @@
#define __ARCH_WANT_SYS_ALARM
#define __ARCH_WANT_SYS_GETHOSTNAME
#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
#define __ARCH_WANT_SYS_SIGNAL
#define __ARCH_WANT_SYS_TIME
#define __ARCH_WANT_SYS_UTIME
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index a5e4b6068213..7bd64aa2e94a 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -21,6 +21,7 @@ generic-y += param.h
generic-y += pci.h
generic-y += percpu.h
generic-y += preempt.h
+generic-y += scatterlist.h
generic-y += sections.h
generic-y += switch_to.h
generic-y += topology.h
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 895560b1fd24..48514158a6ba 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,7 +26,7 @@ config X86
select ARCH_MIGHT_HAVE_PC_SERIO
select HAVE_AOUT if X86_32
select HAVE_UNSTABLE_SCHED_CLOCK
- select ARCH_SUPPORTS_NUMA_BALANCING
+ select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_INT128 if X86_64
select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
@@ -41,7 +41,7 @@ config X86
select ARCH_WANT_OPTIONAL_GPIOLIB
select ARCH_WANT_FRAME_POINTERS
select HAVE_DMA_ATTRS
- select HAVE_DMA_CONTIGUOUS if !SWIOTLB
+ select HAVE_DMA_CONTIGUOUS
select HAVE_KRETPROBES
select GENERIC_EARLY_IOREMAP
select HAVE_OPTPROBES
@@ -96,6 +96,7 @@ config X86
select IRQ_FORCED_THREADING
select HAVE_BPF_JIT if X86_64
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select ARCH_HAS_SG_CHAIN
select CLKEVT_I8253
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_IOMAP
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 3ca9762e1649..3bf000fab0ae 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -5,6 +5,7 @@ genhdr-y += unistd_64.h
genhdr-y += unistd_x32.h
generic-y += clkdev.h
-generic-y += early_ioremap.h
generic-y += cputime.h
+generic-y += early_ioremap.h
generic-y += mcs_spinlock.h
+generic-y += scatterlist.h
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b459ddf27d64..66276c1d23bb 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -131,7 +131,8 @@ static inline int pte_exec(pte_t pte)
static inline int pte_special(pte_t pte)
{
- return pte_flags(pte) & _PAGE_SPECIAL;
+ return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
+ (_PAGE_PRESENT|_PAGE_SPECIAL);
}
static inline unsigned long pte_pfn(pte_t pte)
@@ -452,6 +453,12 @@ static inline int pte_present(pte_t a)
_PAGE_NUMA);
}
+#define pte_present_nonuma pte_present_nonuma
+static inline int pte_present_nonuma(pte_t a)
+{
+ return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
+}
+
#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
@@ -860,19 +867,19 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
- VM_BUG_ON(pte_present(pte));
+ VM_BUG_ON(pte_present_nonuma(pte));
return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}
static inline int pte_swp_soft_dirty(pte_t pte)
{
- VM_BUG_ON(pte_present(pte));
+ VM_BUG_ON(pte_present_nonuma(pte));
return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
}
static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
- VM_BUG_ON(pte_present(pte));
+ VM_BUG_ON(pte_present_nonuma(pte));
return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e22c1dbf7feb..6d6ecd09883c 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -145,8 +145,16 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
/* Encode and de-code a swap entry */
#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#ifdef CONFIG_NUMA_BALANCING
+/* Automatic NUMA balancing needs to be distinguishable from swap entries */
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
+#else
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+#endif
#else
+#ifdef CONFIG_NUMA_BALANCING
+#error Incompatible format for automatic NUMA balancing
+#endif
#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
#endif
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index eb3d44945133..f216963760e5 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -16,15 +16,26 @@
#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT 7 /* on 4KB pages */
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
-#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
-#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
-#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
+#define _PAGE_BIT_SOFTW1 9 /* available for programmer */
+#define _PAGE_BIT_SOFTW2 10 /* " */
+#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
-#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
-#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
-#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
+#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
+#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
+#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
+#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */
+#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
+#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+/*
+ * Swap offsets on configurations that allow automatic NUMA balancing use the
+ * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
+ * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
+ * maximum possible swap space from 16TB to 8TB.
+ */
+#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1)
+
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
@@ -40,7 +51,7 @@
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
-#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
@@ -61,8 +72,6 @@
* they do not conflict with each other.
*/
-#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN
-
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else
@@ -70,6 +79,21 @@
#endif
/*
+ * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
+ * that is not present. The hinting fault gathers numa placement statistics
+ * (see pte_numa()). The bit is always zero when the PTE is not present.
+ *
+ * The bit picked must be always zero when the pmd is present and not
+ * present, so that we don't lose information when we set it while
+ * atomically clearing the present bit.
+ */
+#ifdef CONFIG_NUMA_BALANCING
+#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
+#else
+#define _PAGE_NUMA (_AT(pteval_t, 0))
+#endif
+
+/*
* Tracking soft dirty bit when a page goes to a swap is tricky.
* We need a bit which can be stored in pte _and_ not conflict
* with swap entry format. On x86 bits 6 and 7 are *not* involved
@@ -94,26 +118,6 @@
#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-/*
- * _PAGE_NUMA indicates that this page will trigger a numa hinting
- * minor page fault to gather numa placement statistics (see
- * pte_numa()). The bit picked (8) is within the range between
- * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
- * require changes to the swp entry format because that bit is always
- * zero when the pte is not present.
- *
- * The bit picked must be always zero when the pmd is present and not
- * present, so that we don't lose information when we set it while
- * atomically clearing the present bit.
- *
- * Because we shared the same bit (8) with _PAGE_PROTNONE this can be
- * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
- * couldn't reach, like handle_mm_fault() (see access_error in
- * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
- * handle_mm_fault() to be invoked).
- */
-#define _PAGE_NUMA _PAGE_PROTNONE
-
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
@@ -122,8 +126,8 @@
/* Set of bits not changed in pte_modify */
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
- _PAGE_SOFT_DIRTY)
-#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+ _PAGE_SOFT_DIRTY | _PAGE_NUMA)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
#define _PAGE_CACHE_WB (0)
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
deleted file mode 100644
index 4240878b9d76..000000000000
--- a/arch/x86/include/asm/scatterlist.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _ASM_X86_SCATTERLIST_H
-#define _ASM_X86_SCATTERLIST_H
-
-#include <asm-generic/scatterlist.h>
-
-#define ARCH_HAS_SG_CHAIN
-
-#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 35e67a457182..31eab867e6d3 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -92,12 +92,6 @@ static inline int __gen_sigismember(sigset_t *set, int _sig)
? __const_sigismember((set), (sig)) \
: __gen_sigismember((set), (sig)))
-static inline int sigfindinword(unsigned long word)
-{
- asm("bsfl %1,%0" : "=r"(word) : "rm"(word) : "cc");
- return word;
-}
-
struct pt_regs;
#else /* __i386__ */
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index 977f1761a25d..ab05d73e2bb7 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -29,4 +29,11 @@ static inline void pci_swiotlb_late_init(void)
static inline void dma_mark_clean(void *addr, size_t size) {}
+extern void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags,
+ struct dma_attrs *attrs);
+extern void x86_swiotlb_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_addr,
+ struct dma_attrs *attrs);
+
#endif /* _ASM_X86_SWIOTLB_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 3f556c6a0157..2b19caa4081c 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -41,7 +41,6 @@
# define __ARCH_WANT_SYS_OLD_GETRLIMIT
# define __ARCH_WANT_SYS_OLD_UNAME
# define __ARCH_WANT_SYS_PAUSE
-# define __ARCH_WANT_SYS_SGETMASK
# define __ARCH_WANT_SYS_SIGNAL
# define __ARCH_WANT_SYS_SIGPENDING
# define __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index b574b295a2f9..8e3842fc8bea 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -512,7 +512,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_addr, struct dma_attrs *attrs)
{
gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
- free_pages((unsigned long)vaddr, get_order(size));
+ dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
}
static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index f7d0672481fd..a25e202bb319 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -97,12 +97,17 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_mask = dma_alloc_coherent_mask(dev, flag);
- flag |= __GFP_ZERO;
+ flag &= ~__GFP_ZERO;
again:
page = NULL;
/* CMA can be used only in the context which permits sleeping */
- if (flag & __GFP_WAIT)
+ if (flag & __GFP_WAIT) {
page = dma_alloc_from_contiguous(dev, count, get_order(size));
+ if (page && page_to_phys(page) + size > dma_mask) {
+ dma_release_from_contiguous(dev, page, count);
+ page = NULL;
+ }
+ }
/* fallback */
if (!page)
page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
@@ -120,7 +125,7 @@ again:
return NULL;
}
-
+ memset(page_address(page), 0, size);
*dma_addr = addr;
return page_address(page);
}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 6c483ba98b9c..77dd0ad58be4 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -14,7 +14,7 @@
#include <asm/iommu_table.h>
int swiotlb __read_mostly;
-static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags,
struct dma_attrs *attrs)
{
@@ -28,11 +28,14 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
}
-static void x86_swiotlb_free_coherent(struct device *dev, size_t size,
+void x86_swiotlb_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t dma_addr,
struct dma_attrs *attrs)
{
- swiotlb_free_coherent(dev, size, vaddr, dma_addr);
+ if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr)))
+ swiotlb_free_coherent(dev, size, vaddr, dma_addr);
+ else
+ dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
}
static struct dma_map_ops swiotlb_dma_ops = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 09c76d265550..78a0e6298922 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1119,7 +1119,7 @@ void __init setup_arch(char **cmdline_p)
setup_real_mode();
memblock_set_current_limit(get_max_mapped());
- dma_contiguous_reserve(0);
+ dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
/*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f35c66c5959a..b92591fa8970 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1230,17 +1230,43 @@ const char *arch_vma_name(struct vm_area_struct *vma)
return NULL;
}
-#ifdef CONFIG_X86_UV
-unsigned long memory_block_size_bytes(void)
+static unsigned long probe_memory_block_size(void)
{
+ /* start from 2g */
+ unsigned long bz = 1UL<<31;
+
+#ifdef CONFIG_X86_UV
if (is_uv_system()) {
printk(KERN_INFO "UV: memory block size 2GB\n");
return 2UL * 1024 * 1024 * 1024;
}
- return MIN_MEMORY_BLOCK_SIZE;
-}
#endif
+ /* less than 64g installed */
+ if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
+ return MIN_MEMORY_BLOCK_SIZE;
+
+ /* get the tail size */
+ while (bz > MIN_MEMORY_BLOCK_SIZE) {
+ if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
+ break;
+ bz >>= 1;
+ }
+
+ printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20);
+
+ return bz;
+}
+
+static unsigned long memory_block_size_probed;
+unsigned long memory_block_size_bytes(void)
+{
+ if (!memory_block_size_probed)
+ memory_block_size_probed = probe_memory_block_size();
+
+ return memory_block_size_probed;
+}
+
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
* Initialise the sparsemem vmemmap using huge-pages at the PMD level.
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 461bc8289024..6629f397b467 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -35,7 +35,7 @@ enum {
static int pte_testbit(pte_t pte)
{
- return pte_flags(pte) & _PAGE_UNUSED1;
+ return pte_flags(pte) & _PAGE_SOFTW1;
}
struct split_state {
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 9d8a509c9730..5ceda85b8687 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -173,9 +173,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev,
{
void *vaddr;
- vaddr = dma_generic_alloc_coherent(dev, size, dma_handle, flags, attrs);
- if (!vaddr)
- vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, flags);
+ vaddr = x86_swiotlb_alloc_coherent(dev, size, dma_handle, flags, attrs);
*dma_handle = p2a(*dma_handle, to_pci_dev(dev));
return vaddr;
}
@@ -183,7 +181,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev,
/* We have our own dma_ops: the same as swiotlb but from alloc (above) */
static struct dma_map_ops sta2x11_dma_ops = {
.alloc = sta2x11_swiotlb_alloc_coherent,
- .free = swiotlb_free_coherent,
+ .free = x86_swiotlb_free_coherent,
.map_page = swiotlb_map_page,
.unmap_page = swiotlb_unmap_page,
.map_sg = swiotlb_map_sg_attrs,
diff --git a/block/genhd.c b/block/genhd.c
index 791f41943132..7bd4372e8b6f 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -849,7 +849,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
index 165c2c299e57..b056661b77ab 100644
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -59,11 +59,22 @@ struct cma *dma_contiguous_default_area;
*/
static const phys_addr_t size_bytes = CMA_SIZE_MBYTES * SZ_1M;
static phys_addr_t size_cmdline = -1;
+static phys_addr_t base_cmdline;
+static phys_addr_t limit_cmdline;
static int __init early_cma(char *p)
{
pr_debug("%s(%s)\n", __func__, p);
size_cmdline = memparse(p, &p);
+ if (*p != '@')
+ return 0;
+ base_cmdline = memparse(p + 1, &p);
+ if (*p != '-') {
+ limit_cmdline = base_cmdline + size_cmdline;
+ return 0;
+ }
+ limit_cmdline = memparse(p + 1, &p);
+
return 0;
}
early_param("cma", early_cma);
@@ -107,11 +118,18 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void)
void __init dma_contiguous_reserve(phys_addr_t limit)
{
phys_addr_t selected_size = 0;
+ phys_addr_t selected_base = 0;
+ phys_addr_t selected_limit = limit;
+ bool fixed = false;
pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
if (size_cmdline != -1) {
selected_size = size_cmdline;
+ selected_base = base_cmdline;
+ selected_limit = min_not_zero(limit_cmdline, limit);
+ if (base_cmdline + size_cmdline == limit_cmdline)
+ fixed = true;
} else {
#ifdef CONFIG_CMA_SIZE_SEL_MBYTES
selected_size = size_bytes;
@@ -128,10 +146,12 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
pr_debug("%s: reserving %ld MiB for global area\n", __func__,
(unsigned long)selected_size / SZ_1M);
- dma_contiguous_reserve_area(selected_size, 0, limit,
- &dma_contiguous_default_area);
+ dma_contiguous_reserve_area(selected_size, selected_base,
+ selected_limit,
+ &dma_contiguous_default_area,
+ fixed);
}
-};
+}
static DEFINE_MUTEX(cma_mutex);
@@ -187,15 +207,20 @@ core_initcall(cma_init_reserved_areas);
* @base: Base address of the reserved area optional, use 0 for any
* @limit: End address of the reserved memory (optional, 0 for any).
* @res_cma: Pointer to store the created cma region.
+ * @fixed: hint about where to place the reserved area
*
* This function reserves memory from early allocator. It should be
* called by arch specific code once the early allocator (memblock or bootmem)
* has been activated and all other subsystems have already allocated/reserved
* memory. This function allows to create custom reserved areas for specific
* devices.
+ *
+ * If @fixed is true, reserve contiguous area at exactly @base. If false,
+ * reserve in range from @base to @limit.
*/
int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
- phys_addr_t limit, struct cma **res_cma)
+ phys_addr_t limit, struct cma **res_cma,
+ bool fixed)
{
struct cma *cma = &cma_areas[cma_area_count];
phys_addr_t alignment;
@@ -221,18 +246,15 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
limit &= ~(alignment - 1);
/* Reserve memory */
- if (base) {
+ if (base && fixed) {
if (memblock_is_region_reserved(base, size) ||
memblock_reserve(base, size) < 0) {
ret = -EBUSY;
goto err;
}
} else {
- /*
- * Use __memblock_alloc_base() since
- * memblock_alloc_base() panic()s.
- */
- phys_addr_t addr = __memblock_alloc_base(size, alignment, limit);
+ phys_addr_t addr = memblock_alloc_range(size, alignment, base,
+ limit);
if (!addr) {
ret = -ENOMEM;
goto err;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index bece691cb5d9..89f752dd8465 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -118,16 +118,6 @@ static ssize_t show_mem_start_phys_index(struct device *dev,
return sprintf(buf, "%08lx\n", phys_index);
}
-static ssize_t show_mem_end_phys_index(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct memory_block *mem = to_memory_block(dev);
- unsigned long phys_index;
-
- phys_index = mem->end_section_nr / sections_per_block;
- return sprintf(buf, "%08lx\n", phys_index);
-}
-
/*
* Show whether the section of memory is likely to be hot-removable
*/
@@ -384,7 +374,6 @@ static ssize_t show_phys_device(struct device *dev,
}
static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
-static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL);
@@ -529,7 +518,6 @@ struct memory_block *find_memory_block(struct mem_section *section)
static struct attribute *memory_memblk_attrs[] = {
&dev_attr_phys_index.attr,
- &dev_attr_end_phys_index.attr,
&dev_attr_state.attr,
&dev_attr_phys_device.attr,
&dev_attr_removable.attr,
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9849b5233bf4..48eccb350180 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -572,10 +572,10 @@ static void zram_bio_discard(struct zram *zram, u32 index,
* skipping this logical block is appropriate here.
*/
if (offset) {
- if (n < offset)
+ if (n <= (PAGE_SIZE - offset))
return;
- n -= offset;
+ n -= (PAGE_SIZE - offset);
index++;
}
diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c
index 6c1885eedfdf..800158714473 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c
@@ -467,14 +467,17 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev,
goto err_free;
}
+ down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, userptr);
if (!vma) {
+ up_read(&current->mm->mmap_sem);
DRM_ERROR("failed to get vm region.\n");
ret = -EFAULT;
goto err_free_pages;
}
if (vma->vm_end < userptr + size) {
+ up_read(&current->mm->mmap_sem);
DRM_ERROR("vma is too small.\n");
ret = -EFAULT;
goto err_free_pages;
@@ -482,6 +485,7 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev,
g2d_userptr->vma = exynos_gem_get_vma(vma);
if (!g2d_userptr->vma) {
+ up_read(&current->mm->mmap_sem);
DRM_ERROR("failed to copy vma.\n");
ret = -ENOMEM;
goto err_free_pages;
@@ -492,10 +496,12 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev,
ret = exynos_gem_get_pages_from_userptr(start & PAGE_MASK,
npages, pages, vma);
if (ret < 0) {
+ up_read(&current->mm->mmap_sem);
DRM_ERROR("failed to get user pages from userptr.\n");
goto err_put_vma;
}
+ up_read(&current->mm->mmap_sem);
g2d_userptr->pages = pages;
sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig
index a11ff74a5127..9eac8de9e8b7 100644
--- a/drivers/input/Kconfig
+++ b/drivers/input/Kconfig
@@ -178,6 +178,15 @@ comment "Input Device Drivers"
source "drivers/input/keyboard/Kconfig"
+config INPUT_LEDS
+ bool "LED Support"
+ depends on LEDS_CLASS = INPUT || LEDS_CLASS = y
+ select LEDS_TRIGGERS
+ default y
+ help
+ This option enables support for LEDs on keyboards managed
+ by the input layer.
+
source "drivers/input/mouse/Kconfig"
source "drivers/input/joystick/Kconfig"
diff --git a/drivers/input/Makefile b/drivers/input/Makefile
index 5ca3f631497f..2ab5f3336da5 100644
--- a/drivers/input/Makefile
+++ b/drivers/input/Makefile
@@ -6,6 +6,9 @@
obj-$(CONFIG_INPUT) += input-core.o
input-core-y := input.o input-compat.o input-mt.o ff-core.o
+ifeq ($(CONFIG_INPUT_LEDS),y)
+input-core-y += leds.o
+endif
obj-$(CONFIG_INPUT_FF_MEMLESS) += ff-memless.o
obj-$(CONFIG_INPUT_POLLDEV) += input-polldev.o
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 1c4c0db05550..3b9284b18e70 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -708,6 +708,9 @@ static void input_disconnect_device(struct input_dev *dev)
handle->open = 0;
spin_unlock_irq(&dev->event_lock);
+
+ if (is_event_supported(EV_LED, dev->evbit, EV_MAX))
+ input_led_disconnect(dev);
}
/**
@@ -2134,6 +2137,9 @@ int input_register_device(struct input_dev *dev)
list_add_tail(&dev->node, &input_dev_list);
+ if (is_event_supported(EV_LED, dev->evbit, EV_MAX))
+ input_led_connect(dev);
+
list_for_each_entry(handler, &input_handler_list, node)
input_attach_handler(dev, handler);
diff --git a/drivers/input/leds.c b/drivers/input/leds.c
new file mode 100644
index 000000000000..985fa7ebeec7
--- /dev/null
+++ b/drivers/input/leds.c
@@ -0,0 +1,249 @@
+/*
+ * LED support for the input layer
+ *
+ * Copyright 2010-2014 Samuel Thibault <samuel.thibault@ens-lyon.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/leds.h>
+#include <linux/input.h>
+
+/*
+ * Keyboard LEDs are propagated by default like the following example:
+ *
+ * VT keyboard numlock trigger
+ * -> vt::numl VT LED
+ * -> vt-numl VT trigger
+ * -> per-device inputX::numl LED
+ *
+ * Userland can however choose the trigger for the vt::numl LED, or
+ * independently choose the trigger for any inputx::numl LED.
+ *
+ *
+ * VT LED classes and triggers are registered on-demand according to
+ * existing LED devices
+ */
+
+/* Handler for VT LEDs, just triggers the corresponding VT trigger. */
+static void vt_led_set(struct led_classdev *cdev,
+ enum led_brightness brightness);
+static struct led_classdev vt_leds[LED_CNT] = {
+#define DEFINE_INPUT_LED(vt_led, nam, deftrig) \
+ [vt_led] = { \
+ .name = "vt::"nam, \
+ .max_brightness = 1, \
+ .brightness_set = vt_led_set, \
+ .default_trigger = deftrig, \
+ }
+/* Default triggers for the VT LEDs just correspond to the legacy
+ * usage. */
+ DEFINE_INPUT_LED(LED_NUML, "numl", "kbd-numlock"),
+ DEFINE_INPUT_LED(LED_CAPSL, "capsl", "kbd-capslock"),
+ DEFINE_INPUT_LED(LED_SCROLLL, "scrolll", "kbd-scrollock"),
+ DEFINE_INPUT_LED(LED_COMPOSE, "compose", NULL),
+ DEFINE_INPUT_LED(LED_KANA, "kana", "kbd-kanalock"),
+ DEFINE_INPUT_LED(LED_SLEEP, "sleep", NULL),
+ DEFINE_INPUT_LED(LED_SUSPEND, "suspend", NULL),
+ DEFINE_INPUT_LED(LED_MUTE, "mute", NULL),
+ DEFINE_INPUT_LED(LED_MISC, "misc", NULL),
+ DEFINE_INPUT_LED(LED_MAIL, "mail", NULL),
+ DEFINE_INPUT_LED(LED_CHARGING, "charging", NULL),
+};
+static const char *const vt_led_names[LED_CNT] = {
+ [LED_NUML] = "numl",
+ [LED_CAPSL] = "capsl",
+ [LED_SCROLLL] = "scrolll",
+ [LED_COMPOSE] = "compose",
+ [LED_KANA] = "kana",
+ [LED_SLEEP] = "sleep",
+ [LED_SUSPEND] = "suspend",
+ [LED_MUTE] = "mute",
+ [LED_MISC] = "misc",
+ [LED_MAIL] = "mail",
+ [LED_CHARGING] = "charging",
+};
+/* Handler for hotplug initialization */
+static void vt_led_trigger_activate(struct led_classdev *cdev);
+/* VT triggers */
+static struct led_trigger vt_led_triggers[LED_CNT] = {
+#define DEFINE_INPUT_LED_TRIGGER(vt_led, nam) \
+ [vt_led] = { \
+ .name = "vt-"nam, \
+ .activate = vt_led_trigger_activate, \
+ }
+ DEFINE_INPUT_LED_TRIGGER(LED_NUML, "numl"),
+ DEFINE_INPUT_LED_TRIGGER(LED_CAPSL, "capsl"),
+ DEFINE_INPUT_LED_TRIGGER(LED_SCROLLL, "scrolll"),
+ DEFINE_INPUT_LED_TRIGGER(LED_COMPOSE, "compose"),
+ DEFINE_INPUT_LED_TRIGGER(LED_KANA, "kana"),
+ DEFINE_INPUT_LED_TRIGGER(LED_SLEEP, "sleep"),
+ DEFINE_INPUT_LED_TRIGGER(LED_SUSPEND, "suspend"),
+ DEFINE_INPUT_LED_TRIGGER(LED_MUTE, "mute"),
+ DEFINE_INPUT_LED_TRIGGER(LED_MISC, "misc"),
+ DEFINE_INPUT_LED_TRIGGER(LED_MAIL, "mail"),
+ DEFINE_INPUT_LED_TRIGGER(LED_CHARGING, "charging"),
+};
+
+/* Lock for registration coherency */
+static DEFINE_MUTEX(vt_led_registered_lock);
+
+/* Which VT LED classes and triggers are registered */
+static unsigned long vt_led_registered[BITS_TO_LONGS(LED_CNT)];
+
+/* Number of input devices having each LED */
+static int vt_led_references[LED_CNT];
+
+/* VT LED state change, tell the VT trigger. */
+static void vt_led_set(struct led_classdev *cdev,
+ enum led_brightness brightness)
+{
+ int led = cdev - vt_leds;
+
+ led_trigger_event(&vt_led_triggers[led], !!brightness);
+}
+
+/* LED state change for some keyboard, notify that keyboard. */
+static void perdevice_input_led_set(struct led_classdev *cdev,
+ enum led_brightness brightness)
+{
+ struct input_dev *dev;
+ struct led_classdev *leds;
+ int led;
+
+ dev = cdev->dev->platform_data;
+ if (!dev)
+ /* Still initializing */
+ return;
+ leds = dev->leds;
+ led = cdev - leds;
+
+ input_event(dev, EV_LED, led, !!brightness);
+ input_event(dev, EV_SYN, SYN_REPORT, 0);
+}
+
+/* Keyboard hotplug, initialize its LED status */
+static void vt_led_trigger_activate(struct led_classdev *cdev)
+{
+ struct led_trigger *trigger = cdev->trigger;
+ int led = trigger - vt_led_triggers;
+
+ if (cdev->brightness_set)
+ cdev->brightness_set(cdev, vt_leds[led].brightness);
+}
+
+/* Free led stuff from input device, used at abortion and disconnection. */
+static void input_led_delete(struct input_dev *dev)
+{
+ if (dev) {
+ struct led_classdev *leds = dev->leds;
+ if (leds) {
+ int i;
+ for (i = 0; i < LED_CNT; i++)
+ kfree(leds[i].name);
+ kfree(leds);
+ dev->leds = NULL;
+ }
+ }
+}
+
+/* A new input device with potential LEDs to connect. */
+int input_led_connect(struct input_dev *dev)
+{
+ int i, error = 0;
+ struct led_classdev *leds;
+
+ dev->leds = leds = kcalloc(LED_CNT, sizeof(*leds), GFP_KERNEL);
+ if (!dev->leds)
+ return -ENOMEM;
+
+ /* lazily register missing VT LEDs */
+ mutex_lock(&vt_led_registered_lock);
+ for (i = 0; i < LED_CNT; i++)
+ if (vt_leds[i].name && test_bit(i, dev->ledbit)) {
+ if (!vt_led_references[i]) {
+ led_trigger_register(&vt_led_triggers[i]);
+ /* This keyboard is first to have led i,
+ * try to register it */
+ if (!led_classdev_register(NULL, &vt_leds[i]))
+ vt_led_references[i] = 1;
+ else
+ led_trigger_unregister(&vt_led_triggers[i]);
+ } else
+ vt_led_references[i]++;
+ }
+ mutex_unlock(&vt_led_registered_lock);
+
+ /* and register this device's LEDs */
+ for (i = 0; i < LED_CNT; i++)
+ if (vt_leds[i].name && test_bit(i, dev->ledbit)) {
+ leds[i].name = kasprintf(GFP_KERNEL, "%s::%s",
+ dev_name(&dev->dev),
+ vt_led_names[i]);
+ if (!leds[i].name) {
+ error = -ENOMEM;
+ goto err;
+ }
+ leds[i].max_brightness = 1;
+ leds[i].brightness_set = perdevice_input_led_set;
+ leds[i].default_trigger = vt_led_triggers[i].name;
+ }
+
+ /* No issue so far, we can register for real. */
+ for (i = 0; i < LED_CNT; i++)
+ if (leds[i].name) {
+ led_classdev_register(&dev->dev, &leds[i]);
+ leds[i].dev->platform_data = dev;
+ perdevice_input_led_set(&leds[i],
+ vt_leds[i].brightness);
+ }
+
+ return 0;
+
+err:
+ input_led_delete(dev);
+ return error;
+}
+
+/*
+ * Disconnected input device. Clean it, and deregister now-useless VT LEDs
+ * and triggers.
+ */
+void input_led_disconnect(struct input_dev *dev)
+{
+ int i;
+ struct led_classdev *leds = dev->leds;
+
+ for (i = 0; i < LED_CNT; i++)
+ if (leds[i].name)
+ led_classdev_unregister(&leds[i]);
+
+ input_led_delete(dev);
+
+ mutex_lock(&vt_led_registered_lock);
+ for (i = 0; i < LED_CNT; i++) {
+ if (!vt_leds[i].name || !test_bit(i, dev->ledbit))
+ continue;
+
+ vt_led_references[i]--;
+ if (vt_led_references[i]) {
+ /* Still some devices needing it */
+ continue;
+ }
+
+ led_classdev_unregister(&vt_leds[i]);
+ led_trigger_unregister(&vt_led_triggers[i]);
+ clear_bit(i, vt_led_registered);
+ }
+ mutex_unlock(&vt_led_registered_lock);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("User LED support for input layer");
+MODULE_AUTHOR("Samuel Thibault <samuel.thibault@ens-lyon.org>");
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index f256ffc02e29..dbd55f0f2737 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3193,7 +3193,7 @@ static void *intel_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t flags,
struct dma_attrs *attrs)
{
- void *vaddr;
+ struct page *page = NULL;
int order;
size = PAGE_ALIGN(size);
@@ -3208,17 +3208,31 @@ static void *intel_alloc_coherent(struct device *dev, size_t size,
flags |= GFP_DMA32;
}
- vaddr = (void *)__get_free_pages(flags, order);
- if (!vaddr)
+ if (flags & __GFP_WAIT) {
+ unsigned int count = size >> PAGE_SHIFT;
+
+ page = dma_alloc_from_contiguous(dev, count, order);
+ if (page && iommu_no_mapping(dev) &&
+ page_to_phys(page) + size > dev->coherent_dma_mask) {
+ dma_release_from_contiguous(dev, page, count);
+ page = NULL;
+ }
+ }
+
+ if (!page)
+ page = alloc_pages(flags, order);
+ if (!page)
return NULL;
- memset(vaddr, 0, size);
+ memset(page_address(page), 0, size);
- *dma_handle = __intel_map_single(dev, virt_to_bus(vaddr), size,
+ *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
DMA_BIDIRECTIONAL,
dev->coherent_dma_mask);
if (*dma_handle)
- return vaddr;
- free_pages((unsigned long)vaddr, order);
+ return page_address(page);
+ if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
+ __free_pages(page, order);
+
return NULL;
}
@@ -3226,12 +3240,14 @@ static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_handle, struct dma_attrs *attrs)
{
int order;
+ struct page *page = virt_to_page(vaddr);
size = PAGE_ALIGN(size);
order = get_order(size);
intel_unmap_page(dev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
- free_pages((unsigned long)vaddr, order);
+ if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
+ __free_pages(page, order);
}
static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 03bc098136bc..7e55a85c303a 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -11,9 +11,6 @@ menuconfig NEW_LEDS
Say Y to enable Linux LED support. This allows control of supported
LEDs from both userspace and optionally, by kernel events (triggers).
- This is not related to standard keyboard LEDs which are controlled
- via the input system.
-
if NEW_LEDS
config LEDS_CLASS
diff --git a/drivers/misc/ti-st/st_core.c b/drivers/misc/ti-st/st_core.c
index 1972d57aadb3..e7fbc08a0627 100644
--- a/drivers/misc/ti-st/st_core.c
+++ b/drivers/misc/ti-st/st_core.c
@@ -342,7 +342,7 @@ void st_int_recv(void *disc_data,
/* Unknow packet? */
default:
type = *ptr;
- if (st_gdata->list[type] == NULL) {
+ if (type >= ST_MAX_CHANNELS || st_gdata->list[type] == NULL) {
pr_err("chip/interface misbehavior dropping"
" frame starting with 0x%02x", type);
goto done;
diff --git a/drivers/net/irda/donauboe.c b/drivers/net/irda/donauboe.c
index 768dfe9a9315..6d3e2093bf7f 100644
--- a/drivers/net/irda/donauboe.c
+++ b/drivers/net/irda/donauboe.c
@@ -1755,17 +1755,4 @@ static struct pci_driver donauboe_pci_driver = {
.resume = toshoboe_wakeup
};
-static int __init
-donauboe_init (void)
-{
- return pci_register_driver(&donauboe_pci_driver);
-}
-
-static void __exit
-donauboe_cleanup (void)
-{
- pci_unregister_driver(&donauboe_pci_driver);
-}
-
-module_init(donauboe_init);
-module_exit(donauboe_cleanup);
+module_pci_driver(donauboe_pci_driver);
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 2e565f8e5165..f4e113874e7a 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -386,12 +386,12 @@ config RTC_DRV_PCF8583
will be called rtc-pcf8583.
config RTC_DRV_M41T80
- tristate "ST M41T62/65/M41T80/81/82/83/84/85/87"
+ tristate "ST M41T62/65/M41T80/81/82/83/84/85/87 and compatible"
help
If you say Y here you will get support for the ST M41T60
and M41T80 RTC chips series. Currently, the following chips are
supported: M41T62, M41T65, M41T80, M41T81, M41T82, M41T83, M41ST84,
- M41ST85, and M41ST87.
+ M41ST85, M41ST87, and MicroCrystal RV4162.
This driver can also be built as a module. If so, the module
will be called rtc-m41t80.
@@ -1327,6 +1327,15 @@ config RTC_DRV_MOXART
This driver can also be built as a module. If so, the module
will be called rtc-moxart
+config RTC_DRV_XGENE
+ tristate "APM X-Gene RTC"
+ help
+ If you say yes here you get support for the APM X-Gene SoC real time
+ clock.
+
+ This driver can also be built as a module, if so, the module
+ will be called "rtc-xgene".
+
comment "HID Sensor RTC drivers"
config RTC_DRV_HID_SENSOR_TIME
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 40a09915c8f6..4545b4a88f30 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -135,5 +135,6 @@ obj-$(CONFIG_RTC_DRV_VT8500) += rtc-vt8500.o
obj-$(CONFIG_RTC_DRV_WM831X) += rtc-wm831x.o
obj-$(CONFIG_RTC_DRV_WM8350) += rtc-wm8350.o
obj-$(CONFIG_RTC_DRV_X1205) += rtc-x1205.o
+obj-$(CONFIG_RTC_DRV_XGENE) += rtc-xgene.o
obj-$(CONFIG_RTC_DRV_SIRFSOC) += rtc-sirfsoc.o
obj-$(CONFIG_RTC_DRV_MOXART) += rtc-moxart.o
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index c2eff6082363..13f93fa7fa80 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -292,7 +292,9 @@ int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
dev_dbg(&rtc->dev, "alarm rollover: %s\n", "year");
do {
alarm->time.tm_year++;
- } while (rtc_valid_tm(&alarm->time) != 0);
+ } while (alarm->time.tm_mon == 1
+ && is_leap_year(alarm->time.tm_year + 1900)
+ && rtc_valid_tm(&alarm->time) != 0);
break;
default:
@@ -300,7 +302,16 @@ int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
}
done:
- return 0;
+ err = rtc_valid_tm(&alarm->time);
+
+ if (err) {
+ dev_warn(&rtc->dev, "invalid alarm value: %d-%d-%d %d:%d:%d\n",
+ alarm->time.tm_year + 1900, alarm->time.tm_mon + 1,
+ alarm->time.tm_mday, alarm->time.tm_hour, alarm->time.tm_min,
+ alarm->time.tm_sec);
+ }
+
+ return err;
}
int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index 0963c9309c74..b0e4a3eb33c7 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -647,6 +647,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
int retval = 0;
unsigned char rtc_control;
unsigned address_space;
+ u32 flags = 0;
/* there can be only one ... */
if (cmos_rtc.dev)
@@ -660,9 +661,12 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
* REVISIT non-x86 systems may instead use memory space resources
* (needing ioremap etc), not i/o space resources like this ...
*/
- ports = request_region(ports->start,
- resource_size(ports),
- driver_name);
+ if (RTC_IOMAPPED)
+ ports = request_region(ports->start, resource_size(ports),
+ driver_name);
+ else
+ ports = request_mem_region(ports->start, resource_size(ports),
+ driver_name);
if (!ports) {
dev_dbg(dev, "i/o registers already in use\n");
return -EBUSY;
@@ -699,6 +703,11 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
* expect CMOS_READ and friends to handle.
*/
if (info) {
+ if (info->flags)
+ flags = info->flags;
+ if (info->address_space)
+ address_space = info->address_space;
+
if (info->rtc_day_alarm && info->rtc_day_alarm < 128)
cmos_rtc.day_alrm = info->rtc_day_alarm;
if (info->rtc_mon_alarm && info->rtc_mon_alarm < 128)
@@ -726,18 +735,21 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
spin_lock_irq(&rtc_lock);
- /* force periodic irq to CMOS reset default of 1024Hz;
- *
- * REVISIT it's been reported that at least one x86_64 ALI mobo
- * doesn't use 32KHz here ... for portability we might need to
- * do something about other clock frequencies.
- */
- cmos_rtc.rtc->irq_freq = 1024;
- hpet_set_periodic_freq(cmos_rtc.rtc->irq_freq);
- CMOS_WRITE(RTC_REF_CLCK_32KHZ | 0x06, RTC_FREQ_SELECT);
+ if (!(flags & CMOS_RTC_FLAGS_NOFREQ)) {
+ /* force periodic irq to CMOS reset default of 1024Hz;
+ *
+ * REVISIT it's been reported that at least one x86_64 ALI
+ * mobo doesn't use 32KHz here ... for portability we might
+ * need to do something about other clock frequencies.
+ */
+ cmos_rtc.rtc->irq_freq = 1024;
+ hpet_set_periodic_freq(cmos_rtc.rtc->irq_freq);
+ CMOS_WRITE(RTC_REF_CLCK_32KHZ | 0x06, RTC_FREQ_SELECT);
+ }
/* disable irqs */
- cmos_irq_disable(&cmos_rtc, RTC_PIE | RTC_AIE | RTC_UIE);
+ if (is_valid_irq(rtc_irq))
+ cmos_irq_disable(&cmos_rtc, RTC_PIE | RTC_AIE | RTC_UIE);
rtc_control = CMOS_READ(RTC_CONTROL);
@@ -802,14 +814,18 @@ cleanup1:
cmos_rtc.dev = NULL;
rtc_device_unregister(cmos_rtc.rtc);
cleanup0:
- release_region(ports->start, resource_size(ports));
+ if (RTC_IOMAPPED)
+ release_region(ports->start, resource_size(ports));
+ else
+ release_mem_region(ports->start, resource_size(ports));
return retval;
}
-static void cmos_do_shutdown(void)
+static void cmos_do_shutdown(int rtc_irq)
{
spin_lock_irq(&rtc_lock);
- cmos_irq_disable(&cmos_rtc, RTC_IRQMASK);
+ if (is_valid_irq(rtc_irq))
+ cmos_irq_disable(&cmos_rtc, RTC_IRQMASK);
spin_unlock_irq(&rtc_lock);
}
@@ -818,7 +834,7 @@ static void __exit cmos_do_remove(struct device *dev)
struct cmos_rtc *cmos = dev_get_drvdata(dev);
struct resource *ports;
- cmos_do_shutdown();
+ cmos_do_shutdown(cmos->irq);
sysfs_remove_bin_file(&dev->kobj, &nvram);
@@ -831,7 +847,10 @@ static void __exit cmos_do_remove(struct device *dev)
cmos->rtc = NULL;
ports = cmos->iomem;
- release_region(ports->start, resource_size(ports));
+ if (RTC_IOMAPPED)
+ release_region(ports->start, resource_size(ports));
+ else
+ release_mem_region(ports->start, resource_size(ports));
cmos->iomem = NULL;
cmos->dev = NULL;
@@ -1065,10 +1084,13 @@ static void __exit cmos_pnp_remove(struct pnp_dev *pnp)
static void cmos_pnp_shutdown(struct pnp_dev *pnp)
{
- if (system_state == SYSTEM_POWER_OFF && !cmos_poweroff(&pnp->dev))
+ struct device *dev = &pnp->dev;
+ struct cmos_rtc *cmos = dev_get_drvdata(dev);
+
+ if (system_state == SYSTEM_POWER_OFF && !cmos_poweroff(dev))
return;
- cmos_do_shutdown();
+ cmos_do_shutdown(cmos->irq);
}
static const struct pnp_device_id rtc_ids[] = {
@@ -1143,11 +1165,21 @@ static inline void cmos_of_init(struct platform_device *pdev) {}
static int __init cmos_platform_probe(struct platform_device *pdev)
{
+ struct resource *resource;
+ int irq;
+
cmos_of_init(pdev);
cmos_wake_setup(&pdev->dev);
- return cmos_do_probe(&pdev->dev,
- platform_get_resource(pdev, IORESOURCE_IO, 0),
- platform_get_irq(pdev, 0));
+
+ if (RTC_IOMAPPED)
+ resource = platform_get_resource(pdev, IORESOURCE_IO, 0);
+ else
+ resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0)
+ irq = -1;
+
+ return cmos_do_probe(&pdev->dev, resource, irq);
}
static int __exit cmos_platform_remove(struct platform_device *pdev)
@@ -1158,10 +1190,13 @@ static int __exit cmos_platform_remove(struct platform_device *pdev)
static void cmos_platform_shutdown(struct platform_device *pdev)
{
- if (system_state == SYSTEM_POWER_OFF && !cmos_poweroff(&pdev->dev))
+ struct device *dev = &pdev->dev;
+ struct cmos_rtc *cmos = dev_get_drvdata(dev);
+
+ if (system_state == SYSTEM_POWER_OFF && !cmos_poweroff(dev))
return;
- cmos_do_shutdown();
+ cmos_do_shutdown(cmos->irq);
}
/* work with hotplug and coldplug */
diff --git a/drivers/rtc/rtc-efi.c b/drivers/rtc/rtc-efi.c
index 797aa0252ba9..c4c38431012e 100644
--- a/drivers/rtc/rtc-efi.c
+++ b/drivers/rtc/rtc-efi.c
@@ -35,7 +35,7 @@ static inline int
compute_yday(efi_time_t *eft)
{
/* efi_time_t.month is in the [1-12] so, we need -1 */
- return rtc_year_days(eft->day - 1, eft->month - 1, eft->year);
+ return rtc_year_days(eft->day, eft->month - 1, eft->year);
}
/*
* returns day of the week [0-6] 0=Sunday
diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c
index a5248aa1abf1..7ff7427c2e6a 100644
--- a/drivers/rtc/rtc-m41t80.c
+++ b/drivers/rtc/rtc-m41t80.c
@@ -66,8 +66,6 @@
#define M41T80_FEATURE_WD (1 << 3) /* Extra watchdog resolution */
#define M41T80_FEATURE_SQ_ALT (1 << 4) /* RSx bits are in reg 4 */
-#define DRV_VERSION "0.05"
-
static DEFINE_MUTEX(m41t80_rtc_mutex);
static const struct i2c_device_id m41t80_id[] = {
{ "m41t62", M41T80_FEATURE_SQ | M41T80_FEATURE_SQ_ALT },
@@ -80,6 +78,7 @@ static const struct i2c_device_id m41t80_id[] = {
{ "m41st84", M41T80_FEATURE_HT | M41T80_FEATURE_BL | M41T80_FEATURE_SQ },
{ "m41st85", M41T80_FEATURE_HT | M41T80_FEATURE_BL | M41T80_FEATURE_SQ },
{ "m41st87", M41T80_FEATURE_HT | M41T80_FEATURE_BL | M41T80_FEATURE_SQ },
+ { "rv4162", M41T80_FEATURE_SQ | M41T80_FEATURE_WD | M41T80_FEATURE_SQ_ALT },
{ }
};
MODULE_DEVICE_TABLE(i2c, m41t80_id);
@@ -232,7 +231,7 @@ static ssize_t m41t80_sysfs_show_flags(struct device *dev,
val = i2c_smbus_read_byte_data(client, M41T80_REG_FLAGS);
if (val < 0)
- return -EIO;
+ return val;
return sprintf(buf, "%#x\n", val);
}
static DEVICE_ATTR(flags, S_IRUGO, m41t80_sysfs_show_flags, NULL);
@@ -252,7 +251,7 @@ static ssize_t m41t80_sysfs_show_sqwfreq(struct device *dev,
reg_sqw = M41T80_REG_WDAY;
val = i2c_smbus_read_byte_data(client, reg_sqw);
if (val < 0)
- return -EIO;
+ return val;
val = (val >> 4) & 0xf;
switch (val) {
case 0:
@@ -271,7 +270,7 @@ static ssize_t m41t80_sysfs_set_sqwfreq(struct device *dev,
{
struct i2c_client *client = to_i2c_client(dev);
struct m41t80_data *clientdata = i2c_get_clientdata(client);
- int almon, sqw, reg_sqw;
+ int almon, sqw, reg_sqw, rc;
int val = simple_strtoul(buf, NULL, 0);
if (!(clientdata->features & M41T80_FEATURE_SQ))
@@ -291,21 +290,30 @@ static ssize_t m41t80_sysfs_set_sqwfreq(struct device *dev,
/* disable SQW, set SQW frequency & re-enable */
almon = i2c_smbus_read_byte_data(client, M41T80_REG_ALARM_MON);
if (almon < 0)
- return -EIO;
+ return almon;
reg_sqw = M41T80_REG_SQW;
if (clientdata->features & M41T80_FEATURE_SQ_ALT)
reg_sqw = M41T80_REG_WDAY;
sqw = i2c_smbus_read_byte_data(client, reg_sqw);
if (sqw < 0)
- return -EIO;
+ return sqw;
sqw = (sqw & 0x0f) | (val << 4);
- if (i2c_smbus_write_byte_data(client, M41T80_REG_ALARM_MON,
- almon & ~M41T80_ALMON_SQWE) < 0 ||
- i2c_smbus_write_byte_data(client, reg_sqw, sqw) < 0)
- return -EIO;
- if (val && i2c_smbus_write_byte_data(client, M41T80_REG_ALARM_MON,
- almon | M41T80_ALMON_SQWE) < 0)
- return -EIO;
+
+ rc = i2c_smbus_write_byte_data(client, M41T80_REG_ALARM_MON,
+ almon & ~M41T80_ALMON_SQWE);
+ if (rc < 0)
+ return rc;
+
+ if (val) {
+ rc = i2c_smbus_write_byte_data(client, reg_sqw, sqw);
+ if (rc < 0)
+ return rc;
+
+ rc = i2c_smbus_write_byte_data(client, M41T80_REG_ALARM_MON,
+ almon | M41T80_ALMON_SQWE);
+ if (rc <0)
+ return rc;
+ }
return count;
}
static DEVICE_ATTR(sqwfreq, S_IRUGO | S_IWUSR,
@@ -629,40 +637,28 @@ static int m41t80_probe(struct i2c_client *client,
struct m41t80_data *clientdata = NULL;
if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C
- | I2C_FUNC_SMBUS_BYTE_DATA)) {
- rc = -ENODEV;
- goto exit;
- }
-
- dev_info(&client->dev,
- "chip found, driver version " DRV_VERSION "\n");
+ | I2C_FUNC_SMBUS_BYTE_DATA))
+ return -ENODEV;
clientdata = devm_kzalloc(&client->dev, sizeof(*clientdata),
GFP_KERNEL);
- if (!clientdata) {
- rc = -ENOMEM;
- goto exit;
- }
+ if (!clientdata)
+ return -ENOMEM;
clientdata->features = id->driver_data;
i2c_set_clientdata(client, clientdata);
rtc = devm_rtc_device_register(&client->dev, client->name,
&m41t80_rtc_ops, THIS_MODULE);
- if (IS_ERR(rtc)) {
- rc = PTR_ERR(rtc);
- rtc = NULL;
- goto exit;
- }
+ if (IS_ERR(rtc))
+ return PTR_ERR(rtc);
clientdata->rtc = rtc;
/* Make sure HT (Halt Update) bit is cleared */
rc = i2c_smbus_read_byte_data(client, M41T80_REG_ALARM_HOUR);
- if (rc < 0)
- goto ht_err;
- if (rc & M41T80_ALHOUR_HT) {
+ if (rc >= 0 && rc & M41T80_ALHOUR_HT) {
if (clientdata->features & M41T80_FEATURE_HT) {
m41t80_get_datetime(client, &tm);
dev_info(&client->dev, "HT bit was set!\n");
@@ -673,53 +669,44 @@ static int m41t80_probe(struct i2c_client *client,
tm.tm_mon + 1, tm.tm_mday, tm.tm_hour,
tm.tm_min, tm.tm_sec);
}
- if (i2c_smbus_write_byte_data(client,
- M41T80_REG_ALARM_HOUR,
- rc & ~M41T80_ALHOUR_HT) < 0)
- goto ht_err;
+ rc = i2c_smbus_write_byte_data(client, M41T80_REG_ALARM_HOUR,
+ rc & ~M41T80_ALHOUR_HT);
+ }
+
+ if (rc < 0) {
+ dev_err(&client->dev, "Can't clear HT bit\n");
+ return rc;
}
/* Make sure ST (stop) bit is cleared */
rc = i2c_smbus_read_byte_data(client, M41T80_REG_SEC);
- if (rc < 0)
- goto st_err;
- if (rc & M41T80_SEC_ST) {
- if (i2c_smbus_write_byte_data(client, M41T80_REG_SEC,
- rc & ~M41T80_SEC_ST) < 0)
- goto st_err;
+ if (rc >= 0 && rc & M41T80_SEC_ST)
+ rc = i2c_smbus_write_byte_data(client, M41T80_REG_SEC,
+ rc & ~M41T80_SEC_ST);
+ if (rc < 0) {
+ dev_err(&client->dev, "Can't clear ST bit\n");
+ return rc;
}
rc = m41t80_sysfs_register(&client->dev);
if (rc)
- goto exit;
+ return rc;
#ifdef CONFIG_RTC_DRV_M41T80_WDT
if (clientdata->features & M41T80_FEATURE_HT) {
save_client = client;
rc = misc_register(&wdt_dev);
if (rc)
- goto exit;
+ return rc;
rc = register_reboot_notifier(&wdt_notifier);
if (rc) {
misc_deregister(&wdt_dev);
- goto exit;
+ return rc;
}
}
#endif
return 0;
-
-st_err:
- rc = -EIO;
- dev_err(&client->dev, "Can't clear ST bit\n");
- goto exit;
-ht_err:
- rc = -EIO;
- dev_err(&client->dev, "Can't clear HT bit\n");
- goto exit;
-
-exit:
- return rc;
}
static int m41t80_remove(struct i2c_client *client)
@@ -750,4 +737,3 @@ module_i2c_driver(m41t80_driver);
MODULE_AUTHOR("Alexander Bigga <ab@mycable.de>");
MODULE_DESCRIPTION("ST Microelectronics M41T80 series RTC I2C Client Driver");
MODULE_LICENSE("GPL");
-MODULE_VERSION(DRV_VERSION);
diff --git a/drivers/rtc/rtc-pcf8523.c b/drivers/rtc/rtc-pcf8523.c
index 5c8f8226c848..4cdb64be061b 100644
--- a/drivers/rtc/rtc-pcf8523.c
+++ b/drivers/rtc/rtc-pcf8523.c
@@ -206,7 +206,7 @@ static int pcf8523_rtc_read_time(struct device *dev, struct rtc_time *tm)
tm->tm_hour = bcd2bin(regs[2] & 0x3f);
tm->tm_mday = bcd2bin(regs[3] & 0x3f);
tm->tm_wday = regs[4] & 0x7;
- tm->tm_mon = bcd2bin(regs[5] & 0x1f);
+ tm->tm_mon = bcd2bin(regs[5] & 0x1f) - 1;
tm->tm_year = bcd2bin(regs[6]) + 100;
return rtc_valid_tm(tm);
@@ -229,7 +229,7 @@ static int pcf8523_rtc_set_time(struct device *dev, struct rtc_time *tm)
regs[3] = bin2bcd(tm->tm_hour);
regs[4] = bin2bcd(tm->tm_mday);
regs[5] = tm->tm_wday;
- regs[6] = bin2bcd(tm->tm_mon);
+ regs[6] = bin2bcd(tm->tm_mon + 1);
regs[7] = bin2bcd(tm->tm_year - 100);
msg.addr = client->addr;
diff --git a/drivers/rtc/rtc-xgene.c b/drivers/rtc/rtc-xgene.c
new file mode 100644
index 000000000000..14129cc85bdb
--- /dev/null
+++ b/drivers/rtc/rtc-xgene.c
@@ -0,0 +1,278 @@
+/*
+ * APM X-Gene SoC Real Time Clock Driver
+ *
+ * Copyright (c) 2014, Applied Micro Circuits Corporation
+ * Author: Rameshwar Prasad Sahu <rsahu@apm.com>
+ * Loc Ho <lho@apm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/rtc.h>
+
+/* RTC CSR Registers */
+#define RTC_CCVR 0x00
+#define RTC_CMR 0x04
+#define RTC_CLR 0x08
+#define RTC_CCR 0x0C
+#define RTC_CCR_IE BIT(0)
+#define RTC_CCR_MASK BIT(1)
+#define RTC_CCR_EN BIT(2)
+#define RTC_CCR_WEN BIT(3)
+#define RTC_STAT 0x10
+#define RTC_STAT_BIT BIT(0)
+#define RTC_RSTAT 0x14
+#define RTC_EOI 0x18
+#define RTC_VER 0x1C
+
+struct xgene_rtc_dev {
+ struct rtc_device *rtc;
+ struct device *dev;
+ unsigned long alarm_time;
+ void __iomem *csr_base;
+ struct clk *clk;
+ unsigned int irq_wake;
+};
+
+static int xgene_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+ struct xgene_rtc_dev *pdata = dev_get_drvdata(dev);
+
+ rtc_time_to_tm(readl(pdata->csr_base + RTC_CCVR), tm);
+ return rtc_valid_tm(tm);
+}
+
+static int xgene_rtc_set_mmss(struct device *dev, unsigned long secs)
+{
+ struct xgene_rtc_dev *pdata = dev_get_drvdata(dev);
+
+ /*
+ * NOTE: After the following write, the RTC_CCVR is only reflected
+ * after the update cycle of 1 seconds.
+ */
+ writel((u32) secs, pdata->csr_base + RTC_CLR);
+ readl(pdata->csr_base + RTC_CLR); /* Force a barrier */
+
+ return 0;
+}
+
+static int xgene_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+ struct xgene_rtc_dev *pdata = dev_get_drvdata(dev);
+
+ rtc_time_to_tm(pdata->alarm_time, &alrm->time);
+ alrm->enabled = readl(pdata->csr_base + RTC_CCR) & RTC_CCR_IE;
+
+ return 0;
+}
+
+static int xgene_rtc_alarm_irq_enable(struct device *dev, u32 enabled)
+{
+ struct xgene_rtc_dev *pdata = dev_get_drvdata(dev);
+ u32 ccr;
+
+ ccr = readl(pdata->csr_base + RTC_CCR);
+ if (enabled) {
+ ccr &= ~RTC_CCR_MASK;
+ ccr |= RTC_CCR_IE;
+ } else {
+ ccr &= ~RTC_CCR_IE;
+ ccr |= RTC_CCR_MASK;
+ }
+ writel(ccr, pdata->csr_base + RTC_CCR);
+
+ return 0;
+}
+
+static int xgene_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+ struct xgene_rtc_dev *pdata = dev_get_drvdata(dev);
+ unsigned long rtc_time;
+ unsigned long alarm_time;
+
+ rtc_time = readl(pdata->csr_base + RTC_CCVR);
+ rtc_tm_to_time(&alrm->time, &alarm_time);
+
+ pdata->alarm_time = alarm_time;
+ writel((u32) pdata->alarm_time, pdata->csr_base + RTC_CMR);
+
+ xgene_rtc_alarm_irq_enable(dev, alrm->enabled);
+
+ return 0;
+}
+
+static const struct rtc_class_ops xgene_rtc_ops = {
+ .read_time = xgene_rtc_read_time,
+ .set_mmss = xgene_rtc_set_mmss,
+ .read_alarm = xgene_rtc_read_alarm,
+ .set_alarm = xgene_rtc_set_alarm,
+ .alarm_irq_enable = xgene_rtc_alarm_irq_enable,
+};
+
+static irqreturn_t xgene_rtc_interrupt(int irq, void *id)
+{
+ struct xgene_rtc_dev *pdata = (struct xgene_rtc_dev *) id;
+
+ /* Check if interrupt asserted */
+ if (!(readl(pdata->csr_base + RTC_STAT) & RTC_STAT_BIT))
+ return IRQ_NONE;
+
+ /* Clear interrupt */
+ readl(pdata->csr_base + RTC_EOI);
+
+ rtc_update_irq(pdata->rtc, 1, RTC_IRQF | RTC_AF);
+
+ return IRQ_HANDLED;
+}
+
+static int xgene_rtc_probe(struct platform_device *pdev)
+{
+ struct xgene_rtc_dev *pdata;
+ struct resource *res;
+ int ret;
+ int irq;
+
+ pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
+ if (!pdata)
+ return -ENOMEM;
+ platform_set_drvdata(pdev, pdata);
+ pdata->dev = &pdev->dev;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ pdata->csr_base = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(pdata->csr_base))
+ return PTR_ERR(pdata->csr_base);
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0) {
+ dev_err(&pdev->dev, "No IRQ resource\n");
+ return irq;
+ }
+ ret = devm_request_irq(&pdev->dev, irq, xgene_rtc_interrupt, 0,
+ dev_name(&pdev->dev), pdata);
+ if (ret) {
+ dev_err(&pdev->dev, "Could not request IRQ\n");
+ return ret;
+ }
+
+ pdata->clk = devm_clk_get(&pdev->dev, NULL);
+ if (IS_ERR(pdata->clk)) {
+ dev_err(&pdev->dev, "Couldn't get the clock for RTC\n");
+ return -ENODEV;
+ }
+ clk_prepare_enable(pdata->clk);
+
+ /* Turn on the clock and the crystal */
+ writel(RTC_CCR_EN, pdata->csr_base + RTC_CCR);
+
+ device_init_wakeup(&pdev->dev, 1);
+
+ pdata->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
+ &xgene_rtc_ops, THIS_MODULE);
+ if (IS_ERR(pdata->rtc)) {
+ clk_disable_unprepare(pdata->clk);
+ return PTR_ERR(pdata->rtc);
+ }
+
+ /* HW does not support update faster than 1 seconds */
+ pdata->rtc->uie_unsupported = 1;
+
+ return 0;
+}
+
+static int xgene_rtc_remove(struct platform_device *pdev)
+{
+ struct xgene_rtc_dev *pdata = platform_get_drvdata(pdev);
+
+ xgene_rtc_alarm_irq_enable(&pdev->dev, 0);
+ device_init_wakeup(&pdev->dev, 0);
+ clk_disable_unprepare(pdata->clk);
+ return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int xgene_rtc_suspend(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct xgene_rtc_dev *pdata = platform_get_drvdata(pdev);
+ int irq;
+
+ irq = platform_get_irq(pdev, 0);
+ if (device_may_wakeup(&pdev->dev)) {
+ if (!enable_irq_wake(irq))
+ pdata->irq_wake = 1;
+ } else {
+ xgene_rtc_alarm_irq_enable(dev, 0);
+ clk_disable(pdata->clk);
+ }
+
+ return 0;
+}
+
+static int xgene_rtc_resume(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct xgene_rtc_dev *pdata = platform_get_drvdata(pdev);
+ int irq;
+
+ irq = platform_get_irq(pdev, 0);
+ if (device_may_wakeup(&pdev->dev)) {
+ if (pdata->irq_wake) {
+ disable_irq_wake(irq);
+ pdata->irq_wake = 0;
+ }
+ } else {
+ clk_enable(pdata->clk);
+ xgene_rtc_alarm_irq_enable(dev, 1);
+ }
+
+ return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(xgene_rtc_pm_ops, xgene_rtc_suspend, xgene_rtc_resume);
+
+#ifdef CONFIG_OF
+static const struct of_device_id xgene_rtc_of_match[] = {
+ {.compatible = "apm,xgene-rtc" },
+ { }
+};
+MODULE_DEVICE_TABLE(of, xgene_rtc_of_match);
+#endif
+
+static struct platform_driver xgene_rtc_driver = {
+ .probe = xgene_rtc_probe,
+ .remove = xgene_rtc_remove,
+ .driver = {
+ .owner = THIS_MODULE,
+ .name = "xgene-rtc",
+ .pm = &xgene_rtc_pm_ops,
+ .of_match_table = of_match_ptr(xgene_rtc_of_match),
+ },
+};
+
+module_platform_driver(xgene_rtc_driver);
+
+MODULE_DESCRIPTION("APM X-Gene SoC RTC driver");
+MODULE_AUTHOR("Rameshwar Sahu <rsahu@apm.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index b24aa010f68c..65cd80bf9aec 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -13,6 +13,10 @@ config VT
bool "Virtual terminal" if EXPERT
depends on !S390 && !UML
select INPUT
+ select NEW_LEDS
+ select LEDS_CLASS
+ select LEDS_TRIGGERS
+ select INPUT_LEDS
default y
---help---
If you say Y here, you will get support for terminal devices with
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index d0e3a4497707..d6ecfc9e734f 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -33,6 +33,7 @@
#include <linux/string.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/leds.h>
#include <linux/kbd_kern.h>
#include <linux/kbd_diacr.h>
@@ -130,6 +131,7 @@ static char rep; /* flag telling character repeat */
static int shift_state = 0;
static unsigned char ledstate = 0xff; /* undefined */
+static unsigned char lockstate = 0xff; /* undefined */
static unsigned char ledioctl;
/*
@@ -961,6 +963,41 @@ static void k_brl(struct vc_data *vc, unsigned char value, char up_flag)
}
}
+/* We route VT keyboard "leds" through triggers */
+static void kbd_ledstate_trigger_activate(struct led_classdev *cdev);
+
+static struct led_trigger ledtrig_ledstate[] = {
+#define DEFINE_LEDSTATE_TRIGGER(kbd_led, nam) \
+ [kbd_led] = { \
+ .name = nam, \
+ .activate = kbd_ledstate_trigger_activate, \
+ }
+ DEFINE_LEDSTATE_TRIGGER(VC_SCROLLOCK, "kbd-scrollock"),
+ DEFINE_LEDSTATE_TRIGGER(VC_NUMLOCK, "kbd-numlock"),
+ DEFINE_LEDSTATE_TRIGGER(VC_CAPSLOCK, "kbd-capslock"),
+ DEFINE_LEDSTATE_TRIGGER(VC_KANALOCK, "kbd-kanalock"),
+#undef DEFINE_LEDSTATE_TRIGGER
+};
+
+static void kbd_lockstate_trigger_activate(struct led_classdev *cdev);
+
+static struct led_trigger ledtrig_lockstate[] = {
+#define DEFINE_LOCKSTATE_TRIGGER(kbd_led, nam) \
+ [kbd_led] = { \
+ .name = nam, \
+ .activate = kbd_lockstate_trigger_activate, \
+ }
+ DEFINE_LOCKSTATE_TRIGGER(VC_SHIFTLOCK, "kbd-shiftlock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_ALTGRLOCK, "kbd-altgrlock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_CTRLLOCK, "kbd-ctrllock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_ALTLOCK, "kbd-altlock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_SHIFTLLOCK, "kbd-shiftllock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_SHIFTRLOCK, "kbd-shiftrlock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_CTRLLLOCK, "kbd-ctrlllock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_CTRLRLOCK, "kbd-ctrlrlock"),
+#undef DEFINE_LOCKSTATE_TRIGGER
+};
+
/*
* The leds display either (i) the status of NumLock, CapsLock, ScrollLock,
* or (ii) whatever pattern of lights people want to show using KDSETLED,
@@ -995,18 +1032,25 @@ static inline unsigned char getleds(void)
return kbd->ledflagstate;
}
-static int kbd_update_leds_helper(struct input_handle *handle, void *data)
+/* Called on trigger connection, to set initial state */
+static void kbd_ledstate_trigger_activate(struct led_classdev *cdev)
{
- unsigned char leds = *(unsigned char *)data;
+ struct led_trigger *trigger = cdev->trigger;
+ int led = trigger - ledtrig_ledstate;
- if (test_bit(EV_LED, handle->dev->evbit)) {
- input_inject_event(handle, EV_LED, LED_SCROLLL, !!(leds & 0x01));
- input_inject_event(handle, EV_LED, LED_NUML, !!(leds & 0x02));
- input_inject_event(handle, EV_LED, LED_CAPSL, !!(leds & 0x04));
- input_inject_event(handle, EV_SYN, SYN_REPORT, 0);
- }
+ tasklet_disable(&keyboard_tasklet);
+ led_trigger_event(trigger, ledstate & (1 << led) ? LED_FULL : LED_OFF);
+ tasklet_enable(&keyboard_tasklet);
+}
- return 0;
+static void kbd_lockstate_trigger_activate(struct led_classdev *cdev)
+{
+ struct led_trigger *trigger = cdev->trigger;
+ int led = trigger - ledtrig_lockstate;
+
+ tasklet_disable(&keyboard_tasklet);
+ led_trigger_event(trigger, lockstate & (1 << led) ? LED_FULL : LED_OFF);
+ tasklet_enable(&keyboard_tasklet);
}
/**
@@ -1095,16 +1139,29 @@ static void kbd_bh(unsigned long dummy)
{
unsigned char leds;
unsigned long flags;
-
+ int i;
+
spin_lock_irqsave(&led_lock, flags);
leds = getleds();
spin_unlock_irqrestore(&led_lock, flags);
if (leds != ledstate) {
- input_handler_for_each_handle(&kbd_handler, &leds,
- kbd_update_leds_helper);
+ for (i = 0; i < ARRAY_SIZE(ledtrig_ledstate); i++)
+ if ((leds ^ ledstate) & (1 << i))
+ led_trigger_event(&ledtrig_ledstate[i],
+ leds & (1 << i)
+ ? LED_FULL : LED_OFF);
ledstate = leds;
}
+
+ if (kbd->lockstate != lockstate) {
+ for (i = 0; i < ARRAY_SIZE(ledtrig_lockstate); i++)
+ if ((kbd->lockstate ^ lockstate) & (1 << i))
+ led_trigger_event(&ledtrig_lockstate[i],
+ kbd->lockstate & (1 << i)
+ ? LED_FULL : LED_OFF);
+ lockstate = kbd->lockstate;
+ }
}
DECLARE_TASKLET_DISABLED(keyboard_tasklet, kbd_bh, 0);
@@ -1442,20 +1499,6 @@ static void kbd_disconnect(struct input_handle *handle)
kfree(handle);
}
-/*
- * Start keyboard handler on the new keyboard by refreshing LED state to
- * match the rest of the system.
- */
-static void kbd_start(struct input_handle *handle)
-{
- tasklet_disable(&keyboard_tasklet);
-
- if (ledstate != 0xff)
- kbd_update_leds_helper(handle, &ledstate);
-
- tasklet_enable(&keyboard_tasklet);
-}
-
static const struct input_device_id kbd_ids[] = {
{
.flags = INPUT_DEVICE_ID_MATCH_EVBIT,
@@ -1477,7 +1520,6 @@ static struct input_handler kbd_handler = {
.match = kbd_match,
.connect = kbd_connect,
.disconnect = kbd_disconnect,
- .start = kbd_start,
.name = "kbd",
.id_table = kbd_ids,
};
@@ -1501,6 +1543,20 @@ int __init kbd_init(void)
if (error)
return error;
+ for (i = 0; i < ARRAY_SIZE(ledtrig_ledstate); i++) {
+ error = led_trigger_register(&ledtrig_ledstate[i]);
+ if (error)
+ pr_err("error %d while registering trigger %s\n",
+ error, ledtrig_ledstate[i].name);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ledtrig_lockstate); i++) {
+ error = led_trigger_register(&ledtrig_lockstate[i]);
+ if (error)
+ pr_err("error %d while registering trigger %s\n",
+ error, ledtrig_lockstate[i].name);
+ }
+
tasklet_enable(&keyboard_tasklet);
tasklet_schedule(&keyboard_tasklet);
diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index bd2172c2d650..31672740fe28 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -189,8 +189,6 @@ static ssize_t brightness_store(struct device *dev,
}
mutex_unlock(&bd->ops_lock);
- backlight_generate_event(bd, BACKLIGHT_UPDATE_SYSFS);
-
return rc;
}
static DEVICE_ATTR_RW(brightness);
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 25b23b1e7f22..9bca88159725 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -1,3 +1,9 @@
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
@@ -206,7 +212,7 @@ affs_set_blocksize(struct super_block *sb, int size)
static inline struct buffer_head *
affs_bread(struct super_block *sb, int block)
{
- pr_debug("affs_bread: %d\n", block);
+ pr_debug("%s: %d\n", __func__, block);
if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size)
return sb_bread(sb, block);
return NULL;
@@ -214,7 +220,7 @@ affs_bread(struct super_block *sb, int block)
static inline struct buffer_head *
affs_getblk(struct super_block *sb, int block)
{
- pr_debug("affs_getblk: %d\n", block);
+ pr_debug("%s: %d\n", __func__, block);
if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size)
return sb_getblk(sb, block);
return NULL;
@@ -223,7 +229,7 @@ static inline struct buffer_head *
affs_getzeroblk(struct super_block *sb, int block)
{
struct buffer_head *bh;
- pr_debug("affs_getzeroblk: %d\n", block);
+ pr_debug("%s: %d\n", __func__, block);
if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) {
bh = sb_getblk(sb, block);
lock_buffer(bh);
@@ -238,7 +244,7 @@ static inline struct buffer_head *
affs_getemptyblk(struct super_block *sb, int block)
{
struct buffer_head *bh;
- pr_debug("affs_getemptyblk: %d\n", block);
+ pr_debug("%s: %d\n", __func__, block);
if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) {
bh = sb_getblk(sb, block);
wait_on_buffer(bh);
@@ -251,7 +257,7 @@ static inline void
affs_brelse(struct buffer_head *bh)
{
if (bh)
- pr_debug("affs_brelse: %lld\n", (long long) bh->b_blocknr);
+ pr_debug("%s: %lld\n", __func__, (long long) bh->b_blocknr);
brelse(bh);
}
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 533a322c41c0..406b29836b19 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -34,7 +34,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
ino = bh->b_blocknr;
offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]);
- pr_debug("AFFS: insert_hash(dir=%u, ino=%d)\n", (u32)dir->i_ino, ino);
+ pr_debug("%s(dir=%u, ino=%d)\n", __func__, (u32)dir->i_ino, ino);
dir_bh = affs_bread(sb, dir->i_ino);
if (!dir_bh)
@@ -84,7 +84,8 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
sb = dir->i_sb;
rem_ino = rem_bh->b_blocknr;
offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]);
- pr_debug("AFFS: remove_hash(dir=%d, ino=%d, hashval=%d)\n", (u32)dir->i_ino, rem_ino, offset);
+ pr_debug("%s(dir=%d, ino=%d, hashval=%d)\n",
+ __func__, (u32)dir->i_ino, rem_ino, offset);
bh = affs_bread(sb, dir->i_ino);
if (!bh)
@@ -147,7 +148,7 @@ affs_remove_link(struct dentry *dentry)
u32 link_ino, ino;
int retval;
- pr_debug("AFFS: remove_link(key=%ld)\n", inode->i_ino);
+ pr_debug("%s(key=%ld)\n", __func__, inode->i_ino);
retval = -EIO;
bh = affs_bread(sb, inode->i_ino);
if (!bh)
@@ -279,7 +280,7 @@ affs_remove_header(struct dentry *dentry)
if (!inode)
goto done;
- pr_debug("AFFS: remove_header(key=%ld)\n", inode->i_ino);
+ pr_debug("%s(key=%ld)\n", __func__, inode->i_ino);
retval = -EIO;
bh = affs_bread(sb, (u32)(long)dentry->d_fsdata);
if (!bh)
@@ -451,10 +452,10 @@ affs_error(struct super_block *sb, const char *function, const char *fmt, ...)
vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args);
va_end(args);
- printk(KERN_CRIT "AFFS error (device %s): %s(): %s\n", sb->s_id,
+ pr_crit("error (device %s): %s(): %s\n", sb->s_id,
function,ErrorBuffer);
if (!(sb->s_flags & MS_RDONLY))
- printk(KERN_WARNING "AFFS: Remounting filesystem read-only\n");
+ pr_warn("Remounting filesystem read-only\n");
sb->s_flags |= MS_RDONLY;
}
@@ -467,7 +468,7 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args);
va_end(args);
- printk(KERN_WARNING "AFFS warning (device %s): %s(): %s\n", sb->s_id,
+ pr_warn("(device %s): %s(): %s\n", sb->s_id,
function,ErrorBuffer);
}
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index a32246b8359e..c8de51185c23 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -17,7 +17,7 @@ affs_count_free_blocks(struct super_block *sb)
u32 free;
int i;
- pr_debug("AFFS: count_free_blocks()\n");
+ pr_debug("%s()\n", __func__);
if (sb->s_flags & MS_RDONLY)
return 0;
@@ -43,7 +43,7 @@ affs_free_block(struct super_block *sb, u32 block)
u32 blk, bmap, bit, mask, tmp;
__be32 *data;
- pr_debug("AFFS: free_block(%u)\n", block);
+ pr_debug("%s(%u)\n", __func__, block);
if (block > sbi->s_partition_size)
goto err_range;
@@ -125,7 +125,7 @@ affs_alloc_block(struct inode *inode, u32 goal)
sb = inode->i_sb;
sbi = AFFS_SB(sb);
- pr_debug("AFFS: balloc(inode=%lu,goal=%u): ", inode->i_ino, goal);
+ pr_debug("balloc(inode=%lu,goal=%u): ", inode->i_ino, goal);
if (AFFS_I(inode)->i_pa_cnt) {
pr_debug("%d\n", AFFS_I(inode)->i_lastalloc+1);
@@ -254,8 +254,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
return 0;
if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) {
- printk(KERN_NOTICE "AFFS: Bitmap invalid - mounting %s read only\n",
- sb->s_id);
+ pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id);
*flags |= MS_RDONLY;
return 0;
}
@@ -268,7 +267,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
size = sbi->s_bmap_count * sizeof(*bm);
bm = sbi->s_bitmap = kzalloc(size, GFP_KERNEL);
if (!sbi->s_bitmap) {
- printk(KERN_ERR "AFFS: Bitmap allocation failed\n");
+ pr_err("Bitmap allocation failed\n");
return -ENOMEM;
}
@@ -282,17 +281,17 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
bm->bm_key = be32_to_cpu(bmap_blk[blk]);
bh = affs_bread(sb, bm->bm_key);
if (!bh) {
- printk(KERN_ERR "AFFS: Cannot read bitmap\n");
+ pr_err("Cannot read bitmap\n");
res = -EIO;
goto out;
}
if (affs_checksum_block(sb, bh)) {
- printk(KERN_WARNING "AFFS: Bitmap %u invalid - mounting %s read only.\n",
- bm->bm_key, sb->s_id);
+ pr_warn("Bitmap %u invalid - mounting %s read only.\n",
+ bm->bm_key, sb->s_id);
*flags |= MS_RDONLY;
goto out;
}
- pr_debug("AFFS: read bitmap block %d: %d\n", blk, bm->bm_key);
+ pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key);
bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4);
/* Don't try read the extension if this is the last block,
@@ -304,7 +303,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
affs_brelse(bmap_bh);
bmap_bh = affs_bread(sb, be32_to_cpu(bmap_blk[blk]));
if (!bmap_bh) {
- printk(KERN_ERR "AFFS: Cannot read bitmap extension\n");
+ pr_err("Cannot read bitmap extension\n");
res = -EIO;
goto out;
}
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index cbbda476a805..59f07bec92a6 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -54,8 +54,8 @@ affs_readdir(struct file *file, struct dir_context *ctx)
u32 ino;
int error = 0;
- pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",
- inode->i_ino, (unsigned long)ctx->pos);
+ pr_debug("%s(ino=%lu,f_pos=%lx)\n",
+ __func__, inode->i_ino, (unsigned long)ctx->pos);
if (ctx->pos < 2) {
file->private_data = (void *)0;
@@ -81,7 +81,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
*/
ino = (u32)(long)file->private_data;
if (ino && file->f_version == inode->i_version) {
- pr_debug("AFFS: readdir() left off=%d\n", ino);
+ pr_debug("readdir() left off=%d\n", ino);
goto inside;
}
@@ -117,7 +117,7 @@ inside:
namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
name = AFFS_TAIL(sb, fh_bh)->name + 1;
- pr_debug("AFFS: readdir(): dir_emit(\"%.*s\", "
+ pr_debug("readdir(): dir_emit(\"%.*s\", "
"ino=%u), hash=%d, f_pos=%x\n",
namelen, name, ino, hash_pos, (u32)ctx->pos);
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 9df23175e28b..a7fe57d2cd9a 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -45,7 +45,7 @@ const struct inode_operations affs_file_inode_operations = {
static int
affs_file_open(struct inode *inode, struct file *filp)
{
- pr_debug("AFFS: open(%lu,%d)\n",
+ pr_debug("open(%lu,%d)\n",
inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
atomic_inc(&AFFS_I(inode)->i_opencnt);
return 0;
@@ -54,7 +54,7 @@ affs_file_open(struct inode *inode, struct file *filp)
static int
affs_file_release(struct inode *inode, struct file *filp)
{
- pr_debug("AFFS: release(%lu, %d)\n",
+ pr_debug("release(%lu, %d)\n",
inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
@@ -324,7 +324,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
struct buffer_head *ext_bh;
u32 ext;
- pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block);
+ pr_debug("%s(%u, %lu)\n",
+ __func__, (u32)inode->i_ino, (unsigned long)block);
BUG_ON(block > (sector_t)0x7fffffffUL);
@@ -498,34 +499,36 @@ affs_getemptyblk_ino(struct inode *inode, int block)
}
static int
-affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsigned to)
+affs_do_readpage_ofs(struct page *page, unsigned to)
{
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
struct buffer_head *bh;
char *data;
+ unsigned pos = 0;
u32 bidx, boff, bsize;
u32 tmp;
- pr_debug("AFFS: read_page(%u, %ld, %d, %d)\n", (u32)inode->i_ino, page->index, from, to);
- BUG_ON(from > to || to > PAGE_CACHE_SIZE);
+ pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino,
+ page->index, to);
+ BUG_ON(to > PAGE_CACHE_SIZE);
kmap(page);
data = page_address(page);
bsize = AFFS_SB(sb)->s_data_blksize;
- tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+ tmp = page->index << PAGE_CACHE_SHIFT;
bidx = tmp / bsize;
boff = tmp % bsize;
- while (from < to) {
+ while (pos < to) {
bh = affs_bread_ino(inode, bidx, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
- tmp = min(bsize - boff, to - from);
- BUG_ON(from + tmp > to || tmp > bsize);
- memcpy(data + from, AFFS_DATA(bh) + boff, tmp);
+ tmp = min(bsize - boff, to - pos);
+ BUG_ON(pos + tmp > to || tmp > bsize);
+ memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
affs_brelse(bh);
bidx++;
- from += tmp;
+ pos += tmp;
boff = 0;
}
flush_dcache_page(page);
@@ -542,7 +545,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
u32 size, bsize;
u32 tmp;
- pr_debug("AFFS: extent_file(%u, %d)\n", (u32)inode->i_ino, newsize);
+ pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize);
bsize = AFFS_SB(sb)->s_data_blksize;
bh = NULL;
size = AFFS_I(inode)->mmu_private;
@@ -608,14 +611,14 @@ affs_readpage_ofs(struct file *file, struct page *page)
u32 to;
int err;
- pr_debug("AFFS: read_page(%u, %ld)\n", (u32)inode->i_ino, page->index);
+ pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index);
to = PAGE_CACHE_SIZE;
if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
to = inode->i_size & ~PAGE_CACHE_MASK;
memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to);
}
- err = affs_do_readpage_ofs(file, page, 0, to);
+ err = affs_do_readpage_ofs(page, to);
if (!err)
SetPageUptodate(page);
unlock_page(page);
@@ -631,7 +634,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
pgoff_t index;
int err = 0;
- pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len);
+ pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino,
+ (unsigned long long)pos, (unsigned long long)pos + len);
if (pos > AFFS_I(inode)->mmu_private) {
/* XXX: this probably leaves a too-big i_size in case of
* failure. Should really be updating i_size at write_end time
@@ -651,7 +655,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
return 0;
/* XXX: inefficient but safe in the face of short writes */
- err = affs_do_readpage_ofs(file, page, 0, PAGE_CACHE_SIZE);
+ err = affs_do_readpage_ofs(page, PAGE_CACHE_SIZE);
if (err) {
unlock_page(page);
page_cache_release(page);
@@ -680,7 +684,9 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
* due to write_begin.
*/
- pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len);
+ pr_debug("%s(%u, %llu, %llu)\n",
+ __func__, (u32)inode->i_ino, (unsigned long long)pos,
+ (unsigned long long)pos + len);
bsize = AFFS_SB(sb)->s_data_blksize;
data = page_address(page);
@@ -802,7 +808,7 @@ affs_free_prealloc(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
- pr_debug("AFFS: free_prealloc(ino=%lu)\n", inode->i_ino);
+ pr_debug("free_prealloc(ino=%lu)\n", inode->i_ino);
while (AFFS_I(inode)->i_pa_cnt) {
AFFS_I(inode)->i_pa_cnt--;
@@ -822,7 +828,7 @@ affs_truncate(struct inode *inode)
struct buffer_head *ext_bh;
int i;
- pr_debug("AFFS: truncate(inode=%d, oldsize=%u, newsize=%u)\n",
+ pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n",
(u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size);
last_blk = 0;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 96df91e8c334..bec2d1a0c91c 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -34,7 +34,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
if (!(inode->i_state & I_NEW))
return inode;
- pr_debug("AFFS: affs_iget(%lu)\n", inode->i_ino);
+ pr_debug("affs_iget(%lu)\n", inode->i_ino);
block = inode->i_ino;
bh = affs_bread(sb, block);
@@ -175,7 +175,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
uid_t uid;
gid_t gid;
- pr_debug("AFFS: write_inode(%lu)\n",inode->i_ino);
+ pr_debug("write_inode(%lu)\n", inode->i_ino);
if (!inode->i_nlink)
// possibly free block
@@ -220,7 +220,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
struct inode *inode = dentry->d_inode;
int error;
- pr_debug("AFFS: notify_change(%lu,0x%x)\n",inode->i_ino,attr->ia_valid);
+ pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid);
error = inode_change_ok(inode,attr);
if (error)
@@ -258,7 +258,8 @@ void
affs_evict_inode(struct inode *inode)
{
unsigned long cache_page;
- pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
+ pr_debug("evict_inode(ino=%lu, nlink=%u)\n",
+ inode->i_ino, inode->i_nlink);
truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
@@ -271,7 +272,7 @@ affs_evict_inode(struct inode *inode)
affs_free_prealloc(inode);
cache_page = (unsigned long)AFFS_I(inode)->i_lc;
if (cache_page) {
- pr_debug("AFFS: freeing ext cache\n");
+ pr_debug("freeing ext cache\n");
AFFS_I(inode)->i_lc = NULL;
AFFS_I(inode)->i_ac = NULL;
free_page(cache_page);
@@ -350,7 +351,8 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
u32 block = 0;
int retval;
- pr_debug("AFFS: add_entry(dir=%u, inode=%u, \"%*s\", type=%d)\n", (u32)dir->i_ino,
+ pr_debug("%s(dir=%u, inode=%u, \"%*s\", type=%d)\n",
+ __func__, (u32)dir->i_ino,
(u32)inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name, type);
retval = -EIO;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 6dae1ccd176d..035bd31556fc 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -190,7 +190,8 @@ affs_find_entry(struct inode *dir, struct dentry *dentry)
toupper_t toupper = affs_get_toupper(sb);
u32 key;
- pr_debug("AFFS: find_entry(\"%.*s\")\n", (int)dentry->d_name.len, dentry->d_name.name);
+ pr_debug("%s(\"%.*s\")\n",
+ __func__, (int)dentry->d_name.len, dentry->d_name.name);
bh = affs_bread(sb, dir->i_ino);
if (!bh)
@@ -218,7 +219,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
struct buffer_head *bh;
struct inode *inode = NULL;
- pr_debug("AFFS: lookup(\"%.*s\")\n",(int)dentry->d_name.len,dentry->d_name.name);
+ pr_debug("%s(\"%.*s\")\n",
+ __func__, (int)dentry->d_name.len, dentry->d_name.name);
affs_lock_dir(dir);
bh = affs_find_entry(dir, dentry);
@@ -248,9 +250,9 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
int
affs_unlink(struct inode *dir, struct dentry *dentry)
{
- pr_debug("AFFS: unlink(dir=%d, %lu \"%.*s\")\n", (u32)dir->i_ino,
- dentry->d_inode->i_ino,
- (int)dentry->d_name.len, dentry->d_name.name);
+ pr_debug("%s(dir=%d, %lu \"%.*s\")\n",
+ __func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
+ (int)dentry->d_name.len, dentry->d_name.name);
return affs_remove_header(dentry);
}
@@ -262,7 +264,8 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
struct inode *inode;
int error;
- pr_debug("AFFS: create(%lu,\"%.*s\",0%ho)\n",dir->i_ino,(int)dentry->d_name.len,
+ pr_debug("%s(%lu,\"%.*s\",0%ho)\n",
+ __func__, dir->i_ino, (int)dentry->d_name.len,
dentry->d_name.name,mode);
inode = affs_new_inode(dir);
@@ -291,8 +294,9 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
int error;
- pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%ho)\n",dir->i_ino,
- (int)dentry->d_name.len,dentry->d_name.name,mode);
+ pr_debug("%s(%lu,\"%.*s\",0%ho)\n",
+ __func__, dir->i_ino, (int)dentry->d_name.len,
+ dentry->d_name.name, mode);
inode = affs_new_inode(dir);
if (!inode)
@@ -317,8 +321,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
int
affs_rmdir(struct inode *dir, struct dentry *dentry)
{
- pr_debug("AFFS: rmdir(dir=%u, %lu \"%.*s\")\n", (u32)dir->i_ino,
- dentry->d_inode->i_ino,
+ pr_debug("%s(dir=%u, %lu \"%.*s\")\n",
+ __func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
(int)dentry->d_name.len, dentry->d_name.name);
return affs_remove_header(dentry);
@@ -334,8 +338,9 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
int i, maxlen, error;
char c, lc;
- pr_debug("AFFS: symlink(%lu,\"%.*s\" -> \"%s\")\n",dir->i_ino,
- (int)dentry->d_name.len,dentry->d_name.name,symname);
+ pr_debug("%s(%lu,\"%.*s\" -> \"%s\")\n",
+ __func__, dir->i_ino, (int)dentry->d_name.len,
+ dentry->d_name.name, symname);
maxlen = AFFS_SB(sb)->s_hashsize * sizeof(u32) - 1;
inode = affs_new_inode(dir);
@@ -404,7 +409,8 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
struct inode *inode = old_dentry->d_inode;
- pr_debug("AFFS: link(%u, %u, \"%.*s\")\n", (u32)inode->i_ino, (u32)dir->i_ino,
+ pr_debug("%s(%u, %u, \"%.*s\")\n",
+ __func__, (u32)inode->i_ino, (u32)dir->i_ino,
(int)dentry->d_name.len,dentry->d_name.name);
return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
@@ -418,9 +424,10 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *bh = NULL;
int retval;
- pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
- (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
- (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
+ pr_debug("%s(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
+ __func__, (u32)old_dir->i_ino, (int)old_dentry->d_name.len,
+ old_dentry->d_name.name, (u32)new_dir->i_ino,
+ (int)new_dentry->d_name.len, new_dentry->d_name.name);
retval = affs_check_name(new_dentry->d_name.name,
new_dentry->d_name.len,
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 6d589f28bf9b..36f7b446cb22 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -46,7 +46,7 @@ static void
affs_put_super(struct super_block *sb)
{
struct affs_sb_info *sbi = AFFS_SB(sb);
- pr_debug("AFFS: put_super()\n");
+ pr_debug("%s()\n", __func__);
cancel_delayed_work_sync(&sbi->sb_work);
}
@@ -220,7 +220,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
return 0;
if (n != 512 && n != 1024 && n != 2048
&& n != 4096) {
- printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
+ pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
return 0;
}
*blocksize = n;
@@ -285,8 +285,8 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
/* Silently ignore the quota options */
break;
default:
- printk("AFFS: Unrecognized mount option \"%s\" "
- "or missing value\n", p);
+ pr_warn("Unrecognized mount option \"%s\" or missing value\n",
+ p);
return 0;
}
}
@@ -319,7 +319,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
save_mount_options(sb, data);
- pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options");
+ pr_debug("read_super(%s)\n", data ? (const char *)data : "no options");
sb->s_magic = AFFS_SUPER_MAGIC;
sb->s_op = &affs_sops;
@@ -339,7 +339,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
&blocksize,&sbi->s_prefix,
sbi->s_volume, &mount_flags)) {
- printk(KERN_ERR "AFFS: Error parsing options\n");
+ pr_err("Error parsing options\n");
kfree(sbi->s_prefix);
kfree(sbi);
return -EINVAL;
@@ -358,7 +358,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
*/
size = sb->s_bdev->bd_inode->i_size >> 9;
- pr_debug("AFFS: initial blocksize=%d, #blocks=%d\n", 512, size);
+ pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
affs_set_blocksize(sb, PAGE_SIZE);
/* Try to find root block. Its location depends on the block size. */
@@ -373,7 +373,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_root_block = root_block;
if (root_block < 0)
sbi->s_root_block = (reserved + size - 1) / 2;
- pr_debug("AFFS: setting blocksize to %d\n", blocksize);
+ pr_debug("setting blocksize to %d\n", blocksize);
affs_set_blocksize(sb, blocksize);
sbi->s_partition_size = size;
@@ -388,7 +388,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
* block behind the calculated one. So we check this one, too.
*/
for (num_bm = 0; num_bm < 2; num_bm++) {
- pr_debug("AFFS: Dev %s, trying root=%u, bs=%d, "
+ pr_debug("Dev %s, trying root=%u, bs=%d, "
"size=%d, reserved=%d\n",
sb->s_id,
sbi->s_root_block + num_bm,
@@ -409,8 +409,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
}
}
if (!silent)
- printk(KERN_ERR "AFFS: No valid root block on device %s\n",
- sb->s_id);
+ pr_err("No valid root block on device %s\n", sb->s_id);
return -EINVAL;
/* N.B. after this point bh must be released */
@@ -422,7 +421,7 @@ got_root:
/* Find out which kind of FS we have */
boot_bh = sb_bread(sb, 0);
if (!boot_bh) {
- printk(KERN_ERR "AFFS: Cannot read boot block\n");
+ pr_err("Cannot read boot block\n");
return -EINVAL;
}
memcpy(sig, boot_bh->b_data, 4);
@@ -435,8 +434,7 @@ got_root:
*/
if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS
|| chksum == MUFS_DCOFS) && !(sb->s_flags & MS_RDONLY)) {
- printk(KERN_NOTICE "AFFS: Dircache FS - mounting %s read only\n",
- sb->s_id);
+ pr_notice("Dircache FS - mounting %s read only\n", sb->s_id);
sb->s_flags |= MS_RDONLY;
}
switch (chksum) {
@@ -470,14 +468,14 @@ got_root:
sb->s_flags |= MS_NOEXEC;
break;
default:
- printk(KERN_ERR "AFFS: Unknown filesystem on device %s: %08X\n",
- sb->s_id, chksum);
+ pr_err("Unknown filesystem on device %s: %08X\n",
+ sb->s_id, chksum);
return -EINVAL;
}
if (mount_flags & SF_VERBOSE) {
u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
- printk(KERN_NOTICE "AFFS: Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
+ pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
len > 31 ? 31 : len,
AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1,
sig, sig[3] + '0', blocksize);
@@ -508,11 +506,11 @@ got_root:
sb->s_root = d_make_root(root_inode);
if (!sb->s_root) {
- printk(KERN_ERR "AFFS: Get root inode failed\n");
+ pr_err("AFFS: Get root inode failed\n");
return -ENOMEM;
}
- pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
+ pr_debug("s_flags=%lX\n", sb->s_flags);
return 0;
}
@@ -532,7 +530,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
char volume[32];
char *prefix = NULL;
- pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
+ pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
sync_filesystem(sb);
*flags |= MS_NODIRATIME;
@@ -580,8 +578,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
int free;
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
- pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
- AFFS_SB(sb)->s_reserved);
+ pr_debug("%s() partsize=%d, reserved=%d\n",
+ __func__, AFFS_SB(sb)->s_partition_size,
+ AFFS_SB(sb)->s_reserved);
free = affs_count_free_blocks(sb);
buf->f_type = AFFS_SUPER_MAGIC;
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index ee00f08c4f53..f39b71c3981e 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -21,7 +21,7 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
char c;
char lc;
- pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino);
+ pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
err = -EIO;
bh = affs_bread(inode->i_sb, inode->i_ino);
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index a2cd305a993a..9d7d00e04858 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -405,7 +405,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
* Heres how it works: Key_no is the index of the key/value pair to
* return in keybuf/value.
* Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is
- * the number of charecters in the key (just a convenience).
+ * the number of characters in the key (just a convenience).
*
* Algorithm:
* Get the first leafnode of the tree. See if the requested key is in that
@@ -502,12 +502,11 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
"for key of size %d", __func__, bufsize, keylen);
brelse(this_node->bh);
goto error_alloc;
- };
+ }
- strncpy(keybuf, keystart, keylen);
+ strlcpy(keybuf, keystart, keylen + 1);
*value = fs64_to_cpu(sb, valarray[cur_key]);
*keysize = keylen;
- keybuf[keylen] = '\0';
befs_debug(sb, "Read [%llu,%d]: Key \"%.*s\", Value %llu", node_off,
cur_key, keylen, keybuf, *value);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index d626756ff721..1e27cd33f7f2 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -396,9 +396,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
if (S_ISLNK(inode->i_mode) && !(befs_ino->i_flags & BEFS_LONG_SYMLINK)){
inode->i_size = 0;
inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE;
- strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
- BEFS_SYMLINK_LEN - 1);
- befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0';
+ strlcpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
+ BEFS_SYMLINK_LEN);
} else {
int num_blks;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index aa3cb626671e..ef242033a120 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -145,6 +145,25 @@ static int padzero(unsigned long elf_bss)
#define ELF_BASE_PLATFORM NULL
#endif
+/*
+ * Use get_random_int() to implement AT_RANDOM while avoiding depletion
+ * of the entropy pool.
+ */
+static void get_atrandom_bytes(unsigned char *buf, size_t nbytes)
+{
+ unsigned char *p = buf;
+
+ while (nbytes) {
+ unsigned int random_variable;
+ size_t chunk = min(nbytes, sizeof(random_variable));
+
+ random_variable = get_random_int();
+ memcpy(p, &random_variable, chunk);
+ p += chunk;
+ nbytes -= chunk;
+ }
+}
+
static int
create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
unsigned long load_addr, unsigned long interp_load_addr)
@@ -206,7 +225,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
/*
* Generate 16 random bytes for userspace PRNG seeding.
*/
- get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+ get_atrandom_bytes(k_rand_bytes, sizeof(k_rand_bytes));
u_rand_bytes = (elf_addr_t __user *)
STACK_ALLOC(p, sizeof(k_rand_bytes));
if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
@@ -1686,7 +1705,7 @@ static size_t get_note_info_size(struct elf_note_info *info)
static int write_note_info(struct elf_note_info *info,
struct coredump_params *cprm)
{
- bool first = 1;
+ bool first = true;
struct elf_thread_core_info *t = info->thread;
do {
@@ -1710,7 +1729,7 @@ static int write_note_info(struct elf_note_info *info,
!writenote(&t->notes[i], cprm))
return 0;
- first = 0;
+ first = false;
t = t->next;
} while (t);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index d50bbe59da1e..f723cd3a455c 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -380,7 +380,7 @@ failed:
/****************************************************************************/
-void old_reloc(unsigned long rl)
+static void old_reloc(unsigned long rl)
{
#ifdef DEBUG
char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" };
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 91ad9e1c9441..e26bc9a22ac9 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -303,6 +303,31 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
return dclus;
}
+static int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap)
+{
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int cluster, offset;
+
+ cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+ offset = sector & (sbi->sec_per_clus - 1);
+ cluster = fat_bmap_cluster(inode, cluster);
+
+ if (cluster < 0)
+ return cluster;
+
+ else if (cluster) {
+ *bmap = fat_clus_to_blknr(sbi, cluster) + offset;
+ *mapped_blocks = sbi->sec_per_clus - offset;
+ if (*mapped_blocks > last_block - sector)
+ *mapped_blocks = last_block - sector;
+ }
+
+ return 0;
+}
+
int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
unsigned long *mapped_blocks, int create)
{
@@ -311,7 +336,6 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
const unsigned long blocksize = sb->s_blocksize;
const unsigned char blocksize_bits = sb->s_blocksize_bits;
sector_t last_block;
- int cluster, offset;
*phys = 0;
*mapped_blocks = 0;
@@ -329,25 +353,39 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
return 0;
/*
- * ->mmu_private can access on only allocation path.
- * (caller must hold ->i_mutex)
+ * Both ->mmu_private and ->i_disksize can access
+ * on only allocation path. (caller must hold ->i_mutex)
*/
- last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+ last_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1))
>> blocksize_bits;
if (sector >= last_block)
return 0;
}
- cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
- offset = sector & (sbi->sec_per_clus - 1);
- cluster = fat_bmap_cluster(inode, cluster);
- if (cluster < 0)
- return cluster;
- else if (cluster) {
- *phys = fat_clus_to_blknr(sbi, cluster) + offset;
- *mapped_blocks = sbi->sec_per_clus - offset;
- if (*mapped_blocks > last_block - sector)
- *mapped_blocks = last_block - sector;
- }
- return 0;
+ return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
+ phys);
+}
+
+int fat_bmap2(struct inode *inode, sector_t sector,
+ unsigned long *mapped_blocks, struct buffer_head *bh_result,
+ int create, sector_t *bmap)
+{
+ struct super_block *sb = inode->i_sb;
+ sector_t last_block;
+ const unsigned long blocksize = sb->s_blocksize;
+ const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+ BUG_ON(create != 0);
+
+ *bmap = 0;
+ *mapped_blocks = 0;
+
+ last_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1))
+ >> blocksize_bits;
+
+ if (sector >= last_block)
+ return 0;
+
+ return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
+ bmap);
}
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 7c31f4bc74a9..13b7202bd651 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -52,7 +52,8 @@ struct fat_mount_options {
usefree:1, /* Use free_clusters for FAT32 */
tz_set:1, /* Filesystem timestamps' offset set */
rodir:1, /* allow ATTR_RO for directory */
- discard:1; /* Issue discard requests on deletions */
+ discard:1, /* Issue discard requests on deletions */
+ dos1xfloppy:1; /* Assume default BPB for DOS 1.x floppies */
};
#define FAT_HASH_BITS 8
@@ -118,7 +119,8 @@ struct msdos_inode_info {
unsigned int cache_valid_id;
/* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */
- loff_t mmu_private; /* physically allocated size */
+ loff_t mmu_private; /* physically allocated size (initialized) */
+ loff_t i_disksize; /* physically allocated size (uninitialized) */
int i_start; /* first cluster or 0 */
int i_logstart; /* logical first cluster */
@@ -289,6 +291,9 @@ extern int fat_get_cluster(struct inode *inode, int cluster,
int *fclus, int *dclus);
extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
unsigned long *mapped_blocks, int create);
+extern int fat_bmap2(struct inode *inode, sector_t sector,
+ unsigned long *mapped_blocks,
+ struct buffer_head *bh_result, int create, sector_t *bmap);
/* fat/dir.c */
extern const struct file_operations fat_dir_operations;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 85f79a89e747..92e9e753b554 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -17,8 +17,12 @@
#include <linux/blkdev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
+#include <linux/falloc.h>
#include "fat.h"
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len);
+
static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
@@ -182,6 +186,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .fallocate = fat_fallocate,
};
static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -220,6 +225,75 @@ out:
return err;
}
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ int cluster;
+ int nr_cluster; /* Number of clusters to be allocated */
+ loff_t mm_bytes; /* Number of bytes to be allocated for file */
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int err = 0;
+
+ /* No support for hole punch or other fallocate flags. */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /* No support for dir */
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&inode->i_mutex);
+ if ((offset + len) <= MSDOS_I(inode)->i_disksize)
+ goto error;
+
+ err = inode_newsize_ok(inode, (len + offset));
+ if (err)
+ goto error;
+
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ /* First compute the number of clusters to be allocated */
+ mm_bytes = offset + len - round_up(MSDOS_I(inode)->i_disksize,
+ sbi->cluster_size);
+ nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
+ sbi->cluster_bits;
+
+ /* Start the allocation.We are not zeroing out the clusters */
+ while (nr_cluster-- > 0) {
+ err = fat_alloc_clusters(inode, &cluster, 1);
+ if (err) {
+ fat_msg(sb, KERN_ERR,
+ "fat_fallocate(): fat_alloc_clusters() error");
+ goto error;
+ }
+ err = fat_chain_add(inode, cluster, 1);
+ if (err) {
+ fat_free_clusters(inode, cluster);
+ goto error;
+ }
+ MSDOS_I(inode)->i_disksize += sbi->cluster_size;
+ }
+ } else {
+ /* This is just an expanding truncate */
+ err = fat_cont_expand(inode, (offset + len));
+ if (err)
+ fat_msg(sb, KERN_ERR,
+ "fat_fallocate(): fat_cont_expand() error");
+ }
+
+error:
+ mutex_unlock(&inode->i_mutex);
+ return err;
+}
+
/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
@@ -300,8 +374,10 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset)
* This protects against truncating a file bigger than it was then
* trying to write into the hole.
*/
- if (MSDOS_I(inode)->mmu_private > offset)
+ if (MSDOS_I(inode)->i_disksize > offset) {
MSDOS_I(inode)->mmu_private = offset;
+ MSDOS_I(inode)->i_disksize = offset;
+ }
nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 385cce464e82..e72afce295f3 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -35,9 +35,71 @@
#define CONFIG_FAT_DEFAULT_IOCHARSET ""
#endif
+#define KB_IN_SECTORS 2
+
+/*
+ * A deserialized copy of the on-disk structure laid out in struct
+ * fat_boot_sector.
+ */
+struct fat_bios_param_block {
+ u16 fat_sector_size;
+ u8 fat_sec_per_clus;
+ u16 fat_reserved;
+ u8 fat_fats;
+ u16 fat_dir_entries;
+ u16 fat_sectors;
+ u16 fat_fat_length;
+ u32 fat_total_sect;
+
+ u8 fat16_state;
+ u32 fat16_vol_id;
+
+ u32 fat32_length;
+ u32 fat32_root_cluster;
+ u16 fat32_info_sector;
+ u8 fat32_state;
+ u32 fat32_vol_id;
+};
+
static int fat_default_codepage = CONFIG_FAT_DEFAULT_CODEPAGE;
static char fat_default_iocharset[] = CONFIG_FAT_DEFAULT_IOCHARSET;
+static struct fat_floppy_defaults {
+ unsigned nr_sectors;
+ unsigned sec_per_clus;
+ unsigned dir_entries;
+ unsigned media;
+ unsigned fat_length;
+} floppy_defaults[] = {
+{
+ .nr_sectors = 160 * KB_IN_SECTORS,
+ .sec_per_clus = 1,
+ .dir_entries = 64,
+ .media = 0xFE,
+ .fat_length = 1,
+},
+{
+ .nr_sectors = 180 * KB_IN_SECTORS,
+ .sec_per_clus = 1,
+ .dir_entries = 64,
+ .media = 0xFC,
+ .fat_length = 2,
+},
+{
+ .nr_sectors = 320 * KB_IN_SECTORS,
+ .sec_per_clus = 2,
+ .dir_entries = 112,
+ .media = 0xFF,
+ .fat_length = 1,
+},
+{
+ .nr_sectors = 360 * KB_IN_SECTORS,
+ .sec_per_clus = 2,
+ .dir_entries = 112,
+ .media = 0xFD,
+ .fat_length = 2,
+},
+};
static int fat_add_cluster(struct inode *inode)
{
@@ -54,6 +116,25 @@ static int fat_add_cluster(struct inode *inode)
return err;
}
+static void check_fallocated_region(struct inode *inode, sector_t iblock,
+ unsigned long *max_blocks, struct buffer_head *bh_result)
+{
+ struct super_block *sb = inode->i_sb;
+ sector_t last_block, disk_block;
+ const unsigned long blocksize = sb->s_blocksize;
+ const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+ last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+ >> blocksize_bits;
+ disk_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1))
+ >> blocksize_bits;
+ if (iblock >= last_block && iblock <= disk_block) {
+ MSDOS_I(inode)->mmu_private += *max_blocks << blocksize_bits;
+ set_buffer_new(bh_result);
+ }
+
+}
+
static inline int __fat_get_block(struct inode *inode, sector_t iblock,
unsigned long *max_blocks,
struct buffer_head *bh_result, int create)
@@ -68,8 +149,11 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
if (err)
return err;
if (phys) {
- map_bh(bh_result, sb, phys);
*max_blocks = min(mapped_blocks, *max_blocks);
+ if (create)
+ check_fallocated_region(inode, iblock, max_blocks,
+ bh_result);
+ map_bh(bh_result, sb, phys);
return 0;
}
if (!create)
@@ -93,6 +177,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
*max_blocks = min(mapped_blocks, *max_blocks);
MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
+ MSDOS_I(inode)->i_disksize = MSDOS_I(inode)->mmu_private;
err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
if (err)
@@ -207,6 +292,13 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
loff_t size = offset + count;
if (MSDOS_I(inode)->mmu_private < size)
return 0;
+
+ /*
+ * In case of writing in fallocated region, return 0 and
+ * fallback to buffered write.
+ */
+ if (MSDOS_I(inode)->i_disksize > MSDOS_I(inode)->mmu_private)
+ return 0;
}
/*
@@ -220,13 +312,36 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
return ret;
}
+static int fat_get_block_bmap(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int err;
+ sector_t bmap;
+ unsigned long mapped_blocks;
+
+ err = fat_bmap2(inode, iblock, &mapped_blocks, bh_result, create,
+ &bmap);
+ if (err)
+ return err;
+
+ if (bmap) {
+ map_bh(bh_result, sb, bmap);
+ max_blocks = min(mapped_blocks, max_blocks);
+ }
+
+ bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+ return 0;
+}
+
static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
{
sector_t blocknr;
/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
down_read(&MSDOS_I(mapping->host)->truncate_lock);
- blocknr = generic_block_bmap(mapping, block, fat_get_block);
+ blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
up_read(&MSDOS_I(mapping->host)->truncate_lock);
return blocknr;
@@ -407,7 +522,6 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
error = fat_calc_dir_size(inode);
if (error < 0)
return error;
- MSDOS_I(inode)->mmu_private = inode->i_size;
set_nlink(inode, fat_subdirs(inode));
} else { /* not a directory */
@@ -422,8 +536,12 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
inode->i_op = &fat_file_inode_operations;
inode->i_fop = &fat_file_operations;
inode->i_mapping->a_ops = &fat_aops;
- MSDOS_I(inode)->mmu_private = inode->i_size;
}
+
+ MSDOS_I(inode)->mmu_private = inode->i_size;
+ MSDOS_I(inode)->i_disksize = round_up(inode->i_size,
+ inode->i_sb->s_blocksize);
+
if (de->attr & ATTR_SYS) {
if (sbi->options.sys_immutable)
inode->i_flags |= S_IMMUTABLE;
@@ -488,12 +606,34 @@ out:
EXPORT_SYMBOL_GPL(fat_build_inode);
+static int __fat_write_inode(struct inode *inode, int wait);
static void fat_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
fat_truncate_blocks(inode, 0);
+ } else {
+ /* Release unwritten fallocated blocks on inode eviction. */
+ if (MSDOS_I(inode)->i_disksize >
+ round_up(MSDOS_I(inode)->mmu_private,
+ inode->i_sb->s_blocksize)) {
+ int err;
+ fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
+ /* Fallocate results in updating the i_start/iogstart
+ * for the zero byte file. So, make it return to
+ * original state during evict and commit it to avoid
+ * any corruption on the next access to the cluster
+ * chain for the file.
+ */
+ err = __fat_write_inode(inode, inode_needs_sync(inode));
+ if (err) {
+ fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
+ "update on disk inode for unused fallocated "
+ "blocks, inode could be corrupted. Please run "
+ "fsck");
+ }
+ }
}
invalidate_inode_buffers(inode);
clear_inode(inode);
@@ -853,6 +993,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",nfs=stale_rw");
if (opts->discard)
seq_puts(m, ",discard");
+ if (opts->dos1xfloppy)
+ seq_puts(m, ",dos1xfloppy");
return 0;
}
@@ -867,7 +1009,7 @@ enum {
Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
- Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err,
+ Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy,
};
static const match_table_t fat_tokens = {
@@ -900,6 +1042,7 @@ static const match_table_t fat_tokens = {
{Opt_nfs_stale_rw, "nfs"},
{Opt_nfs_stale_rw, "nfs=stale_rw"},
{Opt_nfs_nostale_ro, "nfs=nostale_ro"},
+ {Opt_dos1xfloppy, "dos1xfloppy"},
{Opt_obsolete, "conv=binary"},
{Opt_obsolete, "conv=text"},
{Opt_obsolete, "conv=auto"},
@@ -1102,6 +1245,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
case Opt_nfs_nostale_ro:
opts->nfs = FAT_NFS_NOSTALE_RO;
break;
+ case Opt_dos1xfloppy:
+ opts->dos1xfloppy = 1;
+ break;
/* msdos specific */
case Opt_dots:
@@ -1225,6 +1371,7 @@ static int fat_read_root(struct inode *inode)
& ~((loff_t)sbi->cluster_size - 1)) >> 9;
MSDOS_I(inode)->i_logstart = 0;
MSDOS_I(inode)->mmu_private = inode->i_size;
+ MSDOS_I(inode)->i_disksize = inode->i_size;
fat_save_attrs(inode, ATTR_DIR);
inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
@@ -1247,6 +1394,169 @@ static unsigned long calc_fat_clusters(struct super_block *sb)
return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
}
+static bool fat_bpb_is_zero(struct fat_boot_sector *b)
+{
+ if (get_unaligned_le16(&b->sector_size))
+ return false;
+ if (b->sec_per_clus)
+ return false;
+ if (b->reserved)
+ return false;
+ if (b->fats)
+ return false;
+ if (get_unaligned_le16(&b->dir_entries))
+ return false;
+ if (get_unaligned_le16(&b->sectors))
+ return false;
+ if (b->media)
+ return false;
+ if (b->fat_length)
+ return false;
+ if (b->secs_track)
+ return false;
+ if (b->heads)
+ return false;
+ return true;
+}
+
+static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b,
+ int silent, struct fat_bios_param_block *bpb)
+{
+ int error = -EINVAL;
+
+ /* Read in BPB ... */
+ memset(bpb, 0, sizeof(*bpb));
+ bpb->fat_sector_size = get_unaligned_le16(&b->sector_size);
+ bpb->fat_sec_per_clus = b->sec_per_clus;
+ bpb->fat_reserved = le16_to_cpu(b->reserved);
+ bpb->fat_fats = b->fats;
+ bpb->fat_dir_entries = get_unaligned_le16(&b->dir_entries);
+ bpb->fat_sectors = get_unaligned_le16(&b->sectors);
+ bpb->fat_fat_length = le16_to_cpu(b->fat_length);
+ bpb->fat_total_sect = le32_to_cpu(b->total_sect);
+
+ bpb->fat16_state = b->fat16.state;
+ bpb->fat16_vol_id = get_unaligned_le32(b->fat16.vol_id);
+
+ bpb->fat32_length = le32_to_cpu(b->fat32.length);
+ bpb->fat32_root_cluster = le32_to_cpu(b->fat32.root_cluster);
+ bpb->fat32_info_sector = le16_to_cpu(b->fat32.info_sector);
+ bpb->fat32_state = b->fat32.state;
+ bpb->fat32_vol_id = get_unaligned_le32(b->fat32.vol_id);
+
+ /* Validate this looks like a FAT filesystem BPB */
+ if (!bpb->fat_reserved) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR,
+ "bogus number of reserved sectors");
+ goto out;
+ }
+ if (!bpb->fat_fats) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
+ goto out;
+ }
+
+ /*
+ * Earlier we checked here that b->secs_track and b->head are nonzero,
+ * but it turns out valid FAT filesystems can have zero there.
+ */
+
+ if (!fat_valid_media(b->media)) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
+ (unsigned)b->media);
+ goto out;
+ }
+
+ if (!is_power_of_2(bpb->fat_sector_size)
+ || (bpb->fat_sector_size < 512)
+ || (bpb->fat_sector_size > 4096)) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
+ (unsigned)bpb->fat_sector_size);
+ goto out;
+ }
+
+ if (!is_power_of_2(bpb->fat_sec_per_clus)) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
+ (unsigned)bpb->fat_sec_per_clus);
+ goto out;
+ }
+
+ error = 0;
+
+out:
+ return error;
+}
+
+static int fat_read_static_bpb(struct super_block *sb,
+ struct fat_boot_sector *b, int silent,
+ struct fat_bios_param_block *bpb)
+{
+ static const char *notdos1x = "This doesn't look like a DOS 1.x volume";
+
+ struct fat_floppy_defaults *fdefaults = NULL;
+ int error = -EINVAL;
+ sector_t bd_sects;
+ unsigned i;
+
+ bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE;
+
+ /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */
+ if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR,
+ "%s; no bootstrapping code", notdos1x);
+ goto out;
+ }
+
+ /*
+ * If any value in this region is non-zero, it isn't archaic
+ * DOS.
+ */
+ if (!fat_bpb_is_zero(b)) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR,
+ "%s; DOS 2.x BPB is non-zero", notdos1x);
+ goto out;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(floppy_defaults); i++) {
+ if (floppy_defaults[i].nr_sectors == bd_sects) {
+ fdefaults = &floppy_defaults[i];
+ break;
+ }
+ }
+
+ if (fdefaults == NULL) {
+ if (!silent)
+ fat_msg(sb, KERN_WARNING,
+ "This looks like a DOS 1.x volume, but isn't a recognized floppy size (%llu sectors)",
+ (u64)bd_sects);
+ goto out;
+ }
+
+ if (!silent)
+ fat_msg(sb, KERN_INFO,
+ "This looks like a DOS 1.x volume; assuming default BPB values");
+
+ memset(bpb, 0, sizeof(*bpb));
+ bpb->fat_sector_size = SECTOR_SIZE;
+ bpb->fat_sec_per_clus = fdefaults->sec_per_clus;
+ bpb->fat_reserved = 1;
+ bpb->fat_fats = 2;
+ bpb->fat_dir_entries = fdefaults->dir_entries;
+ bpb->fat_sectors = fdefaults->nr_sectors;
+ bpb->fat_fat_length = fdefaults->fat_length;
+
+ error = 0;
+
+out:
+ return error;
+}
+
/*
* Read the super block of an MS-DOS FS.
*/
@@ -1256,12 +1566,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
struct inode *root_inode = NULL, *fat_inode = NULL;
struct inode *fsinfo_inode = NULL;
struct buffer_head *bh;
- struct fat_boot_sector *b;
+ struct fat_bios_param_block bpb;
struct msdos_sb_info *sbi;
u16 logical_sector_size;
u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors;
int debug;
- unsigned int media;
long error;
char buf[50];
@@ -1298,100 +1607,71 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
goto out_fail;
}
- b = (struct fat_boot_sector *) bh->b_data;
- if (!b->reserved) {
- if (!silent)
- fat_msg(sb, KERN_ERR, "bogus number of reserved sectors");
- brelse(bh);
- goto out_invalid;
- }
- if (!b->fats) {
- if (!silent)
- fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
- brelse(bh);
- goto out_invalid;
- }
-
- /*
- * Earlier we checked here that b->secs_track and b->head are nonzero,
- * but it turns out valid FAT filesystems can have zero there.
- */
+ error = fat_read_bpb(sb, (struct fat_boot_sector *)bh->b_data, silent,
+ &bpb);
+ if (error == -EINVAL && sbi->options.dos1xfloppy)
+ error = fat_read_static_bpb(sb,
+ (struct fat_boot_sector *)bh->b_data, silent, &bpb);
+ brelse(bh);
- media = b->media;
- if (!fat_valid_media(media)) {
- if (!silent)
- fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
- media);
- brelse(bh);
- goto out_invalid;
- }
- logical_sector_size = get_unaligned_le16(&b->sector_size);
- if (!is_power_of_2(logical_sector_size)
- || (logical_sector_size < 512)
- || (logical_sector_size > 4096)) {
- if (!silent)
- fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
- logical_sector_size);
- brelse(bh);
- goto out_invalid;
- }
- sbi->sec_per_clus = b->sec_per_clus;
- if (!is_power_of_2(sbi->sec_per_clus)) {
- if (!silent)
- fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
- sbi->sec_per_clus);
- brelse(bh);
+ if (error == -EINVAL)
goto out_invalid;
- }
+ else if (error)
+ goto out_fail;
+
+ logical_sector_size = bpb.fat_sector_size;
+ sbi->sec_per_clus = bpb.fat_sec_per_clus;
if (logical_sector_size < sb->s_blocksize) {
fat_msg(sb, KERN_ERR, "logical sector size too small for device"
" (logical sector size = %u)", logical_sector_size);
- brelse(bh);
goto out_fail;
}
+
if (logical_sector_size > sb->s_blocksize) {
- brelse(bh);
+ struct buffer_head *bh_resize;
if (!sb_set_blocksize(sb, logical_sector_size)) {
fat_msg(sb, KERN_ERR, "unable to set blocksize %u",
logical_sector_size);
goto out_fail;
}
- bh = sb_bread(sb, 0);
- if (bh == NULL) {
+
+ /* Verify that the larger boot sector is fully readable */
+ bh_resize = sb_bread(sb, 0);
+ if (bh_resize == NULL) {
fat_msg(sb, KERN_ERR, "unable to read boot sector"
" (logical sector size = %lu)",
sb->s_blocksize);
goto out_fail;
}
- b = (struct fat_boot_sector *) bh->b_data;
+ brelse(bh_resize);
}
mutex_init(&sbi->s_lock);
sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus;
sbi->cluster_bits = ffs(sbi->cluster_size) - 1;
- sbi->fats = b->fats;
+ sbi->fats = bpb.fat_fats;
sbi->fat_bits = 0; /* Don't know yet */
- sbi->fat_start = le16_to_cpu(b->reserved);
- sbi->fat_length = le16_to_cpu(b->fat_length);
+ sbi->fat_start = bpb.fat_reserved;
+ sbi->fat_length = bpb.fat_fat_length;
sbi->root_cluster = 0;
sbi->free_clusters = -1; /* Don't know yet */
sbi->free_clus_valid = 0;
sbi->prev_free = FAT_START_ENT;
sb->s_maxbytes = 0xffffffff;
- if (!sbi->fat_length && b->fat32.length) {
+ if (!sbi->fat_length && bpb.fat32_length) {
struct fat_boot_fsinfo *fsinfo;
struct buffer_head *fsinfo_bh;
/* Must be FAT32 */
sbi->fat_bits = 32;
- sbi->fat_length = le32_to_cpu(b->fat32.length);
- sbi->root_cluster = le32_to_cpu(b->fat32.root_cluster);
+ sbi->fat_length = bpb.fat32_length;
+ sbi->root_cluster = bpb.fat32_root_cluster;
/* MC - if info_sector is 0, don't multiply by 0 */
- sbi->fsinfo_sector = le16_to_cpu(b->fat32.info_sector);
+ sbi->fsinfo_sector = bpb.fat32_info_sector;
if (sbi->fsinfo_sector == 0)
sbi->fsinfo_sector = 1;
@@ -1399,7 +1679,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
if (fsinfo_bh == NULL) {
fat_msg(sb, KERN_ERR, "bread failed, FSINFO block"
" (sector = %lu)", sbi->fsinfo_sector);
- brelse(bh);
goto out_fail;
}
@@ -1422,35 +1701,28 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
/* interpret volume ID as a little endian 32 bit integer */
if (sbi->fat_bits == 32)
- sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
- ((u32)b->fat32.vol_id[1] << 8) |
- ((u32)b->fat32.vol_id[2] << 16) |
- ((u32)b->fat32.vol_id[3] << 24));
+ sbi->vol_id = bpb.fat32_vol_id;
else /* fat 16 or 12 */
- sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
- ((u32)b->fat16.vol_id[1] << 8) |
- ((u32)b->fat16.vol_id[2] << 16) |
- ((u32)b->fat16.vol_id[3] << 24));
+ sbi->vol_id = bpb.fat16_vol_id;
sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length;
- sbi->dir_entries = get_unaligned_le16(&b->dir_entries);
+ sbi->dir_entries = bpb.fat_dir_entries;
if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
if (!silent)
fat_msg(sb, KERN_ERR, "bogus directory-entries per block"
" (%u)", sbi->dir_entries);
- brelse(bh);
goto out_invalid;
}
rootdir_sectors = sbi->dir_entries
* sizeof(struct msdos_dir_entry) / sb->s_blocksize;
sbi->data_start = sbi->dir_start + rootdir_sectors;
- total_sectors = get_unaligned_le16(&b->sectors);
+ total_sectors = bpb.fat_sectors;
if (total_sectors == 0)
- total_sectors = le32_to_cpu(b->total_sect);
+ total_sectors = bpb.fat_total_sect;
total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus;
@@ -1459,9 +1731,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
/* some OSes set FAT_STATE_DIRTY and clean it on unmount. */
if (sbi->fat_bits == 32)
- sbi->dirty = b->fat32.state & FAT_STATE_DIRTY;
+ sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY;
else /* fat 16 or 12 */
- sbi->dirty = b->fat16.state & FAT_STATE_DIRTY;
+ sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY;
/* check that FAT table does not overflow */
fat_clusters = calc_fat_clusters(sb);
@@ -1470,7 +1742,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
if (!silent)
fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
total_clusters);
- brelse(bh);
goto out_invalid;
}
@@ -1483,8 +1754,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
if (sbi->prev_free < FAT_START_ENT)
sbi->prev_free = FAT_START_ENT;
- brelse(bh);
-
/* set up enough so that it can read an inode */
fat_hash_init(sb);
dir_hash_init(sb);
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index caf89a7be0a1..e5b221de7de6 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -54,14 +54,11 @@ int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key,
memset(key, 0, sizeof(struct hfsplus_attr_key));
key->attr.cnid = cpu_to_be32(cnid);
if (name) {
- len = strlen(name);
- if (len > HFSPLUS_ATTR_MAX_STRLEN) {
- pr_err("invalid xattr name's length\n");
- return -EINVAL;
- }
- hfsplus_asc2uni(sb,
+ int res = hfsplus_asc2uni(sb,
(struct hfsplus_unistr *)&key->attr.key_name,
- HFSPLUS_ATTR_MAX_STRLEN, name, len);
+ HFSPLUS_ATTR_MAX_STRLEN, name, strlen(name));
+ if (res)
+ return res;
len = be16_to_cpu(key->attr.key_name.length);
} else {
key->attr.key_name.length = 0;
@@ -82,31 +79,6 @@ int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key,
return 0;
}
-void hfsplus_attr_build_key_uni(hfsplus_btree_key *key,
- u32 cnid,
- struct hfsplus_attr_unistr *name)
-{
- int ustrlen;
-
- memset(key, 0, sizeof(struct hfsplus_attr_key));
- ustrlen = be16_to_cpu(name->length);
- key->attr.cnid = cpu_to_be32(cnid);
- key->attr.key_name.length = cpu_to_be16(ustrlen);
- ustrlen *= 2;
- memcpy(key->attr.key_name.unicode, name->unicode, ustrlen);
-
- /* The length of the key, as stored in key_len field, does not include
- * the size of the key_len field itself.
- * So, offsetof(hfsplus_attr_key, key_name) is a trick because
- * it takes into consideration key_len field (__be16) of
- * hfsplus_attr_key structure instead of length field (__be16) of
- * hfsplus_attr_unistr structure.
- */
- key->key_len =
- cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) +
- ustrlen);
-}
-
hfsplus_attr_entry *hfsplus_alloc_attr_entry(void)
{
return kmem_cache_alloc(hfsplus_attr_tree_cachep, GFP_KERNEL);
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 32602c667b4a..7892e6fddb66 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -38,21 +38,30 @@ int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1,
return hfsplus_strcmp(&k1->cat.name, &k2->cat.name);
}
-void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
- u32 parent, struct qstr *str)
+/* Generates key for catalog file/folders record. */
+int hfsplus_cat_build_key(struct super_block *sb,
+ hfsplus_btree_key *key, u32 parent, struct qstr *str)
{
- int len;
+ int len, err;
key->cat.parent = cpu_to_be32(parent);
- if (str) {
- hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN,
- str->name, str->len);
- len = be16_to_cpu(key->cat.name.length);
- } else {
- key->cat.name.length = 0;
- len = 0;
- }
+ err = hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN,
+ str->name, str->len);
+ if (unlikely(err < 0))
+ return err;
+
+ len = be16_to_cpu(key->cat.name.length);
key->key_len = cpu_to_be16(6 + 2 * len);
+ return 0;
+}
+
+/* Generates key for catalog thread record. */
+void hfsplus_cat_build_key_with_cnid(struct super_block *sb,
+ hfsplus_btree_key *key, u32 parent)
+{
+ key->cat.parent = cpu_to_be32(parent);
+ key->cat.name.length = 0;
+ key->key_len = cpu_to_be16(6);
}
static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
@@ -167,11 +176,16 @@ static int hfsplus_fill_cat_thread(struct super_block *sb,
hfsplus_cat_entry *entry, int type,
u32 parentid, struct qstr *str)
{
+ int err;
+
entry->type = cpu_to_be16(type);
entry->thread.reserved = 0;
entry->thread.parentID = cpu_to_be32(parentid);
- hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN,
+ err = hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN,
str->name, str->len);
+ if (unlikely(err < 0))
+ return err;
+
return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2;
}
@@ -183,7 +197,7 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
int err;
u16 type;
- hfsplus_cat_build_key(sb, fd->search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd->search_key, cnid);
err = hfs_brec_read(fd, &tmp, sizeof(hfsplus_cat_entry));
if (err)
return err;
@@ -250,11 +264,16 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
if (err)
return err;
- hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
entry_size = hfsplus_fill_cat_thread(sb, &entry,
S_ISDIR(inode->i_mode) ?
HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,
dir->i_ino, str);
+ if (unlikely(entry_size < 0)) {
+ err = entry_size;
+ goto err2;
+ }
+
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
if (!err)
@@ -265,7 +284,10 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
if (err)
goto err2;
- hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+ err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+ if (unlikely(err))
+ goto err1;
+
entry_size = hfsplus_cat_build_record(&entry, cnid, inode);
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
@@ -288,7 +310,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
return 0;
err1:
- hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
if (!hfs_brec_find(&fd, hfs_find_rec_by_key))
hfs_brec_remove(&fd);
err2:
@@ -313,7 +335,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
if (!str) {
int len;
- hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -329,7 +351,9 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
off + 2, len);
fd.search_key->key_len = cpu_to_be16(6 + len);
} else
- hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+ err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+ if (unlikely(err))
+ goto out;
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
@@ -360,7 +384,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
if (err)
goto out;
- hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -405,7 +429,11 @@ int hfsplus_rename_cat(u32 cnid,
dst_fd = src_fd;
/* find the old dir entry and read the data */
- hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
+ err = hfsplus_cat_build_key(sb, src_fd.search_key,
+ src_dir->i_ino, src_name);
+ if (unlikely(err))
+ goto out;
+
err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -419,7 +447,11 @@ int hfsplus_rename_cat(u32 cnid,
type = be16_to_cpu(entry.type);
/* create new dir entry with the data from the old entry */
- hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
+ err = hfsplus_cat_build_key(sb, dst_fd.search_key,
+ dst_dir->i_ino, dst_name);
+ if (unlikely(err))
+ goto out;
+
err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
if (!err)
@@ -436,7 +468,11 @@ int hfsplus_rename_cat(u32 cnid,
dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
/* finally remove the old entry */
- hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
+ err = hfsplus_cat_build_key(sb, src_fd.search_key,
+ src_dir->i_ino, src_name);
+ if (unlikely(err))
+ goto out;
+
err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -449,7 +485,7 @@ int hfsplus_rename_cat(u32 cnid,
src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
/* remove old thread entry */
- hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid);
err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -459,9 +495,14 @@ int hfsplus_rename_cat(u32 cnid,
goto out;
/* create new thread entry */
- hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, dst_fd.search_key, cnid);
entry_size = hfsplus_fill_cat_thread(sb, &entry, type,
dst_dir->i_ino, dst_name);
+ if (unlikely(entry_size < 0)) {
+ err = entry_size;
+ goto out;
+ }
+
err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
if (!err)
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index bdec66522de3..18a9cdc273f5 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -12,6 +12,7 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/random.h>
+#include <linux/nls.h>
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
@@ -43,7 +44,10 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
if (err)
return ERR_PTR(err);
- hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
+ err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino,
+ &dentry->d_name);
+ if (unlikely(err < 0))
+ goto fail;
again:
err = hfs_brec_read(&fd, &entry, sizeof(entry));
if (err) {
@@ -96,9 +100,11 @@ again:
be32_to_cpu(entry.file.permissions.dev);
str.len = sprintf(name, "iNode%d", linkid);
str.name = name;
- hfsplus_cat_build_key(sb, fd.search_key,
+ err = hfsplus_cat_build_key(sb, fd.search_key,
HFSPLUS_SB(sb)->hidden_dir->i_ino,
&str);
+ if (unlikely(err < 0))
+ goto fail;
goto again;
}
} else if (!dentry->d_fsdata)
@@ -127,7 +133,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
int len, err;
- char strbuf[HFSPLUS_MAX_STRLEN + 1];
+ char *strbuf;
hfsplus_cat_entry entry;
struct hfs_find_data fd;
struct hfsplus_readdir_data *rd;
@@ -139,7 +145,12 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
if (err)
return err;
- hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
+ strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN + 1, GFP_KERNEL);
+ if (!strbuf) {
+ err = -ENOMEM;
+ goto out;
+ }
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, inode->i_ino);
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -193,7 +204,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
fd.entrylength);
type = be16_to_cpu(entry.type);
- len = HFSPLUS_MAX_STRLEN;
+ len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN;
err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len);
if (err)
goto out;
@@ -246,6 +257,7 @@ next:
}
memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
out:
+ kfree(strbuf);
hfs_find_exit(&fd);
return err;
}
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 83dc29286b10..cf10da368ee6 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -375,9 +375,6 @@ int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *,
const hfsplus_btree_key *);
int hfsplus_attr_build_key(struct super_block *, hfsplus_btree_key *,
u32, const char *);
-void hfsplus_attr_build_key_uni(hfsplus_btree_key *key,
- u32 cnid,
- struct hfsplus_attr_unistr *name);
int hfsplus_find_attr(struct super_block *, u32,
const char *, struct hfs_find_data *);
int hfsplus_attr_exists(struct inode *inode, const char *name);
@@ -444,8 +441,10 @@ int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *,
const hfsplus_btree_key *);
int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *,
const hfsplus_btree_key *);
-void hfsplus_cat_build_key(struct super_block *sb,
+int hfsplus_cat_build_key(struct super_block *sb,
hfsplus_btree_key *, u32, struct qstr *);
+void hfsplus_cat_build_key_with_cnid(struct super_block *sb,
+ hfsplus_btree_key *, u32);
int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *);
int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index a513d2d36be9..dcb474129d5c 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -514,7 +514,9 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
err = hfs_find_init(sbi->cat_tree, &fd);
if (err)
goto out_put_root;
- hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
+ err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
+ if (unlikely(err < 0))
+ goto out_put_root;
if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
hfs_find_exit(&fd);
if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 4e27edc082a4..c03c94611cce 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -8,6 +8,7 @@
#include "hfsplus_fs.h"
#include <linux/posix_acl_xattr.h>
+#include <linux/nls.h>
#include "xattr.h"
#include "acl.h"
@@ -645,8 +646,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct hfs_find_data fd;
u16 key_len = 0;
struct hfsplus_attr_key attr_key;
- char strbuf[HFSPLUS_ATTR_MAX_STRLEN +
- XATTR_MAC_OSX_PREFIX_LEN + 1] = {0};
+ char *strbuf;
int xattr_name_len;
if ((!S_ISREG(inode->i_mode) &&
@@ -666,6 +666,13 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
return err;
}
+ strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN +
+ XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL);
+ if (!strbuf) {
+ res = -ENOMEM;
+ goto out;
+ }
+
err = hfsplus_find_attr(inode->i_sb, inode->i_ino, NULL, &fd);
if (err) {
if (err == -ENOENT) {
@@ -692,7 +699,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
if (be32_to_cpu(attr_key.cnid) != inode->i_ino)
goto end_listxattr;
- xattr_name_len = HFSPLUS_ATTR_MAX_STRLEN;
+ xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN;
if (hfsplus_uni2asc(inode->i_sb,
(const struct hfsplus_unistr *)&fd.key->attr.key_name,
strbuf, &xattr_name_len)) {
@@ -718,6 +725,8 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
}
end_listxattr:
+ kfree(strbuf);
+out:
hfs_find_exit(&fd);
return res;
}
@@ -797,47 +806,55 @@ end_removexattr:
static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN +
- XATTR_MAC_OSX_PREFIX_LEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
/*
* Don't allow retrieving properly prefixed attributes
* by prepending them with "osx."
*/
if (is_known_namespace(name))
return -EOPNOTSUPP;
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN
+ + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
+ strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
+ strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
- return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ kfree(xattr_name);
+ return res;
}
static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN +
- XATTR_MAC_OSX_PREFIX_LEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
/*
* Don't allow setting properly prefixed attributes
* by prepending them with "osx."
*/
if (is_known_namespace(name))
return -EOPNOTSUPP;
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN
+ + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
+ strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
+ strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
- return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ kfree(xattr_name);
+ return res;
}
static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index 00722765ea79..6ec5e107691f 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -7,6 +7,8 @@
*/
#include <linux/security.h>
+#include <linux/nls.h>
+
#include "hfsplus_fs.h"
#include "xattr.h"
#include "acl.h"
@@ -14,37 +16,43 @@
static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_SECURITY_PREFIX);
strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name);
- return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ kfree(xattr_name);
+ return res;
}
static int hfsplus_security_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_SECURITY_PREFIX);
strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name);
- return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ kfree(xattr_name);
+ return res;
}
static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list,
@@ -62,31 +70,30 @@ static int hfsplus_initxattrs(struct inode *inode,
void *fs_info)
{
const struct xattr *xattr;
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t xattr_name_len;
+ char *xattr_name;
int err = 0;
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- xattr_name_len = strlen(xattr->name);
- if (xattr_name_len == 0)
+ if (!strcmp(xattr->name, ""))
continue;
- if (xattr_name_len + XATTR_SECURITY_PREFIX_LEN >
- HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
strcpy(xattr_name, XATTR_SECURITY_PREFIX);
strcpy(xattr_name +
XATTR_SECURITY_PREFIX_LEN, xattr->name);
memset(xattr_name +
- XATTR_SECURITY_PREFIX_LEN + xattr_name_len, 0, 1);
+ XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name), 0, 1);
err = __hfsplus_setxattr(inode, xattr_name,
xattr->value, xattr->value_len, 0);
if (err)
break;
}
+ kfree(xattr_name);
return err;
}
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index 426cee277542..3c5f27e4746a 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -6,43 +6,51 @@
* Handler for trusted extended attributes.
*/
+#include <linux/nls.h>
+
#include "hfsplus_fs.h"
#include "xattr.h"
static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_TRUSTED_PREFIX);
strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name);
- return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ kfree(xattr_name);
+ return res;
}
static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_TRUSTED_PREFIX);
strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name);
- return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ kfree(xattr_name);
+ return res;
}
static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index e34016561ae0..2b625a538b64 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -6,43 +6,51 @@
* Handler for user extended attributes.
*/
+#include <linux/nls.h>
+
#include "hfsplus_fs.h"
#include "xattr.h"
static int hfsplus_user_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_USER_PREFIX);
strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name);
- return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ kfree(xattr_name);
+ return res;
}
static int hfsplus_user_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_USER_PREFIX);
strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name);
- return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ kfree(xattr_name);
+ return res;
}
static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 204027520937..a20021fec74b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -6,6 +6,8 @@
* Copyright (C) 2002 Linus Torvalds.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/thread_info.h>
#include <asm/current.h>
@@ -823,8 +825,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
ps = memparse(args[0].from, &rest);
pconfig->hstate = size_to_hstate(ps);
if (!pconfig->hstate) {
- printk(KERN_ERR
- "hugetlbfs: Unsupported page size %lu MB\n",
+ pr_err("Unsupported page size %lu MB\n",
ps >> 20);
return -EINVAL;
}
@@ -832,8 +833,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
}
default:
- printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
- p);
+ pr_err("Bad mount option: \"%s\"\n", p);
return -EINVAL;
break;
}
@@ -853,8 +853,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
return 0;
bad_val:
- printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
- args[0].from, p);
+ pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
return -EINVAL;
}
@@ -970,8 +969,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
*user = current_user();
if (user_shm_lock(size, *user)) {
task_lock(current);
- printk_once(KERN_WARNING
- "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
+ pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
current->comm, current->pid);
task_unlock(current);
} else {
@@ -1030,6 +1028,11 @@ static int __init init_hugetlbfs_fs(void)
int error;
int i;
+ if (!hugepages_supported()) {
+ pr_info("disabling because there are no supported hugepage sizes\n");
+ return -ENOTSUPP;
+ }
+
error = bdi_init(&hugetlbfs_backing_dev_info);
if (error)
return error;
@@ -1055,7 +1058,7 @@ static int __init init_hugetlbfs_fs(void)
buf);
if (IS_ERR(hugetlbfs_vfsmount[i])) {
- pr_err("hugetlb: Cannot mount internal hugetlbfs for "
+ pr_err("Cannot mount internal hugetlbfs for "
"page size %uK", ps_kb);
error = PTR_ERR(hugetlbfs_vfsmount[i]);
hugetlbfs_vfsmount[i] = NULL;
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 2b60ce1996aa..bb9cebc9ca8a 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -75,10 +75,13 @@ void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c)
static int jffs2_garbage_collect_thread(void *_c)
{
struct jffs2_sb_info *c = _c;
+ sigset_t hupmask;
+ siginitset(&hupmask, sigmask(SIGHUP));
allow_signal(SIGKILL);
allow_signal(SIGSTOP);
allow_signal(SIGCONT);
+ allow_signal(SIGHUP);
c->gc_task = current;
complete(&c->gc_thread_start);
@@ -87,7 +90,7 @@ static int jffs2_garbage_collect_thread(void *_c)
set_freezable();
for (;;) {
- allow_signal(SIGHUP);
+ sigprocmask(SIG_UNBLOCK, &hupmask, NULL);
again:
spin_lock(&c->erase_completion_lock);
if (!jffs2_thread_should_wake(c)) {
@@ -95,10 +98,9 @@ static int jffs2_garbage_collect_thread(void *_c)
spin_unlock(&c->erase_completion_lock);
jffs2_dbg(1, "%s(): sleeping...\n", __func__);
schedule();
- } else
+ } else {
spin_unlock(&c->erase_completion_lock);
-
-
+ }
/* Problem - immediately after bootup, the GCD spends a lot
* of time in places like jffs2_kill_fragtree(); so much so
* that userspace processes (like gdm and X) are starved
@@ -150,7 +152,7 @@ static int jffs2_garbage_collect_thread(void *_c)
}
}
/* We don't want SIGHUP to interrupt us. STOP and KILL are OK though. */
- disallow_signal(SIGHUP);
+ sigprocmask(SIG_BLOCK, &hupmask, NULL);
jffs2_dbg(1, "%s(): pass\n", __func__);
if (jffs2_garbage_collect_pass(c) == -ENOSPC) {
diff --git a/fs/mpage.c b/fs/mpage.c
index 4979ffa60aaa..4e0af5ae34fa 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -462,6 +462,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -570,7 +571,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
alloc_new:
if (bio == NULL) {
@@ -587,7 +588,7 @@ alloc_new:
*/
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
goto alloc_new;
}
@@ -620,7 +621,7 @@ alloc_new:
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -632,7 +633,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
@@ -688,8 +689,11 @@ mpage_writepages(struct address_space *mapping,
};
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
}
blk_finish_plug(&plug);
return ret;
@@ -706,8 +710,11 @@ int mpage_writepage(struct page *page, get_block_t get_block,
.use_writepage = 0,
};
int ret = __mpage_writepage(page, wbc, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
return ret;
}
EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 4e565c814309..d42220f7d739 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -811,6 +811,15 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
group->priority == FS_PRIO_0)
goto fput_and_out;
+ if (flags & FAN_MARK_FLUSH) {
+ ret = 0;
+ if (flags & FAN_MARK_MOUNT)
+ fsnotify_clear_vfsmount_marks_by_group(group);
+ else
+ fsnotify_clear_inode_marks_by_group(group);
+ goto fput_and_out;
+ }
+
ret = fanotify_find_path(dfd, pathname, &path, flags);
if (ret)
goto fput_and_out;
@@ -822,7 +831,7 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
mnt = path.mnt;
/* create/update an inode mark */
- switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
+ switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
case FAN_MARK_ADD:
if (flags & FAN_MARK_MOUNT)
ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
@@ -835,12 +844,6 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
else
ret = fanotify_remove_inode_mark(group, inode, mask, flags);
break;
- case FAN_MARK_FLUSH:
- if (flags & FAN_MARK_MOUNT)
- fsnotify_clear_vfsmount_marks_by_group(group);
- else
- fsnotify_clear_inode_marks_by_group(group);
- break;
default:
ret = -EINVAL;
}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 923fe4a5f503..d90deaa08e78 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -340,7 +340,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
static int fsnotify_mark_destroy(void *ignored)
{
struct fsnotify_mark *mark, *next;
- LIST_HEAD(private_destroy_list);
+ struct list_head private_destroy_list;
for (;;) {
spin_lock(&destroy_lock);
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index ee4144ce5d7c..f82498c35e78 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -58,7 +58,7 @@ typedef enum {
/**
* ntfs_compression_buffer - one buffer for the decompression engine
*/
-static u8 *ntfs_compression_buffer = NULL;
+static u8 *ntfs_compression_buffer;
/**
* ntfs_cb_lock - spinlock which protects ntfs_compression_buffer
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 9de2491f2926..6c3296e546c3 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -50,8 +50,8 @@
static unsigned long ntfs_nr_compression_users;
/* A global default upcase table and a corresponding reference count. */
-static ntfschar *default_upcase = NULL;
-static unsigned long ntfs_nr_upcase_users = 0;
+static ntfschar *default_upcase;
+static unsigned long ntfs_nr_upcase_users;
/* Error constants/strings used in inode.c::ntfs_show_options(). */
typedef enum {
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index 79a89184cb5e..1927170a35ce 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -56,7 +56,7 @@ static ctl_table sysctls_root[] = {
};
/* Storage for the sysctls header. */
-static struct ctl_table_header *sysctls_root_table = NULL;
+static struct ctl_table_header *sysctls_root_table;
/**
* ntfs_sysctl - add or remove the debug sysctl
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index c6b90e670389..681691bc233a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -108,7 +108,7 @@ static struct rb_root o2net_handler_tree = RB_ROOT;
static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
/* XXX someday we'll need better accounting */
-static struct socket *o2net_listen_sock = NULL;
+static struct socket *o2net_listen_sock;
/*
* listen work is only queued by the listening socket callbacks on the
@@ -1799,7 +1799,7 @@ int o2net_register_hb_callbacks(void)
/* ------------------------------------------------------------ */
-static int o2net_accept_one(struct socket *sock)
+static int o2net_accept_one(struct socket *sock, int *more)
{
int ret, slen;
struct sockaddr_in sin;
@@ -1810,6 +1810,7 @@ static int o2net_accept_one(struct socket *sock)
struct o2net_node *nn;
BUG_ON(sock == NULL);
+ *more = 0;
ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
sock->sk->sk_protocol, &new_sock);
if (ret)
@@ -1821,6 +1822,7 @@ static int o2net_accept_one(struct socket *sock)
if (ret < 0)
goto out;
+ *more = 1;
new_sock->sk->sk_allocation = GFP_ATOMIC;
ret = o2net_set_nodelay(new_sock);
@@ -1919,11 +1921,36 @@ out:
return ret;
}
+/*
+ * This function is invoked in response to one or more
+ * pending accepts at softIRQ level. We must drain the
+ * entire que before returning.
+ */
+
static void o2net_accept_many(struct work_struct *work)
{
struct socket *sock = o2net_listen_sock;
- while (o2net_accept_one(sock) == 0)
+ int more;
+ int err;
+
+ /*
+ * It is critical to note that due to interrupt moderation
+ * at the network driver level, we can't assume to get a
+ * softIRQ for every single conn since tcp SYN packets
+ * can arrive back-to-back, and therefore many pending
+ * accepts may result in just 1 softIRQ. If we terminate
+ * the o2net_accept_one() loop upon seeing an err, what happens
+ * to the rest of the conns in the queue? If no new SYN
+ * arrives for hours, no softIRQ will be delivered,
+ * and the connections will just sit in the queue.
+ */
+
+ for (;;) {
+ err = o2net_accept_one(sock, &more);
+ if (!more)
+ break;
cond_resched();
+ }
}
static void o2net_listen_data_ready(struct sock *sk)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index e33cd7a3c582..18f13c2e4a10 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -338,7 +338,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
#ifdef CONFIG_DEBUG_FS
-static struct dentry *dlm_debugfs_root = NULL;
+static struct dentry *dlm_debugfs_root;
#define DLM_DEBUGFS_DIR "o2dlm"
#define DLM_DEBUGFS_DLM_STATE "dlm_state"
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 5d32f7511f74..66c2a491f68d 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -52,7 +52,7 @@
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
-static struct kmem_cache *dlm_lock_cache = NULL;
+static struct kmem_cache *dlm_lock_cache;
static DEFINE_SPINLOCK(dlm_cookie_lock);
static u64 dlm_next_cookie = 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index af3f7aa73e13..1256dc49f83f 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -82,9 +82,9 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
return 1;
}
-static struct kmem_cache *dlm_lockres_cache = NULL;
-static struct kmem_cache *dlm_lockname_cache = NULL;
-static struct kmem_cache *dlm_mle_cache = NULL;
+static struct kmem_cache *dlm_lockres_cache;
+static struct kmem_cache *dlm_lockname_cache;
+static struct kmem_cache *dlm_mle_cache;
static void dlm_mle_release(struct kref *kref);
static void dlm_init_mle(struct dlm_master_list_entry *mle,
@@ -3084,11 +3084,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
/* remove it so that only one mle will be found */
__dlm_unlink_mle(dlm, tmp);
__dlm_mle_detach_hb_events(dlm, tmp);
- ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
- mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
- "telling master to get ref for cleared out mle "
- "during migration\n", dlm->name, namelen, name,
- master, new_master);
+ if (tmp->type == DLM_MLE_MASTER) {
+ ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
+ mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
+ "telling master to get ref "
+ "for cleared out mle during "
+ "migration\n", dlm->name,
+ namelen, name, master,
+ new_master);
+ }
}
spin_unlock(&tmp->spinlock);
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 03ea9314fecd..4b0c68849b36 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -30,6 +30,7 @@
#include <linux/kthread.h>
#include <linux/time.h>
#include <linux/random.h>
+#include <linux/delay.h>
#include <cluster/masklog.h>
@@ -2185,8 +2186,20 @@ static int ocfs2_commit_thread(void *arg)
|| kthread_should_stop());
status = ocfs2_commit_cache(osb);
- if (status < 0)
- mlog_errno(status);
+ if (status < 0) {
+ static unsigned long abort_warn_time;
+
+ /* Warn about this once per minute */
+ if (printk_timed_ratelimit(&abort_warn_time, 60*HZ))
+ mlog(ML_ERROR, "status = %d, journal is "
+ "already aborted.\n", status);
+ /*
+ * After ocfs2_commit_cache() fails, j_num_trans has a
+ * non-zero value. Sleep here to avoid a busy-wait
+ * loop.
+ */
+ msleep_interruptible(1000);
+ }
if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
mlog(ML_KTHREAD,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2060fc398445..ad8022431e09 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -231,6 +231,7 @@ static int ocfs2_mknod(struct inode *dir,
sigset_t oldset;
int did_block_signals = 0;
struct posix_acl *default_acl = NULL, *acl = NULL;
+ struct ocfs2_dentry_lock *dl = NULL;
trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
(unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -423,6 +424,8 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
+ dl = dentry->d_fsdata;
+
status = ocfs2_add_entry(handle, dentry, inode,
OCFS2_I(inode)->ip_blkno, parent_fe_bh,
&lookup);
@@ -469,6 +472,16 @@ leave:
* ocfs2_delete_inode will mutex_lock again.
*/
if ((status < 0) && inode) {
+ if (dl) {
+ ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+ ocfs2_lock_res_free(&dl->dl_lockres);
+ BUG_ON(dl->dl_count != 1);
+ spin_lock(&dentry_attach_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry_attach_lock);
+ kfree(dl);
+ iput(inode);
+ }
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
clear_nlink(inode);
iput(inode);
@@ -991,6 +1004,65 @@ leave:
return status;
}
+static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
+ u64 src_inode_no, u64 dest_inode_no)
+{
+ int ret = 0, i = 0;
+ u64 parent_inode_no = 0;
+ u64 child_inode_no = src_inode_no;
+ struct inode *child_inode;
+
+#define MAX_LOOKUP_TIMES 32
+ while (1) {
+ child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
+ if (IS_ERR(child_inode)) {
+ ret = PTR_ERR(child_inode);
+ break;
+ }
+
+ ret = ocfs2_inode_lock(child_inode, NULL, 0);
+ if (ret < 0) {
+ iput(child_inode);
+ if (ret != -ENOENT)
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
+ &parent_inode_no);
+ ocfs2_inode_unlock(child_inode, 0);
+ iput(child_inode);
+ if (ret < 0) {
+ ret = -ENOENT;
+ break;
+ }
+
+ if (parent_inode_no == dest_inode_no) {
+ ret = 1;
+ break;
+ }
+
+ if (parent_inode_no == osb->root_inode->i_ino) {
+ ret = 0;
+ break;
+ }
+
+ child_inode_no = parent_inode_no;
+
+ if (++i >= MAX_LOOKUP_TIMES) {
+ mlog(ML_NOTICE, "max lookup times reached, filesystem "
+ "may have nested directories, "
+ "src inode: %llu, dest inode: %llu.\n",
+ (unsigned long long)src_inode_no,
+ (unsigned long long)dest_inode_no);
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
/*
* The only place this should be used is rename!
* if they have the same id, then the 1st one is the only one locked.
@@ -1002,6 +1074,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
struct inode *inode2)
{
int status;
+ int inode1_is_ancestor, inode2_is_ancestor;
struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
struct buffer_head **tmpbh;
@@ -1015,9 +1088,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
if (*bh2)
*bh2 = NULL;
- /* we always want to lock the one with the lower lockid first. */
+ /* we always want to lock the one with the lower lockid first.
+ * and if they are nested, we lock ancestor first */
if (oi1->ip_blkno != oi2->ip_blkno) {
- if (oi1->ip_blkno < oi2->ip_blkno) {
+ inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
+ oi1->ip_blkno);
+ if (inode1_is_ancestor < 0) {
+ status = inode1_is_ancestor;
+ goto bail;
+ }
+
+ inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
+ oi2->ip_blkno);
+ if (inode2_is_ancestor < 0) {
+ status = inode2_is_ancestor;
+ goto bail;
+ }
+
+ if ((inode1_is_ancestor == 1) ||
+ (oi1->ip_blkno < oi2->ip_blkno &&
+ inode2_is_ancestor == 0)) {
/* switch id1 and id2 around */
tmpbh = bh2;
bh2 = bh1;
@@ -1098,6 +1188,7 @@ static int ocfs2_rename(struct inode *old_dir,
struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
struct ocfs2_dir_lookup_result target_insert = { NULL, };
+ bool should_add_orphan = false;
/* At some point it might be nice to break this function up a
* bit. */
@@ -1134,6 +1225,22 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
rename_lock = 1;
+
+ /* here we cannot guarantee the inodes haven't just been
+ * changed, so check if they are nested again */
+ status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
+ old_inode->i_ino);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ } else if (status == 1) {
+ status = -EPERM;
+ mlog(ML_ERROR, "src inode %llu should not be ancestor "
+ "of new dir inode %llu\n",
+ (unsigned long long)old_inode->i_ino,
+ (unsigned long long)new_dir->i_ino);
+ goto bail;
+ }
}
/* if old and new are the same, this'll just do one lock. */
@@ -1304,6 +1411,7 @@ static int ocfs2_rename(struct inode *old_dir,
mlog_errno(status);
goto bail;
}
+ should_add_orphan = true;
}
} else {
BUG_ON(new_dentry->d_parent->d_inode != new_dir);
@@ -1348,17 +1456,6 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
- if (S_ISDIR(new_inode->i_mode) ||
- (ocfs2_read_links_count(newfe) == 1)) {
- status = ocfs2_orphan_add(osb, handle, new_inode,
- newfe_bh, orphan_name,
- &orphan_insert, orphan_dir);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
-
/* change the dirent to point to the correct inode */
status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
old_inode);
@@ -1373,6 +1470,15 @@ static int ocfs2_rename(struct inode *old_dir,
else
ocfs2_add_links_count(newfe, -1);
ocfs2_journal_dirty(handle, newfe_bh);
+ if (should_add_orphan) {
+ status = ocfs2_orphan_add(osb, handle, new_inode,
+ newfe_bh, orphan_name,
+ &orphan_insert, orphan_dir);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
} else {
/* if the name was not found in new_dir, add it now */
status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1642,6 +1748,7 @@ static int ocfs2_symlink(struct inode *dir,
struct ocfs2_dir_lookup_result lookup = { NULL, };
sigset_t oldset;
int did_block_signals = 0;
+ struct ocfs2_dentry_lock *dl = NULL;
trace_ocfs2_symlink_begin(dir, dentry, symname,
dentry->d_name.len, dentry->d_name.name);
@@ -1830,6 +1937,8 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
+ dl = dentry->d_fsdata;
+
status = ocfs2_add_entry(handle, dentry, inode,
le64_to_cpu(fe->i_blkno), parent_fe_bh,
&lookup);
@@ -1864,6 +1973,16 @@ bail:
if (xattr_ac)
ocfs2_free_alloc_context(xattr_ac);
if ((status < 0) && inode) {
+ if (dl) {
+ ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+ ocfs2_lock_res_free(&dl->dl_lockres);
+ BUG_ON(dl->dl_count != 1);
+ spin_lock(&dentry_attach_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry_attach_lock);
+ kfree(dl);
+ iput(inode);
+ }
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
clear_nlink(inode);
iput(inode);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 83f1a665ae97..5d965e83bd43 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -709,7 +709,7 @@ static struct ctl_table ocfs2_root_table[] = {
{ }
};
-static struct ctl_table_header *ocfs2_table_header = NULL;
+static struct ctl_table_header *ocfs2_table_header;
/*
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a7cdd56f4c79..c7a89cea5c5d 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -75,7 +75,7 @@
#include "buffer_head_io.h"
-static struct kmem_cache *ocfs2_inode_cachep = NULL;
+static struct kmem_cache *ocfs2_inode_cachep;
struct kmem_cache *ocfs2_dquot_cachep;
struct kmem_cache *ocfs2_qf_chunk_cachep;
@@ -85,7 +85,7 @@ struct kmem_cache *ocfs2_qf_chunk_cachep;
* workqueue and schedule on our own. */
struct workqueue_struct *ocfs2_wq = NULL;
-static struct dentry *ocfs2_debugfs_root = NULL;
+static struct dentry *ocfs2_debugfs_root;
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
@@ -2292,8 +2292,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
- osb->vol_label[63] = '\0';
+ strlcpy(osb->vol_label, di->id2.i_super.s_label,
+ OCFS2_MAX_VOL_LABEL_LEN);
osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
osb->first_cluster_group_blkno =
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 52eaf33d346f..82e17b076ce7 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -67,7 +67,7 @@ struct ocfs2_meta_cache_item {
sector_t c_block;
};
-static struct kmem_cache *ocfs2_uptodate_cachep = NULL;
+static struct kmem_cache *ocfs2_uptodate_cachep;
u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
{
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 442177b1119a..fa6d6a4e85b3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -424,7 +424,6 @@ const struct file_operations proc_tid_maps_operations = {
#ifdef CONFIG_PROC_PAGE_MONITOR
struct mem_size_stats {
- struct vm_area_struct *vma;
unsigned long resident;
unsigned long shared_clean;
unsigned long shared_dirty;
@@ -438,15 +437,16 @@ struct mem_size_stats {
u64 pss;
};
-
-static void smaps_pte_entry(pte_t ptent, unsigned long addr,
- unsigned long ptent_size, struct mm_walk *walk)
+static int smaps_pte(pte_t *pte, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
- struct vm_area_struct *vma = mss->vma;
+ struct vm_area_struct *vma = walk->vma;
pgoff_t pgoff = linear_page_index(vma, addr);
struct page *page = NULL;
int mapcount;
+ pte_t ptent = *pte;
+ unsigned long ptent_size = end - addr;
if (pte_present(ptent)) {
page = vm_normal_page(vma, addr, ptent);
@@ -463,7 +463,7 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
}
if (!page)
- return;
+ return 0;
if (PageAnon(page))
mss->anonymous += ptent_size;
@@ -489,35 +489,22 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
mss->private_clean += ptent_size;
mss->pss += (ptent_size << PSS_SHIFT);
}
+ return 0;
}
-static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+static int smaps_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
- struct vm_area_struct *vma = mss->vma;
- pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
+ if (pmd_trans_huge_lock(pmd, walk->vma, &ptl) == 1) {
+ smaps_pte((pte_t *)pmd, addr, addr + HPAGE_PMD_SIZE, walk);
spin_unlock(ptl);
mss->anonymous_thp += HPAGE_PMD_SIZE;
- return 0;
+ /* don't call smaps_pte() */
+ walk->skip = 1;
}
-
- if (pmd_trans_unstable(pmd))
- return 0;
- /*
- * The mmap_sem held all the way back in m_start() is what
- * keeps khugepaged out of here and from collapsing things
- * in here.
- */
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- for (; addr != end; pte++, addr += PAGE_SIZE)
- smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
- pte_unmap_unlock(pte - 1, ptl);
- cond_resched();
return 0;
}
@@ -582,16 +569,16 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
struct vm_area_struct *vma = v;
struct mem_size_stats mss;
struct mm_walk smaps_walk = {
- .pmd_entry = smaps_pte_range,
+ .pmd_entry = smaps_pmd,
+ .pte_entry = smaps_pte,
.mm = vma->vm_mm,
+ .vma = vma,
.private = &mss,
};
memset(&mss, 0, sizeof mss);
- mss.vma = vma;
/* mmap_sem is held in m_start */
- if (vma->vm_mm && !is_vm_hugetlb_page(vma))
- walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
+ walk_page_vma(vma, &smaps_walk);
show_map_vma(m, vma, is_pid);
@@ -712,7 +699,6 @@ enum clear_refs_types {
};
struct clear_refs_private {
- struct vm_area_struct *vma;
enum clear_refs_types type;
};
@@ -737,48 +723,52 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
ptent = pte_file_clear_soft_dirty(ptent);
}
- if (vma->vm_flags & VM_SOFTDIRTY)
- vma->vm_flags &= ~VM_SOFTDIRTY;
-
set_pte_at(vma->vm_mm, addr, pte, ptent);
#endif
}
-static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+static int clear_refs_pte(pte_t *pte, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct clear_refs_private *cp = walk->private;
- struct vm_area_struct *vma = cp->vma;
- pte_t *pte, ptent;
- spinlock_t *ptl;
+ struct vm_area_struct *vma = walk->vma;
struct page *page;
- split_huge_page_pmd(vma, addr, pmd);
- if (pmd_trans_unstable(pmd))
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ clear_soft_dirty(vma, addr, pte);
return 0;
+ }
+ if (!pte_present(*pte))
+ return 0;
+ page = vm_normal_page(vma, addr, *pte);
+ if (!page)
+ return 0;
+ /* Clear accessed and referenced bits. */
+ ptep_test_and_clear_young(vma, addr, pte);
+ ClearPageReferenced(page);
+ return 0;
+}
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- for (; addr != end; pte++, addr += PAGE_SIZE) {
- ptent = *pte;
-
- if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
- clear_soft_dirty(vma, addr, pte);
- continue;
- }
-
- if (!pte_present(ptent))
- continue;
-
- page = vm_normal_page(vma, addr, ptent);
- if (!page)
- continue;
+static int clear_refs_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct clear_refs_private *cp = walk->private;
+ struct vm_area_struct *vma = walk->vma;
- /* Clear accessed and referenced bits. */
- ptep_test_and_clear_young(vma, addr, pte);
- ClearPageReferenced(page);
+ /*
+ * Writing 1 to /proc/pid/clear_refs affects all pages.
+ * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
+ * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
+ * Writing 4 to /proc/pid/clear_refs affects all pages.
+ */
+ if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
+ walk->skip = 1;
+ if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
+ walk->skip = 1;
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ vma->vm_flags &= ~VM_SOFTDIRTY;
}
- pte_unmap_unlock(pte - 1, ptl);
- cond_resched();
return 0;
}
@@ -807,8 +797,9 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
if (type == CLEAR_REFS_SOFT_DIRTY) {
soft_dirty_cleared = true;
- pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
- "See the linux/Documentation/vm/pagemap.txt for details.\n");
+ pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
+ " See the linux/Documentation/vm/pagemap.txt for "
+ "details.\n");
}
task = get_proc_task(file_inode(file));
@@ -820,33 +811,16 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
.type = type,
};
struct mm_walk clear_refs_walk = {
- .pmd_entry = clear_refs_pte_range,
+ .pte_entry = clear_refs_pte,
+ .test_walk = clear_refs_test_walk,
.mm = mm,
.private = &cp,
};
down_read(&mm->mmap_sem);
if (type == CLEAR_REFS_SOFT_DIRTY)
mmu_notifier_invalidate_range_start(mm, 0, -1);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- cp.vma = vma;
- if (is_vm_hugetlb_page(vma))
- continue;
- /*
- * Writing 1 to /proc/pid/clear_refs affects all pages.
- *
- * Writing 2 to /proc/pid/clear_refs only affects
- * Anonymous pages.
- *
- * Writing 3 to /proc/pid/clear_refs only affects file
- * mapped pages.
- */
- if (type == CLEAR_REFS_ANON && vma->vm_file)
- continue;
- if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
- continue;
- walk_page_range(vma->vm_start, vma->vm_end,
- &clear_refs_walk);
- }
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ walk_page_vma(vma, &clear_refs_walk);
if (type == CLEAR_REFS_SOFT_DIRTY)
mmu_notifier_invalidate_range_end(mm, 0, -1);
flush_tlb_mm(mm);
@@ -987,19 +961,33 @@ static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemap
}
#endif
-static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+static int pagemap_pte(pte_t *pte, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = walk->vma;
struct pagemapread *pm = walk->private;
- spinlock_t *ptl;
- pte_t *pte;
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+
+ if (vma && vma->vm_start <= addr && end <= vma->vm_end) {
+ pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+ /* unmap before userspace copy */
+ pte_unmap(pte);
+ }
+ return add_to_pagemap(addr, &pme, pm);
+}
+
+static int pagemap_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
int err = 0;
+ struct vm_area_struct *vma = walk->vma;
+ struct pagemapread *pm = walk->private;
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ spinlock_t *ptl;
- /* find the first VMA at or above 'addr' */
- vma = find_vma(walk->mm, addr);
- if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (!vma)
+ return err;
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
int pmd_flags2;
if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1018,41 +1006,9 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
break;
}
spin_unlock(ptl);
- return err;
+ /* don't call pagemap_pte() */
+ walk->skip = 1;
}
-
- if (pmd_trans_unstable(pmd))
- return 0;
- for (; addr != end; addr += PAGE_SIZE) {
- int flags2;
-
- /* check to see if we've left 'vma' behind
- * and need a new, higher one */
- if (vma && (addr >= vma->vm_end)) {
- vma = find_vma(walk->mm, addr);
- if (vma && (vma->vm_flags & VM_SOFTDIRTY))
- flags2 = __PM_SOFT_DIRTY;
- else
- flags2 = 0;
- pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
- }
-
- /* check that 'vma' actually covers this address,
- * and that it isn't a huge page vma */
- if (vma && (vma->vm_start <= addr) &&
- !is_vm_hugetlb_page(vma)) {
- pte = pte_offset_map(pmd, addr);
- pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
- /* unmap before userspace copy */
- pte_unmap(pte);
- }
- err = add_to_pagemap(addr, &pme, pm);
- if (err)
- return err;
- }
-
- cond_resched();
-
return err;
}
@@ -1070,24 +1026,22 @@ static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *
}
/* This function walks within one hugetlb entry in the single call */
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
- unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+static int pagemap_hugetlb(pte_t *pte, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = walk->vma;
int err = 0;
int flags2;
pagemap_entry_t pme;
+ unsigned long hmask;
- vma = find_vma(walk->mm, addr);
- WARN_ON_ONCE(!vma);
-
- if (vma && (vma->vm_flags & VM_SOFTDIRTY))
+ if (vma->vm_flags & VM_SOFTDIRTY)
flags2 = __PM_SOFT_DIRTY;
else
flags2 = 0;
+ hmask = huge_page_mask(hstate_vma(vma));
for (; addr != end; addr += PAGE_SIZE) {
int offset = (addr & ~hmask) >> PAGE_SHIFT;
huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
@@ -1095,9 +1049,6 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
if (err)
return err;
}
-
- cond_resched();
-
return err;
}
#endif /* HUGETLB_PAGE */
@@ -1164,10 +1115,11 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!mm || IS_ERR(mm))
goto out_free;
- pagemap_walk.pmd_entry = pagemap_pte_range;
+ pagemap_walk.pte_entry = pagemap_pte;
+ pagemap_walk.pmd_entry = pagemap_pmd;
pagemap_walk.pte_hole = pagemap_pte_hole;
#ifdef CONFIG_HUGETLB_PAGE
- pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
+ pagemap_walk.hugetlb_entry = pagemap_hugetlb;
#endif
pagemap_walk.mm = mm;
pagemap_walk.private = &pm;
@@ -1243,7 +1195,6 @@ const struct file_operations proc_pagemap_operations = {
#ifdef CONFIG_NUMA
struct numa_maps {
- struct vm_area_struct *vma;
unsigned long pages;
unsigned long anon;
unsigned long active;
@@ -1309,44 +1260,42 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
return page;
}
-static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
+static int gather_pte_stats(pte_t *pte, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
- struct numa_maps *md;
- spinlock_t *ptl;
- pte_t *orig_pte;
- pte_t *pte;
+ struct numa_maps *md = walk->private;
- md = walk->private;
+ struct page *page = can_gather_numa_stats(*pte, walk->vma, addr);
+ if (!page)
+ return 0;
+ gather_stats(page, md, pte_dirty(*pte), 1);
+ return 0;
+}
- if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
+static int gather_pmd_stats(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct numa_maps *md = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ spinlock_t *ptl;
+
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
- page = can_gather_numa_stats(huge_pte, md->vma, addr);
+ page = can_gather_numa_stats(huge_pte, vma, addr);
if (page)
gather_stats(page, md, pte_dirty(huge_pte),
HPAGE_PMD_SIZE/PAGE_SIZE);
spin_unlock(ptl);
- return 0;
+ /* don't call gather_pte_stats() */
+ walk->skip = 1;
}
-
- if (pmd_trans_unstable(pmd))
- return 0;
- orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- do {
- struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
- if (!page)
- continue;
- gather_stats(page, md, pte_dirty(*pte), 1);
-
- } while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap_unlock(orig_pte, ptl);
return 0;
}
#ifdef CONFIG_HUGETLB_PAGE
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
- unsigned long addr, unsigned long end, struct mm_walk *walk)
+static int gather_hugetlb_stats(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
struct numa_maps *md;
struct page *page;
@@ -1354,6 +1303,9 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
if (pte_none(*pte))
return 0;
+ if (!pte_present(*pte))
+ return 0;
+
page = pte_page(*pte);
if (!page)
return 0;
@@ -1364,8 +1316,8 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
}
#else
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
- unsigned long addr, unsigned long end, struct mm_walk *walk)
+static int gather_hugetlb_stats(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
return 0;
}
@@ -1394,12 +1346,12 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
/* Ensure we start with an empty set of numa_maps statistics. */
memset(md, 0, sizeof(*md));
- md->vma = vma;
-
- walk.hugetlb_entry = gather_hugetbl_stats;
- walk.pmd_entry = gather_pte_stats;
+ walk.hugetlb_entry = gather_hugetlb_stats;
+ walk.pmd_entry = gather_pmd_stats;
+ walk.pte_entry = gather_pte_stats;
walk.private = md;
walk.mm = mm;
+ walk.vma = vma;
pol = get_vma_policy(task, vma, vma->vm_start);
mpol_to_str(buffer, sizeof(buffer), pol);
@@ -1430,6 +1382,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
if (is_vm_hugetlb_page(vma))
seq_printf(m, " huge");
+ /* mmap_sem is held by m_start */
walk_page_range(vma->vm_start, vma->vm_end, &walk);
if (!md->pages)
diff --git a/fs/readdir.c b/fs/readdir.c
index 5b53d995cae6..33fd92208cb7 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -13,6 +13,7 @@
#include <linux/stat.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/fsnotify.h>
#include <linux/dirent.h>
#include <linux/security.h>
#include <linux/syscalls.h>
@@ -40,6 +41,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
ctx->pos = file->f_pos;
res = file->f_op->iterate(file, ctx);
file->f_pos = ctx->pos;
+ fsnotify_access(file);
file_accessed(file);
}
mutex_unlock(&inode->i_mutex);
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index dc9a6829f7c6..1bcffeab713c 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -142,7 +142,6 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
int org = *beg;
BUG_ON(!th->t_trans_id);
-
RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of "
"range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1);
PROC_INFO_INC(s, scan_bitmap.bmap);
@@ -321,7 +320,6 @@ static int scan_bitmap(struct reiserfs_transaction_handle *th,
unsigned int off_max = s->s_blocksize << 3;
BUG_ON(!th->t_trans_id);
-
PROC_INFO_INC(s, scan_bitmap.call);
if (SB_FREE_BLOCKS(s) <= 0)
return 0; // No point in looking for more free blocks
@@ -388,9 +386,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
unsigned int nr, offset;
BUG_ON(!th->t_trans_id);
-
PROC_INFO_INC(s, free_block);
-
rs = SB_DISK_SUPER_BLOCK(s);
sbh = SB_BUFFER_WITH_SB(s);
apbi = SB_AP_BITMAP(s);
@@ -435,8 +431,8 @@ void reiserfs_free_block(struct reiserfs_transaction_handle *th,
int for_unformatted)
{
struct super_block *s = th->t_super;
- BUG_ON(!th->t_trans_id);
+ BUG_ON(!th->t_trans_id);
RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
if (!is_reusable(s, block, 1))
return;
@@ -471,6 +467,7 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th,
unsigned long save = ei->i_prealloc_block;
int dirty = 0;
struct inode *inode = &ei->vfs_inode;
+
BUG_ON(!th->t_trans_id);
#ifdef CONFIG_REISERFS_CHECK
if (ei->i_prealloc_count < 0)
@@ -494,6 +491,7 @@ void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
struct inode *inode)
{
struct reiserfs_inode_info *ei = REISERFS_I(inode);
+
BUG_ON(!th->t_trans_id);
if (ei->i_prealloc_count)
__discard_prealloc(th, ei);
@@ -504,7 +502,6 @@ void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th)
struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
BUG_ON(!th->t_trans_id);
-
while (!list_empty(plist)) {
struct reiserfs_inode_info *ei;
ei = list_entry(plist->next, struct reiserfs_inode_info,
@@ -562,7 +559,7 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options)
if (!strcmp(this_char, "displacing_new_packing_localities")) {
SET_OPTION(displacing_new_packing_localities);
continue;
- };
+ }
if (!strcmp(this_char, "old_hashed_relocation")) {
SET_OPTION(old_hashed_relocation);
@@ -729,6 +726,7 @@ void show_alloc_options(struct seq_file *seq, struct super_block *s)
static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint)
{
char *hash_in;
+
if (hint->formatted_node) {
hash_in = (char *)&hint->key.k_dir_id;
} else {
@@ -757,6 +755,7 @@ static void dirid_groups(reiserfs_blocknr_hint_t * hint)
__u32 dirid = 0;
int bm = 0;
struct super_block *sb = hint->th->t_super;
+
if (hint->inode)
dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
else if (hint->formatted_node)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index a8015a7a55bb..53b2acc38213 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -233,6 +233,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
# define pte_accessible(mm, pte) ((void)(pte), 1)
#endif
+#ifndef pte_present_nonuma
+#define pte_present_nonuma(pte) pte_present(pte)
+#endif
+
#ifndef flush_tlb_fix_spurious_fault
#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
#endif
@@ -670,7 +674,7 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
static inline int pte_numa(pte_t pte)
{
return (pte_flags(pte) &
- (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+ (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
}
#endif
@@ -678,7 +682,7 @@ static inline int pte_numa(pte_t pte)
static inline int pmd_numa(pmd_t pmd)
{
return (pmd_flags(pmd) &
- (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+ (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
}
#endif
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644
index 000000000000..bba7a4d692b3
--- /dev/null
+++ b/include/linux/crc64_ecma.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index 3b28f937d959..772eab5d524a 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -88,7 +88,8 @@ static inline void dma_contiguous_set_default(struct cma *cma)
void dma_contiguous_reserve(phys_addr_t addr_limit);
int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
- phys_addr_t limit, struct cma **res_cma);
+ phys_addr_t limit, struct cma **res_cma,
+ bool fixed);
/**
* dma_declare_contiguous() - reserve area for contiguous memory handling
@@ -108,7 +109,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size,
{
struct cma *cma;
int ret;
- ret = dma_contiguous_reserve_area(size, base, limit, &cma);
+ ret = dma_contiguous_reserve_area(size, base, limit, &cma, true);
if (ret == 0)
dev_set_cma_area(dev, cma);
@@ -136,7 +137,9 @@ static inline void dma_contiguous_set_default(struct cma *cma) { }
static inline void dma_contiguous_reserve(phys_addr_t limit) { }
static inline int dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
- phys_addr_t limit, struct cma **res_cma) {
+ phys_addr_t limit, struct cma **res_cma,
+ bool fixed)
+{
return -ENOSYS;
}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 39b81dc7d01a..d382db71e300 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -31,7 +31,6 @@ struct vm_area_struct;
#define ___GFP_HARDWALL 0x20000u
#define ___GFP_THISNODE 0x40000u
#define ___GFP_RECLAIMABLE 0x80000u
-#define ___GFP_KMEMCG 0x100000u
#define ___GFP_NOTRACK 0x200000u
#define ___GFP_NO_KSWAPD 0x400000u
#define ___GFP_OTHER_NODE 0x800000u
@@ -91,7 +90,6 @@ struct vm_area_struct;
#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
-#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */
#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
/*
@@ -353,6 +351,10 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
alloc_pages_vma(gfp_mask, 0, vma, addr, node)
+extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order);
+extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask,
+ unsigned int order);
+
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
@@ -372,8 +374,8 @@ extern void free_pages(unsigned long addr, unsigned int order);
extern void free_hot_cold_page(struct page *page, int cold);
extern void free_hot_cold_page_list(struct list_head *list, int cold);
-extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
-extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
+extern void __free_kmem_pages(struct page *page, unsigned int order);
+extern void free_kmem_pages(unsigned long addr, unsigned int order);
#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5b337cf8fb86..4eace5e94117 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -41,8 +41,6 @@ extern int hugetlb_max_hstate __read_mostly;
struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
void hugepage_put_subpool(struct hugepage_subpool *spool);
-int PageHuge(struct page *page);
-
void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
@@ -109,11 +107,6 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
#else /* !CONFIG_HUGETLB_PAGE */
-static inline int PageHuge(struct page *page)
-{
- return 0;
-}
-
static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
}
@@ -343,6 +336,11 @@ static inline unsigned huge_page_shift(struct hstate *h)
return h->order + PAGE_SHIFT;
}
+static inline bool hstate_is_gigantic(struct hstate *h)
+{
+ return huge_page_order(h) >= MAX_ORDER;
+}
+
static inline unsigned int pages_per_huge_page(struct hstate *h)
{
return 1 << h->order;
@@ -460,4 +458,14 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
return ptl;
}
+static inline bool hugepages_supported(void)
+{
+ /*
+ * Some platform decide whether they support huge pages at boot
+ * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
+ * there is no such support
+ */
+ return HPAGE_SHIFT != 0;
+}
+
#endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 2bb681fbeb35..4d60c82e9fda 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -10,6 +10,8 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
return !!(vma->vm_flags & VM_HUGETLB);
}
+int PageHuge(struct page *page);
+
#else
static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
@@ -17,6 +19,11 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
return 0;
}
+static inline int PageHuge(struct page *page)
+{
+ return 0;
+}
+
#endif
#endif
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 6af3400b9b2f..013fd9bc4cb6 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -29,21 +29,24 @@
struct idr_layer {
int prefix; /* the ID prefix of this idr_layer */
- DECLARE_BITMAP(bitmap, IDR_SIZE); /* A zero bit means "space here" */
+ int layer; /* distance from leaf */
struct idr_layer __rcu *ary[1<<IDR_BITS];
int count; /* When zero, we can release it */
- int layer; /* distance from leaf */
- struct rcu_head rcu_head;
+ union {
+ /* A zero bit means "space here" */
+ DECLARE_BITMAP(bitmap, IDR_SIZE);
+ struct rcu_head rcu_head;
+ };
};
struct idr {
struct idr_layer __rcu *hint; /* the last layer allocated from */
struct idr_layer __rcu *top;
- struct idr_layer *id_free;
int layers; /* only valid w/o concurrent changes */
- int id_free_cnt;
int cur; /* current pos for cyclic allocation */
spinlock_t lock;
+ int id_free_cnt;
+ struct idr_layer *id_free;
};
#define IDR_INIT(name) \
diff --git a/include/linux/input.h b/include/linux/input.h
index 82ce323b9986..6453b22372ac 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -79,6 +79,7 @@ struct input_value {
* @led: reflects current state of device's LEDs
* @snd: reflects current state of sound effects
* @sw: reflects current state of device's switches
+ * @leds: leds objects for the device's LEDs
* @open: this method is called when the very first user calls
* input_open_device(). The driver must prepare the device
* to start generating events (start polling thread,
@@ -164,6 +165,8 @@ struct input_dev {
unsigned long snd[BITS_TO_LONGS(SND_CNT)];
unsigned long sw[BITS_TO_LONGS(SW_CNT)];
+ struct led_classdev *leds;
+
int (*open)(struct input_dev *dev);
void (*close)(struct input_dev *dev);
int (*flush)(struct input_dev *dev, struct file *file);
@@ -531,4 +534,22 @@ int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file);
int input_ff_create_memless(struct input_dev *dev, void *data,
int (*play_effect)(struct input_dev *, void *, struct ff_effect *));
+#ifdef CONFIG_INPUT_LEDS
+
+int input_led_connect(struct input_dev *dev);
+void input_led_disconnect(struct input_dev *dev);
+
+#else
+
+static inline int input_led_connect(struct input_dev *dev)
+{
+ return 0;
+}
+
+static inline void input_led_disconnect(struct input_dev *dev)
+{
+}
+
+#endif
+
#endif
diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h
index 2f4e957af656..433e0c74d643 100644
--- a/include/linux/mc146818rtc.h
+++ b/include/linux/mc146818rtc.h
@@ -31,6 +31,10 @@ struct cmos_rtc_board_info {
void (*wake_on)(struct device *dev);
void (*wake_off)(struct device *dev);
+ u32 flags;
+#define CMOS_RTC_FLAGS_NOFREQ (1 << 0)
+ int address_space;
+
u8 rtc_day_alarm; /* zero, or register index */
u8 rtc_mon_alarm; /* zero, or register index */
u8 rtc_century; /* zero, or register index */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 73dc382e72d8..b660e05b63d4 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -272,6 +272,8 @@ static inline bool memblock_bottom_up(void) { return false; }
#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0)
#define MEMBLOCK_ALLOC_ACCESSIBLE 0
+phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
+ phys_addr_t start, phys_addr_t end);
phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
phys_addr_t max_addr);
phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b569b8be5c5a..5155d09e749d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -506,6 +506,9 @@ void memcg_update_array_size(int num_groups);
struct kmem_cache *
__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size);
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size);
+
void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
int __kmem_cache_destroy_memcg_children(struct kmem_cache *s);
@@ -534,7 +537,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
* res_counter_charge_nofail, but we hope those allocations are rare,
* and won't be worth the trouble.
*/
- if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL))
+ if (gfp & __GFP_NOFAIL)
return true;
if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
return true;
@@ -583,17 +586,7 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
* @cachep: the original global kmem cache
* @gfp: allocation flags.
*
- * This function assumes that the task allocating, which determines the memcg
- * in the page allocator, belongs to the same cgroup throughout the whole
- * process. Misacounting can happen if the task calls memcg_kmem_get_cache()
- * while belonging to a cgroup, and later on changes. This is considered
- * acceptable, and should only happen upon task migration.
- *
- * Before the cache is created by the memcg core, there is also a possible
- * imbalance: the task belongs to a memcg, but the cache being allocated from
- * is the global cache, since the child cache is not yet guaranteed to be
- * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
- * passed and the page allocator will not attempt any cgroup accounting.
+ * All memory allocated from a per-memcg cache is charged to the owner memcg.
*/
static __always_inline struct kmem_cache *
memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4ca3d951fe91..010d125bffbf 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -187,14 +187,8 @@ extern void put_page_bootmem(struct page *page);
extern void get_page_bootmem(unsigned long ingo, struct page *page,
unsigned long type);
-/*
- * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug
- * notifier will be called under this. 2) offline/online/add/remove memory
- * will not run simultaneously.
- */
-
-void lock_memory_hotplug(void);
-void unlock_memory_hotplug(void);
+void get_online_mems(void);
+void put_online_mems(void);
#else /* ! CONFIG_MEMORY_HOTPLUG */
/*
@@ -232,8 +226,8 @@ static inline int try_online_node(int nid)
return 0;
}
-static inline void lock_memory_hotplug(void) {}
-static inline void unlock_memory_hotplug(void) {}
+static inline void get_online_mems(void) {}
+static inline void put_online_mems(void) {}
#endif /* ! CONFIG_MEMORY_HOTPLUG */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf9811e1321a..50c957f755f8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -288,6 +288,25 @@ static inline int get_freepage_migratetype(struct page *page)
}
/*
+ * Check that a freepage cannot end up on a wrong free_list for "sensitive"
+ * migratetypes. Return false if it could. Useful for VM_BUG_ON checks.
+ */
+static inline bool check_freepage_migratetype(struct page *page)
+{
+ int pageblock_mt = get_pageblock_migratetype(page);
+ int freepage_mt = get_freepage_migratetype(page);
+
+ /*
+ * For RESERVE and CMA pageblocks, the freepage_migratetype must
+ * match their migratetype. For other pageblocks, we don't care.
+ */
+ if (pageblock_mt != MIGRATE_RESERVE && !is_migrate_cma(pageblock_mt))
+ return true;
+
+ return (freepage_mt == pageblock_mt);
+}
+
+/*
* FIXME: take this include out, include page-flags.h in
* files which need it (119 of them)
*/
@@ -1096,10 +1115,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
* @pte_entry: if set, called for each non-empty PTE (4th-level) entry
* @pte_hole: if set, called for each hole at all levels
* @hugetlb_entry: if set, called for each hugetlb entry
- * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry
- * is used.
+ * @test_walk: caller specific callback function to determine whether
+ * we walk over the current vma or not. A positive returned
+ * value means "do page table walk over the current vma,"
+ * and a negative one means "abort current page table walk
+ * right now." 0 means "skip the current vma."
+ * @mm: mm_struct representing the target process of page table walk
+ * @vma: vma currently walked
+ * @skip: internal control flag which is set when we skip the lower
+ * level entries.
+ * @private: private data for callbacks' use
*
- * (see walk_page_range for more details)
+ * (see the comment on walk_page_range() for more details)
*/
struct mm_walk {
int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
@@ -1112,15 +1139,19 @@ struct mm_walk {
unsigned long next, struct mm_walk *walk);
int (*pte_hole)(unsigned long addr, unsigned long next,
struct mm_walk *walk);
- int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
- unsigned long addr, unsigned long next,
- struct mm_walk *walk);
+ int (*hugetlb_entry)(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk);
+ int (*test_walk)(unsigned long addr, unsigned long next,
+ struct mm_walk *walk);
struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int skip;
void *private;
};
int walk_page_range(unsigned long addr, unsigned long end,
struct mm_walk *walk);
+int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk);
void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8967e20cbe57..de1627232af0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -406,7 +406,7 @@ struct mm_struct {
spinlock_t ioctx_lock;
struct kioctx_table __rcu *ioctx_table;
#endif
-#ifdef CONFIG_MM_OWNER
+#ifdef CONFIG_MEMCG
/*
* "owner" points to a task that is regarded as the canonical
* user/owner of this mm. All of the following must be true in
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 2d57efa64cc1..edd82a105220 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -1,6 +1,8 @@
#ifndef LINUX_MM_DEBUG_H
#define LINUX_MM_DEBUG_H 1
+#include <linux/stringify.h>
+
struct page;
extern void dump_page(struct page *page, const char *reason);
@@ -9,11 +11,20 @@ extern void dump_page_badflags(struct page *page, const char *reason,
#ifdef CONFIG_DEBUG_VM
#define VM_BUG_ON(cond) BUG_ON(cond)
-#define VM_BUG_ON_PAGE(cond, page) \
- do { if (unlikely(cond)) { dump_page(page, NULL); BUG(); } } while (0)
+#define VM_BUG_ON_PAGE(cond, page) \
+ do { \
+ if (unlikely(cond)) { \
+ dump_page(page, "VM_BUG_ON_PAGE(" __stringify(cond)")");\
+ BUG(); \
+ } \
+ } while (0)
+#define VM_WARN_ON(cond) WARN_ON(cond)
+#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond)
#else
#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
#define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
+#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
#endif
#ifdef CONFIG_DEBUG_VIRTUAL
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fac5509c18f0..ae693e1ad0f9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -481,9 +481,8 @@ struct zone {
* give them a chance of being in the same cacheline.
*
* Write access to present_pages at runtime should be protected by
- * lock_memory_hotplug()/unlock_memory_hotplug(). Any reader who can't
- * tolerant drift of present_pages should hold memory hotplug lock to
- * get a stable value.
+ * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
+ * present_pages should get_online_mems() to get a stable value.
*
* Read access to managed_pages should be safe because it's unsigned
* long. Write access to zone->managed_pages and totalram_pages are
@@ -763,10 +762,10 @@ typedef struct pglist_data {
unsigned long node_spanned_pages; /* total size of physical page
range, including holes */
int node_id;
- nodemask_t reclaim_nodes; /* Nodes allowed to reclaim from */
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
- struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
+ struct task_struct *kswapd; /* Protected by
+ mem_hotplug_begin/end() */
int kswapd_max_order;
enum zone_type classzone_idx;
#ifdef CONFIG_NUMA_BALANCING
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d1fe1a761047..bc2007ea3577 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -348,6 +348,9 @@ static inline void ClearPageCompound(struct page *page)
ClearPageHead(page);
}
#endif
+
+#define PG_head_mask ((1L << PG_head))
+
#else
/*
* Reduce page flag use as much as possible by overlapping
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 45598f1e9aa3..d4acafc51949 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -316,6 +316,34 @@ static inline loff_t page_file_offset(struct page *page)
return ((loff_t)page_file_index(page)) << PAGE_CACHE_SHIFT;
}
+/*
+ * Get the order of a given page in the context of the pagecache which it
+ * belongs to.
+ *
+ * Pagecache unit size is not a fixed value (hugetlbfs is an example), but the
+ * vma_interval_tree and anon_vma_interval_tree APIs assume that indices are in
+ * PAGE_SIZE units. So this function helps us to get normalized indices.
+ *
+ * page_size_order() should be called only for pagecache pages/hugepages and
+ * anonymous pages/hugepages, because pagecache unit size is irrelevant except
+ * for those pages.
+ */
+static inline unsigned int page_size_order(struct page *page)
+{
+ return unlikely(PageHuge(page)) ?
+ compound_order(compound_head(page)) :
+ (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+}
+
+/*
+ * page->index stores pagecache index whose unit is not always PAGE_SIZE.
+ * This function converts it into PAGE_SIZE offset.
+ */
+static inline pgoff_t page_pgoff(struct page *page)
+{
+ return page->index << page_size_order(page);
+}
+
extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
unsigned long address);
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index a964f7285600..4b152c81c5fa 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -136,7 +136,7 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
struct scatterlist *sgl)
{
-#ifndef ARCH_HAS_SG_CHAIN
+#ifndef CONFIG_ARCH_HAS_SG_CHAIN
BUG();
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 010cde3b44cb..d8a47da72426 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -137,12 +137,6 @@ struct filename;
#define VMACACHE_MASK (VMACACHE_SIZE - 1)
/*
- * List of flags we want to share for kernel threads,
- * if only because they are not used by them anyway.
- */
-#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
-
-/*
* These are the constant used to fake the fixed-point load-average
* counting. Some notes:
* - 11 bit fractions expand to 22 bits by the multiplies: this gives
@@ -2366,9 +2360,6 @@ extern void flush_itimer_signals(void);
extern void do_group_exit(int);
-extern int allow_signal(int);
-extern int disallow_signal(int);
-
extern int do_execve(struct filename *,
const char __user * const __user *,
const char __user * const __user *);
@@ -2954,7 +2945,7 @@ static inline void inc_syscw(struct task_struct *tsk)
#define TASK_SIZE_OF(tsk) TASK_SIZE
#endif
-#ifdef CONFIG_MM_OWNER
+#ifdef CONFIG_MEMCG
extern void mm_update_next_owner(struct mm_struct *mm);
extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
#else
@@ -2965,7 +2956,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
}
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
static inline unsigned long task_rlimit(const struct task_struct *tsk,
unsigned int limit)
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 2ac423bdb676..c9e65360c49a 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -63,11 +63,6 @@ static inline int sigismember(sigset_t *set, int _sig)
return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW));
}
-static inline int sigfindinword(unsigned long word)
-{
- return ffz(~word);
-}
-
#endif /* __HAVE_ARCH_SIG_BITOPS */
static inline int sigisemptyset(sigset_t *set)
@@ -289,6 +284,22 @@ extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping);
extern void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs, int stepping);
extern void exit_signals(struct task_struct *tsk);
+extern void kernel_sigaction(int, __sighandler_t);
+
+static inline void allow_signal(int sig)
+{
+ /*
+ * Kernel threads handle their own signals. Let the signal code
+ * know it'll be handled, so that they don't get converted to
+ * SIGKILL or just silently dropped.
+ */
+ kernel_sigaction(sig, (__force __sighandler_t)2);
+}
+
+static inline void disallow_signal(int sig)
+{
+ kernel_sigaction(sig, SIG_IGN);
+}
/*
* Eventually that'll replace get_signal_to_deliver(); macro for now,
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 307bfbe62387..a6aab2c0dfc5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -369,16 +369,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
#include <linux/slub_def.h>
#endif
-static __always_inline void *
-kmalloc_order(size_t size, gfp_t flags, unsigned int order)
-{
- void *ret;
-
- flags |= (__GFP_COMP | __GFP_KMEMCG);
- ret = (void *) __get_free_pages(flags, order);
- kmemleak_alloc(ret, size, 1, flags);
- return ret;
-}
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order);
#ifdef CONFIG_TRACING
extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order);
diff --git a/include/linux/string.h b/include/linux/string.h
index ac889c5ea11b..f29f9a0b7265 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -114,6 +114,7 @@ void *memchr_inv(const void *s, int c, size_t n);
extern char *kstrdup(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
+extern char *kstrimdup(const char *s, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 350711560753..5a14b928164e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -308,8 +308,9 @@ extern unsigned long nr_free_pagecache_pages(void);
/* linux/mm/swap.c */
-extern void __lru_cache_add(struct page *);
extern void lru_cache_add(struct page *);
+extern void lru_cache_add_anon(struct page *page);
+extern void lru_cache_add_file(struct page *page);
extern void lru_add_page_tail(struct page *page, struct page *page_tail,
struct lruvec *lruvec, struct list_head *head);
extern void activate_page(struct page *);
@@ -323,22 +324,6 @@ extern void swap_setup(void);
extern void add_page_to_unevictable_list(struct page *page);
-/**
- * lru_cache_add: add a page to the page lists
- * @page: the page to add
- */
-static inline void lru_cache_add_anon(struct page *page)
-{
- ClearPageActive(page);
- __lru_cache_add(page);
-}
-
-static inline void lru_cache_add_file(struct page *page)
-{
- ClearPageActive(page);
- __lru_cache_add(page);
-}
-
/* linux/mm/vmscan.c */
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index c0f75261a728..6adfb7bfbf44 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
/* check whether a pte points to a swap entry */
static inline int is_swap_pte(pte_t pte)
{
- return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
+ return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte);
}
#endif
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index a5ffd32642fd..e7a018eaf3a2 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -116,4 +116,6 @@ static inline void swiotlb_free(void) { }
#endif
extern void swiotlb_print_info(void);
+extern int is_swiotlb_buffer(phys_addr_t paddr);
+
#endif /* __LINUX_SWIOTLB_H */
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index cb0cec94fda3..ff307b548ed3 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -61,8 +61,6 @@ extern long do_no_restart_syscall(struct restart_block *parm);
# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK)
#endif
-#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG)
-
/*
* flag set/clear/test wrappers
* - pass TIF_xxxx constants to these functions
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 7062330a1329..53261e285889 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -58,7 +58,8 @@ int arch_update_cpu_topology(void);
/*
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
* (in whatever arch specific measurement units returned by node_distance())
- * then switch on zone reclaim on boot.
+ * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim()
+ * on nodes within this distance.
*/
#define RECLAIM_DISTANCE 30
#endif
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 486c3972c0be..ced92345c963 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -80,6 +80,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
NR_TLB_LOCAL_FLUSH_ALL,
NR_TLB_LOCAL_FLUSH_ONE,
#endif /* CONFIG_DEBUG_TLBFLUSH */
+#ifdef CONFIG_DEBUG_VM_VMACACHE
+ VMACACHE_FIND_CALLS,
+ VMACACHE_FIND_HITS,
+#endif
NR_VM_EVENT_ITEMS
};
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 45c9cd1daf7a..82e7db7f7100 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -95,6 +95,12 @@ static inline void vm_events_fold_cpu(int cpu)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif
+#ifdef CONFIG_DEBUG_VM_VMACACHE
+#define count_vm_vmacache_event(x) count_vm_event(x)
+#else
+#define count_vm_vmacache_event(x) do {} while (0)
+#endif
+
#define __count_zone_vm_events(item, zone, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + \
zone_idx(zone), delta)
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 0a4edfe8af51..d34cf2df093b 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -31,7 +31,7 @@ enum scsi_timeouts {
* Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
* is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
*/
-#ifdef ARCH_HAS_SG_CHAIN
+#ifdef CONFIG_ARCH_HAS_SG_CHAIN
#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048
#else
#define SCSI_MAX_SG_CHAIN_SEGMENTS SCSI_MAX_SG_SEGMENTS
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index 1eddbf1557f2..d6fd8e5b14b7 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -34,7 +34,6 @@
{(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \
{(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \
{(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
- {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \
{(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \
{(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \
{(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \
diff --git a/init/Kconfig b/init/Kconfig
index 0f0f351029eb..66aa1802449d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -933,7 +933,6 @@ config RESOURCE_COUNTERS
config MEMCG
bool "Memory Resource Controller for Control Groups"
depends on RESOURCE_COUNTERS
- select MM_OWNER
select EVENTFD
help
Provides a memory resource controller that manages both anonymous
@@ -951,9 +950,6 @@ config MEMCG
disable memory resource controller and you can avoid overheads.
(and lose benefits of memory resource controller)
- This config option also selects MM_OWNER config option, which
- could in turn add some fork/exit overhead.
-
config MEMCG_SWAP
bool "Memory Resource Controller Swap Extension"
depends on MEMCG && SWAP
@@ -996,6 +992,12 @@ config MEMCG_KMEM
the kmem extension can use it to guarantee that no group of processes
will ever exhaust kernel resources alone.
+ WARNING: Current implementation lacks reclaim support. That means
+ allocation attempts will fail when close to the limit even if there
+ are plenty of kmem available for reclaim. That makes this option
+ unusable in real life so DO NOT SELECT IT unless for development
+ purposes.
+
config CGROUP_HUGETLB
bool "HugeTLB Resource Controller for Control Groups"
depends on RESOURCE_COUNTERS && HUGETLB_PAGE
@@ -1173,9 +1175,6 @@ config SCHED_AUTOGROUP
desktop applications. Task group autogeneration is currently based
upon task session.
-config MM_OWNER
- bool
-
config SYSFS_DEPRECATED
bool "Enable deprecated sysfs features to support old userspace tools"
depends on SYSFS
@@ -1375,6 +1374,16 @@ config UID16
help
This enables the legacy 16-bit UID syscall wrappers.
+config SGETMASK_SYSCALL
+ bool "sgetmask/ssetmask syscalls support" if EXPERT
+ def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH
+ ---help---
+ sys_sgetmask and sys_ssetmask are obsolete system calls
+ no longer supported in libc but still enabled by default in some
+ architectures.
+
+ If unsure, leave the default option here.
+
config SYSFS_SYSCALL
bool "Sysfs syscall support" if EXPERT
default y
diff --git a/init/main.c b/init/main.c
index 9c7fd4c9249f..1a14531b61f3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -77,6 +77,7 @@
#include <linux/sched_clock.h>
#include <linux/context_tracking.h>
#include <linux/random.h>
+#include <linux/list.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -379,7 +380,7 @@ static noinline void __init_refok rest_init(void)
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
- kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
+ kernel_thread(kernel_init, NULL, CLONE_FS);
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
rcu_read_lock();
@@ -666,19 +667,83 @@ static void __init do_ctors(void)
bool initcall_debug;
core_param(initcall_debug, initcall_debug, bool, 0644);
+#ifdef CONFIG_KALLSYMS
+struct blacklist_entry {
+ struct list_head next;
+ char *buf;
+};
+
+static __initdata_or_module LIST_HEAD(blacklisted_initcalls);
+
+static int __init initcall_blacklist(char *str)
+{
+ char *str_entry;
+ struct blacklist_entry *entry;
+
+ /* str argument is a comma-separated list of functions */
+ do {
+ str_entry = strsep(&str, ",");
+ if (str_entry) {
+ pr_debug("blacklisting initcall %s\n", str_entry);
+ entry = alloc_bootmem(sizeof(*entry));
+ entry->buf = alloc_bootmem(strlen(str_entry) + 1);
+ strcpy(entry->buf, str_entry);
+ list_add(&entry->next, &blacklisted_initcalls);
+ }
+ } while (str_entry);
+
+ return 0;
+}
+
+static bool __init_or_module initcall_blacklisted(initcall_t fn)
+{
+ struct list_head *tmp;
+ struct blacklist_entry *entry;
+ char *fn_name;
+
+ fn_name = kasprintf(GFP_KERNEL, "%pf", fn);
+ if (!fn_name)
+ return false;
+
+ list_for_each(tmp, &blacklisted_initcalls) {
+ entry = list_entry(tmp, struct blacklist_entry, next);
+ if (!strcmp(fn_name, entry->buf)) {
+ pr_debug("initcall %s blacklisted\n", fn_name);
+ kfree(fn_name);
+ return true;
+ }
+ }
+
+ kfree(fn_name);
+ return false;
+}
+#else
+static int __init initcall_blacklist(char *str)
+{
+ pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n");
+ return 0;
+}
+
+static bool __init_or_module initcall_blacklisted(initcall_t fn)
+{
+ return false;
+}
+#endif
+__setup("initcall_blacklist=", initcall_blacklist);
+
static int __init_or_module do_one_initcall_debug(initcall_t fn)
{
ktime_t calltime, delta, rettime;
unsigned long long duration;
int ret;
- pr_debug("calling %pF @ %i\n", fn, task_pid_nr(current));
+ printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current));
calltime = ktime_get();
ret = fn();
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
duration = (unsigned long long) ktime_to_ns(delta) >> 10;
- pr_debug("initcall %pF returned %d after %lld usecs\n",
+ printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n",
fn, ret, duration);
return ret;
@@ -690,6 +755,9 @@ int __init_or_module do_one_initcall(initcall_t fn)
int ret;
char msgbuf[64];
+ if (initcall_blacklisted(fn))
+ return -EPERM;
+
if (initcall_debug)
ret = do_one_initcall_debug(fn);
else
diff --git a/ipc/compat.c b/ipc/compat.c
index 45d035d4cedc..b5ef4f7946dc 100644
--- a/ipc/compat.c
+++ b/ipc/compat.c
@@ -30,7 +30,7 @@
#include <linux/ptrace.h>
#include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "util.h"
diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c
index 90d29f59cac6..ef6f91cc4490 100644
--- a/ipc/compat_mq.c
+++ b/ipc/compat_mq.c
@@ -12,7 +12,7 @@
#include <linux/mqueue.h>
#include <linux/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
struct compat_mq_attr {
compat_long_t mq_flags; /* message queue flags */
diff --git a/ipc/msg.c b/ipc/msg.c
index 649853105a5d..7ed1ef338e76 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -39,7 +39,7 @@
#include <linux/ipc_namespace.h>
#include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "util.h"
/*
@@ -306,15 +306,14 @@ static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg)
SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)
{
struct ipc_namespace *ns;
- struct ipc_ops msg_ops;
+ static const struct ipc_ops msg_ops = {
+ .getnew = newque,
+ .associate = msg_security,
+ };
struct ipc_params msg_params;
ns = current->nsproxy->ipc_ns;
- msg_ops.getnew = newque;
- msg_ops.associate = msg_security;
- msg_ops.more_checks = NULL;
-
msg_params.key = key;
msg_params.flg = msgflg;
@@ -612,23 +611,22 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)
static int testmsg(struct msg_msg *msg, long type, int mode)
{
- switch (mode)
- {
- case SEARCH_ANY:
- case SEARCH_NUMBER:
+ switch (mode) {
+ case SEARCH_ANY:
+ case SEARCH_NUMBER:
+ return 1;
+ case SEARCH_LESSEQUAL:
+ if (msg->m_type <= type)
return 1;
- case SEARCH_LESSEQUAL:
- if (msg->m_type <= type)
- return 1;
- break;
- case SEARCH_EQUAL:
- if (msg->m_type == type)
- return 1;
- break;
- case SEARCH_NOTEQUAL:
- if (msg->m_type != type)
- return 1;
- break;
+ break;
+ case SEARCH_EQUAL:
+ if (msg->m_type == type)
+ return 1;
+ break;
+ case SEARCH_NOTEQUAL:
+ if (msg->m_type != type)
+ return 1;
+ break;
}
return 0;
}
diff --git a/ipc/sem.c b/ipc/sem.c
index bee555417312..fe0928a3d08b 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -87,7 +87,7 @@
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "util.h"
/* One semaphore structure for each semaphore in the system. */
@@ -160,7 +160,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
* sem_array.pending{_alter,_cont},
* sem_array.sem_undo: global sem_lock() for read/write
* sem_undo.proc_next: only "current" is allowed to read/write that field.
- *
+ *
* sem_array.sem_base[i].pending_{const,alter}:
* global or semaphore sem_lock() for read/write
*/
@@ -564,7 +564,11 @@ static inline int sem_more_checks(struct kern_ipc_perm *ipcp,
SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
{
struct ipc_namespace *ns;
- struct ipc_ops sem_ops;
+ static const struct ipc_ops sem_ops = {
+ .getnew = newary,
+ .associate = sem_security,
+ .more_checks = sem_more_checks,
+ };
struct ipc_params sem_params;
ns = current->nsproxy->ipc_ns;
@@ -572,10 +576,6 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
if (nsems < 0 || nsems > ns->sc_semmsl)
return -EINVAL;
- sem_ops.getnew = newary;
- sem_ops.associate = sem_security;
- sem_ops.more_checks = sem_more_checks;
-
sem_params.key = key;
sem_params.flg = semflg;
sem_params.u.nsems = nsems;
@@ -1161,7 +1161,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid,
err = security_sem_semctl(NULL, cmd);
if (err)
return err;
-
+
memset(&seminfo, 0, sizeof(seminfo));
seminfo.semmni = ns->sc_semmni;
seminfo.semmns = ns->sc_semmns;
@@ -1181,7 +1181,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid,
}
max_id = ipc_get_maxid(&sem_ids(ns));
up_read(&sem_ids(ns).rwsem);
- if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
+ if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
return -EFAULT;
return (max_id < 0) ? 0 : max_id;
}
@@ -1883,7 +1883,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
/* We need to sleep on this operation, so we put the current
* task into the pending queue and go to sleep.
*/
-
+
queue.sops = sops;
queue.nsops = nsops;
queue.undo = un;
@@ -2016,7 +2016,7 @@ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
return error;
atomic_inc(&undo_list->refcnt);
tsk->sysvsem.undo_list = undo_list;
- } else
+ } else
tsk->sysvsem.undo_list = NULL;
return 0;
diff --git a/ipc/shm.c b/ipc/shm.c
index 76459616a7fa..2b64b0d25bba 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -43,7 +43,7 @@
#include <linux/mount.h>
#include <linux/ipc_namespace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "util.h"
@@ -609,15 +609,15 @@ static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
{
struct ipc_namespace *ns;
- struct ipc_ops shm_ops;
+ static const struct ipc_ops shm_ops = {
+ .getnew = newseg,
+ .associate = shm_security,
+ .more_checks = shm_more_checks,
+ };
struct ipc_params shm_params;
ns = current->nsproxy->ipc_ns;
- shm_ops.getnew = newseg;
- shm_ops.associate = shm_security;
- shm_ops.more_checks = shm_more_checks;
-
shm_params.key = key;
shm_params.flg = shmflg;
shm_params.u.size = size;
@@ -694,7 +694,7 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf
out.shmmin = in->shmmin;
out.shmmni = in->shmmni;
out.shmseg = in->shmseg;
- out.shmall = in->shmall;
+ out.shmall = in->shmall;
return copy_to_user(buf, &out, sizeof(out));
}
diff --git a/ipc/util.c b/ipc/util.c
index 2eb0d1eaa312..27d74e69fd57 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -183,7 +183,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
* ipc_findkey - find a key in an ipc identifier set
* @ids: ipc identifier set
* @key: key to find
- *
+ *
* Returns the locked pointer to the ipc structure if found or NULL
* otherwise. If key is found ipc points to the owning ipc structure
*
@@ -317,7 +317,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
* when the key is IPC_PRIVATE.
*/
static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids,
- struct ipc_ops *ops, struct ipc_params *params)
+ const struct ipc_ops *ops, struct ipc_params *params)
{
int err;
@@ -344,7 +344,7 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids,
*/
static int ipc_check_perms(struct ipc_namespace *ns,
struct kern_ipc_perm *ipcp,
- struct ipc_ops *ops,
+ const struct ipc_ops *ops,
struct ipc_params *params)
{
int err;
@@ -375,7 +375,7 @@ static int ipc_check_perms(struct ipc_namespace *ns,
* On success, the ipc id is returned.
*/
static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
- struct ipc_ops *ops, struct ipc_params *params)
+ const struct ipc_ops *ops, struct ipc_params *params)
{
struct kern_ipc_perm *ipcp;
int flg = params->flg;
@@ -538,7 +538,7 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag)
else if (in_group_p(ipcp->cgid) || in_group_p(ipcp->gid))
granted_mode >>= 3;
/* is there some bit set in requested_mode but not in granted_mode? */
- if ((requested_mode & ~granted_mode & 0007) &&
+ if ((requested_mode & ~granted_mode & 0007) &&
!ns_capable(ns->user_ns, CAP_IPC_OWNER))
return -1;
@@ -678,7 +678,7 @@ out:
* Common routine called by sys_msgget(), sys_semget() and sys_shmget().
*/
int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
- struct ipc_ops *ops, struct ipc_params *params)
+ const struct ipc_ops *ops, struct ipc_params *params)
{
if (params->key == IPC_PRIVATE)
return ipcget_new(ns, ids, ops, params);
diff --git a/ipc/util.h b/ipc/util.h
index 9c47d6f6c7b4..1a5a0fcd099c 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -78,9 +78,9 @@ struct ipc_params {
* . routine to call for an extra check if needed
*/
struct ipc_ops {
- int (*getnew) (struct ipc_namespace *, struct ipc_params *);
- int (*associate) (struct kern_ipc_perm *, int);
- int (*more_checks) (struct kern_ipc_perm *, struct ipc_params *);
+ int (*getnew)(struct ipc_namespace *, struct ipc_params *);
+ int (*associate)(struct kern_ipc_perm *, int);
+ int (*more_checks)(struct kern_ipc_perm *, struct ipc_params *);
};
struct seq_file;
@@ -142,7 +142,7 @@ struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
struct ipc64_perm *perm, int extra_perm);
#ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
- /* On IA-64, we always use the "64-bit version" of the IPC structures. */
+/* On IA-64, we always use the "64-bit version" of the IPC structures. */
# define ipc_parse_version(cmd) IPC_64
#else
int ipc_parse_version(int *cmd);
@@ -201,7 +201,7 @@ static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
- struct ipc_ops *ops, struct ipc_params *params);
+ const struct ipc_ops *ops, struct ipc_params *params);
void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
void (*free)(struct ipc_namespace *, struct kern_ipc_perm *));
#endif
diff --git a/kernel/acct.c b/kernel/acct.c
index 1853dd4a1d01..d17810f056cf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -55,7 +55,7 @@
#include <linux/times.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
@@ -135,7 +135,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
spin_lock(&acct_lock);
if (file != acct->file) {
if (act)
- res = act>0;
+ res = act > 0;
goto out;
}
@@ -263,7 +263,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
if (name) {
struct filename *tmp = getname(name);
if (IS_ERR(tmp))
- return (PTR_ERR(tmp));
+ return PTR_ERR(tmp);
error = acct_on(tmp);
putname(tmp);
} else {
diff --git a/kernel/audit.c b/kernel/audit.c
index 7c2893602d06..dfef723024d6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -44,7 +44,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
-#include <asm/types.h>
+#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/mm.h>
#include <linux/export.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 6ed6a1d552b5..3b447b9c9f60 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -313,46 +313,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
-/*
- * Let kernel threads use this to say that they allow a certain signal.
- * Must not be used if kthread was cloned with CLONE_SIGHAND.
- */
-int allow_signal(int sig)
-{
- if (!valid_signal(sig) || sig < 1)
- return -EINVAL;
-
- spin_lock_irq(&current->sighand->siglock);
- /* This is only needed for daemonize()'ed kthreads */
- sigdelset(&current->blocked, sig);
- /*
- * Kernel threads handle their own signals. Let the signal code
- * know it'll be handled, so that they don't get converted to
- * SIGKILL or just silently dropped.
- */
- current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 0;
-}
-
-EXPORT_SYMBOL(allow_signal);
-
-int disallow_signal(int sig)
-{
- if (!valid_signal(sig) || sig < 1)
- return -EINVAL;
-
- spin_lock_irq(&current->sighand->siglock);
- current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 0;
-}
-
-EXPORT_SYMBOL(disallow_signal);
-
-#ifdef CONFIG_MM_OWNER
+#ifdef CONFIG_MEMCG
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
*/
@@ -434,7 +395,7 @@ assign_new_owner:
task_unlock(c);
put_task_struct(c);
}
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
/*
* Turn us into a lazy TLB process if we
diff --git a/kernel/fork.c b/kernel/fork.c
index 54a8d26f612f..0d53eb0dfb6f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti)
static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
int node)
{
- struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
- THREAD_SIZE_ORDER);
+ struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
{
- free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_info_cache;
@@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p)
#endif
}
-#ifdef CONFIG_MM_OWNER
+#ifdef CONFIG_MEMCG
void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
mm->owner = p;
}
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
/*
* Initialize POSIX timer handling for a single task.
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c8380ad203bc..e6f3aecde543 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1622,6 +1622,7 @@ static int __init crash_save_vmcoreinfo_init(void)
#ifdef CONFIG_MEMORY_FAILURE
VMCOREINFO_NUMBER(PG_hwpoison);
#endif
+ VMCOREINFO_NUMBER(PG_head_mask);
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
arch_crash_save_vmcoreinfo();
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0ac67a5861c5..8637e041a247 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -285,10 +285,7 @@ static int wait_for_helper(void *data)
pid_t pid;
/* If SIGCLD is ignored sys_wait4 won't populate the status. */
- spin_lock_irq(&current->sighand->siglock);
- current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
- spin_unlock_irq(&current->sighand->siglock);
-
+ kernel_sigaction(SIGCHLD, SIG_DFL);
pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
diff --git a/kernel/panic.c b/kernel/panic.c
index d02fa9fef46a..62e16cef9cc2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -32,6 +32,7 @@ static unsigned long tainted_mask;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
+static bool crash_kexec_post_notifiers;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -112,9 +113,11 @@ void panic(const char *fmt, ...)
/*
* If we have crashed and we have a crash kernel loaded let it handle
* everything else.
- * Do we want to call this before we try to display a message?
+ * If we want to run this after calling panic_notifiers, pass
+ * the "crash_kexec_post_notifiers" option to the kernel.
*/
- crash_kexec(NULL);
+ if (!crash_kexec_post_notifiers)
+ crash_kexec(NULL);
/*
* Note smp_send_stop is the usual smp shutdown function, which
@@ -131,6 +134,15 @@ void panic(const char *fmt, ...)
kmsg_dump(KMSG_DUMP_PANIC);
+ /*
+ * If you doubt kdump always works fine in any situation,
+ * "crash_kexec_post_notifiers" offers you a chance to run
+ * panic_notifiers and dumping kmsg before kdump.
+ * Note: since some panic_notifiers can make crashed kernel
+ * more unstable, it can increase risks of the kdump failure too.
+ */
+ crash_kexec(NULL);
+
bust_spinlocks(0);
if (!panic_blink)
@@ -472,6 +484,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
+static int __init setup_crash_kexec_post_notifiers(char *s)
+{
+ crash_kexec_post_notifiers = true;
+ return 0;
+}
+early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
+
static int __init oops_setup(char *s)
{
if (!s)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a45b50962295..d3aa0c92b28b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -297,34 +297,106 @@ static u32 log_next(u32 idx)
return idx + msg->len;
}
-/* insert record into the buffer, discard old ones, update heads */
-static void log_store(int facility, int level,
- enum log_flags flags, u64 ts_nsec,
- const char *dict, u16 dict_len,
- const char *text, u16 text_len)
+/*
+ * Check whether there is enough free space for the given message.
+ *
+ * The same values of first_idx and next_idx mean that the buffer
+ * is either empty or full.
+ *
+ * If the buffer is empty, we must respect the position of the indexes.
+ * They cannot be reset to the beginning of the buffer.
+ */
+static int logbuf_has_space(u32 msg_size, bool empty)
{
- struct printk_log *msg;
- u32 size, pad_len;
+ u32 free;
- /* number of '\0' padding bytes to next message */
- size = sizeof(struct printk_log) + text_len + dict_len;
- pad_len = (-size) & (LOG_ALIGN - 1);
- size += pad_len;
+ if (log_next_idx > log_first_idx || empty)
+ free = max(log_buf_len - log_next_idx, log_first_idx);
+ else
+ free = log_first_idx - log_next_idx;
+ /*
+ * We need space also for an empty header that signalizes wrapping
+ * of the buffer.
+ */
+ return free >= msg_size + sizeof(struct printk_log);
+}
+
+static int log_make_free_space(u32 msg_size)
+{
while (log_first_seq < log_next_seq) {
- u32 free;
+ if (logbuf_has_space(msg_size, false))
+ return 0;
+ /* drop old messages until we have enough continuous space */
+ log_first_idx = log_next(log_first_idx);
+ log_first_seq++;
+ }
- if (log_next_idx > log_first_idx)
- free = max(log_buf_len - log_next_idx, log_first_idx);
- else
- free = log_first_idx - log_next_idx;
+ /* sequence numbers are equal, so the log buffer is empty */
+ if (logbuf_has_space(msg_size, true))
+ return 0;
- if (free >= size + sizeof(struct printk_log))
- break;
+ return -ENOMEM;
+}
- /* drop old messages until we have enough contiuous space */
- log_first_idx = log_next(log_first_idx);
- log_first_seq++;
+/* compute the message size including the padding bytes */
+static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
+{
+ u32 size;
+
+ size = sizeof(struct printk_log) + text_len + dict_len;
+ *pad_len = (-size) & (LOG_ALIGN - 1);
+ size += *pad_len;
+
+ return size;
+}
+
+/*
+ * Define how much of the log buffer we could take at maximum. The value
+ * must be greater than two. Note that only half of the buffer is available
+ * when the index points to the middle.
+ */
+#define MAX_LOG_TAKE_PART 4
+static const char trunc_msg[] = "<truncated>";
+
+static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
+ u16 *dict_len, u32 *pad_len)
+{
+ /*
+ * The message should not take the whole buffer. Otherwise, it might
+ * get removed too soon.
+ */
+ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
+ if (*text_len > max_text_len)
+ *text_len = max_text_len;
+ /* enable the warning message */
+ *trunc_msg_len = strlen(trunc_msg);
+ /* disable the "dict" completely */
+ *dict_len = 0;
+ /* compute the size again, count also the warning message */
+ return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
+}
+
+/* insert record into the buffer, discard old ones, update heads */
+static int log_store(int facility, int level,
+ enum log_flags flags, u64 ts_nsec,
+ const char *dict, u16 dict_len,
+ const char *text, u16 text_len)
+{
+ struct printk_log *msg;
+ u32 size, pad_len;
+ u16 trunc_msg_len = 0;
+
+ /* number of '\0' padding bytes to next message */
+ size = msg_used_size(text_len, dict_len, &pad_len);
+
+ if (log_make_free_space(size)) {
+ /* truncate the message if it is too long for empty buffer */
+ size = truncate_msg(&text_len, &trunc_msg_len,
+ &dict_len, &pad_len);
+ /* survive when the log buffer is too small for trunc_msg */
+ if (log_make_free_space(size))
+ return 0;
}
if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
@@ -341,6 +413,10 @@ static void log_store(int facility, int level,
msg = (struct printk_log *)(log_buf + log_next_idx);
memcpy(log_text(msg), text, text_len);
msg->text_len = text_len;
+ if (trunc_msg_len) {
+ memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
+ msg->text_len += trunc_msg_len;
+ }
memcpy(log_dict(msg), dict, dict_len);
msg->dict_len = dict_len;
msg->facility = facility;
@@ -356,6 +432,8 @@ static void log_store(int facility, int level,
/* insert message */
log_next_idx += msg->len;
log_next_seq++;
+
+ return msg->text_len;
}
#ifdef CONFIG_SECURITY_DMESG_RESTRICT
@@ -1530,10 +1608,10 @@ asmlinkage int vprintk_emit(int facility, int level,
"BUG: recent printk recursion!";
recursion_bug = 0;
- printed_len += strlen(recursion_msg);
+ text_len = strlen(recursion_msg);
/* emit KERN_CRIT message */
- log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
- NULL, 0, recursion_msg, printed_len);
+ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+ NULL, 0, recursion_msg, text_len);
}
/*
@@ -1586,9 +1664,12 @@ asmlinkage int vprintk_emit(int facility, int level,
cont_flush(LOG_NEWLINE);
/* buffer line if possible, otherwise store it right away */
- if (!cont_add(facility, level, text, text_len))
- log_store(facility, level, lflags | LOG_CONT, 0,
- dict, dictlen, text, text_len);
+ if (cont_add(facility, level, text, text_len))
+ printed_len += text_len;
+ else
+ printed_len += log_store(facility, level,
+ lflags | LOG_CONT, 0,
+ dict, dictlen, text, text_len);
} else {
bool stored = false;
@@ -1607,11 +1688,12 @@ asmlinkage int vprintk_emit(int facility, int level,
cont_flush(LOG_NEWLINE);
}
- if (!stored)
- log_store(facility, level, lflags, 0,
- dict, dictlen, text, text_len);
+ if (stored)
+ printed_len += text_len;
+ else
+ printed_len += log_store(facility, level, lflags, 0,
+ dict, dictlen, text, text_len);
}
- printed_len += text_len;
/*
* Try to acquire and then immediately release the console semaphore.
diff --git a/kernel/signal.c b/kernel/signal.c
index 6ea13c09ae56..cc3b1938b7dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -705,11 +705,8 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state)
* Returns 1 if any signals were found.
*
* All callers must be holding the siglock.
- *
- * This version takes a sigset mask and looks at all signals,
- * not just those in the first mask word.
*/
-static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
+static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
{
struct sigqueue *q, *n;
sigset_t m;
@@ -727,29 +724,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
}
return 1;
}
-/*
- * Remove signals in mask from the pending set and queue.
- * Returns 1 if any signals were found.
- *
- * All callers must be holding the siglock.
- */
-static int rm_from_queue(unsigned long mask, struct sigpending *s)
-{
- struct sigqueue *q, *n;
-
- if (!sigtestsetmask(&s->signal, mask))
- return 0;
-
- sigdelsetmask(&s->signal, mask);
- list_for_each_entry_safe(q, n, &s->list, list) {
- if (q->info.si_signo < SIGRTMIN &&
- (mask & sigmask(q->info.si_signo))) {
- list_del_init(&q->list);
- __sigqueue_free(q);
- }
- }
- return 1;
-}
static inline int is_si_special(const struct siginfo *info)
{
@@ -861,6 +835,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
+ sigset_t flush;
if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
if (signal->flags & SIGNAL_GROUP_COREDUMP)
@@ -872,26 +847,25 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
/*
* This is a stop signal. Remove SIGCONT from all queues.
*/
- rm_from_queue(sigmask(SIGCONT), &signal->shared_pending);
- t = p;
- do {
- rm_from_queue(sigmask(SIGCONT), &t->pending);
- } while_each_thread(p, t);
+ siginitset(&flush, sigmask(SIGCONT));
+ flush_sigqueue_mask(&flush, &signal->shared_pending);
+ for_each_thread(p, t)
+ flush_sigqueue_mask(&flush, &t->pending);
} else if (sig == SIGCONT) {
unsigned int why;
/*
* Remove all stop signals from all queues, wake all threads.
*/
- rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
- t = p;
- do {
+ siginitset(&flush, SIG_KERNEL_STOP_MASK);
+ flush_sigqueue_mask(&flush, &signal->shared_pending);
+ for_each_thread(p, t) {
+ flush_sigqueue_mask(&flush, &t->pending);
task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
- rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
if (likely(!(t->ptrace & PT_SEIZED)))
wake_up_state(t, __TASK_STOPPED);
else
ptrace_trap_notify(t);
- } while_each_thread(p, t);
+ }
/*
* Notify the parent with CLD_CONTINUED if we were stopped.
@@ -2854,7 +2828,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, &tsk->real_blocked);
- siginitset(&tsk->real_blocked, 0);
+ sigemptyset(&tsk->real_blocked);
sig = dequeue_signal(tsk, &mask, info);
}
spin_unlock_irq(&tsk->sighand->siglock);
@@ -3091,18 +3065,39 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
}
#endif
+/*
+ * For kthreads only, must not be used if cloned with CLONE_SIGHAND
+ */
+void kernel_sigaction(int sig, __sighandler_t action)
+{
+ spin_lock_irq(&current->sighand->siglock);
+ current->sighand->action[sig - 1].sa.sa_handler = action;
+ if (action == SIG_IGN) {
+ sigset_t mask;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, sig);
+
+ flush_sigqueue_mask(&mask, &current->signal->shared_pending);
+ flush_sigqueue_mask(&mask, &current->pending);
+ recalc_sigpending();
+ }
+ spin_unlock_irq(&current->sighand->siglock);
+}
+EXPORT_SYMBOL(kernel_sigaction);
+
int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
- struct task_struct *t = current;
+ struct task_struct *p = current, *t;
struct k_sigaction *k;
sigset_t mask;
if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
return -EINVAL;
- k = &t->sighand->action[sig-1];
+ k = &p->sighand->action[sig-1];
- spin_lock_irq(&current->sighand->siglock);
+ spin_lock_irq(&p->sighand->siglock);
if (oact)
*oact = *k;
@@ -3121,21 +3116,20 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
* (for example, SIGCHLD), shall cause the pending signal to
* be discarded, whether or not it is blocked"
*/
- if (sig_handler_ignored(sig_handler(t, sig), sig)) {
+ if (sig_handler_ignored(sig_handler(p, sig), sig)) {
sigemptyset(&mask);
sigaddset(&mask, sig);
- rm_from_queue_full(&mask, &t->signal->shared_pending);
- do {
- rm_from_queue_full(&mask, &t->pending);
- } while_each_thread(current, t);
+ flush_sigqueue_mask(&mask, &p->signal->shared_pending);
+ for_each_thread(p, t)
+ flush_sigqueue_mask(&mask, &t->pending);
}
}
- spin_unlock_irq(&current->sighand->siglock);
+ spin_unlock_irq(&p->sighand->siglock);
return 0;
}
-static int
+static int
do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
{
stack_t oss;
@@ -3496,7 +3490,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
}
#endif
-#ifdef __ARCH_WANT_SYS_SGETMASK
+#ifdef CONFIG_SGETMASK_SYSCALL
/*
* For backwards compatibility. Functionality superseded by sigprocmask.
@@ -3517,7 +3511,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
return old;
}
-#endif /* __ARCH_WANT_SGETMASK */
+#endif /* CONFIG_SGETMASK_SYSCALL */
#ifdef __ARCH_WANT_SYS_SIGNAL
/*
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bc8d1b74a6b9..36441b51b5df 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16);
cond_syscall(sys_setresuid16);
cond_syscall(sys_setreuid16);
cond_syscall(sys_setuid16);
+cond_syscall(sys_sgetmask);
+cond_syscall(sys_ssetmask);
cond_syscall(sys_vm86old);
cond_syscall(sys_vm86);
cond_syscall(sys_ipc);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 4d23dc4d8139..5038b4d3b76d 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -154,6 +154,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
raw_write_seqcount_end(&cd.seq);
r = rate;
+ /*
+ * Use 4MHz instead of 1MHz so that things like 1.832Mhz show as
+ * 1832Khz
+ */
if (r >= 4000000) {
r /= 1000000;
r_unit = 'M';
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 516203e665fc..89c5939700cc 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -243,10 +243,12 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
- if (hardlockup_panic)
+ if (hardlockup_panic) {
+ trigger_all_cpu_backtrace();
panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
- else
+ } else {
WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ }
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -327,8 +329,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
else
dump_stack();
- if (softlockup_panic)
+ if (softlockup_panic) {
+ trigger_all_cpu_backtrace();
panic("softlockup: hung tasks");
+ }
__this_cpu_write(soft_watchdog_warn, true);
} else
__this_cpu_write(soft_watchdog_warn, false);
diff --git a/lib/Kconfig b/lib/Kconfig
index 4771fb3f4da4..e5ae36dcbf01 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -177,6 +177,13 @@ config CRC8
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
@@ -460,4 +467,11 @@ config UCS2_STRING
source "lib/fonts/Kconfig"
+#
+# sg chaining option
+#
+
+config ARCH_HAS_SG_CHAIN
+ def_bool n
+
endmenu
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e548aa00dd5d..cd5c82988d68 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -501,6 +501,16 @@ config DEBUG_VM
If unsure, say N.
+config DEBUG_VM_VMACACHE
+ bool "Debug VMA caching"
+ depends on DEBUG_VM
+ help
+ Enable this to turn on VMA caching debug information. Doing so
+ can cause significant overhead, so only enable it in non-production
+ environments.
+
+ If unsure, say N.
+
config DEBUG_VM_RB
bool "Debug VM red-black trees"
depends on DEBUG_VM
diff --git a/lib/Makefile b/lib/Makefile
index 0cd7b68e1382..02da5b614a80 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644
index 000000000000..41629ea5a60c
--- /dev/null
+++ b/lib/crc64_ecma.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
diff --git a/lib/idr.c b/lib/idr.c
index 2642fa8e424d..39158abebad1 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -18,12 +18,6 @@
* pointer or what ever, we treat it as a (void *). You can pass this
* id to a user for him to pass back at a later time. You then pass
* that id to this code and it returns your pointer.
-
- * You can release ids at any time. When all ids are released, most of
- * the memory is returned (we keep MAX_IDR_FREE) in a local pool so we
- * don't need to go to the memory "store" during an id allocate, just
- * so you don't need to be too concerned about locking and conflicts
- * with the slab allocator.
*/
#ifndef TEST // to test in user space...
@@ -151,7 +145,7 @@ static void idr_layer_rcu_free(struct rcu_head *head)
static inline void free_layer(struct idr *idr, struct idr_layer *p)
{
- if (idr->hint && idr->hint == p)
+ if (idr->hint == p)
RCU_INIT_POINTER(idr->hint, NULL);
call_rcu(&p->rcu_head, idr_layer_rcu_free);
}
@@ -249,7 +243,7 @@ static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa,
id = (id | ((1 << (IDR_BITS * l)) - 1)) + 1;
/* if already at the top layer, we need to grow */
- if (id >= 1 << (idp->layers * IDR_BITS)) {
+ if (id > idr_max(idp->layers)) {
*starting_id = id;
return -EAGAIN;
}
@@ -562,6 +556,11 @@ void idr_remove(struct idr *idp, int id)
if (id < 0)
return;
+ if (id > idr_max(idp->layers)) {
+ idr_remove_warning(id);
+ return;
+ }
+
sub_remove(idp, (idp->layers - 1) * IDR_BITS, id);
if (idp->top && idp->top->count == 1 && (idp->layers > 1) &&
idp->top->ary[0]) {
@@ -579,16 +578,6 @@ void idr_remove(struct idr *idp, int id)
bitmap_clear(to_free->bitmap, 0, IDR_SIZE);
free_layer(idp, to_free);
}
- while (idp->id_free_cnt >= MAX_IDR_FREE) {
- p = get_from_free_list(idp);
- /*
- * Note: we don't call the rcu callback here, since the only
- * layers that fall into the freelist are those that have been
- * preallocated.
- */
- kmem_cache_free(idr_layer_cache, p);
- }
- return;
}
EXPORT_SYMBOL(idr_remove);
@@ -809,14 +798,12 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
p = idp->top;
if (!p)
- return ERR_PTR(-EINVAL);
-
- n = (p->layer+1) * IDR_BITS;
+ return ERR_PTR(-ENOENT);
- if (id >= (1 << n))
- return ERR_PTR(-EINVAL);
+ if (id > idr_max(p->layer + 1))
+ return ERR_PTR(-ENOENT);
- n -= IDR_BITS;
+ n = p->layer * IDR_BITS;
while ((n > 0) && p) {
p = p->ary[(id >> n) & IDR_MASK];
n -= IDR_BITS;
@@ -1027,6 +1014,9 @@ void ida_remove(struct ida *ida, int id)
int n;
struct ida_bitmap *bitmap;
+ if (idr_id > idr_max(ida->idr.layers))
+ goto err;
+
/* clear full bits while looking up the leaf idr_layer */
while ((shift > 0) && p) {
n = (idr_id >> shift) & IDR_MASK;
@@ -1042,7 +1032,7 @@ void ida_remove(struct ida *ida, int id)
__clear_bit(n, p->bitmap);
bitmap = (void *)p->ary[n];
- if (!test_bit(offset, bitmap->bitmap))
+ if (!bitmap || !test_bit(offset, bitmap->bitmap))
goto err;
/* update bitmap and remove it if empty */
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 3a8e8e8fb2a5..4251cbd5becb 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -73,7 +73,7 @@ EXPORT_SYMBOL(sg_nents);
**/
struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents)
{
-#ifndef ARCH_HAS_SG_CHAIN
+#ifndef CONFIG_ARCH_HAS_SG_CHAIN
struct scatterlist *ret = &sgl[nents - 1];
#else
struct scatterlist *sg, *ret = NULL;
@@ -251,7 +251,7 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents,
if (nents == 0)
return -EINVAL;
-#ifndef ARCH_HAS_SG_CHAIN
+#ifndef CONFIG_ARCH_HAS_SG_CHAIN
if (WARN_ON_ONCE(nents > max_ents))
return -EINVAL;
#endif
diff --git a/lib/string.c b/lib/string.c
index 9b1f9062a202..89ad0f035f48 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -107,7 +107,7 @@ EXPORT_SYMBOL(strcpy);
#ifndef __HAVE_ARCH_STRNCPY
/**
- * strncpy - Copy a length-limited, %NUL-terminated string
+ * strncpy - Copy a length-limited, C-string
* @dest: Where to copy the string to
* @src: Where to copy the string from
* @count: The maximum number of bytes to copy
@@ -136,7 +136,7 @@ EXPORT_SYMBOL(strncpy);
#ifndef __HAVE_ARCH_STRLCPY
/**
- * strlcpy - Copy a %NUL terminated string into a sized buffer
+ * strlcpy - Copy a C-string into a sized buffer
* @dest: Where to copy the string to
* @src: Where to copy the string from
* @size: size of destination buffer
@@ -182,7 +182,7 @@ EXPORT_SYMBOL(strcat);
#ifndef __HAVE_ARCH_STRNCAT
/**
- * strncat - Append a length-limited, %NUL-terminated string to another
+ * strncat - Append a length-limited, C-string to another
* @dest: The string to be appended to
* @src: The string to append to it
* @count: The maximum numbers of bytes to copy
@@ -211,7 +211,7 @@ EXPORT_SYMBOL(strncat);
#ifndef __HAVE_ARCH_STRLCAT
/**
- * strlcat - Append a length-limited, %NUL-terminated string to another
+ * strlcat - Append a length-limited, C-string to another
* @dest: The string to be appended to
* @src: The string to append to it
* @count: The size of the destination buffer.
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index b604b831f4d1..649d097853a1 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -374,7 +374,7 @@ void __init swiotlb_free(void)
io_tlb_nslabs = 0;
}
-static int is_swiotlb_buffer(phys_addr_t paddr)
+int is_swiotlb_buffer(phys_addr_t paddr)
{
return paddr >= io_tlb_start && paddr < io_tlb_end;
}
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 0648291cdafe..e30d885d9631 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1183,6 +1183,21 @@ char *address_val(char *buf, char *end, const void *addr,
return number(buf, end, num, spec);
}
+static noinline_for_stack
+char *comm_name(char *buf, char *end, struct task_struct *tsk,
+ struct printf_spec spec, const char *fmt)
+{
+ char name[TASK_COMM_LEN];
+
+ /* Caller can pass NULL instead of current. */
+ if (!tsk)
+ tsk = current;
+ /* Not using get_task_comm() in case I'm in IRQ context. */
+ memcpy(name, tsk->comm, TASK_COMM_LEN);
+ name[sizeof(name) - 1] = '\0';
+ return string(buf, end, name, spec);
+}
+
int kptr_restrict __read_mostly;
/*
@@ -1250,6 +1265,7 @@ int kptr_restrict __read_mostly;
* (default assumed to be phys_addr_t, passed by reference)
* - 'd[234]' For a dentry name (optionally 2-4 last components)
* - 'D[234]' Same as 'd' but for a struct file
+ * - 'T' task_struct->comm
*
* Note: The difference between 'S' and 'F' is that on ia64 and ppc64
* function pointers are really function descriptors, which contain a
@@ -1261,7 +1277,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
{
int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0);
- if (!ptr && *fmt != 'K') {
+ if (!ptr && *fmt != 'K' && *fmt != 'T') {
/*
* Print (null) with the same width as a pointer so it makes
* tabular output look nice.
@@ -1389,6 +1405,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
return dentry_name(buf, end,
((const struct file *)ptr)->f_path.dentry,
spec, fmt);
+ case 'T':
+ return comm_name(buf, end, ptr, spec, fmt);
}
spec.flags |= SMALL;
if (spec.field_width == -1) {
diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig
index 08837db52d94..12d2d777f36b 100644
--- a/lib/xz/Kconfig
+++ b/lib/xz/Kconfig
@@ -9,33 +9,33 @@ config XZ_DEC
if XZ_DEC
config XZ_DEC_X86
- bool "x86 BCJ filter decoder"
- default y if X86
+ bool "x86 BCJ filter decoder" if EXPERT
+ default y
select XZ_DEC_BCJ
config XZ_DEC_POWERPC
- bool "PowerPC BCJ filter decoder"
- default y if PPC
+ bool "PowerPC BCJ filter decoder" if EXPERT
+ default y
select XZ_DEC_BCJ
config XZ_DEC_IA64
- bool "IA-64 BCJ filter decoder"
- default y if IA64
+ bool "IA-64 BCJ filter decoder" if EXPERT
+ default y
select XZ_DEC_BCJ
config XZ_DEC_ARM
- bool "ARM BCJ filter decoder"
- default y if ARM
+ bool "ARM BCJ filter decoder" if EXPERT
+ default y
select XZ_DEC_BCJ
config XZ_DEC_ARMTHUMB
- bool "ARM-Thumb BCJ filter decoder"
- default y if (ARM && ARM_THUMB)
+ bool "ARM-Thumb BCJ filter decoder" if EXPERT
+ default y
select XZ_DEC_BCJ
config XZ_DEC_SPARC
- bool "SPARC BCJ filter decoder"
- default y if SPARC
+ bool "SPARC BCJ filter decoder" if EXPERT
+ default y
select XZ_DEC_BCJ
endif
diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c
index a6cdc969ea42..08c3c8049998 100644
--- a/lib/xz/xz_dec_lzma2.c
+++ b/lib/xz/xz_dec_lzma2.c
@@ -1043,6 +1043,8 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s,
s->lzma2.sequence = SEQ_LZMA_PREPARE;
+ /* Fall through */
+
case SEQ_LZMA_PREPARE:
if (s->lzma2.compressed < RC_INIT_BYTES)
return XZ_DATA_ERROR;
@@ -1053,6 +1055,8 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s,
s->lzma2.compressed -= RC_INIT_BYTES;
s->lzma2.sequence = SEQ_LZMA_RUN;
+ /* Fall through */
+
case SEQ_LZMA_RUN:
/*
* Set dictionary limit to indicate how much we want
diff --git a/mm/Makefile b/mm/Makefile
index b484452dac57..9bc26154557c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,7 +3,7 @@
#
mmu-y := nommu.o
-mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
+mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
vmalloc.o pagewalk.o pgtable-generic.o
diff --git a/mm/bounce.c b/mm/bounce.c
index 523918b8c6dc..ab21ba203d5c 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -3,6 +3,8 @@
* - Split from highmem.c
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/swap.h>
@@ -15,6 +17,7 @@
#include <linux/hash.h>
#include <linux/highmem.h>
#include <linux/bootmem.h>
+#include <linux/printk.h>
#include <asm/tlbflush.h>
#include <trace/events/block.h>
@@ -34,7 +37,7 @@ static __init int init_emergency_pool(void)
page_pool = mempool_create_page_pool(POOL_SIZE, 0);
BUG_ON(!page_pool);
- printk("bounce pool size: %d pages\n", POOL_SIZE);
+ pr_info("pool size: %d pages\n", POOL_SIZE);
return 0;
}
@@ -86,7 +89,7 @@ int init_emergency_isa_pool(void)
mempool_free_pages, (void *) 0);
BUG_ON(!isa_page_pool);
- printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
+ pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE);
return 0;
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 37f976287068..6a42ee508e6c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -208,12 +208,6 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
return true;
}
-static inline bool compact_trylock_irqsave(spinlock_t *lock,
- unsigned long *flags, struct compact_control *cc)
-{
- return compact_checklock_irqsave(lock, flags, false, cc);
-}
-
/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct page *page)
{
@@ -293,14 +287,14 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
/* Found a free page, break it into order-0 pages */
isolated = split_free_page(page);
- total_isolated += isolated;
- for (i = 0; i < isolated; i++) {
- list_add(&page->lru, freelist);
- page++;
- }
-
- /* If a page was split, advance to the end of it */
if (isolated) {
+ total_isolated += isolated;
+ for (i = 0; i < isolated; i++) {
+ list_add(&page->lru, freelist);
+ page++;
+ }
+
+ /* If a page was split, advance to the end of it */
blockpfn += isolated - 1;
cursor += isolated - 1;
continue;
@@ -309,9 +303,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
isolate_fail:
if (strict)
break;
- else
- continue;
-
}
trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
@@ -671,24 +662,30 @@ static void isolate_freepages(struct zone *zone,
struct compact_control *cc)
{
struct page *page;
- unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn;
+ unsigned long pfn; /* scanning cursor */
+ unsigned long low_pfn; /* lowest pfn scanner is able to scan */
+ unsigned long next_free_pfn; /* start pfn for scaning at next round */
+ unsigned long z_end_pfn; /* zone's end pfn */
int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
/*
* Initialise the free scanner. The starting point is where we last
- * scanned from (or the end of the zone if starting). The low point
- * is the end of the pageblock the migration scanner is using.
+ * successfully isolated from, zone-cached value, or the end of the
+ * zone when isolating for the first time. We need this aligned to
+ * the pageblock boundary, because we do pfn -= pageblock_nr_pages
+ * in the for loop.
+ * The low boundary is the end of the pageblock the migration scanner
+ * is using.
*/
- pfn = cc->free_pfn;
+ pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
/*
- * Take care that if the migration scanner is at the end of the zone
- * that the free scanner does not accidentally move to the next zone
- * in the next isolation cycle.
+ * Seed the value for max(next_free_pfn, pfn) updates. If no pages are
+ * isolated, the pfn < low_pfn check will kick in.
*/
- high_pfn = min(low_pfn, pfn);
+ next_free_pfn = 0;
z_end_pfn = zone_end_pfn(zone);
@@ -700,6 +697,7 @@ static void isolate_freepages(struct zone *zone,
for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
pfn -= pageblock_nr_pages) {
unsigned long isolated;
+ unsigned long end_pfn;
/*
* This can iterate a massively long zone without finding any
@@ -731,16 +729,12 @@ static void isolate_freepages(struct zone *zone,
continue;
/* Found a block suitable for isolating free pages from */
- isolated = 0;
/*
- * As pfn may not start aligned, pfn+pageblock_nr_page
- * may cross a MAX_ORDER_NR_PAGES boundary and miss
- * a pfn_valid check. Ensure isolate_freepages_block()
- * only scans within a pageblock
+ * Take care when isolating in last pageblock of a zone which
+ * ends in the middle of a pageblock.
*/
- end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
- end_pfn = min(end_pfn, z_end_pfn);
+ end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn);
isolated = isolate_freepages_block(cc, pfn, end_pfn,
freelist, false);
nr_freepages += isolated;
@@ -752,7 +746,7 @@ static void isolate_freepages(struct zone *zone,
*/
if (isolated) {
cc->finished_update_free = true;
- high_pfn = max(high_pfn, pfn);
+ next_free_pfn = max(next_free_pfn, pfn);
}
}
@@ -764,9 +758,9 @@ static void isolate_freepages(struct zone *zone,
* so that compact_finished() may detect this
*/
if (pfn < low_pfn)
- cc->free_pfn = max(pfn, zone->zone_start_pfn);
- else
- cc->free_pfn = high_pfn;
+ next_free_pfn = cc->migrate_pfn;
+
+ cc->free_pfn = next_free_pfn;
cc->nr_freepages = nr_freepages;
}
@@ -1163,9 +1157,6 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
if (zone_watermark_ok(zone, cc->order,
low_wmark_pages(zone), 0, 0))
compaction_defer_reset(zone, cc->order, false);
- /* Currently async compaction is never deferred. */
- else if (cc->sync)
- defer_compaction(zone, cc->order);
}
VM_BUG_ON(!list_empty(&cc->freepages));
diff --git a/mm/filemap.c b/mm/filemap.c
index 79ea25b12cad..8fb66b2a78ec 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -906,8 +906,8 @@ EXPORT_SYMBOL(page_cache_prev_hole);
* Looks up the page cache slot at @mapping & @offset. If there is a
* page cache page, it is returned with an increased refcount.
*
- * If the slot holds a shadow entry of a previously evicted page, it
- * is returned.
+ * If the slot holds a shadow entry of a previously evicted page, or a
+ * swap entry from shmem/tmpfs, it is returned.
*
* Otherwise, %NULL is returned.
*/
@@ -928,9 +928,9 @@ repeat:
if (radix_tree_deref_retry(page))
goto repeat;
/*
- * Otherwise, shmem/tmpfs must be storing a swap entry
- * here as an exceptional entry: so return it without
- * attempting to raise page count.
+ * A shadow entry of a recently evicted page,
+ * or a swap entry from shmem/tmpfs. Return
+ * it without attempting to raise page count.
*/
goto out;
}
@@ -983,8 +983,8 @@ EXPORT_SYMBOL(find_get_page);
* page cache page, it is returned locked and with an increased
* refcount.
*
- * If the slot holds a shadow entry of a previously evicted page, it
- * is returned.
+ * If the slot holds a shadow entry of a previously evicted page, or a
+ * swap entry from shmem/tmpfs, it is returned.
*
* Otherwise, %NULL is returned.
*
@@ -1099,8 +1099,8 @@ EXPORT_SYMBOL(find_or_create_page);
* with ascending indexes. There may be holes in the indices due to
* not-present pages.
*
- * Any shadow entries of evicted pages are included in the returned
- * array.
+ * Any shadow entries of evicted pages, or swap entries from
+ * shmem/tmpfs, are included in the returned array.
*
* find_get_entries() returns the number of pages and shadow entries
* which were found.
@@ -1128,9 +1128,9 @@ repeat:
if (radix_tree_deref_retry(page))
goto restart;
/*
- * Otherwise, we must be storing a swap entry
- * here as an exceptional entry: so return it
- * without attempting to raise page count.
+ * A shadow entry of a recently evicted page,
+ * or a swap entry from shmem/tmpfs. Return
+ * it without attempting to raise page count.
*/
goto export;
}
@@ -1198,9 +1198,9 @@ repeat:
goto restart;
}
/*
- * Otherwise, shmem/tmpfs must be storing a swap entry
- * here as an exceptional entry: so skip over it -
- * we only reach this from invalidate_mapping_pages().
+ * A shadow entry of a recently evicted page,
+ * or a swap entry from shmem/tmpfs. Skip
+ * over it.
*/
continue;
}
@@ -1265,9 +1265,9 @@ repeat:
goto restart;
}
/*
- * Otherwise, shmem/tmpfs must be storing a swap entry
- * here as an exceptional entry: so stop looking for
- * contiguous pages.
+ * A shadow entry of a recently evicted page,
+ * or a swap entry from shmem/tmpfs. Stop
+ * looking for contiguous pages.
*/
break;
}
@@ -1341,10 +1341,17 @@ repeat:
goto restart;
}
/*
- * This function is never used on a shmem/tmpfs
- * mapping, so a swap entry won't be found here.
+ * A shadow entry of a recently evicted page.
+ *
+ * Those entries should never be tagged, but
+ * this tree walk is lockless and the tags are
+ * looked up in bulk, one radix tree node at a
+ * time, so there is a sizable window for page
+ * reclaim to evict a page we saw tagged.
+ *
+ * Skip over it.
*/
- BUG();
+ continue;
}
if (!page_cache_get_speculative(page))
diff --git a/mm/fremap.c b/mm/fremap.c
index 34feba60a17e..2c5646f11f41 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -82,13 +82,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
ptfile = pgoff_to_pte(pgoff);
- if (!pte_none(*pte)) {
- if (pte_present(*pte) && pte_soft_dirty(*pte))
- pte_file_mksoft_dirty(ptfile);
+ if (!pte_none(*pte))
zap_pte(mm, vma, addr, pte);
- }
- set_pte_at(mm, addr, pte, ptfile);
+ set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
/*
* We don't need to run update_mmu_cache() here because the "file pte"
* being installed by install_file_pte() is not a real pte - it's a
diff --git a/mm/gup.c b/mm/gup.c
new file mode 100644
index 000000000000..5238000726fa
--- /dev/null
+++ b/mm/gup.c
@@ -0,0 +1,640 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+
+#include <linux/hugetlb.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+static struct page *no_page_table(struct vm_area_struct *vma,
+ unsigned int flags)
+{
+ /*
+ * When core dumping an enormous anonymous area that nobody
+ * has touched so far, we don't want to allocate unnecessary pages or
+ * page tables. Return error instead of NULL to skip handle_mm_fault,
+ * then get_dump_page() will return NULL to leave a hole in the dump.
+ * But we can only make this optimization where a hole would surely
+ * be zero-filled if handle_mm_fault() actually did handle it.
+ */
+ if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
+ return ERR_PTR(-EFAULT);
+ return NULL;
+}
+
+static struct page *follow_page_pte(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
+ spinlock_t *ptl;
+ pte_t *ptep, pte;
+
+retry:
+ if (unlikely(pmd_bad(*pmd)))
+ return no_page_table(vma, flags);
+
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ pte = *ptep;
+ if (!pte_present(pte)) {
+ swp_entry_t entry;
+ /*
+ * KSM's break_ksm() relies upon recognizing a ksm page
+ * even while it is being migrated, so for that case we
+ * need migration_entry_wait().
+ */
+ if (likely(!(flags & FOLL_MIGRATION)))
+ goto no_page;
+ if (pte_none(pte) || pte_file(pte))
+ goto no_page;
+ entry = pte_to_swp_entry(pte);
+ if (!is_migration_entry(entry))
+ goto no_page;
+ pte_unmap_unlock(ptep, ptl);
+ migration_entry_wait(mm, pmd, address);
+ goto retry;
+ }
+ if ((flags & FOLL_NUMA) && pte_numa(pte))
+ goto no_page;
+ if ((flags & FOLL_WRITE) && !pte_write(pte)) {
+ pte_unmap_unlock(ptep, ptl);
+ return NULL;
+ }
+
+ page = vm_normal_page(vma, address, pte);
+ if (unlikely(!page)) {
+ if ((flags & FOLL_DUMP) ||
+ !is_zero_pfn(pte_pfn(pte)))
+ goto bad_page;
+ page = pte_page(pte);
+ }
+
+ if (flags & FOLL_GET)
+ get_page_foll(page);
+ if (flags & FOLL_TOUCH) {
+ if ((flags & FOLL_WRITE) &&
+ !pte_dirty(pte) && !PageDirty(page))
+ set_page_dirty(page);
+ /*
+ * pte_mkyoung() would be more correct here, but atomic care
+ * is needed to avoid losing the dirty bit: it is easier to use
+ * mark_page_accessed().
+ */
+ mark_page_accessed(page);
+ }
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /*
+ * The preliminary mapping check is mainly to avoid the
+ * pointless overhead of lock_page on the ZERO_PAGE
+ * which might bounce very badly if there is contention.
+ *
+ * If the page is already locked, we don't need to
+ * handle it now - vmscan will handle it later if and
+ * when it attempts to reclaim the page.
+ */
+ if (page->mapping && trylock_page(page)) {
+ lru_add_drain(); /* push cached pages to LRU */
+ /*
+ * Because we lock page here, and migration is
+ * blocked by the pte's page reference, and we
+ * know the page is still mapped, we don't even
+ * need to check for file-cache page truncation.
+ */
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+ }
+ pte_unmap_unlock(ptep, ptl);
+ return page;
+bad_page:
+ pte_unmap_unlock(ptep, ptl);
+ return ERR_PTR(-EFAULT);
+
+no_page:
+ pte_unmap_unlock(ptep, ptl);
+ if (!pte_none(pte))
+ return NULL;
+ return no_page_table(vma, flags);
+}
+
+/**
+ * follow_page_mask - look up a page descriptor from a user-virtual address
+ * @vma: vm_area_struct mapping @address
+ * @address: virtual address to look up
+ * @flags: flags modifying lookup behaviour
+ * @page_mask: on output, *page_mask is set according to the size of the page
+ *
+ * @flags can have FOLL_ flags set, defined in <linux/mm.h>
+ *
+ * Returns the mapped (struct page *), %NULL if no mapping exists, or
+ * an error pointer if there is a mapping to something not represented
+ * by a page descriptor (see also vm_normal_page()).
+ */
+struct page *follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ unsigned int *page_mask)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ spinlock_t *ptl;
+ struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+
+ *page_mask = 0;
+
+ page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+ if (!IS_ERR(page)) {
+ BUG_ON(flags & FOLL_GET);
+ return page;
+ }
+
+ pgd = pgd_offset(mm, address);
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ return no_page_table(vma, flags);
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud))
+ return no_page_table(vma, flags);
+ if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
+ if (flags & FOLL_GET)
+ return NULL;
+ page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+ return page;
+ }
+ if (unlikely(pud_bad(*pud)))
+ return no_page_table(vma, flags);
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ return no_page_table(vma, flags);
+ if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
+ page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
+ if (flags & FOLL_GET) {
+ /*
+ * Refcount on tail pages are not well-defined and
+ * shouldn't be taken. The caller should handle a NULL
+ * return when trying to follow tail pages.
+ */
+ if (PageHead(page))
+ get_page(page);
+ else
+ page = NULL;
+ }
+ return page;
+ }
+ if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ return no_page_table(vma, flags);
+ if (pmd_trans_huge(*pmd)) {
+ if (flags & FOLL_SPLIT) {
+ split_huge_page_pmd(vma, address, pmd);
+ return follow_page_pte(vma, address, pmd, flags);
+ }
+ ptl = pmd_lock(mm, pmd);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(ptl);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ } else {
+ page = follow_trans_huge_pmd(vma, address,
+ pmd, flags);
+ spin_unlock(ptl);
+ *page_mask = HPAGE_PMD_NR - 1;
+ return page;
+ }
+ } else
+ spin_unlock(ptl);
+ }
+ return follow_page_pte(vma, address, pmd, flags);
+}
+
+static int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
+{
+ return stack_guard_page_start(vma, addr) ||
+ stack_guard_page_end(vma, addr+PAGE_SIZE);
+}
+
+static int get_gate_page(struct mm_struct *mm, unsigned long address,
+ unsigned int gup_flags, struct vm_area_struct **vma,
+ struct page **page)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int ret = 0;
+
+ /* user gate pages are read-only */
+ if (gup_flags & FOLL_WRITE)
+ return -EFAULT;
+ if (address > TASK_SIZE)
+ pgd = pgd_offset_k(address);
+ else
+ pgd = pgd_offset_gate(mm, address);
+ BUG_ON(pgd_none(*pgd));
+ pud = pud_offset(pgd, address);
+ BUG_ON(pud_none(*pud));
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ return -EFAULT;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ pte = pte_offset_map(pmd, address);
+ if (pte_none(*pte)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ *vma = get_gate_vma(mm);
+ if (!page)
+ goto out;
+ *page = vm_normal_page(*vma, address, *pte);
+ if (!*page)
+ goto out;
+ if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) {
+ *page = NULL;
+ ret = -EFAULT;
+ goto out;
+ }
+ *page = pte_page(*pte);
+ get_page(*page);
+out:
+ pte_unmap(pte);
+ return 0;
+}
+
+static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
+ unsigned long address, unsigned int *flags, int *nonblocking)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned int fault_flags = 0;
+ int ret;
+
+ /* For mlock, just skip the stack guard page. */
+ if ((*flags & FOLL_MLOCK) && stack_guard_page(vma, address))
+ return -ENOENT;
+ if (*flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (nonblocking)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (*flags & FOLL_NOWAIT)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+
+ ret = handle_mm_fault(mm, vma, address, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ return -ENOMEM;
+ if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+ return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
+ if (ret & VM_FAULT_SIGBUS)
+ return -EFAULT;
+ BUG();
+ }
+
+ if (tsk) {
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ }
+
+ if (ret & VM_FAULT_RETRY) {
+ if (nonblocking)
+ *nonblocking = 0;
+ return -EBUSY;
+ }
+
+ /*
+ * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
+ * necessary, even if maybe_mkwrite decided not to set pte_write. We
+ * can thus safely do subsequent page lookups as if they were reads.
+ * But only do so when looping for pte_write is futile: in some cases
+ * userspace may also be wanting to write to the gotten user page,
+ * which a read fault here might prevent (a readonly page might get
+ * reCOWed by userspace write).
+ */
+ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
+ *flags &= ~FOLL_WRITE;
+ return 0;
+}
+
+/**
+ * __get_user_pages() - pin user pages in memory
+ * @tsk: task_struct of target task
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ * @nonblocking: whether waiting for disk IO or mmap_sem contention
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * __get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * __get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
+ * the page is written to, set_page_dirty (or set_page_dirty_lock, as
+ * appropriate) must be called after the page is finished with, and
+ * before put_page is called.
+ *
+ * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
+ * or mmap_sem contention, and if waiting is needed to pin all pages,
+ * *@nonblocking will be set to 0.
+ *
+ * In most cases, get_user_pages or get_user_pages_fast should be used
+ * instead of __get_user_pages. __get_user_pages should be used only if
+ * you need some special @gup_flags.
+ */
+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *nonblocking)
+{
+ long i = 0;
+ unsigned long vm_flags;
+ unsigned int page_mask;
+ struct vm_area_struct *vma = NULL;
+
+ if (!nr_pages)
+ return 0;
+
+ VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
+
+ /*
+ * Require read or write permissions.
+ * If FOLL_FORCE is set, we only require the "MAY" flags.
+ */
+ vm_flags = (gup_flags & FOLL_WRITE) ?
+ (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= (gup_flags & FOLL_FORCE) ?
+ (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+
+ /*
+ * If FOLL_FORCE is set then do not force a full fault as the hinting
+ * fault information is unrelated to the reference behaviour of a task
+ * using the address space
+ */
+ if (!(gup_flags & FOLL_FORCE))
+ gup_flags |= FOLL_NUMA;
+
+ do {
+ struct page *page;
+ unsigned int foll_flags = gup_flags;
+ unsigned int page_increm;
+
+ /* first iteration or cross vma bound */
+ if (!vma || start >= vma->vm_end) {
+ vma = find_extend_vma(mm, start);
+ if (!vma && in_gate_area(mm, start)) {
+ int ret;
+ ret = get_gate_page(mm, start & PAGE_MASK,
+ gup_flags, &vma,
+ pages ? &pages[i] : NULL);
+ if (ret)
+ return i ? : ret;
+ page_mask = 0;
+ goto next_page;
+ }
+
+ if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+ !(vm_flags & vma->vm_flags))
+ return i ? : -EFAULT;
+
+ if (is_vm_hugetlb_page(vma)) {
+ i = follow_hugetlb_page(mm, vma, pages, vmas,
+ &start, &nr_pages, i,
+ gup_flags);
+ continue;
+ }
+ }
+
+ /*
+ * If we have a pending SIGKILL, don't keep faulting pages and
+ * potentially allocating memory.
+ */
+ if (unlikely(fatal_signal_pending(current)))
+ return i ? i : -ERESTARTSYS;
+retry:
+ cond_resched();
+ page = follow_page_mask(vma, start, foll_flags, &page_mask);
+ if (!page) {
+ int ret;
+ ret = faultin_page(tsk, vma, start, &foll_flags,
+ nonblocking);
+ switch (ret) {
+ case 0:
+ goto retry;
+ case -EFAULT:
+ case -ENOMEM:
+ case -EHWPOISON:
+ return i ? i : ret;
+ case -EBUSY:
+ return i;
+ case -ENOENT:
+ goto next_page;
+ }
+ BUG();
+ }
+ if (IS_ERR(page))
+ return i ? i : PTR_ERR(page);
+ if (pages) {
+ pages[i] = page;
+ flush_anon_page(vma, page, start);
+ flush_dcache_page(page);
+ page_mask = 0;
+ }
+next_page:
+ if (vmas) {
+ vmas[i] = vma;
+ page_mask = 0;
+ }
+ page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+ if (page_increm > nr_pages)
+ page_increm = nr_pages;
+ i += page_increm;
+ start += page_increm * PAGE_SIZE;
+ nr_pages -= page_increm;
+ } while (nr_pages);
+ return i;
+}
+EXPORT_SYMBOL(__get_user_pages);
+
+/*
+ * fixup_user_fault() - manually resolve a user page fault
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @address: user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ *
+ * This is meant to be called in the specific scenario where for locking reasons
+ * we try to access user memory in atomic context (within a pagefault_disable()
+ * section), this returns -EFAULT, and we want to resolve the user fault before
+ * trying again.
+ *
+ * Typically this is meant to be used by the futex code.
+ *
+ * The main difference with get_user_pages() is that this function will
+ * unconditionally call handle_mm_fault() which will in turn perform all the
+ * necessary SW fixup of the dirty and young bits in the PTE, while
+ * handle_mm_fault() only guarantees to update these in the struct page.
+ *
+ * This is important for some architectures where those bits also gate the
+ * access permission to the page because they are maintained in software. On
+ * such architectures, gup() will not be enough to make a subsequent access
+ * succeed.
+ *
+ * This should be called with the mm_sem held for read.
+ */
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long address, unsigned int fault_flags)
+{
+ struct vm_area_struct *vma;
+ int ret;
+
+ vma = find_extend_vma(mm, address);
+ if (!vma || address < vma->vm_start)
+ return -EFAULT;
+
+ ret = handle_mm_fault(mm, vma, address, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ return -ENOMEM;
+ if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+ return -EHWPOISON;
+ if (ret & VM_FAULT_SIGBUS)
+ return -EFAULT;
+ BUG();
+ }
+ if (tsk) {
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ }
+ return 0;
+}
+
+/*
+ * get_user_pages() - pin user pages in memory
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @write: whether pages will be written to by the caller
+ * @force: whether to force write access even if user mapping is
+ * readonly. This will result in the page being COWed even
+ * in MAP_SHARED mappings. You do not want this.
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If write=0, the page must not be written to. If the page is written to,
+ * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
+ * after the page is finished with, and before put_page is called.
+ *
+ * get_user_pages is typically used for fewer-copy IO operations, to get a
+ * handle on the memory by some means other than accesses via the user virtual
+ * addresses. The pages may be submitted for DMA to devices or accessed via
+ * their kernel linear mapping (via the kmap APIs). Care should be taken to
+ * use the correct cache flushing APIs.
+ *
+ * See also get_user_pages_fast, for performance critical applications.
+ */
+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages, int write,
+ int force, struct page **pages, struct vm_area_struct **vmas)
+{
+ int flags = FOLL_TOUCH;
+
+ if (pages)
+ flags |= FOLL_GET;
+ if (write)
+ flags |= FOLL_WRITE;
+ if (force)
+ flags |= FOLL_FORCE;
+
+ return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+ NULL);
+}
+EXPORT_SYMBOL(get_user_pages);
+
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by page_cache_release() or put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save diskspace.
+ *
+ * Called without mmap_sem, but after all other threads have been killed.
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ if (__get_user_pages(current, current->mm, addr, 1,
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+ NULL) < 1)
+ return NULL;
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ return page;
+}
+#endif /* CONFIG_ELF_CORE */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d199d2d91946..c5ff461e0253 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -5,6 +5,8 @@
* the COPYING file in the top-level directory.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/highmem.h>
@@ -151,8 +153,7 @@ static int start_khugepaged(void)
khugepaged_thread = kthread_run(khugepaged, NULL,
"khugepaged");
if (unlikely(IS_ERR(khugepaged_thread))) {
- printk(KERN_ERR
- "khugepaged: kthread_run(khugepaged) failed\n");
+ pr_err("khugepaged: kthread_run(khugepaged) failed\n");
err = PTR_ERR(khugepaged_thread);
khugepaged_thread = NULL;
}
@@ -584,19 +585,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
if (unlikely(!*hugepage_kobj)) {
- printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n");
+ pr_err("failed to create transparent hugepage kobject\n");
return -ENOMEM;
}
err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
if (err) {
- printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
+ pr_err("failed to register transparent hugepage group\n");
goto delete_obj;
}
err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
if (err) {
- printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
+ pr_err("failed to register transparent hugepage group\n");
goto remove_hp_group;
}
@@ -689,8 +690,7 @@ static int __init setup_transparent_hugepage(char *str)
}
out:
if (!ret)
- printk(KERN_WARNING
- "transparent_hugepage= cannot parse, ignored\n");
+ pr_warn("transparent_hugepage= cannot parse, ignored\n");
return ret;
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
@@ -1807,7 +1807,7 @@ static void __split_huge_page(struct page *page,
struct list_head *list)
{
int mapcount, mapcount2;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff_t pgoff = page_pgoff(page);
struct anon_vma_chain *avc;
BUG_ON(!PageHead(page));
@@ -1830,10 +1830,11 @@ static void __split_huge_page(struct page *page,
* the newly established pmd of the child later during the
* walk, to be able to set it as pmd_trans_splitting too.
*/
- if (mapcount != page_mapcount(page))
- printk(KERN_ERR "mapcount %d page_mapcount %d\n",
- mapcount, page_mapcount(page));
- BUG_ON(mapcount != page_mapcount(page));
+ if (mapcount != page_mapcount(page)) {
+ pr_err("mapcount %d page_mapcount %d\n",
+ mapcount, page_mapcount(page));
+ BUG();
+ }
__split_huge_page_refcount(page, list);
@@ -1844,10 +1845,11 @@ static void __split_huge_page(struct page *page,
BUG_ON(is_vma_temporary_stack(vma));
mapcount2 += __split_huge_page_map(page, vma, addr);
}
- if (mapcount != mapcount2)
- printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
- mapcount, mapcount2, page_mapcount(page));
- BUG_ON(mapcount != mapcount2);
+ if (mapcount != mapcount2) {
+ pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
+ mapcount, mapcount2, page_mapcount(page));
+ BUG();
+ }
}
/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 246192929a2d..e73f7bccd10c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -31,6 +31,7 @@
#include <linux/io.h>
#include <linux/hugetlb.h>
+#include <linux/hugetlb_inline.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
#include "internal.h"
@@ -607,25 +608,242 @@ err:
return NULL;
}
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed. Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ nid = next_node(nid, *nodes_allowed);
+ if (nid == MAX_NUMNODES)
+ nid = first_node(*nodes_allowed);
+ VM_BUG_ON(nid >= MAX_NUMNODES);
+
+ return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ if (!node_isset(nid, *nodes_allowed))
+ nid = next_node_allowed(nid, nodes_allowed);
+ return nid;
+}
+
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+ nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+ h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node ["this node"] from which to free a huge page. Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+ h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
+ nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+ nr_nodes--)
+
+#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+static void destroy_compound_gigantic_page(struct page *page,
+ unsigned long order)
+{
+ int i;
+ int nr_pages = 1 << order;
+ struct page *p = page + 1;
+
+ for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ __ClearPageTail(p);
+ set_page_refcounted(p);
+ p->first_page = NULL;
+ }
+
+ set_compound_order(page, 0);
+ __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+ free_contig_range(page_to_pfn(page), 1 << order);
+}
+
+static int __alloc_gigantic_page(unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long end_pfn = start_pfn + nr_pages;
+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+}
+
+static bool pfn_range_valid_gigantic(unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i, end_pfn = start_pfn + nr_pages;
+ struct page *page;
+
+ for (i = start_pfn; i < end_pfn; i++) {
+ if (!pfn_valid(i))
+ return false;
+
+ page = pfn_to_page(i);
+
+ if (PageReserved(page))
+ return false;
+
+ if (page_count(page) > 0)
+ return false;
+
+ if (PageHuge(page))
+ return false;
+ }
+
+ return true;
+}
+
+static bool zone_spans_last_pfn(const struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ unsigned long last_pfn = start_pfn + nr_pages - 1;
+ return zone_spans_pfn(zone, last_pfn);
+}
+
+static struct page *alloc_gigantic_page(int nid, unsigned order)
+{
+ unsigned long nr_pages = 1 << order;
+ unsigned long ret, pfn, flags;
+ struct zone *z;
+
+ z = NODE_DATA(nid)->node_zones;
+ for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
+ spin_lock_irqsave(&z->lock, flags);
+
+ pfn = ALIGN(z->zone_start_pfn, nr_pages);
+ while (zone_spans_last_pfn(z, pfn, nr_pages)) {
+ if (pfn_range_valid_gigantic(pfn, nr_pages)) {
+ /*
+ * We release the zone lock here because
+ * alloc_contig_range() will also lock the zone
+ * at some point. If there's an allocation
+ * spinning on this lock, it may win the race
+ * and cause alloc_contig_range() to fail...
+ */
+ spin_unlock_irqrestore(&z->lock, flags);
+ ret = __alloc_gigantic_page(pfn, nr_pages);
+ if (!ret)
+ return pfn_to_page(pfn);
+ spin_lock_irqsave(&z->lock, flags);
+ }
+ pfn += nr_pages;
+ }
+
+ spin_unlock_irqrestore(&z->lock, flags);
+ }
+
+ return NULL;
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
+static void prep_compound_gigantic_page(struct page *page, unsigned long order);
+
+static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ page = alloc_gigantic_page(nid, huge_page_order(h));
+ if (page) {
+ prep_compound_gigantic_page(page, huge_page_order(h));
+ prep_new_huge_page(h, page, nid);
+ }
+
+ return page;
+}
+
+static int alloc_fresh_gigantic_page(struct hstate *h,
+ nodemask_t *nodes_allowed)
+{
+ struct page *page = NULL;
+ int nr_nodes, node;
+
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ page = alloc_fresh_gigantic_page_node(h, node);
+ if (page)
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline bool gigantic_page_supported(void) { return true; }
+#else
+static inline bool gigantic_page_supported(void) { return false; }
+static inline void free_gigantic_page(struct page *page, unsigned order) { }
+static inline void destroy_compound_gigantic_page(struct page *page,
+ unsigned long order) { }
+static inline int alloc_fresh_gigantic_page(struct hstate *h,
+ nodemask_t *nodes_allowed) { return 0; }
+#endif
+
static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
- VM_BUG_ON(h->order >= MAX_ORDER);
+ if (hstate_is_gigantic(h) && !gigantic_page_supported())
+ return;
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h); i++) {
page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
- 1 << PG_active | 1 << PG_reserved |
- 1 << PG_private | 1 << PG_writeback);
+ 1 << PG_active | 1 << PG_private |
+ 1 << PG_writeback);
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
- arch_release_hugepage(page);
- __free_pages(page, huge_page_order(h));
+ if (hstate_is_gigantic(h)) {
+ destroy_compound_gigantic_page(page, huge_page_order(h));
+ free_gigantic_page(page, huge_page_order(h));
+ } else {
+ arch_release_hugepage(page);
+ __free_pages(page, huge_page_order(h));
+ }
}
struct hstate *size_to_hstate(unsigned long size)
@@ -664,7 +882,7 @@ static void free_huge_page(struct page *page)
if (restore_reserve)
h->resv_huge_pages++;
- if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+ if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
list_del(&page->lru);
update_and_free_page(h, page);
@@ -690,8 +908,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
put_page(page); /* free it into the hugepage allocator */
}
-static void __init prep_compound_gigantic_page(struct page *page,
- unsigned long order)
+static void prep_compound_gigantic_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
@@ -769,9 +986,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
- if (h->order >= MAX_ORDER)
- return NULL;
-
page = alloc_pages_exact_node(nid,
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
@@ -787,79 +1001,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
return page;
}
-/*
- * common helper functions for hstate_next_node_to_{alloc|free}.
- * We may have allocated or freed a huge page based on a different
- * nodes_allowed previously, so h->next_node_to_{alloc|free} might
- * be outside of *nodes_allowed. Ensure that we use an allowed
- * node for alloc or free.
- */
-static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
- nid = next_node(nid, *nodes_allowed);
- if (nid == MAX_NUMNODES)
- nid = first_node(*nodes_allowed);
- VM_BUG_ON(nid >= MAX_NUMNODES);
-
- return nid;
-}
-
-static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
- if (!node_isset(nid, *nodes_allowed))
- nid = next_node_allowed(nid, nodes_allowed);
- return nid;
-}
-
-/*
- * returns the previously saved node ["this node"] from which to
- * allocate a persistent huge page for the pool and advance the
- * next node from which to allocate, handling wrap at end of node
- * mask.
- */
-static int hstate_next_node_to_alloc(struct hstate *h,
- nodemask_t *nodes_allowed)
-{
- int nid;
-
- VM_BUG_ON(!nodes_allowed);
-
- nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
- h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
-
- return nid;
-}
-
-/*
- * helper for free_pool_huge_page() - return the previously saved
- * node ["this node"] from which to free a huge page. Advance the
- * next node id whether or not we find a free huge page to free so
- * that the next attempt to free addresses the next node.
- */
-static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
-{
- int nid;
-
- VM_BUG_ON(!nodes_allowed);
-
- nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
- h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
-
- return nid;
-}
-
-#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
- for (nr_nodes = nodes_weight(*mask); \
- nr_nodes > 0 && \
- ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
- nr_nodes--)
-
-#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
- for (nr_nodes = nodes_weight(*mask); \
- nr_nodes > 0 && \
- ((node = hstate_next_node_to_free(hs, mask)) || 1); \
- nr_nodes--)
-
static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
{
struct page *page;
@@ -963,7 +1104,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
struct page *page;
unsigned int r_nid;
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h))
return NULL;
/*
@@ -1156,7 +1297,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h->resv_huge_pages -= unused_resv_pages;
/* Cannot return gigantic pages currently */
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h))
return;
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
@@ -1356,7 +1497,7 @@ static void __init gather_bootmem_prealloc(void)
* fix confusing memory reports from free(1) and another
* side-effects, like CommitLimit going negative.
*/
- if (h->order > (MAX_ORDER - 1))
+ if (hstate_is_gigantic(h))
adjust_managed_page_count(page, 1 << h->order);
}
}
@@ -1366,7 +1507,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
unsigned long i;
for (i = 0; i < h->max_huge_pages; ++i) {
- if (h->order >= MAX_ORDER) {
+ if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
@@ -1382,7 +1523,7 @@ static void __init hugetlb_init_hstates(void)
for_each_hstate(h) {
/* oversize hugepages were init'ed in early boot */
- if (h->order < MAX_ORDER)
+ if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
}
@@ -1416,7 +1557,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
{
int i;
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h))
return;
for_each_node_mask(i, *nodes_allowed) {
@@ -1479,7 +1620,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
{
unsigned long min_count, ret;
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h) && !gigantic_page_supported())
return h->max_huge_pages;
/*
@@ -1506,7 +1647,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* and reducing the surplus.
*/
spin_unlock(&hugetlb_lock);
- ret = alloc_fresh_huge_page(h, nodes_allowed);
+ if (hstate_is_gigantic(h))
+ ret = alloc_fresh_gigantic_page(h, nodes_allowed);
+ else
+ ret = alloc_fresh_huge_page(h, nodes_allowed);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
@@ -1606,7 +1750,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
goto out;
h = kobj_to_hstate(kobj, &nid);
- if (h->order >= MAX_ORDER) {
+ if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
err = -EINVAL;
goto out;
}
@@ -1689,7 +1833,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h))
return -EINVAL;
err = kstrtoul(buf, 10, &input);
@@ -1981,11 +2125,7 @@ static int __init hugetlb_init(void)
{
int i;
- /* Some platform decide whether they support huge pages at boot
- * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
- * there is no such support
- */
- if (HPAGE_SHIFT == 0)
+ if (!hugepages_supported())
return 0;
if (!size_to_hstate(default_hstate_size)) {
@@ -2112,9 +2252,12 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
unsigned long tmp;
int ret;
+ if (!hugepages_supported())
+ return -ENOTSUPP;
+
tmp = h->max_huge_pages;
- if (write && h->order >= MAX_ORDER)
+ if (write && hstate_is_gigantic(h) && !gigantic_page_supported())
return -EINVAL;
table->data = &tmp;
@@ -2165,9 +2308,12 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
unsigned long tmp;
int ret;
+ if (!hugepages_supported())
+ return -ENOTSUPP;
+
tmp = h->nr_overcommit_huge_pages;
- if (write && h->order >= MAX_ORDER)
+ if (write && hstate_is_gigantic(h))
return -EINVAL;
table->data = &tmp;
@@ -2190,6 +2336,8 @@ out:
void hugetlb_report_meminfo(struct seq_file *m)
{
struct hstate *h = &default_hstate;
+ if (!hugepages_supported())
+ return;
seq_printf(m,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
@@ -2206,6 +2354,8 @@ void hugetlb_report_meminfo(struct seq_file *m)
int hugetlb_report_node_meminfo(int nid, char *buf)
{
struct hstate *h = &default_hstate;
+ if (!hugepages_supported())
+ return 0;
return sprintf(buf,
"Node %d HugePages_Total: %5u\n"
"Node %d HugePages_Free: %5u\n"
@@ -2220,6 +2370,9 @@ void hugetlb_show_meminfo(void)
struct hstate *h;
int nid;
+ if (!hugepages_supported())
+ return;
+
for_each_node_state(nid, N_MEMORY)
for_each_hstate(h)
pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 91d67eaee050..71661aa488e9 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1300,7 +1300,7 @@ static void kmemleak_scan(void)
/*
* Struct page scanning for each node.
*/
- lock_memory_hotplug();
+ get_online_mems();
for_each_online_node(i) {
unsigned long start_pfn = node_start_pfn(i);
unsigned long end_pfn = node_end_pfn(i);
@@ -1318,7 +1318,7 @@ static void kmemleak_scan(void)
scan_block(page, page + 1, NULL, 1);
}
}
- unlock_memory_hotplug();
+ put_online_mems();
/*
* Scanning the task stacks (may introduce false negatives).
diff --git a/mm/memblock.c b/mm/memblock.c
index a810ba923cdd..146736411318 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1033,22 +1033,35 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
- phys_addr_t align, phys_addr_t max_addr,
- int nid)
+static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start,
+ phys_addr_t end, int nid)
{
phys_addr_t found;
if (!align)
align = SMP_CACHE_BYTES;
- found = memblock_find_in_range_node(size, align, 0, max_addr, nid);
+ found = memblock_find_in_range_node(size, align, start, end, nid);
if (found && !memblock_reserve(found, size))
return found;
return 0;
}
+phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
+ phys_addr_t start, phys_addr_t end)
+{
+ return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+}
+
+static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
+ phys_addr_t align, phys_addr_t max_addr,
+ int nid)
+{
+ return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+}
+
phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 29501f040568..efc233f8b529 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -683,6 +683,15 @@ mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
+static struct mem_cgroup_per_zone *
+mem_cgroup_zoneinfo_zone(struct mem_cgroup *memcg, struct zone *zone)
+{
+ int nid = zone_to_nid(zone);
+ int zid = zone_idx(zone);
+
+ return mem_cgroup_zoneinfo(memcg, nid, zid);
+}
+
struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
{
return &memcg->css;
@@ -1234,11 +1243,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
int uninitialized_var(seq);
if (reclaim) {
- int nid = zone_to_nid(reclaim->zone);
- int zid = zone_idx(reclaim->zone);
struct mem_cgroup_per_zone *mz;
- mz = mem_cgroup_zoneinfo(root, nid, zid);
+ mz = mem_cgroup_zoneinfo_zone(root, reclaim->zone);
iter = &mz->reclaim_iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation) {
iter->last_visited = NULL;
@@ -1345,7 +1352,7 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
goto out;
}
- mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
+ mz = mem_cgroup_zoneinfo_zone(memcg, zone);
lruvec = &mz->lruvec;
out:
/*
@@ -2944,7 +2951,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
}
#endif
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
{
struct res_counter *fail_res;
int ret = 0;
@@ -2982,7 +2989,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
return ret;
}
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
{
res_counter_uncharge(&memcg->res, size);
if (do_swap_account)
@@ -3504,7 +3511,6 @@ out:
rcu_read_unlock();
return cachep;
}
-EXPORT_SYMBOL(__memcg_kmem_get_cache);
/*
* We need to verify if the allocation against current->mm->owner's memcg is
@@ -3531,11 +3537,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
/*
* Disabling accounting is only relevant for some specific memcg
* internal allocations. Therefore we would initially not have such
- * check here, since direct calls to the page allocator that are marked
- * with GFP_KMEMCG only happen outside memcg core. We are mostly
- * concerned with cache allocations, and by having this test at
- * memcg_kmem_get_cache, we are already able to relay the allocation to
- * the root cache and bypass the memcg cache altogether.
+ * check here, since direct calls to the page allocator that are
+ * accounted to kmemcg (alloc_kmem_pages and friends) only happen
+ * outside memcg core. We are mostly concerned with cache allocations,
+ * and by having this test at memcg_kmem_get_cache, we are already able
+ * to relay the allocation to the root cache and bypass the memcg cache
+ * altogether.
*
* There is one exception, though: the SLUB allocator does not create
* large order caches, but rather service large kmallocs directly from
@@ -5442,22 +5449,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
- if (val > 100 || !parent)
+ if (val > 100)
return -EINVAL;
- mutex_lock(&memcg_create_mutex);
-
- /* If under hierarchy, only empty-root can set this value */
- if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
- mutex_unlock(&memcg_create_mutex);
- return -EINVAL;
- }
-
- memcg->swappiness = val;
-
- mutex_unlock(&memcg_create_mutex);
+ if (css_parent(css))
+ memcg->swappiness = val;
+ else
+ vm_swappiness = val;
return 0;
}
@@ -5789,22 +5788,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
/* cannot set to root cgroup and only 0 and 1 are allowed */
- if (!parent || !((val == 0) || (val == 1)))
+ if (!css_parent(css) || !((val == 0) || (val == 1)))
return -EINVAL;
- mutex_lock(&memcg_create_mutex);
- /* oom-kill-disable is a flag for subhierarchy. */
- if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
- mutex_unlock(&memcg_create_mutex);
- return -EINVAL;
- }
memcg->oom_kill_disable = val;
if (!val)
memcg_oom_recover(memcg);
- mutex_unlock(&memcg_create_mutex);
+
return 0;
}
@@ -6686,16 +6678,20 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
pgoff = pte_to_pgoff(ptent);
/* page is moved even if it's not RSS of this task(page-faulted). */
- page = find_get_page(mapping, pgoff);
-
#ifdef CONFIG_SWAP
/* shmem/tmpfs may report page out on swap: account for that too. */
- if (radix_tree_exceptional_entry(page)) {
- swp_entry_t swap = radix_to_swp_entry(page);
- if (do_swap_account)
- *entry = swap;
- page = find_get_page(swap_address_space(swap), swap.val);
- }
+ if (shmem_mapping(mapping)) {
+ page = find_get_entry(mapping, pgoff);
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swp = radix_to_swp_entry(page);
+ if (do_swap_account)
+ *entry = swp;
+ page = find_get_page(swap_address_space(swp), swp.val);
+ }
+ } else
+ page = find_get_page(mapping, pgoff);
+#else
+ page = find_get_page(mapping, pgoff);
#endif
return page;
}
@@ -6777,30 +6773,29 @@ static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
}
#endif
-static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
+static int mem_cgroup_count_precharge_pte(pte_t *pte,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->private;
- pte_t *pte;
+ if (get_mctgt_type(walk->vma, addr, *pte, NULL))
+ mc.precharge++; /* increment precharge temporarily */
+ return 0;
+}
+
+static int mem_cgroup_count_precharge_pmd(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
spin_unlock(ptl);
- return 0;
+ /* don't call mem_cgroup_count_precharge_pte() */
+ walk->skip = 1;
}
-
- if (pmd_trans_unstable(pmd))
- return 0;
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- for (; addr != end; pte++, addr += PAGE_SIZE)
- if (get_mctgt_type(vma, addr, *pte, NULL))
- mc.precharge++; /* increment precharge temporarily */
- pte_unmap_unlock(pte - 1, ptl);
- cond_resched();
-
return 0;
}
@@ -6809,18 +6804,14 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
unsigned long precharge;
struct vm_area_struct *vma;
+ struct mm_walk mem_cgroup_count_precharge_walk = {
+ .pmd_entry = mem_cgroup_count_precharge_pmd,
+ .pte_entry = mem_cgroup_count_precharge_pte,
+ .mm = mm,
+ };
down_read(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- struct mm_walk mem_cgroup_count_precharge_walk = {
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
- .mm = mm,
- .private = vma,
- };
- if (is_vm_hugetlb_page(vma))
- continue;
- walk_page_range(vma->vm_start, vma->vm_end,
- &mem_cgroup_count_precharge_walk);
- }
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ walk_page_vma(vma, &mem_cgroup_count_precharge_walk);
up_read(&mm->mmap_sem);
precharge = mc.precharge;
@@ -6959,7 +6950,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
struct mm_walk *walk)
{
int ret = 0;
- struct vm_area_struct *vma = walk->private;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
enum mc_target_type target_type;
@@ -7060,6 +7051,10 @@ put: /* get_mctgt_type() gets the page */
static void mem_cgroup_move_charge(struct mm_struct *mm)
{
struct vm_area_struct *vma;
+ struct mm_walk mem_cgroup_move_charge_walk = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+ .mm = mm,
+ };
lru_add_drain_all();
retry:
@@ -7075,24 +7070,8 @@ retry:
cond_resched();
goto retry;
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- int ret;
- struct mm_walk mem_cgroup_move_charge_walk = {
- .pmd_entry = mem_cgroup_move_charge_pte_range,
- .mm = mm,
- .private = vma,
- };
- if (is_vm_hugetlb_page(vma))
- continue;
- ret = walk_page_range(vma->vm_start, vma->vm_end,
- &mem_cgroup_move_charge_walk);
- if (ret)
- /*
- * means we have consumed all precharges and failed in
- * doing additional charge. Just abandon here.
- */
- break;
- }
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ walk_page_vma(vma, &mem_cgroup_move_charge_walk);
up_read(&mm->mmap_sem);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 35ef28acf137..efb55b364ac1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -202,7 +202,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
#ifdef __ARCH_SI_TRAPNO
si.si_trapno = trapno;
#endif
- si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+ si.si_addr_lsb = page_size_order(page) + PAGE_SHIFT;
if ((flags & MF_ACTION_REQUIRED) && t == current) {
si.si_code = BUS_MCEERR_AR;
@@ -404,7 +404,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (av == NULL) /* Not actually mapped anymore */
return;
- pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff = page_pgoff(page);
read_lock(&tasklist_lock);
for_each_process (tsk) {
struct anon_vma_chain *vmac;
@@ -437,7 +437,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
mutex_lock(&mapping->i_mmap_mutex);
read_lock(&tasklist_lock);
for_each_process(tsk) {
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff_t pgoff = page_pgoff(page);
if (!task_early_kill(tsk))
continue;
@@ -1661,11 +1661,7 @@ int soft_offline_page(struct page *page, int flags)
}
}
- /*
- * The lock_memory_hotplug prevents a race with memory hotplug.
- * This is a big hammer, a better would be nicer.
- */
- lock_memory_hotplug();
+ get_online_mems();
/*
* Isolate the page, so that it doesn't get reallocated if it
@@ -1676,7 +1672,7 @@ int soft_offline_page(struct page *page, int flags)
set_migratetype_isolate(page, true);
ret = get_any_page(page, pfn, flags);
- unlock_memory_hotplug();
+ put_online_mems();
if (ret > 0) { /* for in-use pages */
if (PageHuge(page))
ret = soft_offline_huge_page(page, flags);
diff --git a/mm/memory.c b/mm/memory.c
index d0f0bef3be48..7e6a74f57639 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -747,7 +747,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn = pte_pfn(pte);
if (HAVE_PTE_SPECIAL) {
- if (likely(!pte_special(pte)))
+ if (likely(!pte_special(pte) || pte_numa(pte)))
goto check_pfn;
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
@@ -773,14 +773,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
}
}
- if (is_zero_pfn(pfn))
- return NULL;
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
+ if (is_zero_pfn(pfn))
+ return NULL;
+
/*
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
@@ -1442,641 +1443,6 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);
-/**
- * follow_page_mask - look up a page descriptor from a user-virtual address
- * @vma: vm_area_struct mapping @address
- * @address: virtual address to look up
- * @flags: flags modifying lookup behaviour
- * @page_mask: on output, *page_mask is set according to the size of the page
- *
- * @flags can have FOLL_ flags set, defined in <linux/mm.h>
- *
- * Returns the mapped (struct page *), %NULL if no mapping exists, or
- * an error pointer if there is a mapping to something not represented
- * by a page descriptor (see also vm_normal_page()).
- */
-struct page *follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned int *page_mask)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *ptep, pte;
- spinlock_t *ptl;
- struct page *page;
- struct mm_struct *mm = vma->vm_mm;
-
- *page_mask = 0;
-
- page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
- if (!IS_ERR(page)) {
- BUG_ON(flags & FOLL_GET);
- goto out;
- }
-
- page = NULL;
- pgd = pgd_offset(mm, address);
- if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- goto no_page_table;
-
- pud = pud_offset(pgd, address);
- if (pud_none(*pud))
- goto no_page_table;
- if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
- if (flags & FOLL_GET)
- goto out;
- page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
- goto out;
- }
- if (unlikely(pud_bad(*pud)))
- goto no_page_table;
-
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd))
- goto no_page_table;
- if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
- page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
- if (flags & FOLL_GET) {
- /*
- * Refcount on tail pages are not well-defined and
- * shouldn't be taken. The caller should handle a NULL
- * return when trying to follow tail pages.
- */
- if (PageHead(page))
- get_page(page);
- else {
- page = NULL;
- goto out;
- }
- }
- goto out;
- }
- if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
- goto no_page_table;
- if (pmd_trans_huge(*pmd)) {
- if (flags & FOLL_SPLIT) {
- split_huge_page_pmd(vma, address, pmd);
- goto split_fallthrough;
- }
- ptl = pmd_lock(mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- } else {
- page = follow_trans_huge_pmd(vma, address,
- pmd, flags);
- spin_unlock(ptl);
- *page_mask = HPAGE_PMD_NR - 1;
- goto out;
- }
- } else
- spin_unlock(ptl);
- /* fall through */
- }
-split_fallthrough:
- if (unlikely(pmd_bad(*pmd)))
- goto no_page_table;
-
- ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
-
- pte = *ptep;
- if (!pte_present(pte)) {
- swp_entry_t entry;
- /*
- * KSM's break_ksm() relies upon recognizing a ksm page
- * even while it is being migrated, so for that case we
- * need migration_entry_wait().
- */
- if (likely(!(flags & FOLL_MIGRATION)))
- goto no_page;
- if (pte_none(pte) || pte_file(pte))
- goto no_page;
- entry = pte_to_swp_entry(pte);
- if (!is_migration_entry(entry))
- goto no_page;
- pte_unmap_unlock(ptep, ptl);
- migration_entry_wait(mm, pmd, address);
- goto split_fallthrough;
- }
- if ((flags & FOLL_NUMA) && pte_numa(pte))
- goto no_page;
- if ((flags & FOLL_WRITE) && !pte_write(pte))
- goto unlock;
-
- page = vm_normal_page(vma, address, pte);
- if (unlikely(!page)) {
- if ((flags & FOLL_DUMP) ||
- !is_zero_pfn(pte_pfn(pte)))
- goto bad_page;
- page = pte_page(pte);
- }
-
- if (flags & FOLL_GET)
- get_page_foll(page);
- if (flags & FOLL_TOUCH) {
- if ((flags & FOLL_WRITE) &&
- !pte_dirty(pte) && !PageDirty(page))
- set_page_dirty(page);
- /*
- * pte_mkyoung() would be more correct here, but atomic care
- * is needed to avoid losing the dirty bit: it is easier to use
- * mark_page_accessed().
- */
- mark_page_accessed(page);
- }
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- /*
- * The preliminary mapping check is mainly to avoid the
- * pointless overhead of lock_page on the ZERO_PAGE
- * which might bounce very badly if there is contention.
- *
- * If the page is already locked, we don't need to
- * handle it now - vmscan will handle it later if and
- * when it attempts to reclaim the page.
- */
- if (page->mapping && trylock_page(page)) {
- lru_add_drain(); /* push cached pages to LRU */
- /*
- * Because we lock page here, and migration is
- * blocked by the pte's page reference, and we
- * know the page is still mapped, we don't even
- * need to check for file-cache page truncation.
- */
- mlock_vma_page(page);
- unlock_page(page);
- }
- }
-unlock:
- pte_unmap_unlock(ptep, ptl);
-out:
- return page;
-
-bad_page:
- pte_unmap_unlock(ptep, ptl);
- return ERR_PTR(-EFAULT);
-
-no_page:
- pte_unmap_unlock(ptep, ptl);
- if (!pte_none(pte))
- return page;
-
-no_page_table:
- /*
- * When core dumping an enormous anonymous area that nobody
- * has touched so far, we don't want to allocate unnecessary pages or
- * page tables. Return error instead of NULL to skip handle_mm_fault,
- * then get_dump_page() will return NULL to leave a hole in the dump.
- * But we can only make this optimization where a hole would surely
- * be zero-filled if handle_mm_fault() actually did handle it.
- */
- if ((flags & FOLL_DUMP) &&
- (!vma->vm_ops || !vma->vm_ops->fault))
- return ERR_PTR(-EFAULT);
- return page;
-}
-
-static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
-{
- return stack_guard_page_start(vma, addr) ||
- stack_guard_page_end(vma, addr+PAGE_SIZE);
-}
-
-/**
- * __get_user_pages() - pin user pages in memory
- * @tsk: task_struct of target task
- * @mm: mm_struct of target mm
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying pin behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
- * @nonblocking: whether waiting for disk IO or mmap_sem contention
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with. vmas will only
- * remain valid while mmap_sem is held.
- *
- * Must be called with mmap_sem held for read or write.
- *
- * __get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
- * instant. That is, it takes the page that would be accessed if a user
- * thread accesses the given user virtual address at that instant.
- *
- * This does not guarantee that the page exists in the user mappings when
- * __get_user_pages returns, and there may even be a completely different
- * page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
- * won't be freed completely. And mostly callers simply care that the page
- * contains data that was valid *at some point in time*. Typically, an IO
- * or similar operation cannot guarantee anything stronger anyway because
- * locks can't be held over the syscall boundary.
- *
- * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
- * the page is written to, set_page_dirty (or set_page_dirty_lock, as
- * appropriate) must be called after the page is finished with, and
- * before put_page is called.
- *
- * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
- * or mmap_sem contention, and if waiting is needed to pin all pages,
- * *@nonblocking will be set to 0.
- *
- * In most cases, get_user_pages or get_user_pages_fast should be used
- * instead of __get_user_pages. __get_user_pages should be used only if
- * you need some special @gup_flags.
- */
-long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *nonblocking)
-{
- long i;
- unsigned long vm_flags;
- unsigned int page_mask;
-
- if (!nr_pages)
- return 0;
-
- VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
-
- /*
- * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
- * would be called on PROT_NONE ranges. We must never invoke
- * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
- * page faults would unprotect the PROT_NONE ranges if
- * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
- * bitflag. So to avoid that, don't set FOLL_NUMA if
- * FOLL_FORCE is set.
- */
- if (!(gup_flags & FOLL_FORCE))
- gup_flags |= FOLL_NUMA;
-
- i = 0;
-
- do {
- struct vm_area_struct *vma;
-
- vma = find_extend_vma(mm, start);
- if (!vma && in_gate_area(mm, start)) {
- unsigned long pg = start & PAGE_MASK;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- /* user gate pages are read-only */
- if (gup_flags & FOLL_WRITE)
- goto efault;
- if (pg > TASK_SIZE)
- pgd = pgd_offset_k(pg);
- else
- pgd = pgd_offset_gate(mm, pg);
- BUG_ON(pgd_none(*pgd));
- pud = pud_offset(pgd, pg);
- BUG_ON(pud_none(*pud));
- pmd = pmd_offset(pud, pg);
- if (pmd_none(*pmd))
- goto efault;
- VM_BUG_ON(pmd_trans_huge(*pmd));
- pte = pte_offset_map(pmd, pg);
- if (pte_none(*pte)) {
- pte_unmap(pte);
- goto efault;
- }
- vma = get_gate_vma(mm);
- if (pages) {
- struct page *page;
-
- page = vm_normal_page(vma, start, *pte);
- if (!page) {
- if (!(gup_flags & FOLL_DUMP) &&
- is_zero_pfn(pte_pfn(*pte)))
- page = pte_page(*pte);
- else {
- pte_unmap(pte);
- goto efault;
- }
- }
- pages[i] = page;
- get_page(page);
- }
- pte_unmap(pte);
- page_mask = 0;
- goto next_page;
- }
-
- if (!vma)
- goto efault;
- vm_flags = vma->vm_flags;
- if (vm_flags & (VM_IO | VM_PFNMAP))
- goto efault;
-
- if (gup_flags & FOLL_WRITE) {
- if (!(vm_flags & VM_WRITE)) {
- if (!(gup_flags & FOLL_FORCE))
- goto efault;
- /*
- * We used to let the write,force case do COW
- * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so
- * ptrace could set a breakpoint in a read-only
- * mapping of an executable, without corrupting
- * the file (yet only when that file had been
- * opened for writing!). Anon pages in shared
- * mappings are surprising: now just reject it.
- */
- if (!is_cow_mapping(vm_flags)) {
- WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
- goto efault;
- }
- }
- } else {
- if (!(vm_flags & VM_READ)) {
- if (!(gup_flags & FOLL_FORCE))
- goto efault;
- /*
- * Is there actually any vma we can reach here
- * which does not have VM_MAYREAD set?
- */
- if (!(vm_flags & VM_MAYREAD))
- goto efault;
- }
- }
-
- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &nr_pages, i, gup_flags);
- continue;
- }
-
- do {
- struct page *page;
- unsigned int foll_flags = gup_flags;
- unsigned int page_increm;
-
- /*
- * If we have a pending SIGKILL, don't keep faulting
- * pages and potentially allocating memory.
- */
- if (unlikely(fatal_signal_pending(current)))
- return i ? i : -ERESTARTSYS;
-
- cond_resched();
- while (!(page = follow_page_mask(vma, start,
- foll_flags, &page_mask))) {
- int ret;
- unsigned int fault_flags = 0;
-
- /* For mlock, just skip the stack guard page. */
- if (foll_flags & FOLL_MLOCK) {
- if (stack_guard_page(vma, start))
- goto next_page;
- }
- if (foll_flags & FOLL_WRITE)
- fault_flags |= FAULT_FLAG_WRITE;
- if (nonblocking)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY;
- if (foll_flags & FOLL_NOWAIT)
- fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
-
- ret = handle_mm_fault(mm, vma, start,
- fault_flags);
-
- if (ret & VM_FAULT_ERROR) {
- if (ret & VM_FAULT_OOM)
- return i ? i : -ENOMEM;
- if (ret & (VM_FAULT_HWPOISON |
- VM_FAULT_HWPOISON_LARGE)) {
- if (i)
- return i;
- else if (gup_flags & FOLL_HWPOISON)
- return -EHWPOISON;
- else
- return -EFAULT;
- }
- if (ret & VM_FAULT_SIGBUS)
- goto efault;
- BUG();
- }
-
- if (tsk) {
- if (ret & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
-
- if (ret & VM_FAULT_RETRY) {
- if (nonblocking)
- *nonblocking = 0;
- return i;
- }
-
- /*
- * The VM_FAULT_WRITE bit tells us that
- * do_wp_page has broken COW when necessary,
- * even if maybe_mkwrite decided not to set
- * pte_write. We can thus safely do subsequent
- * page lookups as if they were reads. But only
- * do so when looping for pte_write is futile:
- * in some cases userspace may also be wanting
- * to write to the gotten user page, which a
- * read fault here might prevent (a readonly
- * page might get reCOWed by userspace write).
- */
- if ((ret & VM_FAULT_WRITE) &&
- !(vma->vm_flags & VM_WRITE))
- foll_flags &= ~FOLL_WRITE;
-
- cond_resched();
- }
- if (IS_ERR(page))
- return i ? i : PTR_ERR(page);
- if (pages) {
- pages[i] = page;
-
- flush_anon_page(vma, page, start);
- flush_dcache_page(page);
- page_mask = 0;
- }
-next_page:
- if (vmas) {
- vmas[i] = vma;
- page_mask = 0;
- }
- page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
- if (page_increm > nr_pages)
- page_increm = nr_pages;
- i += page_increm;
- start += page_increm * PAGE_SIZE;
- nr_pages -= page_increm;
- } while (nr_pages && start < vma->vm_end);
- } while (nr_pages);
- return i;
-efault:
- return i ? : -EFAULT;
-}
-EXPORT_SYMBOL(__get_user_pages);
-
-/*
- * fixup_user_fault() - manually resolve a user page fault
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
- * @mm: mm_struct of target mm
- * @address: user address
- * @fault_flags:flags to pass down to handle_mm_fault()
- *
- * This is meant to be called in the specific scenario where for locking reasons
- * we try to access user memory in atomic context (within a pagefault_disable()
- * section), this returns -EFAULT, and we want to resolve the user fault before
- * trying again.
- *
- * Typically this is meant to be used by the futex code.
- *
- * The main difference with get_user_pages() is that this function will
- * unconditionally call handle_mm_fault() which will in turn perform all the
- * necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
- *
- * This is important for some architectures where those bits also gate the
- * access permission to the page because they are maintained in software. On
- * such architectures, gup() will not be enough to make a subsequent access
- * succeed.
- *
- * This should be called with the mm_sem held for read.
- */
-int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long address, unsigned int fault_flags)
-{
- struct vm_area_struct *vma;
- int ret;
-
- vma = find_extend_vma(mm, address);
- if (!vma || address < vma->vm_start)
- return -EFAULT;
-
- ret = handle_mm_fault(mm, vma, address, fault_flags);
- if (ret & VM_FAULT_ERROR) {
- if (ret & VM_FAULT_OOM)
- return -ENOMEM;
- if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
- return -EHWPOISON;
- if (ret & VM_FAULT_SIGBUS)
- return -EFAULT;
- BUG();
- }
- if (tsk) {
- if (ret & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
- return 0;
-}
-
-/*
- * get_user_pages() - pin user pages in memory
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
- * @mm: mm_struct of target mm
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to by the caller
- * @force: whether to force access even when user mapping is currently
- * protected (but never forces write access to shared mapping).
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with. vmas will only
- * remain valid while mmap_sem is held.
- *
- * Must be called with mmap_sem held for read or write.
- *
- * get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
- * instant. That is, it takes the page that would be accessed if a user
- * thread accesses the given user virtual address at that instant.
- *
- * This does not guarantee that the page exists in the user mappings when
- * get_user_pages returns, and there may even be a completely different
- * page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
- * won't be freed completely. And mostly callers simply care that the page
- * contains data that was valid *at some point in time*. Typically, an IO
- * or similar operation cannot guarantee anything stronger anyway because
- * locks can't be held over the syscall boundary.
- *
- * If write=0, the page must not be written to. If the page is written to,
- * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
- * after the page is finished with, and before put_page is called.
- *
- * get_user_pages is typically used for fewer-copy IO operations, to get a
- * handle on the memory by some means other than accesses via the user virtual
- * addresses. The pages may be submitted for DMA to devices or accessed via
- * their kernel linear mapping (via the kmap APIs). Care should be taken to
- * use the correct cache flushing APIs.
- *
- * See also get_user_pages_fast, for performance critical applications.
- */
-long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages, int write,
- int force, struct page **pages, struct vm_area_struct **vmas)
-{
- int flags = FOLL_TOUCH;
-
- if (pages)
- flags |= FOLL_GET;
- if (write)
- flags |= FOLL_WRITE;
- if (force)
- flags |= FOLL_FORCE;
-
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
- NULL);
-}
-EXPORT_SYMBOL(get_user_pages);
-
-/**
- * get_dump_page() - pin user page in memory while writing it to core dump
- * @addr: user address
- *
- * Returns struct page pointer of user page pinned for dump,
- * to be freed afterwards by page_cache_release() or put_page().
- *
- * Returns NULL on any kind of failure - a hole must then be inserted into
- * the corefile, to preserve alignment with its headers; and also returns
- * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
- *
- * Called without mmap_sem, but after all other threads have been killed.
- */
-#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
-{
- struct vm_area_struct *vma;
- struct page *page;
-
- if (__get_user_pages(current, current->mm, addr, 1,
- FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
- NULL) < 1)
- return NULL;
- flush_cache_page(vma, addr, page_to_pfn(page));
- return page;
-}
-#endif /* CONFIG_ELF_CORE */
-
pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl)
{
@@ -3578,6 +2944,8 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int dirtied = 0;
int ret, tmp;
+ WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
+
ret = __do_fault(vma, address, pgoff, flags, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -3608,6 +2976,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (set_page_dirty(fault_page))
dirtied = 1;
+ /*
+ * Take a local copy of the address_space - page.mapping may be zeroed
+ * by truncate after unlock_page(). The address_space itself remains
+ * pinned by vma->vm_file's reference. We rely on unlock_page()'s
+ * release semantics to prevent the compiler from undoing this copying.
+ */
mapping = fault_page->mapping;
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a650db29606f..2906873a1502 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -46,19 +46,84 @@
static void generic_online_page(struct page *page);
static online_page_callback_t online_page_callback = generic_online_page;
+static DEFINE_MUTEX(online_page_callback_lock);
-DEFINE_MUTEX(mem_hotplug_mutex);
+/* The same as the cpu_hotplug lock, but for memory hotplug. */
+static struct {
+ struct task_struct *active_writer;
+ struct mutex lock; /* Synchronizes accesses to refcount, */
+ /*
+ * Also blocks the new readers during
+ * an ongoing mem hotplug operation.
+ */
+ int refcount;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+} mem_hotplug = {
+ .active_writer = NULL,
+ .lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
+ .refcount = 0,
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ .dep_map = {.name = "mem_hotplug.lock" },
+#endif
+};
+
+/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
+#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
+#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
+#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+
+void get_online_mems(void)
+{
+ might_sleep();
+ if (mem_hotplug.active_writer == current)
+ return;
+ memhp_lock_acquire_read();
+ mutex_lock(&mem_hotplug.lock);
+ mem_hotplug.refcount++;
+ mutex_unlock(&mem_hotplug.lock);
+
+}
-void lock_memory_hotplug(void)
+void put_online_mems(void)
{
- mutex_lock(&mem_hotplug_mutex);
+ if (mem_hotplug.active_writer == current)
+ return;
+ mutex_lock(&mem_hotplug.lock);
+
+ if (WARN_ON(!mem_hotplug.refcount))
+ mem_hotplug.refcount++; /* try to fix things up */
+
+ if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
+ wake_up_process(mem_hotplug.active_writer);
+ mutex_unlock(&mem_hotplug.lock);
+ memhp_lock_release();
+
}
-void unlock_memory_hotplug(void)
+static void mem_hotplug_begin(void)
{
- mutex_unlock(&mem_hotplug_mutex);
+ mem_hotplug.active_writer = current;
+
+ memhp_lock_acquire();
+ for (;;) {
+ mutex_lock(&mem_hotplug.lock);
+ if (likely(!mem_hotplug.refcount))
+ break;
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&mem_hotplug.lock);
+ schedule();
+ }
}
+static void mem_hotplug_done(void)
+{
+ mem_hotplug.active_writer = NULL;
+ mutex_unlock(&mem_hotplug.lock);
+ memhp_lock_release();
+}
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
@@ -727,14 +792,16 @@ int set_online_page_callback(online_page_callback_t callback)
{
int rc = -EINVAL;
- lock_memory_hotplug();
+ get_online_mems();
+ mutex_lock(&online_page_callback_lock);
if (online_page_callback == generic_online_page) {
online_page_callback = callback;
rc = 0;
}
- unlock_memory_hotplug();
+ mutex_unlock(&online_page_callback_lock);
+ put_online_mems();
return rc;
}
@@ -744,14 +811,16 @@ int restore_online_page_callback(online_page_callback_t callback)
{
int rc = -EINVAL;
- lock_memory_hotplug();
+ get_online_mems();
+ mutex_lock(&online_page_callback_lock);
if (online_page_callback == callback) {
online_page_callback = generic_online_page;
rc = 0;
}
- unlock_memory_hotplug();
+ mutex_unlock(&online_page_callback_lock);
+ put_online_mems();
return rc;
}
@@ -899,7 +968,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
int ret;
struct memory_notify arg;
- lock_memory_hotplug();
+ mem_hotplug_begin();
/*
* This doesn't need a lock to do pfn_to_page().
* The section can't be removed here because of the
@@ -907,23 +976,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
*/
zone = page_zone(pfn_to_page(pfn));
+ ret = -EINVAL;
if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
- !can_online_high_movable(zone)) {
- unlock_memory_hotplug();
- return -EINVAL;
- }
+ !can_online_high_movable(zone))
+ goto out;
if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
- if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
- unlock_memory_hotplug();
- return -EINVAL;
- }
+ if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
+ goto out;
}
if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
- if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
- unlock_memory_hotplug();
- return -EINVAL;
- }
+ if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
+ goto out;
}
/* Previous code may changed the zone of the pfn range */
@@ -939,8 +1003,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
ret = notifier_to_errno(ret);
if (ret) {
memory_notify(MEM_CANCEL_ONLINE, &arg);
- unlock_memory_hotplug();
- return ret;
+ goto out;
}
/*
* If this zone is not populated, then it is not in zonelist.
@@ -964,8 +1027,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
(((unsigned long long) pfn + nr_pages)
<< PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_ONLINE, &arg);
- unlock_memory_hotplug();
- return ret;
+ goto out;
}
zone->present_pages += onlined_pages;
@@ -995,9 +1057,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
- unlock_memory_hotplug();
-
- return 0;
+out:
+ mem_hotplug_done();
+ return ret;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1055,7 +1117,7 @@ int try_online_node(int nid)
if (node_online(nid))
return 0;
- lock_memory_hotplug();
+ mem_hotplug_begin();
pgdat = hotadd_new_pgdat(nid, 0);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
@@ -1073,7 +1135,7 @@ int try_online_node(int nid)
}
out:
- unlock_memory_hotplug();
+ mem_hotplug_done();
return ret;
}
@@ -1117,7 +1179,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
new_pgdat = !p;
}
- lock_memory_hotplug();
+ mem_hotplug_begin();
new_node = !node_online(nid);
if (new_node) {
@@ -1158,7 +1220,7 @@ error:
release_memory_resource(res);
out:
- unlock_memory_hotplug();
+ mem_hotplug_done();
return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
@@ -1565,7 +1627,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
if (!test_pages_in_a_zone(start_pfn, end_pfn))
return -EINVAL;
- lock_memory_hotplug();
+ mem_hotplug_begin();
zone = page_zone(pfn_to_page(start_pfn));
node = zone_to_nid(zone);
@@ -1672,7 +1734,7 @@ repeat:
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
- unlock_memory_hotplug();
+ mem_hotplug_done();
return 0;
failed_removal:
@@ -1684,7 +1746,7 @@ failed_removal:
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
out:
- unlock_memory_hotplug();
+ mem_hotplug_done();
return ret;
}
@@ -1888,7 +1950,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
BUG_ON(check_hotplug_memory_range(start, size));
- lock_memory_hotplug();
+ mem_hotplug_begin();
/*
* All memory blocks must be offlined before removing memory. Check
@@ -1897,10 +1959,8 @@ void __ref remove_memory(int nid, u64 start, u64 size)
*/
ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
check_memblock_offlined_cb);
- if (ret) {
- unlock_memory_hotplug();
+ if (ret)
BUG();
- }
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
@@ -1909,7 +1969,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
try_offline_node(nid);
- unlock_memory_hotplug();
+ mem_hotplug_done();
}
EXPORT_SYMBOL_GPL(remove_memory);
#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 78e1472933ea..ac621fa9dd5d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -65,6 +65,8 @@
kernel is not always grateful with that.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/highmem.h>
@@ -91,6 +93,7 @@
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
+#include <linux/printk.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
@@ -476,140 +479,70 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
+struct queue_pages {
+ struct list_head *pagelist;
+ unsigned long flags;
+ nodemask_t *nmask;
+ struct vm_area_struct *prev;
+};
+
/*
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
*/
-static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
+static int queue_pages_pte(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
{
- pte_t *orig_pte;
- pte_t *pte;
- spinlock_t *ptl;
-
- orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- do {
- struct page *page;
- int nid;
+ struct vm_area_struct *vma = walk->vma;
+ struct page *page;
+ struct queue_pages *qp = walk->private;
+ unsigned long flags = qp->flags;
+ int nid;
- if (!pte_present(*pte))
- continue;
- page = vm_normal_page(vma, addr, *pte);
- if (!page)
- continue;
- /*
- * vm_normal_page() filters out zero pages, but there might
- * still be PageReserved pages to skip, perhaps in a VDSO.
- */
- if (PageReserved(page))
- continue;
- nid = page_to_nid(page);
- if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
- continue;
+ if (!pte_present(*pte))
+ return 0;
+ page = vm_normal_page(vma, addr, *pte);
+ if (!page)
+ return 0;
+ /*
+ * vm_normal_page() filters out zero pages, but there might
+ * still be PageReserved pages to skip, perhaps in a VDSO.
+ */
+ if (PageReserved(page))
+ return 0;
+ nid = page_to_nid(page);
+ if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
+ return 0;
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- migrate_page_add(page, private, flags);
- else
- break;
- } while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap_unlock(orig_pte, ptl);
- return addr != end;
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ migrate_page_add(page, qp->pagelist, flags);
+ return 0;
}
-static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
- pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
- void *private)
+static int queue_pages_hugetlb(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
+ struct queue_pages *qp = walk->private;
+ unsigned long flags = qp->flags;
int nid;
struct page *page;
- spinlock_t *ptl;
+ pte_t entry;
- ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
- page = pte_page(huge_ptep_get((pte_t *)pmd));
+ entry = huge_ptep_get(pte);
+ if (!pte_present(entry))
+ return 0;
+ page = pte_page(entry);
nid = page_to_nid(page);
- if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
- goto unlock;
+ if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
+ return 0;
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
- isolate_huge_page(page, private);
-unlock:
- spin_unlock(ptl);
+ isolate_huge_page(page, qp->pagelist);
#else
BUG();
#endif
-}
-
-static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
- pmd_t *pmd;
- unsigned long next;
-
- pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- if (!pmd_present(*pmd))
- continue;
- if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
- queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
- flags, private);
- continue;
- }
- split_huge_page_pmd(vma, addr, pmd);
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- continue;
- if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pmd++, addr = next, addr != end);
- return 0;
-}
-
-static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
- pud_t *pud;
- unsigned long next;
-
- pud = pud_offset(pgd, addr);
- do {
- next = pud_addr_end(addr, end);
- if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
- continue;
- if (pud_none_or_clear_bad(pud))
- continue;
- if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pud++, addr = next, addr != end);
- return 0;
-}
-
-static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
- pgd_t *pgd;
- unsigned long next;
-
- pgd = pgd_offset(vma->vm_mm, addr);
- do {
- next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
- continue;
- if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pgd++, addr = next, addr != end);
return 0;
}
@@ -642,6 +575,45 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
}
#endif /* CONFIG_NUMA_BALANCING */
+static int queue_pages_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ struct queue_pages *qp = walk->private;
+ unsigned long endvma = vma->vm_end;
+ unsigned long flags = qp->flags;
+
+ if (endvma > end)
+ endvma = end;
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+
+ if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+ if (!vma->vm_next && vma->vm_end < end)
+ return -EFAULT;
+ if (qp->prev && qp->prev->vm_end < vma->vm_start)
+ return -EFAULT;
+ }
+
+ qp->prev = vma;
+ walk->skip = 1;
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 0;
+
+ if (flags & MPOL_MF_LAZY) {
+ change_prot_numa(vma, start, endvma);
+ return 0;
+ }
+
+ if ((flags & MPOL_MF_STRICT) ||
+ ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+ vma_migratable(vma)))
+ /* queue pages from current vma */
+ walk->skip = 0;
+ return 0;
+}
+
/*
* Walk through page tables and collect pages to be migrated.
*
@@ -651,51 +623,29 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
*/
static struct vm_area_struct *
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
- const nodemask_t *nodes, unsigned long flags, void *private)
+ nodemask_t *nodes, unsigned long flags,
+ struct list_head *pagelist)
{
int err;
- struct vm_area_struct *first, *vma, *prev;
-
-
- first = find_vma(mm, start);
- if (!first)
- return ERR_PTR(-EFAULT);
- prev = NULL;
- for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
- unsigned long endvma = vma->vm_end;
-
- if (endvma > end)
- endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
-
- if (!(flags & MPOL_MF_DISCONTIG_OK)) {
- if (!vma->vm_next && vma->vm_end < end)
- return ERR_PTR(-EFAULT);
- if (prev && prev->vm_end < vma->vm_start)
- return ERR_PTR(-EFAULT);
- }
-
- if (flags & MPOL_MF_LAZY) {
- change_prot_numa(vma, start, endvma);
- goto next;
- }
-
- if ((flags & MPOL_MF_STRICT) ||
- ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma))) {
-
- err = queue_pages_pgd_range(vma, start, endvma, nodes,
- flags, private);
- if (err) {
- first = ERR_PTR(err);
- break;
- }
- }
-next:
- prev = vma;
- }
- return first;
+ struct queue_pages qp = {
+ .pagelist = pagelist,
+ .flags = flags,
+ .nmask = nodes,
+ .prev = NULL,
+ };
+ struct mm_walk queue_pages_walk = {
+ .hugetlb_entry = queue_pages_hugetlb,
+ .pte_entry = queue_pages_pte,
+ .test_walk = queue_pages_test_walk,
+ .mm = mm,
+ .private = &qp,
+ };
+
+ err = walk_page_range(start, end, &queue_pages_walk);
+ if (err < 0)
+ return ERR_PTR(err);
+ else
+ return find_vma(mm, start);
}
/*
@@ -2645,7 +2595,7 @@ void __init numa_policy_init(void)
node_set(prefer, interleave_nodes);
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
- printk("numa_policy_init: interleaving failed\n");
+ pr_err("%s: interleaving failed\n", __func__);
check_numabalancing_enable();
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 905434f18c97..455d468c3a5d 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -192,6 +192,7 @@ EXPORT_SYMBOL(mempool_resize);
* returns NULL. Note that due to preallocation, this function
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
+ * Note: using __GFP_ZERO is not supported.
*/
void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
@@ -200,6 +201,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
wait_queue_t wait;
gfp_t gfp_temp;
+ VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
might_sleep_if(gfp_mask & __GFP_WAIT);
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
diff --git a/mm/mmap.c b/mm/mmap.c
index b1202cf81f4b..9db71234e00f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -6,6 +6,8 @@
* Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
@@ -37,6 +39,7 @@
#include <linux/sched/sysctl.h>
#include <linux/notifier.h>
#include <linux/memory.h>
+#include <linux/printk.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -361,20 +364,20 @@ static int browse_rb(struct rb_root *root)
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
if (vma->vm_start < prev) {
- printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
+ pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev);
bug = 1;
}
if (vma->vm_start < pend) {
- printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
+ pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend);
bug = 1;
}
if (vma->vm_start > vma->vm_end) {
- printk("vm_end %lx < vm_start %lx\n",
+ pr_info("vm_end %lx < vm_start %lx\n",
vma->vm_end, vma->vm_start);
bug = 1;
}
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
- printk("free gap %lx, correct %lx\n",
+ pr_info("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
@@ -388,7 +391,7 @@ static int browse_rb(struct rb_root *root)
for (nd = pn; nd; nd = rb_prev(nd))
j++;
if (i != j) {
- printk("backwards %d, forwards %d\n", j, i);
+ pr_info("backwards %d, forwards %d\n", j, i);
bug = 1;
}
return bug ? -1 : i;
@@ -423,17 +426,17 @@ static void validate_mm(struct mm_struct *mm)
i++;
}
if (i != mm->map_count) {
- printk("map_count %d vm_next %d\n", mm->map_count, i);
+ pr_info("map_count %d vm_next %d\n", mm->map_count, i);
bug = 1;
}
if (highest_address != mm->highest_vm_end) {
- printk("mm->highest_vm_end %lx, found %lx\n",
+ pr_info("mm->highest_vm_end %lx, found %lx\n",
mm->highest_vm_end, highest_address);
bug = 1;
}
i = browse_rb(&mm->mm_rb);
if (i != mm->map_count) {
- printk("map_count %d rb %d\n", mm->map_count, i);
+ pr_info("map_count %d rb %d\n", mm->map_count, i);
bug = 1;
}
BUG_ON(bug);
@@ -640,11 +643,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct address_space *mapping = NULL;
- if (vma->vm_file)
+ if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
-
- if (mapping)
mutex_lock(&mapping->i_mmap_mutex);
+ }
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
@@ -2965,9 +2967,7 @@ int install_special_mapping(struct mm_struct *mm,
struct vm_area_struct *vma = _install_special_mapping(mm,
addr, len, vm_flags, pages);
- if (IS_ERR(vma))
- return PTR_ERR(vma);
- return 0;
+ return PTR_ERR_OR_ZERO(vma);
}
static DEFINE_MUTEX(mm_all_locks_mutex);
@@ -3252,7 +3252,7 @@ static struct notifier_block reserve_mem_nb = {
static int __meminit init_reserve_notifier(void)
{
if (register_hotmemory_notifier(&reserve_mem_nb))
- printk("Failed registering memory add/remove notifier for admin reserve");
+ pr_err("Failed registering memory add/remove notifier for admin reserve\n");
return 0;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 85f8d6698d48..b78e3a8f5ee7 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -13,6 +13,8 @@
* Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
@@ -32,6 +34,7 @@
#include <linux/syscalls.h>
#include <linux/audit.h>
#include <linux/sched/sysctl.h>
+#include <linux/printk.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
@@ -1246,7 +1249,7 @@ error_free:
return ret;
enomem:
- printk("Allocation of length %lu from process %d (%s) failed\n",
+ pr_err("Allocation of length %lu from process %d (%s) failed\n",
len, current->pid, current->comm);
show_free_areas(0);
return -ENOMEM;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5dba2933c9c0..921ef3435968 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -261,8 +261,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
} while (zone_span_seqretry(zone, seq));
if (ret)
- pr_err("page %lu outside zone [ %lu - %lu ]\n",
- pfn, start_pfn, start_pfn + sp);
+ pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
+ pfn, zone_to_nid(zone), zone->name,
+ start_pfn, start_pfn + sp);
return ret;
}
@@ -698,6 +699,8 @@ static void free_pcppages_bulk(struct zone *zone, int count,
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
+
+ VM_BUG_ON(!check_freepage_migratetype(page));
mt = get_freepage_migratetype(page);
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, zone, 0, mt);
@@ -931,6 +934,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
rmv_page_order(page);
area->nr_free--;
expand(zone, page, order, current_order, area, migratetype);
+ set_freepage_migratetype(page, migratetype);
return page;
}
@@ -1057,7 +1061,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
/*
* When borrowing from MIGRATE_CMA, we need to release the excess
- * buddy pages to CMA itself.
+ * buddy pages to CMA itself. We also ensure the freepage_migratetype
+ * is set to CMA so it is returned to the correct freelist in case
+ * the page ends up being not actually allocated from the pcp lists.
*/
if (is_migrate_cma(fallback_type))
return fallback_type;
@@ -1125,6 +1131,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
expand(zone, page, order, current_order, area,
new_type);
+ /* The freepage_migratetype may differ from pageblock's
+ * migratetype depending on the decisions in
+ * try_to_steal_freepages. This is OK as long as it does
+ * not differ for MIGRATE_CMA type.
+ */
+ set_freepage_migratetype(page, new_type);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, migratetype, new_type);
@@ -1175,13 +1187,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, int cold)
{
- int mt = migratetype, i;
+ int i;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype);
if (unlikely(page == NULL))
break;
+ VM_BUG_ON(!check_freepage_migratetype(page));
/*
* Split buddy pages returned by expand() are received here
@@ -1196,14 +1209,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
list_add(&page->lru, list);
else
list_add_tail(&page->lru, list);
- if (IS_ENABLED(CONFIG_CMA)) {
- mt = get_pageblock_migratetype(page);
- if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
- mt = migratetype;
- }
- set_freepage_migratetype(page, mt);
list = &page->lru;
- if (is_migrate_cma(mt))
+ if (is_migrate_cma(get_freepage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
@@ -1572,7 +1579,7 @@ again:
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
- get_pageblock_migratetype(page));
+ get_freepage_migratetype(page));
}
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -1850,18 +1857,8 @@ static bool zone_local(struct zone *local_zone, struct zone *zone)
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
- return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
-}
-
-static void __paginginit init_zone_allows_reclaim(int nid)
-{
- int i;
-
- for_each_node_state(i, N_MEMORY)
- if (node_distance(nid, i) <= RECLAIM_DISTANCE)
- node_set(i, NODE_DATA(nid)->reclaim_nodes);
- else
- zone_reclaim_mode = 1;
+ return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
+ RECLAIM_DISTANCE;
}
#else /* CONFIG_NUMA */
@@ -1895,9 +1892,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
return true;
}
-static inline void init_zone_allows_reclaim(int nid)
-{
-}
#endif /* CONFIG_NUMA */
/*
@@ -2697,7 +2691,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
- struct mem_cgroup *memcg = NULL;
gfp_mask &= gfp_allowed_mask;
@@ -2716,13 +2709,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
- /*
- * Will only have any effect when __GFP_KMEMCG is set. This is
- * verified in the (always inline) callee
- */
- if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
- return NULL;
-
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
@@ -2782,8 +2768,6 @@ out:
if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
- memcg_kmem_commit_charge(page, memcg, order);
-
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2837,27 +2821,51 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
/*
- * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
- * pages allocated with __GFP_KMEMCG.
+ * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
+ * of the current memory cgroup.
*
- * Those pages are accounted to a particular memcg, embedded in the
- * corresponding page_cgroup. To avoid adding a hit in the allocator to search
- * for that information only to find out that it is NULL for users who have no
- * interest in that whatsoever, we provide these functions.
- *
- * The caller knows better which flags it relies on.
+ * It should be used when the caller would like to use kmalloc, but since the
+ * allocation is large, it has to fall back to the page allocator.
+ */
+struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
+{
+ struct page *page;
+ struct mem_cgroup *memcg = NULL;
+
+ if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+ return NULL;
+ page = alloc_pages(gfp_mask, order);
+ memcg_kmem_commit_charge(page, memcg, order);
+ return page;
+}
+
+struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
+{
+ struct page *page;
+ struct mem_cgroup *memcg = NULL;
+
+ if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+ return NULL;
+ page = alloc_pages_node(nid, gfp_mask, order);
+ memcg_kmem_commit_charge(page, memcg, order);
+ return page;
+}
+
+/*
+ * __free_kmem_pages and free_kmem_pages will free pages allocated with
+ * alloc_kmem_pages.
*/
-void __free_memcg_kmem_pages(struct page *page, unsigned int order)
+void __free_kmem_pages(struct page *page, unsigned int order)
{
memcg_kmem_uncharge_pages(page, order);
__free_pages(page, order);
}
-void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
+void free_kmem_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
VM_BUG_ON(!virt_addr_valid((void *)addr));
- __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
+ __free_kmem_pages(virt_to_page((void *)addr), order);
}
}
@@ -4921,8 +4929,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
- if (node_state(nid, N_MEMORY))
- init_zone_allows_reclaim(nid);
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
#endif
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2beeabf502c5..b2a075ffb96e 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -3,29 +3,58 @@
#include <linux/sched.h>
#include <linux/hugetlb.h>
-static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+/*
+ * Check the current skip status of page table walker.
+ *
+ * Here what I mean by skip is to skip lower level walking, and that was
+ * determined for each entry independently. For example, when walk_pmd_range
+ * handles a pmd_trans_huge we don't have to walk over ptes under that pmd,
+ * and the skipping does not affect the walking over ptes under other pmds.
+ * That's why we reset @walk->skip after tested.
+ */
+static bool skip_lower_level_walking(struct mm_walk *walk)
{
+ if (walk->skip) {
+ walk->skip = 0;
+ return true;
+ }
+ return false;
+}
+
+static int walk_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct mm_struct *mm = walk->mm;
pte_t *pte;
+ pte_t *orig_pte;
+ spinlock_t *ptl;
int err = 0;
- pte = pte_offset_map(pmd, addr);
- for (;;) {
+ orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ do {
+ if (pte_none(*pte)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, addr + PAGE_SIZE,
+ walk);
+ if (err)
+ break;
+ continue;
+ }
+ /*
+ * Callers should have their own way to handle swap entries
+ * in walk->pte_entry().
+ */
err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
break;
- addr += PAGE_SIZE;
- if (addr == end)
- break;
- pte++;
- }
-
- pte_unmap(pte);
- return err;
+ } while (pte++, addr += PAGE_SIZE, addr < end);
+ pte_unmap_unlock(orig_pte, ptl);
+ cond_resched();
+ return addr == end ? 0 : err;
}
-static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+static int walk_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
pmd_t *pmd;
unsigned long next;
@@ -35,6 +64,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
do {
again:
next = pmd_addr_end(addr, end);
+
if (pmd_none(*pmd)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
@@ -42,35 +72,32 @@ again:
break;
continue;
}
- /*
- * This implies that each ->pmd_entry() handler
- * needs to know about pmd_trans_huge() pmds
- */
- if (walk->pmd_entry)
- err = walk->pmd_entry(pmd, addr, next, walk);
- if (err)
- break;
- /*
- * Check this here so we only break down trans_huge
- * pages when we _need_ to
- */
- if (!walk->pte_entry)
- continue;
+ if (walk->pmd_entry) {
+ err = walk->pmd_entry(pmd, addr, next, walk);
+ if (skip_lower_level_walking(walk))
+ continue;
+ if (err)
+ break;
+ }
- split_huge_page_pmd_mm(walk->mm, addr, pmd);
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- goto again;
- err = walk_pte_range(pmd, addr, next, walk);
- if (err)
- break;
- } while (pmd++, addr = next, addr != end);
+ if (walk->pte_entry) {
+ if (walk->vma) {
+ split_huge_page_pmd(walk->vma, addr, pmd);
+ if (pmd_trans_unstable(pmd))
+ goto again;
+ }
+ err = walk_pte_range(pmd, addr, next, walk);
+ if (err)
+ break;
+ }
+ } while (pmd++, addr = next, addr < end);
return err;
}
-static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+static int walk_pud_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
pud_t *pud;
unsigned long next;
@@ -79,6 +106,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
+
if (pud_none_or_clear_bad(pud)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
@@ -86,13 +114,58 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
break;
continue;
}
- if (walk->pud_entry)
+
+ if (walk->pud_entry) {
err = walk->pud_entry(pud, addr, next, walk);
- if (!err && (walk->pmd_entry || walk->pte_entry))
+ if (skip_lower_level_walking(walk))
+ continue;
+ if (err)
+ break;
+ }
+
+ if (walk->pmd_entry || walk->pte_entry) {
err = walk_pmd_range(pud, addr, next, walk);
- if (err)
- break;
- } while (pud++, addr = next, addr != end);
+ if (err)
+ break;
+ }
+ } while (pud++, addr = next, addr < end);
+
+ return err;
+}
+
+static int walk_pgd_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ int err = 0;
+
+ pgd = pgd_offset(walk->mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+
+ if (pgd_none_or_clear_bad(pgd)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, walk);
+ if (err)
+ break;
+ continue;
+ }
+
+ if (walk->pgd_entry) {
+ err = walk->pgd_entry(pgd, addr, next, walk);
+ if (skip_lower_level_walking(walk))
+ continue;
+ if (err)
+ break;
+ }
+
+ if (walk->pud_entry || walk->pmd_entry || walk->pte_entry) {
+ err = walk_pud_range(pgd, addr, next, walk);
+ if (err)
+ break;
+ }
+ } while (pgd++, addr = next, addr < end);
return err;
}
@@ -105,144 +178,180 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
return boundary < end ? boundary : end;
}
-static int walk_hugetlb_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
+ struct mm_struct *mm = walk->mm;
+ struct vm_area_struct *vma = walk->vma;
struct hstate *h = hstate_vma(vma);
unsigned long next;
unsigned long hmask = huge_page_mask(h);
pte_t *pte;
int err = 0;
+ spinlock_t *ptl;
do {
next = hugetlb_entry_end(h, addr, end);
pte = huge_pte_offset(walk->mm, addr & hmask);
- if (pte && walk->hugetlb_entry)
- err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
+ if (!pte)
+ continue;
+ ptl = huge_pte_lock(h, mm, pte);
+ /*
+ * Callers should have their own way to handle swap entries
+ * in walk->hugetlb_entry().
+ */
+ if (walk->hugetlb_entry)
+ err = walk->hugetlb_entry(pte, addr, next, walk);
+ spin_unlock(ptl);
if (err)
- return err;
+ break;
} while (addr = next, addr != end);
-
- return 0;
+ cond_resched();
+ return err;
}
#else /* CONFIG_HUGETLB_PAGE */
-static int walk_hugetlb_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+static inline int walk_hugetlb_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
return 0;
}
#endif /* CONFIG_HUGETLB_PAGE */
+/*
+ * Decide whether we really walk over the current vma on [@start, @end)
+ * or skip it. When we skip it, we set @walk->skip to 1.
+ * The return value is used to control the page table walking to
+ * continue (for zero) or not (for non-zero).
+ *
+ * Default check (only VM_PFNMAP check for now) is used when the caller
+ * doesn't define test_walk() callback.
+ */
+static int walk_page_test(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+
+ if (walk->test_walk)
+ return walk->test_walk(start, end, walk);
+ /*
+ * Do not walk over vma(VM_PFNMAP), because we have no valid struct
+ * page backing a VM_PFNMAP range. See also commit a9ff785e4437.
+ */
+ if (vma->vm_flags & VM_PFNMAP)
+ walk->skip = 1;
+ return 0;
+}
+
+static int __walk_page_range(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ int err = 0;
+ struct vm_area_struct *vma = walk->vma;
+
+ if (vma && is_vm_hugetlb_page(vma)) {
+ if (walk->hugetlb_entry)
+ err = walk_hugetlb_range(start, end, walk);
+ } else
+ err = walk_pgd_range(start, end, walk);
+
+ return err;
+}
/**
- * walk_page_range - walk a memory map's page tables with a callback
- * @addr: starting address
- * @end: ending address
- * @walk: set of callbacks to invoke for each level of the tree
+ * walk_page_range - walk page table with caller specific callbacks
+ *
+ * Recursively walk the page table tree of the process represented by
+ * @walk->mm within the virtual address range [@start, @end). In walking,
+ * we can call caller-specific callback functions against each entry.
*
- * Recursively walk the page table for the memory area in a VMA,
- * calling supplied callbacks. Callbacks are called in-order (first
- * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
- * etc.). If lower-level callbacks are omitted, walking depth is reduced.
+ * Before starting to walk page table, some callers want to check whether
+ * they really want to walk over the vma (for example by checking vm_flags.)
+ * walk_page_test() and @walk->test_walk() do that check.
*
- * Each callback receives an entry pointer and the start and end of the
- * associated range, and a copy of the original mm_walk for access to
- * the ->private or ->mm fields.
+ * If any callback returns a non-zero value, the page table walk is aborted
+ * immediately and the return value is propagated back to the caller.
+ * Note that the meaning of the positive returned value can be defined
+ * by the caller for its own purpose.
*
- * Usually no locks are taken, but splitting transparent huge page may
- * take page table lock. And the bottom level iterator will map PTE
- * directories from highmem if necessary.
+ * If the caller defines multiple callbacks in different levels, the
+ * callbacks are called in depth-first manner. It could happen that
+ * multiple callbacks are called on a address. For example if some caller
+ * defines test_walk(), pmd_entry(), and pte_entry(), then callbacks are
+ * called in the order of test_walk(), pmd_entry(), and pte_entry().
+ * If you don't want to go down to lower level at some point and move to
+ * the next entry in the same level, you set @walk->skip to 1.
+ * For example if you succeed to handle some pmd entry as trans_huge entry,
+ * you need not call walk_pte_range() any more, so set it to avoid that.
+ * We can't determine whether to go down to lower level with the return
+ * value of the callback, because the whole range of return values (0, >0,
+ * and <0) are used up for other meanings.
*
- * If any callback returns a non-zero value, the walk is aborted and
- * the return value is propagated back to the caller. Otherwise 0 is returned.
+ * Each callback can access to the vma over which it is doing page table
+ * walk right now via @walk->vma. @walk->vma is set to NULL in walking
+ * outside a vma. If you want to access to some caller-specific data from
+ * callbacks, @walk->private should be helpful.
*
- * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
- * is !NULL.
+ * The callers should hold @walk->mm->mmap_sem. Note that the lower level
+ * iterators can take page table lock in lowest level iteration and/or
+ * in split_huge_page_pmd().
*/
-int walk_page_range(unsigned long addr, unsigned long end,
+int walk_page_range(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
- pgd_t *pgd;
- unsigned long next;
int err = 0;
+ struct vm_area_struct *vma;
+ unsigned long next;
- if (addr >= end)
- return err;
+ if (start >= end)
+ return -EINVAL;
if (!walk->mm)
return -EINVAL;
VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
- pgd = pgd_offset(walk->mm, addr);
do {
- struct vm_area_struct *vma = NULL;
+ vma = find_vma(walk->mm, start);
+ if (!vma) { /* after the last vma */
+ walk->vma = NULL;
+ next = end;
+ } else if (start < vma->vm_start) { /* outside the found vma */
+ walk->vma = NULL;
+ next = vma->vm_start;
+ } else { /* inside the found vma */
+ walk->vma = vma;
+ next = min(end, vma->vm_end);
- next = pgd_addr_end(addr, end);
-
- /*
- * This function was not intended to be vma based.
- * But there are vma special cases to be handled:
- * - hugetlb vma's
- * - VM_PFNMAP vma's
- */
- vma = find_vma(walk->mm, addr);
- if (vma) {
- /*
- * There are no page structures backing a VM_PFNMAP
- * range, so do not allow split_huge_page_pmd().
- */
- if ((vma->vm_start <= addr) &&
- (vma->vm_flags & VM_PFNMAP)) {
- next = vma->vm_end;
- pgd = pgd_offset(walk->mm, next);
- continue;
- }
- /*
- * Handle hugetlb vma individually because pagetable
- * walk for the hugetlb page is dependent on the
- * architecture and we can't handled it in the same
- * manner as non-huge pages.
- */
- if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
- is_vm_hugetlb_page(vma)) {
- if (vma->vm_end < next)
- next = vma->vm_end;
- /*
- * Hugepage is very tightly coupled with vma,
- * so walk through hugetlb entries within a
- * given vma.
- */
- err = walk_hugetlb_range(vma, addr, next, walk);
- if (err)
- break;
- pgd = pgd_offset(walk->mm, next);
+ err = walk_page_test(start, next, walk);
+ if (skip_lower_level_walking(walk))
continue;
- }
- }
-
- if (pgd_none_or_clear_bad(pgd)) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
if (err)
break;
- pgd++;
- continue;
}
- if (walk->pgd_entry)
- err = walk->pgd_entry(pgd, addr, next, walk);
- if (!err &&
- (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
- err = walk_pud_range(pgd, addr, next, walk);
+ err = __walk_page_range(start, next, walk);
if (err)
break;
- pgd++;
- } while (addr = next, addr < end);
-
+ } while (start = next, start < end);
return err;
}
+
+int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
+{
+ int err;
+
+ if (!walk->mm)
+ return -EINVAL;
+
+ VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+ VM_BUG_ON(!vma);
+ walk->vma = vma;
+ err = walk_page_test(vma->vm_start, vma->vm_end, walk);
+ if (skip_lower_level_walking(walk))
+ return 0;
+ if (err)
+ return err;
+ return __walk_page_range(vma->vm_start, vma->vm_end, walk);
+}
diff --git a/mm/rmap.c b/mm/rmap.c
index 9c3e77396d1a..e065ba798fde 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -515,11 +515,7 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
static inline unsigned long
__vma_address(struct page *page, struct vm_area_struct *vma)
{
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-
- if (unlikely(is_vm_hugetlb_page(vma)))
- pgoff = page->index << huge_page_order(page_hstate(page));
-
+ pgoff_t pgoff = page_pgoff(page);
return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
}
@@ -1024,7 +1020,7 @@ void page_add_new_anon_rmap(struct page *page,
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
hpage_nr_pages(page));
__page_set_anon_rmap(page, vma, address, 1);
- if (!mlocked_vma_newpage(vma, page)) {
+ if (!mlocked_vma_newpage(vma, page) && !PageUnevictable(page)) {
SetPageActive(page);
lru_cache_add(page);
} else
@@ -1359,7 +1355,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
if (page->index != linear_page_index(vma, address)) {
pte_t ptfile = pgoff_to_pte(page->index);
if (pte_soft_dirty(pteval))
- pte_file_mksoft_dirty(ptfile);
+ ptfile = pte_file_mksoft_dirty(ptfile);
set_pte_at(mm, address, pte, ptfile);
}
@@ -1609,7 +1605,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff_t pgoff = page_pgoff(page);
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
@@ -1650,7 +1646,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
{
struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page->index << compound_order(page);
+ pgoff_t pgoff = page_pgoff(page);
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
diff --git a/mm/slab.c b/mm/slab.c
index 388cb1ae6fbc..bae9c32d4c8e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1681,8 +1681,12 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
flags |= __GFP_RECLAIMABLE;
+ if (memcg_charge_slab(cachep, flags, cachep->gfporder))
+ return NULL;
+
page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
if (!page) {
+ memcg_uncharge_slab(cachep, cachep->gfporder);
if (!(flags & __GFP_NOWARN) && printk_ratelimit())
slab_out_of_memory(cachep, flags, nodeid);
return NULL;
@@ -1741,7 +1745,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
memcg_release_pages(cachep, cachep->gfporder);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- __free_memcg_kmem_pages(page, cachep->gfporder);
+ __free_pages(page, cachep->gfporder);
+ memcg_uncharge_slab(cachep, cachep->gfporder);
}
static void kmem_rcu_free(struct rcu_head *head)
@@ -2469,8 +2474,7 @@ out:
return nr_freed;
}
-/* Called with slab_mutex held to protect against cpu hotplug */
-static int __cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep)
{
int ret = 0, i = 0;
struct kmem_cache_node *n;
@@ -2491,32 +2495,11 @@ static int __cache_shrink(struct kmem_cache *cachep)
return (ret ? 1 : 0);
}
-/**
- * kmem_cache_shrink - Shrink a cache.
- * @cachep: The cache to shrink.
- *
- * Releases as many slabs as possible for a cache.
- * To help debugging, a zero exit status indicates all slabs were released.
- */
-int kmem_cache_shrink(struct kmem_cache *cachep)
-{
- int ret;
- BUG_ON(!cachep || in_interrupt());
-
- get_online_cpus();
- mutex_lock(&slab_mutex);
- ret = __cache_shrink(cachep);
- mutex_unlock(&slab_mutex);
- put_online_cpus();
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_shrink);
-
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
int i;
struct kmem_cache_node *n;
- int rc = __cache_shrink(cachep);
+ int rc = __kmem_cache_shrink(cachep);
if (rc)
return rc;
diff --git a/mm/slab.h b/mm/slab.h
index 3045316b7c9d..6fc2f0050db6 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -91,6 +91,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
int __kmem_cache_shutdown(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *);
struct seq_file;
struct file;
@@ -191,6 +192,26 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
return s;
return s->memcg_params->root_cache;
}
+
+static __always_inline int memcg_charge_slab(struct kmem_cache *s,
+ gfp_t gfp, int order)
+{
+ if (!memcg_kmem_enabled())
+ return 0;
+ if (is_root_cache(s))
+ return 0;
+ return memcg_charge_kmem(s->memcg_params->memcg, gfp,
+ PAGE_SIZE << order);
+}
+
+static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
+{
+ if (!memcg_kmem_enabled())
+ return;
+ if (is_root_cache(s))
+ return;
+ memcg_uncharge_kmem(s->memcg_params->memcg, PAGE_SIZE << order);
+}
#else
static inline bool is_root_cache(struct kmem_cache *s)
{
@@ -226,6 +247,15 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
{
return s;
}
+
+static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
+{
+ return 0;
+}
+
+static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
+{
+}
#endif
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f3cfccf76dda..94db71602bfb 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -205,6 +205,8 @@ kmem_cache_create(const char *name, size_t size, size_t align,
int err;
get_online_cpus();
+ get_online_mems();
+
mutex_lock(&slab_mutex);
err = kmem_cache_sanity_check(name, size);
@@ -239,6 +241,8 @@ kmem_cache_create(const char *name, size_t size, size_t align,
out_unlock:
mutex_unlock(&slab_mutex);
+
+ put_online_mems();
put_online_cpus();
if (err) {
@@ -272,6 +276,8 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c
char *cache_name;
get_online_cpus();
+ get_online_mems();
+
mutex_lock(&slab_mutex);
/*
@@ -290,15 +296,13 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c
root_cache->size, root_cache->align,
root_cache->flags, root_cache->ctor,
memcg, root_cache);
- if (IS_ERR(s)) {
+ if (IS_ERR(s))
kfree(cache_name);
- goto out_unlock;
- }
-
- s->allocflags |= __GFP_KMEMCG;
out_unlock:
mutex_unlock(&slab_mutex);
+
+ put_online_mems();
put_online_cpus();
}
@@ -326,6 +330,8 @@ static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
get_online_cpus();
+ get_online_mems();
+
mutex_lock(&slab_mutex);
s->refcount--;
@@ -354,15 +360,36 @@ void kmem_cache_destroy(struct kmem_cache *s)
memcg_free_cache_params(s);
kfree(s->name);
kmem_cache_free(kmem_cache, s);
- goto out_put_cpus;
+ goto out;
out_unlock:
mutex_unlock(&slab_mutex);
-out_put_cpus:
+out:
+ put_online_mems();
put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);
+/**
+ * kmem_cache_shrink - Shrink a cache.
+ * @cachep: The cache to shrink.
+ *
+ * Releases as many slabs as possible for a cache.
+ * To help debugging, a zero exit status indicates all slabs were released.
+ */
+int kmem_cache_shrink(struct kmem_cache *cachep)
+{
+ int ret;
+
+ get_online_cpus();
+ get_online_mems();
+ ret = __kmem_cache_shrink(cachep);
+ put_online_mems();
+ put_online_cpus();
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
int slab_is_available(void)
{
return slab_state >= UP;
@@ -577,6 +604,24 @@ void __init create_kmalloc_caches(unsigned long flags)
}
#endif /* !CONFIG_SLOB */
+/*
+ * To avoid unnecessary overhead, we pass through large allocation requests
+ * directly to the page allocator. We use __GFP_COMP, because we will need to
+ * know the allocation order to free the pages properly in kfree.
+ */
+void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+{
+ void *ret;
+ struct page *page;
+
+ flags |= __GFP_COMP;
+ page = alloc_kmem_pages(flags, order);
+ ret = page ? page_address(page) : NULL;
+ kmemleak_alloc(ret, size, 1, flags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_order);
+
#ifdef CONFIG_TRACING
void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{
diff --git a/mm/slob.c b/mm/slob.c
index 730cad45d4be..21980e0f39a8 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -620,11 +620,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
return 0;
}
-int kmem_cache_shrink(struct kmem_cache *d)
+int __kmem_cache_shrink(struct kmem_cache *d)
{
return 0;
}
-EXPORT_SYMBOL(kmem_cache_shrink);
struct kmem_cache kmem_cache_boot = {
.name = "kmem_cache",
diff --git a/mm/slub.c b/mm/slub.c
index 5e234f1f8853..521167b1a96a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1317,17 +1317,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
/*
* Slab allocation and freeing
*/
-static inline struct page *alloc_slab_page(gfp_t flags, int node,
- struct kmem_cache_order_objects oo)
+static inline struct page *alloc_slab_page(struct kmem_cache *s,
+ gfp_t flags, int node, struct kmem_cache_order_objects oo)
{
+ struct page *page;
int order = oo_order(oo);
flags |= __GFP_NOTRACK;
+ if (memcg_charge_slab(s, flags, order))
+ return NULL;
+
if (node == NUMA_NO_NODE)
- return alloc_pages(flags, order);
+ page = alloc_pages(flags, order);
else
- return alloc_pages_exact_node(node, flags, order);
+ page = alloc_pages_exact_node(node, flags, order);
+
+ if (!page)
+ memcg_uncharge_slab(s, order);
+
+ return page;
}
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1349,7 +1358,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
- page = alloc_slab_page(alloc_gfp, node, oo);
+ page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page)) {
oo = s->min;
alloc_gfp = flags;
@@ -1357,7 +1366,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
* Allocation may have failed due to fragmentation.
* Try a lower order alloc if possible
*/
- page = alloc_slab_page(alloc_gfp, node, oo);
+ page = alloc_slab_page(s, alloc_gfp, node, oo);
if (page)
stat(s, ORDER_FALLBACK);
@@ -1473,7 +1482,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
page_mapcount_reset(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- __free_memcg_kmem_pages(page, order);
+ __free_pages(page, order);
+ memcg_uncharge_slab(s, order);
}
#define need_reserve_slab_rcu \
@@ -3325,8 +3335,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
struct page *page;
void *ptr = NULL;
- flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
- page = alloc_pages_node(node, flags, get_order(size));
+ flags |= __GFP_COMP | __GFP_NOTRACK;
+ page = alloc_kmem_pages_node(node, flags, get_order(size));
if (page)
ptr = page_address(page);
@@ -3395,7 +3405,7 @@ void kfree(const void *x)
if (unlikely(!PageSlab(page))) {
BUG_ON(!PageCompound(page));
kfree_hook(x);
- __free_memcg_kmem_pages(page, compound_order(page));
+ __free_kmem_pages(page, compound_order(page));
return;
}
slab_free(page->slab_cache, page, object, _RET_IP_);
@@ -3412,7 +3422,7 @@ EXPORT_SYMBOL(kfree);
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s)
{
int node;
int i;
@@ -3468,7 +3478,6 @@ int kmem_cache_shrink(struct kmem_cache *s)
kfree(slabs_by_inuse);
return 0;
}
-EXPORT_SYMBOL(kmem_cache_shrink);
static int slab_mem_going_offline_callback(void *arg)
{
@@ -3476,7 +3485,7 @@ static int slab_mem_going_offline_callback(void *arg)
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list)
- kmem_cache_shrink(s);
+ __kmem_cache_shrink(s);
mutex_unlock(&slab_mutex);
return 0;
@@ -4352,7 +4361,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
}
}
- lock_memory_hotplug();
+ get_online_mems();
#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
for_each_node_state(node, N_NORMAL_MEMORY) {
@@ -4392,7 +4401,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
- unlock_memory_hotplug();
+ put_online_mems();
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
@@ -5071,15 +5080,18 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
#ifdef CONFIG_MEMCG_KMEM
int i;
char *buffer = NULL;
+ struct kmem_cache *root_cache;
- if (!is_root_cache(s))
+ if (is_root_cache(s))
return;
+ root_cache = s->memcg_params->root_cache;
+
/*
* This mean this cache had no attribute written. Therefore, no point
* in copying default values around
*/
- if (!s->max_attr_size)
+ if (!root_cache->max_attr_size)
return;
for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
@@ -5101,7 +5113,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
*/
if (buffer)
buf = buffer;
- else if (s->max_attr_size < ARRAY_SIZE(mbuf))
+ else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
buf = mbuf;
else {
buffer = (char *) get_zeroed_page(GFP_KERNEL);
@@ -5110,7 +5122,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
buf = buffer;
}
- attr->show(s->memcg_params->root_cache, buf);
+ attr->show(root_cache, buf);
attr->store(s, buf, strlen(buf));
}
diff --git a/mm/swap.c b/mm/swap.c
index 9ce43ba4498b..c0ed4d65438f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -582,13 +582,7 @@ void mark_page_accessed(struct page *page)
}
EXPORT_SYMBOL(mark_page_accessed);
-/*
- * Queue the page for addition to the LRU via pagevec. The decision on whether
- * to add the page to the [in]active [file|anon] list is deferred until the
- * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
- * have the page added to the active list using mark_page_accessed().
- */
-void __lru_cache_add(struct page *page)
+static void __lru_cache_add(struct page *page)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
@@ -598,11 +592,32 @@ void __lru_cache_add(struct page *page)
pagevec_add(pvec, page);
put_cpu_var(lru_add_pvec);
}
-EXPORT_SYMBOL(__lru_cache_add);
+
+/**
+ * lru_cache_add: add a page to the page lists
+ * @page: the page to add
+ */
+void lru_cache_add_anon(struct page *page)
+{
+ ClearPageActive(page);
+ __lru_cache_add(page);
+}
+
+void lru_cache_add_file(struct page *page)
+{
+ ClearPageActive(page);
+ __lru_cache_add(page);
+}
+EXPORT_SYMBOL(lru_cache_add_file);
/**
* lru_cache_add - add a page to a page list
* @page: the page to be added to the LRU.
+ *
+ * Queue the page for addition to the LRU via pagevec. The decision on whether
+ * to add the page to the [in]active [file|anon] list is deferred until the
+ * pagevec is drained. This gives a chance for the caller of lru_cache_add()
+ * have the page added to the active list using mark_page_accessed().
*/
void lru_cache_add(struct page *page)
{
diff --git a/mm/truncate.c b/mm/truncate.c
index e5cc39ab0751..6a78c814bebf 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -484,14 +484,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
unsigned long count = 0;
int i;
- /*
- * Note: this function may get called on a shmem/tmpfs mapping:
- * pagevec_lookup() might then return 0 prematurely (because it
- * got a gangful of swap entries); but it's hardly worth worrying
- * about - it can rarely have anything to free from such a mapping
- * (most pages are dirty), and already skips over any difficulties.
- */
-
pagevec_init(&pvec, 0);
while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
diff --git a/mm/util.c b/mm/util.c
index f380af7ea779..efadeaaef81e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -3,6 +3,7 @@
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/ctype.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/security.h>
@@ -64,6 +65,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp)
EXPORT_SYMBOL(kstrndup);
/**
+ * kstrimdup - Trim and copy a %NUL terminated string.
+ * @s: the string to trim and duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Returns an address, which the caller must kfree, containing
+ * a duplicate of the passed string with leading and/or trailing
+ * whitespace (as defined by isspace) removed.
+ */
+char *kstrimdup(const char *s, gfp_t gfp)
+{
+ char *buf;
+ char *begin = skip_spaces(s);
+ size_t len = strlen(begin);
+
+ while (len && isspace(begin[len - 1]))
+ len--;
+
+ buf = kmalloc_track_caller(len + 1, gfp);
+ if (!buf)
+ return NULL;
+
+ memcpy(buf, begin, len);
+ buf[len] = '\0';
+
+ return buf;
+}
+EXPORT_SYMBOL(kstrimdup);
+
+/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
diff --git a/mm/vmacache.c b/mm/vmacache.c
index d4224b397c0e..61c38ae9f54b 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -17,6 +17,16 @@ void vmacache_flush_all(struct mm_struct *mm)
{
struct task_struct *g, *p;
+ /*
+ * Single threaded tasks need not iterate the entire
+ * list of process. We can avoid the flushing as well
+ * since the mm's seqnum was increased and don't have
+ * to worry about other threads' seqnum. Current's
+ * flush will occur upon the next lookup.
+ */
+ if (atomic_read(&mm->mm_users) == 1)
+ return;
+
rcu_read_lock();
for_each_process_thread(g, p) {
/*
@@ -78,11 +88,14 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
if (!vmacache_valid(mm))
return NULL;
+ count_vm_vmacache_event(VMACACHE_FIND_CALLS);
+
for (i = 0; i < VMACACHE_SIZE; i++) {
struct vm_area_struct *vma = current->vmacache[i];
if (vma && vma->vm_start <= addr && vma->vm_end > addr) {
BUG_ON(vma->vm_mm != mm);
+ count_vm_vmacache_event(VMACACHE_FIND_HITS);
return vma;
}
}
@@ -100,11 +113,15 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
if (!vmacache_valid(mm))
return NULL;
+ count_vm_vmacache_event(VMACACHE_FIND_CALLS);
+
for (i = 0; i < VMACACHE_SIZE; i++) {
struct vm_area_struct *vma = current->vmacache[i];
- if (vma && vma->vm_start == start && vma->vm_end == end)
+ if (vma && vma->vm_start == start && vma->vm_end == end) {
+ count_vm_vmacache_event(VMACACHE_FIND_HITS);
return vma;
+ }
}
return NULL;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 11062a64a010..b7908f7118f0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -11,6 +11,8 @@
* Multiqueue VM started 5.8.00, Rik van Riel.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/gfp.h>
@@ -43,6 +45,7 @@
#include <linux/sysctl.h>
#include <linux/oom.h>
#include <linux/prefetch.h>
+#include <linux/printk.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -477,7 +480,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
if (page_has_private(page)) {
if (try_to_free_buffers(page)) {
ClearPageDirty(page);
- printk("%s: orphaned page\n", __func__);
+ pr_info("%s: orphaned page\n", __func__);
return PAGE_CLEAN;
}
}
@@ -1866,6 +1869,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
bool force_scan = false;
unsigned long ap, fp;
enum lru_list lru;
+ bool some_scanned;
+ int pass;
/*
* If the zone or memcg is small, nr[l] can be 0. This
@@ -1971,39 +1976,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
fraction[1] = fp;
denominator = ap + fp + 1;
out:
- for_each_evictable_lru(lru) {
- int file = is_file_lru(lru);
- unsigned long size;
- unsigned long scan;
+ some_scanned = false;
+ /* Only use force_scan on second pass. */
+ for (pass = 0; !some_scanned && pass < 2; pass++) {
+ for_each_evictable_lru(lru) {
+ int file = is_file_lru(lru);
+ unsigned long size;
+ unsigned long scan;
- size = get_lru_size(lruvec, lru);
- scan = size >> sc->priority;
+ size = get_lru_size(lruvec, lru);
+ scan = size >> sc->priority;
- if (!scan && force_scan)
- scan = min(size, SWAP_CLUSTER_MAX);
+ if (!scan && pass && force_scan)
+ scan = min(size, SWAP_CLUSTER_MAX);
- switch (scan_balance) {
- case SCAN_EQUAL:
- /* Scan lists relative to size */
- break;
- case SCAN_FRACT:
+ switch (scan_balance) {
+ case SCAN_EQUAL:
+ /* Scan lists relative to size */
+ break;
+ case SCAN_FRACT:
+ /*
+ * Scan types proportional to swappiness and
+ * their relative recent reclaim efficiency.
+ */
+ scan = div64_u64(scan * fraction[file],
+ denominator);
+ break;
+ case SCAN_FILE:
+ case SCAN_ANON:
+ /* Scan one type exclusively */
+ if ((scan_balance == SCAN_FILE) != file)
+ scan = 0;
+ break;
+ default:
+ /* Look ma, no brain */
+ BUG();
+ }
+ nr[lru] = scan;
/*
- * Scan types proportional to swappiness and
- * their relative recent reclaim efficiency.
+ * Skip the second pass and don't force_scan,
+ * if we found something to scan.
*/
- scan = div64_u64(scan * fraction[file], denominator);
- break;
- case SCAN_FILE:
- case SCAN_ANON:
- /* Scan one type exclusively */
- if ((scan_balance == SCAN_FILE) != file)
- scan = 0;
- break;
- default:
- /* Look ma, no brain */
- BUG();
+ some_scanned |= !!scan;
}
- nr[lru] = scan;
}
}
@@ -2507,10 +2522,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
+ if (!populated_zone(zone))
+ continue;
+
pfmemalloc_reserve += min_wmark_pages(zone);
free_pages += zone_page_state(zone, NR_FREE_PAGES);
}
+ /* If there are no reserves (unexpected config) then do not throttle */
+ if (!pfmemalloc_reserve)
+ return true;
+
wmark_ok = free_pages > pfmemalloc_reserve / 2;
/* kswapd must be awake if processes are being throttled */
@@ -2535,9 +2557,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
+ struct zoneref *z;
struct zone *zone;
- int high_zoneidx = gfp_zone(gfp_mask);
- pg_data_t *pgdat;
+ pg_data_t *pgdat = NULL;
/*
* Kernel threads should not be throttled as they may be indirectly
@@ -2556,10 +2578,24 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
if (fatal_signal_pending(current))
goto out;
- /* Check if the pfmemalloc reserves are ok */
- first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
- pgdat = zone->zone_pgdat;
- if (pfmemalloc_watermark_ok(pgdat))
+ /*
+ * Check if the pfmemalloc reserves are ok by finding the first node
+ * with a usable ZONE_NORMAL or lower zone
+ */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_mask, nodemask) {
+ if (zone_idx(zone) > ZONE_NORMAL)
+ continue;
+
+ /* Throttle based on the first usable node */
+ pgdat = zone->zone_pgdat;
+ if (pfmemalloc_watermark_ok(pgdat))
+ goto out;
+ break;
+ }
+
+ /* If no zone was usable by the allocation flags then do not throttle */
+ if (!pgdat)
goto out;
/* Account for the throttling */
@@ -3404,7 +3440,7 @@ int kswapd_run(int nid)
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold lock_memory_hotplug().
+ * hold mem_hotplug_begin/end().
*/
void kswapd_stop(int nid)
{
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 302dd076b8bf..82ce17ce58c4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -866,6 +866,10 @@ const char * const vmstat_text[] = {
"nr_tlb_local_flush_one",
#endif /* CONFIG_DEBUG_TLBFLUSH */
+#ifdef CONFIG_DEBUG_VM_VMACACHE
+ "vmacache_find_calls",
+ "vmacache_find_hits",
+#endif
#endif /* CONFIG_VM_EVENTS_COUNTERS */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 34eb2160489d..89f44f171eaf 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -397,6 +397,11 @@ foreach my $entry (@mode_permission_funcs) {
$mode_perms_search .= $entry->[0];
}
+our $declaration_macros = qr{(?x:
+ (?:$Storage\s+)?(?:DECLARE|DEFINE)_[A-Z]+\s*\(|
+ (?:$Storage\s+)?LIST_HEAD\s*\(
+)};
+
our $allowed_asm_includes = qr{(?x:
irq|
memory
@@ -2093,8 +2098,10 @@ sub process {
foreach my $compat (@compats) {
my $compat2 = $compat;
- $compat2 =~ s/\,[a-z]*\-/\,<\.\*>\-/;
- `grep -Erq "$compat|$compat2" $dt_path`;
+ $compat2 =~ s/\,[a-zA-Z0-9]*\-/\,<\.\*>\-/;
+ my $compat3 = $compat;
+ $compat3 =~ s/\,([a-z]*)[0-9]*\-/\,$1<\.\*>\-/;
+ `grep -Erq "$compat|$compat2|$compat3" $dt_path`;
if ( $? >> 8 ) {
WARN("UNDOCUMENTED_DT_STRING",
"DT compatible string \"$compat\" appears un-documented -- check $dt_path\n" . $herecurr);
@@ -2266,18 +2273,35 @@ sub process {
}
# check for missing blank lines after declarations
- if ($realfile =~ m@^(drivers/net/|net/)@ &&
- $prevline =~ /^\+\s+$Declare\s+$Ident/ &&
- !($prevline =~ /(?:$Compare|$Assignment|$Operators)\s*$/ ||
- $prevline =~ /(?:\{\s*|\\)$/) && #extended lines
- $sline =~ /^\+\s+/ && #Not at char 1
- !($sline =~ /^\+\s+$Declare/ ||
- $sline =~ /^\+\s+$Ident\s+$Ident/ || #eg: typedef foo
+ if ($sline =~ /^\+\s+\S/ && #Not at char 1
+ # actual declarations
+ ($prevline =~ /^\+\s+$Declare\s*$Ident\s*[=,;\[]/ ||
+ # foo bar; where foo is some local typedef or #define
+ $prevline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ ||
+ # known declaration macros
+ $prevline =~ /^\+\s+$declaration_macros/) &&
+ # for "else if" which can look like "$Ident $Ident"
+ !($prevline =~ /^\+\s+$c90_Keywords\b/ ||
+ # other possible extensions of declaration lines
+ $prevline =~ /(?:$Compare|$Assignment|$Operators)\s*$/ ||
+ # not starting a section or a macro "\" extended line
+ $prevline =~ /(?:\{\s*|\\)$/) &&
+ # looks like a declaration
+ !($sline =~ /^\+\s+$Declare\s*$Ident\s*[=,;\[]/ ||
+ # foo bar; where foo is some local typedef or #define
+ $sline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ ||
+ # known declaration macros
+ $sline =~ /^\+\s+$declaration_macros/ ||
+ # start of struct or union or enum
$sline =~ /^\+\s+(?:union|struct|enum|typedef)\b/ ||
- $sline =~ /^\+\s+(?:$|[\{\}\.\#\"\?\:\(])/ ||
+ # start or end of block or continuation of declaration
+ $sline =~ /^\+\s+(?:$|[\{\}\.\#\"\?\:\(\[])/ ||
+ # bitfield continuation
+ $sline =~ /^\+\s+$Ident\s*:\s*\d+\s*[,;]/ ||
+ # other possible extensions of declaration lines
$sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/)) {
WARN("SPACING",
- "networking uses a blank line after declarations\n" . $hereprev);
+ "Missing a blank line after declarations\n" . $hereprev);
}
# check for spaces at the beginning of a line.
diff --git a/usr/Kconfig b/usr/Kconfig
index 642f503d3e9f..2d4c77eecf2e 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -98,80 +98,3 @@ config RD_LZ4
help
Support loading of a LZ4 encoded initial ramdisk or cpio buffer
If unsure, say N.
-
-choice
- prompt "Built-in initramfs compression mode" if INITRAMFS_SOURCE!=""
- help
- This option decides by which algorithm the builtin initramfs
- will be compressed. Several compression algorithms are
- available, which differ in efficiency, compression and
- decompression speed. Compression speed is only relevant
- when building a kernel. Decompression speed is relevant at
- each boot.
-
- If you have any problems with bzip2 or LZMA compressed
- initramfs, mail me (Alain Knaff) <alain@knaff.lu>.
-
- High compression options are mostly useful for users who are
- low on RAM, since it reduces the memory consumption during
- boot.
-
- If in doubt, select 'gzip'
-
-config INITRAMFS_COMPRESSION_NONE
- bool "None"
- help
- Do not compress the built-in initramfs at all. This may
- sound wasteful in space, but, you should be aware that the
- built-in initramfs will be compressed at a later stage
- anyways along with the rest of the kernel, on those
- architectures that support this.
- However, not compressing the initramfs may lead to slightly
- higher memory consumption during a short time at boot, while
- both the cpio image and the unpacked filesystem image will
- be present in memory simultaneously
-
-config INITRAMFS_COMPRESSION_GZIP
- bool "Gzip"
- depends on RD_GZIP
- help
- The old and tried gzip compression. It provides a good balance
- between compression ratio and decompression speed.
-
-config INITRAMFS_COMPRESSION_BZIP2
- bool "Bzip2"
- depends on RD_BZIP2
- help
- Its compression ratio and speed is intermediate.
- Decompression speed is slowest among the choices. The initramfs
- size is about 10% smaller with bzip2, in comparison to gzip.
- Bzip2 uses a large amount of memory. For modern kernels you
- will need at least 8MB RAM or more for booting.
-
-config INITRAMFS_COMPRESSION_LZMA
- bool "LZMA"
- depends on RD_LZMA
- help
- This algorithm's compression ratio is best.
- Decompression speed is between the other choices.
- Compression is slowest. The initramfs size is about 33%
- smaller with LZMA in comparison to gzip.
-
-config INITRAMFS_COMPRESSION_XZ
- bool "XZ"
- depends on RD_XZ
- help
- XZ uses the LZMA2 algorithm. The initramfs size is about 30%
- smaller with XZ in comparison to gzip. Decompression speed
- is better than that of bzip2 but worse than gzip and LZO.
- Compression is slow.
-
-config INITRAMFS_COMPRESSION_LZO
- bool "LZO"
- depends on RD_LZO
- help
- Its compression ratio is the poorest among the choices. The kernel
- size is about 10% bigger than gzip; however its speed
- (both compression and decompression) is the fastest.
-
-endchoice