summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/stable/sysfs-devices-node8
-rw-r--r--Documentation/ABI/testing/sysfs-block-zram32
-rw-r--r--Documentation/ABI/testing/sysfs-devices-memory8
-rw-r--r--Documentation/binfmt_misc.txt50
-rw-r--r--Documentation/blockdev/zram.txt25
-rw-r--r--Documentation/devicetree/bindings/rtc/s3c-rtc.txt3
-rw-r--r--Documentation/filesystems/autofs4.txt520
-rw-r--r--Documentation/filesystems/proc.txt2
-rw-r--r--Documentation/leds/leds-class.txt3
-rw-r--r--Documentation/memory-hotplug.txt11
-rw-r--r--Documentation/printk-formats.txt6
-rw-r--r--MAINTAINERS18
-rw-r--r--arch/alpha/include/asm/Kbuild1
-rw-r--r--arch/alpha/include/asm/sections.h7
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/boot/dts/exynos3250.dtsi2
-rw-r--r--arch/arm/mm/dma-mapping.c210
-rw-r--r--arch/arm/mm/init.c2
-rw-r--r--arch/arm64/Kconfig1
-rw-r--r--arch/arm64/mm/dma-mapping.c164
-rw-r--r--arch/cris/include/asm/Kbuild1
-rw-r--r--arch/cris/include/asm/sections.h7
-rw-r--r--arch/m32r/include/asm/Kbuild1
-rw-r--r--arch/m32r/include/asm/sections.h7
-rw-r--r--arch/m68k/kernel/sys_m68k.c21
-rw-r--r--arch/mn10300/include/asm/Kbuild1
-rw-r--r--arch/mn10300/include/asm/sections.h1
-rw-r--r--arch/powerpc/include/asm/pgtable.h57
-rw-r--r--arch/powerpc/include/asm/pte-common.h5
-rw-r--r--arch/score/include/asm/Kbuild1
-rw-r--r--arch/score/include/asm/sections.h6
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/numa.h1
-rw-r--r--arch/x86/include/asm/pgtable_64.h3
-rw-r--r--arch/x86/include/asm/pgtable_types.h14
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/init_64.c36
-rw-r--r--arch/x86/mm/ioremap.c20
-rw-r--r--arch/x86/mm/numa.c123
-rw-r--r--arch/x86/platform/efi/efi-bgrt.c36
-rw-r--r--arch/x86/purgatory/Makefile3
-rw-r--r--block/bio-integrity.c2
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/base/dma-mapping.c70
-rw-r--r--drivers/base/memory.c42
-rw-r--r--drivers/base/node.c19
-rw-r--r--drivers/block/zram/zram_drv.c107
-rw-r--r--drivers/block/zram/zram_drv.h6
-rw-r--r--drivers/input/Kconfig9
-rw-r--r--drivers/input/Makefile3
-rw-r--r--drivers/input/input.c6
-rw-r--r--drivers/input/leds.c249
-rw-r--r--drivers/leds/Kconfig3
-rw-r--r--drivers/misc/ti-st/st_core.c2
-rw-r--r--drivers/rtc/rtc-pcf8583.c18
-rw-r--r--drivers/rtc/rtc-s3c.c851
-rw-r--r--drivers/tty/Kconfig4
-rw-r--r--drivers/tty/vt/keyboard.c110
-rw-r--r--drivers/video/backlight/backlight.c2
-rw-r--r--drivers/virtio/Kconfig1
-rw-r--r--drivers/virtio/virtio_balloon.c77
-rw-r--r--fs/autofs4/autofs_i.h6
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/autofs4/expire.c210
-rw-r--r--fs/autofs4/root.c62
-rw-r--r--fs/befs/btree.c53
-rw-r--r--fs/binfmt_misc.c19
-rw-r--r--fs/block_dev.c7
-rw-r--r--fs/buffer.c26
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifssmb.c20
-rw-r--r--fs/cifs/file.c4
-rw-r--r--fs/cifs/sess.c2
-rw-r--r--fs/cifs/smb2file.c4
-rw-r--r--fs/cifs/smb2misc.c38
-rw-r--r--fs/cifs/smb2ops.c2
-rw-r--r--fs/cifs/smb2pdu.c2
-rw-r--r--fs/cifs/smb2pdu.h28
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/hfsplus/catalog.c89
-rw-r--r--fs/hfsplus/dir.c11
-rw-r--r--fs/hfsplus/hfsplus_fs.h4
-rw-r--r--fs/hfsplus/super.c4
-rw-r--r--fs/internal.h5
-rw-r--r--fs/mpage.c25
-rw-r--r--fs/ntfs/file.c5
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/ocfs2/alloc.c28
-rw-r--r--fs/ocfs2/alloc.h2
-rw-r--r--fs/ocfs2/aops.c37
-rw-r--r--fs/ocfs2/cluster/heartbeat.c19
-rw-r--r--fs/ocfs2/cluster/heartbeat.h1
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/dlmast.c6
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c44
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c25
-rw-r--r--fs/ocfs2/dlm/dlmthread.c10
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/inode.c10
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/proc/base.c36
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/kcore.c4
-rw-r--r--fs/proc/meminfo.c6
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/task_mmu.c272
-rw-r--r--fs/proc/task_nommu.c89
-rw-r--r--include/asm-generic/dma-mapping-common.h9
-rw-r--r--include/asm-generic/pgtable.h27
-rw-r--r--include/linux/balloon_compaction.h189
-rw-r--r--include/linux/compaction.h24
-rw-r--r--include/linux/compiler-gcc5.h66
-rw-r--r--include/linux/crc64_ecma.h56
-rw-r--r--include/linux/genalloc.h7
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/huge_mm.h2
-rw-r--r--include/linux/input.h21
-rw-r--r--include/linux/kernel.h32
-rw-r--r--include/linux/list.h1
-rw-r--r--include/linux/memory_hotplug.h1
-rw-r--r--include/linux/mempolicy.h7
-rw-r--r--include/linux/migrate.h14
-rw-r--r--include/linux/mm.h30
-rw-r--r--include/linux/mmdebug.h10
-rw-r--r--include/linux/mmzone.h54
-rw-r--r--include/linux/pagemap.h18
-rw-r--r--include/linux/rmap.h2
-rw-r--r--include/linux/slab.h64
-rw-r--r--include/linux/string.h1
-rw-r--r--include/linux/swap.h16
-rw-r--r--include/linux/vmstat.h2
-rw-r--r--include/linux/zsmalloc.h2
-rw-r--r--include/uapi/linux/kernel-page-flags.h1
-rw-r--r--include/uapi/linux/prctl.h27
-rw-r--r--init/Kconfig12
-rw-r--r--init/initramfs.c34
-rw-r--r--ipc/ipc_sysctl.c3
-rw-r--r--ipc/shm.c7
-rw-r--r--ipc/util.c20
-rw-r--r--kernel/acct.c14
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/debug/debug_core.c9
-rw-r--r--kernel/debug/kdb/kdb_debugger.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c4
-rw-r--r--kernel/printk/printk.c7
-rw-r--r--kernel/resource.c36
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/sys.c489
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/time/posix-timers.c57
-rw-r--r--kernel/watchdog.c18
-rw-r--r--lib/Kconfig7
-rw-r--r--lib/Makefile1
-rw-r--r--lib/crc64_ecma.c341
-rw-r--r--lib/dynamic_debug.c17
-rw-r--r--lib/genalloc.c49
-rw-r--r--lib/vsprintf.c20
-rw-r--r--mm/Kconfig7
-rw-r--r--mm/Makefile3
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/balloon_compaction.c227
-rw-r--r--mm/cma.c21
-rw-r--r--mm/compaction.c660
-rw-r--r--mm/huge_memory.c26
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/internal.h26
-rw-r--r--mm/interval_tree.c2
-rw-r--r--mm/kmemcheck.c1
-rw-r--r--mm/memory.c8
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c130
-rw-r--r--mm/migrate.c29
-rw-r--r--mm/mlock.c4
-rw-r--r--mm/mmap.c67
-rw-r--r--mm/mremap.c3
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c8
-rw-r--r--mm/page_alloc.c262
-rw-r--r--mm/rmap.c8
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab.c84
-rw-r--r--mm/slab.h35
-rw-r--r--mm/slab_common.c13
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c11
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/util.c53
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c107
-rw-r--r--mm/vmstat.c149
-rw-r--r--mm/zbud.c13
-rw-r--r--mm/zsmalloc.c30
-rwxr-xr-xscripts/checkpatch.pl28
-rwxr-xr-xscripts/headers_install.sh4
-rw-r--r--tools/testing/selftests/vm/Makefile1
-rw-r--r--tools/testing/selftests/vm/transhuge-stress.c144
-rw-r--r--tools/vm/page-types.c1
199 files changed, 5510 insertions, 2804 deletions
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index ce259c13c36a..5b2d0f08867c 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -85,14 +85,6 @@ Description:
will be compacted. When it completes, memory will be freed
into blocks which have as many contiguous pages as possible
-What: /sys/devices/system/node/nodeX/scan_unevictable_pages
-Date: October 2008
-Contact: Lee Schermerhorn <lee.schermerhorn@hp.com>
-Description:
- When set, it triggers scanning the node's unevictable lists
- and move any pages that have become evictable onto the respective
- zone's inactive list. See mm/vmscan.c
-
What: /sys/devices/system/node/nodeX/hugepages/hugepages-<size>/
Date: December 2009
Contact: Lee Schermerhorn <lee.schermerhorn@hp.com>
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 70ec992514d0..a6148eaf91e5 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -77,11 +77,14 @@ What: /sys/block/zram<id>/notify_free
Date: August 2010
Contact: Nitin Gupta <ngupta@vflare.org>
Description:
- The notify_free file is read-only and specifies the number of
- swap slot free notifications received by this device. These
- notifications are sent to a swap block device when a swap slot
- is freed. This statistic is applicable only when this disk is
- being used as a swap disk.
+ The notify_free file is read-only. Depending on device usage
+ scenario it may account a) the number of pages freed because
+ of swap slot free notifications or b) the number of pages freed
+ because of REQ_DISCARD requests sent by bio. The former ones
+ are sent to a swap block device when a swap slot is freed, which
+ implies that this disk is being used as a swap disk. The latter
+ ones are sent by filesystem mounted with discard option,
+ whenever some data blocks are getting discarded.
What: /sys/block/zram<id>/zero_pages
Date: August 2010
@@ -119,3 +122,22 @@ Description:
efficiency can be calculated using compr_data_size and this
statistic.
Unit: bytes
+
+What: /sys/block/zram<id>/mem_used_max
+Date: August 2014
+Contact: Minchan Kim <minchan@kernel.org>
+Description:
+ The mem_used_max file is read/write and specifies the amount
+ of maximum memory zram have consumed to store compressed data.
+ For resetting the value, you should write "0". Otherwise,
+ you could see -EINVAL.
+ Unit: bytes
+
+What: /sys/block/zram<id>/mem_limit
+Date: August 2014
+Contact: Minchan Kim <minchan@kernel.org>
+Description:
+ The mem_limit file is read/write and specifies the maximum
+ amount of memory ZRAM can use to store the compressed data. The
+ limit could be changed in run time and "0" means disable the
+ limit. No limit is the initial state. Unit: bytes
diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory
index 7405de26ee60..deef3b5723cf 100644
--- a/Documentation/ABI/testing/sysfs-devices-memory
+++ b/Documentation/ABI/testing/sysfs-devices-memory
@@ -61,6 +61,14 @@ Users: hotplug memory remove tools
http://www.ibm.com/developerworks/wikis/display/LinuxP/powerpc-utils
+What: /sys/devices/system/memory/memoryX/valid_zones
+Date: July 2014
+Contact: Zhang Zhen <zhenzhang.zhang@huawei.com>
+Description:
+ The file /sys/devices/system/memory/memoryX/valid_zones is
+ read-only and is designed to show which zone this memory
+ block can be onlined to.
+
What: /sys/devices/system/memoryX/nodeY
Date: October 2009
Contact: Linux Memory Management list <linux-mm@kvack.org>
diff --git a/Documentation/binfmt_misc.txt b/Documentation/binfmt_misc.txt
index c1ed6948ba80..6b1de7058371 100644
--- a/Documentation/binfmt_misc.txt
+++ b/Documentation/binfmt_misc.txt
@@ -15,39 +15,50 @@ First you must mount binfmt_misc:
mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc
To actually register a new binary type, you have to set up a string looking like
-:name:type:offset:magic:mask:interpreter:flags (where you can choose the ':' upon
-your needs) and echo it to /proc/sys/fs/binfmt_misc/register.
+:name:type:offset:magic:mask:interpreter:flags (where you can choose the ':'
+upon your needs) and echo it to /proc/sys/fs/binfmt_misc/register.
+
Here is what the fields mean:
- 'name' is an identifier string. A new /proc file will be created with this
- name below /proc/sys/fs/binfmt_misc
+ name below /proc/sys/fs/binfmt_misc; cannot contain slashes '/' for obvious
+ reasons.
- 'type' is the type of recognition. Give 'M' for magic and 'E' for extension.
- 'offset' is the offset of the magic/mask in the file, counted in bytes. This
- defaults to 0 if you omit it (i.e. you write ':name:type::magic...')
+ defaults to 0 if you omit it (i.e. you write ':name:type::magic...'). Ignored
+ when using filename extension matching.
- 'magic' is the byte sequence binfmt_misc is matching for. The magic string
- may contain hex-encoded characters like \x0a or \xA4. In a shell environment
- you will have to write \\x0a to prevent the shell from eating your \.
+ may contain hex-encoded characters like \x0a or \xA4. Note that you must
+ escape any NUL bytes; parsing halts at the first one. In a shell environment
+ you might have to write \\x0a to prevent the shell from eating your \.
If you chose filename extension matching, this is the extension to be
recognised (without the '.', the \x0a specials are not allowed). Extension
- matching is case sensitive!
+ matching is case sensitive, and slashes '/' are not allowed!
- 'mask' is an (optional, defaults to all 0xff) mask. You can mask out some
bits from matching by supplying a string like magic and as long as magic.
- The mask is anded with the byte sequence of the file.
+ The mask is anded with the byte sequence of the file. Note that you must
+ escape any NUL bytes; parsing halts at the first one. Ignored when using
+ filename extension matching.
- 'interpreter' is the program that should be invoked with the binary as first
argument (specify the full path)
- 'flags' is an optional field that controls several aspects of the invocation
- of the interpreter. It is a string of capital letters, each controls a certain
- aspect. The following flags are supported -
- 'P' - preserve-argv[0]. Legacy behavior of binfmt_misc is to overwrite the
- original argv[0] with the full path to the binary. When this flag is
- included, binfmt_misc will add an argument to the argument vector for
- this purpose, thus preserving the original argv[0].
+ of the interpreter. It is a string of capital letters, each controls a
+ certain aspect. The following flags are supported -
+ 'P' - preserve-argv[0]. Legacy behavior of binfmt_misc is to overwrite
+ the original argv[0] with the full path to the binary. When this
+ flag is included, binfmt_misc will add an argument to the argument
+ vector for this purpose, thus preserving the original argv[0].
+ e.g. If your interp is set to /bin/foo and you run `blah` (which is
+ in /usr/local/bin), then the kernel will execute /bin/foo with
+ argv[] set to ["/bin/foo", "/usr/local/bin/blah", "blah"]. The
+ interp has to be aware of this so it can execute /usr/local/bin/blah
+ with argv[] set to ["blah"].
'O' - open-binary. Legacy behavior of binfmt_misc is to pass the full path
of the binary to the interpreter as an argument. When this flag is
included, binfmt_misc will open the file for reading and pass its
descriptor as an argument, instead of the full path, thus allowing
- the interpreter to execute non-readable binaries. This feature should
- be used with care - the interpreter has to be trusted not to emit
- the contents of the non-readable binary.
+ the interpreter to execute non-readable binaries. This feature
+ should be used with care - the interpreter has to be trusted not to
+ emit the contents of the non-readable binary.
'C' - credentials. Currently, the behavior of binfmt_misc is to calculate
the credentials and security token of the new process according to
the interpreter. When this flag is included, these attributes are
@@ -58,7 +69,7 @@ Here is what the fields mean:
There are some restrictions:
- - the whole register string may not exceed 255 characters
+ - the whole register string may not exceed 1920 characters
- the magic must reside in the first 128 bytes of the file, i.e.
offset+size(magic) has to be less than 128
- the interpreter string may not exceed 127 characters
@@ -110,7 +121,4 @@ passes it the full filename (or the file descriptor) to use. Using $PATH can
cause unexpected behaviour and can be a security hazard.
-There is a web page about binfmt_misc at
-http://www.tat.physik.uni-tuebingen.de
-
Richard Günther <rguenth@tat.physik.uni-tuebingen.de>
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 0595c3f56ccf..7fcf9c6592ec 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -74,14 +74,30 @@ There is little point creating a zram of greater than twice the size of memory
since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the
size of the disk when not in use so a huge zram is wasteful.
-5) Activate:
+5) Set memory limit: Optional
+ Set memory limit by writing the value to sysfs node 'mem_limit'.
+ The value can be either in bytes or you can use mem suffixes.
+ In addition, you could change the value in runtime.
+ Examples:
+ # limit /dev/zram0 with 50MB memory
+ echo $((50*1024*1024)) > /sys/block/zram0/mem_limit
+
+ # Using mem suffixes
+ echo 256K > /sys/block/zram0/mem_limit
+ echo 512M > /sys/block/zram0/mem_limit
+ echo 1G > /sys/block/zram0/mem_limit
+
+ # To disable memory limit
+ echo 0 > /sys/block/zram0/mem_limit
+
+6) Activate:
mkswap /dev/zram0
swapon /dev/zram0
mkfs.ext4 /dev/zram1
mount /dev/zram1 /tmp
-6) Stats:
+7) Stats:
Per-device statistics are exported as various nodes under
/sys/block/zram<id>/
disksize
@@ -95,12 +111,13 @@ size of the disk when not in use so a huge zram is wasteful.
orig_data_size
compr_data_size
mem_used_total
+ mem_used_max
-7) Deactivate:
+8) Deactivate:
swapoff /dev/zram0
umount /dev/zram1
-8) Reset:
+9) Reset:
Write any positive value to 'reset' sysfs node
echo 1 > /sys/block/zram0/reset
echo 1 > /sys/block/zram1/reset
diff --git a/Documentation/devicetree/bindings/rtc/s3c-rtc.txt b/Documentation/devicetree/bindings/rtc/s3c-rtc.txt
index 7ac7259fe9ea..ab757b84daa7 100644
--- a/Documentation/devicetree/bindings/rtc/s3c-rtc.txt
+++ b/Documentation/devicetree/bindings/rtc/s3c-rtc.txt
@@ -3,7 +3,10 @@
Required properties:
- compatible: should be one of the following.
* "samsung,s3c2410-rtc" - for controllers compatible with s3c2410 rtc.
+ * "samsung,s3c2416-rtc" - for controllers compatible with s3c2416 rtc.
+ * "samsung,s3c2443-rtc" - for controllers compatible with s3c2443 rtc.
* "samsung,s3c6410-rtc" - for controllers compatible with s3c6410 rtc.
+ * "samsung,exynos3250-rtc" - for controllers compatible with exynos3250 rtc.
- reg: physical base address of the controller and length of memory mapped
region.
- interrupts: Two interrupt numbers to the cpu should be specified. First
diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt
new file mode 100644
index 000000000000..39d02e19fb62
--- /dev/null
+++ b/Documentation/filesystems/autofs4.txt
@@ -0,0 +1,520 @@
+<head>
+<style> p { max-width:50em} ol, ul {max-width: 40em}</style>
+</head>
+
+autofs - how it works
+=====================
+
+Purpose
+-------
+
+The goal of autofs is to provide on-demand mounting and race free
+automatic unmounting of various other filesystems. This provides two
+key advantages:
+
+1. There is no need to delay boot until all filesystems that
+ might be needed are mounted. Processes that try to access those
+ slow filesystems might be delayed but other processes can
+ continue freely. This is particularly important for
+ network filesystems (e.g. NFS) or filesystems stored on
+ media with a media-changing robot.
+
+2. The names and locations of filesystems can be stored in
+ a remote database and can change at any time. The content
+ in that data base at the time of access will be used to provide
+ a target for the access. The interpretation of names in the
+ filesystem can even be programmatic rather than database-backed,
+ allowing wildcards for example, and can vary based on the user who
+ first accessed a name.
+
+Context
+-------
+
+The "autofs4" filesystem module is only one part of an autofs system.
+There also needs to be a user-space program which looks up names
+and mounts filesystems. This will often be the "automount" program,
+though other tools including "systemd" can make use of "autofs4".
+This document describes only the kernel module and the interactions
+required with any user-space program. Subsequent text refers to this
+as the "automount daemon" or simply "the daemon".
+
+"autofs4" is a Linux kernel module with provides the "autofs"
+filesystem type. Several "autofs" filesystems can be mounted and they
+can each be managed separately, or all managed by the same daemon.
+
+Content
+-------
+
+An autofs filesystem can contain 3 sorts of objects: directories,
+symbolic links and mount traps. Mount traps are directories with
+extra properties as described in the next section.
+
+Objects can only be created by the automount daemon: symlinks are
+created with a regular `symlink` system call, while directories and
+mount traps are created with `mkdir`. The determination of whether a
+directory should be a mount trap or not is quite _ad hoc_, largely for
+historical reasons, and is determined in part by the
+*direct*/*indirect*/*offset* mount options, and the *maxproto* mount option.
+
+If neither the *direct* or *offset* mount options are given (so the
+mount is considered to be *indirect*), then the root directory is
+always a regular directory, otherwise it is a mount trap when it is
+empty and a regular directory when not empty. Note that *direct* and
+*offset* are treated identically so a concise summary is that the root
+directory is a mount trap only if the filesystem is mounted *direct*
+and the root is empty.
+
+Directories created in the root directory are mount traps only if the
+filesystem is mounted *indirect* and they are empty.
+
+Directories further down the tree depend on the *maxproto* mount
+option and particularly whether it is less than five or not.
+When *maxproto* is five, no directories further down the
+tree are ever mount traps, they are always regular directories. When
+the *maxproto* is four (or three), these directories are mount traps
+precisely when they are empty.
+
+So: non-empty (i.e. non-leaf) directories are never mount traps. Empty
+directories are sometimes mount traps, and sometimes not depending on
+where in the tree they are (root, top level, or lower), the *maxproto*,
+and whether the mount was *indirect* or not.
+
+Mount Traps
+---------------
+
+A core element of the implementation of autofs is the Mount Traps
+which are provided by the Linux VFS. Any directory provided by a
+filesystem can be designated as a trap. This involves two separate
+features that work together to allow autofs to do its job.
+
+**DCACHE_NEED_AUTOMOUNT**
+
+If a dentry has the DCACHE_NEED_AUTOMOUNT flag set (which gets set if
+the inode has S_AUTOMOUNT set, or can be set directly) then it is
+(potentially) a mount trap. Any access to this directory beyond a
+"`stat`" will (normally) cause the `d_op->d_automount()` dentry operation
+to be called. The task of this method is to find the filesystem that
+should be mounted on the directory and to return it. The VFS is
+responsible for actually mounting the root of this filesystem on the
+directory.
+
+autofs doesn't find the filesystem itself but sends a message to the
+automount daemon asking it to find and mount the filesystem. The
+autofs `d_automount` method then waits for the daemon to report that
+everything is ready. It will then return "`NULL`" indicating that the
+mount has already happened. The VFS doesn't try to mount anything but
+follows down the mount that is already there.
+
+This functionality is sufficient for some users of mount traps such
+as NFS which creates traps so that mountpoints on the server can be
+reflected on the client. However it is not sufficient for autofs. As
+mounting onto a directory is considered to be "beyond a `stat`", the
+automount daemon would not be able to mount a filesystem on the 'trap'
+directory without some way to avoid getting caught in the trap. For
+that purpose there is another flag.
+
+**DCACHE_MANAGE_TRANSIT**
+
+If a dentry has DCACHE_MANAGE_TRANSIT set then two very different but
+related behaviors are invoked, both using the `d_op->d_manage()`
+dentry operation.
+
+Firstly, before checking to see if any filesystem is mounted on the
+directory, d_manage() will be called with the `rcu_walk` parameter set
+to `false`. It may return one of three things:
+
+- A return value of zero indicates that there is nothing special
+ about this dentry and normal checks for mounts and automounts
+ should proceed.
+
+ autofs normally returns zero, but first waits for any
+ expiry (automatic unmounting of the mounted filesystem) to
+ complete. This avoids races.
+
+- A return value of `-EISDIR` tells the VFS to ignore any mounts
+ on the directory and to not consider calling `->d_automount()`.
+ This effectively disables the **DCACHE_NEED_AUTOMOUNT** flag
+ causing the directory not be a mount trap after all.
+
+ autofs returns this if it detects that the process performing the
+ lookup is the automount daemon and that the mount has been
+ requested but has not yet completed. How it determines this is
+ discussed later. This allows the automount daemon not to get
+ caught in the mount trap.
+
+ There is a subtlety here. It is possible that a second autofs
+ filesystem can be mounted below the first and for both of them to
+ be managed by the same daemon. For the daemon to be able to mount
+ something on the second it must be able to "walk" down past the
+ first. This means that d_manage cannot *always* return -EISDIR for
+ the automount daemon. It must only return it when a mount has
+ been requested, but has not yet completed.
+
+ `d_manage` also returns `-EISDIR` if the dentry shouldn't be a
+ mount trap, either because it is a symbolic link or because it is
+ not empty.
+
+- Any other negative value is treated as an error and returned
+ to the caller.
+
+ autofs can return
+
+ - -ENOENT if the automount daemon failed to mount anything,
+ - -ENOMEM if it ran out of memory,
+ - -EINTR if a signal arrived while waiting for expiry to
+ complete
+ - or any other error sent down by the automount daemon.
+
+
+The second use case only occurs during an "RCU-walk" and so `rcu_walk`
+will be set.
+
+An RCU-walk is a fast and lightweight process for walking down a
+filename path (i.e. it is like running on tip-toes). RCU-walk cannot
+cope with all situations so when it finds a difficulty it falls back
+to "REF-walk", which is slower but more robust.
+
+RCU-walk will never call `->d_automount`; the filesystems must already
+be mounted or RCU-walk cannot handle the path.
+To determine if a mount-trap is safe for RCU-walk mode it calls
+`->d_manage()` with `rcu_walk` set to `true`.
+
+In this case `d_manage()` must avoid blocking and should avoid taking
+spinlocks if at all possible. Its sole purpose is to determine if it
+would be safe to follow down into any mounted directory and the only
+reason that it might not be is if an expiry of the mount is
+underway.
+
+In the `rcu_walk` case, `d_manage()` cannot return -EISDIR to tell the
+VFS that this is a directory that doesn't require d_automount. If
+`rcu_walk` sees a dentry with DCACHE_NEED_AUTOMOUNT set but nothing
+mounted, it *will* fall back to REF-walk. `d_manage()` cannot make the
+VFS remain in RCU-walk mode, but can only tell it to get out of
+RCU-walk mode by returning `-ECHILD`.
+
+So `d_manage()`, when called with `rcu_walk` set, should either return
+-ECHILD if there is any reason to believe it is unsafe to end the
+mounted filesystem, and otherwise should return 0.
+
+autofs will return `-ECHILD` if an expiry of the filesystem has been
+initiated or is being considered, otherwise it returns 0.
+
+
+Mountpoint expiry
+-----------------
+
+The VFS has a mechansim for automatically expiring unused mounts,
+much as it can expire any unused dentry information from the dcache.
+This is guided by the MNT_SHRINKABLE flag. This only applies to
+mounts that were created by `d_automount()` returning a filesystem to be
+mounted. As autofs doesn't return such a filesystem but leaves the
+mounting to the automount daemon, it must involve the automount daemon
+in unmounting as well. This also means that autofs has more control
+of expiry.
+
+The VFS also supports "expiry" of mounts using the MNT_EXPIRE flag to
+the `umount` system call. Unmounting with MNT_EXPIRE will fail unless
+a previous attempt had been made, and the filesystem has been inactive
+and untouched since that previous attempt. autofs4 does not depend on
+this but has its own internal tracking of whether filesystems were
+recently used. This allows individual names in the autofs directory
+to expire separately.
+
+With version 4 of the protocol, the automount daemon can try to
+unmount any filesystems mounted on the autofs filesystem or remove any
+symbolic links or empty directories any time it likes. If the unmount
+or removal is successful the filesystem will be returned to the state
+it was before the mount or creation, so that any access of the name
+will trigger normal auto-mount processing. In particlar, `rmdir` and
+`unlink` do not leave negative entries in the dcache as a normal
+filesystem would, so an attempt to access a recently-removed object is
+passed to autofs for handling.
+
+With version 5, this is not safe except for unmounting from top-level
+directories. As lower-level directories are never mount traps, other
+processes will see an empty directory as soon as the filesystem is
+unmounted. So it is generally safest to use the autofs expiry
+protocol described below.
+
+Normally the daemon only wants to remove entries which haven't been
+used for a while. For this purpose autofs maintains a "`last_used`"
+time stamp on each directory or symlink. For symlinks it genuinely
+does record the last time the symlink was "used" or followed to find
+out where it points to. For directories the field is a slight
+misnomer. It actually records the last time that autofs checked if
+the directory or one of its descendents was busy and found that it
+was. This is just as useful and doesn't require updating the field so
+often.
+
+The daemon is able to ask autofs if anything is due to be expired,
+using an `ioctl` as discussed later. For a *direct* mount, autofs
+considers if the entire mount-tree can be unmounted or not. For an
+*indirect* mount, autofs considers each of the names in the top level
+directory to determine if any of those can be unmounted and cleaned
+up.
+
+There is an option with indirect mounts to consider each of the leaves
+that has been mounted on instead of considering the top-level names.
+This is intended for compatability with version 4 of autofs and should
+be considered as deprecated.
+
+When autofs considers a directory it checks the `last_used` time and
+compares it with the "timeout" value set when the filesystem was
+mounted, though this check is ignored in some cases. It also checks if
+the directory or anything below it is in use. For symbolic links,
+only the `last_used` time is ever considered.
+
+If both appear to support expiring the directory or symlink, an action
+is taken.
+
+There are two ways to ask autofs to consider expiry. The first is to
+use the **AUTOFS_IOC_EXPIRE** ioctl. This only works for indirect
+mounts. If it finds something in the root directory to expire it will
+return the name of that thing. Once a name has been returned the
+automount daemon needs to unmount any filesystems mounted below the
+name normally. As described above, this is unsafe for non-toplevel
+mounts in a version-5 autofs. For this reason the current `automountd`
+does not use this ioctl.
+
+The second mechanism uses either the **AUTOFS_DEV_IOCTL_EXPIRE_CMD** or
+the **AUTOFS_IOC_EXPIRE_MULTI** ioctl. This will work for both direct and
+indirect mounts. If it selects an object to expire, it will notify
+the daemon using the notification mechanism described below. This
+will block until the daemon acknowledges the expiry notification.
+This implies that the "`EXPIRE`" ioctl must be sent from a different
+thread than the one which handles notification.
+
+While the ioctl is blocking, the entry is marked as "expiring" and
+`d_manage` will block until the daemon affirms that the unmount has
+completed (together with removing any directories that might have been
+necessary), or has been aborted.
+
+Communicating with autofs: detecting the daemon
+-----------------------------------------------
+
+There are several forms of communication between the automount daemon
+and the filesystem. As we have already seen, the daemon can create and
+remove directories and symlinks using normal filesystem operations.
+autofs knows whether a process requesting some operation is the daemon
+or not based on its process-group id number (see getpgid(1)).
+
+When an autofs filesystem it mounted the pgid of the mounting
+processes is recorded unless the "pgrp=" option is given, in which
+case that number is recorded instead. Any request arriving from a
+process in that process group is considered to come from the daemon.
+If the daemon ever has to be stopped and restarted a new pgid can be
+provided through an ioctl as will be described below.
+
+Communicating with autofs: the event pipe
+-----------------------------------------
+
+When an autofs filesystem is mounted, the 'write' end of a pipe must
+be passed using the 'fd=' mount option. autofs will write
+notification messages to this pipe for the daemon to respond to.
+For version 5, the format of the message is:
+
+ struct autofs_v5_packet {
+ int proto_version; /* Protocol version */
+ int type; /* Type of packet */
+ autofs_wqt_t wait_queue_token;
+ __u32 dev;
+ __u64 ino;
+ __u32 uid;
+ __u32 gid;
+ __u32 pid;
+ __u32 tgid;
+ __u32 len;
+ char name[NAME_MAX+1];
+ };
+
+where the type is one of
+
+ autofs_ptype_missing_indirect
+ autofs_ptype_expire_indirect
+ autofs_ptype_missing_direct
+ autofs_ptype_expire_direct
+
+so messages can indicate that a name is missing (something tried to
+access it but it isn't there) or that it has been selected for expiry.
+
+The pipe will be set to "packet mode" (equivalent to passing
+`O_DIRECT`) to _pipe2(2)_ so that a read from the pipe will return at
+most one packet, and any unread portion of a packet will be discarded.
+
+The `wait_queue_token` is a unique number which can identify a
+particular request to be acknowledged. When a message is sent over
+the pipe the affected dentry is marked as either "active" or
+"expiring" and other accesses to it block until the message is
+acknowledged using one of the ioctls below and the relevant
+`wait_queue_token`.
+
+Communicating with autofs: root directory ioctls
+------------------------------------------------
+
+The root directory of an autofs filesystem will respond to a number of
+ioctls. The process issuing the ioctl must have the CAP_SYS_ADMIN
+capability, or must be the automount daemon.
+
+The available ioctl commands are:
+
+- **AUTOFS_IOC_READY**: a notification has been handled. The argument
+ to the ioctl command is the "wait_queue_token" number
+ corresponding to the notification being acknowledged.
+- **AUTOFS_IOC_FAIL**: similar to above, but indicates failure with
+ the error code `ENOENT`.
+- **AUTOFS_IOC_CATATONIC**: Causes the autofs to enter "catatonic"
+ mode meaning that it stops sending notifications to the daemon.
+ This mode is also entered if a write to the pipe fails.
+- **AUTOFS_IOC_PROTOVER**: This returns the protocol version in use.
+- **AUTOFS_IOC_PROTOSUBVER**: Returns the protocol sub-version which
+ is really a version number for the implementation. It is
+ currently 2.
+- **AUTOFS_IOC_SETTIMEOUT**: This passes a pointer to an unsigned
+ long. The value is used to set the timeout for expiry, and
+ the current timeout value is stored back through the pointer.
+- **AUTOFS_IOC_ASKUMOUNT**: Returns, in the pointed-to `int`, 1 if
+ the filesystem could be unmounted. This is only a hint as
+ the situation could change at any instant. This call can be
+ use to avoid a more expensive full unmount attempt.
+- **AUTOFS_IOC_EXPIRE**: as described above, this asks if there is
+ anything suitable to expire. A pointer to a packet:
+
+ struct autofs_packet_expire_multi {
+ int proto_version; /* Protocol version */
+ int type; /* Type of packet */
+ autofs_wqt_t wait_queue_token;
+ int len;
+ char name[NAME_MAX+1];
+ };
+
+ is required. This is filled in with the name of something
+ that can be unmounted or removed. If nothing can be expired,
+ `errno` is set to `EAGAIN`. Even though a `wait_queue_token`
+ is present in the structure, no "wait queue" is established
+ and no acknowledgment is needed.
+- **AUTOFS_IOC_EXPIRE_MULTI**: This is similar to
+ **AUTOFS_IOC_EXPIRE** except that it causes notification to be
+ sent to the daemon, and it blocks until the daemon acknowledges.
+ The argument is an integer which can contain two different flags.
+
+ **AUTOFS_EXP_IMMEDIATE** causes `last_used` time to be ignored
+ and objects are expired if the are not in use.
+
+ **AUTOFS_EXP_LEAVES** will select a leaf rather than a top-level
+ name to expire. This is only safe when *maxproto* is 4.
+
+Communicating with autofs: char-device ioctls
+---------------------------------------------
+
+It is not always possible to open the root of an autofs filesystem,
+particularly a *direct* mounted filesystem. If the automount daemon
+is restarted there is no way for it to regain control of existing
+mounts using any of the above communication channels. To address this
+need there is a "miscellaneous" character device (major 10, minor 235)
+which can be used to communicate directly with the autofs filesystem.
+It requires CAP_SYS_ADMIN for access.
+
+The `ioctl`s that can be used on this device are described in a separate
+document `autofs4-mount-control.txt`, and are summarized briefly here.
+Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure:
+
+ struct autofs_dev_ioctl {
+ __u32 ver_major;
+ __u32 ver_minor;
+ __u32 size; /* total size of data passed in
+ * including this struct */
+ __s32 ioctlfd; /* automount command fd */
+
+ __u32 arg1; /* Command parameters */
+ __u32 arg2;
+
+ char path[0];
+ };
+
+For the **OPEN_MOUNT** and **IS_MOUNTPOINT** commands, the target
+filesystem is identified by the `path`. All other commands identify
+the filesystem by the `ioctlfd` which is a file descriptor open on the
+root, and which can be returned by **OPEN_MOUNT**.
+
+The `ver_major` and `ver_minor` are in/out parameters which check that
+the requested version is supported, and report the maximum version
+that the kernel module can support.
+
+Commands are:
+
+- **AUTOFS_DEV_IOCTL_VERSION_CMD**: does nothing, except validate and
+ set version numbers.
+- **AUTOFS_DEV_IOCTL_OPENMOUNT_CMD**: return an open file descriptor
+ on the root of an autofs filesystem. The filesystem is identified
+ by name and device number, which is stored in `arg1`. Device
+ numbers for existing filesystems can be found in
+ `/proc/self/mountinfo`.
+- **AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD**: same as `close(ioctlfd)`.
+- **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in
+ catatonic mode, this can provide the write end of a new pipe
+ in `arg1` to re-establish communication with a daemon. The
+ process group of the calling process is used to identify the
+ daemon.
+- **AUTOFS_DEV_IOCTL_REQUESTER_CMD**: `path` should be a
+ name within the filesystem that has been auto-mounted on.
+ arg1 is the dev number of the underlying autofs. On successful
+ return, `arg1` and `arg2` will be the UID and GID of the process
+ which triggered that mount.
+
+- **AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD**: Check if path is a
+ mountpoint of a particular type - see separate documentation for
+ details.
+
+- **AUTOFS_DEV_IOCTL_PROTOVER_CMD**:
+- **AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD**:
+- **AUTOFS_DEV_IOCTL_READY_CMD**:
+- **AUTOFS_DEV_IOCTL_FAIL_CMD**:
+- **AUTOFS_DEV_IOCTL_CATATONIC_CMD**:
+- **AUTOFS_DEV_IOCTL_TIMEOUT_CMD**:
+- **AUTOFS_DEV_IOCTL_EXPIRE_CMD**:
+- **AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD**: These all have the same
+ function as the similarly named **AUTOFS_IOC** ioctls, except
+ that **FAIL** can be given an explicit error number in `arg1`
+ instead of assuming `ENOENT`, and this **EXPIRE** command
+ corresponds to **AUTOFS_IOC_EXPIRE_MULTI**.
+
+Catatonic mode
+--------------
+
+As mentioned, an autofs mount can enter "catatonic" mode. This
+happens if a write to the notification pipe fails, or if it is
+explicitly requested by an `ioctl`.
+
+When entering catatonic mode, the pipe is closed and any pending
+notifications are acknowledged with the error `ENOENT`.
+
+Once in catatonic mode attempts to access non-existing names will
+result in `ENOENT` while attempts to access existing directories will
+be treated in the same way as if they came from the daemon, so mount
+traps will not fire.
+
+When the filesystem is mounted a _uid_ and _gid_ can be given which
+set the ownership of directories and symbolic links. When the
+filesystem is in catatonic mode, any process with a matching UID can
+create directories or symlinks in the root directory, but not in other
+directories.
+
+Catatonic mode can only be left via the
+**AUTOFS_DEV_IOCTL_OPENMOUNT_CMD** ioctl on the `/dev/autofs`.
+
+autofs, name spaces, and shared mounts
+--------------------------------------
+
+With bind mounts and name spaces it is possible for an autofs
+filesystem to appear at multiple places in one or more filesystem
+name spaces. For this to work sensibly, the autofs filesystem should
+always be mounted "shared". e.g.
+
+> `mount --make-shared /autofs/mount/point`
+
+The automount daemon is only able to mange a single mount location for
+an autofs filesystem and if mounts on that are not 'shared', other
+locations will not behave as expected. In particular access to those
+other locations will likely result in the `ELOOP` error
+
+> Too many levels of symbolic links
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index eb8a10e22f7c..154a345307b0 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -796,6 +796,7 @@ VmallocTotal: 112216 kB
VmallocUsed: 428 kB
VmallocChunk: 111088 kB
AnonHugePages: 49152 kB
+BalloonPages: 0 kB
MemTotal: Total usable ram (i.e. physical ram minus a few reserved
bits and the kernel binary code)
@@ -838,6 +839,7 @@ MemAvailable: An estimate of how much memory is available for starting new
Writeback: Memory which is actively being written back to the disk
AnonPages: Non-file backed pages mapped into userspace page tables
AnonHugePages: Non-file backed huge pages mapped into userspace page tables
+BalloonPages: Memory which was ballooned, not included into MemTotal
Mapped: files which have been mmaped, such as libraries
Slab: in-kernel data structures cache
SReclaimable: Part of Slab, that might be reclaimed, such as caches
diff --git a/Documentation/leds/leds-class.txt b/Documentation/leds/leds-class.txt
index 79699c200766..62261c04060a 100644
--- a/Documentation/leds/leds-class.txt
+++ b/Documentation/leds/leds-class.txt
@@ -2,9 +2,6 @@
LED handling under Linux
========================
-If you're reading this and thinking about keyboard leds, these are
-handled by the input subsystem and the led class is *not* needed.
-
In its simplest form, the LED class just allows control of LEDs from
userspace. LEDs appear in /sys/class/leds/. The maximum brightness of the
LED is defined in max_brightness file. The brightness file will set the brightness
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 45134dc23854..ea03abfc97e9 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -155,6 +155,7 @@ Under each memory block, you can see 4 files:
/sys/devices/system/memory/memoryXXX/phys_device
/sys/devices/system/memory/memoryXXX/state
/sys/devices/system/memory/memoryXXX/removable
+/sys/devices/system/memory/memoryXXX/valid_zones
'phys_index' : read-only and contains memory block id, same as XXX.
'state' : read-write
@@ -170,6 +171,15 @@ Under each memory block, you can see 4 files:
block is removable and a value of 0 indicates that
it is not removable. A memory block is removable only if
every section in the block is removable.
+'valid_zones' : read-only: designed to show which zones this memory block
+ can be onlined to.
+ The first column shows it's default zone.
+ "memory6/valid_zones: Normal Movable" shows this memoryblock
+ can be onlined to ZONE_NORMAL by default and to ZONE_MOVABLE
+ by online_movable.
+ "memory7/valid_zones: Movable Normal" shows this memoryblock
+ can be onlined to ZONE_MOVABLE by default and to ZONE_NORMAL
+ by online_kernel.
NOTE:
These directories/files appear after physical memory hotplug phase.
@@ -408,7 +418,6 @@ node if necessary.
- allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
sysctl or new control file.
- showing memory block and physical device relationship.
- - showing memory block is under ZONE_MOVABLE or not
- test and make it better memory offlining.
- support HugeTLB page migration and offlining.
- memmap removing at memory offline.
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index b4498218c474..3b56a991f52e 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -184,6 +184,12 @@ dentry names:
equivalent of %s dentry->d_name.name we used to use, %pd<n> prints
n last components. %pD does the same thing for struct file.
+task_struct comm name:
+
+ %pT
+
+ For printing task_struct->comm.
+
struct va_format:
%pV
diff --git a/MAINTAINERS b/MAINTAINERS
index 18baa053b0ae..c6c1798fd35e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1328,8 +1328,7 @@ ARM/SAMSUNG MOBILE MACHINE SUPPORT
M: Kyungmin Park <kyungmin.park@samsung.com>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Maintained
-F: arch/arm/mach-s5pv210/mach-aquila.c
-F: arch/arm/mach-s5pv210/mach-goni.c
+F: arch/arm/mach-s5pv210/
ARM/SAMSUNG S5P SERIES 2D GRAPHICS ACCELERATION (G2D) SUPPORT
M: Kyungmin Park <kyungmin.park@samsung.com>
@@ -1536,6 +1535,7 @@ T: git git://git.xilinx.com/linux-xlnx.git
S: Supported
F: arch/arm/mach-zynq/
F: drivers/cpuidle/cpuidle-zynq.c
+F: drivers/block/xsysace.c
N: zynq
N: xilinx
F: drivers/clocksource/cadence_ttc_timer.c
@@ -2998,7 +2998,7 @@ M: Sumit Semwal <sumit.semwal@linaro.org>
S: Maintained
L: linux-media@vger.kernel.org
L: dri-devel@lists.freedesktop.org
-L: linaro-mm-sig@lists.linaro.org
+L: linaro-mm-sig@lists.linaro.org (moderated for non-subscribers)
F: drivers/dma-buf/
F: include/linux/dma-buf*
F: include/linux/reservation.h
@@ -5238,6 +5238,13 @@ F: include/linux/lockd/
F: include/linux/sunrpc/
F: include/uapi/linux/sunrpc/
+KERNEL SELFTEST FRAMEWORK
+M: Shuah Khan <shuahkh@osg.samsung.com>
+L: linux-api@vger.kernel.org
+T: git git://git.kernel.org/pub/scm/shuah/linux-kselftest
+S: Maintained
+F: tools/testing/selftests
+
KERNEL VIRTUAL MACHINE (KVM)
M: Gleb Natapov <gleb@kernel.org>
M: Paolo Bonzini <pbonzini@redhat.com>
@@ -7905,7 +7912,6 @@ S: Supported
F: drivers/mfd/sec*.c
F: drivers/regulator/s2m*.c
F: drivers/regulator/s5m*.c
-F: drivers/rtc/rtc-sec.c
F: include/linux/mfd/samsung/
SAMSUNG S5P/EXYNOS4 SOC SERIES CAMERA SUBSYSTEM DRIVERS
@@ -10245,10 +10251,6 @@ M: John Linn <John.Linn@xilinx.com>
S: Maintained
F: drivers/net/ethernet/xilinx/xilinx_axienet*
-XILINX SYSTEMACE DRIVER
-S: Orphan
-F: drivers/block/xsysace.c
-
XILINX UARTLITE SERIAL DRIVER
M: Peter Korsgaard <jacmet@sunsite.dk>
L: linux-serial@vger.kernel.org
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index e858aa0ad8af..f53010234ff8 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -7,4 +7,5 @@ generic-y += hash.h
generic-y += mcs_spinlock.h
generic-y += preempt.h
generic-y += scatterlist.h
+generic-y += sections.h
generic-y += trace_clock.h
diff --git a/arch/alpha/include/asm/sections.h b/arch/alpha/include/asm/sections.h
deleted file mode 100644
index 43b40edd6e44..000000000000
--- a/arch/alpha/include/asm/sections.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _ALPHA_SECTIONS_H
-#define _ALPHA_SECTIONS_H
-
-/* nothing to see, move along */
-#include <asm-generic/sections.h>
-
-#endif
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9e3298da8dd3..d63e50615e9c 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -14,6 +14,7 @@ config ARM
select CLONE_BACKWARDS
select CPU_PM if (SUSPEND || CPU_IDLE)
select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS
+ select GENERIC_ALLOCATOR
select GENERIC_ATOMIC64 if (CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI)
select GENERIC_CLOCKEVENTS_BROADCAST if SMP
select GENERIC_IDLE_POLL_SETUP
diff --git a/arch/arm/boot/dts/exynos3250.dtsi b/arch/arm/boot/dts/exynos3250.dtsi
index 1d52de6370d5..429a6c6cfcf9 100644
--- a/arch/arm/boot/dts/exynos3250.dtsi
+++ b/arch/arm/boot/dts/exynos3250.dtsi
@@ -164,7 +164,7 @@
};
rtc: rtc@10070000 {
- compatible = "samsung,s3c6410-rtc";
+ compatible = "samsung,exynos3250-rtc";
reg = <0x10070000 0x100>;
interrupts = <0 73 0>, <0 74 0>;
status = "disabled";
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 7a996aaa061e..c245d903927f 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -12,6 +12,7 @@
#include <linux/bootmem.h>
#include <linux/module.h>
#include <linux/mm.h>
+#include <linux/genalloc.h>
#include <linux/gfp.h>
#include <linux/errno.h>
#include <linux/list.h>
@@ -298,57 +299,29 @@ static void *
__dma_alloc_remap(struct page *page, size_t size, gfp_t gfp, pgprot_t prot,
const void *caller)
{
- struct vm_struct *area;
- unsigned long addr;
-
/*
* DMA allocation can be mapped to user space, so lets
* set VM_USERMAP flags too.
*/
- area = get_vm_area_caller(size, VM_ARM_DMA_CONSISTENT | VM_USERMAP,
- caller);
- if (!area)
- return NULL;
- addr = (unsigned long)area->addr;
- area->phys_addr = __pfn_to_phys(page_to_pfn(page));
-
- if (ioremap_page_range(addr, addr + size, area->phys_addr, prot)) {
- vunmap((void *)addr);
- return NULL;
- }
- return (void *)addr;
+ return dma_common_contiguous_remap(page, size,
+ VM_ARM_DMA_CONSISTENT | VM_USERMAP,
+ prot, caller);
}
static void __dma_free_remap(void *cpu_addr, size_t size)
{
- unsigned int flags = VM_ARM_DMA_CONSISTENT | VM_USERMAP;
- struct vm_struct *area = find_vm_area(cpu_addr);
- if (!area || (area->flags & flags) != flags) {
- WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
- return;
- }
- unmap_kernel_range((unsigned long)cpu_addr, size);
- vunmap(cpu_addr);
+ dma_common_free_remap(cpu_addr, size,
+ VM_ARM_DMA_CONSISTENT | VM_USERMAP);
}
#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K
+static struct gen_pool *atomic_pool;
-struct dma_pool {
- size_t size;
- spinlock_t lock;
- unsigned long *bitmap;
- unsigned long nr_pages;
- void *vaddr;
- struct page **pages;
-};
-
-static struct dma_pool atomic_pool = {
- .size = DEFAULT_DMA_COHERENT_POOL_SIZE,
-};
+static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
static int __init early_coherent_pool(char *p)
{
- atomic_pool.size = memparse(p, &p);
+ atomic_pool_size = memparse(p, &p);
return 0;
}
early_param("coherent_pool", early_coherent_pool);
@@ -358,14 +331,14 @@ void __init init_dma_coherent_pool_size(unsigned long size)
/*
* Catch any attempt to set the pool size too late.
*/
- BUG_ON(atomic_pool.vaddr);
+ BUG_ON(atomic_pool);
/*
* Set architecture specific coherent pool size only if
* it has not been changed by kernel command line parameter.
*/
- if (atomic_pool.size == DEFAULT_DMA_COHERENT_POOL_SIZE)
- atomic_pool.size = size;
+ if (atomic_pool_size == DEFAULT_DMA_COHERENT_POOL_SIZE)
+ atomic_pool_size = size;
}
/*
@@ -373,52 +346,44 @@ void __init init_dma_coherent_pool_size(unsigned long size)
*/
static int __init atomic_pool_init(void)
{
- struct dma_pool *pool = &atomic_pool;
pgprot_t prot = pgprot_dmacoherent(PAGE_KERNEL);
gfp_t gfp = GFP_KERNEL | GFP_DMA;
- unsigned long nr_pages = pool->size >> PAGE_SHIFT;
- unsigned long *bitmap;
struct page *page;
- struct page **pages;
void *ptr;
- int bitmap_size = BITS_TO_LONGS(nr_pages) * sizeof(long);
- bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!bitmap)
- goto no_bitmap;
-
- pages = kzalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
- if (!pages)
- goto no_pages;
+ atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
+ if (!atomic_pool)
+ goto out;
if (dev_get_cma_area(NULL))
- ptr = __alloc_from_contiguous(NULL, pool->size, prot, &page,
- atomic_pool_init);
+ ptr = __alloc_from_contiguous(NULL, atomic_pool_size, prot,
+ &page, atomic_pool_init);
else
- ptr = __alloc_remap_buffer(NULL, pool->size, gfp, prot, &page,
- atomic_pool_init);
+ ptr = __alloc_remap_buffer(NULL, atomic_pool_size, gfp, prot,
+ &page, atomic_pool_init);
if (ptr) {
- int i;
-
- for (i = 0; i < nr_pages; i++)
- pages[i] = page + i;
-
- spin_lock_init(&pool->lock);
- pool->vaddr = ptr;
- pool->pages = pages;
- pool->bitmap = bitmap;
- pool->nr_pages = nr_pages;
- pr_info("DMA: preallocated %u KiB pool for atomic coherent allocations\n",
- (unsigned)pool->size / 1024);
+ int ret;
+
+ ret = gen_pool_add_virt(atomic_pool, (unsigned long)ptr,
+ page_to_phys(page),
+ atomic_pool_size, -1);
+ if (ret)
+ goto destroy_genpool;
+
+ gen_pool_set_algo(atomic_pool,
+ gen_pool_first_fit_order_align,
+ (void *)PAGE_SHIFT);
+ pr_info("DMA: preallocated %zd KiB pool for atomic coherent allocations\n",
+ atomic_pool_size / 1024);
return 0;
}
- kfree(pages);
-no_pages:
- kfree(bitmap);
-no_bitmap:
- pr_err("DMA: failed to allocate %u KiB pool for atomic coherent allocation\n",
- (unsigned)pool->size / 1024);
+destroy_genpool:
+ gen_pool_destroy(atomic_pool);
+ atomic_pool = NULL;
+out:
+ pr_err("DMA: failed to allocate %zx KiB pool for atomic coherent allocation\n",
+ atomic_pool_size / 1024);
return -ENOMEM;
}
/*
@@ -522,76 +487,36 @@ static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp,
static void *__alloc_from_pool(size_t size, struct page **ret_page)
{
- struct dma_pool *pool = &atomic_pool;
- unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
- unsigned int pageno;
- unsigned long flags;
+ unsigned long val;
void *ptr = NULL;
- unsigned long align_mask;
- if (!pool->vaddr) {
+ if (!atomic_pool) {
WARN(1, "coherent pool not initialised!\n");
return NULL;
}
- /*
- * Align the region allocation - allocations from pool are rather
- * small, so align them to their order in pages, minimum is a page
- * size. This helps reduce fragmentation of the DMA space.
- */
- align_mask = (1 << get_order(size)) - 1;
-
- spin_lock_irqsave(&pool->lock, flags);
- pageno = bitmap_find_next_zero_area(pool->bitmap, pool->nr_pages,
- 0, count, align_mask);
- if (pageno < pool->nr_pages) {
- bitmap_set(pool->bitmap, pageno, count);
- ptr = pool->vaddr + PAGE_SIZE * pageno;
- *ret_page = pool->pages[pageno];
- } else {
- pr_err_once("ERROR: %u KiB atomic DMA coherent pool is too small!\n"
- "Please increase it with coherent_pool= kernel parameter!\n",
- (unsigned)pool->size / 1024);
+ val = gen_pool_alloc(atomic_pool, size);
+ if (val) {
+ phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
+
+ *ret_page = phys_to_page(phys);
+ ptr = (void *)val;
}
- spin_unlock_irqrestore(&pool->lock, flags);
return ptr;
}
static bool __in_atomic_pool(void *start, size_t size)
{
- struct dma_pool *pool = &atomic_pool;
- void *end = start + size;
- void *pool_start = pool->vaddr;
- void *pool_end = pool->vaddr + pool->size;
-
- if (start < pool_start || start >= pool_end)
- return false;
-
- if (end <= pool_end)
- return true;
-
- WARN(1, "Wrong coherent size(%p-%p) from atomic pool(%p-%p)\n",
- start, end - 1, pool_start, pool_end - 1);
-
- return false;
+ return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
}
static int __free_from_pool(void *start, size_t size)
{
- struct dma_pool *pool = &atomic_pool;
- unsigned long pageno, count;
- unsigned long flags;
-
if (!__in_atomic_pool(start, size))
return 0;
- pageno = (start - pool->vaddr) >> PAGE_SHIFT;
- count = size >> PAGE_SHIFT;
-
- spin_lock_irqsave(&pool->lock, flags);
- bitmap_clear(pool->bitmap, pageno, count);
- spin_unlock_irqrestore(&pool->lock, flags);
+ gen_pool_free(atomic_pool, (unsigned long)start, size);
return 1;
}
@@ -1271,29 +1196,8 @@ static void *
__iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot,
const void *caller)
{
- unsigned int i, nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
- struct vm_struct *area;
- unsigned long p;
-
- area = get_vm_area_caller(size, VM_ARM_DMA_CONSISTENT | VM_USERMAP,
- caller);
- if (!area)
- return NULL;
-
- area->pages = pages;
- area->nr_pages = nr_pages;
- p = (unsigned long)area->addr;
-
- for (i = 0; i < nr_pages; i++) {
- phys_addr_t phys = __pfn_to_phys(page_to_pfn(pages[i]));
- if (ioremap_page_range(p, p + PAGE_SIZE, phys, prot))
- goto err;
- p += PAGE_SIZE;
- }
- return area->addr;
-err:
- unmap_kernel_range((unsigned long)area->addr, size);
- vunmap(area->addr);
+ return dma_common_pages_remap(pages, size,
+ VM_ARM_DMA_CONSISTENT | VM_USERMAP, prot, caller);
return NULL;
}
@@ -1355,11 +1259,13 @@ static int __iommu_remove_mapping(struct device *dev, dma_addr_t iova, size_t si
static struct page **__atomic_get_pages(void *addr)
{
- struct dma_pool *pool = &atomic_pool;
- struct page **pages = pool->pages;
- int offs = (addr - pool->vaddr) >> PAGE_SHIFT;
+ struct page *page;
+ phys_addr_t phys;
+
+ phys = gen_pool_virt_to_phys(atomic_pool, (unsigned long)addr);
+ page = phys_to_page(phys);
- return pages + offs;
+ return (struct page **)page;
}
static struct page **__iommu_get_pages(void *cpu_addr, struct dma_attrs *attrs)
@@ -1501,8 +1407,8 @@ void arm_iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
}
if (!dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs)) {
- unmap_kernel_range((unsigned long)cpu_addr, size);
- vunmap(cpu_addr);
+ dma_common_free_remap(cpu_addr, size,
+ VM_ARM_DMA_CONSISTENT | VM_USERMAP);
}
__iommu_remove_mapping(dev, handle, size);
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 659c75d808dc..c1b513555786 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -322,7 +322,7 @@ void __init arm_memblock_init(const struct machine_desc *mdesc)
* reserve memory for DMA contigouos allocations,
* must come from DMA area inside low memory
*/
- dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));
+ dma_contiguous_reserve(arm_dma_limit);
arm_memblock_steal_permitted = false;
memblock_dump_all();
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 466ba3251aff..2ca2ebd2e038 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -18,6 +18,7 @@ config ARM64
select COMMON_CLK
select CPU_PM if (SUSPEND || CPU_IDLE)
select DCACHE_WORD_ACCESS
+ select GENERIC_ALLOCATOR
select GENERIC_CLOCKEVENTS
select GENERIC_CLOCKEVENTS_BROADCAST if SMP
select GENERIC_CPU_AUTOPROBE
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 4164c5ace9f8..90bb7b34d058 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -27,6 +27,7 @@
#include <linux/vmalloc.h>
#include <linux/swiotlb.h>
#include <linux/amba/bus.h>
+#include <linux/genalloc.h>
#include <asm/cacheflush.h>
@@ -41,6 +42,54 @@ static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot,
return prot;
}
+static struct gen_pool *atomic_pool;
+
+#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K
+static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
+
+static int __init early_coherent_pool(char *p)
+{
+ atomic_pool_size = memparse(p, &p);
+ return 0;
+}
+early_param("coherent_pool", early_coherent_pool);
+
+static void *__alloc_from_pool(size_t size, struct page **ret_page)
+{
+ unsigned long val;
+ void *ptr = NULL;
+
+ if (!atomic_pool) {
+ WARN(1, "coherent pool not initialised!\n");
+ return NULL;
+ }
+
+ val = gen_pool_alloc(atomic_pool, size);
+ if (val) {
+ phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
+
+ *ret_page = phys_to_page(phys);
+ ptr = (void *)val;
+ }
+
+ return ptr;
+}
+
+static bool __in_atomic_pool(void *start, size_t size)
+{
+ return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+}
+
+static int __free_from_pool(void *start, size_t size)
+{
+ if (!__in_atomic_pool(start, size))
+ return 0;
+
+ gen_pool_free(atomic_pool, (unsigned long)start, size);
+
+ return 1;
+}
+
static void *__dma_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t flags,
struct dma_attrs *attrs)
@@ -53,7 +102,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size,
if (IS_ENABLED(CONFIG_ZONE_DMA) &&
dev->coherent_dma_mask <= DMA_BIT_MASK(32))
flags |= GFP_DMA;
- if (IS_ENABLED(CONFIG_DMA_CMA)) {
+ if (IS_ENABLED(CONFIG_DMA_CMA) && (flags & __GFP_WAIT)) {
struct page *page;
size = PAGE_ALIGN(size);
@@ -73,50 +122,54 @@ static void __dma_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t dma_handle,
struct dma_attrs *attrs)
{
+ bool freed;
+ phys_addr_t paddr = dma_to_phys(dev, dma_handle);
+
if (dev == NULL) {
WARN_ONCE(1, "Use an actual device structure for DMA allocation\n");
return;
}
- if (IS_ENABLED(CONFIG_DMA_CMA)) {
- phys_addr_t paddr = dma_to_phys(dev, dma_handle);
-
- dma_release_from_contiguous(dev,
+ freed = dma_release_from_contiguous(dev,
phys_to_page(paddr),
size >> PAGE_SHIFT);
- } else {
+ if (!freed)
swiotlb_free_coherent(dev, size, vaddr, dma_handle);
- }
}
static void *__dma_alloc_noncoherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t flags,
struct dma_attrs *attrs)
{
- struct page *page, **map;
+ struct page *page;
void *ptr, *coherent_ptr;
- int order, i;
size = PAGE_ALIGN(size);
- order = get_order(size);
+
+ if (!(flags & __GFP_WAIT)) {
+ struct page *page = NULL;
+ void *addr = __alloc_from_pool(size, &page);
+
+ if (addr)
+ *dma_handle = phys_to_dma(dev, page_to_phys(page));
+
+ return addr;
+
+ }
ptr = __dma_alloc_coherent(dev, size, dma_handle, flags, attrs);
if (!ptr)
goto no_mem;
- map = kmalloc(sizeof(struct page *) << order, flags & ~GFP_DMA);
- if (!map)
- goto no_map;
/* remove any dirty cache lines on the kernel alias */
__dma_flush_range(ptr, ptr + size);
/* create a coherent mapping */
page = virt_to_page(ptr);
- for (i = 0; i < (size >> PAGE_SHIFT); i++)
- map[i] = page + i;
- coherent_ptr = vmap(map, size >> PAGE_SHIFT, VM_MAP,
- __get_dma_pgprot(attrs, __pgprot(PROT_NORMAL_NC), false));
- kfree(map);
+ coherent_ptr = dma_common_contiguous_remap(page, size, VM_USERMAP,
+ __get_dma_pgprot(attrs,
+ __pgprot(PROT_NORMAL_NC), false),
+ NULL);
if (!coherent_ptr)
goto no_map;
@@ -135,6 +188,8 @@ static void __dma_free_noncoherent(struct device *dev, size_t size,
{
void *swiotlb_addr = phys_to_virt(dma_to_phys(dev, dma_handle));
+ if (__free_from_pool(vaddr, size))
+ return;
vunmap(vaddr);
__dma_free_coherent(dev, size, swiotlb_addr, dma_handle, attrs);
}
@@ -332,6 +387,67 @@ static struct notifier_block amba_bus_nb = {
extern int swiotlb_late_init_with_default_size(size_t default_size);
+static int __init atomic_pool_init(void)
+{
+ pgprot_t prot = __pgprot(PROT_NORMAL_NC);
+ unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
+ struct page *page;
+ void *addr;
+ unsigned int pool_size_order = get_order(atomic_pool_size);
+
+ if (dev_get_cma_area(NULL))
+ page = dma_alloc_from_contiguous(NULL, nr_pages,
+ pool_size_order);
+ else
+ page = alloc_pages(GFP_DMA, pool_size_order);
+
+ if (page) {
+ int ret;
+ void *page_addr = page_address(page);
+
+ memset(page_addr, 0, atomic_pool_size);
+ __dma_flush_range(page_addr, page_addr + atomic_pool_size);
+
+ atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
+ if (!atomic_pool)
+ goto free_page;
+
+ addr = dma_common_contiguous_remap(page, atomic_pool_size,
+ VM_USERMAP, prot, atomic_pool_init);
+
+ if (!addr)
+ goto destroy_genpool;
+
+ ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr,
+ page_to_phys(page),
+ atomic_pool_size, -1);
+ if (ret)
+ goto remove_mapping;
+
+ gen_pool_set_algo(atomic_pool,
+ gen_pool_first_fit_order_align,
+ (void *)PAGE_SHIFT);
+
+ pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n",
+ atomic_pool_size / 1024);
+ return 0;
+ }
+ goto out;
+
+remove_mapping:
+ dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP);
+destroy_genpool:
+ gen_pool_destroy(atomic_pool);
+ atomic_pool = NULL;
+free_page:
+ if (!dma_release_from_contiguous(NULL, page, nr_pages))
+ __free_pages(page, pool_size_order);
+out:
+ pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
+ atomic_pool_size / 1024);
+ return -ENOMEM;
+}
+
static int __init swiotlb_late_init(void)
{
size_t swiotlb_size = min(SZ_64M, MAX_ORDER_NR_PAGES << PAGE_SHIFT);
@@ -346,7 +462,17 @@ static int __init swiotlb_late_init(void)
return swiotlb_late_init_with_default_size(swiotlb_size);
}
-arch_initcall(swiotlb_late_init);
+
+static int __init arm64_dma_init(void)
+{
+ int ret = 0;
+
+ ret |= swiotlb_late_init();
+ ret |= atomic_pool_init();
+
+ return ret;
+}
+arch_initcall(arm64_dma_init);
#define PREALLOC_DMA_DEBUG_ENTRIES 4096
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild
index 31742dfadff9..3b273dba65fa 100644
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -14,6 +14,7 @@ generic-y += mcs_spinlock.h
generic-y += module.h
generic-y += preempt.h
generic-y += scatterlist.h
+generic-y += sections.h
generic-y += trace_clock.h
generic-y += vga.h
generic-y += xor.h
diff --git a/arch/cris/include/asm/sections.h b/arch/cris/include/asm/sections.h
deleted file mode 100644
index 2c998ce8967b..000000000000
--- a/arch/cris/include/asm/sections.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _CRIS_SECTIONS_H
-#define _CRIS_SECTIONS_H
-
-/* nothing to see, move along */
-#include <asm-generic/sections.h>
-
-#endif
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild
index accc10a3dc78..761eb79eff4d 100644
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -7,4 +7,5 @@ generic-y += mcs_spinlock.h
generic-y += module.h
generic-y += preempt.h
generic-y += scatterlist.h
+generic-y += sections.h
generic-y += trace_clock.h
diff --git a/arch/m32r/include/asm/sections.h b/arch/m32r/include/asm/sections.h
deleted file mode 100644
index 5e5d21c4908a..000000000000
--- a/arch/m32r/include/asm/sections.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _M32R_SECTIONS_H
-#define _M32R_SECTIONS_H
-
-/* nothing to see, move along */
-#include <asm-generic/sections.h>
-
-#endif /* _M32R_SECTIONS_H */
diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c
index 3a480b3df0d6..9aa01adb407f 100644
--- a/arch/m68k/kernel/sys_m68k.c
+++ b/arch/m68k/kernel/sys_m68k.c
@@ -376,7 +376,6 @@ cache_flush_060 (unsigned long addr, int scope, int cache, unsigned long len)
asmlinkage int
sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len)
{
- struct vm_area_struct *vma;
int ret = -EINVAL;
if (scope < FLUSH_SCOPE_LINE || scope > FLUSH_SCOPE_ALL ||
@@ -389,17 +388,21 @@ sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len)
if (!capable(CAP_SYS_ADMIN))
goto out;
} else {
+ struct vm_area_struct *vma;
+
+ /* Check for overflow. */
+ if (addr + len < addr)
+ goto out;
+
/*
* Verify that the specified address region actually belongs
* to this process.
*/
- vma = find_vma (current->mm, addr);
ret = -EINVAL;
- /* Check for overflow. */
- if (addr + len < addr)
- goto out;
- if (vma == NULL || addr < vma->vm_start || addr + len > vma->vm_end)
- goto out;
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, addr);
+ if (!vma || addr < vma->vm_start || addr + len > vma->vm_end)
+ goto out_unlock;
}
if (CPU_IS_020_OR_030) {
@@ -429,7 +432,7 @@ sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len)
__asm__ __volatile__ ("movec %0, %%cacr" : : "r" (cacr));
}
ret = 0;
- goto out;
+ goto out_unlock;
} else {
/*
* 040 or 060: don't blindly trust 'scope', someone could
@@ -446,6 +449,8 @@ sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len)
ret = cache_flush_060 (addr, scope, cache, len);
}
}
+out_unlock:
+ up_read(&current->mm->mmap_sem);
out:
return ret;
}
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild
index ecbd6676bd33..8504ac52357b 100644
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -7,4 +7,5 @@ generic-y += hash.h
generic-y += mcs_spinlock.h
generic-y += preempt.h
generic-y += scatterlist.h
+generic-y += sections.h
generic-y += trace_clock.h
diff --git a/arch/mn10300/include/asm/sections.h b/arch/mn10300/include/asm/sections.h
deleted file mode 100644
index 2b8c5160388f..000000000000
--- a/arch/mn10300/include/asm/sections.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/sections.h>
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index d98c1ecc3266..f60d4ea8b50c 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -38,10 +38,9 @@ static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK)
static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
#ifdef CONFIG_NUMA_BALANCING
-
static inline int pte_present(pte_t pte)
{
- return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA);
+ return pte_val(pte) & _PAGE_NUMA_MASK;
}
#define pte_present_nonuma pte_present_nonuma
@@ -50,37 +49,6 @@ static inline int pte_present_nonuma(pte_t pte)
return pte_val(pte) & (_PAGE_PRESENT);
}
-#define pte_numa pte_numa
-static inline int pte_numa(pte_t pte)
-{
- return (pte_val(pte) &
- (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
-}
-
-#define pte_mknonnuma pte_mknonnuma
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
- pte_val(pte) &= ~_PAGE_NUMA;
- pte_val(pte) |= _PAGE_PRESENT | _PAGE_ACCESSED;
- return pte;
-}
-
-#define pte_mknuma pte_mknuma
-static inline pte_t pte_mknuma(pte_t pte)
-{
- /*
- * We should not set _PAGE_NUMA on non present ptes. Also clear the
- * present bit so that hash_page will return 1 and we collect this
- * as numa fault.
- */
- if (pte_present(pte)) {
- pte_val(pte) |= _PAGE_NUMA;
- pte_val(pte) &= ~_PAGE_PRESENT;
- } else
- VM_BUG_ON(1);
- return pte;
-}
-
#define ptep_set_numa ptep_set_numa
static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
@@ -92,12 +60,6 @@ static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
return;
}
-#define pmd_numa pmd_numa
-static inline int pmd_numa(pmd_t pmd)
-{
- return pte_numa(pmd_pte(pmd));
-}
-
#define pmdp_set_numa pmdp_set_numa
static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
@@ -109,16 +71,21 @@ static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
return;
}
-#define pmd_mknonnuma pmd_mknonnuma
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+/*
+ * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist
+ * which was inherited from x86. For the purposes of powerpc pte_basic_t and
+ * pmd_t are equivalent
+ */
+#define pteval_t pte_basic_t
+#define pmdval_t pmd_t
+static inline pteval_t ptenuma_flags(pte_t pte)
{
- return pte_pmd(pte_mknonnuma(pmd_pte(pmd)));
+ return pte_val(pte) & _PAGE_NUMA_MASK;
}
-#define pmd_mknuma pmd_mknuma
-static inline pmd_t pmd_mknuma(pmd_t pmd)
+static inline pmdval_t pmdnuma_flags(pmd_t pmd)
{
- return pte_pmd(pte_mknuma(pmd_pte(pmd)));
+ return pmd_val(pmd) & _PAGE_NUMA_MASK;
}
# else
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index 8d1569c29042..e040c3595129 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -98,6 +98,11 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
_PAGE_USER | _PAGE_ACCESSED | \
_PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC)
+#ifdef CONFIG_NUMA_BALANCING
+/* Mask of bits that distinguish present and numa ptes */
+#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT)
+#endif
+
/*
* We define 2 sets of base prot bits, one for basic pages (ie,
* cacheable kernel and user pages) and one for non cacheable
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild
index aad209199f7e..6fc350390063 100644
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -9,5 +9,6 @@ generic-y += hash.h
generic-y += mcs_spinlock.h
generic-y += preempt.h
generic-y += scatterlist.h
+generic-y += sections.h
generic-y += trace_clock.h
generic-y += xor.h
diff --git a/arch/score/include/asm/sections.h b/arch/score/include/asm/sections.h
deleted file mode 100644
index 9441d23af005..000000000000
--- a/arch/score/include/asm/sections.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_SCORE_SECTIONS_H
-#define _ASM_SCORE_SECTIONS_H
-
-#include <asm-generic/sections.h>
-
-#endif /* _ASM_SCORE_SECTIONS_H */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 51e0ae3b2108..5b1b180905e5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -29,7 +29,6 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_INT128 if X86_64
- select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 4064acae625d..01b493e5a99b 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -9,7 +9,6 @@
#ifdef CONFIG_NUMA
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
-#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
/*
* Too small node sizes may confuse the VM badly. Usually they
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 5be9063545d2..809abb335627 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -115,7 +115,8 @@ static inline void native_pgd_clear(pgd_t *pgd)
native_set_pgd(pgd, native_make_pgd(0));
}
-extern void sync_global_pgds(unsigned long start, unsigned long end);
+extern void sync_global_pgds(unsigned long start, unsigned long end,
+ int removed);
/*
* Conversion functions: convert a page and protection to a page entry,
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index cee83b26916f..07789647bf33 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -324,6 +324,20 @@ static inline pteval_t pte_flags(pte_t pte)
return native_pte_val(pte) & PTE_FLAGS_MASK;
}
+#ifdef CONFIG_NUMA_BALANCING
+/* Set of bits that distinguishes present, prot_none and numa ptes */
+#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
+static inline pteval_t ptenuma_flags(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_NUMA_MASK;
+}
+
+static inline pmdval_t pmdnuma_flags(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_NUMA_MASK;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
#define pgprot_val(x) ((x).pgprot)
#define __pgprot(x) ((pgprot_t) { (x) } )
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 83bb03bfa259..15d7cc4a67af 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -350,7 +350,7 @@ out:
void vmalloc_sync_all(void)
{
- sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
+ sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
}
/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5d984769cbd8..4cb8763868fc 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -178,7 +178,7 @@ __setup("noexec32=", nonx32_setup);
* When memory was added/removed make sure all the processes MM have
* suitable PGD entries in the local PGD level page.
*/
-void sync_global_pgds(unsigned long start, unsigned long end)
+void sync_global_pgds(unsigned long start, unsigned long end, int removed)
{
unsigned long address;
@@ -186,7 +186,12 @@ void sync_global_pgds(unsigned long start, unsigned long end)
const pgd_t *pgd_ref = pgd_offset_k(address);
struct page *page;
- if (pgd_none(*pgd_ref))
+ /*
+ * When it is called after memory hot remove, pgd_none()
+ * returns true. In this case (removed == 1), we must clear
+ * the PGD entries in the local PGD level page.
+ */
+ if (pgd_none(*pgd_ref) && !removed)
continue;
spin_lock(&pgd_lock);
@@ -199,12 +204,18 @@ void sync_global_pgds(unsigned long start, unsigned long end)
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
- if (pgd_none(*pgd))
- set_pgd(pgd, *pgd_ref);
- else
+ if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
BUG_ON(pgd_page_vaddr(*pgd)
!= pgd_page_vaddr(*pgd_ref));
+ if (removed) {
+ if (pgd_none(*pgd_ref) && !pgd_none(*pgd))
+ pgd_clear(pgd);
+ } else {
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+ }
+
spin_unlock(pgt_lock);
}
spin_unlock(&pgd_lock);
@@ -633,7 +644,7 @@ kernel_physical_mapping_init(unsigned long start,
}
if (pgd_changed)
- sync_global_pgds(addr, end - 1);
+ sync_global_pgds(addr, end - 1, 0);
__flush_tlb_all();
@@ -976,25 +987,26 @@ static void __meminit
remove_pagetable(unsigned long start, unsigned long end, bool direct)
{
unsigned long next;
+ unsigned long addr;
pgd_t *pgd;
pud_t *pud;
bool pgd_changed = false;
- for (; start < end; start = next) {
- next = pgd_addr_end(start, end);
+ for (addr = start; addr < end; addr = next) {
+ next = pgd_addr_end(addr, end);
- pgd = pgd_offset_k(start);
+ pgd = pgd_offset_k(addr);
if (!pgd_present(*pgd))
continue;
pud = (pud_t *)pgd_page_vaddr(*pgd);
- remove_pud_table(pud, start, next, direct);
+ remove_pud_table(pud, addr, next, direct);
if (free_pud_table(pud, pgd))
pgd_changed = true;
}
if (pgd_changed)
- sync_global_pgds(start, end - 1);
+ sync_global_pgds(start, end - 1, 1);
flush_tlb_all();
}
@@ -1341,7 +1353,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
else
err = vmemmap_populate_basepages(start, end, node);
if (!err)
- sync_global_pgds(start, end - 1);
+ sync_global_pgds(start, end - 1, 0);
return err;
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index baff1da354e0..af78e50ca6ce 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -86,6 +86,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
pgprot_t prot;
int retval;
void __iomem *ret_addr;
+ int ram_region;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
@@ -108,12 +109,23 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
- pfn = phys_addr >> PAGE_SHIFT;
- last_pfn = last_addr >> PAGE_SHIFT;
- if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
- __ioremap_check_ram) == 1)
+ /* First check if whole region can be identified as RAM or not */
+ ram_region = region_is_ram(phys_addr, size);
+ if (ram_region > 0) {
+ WARN_ONCE(1, "ioremap on RAM at 0x%lx - 0x%lx\n",
+ (unsigned long int)phys_addr,
+ (unsigned long int)last_addr);
return NULL;
+ }
+ /* If could not be identified(-1), check page by page */
+ if (ram_region < 0) {
+ pfn = phys_addr >> PAGE_SHIFT;
+ last_pfn = last_addr >> PAGE_SHIFT;
+ if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+ __ioremap_check_ram) == 1)
+ return NULL;
+ }
/*
* Mappings have to be page-aligned
*/
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a32b706c401a..1a883705a12a 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -185,8 +185,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
return numa_add_memblk_to(nid, start, end, &numa_meminfo);
}
-/* Initialize NODE_DATA for a node on the local memory */
-static void __init setup_node_data(int nid, u64 start, u64 end)
+/* Allocate NODE_DATA for a node on the local memory */
+static void __init alloc_node_data(int nid)
{
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
u64 nd_pa;
@@ -194,18 +194,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
int tnid;
/*
- * Don't confuse VM with a node that doesn't have the
- * minimum amount of memory:
- */
- if (end && (end - start) < NODE_MIN_SIZE)
- return;
-
- start = roundup(start, ZONE_ALIGN);
-
- printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
- nid, start, end - 1);
-
- /*
* Allocate node data. Try node-local memory and then any node.
* Never allocate in DMA zone.
*/
@@ -222,7 +210,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
nd = __va(nd_pa);
/* report and initialize */
- printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n",
+ printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
if (tnid != nid)
@@ -230,9 +218,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
- NODE_DATA(nid)->node_id = nid;
- NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
- NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
node_set_online(nid);
}
@@ -478,6 +463,42 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
return true;
}
+static void __init numa_clear_kernel_node_hotplug(void)
+{
+ int i, nid;
+ nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
+ unsigned long start, end;
+ struct memblock_region *r;
+
+ /*
+ * At this time, all memory regions reserved by memblock are
+ * used by the kernel. Set the nid in memblock.reserved will
+ * mark out all the nodes the kernel resides in.
+ */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ struct numa_memblk *mb = &numa_meminfo.blk[i];
+
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.reserved, mb->nid);
+ }
+
+ /* Mark all kernel nodes. */
+ for_each_memblock(reserved, r)
+ node_set(r->nid, numa_kernel_nodes);
+
+ /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ nid = numa_meminfo.blk[i].nid;
+ if (!node_isset(nid, numa_kernel_nodes))
+ continue;
+
+ start = numa_meminfo.blk[i].start;
+ end = numa_meminfo.blk[i].end;
+
+ memblock_clear_hotplug(start, end - start);
+ }
+}
+
static int __init numa_register_memblks(struct numa_meminfo *mi)
{
unsigned long uninitialized_var(pfn_align);
@@ -496,6 +517,15 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
}
/*
+ * At very early time, the kernel have to use some memory such as
+ * loading the kernel image. We cannot prevent this anyway. So any
+ * node the kernel resides in should be un-hotpluggable.
+ *
+ * And when we come here, alloc node data won't fail.
+ */
+ numa_clear_kernel_node_hotplug();
+
+ /*
* If sections array is gonna be used for pfn -> nid mapping, check
* whether its granularity is fine enough.
*/
@@ -523,8 +553,17 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
end = max(mi->blk[i].end, end);
}
- if (start < end)
- setup_node_data(nid, start, end);
+ if (start >= end)
+ continue;
+
+ /*
+ * Don't confuse VM with a node that doesn't have the
+ * minimum amount of memory:
+ */
+ if (end && (end - start) < NODE_MIN_SIZE)
+ continue;
+
+ alloc_node_data(nid);
}
/* Dump memblock with node info and return. */
@@ -554,41 +593,6 @@ static void __init numa_init_array(void)
}
}
-static void __init numa_clear_kernel_node_hotplug(void)
-{
- int i, nid;
- nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
- unsigned long start, end;
- struct memblock_region *r;
-
- /*
- * At this time, all memory regions reserved by memblock are
- * used by the kernel. Set the nid in memblock.reserved will
- * mark out all the nodes the kernel resides in.
- */
- for (i = 0; i < numa_meminfo.nr_blks; i++) {
- struct numa_memblk *mb = &numa_meminfo.blk[i];
- memblock_set_node(mb->start, mb->end - mb->start,
- &memblock.reserved, mb->nid);
- }
-
- /* Mark all kernel nodes. */
- for_each_memblock(reserved, r)
- node_set(r->nid, numa_kernel_nodes);
-
- /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
- for (i = 0; i < numa_meminfo.nr_blks; i++) {
- nid = numa_meminfo.blk[i].nid;
- if (!node_isset(nid, numa_kernel_nodes))
- continue;
-
- start = numa_meminfo.blk[i].start;
- end = numa_meminfo.blk[i].end;
-
- memblock_clear_hotplug(start, end - start);
- }
-}
-
static int __init numa_init(int (*init_func)(void))
{
int i;
@@ -643,15 +647,6 @@ static int __init numa_init(int (*init_func)(void))
}
numa_init_array();
- /*
- * At very early time, the kernel have to use some memory such as
- * loading the kernel image. We cannot prevent this anyway. So any
- * node the kernel resides in should be un-hotpluggable.
- *
- * And when we come here, numa_init() won't fail.
- */
- numa_clear_kernel_node_hotplug();
-
return 0;
}
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index f15103dff4b4..d143d216d52b 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -40,20 +40,40 @@ void __init efi_bgrt_init(void)
if (ACPI_FAILURE(status))
return;
- if (bgrt_tab->header.length < sizeof(*bgrt_tab))
+ if (bgrt_tab->header.length < sizeof(*bgrt_tab)) {
+ pr_err("Ignoring BGRT: invalid length %u (expected %zu)\n",
+ bgrt_tab->header.length, sizeof(*bgrt_tab));
return;
- if (bgrt_tab->version != 1 || bgrt_tab->status != 1)
+ }
+ if (bgrt_tab->version != 1) {
+ pr_err("Ignoring BGRT: invalid version %u (expected 1)\n",
+ bgrt_tab->version);
+ return;
+ }
+ if (bgrt_tab->status != 1) {
+ pr_err("Ignoring BGRT: invalid status %u (expected 1)\n",
+ bgrt_tab->status);
+ return;
+ }
+ if (bgrt_tab->image_type != 0) {
+ pr_err("Ignoring BGRT: invalid image type %u (expected 0)\n",
+ bgrt_tab->image_type);
return;
- if (bgrt_tab->image_type != 0 || !bgrt_tab->image_address)
+ }
+ if (!bgrt_tab->image_address) {
+ pr_err("Ignoring BGRT: null image address\n");
return;
+ }
image = efi_lookup_mapped_addr(bgrt_tab->image_address);
if (!image) {
image = early_memremap(bgrt_tab->image_address,
sizeof(bmp_header));
ioremapped = true;
- if (!image)
+ if (!image) {
+ pr_err("Ignoring BGRT: failed to map image header memory\n");
return;
+ }
}
memcpy_fromio(&bmp_header, image, sizeof(bmp_header));
@@ -61,14 +81,18 @@ void __init efi_bgrt_init(void)
early_iounmap(image, sizeof(bmp_header));
bgrt_image_size = bmp_header.size;
- bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL);
- if (!bgrt_image)
+ bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!bgrt_image) {
+ pr_err("Ignoring BGRT: failed to allocate memory for image (wanted %zu bytes)\n",
+ bgrt_image_size);
return;
+ }
if (ioremapped) {
image = early_memremap(bgrt_tab->image_address,
bmp_header.size);
if (!image) {
+ pr_err("Ignoring BGRT: failed to map image memory\n");
kfree(bgrt_image);
bgrt_image = NULL;
return;
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 899dd2454256..f52e033557c9 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -18,8 +18,9 @@ $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
targets += kexec-purgatory.c
+CMD_BIN2C = $(objtree)/scripts/basic/bin2c
quiet_cmd_bin2c = BIN2C $@
- cmd_bin2c = cat $(obj)/purgatory.ro | $(objtree)/scripts/basic/bin2c kexec_purgatory > $(obj)/kexec-purgatory.c
+ cmd_bin2c = $(CMD_BIN2C) kexec_purgatory < $< > $@
$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE
$(call if_changed,bin2c)
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index f14b4abbebd8..fa6f4eadc623 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -646,6 +646,4 @@ void __init bio_integrity_init(void)
sizeof(struct bio_integrity_payload) +
sizeof(struct bio_vec) * BIP_INLINE_VECS,
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
- if (!bip_slab)
- panic("Failed to create slab\n");
}
diff --git a/block/genhd.c b/block/genhd.c
index ce179854c181..d714fafaa30d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -852,7 +852,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index 6cd08e145bfa..3a6af66be1e4 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -10,6 +10,8 @@
#include <linux/dma-mapping.h>
#include <linux/export.h>
#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <asm-generic/dma-coherent.h>
/*
@@ -267,3 +269,71 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
return ret;
}
EXPORT_SYMBOL(dma_common_mmap);
+
+#ifdef CONFIG_MMU
+/*
+ * remaps an array of PAGE_SIZE pages into another vm_area
+ * Cannot be used in non-sleeping contexts
+ */
+void *dma_common_pages_remap(struct page **pages, size_t size,
+ unsigned long vm_flags, pgprot_t prot,
+ const void *caller)
+{
+ struct vm_struct *area;
+
+ area = get_vm_area_caller(size, vm_flags, caller);
+ if (!area)
+ return NULL;
+
+ if (map_vm_area(area, prot, pages)) {
+ vunmap(area->addr);
+ return NULL;
+ }
+
+ return area->addr;
+}
+
+/*
+ * remaps an allocated contiguous region into another vm_area.
+ * Cannot be used in non-sleeping contexts
+ */
+
+void *dma_common_contiguous_remap(struct page *page, size_t size,
+ unsigned long vm_flags,
+ pgprot_t prot, const void *caller)
+{
+ int i;
+ struct page **pages;
+ void *ptr;
+ unsigned long pfn;
+
+ pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL);
+ if (!pages)
+ return NULL;
+
+ for (i = 0, pfn = page_to_pfn(page); i < (size >> PAGE_SHIFT); i++)
+ pages[i] = pfn_to_page(pfn + i);
+
+ ptr = dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+
+ kfree(pages);
+
+ return ptr;
+}
+
+/*
+ * unmaps a range previously mapped by dma_common_*_remap
+ */
+void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
+{
+ struct vm_struct *area = find_vm_area(cpu_addr);
+
+ if (!area || (area->flags & vm_flags) != vm_flags) {
+ WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
+ return;
+ }
+
+ unmap_kernel_range((unsigned long)cpu_addr, size);
+ vunmap(cpu_addr);
+}
+#endif
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index a2e13e250bba..7c5d87191b28 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -373,6 +373,45 @@ static ssize_t show_phys_device(struct device *dev,
return sprintf(buf, "%d\n", mem->phys_device);
}
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static ssize_t show_valid_zones(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct memory_block *mem = to_memory_block(dev);
+ unsigned long start_pfn, end_pfn;
+ unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+ struct page *first_page;
+ struct zone *zone;
+
+ start_pfn = section_nr_to_pfn(mem->start_section_nr);
+ end_pfn = start_pfn + nr_pages;
+ first_page = pfn_to_page(start_pfn);
+
+ /* The block contains more than one zone can not be offlined. */
+ if (!test_pages_in_a_zone(start_pfn, end_pfn))
+ return sprintf(buf, "none\n");
+
+ zone = page_zone(first_page);
+
+ if (zone_idx(zone) == ZONE_MOVABLE - 1) {
+ /*The mem block is the last memoryblock of this zone.*/
+ if (end_pfn == zone_end_pfn(zone))
+ return sprintf(buf, "%s %s\n",
+ zone->name, (zone + 1)->name);
+ }
+
+ if (zone_idx(zone) == ZONE_MOVABLE) {
+ /*The mem block is the first memoryblock of ZONE_MOVABLE.*/
+ if (start_pfn == zone->zone_start_pfn)
+ return sprintf(buf, "%s %s\n",
+ zone->name, (zone - 1)->name);
+ }
+
+ return sprintf(buf, "%s\n", zone->name);
+}
+static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL);
+#endif
+
static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
@@ -523,6 +562,9 @@ static struct attribute *memory_memblk_attrs[] = {
&dev_attr_state.attr,
&dev_attr_phys_device.attr,
&dev_attr_removable.attr,
+#ifdef CONFIG_MEMORY_HOTREMOVE
+ &dev_attr_valid_zones.attr,
+#endif
NULL
};
diff --git a/drivers/base/node.c b/drivers/base/node.c
index c6d3ae05f1ca..1f8d8b116482 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -120,6 +120,9 @@ static ssize_t node_read_meminfo(struct device *dev,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"Node %d AnonHugePages: %8lu kB\n"
#endif
+#ifdef CONFIG_MEMORY_BALLOON
+ "Node %d BalloonPages: %8lu kB\n"
+#endif
,
nid, K(node_page_state(nid, NR_FILE_DIRTY)),
nid, K(node_page_state(nid, NR_WRITEBACK)),
@@ -136,14 +139,15 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))
- , nid,
- K(node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
- HPAGE_PMD_NR));
-#else
- nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ ,nid, K(node_page_state(nid,
+ NR_ANON_TRANSPARENT_HUGEPAGES) * HPAGE_PMD_NR)
+#endif
+#ifdef CONFIG_MEMORY_BALLOON
+ ,nid, K(node_page_state(nid, NR_BALLOON_PAGES))
#endif
+ );
n += hugetlb_report_node_meminfo(nid, buf + n);
return n;
}
@@ -289,8 +293,6 @@ static int register_node(struct node *node, int num, struct node *parent)
device_create_file(&node->dev, &dev_attr_distance);
device_create_file(&node->dev, &dev_attr_vmstat);
- scan_unevictable_register_node(node);
-
hugetlb_register_node(node);
compaction_register_node(node);
@@ -314,7 +316,6 @@ void unregister_node(struct node *node)
device_remove_file(&node->dev, &dev_attr_distance);
device_remove_file(&node->dev, &dev_attr_vmstat);
- scan_unevictable_unregister_node(node);
hugetlb_unregister_node(node); /* no-op, if memoryless node */
device_unregister(&node->dev);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d00831c3d731..bc20fe10f6c2 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -103,10 +103,10 @@ static ssize_t mem_used_total_show(struct device *dev,
down_read(&zram->init_lock);
if (init_done(zram))
- val = zs_get_total_size_bytes(meta->mem_pool);
+ val = zs_get_total_pages(meta->mem_pool);
up_read(&zram->init_lock);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
}
static ssize_t max_comp_streams_show(struct device *dev,
@@ -122,6 +122,72 @@ static ssize_t max_comp_streams_show(struct device *dev,
return scnprintf(buf, PAGE_SIZE, "%d\n", val);
}
+static ssize_t mem_limit_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u64 val;
+ struct zram *zram = dev_to_zram(dev);
+
+ down_read(&zram->init_lock);
+ val = zram->limit_pages;
+ up_read(&zram->init_lock);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
+}
+
+static ssize_t mem_limit_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ u64 limit;
+ char *tmp;
+ struct zram *zram = dev_to_zram(dev);
+
+ limit = memparse(buf, &tmp);
+ if (buf == tmp) /* no chars parsed, invalid input */
+ return -EINVAL;
+
+ down_write(&zram->init_lock);
+ zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
+ up_write(&zram->init_lock);
+
+ return len;
+}
+
+static ssize_t mem_used_max_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u64 val = 0;
+ struct zram *zram = dev_to_zram(dev);
+
+ down_read(&zram->init_lock);
+ if (init_done(zram))
+ val = atomic_long_read(&zram->stats.max_used_pages);
+ up_read(&zram->init_lock);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
+}
+
+static ssize_t mem_used_max_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ int err;
+ unsigned long val;
+ struct zram *zram = dev_to_zram(dev);
+ struct zram_meta *meta = zram->meta;
+
+ err = kstrtoul(buf, 10, &val);
+ if (err || val != 0)
+ return -EINVAL;
+
+ down_read(&zram->init_lock);
+ if (init_done(zram))
+ atomic_long_set(&zram->stats.max_used_pages,
+ zs_get_total_pages(meta->mem_pool));
+ up_read(&zram->init_lock);
+
+ return len;
+}
+
static ssize_t max_comp_streams_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
@@ -344,6 +410,7 @@ static void zram_free_page(struct zram *zram, size_t index)
atomic64_sub(zram_get_obj_size(meta, index),
&zram->stats.compr_data_size);
atomic64_dec(&zram->stats.pages_stored);
+ atomic64_inc(&zram->stats.notify_free);
meta->table[index].handle = 0;
zram_set_obj_size(meta, index, 0);
@@ -434,6 +501,21 @@ out_cleanup:
return ret;
}
+static inline void update_used_max(struct zram *zram,
+ const unsigned long pages)
+{
+ int old_max, cur_max;
+
+ old_max = atomic_long_read(&zram->stats.max_used_pages);
+
+ do {
+ cur_max = old_max;
+ if (pages > cur_max)
+ old_max = atomic_long_cmpxchg(
+ &zram->stats.max_used_pages, cur_max, pages);
+ } while (old_max != cur_max);
+}
+
static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
int offset)
{
@@ -445,6 +527,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
struct zram_meta *meta = zram->meta;
struct zcomp_strm *zstrm;
bool locked = false;
+ unsigned long alloced_pages;
page = bvec->bv_page;
if (is_partial_io(bvec)) {
@@ -513,6 +596,16 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
ret = -ENOMEM;
goto out;
}
+
+ alloced_pages = zs_get_total_pages(meta->mem_pool);
+ if (zram->limit_pages && alloced_pages > zram->limit_pages) {
+ zs_free(meta->mem_pool, handle);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ update_used_max(zram, alloced_pages);
+
cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
@@ -617,6 +710,9 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
struct zram_meta *meta;
down_write(&zram->init_lock);
+
+ zram->limit_pages = 0;
+
if (!init_done(zram)) {
up_write(&zram->init_lock);
return;
@@ -843,7 +939,6 @@ static void zram_slot_free_notify(struct block_device *bdev,
bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
zram_free_page(zram, index);
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
- atomic64_inc(&zram->stats.notify_free);
}
static const struct block_device_operations zram_devops = {
@@ -857,6 +952,10 @@ static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL);
static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store);
static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL);
static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
+static DEVICE_ATTR(mem_limit, S_IRUGO | S_IWUSR, mem_limit_show,
+ mem_limit_store);
+static DEVICE_ATTR(mem_used_max, S_IRUGO | S_IWUSR, mem_used_max_show,
+ mem_used_max_store);
static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR,
max_comp_streams_show, max_comp_streams_store);
static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR,
@@ -885,6 +984,8 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_orig_data_size.attr,
&dev_attr_compr_data_size.attr,
&dev_attr_mem_used_total.attr,
+ &dev_attr_mem_limit.attr,
+ &dev_attr_mem_used_max.attr,
&dev_attr_max_comp_streams.attr,
&dev_attr_comp_algorithm.attr,
NULL,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index e0f725c87cc6..c6ee271317f5 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -90,6 +90,7 @@ struct zram_stats {
atomic64_t notify_free; /* no. of swap slot free notifications */
atomic64_t zero_pages; /* no. of zero filled pages */
atomic64_t pages_stored; /* no. of pages currently stored */
+ atomic_long_t max_used_pages; /* no. of maximum pages stored */
};
struct zram_meta {
@@ -112,6 +113,11 @@ struct zram {
u64 disksize; /* bytes */
int max_comp_streams;
struct zram_stats stats;
+ /*
+ * the number of pages zram can consume for storing compressed data
+ */
+ unsigned long limit_pages;
+
char compressor[10];
};
#endif
diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig
index a11ff74a5127..9eac8de9e8b7 100644
--- a/drivers/input/Kconfig
+++ b/drivers/input/Kconfig
@@ -178,6 +178,15 @@ comment "Input Device Drivers"
source "drivers/input/keyboard/Kconfig"
+config INPUT_LEDS
+ bool "LED Support"
+ depends on LEDS_CLASS = INPUT || LEDS_CLASS = y
+ select LEDS_TRIGGERS
+ default y
+ help
+ This option enables support for LEDs on keyboards managed
+ by the input layer.
+
source "drivers/input/mouse/Kconfig"
source "drivers/input/joystick/Kconfig"
diff --git a/drivers/input/Makefile b/drivers/input/Makefile
index 5ca3f631497f..2ab5f3336da5 100644
--- a/drivers/input/Makefile
+++ b/drivers/input/Makefile
@@ -6,6 +6,9 @@
obj-$(CONFIG_INPUT) += input-core.o
input-core-y := input.o input-compat.o input-mt.o ff-core.o
+ifeq ($(CONFIG_INPUT_LEDS),y)
+input-core-y += leds.o
+endif
obj-$(CONFIG_INPUT_FF_MEMLESS) += ff-memless.o
obj-$(CONFIG_INPUT_POLLDEV) += input-polldev.o
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 29ca0bb4f561..236bc56e6a81 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -710,6 +710,9 @@ static void input_disconnect_device(struct input_dev *dev)
handle->open = 0;
spin_unlock_irq(&dev->event_lock);
+
+ if (is_event_supported(EV_LED, dev->evbit, EV_MAX))
+ input_led_disconnect(dev);
}
/**
@@ -2136,6 +2139,9 @@ int input_register_device(struct input_dev *dev)
list_add_tail(&dev->node, &input_dev_list);
+ if (is_event_supported(EV_LED, dev->evbit, EV_MAX))
+ input_led_connect(dev);
+
list_for_each_entry(handler, &input_handler_list, node)
input_attach_handler(dev, handler);
diff --git a/drivers/input/leds.c b/drivers/input/leds.c
new file mode 100644
index 000000000000..985fa7ebeec7
--- /dev/null
+++ b/drivers/input/leds.c
@@ -0,0 +1,249 @@
+/*
+ * LED support for the input layer
+ *
+ * Copyright 2010-2014 Samuel Thibault <samuel.thibault@ens-lyon.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/leds.h>
+#include <linux/input.h>
+
+/*
+ * Keyboard LEDs are propagated by default like the following example:
+ *
+ * VT keyboard numlock trigger
+ * -> vt::numl VT LED
+ * -> vt-numl VT trigger
+ * -> per-device inputX::numl LED
+ *
+ * Userland can however choose the trigger for the vt::numl LED, or
+ * independently choose the trigger for any inputx::numl LED.
+ *
+ *
+ * VT LED classes and triggers are registered on-demand according to
+ * existing LED devices
+ */
+
+/* Handler for VT LEDs, just triggers the corresponding VT trigger. */
+static void vt_led_set(struct led_classdev *cdev,
+ enum led_brightness brightness);
+static struct led_classdev vt_leds[LED_CNT] = {
+#define DEFINE_INPUT_LED(vt_led, nam, deftrig) \
+ [vt_led] = { \
+ .name = "vt::"nam, \
+ .max_brightness = 1, \
+ .brightness_set = vt_led_set, \
+ .default_trigger = deftrig, \
+ }
+/* Default triggers for the VT LEDs just correspond to the legacy
+ * usage. */
+ DEFINE_INPUT_LED(LED_NUML, "numl", "kbd-numlock"),
+ DEFINE_INPUT_LED(LED_CAPSL, "capsl", "kbd-capslock"),
+ DEFINE_INPUT_LED(LED_SCROLLL, "scrolll", "kbd-scrollock"),
+ DEFINE_INPUT_LED(LED_COMPOSE, "compose", NULL),
+ DEFINE_INPUT_LED(LED_KANA, "kana", "kbd-kanalock"),
+ DEFINE_INPUT_LED(LED_SLEEP, "sleep", NULL),
+ DEFINE_INPUT_LED(LED_SUSPEND, "suspend", NULL),
+ DEFINE_INPUT_LED(LED_MUTE, "mute", NULL),
+ DEFINE_INPUT_LED(LED_MISC, "misc", NULL),
+ DEFINE_INPUT_LED(LED_MAIL, "mail", NULL),
+ DEFINE_INPUT_LED(LED_CHARGING, "charging", NULL),
+};
+static const char *const vt_led_names[LED_CNT] = {
+ [LED_NUML] = "numl",
+ [LED_CAPSL] = "capsl",
+ [LED_SCROLLL] = "scrolll",
+ [LED_COMPOSE] = "compose",
+ [LED_KANA] = "kana",
+ [LED_SLEEP] = "sleep",
+ [LED_SUSPEND] = "suspend",
+ [LED_MUTE] = "mute",
+ [LED_MISC] = "misc",
+ [LED_MAIL] = "mail",
+ [LED_CHARGING] = "charging",
+};
+/* Handler for hotplug initialization */
+static void vt_led_trigger_activate(struct led_classdev *cdev);
+/* VT triggers */
+static struct led_trigger vt_led_triggers[LED_CNT] = {
+#define DEFINE_INPUT_LED_TRIGGER(vt_led, nam) \
+ [vt_led] = { \
+ .name = "vt-"nam, \
+ .activate = vt_led_trigger_activate, \
+ }
+ DEFINE_INPUT_LED_TRIGGER(LED_NUML, "numl"),
+ DEFINE_INPUT_LED_TRIGGER(LED_CAPSL, "capsl"),
+ DEFINE_INPUT_LED_TRIGGER(LED_SCROLLL, "scrolll"),
+ DEFINE_INPUT_LED_TRIGGER(LED_COMPOSE, "compose"),
+ DEFINE_INPUT_LED_TRIGGER(LED_KANA, "kana"),
+ DEFINE_INPUT_LED_TRIGGER(LED_SLEEP, "sleep"),
+ DEFINE_INPUT_LED_TRIGGER(LED_SUSPEND, "suspend"),
+ DEFINE_INPUT_LED_TRIGGER(LED_MUTE, "mute"),
+ DEFINE_INPUT_LED_TRIGGER(LED_MISC, "misc"),
+ DEFINE_INPUT_LED_TRIGGER(LED_MAIL, "mail"),
+ DEFINE_INPUT_LED_TRIGGER(LED_CHARGING, "charging"),
+};
+
+/* Lock for registration coherency */
+static DEFINE_MUTEX(vt_led_registered_lock);
+
+/* Which VT LED classes and triggers are registered */
+static unsigned long vt_led_registered[BITS_TO_LONGS(LED_CNT)];
+
+/* Number of input devices having each LED */
+static int vt_led_references[LED_CNT];
+
+/* VT LED state change, tell the VT trigger. */
+static void vt_led_set(struct led_classdev *cdev,
+ enum led_brightness brightness)
+{
+ int led = cdev - vt_leds;
+
+ led_trigger_event(&vt_led_triggers[led], !!brightness);
+}
+
+/* LED state change for some keyboard, notify that keyboard. */
+static void perdevice_input_led_set(struct led_classdev *cdev,
+ enum led_brightness brightness)
+{
+ struct input_dev *dev;
+ struct led_classdev *leds;
+ int led;
+
+ dev = cdev->dev->platform_data;
+ if (!dev)
+ /* Still initializing */
+ return;
+ leds = dev->leds;
+ led = cdev - leds;
+
+ input_event(dev, EV_LED, led, !!brightness);
+ input_event(dev, EV_SYN, SYN_REPORT, 0);
+}
+
+/* Keyboard hotplug, initialize its LED status */
+static void vt_led_trigger_activate(struct led_classdev *cdev)
+{
+ struct led_trigger *trigger = cdev->trigger;
+ int led = trigger - vt_led_triggers;
+
+ if (cdev->brightness_set)
+ cdev->brightness_set(cdev, vt_leds[led].brightness);
+}
+
+/* Free led stuff from input device, used at abortion and disconnection. */
+static void input_led_delete(struct input_dev *dev)
+{
+ if (dev) {
+ struct led_classdev *leds = dev->leds;
+ if (leds) {
+ int i;
+ for (i = 0; i < LED_CNT; i++)
+ kfree(leds[i].name);
+ kfree(leds);
+ dev->leds = NULL;
+ }
+ }
+}
+
+/* A new input device with potential LEDs to connect. */
+int input_led_connect(struct input_dev *dev)
+{
+ int i, error = 0;
+ struct led_classdev *leds;
+
+ dev->leds = leds = kcalloc(LED_CNT, sizeof(*leds), GFP_KERNEL);
+ if (!dev->leds)
+ return -ENOMEM;
+
+ /* lazily register missing VT LEDs */
+ mutex_lock(&vt_led_registered_lock);
+ for (i = 0; i < LED_CNT; i++)
+ if (vt_leds[i].name && test_bit(i, dev->ledbit)) {
+ if (!vt_led_references[i]) {
+ led_trigger_register(&vt_led_triggers[i]);
+ /* This keyboard is first to have led i,
+ * try to register it */
+ if (!led_classdev_register(NULL, &vt_leds[i]))
+ vt_led_references[i] = 1;
+ else
+ led_trigger_unregister(&vt_led_triggers[i]);
+ } else
+ vt_led_references[i]++;
+ }
+ mutex_unlock(&vt_led_registered_lock);
+
+ /* and register this device's LEDs */
+ for (i = 0; i < LED_CNT; i++)
+ if (vt_leds[i].name && test_bit(i, dev->ledbit)) {
+ leds[i].name = kasprintf(GFP_KERNEL, "%s::%s",
+ dev_name(&dev->dev),
+ vt_led_names[i]);
+ if (!leds[i].name) {
+ error = -ENOMEM;
+ goto err;
+ }
+ leds[i].max_brightness = 1;
+ leds[i].brightness_set = perdevice_input_led_set;
+ leds[i].default_trigger = vt_led_triggers[i].name;
+ }
+
+ /* No issue so far, we can register for real. */
+ for (i = 0; i < LED_CNT; i++)
+ if (leds[i].name) {
+ led_classdev_register(&dev->dev, &leds[i]);
+ leds[i].dev->platform_data = dev;
+ perdevice_input_led_set(&leds[i],
+ vt_leds[i].brightness);
+ }
+
+ return 0;
+
+err:
+ input_led_delete(dev);
+ return error;
+}
+
+/*
+ * Disconnected input device. Clean it, and deregister now-useless VT LEDs
+ * and triggers.
+ */
+void input_led_disconnect(struct input_dev *dev)
+{
+ int i;
+ struct led_classdev *leds = dev->leds;
+
+ for (i = 0; i < LED_CNT; i++)
+ if (leds[i].name)
+ led_classdev_unregister(&leds[i]);
+
+ input_led_delete(dev);
+
+ mutex_lock(&vt_led_registered_lock);
+ for (i = 0; i < LED_CNT; i++) {
+ if (!vt_leds[i].name || !test_bit(i, dev->ledbit))
+ continue;
+
+ vt_led_references[i]--;
+ if (vt_led_references[i]) {
+ /* Still some devices needing it */
+ continue;
+ }
+
+ led_classdev_unregister(&vt_leds[i]);
+ led_trigger_unregister(&vt_led_triggers[i]);
+ clear_bit(i, vt_led_registered);
+ }
+ mutex_unlock(&vt_led_registered_lock);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("User LED support for input layer");
+MODULE_AUTHOR("Samuel Thibault <samuel.thibault@ens-lyon.org>");
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index f6ef7bb2dc11..af6a4f537838 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -11,9 +11,6 @@ menuconfig NEW_LEDS
Say Y to enable Linux LED support. This allows control of supported
LEDs from both userspace and optionally, by kernel events (triggers).
- This is not related to standard keyboard LEDs which are controlled
- via the input system.
-
if NEW_LEDS
config LEDS_CLASS
diff --git a/drivers/misc/ti-st/st_core.c b/drivers/misc/ti-st/st_core.c
index 54be83d3efdd..374480cb69bf 100644
--- a/drivers/misc/ti-st/st_core.c
+++ b/drivers/misc/ti-st/st_core.c
@@ -343,7 +343,7 @@ void st_int_recv(void *disc_data,
/* Unknow packet? */
default:
type = *ptr;
- if (st_gdata->list[type] == NULL) {
+ if (type >= ST_MAX_CHANNELS || st_gdata->list[type] == NULL) {
pr_err("chip/interface misbehavior dropping"
" frame starting with 0x%02x", type);
goto done;
diff --git a/drivers/rtc/rtc-pcf8583.c b/drivers/rtc/rtc-pcf8583.c
index c2639845186b..5911a6dca291 100644
--- a/drivers/rtc/rtc-pcf8583.c
+++ b/drivers/rtc/rtc-pcf8583.c
@@ -176,7 +176,11 @@ static int pcf8583_rtc_read_time(struct device *dev, struct rtc_time *tm)
{
struct i2c_client *client = to_i2c_client(dev);
unsigned char ctrl, year[2];
- struct rtc_mem mem = { CMOS_YEAR, sizeof(year), year };
+ struct rtc_mem mem = {
+ .loc = CMOS_YEAR,
+ .nr = sizeof(year),
+ .data = year
+ };
int real_year, year_offset, err;
/*
@@ -222,8 +226,16 @@ static int pcf8583_rtc_set_time(struct device *dev, struct rtc_time *tm)
{
struct i2c_client *client = to_i2c_client(dev);
unsigned char year[2], chk;
- struct rtc_mem cmos_year = { CMOS_YEAR, sizeof(year), year };
- struct rtc_mem cmos_check = { CMOS_CHECKSUM, 1, &chk };
+ struct rtc_mem cmos_year = {
+ .loc = CMOS_YEAR,
+ .nr = sizeof(year),
+ .data = year
+ };
+ struct rtc_mem cmos_check = {
+ .loc = CMOS_CHECKSUM,
+ .nr = 1,
+ .data = &chk
+ };
unsigned int proper_year = tm->tm_year + 1900;
int ret;
diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c
index 4958a363b2c7..a6b1252c9941 100644
--- a/drivers/rtc/rtc-s3c.c
+++ b/drivers/rtc/rtc-s3c.c
@@ -32,155 +32,150 @@
#include <asm/irq.h>
#include "rtc-s3c.h"
-enum s3c_cpu_type {
- TYPE_S3C2410,
- TYPE_S3C2416,
- TYPE_S3C2443,
- TYPE_S3C64XX,
-};
+struct s3c_rtc {
+ struct device *dev;
+ struct rtc_device *rtc;
-struct s3c_rtc_drv_data {
- int cpu_type;
-};
+ void __iomem *base;
+ struct clk *rtc_clk;
+ struct clk *rtc_src_clk;
+ bool enabled;
+
+ struct s3c_rtc_data *data;
-/* I have yet to find an S3C implementation with more than one
- * of these rtc blocks in */
+ int irq_alarm;
+ int irq_tick;
-static struct clk *rtc_clk;
-static void __iomem *s3c_rtc_base;
-static int s3c_rtc_alarmno;
-static int s3c_rtc_tickno;
-static enum s3c_cpu_type s3c_rtc_cpu_type;
+ spinlock_t pie_lock;
+ spinlock_t alarm_clk_lock;
-static DEFINE_SPINLOCK(s3c_rtc_pie_lock);
+ int ticnt_save, ticnt_en_save;
+ bool wake_en;
+};
+
+struct s3c_rtc_data {
+ int max_user_freq;
+ bool needs_src_clk;
+
+ void (*irq_handler) (struct s3c_rtc *info, int mask);
+ void (*set_freq) (struct s3c_rtc *info, int freq);
+ void (*enable_tick) (struct s3c_rtc *info, struct seq_file *seq);
+ void (*select_tick_clk) (struct s3c_rtc *info);
+ void (*save_tick_cnt) (struct s3c_rtc *info);
+ void (*restore_tick_cnt) (struct s3c_rtc *info);
+ void (*enable) (struct s3c_rtc *info);
+ void (*disable) (struct s3c_rtc *info);
+};
-static void s3c_rtc_alarm_clk_enable(bool enable)
+static void s3c_rtc_alarm_clk_enable(struct s3c_rtc *info, bool enable)
{
- static DEFINE_SPINLOCK(s3c_rtc_alarm_clk_lock);
- static bool alarm_clk_enabled;
unsigned long irq_flags;
- spin_lock_irqsave(&s3c_rtc_alarm_clk_lock, irq_flags);
+ spin_lock_irqsave(&info->alarm_clk_lock, irq_flags);
if (enable) {
- if (!alarm_clk_enabled) {
- clk_enable(rtc_clk);
- alarm_clk_enabled = true;
+ if (!info->enabled) {
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
+ info->enabled = true;
}
} else {
- if (alarm_clk_enabled) {
- clk_disable(rtc_clk);
- alarm_clk_enabled = false;
+ if (info->enabled) {
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
+ info->enabled = false;
}
}
- spin_unlock_irqrestore(&s3c_rtc_alarm_clk_lock, irq_flags);
+ spin_unlock_irqrestore(&info->alarm_clk_lock, irq_flags);
}
/* IRQ Handlers */
-
-static irqreturn_t s3c_rtc_alarmirq(int irq, void *id)
+static irqreturn_t s3c_rtc_tickirq(int irq, void *id)
{
- struct rtc_device *rdev = id;
-
- clk_enable(rtc_clk);
- rtc_update_irq(rdev, 1, RTC_AF | RTC_IRQF);
-
- if (s3c_rtc_cpu_type == TYPE_S3C64XX)
- writeb(S3C2410_INTP_ALM, s3c_rtc_base + S3C2410_INTP);
-
- clk_disable(rtc_clk);
+ struct s3c_rtc *info = (struct s3c_rtc *)id;
- s3c_rtc_alarm_clk_enable(false);
+ if (info->data->irq_handler)
+ info->data->irq_handler(info, S3C2410_INTP_TIC);
return IRQ_HANDLED;
}
-static irqreturn_t s3c_rtc_tickirq(int irq, void *id)
+static irqreturn_t s3c_rtc_alarmirq(int irq, void *id)
{
- struct rtc_device *rdev = id;
+ struct s3c_rtc *info = (struct s3c_rtc *)id;
- clk_enable(rtc_clk);
- rtc_update_irq(rdev, 1, RTC_PF | RTC_IRQF);
+ if (info->data->irq_handler)
+ info->data->irq_handler(info, S3C2410_INTP_ALM);
- if (s3c_rtc_cpu_type == TYPE_S3C64XX)
- writeb(S3C2410_INTP_TIC, s3c_rtc_base + S3C2410_INTP);
-
- clk_disable(rtc_clk);
return IRQ_HANDLED;
}
/* Update control registers */
static int s3c_rtc_setaie(struct device *dev, unsigned int enabled)
{
+ struct s3c_rtc *info = dev_get_drvdata(dev);
unsigned int tmp;
- dev_dbg(dev, "%s: aie=%d\n", __func__, enabled);
+ dev_dbg(info->dev, "%s: aie=%d\n", __func__, enabled);
- clk_enable(rtc_clk);
- tmp = readb(s3c_rtc_base + S3C2410_RTCALM) & ~S3C2410_RTCALM_ALMEN;
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
+ tmp = readb(info->base + S3C2410_RTCALM) & ~S3C2410_RTCALM_ALMEN;
if (enabled)
tmp |= S3C2410_RTCALM_ALMEN;
- writeb(tmp, s3c_rtc_base + S3C2410_RTCALM);
- clk_disable(rtc_clk);
+ writeb(tmp, info->base + S3C2410_RTCALM);
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
- s3c_rtc_alarm_clk_enable(enabled);
+ s3c_rtc_alarm_clk_enable(info, enabled);
return 0;
}
-static int s3c_rtc_setfreq(struct device *dev, int freq)
+/* Set RTC frequency */
+static int s3c_rtc_setfreq(struct s3c_rtc *info, int freq)
{
- struct platform_device *pdev = to_platform_device(dev);
- struct rtc_device *rtc_dev = platform_get_drvdata(pdev);
- unsigned int tmp = 0;
- int val;
-
if (!is_power_of_2(freq))
return -EINVAL;
- clk_enable(rtc_clk);
- spin_lock_irq(&s3c_rtc_pie_lock);
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
+ spin_lock_irq(&info->pie_lock);
- if (s3c_rtc_cpu_type != TYPE_S3C64XX) {
- tmp = readb(s3c_rtc_base + S3C2410_TICNT);
- tmp &= S3C2410_TICNT_ENABLE;
- }
+ if (info->data->set_freq)
+ info->data->set_freq(info, freq);
- val = (rtc_dev->max_user_freq / freq) - 1;
-
- if (s3c_rtc_cpu_type == TYPE_S3C2416 || s3c_rtc_cpu_type == TYPE_S3C2443) {
- tmp |= S3C2443_TICNT_PART(val);
- writel(S3C2443_TICNT1_PART(val), s3c_rtc_base + S3C2443_TICNT1);
-
- if (s3c_rtc_cpu_type == TYPE_S3C2416)
- writel(S3C2416_TICNT2_PART(val), s3c_rtc_base + S3C2416_TICNT2);
- } else {
- tmp |= val;
- }
-
- writel(tmp, s3c_rtc_base + S3C2410_TICNT);
- spin_unlock_irq(&s3c_rtc_pie_lock);
- clk_disable(rtc_clk);
+ spin_unlock_irq(&info->pie_lock);
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
return 0;
}
/* Time read/write */
-
static int s3c_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
{
+ struct s3c_rtc *info = dev_get_drvdata(dev);
unsigned int have_retried = 0;
- void __iomem *base = s3c_rtc_base;
- clk_enable(rtc_clk);
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
+
retry_get_time:
- rtc_tm->tm_min = readb(base + S3C2410_RTCMIN);
- rtc_tm->tm_hour = readb(base + S3C2410_RTCHOUR);
- rtc_tm->tm_mday = readb(base + S3C2410_RTCDATE);
- rtc_tm->tm_mon = readb(base + S3C2410_RTCMON);
- rtc_tm->tm_year = readb(base + S3C2410_RTCYEAR);
- rtc_tm->tm_sec = readb(base + S3C2410_RTCSEC);
+ rtc_tm->tm_min = readb(info->base + S3C2410_RTCMIN);
+ rtc_tm->tm_hour = readb(info->base + S3C2410_RTCHOUR);
+ rtc_tm->tm_mday = readb(info->base + S3C2410_RTCDATE);
+ rtc_tm->tm_mon = readb(info->base + S3C2410_RTCMON);
+ rtc_tm->tm_year = readb(info->base + S3C2410_RTCYEAR);
+ rtc_tm->tm_sec = readb(info->base + S3C2410_RTCSEC);
/* the only way to work out whether the system was mid-update
* when we read it is to check the second counter, and if it
@@ -207,13 +202,16 @@ static int s3c_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
rtc_tm->tm_mon -= 1;
- clk_disable(rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
+
return rtc_valid_tm(rtc_tm);
}
static int s3c_rtc_settime(struct device *dev, struct rtc_time *tm)
{
- void __iomem *base = s3c_rtc_base;
+ struct s3c_rtc *info = dev_get_drvdata(dev);
int year = tm->tm_year - 100;
dev_dbg(dev, "set time %04d.%02d.%02d %02d:%02d:%02d\n",
@@ -227,33 +225,42 @@ static int s3c_rtc_settime(struct device *dev, struct rtc_time *tm)
return -EINVAL;
}
- clk_enable(rtc_clk);
- writeb(bin2bcd(tm->tm_sec), base + S3C2410_RTCSEC);
- writeb(bin2bcd(tm->tm_min), base + S3C2410_RTCMIN);
- writeb(bin2bcd(tm->tm_hour), base + S3C2410_RTCHOUR);
- writeb(bin2bcd(tm->tm_mday), base + S3C2410_RTCDATE);
- writeb(bin2bcd(tm->tm_mon + 1), base + S3C2410_RTCMON);
- writeb(bin2bcd(year), base + S3C2410_RTCYEAR);
- clk_disable(rtc_clk);
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
+
+ writeb(bin2bcd(tm->tm_sec), info->base + S3C2410_RTCSEC);
+ writeb(bin2bcd(tm->tm_min), info->base + S3C2410_RTCMIN);
+ writeb(bin2bcd(tm->tm_hour), info->base + S3C2410_RTCHOUR);
+ writeb(bin2bcd(tm->tm_mday), info->base + S3C2410_RTCDATE);
+ writeb(bin2bcd(tm->tm_mon + 1), info->base + S3C2410_RTCMON);
+ writeb(bin2bcd(year), info->base + S3C2410_RTCYEAR);
+
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
return 0;
}
static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm)
{
+ struct s3c_rtc *info = dev_get_drvdata(dev);
struct rtc_time *alm_tm = &alrm->time;
- void __iomem *base = s3c_rtc_base;
unsigned int alm_en;
- clk_enable(rtc_clk);
- alm_tm->tm_sec = readb(base + S3C2410_ALMSEC);
- alm_tm->tm_min = readb(base + S3C2410_ALMMIN);
- alm_tm->tm_hour = readb(base + S3C2410_ALMHOUR);
- alm_tm->tm_mon = readb(base + S3C2410_ALMMON);
- alm_tm->tm_mday = readb(base + S3C2410_ALMDATE);
- alm_tm->tm_year = readb(base + S3C2410_ALMYEAR);
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
- alm_en = readb(base + S3C2410_RTCALM);
+ alm_tm->tm_sec = readb(info->base + S3C2410_ALMSEC);
+ alm_tm->tm_min = readb(info->base + S3C2410_ALMMIN);
+ alm_tm->tm_hour = readb(info->base + S3C2410_ALMHOUR);
+ alm_tm->tm_mon = readb(info->base + S3C2410_ALMMON);
+ alm_tm->tm_mday = readb(info->base + S3C2410_ALMDATE);
+ alm_tm->tm_year = readb(info->base + S3C2410_ALMYEAR);
+
+ alm_en = readb(info->base + S3C2410_RTCALM);
alrm->enabled = (alm_en & S3C2410_RTCALM_ALMEN) ? 1 : 0;
@@ -297,65 +304,74 @@ static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm)
else
alm_tm->tm_year = -1;
- clk_disable(rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
+
return 0;
}
static int s3c_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
{
+ struct s3c_rtc *info = dev_get_drvdata(dev);
struct rtc_time *tm = &alrm->time;
- void __iomem *base = s3c_rtc_base;
unsigned int alrm_en;
- clk_enable(rtc_clk);
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
+
dev_dbg(dev, "s3c_rtc_setalarm: %d, %04d.%02d.%02d %02d:%02d:%02d\n",
alrm->enabled,
1900 + tm->tm_year, tm->tm_mon + 1, tm->tm_mday,
tm->tm_hour, tm->tm_min, tm->tm_sec);
- alrm_en = readb(base + S3C2410_RTCALM) & S3C2410_RTCALM_ALMEN;
- writeb(0x00, base + S3C2410_RTCALM);
+ alrm_en = readb(info->base + S3C2410_RTCALM) & S3C2410_RTCALM_ALMEN;
+ writeb(0x00, info->base + S3C2410_RTCALM);
if (tm->tm_sec < 60 && tm->tm_sec >= 0) {
alrm_en |= S3C2410_RTCALM_SECEN;
- writeb(bin2bcd(tm->tm_sec), base + S3C2410_ALMSEC);
+ writeb(bin2bcd(tm->tm_sec), info->base + S3C2410_ALMSEC);
}
if (tm->tm_min < 60 && tm->tm_min >= 0) {
alrm_en |= S3C2410_RTCALM_MINEN;
- writeb(bin2bcd(tm->tm_min), base + S3C2410_ALMMIN);
+ writeb(bin2bcd(tm->tm_min), info->base + S3C2410_ALMMIN);
}
if (tm->tm_hour < 24 && tm->tm_hour >= 0) {
alrm_en |= S3C2410_RTCALM_HOUREN;
- writeb(bin2bcd(tm->tm_hour), base + S3C2410_ALMHOUR);
+ writeb(bin2bcd(tm->tm_hour), info->base + S3C2410_ALMHOUR);
}
dev_dbg(dev, "setting S3C2410_RTCALM to %08x\n", alrm_en);
- writeb(alrm_en, base + S3C2410_RTCALM);
+ writeb(alrm_en, info->base + S3C2410_RTCALM);
s3c_rtc_setaie(dev, alrm->enabled);
- clk_disable(rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
+
return 0;
}
static int s3c_rtc_proc(struct device *dev, struct seq_file *seq)
{
- unsigned int ticnt;
+ struct s3c_rtc *info = dev_get_drvdata(dev);
- clk_enable(rtc_clk);
- if (s3c_rtc_cpu_type == TYPE_S3C64XX) {
- ticnt = readw(s3c_rtc_base + S3C2410_RTCCON);
- ticnt &= S3C64XX_RTCCON_TICEN;
- } else {
- ticnt = readb(s3c_rtc_base + S3C2410_TICNT);
- ticnt &= S3C2410_TICNT_ENABLE;
- }
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
+
+ if (info->data->enable_tick)
+ info->data->enable_tick(info, seq);
+
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
- seq_printf(seq, "periodic_IRQ\t: %s\n", ticnt ? "yes" : "no");
- clk_disable(rtc_clk);
return 0;
}
@@ -368,152 +384,199 @@ static const struct rtc_class_ops s3c_rtcops = {
.alarm_irq_enable = s3c_rtc_setaie,
};
-static void s3c_rtc_enable(struct platform_device *pdev, int en)
+static void s3c24xx_rtc_enable(struct s3c_rtc *info)
{
- void __iomem *base = s3c_rtc_base;
- unsigned int tmp;
+ unsigned int con, tmp;
- if (s3c_rtc_base == NULL)
- return;
-
- clk_enable(rtc_clk);
- if (!en) {
- tmp = readw(base + S3C2410_RTCCON);
- if (s3c_rtc_cpu_type == TYPE_S3C64XX)
- tmp &= ~S3C64XX_RTCCON_TICEN;
- tmp &= ~S3C2410_RTCCON_RTCEN;
- writew(tmp, base + S3C2410_RTCCON);
-
- if (s3c_rtc_cpu_type != TYPE_S3C64XX) {
- tmp = readb(base + S3C2410_TICNT);
- tmp &= ~S3C2410_TICNT_ENABLE;
- writeb(tmp, base + S3C2410_TICNT);
- }
- } else {
- /* re-enable the device, and check it is ok */
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
- if ((readw(base+S3C2410_RTCCON) & S3C2410_RTCCON_RTCEN) == 0) {
- dev_info(&pdev->dev, "rtc disabled, re-enabling\n");
+ con = readw(info->base + S3C2410_RTCCON);
+ /* re-enable the device, and check it is ok */
+ if ((con & S3C2410_RTCCON_RTCEN) == 0) {
+ dev_info(info->dev, "rtc disabled, re-enabling\n");
- tmp = readw(base + S3C2410_RTCCON);
- writew(tmp | S3C2410_RTCCON_RTCEN,
- base + S3C2410_RTCCON);
- }
+ tmp = readw(info->base + S3C2410_RTCCON);
+ writew(tmp | S3C2410_RTCCON_RTCEN,
+ info->base + S3C2410_RTCCON);
+ }
- if ((readw(base + S3C2410_RTCCON) & S3C2410_RTCCON_CNTSEL)) {
- dev_info(&pdev->dev, "removing RTCCON_CNTSEL\n");
+ if (con & S3C2410_RTCCON_CNTSEL) {
+ dev_info(info->dev, "removing RTCCON_CNTSEL\n");
- tmp = readw(base + S3C2410_RTCCON);
- writew(tmp & ~S3C2410_RTCCON_CNTSEL,
- base + S3C2410_RTCCON);
- }
+ tmp = readw(info->base + S3C2410_RTCCON);
+ writew(tmp & ~S3C2410_RTCCON_CNTSEL,
+ info->base + S3C2410_RTCCON);
+ }
- if ((readw(base + S3C2410_RTCCON) & S3C2410_RTCCON_CLKRST)) {
- dev_info(&pdev->dev, "removing RTCCON_CLKRST\n");
+ if (con & S3C2410_RTCCON_CLKRST) {
+ dev_info(info->dev, "removing RTCCON_CLKRST\n");
- tmp = readw(base + S3C2410_RTCCON);
- writew(tmp & ~S3C2410_RTCCON_CLKRST,
- base + S3C2410_RTCCON);
- }
+ tmp = readw(info->base + S3C2410_RTCCON);
+ writew(tmp & ~S3C2410_RTCCON_CLKRST,
+ info->base + S3C2410_RTCCON);
}
- clk_disable(rtc_clk);
+
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
}
-static int s3c_rtc_remove(struct platform_device *dev)
+static void s3c24xx_rtc_disable(struct s3c_rtc *info)
{
- s3c_rtc_setaie(&dev->dev, 0);
+ unsigned int con;
+
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
+
+ con = readw(info->base + S3C2410_RTCCON);
+ con &= ~S3C2410_RTCCON_RTCEN;
+ writew(con, info->base + S3C2410_RTCCON);
- clk_unprepare(rtc_clk);
- rtc_clk = NULL;
+ con = readb(info->base + S3C2410_TICNT);
+ con &= ~S3C2410_TICNT_ENABLE;
+ writeb(con, info->base + S3C2410_TICNT);
+
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
+}
+
+static void s3c6410_rtc_disable(struct s3c_rtc *info)
+{
+ unsigned int con;
+
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
+
+ con = readw(info->base + S3C2410_RTCCON);
+ con &= ~S3C64XX_RTCCON_TICEN;
+ con &= ~S3C2410_RTCCON_RTCEN;
+ writew(con, info->base + S3C2410_RTCCON);
+
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
+}
+
+static int s3c_rtc_remove(struct platform_device *pdev)
+{
+ struct s3c_rtc *info = platform_get_drvdata(pdev);
+
+ s3c_rtc_setaie(info->dev, 0);
+
+ clk_unprepare(info->rtc_clk);
+ info->rtc_clk = NULL;
return 0;
}
static const struct of_device_id s3c_rtc_dt_match[];
-static inline int s3c_rtc_get_driver_data(struct platform_device *pdev)
+static struct s3c_rtc_data *s3c_rtc_get_data(struct platform_device *pdev)
{
-#ifdef CONFIG_OF
- struct s3c_rtc_drv_data *data;
- if (pdev->dev.of_node) {
- const struct of_device_id *match;
- match = of_match_node(s3c_rtc_dt_match, pdev->dev.of_node);
- data = (struct s3c_rtc_drv_data *) match->data;
- return data->cpu_type;
- }
-#endif
- return platform_get_device_id(pdev)->driver_data;
+ const struct of_device_id *match;
+
+ match = of_match_node(s3c_rtc_dt_match, pdev->dev.of_node);
+ return (struct s3c_rtc_data *)match->data;
}
static int s3c_rtc_probe(struct platform_device *pdev)
{
- struct rtc_device *rtc;
+ struct s3c_rtc *info = NULL;
struct rtc_time rtc_tm;
struct resource *res;
int ret;
- int tmp;
- dev_dbg(&pdev->dev, "%s: probe=%p\n", __func__, pdev);
+ info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
/* find the IRQs */
-
- s3c_rtc_tickno = platform_get_irq(pdev, 1);
- if (s3c_rtc_tickno < 0) {
+ info->irq_tick = platform_get_irq(pdev, 1);
+ if (info->irq_tick < 0) {
dev_err(&pdev->dev, "no irq for rtc tick\n");
- return s3c_rtc_tickno;
+ return info->irq_tick;
+ }
+
+ info->dev = &pdev->dev;
+ info->data = s3c_rtc_get_data(pdev);
+ if (!info->data) {
+ dev_err(&pdev->dev, "failed getting s3c_rtc_data\n");
+ return -EINVAL;
}
+ spin_lock_init(&info->pie_lock);
+ spin_lock_init(&info->alarm_clk_lock);
+
+ platform_set_drvdata(pdev, info);
- s3c_rtc_alarmno = platform_get_irq(pdev, 0);
- if (s3c_rtc_alarmno < 0) {
+ info->irq_alarm = platform_get_irq(pdev, 0);
+ if (info->irq_alarm < 0) {
dev_err(&pdev->dev, "no irq for alarm\n");
- return s3c_rtc_alarmno;
+ return info->irq_alarm;
}
dev_dbg(&pdev->dev, "s3c2410_rtc: tick irq %d, alarm irq %d\n",
- s3c_rtc_tickno, s3c_rtc_alarmno);
+ info->irq_tick, info->irq_alarm);
/* get the memory region */
-
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- s3c_rtc_base = devm_ioremap_resource(&pdev->dev, res);
- if (IS_ERR(s3c_rtc_base))
- return PTR_ERR(s3c_rtc_base);
-
- rtc_clk = devm_clk_get(&pdev->dev, "rtc");
- if (IS_ERR(rtc_clk)) {
- dev_err(&pdev->dev, "failed to find rtc clock source\n");
- ret = PTR_ERR(rtc_clk);
- rtc_clk = NULL;
- return ret;
+ info->base = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(info->base))
+ return PTR_ERR(info->base);
+
+ info->rtc_clk = devm_clk_get(&pdev->dev, "rtc");
+ if (IS_ERR(info->rtc_clk)) {
+ dev_err(&pdev->dev, "failed to find rtc clock\n");
+ return PTR_ERR(info->rtc_clk);
}
+ clk_prepare_enable(info->rtc_clk);
- clk_prepare_enable(rtc_clk);
+ info->rtc_src_clk = devm_clk_get(&pdev->dev, "rtc_src");
+ if (IS_ERR(info->rtc_src_clk)) {
+ dev_err(&pdev->dev, "failed to find rtc source clock\n");
+ return PTR_ERR(info->rtc_src_clk);
+ }
+ clk_prepare_enable(info->rtc_src_clk);
- /* check to see if everything is setup correctly */
- s3c_rtc_enable(pdev, 1);
+ /* check to see if everything is setup correctly */
+ if (info->data->enable)
+ info->data->enable(info);
dev_dbg(&pdev->dev, "s3c2410_rtc: RTCCON=%02x\n",
- readw(s3c_rtc_base + S3C2410_RTCCON));
+ readw(info->base + S3C2410_RTCCON));
device_init_wakeup(&pdev->dev, 1);
/* register RTC and exit */
-
- rtc = devm_rtc_device_register(&pdev->dev, "s3c", &s3c_rtcops,
+ info->rtc = devm_rtc_device_register(&pdev->dev, "s3c", &s3c_rtcops,
THIS_MODULE);
-
- if (IS_ERR(rtc)) {
+ if (IS_ERR(info->rtc)) {
dev_err(&pdev->dev, "cannot attach rtc\n");
- ret = PTR_ERR(rtc);
+ ret = PTR_ERR(info->rtc);
goto err_nortc;
}
- s3c_rtc_cpu_type = s3c_rtc_get_driver_data(pdev);
+ ret = devm_request_irq(&pdev->dev, info->irq_alarm, s3c_rtc_alarmirq,
+ 0, "s3c2410-rtc alarm", info);
+ if (ret) {
+ dev_err(&pdev->dev, "IRQ%d error %d\n", info->irq_alarm, ret);
+ goto err_nortc;
+ }
- /* Check RTC Time */
+ ret = devm_request_irq(&pdev->dev, info->irq_tick, s3c_rtc_tickirq,
+ 0, "s3c2410-rtc tick", info);
+ if (ret) {
+ dev_err(&pdev->dev, "IRQ%d error %d\n", info->irq_tick, ret);
+ goto err_nortc;
+ }
- s3c_rtc_gettime(NULL, &rtc_tm);
+ /* Check RTC Time */
+ s3c_rtc_gettime(&pdev->dev, &rtc_tm);
if (rtc_valid_tm(&rtc_tm)) {
rtc_tm.tm_year = 100;
@@ -523,163 +586,312 @@ static int s3c_rtc_probe(struct platform_device *pdev)
rtc_tm.tm_min = 0;
rtc_tm.tm_sec = 0;
- s3c_rtc_settime(NULL, &rtc_tm);
+ s3c_rtc_settime(&pdev->dev, &rtc_tm);
dev_warn(&pdev->dev, "warning: invalid RTC value so initializing it\n");
}
- if (s3c_rtc_cpu_type != TYPE_S3C2410)
- rtc->max_user_freq = 32768;
- else
- rtc->max_user_freq = 128;
-
- if (s3c_rtc_cpu_type == TYPE_S3C2416 || s3c_rtc_cpu_type == TYPE_S3C2443) {
- tmp = readw(s3c_rtc_base + S3C2410_RTCCON);
- tmp |= S3C2443_RTCCON_TICSEL;
- writew(tmp, s3c_rtc_base + S3C2410_RTCCON);
- }
+ if (info->data->select_tick_clk)
+ info->data->select_tick_clk(info);
- platform_set_drvdata(pdev, rtc);
+ s3c_rtc_setfreq(info, 1);
- s3c_rtc_setfreq(&pdev->dev, 1);
-
- ret = devm_request_irq(&pdev->dev, s3c_rtc_alarmno, s3c_rtc_alarmirq,
- 0, "s3c2410-rtc alarm", rtc);
- if (ret) {
- dev_err(&pdev->dev, "IRQ%d error %d\n", s3c_rtc_alarmno, ret);
- goto err_nortc;
- }
-
- ret = devm_request_irq(&pdev->dev, s3c_rtc_tickno, s3c_rtc_tickirq,
- 0, "s3c2410-rtc tick", rtc);
- if (ret) {
- dev_err(&pdev->dev, "IRQ%d error %d\n", s3c_rtc_tickno, ret);
- goto err_nortc;
- }
-
- clk_disable(rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
return 0;
err_nortc:
- s3c_rtc_enable(pdev, 0);
- clk_disable_unprepare(rtc_clk);
+ if (info->data->disable)
+ info->data->disable(info);
+ clk_disable_unprepare(info->rtc_clk);
return ret;
}
#ifdef CONFIG_PM_SLEEP
-/* RTC Power management control */
-
-static int ticnt_save, ticnt_en_save;
-static bool wake_en;
static int s3c_rtc_suspend(struct device *dev)
{
- struct platform_device *pdev = to_platform_device(dev);
+ struct s3c_rtc *info = dev_get_drvdata(dev);
+
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
- clk_enable(rtc_clk);
/* save TICNT for anyone using periodic interrupts */
- if (s3c_rtc_cpu_type == TYPE_S3C64XX) {
- ticnt_en_save = readw(s3c_rtc_base + S3C2410_RTCCON);
- ticnt_en_save &= S3C64XX_RTCCON_TICEN;
- ticnt_save = readl(s3c_rtc_base + S3C2410_TICNT);
- } else {
- ticnt_save = readb(s3c_rtc_base + S3C2410_TICNT);
- }
- s3c_rtc_enable(pdev, 0);
+ if (info->data->save_tick_cnt)
+ info->data->save_tick_cnt(info);
+
+ if (info->data->disable)
+ info->data->disable(info);
- if (device_may_wakeup(dev) && !wake_en) {
- if (enable_irq_wake(s3c_rtc_alarmno) == 0)
- wake_en = true;
+ if (device_may_wakeup(dev) && !info->wake_en) {
+ if (enable_irq_wake(info->irq_alarm) == 0)
+ info->wake_en = true;
else
dev_err(dev, "enable_irq_wake failed\n");
}
- clk_disable(rtc_clk);
+
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
return 0;
}
static int s3c_rtc_resume(struct device *dev)
{
- struct platform_device *pdev = to_platform_device(dev);
- unsigned int tmp;
+ struct s3c_rtc *info = dev_get_drvdata(dev);
- clk_enable(rtc_clk);
- s3c_rtc_enable(pdev, 1);
- if (s3c_rtc_cpu_type == TYPE_S3C64XX) {
- writel(ticnt_save, s3c_rtc_base + S3C2410_TICNT);
- if (ticnt_en_save) {
- tmp = readw(s3c_rtc_base + S3C2410_RTCCON);
- writew(tmp | ticnt_en_save,
- s3c_rtc_base + S3C2410_RTCCON);
- }
- } else {
- writeb(ticnt_save, s3c_rtc_base + S3C2410_TICNT);
- }
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
- if (device_may_wakeup(dev) && wake_en) {
- disable_irq_wake(s3c_rtc_alarmno);
- wake_en = false;
+ if (info->data->enable)
+ info->data->enable(info);
+
+ if (info->data->restore_tick_cnt)
+ info->data->restore_tick_cnt(info);
+
+ if (device_may_wakeup(dev) && info->wake_en) {
+ disable_irq_wake(info->irq_alarm);
+ info->wake_en = false;
}
- clk_disable(rtc_clk);
+
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
return 0;
}
#endif
-
static SIMPLE_DEV_PM_OPS(s3c_rtc_pm_ops, s3c_rtc_suspend, s3c_rtc_resume);
-#ifdef CONFIG_OF
-static struct s3c_rtc_drv_data s3c_rtc_drv_data_array[] = {
- [TYPE_S3C2410] = { TYPE_S3C2410 },
- [TYPE_S3C2416] = { TYPE_S3C2416 },
- [TYPE_S3C2443] = { TYPE_S3C2443 },
- [TYPE_S3C64XX] = { TYPE_S3C64XX },
+static void s3c24xx_rtc_irq(struct s3c_rtc *info, int mask)
+{
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
+ rtc_update_irq(info->rtc, 1, RTC_AF | RTC_IRQF);
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
+
+ s3c_rtc_alarm_clk_enable(info, false);
+}
+
+static void s3c6410_rtc_irq(struct s3c_rtc *info, int mask)
+{
+ clk_enable(info->rtc_clk);
+ if (info->data->needs_src_clk)
+ clk_enable(info->rtc_src_clk);
+ rtc_update_irq(info->rtc, 1, RTC_AF | RTC_IRQF);
+ writeb(mask, info->base + S3C2410_INTP);
+ if (info->data->needs_src_clk)
+ clk_disable(info->rtc_src_clk);
+ clk_disable(info->rtc_clk);
+
+ s3c_rtc_alarm_clk_enable(info, false);
+}
+
+static void s3c2410_rtc_setfreq(struct s3c_rtc *info, int freq)
+{
+ unsigned int tmp = 0;
+ int val;
+
+ tmp = readb(info->base + S3C2410_TICNT);
+ tmp &= S3C2410_TICNT_ENABLE;
+
+ val = (info->rtc->max_user_freq / freq) - 1;
+ tmp |= val;
+
+ writel(tmp, info->base + S3C2410_TICNT);
+}
+
+static void s3c2416_rtc_setfreq(struct s3c_rtc *info, int freq)
+{
+ unsigned int tmp = 0;
+ int val;
+
+ tmp = readb(info->base + S3C2410_TICNT);
+ tmp &= S3C2410_TICNT_ENABLE;
+
+ val = (info->rtc->max_user_freq / freq) - 1;
+
+ tmp |= S3C2443_TICNT_PART(val);
+ writel(S3C2443_TICNT1_PART(val), info->base + S3C2443_TICNT1);
+
+ writel(S3C2416_TICNT2_PART(val), info->base + S3C2416_TICNT2);
+
+ writel(tmp, info->base + S3C2410_TICNT);
+}
+
+static void s3c2443_rtc_setfreq(struct s3c_rtc *info, int freq)
+{
+ unsigned int tmp = 0;
+ int val;
+
+ tmp = readb(info->base + S3C2410_TICNT);
+ tmp &= S3C2410_TICNT_ENABLE;
+
+ val = (info->rtc->max_user_freq / freq) - 1;
+
+ tmp |= S3C2443_TICNT_PART(val);
+ writel(S3C2443_TICNT1_PART(val), info->base + S3C2443_TICNT1);
+
+ writel(tmp, info->base + S3C2410_TICNT);
+}
+
+static void s3c6410_rtc_setfreq(struct s3c_rtc *info, int freq)
+{
+ int val;
+
+ val = (info->rtc->max_user_freq / freq) - 1;
+ writel(val, info->base + S3C2410_TICNT);
+}
+
+static void s3c24xx_rtc_enable_tick(struct s3c_rtc *info, struct seq_file *seq)
+{
+ unsigned int ticnt;
+
+ ticnt = readb(info->base + S3C2410_TICNT);
+ ticnt &= S3C2410_TICNT_ENABLE;
+
+ seq_printf(seq, "periodic_IRQ\t: %s\n", ticnt ? "yes" : "no");
+}
+
+static void s3c2416_rtc_select_tick_clk(struct s3c_rtc *info)
+{
+ unsigned int con;
+
+ con = readw(info->base + S3C2410_RTCCON);
+ con |= S3C2443_RTCCON_TICSEL;
+ writew(con, info->base + S3C2410_RTCCON);
+}
+
+static void s3c6410_rtc_enable_tick(struct s3c_rtc *info, struct seq_file *seq)
+{
+ unsigned int ticnt;
+
+ ticnt = readw(info->base + S3C2410_RTCCON);
+ ticnt &= S3C64XX_RTCCON_TICEN;
+
+ seq_printf(seq, "periodic_IRQ\t: %s\n", ticnt ? "yes" : "no");
+}
+
+static void s3c24xx_rtc_save_tick_cnt(struct s3c_rtc *info)
+{
+ info->ticnt_save = readb(info->base + S3C2410_TICNT);
+}
+
+static void s3c24xx_rtc_restore_tick_cnt(struct s3c_rtc *info)
+{
+ writeb(info->ticnt_save, info->base + S3C2410_TICNT);
+}
+
+static void s3c6410_rtc_save_tick_cnt(struct s3c_rtc *info)
+{
+ info->ticnt_en_save = readw(info->base + S3C2410_RTCCON);
+ info->ticnt_en_save &= S3C64XX_RTCCON_TICEN;
+ info->ticnt_save = readl(info->base + S3C2410_TICNT);
+}
+
+static void s3c6410_rtc_restore_tick_cnt(struct s3c_rtc *info)
+{
+ unsigned int con;
+
+ writel(info->ticnt_save, info->base + S3C2410_TICNT);
+ if (info->ticnt_en_save) {
+ con = readw(info->base + S3C2410_RTCCON);
+ writew(con | info->ticnt_en_save,
+ info->base + S3C2410_RTCCON);
+ }
+}
+
+static struct s3c_rtc_data const s3c2410_rtc_data = {
+ .max_user_freq = 128,
+ .irq_handler = s3c24xx_rtc_irq,
+ .set_freq = s3c2410_rtc_setfreq,
+ .enable_tick = s3c24xx_rtc_enable_tick,
+ .save_tick_cnt = s3c24xx_rtc_save_tick_cnt,
+ .restore_tick_cnt = s3c24xx_rtc_restore_tick_cnt,
+ .enable = s3c24xx_rtc_enable,
+ .disable = s3c24xx_rtc_disable,
+};
+
+static struct s3c_rtc_data const s3c2416_rtc_data = {
+ .max_user_freq = 32768,
+ .irq_handler = s3c24xx_rtc_irq,
+ .set_freq = s3c2416_rtc_setfreq,
+ .enable_tick = s3c24xx_rtc_enable_tick,
+ .select_tick_clk = s3c2416_rtc_select_tick_clk,
+ .save_tick_cnt = s3c24xx_rtc_save_tick_cnt,
+ .restore_tick_cnt = s3c24xx_rtc_restore_tick_cnt,
+ .enable = s3c24xx_rtc_enable,
+ .disable = s3c24xx_rtc_disable,
+};
+
+static struct s3c_rtc_data const s3c2443_rtc_data = {
+ .max_user_freq = 32768,
+ .irq_handler = s3c24xx_rtc_irq,
+ .set_freq = s3c2443_rtc_setfreq,
+ .enable_tick = s3c24xx_rtc_enable_tick,
+ .select_tick_clk = s3c2416_rtc_select_tick_clk,
+ .save_tick_cnt = s3c24xx_rtc_save_tick_cnt,
+ .restore_tick_cnt = s3c24xx_rtc_restore_tick_cnt,
+ .enable = s3c24xx_rtc_enable,
+ .disable = s3c24xx_rtc_disable,
+};
+
+static struct s3c_rtc_data const s3c6410_rtc_data = {
+ .max_user_freq = 32768,
+ .irq_handler = s3c6410_rtc_irq,
+ .set_freq = s3c6410_rtc_setfreq,
+ .enable_tick = s3c6410_rtc_enable_tick,
+ .save_tick_cnt = s3c6410_rtc_save_tick_cnt,
+ .restore_tick_cnt = s3c6410_rtc_restore_tick_cnt,
+ .enable = s3c24xx_rtc_enable,
+ .disable = s3c6410_rtc_disable,
+};
+
+static struct s3c_rtc_data const exynos3250_rtc_data = {
+ .max_user_freq = 32768,
+ .needs_src_clk = true,
+ .irq_handler = s3c6410_rtc_irq,
+ .set_freq = s3c6410_rtc_setfreq,
+ .enable_tick = s3c6410_rtc_enable_tick,
+ .save_tick_cnt = s3c6410_rtc_save_tick_cnt,
+ .restore_tick_cnt = s3c6410_rtc_restore_tick_cnt,
+ .enable = s3c24xx_rtc_enable,
+ .disable = s3c6410_rtc_disable,
};
static const struct of_device_id s3c_rtc_dt_match[] = {
{
.compatible = "samsung,s3c2410-rtc",
- .data = &s3c_rtc_drv_data_array[TYPE_S3C2410],
+ .data = (void *)&s3c2410_rtc_data,
}, {
.compatible = "samsung,s3c2416-rtc",
- .data = &s3c_rtc_drv_data_array[TYPE_S3C2416],
+ .data = (void *)&s3c2416_rtc_data,
}, {
.compatible = "samsung,s3c2443-rtc",
- .data = &s3c_rtc_drv_data_array[TYPE_S3C2443],
+ .data = (void *)&s3c2443_rtc_data,
}, {
.compatible = "samsung,s3c6410-rtc",
- .data = &s3c_rtc_drv_data_array[TYPE_S3C64XX],
- },
- {},
-};
-MODULE_DEVICE_TABLE(of, s3c_rtc_dt_match);
-#endif
-
-static struct platform_device_id s3c_rtc_driver_ids[] = {
- {
- .name = "s3c2410-rtc",
- .driver_data = TYPE_S3C2410,
- }, {
- .name = "s3c2416-rtc",
- .driver_data = TYPE_S3C2416,
- }, {
- .name = "s3c2443-rtc",
- .driver_data = TYPE_S3C2443,
+ .data = (void *)&s3c6410_rtc_data,
}, {
- .name = "s3c64xx-rtc",
- .driver_data = TYPE_S3C64XX,
+ .compatible = "samsung,exynos3250-rtc",
+ .data = (void *)&exynos3250_rtc_data,
},
- { }
+ { /* sentinel */ },
};
-
-MODULE_DEVICE_TABLE(platform, s3c_rtc_driver_ids);
+MODULE_DEVICE_TABLE(of, s3c_rtc_dt_match);
static struct platform_driver s3c_rtc_driver = {
.probe = s3c_rtc_probe,
.remove = s3c_rtc_remove,
- .id_table = s3c_rtc_driver_ids,
.driver = {
.name = "s3c-rtc",
.owner = THIS_MODULE,
@@ -687,7 +899,6 @@ static struct platform_driver s3c_rtc_driver = {
.of_match_table = of_match_ptr(s3c_rtc_dt_match),
},
};
-
module_platform_driver(s3c_rtc_driver);
MODULE_DESCRIPTION("Samsung S3C RTC Driver");
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index b24aa010f68c..65cd80bf9aec 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -13,6 +13,10 @@ config VT
bool "Virtual terminal" if EXPERT
depends on !S390 && !UML
select INPUT
+ select NEW_LEDS
+ select LEDS_CLASS
+ select LEDS_TRIGGERS
+ select INPUT_LEDS
default y
---help---
If you say Y here, you will get support for terminal devices with
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index c039cfea5b11..9d00088d3ecf 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -33,6 +33,7 @@
#include <linux/string.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/leds.h>
#include <linux/kbd_kern.h>
#include <linux/kbd_diacr.h>
@@ -130,6 +131,7 @@ static char rep; /* flag telling character repeat */
static int shift_state = 0;
static unsigned char ledstate = 0xff; /* undefined */
+static unsigned char lockstate = 0xff; /* undefined */
static unsigned char ledioctl;
/*
@@ -961,6 +963,41 @@ static void k_brl(struct vc_data *vc, unsigned char value, char up_flag)
}
}
+/* We route VT keyboard "leds" through triggers */
+static void kbd_ledstate_trigger_activate(struct led_classdev *cdev);
+
+static struct led_trigger ledtrig_ledstate[] = {
+#define DEFINE_LEDSTATE_TRIGGER(kbd_led, nam) \
+ [kbd_led] = { \
+ .name = nam, \
+ .activate = kbd_ledstate_trigger_activate, \
+ }
+ DEFINE_LEDSTATE_TRIGGER(VC_SCROLLOCK, "kbd-scrollock"),
+ DEFINE_LEDSTATE_TRIGGER(VC_NUMLOCK, "kbd-numlock"),
+ DEFINE_LEDSTATE_TRIGGER(VC_CAPSLOCK, "kbd-capslock"),
+ DEFINE_LEDSTATE_TRIGGER(VC_KANALOCK, "kbd-kanalock"),
+#undef DEFINE_LEDSTATE_TRIGGER
+};
+
+static void kbd_lockstate_trigger_activate(struct led_classdev *cdev);
+
+static struct led_trigger ledtrig_lockstate[] = {
+#define DEFINE_LOCKSTATE_TRIGGER(kbd_led, nam) \
+ [kbd_led] = { \
+ .name = nam, \
+ .activate = kbd_lockstate_trigger_activate, \
+ }
+ DEFINE_LOCKSTATE_TRIGGER(VC_SHIFTLOCK, "kbd-shiftlock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_ALTGRLOCK, "kbd-altgrlock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_CTRLLOCK, "kbd-ctrllock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_ALTLOCK, "kbd-altlock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_SHIFTLLOCK, "kbd-shiftllock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_SHIFTRLOCK, "kbd-shiftrlock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_CTRLLLOCK, "kbd-ctrlllock"),
+ DEFINE_LOCKSTATE_TRIGGER(VC_CTRLRLOCK, "kbd-ctrlrlock"),
+#undef DEFINE_LOCKSTATE_TRIGGER
+};
+
/*
* The leds display either (i) the status of NumLock, CapsLock, ScrollLock,
* or (ii) whatever pattern of lights people want to show using KDSETLED,
@@ -995,18 +1032,25 @@ static inline unsigned char getleds(void)
return kb->ledflagstate;
}
-static int kbd_update_leds_helper(struct input_handle *handle, void *data)
+/* Called on trigger connection, to set initial state */
+static void kbd_ledstate_trigger_activate(struct led_classdev *cdev)
{
- unsigned char leds = *(unsigned char *)data;
+ struct led_trigger *trigger = cdev->trigger;
+ int led = trigger - ledtrig_ledstate;
- if (test_bit(EV_LED, handle->dev->evbit)) {
- input_inject_event(handle, EV_LED, LED_SCROLLL, !!(leds & 0x01));
- input_inject_event(handle, EV_LED, LED_NUML, !!(leds & 0x02));
- input_inject_event(handle, EV_LED, LED_CAPSL, !!(leds & 0x04));
- input_inject_event(handle, EV_SYN, SYN_REPORT, 0);
- }
+ tasklet_disable(&keyboard_tasklet);
+ led_trigger_event(trigger, ledstate & (1 << led) ? LED_FULL : LED_OFF);
+ tasklet_enable(&keyboard_tasklet);
+}
- return 0;
+static void kbd_lockstate_trigger_activate(struct led_classdev *cdev)
+{
+ struct led_trigger *trigger = cdev->trigger;
+ int led = trigger - ledtrig_lockstate;
+
+ tasklet_disable(&keyboard_tasklet);
+ led_trigger_event(trigger, lockstate & (1 << led) ? LED_FULL : LED_OFF);
+ tasklet_enable(&keyboard_tasklet);
}
/**
@@ -1095,16 +1139,29 @@ static void kbd_bh(unsigned long dummy)
{
unsigned char leds;
unsigned long flags;
-
+ int i;
+
spin_lock_irqsave(&led_lock, flags);
leds = getleds();
spin_unlock_irqrestore(&led_lock, flags);
if (leds != ledstate) {
- input_handler_for_each_handle(&kbd_handler, &leds,
- kbd_update_leds_helper);
+ for (i = 0; i < ARRAY_SIZE(ledtrig_ledstate); i++)
+ if ((leds ^ ledstate) & (1 << i))
+ led_trigger_event(&ledtrig_ledstate[i],
+ leds & (1 << i)
+ ? LED_FULL : LED_OFF);
ledstate = leds;
}
+
+ if (kbd->lockstate != lockstate) {
+ for (i = 0; i < ARRAY_SIZE(ledtrig_lockstate); i++)
+ if ((kbd->lockstate ^ lockstate) & (1 << i))
+ led_trigger_event(&ledtrig_lockstate[i],
+ kbd->lockstate & (1 << i)
+ ? LED_FULL : LED_OFF);
+ lockstate = kbd->lockstate;
+ }
}
DECLARE_TASKLET_DISABLED(keyboard_tasklet, kbd_bh, 0);
@@ -1442,20 +1499,6 @@ static void kbd_disconnect(struct input_handle *handle)
kfree(handle);
}
-/*
- * Start keyboard handler on the new keyboard by refreshing LED state to
- * match the rest of the system.
- */
-static void kbd_start(struct input_handle *handle)
-{
- tasklet_disable(&keyboard_tasklet);
-
- if (ledstate != 0xff)
- kbd_update_leds_helper(handle, &ledstate);
-
- tasklet_enable(&keyboard_tasklet);
-}
-
static const struct input_device_id kbd_ids[] = {
{
.flags = INPUT_DEVICE_ID_MATCH_EVBIT,
@@ -1477,7 +1520,6 @@ static struct input_handler kbd_handler = {
.match = kbd_match,
.connect = kbd_connect,
.disconnect = kbd_disconnect,
- .start = kbd_start,
.name = "kbd",
.id_table = kbd_ids,
};
@@ -1501,6 +1543,20 @@ int __init kbd_init(void)
if (error)
return error;
+ for (i = 0; i < ARRAY_SIZE(ledtrig_ledstate); i++) {
+ error = led_trigger_register(&ledtrig_ledstate[i]);
+ if (error)
+ pr_err("error %d while registering trigger %s\n",
+ error, ledtrig_ledstate[i].name);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ledtrig_lockstate); i++) {
+ error = led_trigger_register(&ledtrig_lockstate[i]);
+ if (error)
+ pr_err("error %d while registering trigger %s\n",
+ error, ledtrig_lockstate[i].name);
+ }
+
tasklet_enable(&keyboard_tasklet);
tasklet_schedule(&keyboard_tasklet);
diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index bddc8b17a4d8..0ce8823952a8 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -190,8 +190,6 @@ static ssize_t brightness_store(struct device *dev,
}
mutex_unlock(&bd->ops_lock);
- backlight_generate_event(bd, BACKLIGHT_UPDATE_SYSFS);
-
return rc;
}
static DEVICE_ATTR_RW(brightness);
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index c6683f2e396c..00b228638274 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -25,6 +25,7 @@ config VIRTIO_PCI
config VIRTIO_BALLOON
tristate "Virtio balloon driver"
depends on VIRTIO
+ select MEMORY_BALLOON
---help---
This driver supports increasing and decreasing the amount
of memory within a KVM guest.
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 25ebe8eecdb7..bdc6a7ea4aad 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -59,7 +59,7 @@ struct virtio_balloon
* Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE
* to num_pages above.
*/
- struct balloon_dev_info *vb_dev_info;
+ struct balloon_dev_info vb_dev_info;
/* Synchronize access/update to this struct virtio_balloon elements */
struct mutex balloon_lock;
@@ -127,7 +127,7 @@ static void set_page_pfns(u32 pfns[], struct page *page)
static void fill_balloon(struct virtio_balloon *vb, size_t num)
{
- struct balloon_dev_info *vb_dev_info = vb->vb_dev_info;
+ struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
/* We can only do one array worth at a time. */
num = min(num, ARRAY_SIZE(vb->pfns));
@@ -163,15 +163,15 @@ static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
/* Find pfns pointing at start of each page, get pages and free them. */
for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) {
struct page *page = balloon_pfn_to_page(pfns[i]);
- balloon_page_free(page);
adjust_managed_page_count(page, 1);
+ put_page(page);
}
}
static void leak_balloon(struct virtio_balloon *vb, size_t num)
{
struct page *page;
- struct balloon_dev_info *vb_dev_info = vb->vb_dev_info;
+ struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
/* We can only do one array worth at a time. */
num = min(num, ARRAY_SIZE(vb->pfns));
@@ -353,12 +353,11 @@ static int init_vqs(struct virtio_balloon *vb)
return 0;
}
-static const struct address_space_operations virtio_balloon_aops;
#ifdef CONFIG_BALLOON_COMPACTION
/*
* virtballoon_migratepage - perform the balloon page migration on behalf of
* a compation thread. (called under page lock)
- * @mapping: the page->mapping which will be assigned to the new migrated page.
+ * @vb_dev_info: the balloon device
* @newpage: page that will replace the isolated page after migration finishes.
* @page : the isolated (old) page that is about to be migrated to newpage.
* @mode : compaction mode -- not used for balloon page migration.
@@ -373,17 +372,13 @@ static const struct address_space_operations virtio_balloon_aops;
* This function preforms the balloon page migration task.
* Called through balloon_mapping->a_ops->migratepage
*/
-static int virtballoon_migratepage(struct address_space *mapping,
+static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
struct page *newpage, struct page *page, enum migrate_mode mode)
{
- struct balloon_dev_info *vb_dev_info = balloon_page_device(page);
- struct virtio_balloon *vb;
+ struct virtio_balloon *vb = container_of(vb_dev_info,
+ struct virtio_balloon, vb_dev_info);
unsigned long flags;
- BUG_ON(!vb_dev_info);
-
- vb = vb_dev_info->balloon_device;
-
/*
* In order to avoid lock contention while migrating pages concurrently
* to leak_balloon() or fill_balloon() we just give up the balloon_lock
@@ -395,42 +390,34 @@ static int virtballoon_migratepage(struct address_space *mapping,
if (!mutex_trylock(&vb->balloon_lock))
return -EAGAIN;
+ get_page(newpage); /* balloon reference */
+
/* balloon's page migration 1st step -- inflate "newpage" */
spin_lock_irqsave(&vb_dev_info->pages_lock, flags);
- balloon_page_insert(newpage, mapping, &vb_dev_info->pages);
+ balloon_page_insert(vb_dev_info, newpage);
vb_dev_info->isolated_pages--;
spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
set_page_pfns(vb->pfns, newpage);
tell_host(vb, vb->inflate_vq);
- /*
- * balloon's page migration 2nd step -- deflate "page"
- *
- * It's safe to delete page->lru here because this page is at
- * an isolated migration list, and this step is expected to happen here
- */
- balloon_page_delete(page);
+ /* balloon's page migration 2nd step -- deflate "page" */
+ balloon_page_delete(page, true);
vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
set_page_pfns(vb->pfns, page);
tell_host(vb, vb->deflate_vq);
mutex_unlock(&vb->balloon_lock);
- return MIGRATEPAGE_BALLOON_SUCCESS;
-}
+ put_page(page); /* balloon reference */
-/* define the balloon_mapping->a_ops callback to allow balloon page migration */
-static const struct address_space_operations virtio_balloon_aops = {
- .migratepage = virtballoon_migratepage,
-};
+ return MIGRATEPAGE_SUCCESS;
+}
#endif /* CONFIG_BALLOON_COMPACTION */
static int virtballoon_probe(struct virtio_device *vdev)
{
struct virtio_balloon *vb;
- struct address_space *vb_mapping;
- struct balloon_dev_info *vb_devinfo;
int err;
vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL);
@@ -446,30 +433,14 @@ static int virtballoon_probe(struct virtio_device *vdev)
vb->vdev = vdev;
vb->need_stats_update = 0;
- vb_devinfo = balloon_devinfo_alloc(vb);
- if (IS_ERR(vb_devinfo)) {
- err = PTR_ERR(vb_devinfo);
- goto out_free_vb;
- }
-
- vb_mapping = balloon_mapping_alloc(vb_devinfo,
- (balloon_compaction_check()) ?
- &virtio_balloon_aops : NULL);
- if (IS_ERR(vb_mapping)) {
- /*
- * IS_ERR(vb_mapping) && PTR_ERR(vb_mapping) == -EOPNOTSUPP
- * This means !CONFIG_BALLOON_COMPACTION, otherwise we get off.
- */
- err = PTR_ERR(vb_mapping);
- if (err != -EOPNOTSUPP)
- goto out_free_vb_devinfo;
- }
-
- vb->vb_dev_info = vb_devinfo;
+ balloon_devinfo_init(&vb->vb_dev_info);
+#ifdef CONFIG_BALLOON_COMPACTION
+ vb->vb_dev_info.migrate_page = virtballoon_migratepage;
+#endif
err = init_vqs(vb);
if (err)
- goto out_free_vb_mapping;
+ goto out_free_vb;
vb->thread = kthread_run(balloon, vb, "vballoon");
if (IS_ERR(vb->thread)) {
@@ -481,10 +452,6 @@ static int virtballoon_probe(struct virtio_device *vdev)
out_del_vqs:
vdev->config->del_vqs(vdev);
-out_free_vb_mapping:
- balloon_mapping_free(vb_mapping);
-out_free_vb_devinfo:
- balloon_devinfo_free(vb_devinfo);
out_free_vb:
kfree(vb);
out:
@@ -510,8 +477,6 @@ static void virtballoon_remove(struct virtio_device *vdev)
kthread_stop(vb->thread);
remove_common(vb);
- balloon_mapping_free(vb->vb_dev_info->mapping);
- balloon_devinfo_free(vb->vb_dev_info);
kfree(vb);
}
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 9e359fb20c0a..8e98cf954bab 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -79,6 +79,10 @@ struct autofs_info {
};
#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered
+ * for expiry, so RCU_walk is
+ * not permitted
+ */
#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
struct autofs_wait_queue {
@@ -148,7 +152,7 @@ void autofs4_free_ino(struct autofs_info *);
/* Expiration */
int is_autofs4_dentry(struct dentry *);
-int autofs4_expire_wait(struct dentry *dentry);
+int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
int autofs4_expire_run(struct super_block *, struct vfsmount *,
struct autofs_sb_info *,
struct autofs_packet_expire __user *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 5b570b6efa28..aaf96cb25452 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -450,7 +450,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
ino = autofs4_dentry_ino(path.dentry);
if (ino) {
err = 0;
- autofs4_expire_wait(path.dentry);
+ autofs4_expire_wait(path.dentry, 0);
spin_lock(&sbi->fs_lock);
param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index a7be57e39be7..4d52272952ec 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -30,12 +30,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
/* Too young to die */
if (!timeout || time_after(ino->last_used + timeout, now))
return 0;
-
- /* update last_used here :-
- - obviously makes sense if it is in use now
- - less obviously, prevents rapid-fire expire
- attempts if expire fails the first time */
- ino->last_used = now;
}
return 1;
}
@@ -333,10 +327,19 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
if (ino->flags & AUTOFS_INF_PENDING)
goto out;
if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
- ino->flags |= AUTOFS_INF_EXPIRING;
- init_completion(&ino->expire_complete);
+ ino->flags |= AUTOFS_INF_NO_RCU;
spin_unlock(&sbi->fs_lock);
- return root;
+ synchronize_rcu();
+ spin_lock(&sbi->fs_lock);
+ if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
+ ino->flags |= AUTOFS_INF_EXPIRING;
+ smp_mb();
+ ino->flags &= ~AUTOFS_INF_NO_RCU;
+ init_completion(&ino->expire_complete);
+ spin_unlock(&sbi->fs_lock);
+ return root;
+ }
+ ino->flags &= ~AUTOFS_INF_NO_RCU;
}
out:
spin_unlock(&sbi->fs_lock);
@@ -345,6 +348,89 @@ out:
return NULL;
}
+/* Check if 'dentry' should expire, or return a nearby
+ * dentry that is suitable.
+ * If returned dentry is different from arg dentry,
+ * then a dget() reference was taken, else not.
+ */
+static struct dentry *should_expire(struct dentry *dentry,
+ struct vfsmount *mnt,
+ unsigned long timeout,
+ int how)
+{
+ int do_now = how & AUTOFS_EXP_IMMEDIATE;
+ int exp_leaves = how & AUTOFS_EXP_LEAVES;
+ struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ unsigned int ino_count;
+
+ /* No point expiring a pending mount */
+ if (ino->flags & AUTOFS_INF_PENDING)
+ return NULL;
+
+ /*
+ * Case 1: (i) indirect mount or top level pseudo direct mount
+ * (autofs-4.1).
+ * (ii) indirect mount with offset mount, check the "/"
+ * offset (autofs-5.0+).
+ */
+ if (d_mountpoint(dentry)) {
+ DPRINTK("checking mountpoint %p %.*s",
+ dentry, (int)dentry->d_name.len, dentry->d_name.name);
+
+ /* Can we umount this guy */
+ if (autofs4_mount_busy(mnt, dentry))
+ return NULL;
+
+ /* Can we expire this guy */
+ if (autofs4_can_expire(dentry, timeout, do_now))
+ return dentry;
+ return NULL;
+ }
+
+ if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+ DPRINTK("checking symlink %p %.*s",
+ dentry, (int)dentry->d_name.len, dentry->d_name.name);
+ /*
+ * A symlink can't be "busy" in the usual sense so
+ * just check last used for expire timeout.
+ */
+ if (autofs4_can_expire(dentry, timeout, do_now))
+ return dentry;
+ return NULL;
+ }
+
+ if (simple_empty(dentry))
+ return NULL;
+
+ /* Case 2: tree mount, expire iff entire tree is not busy */
+ if (!exp_leaves) {
+ /* Path walk currently on this dentry? */
+ ino_count = atomic_read(&ino->count) + 1;
+ if (d_count(dentry) > ino_count)
+ return NULL;
+
+ if (!autofs4_tree_busy(mnt, dentry, timeout, do_now))
+ return dentry;
+ /*
+ * Case 3: pseudo direct mount, expire individual leaves
+ * (autofs-4.1).
+ */
+ } else {
+ /* Path walk currently on this dentry? */
+ struct dentry *expired;
+ ino_count = atomic_read(&ino->count) + 1;
+ if (d_count(dentry) > ino_count)
+ return NULL;
+
+ expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
+ if (expired) {
+ if (expired == dentry)
+ dput(dentry);
+ return expired;
+ }
+ }
+ return NULL;
+}
/*
* Find an eligible tree to time-out
* A tree is eligible if :-
@@ -359,11 +445,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
unsigned long timeout;
struct dentry *root = sb->s_root;
struct dentry *dentry;
- struct dentry *expired = NULL;
- int do_now = how & AUTOFS_EXP_IMMEDIATE;
- int exp_leaves = how & AUTOFS_EXP_LEAVES;
+ struct dentry *expired;
struct autofs_info *ino;
- unsigned int ino_count;
if (!root)
return NULL;
@@ -375,77 +458,28 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
while ((dentry = get_next_positive_subdir(dentry, root))) {
spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(dentry);
- /* No point expiring a pending mount */
- if (ino->flags & AUTOFS_INF_PENDING)
- goto next;
-
- /*
- * Case 1: (i) indirect mount or top level pseudo direct mount
- * (autofs-4.1).
- * (ii) indirect mount with offset mount, check the "/"
- * offset (autofs-5.0+).
- */
- if (d_mountpoint(dentry)) {
- DPRINTK("checking mountpoint %p %.*s",
- dentry, (int)dentry->d_name.len, dentry->d_name.name);
-
- /* Can we umount this guy */
- if (autofs4_mount_busy(mnt, dentry))
- goto next;
-
- /* Can we expire this guy */
- if (autofs4_can_expire(dentry, timeout, do_now)) {
- expired = dentry;
- goto found;
- }
- goto next;
- }
-
- if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
- DPRINTK("checking symlink %p %.*s",
- dentry, (int)dentry->d_name.len, dentry->d_name.name);
- /*
- * A symlink can't be "busy" in the usual sense so
- * just check last used for expire timeout.
- */
- if (autofs4_can_expire(dentry, timeout, do_now)) {
- expired = dentry;
- goto found;
- }
- goto next;
+ if (ino->flags & AUTOFS_INF_NO_RCU)
+ expired = NULL;
+ else
+ expired = should_expire(dentry, mnt, timeout, how);
+ if (!expired) {
+ spin_unlock(&sbi->fs_lock);
+ continue;
}
-
- if (simple_empty(dentry))
- goto next;
-
- /* Case 2: tree mount, expire iff entire tree is not busy */
- if (!exp_leaves) {
- /* Path walk currently on this dentry? */
- ino_count = atomic_read(&ino->count) + 1;
- if (d_count(dentry) > ino_count)
- goto next;
-
- if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
- expired = dentry;
- goto found;
- }
- /*
- * Case 3: pseudo direct mount, expire individual leaves
- * (autofs-4.1).
- */
- } else {
- /* Path walk currently on this dentry? */
- ino_count = atomic_read(&ino->count) + 1;
- if (d_count(dentry) > ino_count)
- goto next;
-
- expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
- if (expired) {
+ ino = autofs4_dentry_ino(expired);
+ ino->flags |= AUTOFS_INF_NO_RCU;
+ spin_unlock(&sbi->fs_lock);
+ synchronize_rcu();
+ spin_lock(&sbi->fs_lock);
+ if (should_expire(expired, mnt, timeout, how)) {
+ if (expired != dentry)
dput(dentry);
- goto found;
- }
+ goto found;
}
-next:
+
+ ino->flags &= ~AUTOFS_INF_NO_RCU;
+ if (expired != dentry)
+ dput(expired);
spin_unlock(&sbi->fs_lock);
}
return NULL;
@@ -453,8 +487,9 @@ next:
found:
DPRINTK("returning %p %.*s",
expired, (int)expired->d_name.len, expired->d_name.name);
- ino = autofs4_dentry_ino(expired);
ino->flags |= AUTOFS_INF_EXPIRING;
+ smp_mb();
+ ino->flags &= ~AUTOFS_INF_NO_RCU;
init_completion(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
spin_lock(&sbi->lookup_lock);
@@ -467,13 +502,18 @@ found:
return expired;
}
-int autofs4_expire_wait(struct dentry *dentry)
+int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
/* Block on any pending expire */
+ if (!(ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)))
+ return 0;
+ if (rcu_walk)
+ return -ECHILD;
+
spin_lock(&sbi->fs_lock);
if (ino->flags & AUTOFS_INF_EXPIRING) {
spin_unlock(&sbi->fs_lock);
@@ -525,6 +565,8 @@ int autofs4_expire_run(struct super_block *sb,
spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(dentry);
+ /* avoid rapid-fire expire attempts if expiry fails */
+ ino->last_used = now;
ino->flags &= ~AUTOFS_INF_EXPIRING;
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
@@ -551,6 +593,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
spin_lock(&sbi->fs_lock);
+ /* avoid rapid-fire expire attempts if expiry fails */
+ ino->last_used = now;
ino->flags &= ~AUTOFS_INF_EXPIRING;
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index cdb25ebccc4c..d76d083f2f06 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -210,7 +210,8 @@ next:
return NULL;
}
-static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
+static struct dentry *autofs4_lookup_expiring(struct dentry *dentry,
+ bool rcu_walk)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct dentry *parent = dentry->d_parent;
@@ -229,6 +230,11 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
struct dentry *expiring;
struct qstr *qstr;
+ if (rcu_walk) {
+ spin_unlock(&sbi->lookup_lock);
+ return ERR_PTR(-ECHILD);
+ }
+
ino = list_entry(p, struct autofs_info, expiring);
expiring = ino->dentry;
@@ -264,13 +270,15 @@ next:
return NULL;
}
-static int autofs4_mount_wait(struct dentry *dentry)
+static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status = 0;
if (ino->flags & AUTOFS_INF_PENDING) {
+ if (rcu_walk)
+ return -ECHILD;
DPRINTK("waiting for mount name=%.*s",
dentry->d_name.len, dentry->d_name.name);
status = autofs4_wait(sbi, dentry, NFY_MOUNT);
@@ -280,20 +288,22 @@ static int autofs4_mount_wait(struct dentry *dentry)
return status;
}
-static int do_expire_wait(struct dentry *dentry)
+static int do_expire_wait(struct dentry *dentry, bool rcu_walk)
{
struct dentry *expiring;
- expiring = autofs4_lookup_expiring(dentry);
+ expiring = autofs4_lookup_expiring(dentry, rcu_walk);
+ if (IS_ERR(expiring))
+ return PTR_ERR(expiring);
if (!expiring)
- return autofs4_expire_wait(dentry);
+ return autofs4_expire_wait(dentry, rcu_walk);
else {
/*
* If we are racing with expire the request might not
* be quite complete, but the directory has been removed
* so it must have been successful, just wait for it.
*/
- autofs4_expire_wait(expiring);
+ autofs4_expire_wait(expiring, 0);
autofs4_del_expiring(expiring);
dput(expiring);
}
@@ -345,7 +355,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
* and the directory was removed, so just go ahead and try
* the mount.
*/
- status = do_expire_wait(dentry);
+ status = do_expire_wait(dentry, 0);
if (status && status != -EAGAIN)
return NULL;
@@ -353,7 +363,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
spin_lock(&sbi->fs_lock);
if (ino->flags & AUTOFS_INF_PENDING) {
spin_unlock(&sbi->fs_lock);
- status = autofs4_mount_wait(dentry);
+ status = autofs4_mount_wait(dentry, 0);
if (status)
return ERR_PTR(status);
goto done;
@@ -394,7 +404,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
}
ino->flags |= AUTOFS_INF_PENDING;
spin_unlock(&sbi->fs_lock);
- status = autofs4_mount_wait(dentry);
+ status = autofs4_mount_wait(dentry, 0);
spin_lock(&sbi->fs_lock);
ino->flags &= ~AUTOFS_INF_PENDING;
if (status) {
@@ -423,28 +433,46 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
/* The daemon never waits. */
if (autofs4_oz_mode(sbi)) {
- if (rcu_walk)
- return 0;
if (!d_mountpoint(dentry))
return -EISDIR;
return 0;
}
- /* We need to sleep, so we need pathwalk to be in ref-mode */
- if (rcu_walk)
- return -ECHILD;
-
/* Wait for pending expires */
- do_expire_wait(dentry);
+ if (do_expire_wait(dentry, rcu_walk) == -ECHILD)
+ return -ECHILD;
/*
* This dentry may be under construction so wait on mount
* completion.
*/
- status = autofs4_mount_wait(dentry);
+ status = autofs4_mount_wait(dentry, rcu_walk);
if (status)
return status;
+ if (rcu_walk) {
+ /* We don't need fs_lock in rcu_walk mode,
+ * just testing 'AUTOFS_INFO_NO_RCU' is enough.
+ * simple_empty() takes a spinlock, so leave it
+ * to last.
+ * We only return -EISDIR when certain this isn't
+ * a mount-trap.
+ */
+ struct inode *inode;
+ if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
+ return 0;
+ if (d_mountpoint(dentry))
+ return 0;
+ inode = ACCESS_ONCE(dentry->d_inode);
+ if (inode && S_ISLNK(inode->i_mode))
+ return -EISDIR;
+ if (list_empty(&dentry->d_subdirs))
+ return 0;
+ if (!simple_empty(dentry))
+ return -EISDIR;
+ return 0;
+ }
+
spin_lock(&sbi->fs_lock);
/*
* If the dentry has been selected for expire while we slept
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 9c7faa8a9288..0826e91dacda 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -78,11 +78,11 @@
/*
* In memory structure of each btree node
*/
-typedef struct {
+struct befs_btree_node {
befs_host_btree_nodehead head; /* head of node converted to cpu byteorder */
struct buffer_head *bh;
befs_btree_nodehead *od_node; /* on disk node */
-} befs_btree_node;
+};
/* local constants */
static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL;
@@ -90,27 +90,30 @@ static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL;
/* local functions */
static int befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
befs_btree_super * bt_super,
- befs_btree_node * this_node,
+ struct befs_btree_node *this_node,
befs_off_t * node_off);
static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
befs_btree_super * sup);
static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
- befs_btree_node * node, befs_off_t node_off);
+ struct befs_btree_node *node,
+ befs_off_t node_off);
-static int befs_leafnode(befs_btree_node * node);
+static int befs_leafnode(struct befs_btree_node *node);
-static fs16 *befs_bt_keylen_index(befs_btree_node * node);
+static fs16 *befs_bt_keylen_index(struct befs_btree_node *node);
-static fs64 *befs_bt_valarray(befs_btree_node * node);
+static fs64 *befs_bt_valarray(struct befs_btree_node *node);
-static char *befs_bt_keydata(befs_btree_node * node);
+static char *befs_bt_keydata(struct befs_btree_node *node);
-static int befs_find_key(struct super_block *sb, befs_btree_node * node,
+static int befs_find_key(struct super_block *sb,
+ struct befs_btree_node *node,
const char *findkey, befs_off_t * value);
-static char *befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
+static char *befs_bt_get_key(struct super_block *sb,
+ struct befs_btree_node *node,
int index, u16 * keylen);
static int befs_compare_strings(const void *key1, int keylen1,
@@ -191,7 +194,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
static int
befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
- befs_btree_node * node, befs_off_t node_off)
+ struct befs_btree_node *node, befs_off_t node_off)
{
uint off = 0;
@@ -247,7 +250,7 @@ int
befs_btree_find(struct super_block *sb, befs_data_stream * ds,
const char *key, befs_off_t * value)
{
- befs_btree_node *this_node = NULL;
+ struct befs_btree_node *this_node = NULL;
befs_btree_super bt_super;
befs_off_t node_off;
int res;
@@ -260,11 +263,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
goto error;
}
- this_node = kmalloc(sizeof (befs_btree_node),
+ this_node = kmalloc(sizeof(struct befs_btree_node),
GFP_NOFS);
if (!this_node) {
befs_error(sb, "befs_btree_find() failed to allocate %zu "
- "bytes of memory", sizeof (befs_btree_node));
+ "bytes of memory", sizeof(struct befs_btree_node));
goto error;
}
@@ -333,7 +336,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
* Use binary search instead of a linear.
*/
static int
-befs_find_key(struct super_block *sb, befs_btree_node * node,
+befs_find_key(struct super_block *sb, struct befs_btree_node *node,
const char *findkey, befs_off_t * value)
{
int first, last, mid;
@@ -417,7 +420,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
loff_t key_no, size_t bufsize, char *keybuf, size_t * keysize,
befs_off_t * value)
{
- befs_btree_node *this_node;
+ struct befs_btree_node *this_node;
befs_btree_super bt_super;
befs_off_t node_off = 0;
int cur_key;
@@ -436,9 +439,10 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
goto error;
}
- if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
+ this_node = kmalloc(sizeof(struct befs_btree_node), GFP_NOFS);
+ if (this_node == NULL) {
befs_error(sb, "befs_btree_read() failed to allocate %zu "
- "bytes of memory", sizeof (befs_btree_node));
+ "bytes of memory", sizeof(struct befs_btree_node));
goto error;
}
@@ -545,7 +549,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
*/
static int
befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
- befs_btree_super * bt_super, befs_btree_node * this_node,
+ befs_btree_super *bt_super,
+ struct befs_btree_node *this_node,
befs_off_t * node_off)
{
@@ -600,7 +605,7 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
* Return 1 if leaf, 0 if interior
*/
static int
-befs_leafnode(befs_btree_node * node)
+befs_leafnode(struct befs_btree_node *node)
{
/* all interior nodes (and only interior nodes) have an overflow node */
if (node->head.overflow == befs_bt_inval)
@@ -623,7 +628,7 @@ befs_leafnode(befs_btree_node * node)
* Except that rounding up to 8 works, and rounding up to 4 doesn't.
*/
static fs16 *
-befs_bt_keylen_index(befs_btree_node * node)
+befs_bt_keylen_index(struct befs_btree_node *node)
{
const int keylen_align = 8;
unsigned long int off =
@@ -644,7 +649,7 @@ befs_bt_keylen_index(befs_btree_node * node)
* of the node pointed to by the node header
*/
static fs64 *
-befs_bt_valarray(befs_btree_node * node)
+befs_bt_valarray(struct befs_btree_node *node)
{
void *keylen_index_start = (void *) befs_bt_keylen_index(node);
size_t keylen_index_size = node->head.all_key_count * sizeof (fs16);
@@ -660,7 +665,7 @@ befs_bt_valarray(befs_btree_node * node)
* of the node pointed to by the node header
*/
static char *
-befs_bt_keydata(befs_btree_node * node)
+befs_bt_keydata(struct befs_btree_node *node)
{
return (char *) ((void *) node->od_node + sizeof (befs_btree_nodehead));
}
@@ -676,7 +681,7 @@ befs_bt_keydata(befs_btree_node * node)
* Returns NULL on failure (bad input) and sets *@keylen = 0
*/
static char *
-befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
+befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node,
int index, u16 * keylen)
{
int prev_key_end;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index b60500300dd7..c96b855d2a5e 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -62,7 +62,22 @@ static struct file_system_type bm_fs_type;
static struct vfsmount *bm_mnt;
static int entry_count;
-/*
+/*
+ * Max length of the register string. Determined by:
+ * - 7 delimiters
+ * - name: ~50 bytes
+ * - type: 1 byte
+ * - offset: 3 bytes (has to be smaller than BINPRM_BUF_SIZE)
+ * - magic: 128 bytes (512 in escaped form)
+ * - mask: 128 bytes (512 in escaped form)
+ * - interp: ~50 bytes
+ * - flags: 5 bytes
+ * Round that up a bit, and then back off to hold the internal data
+ * (like struct Node).
+ */
+#define MAX_REGISTER_LENGTH 1920
+
+/*
* Check if we support the binfmt
* if we do, return the node, else NULL
* locking is done in load_misc_binary
@@ -279,7 +294,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
/* some sanity checks */
err = -EINVAL;
- if ((count < 11) || (count > 256))
+ if ((count < 11) || (count > MAX_REGISTER_LENGTH))
goto out;
err = -ENOMEM;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index cc8d68ac29aa..cc9d4114cda0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -294,6 +294,12 @@ static int blkdev_readpage(struct file * file, struct page * page)
return block_read_full_page(page, blkdev_get_block);
}
+static int blkdev_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+}
+
static int blkdev_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1610,6 +1616,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
+ .readpages = blkdev_readpages,
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
diff --git a/fs/buffer.c b/fs/buffer.c
index 14537f3bf977..46eb2bcb082d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2966,7 +2966,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
/*
* This allows us to do IO even on the odd last sectors
- * of a device, even if the bh block size is some multiple
+ * of a device, even if the block size is some multiple
* of the physical sector size.
*
* We'll just truncate the bio to the size of the device,
@@ -2976,10 +2976,11 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
* errors, this only handles the "we need to be able to
* do IO at the final sector" case.
*/
-static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
+void guard_bio_eod(int rw, struct bio *bio)
{
sector_t maxsector;
- unsigned bytes;
+ struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
+ unsigned truncated_bytes;
maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
if (!maxsector)
@@ -2994,23 +2995,20 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
return;
maxsector -= bio->bi_iter.bi_sector;
- bytes = bio->bi_iter.bi_size;
- if (likely((bytes >> 9) <= maxsector))
+ if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
return;
- /* Uhhuh. We've got a bh that straddles the device size! */
- bytes = maxsector << 9;
+ /* Uhhuh. We've got a bio that straddles the device size! */
+ truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
/* Truncate the bio.. */
- bio->bi_iter.bi_size = bytes;
- bio->bi_io_vec[0].bv_len = bytes;
+ bio->bi_iter.bi_size -= truncated_bytes;
+ bvec->bv_len -= truncated_bytes;
/* ..and clear the end of the buffer for reads */
if ((rw & RW_MASK) == READ) {
- void *kaddr = kmap_atomic(bh->b_page);
- memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
- kunmap_atomic(kaddr);
- flush_dcache_page(bh->b_page);
+ zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+ truncated_bytes);
}
}
@@ -3051,7 +3049,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
bio->bi_flags |= bio_flags;
/* Take care of bh's that straddle the end of the device */
- guard_bh_eod(rw, bio, bh);
+ guard_bio_eod(rw, bio);
if (buffer_meta(bh))
rw |= REQ_META;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7ff866dbb89e..54ac0e8ad96c 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -38,7 +38,7 @@ static const struct cifs_sid sid_everyone = {
1, 1, {0, 0, 0, 0, 0, 1}, {0} };
/* security id for Authenticated Users system group */
static const struct cifs_sid sid_authusers = {
- 1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11)} };
+ 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} };
/* group users */
static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 66f65001a6d8..86a2aa57560c 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2477,14 +2477,14 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
}
parm_data = (struct cifs_posix_lock *)
((char *)&pSMBr->hdr.Protocol + data_offset);
- if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
+ if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
pLockData->fl_type = F_UNLCK;
else {
if (parm_data->lock_type ==
- __constant_cpu_to_le16(CIFS_RDLCK))
+ cpu_to_le16(CIFS_RDLCK))
pLockData->fl_type = F_RDLCK;
else if (parm_data->lock_type ==
- __constant_cpu_to_le16(CIFS_WRLCK))
+ cpu_to_le16(CIFS_WRLCK))
pLockData->fl_type = F_WRLCK;
pLockData->fl_start = le64_to_cpu(parm_data->start);
@@ -3276,25 +3276,25 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
pSMB->TotalParameterCount = 0;
- pSMB->TotalDataCount = __constant_cpu_to_le32(2);
+ pSMB->TotalDataCount = cpu_to_le32(2);
pSMB->MaxParameterCount = 0;
pSMB->MaxDataCount = 0;
pSMB->MaxSetupCount = 4;
pSMB->Reserved = 0;
pSMB->ParameterOffset = 0;
- pSMB->DataCount = __constant_cpu_to_le32(2);
+ pSMB->DataCount = cpu_to_le32(2);
pSMB->DataOffset =
cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req,
compression_state) - 4); /* 84 */
pSMB->SetupCount = 4;
- pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL);
+ pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
pSMB->ParameterCount = 0;
- pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION);
+ pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION);
pSMB->IsFsctl = 1; /* FSCTL */
pSMB->IsRootFlag = 0;
pSMB->Fid = fid; /* file handle always le */
/* 3 byte pad, followed by 2 byte compress state */
- pSMB->ByteCount = __constant_cpu_to_le16(5);
+ pSMB->ByteCount = cpu_to_le16(5);
inc_rfc1001_len(pSMB, 5);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3430,10 +3430,10 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
cifs_acl->version = cpu_to_le16(1);
if (acl_type == ACL_TYPE_ACCESS) {
cifs_acl->access_entry_count = cpu_to_le16(count);
- cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF);
+ cifs_acl->default_entry_count = cpu_to_le16(0xFFFF);
} else if (acl_type == ACL_TYPE_DEFAULT) {
cifs_acl->default_entry_count = cpu_to_le16(count);
- cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF);
+ cifs_acl->access_entry_count = cpu_to_le16(0xFFFF);
} else {
cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
return 0;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index dc3c7e6aff1f..0666df4a6790 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1066,7 +1066,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
max_num = (max_buf - sizeof(struct smb_hdr)) /
sizeof(LOCKING_ANDX_RANGE);
- buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
if (!buf) {
free_xid(xid);
return -ENOMEM;
@@ -1401,7 +1401,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
max_num = (max_buf - sizeof(struct smb_hdr)) /
sizeof(LOCKING_ANDX_RANGE);
- buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
if (!buf)
return -ENOMEM;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 3a5e83317683..07fe97ac6b0c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -46,7 +46,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
USHRT_MAX));
pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
- pSMB->req.VcNumber = __constant_cpu_to_le16(1);
+ pSMB->req.VcNumber = cpu_to_le16(1);
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 45992944e238..7198eac5dddd 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -111,7 +111,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
return -EINVAL;
max_num = max_buf / sizeof(struct smb2_lock_element);
- buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
+ buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -247,7 +247,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
}
max_num = max_buf / sizeof(struct smb2_lock_element);
- buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
+ buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
if (!buf) {
free_xid(xid);
return -ENOMEM;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 4aa7a0f07d6e..18af47c56345 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -67,27 +67,27 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
* indexed by command in host byte order
*/
static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
- /* SMB2_NEGOTIATE */ __constant_cpu_to_le16(65),
- /* SMB2_SESSION_SETUP */ __constant_cpu_to_le16(9),
- /* SMB2_LOGOFF */ __constant_cpu_to_le16(4),
- /* SMB2_TREE_CONNECT */ __constant_cpu_to_le16(16),
- /* SMB2_TREE_DISCONNECT */ __constant_cpu_to_le16(4),
- /* SMB2_CREATE */ __constant_cpu_to_le16(89),
- /* SMB2_CLOSE */ __constant_cpu_to_le16(60),
- /* SMB2_FLUSH */ __constant_cpu_to_le16(4),
- /* SMB2_READ */ __constant_cpu_to_le16(17),
- /* SMB2_WRITE */ __constant_cpu_to_le16(17),
- /* SMB2_LOCK */ __constant_cpu_to_le16(4),
- /* SMB2_IOCTL */ __constant_cpu_to_le16(49),
+ /* SMB2_NEGOTIATE */ cpu_to_le16(65),
+ /* SMB2_SESSION_SETUP */ cpu_to_le16(9),
+ /* SMB2_LOGOFF */ cpu_to_le16(4),
+ /* SMB2_TREE_CONNECT */ cpu_to_le16(16),
+ /* SMB2_TREE_DISCONNECT */ cpu_to_le16(4),
+ /* SMB2_CREATE */ cpu_to_le16(89),
+ /* SMB2_CLOSE */ cpu_to_le16(60),
+ /* SMB2_FLUSH */ cpu_to_le16(4),
+ /* SMB2_READ */ cpu_to_le16(17),
+ /* SMB2_WRITE */ cpu_to_le16(17),
+ /* SMB2_LOCK */ cpu_to_le16(4),
+ /* SMB2_IOCTL */ cpu_to_le16(49),
/* BB CHECK this ... not listed in documentation */
- /* SMB2_CANCEL */ __constant_cpu_to_le16(0),
- /* SMB2_ECHO */ __constant_cpu_to_le16(4),
- /* SMB2_QUERY_DIRECTORY */ __constant_cpu_to_le16(9),
- /* SMB2_CHANGE_NOTIFY */ __constant_cpu_to_le16(9),
- /* SMB2_QUERY_INFO */ __constant_cpu_to_le16(9),
- /* SMB2_SET_INFO */ __constant_cpu_to_le16(2),
+ /* SMB2_CANCEL */ cpu_to_le16(0),
+ /* SMB2_ECHO */ cpu_to_le16(4),
+ /* SMB2_QUERY_DIRECTORY */ cpu_to_le16(9),
+ /* SMB2_CHANGE_NOTIFY */ cpu_to_le16(9),
+ /* SMB2_QUERY_INFO */ cpu_to_le16(9),
+ /* SMB2_SET_INFO */ cpu_to_le16(2),
/* BB FIXME can also be 44 for lease break */
- /* SMB2_OPLOCK_BREAK */ __constant_cpu_to_le16(24)
+ /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24)
};
int
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index f522193b7184..96101266c5e5 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -597,7 +597,7 @@ smb2_clone_range(const unsigned int xid,
goto cchunk_out;
/* For now array only one chunk long, will make more flexible later */
- pcchunk->ChunkCount = __constant_cpu_to_le32(1);
+ pcchunk->ChunkCount = cpu_to_le32(1);
pcchunk->Reserved = 0;
pcchunk->Reserved2 = 0;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 74b3a6684383..40f8918fe6d1 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1357,7 +1357,7 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
char *ret_data = NULL;
fsctl_input.CompressionState =
- __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+ cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
FSCTL_SET_COMPRESSION, true /* is_fsctl */,
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index fbe486c285a9..6baa6cd430e0 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -85,7 +85,7 @@
/* BB FIXME - analyze following length BB */
#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */
-#define SMB2_PROTO_NUMBER __constant_cpu_to_le32(0x424d53fe)
+#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
/*
* SMB2 Header Definition
@@ -96,7 +96,7 @@
*
*/
-#define SMB2_HEADER_STRUCTURE_SIZE __constant_cpu_to_le16(64)
+#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64)
struct smb2_hdr {
__be32 smb2_buf_length; /* big endian on wire */
@@ -137,16 +137,16 @@ struct smb2_transform_hdr {
} __packed;
/* Encryption Algorithms */
-#define SMB2_ENCRYPTION_AES128_CCM __constant_cpu_to_le16(0x0001)
+#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
/*
* SMB2 flag definitions
*/
-#define SMB2_FLAGS_SERVER_TO_REDIR __constant_cpu_to_le32(0x00000001)
-#define SMB2_FLAGS_ASYNC_COMMAND __constant_cpu_to_le32(0x00000002)
-#define SMB2_FLAGS_RELATED_OPERATIONS __constant_cpu_to_le32(0x00000004)
-#define SMB2_FLAGS_SIGNED __constant_cpu_to_le32(0x00000008)
-#define SMB2_FLAGS_DFS_OPERATIONS __constant_cpu_to_le32(0x10000000)
+#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001)
+#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002)
+#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004)
+#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008)
+#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000)
/*
* Definitions for SMB2 Protocol Data Units (network frames)
@@ -157,7 +157,7 @@ struct smb2_transform_hdr {
*
*/
-#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_cpu_to_le16(9)
+#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9)
struct smb2_err_rsp {
struct smb2_hdr hdr;
@@ -500,12 +500,12 @@ struct create_context {
#define SMB2_LEASE_HANDLE_CACHING_HE 0x02
#define SMB2_LEASE_WRITE_CACHING_HE 0x04
-#define SMB2_LEASE_NONE __constant_cpu_to_le32(0x00)
-#define SMB2_LEASE_READ_CACHING __constant_cpu_to_le32(0x01)
-#define SMB2_LEASE_HANDLE_CACHING __constant_cpu_to_le32(0x02)
-#define SMB2_LEASE_WRITE_CACHING __constant_cpu_to_le32(0x04)
+#define SMB2_LEASE_NONE cpu_to_le32(0x00)
+#define SMB2_LEASE_READ_CACHING cpu_to_le32(0x01)
+#define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02)
+#define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04)
-#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x02)
+#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x02)
#define SMB2_LEASE_KEY_SIZE 16
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a8bc47f75fa0..5b6e9f246233 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -107,7 +107,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
if (!journal) {
- ret = generic_file_fsync(file, start, end, datasync);
+ if (test_opt(inode->i_sb, BARRIER))
+ ret = generic_file_fsync(file, start, end, datasync);
+ else
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 32602c667b4a..7892e6fddb66 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -38,21 +38,30 @@ int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1,
return hfsplus_strcmp(&k1->cat.name, &k2->cat.name);
}
-void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
- u32 parent, struct qstr *str)
+/* Generates key for catalog file/folders record. */
+int hfsplus_cat_build_key(struct super_block *sb,
+ hfsplus_btree_key *key, u32 parent, struct qstr *str)
{
- int len;
+ int len, err;
key->cat.parent = cpu_to_be32(parent);
- if (str) {
- hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN,
- str->name, str->len);
- len = be16_to_cpu(key->cat.name.length);
- } else {
- key->cat.name.length = 0;
- len = 0;
- }
+ err = hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN,
+ str->name, str->len);
+ if (unlikely(err < 0))
+ return err;
+
+ len = be16_to_cpu(key->cat.name.length);
key->key_len = cpu_to_be16(6 + 2 * len);
+ return 0;
+}
+
+/* Generates key for catalog thread record. */
+void hfsplus_cat_build_key_with_cnid(struct super_block *sb,
+ hfsplus_btree_key *key, u32 parent)
+{
+ key->cat.parent = cpu_to_be32(parent);
+ key->cat.name.length = 0;
+ key->key_len = cpu_to_be16(6);
}
static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
@@ -167,11 +176,16 @@ static int hfsplus_fill_cat_thread(struct super_block *sb,
hfsplus_cat_entry *entry, int type,
u32 parentid, struct qstr *str)
{
+ int err;
+
entry->type = cpu_to_be16(type);
entry->thread.reserved = 0;
entry->thread.parentID = cpu_to_be32(parentid);
- hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN,
+ err = hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN,
str->name, str->len);
+ if (unlikely(err < 0))
+ return err;
+
return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2;
}
@@ -183,7 +197,7 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
int err;
u16 type;
- hfsplus_cat_build_key(sb, fd->search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd->search_key, cnid);
err = hfs_brec_read(fd, &tmp, sizeof(hfsplus_cat_entry));
if (err)
return err;
@@ -250,11 +264,16 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
if (err)
return err;
- hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
entry_size = hfsplus_fill_cat_thread(sb, &entry,
S_ISDIR(inode->i_mode) ?
HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,
dir->i_ino, str);
+ if (unlikely(entry_size < 0)) {
+ err = entry_size;
+ goto err2;
+ }
+
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
if (!err)
@@ -265,7 +284,10 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
if (err)
goto err2;
- hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+ err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+ if (unlikely(err))
+ goto err1;
+
entry_size = hfsplus_cat_build_record(&entry, cnid, inode);
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
@@ -288,7 +310,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
return 0;
err1:
- hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
if (!hfs_brec_find(&fd, hfs_find_rec_by_key))
hfs_brec_remove(&fd);
err2:
@@ -313,7 +335,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
if (!str) {
int len;
- hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -329,7 +351,9 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
off + 2, len);
fd.search_key->key_len = cpu_to_be16(6 + len);
} else
- hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+ err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+ if (unlikely(err))
+ goto out;
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
@@ -360,7 +384,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
if (err)
goto out;
- hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -405,7 +429,11 @@ int hfsplus_rename_cat(u32 cnid,
dst_fd = src_fd;
/* find the old dir entry and read the data */
- hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
+ err = hfsplus_cat_build_key(sb, src_fd.search_key,
+ src_dir->i_ino, src_name);
+ if (unlikely(err))
+ goto out;
+
err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -419,7 +447,11 @@ int hfsplus_rename_cat(u32 cnid,
type = be16_to_cpu(entry.type);
/* create new dir entry with the data from the old entry */
- hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
+ err = hfsplus_cat_build_key(sb, dst_fd.search_key,
+ dst_dir->i_ino, dst_name);
+ if (unlikely(err))
+ goto out;
+
err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
if (!err)
@@ -436,7 +468,11 @@ int hfsplus_rename_cat(u32 cnid,
dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
/* finally remove the old entry */
- hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
+ err = hfsplus_cat_build_key(sb, src_fd.search_key,
+ src_dir->i_ino, src_name);
+ if (unlikely(err))
+ goto out;
+
err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -449,7 +485,7 @@ int hfsplus_rename_cat(u32 cnid,
src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
/* remove old thread entry */
- hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid);
err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -459,9 +495,14 @@ int hfsplus_rename_cat(u32 cnid,
goto out;
/* create new thread entry */
- hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, dst_fd.search_key, cnid);
entry_size = hfsplus_fill_cat_thread(sb, &entry, type,
dst_dir->i_ino, dst_name);
+ if (unlikely(entry_size < 0)) {
+ err = entry_size;
+ goto out;
+ }
+
err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
if (!err)
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 610a3260bef1..435bea231cc6 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -44,7 +44,10 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
if (err)
return ERR_PTR(err);
- hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
+ err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino,
+ &dentry->d_name);
+ if (unlikely(err < 0))
+ goto fail;
again:
err = hfs_brec_read(&fd, &entry, sizeof(entry));
if (err) {
@@ -97,9 +100,11 @@ again:
be32_to_cpu(entry.file.permissions.dev);
str.len = sprintf(name, "iNode%d", linkid);
str.name = name;
- hfsplus_cat_build_key(sb, fd.search_key,
+ err = hfsplus_cat_build_key(sb, fd.search_key,
HFSPLUS_SB(sb)->hidden_dir->i_ino,
&str);
+ if (unlikely(err < 0))
+ goto fail;
goto again;
}
} else if (!dentry->d_fsdata)
@@ -145,7 +150,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
err = -ENOMEM;
goto out;
}
- hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, inode->i_ino);
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
goto out;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index eb5e059f481a..b0441d65fa54 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -443,8 +443,10 @@ int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1,
const hfsplus_btree_key *k2);
int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1,
const hfsplus_btree_key *k2);
-void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
+int hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
u32 parent, struct qstr *str);
+void hfsplus_cat_build_key_with_cnid(struct super_block *sb,
+ hfsplus_btree_key *key, u32 parent);
void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
int hfsplus_find_cat(struct super_block *sb, u32 cnid,
struct hfs_find_data *fd);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 4cf2024b87da..593af2fdcc2d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -515,7 +515,9 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
err = hfs_find_init(sbi->cat_tree, &fd);
if (err)
goto out_put_root;
- hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
+ err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
+ if (unlikely(err < 0))
+ goto out_put_root;
if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
hfs_find_exit(&fd);
if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
diff --git a/fs/internal.h b/fs/internal.h
index bd4ac19f4d8f..9477f8f6aefc 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -35,6 +35,11 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
#endif
/*
+ * buffer.c
+ */
+extern void guard_bio_eod(int rw, struct bio *bio);
+
+/*
* char_dev.c
*/
extern void __init chrdev_init(void);
diff --git a/fs/mpage.c b/fs/mpage.c
index 5f9ed622274f..587c7ed4185d 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -28,6 +28,7 @@
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/cleancache.h>
+#include "internal.h"
/*
* I/O completion handler for multipage BIOs.
@@ -57,6 +58,7 @@ static void mpage_end_io(struct bio *bio, int err)
static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
bio->bi_end_io = mpage_end_io;
+ guard_bio_eod(rw, bio);
submit_bio(rw, bio);
return NULL;
}
@@ -480,6 +482,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -588,7 +591,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
alloc_new:
if (bio == NULL) {
@@ -612,7 +615,7 @@ alloc_new:
*/
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
goto alloc_new;
}
@@ -622,7 +625,7 @@ alloc_new:
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -634,7 +637,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
@@ -690,8 +693,11 @@ mpage_writepages(struct address_space *mapping,
};
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
}
blk_finish_plug(&plug);
return ret;
@@ -708,8 +714,11 @@ int mpage_writepage(struct page *page, get_block_t get_block,
.use_writepage = 0,
};
int ret = __mpage_writepage(page, wbc, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
return ret;
}
EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index f5ec1ce7a532..643faa44f22b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
/*
* file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
*
* This program/include file is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published
@@ -410,7 +410,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
BUG_ON(!nr_pages);
err = nr = 0;
do {
- pages[nr] = find_lock_page(mapping, index);
+ pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
+ FGP_ACCESSED);
if (!pages[nr]) {
if (!*cached_page) {
*cached_page = page_cache_alloc(mapping);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 6c3296e546c3..9e1e112074fb 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3208,7 +3208,7 @@ static void __exit exit_ntfs_fs(void)
}
MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.");
MODULE_VERSION(NTFS_VERSION);
MODULE_LICENSE("GPL");
#ifdef DEBUG
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a93bf9892256..fcae9ef1a328 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5662,7 +5662,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 cpos, u32 phys_cpos, u32 len, int flags,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- u64 refcount_loc)
+ u64 refcount_loc, bool refcount_tree_locked)
{
int ret, credits = 0, extra_blocks = 0;
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
@@ -5676,11 +5676,13 @@ int ocfs2_remove_btree_range(struct inode *inode,
BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
OCFS2_HAS_REFCOUNT_FL));
- ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
- &ref_tree, NULL);
- if (ret) {
- mlog_errno(ret);
- goto bail;
+ if (!refcount_tree_locked) {
+ ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto bail;
+ }
}
ret = ocfs2_prepare_refcount_change_for_del(inode,
@@ -7021,6 +7023,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
struct ocfs2_extent_tree et;
struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
ocfs2_init_dealloc_ctxt(&dealloc);
@@ -7130,9 +7133,18 @@ start:
phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+ if ((flags & OCFS2_EXT_REFCOUNTED) && trunc_len && !ref_tree) {
+ status = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+ &ref_tree, NULL);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
phys_cpos, trunc_len, flags, &dealloc,
- refcount_loc);
+ refcount_loc, true);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -7147,6 +7159,8 @@ start:
goto start;
bail:
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
ocfs2_schedule_truncate_log_flush(osb, 1);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index ca381c584127..fb09b97db162 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -142,7 +142,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 cpos, u32 phys_cpos, u32 len, int flags,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- u64 refcount_loc);
+ u64 refcount_loc, bool refcount_tree_locked);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct ocfs2_extent_tree *et);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 4a231a166cf8..2e636217a360 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -894,7 +894,7 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
}
}
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
{
int i;
@@ -915,7 +915,11 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
page_cache_release(wc->w_target_page);
}
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
+}
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+ ocfs2_unlock_pages(wc);
brelse(wc->w_di_bh);
kfree(wc);
}
@@ -1817,16 +1821,6 @@ try_again:
if (ret)
goto out_commit;
}
- /*
- * We don't want this to fail in ocfs2_write_end(), so do it
- * here.
- */
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out_quota;
- }
/*
* Fill our page array first. That way we've grabbed enough so
@@ -1977,7 +1971,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- int i;
+ int i, ret;
unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2027,6 +2021,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
}
}
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
+
out_write_size:
pos += copied;
if (pos > i_size_read(inode)) {
@@ -2041,11 +2043,20 @@ out_write_size:
ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, wc->w_di_bh);
+out:
+ /* unlock pages before dealloc since it needs acquiring j_trans_barrier
+ * lock, or it will cause a deadlock since journal commit threads holds
+ * this lock and will ask for the page lock when flushing the data.
+ * put it here to preserve the unlock order.
+ */
+ ocfs2_unlock_pages(wc);
+
ocfs2_commit_trans(osb, handle);
ocfs2_run_deallocs(osb, &wc->w_dealloc);
- ocfs2_free_write_ctxt(wc);
+ brelse(wc->w_di_bh);
+ kfree(wc);
return copied;
}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 73039295d0d1..d13385448168 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2572,6 +2572,25 @@ int o2hb_check_node_heartbeating(u8 node_num)
}
EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
+int o2hb_check_node_heartbeating_no_sem(u8 node_num)
+{
+ unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long flags;
+
+ spin_lock_irqsave(&o2hb_live_lock, flags);
+ o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+ spin_unlock_irqrestore(&o2hb_live_lock, flags);
+ if (!test_bit(node_num, testing_map)) {
+ mlog(ML_HEARTBEAT,
+ "node (%u) does not have heartbeating enabled.\n",
+ node_num);
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem);
+
int o2hb_check_node_heartbeating_from_callback(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 00ad8e8fea51..3ef5137dc362 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -80,6 +80,7 @@ void o2hb_fill_node_map(unsigned long *map,
void o2hb_exit(void);
int o2hb_init(void);
int o2hb_check_node_heartbeating(u8 node_num);
+int o2hb_check_node_heartbeating_no_sem(u8 node_num);
int o2hb_check_node_heartbeating_from_callback(u8 node_num);
int o2hb_check_local_node_heartbeating(void);
void o2hb_stop_all_regions(void);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 0717662b4aef..e737a4ce7d7a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -4477,7 +4477,7 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
- &dealloc, 0);
+ &dealloc, 0, false);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index b46278f9ae44..fd6bbbbd7d78 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -385,8 +385,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
head = &res->granted;
list_for_each_entry(lock, head, list) {
- if (lock->ml.cookie == cookie)
+ /* if lock is found but unlock is pending ignore the bast */
+ if (lock->ml.cookie == cookie) {
+ if (lock->unlock_pending)
+ break;
goto do_ast;
+ }
}
mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 3fcf205ee900..02d315fef432 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -839,7 +839,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
* to back off and try again. This gives heartbeat a chance
* to catch up.
*/
- if (!o2hb_check_node_heartbeating(query->node_idx)) {
+ if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) {
mlog(0, "node %u is not in our live map yet\n",
query->node_idx);
@@ -1975,24 +1975,22 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
if (!dlm) {
- mlog_errno(-ENOMEM);
+ ret = -ENOMEM;
+ mlog_errno(ret);
goto leave;
}
dlm->name = kstrdup(domain, GFP_KERNEL);
if (dlm->name == NULL) {
- mlog_errno(-ENOMEM);
- kfree(dlm);
- dlm = NULL;
+ ret = -ENOMEM;
+ mlog_errno(ret);
goto leave;
}
dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
if (!dlm->lockres_hash) {
- mlog_errno(-ENOMEM);
- kfree(dlm->name);
- kfree(dlm);
- dlm = NULL;
+ ret = -ENOMEM;
+ mlog_errno(ret);
goto leave;
}
@@ -2002,11 +2000,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm->master_hash = (struct hlist_head **)
dlm_alloc_pagevec(DLM_HASH_PAGES);
if (!dlm->master_hash) {
- mlog_errno(-ENOMEM);
- dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
- kfree(dlm->name);
- kfree(dlm);
- dlm = NULL;
+ ret = -ENOMEM;
+ mlog_errno(ret);
goto leave;
}
@@ -2017,14 +2012,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm->node_num = o2nm_this_node();
ret = dlm_create_debugfs_subroot(dlm);
- if (ret < 0) {
- dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
- dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
- kfree(dlm->name);
- kfree(dlm);
- dlm = NULL;
+ if (ret < 0)
goto leave;
- }
spin_lock_init(&dlm->spinlock);
spin_lock_init(&dlm->master_lock);
@@ -2085,6 +2074,19 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
atomic_read(&dlm->dlm_refs.refcount));
leave:
+ if (ret < 0 && dlm) {
+ if (dlm->master_hash)
+ dlm_free_pagevec((void **)dlm->master_hash,
+ DLM_HASH_PAGES);
+
+ if (dlm->lockres_hash)
+ dlm_free_pagevec((void **)dlm->lockres_hash,
+ DLM_HASH_PAGES);
+
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+ }
return dlm;
}
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3ec906ef5d9a..3c92053a60a8 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref)
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
- spin_lock(&dlm->track_lock);
- if (!list_empty(&res->tracking))
- list_del_init(&res->tracking);
- else {
- mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
- res->lockname.len, res->lockname.name);
- dlm_print_one_lock_resource(res);
- }
- spin_unlock(&dlm->track_lock);
-
atomic_dec(&dlm->res_cur_count);
if (!hlist_unhashed(&res->hash_node) ||
@@ -625,9 +615,6 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
return res;
error:
- if (res && res->lockname.name)
- kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
-
if (res)
kmem_cache_free(dlm_lockres_cache, res);
return NULL;
@@ -694,14 +681,6 @@ void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
res->inflight_assert_workers);
}
-static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res)
-{
- spin_lock(&res->spinlock);
- __dlm_lockres_grab_inflight_worker(dlm, res);
- spin_unlock(&res->spinlock);
-}
-
static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
@@ -1635,6 +1614,7 @@ send_response:
}
mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
dlm->node_num, res->lockname.len, res->lockname.name);
+ spin_lock(&res->spinlock);
ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
DLM_ASSERT_MASTER_MLE_CLEANUP);
if (ret < 0) {
@@ -1642,7 +1622,8 @@ send_response:
response = DLM_MASTER_RESP_ERROR;
dlm_lockres_put(res);
} else
- dlm_lockres_grab_inflight_worker(dlm, res);
+ __dlm_lockres_grab_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
} else {
if (res)
dlm_lockres_put(res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 69aac6f088ad..2e5e6d5fffe8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
__dlm_unhash_lockres(dlm, res);
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+ res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2930e231f3f9..319c858e70bf 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1804,7 +1804,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
phys_cpos, trunc_len, flags,
- &dealloc, refcount_loc);
+ &dealloc, refcount_loc, false);
if (ret < 0) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 437de7f768c6..21708cda4b8a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1192,17 +1192,9 @@ void ocfs2_evict_inode(struct inode *inode)
int ocfs2_drop_inode(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- int res;
-
trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
inode->i_nlink, oi->ip_flags);
-
- if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
- res = 1;
- else
- res = generic_drop_inode(inode);
-
- return res;
+ return 1;
}
/*
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index a6c991c0fc98..a9b76de46047 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -162,7 +162,7 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
{
int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
- return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
+ return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits;
}
/* Validate that a bh contains a valid inode */
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 6219aaadeb08..74caffeeee1d 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -404,7 +404,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
* 'vict_blkno' was out of the valid range.
*/
if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
- (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
+ (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
bits_per_unit))) {
ret = -EINVAL;
goto out;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 13a8537d8e8b..720aa389e0ea 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -591,7 +591,7 @@ static int ocfs2_control_release(struct inode *inode, struct file *file)
*/
ocfs2_control_this_node = -1;
running_proto.pv_major = 0;
- running_proto.pv_major = 0;
+ running_proto.pv_minor = 0;
}
out:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 00cd85f18bcc..1133418f0f54 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -632,29 +632,35 @@ static const struct file_operations proc_single_file_operations = {
.release = single_release,
};
-static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
+
+struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
{
- struct task_struct *task = get_proc_task(file_inode(file));
- struct mm_struct *mm;
+ struct task_struct *task = get_proc_task(inode);
+ struct mm_struct *mm = ERR_PTR(-ESRCH);
- if (!task)
- return -ESRCH;
+ if (task) {
+ mm = mm_access(task, mode);
+ put_task_struct(task);
- mm = mm_access(task, mode);
- put_task_struct(task);
+ if (!IS_ERR_OR_NULL(mm)) {
+ /* ensure this mm_struct can't be freed */
+ atomic_inc(&mm->mm_count);
+ /* but do not pin its memory */
+ mmput(mm);
+ }
+ }
+
+ return mm;
+}
+
+static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
+{
+ struct mm_struct *mm = proc_mem_open(inode, mode);
if (IS_ERR(mm))
return PTR_ERR(mm);
- if (mm) {
- /* ensure this mm_struct can't be freed */
- atomic_inc(&mm->mm_count);
- /* but do not pin its memory */
- mmput(mm);
- }
-
file->private_data = mm;
-
return 0;
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7da13e49128a..aa7a0ee182e1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -268,8 +268,9 @@ extern int proc_remount(struct super_block *, int *, char *);
* task_[no]mmu.c
*/
struct proc_maps_private {
- struct pid *pid;
+ struct inode *inode;
struct task_struct *task;
+ struct mm_struct *mm;
#ifdef CONFIG_MMU
struct vm_area_struct *tail_vma;
#endif
@@ -278,6 +279,8 @@ struct proc_maps_private {
#endif
};
+struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
+
extern const struct file_operations proc_pid_maps_operations;
extern const struct file_operations proc_tid_maps_operations;
extern const struct file_operations proc_pid_numa_maps_operations;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6df8d0722c97..91a4e6426321 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -610,8 +610,10 @@ static void __init proc_kcore_text_init(void)
struct kcore_list kcore_modules;
static void __init add_modules_range(void)
{
- kclist_add(&kcore_modules, (void *)MODULES_VADDR,
+ if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {
+ kclist_add(&kcore_modules, (void *)MODULES_VADDR,
MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
+ }
}
#else
static void __init add_modules_range(void)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index aa1eee06420f..f897fbf278c6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -138,6 +138,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"AnonHugePages: %8lu kB\n"
#endif
+#ifdef CONFIG_MEMORY_BALLOON
+ "BalloonPages: %8lu kB\n"
+#endif
,
K(i.totalram),
K(i.freeram),
@@ -193,6 +196,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
HPAGE_PMD_NR)
#endif
+#ifdef CONFIG_MEMORY_BALLOON
+ ,K(global_page_state(NR_BALLOON_PAGES))
+#endif
);
hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e647c55275d9..1e3187da1fed 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -133,6 +133,9 @@ u64 stable_page_flags(struct page *page)
if (PageBuddy(page))
u |= 1 << KPF_BUDDY;
+ if (PageBalloon(page))
+ u |= 1 << KPF_BALLOON;
+
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dfc791c42d64..7810aba88c46 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -87,32 +87,14 @@ unsigned long task_statm(struct mm_struct *mm,
#ifdef CONFIG_NUMA
/*
- * These functions are for numa_maps but called in generic **maps seq_file
- * ->start(), ->stop() ops.
- *
- * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
- * Each mempolicy object is controlled by reference counting. The problem here
- * is how to avoid accessing dead mempolicy object.
- *
- * Because we're holding mmap_sem while reading seq_file, it's safe to access
- * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
- *
- * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
- * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
- * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
- * gurantee the task never exits under us. But taking task_lock() around
- * get_vma_plicy() causes lock order problem.
- *
- * To access task->mempolicy without lock, we hold a reference count of an
- * object pointed by task->mempolicy and remember it. This will guarantee
- * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
+ * Save get_task_policy() for show_numa_map().
*/
static void hold_task_mempolicy(struct proc_maps_private *priv)
{
struct task_struct *task = priv->task;
task_lock(task);
- priv->task_mempolicy = task->mempolicy;
+ priv->task_mempolicy = get_task_policy(task);
mpol_get(priv->task_mempolicy);
task_unlock(task);
}
@@ -129,124 +111,155 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
-static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
+static void vma_stop(struct proc_maps_private *priv)
{
- if (vma && vma != priv->tail_vma) {
- struct mm_struct *mm = vma->vm_mm;
- release_task_mempolicy(priv);
- up_read(&mm->mmap_sem);
- mmput(mm);
- }
+ struct mm_struct *mm = priv->mm;
+
+ release_task_mempolicy(priv);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
}
-static void *m_start(struct seq_file *m, loff_t *pos)
+static struct vm_area_struct *
+m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
+{
+ if (vma == priv->tail_vma)
+ return NULL;
+ return vma->vm_next ?: priv->tail_vma;
+}
+
+static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
+{
+ if (m->count < m->size) /* vma is copied successfully */
+ m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
+}
+
+static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
unsigned long last_addr = m->version;
struct mm_struct *mm;
- struct vm_area_struct *vma, *tail_vma = NULL;
- loff_t l = *pos;
-
- /* Clear the per syscall fields in priv */
- priv->task = NULL;
- priv->tail_vma = NULL;
-
- /*
- * We remember last_addr rather than next_addr to hit with
- * vmacache most of the time. We have zero last_addr at
- * the beginning and also after lseek. We will have -1 last_addr
- * after the end of the vmas.
- */
+ struct vm_area_struct *vma;
+ unsigned int pos = *ppos;
+ /* See m_cache_vma(). Zero at the start or after lseek. */
if (last_addr == -1UL)
return NULL;
- priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+ priv->task = get_proc_task(priv->inode);
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = mm_access(priv->task, PTRACE_MODE_READ);
- if (!mm || IS_ERR(mm))
- return mm;
- down_read(&mm->mmap_sem);
+ mm = priv->mm;
+ if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+ return NULL;
- tail_vma = get_gate_vma(priv->task->mm);
- priv->tail_vma = tail_vma;
+ down_read(&mm->mmap_sem);
hold_task_mempolicy(priv);
- /* Start with last addr hint */
- vma = find_vma(mm, last_addr);
- if (last_addr && vma) {
- vma = vma->vm_next;
- goto out;
+ priv->tail_vma = get_gate_vma(mm);
+
+ if (last_addr) {
+ vma = find_vma(mm, last_addr);
+ if (vma && (vma = m_next_vma(priv, vma)))
+ return vma;
}
- /*
- * Check the vma index is within the range and do
- * sequential scan until m_index.
- */
- vma = NULL;
- if ((unsigned long)l < mm->map_count) {
- vma = mm->mmap;
- while (l-- && vma)
+ m->version = 0;
+ if (pos < mm->map_count) {
+ for (vma = mm->mmap; pos; pos--) {
+ m->version = vma->vm_start;
vma = vma->vm_next;
- goto out;
+ }
+ return vma;
}
- if (l != mm->map_count)
- tail_vma = NULL; /* After gate vma */
-
-out:
- if (vma)
- return vma;
+ /* we do not bother to update m->version in this case */
+ if (pos == mm->map_count && priv->tail_vma)
+ return priv->tail_vma;
- release_task_mempolicy(priv);
- /* End of vmas has been reached */
- m->version = (tail_vma != NULL)? 0: -1UL;
- up_read(&mm->mmap_sem);
- mmput(mm);
- return tail_vma;
+ vma_stop(priv);
+ return NULL;
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
struct proc_maps_private *priv = m->private;
- struct vm_area_struct *vma = v;
- struct vm_area_struct *tail_vma = priv->tail_vma;
+ struct vm_area_struct *next;
(*pos)++;
- if (vma && (vma != tail_vma) && vma->vm_next)
- return vma->vm_next;
- vma_stop(priv, vma);
- return (vma != tail_vma)? tail_vma: NULL;
+ next = m_next_vma(priv, v);
+ if (!next)
+ vma_stop(priv);
+ return next;
}
static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
- struct vm_area_struct *vma = v;
- if (!IS_ERR(vma))
- vma_stop(priv, vma);
- if (priv->task)
+ if (!IS_ERR_OR_NULL(v))
+ vma_stop(priv);
+ if (priv->task) {
put_task_struct(priv->task);
+ priv->task = NULL;
+ }
+}
+
+static int proc_maps_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops, int psize)
+{
+ struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
+
+ if (!priv)
+ return -ENOMEM;
+
+ priv->inode = inode;
+ priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(priv->mm)) {
+ int err = PTR_ERR(priv->mm);
+
+ seq_release_private(inode, file);
+ return err;
+ }
+
+ return 0;
}
+static int proc_map_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ if (priv->mm)
+ mmdrop(priv->mm);
+
+ return seq_release_private(inode, file);
+}
+
+
static int do_maps_open(struct inode *inode, struct file *file,
const struct seq_operations *ops)
{
- struct proc_maps_private *priv;
- int ret = -ENOMEM;
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- if (priv) {
- priv->pid = proc_pid(inode);
- ret = seq_open(file, ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = priv;
- } else {
- kfree(priv);
- }
+ return proc_maps_open(inode, file, ops,
+ sizeof(struct proc_maps_private));
+}
+
+static pid_t pid_of_stack(struct proc_maps_private *priv,
+ struct vm_area_struct *vma, bool is_pid)
+{
+ struct inode *inode = priv->inode;
+ struct task_struct *task;
+ pid_t ret = 0;
+
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (task) {
+ task = task_of_stack(task, vma, is_pid);
+ if (task)
+ ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
}
+ rcu_read_unlock();
+
return ret;
}
@@ -256,7 +269,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
struct proc_maps_private *priv = m->private;
- struct task_struct *task = priv->task;
vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
@@ -321,8 +333,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
goto done;
}
- tid = vm_is_stack(task, vma, is_pid);
-
+ tid = pid_of_stack(priv, vma, is_pid);
if (tid != 0) {
/*
* Thread stack in /proc/PID/task/TID/maps or
@@ -349,15 +360,8 @@ done:
static int show_map(struct seq_file *m, void *v, int is_pid)
{
- struct vm_area_struct *vma = v;
- struct proc_maps_private *priv = m->private;
- struct task_struct *task = priv->task;
-
- show_map_vma(m, vma, is_pid);
-
- if (m->count < m->size) /* vma is copied successfully */
- m->version = (vma != get_gate_vma(task->mm))
- ? vma->vm_start : 0;
+ show_map_vma(m, v, is_pid);
+ m_cache_vma(m, v);
return 0;
}
@@ -399,14 +403,14 @@ const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = proc_map_release,
};
const struct file_operations proc_tid_maps_operations = {
.open = tid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = proc_map_release,
};
/*
@@ -583,8 +587,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
- struct proc_maps_private *priv = m->private;
- struct task_struct *task = priv->task;
struct vm_area_struct *vma = v;
struct mem_size_stats mss;
struct mm_walk smaps_walk = {
@@ -637,10 +639,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.nonlinear >> 10);
show_smap_vma_flags(m, vma);
-
- if (m->count < m->size) /* vma is copied successfully */
- m->version = (vma != get_gate_vma(task->mm))
- ? vma->vm_start : 0;
+ m_cache_vma(m, vma);
return 0;
}
@@ -682,14 +681,14 @@ const struct file_operations proc_pid_smaps_operations = {
.open = pid_smaps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = proc_map_release,
};
const struct file_operations proc_tid_smaps_operations = {
.open = tid_smaps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = proc_map_release,
};
/*
@@ -1406,7 +1405,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
struct vm_area_struct *vma = v;
struct numa_maps *md = &numa_priv->md;
struct file *file = vma->vm_file;
- struct task_struct *task = proc_priv->task;
struct mm_struct *mm = vma->vm_mm;
struct mm_walk walk = {};
struct mempolicy *pol;
@@ -1426,9 +1424,13 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
walk.private = md;
walk.mm = mm;
- pol = get_vma_policy(task, vma, vma->vm_start);
- mpol_to_str(buffer, sizeof(buffer), pol);
- mpol_cond_put(pol);
+ pol = __get_vma_policy(vma, vma->vm_start);
+ if (pol) {
+ mpol_to_str(buffer, sizeof(buffer), pol);
+ mpol_cond_put(pol);
+ } else {
+ mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
+ }
seq_printf(m, "%08lx %s", vma->vm_start, buffer);
@@ -1438,7 +1440,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_puts(m, " heap");
} else {
- pid_t tid = vm_is_stack(task, vma, is_pid);
+ pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
if (tid != 0) {
/*
* Thread stack in /proc/PID/task/TID/maps or
@@ -1486,9 +1488,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
seq_printf(m, " N%d=%lu", nid, md->node[nid]);
out:
seq_putc(m, '\n');
-
- if (m->count < m->size)
- m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
+ m_cache_vma(m, vma);
return 0;
}
@@ -1519,20 +1519,8 @@ static const struct seq_operations proc_tid_numa_maps_op = {
static int numa_maps_open(struct inode *inode, struct file *file,
const struct seq_operations *ops)
{
- struct numa_maps_private *priv;
- int ret = -ENOMEM;
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- if (priv) {
- priv->proc_maps.pid = proc_pid(inode);
- ret = seq_open(file, ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = priv;
- } else {
- kfree(priv);
- }
- }
- return ret;
+ return proc_maps_open(inode, file, ops,
+ sizeof(struct numa_maps_private));
}
static int pid_numa_maps_open(struct inode *inode, struct file *file)
@@ -1549,13 +1537,13 @@ const struct file_operations proc_pid_numa_maps_operations = {
.open = pid_numa_maps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = proc_map_release,
};
const struct file_operations proc_tid_numa_maps_operations = {
.open = tid_numa_maps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = proc_map_release,
};
#endif /* CONFIG_NUMA */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 678455d2d683..0283e5c561f9 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,6 +123,25 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
+static pid_t pid_of_stack(struct proc_maps_private *priv,
+ struct vm_area_struct *vma, bool is_pid)
+{
+ struct inode *inode = priv->inode;
+ struct task_struct *task;
+ pid_t ret = 0;
+
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (task) {
+ task = task_of_stack(task, vma, is_pid);
+ if (task)
+ ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
/*
* display a single VMA to a sequenced file
*/
@@ -163,7 +182,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
seq_pad(m, ' ');
seq_path(m, &file->f_path, "");
} else if (mm) {
- pid_t tid = vm_is_stack(priv->task, vma, is_pid);
+ pid_t tid = pid_of_stack(priv, vma, is_pid);
if (tid != 0) {
seq_pad(m, ' ');
@@ -212,22 +231,22 @@ static void *m_start(struct seq_file *m, loff_t *pos)
loff_t n = *pos;
/* pin the task and mm whilst we play with them */
- priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+ priv->task = get_proc_task(priv->inode);
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = mm_access(priv->task, PTRACE_MODE_READ);
- if (!mm || IS_ERR(mm)) {
- put_task_struct(priv->task);
- priv->task = NULL;
- return mm;
- }
- down_read(&mm->mmap_sem);
+ mm = priv->mm;
+ if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+ return NULL;
+ down_read(&mm->mmap_sem);
/* start from the Nth VMA */
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
if (n-- == 0)
return p;
+
+ up_read(&mm->mmap_sem);
+ mmput(mm);
return NULL;
}
@@ -235,11 +254,13 @@ static void m_stop(struct seq_file *m, void *_vml)
{
struct proc_maps_private *priv = m->private;
+ if (!IS_ERR_OR_NULL(_vml)) {
+ up_read(&priv->mm->mmap_sem);
+ mmput(priv->mm);
+ }
if (priv->task) {
- struct mm_struct *mm = priv->task->mm;
- up_read(&mm->mmap_sem);
- mmput(mm);
put_task_struct(priv->task);
+ priv->task = NULL;
}
}
@@ -268,21 +289,33 @@ static const struct seq_operations proc_tid_maps_ops = {
static int maps_open(struct inode *inode, struct file *file,
const struct seq_operations *ops)
{
- struct proc_maps_private *priv;
- int ret = -ENOMEM;
-
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- if (priv) {
- priv->pid = proc_pid(inode);
- ret = seq_open(file, ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = priv;
- } else {
- kfree(priv);
- }
+ struct proc_maps_private *priv = __seq_open_private(file, ops,
+ sizeof(struct proc_maps_private));
+ if (!priv)
+ return -ENOMEM;
+
+ priv->inode = inode;
+ priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(priv->mm)) {
+ int err = PTR_ERR(priv->mm);
+
+ seq_release_private(inode, file);
+ return err;
}
- return ret;
+
+ return 0;
+}
+
+
+static int map_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ if (priv->mm)
+ mmdrop(priv->mm);
+
+ return seq_release_private(inode, file);
}
static int pid_maps_open(struct inode *inode, struct file *file)
@@ -299,13 +332,13 @@ const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = map_release,
};
const struct file_operations proc_tid_maps_operations = {
.open = tid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = map_release,
};
diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h
index d137431bf26f..3378dcf4c31e 100644
--- a/include/asm-generic/dma-mapping-common.h
+++ b/include/asm-generic/dma-mapping-common.h
@@ -179,6 +179,15 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
void *cpu_addr, dma_addr_t dma_addr, size_t size);
+void *dma_common_contiguous_remap(struct page *page, size_t size,
+ unsigned long vm_flags,
+ pgprot_t prot, const void *caller);
+
+void *dma_common_pages_remap(struct page **pages, size_t size,
+ unsigned long vm_flags, pgprot_t prot,
+ const void *caller);
+void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags);
+
/**
* dma_mmap_attrs - map a coherent DMA allocation into user space
* @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 53b2acc38213..281870f56450 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -660,11 +660,12 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
}
#ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
/*
- * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the
- * same bit too). It's set only when _PAGE_PRESET is not set and it's
- * never set if _PAGE_PRESENT is set.
+ * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that
+ * is protected for PROT_NONE and a NUMA hinting fault entry. If the
+ * architecture defines __PAGE_PROTNONE then it should take that into account
+ * but those that do not can rely on the fact that the NUMA hinting scanner
+ * skips inaccessible VMAs.
*
* pte/pmd_present() returns true if pte/pmd_numa returns true. Page
* fault triggers on those regions if pte/pmd_numa returns true
@@ -673,16 +674,14 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
#ifndef pte_numa
static inline int pte_numa(pte_t pte)
{
- return (pte_flags(pte) &
- (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
+ return ptenuma_flags(pte) == _PAGE_NUMA;
}
#endif
#ifndef pmd_numa
static inline int pmd_numa(pmd_t pmd)
{
- return (pmd_flags(pmd) &
- (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
+ return pmdnuma_flags(pmd) == _PAGE_NUMA;
}
#endif
@@ -722,6 +721,8 @@ static inline pte_t pte_mknuma(pte_t pte)
{
pteval_t val = pte_val(pte);
+ VM_BUG_ON(!(val & _PAGE_PRESENT));
+
val &= ~_PAGE_PRESENT;
val |= _PAGE_NUMA;
@@ -765,16 +766,6 @@ static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
}
#endif
#else
-extern int pte_numa(pte_t pte);
-extern int pmd_numa(pmd_t pmd);
-extern pte_t pte_mknonnuma(pte_t pte);
-extern pmd_t pmd_mknonnuma(pmd_t pmd);
-extern pte_t pte_mknuma(pte_t pte);
-extern pmd_t pmd_mknuma(pmd_t pmd);
-extern void ptep_set_numa(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-extern void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp);
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
-#else
static inline int pmd_numa(pmd_t pmd)
{
return 0;
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 089743ade734..6b9b8cbb68e0 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -54,129 +54,28 @@
* balloon driver as a page book-keeper for its registered balloon devices.
*/
struct balloon_dev_info {
- void *balloon_device; /* balloon device descriptor */
- struct address_space *mapping; /* balloon special page->mapping */
unsigned long isolated_pages; /* # of isolated pages for migration */
spinlock_t pages_lock; /* Protection to pages list */
struct list_head pages; /* Pages enqueued & handled to Host */
+ int (*migrate_page)(struct balloon_dev_info *b_dev_info,
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode);
};
-extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info);
-extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
-extern struct balloon_dev_info *balloon_devinfo_alloc(
- void *balloon_dev_descriptor);
-
-static inline void balloon_devinfo_free(struct balloon_dev_info *b_dev_info)
+static inline void balloon_devinfo_init(struct balloon_dev_info *b_dev_info)
{
- kfree(b_dev_info);
+ b_dev_info->isolated_pages = 0;
+ spin_lock_init(&b_dev_info->pages_lock);
+ INIT_LIST_HEAD(&b_dev_info->pages);
+ b_dev_info->migrate_page = NULL;
}
-/*
- * balloon_page_free - release a balloon page back to the page free lists
- * @page: ballooned page to be set free
- *
- * This function must be used to properly set free an isolated/dequeued balloon
- * page at the end of a sucessful page migration, or at the balloon driver's
- * page release procedure.
- */
-static inline void balloon_page_free(struct page *page)
-{
- /*
- * Balloon pages always get an extra refcount before being isolated
- * and before being dequeued to help on sorting out fortuite colisions
- * between a thread attempting to isolate and another thread attempting
- * to release the very same balloon page.
- *
- * Before we handle the page back to Buddy, lets drop its extra refcnt.
- */
- put_page(page);
- __free_page(page);
-}
+extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info);
+extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
#ifdef CONFIG_BALLOON_COMPACTION
extern bool balloon_page_isolate(struct page *page);
extern void balloon_page_putback(struct page *page);
-extern int balloon_page_migrate(struct page *newpage,
- struct page *page, enum migrate_mode mode);
-extern struct address_space
-*balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
- const struct address_space_operations *a_ops);
-
-static inline void balloon_mapping_free(struct address_space *balloon_mapping)
-{
- kfree(balloon_mapping);
-}
-
-/*
- * page_flags_cleared - helper to perform balloon @page ->flags tests.
- *
- * As balloon pages are obtained from buddy and we do not play with page->flags
- * at driver level (exception made when we get the page lock for compaction),
- * we can safely identify a ballooned page by checking if the
- * PAGE_FLAGS_CHECK_AT_PREP page->flags are all cleared. This approach also
- * helps us skip ballooned pages that are locked for compaction or release, thus
- * mitigating their racy check at balloon_page_movable()
- */
-static inline bool page_flags_cleared(struct page *page)
-{
- return !(page->flags & PAGE_FLAGS_CHECK_AT_PREP);
-}
-
-/*
- * __is_movable_balloon_page - helper to perform @page mapping->flags tests
- */
-static inline bool __is_movable_balloon_page(struct page *page)
-{
- struct address_space *mapping = page->mapping;
- return mapping_balloon(mapping);
-}
-
-/*
- * balloon_page_movable - test page->mapping->flags to identify balloon pages
- * that can be moved by compaction/migration.
- *
- * This function is used at core compaction's page isolation scheme, therefore
- * most pages exposed to it are not enlisted as balloon pages and so, to avoid
- * undesired side effects like racing against __free_pages(), we cannot afford
- * holding the page locked while testing page->mapping->flags here.
- *
- * As we might return false positives in the case of a balloon page being just
- * released under us, the page->mapping->flags need to be re-tested later,
- * under the proper page lock, at the functions that will be coping with the
- * balloon page case.
- */
-static inline bool balloon_page_movable(struct page *page)
-{
- /*
- * Before dereferencing and testing mapping->flags, let's make sure
- * this is not a page that uses ->mapping in a different way
- */
- if (page_flags_cleared(page) && !page_mapped(page) &&
- page_count(page) == 1)
- return __is_movable_balloon_page(page);
-
- return false;
-}
-
-/*
- * isolated_balloon_page - identify an isolated balloon page on private
- * compaction/migration page lists.
- *
- * After a compaction thread isolates a balloon page for migration, it raises
- * the page refcount to prevent concurrent compaction threads from re-isolating
- * the same page. For that reason putback_movable_pages(), or other routines
- * that need to identify isolated balloon pages on private pagelists, cannot
- * rely on balloon_page_movable() to accomplish the task.
- */
-static inline bool isolated_balloon_page(struct page *page)
-{
- /* Already isolated balloon pages, by default, have a raised refcount */
- if (page_flags_cleared(page) && !page_mapped(page) &&
- page_count(page) >= 2)
- return __is_movable_balloon_page(page);
-
- return false;
-}
/*
* balloon_page_insert - insert a page into the balloon's page list and make
@@ -188,12 +87,12 @@ static inline bool isolated_balloon_page(struct page *page)
* Caller must ensure the page is locked and the spin_lock protecting balloon
* pages list is held before inserting a page into the balloon device.
*/
-static inline void balloon_page_insert(struct page *page,
- struct address_space *mapping,
- struct list_head *head)
+static inline void
+balloon_page_insert(struct balloon_dev_info *balloon, struct page *page)
{
- page->mapping = mapping;
- list_add(&page->lru, head);
+ __SetPageBalloon(page);
+ set_page_private(page, (unsigned long)balloon);
+ list_add(&page->lru, &balloon->pages);
}
/*
@@ -204,23 +103,25 @@ static inline void balloon_page_insert(struct page *page,
* Caller must ensure the page is locked and the spin_lock protecting balloon
* pages list is held before deleting a page from the balloon device.
*/
-static inline void balloon_page_delete(struct page *page)
+static inline void balloon_page_delete(struct page *page, bool isolated)
{
- page->mapping = NULL;
- list_del(&page->lru);
+ __ClearPageBalloon(page);
+ set_page_private(page, 0);
+ if (!isolated)
+ list_del(&page->lru);
}
+int balloon_page_migrate(new_page_t get_new_page, free_page_t put_new_page,
+ unsigned long private, struct page *page,
+ int force, enum migrate_mode mode);
+
/*
* balloon_page_device - get the b_dev_info descriptor for the balloon device
* that enqueues the given page.
*/
static inline struct balloon_dev_info *balloon_page_device(struct page *page)
{
- struct address_space *mapping = page->mapping;
- if (likely(mapping))
- return mapping->private_data;
-
- return NULL;
+ return (struct balloon_dev_info *)page_private(page);
}
static inline gfp_t balloon_mapping_gfp_mask(void)
@@ -228,11 +129,6 @@ static inline gfp_t balloon_mapping_gfp_mask(void)
return GFP_HIGHUSER_MOVABLE;
}
-static inline bool balloon_compaction_check(void)
-{
- return true;
-}
-
#else /* !CONFIG_BALLOON_COMPACTION */
static inline void *balloon_mapping_alloc(void *balloon_device,
@@ -246,26 +142,25 @@ static inline void balloon_mapping_free(struct address_space *balloon_mapping)
return;
}
-static inline void balloon_page_insert(struct page *page,
- struct address_space *mapping,
- struct list_head *head)
+static inline void
+balloon_page_insert(struct balloon_dev_info *balloon, struct page *page)
{
- list_add(&page->lru, head);
+ __SetPageBalloon(page);
+ list_add(&page->lru, &balloon->pages);
}
-static inline void balloon_page_delete(struct page *page)
+static inline void balloon_page_delete(struct page *page, bool isolated)
{
- list_del(&page->lru);
-}
-
-static inline bool balloon_page_movable(struct page *page)
-{
- return false;
+ __ClearPageBalloon(page);
+ if (!isolated)
+ list_del(&page->lru);
}
-static inline bool isolated_balloon_page(struct page *page)
+static inline int balloon_page_migrate(new_page_t get_new_page,
+ free_page_t put_new_page, unsigned long private,
+ struct page *page, int force, enum migrate_mode mode)
{
- return false;
+ return -EAGAIN;
}
static inline bool balloon_page_isolate(struct page *page)
@@ -278,20 +173,10 @@ static inline void balloon_page_putback(struct page *page)
return;
}
-static inline int balloon_page_migrate(struct page *newpage,
- struct page *page, enum migrate_mode mode)
-{
- return 0;
-}
-
static inline gfp_t balloon_mapping_gfp_mask(void)
{
return GFP_HIGHUSER;
}
-static inline bool balloon_compaction_check(void)
-{
- return false;
-}
#endif /* CONFIG_BALLOON_COMPACTION */
#endif /* _LINUX_BALLOON_COMPACTION_H */
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 01e3132820da..60bdf8dc02a3 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -2,14 +2,24 @@
#define _LINUX_COMPACTION_H
/* Return values for compact_zone() and try_to_compact_pages() */
+/* compaction didn't start as it was deferred due to past failures */
+#define COMPACT_DEFERRED 0
/* compaction didn't start as it was not possible or direct reclaim was more suitable */
-#define COMPACT_SKIPPED 0
+#define COMPACT_SKIPPED 1
/* compaction should continue to another pageblock */
-#define COMPACT_CONTINUE 1
+#define COMPACT_CONTINUE 2
/* direct compaction partially compacted a zone and there are suitable pages */
-#define COMPACT_PARTIAL 2
+#define COMPACT_PARTIAL 3
/* The full zone was compacted */
-#define COMPACT_COMPLETE 3
+#define COMPACT_COMPLETE 4
+
+/* Used to signal whether compaction detected need_sched() or lock contention */
+/* No contention detected */
+#define COMPACT_CONTENDED_NONE 0
+/* Either need_sched() was true or fatal signal pending */
+#define COMPACT_CONTENDED_SCHED 1
+/* Zone lock or lru_lock was contended in async compaction */
+#define COMPACT_CONTENDED_LOCK 2
#ifdef CONFIG_COMPACTION
extern int sysctl_compact_memory;
@@ -22,7 +32,8 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
extern int fragmentation_index(struct zone *zone, unsigned int order);
extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *mask,
- enum migrate_mode mode, bool *contended);
+ enum migrate_mode mode, int *contended,
+ struct zone **candidate_zone);
extern void compact_pgdat(pg_data_t *pgdat, int order);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -91,7 +102,8 @@ static inline bool compaction_restarting(struct zone *zone, int order)
#else
static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
- enum migrate_mode mode, bool *contended)
+ enum migrate_mode mode, int *contended,
+ struct zone **candidate_zone)
{
return COMPACT_CONTINUE;
}
diff --git a/include/linux/compiler-gcc5.h b/include/linux/compiler-gcc5.h
new file mode 100644
index 000000000000..cdd1cc202d51
--- /dev/null
+++ b/include/linux/compiler-gcc5.h
@@ -0,0 +1,66 @@
+#ifndef __LINUX_COMPILER_H
+#error "Please don't include <linux/compiler-gcc5.h> directly, include <linux/compiler.h> instead."
+#endif
+
+#define __used __attribute__((__used__))
+#define __must_check __attribute__((warn_unused_result))
+#define __compiler_offsetof(a, b) __builtin_offsetof(a, b)
+
+/* Mark functions as cold. gcc will assume any path leading to a call
+ to them will be unlikely. This means a lot of manual unlikely()s
+ are unnecessary now for any paths leading to the usual suspects
+ like BUG(), printk(), panic() etc. [but let's keep them for now for
+ older compilers]
+
+ Early snapshots of gcc 4.3 don't support this and we can't detect this
+ in the preprocessor, but we can live with this because they're unreleased.
+ Maketime probing would be overkill here.
+
+ gcc also has a __attribute__((__hot__)) to move hot functions into
+ a special section, but I don't see any sense in this right now in
+ the kernel context */
+#define __cold __attribute__((__cold__))
+
+#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
+
+#ifndef __CHECKER__
+# define __compiletime_warning(message) __attribute__((warning(message)))
+# define __compiletime_error(message) __attribute__((error(message)))
+#endif /* __CHECKER__ */
+
+/*
+ * Mark a position in code as unreachable. This can be used to
+ * suppress control flow warnings after asm blocks that transfer
+ * control elsewhere.
+ *
+ * Early snapshots of gcc 4.5 don't support this and we can't detect
+ * this in the preprocessor, but we can live with this because they're
+ * unreleased. Really, we need to have autoconf for the kernel.
+ */
+#define unreachable() __builtin_unreachable()
+
+/* Mark a function definition as prohibited from being cloned. */
+#define __noclone __attribute__((__noclone__))
+
+/*
+ * Tell the optimizer that something else uses this function or variable.
+ */
+#define __visible __attribute__((externally_visible))
+
+/*
+ * GCC 'asm goto' miscompiles certain code sequences:
+ *
+ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
+ *
+ * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
+ * Fixed in GCC 4.8.2 and later versions.
+ *
+ * (asm goto is automatically volatile - the naming reflects this.)
+ */
+#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0)
+
+#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
+#define __HAVE_BUILTIN_BSWAP32__
+#define __HAVE_BUILTIN_BSWAP64__
+#define __HAVE_BUILTIN_BSWAP16__
+#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644
index 000000000000..bba7a4d692b3
--- /dev/null
+++ b/include/linux/crc64_ecma.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 1c2fdaa2ffc3..1ccaab44abcc 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -110,6 +110,10 @@ extern void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo,
extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size,
unsigned long start, unsigned int nr, void *data);
+extern unsigned long gen_pool_first_fit_order_align(unsigned long *map,
+ unsigned long size, unsigned long start, unsigned int nr,
+ void *data);
+
extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
unsigned long start, unsigned int nr, void *data);
@@ -117,6 +121,9 @@ extern struct gen_pool *devm_gen_pool_create(struct device *dev,
int min_alloc_order, int nid);
extern struct gen_pool *dev_get_gen_pool(struct device *dev);
+bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+ size_t size);
+
#ifdef CONFIG_OF
extern struct gen_pool *of_get_named_gen_pool(struct device_node *np,
const char *propname, int index);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 5e7219dc0fae..41b30fd4d041 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -156,7 +156,7 @@ struct vm_area_struct;
#define GFP_DMA32 __GFP_DMA32
/* Convert GFP flags to their corresponding migrate type */
-static inline int allocflags_to_migratetype(gfp_t gfp_flags)
+static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
{
WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 63579cb8d3dc..ad9051bab267 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -132,7 +132,7 @@ extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl)
{
- VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
+ VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
if (pmd_trans_huge(*pmd))
return __pmd_trans_huge_lock(pmd, vma, ptl);
else
diff --git a/include/linux/input.h b/include/linux/input.h
index 82ce323b9986..6453b22372ac 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -79,6 +79,7 @@ struct input_value {
* @led: reflects current state of device's LEDs
* @snd: reflects current state of sound effects
* @sw: reflects current state of device's switches
+ * @leds: leds objects for the device's LEDs
* @open: this method is called when the very first user calls
* input_open_device(). The driver must prepare the device
* to start generating events (start polling thread,
@@ -164,6 +165,8 @@ struct input_dev {
unsigned long snd[BITS_TO_LONGS(SND_CNT)];
unsigned long sw[BITS_TO_LONGS(SW_CNT)];
+ struct led_classdev *leds;
+
int (*open)(struct input_dev *dev);
void (*close)(struct input_dev *dev);
int (*flush)(struct input_dev *dev, struct file *file);
@@ -531,4 +534,22 @@ int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file);
int input_ff_create_memless(struct input_dev *dev, void *data,
int (*play_effect)(struct input_dev *, void *, struct ff_effect *));
+#ifdef CONFIG_INPUT_LEDS
+
+int input_led_connect(struct input_dev *dev);
+void input_led_disconnect(struct input_dev *dev);
+
+#else
+
+static inline int input_led_connect(struct input_dev *dev)
+{
+ return 0;
+}
+
+static inline void input_led_disconnect(struct input_dev *dev)
+{
+}
+
+#endif
+
#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 95624bed87ef..aa2a0cb57f50 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -715,23 +715,8 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
(void) (&_max1 == &_max2); \
_max1 > _max2 ? _max1 : _max2; })
-#define min3(x, y, z) ({ \
- typeof(x) _min1 = (x); \
- typeof(y) _min2 = (y); \
- typeof(z) _min3 = (z); \
- (void) (&_min1 == &_min2); \
- (void) (&_min1 == &_min3); \
- _min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
- (_min2 < _min3 ? _min2 : _min3); })
-
-#define max3(x, y, z) ({ \
- typeof(x) _max1 = (x); \
- typeof(y) _max2 = (y); \
- typeof(z) _max3 = (z); \
- (void) (&_max1 == &_max2); \
- (void) (&_max1 == &_max3); \
- _max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \
- (_max2 > _max3 ? _max2 : _max3); })
+#define min3(x, y, z) min((typeof(x))min(x, y), z)
+#define max3(x, y, z) max((typeof(x))max(x, y), z)
/**
* min_not_zero - return the minimum that is _not_ zero, unless both are zero
@@ -746,20 +731,13 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
/**
* clamp - return a value clamped to a given range with strict typechecking
* @val: current value
- * @min: minimum allowable value
- * @max: maximum allowable value
+ * @lo: lowest allowable value
+ * @hi: highest allowable value
*
* This macro does strict typechecking of min/max to make sure they are of the
* same type as val. See the unnecessary pointer comparisons.
*/
-#define clamp(val, min, max) ({ \
- typeof(val) __val = (val); \
- typeof(min) __min = (min); \
- typeof(max) __max = (max); \
- (void) (&__val == &__min); \
- (void) (&__val == &__max); \
- __val = __val < __min ? __min: __val; \
- __val > __max ? __max: __val; })
+#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
/*
* ..and if you can't take the strict
diff --git a/include/linux/list.h b/include/linux/list.h
index cbbb96fcead9..f33f831eb3c8 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -5,6 +5,7 @@
#include <linux/stddef.h>
#include <linux/poison.h>
#include <linux/const.h>
+#include <linux/kernel.h>
/*
* Simple doubly linked list implementation.
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d9524c49d767..8f1a41951df9 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -84,6 +84,7 @@ extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
/* VM interface that may be used by firmware interface */
extern int online_pages(unsigned long, unsigned long, int);
+extern int test_pages_in_a_zone(unsigned long, unsigned long);
extern void __offline_isolated_pages(unsigned long, unsigned long);
typedef void (*online_page_callback_t)(struct page *page);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index f230a978e6ba..3d385c81c153 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -134,9 +134,10 @@ void mpol_free_shared_policy(struct shared_policy *p);
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
unsigned long idx);
-struct mempolicy *get_vma_policy(struct task_struct *tsk,
- struct vm_area_struct *vma, unsigned long addr);
-bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma);
+struct mempolicy *get_task_policy(struct task_struct *p);
+struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
+ unsigned long addr);
+bool vma_policy_mof(struct vm_area_struct *vma);
extern void numa_default_policy(void);
extern void numa_policy_init(void);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index a2901c414664..01aad3ed89ec 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -13,18 +13,9 @@ typedef void free_page_t(struct page *page, unsigned long private);
* Return values from addresss_space_operations.migratepage():
* - negative errno on page migration failure;
* - zero on page migration success;
- *
- * The balloon page migration introduces this special case where a 'distinct'
- * return code is used to flag a successful page migration to unmap_and_move().
- * This approach is necessary because page migration can race against balloon
- * deflation procedure, and for such case we could introduce a nasty page leak
- * if a successfully migrated balloon page gets released concurrently with
- * migration's unmap_and_move() wrap-up steps.
*/
#define MIGRATEPAGE_SUCCESS 0
-#define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page
- * sucessful migration case.
- */
+
enum migrate_reason {
MR_COMPACTION,
MR_MEMORY_FAILURE,
@@ -82,9 +73,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
return -ENOSYS;
}
-/* Possible settings for the migrate_page() method in address_operations */
-#define migrate_page NULL
-
#endif /* CONFIG_MIGRATION */
#ifdef CONFIG_NUMA_BALANCING
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8981cc882ed2..ebc5f90531bb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -18,6 +18,7 @@
#include <linux/pfn.h>
#include <linux/bit_spinlock.h>
#include <linux/shrinker.h>
+#include <linux/resource.h>
struct mempolicy;
struct anon_vma;
@@ -346,6 +347,7 @@ static inline int put_page_unless_one(struct page *page)
}
extern int page_is_ram(unsigned long pfn);
+extern int region_is_ram(resource_size_t phys_addr, unsigned long size);
/* Support for virtually mapped pages */
struct page *vmalloc_to_page(const void *addr);
@@ -553,6 +555,16 @@ static inline void __ClearPageBuddy(struct page *page)
atomic_set(&page->_mapcount, -1);
}
+#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
+
+static inline int PageBalloon(struct page *page)
+{
+ return IS_ENABLED(CONFIG_MEMORY_BALLOON) &&
+ atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE;
+}
+void __SetPageBalloon(struct page *page);
+void __ClearPageBalloon(struct page *page);
+
void put_page(struct page *page);
void put_pages_list(struct list_head *pages);
@@ -1247,8 +1259,8 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma,
!vma_growsup(vma->vm_next, addr);
}
-extern pid_t
-vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
+extern struct task_struct *task_of_stack(struct task_struct *task,
+ struct vm_area_struct *vma, bool in_group);
extern unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
@@ -1780,6 +1792,20 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
+static inline int check_data_rlimit(unsigned long rlim,
+ unsigned long new,
+ unsigned long start,
+ unsigned long end_data,
+ unsigned long start_data)
+{
+ if (rlim < RLIM_INFINITY) {
+ if (((new - start) + (end_data - start_data)) > rlim)
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
extern int mm_take_all_locks(struct mm_struct *mm);
extern void mm_drop_all_locks(struct mm_struct *mm);
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 2f348d02f640..569e4c8d0ebb 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -4,10 +4,12 @@
#include <linux/stringify.h>
struct page;
+struct vm_area_struct;
extern void dump_page(struct page *page, const char *reason);
extern void dump_page_badflags(struct page *page, const char *reason,
unsigned long badflags);
+void dump_vma(const struct vm_area_struct *vma);
#ifdef CONFIG_DEBUG_VM
#define VM_BUG_ON(cond) BUG_ON(cond)
@@ -18,12 +20,20 @@ extern void dump_page_badflags(struct page *page, const char *reason,
BUG(); \
} \
} while (0)
+#define VM_BUG_ON_VMA(cond, vma) \
+ do { \
+ if (unlikely(cond)) { \
+ dump_vma(vma); \
+ BUG(); \
+ } \
+ } while (0)
#define VM_WARN_ON(cond) WARN_ON(cond)
#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond)
#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format)
#else
#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
#define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
+#define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond)
#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 318df7051850..fbbff5cc6af6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -157,6 +157,9 @@ enum zone_stat_item {
WORKINGSET_NODERECLAIM,
NR_ANON_TRANSPARENT_HUGEPAGES,
NR_FREE_CMA_PAGES,
+#ifdef CONFIG_MEMORY_BALLOON
+ NR_BALLOON_PAGES,
+#endif
NR_VM_ZONE_STAT_ITEMS };
/*
@@ -521,13 +524,13 @@ struct zone {
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
-typedef enum {
+enum zone_flags {
ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */
ZONE_CONGESTED, /* zone has many dirty pages backed by
* a congested BDI
*/
- ZONE_TAIL_LRU_DIRTY, /* reclaim scanning has recently found
+ ZONE_DIRTY, /* reclaim scanning has recently found
* many dirty file pages at the tail
* of the LRU.
*/
@@ -535,52 +538,7 @@ typedef enum {
* many pages under writeback
*/
ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
-} zone_flags_t;
-
-static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
-{
- set_bit(flag, &zone->flags);
-}
-
-static inline int zone_test_and_set_flag(struct zone *zone, zone_flags_t flag)
-{
- return test_and_set_bit(flag, &zone->flags);
-}
-
-static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag)
-{
- clear_bit(flag, &zone->flags);
-}
-
-static inline int zone_is_reclaim_congested(const struct zone *zone)
-{
- return test_bit(ZONE_CONGESTED, &zone->flags);
-}
-
-static inline int zone_is_reclaim_dirty(const struct zone *zone)
-{
- return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);
-}
-
-static inline int zone_is_reclaim_writeback(const struct zone *zone)
-{
- return test_bit(ZONE_WRITEBACK, &zone->flags);
-}
-
-static inline int zone_is_reclaim_locked(const struct zone *zone)
-{
- return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
-}
-
-static inline int zone_is_fair_depleted(const struct zone *zone)
-{
- return test_bit(ZONE_FAIR_DEPLETED, &zone->flags);
-}
-
-static inline int zone_is_oom_locked(const struct zone *zone)
-{
- return test_bit(ZONE_OOM_LOCKED, &zone->flags);
-}
+};
static inline unsigned long zone_end_pfn(const struct zone *zone)
{
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 5ba1813337df..210b46b31499 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -24,8 +24,7 @@ enum mapping_flags {
AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */
AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */
AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */
- AS_BALLOON_MAP = __GFP_BITS_SHIFT + 4, /* balloon page special map */
- AS_EXITING = __GFP_BITS_SHIFT + 5, /* final truncate in progress */
+ AS_EXITING = __GFP_BITS_SHIFT + 4, /* final truncate in progress */
};
static inline void mapping_set_error(struct address_space *mapping, int error)
@@ -55,21 +54,6 @@ static inline int mapping_unevictable(struct address_space *mapping)
return !!mapping;
}
-static inline void mapping_set_balloon(struct address_space *mapping)
-{
- set_bit(AS_BALLOON_MAP, &mapping->flags);
-}
-
-static inline void mapping_clear_balloon(struct address_space *mapping)
-{
- clear_bit(AS_BALLOON_MAP, &mapping->flags);
-}
-
-static inline int mapping_balloon(struct address_space *mapping)
-{
- return mapping && test_bit(AS_BALLOON_MAP, &mapping->flags);
-}
-
static inline void mapping_set_exiting(struct address_space *mapping)
{
set_bit(AS_EXITING, &mapping->flags);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index be574506e6a9..c0c2bce6b0b7 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -150,7 +150,7 @@ int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
static inline void anon_vma_merge(struct vm_area_struct *vma,
struct vm_area_struct *next)
{
- VM_BUG_ON(vma->anon_vma != next->anon_vma);
+ VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
unlink_anon_vmas(next);
}
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1d9abb7d22a0..c265bec6a57d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -158,31 +158,6 @@ size_t ksize(const void *);
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
#endif
-#ifdef CONFIG_SLOB
-/*
- * Common fields provided in kmem_cache by all slab allocators
- * This struct is either used directly by the allocator (SLOB)
- * or the allocator must include definitions for all fields
- * provided in kmem_cache_common in their definition of kmem_cache.
- *
- * Once we can do anonymous structs (C11 standard) we could put a
- * anonymous struct definition in these allocators so that the
- * separate allocations in the kmem_cache structure of SLAB and
- * SLUB is no longer needed.
- */
-struct kmem_cache {
- unsigned int object_size;/* The original size of the object */
- unsigned int size; /* The aligned/padded/added on size */
- unsigned int align; /* Alignment as calculated */
- unsigned long flags; /* Active flags on the slab */
- const char *name; /* Slab name for sysfs */
- int refcount; /* Use counter */
- void (*ctor)(void *); /* Called on object slot creation */
- struct list_head list; /* List of all slab caches on the system */
-};
-
-#endif /* CONFIG_SLOB */
-
/*
* Kmalloc array related definitions
*/
@@ -363,14 +338,6 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
}
#endif /* CONFIG_TRACING */
-#ifdef CONFIG_SLAB
-#include <linux/slab_def.h>
-#endif
-
-#ifdef CONFIG_SLUB
-#include <linux/slub_def.h>
-#endif
-
extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order);
#ifdef CONFIG_TRACING
@@ -582,37 +549,15 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
* allocator where we care about the real place the memory allocation
* request comes from.
*/
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \
- (defined(CONFIG_SLAB) && defined(CONFIG_TRACING)) || \
- (defined(CONFIG_SLOB) && defined(CONFIG_TRACING))
extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
#define kmalloc_track_caller(size, flags) \
__kmalloc_track_caller(size, flags, _RET_IP_)
-#else
-#define kmalloc_track_caller(size, flags) \
- __kmalloc(size, flags)
-#endif /* DEBUG_SLAB */
#ifdef CONFIG_NUMA
-/*
- * kmalloc_node_track_caller is a special version of kmalloc_node that
- * records the calling function of the routine calling it for slab leak
- * tracking instead of just the calling function (confusing, eh?).
- * It's useful when the call to kmalloc_node comes from a widely-used
- * standard allocator where we care about the real place the memory
- * allocation request comes from.
- */
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \
- (defined(CONFIG_SLAB) && defined(CONFIG_TRACING)) || \
- (defined(CONFIG_SLOB) && defined(CONFIG_TRACING))
extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
#define kmalloc_node_track_caller(size, flags, node) \
__kmalloc_node_track_caller(size, flags, node, \
_RET_IP_)
-#else
-#define kmalloc_node_track_caller(size, flags, node) \
- __kmalloc_node(size, flags, node)
-#endif
#else /* CONFIG_NUMA */
@@ -650,14 +595,7 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
return kmalloc_node(size, flags | __GFP_ZERO, node);
}
-/*
- * Determine the size of a slab object
- */
-static inline unsigned int kmem_cache_size(struct kmem_cache *s)
-{
- return s->object_size;
-}
-
+unsigned int kmem_cache_size(struct kmem_cache *s);
void __init kmem_cache_init_late(void);
#endif /* _LINUX_SLAB_H */
diff --git a/include/linux/string.h b/include/linux/string.h
index 3b42b3732da6..416b544b59cd 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -117,6 +117,7 @@ void *memchr_inv(const void *s, int c, size_t n);
extern char *kstrdup(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
+extern char *kstrimdup(const char *s, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1b72060f093a..ea4f926e6b9b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -354,22 +354,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
extern int page_evictable(struct page *page);
extern void check_move_unevictable_pages(struct page **, int nr_pages);
-extern unsigned long scan_unevictable_pages;
-extern int scan_unevictable_handler(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-#ifdef CONFIG_NUMA
-extern int scan_unevictable_register_node(struct node *node);
-extern void scan_unevictable_unregister_node(struct node *node);
-#else
-static inline int scan_unevictable_register_node(struct node *node)
-{
- return 0;
-}
-static inline void scan_unevictable_unregister_node(struct node *node)
-{
-}
-#endif
-
extern int kswapd_run(int nid);
extern void kswapd_stop(int nid);
#ifdef CONFIG_MEMCG
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 82e7db7f7100..cece0f0d0279 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -131,10 +131,8 @@ static inline unsigned long zone_page_state(struct zone *zone,
enum zone_stat_item item)
{
long x = atomic_long_read(&zone->vm_stat[item]);
-#ifdef CONFIG_SMP
if (x < 0)
x = 0;
-#endif
return x;
}
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index e44d634e7fb7..05c214760977 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -46,6 +46,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
enum zs_mapmode mm);
void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
-u64 zs_get_total_size_bytes(struct zs_pool *pool);
+unsigned long zs_get_total_pages(struct zs_pool *pool);
#endif
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index 5116a0e48172..2f96d233c980 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -31,6 +31,7 @@
#define KPF_KSM 21
#define KPF_THP 22
+#define KPF_BALLOON 23
#endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 58afc04c107e..513df75d0fc9 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -1,6 +1,8 @@
#ifndef _LINUX_PRCTL_H
#define _LINUX_PRCTL_H
+#include <linux/types.h>
+
/* Values to pass as first argument to prctl() */
#define PR_SET_PDEATHSIG 1 /* Second arg is a signal */
@@ -119,6 +121,31 @@
# define PR_SET_MM_ENV_END 11
# define PR_SET_MM_AUXV 12
# define PR_SET_MM_EXE_FILE 13
+# define PR_SET_MM_MAP 14
+# define PR_SET_MM_MAP_SIZE 15
+
+/*
+ * This structure provides new memory descriptor
+ * map which mostly modifies /proc/pid/stat[m]
+ * output for a task. This mostly done in a
+ * sake of checkpoint/restore functionality.
+ */
+struct prctl_mm_map {
+ __u64 start_code; /* code section bounds */
+ __u64 end_code;
+ __u64 start_data; /* data section bounds */
+ __u64 end_data;
+ __u64 start_brk; /* heap for brk() syscall */
+ __u64 brk;
+ __u64 start_stack; /* stack starts at */
+ __u64 arg_start; /* command line arguments bounds */
+ __u64 arg_end;
+ __u64 env_start; /* environment variables bounds */
+ __u64 env_end;
+ __u64 *auxv; /* auxiliary vector */
+ __u32 auxv_size; /* vector size */
+ __u32 exe_fd; /* /proc/$pid/exe link file */
+};
/*
* Set specific pid that is allowed to ptrace the current task.
diff --git a/init/Kconfig b/init/Kconfig
index 321ff3e5637c..0471be99ec38 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -835,6 +835,7 @@ config LOG_BUF_SHIFT
config LOG_CPU_MAX_BUF_SHIFT
int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)"
+ depends on SMP
range 0 21
default 12 if !BASE_SMALL
default 0 if BASE_SMALL
@@ -895,17 +896,6 @@ config ARCH_SUPPORTS_INT128
config ARCH_WANT_NUMA_VARIABLE_LOCALITY
bool
-#
-# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE
-config ARCH_WANTS_PROT_NUMA_PROT_NONE
- bool
-
-config ARCH_USES_NUMA_PROT_NONE
- bool
- default y
- depends on ARCH_WANTS_PROT_NUMA_PROT_NONE
- depends on NUMA_BALANCING
-
config NUMA_BALANCING_DEFAULT_ENABLED
bool "Automatically enable NUMA aware memory/task placement"
default y
diff --git a/init/initramfs.c b/init/initramfs.c
index bece48c3461e..ad1bd7787bbb 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -197,14 +197,14 @@ static __initdata enum state {
} state, next_state;
static __initdata char *victim;
-static unsigned long count __initdata;
+static unsigned long byte_count __initdata;
static __initdata loff_t this_header, next_header;
static inline void __init eat(unsigned n)
{
victim += n;
this_header += n;
- count -= n;
+ byte_count -= n;
}
static __initdata char *vcollected;
@@ -214,7 +214,7 @@ static __initdata char *collect;
static void __init read_into(char *buf, unsigned size, enum state next)
{
- if (count >= size) {
+ if (byte_count >= size) {
collected = victim;
eat(size);
state = next;
@@ -237,8 +237,8 @@ static int __init do_start(void)
static int __init do_collect(void)
{
unsigned long n = remains;
- if (count < n)
- n = count;
+ if (byte_count < n)
+ n = byte_count;
memcpy(collect, victim, n);
eat(n);
collect += n;
@@ -280,8 +280,8 @@ static int __init do_header(void)
static int __init do_skip(void)
{
- if (this_header + count < next_header) {
- eat(count);
+ if (this_header + byte_count < next_header) {
+ eat(byte_count);
return 1;
} else {
eat(next_header - this_header);
@@ -292,9 +292,9 @@ static int __init do_skip(void)
static int __init do_reset(void)
{
- while(count && *victim == '\0')
+ while (byte_count && *victim == '\0')
eat(1);
- if (count && (this_header & 3))
+ if (byte_count && (this_header & 3))
error("broken padding");
return 1;
}
@@ -309,11 +309,11 @@ static int __init maybe_link(void)
return 0;
}
-static void __init clean_path(char *path, umode_t mode)
+static void __init clean_path(char *path, umode_t fmode)
{
struct stat st;
- if (!sys_newlstat(path, &st) && (st.st_mode^mode) & S_IFMT) {
+ if (!sys_newlstat(path, &st) && (st.st_mode ^ fmode) & S_IFMT) {
if (S_ISDIR(st.st_mode))
sys_rmdir(path);
else
@@ -368,7 +368,7 @@ static int __init do_name(void)
static int __init do_copy(void)
{
- if (count >= body_len) {
+ if (byte_count >= body_len) {
if (xwrite(wfd, victim, body_len) != body_len)
error("write error");
sys_close(wfd);
@@ -378,10 +378,10 @@ static int __init do_copy(void)
state = SkipIt;
return 0;
} else {
- if (xwrite(wfd, victim, count) != count)
+ if (xwrite(wfd, victim, byte_count) != byte_count)
error("write error");
- body_len -= count;
- eat(count);
+ body_len -= byte_count;
+ eat(byte_count);
return 1;
}
}
@@ -411,12 +411,12 @@ static __initdata int (*actions[])(void) = {
static long __init write_buffer(char *buf, unsigned long len)
{
- count = len;
+ byte_count = len;
victim = buf;
while (!actions[state]())
;
- return len - count;
+ return len - byte_count;
}
static long __init flush_buffer(void *bufv, unsigned long len)
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index c3f0326e98db..e8075b247497 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -123,7 +123,6 @@ static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table ipc_table;
- size_t lenp_bef = *lenp;
int oldval;
int rc;
@@ -133,7 +132,7 @@ static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write,
rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
- if (write && !rc && lenp_bef == *lenp) {
+ if (write && !rc) {
int newval = *((int *)(ipc_table.data));
/*
* The file "auto_msgmni" has correctly been set.
diff --git a/ipc/shm.c b/ipc/shm.c
index 7fc9f9f3a26b..01454796ba3c 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1172,13 +1172,6 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
if (find_vma_intersection(current->mm, addr, addr + size))
goto invalid;
- /*
- * If shm segment goes below stack, make sure there is some
- * space left for the stack to grow (at least 4 pages).
- */
- if (addr < current->mm->start_stack &&
- addr > current->mm->start_stack - size - PAGE_SIZE * 5)
- goto invalid;
}
addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
diff --git a/ipc/util.c b/ipc/util.c
index d73b7af581e2..88adc329888c 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -892,28 +892,16 @@ static const struct seq_operations sysvipc_proc_seqops = {
static int sysvipc_proc_open(struct inode *inode, struct file *file)
{
- int ret;
- struct seq_file *seq;
struct ipc_proc_iter *iter;
- ret = -ENOMEM;
- iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ iter = __seq_open_private(file, &sysvipc_proc_seqops, sizeof(*iter));
if (!iter)
- goto out;
-
- ret = seq_open(file, &sysvipc_proc_seqops);
- if (ret) {
- kfree(iter);
- goto out;
- }
-
- seq = file->private_data;
- seq->private = iter;
+ return -ENOMEM;
iter->iface = PDE_DATA(inode);
iter->ns = get_ipc_ns(current->nsproxy->ipc_ns);
-out:
- return ret;
+
+ return 0;
}
static int sysvipc_proc_release(struct inode *inode, struct file *file)
diff --git a/kernel/acct.c b/kernel/acct.c
index b4c667d22e79..33738ef972f3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -472,7 +472,6 @@ static void do_acct_process(struct bsd_acct_struct *acct)
acct_t ac;
unsigned long flim;
const struct cred *orig_cred;
- struct pid_namespace *ns = acct->ns;
struct file *file = acct->file;
/*
@@ -500,10 +499,15 @@ static void do_acct_process(struct bsd_acct_struct *acct)
ac.ac_gid16 = ac.ac_gid;
#endif
#if ACCT_VERSION == 3
- ac.ac_pid = task_tgid_nr_ns(current, ns);
- rcu_read_lock();
- ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
- rcu_read_unlock();
+ {
+ struct pid_namespace *ns = acct->ns;
+
+ ac.ac_pid = task_tgid_nr_ns(current, ns);
+ rcu_read_lock();
+ ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
+ ns);
+ rcu_read_unlock();
+ }
#endif
/*
* Get freeze protection. If the fs is frozen, just skip the write
diff --git a/kernel/async.c b/kernel/async.c
index 61f023ce0228..4c3773c0bf63 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work)
/* 1) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
+ pr_debug("calling %lli_%pF @ %i\n",
(long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
@@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work)
if (initcall_debug && system_state == SYSTEM_BOOTING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
- printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
+ pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
(long long)entry->cookie,
entry->func,
(long long)ktime_to_ns(delta) >> 10);
@@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
ktime_t uninitialized_var(starttime), delta, endtime;
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
+ pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
@@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
- printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
+ pr_debug("async_continuing @ %i after %lli usec\n",
task_pid_nr(current),
(long long)ktime_to_ns(delta) >> 10);
}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index a23bcba7b594..07ce18ca71e0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -473,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
int cpu;
int trace_on = 0;
int online_cpus = num_online_cpus();
+ u64 time_left;
kgdb_info[ks->cpu].enter_kgdb++;
kgdb_info[ks->cpu].exception_state |= exception_state;
@@ -597,9 +598,13 @@ return_normal:
/*
* Wait for the other CPUs to be notified and be waiting for us:
*/
- while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
- atomic_read(&slaves_in_kgdb)) != online_cpus)
+ time_left = loops_per_jiffy * HZ;
+ while (kgdb_do_roundup && --time_left &&
+ (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
+ online_cpus)
cpu_relax();
+ if (!time_left)
+ pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
/*
* At this point the primary processor is completely
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8859ca34dcfe..15e1a7af5dd0 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks)
ks->pass_exception = 1;
KDB_FLAG_SET(CATASTROPHIC);
}
+ /* set CATASTROPHIC if the system contains unresponsive processors */
+ for_each_online_cpu(i)
+ if (!kgdb_info[i].enter_kgdb)
+ KDB_FLAG_SET(CATASTROPHIC);
if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
KDB_STATE_CLEAR(SSBPT);
KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 379650b984f8..0633e784f8cd 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2157,6 +2157,8 @@ static void kdb_cpu_status(void)
for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
if (!cpu_online(i)) {
state = 'F'; /* cpu is offline */
+ } else if (!kgdb_info[i].enter_kgdb) {
+ state = 'D'; /* cpu is online but unresponsive */
} else {
state = ' '; /* cpu is responding to kdb */
if (kdb_task_state_char(KDB_TSK(i)) == 'I')
@@ -2210,7 +2212,7 @@ static int kdb_cpu(int argc, const char **argv)
/*
* Validate cpunum
*/
- if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+ if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
return KDB_BADCPUNUM;
dbg_switch_cpu = cpunum;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index d7edf7d5bf5c..4e4ab5dd104a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -267,7 +267,6 @@ static u32 clear_idx;
#define LOG_ALIGN __alignof__(struct printk_log)
#endif
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
-#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
@@ -852,6 +851,9 @@ static int __init log_buf_len_setup(char *str)
}
early_param("log_buf_len", log_buf_len_setup);
+#ifdef CONFIG_SMP
+#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
+
static void __init log_buf_add_cpu(void)
{
unsigned int cpu_extra;
@@ -878,6 +880,9 @@ static void __init log_buf_add_cpu(void)
log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
}
+#else /* !CONFIG_SMP */
+static inline void log_buf_add_cpu(void) {}
+#endif /* CONFIG_SMP */
void __init setup_log_buf(int early)
{
diff --git a/kernel/resource.c b/kernel/resource.c
index 46322019ab7d..0bcebffc4e77 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -491,6 +491,42 @@ int __weak page_is_ram(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(page_is_ram);
+/*
+ * Search for a resouce entry that fully contains the specified region.
+ * If found, return 1 if it is RAM, 0 if not.
+ * If not found, or region is not fully contained, return -1
+ *
+ * Used by the ioremap functions to ensure the user is not remapping RAM and is
+ * a vast speed up over walking through the resource table page by page.
+ */
+int region_is_ram(resource_size_t start, unsigned long size)
+{
+ struct resource *p;
+ resource_size_t end = start + size - 1;
+ int flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ const char *name = "System RAM";
+ int ret = -1;
+
+ read_lock(&resource_lock);
+ for (p = iomem_resource.child; p ; p = p->sibling) {
+ if (end < p->start)
+ continue;
+
+ if (p->start <= start && end <= p->end) {
+ /* resource fully contains region */
+ if ((p->flags != flags) || strcmp(p->name, name))
+ ret = 0;
+ else
+ ret = 1;
+ break;
+ }
+ if (p->end < start)
+ break; /* not found */
+ }
+ read_unlock(&resource_lock);
+ return ret;
+}
+
void __weak arch_remove_reservations(struct resource *avail)
{
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8dab8d162367..7ea600683db2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1951,7 +1951,7 @@ void task_numa_work(struct callback_head *work)
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
+ if (!vma_migratable(vma) || !vma_policy_mof(vma))
continue;
/*
diff --git a/kernel/sys.c b/kernel/sys.c
index ce8129192a26..e0950d9d9c13 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -62,28 +62,28 @@
#include <asm/unistd.h>
#ifndef SET_UNALIGN_CTL
-# define SET_UNALIGN_CTL(a,b) (-EINVAL)
+# define SET_UNALIGN_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_UNALIGN_CTL
-# define GET_UNALIGN_CTL(a,b) (-EINVAL)
+# define GET_UNALIGN_CTL(a, b) (-EINVAL)
#endif
#ifndef SET_FPEMU_CTL
-# define SET_FPEMU_CTL(a,b) (-EINVAL)
+# define SET_FPEMU_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_FPEMU_CTL
-# define GET_FPEMU_CTL(a,b) (-EINVAL)
+# define GET_FPEMU_CTL(a, b) (-EINVAL)
#endif
#ifndef SET_FPEXC_CTL
-# define SET_FPEXC_CTL(a,b) (-EINVAL)
+# define SET_FPEXC_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_FPEXC_CTL
-# define GET_FPEXC_CTL(a,b) (-EINVAL)
+# define GET_FPEXC_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_ENDIAN
-# define GET_ENDIAN(a,b) (-EINVAL)
+# define GET_ENDIAN(a, b) (-EINVAL)
#endif
#ifndef SET_ENDIAN
-# define SET_ENDIAN(a,b) (-EINVAL)
+# define SET_ENDIAN(a, b) (-EINVAL)
#endif
#ifndef GET_TSC_CTL
# define GET_TSC_CTL(a) (-EINVAL)
@@ -182,39 +182,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
rcu_read_lock();
read_lock(&tasklist_lock);
switch (which) {
- case PRIO_PROCESS:
- if (who)
- p = find_task_by_vpid(who);
- else
- p = current;
- if (p)
- error = set_one_prio(p, niceval, error);
- break;
- case PRIO_PGRP:
- if (who)
- pgrp = find_vpid(who);
- else
- pgrp = task_pgrp(current);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- error = set_one_prio(p, niceval, error);
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case PRIO_USER:
- uid = make_kuid(cred->user_ns, who);
- user = cred->user;
- if (!who)
- uid = cred->uid;
- else if (!uid_eq(uid, cred->uid) &&
- !(user = find_user(uid)))
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p)
+ error = set_one_prio(p, niceval, error);
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ error = set_one_prio(p, niceval, error);
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
+ if (!who)
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid)) {
+ user = find_user(uid);
+ if (!user)
goto out_unlock; /* No processes for this user */
-
- do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid))
- error = set_one_prio(p, niceval, error);
- } while_each_thread(g, p);
- if (!uid_eq(uid, cred->uid))
- free_uid(user); /* For find_user() */
- break;
+ }
+ do_each_thread(g, p) {
+ if (uid_eq(task_uid(p), uid))
+ error = set_one_prio(p, niceval, error);
+ } while_each_thread(g, p);
+ if (!uid_eq(uid, cred->uid))
+ free_uid(user); /* For find_user() */
+ break;
}
out_unlock:
read_unlock(&tasklist_lock);
@@ -244,47 +245,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
rcu_read_lock();
read_lock(&tasklist_lock);
switch (which) {
- case PRIO_PROCESS:
- if (who)
- p = find_task_by_vpid(who);
- else
- p = current;
- if (p) {
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p) {
+ niceval = nice_to_rlimit(task_nice(p));
+ if (niceval > retval)
+ retval = niceval;
+ }
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ niceval = nice_to_rlimit(task_nice(p));
+ if (niceval > retval)
+ retval = niceval;
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
+ if (!who)
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid)) {
+ user = find_user(uid);
+ if (!user)
+ goto out_unlock; /* No processes for this user */
+ }
+ do_each_thread(g, p) {
+ if (uid_eq(task_uid(p), uid)) {
niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
- break;
- case PRIO_PGRP:
- if (who)
- pgrp = find_vpid(who);
- else
- pgrp = task_pgrp(current);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- niceval = nice_to_rlimit(task_nice(p));
- if (niceval > retval)
- retval = niceval;
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case PRIO_USER:
- uid = make_kuid(cred->user_ns, who);
- user = cred->user;
- if (!who)
- uid = cred->uid;
- else if (!uid_eq(uid, cred->uid) &&
- !(user = find_user(uid)))
- goto out_unlock; /* No processes for this user */
-
- do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid)) {
- niceval = nice_to_rlimit(task_nice(p));
- if (niceval > retval)
- retval = niceval;
- }
- } while_each_thread(g, p);
- if (!uid_eq(uid, cred->uid))
- free_uid(user); /* for find_user() */
- break;
+ } while_each_thread(g, p);
+ if (!uid_eq(uid, cred->uid))
+ free_uid(user); /* for find_user() */
+ break;
}
out_unlock:
read_unlock(&tasklist_lock);
@@ -306,7 +308,7 @@ out_unlock:
*
* The general idea is that a program which uses just setregid() will be
* 100% compatible with BSD. A program which uses just setgid() will be
- * 100% compatible with POSIX with saved IDs.
+ * 100% compatible with POSIX with saved IDs.
*
* SMP: There are not races, the GIDs are checked only by filesystem
* operations (as far as semantic preservation is concerned).
@@ -364,7 +366,7 @@ error:
}
/*
- * setgid() is implemented like SysV w/ SAVED_IDS
+ * setgid() is implemented like SysV w/ SAVED_IDS
*
* SMP: Same implicit races as above.
*/
@@ -442,7 +444,7 @@ static int set_user(struct cred *new)
*
* The general idea is that a program which uses just setreuid() will be
* 100% compatible with BSD. A program which uses just setuid() will be
- * 100% compatible with POSIX with saved IDs.
+ * 100% compatible with POSIX with saved IDs.
*/
SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
{
@@ -503,17 +505,17 @@ error:
abort_creds(new);
return retval;
}
-
+
/*
- * setuid() is implemented like SysV with SAVED_IDS
- *
+ * setuid() is implemented like SysV with SAVED_IDS
+ *
* Note that SAVED_ID's is deficient in that a setuid root program
- * like sendmail, for example, cannot set its uid to be a normal
+ * like sendmail, for example, cannot set its uid to be a normal
* user and then switch back, because if you're root, setuid() sets
* the saved uid too. If you don't like this, blame the bright people
* in the POSIX committee and/or USG. Note that the BSD-style setreuid()
* will allow a root program to temporarily drop privileges and be able to
- * regain them by swapping the real and effective uid.
+ * regain them by swapping the real and effective uid.
*/
SYSCALL_DEFINE1(setuid, uid_t, uid)
{
@@ -637,10 +639,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
euid = from_kuid_munged(cred->user_ns, cred->euid);
suid = from_kuid_munged(cred->user_ns, cred->suid);
- if (!(retval = put_user(ruid, ruidp)) &&
- !(retval = put_user(euid, euidp)))
- retval = put_user(suid, suidp);
-
+ retval = put_user(ruid, ruidp);
+ if (!retval) {
+ retval = put_user(euid, euidp);
+ if (!retval)
+ return put_user(suid, suidp);
+ }
return retval;
}
@@ -709,9 +713,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _
egid = from_kgid_munged(cred->user_ns, cred->egid);
sgid = from_kgid_munged(cred->user_ns, cred->sgid);
- if (!(retval = put_user(rgid, rgidp)) &&
- !(retval = put_user(egid, egidp)))
- retval = put_user(sgid, sgidp);
+ retval = put_user(rgid, rgidp);
+ if (!retval) {
+ retval = put_user(egid, egidp);
+ if (!retval)
+ retval = put_user(sgid, sgidp);
+ }
return retval;
}
@@ -1284,7 +1291,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
/*
* Back compatibility for getrlimit. Needed for some apps.
*/
-
SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
struct rlimit __user *, rlim)
{
@@ -1299,7 +1305,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
x.rlim_cur = 0x7FFFFFFF;
if (x.rlim_max > 0x7FFFFFFF)
x.rlim_max = 0x7FFFFFFF;
- return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
+ return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
}
#endif
@@ -1527,7 +1533,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
cputime_t tgutime, tgstime, utime, stime;
unsigned long maxrss = 0;
- memset((char *) r, 0, sizeof *r);
+ memset((char *)r, 0, sizeof (*r));
utime = stime = 0;
if (who == RUSAGE_THREAD) {
@@ -1541,41 +1547,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
return;
switch (who) {
- case RUSAGE_BOTH:
- case RUSAGE_CHILDREN:
- utime = p->signal->cutime;
- stime = p->signal->cstime;
- r->ru_nvcsw = p->signal->cnvcsw;
- r->ru_nivcsw = p->signal->cnivcsw;
- r->ru_minflt = p->signal->cmin_flt;
- r->ru_majflt = p->signal->cmaj_flt;
- r->ru_inblock = p->signal->cinblock;
- r->ru_oublock = p->signal->coublock;
- maxrss = p->signal->cmaxrss;
-
- if (who == RUSAGE_CHILDREN)
- break;
-
- case RUSAGE_SELF:
- thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- utime += tgutime;
- stime += tgstime;
- r->ru_nvcsw += p->signal->nvcsw;
- r->ru_nivcsw += p->signal->nivcsw;
- r->ru_minflt += p->signal->min_flt;
- r->ru_majflt += p->signal->maj_flt;
- r->ru_inblock += p->signal->inblock;
- r->ru_oublock += p->signal->oublock;
- if (maxrss < p->signal->maxrss)
- maxrss = p->signal->maxrss;
- t = p;
- do {
- accumulate_thread_rusage(t, r);
- } while_each_thread(p, t);
+ case RUSAGE_BOTH:
+ case RUSAGE_CHILDREN:
+ utime = p->signal->cutime;
+ stime = p->signal->cstime;
+ r->ru_nvcsw = p->signal->cnvcsw;
+ r->ru_nivcsw = p->signal->cnivcsw;
+ r->ru_minflt = p->signal->cmin_flt;
+ r->ru_majflt = p->signal->cmaj_flt;
+ r->ru_inblock = p->signal->cinblock;
+ r->ru_oublock = p->signal->coublock;
+ maxrss = p->signal->cmaxrss;
+
+ if (who == RUSAGE_CHILDREN)
break;
- default:
- BUG();
+ case RUSAGE_SELF:
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ utime += tgutime;
+ stime += tgstime;
+ r->ru_nvcsw += p->signal->nvcsw;
+ r->ru_nivcsw += p->signal->nivcsw;
+ r->ru_minflt += p->signal->min_flt;
+ r->ru_majflt += p->signal->maj_flt;
+ r->ru_inblock += p->signal->inblock;
+ r->ru_oublock += p->signal->oublock;
+ if (maxrss < p->signal->maxrss)
+ maxrss = p->signal->maxrss;
+ t = p;
+ do {
+ accumulate_thread_rusage(t, r);
+ } while_each_thread(p, t);
+ break;
+
+ default:
+ BUG();
}
unlock_task_sighand(p, &flags);
@@ -1585,6 +1591,7 @@ out:
if (who != RUSAGE_CHILDREN) {
struct mm_struct *mm = get_task_mm(p);
+
if (mm) {
setmax_mm_hiwater_rss(&maxrss, mm);
mmput(mm);
@@ -1596,6 +1603,7 @@ out:
int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
{
struct rusage r;
+
k_getrusage(p, who, &r);
return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}
@@ -1628,12 +1636,14 @@ SYSCALL_DEFINE1(umask, int, mask)
return mask;
}
-static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
{
struct fd exe;
struct inode *inode;
int err;
+ VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
exe = fdget(fd);
if (!exe.file)
return -EBADF;
@@ -1654,8 +1664,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (err)
goto exit;
- down_write(&mm->mmap_sem);
-
/*
* Forbid mm->exe_file change if old file still mapped.
*/
@@ -1667,7 +1675,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (vma->vm_file &&
path_equal(&vma->vm_file->f_path,
&mm->exe_file->f_path))
- goto exit_unlock;
+ goto exit;
}
/*
@@ -1678,34 +1686,222 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
*/
err = -EPERM;
if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
- goto exit_unlock;
+ goto exit;
err = 0;
set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
-exit_unlock:
- up_write(&mm->mmap_sem);
-
exit:
fdput(exe);
return err;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+/*
+ * WARNING: we don't require any capability here so be very careful
+ * in what is allowed for modification from userspace.
+ */
+static int validate_prctl_map(struct prctl_mm_map *prctl_map)
+{
+ unsigned long mmap_max_addr = TASK_SIZE;
+ struct mm_struct *mm = current->mm;
+ int error = -EINVAL, i;
+
+ static const unsigned char offsets[] = {
+ offsetof(struct prctl_mm_map, start_code),
+ offsetof(struct prctl_mm_map, end_code),
+ offsetof(struct prctl_mm_map, start_data),
+ offsetof(struct prctl_mm_map, end_data),
+ offsetof(struct prctl_mm_map, start_brk),
+ offsetof(struct prctl_mm_map, brk),
+ offsetof(struct prctl_mm_map, start_stack),
+ offsetof(struct prctl_mm_map, arg_start),
+ offsetof(struct prctl_mm_map, arg_end),
+ offsetof(struct prctl_mm_map, env_start),
+ offsetof(struct prctl_mm_map, env_end),
+ };
+
+ /*
+ * Make sure the members are not somewhere outside
+ * of allowed address space.
+ */
+ for (i = 0; i < ARRAY_SIZE(offsets); i++) {
+ u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
+
+ if ((unsigned long)val >= mmap_max_addr ||
+ (unsigned long)val < mmap_min_addr)
+ goto out;
+ }
+
+ /*
+ * Make sure the pairs are ordered.
+ */
+#define __prctl_check_order(__m1, __op, __m2) \
+ ((unsigned long)prctl_map->__m1 __op \
+ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
+ error = __prctl_check_order(start_code, <, end_code);
+ error |= __prctl_check_order(start_data, <, end_data);
+ error |= __prctl_check_order(start_brk, <=, brk);
+ error |= __prctl_check_order(arg_start, <=, arg_end);
+ error |= __prctl_check_order(env_start, <=, env_end);
+ if (error)
+ goto out;
+#undef __prctl_check_order
+
+ error = -EINVAL;
+
+ /*
+ * @brk should be after @end_data in traditional maps.
+ */
+ if (prctl_map->start_brk <= prctl_map->end_data ||
+ prctl_map->brk <= prctl_map->end_data)
+ goto out;
+
+ /*
+ * Neither we should allow to override limits if they set.
+ */
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
+ prctl_map->start_brk, prctl_map->end_data,
+ prctl_map->start_data))
+ goto out;
+
+ /*
+ * Someone is trying to cheat the auxv vector.
+ */
+ if (prctl_map->auxv_size) {
+ if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
+ goto out;
+ }
+
+ /*
+ * Finally, make sure the caller has the rights to
+ * change /proc/pid/exe link: only local root should
+ * be allowed to.
+ */
+ if (prctl_map->exe_fd != (u32)-1) {
+ struct user_namespace *ns = current_user_ns();
+ const struct cred *cred = current_cred();
+
+ if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
+ !gid_eq(cred->gid, make_kgid(ns, 0)))
+ goto out;
+ }
+
+ error = 0;
+out:
+ return error;
+}
+
+static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
+{
+ struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+ struct mm_struct *mm = current->mm;
+ int error;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+ BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
+
+ if (opt == PR_SET_MM_MAP_SIZE)
+ return put_user((unsigned int)sizeof(prctl_map),
+ (unsigned int __user *)addr);
+
+ if (data_size != sizeof(prctl_map))
+ return -EINVAL;
+
+ if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
+ return -EFAULT;
+
+ error = validate_prctl_map(&prctl_map);
+ if (error)
+ return error;
+
+ if (prctl_map.auxv_size) {
+ memset(user_auxv, 0, sizeof(user_auxv));
+ if (copy_from_user(user_auxv,
+ (const void __user *)prctl_map.auxv,
+ prctl_map.auxv_size))
+ return -EFAULT;
+
+ /* Last entry must be AT_NULL as specification requires */
+ user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
+ user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
+ }
+
+ down_write(&mm->mmap_sem);
+ if (prctl_map.exe_fd != (u32)-1)
+ error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
+ downgrade_write(&mm->mmap_sem);
+ if (error)
+ goto out;
+
+ /*
+ * We don't validate if these members are pointing to
+ * real present VMAs because application may have correspond
+ * VMAs already unmapped and kernel uses these members for statistics
+ * output in procfs mostly, except
+ *
+ * - @start_brk/@brk which are used in do_brk but kernel lookups
+ * for VMAs when updating these memvers so anything wrong written
+ * here cause kernel to swear at userspace program but won't lead
+ * to any problem in kernel itself
+ */
+
+ mm->start_code = prctl_map.start_code;
+ mm->end_code = prctl_map.end_code;
+ mm->start_data = prctl_map.start_data;
+ mm->end_data = prctl_map.end_data;
+ mm->start_brk = prctl_map.start_brk;
+ mm->brk = prctl_map.brk;
+ mm->start_stack = prctl_map.start_stack;
+ mm->arg_start = prctl_map.arg_start;
+ mm->arg_end = prctl_map.arg_end;
+ mm->env_start = prctl_map.env_start;
+ mm->env_end = prctl_map.env_end;
+
+ /*
+ * Note this update of @saved_auxv is lockless thus
+ * if someone reads this member in procfs while we're
+ * updating -- it may get partly updated results. It's
+ * known and acceptable trade off: we leave it as is to
+ * not introduce additional locks here making the kernel
+ * more complex.
+ */
+ if (prctl_map.auxv_size)
+ memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
+
+ error = 0;
+out:
+ up_read(&mm->mmap_sem);
+ return error;
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
- unsigned long rlim = rlimit(RLIMIT_DATA);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int error;
- if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
+ if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
+ opt != PR_SET_MM_MAP &&
+ opt != PR_SET_MM_MAP_SIZE)))
return -EINVAL;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
+ return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
+#endif
+
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (opt == PR_SET_MM_EXE_FILE)
- return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+ if (opt == PR_SET_MM_EXE_FILE) {
+ down_write(&mm->mmap_sem);
+ error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
+ up_write(&mm->mmap_sem);
+ return error;
+ }
if (addr >= TASK_SIZE || addr < mmap_min_addr)
return -EINVAL;
@@ -1733,9 +1929,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
if (addr <= mm->end_data)
goto out;
- if (rlim < RLIM_INFINITY &&
- (mm->brk - addr) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
+ mm->end_data, mm->start_data))
goto out;
mm->start_brk = addr;
@@ -1745,9 +1940,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
if (addr <= mm->end_data)
goto out;
- if (rlim < RLIM_INFINITY &&
- (addr - mm->start_brk) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
+ mm->end_data, mm->start_data))
goto out;
mm->brk = addr;
@@ -2023,6 +2217,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
{
int err = 0;
int cpu = raw_smp_processor_id();
+
if (cpup)
err |= put_user(cpu, cpup);
if (nodep)
@@ -2135,7 +2330,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
/* Check to see if any memory value is too large for 32-bit and scale
* down if needed
*/
- if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+ if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
int bitcount = 0;
while (s.mem_unit < PAGE_SIZE) {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 75875a741b5e..91180987e40e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1460,13 +1460,6 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
- {
- .procname = "scan_unevictable_pages",
- .data = &scan_unevictable_pages,
- .maxlen = sizeof(scan_unevictable_pages),
- .mode = 0644,
- .proc_handler = scan_unevictable_handler,
- },
#ifdef CONFIG_MEMORY_FAILURE
{
.procname = "memory_failure_early_kill",
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 42b463ad90f2..a81083797e97 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(hash_lock);
* SIGEV values. Here we put out an error if this assumption fails.
*/
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
- ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
+ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif
@@ -254,7 +254,8 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
return 0;
}
-static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_coarse_res(const clockid_t which_clock,
+ struct timespec *tp)
{
*tp = ktime_to_timespec(KTIME_LOW_RES);
return 0;
@@ -335,14 +336,16 @@ static __init int init_posix_timers(void)
posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
- posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
- posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+ posix_timers_register_clock(CLOCK_REALTIME_COARSE,
+ &clock_realtime_coarse);
+ posix_timers_register_clock(CLOCK_MONOTONIC_COARSE,
+ &clock_monotonic_coarse);
posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
posix_timers_register_clock(CLOCK_TAI, &clock_tai);
posix_timers_cache = kmem_cache_create("posix_timers_cache",
- sizeof (struct k_itimer), 0, SLAB_PANIC,
- NULL);
+ sizeof(struct k_itimer), 0,
+ SLAB_PANIC, NULL);
return 0;
}
@@ -496,11 +499,11 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
return ret;
}
-static struct pid *good_sigevent(sigevent_t * event)
+static struct pid *good_sigevent(sigevent_t *event)
{
struct task_struct *rtn = current->group_leader;
- if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
+ if ((event->sigev_notify & SIGEV_THREAD_ID) &&
(!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
!same_thread_group(rtn, current) ||
(event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
@@ -517,18 +520,18 @@ void posix_timers_register_clock(const clockid_t clock_id,
struct k_clock *new_clock)
{
if ((unsigned) clock_id >= MAX_CLOCKS) {
- printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
+ pr_warn("POSIX clock register failed for clock_id %d\n",
clock_id);
return;
}
if (!new_clock->clock_get) {
- printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
+ pr_warn("POSIX clock id %d lacks clock_get()\n",
clock_id);
return;
}
if (!new_clock->clock_getres) {
- printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
+ pr_warn("POSIX clock id %d lacks clock_getres()\n",
clock_id);
return;
}
@@ -537,7 +540,7 @@ void posix_timers_register_clock(const clockid_t clock_id,
}
EXPORT_SYMBOL_GPL(posix_timers_register_clock);
-static struct k_itimer * alloc_posix_timer(void)
+static struct k_itimer *alloc_posix_timer(void)
{
struct k_itimer *tmr;
tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
@@ -624,7 +627,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
new_timer->it_overrun = -1;
if (timer_event_spec) {
- if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
+ if (copy_from_user(&event, timer_event_spec, sizeof(event))) {
error = -EFAULT;
goto out;
}
@@ -649,7 +652,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
new_timer->sigq->info.si_code = SI_TIMER;
if (copy_to_user(created_timer_id,
- &new_timer_id, sizeof (new_timer_id))) {
+ &new_timer_id, sizeof(new_timer_id))) {
error = -EFAULT;
goto out;
}
@@ -750,7 +753,8 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
*/
if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
(timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
- timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
+ timr->it_overrun += (unsigned int) hrtimer_forward(timer, now,
+ iv);
remaining = ktime_sub(hrtimer_get_expires(timer), now);
/* Return 0 only, when the timer is expired and not pending */
@@ -787,7 +791,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
unlock_timer(timr, flags);
- if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+ if (!ret && copy_to_user(setting, &cur_setting, sizeof(cur_setting)))
return -EFAULT;
return ret;
@@ -839,7 +843,7 @@ common_timer_set(struct k_itimer *timr, int flags,
if (hrtimer_try_to_cancel(timer) < 0)
return TIMER_RETRY;
- timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
+ timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
~REQUEUE_PENDING;
timr->it_overrun_last = 0;
@@ -859,9 +863,8 @@ common_timer_set(struct k_itimer *timr, int flags,
/* SIGEV_NONE timers are not queued ! See common_timer_get */
if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
/* Setup correct expiry time for relative timers */
- if (mode == HRTIMER_MODE_REL) {
+ if (mode == HRTIMER_MODE_REL)
hrtimer_add_expires(timer, timer->base->get_time());
- }
return 0;
}
@@ -884,7 +887,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
if (!new_setting)
return -EINVAL;
- if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
+ if (copy_from_user(&new_spec, new_setting, sizeof(new_spec)))
return -EFAULT;
if (!timespec_valid(&new_spec.it_interval) ||
@@ -903,12 +906,12 @@ retry:
unlock_timer(timr, flag);
if (error == TIMER_RETRY) {
- rtn = NULL; // We already got the old time...
+ rtn = NULL; /* We already got the old time... */
goto retry;
}
if (old_setting && !error &&
- copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
+ copy_to_user(old_setting, &old_spec, sizeof(old_spec)))
error = -EFAULT;
return error;
@@ -1010,14 +1013,14 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
if (!kc || !kc->clock_set)
return -EINVAL;
- if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+ if (copy_from_user(&new_tp, tp, sizeof(*tp)))
return -EFAULT;
return kc->clock_set(which_clock, &new_tp);
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
- struct timespec __user *,tp)
+ struct timespec __user *, tp)
{
struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec kernel_tp;
@@ -1028,7 +1031,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
error = kc->clock_get(which_clock, &kernel_tp);
- if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+ if (!error && copy_to_user(tp, &kernel_tp, sizeof(kernel_tp)))
error = -EFAULT;
return error;
@@ -1069,7 +1072,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
error = kc->clock_getres(which_clock, &rtn_tp);
- if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
+ if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof(rtn_tp)))
error = -EFAULT;
return error;
@@ -1098,7 +1101,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
if (!kc->nsleep)
return -ENANOSLEEP_NOTSUP;
- if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+ if (copy_from_user(&t, rqtp, sizeof(struct timespec)))
return -EFAULT;
if (!timespec_valid(&t))
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index ac9e04d452fb..ebfb4697c9bd 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -42,6 +42,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
@@ -328,8 +329,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
/* only warn once */
- if (__this_cpu_read(soft_watchdog_warn) == true)
+ if (__this_cpu_read(soft_watchdog_warn) == true) {
+ /*
+ * When multiple processes are causing softlockups the
+ * softlockup detector only warns on the first one
+ * because the code relies on a full quiet cycle to
+ * re-arm. The second process prevents the quiet cycle
+ * and never gets reported. Use task pointers to detect
+ * this.
+ */
+ if (__this_cpu_read(softlockup_task_ptr_saved) !=
+ current) {
+ __this_cpu_write(soft_watchdog_warn, false);
+ __touch_watchdog();
+ }
return HRTIMER_RESTART;
+ }
if (softlockup_all_cpu_backtrace) {
/* Prevent multiple soft-lockup reports if one cpu is already
@@ -345,6 +360,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
+ __this_cpu_write(softlockup_task_ptr_saved, current);
print_modules();
print_irqtrace_events(current);
if (regs)
diff --git a/lib/Kconfig b/lib/Kconfig
index a5ce0c7f6c30..2accc79fdb6f 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -177,6 +177,13 @@ config CRC8
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Makefile b/lib/Makefile
index d6b4bc496408..b73c3c33047e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -72,6 +72,7 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644
index 000000000000..41629ea5a60c
--- /dev/null
+++ b/lib/crc64_ecma.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index c9afbe2c445a..2750436cc0ae 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -829,22 +829,9 @@ static const struct seq_operations ddebug_proc_seqops = {
*/
static int ddebug_proc_open(struct inode *inode, struct file *file)
{
- struct ddebug_iter *iter;
- int err;
-
vpr_info("called\n");
-
- iter = kzalloc(sizeof(*iter), GFP_KERNEL);
- if (iter == NULL)
- return -ENOMEM;
-
- err = seq_open(file, &ddebug_proc_seqops);
- if (err) {
- kfree(iter);
- return err;
- }
- ((struct seq_file *)file->private_data)->private = iter;
- return 0;
+ return seq_open_private(file, &ddebug_proc_seqops,
+ sizeof(struct ddebug_iter));
}
static const struct file_operations ddebug_proc_fops = {
diff --git a/lib/genalloc.c b/lib/genalloc.c
index bdb9a456bcbb..c7a91cf02fd5 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -403,6 +403,35 @@ void gen_pool_for_each_chunk(struct gen_pool *pool,
EXPORT_SYMBOL(gen_pool_for_each_chunk);
/**
+ * addr_in_gen_pool - checks if an address falls within the range of a pool
+ * @pool: the generic memory pool
+ * @start: start address
+ * @size: size of the region
+ *
+ * Check if the range of addresses falls within the specified pool. Returns
+ * true if the entire range is contained in the pool and false otherwise.
+ */
+bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+ size_t size)
+{
+ bool found = false;
+ unsigned long end = start + size;
+ struct gen_pool_chunk *chunk;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk) {
+ if (start >= chunk->start_addr && start <= chunk->end_addr) {
+ if (end <= chunk->end_addr) {
+ found = true;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+ return found;
+}
+
+/**
* gen_pool_avail - get available free space of the pool
* @pool: pool to get available free space
*
@@ -481,6 +510,26 @@ unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size,
EXPORT_SYMBOL(gen_pool_first_fit);
/**
+ * gen_pool_first_fit_order_align - find the first available region
+ * of memory matching the size requirement. The region will be aligned
+ * to the order of the size specified.
+ * @map: The address to base the search on
+ * @size: The bitmap size in bits
+ * @start: The bitnumber to start searching at
+ * @nr: The number of zeroed bits we're looking for
+ * @data: additional data - unused
+ */
+unsigned long gen_pool_first_fit_order_align(unsigned long *map,
+ unsigned long size, unsigned long start,
+ unsigned int nr, void *data)
+{
+ unsigned long align_mask = roundup_pow_of_two(nr) - 1;
+
+ return bitmap_find_next_zero_area(map, size, start, nr, align_mask);
+}
+EXPORT_SYMBOL(gen_pool_first_fit_order_align);
+
+/**
* gen_pool_best_fit - find the best fitting region of memory
* macthing the size requirement (no alignment constraint)
* @map: The address to base the search on
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index ba3cd0a35640..c80daef2b202 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1183,6 +1183,21 @@ char *address_val(char *buf, char *end, const void *addr,
return number(buf, end, num, spec);
}
+static noinline_for_stack
+char *comm_name(char *buf, char *end, struct task_struct *tsk,
+ struct printf_spec spec, const char *fmt)
+{
+ char name[TASK_COMM_LEN];
+
+ /* Caller can pass NULL instead of current. */
+ if (!tsk)
+ tsk = current;
+ /* Not using get_task_comm() in case I'm in IRQ context. */
+ memcpy(name, tsk->comm, TASK_COMM_LEN);
+ name[sizeof(name) - 1] = '\0';
+ return string(buf, end, name, spec);
+}
+
int kptr_restrict __read_mostly;
/*
@@ -1250,6 +1265,7 @@ int kptr_restrict __read_mostly;
* (default assumed to be phys_addr_t, passed by reference)
* - 'd[234]' For a dentry name (optionally 2-4 last components)
* - 'D[234]' Same as 'd' but for a struct file
+ * - 'T' task_struct->comm
*
* Note: The difference between 'S' and 'F' is that on ia64 and ppc64
* function pointers are really function descriptors, which contain a
@@ -1261,7 +1277,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
{
int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0);
- if (!ptr && *fmt != 'K') {
+ if (!ptr && *fmt != 'K' && *fmt != 'T') {
/*
* Print (null) with the same width as a pointer so it makes
* tabular output look nice.
@@ -1389,6 +1405,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
return dentry_name(buf, end,
((const struct file *)ptr)->f_path.dentry,
spec, fmt);
+ case 'T':
+ return comm_name(buf, end, ptr, spec, fmt);
}
spec.flags |= SMALL;
if (spec.field_width == -1) {
diff --git a/mm/Kconfig b/mm/Kconfig
index 886db2158538..e09cf0a8fa0f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -228,11 +228,16 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
boolean
#
+# support for memory ballooning
+config MEMORY_BALLOON
+ boolean
+
+#
# support for memory balloon compaction
config BALLOON_COMPACTION
bool "Allow for balloon memory compaction/migration"
def_bool y
- depends on COMPACTION && VIRTIO_BALLOON
+ depends on COMPACTION && MEMORY_BALLOON
help
Memory fragmentation introduced by ballooning might reduce
significantly the number of 2MB contiguous memory blocks that can be
diff --git a/mm/Makefile b/mm/Makefile
index 632ae77e6070..2d33d7f64937 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o balloon_compaction.o vmacache.o \
+ compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
iov_iter.o $(mmu-y)
@@ -64,3 +64,4 @@ obj-$(CONFIG_ZBUD) += zbud.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
+obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6d9cdb0f01d8..0ae0df55000b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -623,7 +623,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
* of sleeping on the congestion queue
*/
if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
- !zone_is_reclaim_congested(zone)) {
+ !test_bit(ZONE_CONGESTED, &zone->flags)) {
cond_resched();
/* In case we scheduled, work out time remaining */
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 6e45a5074bf0..a799802b0167 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -10,31 +10,21 @@
#include <linux/export.h>
#include <linux/balloon_compaction.h>
-/*
- * balloon_devinfo_alloc - allocates a balloon device information descriptor.
- * @balloon_dev_descriptor: pointer to reference the balloon device which
- * this struct balloon_dev_info will be servicing.
- *
- * Driver must call it to properly allocate and initialize an instance of
- * struct balloon_dev_info which will be used to reference a balloon device
- * as well as to keep track of the balloon device page list.
- */
-struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
+void __SetPageBalloon(struct page *page)
{
- struct balloon_dev_info *b_dev_info;
- b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
- if (!b_dev_info)
- return ERR_PTR(-ENOMEM);
-
- b_dev_info->balloon_device = balloon_dev_descriptor;
- b_dev_info->mapping = NULL;
- b_dev_info->isolated_pages = 0;
- spin_lock_init(&b_dev_info->pages_lock);
- INIT_LIST_HEAD(&b_dev_info->pages);
+ VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
+ atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE);
+ inc_zone_page_state(page, NR_BALLOON_PAGES);
+}
+EXPORT_SYMBOL(__SetPageBalloon);
- return b_dev_info;
+void __ClearPageBalloon(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageBalloon(page), page);
+ atomic_set(&page->_mapcount, -1);
+ dec_zone_page_state(page, NR_BALLOON_PAGES);
}
-EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
+EXPORT_SYMBOL(__ClearPageBalloon);
/*
* balloon_page_enqueue - allocates a new page and inserts it into the balloon
@@ -61,7 +51,7 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
*/
BUG_ON(!trylock_page(page));
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
- balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
+ balloon_page_insert(b_dev_info, page);
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
unlock_page(page);
return page;
@@ -81,12 +71,10 @@ EXPORT_SYMBOL_GPL(balloon_page_enqueue);
*/
struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
{
- struct page *page, *tmp;
+ struct page *page;
unsigned long flags;
- bool dequeued_page;
- dequeued_page = false;
- list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
+ list_for_each_entry(page, &b_dev_info->pages, lru) {
/*
* Block others from accessing the 'page' while we get around
* establishing additional references and preparing the 'page'
@@ -94,98 +82,32 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
*/
if (trylock_page(page)) {
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
- /*
- * Raise the page refcount here to prevent any wrong
- * attempt to isolate this page, in case of coliding
- * with balloon_page_isolate() just after we release
- * the page lock.
- *
- * balloon_page_free() will take care of dropping
- * this extra refcount later.
- */
- get_page(page);
- balloon_page_delete(page);
+ balloon_page_delete(page, false);
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
unlock_page(page);
- dequeued_page = true;
- break;
+ return page;
}
}
- if (!dequeued_page) {
- /*
- * If we are unable to dequeue a balloon page because the page
- * list is empty and there is no isolated pages, then something
- * went out of track and some balloon pages are lost.
- * BUG() here, otherwise the balloon driver may get stuck into
- * an infinite loop while attempting to release all its pages.
- */
- spin_lock_irqsave(&b_dev_info->pages_lock, flags);
- if (unlikely(list_empty(&b_dev_info->pages) &&
- !b_dev_info->isolated_pages))
- BUG();
- spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
- page = NULL;
- }
- return page;
+ /*
+ * If we are unable to dequeue a balloon page because the page
+ * list is empty and there is no isolated pages, then something
+ * went out of track and some balloon pages are lost.
+ * BUG() here, otherwise the balloon driver may get stuck into
+ * an infinite loop while attempting to release all its pages.
+ */
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ BUG_ON(list_empty(&b_dev_info->pages) && !b_dev_info->isolated_pages);
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ return NULL;
}
EXPORT_SYMBOL_GPL(balloon_page_dequeue);
#ifdef CONFIG_BALLOON_COMPACTION
-/*
- * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
- * @b_dev_info: holds the balloon device information descriptor.
- * @a_ops: balloon_mapping address_space_operations descriptor.
- *
- * Driver must call it to properly allocate and initialize an instance of
- * struct address_space which will be used as the special page->mapping for
- * balloon device enlisted page instances.
- */
-struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
- const struct address_space_operations *a_ops)
-{
- struct address_space *mapping;
-
- mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
- if (!mapping)
- return ERR_PTR(-ENOMEM);
-
- /*
- * Give a clean 'zeroed' status to all elements of this special
- * balloon page->mapping struct address_space instance.
- */
- address_space_init_once(mapping);
-
- /*
- * Set mapping->flags appropriately, to allow balloon pages
- * ->mapping identification.
- */
- mapping_set_balloon(mapping);
- mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
-
- /* balloon's page->mapping->a_ops callback descriptor */
- mapping->a_ops = a_ops;
-
- /*
- * Establish a pointer reference back to the balloon device descriptor
- * this particular page->mapping will be servicing.
- * This is used by compaction / migration procedures to identify and
- * access the balloon device pageset while isolating / migrating pages.
- *
- * As some balloon drivers can register multiple balloon devices
- * for a single guest, this also helps compaction / migration to
- * properly deal with multiple balloon pagesets, when required.
- */
- mapping->private_data = b_dev_info;
- b_dev_info->mapping = mapping;
-
- return mapping;
-}
-EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
static inline void __isolate_balloon_page(struct page *page)
{
- struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+ struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_del(&page->lru);
@@ -195,7 +117,7 @@ static inline void __isolate_balloon_page(struct page *page)
static inline void __putback_balloon_page(struct page *page)
{
- struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+ struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_add(&page->lru, &b_dev_info->pages);
@@ -203,12 +125,6 @@ static inline void __putback_balloon_page(struct page *page)
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
}
-static inline int __migrate_balloon_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
-{
- return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
-}
-
/* __isolate_lru_page() counterpart for a ballooned page */
bool balloon_page_isolate(struct page *page)
{
@@ -239,8 +155,7 @@ bool balloon_page_isolate(struct page *page)
* Prevent concurrent compaction threads from isolating
* an already isolated balloon page by refcount check.
*/
- if (__is_movable_balloon_page(page) &&
- page_count(page) == 2) {
+ if (PageBalloon(page) && page_count(page) == 2) {
__isolate_balloon_page(page);
unlock_page(page);
return true;
@@ -252,6 +167,57 @@ bool balloon_page_isolate(struct page *page)
return false;
}
+int balloon_page_migrate(new_page_t get_new_page, free_page_t put_new_page,
+ unsigned long private, struct page *page,
+ int force, enum migrate_mode mode)
+{
+ struct balloon_dev_info *balloon = balloon_page_device(page);
+ struct page *newpage;
+ int *result = NULL;
+ int rc = -EAGAIN;
+
+ if (!balloon || !balloon->migrate_page)
+ return -EAGAIN;
+
+ newpage = get_new_page(page, private, &result);
+ if (!newpage)
+ return -ENOMEM;
+
+ if (!trylock_page(newpage))
+ BUG();
+
+ if (!trylock_page(page)) {
+ if (!force || mode != MIGRATE_SYNC)
+ goto out;
+ lock_page(page);
+ }
+
+ rc = balloon->migrate_page(balloon, newpage, page, mode);
+
+ unlock_page(page);
+out:
+ unlock_page(newpage);
+
+ if (rc != -EAGAIN) {
+ dec_zone_page_state(page, NR_ISOLATED_FILE);
+ list_del(&page->lru);
+ put_page(page);
+ }
+
+ if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+ put_new_page(newpage, private);
+ else
+ put_page(newpage);
+
+ if (result) {
+ if (rc)
+ *result = rc;
+ else
+ *result = page_to_nid(newpage);
+ }
+ return rc;
+}
+
/* putback_lru_page() counterpart for a ballooned page */
void balloon_page_putback(struct page *page)
{
@@ -261,7 +227,7 @@ void balloon_page_putback(struct page *page)
*/
lock_page(page);
- if (__is_movable_balloon_page(page)) {
+ if (PageBalloon(page)) {
__putback_balloon_page(page);
/* drop the extra ref count taken for page isolation */
put_page(page);
@@ -272,31 +238,4 @@ void balloon_page_putback(struct page *page)
unlock_page(page);
}
-/* move_to_new_page() counterpart for a ballooned page */
-int balloon_page_migrate(struct page *newpage,
- struct page *page, enum migrate_mode mode)
-{
- struct address_space *mapping;
- int rc = -EAGAIN;
-
- /*
- * Block others from accessing the 'newpage' when we get around to
- * establishing additional references. We should be the only one
- * holding a reference to the 'newpage' at this point.
- */
- BUG_ON(!trylock_page(newpage));
-
- if (WARN_ON(!__is_movable_balloon_page(page))) {
- dump_page(page, "not movable balloon page");
- unlock_page(newpage);
- return rc;
- }
-
- mapping = page->mapping;
- if (mapping)
- rc = __migrate_balloon_page(mapping, newpage, page, mode);
-
- unlock_page(newpage);
- return rc;
-}
#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/cma.c b/mm/cma.c
index c17751c0dcaf..474c644a0dc6 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -32,6 +32,7 @@
#include <linux/slab.h>
#include <linux/log2.h>
#include <linux/cma.h>
+#include <linux/highmem.h>
struct cma {
unsigned long base_pfn;
@@ -163,6 +164,8 @@ int __init cma_declare_contiguous(phys_addr_t base,
bool fixed, struct cma **res_cma)
{
struct cma *cma;
+ phys_addr_t memblock_end = memblock_end_of_DRAM();
+ phys_addr_t highmem_start = __pa(high_memory);
int ret = 0;
pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n",
@@ -196,6 +199,24 @@ int __init cma_declare_contiguous(phys_addr_t base,
if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
return -EINVAL;
+ /*
+ * adjust limit to avoid crossing low/high memory boundary for
+ * automatically allocated regions
+ */
+ if (((limit == 0 || limit > memblock_end) &&
+ (memblock_end - size < highmem_start &&
+ memblock_end > highmem_start)) ||
+ (!fixed && limit > highmem_start && limit - size < highmem_start)) {
+ limit = highmem_start;
+ }
+
+ if (fixed && base < highmem_start && base+size > highmem_start) {
+ ret = -EINVAL;
+ pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n",
+ (unsigned long)base, (unsigned long)highmem_start);
+ goto err;
+ }
+
/* Reserve memory */
if (base && fixed) {
if (memblock_is_region_reserved(base, size) ||
diff --git a/mm/compaction.c b/mm/compaction.c
index 21bf292b642a..7d9d92e63297 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -67,6 +67,49 @@ static inline bool migrate_async_suitable(int migratetype)
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
}
+/*
+ * Check that the whole (or subset of) a pageblock given by the interval of
+ * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
+ * with the migration of free compaction scanner. The scanners then need to
+ * use only pfn_valid_within() check for arches that allow holes within
+ * pageblocks.
+ *
+ * Return struct page pointer of start_pfn, or NULL if checks were not passed.
+ *
+ * It's possible on some configurations to have a setup like node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of pages do not
+ * belong to a single zone. We assume that a border between node0 and node1
+ * can occur within a single pageblock, but not a node0 node1 node0
+ * interleaving within a single pageblock. It is therefore sufficient to check
+ * the first and last page of a pageblock and avoid checking each individual
+ * page in a pageblock.
+ */
+static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ struct page *start_page;
+ struct page *end_page;
+
+ /* end_pfn is one past the range we are checking */
+ end_pfn--;
+
+ if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+ return NULL;
+
+ start_page = pfn_to_page(start_pfn);
+
+ if (page_zone(start_page) != zone)
+ return NULL;
+
+ end_page = pfn_to_page(end_pfn);
+
+ /* This gives a shorter code than deriving page_zone(end_page) */
+ if (page_zone_id(start_page) != page_zone_id(end_page))
+ return NULL;
+
+ return start_page;
+}
+
#ifdef CONFIG_COMPACTION
/* Returns true if the pageblock should be scanned for pages to isolate. */
static inline bool isolation_suitable(struct compact_control *cc,
@@ -132,7 +175,7 @@ void reset_isolation_suitable(pg_data_t *pgdat)
*/
static void update_pageblock_skip(struct compact_control *cc,
struct page *page, unsigned long nr_isolated,
- bool set_unsuitable, bool migrate_scanner)
+ bool migrate_scanner)
{
struct zone *zone = cc->zone;
unsigned long pfn;
@@ -146,12 +189,7 @@ static void update_pageblock_skip(struct compact_control *cc,
if (nr_isolated)
return;
- /*
- * Only skip pageblocks when all forms of compaction will be known to
- * fail in the near future.
- */
- if (set_unsuitable)
- set_pageblock_skip(page);
+ set_pageblock_skip(page);
pfn = page_to_pfn(page);
@@ -180,52 +218,77 @@ static inline bool isolation_suitable(struct compact_control *cc,
static void update_pageblock_skip(struct compact_control *cc,
struct page *page, unsigned long nr_isolated,
- bool set_unsuitable, bool migrate_scanner)
+ bool migrate_scanner)
{
}
#endif /* CONFIG_COMPACTION */
-static inline bool should_release_lock(spinlock_t *lock)
+/*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. For async compaction, back out if the lock cannot
+ * be taken immediately. For sync compaction, spin on the lock if needed.
+ *
+ * Returns true if the lock is held
+ * Returns false if the lock is not held and compaction should abort
+ */
+static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
+ struct compact_control *cc)
{
- return need_resched() || spin_is_contended(lock);
+ if (cc->mode == MIGRATE_ASYNC) {
+ if (!spin_trylock_irqsave(lock, *flags)) {
+ cc->contended = COMPACT_CONTENDED_LOCK;
+ return false;
+ }
+ } else {
+ spin_lock_irqsave(lock, *flags);
+ }
+
+ return true;
}
/*
* Compaction requires the taking of some coarse locks that are potentially
- * very heavily contended. Check if the process needs to be scheduled or
- * if the lock is contended. For async compaction, back out in the event
- * if contention is severe. For sync compaction, schedule.
+ * very heavily contended. The lock should be periodically unlocked to avoid
+ * having disabled IRQs for a long time, even when there is nobody waiting on
+ * the lock. It might also be that allowing the IRQs will result in
+ * need_resched() becoming true. If scheduling is needed, async compaction
+ * aborts. Sync compaction schedules.
+ * Either compaction type will also abort if a fatal signal is pending.
+ * In either case if the lock was locked, it is dropped and not regained.
*
- * Returns true if the lock is held.
- * Returns false if the lock is released and compaction should abort
+ * Returns true if compaction should abort due to fatal signal pending, or
+ * async compaction due to need_resched()
+ * Returns false when compaction can continue (sync compaction might have
+ * scheduled)
*/
-static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
- bool locked, struct compact_control *cc)
+static bool compact_unlock_should_abort(spinlock_t *lock,
+ unsigned long flags, bool *locked, struct compact_control *cc)
{
- if (should_release_lock(lock)) {
- if (locked) {
- spin_unlock_irqrestore(lock, *flags);
- locked = false;
- }
+ if (*locked) {
+ spin_unlock_irqrestore(lock, flags);
+ *locked = false;
+ }
+
+ if (fatal_signal_pending(current)) {
+ cc->contended = COMPACT_CONTENDED_SCHED;
+ return true;
+ }
- /* async aborts if taking too long or contended */
+ if (need_resched()) {
if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
- return false;
+ cc->contended = COMPACT_CONTENDED_SCHED;
+ return true;
}
-
cond_resched();
}
- if (!locked)
- spin_lock_irqsave(lock, *flags);
- return true;
+ return false;
}
/*
* Aside from avoiding lock contention, compaction also periodically checks
* need_resched() and either schedules in sync compaction or aborts async
- * compaction. This is similar to what compact_checklock_irqsave() does, but
+ * compaction. This is similar to what compact_unlock_should_abort() does, but
* is used where no lock is concerned.
*
* Returns false when no scheduling was needed, or sync compaction scheduled.
@@ -236,7 +299,7 @@ static inline bool compact_should_abort(struct compact_control *cc)
/* async compaction aborts if contended */
if (need_resched()) {
if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
+ cc->contended = COMPACT_CONTENDED_SCHED;
return true;
}
@@ -250,8 +313,15 @@ static inline bool compact_should_abort(struct compact_control *cc)
static bool suitable_migration_target(struct page *page)
{
/* If the page is a large free page, then disallow migration */
- if (PageBuddy(page) && page_order(page) >= pageblock_order)
- return false;
+ if (PageBuddy(page)) {
+ /*
+ * We are checking page_order without zone->lock taken. But
+ * the only small danger is that we skip a potentially suitable
+ * pageblock, so it's not worth to check order for valid range.
+ */
+ if (page_order_unsafe(page) >= pageblock_order)
+ return false;
+ }
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
if (migrate_async_suitable(get_pageblock_migratetype(page)))
@@ -267,7 +337,7 @@ static bool suitable_migration_target(struct page *page)
* (even though it may still end up isolating some pages).
*/
static unsigned long isolate_freepages_block(struct compact_control *cc,
- unsigned long blockpfn,
+ unsigned long *start_pfn,
unsigned long end_pfn,
struct list_head *freelist,
bool strict)
@@ -276,7 +346,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
struct page *cursor, *valid_page = NULL;
unsigned long flags;
bool locked = false;
- bool checked_pageblock = false;
+ unsigned long blockpfn = *start_pfn;
cursor = pfn_to_page(blockpfn);
@@ -285,6 +355,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
int isolated, i;
struct page *page = cursor;
+ /*
+ * Periodically drop the lock (if held) regardless of its
+ * contention, to give chance to IRQs. Abort if fatal signal
+ * pending or async compaction detects need_resched()
+ */
+ if (!(blockpfn % SWAP_CLUSTER_MAX)
+ && compact_unlock_should_abort(&cc->zone->lock, flags,
+ &locked, cc))
+ break;
+
nr_scanned++;
if (!pfn_valid_within(blockpfn))
goto isolate_fail;
@@ -295,33 +375,30 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
goto isolate_fail;
/*
- * The zone lock must be held to isolate freepages.
- * Unfortunately this is a very coarse lock and can be
- * heavily contended if there are parallel allocations
- * or parallel compactions. For async compaction do not
- * spin on the lock and we acquire the lock as late as
- * possible.
+ * If we already hold the lock, we can skip some rechecking.
+ * Note that if we hold the lock now, checked_pageblock was
+ * already set in some previous iteration (or strict is true),
+ * so it is correct to skip the suitable migration target
+ * recheck as well.
*/
- locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
- locked, cc);
- if (!locked)
- break;
-
- /* Recheck this is a suitable migration target under lock */
- if (!strict && !checked_pageblock) {
+ if (!locked) {
/*
- * We need to check suitability of pageblock only once
- * and this isolate_freepages_block() is called with
- * pageblock range, so just check once is sufficient.
+ * The zone lock must be held to isolate freepages.
+ * Unfortunately this is a very coarse lock and can be
+ * heavily contended if there are parallel allocations
+ * or parallel compactions. For async compaction do not
+ * spin on the lock and we acquire the lock as late as
+ * possible.
*/
- checked_pageblock = true;
- if (!suitable_migration_target(page))
+ locked = compact_trylock_irqsave(&cc->zone->lock,
+ &flags, cc);
+ if (!locked)
break;
- }
- /* Recheck this is a buddy page under lock */
- if (!PageBuddy(page))
- goto isolate_fail;
+ /* Recheck this is a buddy page under lock */
+ if (!PageBuddy(page))
+ goto isolate_fail;
+ }
/* Found a free page, break it into order-0 pages */
isolated = split_free_page(page);
@@ -346,6 +423,9 @@ isolate_fail:
}
+ /* Record how far we have got within the block */
+ *start_pfn = blockpfn;
+
trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
/*
@@ -361,8 +441,7 @@ isolate_fail:
/* Update the pageblock-skip if the whole pageblock was scanned */
if (blockpfn == end_pfn)
- update_pageblock_skip(cc, valid_page, total_isolated, true,
- false);
+ update_pageblock_skip(cc, valid_page, total_isolated, false);
count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
if (total_isolated)
@@ -390,19 +469,21 @@ isolate_freepages_range(struct compact_control *cc,
unsigned long isolated, pfn, block_end_pfn;
LIST_HEAD(freelist);
- for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
- if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
- break;
+ pfn = start_pfn;
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+
+ for (; pfn < end_pfn; pfn += isolated,
+ block_end_pfn += pageblock_nr_pages) {
+ /* Protect pfn from changing by isolate_freepages_block */
+ unsigned long isolate_start_pfn = pfn;
- /*
- * On subsequent iterations ALIGN() is actually not needed,
- * but we keep it that we not to complicate the code.
- */
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
- isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
- &freelist, true);
+ if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ break;
+
+ isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+ block_end_pfn, &freelist, true);
/*
* In strict mode, isolate_freepages_block() returns 0 if
@@ -467,40 +548,34 @@ static bool too_many_isolated(struct zone *zone)
}
/**
- * isolate_migratepages_range() - isolate all migrate-able pages in range.
- * @zone: Zone pages are in.
+ * isolate_migratepages_block() - isolate all migrate-able pages within
+ * a single pageblock
* @cc: Compaction control structure.
- * @low_pfn: The first PFN of the range.
- * @end_pfn: The one-past-the-last PFN of the range.
- * @unevictable: true if it allows to isolate unevictable pages
+ * @low_pfn: The first PFN to isolate
+ * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock
+ * @isolate_mode: Isolation mode to be used.
*
* Isolate all pages that can be migrated from the range specified by
- * [low_pfn, end_pfn). Returns zero if there is a fatal signal
- * pending), otherwise PFN of the first page that was not scanned
- * (which may be both less, equal to or more then end_pfn).
- *
- * Assumes that cc->migratepages is empty and cc->nr_migratepages is
- * zero.
+ * [low_pfn, end_pfn). The range is expected to be within same pageblock.
+ * Returns zero if there is a fatal signal pending, otherwise PFN of the
+ * first page that was not scanned (which may be both less, equal to or more
+ * than end_pfn).
*
- * Apart from cc->migratepages and cc->nr_migratetypes this function
- * does not modify any cc's fields, in particular it does not modify
- * (or read for that matter) cc->migrate_pfn.
+ * The pages are isolated on cc->migratepages list (not required to be empty),
+ * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
+ * is neither read nor updated.
*/
-unsigned long
-isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
- unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
+static unsigned long
+isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
+ unsigned long end_pfn, isolate_mode_t isolate_mode)
{
- unsigned long last_pageblock_nr = 0, pageblock_nr;
+ struct zone *zone = cc->zone;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
struct lruvec *lruvec;
unsigned long flags;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
- bool set_unsuitable = true;
- const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
- ISOLATE_ASYNC_MIGRATE : 0) |
- (unevictable ? ISOLATE_UNEVICTABLE : 0);
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -523,72 +598,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
- /* give a chance to irqs before checking need_resched() */
- if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
- if (should_release_lock(&zone->lru_lock)) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- locked = false;
- }
- }
-
/*
- * migrate_pfn does not necessarily start aligned to a
- * pageblock. Ensure that pfn_valid is called when moving
- * into a new MAX_ORDER_NR_PAGES range in case of large
- * memory holes within the zone
+ * Periodically drop the lock (if held) regardless of its
+ * contention, to give chance to IRQs. Abort async compaction
+ * if contended.
*/
- if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
- if (!pfn_valid(low_pfn)) {
- low_pfn += MAX_ORDER_NR_PAGES - 1;
- continue;
- }
- }
+ if (!(low_pfn % SWAP_CLUSTER_MAX)
+ && compact_unlock_should_abort(&zone->lru_lock, flags,
+ &locked, cc))
+ break;
if (!pfn_valid_within(low_pfn))
continue;
nr_scanned++;
- /*
- * Get the page and ensure the page is within the same zone.
- * See the comment in isolate_freepages about overlapping
- * nodes. It is deliberate that the new zone lock is not taken
- * as memory compaction should not move pages between nodes.
- */
page = pfn_to_page(low_pfn);
- if (page_zone(page) != zone)
- continue;
if (!valid_page)
valid_page = page;
- /* If isolation recently failed, do not retry */
- pageblock_nr = low_pfn >> pageblock_order;
- if (last_pageblock_nr != pageblock_nr) {
- int mt;
-
- last_pageblock_nr = pageblock_nr;
- if (!isolation_suitable(cc, page))
- goto next_pageblock;
+ /*
+ * Skip if free. We read page order here without zone lock
+ * which is generally unsafe, but the race window is small and
+ * the worst thing that can happen is that we skip some
+ * potential isolation targets.
+ */
+ if (PageBuddy(page)) {
+ unsigned long freepage_order = page_order_unsafe(page);
/*
- * For async migration, also only scan in MOVABLE
- * blocks. Async migration is optimistic to see if
- * the minimum amount of work satisfies the allocation
+ * Without lock, we cannot be sure that what we got is
+ * a valid page order. Consider only values in the
+ * valid order range to prevent low_pfn overflow.
*/
- mt = get_pageblock_migratetype(page);
- if (cc->mode == MIGRATE_ASYNC &&
- !migrate_async_suitable(mt)) {
- set_unsuitable = false;
- goto next_pageblock;
- }
- }
-
- /*
- * Skip if free. page_order cannot be used without zone->lock
- * as nothing prevents parallel allocations or buddy merging.
- */
- if (PageBuddy(page))
+ if (freepage_order > 0 && freepage_order < MAX_ORDER)
+ low_pfn += (1UL << freepage_order) - 1;
continue;
+ }
/*
* Check may be lockless but that's ok as we recheck later.
@@ -596,11 +642,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
* Skip any other type of page
*/
if (!PageLRU(page)) {
- if (unlikely(balloon_page_movable(page))) {
- if (locked && balloon_page_isolate(page)) {
- /* Successfully isolated */
- goto isolate_success;
- }
+ if (unlikely(PageBalloon(page)) &&
+ balloon_page_isolate(page)) {
+ /* Successfully isolated */
+ goto isolate_success;
}
continue;
}
@@ -617,8 +662,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
*/
if (PageTransHuge(page)) {
if (!locked)
- goto next_pageblock;
- low_pfn += (1 << compound_order(page)) - 1;
+ low_pfn = ALIGN(low_pfn + 1,
+ pageblock_nr_pages) - 1;
+ else
+ low_pfn += (1 << compound_order(page)) - 1;
+
continue;
}
@@ -631,24 +679,26 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
page_count(page) > page_mapcount(page))
continue;
- /* Check if it is ok to still hold the lock */
- locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
- locked, cc);
- if (!locked || fatal_signal_pending(current))
- break;
+ /* If we already hold the lock, we can skip some rechecking */
+ if (!locked) {
+ locked = compact_trylock_irqsave(&zone->lru_lock,
+ &flags, cc);
+ if (!locked)
+ break;
- /* Recheck PageLRU and PageTransHuge under lock */
- if (!PageLRU(page))
- continue;
- if (PageTransHuge(page)) {
- low_pfn += (1 << compound_order(page)) - 1;
- continue;
+ /* Recheck PageLRU and PageTransHuge under lock */
+ if (!PageLRU(page))
+ continue;
+ if (PageTransHuge(page)) {
+ low_pfn += (1 << compound_order(page)) - 1;
+ continue;
+ }
}
lruvec = mem_cgroup_page_lruvec(page, zone);
/* Try isolate the page */
- if (__isolate_lru_page(page, mode) != 0)
+ if (__isolate_lru_page(page, isolate_mode) != 0)
continue;
VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -667,13 +717,15 @@ isolate_success:
++low_pfn;
break;
}
-
- continue;
-
-next_pageblock:
- low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
}
+ /*
+ * The PageBuddy() check could have potentially brought us outside
+ * the range to be scanned.
+ */
+ if (unlikely(low_pfn > end_pfn))
+ low_pfn = end_pfn;
+
acct_isolated(zone, locked, cc);
if (locked)
@@ -684,8 +736,7 @@ next_pageblock:
* if the whole pageblock was scanned without isolating any page.
*/
if (low_pfn == end_pfn)
- update_pageblock_skip(cc, valid_page, nr_isolated,
- set_unsuitable, true);
+ update_pageblock_skip(cc, valid_page, nr_isolated, true);
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -696,17 +747,64 @@ next_pageblock:
return low_pfn;
}
+/**
+ * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
+ * @cc: Compaction control structure.
+ * @start_pfn: The first PFN to start isolating.
+ * @end_pfn: The one-past-last PFN.
+ *
+ * Returns zero if isolation fails fatally due to e.g. pending signal.
+ * Otherwise, function returns one-past-the-last PFN of isolated page
+ * (which may be greater than end_pfn if end fell in a middle of a THP page).
+ */
+unsigned long
+isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long pfn, block_end_pfn;
+
+ /* Scan block by block. First and last block may be incomplete */
+ pfn = start_pfn;
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+
+ for (; pfn < end_pfn; pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
+ if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ continue;
+
+ pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
+ ISOLATE_UNEVICTABLE);
+
+ /*
+ * In case of fatal failure, release everything that might
+ * have been isolated in the previous iteration, and signal
+ * the failure back to caller.
+ */
+ if (!pfn) {
+ putback_movable_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
+ break;
+ }
+ }
+
+ return pfn;
+}
+
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION
/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
*/
-static void isolate_freepages(struct zone *zone,
- struct compact_control *cc)
+static void isolate_freepages(struct compact_control *cc)
{
+ struct zone *zone = cc->zone;
struct page *page;
unsigned long block_start_pfn; /* start of current pageblock */
+ unsigned long isolate_start_pfn; /* exact pfn we start at */
unsigned long block_end_pfn; /* end of current pageblock */
unsigned long low_pfn; /* lowest pfn scanner is able to scan */
int nr_freepages = cc->nr_freepages;
@@ -715,14 +813,15 @@ static void isolate_freepages(struct zone *zone,
/*
* Initialise the free scanner. The starting point is where we last
* successfully isolated from, zone-cached value, or the end of the
- * zone when isolating for the first time. We need this aligned to
- * the pageblock boundary, because we do
+ * zone when isolating for the first time. For looping we also need
+ * this pfn aligned down to the pageblock boundary, because we do
* block_start_pfn -= pageblock_nr_pages in the for loop.
* For ending point, take care when isolating in last pageblock of a
* a zone which ends in the middle of a pageblock.
* The low boundary is the end of the pageblock the migration scanner
* is using.
*/
+ isolate_start_pfn = cc->free_pfn;
block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
zone_end_pfn(zone));
@@ -735,7 +834,8 @@ static void isolate_freepages(struct zone *zone,
*/
for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
block_end_pfn = block_start_pfn,
- block_start_pfn -= pageblock_nr_pages) {
+ block_start_pfn -= pageblock_nr_pages,
+ isolate_start_pfn = block_start_pfn) {
unsigned long isolated;
/*
@@ -747,18 +847,9 @@ static void isolate_freepages(struct zone *zone,
&& compact_should_abort(cc))
break;
- if (!pfn_valid(block_start_pfn))
- continue;
-
- /*
- * Check for overlapping nodes/zones. It's possible on some
- * configurations to have a setup like
- * node0 node1 node0
- * i.e. it's possible that all pages within a zones range of
- * pages do not belong to a single zone.
- */
- page = pfn_to_page(block_start_pfn);
- if (page_zone(page) != zone)
+ page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
+ zone);
+ if (!page)
continue;
/* Check the block is suitable for migration */
@@ -769,13 +860,25 @@ static void isolate_freepages(struct zone *zone,
if (!isolation_suitable(cc, page))
continue;
- /* Found a block suitable for isolating free pages from */
- cc->free_pfn = block_start_pfn;
- isolated = isolate_freepages_block(cc, block_start_pfn,
+ /* Found a block suitable for isolating free pages from. */
+ isolated = isolate_freepages_block(cc, &isolate_start_pfn,
block_end_pfn, freelist, false);
nr_freepages += isolated;
/*
+ * Remember where the free scanner should restart next time,
+ * which is where isolate_freepages_block() left off.
+ * But if it scanned the whole pageblock, isolate_start_pfn
+ * now points at block_end_pfn, which is the start of the next
+ * pageblock.
+ * In that case we will however want to restart at the start
+ * of the previous pageblock.
+ */
+ cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
+ isolate_start_pfn :
+ block_start_pfn - pageblock_nr_pages;
+
+ /*
* Set a flag that we successfully isolated in this pageblock.
* In the next loop iteration, zone->compact_cached_free_pfn
* will not be updated and thus it will effectively contain the
@@ -822,7 +925,7 @@ static struct page *compaction_alloc(struct page *migratepage,
*/
if (list_empty(&cc->freepages)) {
if (!cc->contended)
- isolate_freepages(cc->zone, cc);
+ isolate_freepages(cc);
if (list_empty(&cc->freepages))
return NULL;
@@ -856,38 +959,83 @@ typedef enum {
} isolate_migrate_t;
/*
- * Isolate all pages that can be migrated from the block pointed to by
- * the migrate scanner within compact_control.
+ * Isolate all pages that can be migrated from the first suitable block,
+ * starting at the block pointed to by the migrate scanner pfn within
+ * compact_control.
*/
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long low_pfn, end_pfn;
+ struct page *page;
+ const isolate_mode_t isolate_mode =
+ (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
- /* Do not scan outside zone boundaries */
- low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
+ /*
+ * Start at where we last stopped, or beginning of the zone as
+ * initialized by compact_zone()
+ */
+ low_pfn = cc->migrate_pfn;
/* Only scan within a pageblock boundary */
end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
- /* Do not cross the free scanner or scan within a memory hole */
- if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
- cc->migrate_pfn = end_pfn;
- return ISOLATE_NONE;
- }
+ /*
+ * Iterate over whole pageblocks until we find the first suitable.
+ * Do not cross the free scanner.
+ */
+ for (; end_pfn <= cc->free_pfn;
+ low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
+
+ /*
+ * This can potentially iterate a massively long zone with
+ * many pageblocks unsuitable, so periodically check if we
+ * need to schedule, or even abort async compaction.
+ */
+ if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+ && compact_should_abort(cc))
+ break;
+
+ page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
+ if (!page)
+ continue;
+
+ /* If isolation recently failed, do not retry */
+ if (!isolation_suitable(cc, page))
+ continue;
+
+ /*
+ * For async compaction, also only scan in MOVABLE blocks.
+ * Async compaction is optimistic to see if the minimum amount
+ * of work satisfies the allocation.
+ */
+ if (cc->mode == MIGRATE_ASYNC &&
+ !migrate_async_suitable(get_pageblock_migratetype(page)))
+ continue;
- /* Perform the isolation */
- low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
- if (!low_pfn || cc->contended)
- return ISOLATE_ABORT;
+ /* Perform the isolation */
+ low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
+ isolate_mode);
+ if (!low_pfn || cc->contended)
+ return ISOLATE_ABORT;
+
+ /*
+ * Either we isolated something and proceed with migration. Or
+ * we failed and compact_zone should decide if we should
+ * continue or not.
+ */
+ break;
+ }
+
+ /* Record where migration scanner will be restarted */
cc->migrate_pfn = low_pfn;
- return ISOLATE_SUCCESS;
+ return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
-static int compact_finished(struct zone *zone,
- struct compact_control *cc)
+static int compact_finished(struct zone *zone, struct compact_control *cc,
+ const int migratetype)
{
unsigned int order;
unsigned long watermark;
@@ -933,7 +1081,7 @@ static int compact_finished(struct zone *zone,
struct free_area *area = &zone->free_area[order];
/* Job done if page is free of the right migratetype */
- if (!list_empty(&area->free_list[cc->migratetype]))
+ if (!list_empty(&area->free_list[migratetype]))
return COMPACT_PARTIAL;
/* Job done if allocation would set block type */
@@ -999,6 +1147,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
int ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
+ const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
const bool sync = cc->mode != MIGRATE_ASYNC;
ret = compaction_suitable(zone, cc->order);
@@ -1041,7 +1190,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
migrate_prep_local();
- while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+ while ((ret = compact_finished(zone, cc, migratetype)) ==
+ COMPACT_CONTINUE) {
int err;
switch (isolate_migratepages(zone, cc)) {
@@ -1056,9 +1206,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
;
}
- if (!cc->nr_migratepages)
- continue;
-
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION);
@@ -1092,14 +1239,14 @@ out:
}
static unsigned long compact_zone_order(struct zone *zone, int order,
- gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
+ gfp_t gfp_mask, enum migrate_mode mode, int *contended)
{
unsigned long ret;
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
.order = order,
- .migratetype = allocflags_to_migratetype(gfp_mask),
+ .gfp_mask = gfp_mask,
.zone = zone,
.mode = mode,
};
@@ -1124,48 +1271,117 @@ int sysctl_extfrag_threshold = 500;
* @gfp_mask: The GFP mask of the current allocation
* @nodemask: The allowed nodes to allocate from
* @mode: The migration mode for async, sync light, or sync migration
- * @contended: Return value that is true if compaction was aborted due to lock contention
- * @page: Optionally capture a free page of the requested order during compaction
+ * @contended: Return value that determines if compaction was aborted due to
+ * need_resched() or lock contention
+ * @candidate_zone: Return the zone where we think allocation should succeed
*
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
- enum migrate_mode mode, bool *contended)
+ enum migrate_mode mode, int *contended,
+ struct zone **candidate_zone)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
- int rc = COMPACT_SKIPPED;
+ int rc = COMPACT_DEFERRED;
int alloc_flags = 0;
+ int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
+
+ *contended = COMPACT_CONTENDED_NONE;
/* Check if the GFP flags allow compaction */
if (!order || !may_enter_fs || !may_perform_io)
- return rc;
-
- count_compact_event(COMPACTSTALL);
+ return COMPACT_SKIPPED;
#ifdef CONFIG_CMA
- if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
nodemask) {
int status;
+ int zone_contended;
+
+ if (compaction_deferred(zone, order))
+ continue;
status = compact_zone_order(zone, order, gfp_mask, mode,
- contended);
+ &zone_contended);
rc = max(status, rc);
+ /*
+ * It takes at least one zone that wasn't lock contended
+ * to clear all_zones_contended.
+ */
+ all_zones_contended &= zone_contended;
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
- alloc_flags))
- break;
+ alloc_flags)) {
+ *candidate_zone = zone;
+ /*
+ * We think the allocation will succeed in this zone,
+ * but it is not certain, hence the false. The caller
+ * will repeat this with true if allocation indeed
+ * succeeds in this zone.
+ */
+ compaction_defer_reset(zone, order, false);
+ /*
+ * It is possible that async compaction aborted due to
+ * need_resched() and the watermarks were ok thanks to
+ * somebody else freeing memory. The allocation can
+ * however still fail so we better signal the
+ * need_resched() contention anyway (this will not
+ * prevent the allocation attempt).
+ */
+ if (zone_contended == COMPACT_CONTENDED_SCHED)
+ *contended = COMPACT_CONTENDED_SCHED;
+
+ goto break_loop;
+ }
+
+ if (mode != MIGRATE_ASYNC) {
+ /*
+ * We think that allocation won't succeed in this zone
+ * so we defer compaction there. If it ends up
+ * succeeding after all, it will be reset.
+ */
+ defer_compaction(zone, order);
+ }
+
+ /*
+ * We might have stopped compacting due to need_resched() in
+ * async compaction, or due to a fatal signal detected. In that
+ * case do not try further zones and signal need_resched()
+ * contention.
+ */
+ if ((zone_contended == COMPACT_CONTENDED_SCHED)
+ || fatal_signal_pending(current)) {
+ *contended = COMPACT_CONTENDED_SCHED;
+ goto break_loop;
+ }
+
+ continue;
+break_loop:
+ /*
+ * We might not have tried all the zones, so be conservative
+ * and assume they are not all lock contended.
+ */
+ all_zones_contended = 0;
+ break;
}
+ /*
+ * If at least one zone wasn't deferred or skipped, we report if all
+ * zones that were tried were lock contended.
+ */
+ if (rc > COMPACT_SKIPPED && all_zones_contended)
+ *contended = COMPACT_CONTENDED_LOCK;
+
return rc;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d9a21d06b862..d81f8ba88c0c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1096,7 +1096,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long mmun_end; /* For mmu_notifiers */
ptl = pmd_lockptr(mm, pmd);
- VM_BUG_ON(!vma->anon_vma);
+ VM_BUG_ON_VMA(!vma->anon_vma, vma);
haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
@@ -2080,7 +2080,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
if (vma->vm_ops)
/* khugepaged not yet working on file or special mappings */
return 0;
- VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+ VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
@@ -2319,23 +2319,17 @@ static struct page
int node)
{
VM_BUG_ON_PAGE(*hpage, *hpage);
+
/*
- * Allocate the page while the vma is still valid and under
- * the mmap_sem read mode so there is no memory allocation
- * later when we take the mmap_sem in write mode. This is more
- * friendly behavior (OTOH it may actually hide bugs) to
- * filesystems in userland with daemons allocating memory in
- * the userland I/O paths. Allocating memory with the
- * mmap_sem in read mode is good idea also to allow greater
- * scalability.
+ * Before allocating the hugepage, release the mmap_sem read lock.
+ * The allocation can take potentially a long time if it involves
+ * sync compaction, and we do not need to hold the mmap_sem during
+ * that. We will recheck the vma after taking it again in write mode.
*/
+ up_read(&mm->mmap_sem);
+
*hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
- /*
- * After allocating the hugepage, release the mmap_sem read lock in
- * preparation for taking it in write mode.
- */
- up_read(&mm->mmap_sem);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
@@ -2409,7 +2403,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
return false;
if (is_vma_temporary_stack(vma))
return false;
- VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+ VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
return true;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eeceeeb09019..9fd722769927 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -434,7 +434,7 @@ static inline struct resv_map *inode_resv_map(struct inode *inode)
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
{
- VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
if (vma->vm_flags & VM_MAYSHARE) {
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
@@ -449,8 +449,8 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
{
- VM_BUG_ON(!is_vm_hugetlb_page(vma));
- VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+ VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
set_vma_private_data(vma, (get_vma_private_data(vma) &
HPAGE_RESV_MASK) | (unsigned long)map);
@@ -458,15 +458,15 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
- VM_BUG_ON(!is_vm_hugetlb_page(vma));
- VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+ VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
set_vma_private_data(vma, get_vma_private_data(vma) | flags);
}
static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
{
- VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
return (get_vma_private_data(vma) & flag) != 0;
}
@@ -474,7 +474,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
- VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
if (!(vma->vm_flags & VM_MAYSHARE))
vma->vm_private_data = (void *)0;
}
diff --git a/mm/internal.h b/mm/internal.h
index a1b651b11c5f..829304090b90 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -142,10 +142,10 @@ struct compact_control {
bool finished_update_migrate;
int order; /* order a direct compactor needs */
- int migratetype; /* MOVABLE, RECLAIMABLE etc */
+ const gfp_t gfp_mask; /* gfp mask of a direct compactor */
struct zone *zone;
- bool contended; /* True if a lock was contended, or
- * need_resched() true during async
+ int contended; /* Signal need_sched() or lock
+ * contention detected during
* compaction
*/
};
@@ -154,8 +154,8 @@ unsigned long
isolate_freepages_range(struct compact_control *cc,
unsigned long start_pfn, unsigned long end_pfn);
unsigned long
-isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
- unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
+isolate_migratepages_range(struct compact_control *cc,
+ unsigned long low_pfn, unsigned long end_pfn);
#endif
@@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
* general, page_zone(page)->lock must be held by the caller to prevent the
* page from being allocated in parallel and returning garbage as the order.
* If a caller does not hold page_zone(page)->lock, it must guarantee that the
- * page cannot be allocated or merged in parallel.
+ * page cannot be allocated or merged in parallel. Alternatively, it must
+ * handle invalid values gracefully, and use page_order_unsafe() below.
*/
static inline unsigned long page_order(struct page *page)
{
@@ -172,6 +173,19 @@ static inline unsigned long page_order(struct page *page)
return page_private(page);
}
+/*
+ * Like page_order(), but for callers who cannot afford to hold the zone lock.
+ * PageBuddy() should be checked first by the caller to minimize race window,
+ * and invalid values must be handled gracefully.
+ *
+ * ACCESS_ONCE is used so that if the caller assigns the result into a local
+ * variable and e.g. tests it for valid range before using, the compiler cannot
+ * decide to remove the variable and inline the page_private(page) multiple
+ * times, potentially observing different values in the tests and the actual
+ * use of the result.
+ */
+#define page_order_unsafe(page) ACCESS_ONCE(page_private(page))
+
static inline bool is_cow_mapping(vm_flags_t flags)
{
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 4a5822a586e6..8da581fa9060 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -34,7 +34,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
struct vm_area_struct *parent;
unsigned long last = vma_last_pgoff(node);
- VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
+ VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
if (!prev->shared.linear.rb.rb_right) {
parent = prev;
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index fd814fd61319..cab58bb592d8 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -2,6 +2,7 @@
#include <linux/mm_types.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include "slab.h"
#include <linux/kmemcheck.h>
void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
diff --git a/mm/memory.c b/mm/memory.c
index adeac306610f..838f929c6aff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2975,6 +2975,8 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int dirtied = 0;
int ret, tmp;
+ WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
+
ret = __do_fault(vma, address, pgoff, flags, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -3005,6 +3007,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (set_page_dirty(fault_page))
dirtied = 1;
+ /*
+ * Take a local copy of the address_space - page.mapping may be zeroed
+ * by truncate after unlock_page(). The address_space itself remains
+ * pinned by vma->vm_file's reference. We rely on unlock_page()'s
+ * release semantics to prevent the compiler from undoing this copying.
+ */
mapping = fault_page->mapping;
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2ff8c2325e96..29d8693d0c61 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1307,7 +1307,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
/*
* Confirm all pages in a range [start, end) is belongs to the same zone.
*/
-static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
+int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct zone *zone = NULL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8f5330d74f47..008fb32936eb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -123,25 +123,23 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
-static struct mempolicy *get_task_policy(struct task_struct *p)
+struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
+ int node;
- if (!pol) {
- int node = numa_node_id();
+ if (pol)
+ return pol;
- if (node != NUMA_NO_NODE) {
- pol = &preferred_node_policy[node];
- /*
- * preferred_node_policy is not initialised early in
- * boot
- */
- if (!pol->mode)
- pol = NULL;
- }
+ node = numa_node_id();
+ if (node != NUMA_NO_NODE) {
+ pol = &preferred_node_policy[node];
+ /* preferred_node_policy is not initialised early in boot */
+ if (pol->mode)
+ return pol;
}
- return pol;
+ return &default_policy;
}
static const struct mempolicy_operations {
@@ -804,7 +802,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *new, *old;
- struct mm_struct *mm = current->mm;
NODEMASK_SCRATCH(scratch);
int ret;
@@ -816,20 +813,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
ret = PTR_ERR(new);
goto out;
}
- /*
- * prevent changing our mempolicy while show_numa_maps()
- * is using it.
- * Note: do_set_mempolicy() can be called at init time
- * with no 'mm'.
- */
- if (mm)
- down_write(&mm->mmap_sem);
+
task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
task_unlock(current);
- if (mm)
- up_write(&mm->mmap_sem);
mpol_put(new);
goto out;
}
@@ -839,9 +827,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
task_unlock(current);
- if (mm)
- up_write(&mm->mmap_sem);
-
mpol_put(old);
ret = 0;
out:
@@ -1605,32 +1590,14 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
#endif
-/*
- * get_vma_policy(@task, @vma, @addr)
- * @task: task for fallback if vma policy == default
- * @vma: virtual memory area whose policy is sought
- * @addr: address in @vma for shared policy lookup
- *
- * Returns effective policy for a VMA at specified address.
- * Falls back to @task or system default policy, as necessary.
- * Current or other task's task mempolicy and non-shared vma policies must be
- * protected by task_lock(task) by the caller.
- * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
- * count--added by the get_policy() vm_op, as appropriate--to protect against
- * freeing by another task. It is the caller's responsibility to free the
- * extra reference for shared policies.
- */
-struct mempolicy *get_vma_policy(struct task_struct *task,
- struct vm_area_struct *vma, unsigned long addr)
+struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
+ unsigned long addr)
{
- struct mempolicy *pol = get_task_policy(task);
+ struct mempolicy *pol = NULL;
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy) {
- struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
- addr);
- if (vpol)
- pol = vpol;
+ pol = vma->vm_ops->get_policy(vma, addr);
} else if (vma->vm_policy) {
pol = vma->vm_policy;
@@ -1644,31 +1611,51 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
mpol_get(pol);
}
}
+
+ return pol;
+}
+
+/*
+ * get_vma_policy(@vma, @addr)
+ * @vma: virtual memory area whose policy is sought
+ * @addr: address in @vma for shared policy lookup
+ *
+ * Returns effective policy for a VMA at specified address.
+ * Falls back to current->mempolicy or system default policy, as necessary.
+ * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
+ * count--added by the get_policy() vm_op, as appropriate--to protect against
+ * freeing by another task. It is the caller's responsibility to free the
+ * extra reference for shared policies.
+ */
+static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct mempolicy *pol = __get_vma_policy(vma, addr);
+
if (!pol)
- pol = &default_policy;
+ pol = get_task_policy(current);
+
return pol;
}
-bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
+bool vma_policy_mof(struct vm_area_struct *vma)
{
- struct mempolicy *pol = get_task_policy(task);
- if (vma) {
- if (vma->vm_ops && vma->vm_ops->get_policy) {
- bool ret = false;
+ struct mempolicy *pol;
- pol = vma->vm_ops->get_policy(vma, vma->vm_start);
- if (pol && (pol->flags & MPOL_F_MOF))
- ret = true;
- mpol_cond_put(pol);
+ if (vma->vm_ops && vma->vm_ops->get_policy) {
+ bool ret = false;
- return ret;
- } else if (vma->vm_policy) {
- pol = vma->vm_policy;
- }
+ pol = vma->vm_ops->get_policy(vma, vma->vm_start);
+ if (pol && (pol->flags & MPOL_F_MOF))
+ ret = true;
+ mpol_cond_put(pol);
+
+ return ret;
}
+ pol = vma->vm_policy;
if (!pol)
- return default_policy.flags & MPOL_F_MOF;
+ pol = get_task_policy(current);
return pol->flags & MPOL_F_MOF;
}
@@ -1874,7 +1861,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
{
struct zonelist *zl;
- *mpol = get_vma_policy(current, vma, addr);
+ *mpol = get_vma_policy(vma, addr);
*nodemask = NULL; /* assume !MPOL_BIND */
if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
@@ -2029,7 +2016,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned int cpuset_mems_cookie;
retry_cpuset:
- pol = get_vma_policy(current, vma, addr);
+ pol = get_vma_policy(vma, addr);
cpuset_mems_cookie = read_mems_allowed_begin();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
@@ -2046,8 +2033,7 @@ retry_cpuset:
page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol, node),
policy_nodemask(gfp, pol));
- if (unlikely(mpol_needs_cond_ref(pol)))
- __mpol_put(pol);
+ mpol_cond_put(pol);
if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
@@ -2074,12 +2060,12 @@ retry_cpuset:
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
- struct mempolicy *pol = get_task_policy(current);
+ struct mempolicy *pol = &default_policy;
struct page *page;
unsigned int cpuset_mems_cookie;
- if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
- pol = &default_policy;
+ if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+ pol = get_task_policy(current);
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
@@ -2296,7 +2282,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
BUG_ON(!vma);
- pol = get_vma_policy(current, vma, addr);
+ pol = get_vma_policy(vma, addr);
if (!(pol->flags & MPOL_F_MOF))
goto out;
diff --git a/mm/migrate.c b/mm/migrate.c
index f78ec9bd454d..09d489c55c21 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -92,7 +92,7 @@ void putback_movable_pages(struct list_head *l)
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- if (unlikely(isolated_balloon_page(page)))
+ if (unlikely(PageBalloon(page)))
balloon_page_putback(page);
else
putback_lru_page(page);
@@ -873,18 +873,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
}
}
- if (unlikely(balloon_page_movable(page))) {
- /*
- * A ballooned page does not need any special attention from
- * physical to virtual reverse mapping procedures.
- * Skip any attempt to unmap PTEs or to remap swap cache,
- * in order to avoid burning cycles at rmap level, and perform
- * the page migration right away (proteced by page lock).
- */
- rc = balloon_page_migrate(newpage, page, mode);
- goto out_unlock;
- }
-
/*
* Corner case handling:
* 1. When a new swap-cache page is read into, it is added to the LRU
@@ -952,17 +940,6 @@ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
rc = __unmap_and_move(page, newpage, force, mode);
- if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
- /*
- * A ballooned page has been migrated already.
- * Now, it's the time to wrap-up counters,
- * handle the page back to Buddy and return.
- */
- dec_zone_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
- balloon_page_free(page);
- return MIGRATEPAGE_SUCCESS;
- }
out:
if (rc != -EAGAIN) {
/*
@@ -1137,6 +1114,10 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
rc = unmap_and_move_huge_page(get_new_page,
put_new_page, private, page,
pass > 2, mode);
+ else if (PageBalloon(page))
+ rc = balloon_page_migrate(get_new_page,
+ put_new_page, private,
+ page, pass > 2, mode);
else
rc = unmap_and_move(get_new_page, put_new_page,
private, page, pass > 2, mode);
diff --git a/mm/mlock.c b/mm/mlock.c
index ab3150c26711..8328af634143 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -233,8 +233,8 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma,
VM_BUG_ON(start & ~PAGE_MASK);
VM_BUG_ON(end & ~PAGE_MASK);
- VM_BUG_ON(start < vma->vm_start);
- VM_BUG_ON(end > vma->vm_end);
+ VM_BUG_ON_VMA(start < vma->vm_start, vma);
+ VM_BUG_ON_VMA(end > vma->vm_end, vma);
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
gup_flags = FOLL_TOUCH | FOLL_MLOCK;
diff --git a/mm/mmap.c b/mm/mmap.c
index 650a1f14a945..e31c1d102c94 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -70,7 +70,7 @@ static void unmap_region(struct mm_struct *mm,
* MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (yes) yes w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
- *
+ *
* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (copy) copy w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
@@ -268,7 +268,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
- unsigned long rlim, retval;
+ unsigned long retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
unsigned long min_brk;
@@ -298,9 +298,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
* segment grow beyond its set limit the in case where the limit is
* not page aligned -Ram Gupta
*/
- rlim = rlimit(RLIMIT_DATA);
- if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
+ mm->end_data, mm->start_data))
goto out;
newbrk = PAGE_ALIGN(brk);
@@ -369,16 +368,18 @@ static int browse_rb(struct rb_root *root)
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
if (vma->vm_start < prev) {
- pr_emerg("vm_start %lx prev %lx\n", vma->vm_start, prev);
+ pr_emerg("vm_start %lx < prev %lx\n",
+ vma->vm_start, prev);
bug = 1;
}
if (vma->vm_start < pend) {
- pr_emerg("vm_start %lx pend %lx\n", vma->vm_start, pend);
+ pr_emerg("vm_start %lx < pend %lx\n",
+ vma->vm_start, pend);
bug = 1;
}
if (vma->vm_start > vma->vm_end) {
- pr_emerg("vm_end %lx < vm_start %lx\n",
- vma->vm_end, vma->vm_start);
+ pr_emerg("vm_start %lx > vm_end %lx\n",
+ vma->vm_start, vma->vm_end);
bug = 1;
}
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
@@ -420,8 +421,10 @@ static void validate_mm(struct mm_struct *mm)
int i = 0;
unsigned long highest_address = 0;
struct vm_area_struct *vma = mm->mmap;
+
while (vma) {
struct anon_vma_chain *avc;
+
vma_lock_anon_vma(vma);
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_verify(avc);
@@ -436,12 +439,13 @@ static void validate_mm(struct mm_struct *mm)
}
if (highest_address != mm->highest_vm_end) {
pr_emerg("mm->highest_vm_end %lx, found %lx\n",
- mm->highest_vm_end, highest_address);
+ mm->highest_vm_end, highest_address);
bug = 1;
}
i = browse_rb(&mm->mm_rb);
if (i != mm->map_count) {
- pr_emerg("map_count %d rb %d\n", mm->map_count, i);
+ if (i != -1)
+ pr_emerg("map_count %d rb %d\n", mm->map_count, i);
bug = 1;
}
BUG_ON(bug);
@@ -741,7 +745,7 @@ again: remove_next = 1 + (end > next->vm_end);
* split_vma inserting another: so it must be
* mprotect case 4 shifting the boundary down.
*/
- adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
+ adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
exporter = vma;
importer = next;
}
@@ -787,8 +791,8 @@ again: remove_next = 1 + (end > next->vm_end);
if (!anon_vma && adjust_next)
anon_vma = next->anon_vma;
if (anon_vma) {
- VM_BUG_ON(adjust_next && next->anon_vma &&
- anon_vma != next->anon_vma);
+ VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
+ anon_vma != next->anon_vma, next);
anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
if (adjust_next)
@@ -1010,7 +1014,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
+ struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
@@ -1036,7 +1040,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
* Can it merge with the predecessor?
*/
if (prev && prev->vm_end == addr &&
- mpol_equal(vma_policy(prev), policy) &&
+ mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff)) {
/*
@@ -1064,7 +1068,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
* Can this new request be merged in front of next?
*/
if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
+ mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen)) {
if (prev && addr < prev->vm_end) /* case 4 */
@@ -1235,7 +1239,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long flags, unsigned long pgoff,
unsigned long *populate)
{
- struct mm_struct * mm = current->mm;
+ struct mm_struct *mm = current->mm;
vm_flags_t vm_flags;
*populate = 0;
@@ -1263,7 +1267,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
- return -EOVERFLOW;
+ return -EOVERFLOW;
/* Too many mappings? */
if (mm->map_count > sysctl_max_map_count)
@@ -1921,7 +1925,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.align_mask = 0;
return vm_unmapped_area(&info);
}
-#endif
+#endif
/*
* This mmap-allocator allocates new areas top-down from below the
@@ -2321,13 +2325,13 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
}
struct vm_area_struct *
-find_extend_vma(struct mm_struct * mm, unsigned long addr)
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct * vma;
+ struct vm_area_struct *vma;
unsigned long start;
addr &= PAGE_MASK;
- vma = find_vma(mm,addr);
+ vma = find_vma(mm, addr);
if (!vma)
return NULL;
if (vma->vm_start <= addr)
@@ -2376,7 +2380,7 @@ static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
+ struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
struct mmu_gather tlb;
lru_add_drain();
@@ -2423,7 +2427,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
* __split_vma() bypasses sysctl_max_map_count checking. We use this on the
* munmap path where it doesn't make sense to fail.
*/
-static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
struct vm_area_struct *new;
@@ -2512,7 +2516,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
return -EINVAL;
- if ((len = PAGE_ALIGN(len)) == 0)
+ len = PAGE_ALIGN(len);
+ if (len == 0)
return -EINVAL;
/* Find the first overlapping VMA */
@@ -2558,7 +2563,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
if (error)
return error;
}
- vma = prev? prev->vm_next: mm->mmap;
+ vma = prev ? prev->vm_next : mm->mmap;
/*
* unlock any mlock()ed ranges before detaching vmas
@@ -2621,10 +2626,10 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
*/
static unsigned long do_brk(unsigned long addr, unsigned long len)
{
- struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma, * prev;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
unsigned long flags;
- struct rb_node ** rb_link, * rb_parent;
+ struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
@@ -2848,7 +2853,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
* safe. It is only safe to keep the vm_pgoff
* linear if there are no pages mapped yet.
*/
- VM_BUG_ON(faulted_in_anon_vma);
+ VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
*vmap = vma = new_vma;
}
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
diff --git a/mm/mremap.c b/mm/mremap.c
index 05f1180e9f21..89e45d8a983a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -195,7 +195,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (pmd_trans_huge(*old_pmd)) {
int err = 0;
if (extent == HPAGE_PMD_SIZE) {
- VM_BUG_ON(vma->vm_file || !vma->anon_vma);
+ VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
+ vma);
/* See comment in move_ptes() */
if (need_rmap_locks)
anon_vma_lock_write(vma->anon_vma);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e11df8fa7ec..bbf405a3a18f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -565,7 +565,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
spin_lock(&zone_scan_lock);
for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
- if (zone_is_oom_locked(zone)) {
+ if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) {
ret = false;
goto out;
}
@@ -575,7 +575,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
* call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
*/
for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
- zone_set_flag(zone, ZONE_OOM_LOCKED);
+ set_bit(ZONE_OOM_LOCKED, &zone->flags);
out:
spin_unlock(&zone_scan_lock);
@@ -594,7 +594,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
spin_lock(&zone_scan_lock);
for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
- zone_clear_flag(zone, ZONE_OOM_LOCKED);
+ clear_bit(ZONE_OOM_LOCKED, &zone->flags);
spin_unlock(&zone_scan_lock);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 508599403721..ff24c9d83112 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1075,13 +1075,13 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
}
if (dirty < setpoint) {
- x = min(bdi->balanced_dirty_ratelimit,
- min(balanced_dirty_ratelimit, task_ratelimit));
+ x = min3(bdi->balanced_dirty_ratelimit,
+ balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit < x)
step = x - dirty_ratelimit;
} else {
- x = max(bdi->balanced_dirty_ratelimit,
- max(balanced_dirty_ratelimit, task_ratelimit));
+ x = max3(bdi->balanced_dirty_ratelimit,
+ balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit > x)
step = dirty_ratelimit - x;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 18cee0d4c8a2..3935c9a17075 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1612,9 +1612,9 @@ again:
}
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
- if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
- !zone_is_fair_depleted(zone))
- zone_set_flag(zone, ZONE_FAIR_DEPLETED);
+ if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
+ !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
+ set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1934,7 +1934,7 @@ static void reset_alloc_batches(struct zone *preferred_zone)
mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
- zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
+ clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
} while (zone++ != preferred_zone);
}
@@ -1985,7 +1985,7 @@ zonelist_scan:
if (alloc_flags & ALLOC_FAIR) {
if (!zone_local(preferred_zone, zone))
break;
- if (zone_is_fair_depleted(zone)) {
+ if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
nr_fair_skipped++;
continue;
}
@@ -2296,58 +2296,72 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
int classzone_idx, int migratetype, enum migrate_mode mode,
- bool *contended_compaction, bool *deferred_compaction,
- unsigned long *did_some_progress)
+ int *contended_compaction, bool *deferred_compaction)
{
+ struct zone *last_compact_zone = NULL;
+ unsigned long compact_result;
+ struct page *page;
+
if (!order)
return NULL;
- if (compaction_deferred(preferred_zone, order)) {
- *deferred_compaction = true;
- return NULL;
- }
-
current->flags |= PF_MEMALLOC;
- *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+ compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, mode,
- contended_compaction);
+ contended_compaction,
+ &last_compact_zone);
current->flags &= ~PF_MEMALLOC;
- if (*did_some_progress != COMPACT_SKIPPED) {
- struct page *page;
+ switch (compact_result) {
+ case COMPACT_DEFERRED:
+ *deferred_compaction = true;
+ /* fall-through */
+ case COMPACT_SKIPPED:
+ return NULL;
+ default:
+ break;
+ }
- /* Page migration frees to the PCP lists but we want merging */
- drain_pages(get_cpu());
- put_cpu();
+ /*
+ * At least in one zone compaction wasn't deferred or skipped, so let's
+ * count a compaction stall
+ */
+ count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, nodemask,
- order, zonelist, high_zoneidx,
- alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx, migratetype);
- if (page) {
- preferred_zone->compact_blockskip_flush = false;
- compaction_defer_reset(preferred_zone, order, true);
- count_vm_event(COMPACTSUCCESS);
- return page;
- }
+ /* Page migration frees to the PCP lists but we want merging */
+ drain_pages(get_cpu());
+ put_cpu();
- /*
- * It's bad if compaction run occurs and fails.
- * The most likely reason is that pages exist,
- * but not enough to satisfy watermarks.
- */
- count_vm_event(COMPACTFAIL);
+ page = get_page_from_freelist(gfp_mask, nodemask,
+ order, zonelist, high_zoneidx,
+ alloc_flags & ~ALLOC_NO_WATERMARKS,
+ preferred_zone, classzone_idx, migratetype);
- /*
- * As async compaction considers a subset of pageblocks, only
- * defer if the failure was a sync compaction failure.
- */
- if (mode != MIGRATE_ASYNC)
- defer_compaction(preferred_zone, order);
+ if (page) {
+ struct zone *zone = page_zone(page);
- cond_resched();
+ zone->compact_blockskip_flush = false;
+ compaction_defer_reset(zone, order, true);
+ count_vm_event(COMPACTSUCCESS);
+ return page;
}
+ /*
+ * last_compact_zone is where try_to_compact_pages thought allocation
+ * should succeed, so it did not defer compaction. But here we know
+ * that it didn't succeed, so we do the defer.
+ */
+ if (last_compact_zone && mode != MIGRATE_ASYNC)
+ defer_compaction(last_compact_zone, order);
+
+ /*
+ * It's bad if compaction run occurs and fails. The most likely reason
+ * is that pages exist, but not enough to satisfy watermarks.
+ */
+ count_vm_event(COMPACTFAIL);
+
+ cond_resched();
+
return NULL;
}
#else
@@ -2355,9 +2369,8 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int classzone_idx, int migratetype,
- enum migrate_mode mode, bool *contended_compaction,
- bool *deferred_compaction, unsigned long *did_some_progress)
+ int classzone_idx, int migratetype, enum migrate_mode mode,
+ int *contended_compaction, bool *deferred_compaction)
{
return NULL;
}
@@ -2457,12 +2470,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
static void wake_all_kswapds(unsigned int order,
struct zonelist *zonelist,
enum zone_type high_zoneidx,
- struct zone *preferred_zone)
+ struct zone *preferred_zone,
+ nodemask_t *nodemask)
{
struct zoneref *z;
struct zone *zone;
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ high_zoneidx, nodemask)
wakeup_kswapd(zone, order, zone_idx(preferred_zone));
}
@@ -2509,7 +2524,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= ALLOC_NO_WATERMARKS;
}
#ifdef CONFIG_CMA
- if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
return alloc_flags;
@@ -2533,7 +2548,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned long did_some_progress;
enum migrate_mode migration_mode = MIGRATE_ASYNC;
bool deferred_compaction = false;
- bool contended_compaction = false;
+ int contended_compaction = COMPACT_CONTENDED_NONE;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -2560,7 +2575,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
restart:
if (!(gfp_mask & __GFP_NO_KSWAPD))
- wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
+ wake_all_kswapds(order, zonelist, high_zoneidx,
+ preferred_zone, nodemask);
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2633,20 +2649,40 @@ rebalance:
preferred_zone,
classzone_idx, migratetype,
migration_mode, &contended_compaction,
- &deferred_compaction,
- &did_some_progress);
+ &deferred_compaction);
if (page)
goto got_pg;
- /*
- * If compaction is deferred for high-order allocations, it is because
- * sync compaction recently failed. In this is the case and the caller
- * requested a movable allocation that does not heavily disrupt the
- * system then fail the allocation instead of entering direct reclaim.
- */
- if ((deferred_compaction || contended_compaction) &&
- (gfp_mask & __GFP_NO_KSWAPD))
- goto nopage;
+ /* Checks for THP-specific high-order allocations */
+ if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
+ /*
+ * If compaction is deferred for high-order allocations, it is
+ * because sync compaction recently failed. If this is the case
+ * and the caller requested a THP allocation, we do not want
+ * to heavily disrupt the system, so we fail the allocation
+ * instead of entering direct reclaim.
+ */
+ if (deferred_compaction)
+ goto nopage;
+
+ /*
+ * In all zones where compaction was attempted (and not
+ * deferred or skipped), lock contention has been detected.
+ * For THP allocation we do not want to disrupt the others
+ * so we fallback to base pages instead.
+ */
+ if (contended_compaction == COMPACT_CONTENDED_LOCK)
+ goto nopage;
+
+ /*
+ * If compaction was aborted due to need_resched(), we do not
+ * want to further increase allocation latency, unless it is
+ * khugepaged trying to collapse.
+ */
+ if (contended_compaction == COMPACT_CONTENDED_SCHED
+ && !(current->flags & PF_KTHREAD))
+ goto nopage;
+ }
/*
* It can become very expensive to allocate transparent hugepages at
@@ -2726,8 +2762,7 @@ rebalance:
preferred_zone,
classzone_idx, migratetype,
migration_mode, &contended_compaction,
- &deferred_compaction,
- &did_some_progress);
+ &deferred_compaction);
if (page)
goto got_pg;
}
@@ -2753,7 +2788,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zone *preferred_zone;
struct zoneref *preferred_zoneref;
struct page *page = NULL;
- int migratetype = allocflags_to_migratetype(gfp_mask);
+ int migratetype = gfpflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
int classzone_idx;
@@ -2775,6 +2810,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
+ if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
@@ -2786,10 +2824,6 @@ retry_cpuset:
goto out;
classzone_idx = zonelist_zone_idx(preferred_zoneref);
-#ifdef CONFIG_CMA
- if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
- alloc_flags |= ALLOC_CMA;
-#endif
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
@@ -4976,6 +5010,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_start_pfn = node_start_pfn;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
+ (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1);
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
@@ -5701,9 +5737,8 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) -
- low_wmark_pages(zone) -
- zone_page_state(zone, NR_ALLOC_BATCH));
+ high_wmark_pages(zone) - low_wmark_pages(zone) -
+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -6278,8 +6313,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
if (list_empty(&cc->migratepages)) {
cc->nr_migratepages = 0;
- pfn = isolate_migratepages_range(cc->zone, cc,
- pfn, end, true);
+ pfn = isolate_migratepages_range(cc, pfn, end);
if (!pfn) {
ret = -EINTR;
break;
@@ -6596,27 +6630,26 @@ static const struct trace_print_flags pageflag_names[] = {
#endif
};
-static void dump_page_flags(unsigned long flags)
+static void dump_flags(unsigned long flags,
+ const struct trace_print_flags *names, int count)
{
const char *delim = "";
unsigned long mask;
int i;
- BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
-
- printk(KERN_ALERT "page flags: %#lx(", flags);
+ printk(KERN_ALERT "flags: %#lx(", flags);
/* remove zone id */
flags &= (1UL << NR_PAGEFLAGS) - 1;
- for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
+ for (i = 0; i < count && flags; i++) {
- mask = pageflag_names[i].mask;
+ mask = names[i].mask;
if ((flags & mask) != mask)
continue;
flags &= ~mask;
- printk("%s%s", delim, pageflag_names[i].name);
+ printk("%s%s", delim, names[i].name);
delim = "|";
}
@@ -6634,12 +6667,14 @@ void dump_page_badflags(struct page *page, const char *reason,
"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
- dump_page_flags(page->flags);
+ BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
+ dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
if (reason)
pr_alert("page dumped because: %s\n", reason);
if (page->flags & badflags) {
pr_alert("bad because of flags:\n");
- dump_page_flags(page->flags & badflags);
+ dump_flags(page->flags & badflags,
+ pageflag_names, ARRAY_SIZE(pageflag_names));
}
mem_cgroup_print_bad_page(page);
}
@@ -6649,3 +6684,66 @@ void dump_page(struct page *page, const char *reason)
dump_page_badflags(page, reason, 0);
}
EXPORT_SYMBOL(dump_page);
+
+#ifdef CONFIG_DEBUG_VM
+
+static const struct trace_print_flags vmaflags_names[] = {
+ {VM_READ, "read" },
+ {VM_WRITE, "write" },
+ {VM_EXEC, "exec" },
+ {VM_SHARED, "shared" },
+ {VM_MAYREAD, "mayread" },
+ {VM_MAYWRITE, "maywrite" },
+ {VM_MAYEXEC, "mayexec" },
+ {VM_MAYSHARE, "mayshare" },
+ {VM_GROWSDOWN, "growsdown" },
+ {VM_PFNMAP, "pfnmap" },
+ {VM_DENYWRITE, "denywrite" },
+ {VM_LOCKED, "locked" },
+ {VM_IO, "io" },
+ {VM_SEQ_READ, "seqread" },
+ {VM_RAND_READ, "randread" },
+ {VM_DONTCOPY, "dontcopy" },
+ {VM_DONTEXPAND, "dontexpand" },
+ {VM_ACCOUNT, "account" },
+ {VM_NORESERVE, "noreserve" },
+ {VM_HUGETLB, "hugetlb" },
+ {VM_NONLINEAR, "nonlinear" },
+#if defined(CONFIG_X86)
+ {VM_PAT, "pat" },
+#elif defined(CONFIG_PPC)
+ {VM_SAO, "sao" },
+#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
+ {VM_GROWSUP, "growsup" },
+#elif !defined(CONFIG_MMU)
+ {VM_MAPPED_COPY, "mappedcopy" },
+#else
+ {VM_ARCH_1, "arch_1" },
+#endif
+ {VM_DONTDUMP, "dontdump" },
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ {VM_SOFTDIRTY, "softdirty" },
+#endif
+ {VM_MIXEDMAP, "mixedmap" },
+ {VM_HUGEPAGE, "hugepage" },
+ {VM_NOHUGEPAGE, "nohugepage" },
+ {VM_MERGEABLE, "mergeable" },
+};
+
+void dump_vma(const struct vm_area_struct *vma)
+{
+ printk(KERN_ALERT
+ "vma %p start %p end %p\n"
+ "next %p prev %p mm %p\n"
+ "prot %lx anon_vma %p vm_ops %p\n"
+ "pgoff %lx file %p private_data %p\n",
+ vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
+ vma->vm_prev, vma->vm_mm,
+ (unsigned long)pgprot_val(vma->vm_page_prot),
+ vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
+ vma->vm_file, vma->vm_private_data);
+ dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
+}
+EXPORT_SYMBOL(dump_vma);
+
+#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/rmap.c b/mm/rmap.c
index 3e8491c504f8..5fbd0fe8f933 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -527,7 +527,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
unsigned long address = __vma_address(page, vma);
/* page should be within @vma mapping range */
- VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
return address;
}
@@ -897,7 +897,7 @@ void page_move_anon_rmap(struct page *page,
struct anon_vma *anon_vma = vma->anon_vma;
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON(!anon_vma);
+ VM_BUG_ON_VMA(!anon_vma, vma);
VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
@@ -1024,7 +1024,7 @@ void do_page_add_anon_rmap(struct page *page,
void page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
SetPageSwapBacked(page);
atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
if (PageTransHuge(page))
@@ -1666,7 +1666,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
* structure at mapping cannot be freed and reused yet,
* so we can safely take mapping->i_mmap_mutex.
*/
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!mapping)
return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index d4bc55d3f107..d547345eb2b5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3075,7 +3075,9 @@ static const struct address_space_operations shmem_aops = {
.write_begin = shmem_write_begin,
.write_end = shmem_write_end,
#endif
+#ifdef CONFIG_MIGRATION
.migratepage = migrate_page,
+#endif
.error_remove_page = generic_error_remove_page,
};
diff --git a/mm/slab.c b/mm/slab.c
index a467b308c682..56116acedacf 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -785,8 +785,8 @@ static inline void *ac_get_obj(struct kmem_cache *cachep,
return objp;
}
-static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
- void *objp)
+static noinline void *__ac_put_obj(struct kmem_cache *cachep,
+ struct array_cache *ac, void *objp)
{
if (unlikely(pfmemalloc_active)) {
/* Some pfmemalloc slabs exist, check if this is one */
@@ -984,46 +984,50 @@ static void drain_alien_cache(struct kmem_cache *cachep,
}
}
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
+ int node, int page_node)
{
- int nodeid = page_to_nid(virt_to_page(objp));
struct kmem_cache_node *n;
struct alien_cache *alien = NULL;
struct array_cache *ac;
- int node;
LIST_HEAD(list);
- node = numa_mem_id();
-
- /*
- * Make sure we are not freeing a object from another node to the array
- * cache on this cpu.
- */
- if (likely(nodeid == node))
- return 0;
-
n = get_node(cachep, node);
STATS_INC_NODEFREES(cachep);
- if (n->alien && n->alien[nodeid]) {
- alien = n->alien[nodeid];
+ if (n->alien && n->alien[page_node]) {
+ alien = n->alien[page_node];
ac = &alien->ac;
spin_lock(&alien->lock);
if (unlikely(ac->avail == ac->limit)) {
STATS_INC_ACOVERFLOW(cachep);
- __drain_alien_cache(cachep, ac, nodeid, &list);
+ __drain_alien_cache(cachep, ac, page_node, &list);
}
ac_put_obj(cachep, ac, objp);
spin_unlock(&alien->lock);
slabs_destroy(cachep, &list);
} else {
- n = get_node(cachep, nodeid);
+ n = get_node(cachep, page_node);
spin_lock(&n->list_lock);
- free_block(cachep, &objp, 1, nodeid, &list);
+ free_block(cachep, &objp, 1, page_node, &list);
spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
}
return 1;
}
+
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+ int page_node = page_to_nid(virt_to_page(objp));
+ int node = numa_mem_id();
+ /*
+ * Make sure we are not freeing a object from another node to the array
+ * cache on this cpu.
+ */
+ if (likely(node == page_node))
+ return 0;
+
+ return __cache_free_alien(cachep, objp, node, page_node);
+}
#endif
/*
@@ -3406,7 +3410,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
return;
- if (likely(ac->avail < ac->limit)) {
+ if (ac->avail < ac->limit) {
STATS_INC_FREEHIT(cachep);
} else {
STATS_INC_FREEMISS(cachep);
@@ -3503,7 +3507,6 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
return kmem_cache_alloc_node_trace(cachep, flags, node, size);
}
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
return __do_kmalloc_node(size, flags, node, _RET_IP_);
@@ -3516,13 +3519,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
return __do_kmalloc_node(size, flags, node, caller);
}
EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#else
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
- return __do_kmalloc_node(size, flags, node, 0);
-}
-EXPORT_SYMBOL(__kmalloc_node);
-#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
#endif /* CONFIG_NUMA */
/**
@@ -3548,8 +3544,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
return ret;
}
-
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc(size_t size, gfp_t flags)
{
return __do_kmalloc(size, flags, _RET_IP_);
@@ -3562,14 +3556,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
}
EXPORT_SYMBOL(__kmalloc_track_caller);
-#else
-void *__kmalloc(size_t size, gfp_t flags)
-{
- return __do_kmalloc(size, flags, 0);
-}
-EXPORT_SYMBOL(__kmalloc);
-#endif
-
/**
* kmem_cache_free - Deallocate an object
* @cachep: The cache the allocation was from.
@@ -4262,19 +4248,15 @@ static const struct seq_operations slabstats_op = {
static int slabstats_open(struct inode *inode, struct file *file)
{
- unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
- int ret = -ENOMEM;
- if (n) {
- ret = seq_open(file, &slabstats_op);
- if (!ret) {
- struct seq_file *m = file->private_data;
- *n = PAGE_SIZE / (2 * sizeof(unsigned long));
- m->private = n;
- n = NULL;
- }
- kfree(n);
- }
- return ret;
+ unsigned long *n;
+
+ n = __seq_open_private(file, &slabstats_op, PAGE_SIZE);
+ if (!n)
+ return -ENOMEM;
+
+ *n = PAGE_SIZE / (2 * sizeof(unsigned long));
+
+ return 0;
}
static const struct file_operations proc_slabstats_operations = {
diff --git a/mm/slab.h b/mm/slab.h
index 0e0fdd365840..026e7c393f0b 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -4,6 +4,41 @@
* Internal slab definitions
*/
+#ifdef CONFIG_SLOB
+/*
+ * Common fields provided in kmem_cache by all slab allocators
+ * This struct is either used directly by the allocator (SLOB)
+ * or the allocator must include definitions for all fields
+ * provided in kmem_cache_common in their definition of kmem_cache.
+ *
+ * Once we can do anonymous structs (C11 standard) we could put a
+ * anonymous struct definition in these allocators so that the
+ * separate allocations in the kmem_cache structure of SLAB and
+ * SLUB is no longer needed.
+ */
+struct kmem_cache {
+ unsigned int object_size;/* The original size of the object */
+ unsigned int size; /* The aligned/padded/added on size */
+ unsigned int align; /* Alignment as calculated */
+ unsigned long flags; /* Active flags on the slab */
+ const char *name; /* Slab name for sysfs */
+ int refcount; /* Use counter */
+ void (*ctor)(void *); /* Called on object slot creation */
+ struct list_head list; /* List of all slab caches on the system */
+};
+
+#endif /* CONFIG_SLOB */
+
+#ifdef CONFIG_SLAB
+#include <linux/slab_def.h>
+#endif
+
+#ifdef CONFIG_SLUB
+#include <linux/slub_def.h>
+#endif
+
+#include <linux/memcontrol.h>
+
/*
* State of the slab allocator.
*
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d319502b2403..d7d8ffd0c306 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -30,6 +30,15 @@ LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
+/*
+ * Determine the size of a slab object
+ */
+unsigned int kmem_cache_size(struct kmem_cache *s)
+{
+ return s->object_size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
#ifdef CONFIG_DEBUG_VM
static int kmem_cache_sanity_check(const char *name, size_t size)
{
@@ -211,8 +220,10 @@ kmem_cache_create(const char *name, size_t size, size_t align,
mutex_lock(&slab_mutex);
err = kmem_cache_sanity_check(name, size);
- if (err)
+ if (err) {
+ s = NULL; /* suppress uninit var warning */
goto out_unlock;
+ }
/*
* Some allocators will constraint the set of valid flags to a subset
diff --git a/mm/slob.c b/mm/slob.c
index 21980e0f39a8..96a86206a26b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -468,7 +468,6 @@ void *__kmalloc(size_t size, gfp_t gfp)
}
EXPORT_SYMBOL(__kmalloc);
-#ifdef CONFIG_TRACING
void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
{
return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
@@ -481,7 +480,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
return __do_kmalloc_node(size, gfp, node, caller);
}
#endif
-#endif
void kfree(const void *block)
{
diff --git a/mm/slub.c b/mm/slub.c
index 3e8afcc07a76..fa86e5845093 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4604,6 +4604,14 @@ static ssize_t trace_show(struct kmem_cache *s, char *buf)
static ssize_t trace_store(struct kmem_cache *s, const char *buf,
size_t length)
{
+ /*
+ * Tracing a merged cache is going to give confusing results
+ * as well as cause other issues like converting a mergeable
+ * cache into an umergeable one.
+ */
+ if (s->refcount > 1)
+ return -EINVAL;
+
s->flags &= ~SLAB_TRACE;
if (buf[0] == '1') {
s->flags &= ~__CMPXCHG_DOUBLE;
@@ -4721,6 +4729,9 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf)
static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
size_t length)
{
+ if (s->refcount > 1)
+ return -EINVAL;
+
s->flags &= ~SLAB_FAILSLAB;
if (buf[0] == '1')
s->flags |= SLAB_FAILSLAB;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3e0ec83d000c..ef1f39139b71 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -28,7 +28,9 @@
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
.set_page_dirty = swap_set_page_dirty,
+#ifdef CONFIG_MIGRATION
.migratepage = migrate_page,
+#endif
};
static struct backing_dev_info swap_backing_dev_info = {
diff --git a/mm/util.c b/mm/util.c
index 093c973f1697..d25558ba661c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -3,6 +3,7 @@
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/ctype.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/security.h>
@@ -62,6 +63,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp)
EXPORT_SYMBOL(kstrndup);
/**
+ * kstrimdup - Trim and copy a %NUL terminated string.
+ * @s: the string to trim and duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Returns an address, which the caller must kfree, containing
+ * a duplicate of the passed string with leading and/or trailing
+ * whitespace (as defined by isspace) removed.
+ */
+char *kstrimdup(const char *s, gfp_t gfp)
+{
+ char *buf;
+ char *begin = skip_spaces(s);
+ size_t len = strlen(begin);
+
+ while (len && isspace(begin[len - 1]))
+ len--;
+
+ buf = kmalloc_track_caller(len + 1, gfp);
+ if (!buf)
+ return NULL;
+
+ memcpy(buf, begin, len);
+ buf[len] = '\0';
+
+ return buf;
+}
+EXPORT_SYMBOL(kstrimdup);
+
+/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
@@ -170,32 +200,25 @@ static int vm_is_stack_for_task(struct task_struct *t,
/*
* Check if the vma is being used as a stack.
* If is_group is non-zero, check in the entire thread group or else
- * just check in the current task. Returns the pid of the task that
- * the vma is stack for.
+ * just check in the current task. Returns the task_struct of the task
+ * that the vma is stack for. Must be called under rcu_read_lock().
*/
-pid_t vm_is_stack(struct task_struct *task,
- struct vm_area_struct *vma, int in_group)
+struct task_struct *task_of_stack(struct task_struct *task,
+ struct vm_area_struct *vma, bool in_group)
{
- pid_t ret = 0;
-
if (vm_is_stack_for_task(task, vma))
- return task->pid;
+ return task;
if (in_group) {
struct task_struct *t;
- rcu_read_lock();
for_each_thread(task, t) {
- if (vm_is_stack_for_task(t, vma)) {
- ret = t->pid;
- goto done;
- }
+ if (vm_is_stack_for_task(t, vma))
+ return t;
}
-done:
- rcu_read_unlock();
}
- return ret;
+ return NULL;
}
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2b0aa5486092..90520af7f186 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2646,21 +2646,11 @@ static const struct seq_operations vmalloc_op = {
static int vmalloc_open(struct inode *inode, struct file *file)
{
- unsigned int *ptr = NULL;
- int ret;
-
- if (IS_ENABLED(CONFIG_NUMA)) {
- ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
- if (ptr == NULL)
- return -ENOMEM;
- }
- ret = seq_open(file, &vmalloc_op);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = ptr;
- } else
- kfree(ptr);
- return ret;
+ if (IS_ENABLED(CONFIG_NUMA))
+ return seq_open_private(file, &vmalloc_op,
+ nr_node_ids * sizeof(unsigned int));
+ else
+ return seq_open(file, &vmalloc_op);
}
static const struct file_operations proc_vmalloc_operations = {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2836b5373b2e..b672e2c6becc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -920,7 +920,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
/* Case 1 above */
if (current_is_kswapd() &&
PageReclaim(page) &&
- zone_is_reclaim_writeback(zone)) {
+ test_bit(ZONE_WRITEBACK, &zone->flags)) {
nr_immediate++;
goto keep_locked;
@@ -1002,7 +1002,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
if (page_is_file_cache(page) &&
(!current_is_kswapd() ||
- !zone_is_reclaim_dirty(zone))) {
+ !test_bit(ZONE_DIRTY, &zone->flags))) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
@@ -1160,7 +1160,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
list_for_each_entry_safe(page, next, page_list, lru) {
if (page_is_file_cache(page) && !PageDirty(page) &&
- !isolated_balloon_page(page)) {
+ !PageBalloon(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
@@ -1563,7 +1563,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* are encountered in the nr_immediate check below.
*/
if (nr_writeback && nr_writeback == nr_taken)
- zone_set_flag(zone, ZONE_WRITEBACK);
+ set_bit(ZONE_WRITEBACK, &zone->flags);
/*
* memcg will stall in page writeback so only consider forcibly
@@ -1575,16 +1575,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* backed by a congested BDI and wait_iff_congested will stall.
*/
if (nr_dirty && nr_dirty == nr_congested)
- zone_set_flag(zone, ZONE_CONGESTED);
+ set_bit(ZONE_CONGESTED, &zone->flags);
/*
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not keeping up. In this case, flag
- * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
- * pages from reclaim context.
+ * the zone ZONE_DIRTY and kswapd will start writing pages from
+ * reclaim context.
*/
if (nr_unqueued_dirty == nr_taken)
- zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
+ set_bit(ZONE_DIRTY, &zone->flags);
/*
* If kswapd scans pages marked marked for immediate
@@ -2315,7 +2315,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
return reclaimable;
}
-/* Returns true if compaction should go ahead for a high-order request */
+/*
+ * Returns true if compaction should go ahead for a high-order request, or
+ * the high-order allocation would succeed without compaction.
+ */
static inline bool compaction_ready(struct zone *zone, int order)
{
unsigned long balance_gap, watermark;
@@ -2339,8 +2342,11 @@ static inline bool compaction_ready(struct zone *zone, int order)
if (compaction_deferred(zone, order))
return watermark_ok;
- /* If compaction is not ready to start, keep reclaiming */
- if (!compaction_suitable(zone, order))
+ /*
+ * If compaction is not ready to start and allocation is not likely
+ * to succeed without it, then keep reclaiming.
+ */
+ if (compaction_suitable(zone, order) == COMPACT_SKIPPED)
return false;
return watermark_ok;
@@ -2818,7 +2824,7 @@ static bool zone_balanced(struct zone *zone, int order,
return false;
if (IS_ENABLED(CONFIG_COMPACTION) && order &&
- !compaction_suitable(zone, order))
+ compaction_suitable(zone, order) == COMPACT_SKIPPED)
return false;
return true;
@@ -2978,7 +2984,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
/* Account for the number of pages attempted to reclaim */
*nr_attempted += sc->nr_to_reclaim;
- zone_clear_flag(zone, ZONE_WRITEBACK);
+ clear_bit(ZONE_WRITEBACK, &zone->flags);
/*
* If a zone reaches its high watermark, consider it to be no longer
@@ -2988,8 +2994,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
*/
if (zone_reclaimable(zone) &&
zone_balanced(zone, testorder, 0, classzone_idx)) {
- zone_clear_flag(zone, ZONE_CONGESTED);
- zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
+ clear_bit(ZONE_CONGESTED, &zone->flags);
+ clear_bit(ZONE_DIRTY, &zone->flags);
}
return sc->nr_scanned >= sc->nr_to_reclaim;
@@ -3080,8 +3086,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
* If balanced, clear the dirty and congested
* flags
*/
- zone_clear_flag(zone, ZONE_CONGESTED);
- zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
+ clear_bit(ZONE_CONGESTED, &zone->flags);
+ clear_bit(ZONE_DIRTY, &zone->flags);
}
}
@@ -3708,11 +3714,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
if (node_state(node_id, N_CPU) && node_id != numa_node_id())
return ZONE_RECLAIM_NOSCAN;
- if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
+ if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
return ZONE_RECLAIM_NOSCAN;
ret = __zone_reclaim(zone, gfp_mask, order);
- zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
+ clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
if (!ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
@@ -3791,66 +3797,3 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
}
}
#endif /* CONFIG_SHMEM */
-
-static void warn_scan_unevictable_pages(void)
-{
- printk_once(KERN_WARNING
- "%s: The scan_unevictable_pages sysctl/node-interface has been "
- "disabled for lack of a legitimate use case. If you have "
- "one, please send an email to linux-mm@kvack.org.\n",
- current->comm);
-}
-
-/*
- * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
- * all nodes' unevictable lists for evictable pages
- */
-unsigned long scan_unevictable_pages;
-
-int scan_unevictable_handler(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *length, loff_t *ppos)
-{
- warn_scan_unevictable_pages();
- proc_doulongvec_minmax(table, write, buffer, length, ppos);
- scan_unevictable_pages = 0;
- return 0;
-}
-
-#ifdef CONFIG_NUMA
-/*
- * per node 'scan_unevictable_pages' attribute. On demand re-scan of
- * a specified node's per zone unevictable lists for evictable pages.
- */
-
-static ssize_t read_scan_unevictable_node(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- warn_scan_unevictable_pages();
- return sprintf(buf, "0\n"); /* always zero; should fit... */
-}
-
-static ssize_t write_scan_unevictable_node(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- warn_scan_unevictable_pages();
- return 1;
-}
-
-
-static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
- read_scan_unevictable_node,
- write_scan_unevictable_node);
-
-int scan_unevictable_register_node(struct node *node)
-{
- return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
-}
-
-void scan_unevictable_unregister_node(struct node *node)
-{
- device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
-}
-#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e9ab104b956f..ff7bd291dee3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -7,6 +7,7 @@
* zoned VM statistics
* Copyright (C) 2006 Silicon Graphics, Inc.,
* Christoph Lameter <christoph@lameter.com>
+ * Copyright (C) 2008-2014 Christoph Lameter
*/
#include <linux/fs.h>
#include <linux/mm.h>
@@ -14,6 +15,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/cpu.h>
+#include <linux/cpumask.h>
#include <linux/vmstat.h>
#include <linux/sched.h>
#include <linux/math64.h>
@@ -419,13 +421,22 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
EXPORT_SYMBOL(dec_zone_page_state);
#endif
-static inline void fold_diff(int *diff)
+
+/*
+ * Fold a differential into the global counters.
+ * Returns the number of counters updated.
+ */
+static int fold_diff(int *diff)
{
int i;
+ int changes = 0;
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (diff[i])
+ if (diff[i]) {
atomic_long_add(diff[i], &vm_stat[i]);
+ changes++;
+ }
+ return changes;
}
/*
@@ -441,12 +452,15 @@ static inline void fold_diff(int *diff)
* statistics in the remote zone struct as well as the global cachelines
* with the global counters. These could cause remote node cache line
* bouncing and will have to be only done when necessary.
+ *
+ * The function returns the number of global counters updated.
*/
-static void refresh_cpu_vm_stats(void)
+static int refresh_cpu_vm_stats(void)
{
struct zone *zone;
int i;
int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+ int changes = 0;
for_each_populated_zone(zone) {
struct per_cpu_pageset __percpu *p = zone->pageset;
@@ -486,15 +500,17 @@ static void refresh_cpu_vm_stats(void)
continue;
}
-
if (__this_cpu_dec_return(p->expire))
continue;
- if (__this_cpu_read(p->pcp.count))
+ if (__this_cpu_read(p->pcp.count)) {
drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+ changes++;
+ }
#endif
}
- fold_diff(global_diff);
+ changes += fold_diff(global_diff);
+ return changes;
}
/*
@@ -735,7 +751,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
TEXT_FOR_HIGHMEM(xx) xx "_movable",
const char * const vmstat_text[] = {
- /* Zoned VM counters */
+ /* enum zone_stat_item countes */
"nr_free_pages",
"nr_alloc_batch",
"nr_inactive_anon",
@@ -778,10 +794,16 @@ const char * const vmstat_text[] = {
"workingset_nodereclaim",
"nr_anon_transparent_hugepages",
"nr_free_cma",
+#ifdef CONFIG_MEMORY_BALLOON
+ "nr_balloon_pages",
+#endif
+
+ /* enum writeback_stat_item counters */
"nr_dirty_threshold",
"nr_dirty_background_threshold",
#ifdef CONFIG_VM_EVENT_COUNTERS
+ /* enum vm_event_item counters */
"pgpgin",
"pgpgout",
"pswpin",
@@ -1229,20 +1251,108 @@ static const struct file_operations proc_vmstat_file_operations = {
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
+static cpumask_var_t cpu_stat_off;
static void vmstat_update(struct work_struct *w)
{
- refresh_cpu_vm_stats();
- schedule_delayed_work(this_cpu_ptr(&vmstat_work),
+ if (refresh_cpu_vm_stats())
+ /*
+ * Counters were updated so we expect more updates
+ * to occur in the future. Keep on running the
+ * update worker thread.
+ */
+ schedule_delayed_work(this_cpu_ptr(&vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+ else {
+ /*
+ * We did not update any counters so the app may be in
+ * a mode where it does not cause counter updates.
+ * We may be uselessly running vmstat_update.
+ * Defer the checking for differentials to the
+ * shepherd thread on a different processor.
+ */
+ int r;
+ /*
+ * Shepherd work thread does not race since it never
+ * changes the bit if its zero but the cpu
+ * online / off line code may race if
+ * worker threads are still allowed during
+ * shutdown / startup.
+ */
+ r = cpumask_test_and_set_cpu(smp_processor_id(),
+ cpu_stat_off);
+ VM_BUG_ON(r);
+ }
+}
+
+/*
+ * Check if the diffs for a certain cpu indicate that
+ * an update is needed.
+ */
+static bool need_update(int cpu)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone) {
+ struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
+
+ BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
+ /*
+ * The fast way of checking if there are any vmstat diffs.
+ * This works because the diffs are byte sized items.
+ */
+ if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
+ return true;
+
+ }
+ return false;
+}
+
+
+/*
+ * Shepherd worker thread that checks the
+ * differentials of processors that have their worker
+ * threads for vm statistics updates disabled because of
+ * inactivity.
+ */
+static void vmstat_shepherd(struct work_struct *w);
+
+static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
+
+static void vmstat_shepherd(struct work_struct *w)
+{
+ int cpu;
+
+ get_online_cpus();
+ /* Check processors whose vmstat worker threads have been disabled */
+ for_each_cpu(cpu, cpu_stat_off)
+ if (need_update(cpu) &&
+ cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+
+ schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu),
+ __round_jiffies_relative(sysctl_stat_interval, cpu));
+
+ put_online_cpus();
+
+ schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
+
}
-static void start_cpu_timer(int cpu)
+static void __init start_shepherd_timer(void)
{
- struct delayed_work *work = &per_cpu(vmstat_work, cpu);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
+ vmstat_update);
+
+ if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
+ BUG();
+ cpumask_copy(cpu_stat_off, cpu_online_mask);
- INIT_DEFERRABLE_WORK(work, vmstat_update);
- schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
+ schedule_delayed_work(&shepherd,
+ round_jiffies_relative(sysctl_stat_interval));
}
static void vmstat_cpu_dead(int node)
@@ -1273,17 +1383,17 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
refresh_zone_stat_thresholds();
- start_cpu_timer(cpu);
node_set_state(cpu_to_node(cpu), N_CPU);
+ cpumask_set_cpu(cpu, cpu_stat_off);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
- per_cpu(vmstat_work, cpu).work.func = NULL;
+ cpumask_clear_cpu(cpu, cpu_stat_off);
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- start_cpu_timer(cpu);
+ cpumask_set_cpu(cpu, cpu_stat_off);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -1303,15 +1413,10 @@ static struct notifier_block vmstat_notifier =
static int __init setup_vmstat(void)
{
#ifdef CONFIG_SMP
- int cpu;
-
cpu_notifier_register_begin();
__register_cpu_notifier(&vmstat_notifier);
- for_each_online_cpu(cpu) {
- start_cpu_timer(cpu);
- node_set_state(cpu_to_node(cpu), N_CPU);
- }
+ start_shepherd_timer();
cpu_notifier_register_done();
#endif
#ifdef CONFIG_PROC_FS
diff --git a/mm/zbud.c b/mm/zbud.c
index f26e7fcc7fa2..ecf1dbef6983 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -60,15 +60,17 @@
* NCHUNKS_ORDER determines the internal allocation granularity, effectively
* adjusting internal fragmentation. It also determines the number of
* freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
- * will be 64 freelists per pool.
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
+ * in allocated page is occupied by zbud header, NCHUNKS will be calculated to
+ * 63 which shows the max number of free chunks in zbud page, also there will be
+ * 63 freelists per pool.
*/
#define NCHUNKS_ORDER 6
#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
#define CHUNK_SIZE (1 << CHUNK_SHIFT)
-#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
/**
* struct zbud_pool - stores metadata for each zbud pool
@@ -268,10 +270,9 @@ static int num_free_chunks(struct zbud_header *zhdr)
{
/*
* Rather than branch for different situations, just use the fact that
- * free buddies have a length of zero to simplify everything. -1 at the
- * end for the zbud header.
+ * free buddies have a length of zero to simplify everything.
*/
- return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
+ return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
}
/*****************
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 94f38fac5e81..c4a91578dc96 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -199,9 +199,6 @@ struct size_class {
spinlock_t lock;
- /* stats */
- u64 pages_allocated;
-
struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
};
@@ -220,6 +217,7 @@ struct zs_pool {
struct size_class size_class[ZS_SIZE_CLASSES];
gfp_t flags; /* allocation flags used when growing pool */
+ atomic_long_t pages_allocated;
};
/*
@@ -299,7 +297,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle)
static u64 zs_zpool_total_size(void *pool)
{
- return zs_get_total_size_bytes(pool);
+ return zs_get_total_pages(pool) << PAGE_SHIFT;
}
static struct zpool_driver zs_zpool_driver = {
@@ -1028,8 +1026,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
return 0;
set_zspage_mapping(first_page, class->index, ZS_EMPTY);
+ atomic_long_add(class->pages_per_zspage,
+ &pool->pages_allocated);
spin_lock(&class->lock);
- class->pages_allocated += class->pages_per_zspage;
}
obj = (unsigned long)first_page->freelist;
@@ -1082,14 +1081,13 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
first_page->inuse--;
fullness = fix_fullness_group(pool, first_page);
-
- if (fullness == ZS_EMPTY)
- class->pages_allocated -= class->pages_per_zspage;
-
spin_unlock(&class->lock);
- if (fullness == ZS_EMPTY)
+ if (fullness == ZS_EMPTY) {
+ atomic_long_sub(class->pages_per_zspage,
+ &pool->pages_allocated);
free_zspage(first_page);
+ }
}
EXPORT_SYMBOL_GPL(zs_free);
@@ -1183,17 +1181,11 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
-u64 zs_get_total_size_bytes(struct zs_pool *pool)
+unsigned long zs_get_total_pages(struct zs_pool *pool)
{
- int i;
- u64 npages = 0;
-
- for (i = 0; i < ZS_SIZE_CLASSES; i++)
- npages += pool->size_class[i].pages_allocated;
-
- return npages << PAGE_SHIFT;
+ return atomic_long_read(&pool->pages_allocated);
}
-EXPORT_SYMBOL_GPL(zs_get_total_size_bytes);
+EXPORT_SYMBOL_GPL(zs_get_total_pages);
module_init(zs_init);
module_exit(zs_exit);
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 4d08b398411f..0c520f7bf095 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -43,6 +43,7 @@ my $configuration_file = ".checkpatch.conf";
my $max_line_length = 80;
my $ignore_perl_version = 0;
my $minimum_perl_version = 5.10.0;
+my $min_conf_desc_length = 4;
sub help {
my ($exitcode) = @_;
@@ -63,6 +64,7 @@ Options:
--types TYPE(,TYPE2...) show only these comma separated message types
--ignore TYPE(,TYPE2...) ignore various comma separated message types
--max-line-length=n set the maximum line length, if exceeded, warn
+ --min-conf-desc-length=n set the min description length, if shorter, warn
--show-types show the message "types" in the output
--root=PATH PATH to the kernel tree root
--no-summary suppress the per-file summary
@@ -131,6 +133,7 @@ GetOptions(
'types=s' => \@use,
'show-types!' => \$show_types,
'max-line-length=i' => \$max_line_length,
+ 'min-conf-desc-length=i' => \$min_conf_desc_length,
'root=s' => \$root,
'summary!' => \$summary,
'mailback!' => \$mailback,
@@ -425,7 +428,9 @@ foreach my $entry (@mode_permission_funcs) {
our $allowed_asm_includes = qr{(?x:
irq|
- memory
+ memory|
+ time|
+ reboot
)};
# memory.h: ARM has a custom one
@@ -2283,8 +2288,10 @@ sub process {
}
$length++;
}
- WARN("CONFIG_DESCRIPTION",
- "please write a paragraph that describes the config symbol fully\n" . $herecurr) if ($is_start && $is_end && $length < 4);
+ if ($is_start && $is_end && $length < $min_conf_desc_length) {
+ WARN("CONFIG_DESCRIPTION",
+ "please write a paragraph that describes the config symbol fully\n" . $herecurr);
+ }
#print "is_start<$is_start> is_end<$is_end> length<$length>\n";
}
@@ -2341,7 +2348,7 @@ sub process {
}
# check we are in a valid source file if not then ignore this hunk
- next if ($realfile !~ /\.(h|c|s|S|pl|sh)$/);
+ next if ($realfile !~ /\.(h|c|s|S|pl|sh|dtsi|dts)$/);
#line length limit
if ($line =~ /^\+/ && $prevrawline !~ /\/\*\*/ &&
@@ -2402,7 +2409,7 @@ sub process {
}
# check we are in a valid source file C or perl if not then ignore this hunk
- next if ($realfile !~ /\.(h|c|pl)$/);
+ next if ($realfile !~ /\.(h|c|pl|dtsi|dts)$/);
# at the beginning of a line any tabs must come first and anything
# more than 8 must use tabs.
@@ -2424,7 +2431,7 @@ sub process {
"please, no space before tabs\n" . $herevet) &&
$fix) {
while ($fixed[$fixlinenr] =~
- s/(^\+.*) {8,8}+\t/$1\t\t/) {}
+ s/(^\+.*) {8,8}\t/$1\t\t/) {}
while ($fixed[$fixlinenr] =~
s/(^\+.*) +\t/$1\t/) {}
}
@@ -3752,7 +3759,6 @@ sub process {
if (ERROR("SPACING",
"space prohibited before that close parenthesis ')'\n" . $herecurr) &&
$fix) {
- print("fixlinenr: <$fixlinenr> fixed[fixlinenr]: <$fixed[$fixlinenr]>\n");
$fixed[$fixlinenr] =~
s/\s+\)/\)/;
}
@@ -4126,7 +4132,7 @@ sub process {
"Macros with multiple statements should be enclosed in a do - while loop\n" . "$herectx");
} else {
ERROR("COMPLEX_MACRO",
- "Macros with complex values should be enclosed in parenthesis\n" . "$herectx");
+ "Macros with complex values should be enclosed in parentheses\n" . "$herectx");
}
}
@@ -4338,6 +4344,12 @@ sub process {
"Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt\n" . $herecurr);
}
+# concatenated string without spaces between elements
+ if ($line =~ /"X+"[A-Z_]+/ || $line =~ /[A-Z_]+"X+"/) {
+ CHK("CONCATENATED_STRING",
+ "Concatenated strings should use spaces between elements\n" . $herecurr);
+ }
+
# warn about #if 0
if ($line =~ /^.\s*\#\s*if\s+0\b/) {
CHK("REDUNDANT_CODE",
diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh
index 5de5660cb708..fdebd66f8fc1 100755
--- a/scripts/headers_install.sh
+++ b/scripts/headers_install.sh
@@ -1,8 +1,8 @@
#!/bin/sh
-if [ $# -lt 1 ]
+if [ $# -lt 2 ]
then
- echo "Usage: headers_install.sh OUTDIR SRCDIR [FILES...]
+ echo "Usage: headers_install.sh OUTDIR SRCDIR [FILES...]"
echo
echo "Prepares kernel header files for use by user space, by removing"
echo "all compiler.h definitions and #includes, removing any"
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 3f94e1afd6cf..4c4b1f631ecf 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -3,6 +3,7 @@
CC = $(CROSS_COMPILE)gcc
CFLAGS = -Wall
BINARIES = hugepage-mmap hugepage-shm map_hugetlb thuge-gen hugetlbfstest
+BINARIES += transhuge-stress
all: $(BINARIES)
%: %.c
diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/vm/transhuge-stress.c
new file mode 100644
index 000000000000..fd7f1b4a96f9
--- /dev/null
+++ b/tools/testing/selftests/vm/transhuge-stress.c
@@ -0,0 +1,144 @@
+/*
+ * Stress test for transparent huge pages, memory compaction and migration.
+ *
+ * Authors: Konstantin Khlebnikov <koct9i@gmail.com>
+ *
+ * This is free and unencumbered software released into the public domain.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <err.h>
+#include <time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#define PAGE_SHIFT 12
+#define HPAGE_SHIFT 21
+
+#define PAGE_SIZE (1 << PAGE_SHIFT)
+#define HPAGE_SIZE (1 << HPAGE_SHIFT)
+
+#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0)
+#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1))
+
+int pagemap_fd;
+
+int64_t allocate_transhuge(void *ptr)
+{
+ uint64_t ent[2];
+
+ /* drop pmd */
+ if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANONYMOUS |
+ MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
+ errx(2, "mmap transhuge");
+
+ if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
+ err(2, "MADV_HUGEPAGE");
+
+ /* allocate transparent huge page */
+ *(volatile void **)ptr = ptr;
+
+ if (pread(pagemap_fd, ent, sizeof(ent),
+ (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
+ err(2, "read pagemap");
+
+ if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
+ PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
+ !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
+ return PAGEMAP_PFN(ent[0]);
+
+ return -1;
+}
+
+int main(int argc, char **argv)
+{
+ size_t ram, len;
+ void *ptr, *p;
+ struct timespec a, b;
+ double s;
+ uint8_t *map;
+ size_t map_len;
+
+ ram = sysconf(_SC_PHYS_PAGES);
+ if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4)
+ ram = SIZE_MAX / 4;
+ else
+ ram *= sysconf(_SC_PAGESIZE);
+
+ if (argc == 1)
+ len = ram;
+ else if (!strcmp(argv[1], "-h"))
+ errx(1, "usage: %s [size in MiB]", argv[0]);
+ else
+ len = atoll(argv[1]) << 20;
+
+ warnx("allocate %zd transhuge pages, using %zd MiB virtual memory"
+ " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20,
+ len >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1));
+
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ err(2, "open pagemap");
+
+ len -= len % HPAGE_SIZE;
+ ptr = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
+ if (ptr == MAP_FAILED)
+ err(2, "initial mmap");
+ ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
+
+ if (madvise(ptr, len, MADV_HUGEPAGE))
+ err(2, "MADV_HUGEPAGE");
+
+ map_len = ram >> (HPAGE_SHIFT - 1);
+ map = malloc(map_len);
+ if (!map)
+ errx(2, "map malloc");
+
+ while (1) {
+ int nr_succeed = 0, nr_failed = 0, nr_pages = 0;
+
+ memset(map, 0, map_len);
+
+ clock_gettime(CLOCK_MONOTONIC, &a);
+ for (p = ptr; p < ptr + len; p += HPAGE_SIZE) {
+ int64_t pfn;
+
+ pfn = allocate_transhuge(p);
+
+ if (pfn < 0) {
+ nr_failed++;
+ } else {
+ size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT);
+
+ nr_succeed++;
+ if (idx >= map_len) {
+ map = realloc(map, idx + 1);
+ if (!map)
+ errx(2, "map realloc");
+ memset(map + map_len, 0, idx + 1 - map_len);
+ map_len = idx + 1;
+ }
+ if (!map[idx])
+ nr_pages++;
+ map[idx] = 1;
+ }
+
+ /* split transhuge page, keep last page */
+ if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED))
+ err(2, "MADV_DONTNEED");
+ }
+ clock_gettime(CLOCK_MONOTONIC, &b);
+ s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.;
+
+ warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
+ "%4d succeed, %4d failed, %4d different pages",
+ s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
+ nr_succeed, nr_failed, nr_pages);
+ }
+}
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index c4d6d2e20e0d..264fbc297e0b 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -132,6 +132,7 @@ static const char * const page_flag_names[] = {
[KPF_NOPAGE] = "n:nopage",
[KPF_KSM] = "x:ksm",
[KPF_THP] = "t:thp",
+ [KPF_BALLOON] = "o:balloon",
[KPF_RESERVED] = "r:reserved",
[KPF_MLOCKED] = "m:mlocked",