summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2013-05-03 13:46:06 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2013-05-03 13:46:06 +1000
commit6f127d4fcee90eefdabfc98c37e6760672f05e26 (patch)
tree8767332c5614134d34aea07a8f99fdb101a44ee1
parent2aee6a1f2a88f732bca9ce4d926a5cc688e67b75 (diff)
parent5e032f67ae9d946cbbe5f1087fca0b652b2e3892 (diff)
Merge branch 'akpm/master'
-rw-r--r--Documentation/cgroups/memory.txt16
-rw-r--r--Documentation/devicetree/bindings/video/simple-framebuffer.txt25
-rw-r--r--Documentation/filesystems/proc.txt9
-rw-r--r--Documentation/rapidio/rapidio.txt137
-rw-r--r--Documentation/rapidio/sysfs.txt17
-rw-r--r--Documentation/sysctl/vm.txt33
-rw-r--r--Documentation/vm/pagemap.txt5
-rw-r--r--Documentation/vm/soft-dirty.txt36
-rw-r--r--arch/Kconfig17
-rw-r--r--arch/arm/Kconfig2
-rw-r--r--arch/arm/boot/compressed/.gitignore1
-rw-r--r--arch/arm/boot/compressed/Makefile6
-rw-r--r--arch/arm/boot/compressed/decompress.c4
-rw-r--r--arch/arm/boot/compressed/piggy.lz4.S6
-rw-r--r--arch/arm/mm/mmap.c2
-rw-r--r--arch/arm/net/bpf_jit_32.c91
-rw-r--r--arch/arm64/mm/mmap.c2
-rw-r--r--arch/mips/mm/mmap.c2
-rw-r--r--arch/powerpc/mm/mmap_64.c2
-rw-r--r--arch/powerpc/net/bpf_jit_comp.c4
-rw-r--r--arch/s390/hypfs/inode.c1
-rw-r--r--arch/s390/mm/mmap.c4
-rw-r--r--arch/s390/net/bpf_jit_comp.c4
-rw-r--r--arch/sparc/kernel/leon_smp.c15
-rw-r--r--arch/sparc/kernel/sys_sparc_64.c2
-rw-r--r--arch/sparc/mm/init_32.c37
-rw-r--r--arch/sparc/mm/init_64.c28
-rw-r--r--arch/sparc/net/bpf_jit_comp.c4
-rw-r--r--arch/tile/mm/mmap.c2
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/boot/compressed/Makefile6
-rw-r--r--arch/x86/boot/compressed/misc.c4
-rw-r--r--arch/x86/ia32/ia32_aout.c2
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/pgtable.h24
-rw-r--r--arch/x86/include/asm/pgtable_types.h12
-rw-r--r--arch/x86/kernel/e820.c72
-rw-r--r--arch/x86/mm/mmap.c2
-rw-r--r--arch/x86/net/bpf_jit_comp.c4
-rw-r--r--arch/x86/platform/efi/efi.c15
-rw-r--r--block/blk-core.c35
-rw-r--r--block/blk-flush.c5
-rw-r--r--block/blk-lib.c3
-rw-r--r--block/blk.h3
-rw-r--r--block/genhd.c2
-rw-r--r--block/scsi_ioctl.c1
-rw-r--r--crypto/Kconfig16
-rw-r--r--crypto/Makefile2
-rw-r--r--crypto/lz4.c106
-rw-r--r--crypto/lz4hc.c106
-rw-r--r--drivers/block/blockconsole.c5
-rw-r--r--drivers/block/drbd/drbd_bitmap.c3
-rw-r--r--drivers/block/drbd/drbd_worker.c9
-rw-r--r--drivers/block/drbd/drbd_wrappers.h9
-rw-r--r--drivers/block/floppy.c3
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c86
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h8
-rw-r--r--drivers/block/pktcdvd.c9
-rw-r--r--drivers/block/swim3.c2
-rw-r--r--drivers/block/virtio_blk.c31
-rw-r--r--drivers/block/xen-blkback/blkback.c3
-rw-r--r--drivers/char/mem.c36
-rw-r--r--drivers/char/mwave/tp3780i.c1
-rw-r--r--drivers/char/random.c54
-rw-r--r--drivers/gpu/drm/drm_fb_helper.c8
-rw-r--r--drivers/infiniband/hw/ipath/ipath_file_ops.c1
-rw-r--r--drivers/infiniband/hw/qib/qib_file_ops.c2
-rw-r--r--drivers/leds/leds-ot200.c14
-rw-r--r--drivers/md/bcache/alloc.c3
-rw-r--r--drivers/md/bcache/btree.c3
-rw-r--r--drivers/md/bcache/debug.c3
-rw-r--r--drivers/md/bcache/io.c6
-rw-r--r--drivers/md/bcache/journal.c9
-rw-r--r--drivers/md/bcache/movinggc.c3
-rw-r--r--drivers/md/bcache/request.c9
-rw-r--r--drivers/md/bcache/request.h3
-rw-r--r--drivers/md/bcache/super.c11
-rw-r--r--drivers/md/bcache/writeback.c8
-rw-r--r--drivers/md/dm-bufio.c9
-rw-r--r--drivers/md/dm-cache-target.c3
-rw-r--r--drivers/md/dm-crypt.c3
-rw-r--r--drivers/md/dm-io.c2
-rw-r--r--drivers/md/dm-snap.c3
-rw-r--r--drivers/md/dm-thin.c3
-rw-r--r--drivers/md/dm-verity.c3
-rw-r--r--drivers/md/dm.c8
-rw-r--r--drivers/md/faulty.c3
-rw-r--r--drivers/md/md.c9
-rw-r--r--drivers/md/multipath.c3
-rw-r--r--drivers/md/raid1.c12
-rw-r--r--drivers/md/raid10.c18
-rw-r--r--drivers/md/raid5.c15
-rw-r--r--drivers/net/ethernet/broadcom/cnic.c4
-rw-r--r--drivers/net/hamradio/baycom_epp.c2
-rw-r--r--drivers/net/hamradio/hdlcdrv.c2
-rw-r--r--drivers/net/hamradio/yam.c2
-rw-r--r--drivers/net/team/team_mode_random.c2
-rw-r--r--drivers/net/wireless/brcm80211/brcmfmac/p2p.c2
-rw-r--r--drivers/net/wireless/mwifiex/cfg80211.c4
-rw-r--r--drivers/rapidio/Kconfig28
-rw-r--r--drivers/rapidio/Makefile3
-rw-r--r--drivers/rapidio/rio-driver.c8
-rw-r--r--drivers/rapidio/rio-scan.c195
-rw-r--r--drivers/rapidio/rio-sysfs.c45
-rw-r--r--drivers/rapidio/rio.c234
-rw-r--r--drivers/rapidio/rio.h12
-rw-r--r--drivers/rtc/rtc-pxa.c99
-rw-r--r--drivers/rtc/rtc-v3020.c37
-rw-r--r--drivers/scsi/sg.c1
-rw-r--r--drivers/staging/android/logger.c1
-rw-r--r--drivers/target/target_core_iblock.c6
-rw-r--r--drivers/target/target_core_pscsi.c3
-rw-r--r--drivers/usb/gadget/amd5536udc.c4
-rw-r--r--drivers/usb/gadget/inode.c42
-rw-r--r--drivers/video/Kconfig17
-rw-r--r--drivers/video/Makefile1
-rw-r--r--drivers/video/cyber2000fb.c3
-rw-r--r--drivers/video/simplefb.c234
-rw-r--r--fs/9p/vfs_addr.c1
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/aio.c1800
-rw-r--r--fs/binfmt_aout.c2
-rw-r--r--fs/binfmt_elf.c23
-rw-r--r--fs/bio-integrity.c3
-rw-r--r--fs/bio.c64
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/btrfs/check-integrity.c14
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/disk-io.c6
-rw-r--r--fs/btrfs/extent_io.c12
-rw-r--r--fs/btrfs/file.c1
-rw-r--r--fs/btrfs/inode.c14
-rw-r--r--fs/btrfs/raid56.c9
-rw-r--r--fs/btrfs/scrub.c18
-rw-r--r--fs/btrfs/volumes.c5
-rw-r--r--fs/buffer.c3
-rw-r--r--fs/ceph/file.c1
-rw-r--r--fs/compat.c1
-rw-r--r--fs/direct-io.c22
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/file.c1
-rw-r--r--fs/ext2/inode.c1
-rw-r--r--fs/ext3/inode.c1
-rw-r--r--fs/ext4/file.c1
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inode.c1
-rw-r--r--fs/ext4/page-io.c4
-rw-r--r--fs/f2fs/data.c3
-rw-r--r--fs/f2fs/segment.c3
-rw-r--r--fs/fat/file.c108
-rw-r--r--fs/fat/inode.c55
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/fuse/dev.c1
-rw-r--r--fs/fuse/file.c1
-rw-r--r--fs/gfs2/aops.c1
-rw-r--r--fs/gfs2/file.c1
-rw-r--r--fs/gfs2/lops.c3
-rw-r--r--fs/gfs2/ops_fstype.c3
-rw-r--r--fs/hfs/inode.c1
-rw-r--r--fs/hfsplus/inode.c1
-rw-r--r--fs/hfsplus/wrapper.c3
-rw-r--r--fs/jfs/inode.c1
-rw-r--r--fs/jfs/jfs_logmgr.c4
-rw-r--r--fs/jfs/jfs_metapage.c6
-rw-r--r--fs/logfs/dev_bdev.c8
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c17
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/segbuf.c3
-rw-r--r--fs/ntfs/file.c1
-rw-r--r--fs/ntfs/inode.c1
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/pipe.c1
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/proc/task_mmu.c128
-rw-r--r--fs/read_write.c35
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/udf/inode.c1
-rw-r--r--fs/xfs/xfs_aops.c4
-rw-r--r--fs/xfs/xfs_buf.c3
-rw-r--r--fs/xfs/xfs_file.c1
-rw-r--r--include/asm-generic/pgtable.h22
-rw-r--r--include/linux/aio.h199
-rw-r--r--include/linux/balloon_compaction.h7
-rw-r--r--include/linux/batch_complete.h23
-rw-r--r--include/linux/bio.h38
-rw-r--r--include/linux/blk_types.h4
-rw-r--r--include/linux/blkdev.h12
-rw-r--r--include/linux/cgroup.h1
-rw-r--r--include/linux/decompress/unlz4.h10
-rw-r--r--include/linux/errno.h1
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/hardirq.h5
-rw-r--r--include/linux/lockdep.h92
-rw-r--r--include/linux/lz4.h87
-rw-r--r--include/linux/mm_types.h3
-rw-r--r--include/linux/mmzone.h20
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--include/linux/percpu-refcount.h114
-rw-r--r--include/linux/pid_namespace.h1
-rw-r--r--include/linux/posix-timers.h16
-rw-r--r--include/linux/random.h7
-rw-r--r--include/linux/rio.h17
-rw-r--r--include/linux/rio_drv.h1
-rw-r--r--include/linux/rtc-pxa.h18
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/seccomp.h18
-rw-r--r--include/linux/swap.h9
-rw-r--r--include/linux/vm_event_item.h7
-rw-r--r--include/linux/wait.h86
-rw-r--r--include/linux/writeback.h1
-rw-r--r--include/net/ip_vs.h8
-rw-r--r--init/Kconfig34
-rw-r--r--ipc/msg.c5
-rw-r--r--ipc/sem.c250
-rw-r--r--kernel/audit_tree.c1
-rw-r--r--kernel/auditfilter.c4
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/lglock.c12
-rw-r--r--kernel/posix-cpu-timers.c395
-rw-r--r--kernel/printk.c92
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/relay.c14
-rw-r--r--kernel/seccomp.c33
-rw-r--r--kernel/smp.c11
-rw-r--r--kernel/watchdog.c10
-rw-r--r--lib/Kconfig13
-rw-r--r--lib/Makefile6
-rw-r--r--lib/decompress.c5
-rw-r--r--lib/decompress_unlz4.c187
-rw-r--r--lib/lz4/Makefile3
-rw-r--r--lib/lz4/lz4_compress.c443
-rw-r--r--lib/lz4/lz4_decompress.c326
-rw-r--r--lib/lz4/lz4defs.h156
-rw-r--r--lib/lz4/lz4hc_compress.c539
-rw-r--r--lib/percpu-refcount.c251
-rw-r--r--mm/Kconfig12
-rw-r--r--mm/balloon_compaction.c2
-rw-r--r--mm/bounce.c12
-rw-r--r--mm/dmapool.c5
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/memcontrol.c192
-rw-r--r--mm/migrate.c1
-rw-r--r--mm/mmap.c28
-rw-r--r--mm/mmu_context.c3
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/page_io.c9
-rw-r--r--mm/shmem.c1
-rw-r--r--mm/swap.c1
-rw-r--r--mm/util.c1
-rw-r--r--mm/vmstat.c9
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c4
-rw-r--r--scripts/Makefile.lib5
-rw-r--r--security/keys/internal.h2
-rw-r--r--security/keys/keyctl.c1
-rw-r--r--sound/core/pcm_native.c2
-rw-r--r--sound/soc/codecs/si476x.c6
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/timers/Makefile8
-rw-r--r--tools/testing/selftests/timers/posix_timers.c221
-rw-r--r--usr/Kconfig9
268 files changed, 6496 insertions, 2485 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 09027a9fece5..1178e2357acb 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -72,6 +72,7 @@ Brief summary of control files.
memory.move_charge_at_immigrate # set/show controls of moving charges
memory.oom_control # set/show oom controls.
memory.numa_stat # show the number of memory usage per numa node
+ memory.dangling_memcgs # show debugging information about dangling groups
memory.kmem.limit_in_bytes # set/show hard limit for kernel memory
memory.kmem.usage_in_bytes # show current kernel memory allocation
@@ -579,6 +580,21 @@ unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
And we have total = file + anon + unevictable.
+5.7 dangling_memcgs
+
+This file will only be ever present in the root cgroup, if the option
+CONFIG_MEMCG_DEBUG_ASYNC_DESTROY is set. When a memcg is destroyed, the memory
+consumed by it may not be immediately freed. This is because when some
+extensions are used, such as swap or kernel memory, objects can outlive the
+group and hold a reference to it.
+
+If this is the case, the dangling_memcgs file will show information about what
+are the memcgs still alive, and which references are still preventing it to be
+freed. There is nothing wrong with that, but it is very useful when debugging,
+to know where this memory is being held. This is a developer-oriented debugging
+facility only, and no guarantees of interface stability will be given. The file
+is read-only, and has the sole purpose of displaying information.
+
6. Hierarchy support
The memory controller supports a deep hierarchy and hierarchical accounting.
diff --git a/Documentation/devicetree/bindings/video/simple-framebuffer.txt b/Documentation/devicetree/bindings/video/simple-framebuffer.txt
new file mode 100644
index 000000000000..3ea460583111
--- /dev/null
+++ b/Documentation/devicetree/bindings/video/simple-framebuffer.txt
@@ -0,0 +1,25 @@
+Simple Framebuffer
+
+A simple frame-buffer describes a raw memory region that may be rendered to,
+with the assumption that the display hardware has already been set up to scan
+out from that buffer.
+
+Required properties:
+- compatible: "simple-framebuffer"
+- reg: Should contain the location and size of the framebuffer memory.
+- width: The width of the framebuffer in pixels.
+- height: The height of the framebuffer in pixels.
+- stride: The number of bytes in each line of the framebuffer.
+- format: The format of the framebuffer surface. Valid values are:
+ - r5g6b5 (16-bit pixels, d[15:11]=r, d[10:5]=g, d[4:0]=b).
+
+Example:
+
+ framebuffer {
+ compatible = "simple-framebuffer";
+ reg = <0x1d385000 (1600 * 1200 * 2)>;
+ width = <1600>;
+ height = <1200>;
+ stride = <(1600 * 2)>;
+ format = "r5g6b5";
+ };
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fd8d0d594fc7..488c0940f3d8 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -473,7 +473,8 @@ This file is only present if the CONFIG_MMU kernel configuration option is
enabled.
The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
-bits on both physical and virtual pages associated with a process.
+bits on both physical and virtual pages associated with a process, and the
+soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details).
To clear the bits for all the pages associated with the process
> echo 1 > /proc/PID/clear_refs
@@ -482,11 +483,17 @@ To clear the bits for the anonymous pages associated with the process
To clear the bits for the file mapped pages associated with the process
> echo 3 > /proc/PID/clear_refs
+
+To clear the soft-dirty bit
+ > echo 4 > /proc/PID/clear_refs
+
Any other value written to /proc/PID/clear_refs will have no effect.
The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
using /proc/kpageflags and number of times a page is mapped using
/proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt.
+(There's also a /proc/pid/pagemap2 file which is the 2nd version of the
+ pagemap one).
1.2 Kernel data
---------------
diff --git a/Documentation/rapidio/rapidio.txt b/Documentation/rapidio/rapidio.txt
index c75694b35d08..d97576f5b082 100644
--- a/Documentation/rapidio/rapidio.txt
+++ b/Documentation/rapidio/rapidio.txt
@@ -79,20 +79,64 @@ master port that is used to communicate with devices within the network.
In order to initialize the RapidIO subsystem, a platform must initialize and
register at least one master port within the RapidIO network. To register mport
within the subsystem controller driver initialization code calls function
-rio_register_mport() for each available master port. After all active master
-ports are registered with a RapidIO subsystem, the rio_init_mports() routine
-is called to perform enumeration and discovery.
+rio_register_mport() for each available master port.
-In the current PowerPC-based implementation a subsys_initcall() is specified to
-perform controller initialization and mport registration. At the end it directly
-calls rio_init_mports() to execute RapidIO enumeration and discovery.
+RapidIO subsystem uses subsys_initcall() or device_initcall() to perform
+controller initialization (depending on controller device type).
+
+After all active master ports are registered with a RapidIO subsystem,
+an enumeration and/or discovery routine may be called automatically or
+by user-space command.
4. Enumeration and Discovery
----------------------------
-When rio_init_mports() is called it scans a list of registered master ports and
-calls an enumeration or discovery routine depending on the configured role of a
-master port: host or agent.
+4.1 Overview
+------------
+
+RapidIO subsystem configuration options allow users to specify enumeration and
+discovery methods as built-in components or loadable modules.
+For built-in options only one method can be selected for all mports in a system.
+Selecting a modular build option for enumeration/discovery allows to use
+different enumerators attached to available mport device individually.
+
+Depending on selected enumeration/discovery build configuration, there are
+several methods to initiate enumeration and/or discovery process:
+
+ (a) Built-in enumeration and discovery process can be started automatically
+ during kernel initialization time. This is the original method used since
+ introduction of RapidIO subsystem. This method relies on kernel initcall that
+ calls rio_init_mports() routine after all available mports have been
+ registered. When automatic start of enumeration/discovery is used a user has
+ to ensure that all discovering endpoints are started before the enumerating
+ endpoint and are waiting for enumeration to be completed.
+ Configuration option CONFIG_RAPIDIO_DISC_TIMEOUT defines time that discovering
+ endpoint waits for enumeration to be completed. If the specified timeout
+ expires the discovery process is terminated without obtaining RapidIO network
+ information. NOTE: a timed out discovery process may be restarted later using
+ a user-space command as it is described later if the given endpoint was
+ enumerated successfully.
+
+ (b) Built-in enumeration and discovery process can be started by a command
+ from user space. This initiation method provides more freedom for system
+ startup compared to the option (a) above. After all participating endpoints
+ have been successfully booted, an enumeration process shall be started first
+ by issuing a user-space command, as soon as enumeration is completed
+ a discovery process can be started on all remaining endpoints.
+ Configuration option CONFIG_RAPIDIO_ENUM_AUTO defines which method will be
+ used to start a built-in enumeration and discovery process.
+
+ (c) Modular enumeration and discovery process can be started by a command from
+ user space. After an enumeration/discovery module is loaded, a network scan
+ process can be started by issuing a user-space command.
+ Similar to the option (b) above, an enumerator has to be started first.
+
+ (d) Modular enumeration and discovery process can be started by a module
+ initialization routine. As in the cases above an enumerating module shall be
+ loaded first.
+
+When a network scan process is started it calls an enumeration or discovery
+routine depending on the configured role of a master port: host or agent.
Enumeration is performed by a master port if it is configured as a host port by
assigning a host device ID greater than or equal to zero. A host device ID is
@@ -104,8 +148,57 @@ for it.
The enumeration and discovery routines use RapidIO maintenance transactions
to access the configuration space of devices.
-The enumeration process is implemented according to the enumeration algorithm
-outlined in the RapidIO Interconnect Specification: Annex I [1].
+4.2 Automatic Start of Enumeration and Discovery
+------------------------------------------------
+
+Automatic enumeration/discovery start method is applicable only to built-in
+enumeration/discovery RapidIO configuration selection. To enable automatic
+enumeration/discovery start set CONFIG_RAPIDIO_ENUM_AUTO=y.
+
+This configuration requires synchronized start of all RapidIO endpoints that
+form a network which will be enumerated/discovered. Discovering endpoints have
+to be started before an enumeration starts to ensure that all RapidIO
+controllers have been initialized and are ready to be discovered. Configuration
+parameter CONFIG_RAPIDIO_DISC_TIMEOUT defines time (in seconds) which
+a discovering endpoint will wait for enumeration to be completed.
+
+When automatic enumeration/discovery start method is selected RapidIO subsystem
+core uses device_initcall_sync() to invoke rio_init_mports() routine to perform
+enumeration or discovery for all known mport devices.
+
+Depending on RapidIO network size and configuration this automatic
+enumeration/discovery start method may be difficult to use due to the
+requirement for synchronized start of all endpoints.
+
+4.3 User-space Start of Enumeration and Discovery
+-------------------------------------------------
+
+User-space start of enumeration and discovery can be used with built-in and
+modular build configurations. For user-space controlled start RapidIO subsystem
+creates the sysfs write-only attribute file '/sys/bus/rapidio/scan'. To initiate
+an enumeration or discovery process on specific mport device, a user needs to
+write mport_ID (not RapidIO destination ID) into that file. The mport_ID is a
+sequential number (0 ... RIO_MAX_MPORTS) assigned during mport device
+registration. For example for machine with single RapidIO controller, mport_ID
+for that controller always will be 0.
+
+To initiate RapidIO enumeration/discovery on all available mports a user may
+write '-1' (or RIO_MPORT_ANY) into the scan attribute file.
+
+4.4 Basic Enumeration Method
+----------------------------
+
+This is an original enumeration/discovery method which is available since
+first release of RapidIO subsystem code. The enumeration process is
+implemented according to the enumeration algorithm outlined in the RapidIO
+Interconnect Specification: Annex I [1].
+
+This method can be configured as built-in or loadable module. When used as a
+kernel module this method offers parameter "scan" witch allows to trigger the
+enumeration/discovery process from module initialization routine.
+
+This enumeration/discovery method can be started only once and does not support
+unloading if it is built as a module.
The enumeration process traverses the network using a recursive depth-first
algorithm. When a new device is found, the enumerator takes ownership of that
@@ -160,6 +253,28 @@ time period. If this wait time period expires before enumeration is completed,
an agent skips RapidIO discovery and continues with remaining kernel
initialization.
+4.5 Adding New Enumeration/Discovery Method
+-------------------------------------------
+
+RapidIO subsystem code organization allows addition of new enumeration/discovery
+methods as new configuration options without significant impact to to the core
+RapidIO code.
+
+RapidIO developers can add new enumeration/discovery methods as built-in code or
+loadable kernel modules. In case of built-in methods only one method may be
+configured in, having multiple built-in enumeration/discovery methods is not
+supported by mport role assignment mechanism.
+
+To be registered with RapidIO subsystem a new built-in method must provide
+global data structure named "rio_scan_ops" (see definition of 'struct rio_scan'
+in include/linux/rio.h file). Example of this structure is available in the
+existing implementation of basic RapidIO enumeration method (see rio-scan.c).
+
+If a new enumeration/discovery method is implemented as a module, its module
+initialization routine has to call rio_register_scan() routine to attach
+a loadable enumerator to a specified mport device. The basic enumerator
+implementation demonstrates this process as well.
+
5. References
-------------
diff --git a/Documentation/rapidio/sysfs.txt b/Documentation/rapidio/sysfs.txt
index 97f71ce575d6..c3b7d18ebdff 100644
--- a/Documentation/rapidio/sysfs.txt
+++ b/Documentation/rapidio/sysfs.txt
@@ -88,3 +88,20 @@ that exports additional attributes.
IDT_GEN2:
errlog - reads contents of device error log until it is empty.
+
+
+5. RapidIO Bus Attributes
+-------------------------
+
+RapidIO bus subdirectory /sys/bus/rapidio implements the following bus-specific
+attribute:
+
+ scan - allows to trigger enumeration discovery process from user space. This
+ is write-only attribute. To initiate an enumeration or discovery
+ process on specific mport device, a user needs to write mport_ID (not
+ RapidIO destination ID) into this file. The mport_ID is a sequential
+ number (0 ... RIO_MAX_MPORTS) assigned to mport device. For example for
+ machine with single RapidIO controller, mport_ID for that controller
+ always will be 0. To initiate RapidIO enumeration/discovery on all
+ available mports a user must write '-1' (or RIO_MPORT_ANY) into this
+ attribute file.
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index dcc75a9ed919..a5717c38834a 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -169,18 +169,39 @@ Setting this to zero disables periodic writeback altogether.
drop_caches
-Writing to this will cause the kernel to drop clean caches, dentries and
-inodes from memory, causing that memory to become free.
+Writing to this will cause the kernel to drop clean caches, as well as
+reclaimable slab objects like dentries and inodes. Once dropped, their
+memory becomes free.
To free pagecache:
echo 1 > /proc/sys/vm/drop_caches
-To free dentries and inodes:
+To free reclaimable slab objects (includes dentries and inodes):
echo 2 > /proc/sys/vm/drop_caches
-To free pagecache, dentries and inodes:
+To free slab objects and pagecache:
echo 3 > /proc/sys/vm/drop_caches
-As this is a non-destructive operation and dirty objects are not freeable, the
-user should run `sync' first.
+This is a non-destructive operation and will not free any dirty objects.
+To increase the number of objects freed by this operation, the user may run
+`sync' prior to writing to /proc/sys/vm/drop_caches. This will minimize the
+number of dirty objects on the system and create more candidates to be
+dropped.
+
+This file is not a means to control the growth of the various kernel caches
+(inodes, dentries, pagecache, etc...) These objects are automatically
+reclaimed by the kernel when memory is needed elsewhere on the system.
+
+Use of this file can cause performance problems. Since it discards cached
+objects, it may cost a significant amount of I/O and CPU to recreate the
+dropped objects, especially if they were under heavy use. Because of this,
+use outside of a testing or debugging environment is not recommended.
+
+You may see informational messages in your kernel log when this file is
+used:
+
+ cat (1234): dropped kernel caches: 3
+
+These are informational only. They do not mean that anything is wrong
+with your system.
==============================================================
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index 7587493c67f1..394cc03c8df6 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -30,6 +30,11 @@ There are three components to pagemap:
determine which areas of memory are actually mapped and llseek to
skip over unmapped regions.
+ * /proc/pid/pagemap2. This file provides the same info as the pagemap
+ does, but bits 56-60 are reserved for future use and thus zero
+
+ Bit 55 means pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+
* /proc/kpagecount. This file contains a 64-bit count of the number of
times each page is mapped, indexed by PFN.
diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt
new file mode 100644
index 000000000000..9a12a5956bc0
--- /dev/null
+++ b/Documentation/vm/soft-dirty.txt
@@ -0,0 +1,36 @@
+ SOFT-DIRTY PTEs
+
+ The soft-dirty is a bit on a PTE which helps to track which pages a task
+writes to. In order to do this tracking one should
+
+ 1. Clear soft-dirty bits from the task's PTEs.
+
+ This is done by writing "4" into the /proc/PID/clear_refs file of the
+ task in question.
+
+ 2. Wait some time.
+
+ 3. Read soft-dirty bits from the PTEs.
+
+ This is done by reading from the /proc/PID/pagemap. The bit 55 of the
+ 64-bit qword is the soft-dirty one. If set, the respective PTE was
+ written to since step 1.
+
+
+ Internally, to do this tracking, the writable bit is cleared from PTEs
+when the soft-dirty bit is cleared. So, after this, when the task tries to
+modify a page at some virtual address the #PF occurs and the kernel sets
+the soft-dirty bit on the respective PTE.
+
+ Note, that although all the task's address space is marked as r/o after the
+soft-dirty bits clear, the #PF-s that occur after that are processed fast.
+This is so, since the pages are still mapped to physical memory, and thus all
+the kernel does is finds this fact out and puts both writable and soft-dirty
+bits on the PTE.
+
+
+ This feature is actively used by the checkpoint-restore project. You
+can find more details about it on http://criu.org
+
+
+-- Pavel Emelyanov, Apr 9, 2013
diff --git a/arch/Kconfig b/arch/Kconfig
index dd0e8eb8042f..a6b105618136 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -329,6 +329,10 @@ config HAVE_ARCH_SECCOMP_FILTER
- secure_computing return value is checked and a return value of -1
results in the system call being skipped immediately.
+# Used by archs to tell that they support SECCOMP_FILTER_JIT
+config HAVE_SECCOMP_FILTER_JIT
+ bool
+
config SECCOMP_FILTER
def_bool y
depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET
@@ -339,6 +343,16 @@ config SECCOMP_FILTER
See Documentation/prctl/seccomp_filter.txt for details.
+config SECCOMP_FILTER_JIT
+ bool "enable Seccomp filter Just In Time compiler"
+ depends on HAVE_SECCOMP_FILTER_JIT && BPF_JIT && SECCOMP_FILTER
+ help
+ Seccomp syscall filtering capabilities are normally handled
+ by an interpreter. This option allows kernel to generate a native
+ code when filter is loaded in memory. This should speedup
+ syscall filtering. Note : Admin should enable this feature
+ changing /proc/sys/net/core/bpf_jit_enable
+
config HAVE_CONTEXT_TRACKING
bool
help
@@ -362,6 +376,9 @@ config HAVE_IRQ_TIME_ACCOUNTING
config HAVE_ARCH_TRANSPARENT_HUGEPAGE
bool
+config HAVE_ARCH_SOFT_DIRTY
+ bool
+
config HAVE_MOD_ARCH_SPECIFIC
bool
help
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index d00a7336061c..f95674a76af0 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -25,6 +25,7 @@ config ARM
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_BPF_JIT
+ select HAVE_SECCOMP_FILTER_JIT
select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_API_DEBUG
@@ -39,6 +40,7 @@ config ARM
select HAVE_HW_BREAKPOINT if (PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7))
select HAVE_IDE if PCI || ISA || PCMCIA
select HAVE_KERNEL_GZIP
+ select HAVE_KERNEL_LZ4
select HAVE_KERNEL_LZMA
select HAVE_KERNEL_LZO
select HAVE_KERNEL_XZ
diff --git a/arch/arm/boot/compressed/.gitignore b/arch/arm/boot/compressed/.gitignore
index f79a08efe000..47279aa96a6a 100644
--- a/arch/arm/boot/compressed/.gitignore
+++ b/arch/arm/boot/compressed/.gitignore
@@ -6,6 +6,7 @@ piggy.gzip
piggy.lzo
piggy.lzma
piggy.xzkern
+piggy.lz4
vmlinux
vmlinux.lds
diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index 3580d57ea218..001a13a0cf6f 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -27,6 +27,9 @@ OBJS += misc.o decompress.o
ifeq ($(CONFIG_DEBUG_UNCOMPRESS),y)
OBJS += debug.o
endif
+ifeq ($(CONFIG_KERNEL_LZ4),y)
+CFLAGS_decompress.o := -Os
+endif
FONTC = $(srctree)/drivers/video/console/font_acorn_8x8.c
# string library code (-Os is enforced to keep it much smaller)
@@ -91,6 +94,7 @@ suffix_$(CONFIG_KERNEL_GZIP) = gzip
suffix_$(CONFIG_KERNEL_LZO) = lzo
suffix_$(CONFIG_KERNEL_LZMA) = lzma
suffix_$(CONFIG_KERNEL_XZ) = xzkern
+suffix_$(CONFIG_KERNEL_LZ4) = lz4
# Borrowed libfdt files for the ATAG compatibility mode
@@ -115,7 +119,7 @@ targets := vmlinux vmlinux.lds \
font.o font.c head.o misc.o $(OBJS)
# Make sure files are removed during clean
-extra-y += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern \
+extra-y += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern piggy.lz4 \
lib1funcs.S ashldi3.S $(libfdt) $(libfdt_hdrs)
ifeq ($(CONFIG_FUNCTION_TRACER),y)
diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c
index 24b0475cb8bf..bd245d34952d 100644
--- a/arch/arm/boot/compressed/decompress.c
+++ b/arch/arm/boot/compressed/decompress.c
@@ -51,6 +51,10 @@ extern char * strstr(const char * s1, const char *s2);
#include "../../../../lib/decompress_unxz.c"
#endif
+#ifdef CONFIG_KERNEL_LZ4
+#include "../../../../lib/decompress_unlz4.c"
+#endif
+
int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x))
{
return decompress(input, len, NULL, NULL, output, NULL, error);
diff --git a/arch/arm/boot/compressed/piggy.lz4.S b/arch/arm/boot/compressed/piggy.lz4.S
new file mode 100644
index 000000000000..3d9a575618a3
--- /dev/null
+++ b/arch/arm/boot/compressed/piggy.lz4.S
@@ -0,0 +1,6 @@
+ .section .piggydata,#alloc
+ .globl input_data
+input_data:
+ .incbin "arch/arm/boot/compressed/piggy.lz4"
+ .globl input_data_end
+input_data_end:
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 10062ceadd1c..0c6356255fe3 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -181,11 +181,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base(random_factor);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 1a643ee8e082..c5ef845ff89e 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -55,7 +55,8 @@
#define FLAG_NEED_X_RESET (1 << 0)
struct jit_ctx {
- const struct sk_filter *skf;
+ unsigned short prog_len;
+ struct sock_filter *prog_insns;
unsigned idx;
unsigned prologue_bytes;
int ret0_fp_idx;
@@ -131,8 +132,8 @@ static u16 saved_regs(struct jit_ctx *ctx)
{
u16 ret = 0;
- if ((ctx->skf->len > 1) ||
- (ctx->skf->insns[0].code == BPF_S_RET_A))
+ if ((ctx->prog_len > 1) ||
+ (ctx->prog_insns[0].code == BPF_S_RET_A))
ret |= 1 << r_A;
#ifdef CONFIG_FRAME_POINTER
@@ -181,7 +182,7 @@ static inline bool is_load_to_a(u16 inst)
static void build_prologue(struct jit_ctx *ctx)
{
u16 reg_set = saved_regs(ctx);
- u16 first_inst = ctx->skf->insns[0].code;
+ u16 first_inst = ctx->prog_insns[0].code;
u16 off;
#ifdef CONFIG_FRAME_POINTER
@@ -279,7 +280,7 @@ static u16 imm_offset(u32 k, struct jit_ctx *ctx)
ctx->imms[i] = k;
/* constants go just after the epilogue */
- offset = ctx->offsets[ctx->skf->len];
+ offset = ctx->offsets[ctx->prog_len];
offset += ctx->prologue_bytes;
offset += ctx->epilogue_bytes;
offset += i * 4;
@@ -419,7 +420,7 @@ static inline void emit_err_ret(u8 cond, struct jit_ctx *ctx)
emit(ARM_MOV_R(ARM_R0, ARM_R0), ctx);
} else {
_emit(cond, ARM_MOV_I(ARM_R0, 0), ctx);
- _emit(cond, ARM_B(b_imm(ctx->skf->len, ctx)), ctx);
+ _emit(cond, ARM_B(b_imm(ctx->prog_len, ctx)), ctx);
}
}
@@ -469,14 +470,13 @@ static inline void update_on_xread(struct jit_ctx *ctx)
static int build_body(struct jit_ctx *ctx)
{
void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w};
- const struct sk_filter *prog = ctx->skf;
const struct sock_filter *inst;
unsigned i, load_order, off, condt;
int imm12;
u32 k;
- for (i = 0; i < prog->len; i++) {
- inst = &(prog->insns[i]);
+ for (i = 0; i < ctx->prog_len; i++) {
+ inst = &(ctx->prog_insns[i]);
/* K as an immediate value operand */
k = inst->k;
@@ -548,6 +548,15 @@ load_common:
emit_err_ret(ARM_COND_NE, ctx);
emit(ARM_MOV_R(r_A, ARM_R0), ctx);
break;
+#ifdef CONFIG_SECCOMP_FILTER_JIT
+ case BPF_S_ANC_SECCOMP_LD_W:
+ ctx->seen |= SEEN_CALL;
+ emit_mov_i(ARM_R3, (u32)seccomp_bpf_load, ctx);
+ emit_mov_i(ARM_R0, k, ctx);
+ emit_blx_r(ARM_R3, ctx);
+ emit(ARM_MOV_R(r_A, ARM_R0), ctx);
+ break;
+#endif
case BPF_S_LD_W_IND:
load_order = 2;
goto load_ind;
@@ -769,8 +778,8 @@ cmp_x:
ctx->ret0_fp_idx = i;
emit_mov_i(ARM_R0, k, ctx);
b_epilogue:
- if (i != ctx->skf->len - 1)
- emit(ARM_B(b_imm(prog->len, ctx)), ctx);
+ if (i != ctx->prog_len - 1)
+ emit(ARM_B(b_imm(ctx->prog_len, ctx)), ctx);
break;
case BPF_S_MISC_TAX:
/* X = A */
@@ -858,7 +867,7 @@ b_epilogue:
}
-void bpf_jit_compile(struct sk_filter *fp)
+static void __bpf_jit_compile(struct jit_ctx *out_ctx)
{
struct jit_ctx ctx;
unsigned tmp_idx;
@@ -867,11 +876,10 @@ void bpf_jit_compile(struct sk_filter *fp)
if (!bpf_jit_enable)
return;
- memset(&ctx, 0, sizeof(ctx));
- ctx.skf = fp;
+ ctx = *out_ctx;
ctx.ret0_fp_idx = -1;
- ctx.offsets = kzalloc(4 * (ctx.skf->len + 1), GFP_KERNEL);
+ ctx.offsets = kzalloc(4 * (ctx.prog_len + 1), GFP_KERNEL);
if (ctx.offsets == NULL)
return;
@@ -920,13 +928,26 @@ void bpf_jit_compile(struct sk_filter *fp)
if (bpf_jit_enable > 1)
/* there are 2 passes here */
bpf_jit_dump(fp->len, alloc_size, 2, ctx.target);
-
- fp->bpf_func = (void *)ctx.target;
out:
kfree(ctx.offsets);
+
+ *out_ctx = ctx;
return;
}
+void bpf_jit_compile(struct sk_filter *fp)
+{
+ struct jit_ctx ctx;
+
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.prog_len = fp->len;
+ ctx.prog_insns = fp->insns;
+
+ __bpf_jit_compile(&ctx);
+ if (ctx.target)
+ fp->bpf_func = (void *)ctx.target;
+}
+
static void bpf_jit_free_worker(struct work_struct *work)
{
module_free(NULL, work);
@@ -937,9 +958,45 @@ void bpf_jit_free(struct sk_filter *fp)
struct work_struct *work;
if (fp->bpf_func != sk_run_filter) {
+ /*
+ * bpf_jit_free() can be called from softirq; module_free()
+ * requires process context.
+ */
work = (struct work_struct *)fp->bpf_func;
INIT_WORK(work, bpf_jit_free_worker);
schedule_work(work);
}
}
+
+#ifdef CONFIG_SECCOMP_FILTER_JIT
+void seccomp_jit_compile(struct seccomp_filter *fp)
+{
+ struct jit_ctx ctx;
+
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.prog_len = seccomp_filter_get_len(fp);
+ ctx.prog_insns = seccomp_filter_get_insns(fp);
+
+ __bpf_jit_compile(&ctx);
+ if (ctx.target)
+ seccomp_filter_set_bpf_func(fp, (void *)ctx.target);
+}
+
+void seccomp_jit_free(struct seccomp_filter *fp)
+{
+ struct work_struct *work;
+ void *bpf_func = seccomp_filter_get_bpf_func(fp);
+
+ if (bpf_func != sk_run_filter) {
+ /*
+ * seccomp_jit_free() can be called from softirq; module_free()
+ * requires process context.
+ */
+ work = (struct work_struct *)bpf_func;
+
+ INIT_WORK(work, bpf_jit_free_worker);
+ schedule_work(work);
+ }
+}
+#endif
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 7c7be7855638..8ed6cb1a900f 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -90,11 +90,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
EXPORT_SYMBOL_GPL(arch_pick_mmap_layout);
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index 7e5fe2790d8a..f1baadd56e82 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -158,11 +158,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base(random_factor);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap_64.c
index 67a42ed0d2fc..cb8bdbe4972f 100644
--- a/arch/powerpc/mm/mmap_64.c
+++ b/arch/powerpc/mm/mmap_64.c
@@ -92,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index c427ae36374a..7ab1a6655d89 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -699,6 +699,10 @@ static void jit_free_defer(struct work_struct *arg)
void bpf_jit_free(struct sk_filter *fp)
{
if (fp->bpf_func != sk_run_filter) {
+ /*
+ * bpf_jit_free() can be called from softirq; module_free()
+ * requires process context.
+ */
struct work_struct *work = (struct work_struct *)fp->bpf_func;
INIT_WORK(work, jit_free_defer);
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 5f7d7ba2874c..7a539f4f5e30 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
+#include <linux/aio.h>
#include <asm/ebcdic.h>
#include "hypfs.h"
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 06bafec00278..40023290ee5b 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -91,11 +91,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
@@ -176,11 +174,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = s390_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = s390_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 82f165f8078c..e199efb50f0a 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -818,6 +818,10 @@ void bpf_jit_free(struct sk_filter *fp)
if (fp->bpf_func == sk_run_filter)
return;
+ /*
+ * bpf_jit_free() can be called from softirq; module_free() requires
+ * process context.
+ */
work = (struct work_struct *)fp->bpf_func;
INIT_WORK(work, jit_free_defer);
schedule_work(work);
diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c
index 9b40c9c12a0c..6cfc1b09ec25 100644
--- a/arch/sparc/kernel/leon_smp.c
+++ b/arch/sparc/kernel/leon_smp.c
@@ -253,24 +253,15 @@ void __init leon_smp_done(void)
/* Free unneeded trap tables */
if (!cpu_present(1)) {
- ClearPageReserved(virt_to_page(&trapbase_cpu1));
- init_page_count(virt_to_page(&trapbase_cpu1));
- free_page((unsigned long)&trapbase_cpu1);
- totalram_pages++;
+ free_reserved_page(virt_to_page(&trapbase_cpu1));
num_physpages++;
}
if (!cpu_present(2)) {
- ClearPageReserved(virt_to_page(&trapbase_cpu2));
- init_page_count(virt_to_page(&trapbase_cpu2));
- free_page((unsigned long)&trapbase_cpu2);
- totalram_pages++;
+ free_reserved_page(virt_to_page(&trapbase_cpu2));
num_physpages++;
}
if (!cpu_present(3)) {
- ClearPageReserved(virt_to_page(&trapbase_cpu3));
- init_page_count(virt_to_page(&trapbase_cpu3));
- free_page((unsigned long)&trapbase_cpu3);
- totalram_pages++;
+ free_reserved_page(virt_to_page(&trapbase_cpu3));
num_physpages++;
}
/* Ok, they are spinning and ready to go. */
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 2daaaa6eda23..51561b8b15ba 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -290,7 +290,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
sysctl_legacy_va_layout) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
/* We know it's 32-bit */
unsigned long task_size = STACK_TOP32;
@@ -302,7 +301,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index 4490c397bb5b..af472cf7c69a 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -366,45 +366,14 @@ void __init mem_init(void)
void free_initmem (void)
{
- unsigned long addr;
- unsigned long freed;
-
- addr = (unsigned long)(&__init_begin);
- freed = (unsigned long)(&__init_end) - addr;
- for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
- struct page *p;
-
- memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- p = virt_to_page(addr);
-
- ClearPageReserved(p);
- init_page_count(p);
- __free_page(p);
- totalram_pages++;
- num_physpages++;
- }
- printk(KERN_INFO "Freeing unused kernel memory: %ldk freed\n",
- freed >> 10);
+ num_physpages += free_initmem_default(POISON_FREE_INITMEM);
}
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- if (start < end)
- printk(KERN_INFO "Freeing initrd memory: %ldk freed\n",
- (end - start) >> 10);
- for (; start < end; start += PAGE_SIZE) {
- struct page *p;
-
- memset((void *)start, POISON_FREE_INITMEM, PAGE_SIZE);
- p = virt_to_page(start);
-
- ClearPageReserved(p);
- init_page_count(p);
- __free_page(p);
- totalram_pages++;
- num_physpages++;
- }
+ num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM,
+ "initrd");
}
#endif
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index cf72a8a5b3aa..a7171997adfd 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2059,8 +2059,7 @@ void __init mem_init(void)
/* We subtract one to account for the mem_map_zero page
* allocated below.
*/
- totalram_pages -= 1;
- num_physpages = totalram_pages;
+ num_physpages = totalram_pages - 1;
/*
* Set up the zero page, mark it reserved, so that page count
@@ -2071,7 +2070,7 @@ void __init mem_init(void)
prom_printf("paging_init: Cannot alloc zero page.\n");
prom_halt();
}
- SetPageReserved(mem_map_zero);
+ mark_page_reserved(mem_map_zero);
codepages = (((unsigned long) _etext) - ((unsigned long) _start));
codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT;
@@ -2111,37 +2110,22 @@ void free_initmem(void)
initend = (unsigned long)(__init_end) & PAGE_MASK;
for (; addr < initend; addr += PAGE_SIZE) {
unsigned long page;
- struct page *p;
page = (addr +
((unsigned long) __va(kern_base)) -
((unsigned long) KERNBASE));
memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- if (do_free) {
- p = virt_to_page(page);
-
- ClearPageReserved(p);
- init_page_count(p);
- __free_page(p);
- totalram_pages++;
- }
+ if (do_free)
+ free_reserved_page(virt_to_page(page));
}
}
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- if (start < end)
- printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
- for (; start < end; start += PAGE_SIZE) {
- struct page *p = virt_to_page(start);
-
- ClearPageReserved(p);
- init_page_count(p);
- __free_page(p);
- totalram_pages++;
- }
+ num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM,
+ "initrd");
}
#endif
diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c
index d36a85ebb5e0..4bd140a8536c 100644
--- a/arch/sparc/net/bpf_jit_comp.c
+++ b/arch/sparc/net/bpf_jit_comp.c
@@ -817,6 +817,10 @@ static void jit_free_defer(struct work_struct *arg)
void bpf_jit_free(struct sk_filter *fp)
{
if (fp->bpf_func != sk_run_filter) {
+ /*
+ * bpf_jit_free() can be called from softirq; module_free()
+ * requires process context.
+ */
struct work_struct *work = (struct work_struct *)fp->bpf_func;
INIT_WORK(work, jit_free_defer);
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c
index f96f4cec602a..d67d91ebf63e 100644
--- a/arch/tile/mm/mmap.c
+++ b/arch/tile/mm/mmap.c
@@ -66,10 +66,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base(mm);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b60a1924fe35..e8fff2f4ecb7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -65,6 +65,7 @@ config X86
select HAVE_KERNEL_LZMA
select HAVE_KERNEL_XZ
select HAVE_KERNEL_LZO
+ select HAVE_KERNEL_LZ4
select HAVE_HW_BREAKPOINT
select HAVE_MIXED_BREAKPOINTS_REGS
select PERF_EVENTS
@@ -102,6 +103,7 @@ config X86
select HAVE_ARCH_SECCOMP_FILTER
select BUILDTIME_EXTABLE_SORT
select GENERIC_CMOS_UPDATE
+ select HAVE_ARCH_SOFT_DIRTY
select CLOCKSOURCE_WATCHDOG
select GENERIC_CLOCKEVENTS
select ARCH_CLOCKSOURCE_DATA if X86_64
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 5ef205c5f37b..dcd90df10ab4 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,8 @@
# create a compressed vmlinux image from the original vmlinux
#
-targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo
+targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
+ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -63,12 +64,15 @@ $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
$(call if_changed,xzkern)
$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lzo)
+$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,lz4)
suffix-$(CONFIG_KERNEL_GZIP) := gz
suffix-$(CONFIG_KERNEL_BZIP2) := bz2
suffix-$(CONFIG_KERNEL_LZMA) := lzma
suffix-$(CONFIG_KERNEL_XZ) := xz
suffix-$(CONFIG_KERNEL_LZO) := lzo
+suffix-$(CONFIG_KERNEL_LZ4) := lz4
quiet_cmd_mkpiggy = MKPIGGY $@
cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 7cb56c6ca351..0319c88290a5 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -145,6 +145,10 @@ static int lines, cols;
#include "../../../../lib/decompress_unlzo.c"
#endif
+#ifdef CONFIG_KERNEL_LZ4
+#include "../../../../lib/decompress_unlz4.c"
+#endif
+
static void scroll(void)
{
int i;
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 805078e08013..fbeba82c703b 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -308,8 +308,6 @@ static int load_aout_binary(struct linux_binprm *bprm)
(current->mm->start_data = N_DATADDR(ex));
current->mm->brk = ex.a_bss +
(current->mm->start_brk = N_BSSADDR(ex));
- current->mm->free_area_cache = TASK_UNMAPPED_BASE;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
if (retval < 0) {
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index cccd07fa5e3a..b8e9224f0b45 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -17,6 +17,8 @@ extern unsigned long pci_mem_start;
extern int e820_any_mapped(u64 start, u64 end, unsigned type);
extern int e820_all_mapped(u64 start, u64 end, unsigned type);
extern void e820_add_region(u64 start, u64 size, int type);
+extern void e820_add_limit_region(u64 start, u64 size, int type);
+extern void e820_adjust_region(u64 *start, u64 *size);
extern void e820_print_map(char *who);
extern int
sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1e672234c4ff..ebf937362479 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -207,7 +207,7 @@ static inline pte_t pte_mkexec(pte_t pte)
static inline pte_t pte_mkdirty(pte_t pte)
{
- return pte_set_flags(pte, _PAGE_DIRTY);
+ return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}
static inline pte_t pte_mkyoung(pte_t pte)
@@ -271,7 +271,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd)
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
- return pmd_set_flags(pmd, _PAGE_DIRTY);
+ return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}
static inline pmd_t pmd_mkhuge(pmd_t pmd)
@@ -294,6 +294,26 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_PRESENT);
}
+static inline int pte_soft_dirty(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SOFT_DIRTY;
+}
+
+static inline int pmd_soft_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
+}
+
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
+}
+
/*
* Mask out unsupported bits in a present pgprot. Non-present pgprots
* can use those bits for other purposes, so leave them be.
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index e6423002c10b..c98ac63aae48 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -55,6 +55,18 @@
#define _PAGE_HIDDEN (_AT(pteval_t, 0))
#endif
+/*
+ * The same hidden bit is used by kmemcheck, but since kmemcheck
+ * works on kernel pages while soft-dirty engine on user space,
+ * they do not conflict with each other.
+ */
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
+#else
+#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0))
+#endif
+
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
#else
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d32abeabbda5..0d5bb689649a 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -47,6 +47,7 @@ unsigned long pci_mem_start = 0xaeedbabe;
#ifdef CONFIG_PCI
EXPORT_SYMBOL(pci_mem_start);
#endif
+static u64 mem_limit = ~0ULL;
/*
* This function checks if any part of the range <start,end> is mapped
@@ -108,7 +109,7 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
* Add a memory region to the kernel e820 map.
*/
static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
- int type)
+ int type, bool limited)
{
int x = e820x->nr_map;
@@ -119,6 +120,22 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
return;
}
+ if (limited) {
+ if (start >= mem_limit) {
+ printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
+ (unsigned long long)start,
+ (unsigned long long)(start + size - 1));
+ return;
+ }
+
+ if (mem_limit - start < size) {
+ printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
+ (unsigned long long)mem_limit,
+ (unsigned long long)(start + size - 1));
+ size = mem_limit - start;
+ }
+ }
+
e820x->map[x].addr = start;
e820x->map[x].size = size;
e820x->map[x].type = type;
@@ -127,7 +144,37 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
void __init e820_add_region(u64 start, u64 size, int type)
{
- __e820_add_region(&e820, start, size, type);
+ __e820_add_region(&e820, start, size, type, false);
+}
+
+/*
+ * do_add_efi_memmap() calls this function().
+ *
+ * Note: BOOT_SERVICES_{CODE,DATA} regions on some efi machines are marked
+ * as E820_RAM, and they are needed to be mapped. Please use e820_add_region()
+ * to add BOOT_SERVICES_{CODE,DATA} regions.
+ */
+void __init e820_add_limit_region(u64 start, u64 size, int type)
+{
+ /*
+ * efi_init() is called after finish_e820_parsing(), so we should
+ * check whether [start, start + size) contains address above
+ * mem_limit if the type is E820_RAM.
+ */
+ __e820_add_region(&e820, start, size, type, type == E820_RAM);
+}
+
+void __init e820_adjust_region(u64 *start, u64 *size)
+{
+ if (*start >= mem_limit) {
+ *size = 0;
+ return;
+ }
+
+ if (mem_limit - *start < *size)
+ *size = mem_limit - *start;
+
+ return;
}
static void __init e820_print_type(u32 type)
@@ -455,8 +502,9 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
/* new range is totally covered? */
if (ei->addr < start && ei_end > end) {
- __e820_add_region(e820x, start, size, new_type);
- __e820_add_region(e820x, end, ei_end - end, ei->type);
+ __e820_add_region(e820x, start, size, new_type, false);
+ __e820_add_region(e820x, end, ei_end - end, ei->type,
+ false);
ei->size = start - ei->addr;
real_updated_size += size;
continue;
@@ -469,7 +517,7 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
continue;
__e820_add_region(e820x, final_start, final_end - final_start,
- new_type);
+ new_type, false);
real_updated_size += final_end - final_start;
@@ -809,7 +857,7 @@ static int userdef __initdata;
/* "mem=nopentium" disables the 4MB page tables. */
static int __init parse_memopt(char *p)
{
- u64 mem_size;
+ char *oldp;
if (!p)
return -EINVAL;
@@ -825,11 +873,11 @@ static int __init parse_memopt(char *p)
}
userdef = 1;
- mem_size = memparse(p, &p);
+ oldp = p;
+ mem_limit = memparse(p, &p);
/* don't remove all of memory when handling "mem={invalid}" param */
- if (mem_size == 0)
+ if (mem_limit == 0 || p == oldp)
return -EINVAL;
- e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
return 0;
}
@@ -895,6 +943,12 @@ early_param("memmap", parse_memmap_opt);
void __init finish_e820_parsing(void)
{
+ if (mem_limit != ~0ULL) {
+ userdef = 1;
+ e820_remove_range(mem_limit, ULLONG_MAX - mem_limit,
+ E820_RAM, 1);
+ }
+
if (userdef) {
u32 nr = e820.nr_map;
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 845df6835f9f..62c29a5bfe26 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -115,10 +115,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = mmap_legacy_base();
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f66b54086ce5..96598170c074 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -749,6 +749,10 @@ static void jit_free_defer(struct work_struct *arg)
void bpf_jit_free(struct sk_filter *fp)
{
if (fp->bpf_func != sk_run_filter) {
+ /*
+ * bpf_jit_free() can be called from softirq; module_free()
+ * requires process context.
+ */
struct work_struct *work = (struct work_struct *)fp->bpf_func;
INIT_WORK(work, jit_free_defer);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 55856b2310d3..83e5cc472752 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -419,10 +419,17 @@ static void __init do_add_efi_memmap(void)
int e820_type;
switch (md->type) {
- case EFI_LOADER_CODE:
- case EFI_LOADER_DATA:
case EFI_BOOT_SERVICES_CODE:
case EFI_BOOT_SERVICES_DATA:
+ /* EFI_BOOT_SERVICES_{CODE,DATA} needs to be mapped */
+ if (md->attribute & EFI_MEMORY_WB)
+ e820_type = E820_RAM;
+ else
+ e820_type = E820_RESERVED;
+ e820_add_region(start, size, e820_type);
+ continue;
+ case EFI_LOADER_CODE:
+ case EFI_LOADER_DATA:
case EFI_CONVENTIONAL_MEMORY:
if (md->attribute & EFI_MEMORY_WB)
e820_type = E820_RAM;
@@ -447,7 +454,7 @@ static void __init do_add_efi_memmap(void)
e820_type = E820_RESERVED;
break;
}
- e820_add_region(start, size, e820_type);
+ e820_add_limit_region(start, size, e820_type);
}
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
}
@@ -555,6 +562,8 @@ void __init efi_free_boot_services(void)
md->type != EFI_BOOT_SERVICES_DATA)
continue;
+ e820_adjust_region(&start, &size);
+
/* Could not reserve boot area */
if (!size)
continue;
diff --git a/block/blk-core.c b/block/blk-core.c
index 33c33bc99ddd..94aa4e75d626 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -153,7 +153,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
EXPORT_SYMBOL(blk_rq_init);
static void req_bio_endio(struct request *rq, struct bio *bio,
- unsigned int nbytes, int error)
+ unsigned int nbytes, int error,
+ struct batch_complete *batch)
{
if (error)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -167,7 +168,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
/* don't actually finish bio if it's part of flush sequence */
if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
- bio_endio(bio, error);
+ bio_endio_batch(bio, error, batch);
}
void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -2281,7 +2282,8 @@ EXPORT_SYMBOL(blk_fetch_request);
* %false - this request doesn't have any more data
* %true - this request has more data
**/
-bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
+bool blk_update_request(struct request *req, int error, unsigned int nr_bytes,
+ struct batch_complete *batch)
{
int total_bytes;
@@ -2337,7 +2339,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
if (bio_bytes == bio->bi_size)
req->bio = bio->bi_next;
- req_bio_endio(req, bio, bio_bytes, error);
+ req_bio_endio(req, bio, bio_bytes, error, batch);
total_bytes += bio_bytes;
nr_bytes -= bio_bytes;
@@ -2390,14 +2392,15 @@ EXPORT_SYMBOL_GPL(blk_update_request);
static bool blk_update_bidi_request(struct request *rq, int error,
unsigned int nr_bytes,
- unsigned int bidi_bytes)
+ unsigned int bidi_bytes,
+ struct batch_complete *batch)
{
- if (blk_update_request(rq, error, nr_bytes))
+ if (blk_update_request(rq, error, nr_bytes, batch))
return true;
/* Bidi request must be completed as a whole */
if (unlikely(blk_bidi_rq(rq)) &&
- blk_update_request(rq->next_rq, error, bidi_bytes))
+ blk_update_request(rq->next_rq, error, bidi_bytes, batch))
return true;
if (blk_queue_add_random(rq->q))
@@ -2480,7 +2483,7 @@ static bool blk_end_bidi_request(struct request *rq, int error,
struct request_queue *q = rq->q;
unsigned long flags;
- if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
+ if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes, NULL))
return true;
spin_lock_irqsave(q->queue_lock, flags);
@@ -2506,9 +2509,11 @@ static bool blk_end_bidi_request(struct request *rq, int error,
* %true - still buffers pending for this request
**/
bool __blk_end_bidi_request(struct request *rq, int error,
- unsigned int nr_bytes, unsigned int bidi_bytes)
+ unsigned int nr_bytes,
+ unsigned int bidi_bytes,
+ struct batch_complete *batch)
{
- if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
+ if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes, batch))
return true;
blk_finish_request(rq, error);
@@ -2609,7 +2614,7 @@ EXPORT_SYMBOL_GPL(blk_end_request_err);
**/
bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
{
- return __blk_end_bidi_request(rq, error, nr_bytes, 0);
+ return __blk_end_bidi_request(rq, error, nr_bytes, 0, NULL);
}
EXPORT_SYMBOL(__blk_end_request);
@@ -2621,7 +2626,8 @@ EXPORT_SYMBOL(__blk_end_request);
* Description:
* Completely finish @rq. Must be called with queue lock held.
*/
-void __blk_end_request_all(struct request *rq, int error)
+void blk_end_request_all_batch(struct request *rq, int error,
+ struct batch_complete *batch)
{
bool pending;
unsigned int bidi_bytes = 0;
@@ -2629,10 +2635,11 @@ void __blk_end_request_all(struct request *rq, int error)
if (unlikely(blk_bidi_rq(rq)))
bidi_bytes = blk_rq_bytes(rq->next_rq);
- pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
+ pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq),
+ bidi_bytes, batch);
BUG_ON(pending);
}
-EXPORT_SYMBOL(__blk_end_request_all);
+EXPORT_SYMBOL(blk_end_request_all_batch);
/**
* __blk_end_request_cur - Helper function to finish the current request chunk.
diff --git a/block/blk-flush.c b/block/blk-flush.c
index cc2b827a853c..ab0ed2358947 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -316,7 +316,7 @@ void blk_insert_flush(struct request *rq)
* complete the request.
*/
if (!policy) {
- __blk_end_bidi_request(rq, 0, 0, 0);
+ __blk_end_bidi_request(rq, 0, 0, 0, NULL);
return;
}
@@ -384,7 +384,8 @@ void blk_abort_flushes(struct request_queue *q)
}
}
-static void bio_end_flush(struct bio *bio, int err)
+static void bio_end_flush(struct bio *bio, int err,
+ struct batch_complete *batch)
{
if (err)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index d6f50d572565..279f9de415be 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -15,7 +15,8 @@ struct bio_batch {
struct completion *wait;
};
-static void bio_batch_end_io(struct bio *bio, int err)
+static void bio_batch_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct bio_batch *bb = bio->bi_private;
diff --git a/block/blk.h b/block/blk.h
index e837b8f619b7..dc8fee6d41d6 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -31,7 +31,8 @@ void blk_queue_bypass_end(struct request_queue *q);
void blk_dequeue_request(struct request *rq);
void __blk_queue_free_tags(struct request_queue *q);
bool __blk_end_bidi_request(struct request *rq, int error,
- unsigned int nr_bytes, unsigned int bidi_bytes);
+ unsigned int nr_bytes, unsigned int bidi_bytes,
+ struct batch_complete *batch);
void blk_rq_timed_out_timer(unsigned long data);
void blk_delete_timer(struct request *);
diff --git a/block/genhd.c b/block/genhd.c
index 20625eed5511..5a9f8931b0e6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -849,7 +849,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 9a87daa6f4fb..a5ffcc988f0b 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -27,6 +27,7 @@
#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <linux/times.h>
+#include <linux/uio.h>
#include <asm/uaccess.h>
#include <scsi/scsi.h>
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 622d8a48cbe9..aa3a34907fd0 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1359,6 +1359,22 @@ config CRYPTO_842
help
This is the 842 algorithm.
+config CRYPTO_LZ4
+ tristate "LZ4 compression algorithm"
+ select CRYPTO_ALGAPI
+ select LZ4_COMPRESS
+ select LZ4_DECOMPRESS
+ help
+ This is the LZ4 algorithm.
+
+config CRYPTO_LZ4HC
+ tristate "LZ4HC compression algorithm"
+ select CRYPTO_ALGAPI
+ select LZ4HC_COMPRESS
+ select LZ4_DECOMPRESS
+ help
+ This is the LZ4 high compression mode algorithm.
+
comment "Random Number Generation"
config CRYPTO_ANSI_CPRNG
diff --git a/crypto/Makefile b/crypto/Makefile
index a8e9b0fefbe9..2ba0df2f908f 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -85,6 +85,8 @@ obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o
obj-$(CONFIG_CRYPTO_CRC32) += crc32.o
obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o
obj-$(CONFIG_CRYPTO_LZO) += lzo.o
+obj-$(CONFIG_CRYPTO_LZ4) += lz4.o
+obj-$(CONFIG_CRYPTO_LZ4HC) += lz4hc.o
obj-$(CONFIG_CRYPTO_842) += 842.o
obj-$(CONFIG_CRYPTO_RNG2) += rng.o
obj-$(CONFIG_CRYPTO_RNG2) += krng.o
diff --git a/crypto/lz4.c b/crypto/lz4.c
new file mode 100644
index 000000000000..4586dd15b0d8
--- /dev/null
+++ b/crypto/lz4.c
@@ -0,0 +1,106 @@
+/*
+ * Cryptographic API.
+ *
+ * Copyright (c) 2013 Chanho Min <chanho.min@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/vmalloc.h>
+#include <linux/lz4.h>
+
+struct lz4_ctx {
+ void *lz4_comp_mem;
+};
+
+static int lz4_init(struct crypto_tfm *tfm)
+{
+ struct lz4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ ctx->lz4_comp_mem = vmalloc(LZ4_MEM_COMPRESS);
+ if (!ctx->lz4_comp_mem)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void lz4_exit(struct crypto_tfm *tfm)
+{
+ struct lz4_ctx *ctx = crypto_tfm_ctx(tfm);
+ vfree(ctx->lz4_comp_mem);
+}
+
+static int lz4_compress_crypto(struct crypto_tfm *tfm, const u8 *src,
+ unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+ struct lz4_ctx *ctx = crypto_tfm_ctx(tfm);
+ size_t tmp_len = *dlen;
+ int err;
+
+ err = lz4_compress(src, slen, dst, &tmp_len, ctx->lz4_comp_mem);
+
+ if (err < 0)
+ return -EINVAL;
+
+ *dlen = tmp_len;
+ return 0;
+}
+
+static int lz4_decompress_crypto(struct crypto_tfm *tfm, const u8 *src,
+ unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+ int err;
+ size_t tmp_len = *dlen;
+ size_t __slen = slen;
+
+ err = lz4_decompress(src, &__slen, dst, tmp_len);
+ if (err < 0)
+ return -EINVAL;
+
+ *dlen = tmp_len;
+ return err;
+}
+
+static struct crypto_alg alg_lz4 = {
+ .cra_name = "lz4",
+ .cra_flags = CRYPTO_ALG_TYPE_COMPRESS,
+ .cra_ctxsize = sizeof(struct lz4_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(alg_lz4.cra_list),
+ .cra_init = lz4_init,
+ .cra_exit = lz4_exit,
+ .cra_u = { .compress = {
+ .coa_compress = lz4_compress_crypto,
+ .coa_decompress = lz4_decompress_crypto } }
+};
+
+static int __init lz4_mod_init(void)
+{
+ return crypto_register_alg(&alg_lz4);
+}
+
+static void __exit lz4_mod_fini(void)
+{
+ crypto_unregister_alg(&alg_lz4);
+}
+
+module_init(lz4_mod_init);
+module_exit(lz4_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZ4 Compression Algorithm");
diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c
new file mode 100644
index 000000000000..151ba31d34e3
--- /dev/null
+++ b/crypto/lz4hc.c
@@ -0,0 +1,106 @@
+/*
+ * Cryptographic API.
+ *
+ * Copyright (c) 2013 Chanho Min <chanho.min@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/vmalloc.h>
+#include <linux/lz4.h>
+
+struct lz4hc_ctx {
+ void *lz4hc_comp_mem;
+};
+
+static int lz4hc_init(struct crypto_tfm *tfm)
+{
+ struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ ctx->lz4hc_comp_mem = vmalloc(LZ4HC_MEM_COMPRESS);
+ if (!ctx->lz4hc_comp_mem)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void lz4hc_exit(struct crypto_tfm *tfm)
+{
+ struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ vfree(ctx->lz4hc_comp_mem);
+}
+
+static int lz4hc_compress_crypto(struct crypto_tfm *tfm, const u8 *src,
+ unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+ struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm);
+ size_t tmp_len = *dlen;
+ int err;
+
+ err = lz4hc_compress(src, slen, dst, &tmp_len, ctx->lz4hc_comp_mem);
+
+ if (err < 0)
+ return -EINVAL;
+
+ *dlen = tmp_len;
+ return 0;
+}
+
+static int lz4hc_decompress_crypto(struct crypto_tfm *tfm, const u8 *src,
+ unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+ int err;
+ size_t tmp_len = *dlen;
+ size_t __slen = slen;
+
+ err = lz4_decompress(src, &__slen, dst, tmp_len);
+ if (err < 0)
+ return -EINVAL;
+
+ *dlen = tmp_len;
+ return err;
+}
+
+static struct crypto_alg alg_lz4hc = {
+ .cra_name = "lz4hc",
+ .cra_flags = CRYPTO_ALG_TYPE_COMPRESS,
+ .cra_ctxsize = sizeof(struct lz4hc_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(alg_lz4hc.cra_list),
+ .cra_init = lz4hc_init,
+ .cra_exit = lz4hc_exit,
+ .cra_u = { .compress = {
+ .coa_compress = lz4hc_compress_crypto,
+ .coa_decompress = lz4hc_decompress_crypto } }
+};
+
+static int __init lz4hc_mod_init(void)
+{
+ return crypto_register_alg(&alg_lz4hc);
+}
+
+static void __exit lz4hc_mod_fini(void)
+{
+ crypto_unregister_alg(&alg_lz4hc);
+}
+
+module_init(lz4hc_mod_init);
+module_exit(lz4hc_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZ4HC Compression Algorithm");
diff --git a/drivers/block/blockconsole.c b/drivers/block/blockconsole.c
index 65f8ace6dc02..656c5a6c1f4e 100644
--- a/drivers/block/blockconsole.c
+++ b/drivers/block/blockconsole.c
@@ -164,7 +164,8 @@ static void bcon_advance_console_bytes(struct blockconsole *bc, int bytes)
} while (cmpxchg64(&bc->console_bytes, old, new) != old);
}
-static void request_complete(struct bio *bio, int err)
+static void request_complete(struct bio *bio, int err,
+ struct batch_complete *batch)
{
complete((struct completion *)bio->bi_private);
}
@@ -289,7 +290,7 @@ static void bcon_unregister(struct work_struct *work)
}
#define BCON_MAX_ERRORS 10
-static void bcon_end_io(struct bio *bio, int err)
+static void bcon_end_io(struct bio *bio, int err, struct batch_complete *batch)
{
struct bcon_bio *bcon_bio = container_of(bio, struct bcon_bio, bio);
struct blockconsole *bc = bio->bi_private;
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 64fbb8385cdc..046aa1793514 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -948,7 +948,8 @@ static void bm_aio_ctx_destroy(struct kref *kref)
}
/* bv_page may be a copy, or may be the original */
-static void bm_async_io_complete(struct bio *bio, int error)
+static void bm_async_io_complete(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct bm_aio_ctx *ctx = bio->bi_private;
struct drbd_conf *mdev = ctx->mdev;
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 891c0ecaa292..04a80af8fddb 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -64,7 +64,8 @@ rwlock_t global_state_lock;
/* used for synchronous meta data and bitmap IO
* submitted by drbd_md_sync_page_io()
*/
-void drbd_md_io_complete(struct bio *bio, int error)
+void drbd_md_io_complete(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct drbd_md_io *md_io;
struct drbd_conf *mdev;
@@ -167,7 +168,8 @@ static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __rel
/* writes on behalf of the partner, or resync writes,
* "submitted" by the receiver.
*/
-void drbd_peer_request_endio(struct bio *bio, int error)
+void drbd_peer_request_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct drbd_peer_request *peer_req = bio->bi_private;
struct drbd_conf *mdev = peer_req->w.mdev;
@@ -203,7 +205,8 @@ void drbd_peer_request_endio(struct bio *bio, int error)
/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
*/
-void drbd_request_endio(struct bio *bio, int error)
+void drbd_request_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
unsigned long flags;
struct drbd_request *req = bio->bi_private;
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
index 328f18e4b4ee..d443dc06b854 100644
--- a/drivers/block/drbd/drbd_wrappers.h
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -20,9 +20,12 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
/* bi_end_io handlers */
-extern void drbd_md_io_complete(struct bio *bio, int error);
-extern void drbd_peer_request_endio(struct bio *bio, int error);
-extern void drbd_request_endio(struct bio *bio, int error);
+extern void drbd_md_io_complete(struct bio *bio, int error,
+ struct batch_complete *batch);
+extern void drbd_peer_request_endio(struct bio *bio, int error,
+ struct batch_complete *batch);
+extern void drbd_request_endio(struct bio *bio, int error,
+ struct batch_complete *batch);
/*
* used to submit our private bio
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 83232639034e..629b6d506cd0 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3748,7 +3748,8 @@ static unsigned int floppy_check_events(struct gendisk *disk,
* a disk in the drive, and whether that disk is writable.
*/
-static void floppy_rb0_complete(struct bio *bio, int err)
+static void floppy_rb0_complete(struct bio *bio, int err,
+ struct batch_complete *batch)
{
complete((struct completion *)bio->bi_private);
}
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 847107ef0cce..126232104173 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -151,6 +151,9 @@ static void mtip_command_cleanup(struct driver_data *dd)
struct mtip_cmd *command;
struct mtip_port *port = dd->port;
static int in_progress;
+ struct batch_complete batch;
+
+ batch_complete_init(&batch);
if (in_progress)
return;
@@ -166,11 +169,9 @@ static void mtip_command_cleanup(struct driver_data *dd)
command = &port->commands[commandindex];
if (atomic_read(&command->active)
- && (command->async_callback)) {
- command->async_callback(command->async_data,
- -ENODEV);
- command->async_callback = NULL;
- command->async_data = NULL;
+ && (command->bio)) {
+ bio_endio_batch(command->bio, -ENODEV, &batch);
+ command->bio = NULL;
}
dma_unmap_sg(&port->dd->pdev->dev,
@@ -178,9 +179,10 @@ static void mtip_command_cleanup(struct driver_data *dd)
command->scatter_ents,
command->direction);
}
+ up(&port->cmd_slot);
}
- up(&port->cmd_slot);
+ batch_complete(&batch);
set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag);
in_progress = 0;
@@ -580,6 +582,9 @@ static void mtip_timeout_function(unsigned long int data)
unsigned int bit, group;
unsigned int num_command_slots;
unsigned long to, tagaccum[SLOTBITS_IN_LONGS];
+ struct batch_complete batch;
+
+ batch_complete_init(&batch);
if (unlikely(!port))
return;
@@ -622,11 +627,9 @@ static void mtip_timeout_function(unsigned long int data)
writel(1 << bit, port->completed[group]);
/* Call the async completion callback. */
- if (likely(command->async_callback))
- command->async_callback(command->async_data,
- -EIO);
- command->async_callback = NULL;
- command->comp_func = NULL;
+ if (likely(command->bio))
+ bio_endio_batch(command->bio, -EIO, &batch);
+ command->bio = NULL;
/* Unmap the DMA scatter list entries */
dma_unmap_sg(&port->dd->pdev->dev,
@@ -645,6 +648,8 @@ static void mtip_timeout_function(unsigned long int data)
}
}
+ batch_complete(&batch);
+
if (cmdto_cnt) {
print_tags(port->dd, "timed out", tagaccum, cmdto_cnt);
if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
@@ -695,7 +700,8 @@ static void mtip_timeout_function(unsigned long int data)
static void mtip_async_complete(struct mtip_port *port,
int tag,
void *data,
- int status)
+ int status,
+ struct batch_complete *batch)
{
struct mtip_cmd *command;
struct driver_data *dd = data;
@@ -712,11 +718,10 @@ static void mtip_async_complete(struct mtip_port *port,
}
/* Upper layer callback */
- if (likely(command->async_callback))
- command->async_callback(command->async_data, cb_status);
+ if (likely(command->bio))
+ bio_endio_batch(command->bio, cb_status, batch);
- command->async_callback = NULL;
- command->comp_func = NULL;
+ command->bio = NULL;
/* Unmap the DMA scatter list entries */
dma_unmap_sg(&dd->pdev->dev,
@@ -752,24 +757,22 @@ static void mtip_async_complete(struct mtip_port *port,
static void mtip_completion(struct mtip_port *port,
int tag,
void *data,
- int status)
+ int status,
+ struct batch_complete *batch)
{
- struct mtip_cmd *command = &port->commands[tag];
struct completion *waiting = data;
if (unlikely(status == PORT_IRQ_TF_ERR))
dev_warn(&port->dd->pdev->dev,
"Internal command %d completed with TFE\n", tag);
- command->async_callback = NULL;
- command->comp_func = NULL;
-
complete(waiting);
}
static void mtip_null_completion(struct mtip_port *port,
int tag,
void *data,
- int status)
+ int status,
+ struct batch_complete *batch)
{
return;
}
@@ -798,6 +801,7 @@ static void mtip_handle_tfe(struct driver_data *dd)
unsigned char *buf;
char *fail_reason = NULL;
int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0;
+ struct batch_complete batch;
dev_warn(&dd->pdev->dev, "Taskfile error\n");
@@ -815,13 +819,14 @@ static void mtip_handle_tfe(struct driver_data *dd)
atomic_inc(&cmd->active); /* active > 1 indicates error */
if (cmd->comp_data && cmd->comp_func) {
cmd->comp_func(port, MTIP_TAG_INTERNAL,
- cmd->comp_data, PORT_IRQ_TF_ERR);
+ cmd->comp_data, PORT_IRQ_TF_ERR, NULL);
}
goto handle_tfe_exit;
}
/* clear the tag accumulator */
memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+ batch_complete_init(&batch);
/* Loop through all the groups */
for (group = 0; group < dd->slot_groups; group++) {
@@ -848,7 +853,7 @@ static void mtip_handle_tfe(struct driver_data *dd)
cmd->comp_func(port,
tag,
cmd->comp_data,
- 0);
+ 0, &batch);
} else {
dev_err(&port->dd->pdev->dev,
"Missing completion func for tag %d",
@@ -861,6 +866,7 @@ static void mtip_handle_tfe(struct driver_data *dd)
}
}
}
+ batch_complete(&batch);
print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt);
@@ -902,6 +908,7 @@ static void mtip_handle_tfe(struct driver_data *dd)
/* clear the tag accumulator */
memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+ batch_complete_init(&batch);
/* Loop through all the groups */
for (group = 0; group < dd->slot_groups; group++) {
@@ -935,7 +942,7 @@ static void mtip_handle_tfe(struct driver_data *dd)
if (cmd->comp_func) {
cmd->comp_func(port, tag,
cmd->comp_data,
- -ENODATA);
+ -ENODATA, &batch);
}
continue;
}
@@ -965,13 +972,15 @@ static void mtip_handle_tfe(struct driver_data *dd)
port,
tag,
cmd->comp_data,
- PORT_IRQ_TF_ERR);
+ PORT_IRQ_TF_ERR, &batch);
else
dev_warn(&port->dd->pdev->dev,
"Bad completion for tag %d\n",
tag);
}
}
+
+ batch_complete(&batch);
print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt);
handle_tfe_exit:
@@ -992,6 +1001,9 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
struct driver_data *dd = port->dd;
int tag, bit;
struct mtip_cmd *command;
+ struct batch_complete batch;
+
+ batch_complete_init(&batch);
if (!completed) {
WARN_ON_ONCE(!completed);
@@ -1016,7 +1028,8 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
port,
tag,
command->comp_data,
- 0);
+ 0,
+ &batch);
} else {
dev_warn(&dd->pdev->dev,
"Null completion "
@@ -1026,13 +1039,16 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
if (mtip_check_surprise_removal(
dd->pdev)) {
mtip_command_cleanup(dd);
- return;
+ goto out;
}
}
}
completed >>= 1;
}
+out:
+ batch_complete(&batch);
+
/* If last, re-enable interrupts */
if (atomic_dec_return(&dd->irq_workers_active) == 0)
writel(0xffffffff, dd->mmio + HOST_IRQ_STAT);
@@ -1053,7 +1069,7 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
cmd->comp_func(port,
MTIP_TAG_INTERNAL,
cmd->comp_data,
- 0);
+ 0, NULL);
return;
}
}
@@ -2561,8 +2577,8 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
* None
*/
static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
- int nsect, int nents, int tag, void *callback,
- void *data, int dir, int unaligned)
+ int nsect, int nents, int tag,
+ struct bio *bio, int dir, int unaligned)
{
struct host_to_dev_fis *fis;
struct mtip_port *port = dd->port;
@@ -2621,12 +2637,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
command->comp_func = mtip_async_complete;
command->direction = dma_dir;
- /*
- * Set the completion function and data for the command passed
- * from the upper layer.
- */
- command->async_data = data;
- command->async_callback = callback;
+ command->bio = bio;
/*
* To prevent this command from being issued
@@ -3934,7 +3945,6 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
bio_sectors(bio),
nents,
tag,
- bio_endio,
bio,
bio_data_dir(bio),
unaligned);
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 3bb8a295fbe4..7a2ddfdde9f4 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -328,11 +328,9 @@ struct mtip_cmd {
void (*comp_func)(struct mtip_port *port,
int tag,
void *data,
- int status);
- /* Additional callback function that may be called by comp_func() */
- void (*async_callback)(void *data, int status);
-
- void *async_data; /* Addl. data passed to async_callback() */
+ int status,
+ struct batch_complete *batch);
+ struct bio *bio;
int scatter_ents; /* Number of scatter list entries used */
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index dcb18a3d3314..ce42c14808ac 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -980,7 +980,8 @@ static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec)
}
}
-static void pkt_end_io_read(struct bio *bio, int err)
+static void pkt_end_io_read(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct packet_data *pkt = bio->bi_private;
struct pktcdvd_device *pd = pkt->pd;
@@ -998,7 +999,8 @@ static void pkt_end_io_read(struct bio *bio, int err)
pkt_bio_finished(pd);
}
-static void pkt_end_io_packet_write(struct bio *bio, int err)
+static void pkt_end_io_packet_write(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct packet_data *pkt = bio->bi_private;
struct pktcdvd_device *pd = pkt->pd;
@@ -2339,7 +2341,8 @@ static int pkt_close(struct gendisk *disk, fmode_t mode)
}
-static void pkt_end_io_read_cloned(struct bio *bio, int err)
+static void pkt_end_io_read_cloned(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct packet_stacked_data *psd = bio->bi_private;
struct pktcdvd_device *pd = psd->pd;
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 758f2ac878cf..deb722d63d68 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -775,7 +775,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
if (intr & ERROR_INTR) {
n = fs->scount - 1 - resid / 512;
if (n > 0) {
- blk_update_request(req, 0, n << 9);
+ blk_update_request(req, 0, n << 9, NULL);
fs->req_sector += n;
}
if (fs->retries < 5) {
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 64723953e1c9..49d0ec2472c5 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -217,7 +217,8 @@ static void virtblk_bio_send_flush_work(struct work_struct *work)
virtblk_bio_send_flush(vbr);
}
-static inline void virtblk_request_done(struct virtblk_req *vbr)
+static inline void virtblk_request_done(struct virtblk_req *vbr,
+ struct batch_complete *batch)
{
struct virtio_blk *vblk = vbr->vblk;
struct request *req = vbr->req;
@@ -231,11 +232,12 @@ static inline void virtblk_request_done(struct virtblk_req *vbr)
req->errors = (error != 0);
}
- __blk_end_request_all(req, error);
+ blk_end_request_all_batch(req, error, batch);
mempool_free(vbr, vblk->pool);
}
-static inline void virtblk_bio_flush_done(struct virtblk_req *vbr)
+static inline void virtblk_bio_flush_done(struct virtblk_req *vbr,
+ struct batch_complete *batch)
{
struct virtio_blk *vblk = vbr->vblk;
@@ -244,12 +246,13 @@ static inline void virtblk_bio_flush_done(struct virtblk_req *vbr)
INIT_WORK(&vbr->work, virtblk_bio_send_data_work);
queue_work(virtblk_wq, &vbr->work);
} else {
- bio_endio(vbr->bio, virtblk_result(vbr));
+ bio_endio_batch(vbr->bio, virtblk_result(vbr), batch);
mempool_free(vbr, vblk->pool);
}
}
-static inline void virtblk_bio_data_done(struct virtblk_req *vbr)
+static inline void virtblk_bio_data_done(struct virtblk_req *vbr,
+ struct batch_complete *batch)
{
struct virtio_blk *vblk = vbr->vblk;
@@ -259,17 +262,18 @@ static inline void virtblk_bio_data_done(struct virtblk_req *vbr)
INIT_WORK(&vbr->work, virtblk_bio_send_flush_work);
queue_work(virtblk_wq, &vbr->work);
} else {
- bio_endio(vbr->bio, virtblk_result(vbr));
+ bio_endio_batch(vbr->bio, virtblk_result(vbr), batch);
mempool_free(vbr, vblk->pool);
}
}
-static inline void virtblk_bio_done(struct virtblk_req *vbr)
+static inline void virtblk_bio_done(struct virtblk_req *vbr,
+ struct batch_complete *batch)
{
if (unlikely(vbr->flags & VBLK_IS_FLUSH))
- virtblk_bio_flush_done(vbr);
+ virtblk_bio_flush_done(vbr, batch);
else
- virtblk_bio_data_done(vbr);
+ virtblk_bio_data_done(vbr, batch);
}
static void virtblk_done(struct virtqueue *vq)
@@ -279,16 +283,19 @@ static void virtblk_done(struct virtqueue *vq)
struct virtblk_req *vbr;
unsigned long flags;
unsigned int len;
+ struct batch_complete batch;
+
+ batch_complete_init(&batch);
spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
do {
virtqueue_disable_cb(vq);
while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
if (vbr->bio) {
- virtblk_bio_done(vbr);
+ virtblk_bio_done(vbr, &batch);
bio_done = true;
} else {
- virtblk_request_done(vbr);
+ virtblk_request_done(vbr, &batch);
req_done = true;
}
}
@@ -298,6 +305,8 @@ static void virtblk_done(struct virtqueue *vq)
blk_start_queue(vblk->disk->queue);
spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags);
+ batch_complete(&batch);
+
if (bio_done)
wake_up(&vblk->queue_wait);
}
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index dd5b2fed97e9..990c1d81849f 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -741,7 +741,8 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
/*
* bio callback.
*/
-static void end_block_io_op(struct bio *bio, int error)
+static void end_block_io_op(struct bio *bio, int error,
+ struct batch_complete *batch)
{
__end_block_io_op(bio->bi_private, error);
bio_put(bio);
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 2c644afbcdd4..1ccbe9482faa 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -28,6 +28,7 @@
#include <linux/pfn.h>
#include <linux/export.h>
#include <linux/io.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
@@ -627,6 +628,18 @@ static ssize_t write_null(struct file *file, const char __user *buf,
return count;
}
+static ssize_t aio_read_null(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ return 0;
+}
+
+static ssize_t aio_write_null(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ return iov_length(iov, nr_segs);
+}
+
static int pipe_to_null(struct pipe_inode_info *info, struct pipe_buffer *buf,
struct splice_desc *sd)
{
@@ -670,6 +683,24 @@ static ssize_t read_zero(struct file *file, char __user *buf,
return written ? written : -EFAULT;
}
+static ssize_t aio_read_zero(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ size_t written = 0;
+ unsigned long i;
+ ssize_t ret;
+
+ for (i = 0; i < nr_segs; i++) {
+ ret = read_zero(iocb->ki_filp, iov[i].iov_base, iov[i].iov_len,
+ &pos);
+ if (ret < 0)
+ break;
+ written += ret;
+ }
+
+ return written ? written : -EFAULT;
+}
+
static int mmap_zero(struct file *file, struct vm_area_struct *vma)
{
#ifndef CONFIG_MMU
@@ -738,6 +769,7 @@ static int open_port(struct inode *inode, struct file *filp)
#define full_lseek null_lseek
#define write_zero write_null
#define read_full read_zero
+#define aio_write_zero aio_write_null
#define open_mem open_port
#define open_kmem open_mem
#define open_oldmem open_mem
@@ -766,6 +798,8 @@ static const struct file_operations null_fops = {
.llseek = null_lseek,
.read = read_null,
.write = write_null,
+ .aio_read = aio_read_null,
+ .aio_write = aio_write_null,
.splice_write = splice_write_null,
};
@@ -782,6 +816,8 @@ static const struct file_operations zero_fops = {
.llseek = zero_lseek,
.read = read_zero,
.write = write_zero,
+ .aio_read = aio_read_zero,
+ .aio_write = aio_write_zero,
.mmap = mmap_zero,
};
diff --git a/drivers/char/mwave/tp3780i.c b/drivers/char/mwave/tp3780i.c
index c68969708068..04e6d6a27994 100644
--- a/drivers/char/mwave/tp3780i.c
+++ b/drivers/char/mwave/tp3780i.c
@@ -479,6 +479,7 @@ int tp3780I_QueryAbilities(THINKPAD_BD_DATA * pBDData, MW_ABILITIES * pAbilities
PRINTK_2(TRACE_TP3780I,
"tp3780i::tp3780I_QueryAbilities entry pBDData %p\n", pBDData);
+ memset(pAbilities, 0, sizeof(*pAbilities));
/* fill out standard constant fields */
pAbilities->instr_per_sec = pBDData->rDspSettings.uIps;
pAbilities->data_size = pBDData->rDspSettings.uDStoreSize;
diff --git a/drivers/char/random.c b/drivers/char/random.c
index cd9a6211dcad..35487e8ded59 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -865,16 +865,24 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min,
if (r->entropy_count / 8 < min + reserved) {
nbytes = 0;
} else {
+ int entropy_count, orig;
+retry:
+ entropy_count = orig = ACCESS_ONCE(r->entropy_count);
/* If limited, never pull more than available */
- if (r->limit && nbytes + reserved >= r->entropy_count / 8)
- nbytes = r->entropy_count/8 - reserved;
-
- if (r->entropy_count / 8 >= nbytes + reserved)
- r->entropy_count -= nbytes*8;
- else
- r->entropy_count = reserved;
+ if (r->limit && nbytes + reserved >= entropy_count / 8)
+ nbytes = entropy_count/8 - reserved;
+
+ if (entropy_count / 8 >= nbytes + reserved) {
+ entropy_count -= nbytes*8;
+ if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig)
+ goto retry;
+ } else {
+ entropy_count = reserved;
+ if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig)
+ goto retry;
+ }
- if (r->entropy_count < random_write_wakeup_thresh)
+ if (entropy_count < random_write_wakeup_thresh)
wakeup_write = 1;
}
@@ -957,10 +965,23 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf,
{
ssize_t ret = 0, i;
__u8 tmp[EXTRACT_SIZE];
+ unsigned long flags;
/* if last_data isn't primed, we need EXTRACT_SIZE extra bytes */
- if (fips_enabled && !r->last_data_init)
- nbytes += EXTRACT_SIZE;
+ if (fips_enabled) {
+ spin_lock_irqsave(&r->lock, flags);
+ if (!r->last_data_init) {
+ r->last_data_init = true;
+ spin_unlock_irqrestore(&r->lock, flags);
+ trace_extract_entropy(r->name, EXTRACT_SIZE,
+ r->entropy_count, _RET_IP_);
+ xfer_secondary_pool(r, EXTRACT_SIZE);
+ extract_buf(r, tmp);
+ spin_lock_irqsave(&r->lock, flags);
+ memcpy(r->last_data, tmp, EXTRACT_SIZE);
+ }
+ spin_unlock_irqrestore(&r->lock, flags);
+ }
trace_extract_entropy(r->name, nbytes, r->entropy_count, _RET_IP_);
xfer_secondary_pool(r, nbytes);
@@ -970,19 +991,6 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf,
extract_buf(r, tmp);
if (fips_enabled) {
- unsigned long flags;
-
-
- /* prime last_data value if need be, per fips 140-2 */
- if (!r->last_data_init) {
- spin_lock_irqsave(&r->lock, flags);
- memcpy(r->last_data, tmp, EXTRACT_SIZE);
- r->last_data_init = true;
- nbytes -= EXTRACT_SIZE;
- spin_unlock_irqrestore(&r->lock, flags);
- extract_buf(r, tmp);
- }
-
spin_lock_irqsave(&r->lock, flags);
if (!memcmp(tmp, r->last_data, EXTRACT_SIZE))
panic("Hardware RNG duplicated output!\n");
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index b78cbe74dadf..442a15443fa5 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -399,6 +399,14 @@ static void drm_fb_helper_dpms(struct fb_info *info, int dpms_mode)
return;
/*
+ * fbdev->blank can be called from irq context in case of a panic.
+ * Since we already have our own special panic handler which will
+ * restore the fbdev console mode completely, just bail out early.
+ */
+ if (oops_in_progress)
+ return;
+
+ /*
* For each CRTC in this fb, turn the connectors on/off.
*/
drm_modeset_lock_all(dev);
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
index aed8afee56da..6d7f453b4d05 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -40,6 +40,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/io.h>
+#include <linux/aio.h>
#include <linux/jiffies.h>
#include <linux/cpu.h>
#include <asm/pgtable.h>
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 4f7aa301b3b1..b56c9428f3c5 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -39,7 +39,7 @@
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/io.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
#include <linux/jiffies.h>
#include <asm/pgtable.h>
#include <linux/delay.h>
diff --git a/drivers/leds/leds-ot200.c b/drivers/leds/leds-ot200.c
index ee14662ed5ce..98cae529373f 100644
--- a/drivers/leds/leds-ot200.c
+++ b/drivers/leds/leds-ot200.c
@@ -47,37 +47,37 @@ static struct ot200_led leds[] = {
{
.name = "led_1",
.port = 0x49,
- .mask = BIT(7),
+ .mask = BIT(6),
},
{
.name = "led_2",
.port = 0x49,
- .mask = BIT(6),
+ .mask = BIT(5),
},
{
.name = "led_3",
.port = 0x49,
- .mask = BIT(5),
+ .mask = BIT(4),
},
{
.name = "led_4",
.port = 0x49,
- .mask = BIT(4),
+ .mask = BIT(3),
},
{
.name = "led_5",
.port = 0x49,
- .mask = BIT(3),
+ .mask = BIT(2),
},
{
.name = "led_6",
.port = 0x49,
- .mask = BIT(2),
+ .mask = BIT(1),
},
{
.name = "led_7",
.port = 0x49,
- .mask = BIT(1),
+ .mask = BIT(0),
}
};
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 048f2947e08b..1f75edd7f329 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -156,7 +156,8 @@ static void discard_finish(struct work_struct *w)
closure_put(&ca->set->cl);
}
-static void discard_endio(struct bio *bio, int error)
+static void discard_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct discard *d = container_of(bio, struct discard, bio);
schedule_work(&d->work);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7a5658f04e62..36688d6bb061 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -134,7 +134,8 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
return crc ^ 0xffffffffffffffffULL;
}
-static void btree_bio_endio(struct bio *bio, int error)
+static void btree_bio_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct closure *cl = bio->bi_private;
struct btree *b = container_of(cl, struct btree, io.cl);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 89fd5204924e..3a32b063040b 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -177,7 +177,8 @@ void bch_btree_verify(struct btree *b, struct bset *new)
mutex_unlock(&b->c->verify_lock);
}
-static void data_verify_endio(struct bio *bio, int error)
+static void data_verify_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct closure *cl = bio->bi_private;
closure_put(cl);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 48efd4dea645..29f344b9e408 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -9,7 +9,8 @@
#include "bset.h"
#include "debug.h"
-static void bch_bi_idx_hack_endio(struct bio *bio, int error)
+static void bch_bi_idx_hack_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct bio *p = bio->bi_private;
@@ -206,7 +207,8 @@ static void bch_bio_submit_split_done(struct closure *cl)
mempool_free(s, s->p->bio_split_hook);
}
-static void bch_bio_submit_split_endio(struct bio *bio, int error)
+static void bch_bio_submit_split_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct closure *cl = bio->bi_private;
struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8c8dfdcd9d4c..bff194bb3e08 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -22,7 +22,8 @@
* bit.
*/
-static void journal_read_endio(struct bio *bio, int error)
+static void journal_read_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct closure *cl = bio->bi_private;
closure_put(cl);
@@ -390,7 +391,8 @@ found:
#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
-static void journal_discard_endio(struct bio *bio, int error)
+static void journal_discard_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct journal_device *ja =
container_of(bio, struct journal_device, discard_bio);
@@ -535,7 +537,8 @@ void bch_journal_next(struct journal *j)
pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
}
-static void journal_write_endio(struct bio *bio, int error)
+static void journal_write_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct journal_write *w = bio->bi_private;
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 8589512c972e..8bf7ae12d4b0 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -61,7 +61,8 @@ static void write_moving_finish(struct closure *cl)
closure_return_with_destructor(cl, moving_io_destructor);
}
-static void read_moving_endio(struct bio *bio, int error)
+static void read_moving_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct moving_io *io = container_of(bio->bi_private,
struct moving_io, s.cl);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index e5ff12e52d5b..bc837edd52ea 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -456,7 +456,8 @@ static void bch_insert_data_error(struct closure *cl)
bch_journal(cl);
}
-static void bch_insert_data_endio(struct bio *bio, int error)
+static void bch_insert_data_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct closure *cl = bio->bi_private;
struct btree_op *op = container_of(cl, struct btree_op, cl);
@@ -621,7 +622,8 @@ void bch_btree_insert_async(struct closure *cl)
/* Common code for the make_request functions */
-static void request_endio(struct bio *bio, int error)
+static void request_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct closure *cl = bio->bi_private;
@@ -636,7 +638,8 @@ static void request_endio(struct bio *bio, int error)
closure_put(cl);
}
-void bch_cache_read_endio(struct bio *bio, int error)
+void bch_cache_read_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct bbio *b = container_of(bio, struct bbio, bio);
struct closure *cl = bio->bi_private;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 254d9ab5707c..3b794625c4c1 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -29,11 +29,10 @@ struct search {
struct btree_op op;
};
-void bch_cache_read_endio(struct bio *, int);
+void bch_cache_read_endio(struct bio *, int, struct batch_complete *batch);
int bch_get_congested(struct cache_set *);
void bch_insert_data(struct closure *cl);
void bch_btree_insert_async(struct closure *);
-void bch_cache_read_endio(struct bio *, int);
void bch_open_buckets_free(struct cache_set *);
int bch_open_buckets_alloc(struct cache_set *);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index c8046bc4aa57..76c7f6cc5c56 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -224,7 +224,8 @@ err:
return err;
}
-static void write_bdev_super_endio(struct bio *bio, int error)
+static void write_bdev_super_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct cached_dev *dc = bio->bi_private;
/* XXX: error checking */
@@ -285,7 +286,8 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
closure_return(cl);
}
-static void write_super_endio(struct bio *bio, int error)
+static void write_super_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct cache *ca = bio->bi_private;
@@ -326,7 +328,7 @@ void bcache_write_super(struct cache_set *c)
/* UUID io */
-static void uuid_endio(struct bio *bio, int error)
+static void uuid_endio(struct bio *bio, int error, struct batch_complete *batch)
{
struct closure *cl = bio->bi_private;
struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl);
@@ -490,7 +492,8 @@ static struct uuid_entry *uuid_find_empty(struct cache_set *c)
* disk.
*/
-static void prio_endio(struct bio *bio, int error)
+static void prio_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct cache *ca = bio->bi_private;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 93e7e31a4bd3..daf9347833be 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -253,7 +253,8 @@ static void write_dirty_finish(struct closure *cl)
closure_return_with_destructor(cl, dirty_io_destructor);
}
-static void dirty_endio(struct bio *bio, int error)
+static void dirty_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct keybuf_key *w = bio->bi_private;
struct dirty_io *io = w->private;
@@ -281,7 +282,8 @@ static void write_dirty(struct closure *cl)
continue_at(cl, write_dirty_finish, dirty_wq);
}
-static void read_dirty_endio(struct bio *bio, int error)
+static void read_dirty_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct keybuf_key *w = bio->bi_private;
struct dirty_io *io = w->private;
@@ -289,7 +291,7 @@ static void read_dirty_endio(struct bio *bio, int error)
bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
error, "reading dirty data from cache");
- dirty_endio(bio, error);
+ dirty_endio(bio, error, NULL);
}
static void read_dirty_submit(struct closure *cl)
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 0387e05cdb98..d489dfd306e6 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -494,7 +494,7 @@ static void dmio_complete(unsigned long error, void *context)
{
struct dm_buffer *b = context;
- b->bio.bi_end_io(&b->bio, error ? -EIO : 0);
+ b->bio.bi_end_io(&b->bio, error ? -EIO : 0, NULL);
}
static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
@@ -525,7 +525,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
r = dm_io(&io_req, 1, &region, NULL);
if (r)
- end_io(&b->bio, r);
+ end_io(&b->bio, r, NULL);
}
static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
@@ -592,7 +592,8 @@ static void submit_io(struct dm_buffer *b, int rw, sector_t block,
* Set the error, clear B_WRITING bit and wake anyone who was waiting on
* it.
*/
-static void write_endio(struct bio *bio, int error)
+static void write_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
@@ -965,7 +966,7 @@ found_buffer:
* The endio routine for reading: set the error, clear the bit and wake up
* anyone waiting on the buffer.
*/
-static void read_endio(struct bio *bio, int error)
+static void read_endio(struct bio *bio, int error, struct batch_complete *batch)
{
struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 6feaba24fcac..c44f5e54f12b 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -652,7 +652,8 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
wake_worker(cache);
}
-static void writethrough_endio(struct bio *bio, int err)
+static void writethrough_endio(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
bio->bi_end_io = pb->saved_bi_end_io;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 6d2d41ae9e32..ec0e3c0883b5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -929,7 +929,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
* The work is done per CPU global for all dm-crypt instances.
* They should not depend on each other and do not block.
*/
-static void crypt_endio(struct bio *clone, int error)
+static void crypt_endio(struct bio *clone, int error,
+ struct batch_complete *batch)
{
struct dm_crypt_io *io = clone->bi_private;
struct crypt_config *cc = io->cc;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea49834377c8..a727b267f86d 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -136,7 +136,7 @@ static void dec_count(struct io *io, unsigned int region, int error)
}
}
-static void endio(struct bio *bio, int error)
+static void endio(struct bio *bio, int error, struct batch_complete *batch)
{
struct io *io;
unsigned region;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c434e5aab2df..fb3ea3c63fd1 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1486,7 +1486,8 @@ static void start_copy(struct dm_snap_pending_exception *pe)
dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
}
-static void full_bio_end_io(struct bio *bio, int error)
+static void full_bio_end_io(struct bio *bio, int error,
+ struct batch_complete *batch)
{
void *callback_data = bio->bi_private;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 004ad1652b73..905b75f60cc7 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -553,7 +553,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
spin_unlock_irqrestore(&pool->lock, flags);
}
-static void overwrite_endio(struct bio *bio, int err)
+static void overwrite_endio(struct bio *bio, int err,
+ struct batch_complete *batch)
{
unsigned long flags;
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index b948fd864d45..b373bb7d1c2d 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -413,7 +413,8 @@ static void verity_work(struct work_struct *w)
verity_finish_io(io, verity_verify_io(io));
}
-static void verity_end_io(struct bio *bio, int error)
+static void verity_end_io(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct dm_verity_io *io = bio->bi_private;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9a0bdad9ad8f..0baa5e75683e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -617,7 +617,8 @@ static void dec_pending(struct dm_io *io, int error)
}
}
-static void clone_endio(struct bio *bio, int error)
+static void clone_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
int r = 0;
struct dm_target_io *tio = bio->bi_private;
@@ -652,7 +653,8 @@ static void clone_endio(struct bio *bio, int error)
/*
* Partial completion handling for request-based dm
*/
-static void end_clone_bio(struct bio *clone, int error)
+static void end_clone_bio(struct bio *clone, int error,
+ struct batch_complete *batch)
{
struct dm_rq_clone_bio_info *info = clone->bi_private;
struct dm_rq_target_io *tio = info->tio;
@@ -696,7 +698,7 @@ static void end_clone_bio(struct bio *clone, int error)
* Do not use blk_end_request() here, because it may complete
* the original request before the clone, and break the ordering.
*/
- blk_update_request(tio->orig, 0, nr_bytes);
+ blk_update_request(tio->orig, 0, nr_bytes, NULL);
}
/*
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 3193aefe982b..ac8af5253fb6 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -70,7 +70,8 @@
#include <linux/seq_file.h>
-static void faulty_fail(struct bio *bio, int error)
+static void faulty_fail(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct bio *b = bio->bi_private;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9a11e3e49a9b..e2fc596504ec 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -379,7 +379,8 @@ EXPORT_SYMBOL(mddev_congested);
* Generic flush handling for md
*/
-static void md_end_flush(struct bio *bio, int err)
+static void md_end_flush(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct md_rdev *rdev = bio->bi_private;
struct mddev *mddev = rdev->mddev;
@@ -756,7 +757,8 @@ void md_rdev_clear(struct md_rdev *rdev)
}
EXPORT_SYMBOL_GPL(md_rdev_clear);
-static void super_written(struct bio *bio, int error)
+static void super_written(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct md_rdev *rdev = bio->bi_private;
struct mddev *mddev = rdev->mddev;
@@ -807,7 +809,8 @@ void md_super_wait(struct mddev *mddev)
finish_wait(&mddev->sb_wait, &wq);
}
-static void bi_complete(struct bio *bio, int error)
+static void bi_complete(struct bio *bio, int error,
+ struct batch_complete *batch)
{
complete((struct completion*)bio->bi_private);
}
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 1642eae75a33..fecad70f53f6 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -83,7 +83,8 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
mempool_free(mp_bh, conf->pool);
}
-static void multipath_end_request(struct bio *bio, int error)
+static void multipath_end_request(struct bio *bio, int error,
+ struct batch_complete *batch)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct multipath_bh *mp_bh = bio->bi_private;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 55951182af73..d55d0d9ab360 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -294,7 +294,8 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
return mirror;
}
-static void raid1_end_read_request(struct bio *bio, int error)
+static void raid1_end_read_request(struct bio *bio, int error,
+ struct batch_complete *batch)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r1bio *r1_bio = bio->bi_private;
@@ -379,7 +380,8 @@ static void r1_bio_write_done(struct r1bio *r1_bio)
}
}
-static void raid1_end_write_request(struct bio *bio, int error)
+static void raid1_end_write_request(struct bio *bio, int error,
+ struct batch_complete *batch)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r1bio *r1_bio = bio->bi_private;
@@ -1612,7 +1614,8 @@ abort:
}
-static void end_sync_read(struct bio *bio, int error)
+static void end_sync_read(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct r1bio *r1_bio = bio->bi_private;
@@ -1630,7 +1633,8 @@ static void end_sync_read(struct bio *bio, int error)
reschedule_retry(r1_bio);
}
-static void end_sync_write(struct bio *bio, int error)
+static void end_sync_write(struct bio *bio, int error,
+ struct batch_complete *batch)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r1bio *r1_bio = bio->bi_private;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 59d4daa5f4c7..6c634061ec71 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -101,7 +101,8 @@ static int enough(struct r10conf *conf, int ignore);
static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
int *skipped);
static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
-static void end_reshape_write(struct bio *bio, int error);
+static void end_reshape_write(struct bio *bio, int error,
+ struct batch_complete *batch);
static void end_reshape(struct r10conf *conf);
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
@@ -358,7 +359,8 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
return r10_bio->devs[slot].devnum;
}
-static void raid10_end_read_request(struct bio *bio, int error)
+static void raid10_end_read_request(struct bio *bio, int error,
+ struct batch_complete *batch)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r10bio *r10_bio = bio->bi_private;
@@ -441,7 +443,8 @@ static void one_write_done(struct r10bio *r10_bio)
}
}
-static void raid10_end_write_request(struct bio *bio, int error)
+static void raid10_end_write_request(struct bio *bio, int error,
+ struct batch_complete *batch)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r10bio *r10_bio = bio->bi_private;
@@ -1912,7 +1915,8 @@ abort:
}
-static void end_sync_read(struct bio *bio, int error)
+static void end_sync_read(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct r10bio *r10_bio = bio->bi_private;
struct r10conf *conf = r10_bio->mddev->private;
@@ -1973,7 +1977,8 @@ static void end_sync_request(struct r10bio *r10_bio)
}
}
-static void end_sync_write(struct bio *bio, int error)
+static void end_sync_write(struct bio *bio, int error,
+ struct batch_complete *batch)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r10bio *r10_bio = bio->bi_private;
@@ -4598,7 +4603,8 @@ static int handle_reshape_read_error(struct mddev *mddev,
return 0;
}
-static void end_reshape_write(struct bio *bio, int error)
+static void end_reshape_write(struct bio *bio, int error,
+ struct batch_complete *batch)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r10bio *r10_bio = bio->bi_private;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0e8c9cedf7f0..f745a1b86e1d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -593,9 +593,11 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
}
static void
-raid5_end_read_request(struct bio *bi, int error);
+raid5_end_read_request(struct bio *bi, int error,
+ struct batch_complete *batch);
static void
-raid5_end_write_request(struct bio *bi, int error);
+raid5_end_write_request(struct bio *bi, int error,
+ struct batch_complete *batch);
static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
{
@@ -1774,7 +1776,8 @@ static void shrink_stripes(struct r5conf *conf)
conf->slab_cache = NULL;
}
-static void raid5_end_read_request(struct bio * bi, int error)
+static void raid5_end_read_request(struct bio *bi, int error,
+ struct batch_complete *batch)
{
struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf;
@@ -1894,7 +1897,8 @@ static void raid5_end_read_request(struct bio * bi, int error)
release_stripe(sh);
}
-static void raid5_end_write_request(struct bio *bi, int error)
+static void raid5_end_write_request(struct bio *bi, int error,
+ struct batch_complete *batch)
{
struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf;
@@ -3972,7 +3976,8 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
* first).
* If the read failed..
*/
-static void raid5_align_endio(struct bio *bi, int error)
+static void raid5_align_endio(struct bio *bi, int error,
+ struct batch_complete *batch)
{
struct bio* raid_bi = bi->bi_private;
struct mddev *mddev;
diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c
index 40649a8bf390..6b0dc131b20e 100644
--- a/drivers/net/ethernet/broadcom/cnic.c
+++ b/drivers/net/ethernet/broadcom/cnic.c
@@ -4085,7 +4085,7 @@ static int cnic_cm_alloc_mem(struct cnic_dev *dev)
if (!cp->csk_tbl)
return -ENOMEM;
- port_id = random32();
+ port_id = prandom_u32();
port_id %= CNIC_LOCAL_PORT_RANGE;
if (cnic_init_id_tbl(&cp->csk_port_tbl, CNIC_LOCAL_PORT_RANGE,
CNIC_LOCAL_PORT_MIN, port_id)) {
@@ -4145,7 +4145,7 @@ static int cnic_cm_init_bnx2_hw(struct cnic_dev *dev)
{
u32 seed;
- seed = random32();
+ seed = prandom_u32();
cnic_ctx_wr(dev, 45, 0, seed);
return 0;
}
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 49b8b58fc5c6..484f77ec2ce1 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -449,7 +449,7 @@ static int transmit(struct baycom_state *bc, int cnt, unsigned char stat)
if ((--bc->hdlctx.slotcnt) > 0)
return 0;
bc->hdlctx.slotcnt = bc->ch_params.slottime;
- if ((random32() % 256) > bc->ch_params.ppersist)
+ if ((prandom_u32() % 256) > bc->ch_params.ppersist)
return 0;
}
}
diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c
index a4a3516b6bbf..3169252613fa 100644
--- a/drivers/net/hamradio/hdlcdrv.c
+++ b/drivers/net/hamradio/hdlcdrv.c
@@ -389,7 +389,7 @@ void hdlcdrv_arbitrate(struct net_device *dev, struct hdlcdrv_state *s)
if ((--s->hdlctx.slotcnt) > 0)
return;
s->hdlctx.slotcnt = s->ch_params.slottime;
- if ((random32() % 256) > s->ch_params.ppersist)
+ if ((prandom_u32() % 256) > s->ch_params.ppersist)
return;
start_tx(dev, s);
}
diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c
index b2d863f2ea42..0721e72f9299 100644
--- a/drivers/net/hamradio/yam.c
+++ b/drivers/net/hamradio/yam.c
@@ -638,7 +638,7 @@ static void yam_arbitrate(struct net_device *dev)
yp->slotcnt = yp->slot / 10;
/* is random > persist ? */
- if ((random32() % 256) > yp->pers)
+ if ((prandom_u32() % 256) > yp->pers)
return;
yam_start_tx(dev, yp);
diff --git a/drivers/net/team/team_mode_random.c b/drivers/net/team/team_mode_random.c
index 9eabfaa22f3e..5ca14d463ba7 100644
--- a/drivers/net/team/team_mode_random.c
+++ b/drivers/net/team/team_mode_random.c
@@ -18,7 +18,7 @@
static u32 random_N(unsigned int N)
{
- return reciprocal_divide(random32(), N);
+ return reciprocal_divide(prandom_u32(), N);
}
static bool rnd_transmit(struct team *team, struct sk_buff *skb)
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
index 2b90da0d85f3..e7a1a4770996 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
@@ -1117,7 +1117,7 @@ static void brcmf_p2p_afx_handler(struct work_struct *work)
if (afx_hdl->is_listen && afx_hdl->my_listen_chan)
/* 100ms ~ 300ms */
err = brcmf_p2p_discover_listen(p2p, afx_hdl->my_listen_chan,
- 100 * (1 + (random32() % 3)));
+ 100 * (1 + prandom_u32() % 3));
else
err = brcmf_p2p_act_frm_search(p2p, afx_hdl->peer_listen_chan);
diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c
index a0cb0770d319..d3c8ece980d8 100644
--- a/drivers/net/wireless/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/mwifiex/cfg80211.c
@@ -216,7 +216,7 @@ mwifiex_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
mwifiex_form_mgmt_frame(skb, buf, len);
mwifiex_queue_tx_pkt(priv, skb);
- *cookie = random32() | 1;
+ *cookie = prandom_u32() | 1;
cfg80211_mgmt_tx_status(wdev, *cookie, buf, len, true, GFP_ATOMIC);
wiphy_dbg(wiphy, "info: management frame transmitted\n");
@@ -271,7 +271,7 @@ mwifiex_cfg80211_remain_on_channel(struct wiphy *wiphy,
duration);
if (!ret) {
- *cookie = random32() | 1;
+ *cookie = prandom_u32() | 1;
priv->roc_cfg.cookie = *cookie;
priv->roc_cfg.chan = *chan;
diff --git a/drivers/rapidio/Kconfig b/drivers/rapidio/Kconfig
index 6194d35ebb97..8b7f92d5e14c 100644
--- a/drivers/rapidio/Kconfig
+++ b/drivers/rapidio/Kconfig
@@ -47,4 +47,32 @@ config RAPIDIO_DEBUG
If you are unsure about this, say N here.
+choice
+ prompt "Enumeration method"
+ depends on RAPIDIO
+ help
+ There are different enumeration and discovery mechanisms offered
+ for RapidIO subsystem. You may select single built-in method or
+ or any number of methods to be built as modules.
+ Selecting a built-in method disables use of loadable methods.
+
+ If unsure, select Basic built-in.
+
+config RAPIDIO_ENUM_BASIC
+ tristate "Basic"
+ help
+ This option includes basic RapidIO fabric enumeration and discovery
+ mechanism similar to one described in RapidIO specification Annex 1.
+
+endchoice
+
+config RAPIDIO_ENUM_AUTO
+ bool "Automatic RapidIO enumeration/discovery start"
+ depends on RAPIDIO && RAPIDIO_ENUM_BASIC = y
+ help
+ Say Y if you want RapidIO subsystem to start fabric enumeration and
+ discovery automatically during kernel initialization time.
+ If you are unsure, say N here. You will be able to start RapidIO
+ fabric enumeration or discovery later by a user request.
+
source "drivers/rapidio/switches/Kconfig"
diff --git a/drivers/rapidio/Makefile b/drivers/rapidio/Makefile
index ec3fb8121004..3036702ffe8b 100644
--- a/drivers/rapidio/Makefile
+++ b/drivers/rapidio/Makefile
@@ -1,7 +1,8 @@
#
# Makefile for RapidIO interconnect services
#
-obj-y += rio.o rio-access.o rio-driver.o rio-scan.o rio-sysfs.o
+obj-y += rio.o rio-access.o rio-driver.o rio-sysfs.o
+obj-$(CONFIG_RAPIDIO_ENUM_BASIC) += rio-scan.o
obj-$(CONFIG_RAPIDIO) += switches/
obj-$(CONFIG_RAPIDIO) += devices/
diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c
index 0f4a53bdaa3c..a0c875563d76 100644
--- a/drivers/rapidio/rio-driver.c
+++ b/drivers/rapidio/rio-driver.c
@@ -164,6 +164,13 @@ void rio_unregister_driver(struct rio_driver *rdrv)
driver_unregister(&rdrv->driver);
}
+void rio_attach_device(struct rio_dev *rdev)
+{
+ rdev->dev.bus = &rio_bus_type;
+ rdev->dev.parent = &rio_bus;
+}
+EXPORT_SYMBOL_GPL(rio_attach_device);
+
/**
* rio_match_bus - Tell if a RIO device structure has a matching RIO driver device id structure
* @dev: the standard device structure to match against
@@ -200,6 +207,7 @@ struct bus_type rio_bus_type = {
.name = "rapidio",
.match = rio_match_bus,
.dev_attrs = rio_dev_attrs,
+ .bus_attrs = rio_bus_attrs,
.probe = rio_device_probe,
.remove = rio_device_remove,
};
diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index a965acd3c0e4..3e371b1d0aab 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -37,12 +37,8 @@
#include "rio.h"
-LIST_HEAD(rio_devices);
-
static void rio_init_em(struct rio_dev *rdev);
-DEFINE_SPINLOCK(rio_global_list_lock);
-
static int next_destid = 0;
static int next_comptag = 1;
@@ -327,127 +323,6 @@ static int rio_is_switch(struct rio_dev *rdev)
}
/**
- * rio_switch_init - Sets switch operations for a particular vendor switch
- * @rdev: RIO device
- * @do_enum: Enumeration/Discovery mode flag
- *
- * Searches the RIO switch ops table for known switch types. If the vid
- * and did match a switch table entry, then call switch initialization
- * routine to setup switch-specific routines.
- */
-static void rio_switch_init(struct rio_dev *rdev, int do_enum)
-{
- struct rio_switch_ops *cur = __start_rio_switch_ops;
- struct rio_switch_ops *end = __end_rio_switch_ops;
-
- while (cur < end) {
- if ((cur->vid == rdev->vid) && (cur->did == rdev->did)) {
- pr_debug("RIO: calling init routine for %s\n",
- rio_name(rdev));
- cur->init_hook(rdev, do_enum);
- break;
- }
- cur++;
- }
-
- if ((cur >= end) && (rdev->pef & RIO_PEF_STD_RT)) {
- pr_debug("RIO: adding STD routing ops for %s\n",
- rio_name(rdev));
- rdev->rswitch->add_entry = rio_std_route_add_entry;
- rdev->rswitch->get_entry = rio_std_route_get_entry;
- rdev->rswitch->clr_table = rio_std_route_clr_table;
- }
-
- if (!rdev->rswitch->add_entry || !rdev->rswitch->get_entry)
- printk(KERN_ERR "RIO: missing routing ops for %s\n",
- rio_name(rdev));
-}
-
-/**
- * rio_add_device- Adds a RIO device to the device model
- * @rdev: RIO device
- *
- * Adds the RIO device to the global device list and adds the RIO
- * device to the RIO device list. Creates the generic sysfs nodes
- * for an RIO device.
- */
-static int rio_add_device(struct rio_dev *rdev)
-{
- int err;
-
- err = device_add(&rdev->dev);
- if (err)
- return err;
-
- spin_lock(&rio_global_list_lock);
- list_add_tail(&rdev->global_list, &rio_devices);
- spin_unlock(&rio_global_list_lock);
-
- rio_create_sysfs_dev_files(rdev);
-
- return 0;
-}
-
-/**
- * rio_enable_rx_tx_port - enable input receiver and output transmitter of
- * given port
- * @port: Master port associated with the RIO network
- * @local: local=1 select local port otherwise a far device is reached
- * @destid: Destination ID of the device to check host bit
- * @hopcount: Number of hops to reach the target
- * @port_num: Port (-number on switch) to enable on a far end device
- *
- * Returns 0 or 1 from on General Control Command and Status Register
- * (EXT_PTR+0x3C)
- */
-inline int rio_enable_rx_tx_port(struct rio_mport *port,
- int local, u16 destid,
- u8 hopcount, u8 port_num) {
-#ifdef CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS
- u32 regval;
- u32 ext_ftr_ptr;
-
- /*
- * enable rx input tx output port
- */
- pr_debug("rio_enable_rx_tx_port(local = %d, destid = %d, hopcount = "
- "%d, port_num = %d)\n", local, destid, hopcount, port_num);
-
- ext_ftr_ptr = rio_mport_get_physefb(port, local, destid, hopcount);
-
- if (local) {
- rio_local_read_config_32(port, ext_ftr_ptr +
- RIO_PORT_N_CTL_CSR(0),
- &regval);
- } else {
- if (rio_mport_read_config_32(port, destid, hopcount,
- ext_ftr_ptr + RIO_PORT_N_CTL_CSR(port_num), &regval) < 0)
- return -EIO;
- }
-
- if (regval & RIO_PORT_N_CTL_P_TYP_SER) {
- /* serial */
- regval = regval | RIO_PORT_N_CTL_EN_RX_SER
- | RIO_PORT_N_CTL_EN_TX_SER;
- } else {
- /* parallel */
- regval = regval | RIO_PORT_N_CTL_EN_RX_PAR
- | RIO_PORT_N_CTL_EN_TX_PAR;
- }
-
- if (local) {
- rio_local_write_config_32(port, ext_ftr_ptr +
- RIO_PORT_N_CTL_CSR(0), regval);
- } else {
- if (rio_mport_write_config_32(port, destid, hopcount,
- ext_ftr_ptr + RIO_PORT_N_CTL_CSR(port_num), regval) < 0)
- return -EIO;
- }
-#endif
- return 0;
-}
-
-/**
* rio_setup_device- Allocates and sets up a RIO device
* @net: RIO network
* @port: Master port to send transactions
@@ -587,8 +462,7 @@ static struct rio_dev *rio_setup_device(struct rio_net *net,
rdev->destid);
}
- rdev->dev.bus = &rio_bus_type;
- rdev->dev.parent = &rio_bus;
+ rio_attach_device(rdev);
device_initialize(&rdev->dev);
rdev->dev.release = rio_release_dev;
@@ -1260,19 +1134,30 @@ static void rio_pw_enable(struct rio_mport *port, int enable)
/**
* rio_enum_mport- Start enumeration through a master port
* @mport: Master port to send transactions
+ * @flags: Enumeration control flags
*
* Starts the enumeration process. If somebody has enumerated our
* master port device, then give up. If not and we have an active
* link, then start recursive peer enumeration. Returns %0 if
* enumeration succeeds or %-EBUSY if enumeration fails.
*/
-int rio_enum_mport(struct rio_mport *mport)
+int rio_enum_mport(struct rio_mport *mport, u32 flags)
{
struct rio_net *net = NULL;
int rc = 0;
printk(KERN_INFO "RIO: enumerate master port %d, %s\n", mport->id,
mport->name);
+
+ /*
+ * To avoid multiple start requests (repeat enumeration is not supported
+ * by this method) check if enumeration/discovery was performed for this
+ * mport: if mport was added into the list of mports for a net exit
+ * with error.
+ */
+ if (mport->nnode.next || mport->nnode.prev)
+ return -EBUSY;
+
/* If somebody else enumerated our master port device, bail. */
if (rio_enum_host(mport) < 0) {
printk(KERN_INFO
@@ -1362,14 +1247,16 @@ static void rio_build_route_tables(struct rio_net *net)
/**
* rio_disc_mport- Start discovery through a master port
* @mport: Master port to send transactions
+ * @flags: discovery control flags
*
* Starts the discovery process. If we have an active link,
- * then wait for the signal that enumeration is complete.
+ * then wait for the signal that enumeration is complete (if wait
+ * is allowed).
* When enumeration completion is signaled, start recursive
* peer discovery. Returns %0 if discovery succeeds or %-EBUSY
* on failure.
*/
-int rio_disc_mport(struct rio_mport *mport)
+int rio_disc_mport(struct rio_mport *mport, u32 flags)
{
struct rio_net *net = NULL;
unsigned long to_end;
@@ -1379,6 +1266,11 @@ int rio_disc_mport(struct rio_mport *mport)
/* If master port has an active link, allocate net and discover peers */
if (rio_mport_is_active(mport)) {
+ if (rio_enum_complete(mport))
+ goto enum_done;
+ else if (flags & RIO_SCAN_ENUM_NO_WAIT)
+ return -EAGAIN;
+
pr_debug("RIO: wait for enumeration to complete...\n");
to_end = jiffies + CONFIG_RAPIDIO_DISC_TIMEOUT * HZ;
@@ -1421,3 +1313,46 @@ enum_done:
bail:
return -EBUSY;
}
+
+struct rio_scan rio_scan_ops = {
+ .enumerate = rio_enum_mport,
+ .discover = rio_disc_mport,
+};
+
+
+#ifdef MODULE
+
+static bool scan;
+module_param(scan, bool, 0);
+MODULE_PARM_DESC(scan, "Start RapidIO network enumeration/discovery "
+ "(default = 1)");
+
+/**
+ * rio_basic_attach:
+ *
+ * When this enumeration/discovery method is loaded as a module this function
+ * registers its specific enumeration and discover routines for all available
+ * RapidIO mport devices. The "scan" command line parameter controls ability of
+ * the module to start RapidIO enumeration/discovery automatically.
+ *
+ * Returns 0 for success or -EIO if unable to register itself.
+ *
+ * This enumeration/discovery method cannot be unloaded and therefore does not
+ * provide a matching cleanup_module routine.
+ */
+
+int __init rio_basic_attach(void)
+{
+ if (rio_register_scan(RIO_MPORT_ANY, &rio_scan_ops))
+ return -EIO;
+ if (scan)
+ rio_init_mports();
+ return 0;
+}
+
+module_init(rio_basic_attach);
+
+MODULE_DESCRIPTION("Basic RapidIO enumeration/discovery");
+MODULE_LICENSE("GPL");
+
+#endif /* MODULE */
diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c
index 4dbe360989be..66d4acd5e18f 100644
--- a/drivers/rapidio/rio-sysfs.c
+++ b/drivers/rapidio/rio-sysfs.c
@@ -285,3 +285,48 @@ void rio_remove_sysfs_dev_files(struct rio_dev *rdev)
rdev->rswitch->sw_sysfs(rdev, RIO_SW_SYSFS_REMOVE);
}
}
+
+static ssize_t bus_scan_store(struct bus_type *bus, const char *buf,
+ size_t count)
+{
+ long val;
+ struct rio_mport *port = NULL;
+ int rc;
+
+ if (kstrtol(buf, 0, &val) < 0)
+ return -EINVAL;
+
+ if (val == RIO_MPORT_ANY) {
+ rc = rio_init_mports();
+ goto exit;
+ }
+
+ if (val < 0 || val >= RIO_MAX_MPORTS)
+ return -EINVAL;
+
+ port = rio_find_mport((int)val);
+
+ if (!port) {
+ pr_debug("RIO: %s: mport_%d not available\n",
+ __func__, (int)val);
+ return -EINVAL;
+ }
+
+ if (!port->nscan)
+ return -EINVAL;
+
+ if (port->host_deviceid >= 0)
+ rc = port->nscan->enumerate(port, 0);
+ else
+ rc = port->nscan->discover(port, RIO_SCAN_ENUM_NO_WAIT);
+exit:
+ if (!rc)
+ rc = count;
+
+ return rc;
+}
+
+struct bus_attribute rio_bus_attrs[] = {
+ __ATTR(scan, (S_IWUSR|S_IWGRP), NULL, bus_scan_store),
+ __ATTR_NULL
+};
diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c
index d553b5d13722..3bfbcecaae0b 100644
--- a/drivers/rapidio/rio.c
+++ b/drivers/rapidio/rio.c
@@ -31,6 +31,9 @@
#include "rio.h"
+LIST_HEAD(rio_devices);
+DEFINE_SPINLOCK(rio_global_list_lock);
+
static LIST_HEAD(rio_mports);
static unsigned char next_portid;
static DEFINE_SPINLOCK(rio_mmap_lock);
@@ -53,6 +56,32 @@ u16 rio_local_get_device_id(struct rio_mport *port)
}
/**
+ * rio_add_device- Adds a RIO device to the device model
+ * @rdev: RIO device
+ *
+ * Adds the RIO device to the global device list and adds the RIO
+ * device to the RIO device list. Creates the generic sysfs nodes
+ * for an RIO device.
+ */
+int rio_add_device(struct rio_dev *rdev)
+{
+ int err;
+
+ err = device_add(&rdev->dev);
+ if (err)
+ return err;
+
+ spin_lock(&rio_global_list_lock);
+ list_add_tail(&rdev->global_list, &rio_devices);
+ spin_unlock(&rio_global_list_lock);
+
+ rio_create_sysfs_dev_files(rdev);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rio_add_device);
+
+/**
* rio_request_inb_mbox - request inbound mailbox service
* @mport: RIO master port from which to allocate the mailbox resource
* @dev_id: Device specific pointer to pass on event
@@ -489,6 +518,7 @@ rio_mport_get_physefb(struct rio_mport *port, int local,
return ext_ftr_ptr;
}
+EXPORT_SYMBOL_GPL(rio_mport_get_physefb);
/**
* rio_get_comptag - Begin or continue searching for a RIO device by component tag
@@ -521,6 +551,7 @@ exit:
spin_unlock(&rio_global_list_lock);
return rdev;
}
+EXPORT_SYMBOL_GPL(rio_get_comptag);
/**
* rio_set_port_lockout - Sets/clears LOCKOUT bit (RIO EM 1.3) for a switch port.
@@ -545,6 +576,107 @@ int rio_set_port_lockout(struct rio_dev *rdev, u32 pnum, int lock)
regval);
return 0;
}
+EXPORT_SYMBOL_GPL(rio_set_port_lockout);
+
+/**
+ * rio_switch_init - Sets switch operations for a particular vendor switch
+ * @rdev: RIO device
+ * @do_enum: Enumeration/Discovery mode flag
+ *
+ * Searches the RIO switch ops table for known switch types. If the vid
+ * and did match a switch table entry, then call switch initialization
+ * routine to setup switch-specific routines.
+ */
+void rio_switch_init(struct rio_dev *rdev, int do_enum)
+{
+ struct rio_switch_ops *cur = __start_rio_switch_ops;
+ struct rio_switch_ops *end = __end_rio_switch_ops;
+
+ while (cur < end) {
+ if ((cur->vid == rdev->vid) && (cur->did == rdev->did)) {
+ pr_debug("RIO: calling init routine for %s\n",
+ rio_name(rdev));
+ cur->init_hook(rdev, do_enum);
+ break;
+ }
+ cur++;
+ }
+
+ if ((cur >= end) && (rdev->pef & RIO_PEF_STD_RT)) {
+ pr_debug("RIO: adding STD routing ops for %s\n",
+ rio_name(rdev));
+ rdev->rswitch->add_entry = rio_std_route_add_entry;
+ rdev->rswitch->get_entry = rio_std_route_get_entry;
+ rdev->rswitch->clr_table = rio_std_route_clr_table;
+ }
+
+ if (!rdev->rswitch->add_entry || !rdev->rswitch->get_entry)
+ printk(KERN_ERR "RIO: missing routing ops for %s\n",
+ rio_name(rdev));
+}
+EXPORT_SYMBOL_GPL(rio_switch_init);
+
+/**
+ * rio_enable_rx_tx_port - enable input receiver and output transmitter of
+ * given port
+ * @port: Master port associated with the RIO network
+ * @local: local=1 select local port otherwise a far device is reached
+ * @destid: Destination ID of the device to check host bit
+ * @hopcount: Number of hops to reach the target
+ * @port_num: Port (-number on switch) to enable on a far end device
+ *
+ * Returns 0 or 1 from on General Control Command and Status Register
+ * (EXT_PTR+0x3C)
+ */
+int rio_enable_rx_tx_port(struct rio_mport *port,
+ int local, u16 destid,
+ u8 hopcount, u8 port_num)
+{
+#ifdef CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS
+ u32 regval;
+ u32 ext_ftr_ptr;
+
+ /*
+ * enable rx input tx output port
+ */
+ pr_debug("rio_enable_rx_tx_port(local = %d, destid = %d, hopcount = "
+ "%d, port_num = %d)\n", local, destid, hopcount, port_num);
+
+ ext_ftr_ptr = rio_mport_get_physefb(port, local, destid, hopcount);
+
+ if (local) {
+ rio_local_read_config_32(port, ext_ftr_ptr +
+ RIO_PORT_N_CTL_CSR(0),
+ &regval);
+ } else {
+ if (rio_mport_read_config_32(port, destid, hopcount,
+ ext_ftr_ptr + RIO_PORT_N_CTL_CSR(port_num), &regval) < 0)
+ return -EIO;
+ }
+
+ if (regval & RIO_PORT_N_CTL_P_TYP_SER) {
+ /* serial */
+ regval = regval | RIO_PORT_N_CTL_EN_RX_SER
+ | RIO_PORT_N_CTL_EN_TX_SER;
+ } else {
+ /* parallel */
+ regval = regval | RIO_PORT_N_CTL_EN_RX_PAR
+ | RIO_PORT_N_CTL_EN_TX_PAR;
+ }
+
+ if (local) {
+ rio_local_write_config_32(port, ext_ftr_ptr +
+ RIO_PORT_N_CTL_CSR(0), regval);
+ } else {
+ if (rio_mport_write_config_32(port, destid, hopcount,
+ ext_ftr_ptr + RIO_PORT_N_CTL_CSR(port_num), regval) < 0)
+ return -EIO;
+ }
+#endif
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rio_enable_rx_tx_port);
+
/**
* rio_chk_dev_route - Validate route to the specified device.
@@ -610,6 +742,7 @@ rio_mport_chk_dev_access(struct rio_mport *mport, u16 destid, u8 hopcount)
return 0;
}
+EXPORT_SYMBOL_GPL(rio_mport_chk_dev_access);
/**
* rio_chk_dev_access - Validate access to the specified device.
@@ -941,6 +1074,7 @@ rio_mport_get_efb(struct rio_mport *port, int local, u16 destid,
return RIO_GET_BLOCK_ID(reg_val);
}
}
+EXPORT_SYMBOL_GPL(rio_mport_get_efb);
/**
* rio_mport_get_feature - query for devices' extended features
@@ -997,6 +1131,7 @@ rio_mport_get_feature(struct rio_mport * port, int local, u16 destid,
return 0;
}
+EXPORT_SYMBOL_GPL(rio_mport_get_feature);
/**
* rio_get_asm - Begin or continue searching for a RIO device by vid/did/asm_vid/asm_did
@@ -1246,6 +1381,87 @@ EXPORT_SYMBOL_GPL(rio_dma_prep_slave_sg);
#endif /* CONFIG_RAPIDIO_DMA_ENGINE */
+/**
+ * rio_find_mport - find RIO mport by its ID
+ * @mport_id: number (ID) of mport device
+ *
+ * Given a RIO mport number, the desired mport is located
+ * in the global list of mports. If the mport is found, a pointer to its
+ * data structure is returned. If no mport is found, %NULL is returned.
+ */
+struct rio_mport *rio_find_mport(int mport_id)
+{
+ struct rio_mport *port = NULL;
+
+ list_for_each_entry(port, &rio_mports, node) {
+ if (port->id == mport_id)
+ return port;
+ }
+
+ return NULL;
+}
+
+/**
+ * rio_register_scan - enumeration/discovery method registration interface
+ * @mport_id: mport device ID for which fabric scan routine has to be set
+ * (RIO_MPORT_ANY = set for all available mports)
+ * @scan_ops: enumeration/discovery control structure
+ *
+ * Assigns enumeration or discovery method to the specified mport device (or all
+ * available mports if RIO_MPORT_ANY is specified).
+ * Returns error if the mport already has an enumerator attached to it.
+ * In case of RIO_MPORT_ANY ignores ports with valid scan routines and returns
+ * an error if was unable to find at least one available mport.
+ */
+int rio_register_scan(int mport_id, struct rio_scan *scan_ops)
+{
+ struct rio_mport *port;
+ int rc = -EBUSY;
+
+ list_for_each_entry(port, &rio_mports, node) {
+ if (port->id == mport_id || mport_id == RIO_MPORT_ANY) {
+ if (port->nscan && mport_id == RIO_MPORT_ANY)
+ continue;
+ else if (port->nscan)
+ break;
+
+ port->nscan = scan_ops;
+ rc = 0;
+
+ if (mport_id != RIO_MPORT_ANY)
+ break;
+ }
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(rio_register_scan);
+
+/**
+ * rio_unregister_scan - removes enumeration/discovery method from mport
+ * @mport_id: mport device ID for which fabric scan routine has to be
+ * unregistered (RIO_MPORT_ANY = set for all available mports)
+ *
+ * Removes enumeration or discovery method assigned to the specified mport
+ * device (or all available mports if RIO_MPORT_ANY is specified).
+ */
+int rio_unregister_scan(int mport_id)
+{
+ struct rio_mport *port;
+
+ list_for_each_entry(port, &rio_mports, node) {
+ if (port->id == mport_id || mport_id == RIO_MPORT_ANY) {
+ if (port->nscan)
+ port->nscan = NULL;
+ if (mport_id != RIO_MPORT_ANY)
+ break;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rio_unregister_scan);
+
static void rio_fixup_device(struct rio_dev *dev)
{
}
@@ -1274,7 +1490,7 @@ static void disc_work_handler(struct work_struct *_work)
work = container_of(_work, struct rio_disc_work, work);
pr_debug("RIO: discovery work for mport %d %s\n",
work->mport->id, work->mport->name);
- rio_disc_mport(work->mport);
+ work->mport->nscan->discover(work->mport, 0);
}
int rio_init_mports(void)
@@ -1291,9 +1507,10 @@ int rio_init_mports(void)
* on any of the registered mports.
*/
list_for_each_entry(port, &rio_mports, node) {
- if (port->host_deviceid >= 0)
- rio_enum_mport(port);
- else
+ if (port->host_deviceid >= 0) {
+ if (port->nscan)
+ port->nscan->enumerate(port, 0);
+ } else
n++;
}
@@ -1323,7 +1540,7 @@ int rio_init_mports(void)
n = 0;
list_for_each_entry(port, &rio_mports, node) {
- if (port->host_deviceid < 0) {
+ if (port->host_deviceid < 0 && port->nscan) {
work[n].mport = port;
INIT_WORK(&work[n].work, disc_work_handler);
queue_work(rio_wq, &work[n].work);
@@ -1342,7 +1559,9 @@ no_disc:
return 0;
}
+#ifdef CONFIG_RAPIDIO_ENUM_AUTO
device_initcall_sync(rio_init_mports);
+#endif
static int hdids[RIO_MAX_MPORTS + 1];
@@ -1362,6 +1581,9 @@ static int rio_hdid_setup(char *str)
__setup("riohdid=", rio_hdid_setup);
+/* Enumerator structure that links built-in enumerator */
+extern struct rio_scan __weak rio_scan_ops;
+
int rio_register_mport(struct rio_mport *port)
{
if (next_portid >= RIO_MAX_MPORTS) {
@@ -1371,6 +1593,7 @@ int rio_register_mport(struct rio_mport *port)
port->id = next_portid++;
port->host_deviceid = rio_get_hdid(port->id);
+ port->nscan = &rio_scan_ops;
list_add_tail(&port->node, &rio_mports);
return 0;
}
@@ -1386,3 +1609,4 @@ EXPORT_SYMBOL_GPL(rio_request_inb_mbox);
EXPORT_SYMBOL_GPL(rio_release_inb_mbox);
EXPORT_SYMBOL_GPL(rio_request_outb_mbox);
EXPORT_SYMBOL_GPL(rio_release_outb_mbox);
+EXPORT_SYMBOL_GPL(rio_init_mports);
diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h
index b1af414f15e6..bdd339e884b8 100644
--- a/drivers/rapidio/rio.h
+++ b/drivers/rapidio/rio.h
@@ -15,6 +15,7 @@
#include <linux/rio.h>
#define RIO_MAX_CHK_RETRY 3
+#define RIO_MPORT_ANY (-1)
/* Functions internal to the RIO core code */
@@ -27,8 +28,6 @@ extern u32 rio_mport_get_efb(struct rio_mport *port, int local, u16 destid,
extern int rio_mport_chk_dev_access(struct rio_mport *mport, u16 destid,
u8 hopcount);
extern int rio_create_sysfs_dev_files(struct rio_dev *rdev);
-extern int rio_enum_mport(struct rio_mport *mport);
-extern int rio_disc_mport(struct rio_mport *mport);
extern int rio_std_route_add_entry(struct rio_mport *mport, u16 destid,
u8 hopcount, u16 table, u16 route_destid,
u8 route_port);
@@ -39,9 +38,18 @@ extern int rio_std_route_clr_table(struct rio_mport *mport, u16 destid,
u8 hopcount, u16 table);
extern int rio_set_port_lockout(struct rio_dev *rdev, u32 pnum, int lock);
extern struct rio_dev *rio_get_comptag(u32 comp_tag, struct rio_dev *from);
+extern int rio_add_device(struct rio_dev *rdev);
+extern void rio_switch_init(struct rio_dev *rdev, int do_enum);
+extern int rio_enable_rx_tx_port(struct rio_mport *port, int local, u16 destid,
+ u8 hopcount, u8 port_num);
+extern int rio_register_scan(int mport_id, struct rio_scan *scan_ops);
+extern int rio_unregister_scan(int mport_id);
+extern void rio_attach_device(struct rio_dev *rdev);
+extern struct rio_mport *rio_find_mport(int mport_id);
/* Structures internal to the RIO core code */
extern struct device_attribute rio_dev_attrs[];
+extern struct bus_attribute rio_bus_attrs[];
extern spinlock_t rio_global_list_lock;
extern struct rio_switch_ops __start_rio_switch_ops[];
diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c
index ed037ae91c5f..a2073eb968a2 100644
--- a/drivers/rtc/rtc-pxa.c
+++ b/drivers/rtc/rtc-pxa.c
@@ -19,6 +19,7 @@
*
*/
+#include <linux/delay.h>
#include <linux/init.h>
#include <linux/platform_device.h>
#include <linux/module.h>
@@ -80,22 +81,29 @@
#define RYAR1 0x1c
#define RTCPICR 0x34
#define PIAR 0x38
+#define PSBR_RTC 0x00
#define rtc_readl(pxa_rtc, reg) \
__raw_readl((pxa_rtc)->base + (reg))
#define rtc_writel(pxa_rtc, reg, value) \
__raw_writel((value), (pxa_rtc)->base + (reg))
+#define rtc_readl_psbr(pxa_rtc, reg) \
+ __raw_readl((pxa_rtc)->base_psbr + (reg))
+#define rtc_writel_psbr(pxa_rtc, reg, value) \
+ __raw_writel((value), (pxa_rtc)->base_psbr + (reg))
struct pxa_rtc {
struct resource *ress;
+ struct resource *ress_psbr;
void __iomem *base;
+ void __iomem *base_psbr;
int irq_1Hz;
int irq_Alrm;
struct rtc_device *rtc;
spinlock_t lock; /* Protects this structure */
};
-
+static struct pxa_rtc *rtc_info;
static u32 ryxr_calc(struct rtc_time *tm)
{
return ((tm->tm_year + 1900) << RYxR_YEAR_S)
@@ -117,7 +125,7 @@ static void tm_calc(u32 rycr, u32 rdcr, struct rtc_time *tm)
tm->tm_year = ((rycr & RYxR_YEAR_MASK) >> RYxR_YEAR_S) - 1900;
tm->tm_mon = (((rycr & RYxR_MONTH_MASK) >> RYxR_MONTH_S)) - 1;
tm->tm_mday = (rycr & RYxR_DAY_MASK);
- tm->tm_wday = ((rycr & RDxR_DOW_MASK) >> RDxR_DOW_S) - 1;
+ tm->tm_wday = ((rdcr & RDxR_DOW_MASK) >> RDxR_DOW_S) - 1;
tm->tm_hour = (rdcr & RDxR_HOUR_MASK) >> RDxR_HOUR_S;
tm->tm_min = (rdcr & RDxR_MIN_MASK) >> RDxR_MIN_S;
tm->tm_sec = rdcr & RDxR_SEC_MASK;
@@ -175,7 +183,6 @@ static irqreturn_t pxa_rtc_irq(int irq, void *dev_id)
/* enable back rtc interrupts */
rtc_writel(pxa_rtc, RTSR, rtsr & ~RTSR_TRIG_MASK);
-
spin_unlock(&pxa_rtc->lock);
return IRQ_HANDLED;
}
@@ -250,12 +257,45 @@ static int pxa_rtc_read_time(struct device *dev, struct rtc_time *tm)
static int pxa_rtc_set_time(struct device *dev, struct rtc_time *tm)
{
struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
-
+ /* sequence to wirte pxa rtc register RCNR RDCR RYCR is
+ *1. set PSBR[RWE] bit, take 2x32-khz to complete
+ *2. write to RTC register,take 2x32-khz to complete
+ *3. clear PSBR[RWE] bit,take 2x32-khz to complete
+ */
+ if ((tm->tm_year < 70) || (tm->tm_year > 138))
+ return -EINVAL;
+ rtc_writel_psbr(rtc_info, PSBR_RTC, 0x01);
+ udelay(100);
rtc_writel(pxa_rtc, RYCR, ryxr_calc(tm));
rtc_writel(pxa_rtc, RDCR, rdxr_calc(tm));
+ udelay(100);
+ rtc_writel_psbr(rtc_info, PSBR_RTC, 0x00);
+ udelay(100);
+ pxa_rtc_read_time(dev, tm);
+ dev_info(dev, "tm.year = %d, tm.month = %d, tm.day = %d\n",
+ tm->tm_year + 1900, tm->tm_mon, tm->tm_mday);
+ return 0;
+}
+int pxa_rtc_sync_time(unsigned int ticks)
+{
+ /* sequence to wirte pxa rtc register RCNR RDCR RYCR is
+ *1. set PSBR[RWE] bit, take 2x32-khz to complete
+ *2. write to RTC register,take 2x32-khz to complete
+ *3. clear PSBR[RWE] bit,take 2x32-khz to complete
+ */
+ struct rtc_time tm;
+ rtc_time_to_tm(ticks, &tm);
+ rtc_writel_psbr(rtc_info, PSBR_RTC, 0x01);
+ udelay(100);
+ rtc_writel(rtc_info, RYCR, ryxr_calc(&tm));
+ rtc_writel(rtc_info, RDCR, rdxr_calc(&tm));
+ udelay(100);
+ rtc_writel_psbr(rtc_info, PSBR_RTC, 0x00);
+ udelay(100);
return 0;
}
+EXPORT_SYMBOL(pxa_rtc_sync_time);
static int pxa_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
{
@@ -324,10 +364,10 @@ static int __init pxa_rtc_probe(struct platform_device *pdev)
int ret;
u32 rttr;
- pxa_rtc = kzalloc(sizeof(struct pxa_rtc), GFP_KERNEL);
+ pxa_rtc = devm_kzalloc(dev, sizeof(struct pxa_rtc), GFP_KERNEL);
if (!pxa_rtc)
return -ENOMEM;
-
+ rtc_info = pxa_rtc;
spin_lock_init(&pxa_rtc->lock);
platform_set_drvdata(pdev, pxa_rtc);
@@ -335,28 +375,38 @@ static int __init pxa_rtc_probe(struct platform_device *pdev)
pxa_rtc->ress = platform_get_resource(pdev, IORESOURCE_MEM, 0);
if (!pxa_rtc->ress) {
dev_err(dev, "No I/O memory resource defined\n");
- goto err_ress;
+ return ret;
+ }
+ pxa_rtc->ress_psbr = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+ if (!pxa_rtc->ress_psbr) {
+ dev_err(dev, "No I/O memory resource defined\n");
+ return ret;
}
pxa_rtc->irq_1Hz = platform_get_irq(pdev, 0);
if (pxa_rtc->irq_1Hz < 0) {
dev_err(dev, "No 1Hz IRQ resource defined\n");
- goto err_ress;
+ return ret;
}
pxa_rtc->irq_Alrm = platform_get_irq(pdev, 1);
if (pxa_rtc->irq_Alrm < 0) {
dev_err(dev, "No alarm IRQ resource defined\n");
- goto err_ress;
+ return ret;
}
- pxa_rtc_open(dev);
ret = -ENOMEM;
- pxa_rtc->base = ioremap(pxa_rtc->ress->start,
+ pxa_rtc->base = devm_ioremap(&pdev->dev, pxa_rtc->ress->start,
resource_size(pxa_rtc->ress));
if (!pxa_rtc->base) {
dev_err(&pdev->dev, "Unable to map pxa RTC I/O memory\n");
- goto err_map;
+ return ret;
}
+ pxa_rtc->base_psbr = devm_ioremap(&pdev->dev, pxa_rtc->ress_psbr->start,
+ resource_size(pxa_rtc->ress_psbr));
+ if (!pxa_rtc->base_psbr) {
+ dev_err(&pdev->dev, "Unable to map pxa RTC PSBR I/O memory\n");
+ return ret;
+ }
/*
* If the clock divider is uninitialized then reset it to the
* default value to get the 1Hz clock.
@@ -370,41 +420,24 @@ static int __init pxa_rtc_probe(struct platform_device *pdev)
rtsr_clear_bits(pxa_rtc, RTSR_PIALE | RTSR_RDALE1 | RTSR_HZE);
- pxa_rtc->rtc = rtc_device_register("pxa-rtc", &pdev->dev, &pxa_rtc_ops,
- THIS_MODULE);
+ pxa_rtc->rtc = devm_rtc_device_register(&pdev->dev, "pxa-rtc",
+ &pxa_rtc_ops, THIS_MODULE);
ret = PTR_ERR(pxa_rtc->rtc);
if (IS_ERR(pxa_rtc->rtc)) {
dev_err(dev, "Failed to register RTC device -> %d\n", ret);
- goto err_rtc_reg;
+ return ret;
}
device_init_wakeup(dev, 1);
-
+ pxa_rtc_open(dev);
return 0;
-
-err_rtc_reg:
- iounmap(pxa_rtc->base);
-err_ress:
-err_map:
- kfree(pxa_rtc);
- return ret;
}
static int __exit pxa_rtc_remove(struct platform_device *pdev)
{
- struct pxa_rtc *pxa_rtc = platform_get_drvdata(pdev);
-
struct device *dev = &pdev->dev;
pxa_rtc_release(dev);
- rtc_device_unregister(pxa_rtc->rtc);
-
- spin_lock_irq(&pxa_rtc->lock);
- iounmap(pxa_rtc->base);
- spin_unlock_irq(&pxa_rtc->lock);
-
- kfree(pxa_rtc);
-
return 0;
}
diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c
index 6e0cba8f47d5..d87878eee8c1 100644
--- a/drivers/rtc/rtc-v3020.c
+++ b/drivers/rtc/rtc-v3020.c
@@ -49,18 +49,13 @@ struct v3020_chip_ops {
#define V3020_RD 2
#define V3020_IO 3
-struct v3020_gpio {
- const char *name;
- unsigned int gpio;
-};
-
struct v3020 {
/* MMIO access */
void __iomem *ioaddress;
int leftshift;
/* GPIO access */
- struct v3020_gpio *gpio;
+ struct gpio *gpio;
struct v3020_chip_ops *ops;
@@ -107,48 +102,40 @@ static struct v3020_chip_ops v3020_mmio_ops = {
.write_bit = v3020_mmio_write_bit,
};
-static struct v3020_gpio v3020_gpio[] = {
- { "RTC CS", 0 },
- { "RTC WR", 0 },
- { "RTC RD", 0 },
- { "RTC IO", 0 },
+static struct gpio v3020_gpio[] = {
+ { 0, GPIOF_OUT_INIT_HIGH, "RTC CS"},
+ { 0, GPIOF_OUT_INIT_HIGH, "RTC WR"},
+ { 0, GPIOF_OUT_INIT_HIGH, "RTC RD"},
+ { 0, GPIOF_OUT_INIT_HIGH, "RTC IO"},
};
static int v3020_gpio_map(struct v3020 *chip, struct platform_device *pdev,
struct v3020_platform_data *pdata)
{
- int i, err;
+ int err;
v3020_gpio[V3020_CS].gpio = pdata->gpio_cs;
v3020_gpio[V3020_WR].gpio = pdata->gpio_wr;
v3020_gpio[V3020_RD].gpio = pdata->gpio_rd;
v3020_gpio[V3020_IO].gpio = pdata->gpio_io;
- for (i = 0; i < ARRAY_SIZE(v3020_gpio); i++) {
- err = gpio_request(v3020_gpio[i].gpio, v3020_gpio[i].name);
- if (err)
- goto err_request;
-
- gpio_direction_output(v3020_gpio[i].gpio, 1);
- }
+ err = gpio_request_array(v3020_gpio, ARRAY_SIZE(v3020_gpio));
+ if (err)
+ goto err_request;
chip->gpio = v3020_gpio;
return 0;
err_request:
- while (--i >= 0)
- gpio_free(v3020_gpio[i].gpio);
+ gpio_free_array(v3020_gpio, ARRAY_SIZE(v3020_gpio));
return err;
}
static void v3020_gpio_unmap(struct v3020 *chip)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(v3020_gpio); i++)
- gpio_free(v3020_gpio[i].gpio);
+ gpio_free_array(v3020_gpio, ARRAY_SIZE(v3020_gpio));
}
static void v3020_gpio_write_bit(struct v3020 *chip, unsigned char bit)
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 9f0c46547459..df5e961484e1 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -35,6 +35,7 @@ static int sg_version_num = 30534; /* 2 digits for each component */
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/aio.h>
#include <linux/errno.h>
#include <linux/mtio.h>
#include <linux/ioctl.h>
diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c
index b14a55742559..b040200a5a55 100644
--- a/drivers/staging/android/logger.c
+++ b/drivers/staging/android/logger.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/vmalloc.h>
+#include <linux/aio.h>
#include "logger.h"
#include <asm/ioctls.h>
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 07f5f94634bb..4e842cb161c6 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -271,7 +271,8 @@ static void iblock_complete_cmd(struct se_cmd *cmd)
kfree(ibr);
}
-static void iblock_bio_done(struct bio *bio, int err)
+static void iblock_bio_done(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct se_cmd *cmd = bio->bi_private;
struct iblock_req *ibr = cmd->priv;
@@ -335,7 +336,8 @@ static void iblock_submit_bios(struct bio_list *list, int rw)
blk_finish_plug(&plug);
}
-static void iblock_end_io_flush(struct bio *bio, int err)
+static void iblock_end_io_flush(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct se_cmd *cmd = bio->bi_private;
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index e992b27aa090..1e9873179860 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -835,7 +835,8 @@ static ssize_t pscsi_show_configfs_dev_params(struct se_device *dev, char *b)
return bl;
}
-static void pscsi_bi_endio(struct bio *bio, int error)
+static void pscsi_bi_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
bio_put(bio);
}
diff --git a/drivers/usb/gadget/amd5536udc.c b/drivers/usb/gadget/amd5536udc.c
index f52dcfe8f545..24bd363ca351 100644
--- a/drivers/usb/gadget/amd5536udc.c
+++ b/drivers/usb/gadget/amd5536udc.c
@@ -3099,7 +3099,7 @@ static int init_dma_pools(struct udc *dev)
}
/* DMA setup */
- dev->data_requests = dma_pool_create("data_requests", NULL,
+ dev->data_requests = dma_pool_create("data_requests", &dev->pdev->dev,
sizeof(struct udc_data_dma), 0, 0);
if (!dev->data_requests) {
DBG(dev, "can't get request data pool\n");
@@ -3111,7 +3111,7 @@ static int init_dma_pools(struct udc *dev)
dev->ep[UDC_EP0IN_IX].dma = &dev->regs->ctl;
/* dma desc for setup data */
- dev->stp_requests = dma_pool_create("setup requests", NULL,
+ dev->stp_requests = dma_pool_create("setup requests", &dev->pdev->dev,
sizeof(struct udc_stp_dma), 0, 0);
if (!dev->stp_requests) {
DBG(dev, "can't get stp request pool\n");
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index dda0dc4a5567..570c005062ab 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -24,6 +24,8 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/poll.h>
+#include <linux/mmu_context.h>
+#include <linux/aio.h>
#include <linux/device.h>
#include <linux/moduleparam.h>
@@ -513,6 +515,9 @@ static long ep_ioctl(struct file *fd, unsigned code, unsigned long value)
struct kiocb_priv {
struct usb_request *req;
struct ep_data *epdata;
+ struct kiocb *iocb;
+ struct mm_struct *mm;
+ struct work_struct work;
void *buf;
const struct iovec *iv;
unsigned long nr_segs;
@@ -528,7 +533,6 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e)
local_irq_disable();
epdata = priv->epdata;
// spin_lock(&epdata->dev->lock);
- kiocbSetCancelled(iocb);
if (likely(epdata && epdata->ep && priv->req))
value = usb_ep_dequeue (epdata->ep, priv->req);
else
@@ -540,15 +544,12 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e)
return value;
}
-static ssize_t ep_aio_read_retry(struct kiocb *iocb)
+static ssize_t ep_copy_to_user(struct kiocb_priv *priv)
{
- struct kiocb_priv *priv = iocb->private;
ssize_t len, total;
void *to_copy;
int i;
- /* we "retry" to get the right mm context for this: */
-
/* copy stuff into user buffers */
total = priv->actual;
len = 0;
@@ -568,9 +569,26 @@ static ssize_t ep_aio_read_retry(struct kiocb *iocb)
if (total == 0)
break;
}
+
+ return len;
+}
+
+static void ep_user_copy_worker(struct work_struct *work)
+{
+ struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work);
+ struct mm_struct *mm = priv->mm;
+ struct kiocb *iocb = priv->iocb;
+ size_t ret;
+
+ use_mm(mm);
+ ret = ep_copy_to_user(priv);
+ unuse_mm(mm);
+
+ /* completing the iocb can drop the ctx and mm, don't touch mm after */
+ aio_complete(iocb, ret, ret);
+
kfree(priv->buf);
kfree(priv);
- return len;
}
static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
@@ -596,14 +614,14 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
aio_complete(iocb, req->actual ? req->actual : req->status,
req->status);
} else {
- /* retry() won't report both; so we hide some faults */
+ /* ep_copy_to_user() won't report both; we hide some faults */
if (unlikely(0 != req->status))
DBG(epdata->dev, "%s fault %d len %d\n",
ep->name, req->status, req->actual);
priv->buf = req->buf;
priv->actual = req->actual;
- kick_iocb(iocb);
+ schedule_work(&priv->work);
}
spin_unlock(&epdata->dev->lock);
@@ -633,8 +651,10 @@ fail:
return value;
}
iocb->private = priv;
+ priv->iocb = iocb;
priv->iv = iv;
priv->nr_segs = nr_segs;
+ INIT_WORK(&priv->work, ep_user_copy_worker);
value = get_ready_ep(iocb->ki_filp->f_flags, epdata);
if (unlikely(value < 0)) {
@@ -642,10 +662,11 @@ fail:
goto fail;
}
- iocb->ki_cancel = ep_aio_cancel;
+ kiocb_set_cancel_fn(iocb, ep_aio_cancel);
get_ep(epdata);
priv->epdata = epdata;
priv->actual = 0;
+ priv->mm = current->mm; /* mm teardown waits for iocbs in exit_aio() */
/* each kiocb is coupled to one usb_request, but we can't
* allocate or submit those if the host disconnected.
@@ -674,7 +695,7 @@ fail:
kfree(priv);
put_ep(epdata);
} else
- value = (iv ? -EIOCBRETRY : -EIOCBQUEUED);
+ value = -EIOCBQUEUED;
return value;
}
@@ -692,7 +713,6 @@ ep_aio_read(struct kiocb *iocb, const struct iovec *iov,
if (unlikely(!buf))
return -ENOMEM;
- iocb->ki_retry = ep_aio_read_retry;
return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs);
}
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 1534cf3c1423..d8ecc5242cd3 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -2453,6 +2453,23 @@ config FB_HYPERV
help
This framebuffer driver supports Microsoft Hyper-V Synthetic Video.
+config FB_SIMPLE
+ bool "Simple framebuffer support"
+ depends on (FB = y) && OF
+ select FB_CFB_FILLRECT
+ select FB_CFB_COPYAREA
+ select FB_CFB_IMAGEBLIT
+ help
+ Say Y if you want support for a simple frame-buffer.
+
+ This driver assumes that the display hardware has been initialized
+ before the kernel boots, and the kernel will simply render to the
+ pre-allocated frame buffer surface.
+
+ Configuration re: surface address, size, and format must be provided
+ through device tree, or potentially plain old platform data in the
+ future.
+
source "drivers/video/omap/Kconfig"
source "drivers/video/omap2/Kconfig"
source "drivers/video/exynos/Kconfig"
diff --git a/drivers/video/Makefile b/drivers/video/Makefile
index 7234e4a959e8..e8bae8dd4804 100644
--- a/drivers/video/Makefile
+++ b/drivers/video/Makefile
@@ -166,6 +166,7 @@ obj-$(CONFIG_FB_MX3) += mx3fb.o
obj-$(CONFIG_FB_DA8XX) += da8xx-fb.o
obj-$(CONFIG_FB_MXS) += mxsfb.o
obj-$(CONFIG_FB_SSD1307) += ssd1307fb.o
+obj-$(CONFIG_FB_SIMPLE) += simplefb.o
# the test framebuffer is last
obj-$(CONFIG_FB_VIRTUAL) += vfb.o
diff --git a/drivers/video/cyber2000fb.c b/drivers/video/cyber2000fb.c
index 57886787ead0..e78d9f2233b8 100644
--- a/drivers/video/cyber2000fb.c
+++ b/drivers/video/cyber2000fb.c
@@ -518,6 +518,9 @@ static void cyber2000fb_set_timing(struct cfb_info *cfb, struct par_info *hw)
cyber2000_grphw(0xb9, 0x00, cfb);
spin_unlock(&cfb->reg_b0_lock);
+ /* wait (for the PLL?) to avoid palette corruption at higher clocks */
+ msleep(1000);
+
cfb->ramdac_ctrl = hw->ramdac;
cyber2000fb_write_ramdac_ctrl(cfb);
diff --git a/drivers/video/simplefb.c b/drivers/video/simplefb.c
new file mode 100644
index 000000000000..e2e9e3e61b72
--- /dev/null
+++ b/drivers/video/simplefb.c
@@ -0,0 +1,234 @@
+/*
+ * Simplest possible simple frame-buffer driver, as a platform device
+ *
+ * Copyright (c) 2013, Stephen Warren
+ *
+ * Based on q40fb.c, which was:
+ * Copyright (C) 2001 Richard Zidlicky <rz@linux-m68k.org>
+ *
+ * Also based on offb.c, which was:
+ * Copyright (C) 1997 Geert Uytterhoeven
+ * Copyright (C) 1996 Paul Mackerras
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/errno.h>
+#include <linux/fb.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+static struct fb_fix_screeninfo simplefb_fix = {
+ .id = "simple",
+ .type = FB_TYPE_PACKED_PIXELS,
+ .visual = FB_VISUAL_TRUECOLOR,
+ .accel = FB_ACCEL_NONE,
+};
+
+static struct fb_var_screeninfo simplefb_var = {
+ .height = -1,
+ .width = -1,
+ .activate = FB_ACTIVATE_NOW,
+ .vmode = FB_VMODE_NONINTERLACED,
+};
+
+static int simplefb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
+ u_int transp, struct fb_info *info)
+{
+ u32 *pal = info->pseudo_palette;
+ u32 cr = red >> (16 - info->var.red.length);
+ u32 cg = green >> (16 - info->var.green.length);
+ u32 cb = blue >> (16 - info->var.blue.length);
+ u32 value;
+
+ if (regno >= 16)
+ return -EINVAL;
+
+ value = (cr << info->var.red.offset) |
+ (cg << info->var.green.offset) |
+ (cb << info->var.blue.offset);
+ if (info->var.transp.length > 0) {
+ u32 mask = (1 << info->var.transp.length) - 1;
+ mask <<= info->var.transp.offset;
+ value |= mask;
+ }
+ pal[regno] = value;
+
+ return 0;
+}
+
+static struct fb_ops simplefb_ops = {
+ .owner = THIS_MODULE,
+ .fb_setcolreg = simplefb_setcolreg,
+ .fb_fillrect = cfb_fillrect,
+ .fb_copyarea = cfb_copyarea,
+ .fb_imageblit = cfb_imageblit,
+};
+
+struct simplefb_format {
+ const char *name;
+ u32 bits_per_pixel;
+ struct fb_bitfield red;
+ struct fb_bitfield green;
+ struct fb_bitfield blue;
+ struct fb_bitfield transp;
+};
+
+static struct simplefb_format simplefb_formats[] = {
+ { "r5g6b5", 16, {11, 5}, {5, 6}, {0, 5}, {0, 0} },
+};
+
+struct simplefb_params {
+ u32 width;
+ u32 height;
+ u32 stride;
+ struct simplefb_format *format;
+};
+
+static int simplefb_parse_dt(struct platform_device *pdev,
+ struct simplefb_params *params)
+{
+ struct device_node *np = pdev->dev.of_node;
+ int ret;
+ const char *format;
+ int i;
+
+ ret = of_property_read_u32(np, "width", &params->width);
+ if (ret) {
+ dev_err(&pdev->dev, "Can't parse width property\n");
+ return ret;
+ }
+
+ ret = of_property_read_u32(np, "height", &params->height);
+ if (ret) {
+ dev_err(&pdev->dev, "Can't parse height property\n");
+ return ret;
+ }
+
+ ret = of_property_read_u32(np, "stride", &params->stride);
+ if (ret) {
+ dev_err(&pdev->dev, "Can't parse stride property\n");
+ return ret;
+ }
+
+ ret = of_property_read_string(np, "format", &format);
+ if (ret) {
+ dev_err(&pdev->dev, "Can't parse format property\n");
+ return ret;
+ }
+ params->format = NULL;
+ for (i = 0; i < ARRAY_SIZE(simplefb_formats); i++) {
+ if (strcmp(format, simplefb_formats[i].name))
+ continue;
+ params->format = &simplefb_formats[i];
+ break;
+ }
+ if (!params->format) {
+ dev_err(&pdev->dev, "Invalid format value\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int simplefb_probe(struct platform_device *pdev)
+{
+ int ret;
+ struct simplefb_params params;
+ struct fb_info *info;
+ struct resource *mem;
+
+ if (fb_get_options("simplefb", NULL))
+ return -ENODEV;
+
+ ret = simplefb_parse_dt(pdev, &params);
+ if (ret)
+ return ret;
+
+ mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!mem) {
+ dev_err(&pdev->dev, "No memory resource\n");
+ return -EINVAL;
+ }
+
+ info = framebuffer_alloc(sizeof(u32) * 16, &pdev->dev);
+ if (!info)
+ return -ENOMEM;
+ platform_set_drvdata(pdev, info);
+
+ info->fix = simplefb_fix;
+ info->fix.smem_start = mem->start;
+ info->fix.smem_len = resource_size(mem);
+ info->fix.line_length = params.stride;
+
+ info->var = simplefb_var;
+ info->var.xres = params.width;
+ info->var.yres = params.height;
+ info->var.xres_virtual = params.width;
+ info->var.yres_virtual = params.height;
+ info->var.bits_per_pixel = params.format->bits_per_pixel;
+ info->var.red = params.format->red;
+ info->var.green = params.format->green;
+ info->var.blue = params.format->blue;
+ info->var.transp = params.format->transp;
+
+ info->fbops = &simplefb_ops;
+ info->flags = FBINFO_DEFAULT;
+ info->screen_base = devm_ioremap(&pdev->dev, info->fix.smem_start,
+ info->fix.smem_len);
+ if (!info->screen_base) {
+ framebuffer_release(info);
+ return -ENODEV;
+ }
+ info->pseudo_palette = (void *)(info + 1);
+
+ ret = register_framebuffer(info);
+ if (ret < 0) {
+ dev_err(&pdev->dev, "Unable to register simplefb: %d\n", ret);
+ framebuffer_release(info);
+ return ret;
+ }
+
+ dev_info(&pdev->dev, "fb%d: simplefb registered!\n", info->node);
+
+ return 0;
+}
+
+static int simplefb_remove(struct platform_device *pdev)
+{
+ struct fb_info *info = platform_get_drvdata(pdev);
+
+ unregister_framebuffer(info);
+ framebuffer_release(info);
+
+ return 0;
+}
+
+static const struct of_device_id simplefb_of_match[] = {
+ { .compatible = "simple-framebuffer", },
+ { },
+};
+MODULE_DEVICE_TABLE(of, simplefb_of_match);
+
+static struct platform_driver simplefb_driver = {
+ .driver = {
+ .name = "simple-framebuffer",
+ .owner = THIS_MODULE,
+ .of_match_table = simplefb_of_match,
+ },
+ .probe = simplefb_probe,
+ .remove = simplefb_remove,
+};
+module_platform_driver(simplefb_driver);
+
+MODULE_AUTHOR("Stephen Warren <swarren@wwwdotorg.org>");
+MODULE_DESCRIPTION("Simple framebuffer driver");
+MODULE_LICENSE("GPL v2");
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 0ad61c6a65a5..055562c580b4 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,6 +33,7 @@
#include <linux/pagemap.h>
#include <linux/idr.h>
#include <linux/sched.h>
+#include <linux/aio.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 7e03eadb40c0..a890db4b9898 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
+#include <linux/aio.h>
#include "internal.h"
static int afs_write_back_from_locked_page(struct afs_writeback *wb,
diff --git a/fs/aio.c b/fs/aio.c
index 351afe7ac78e..5b7ed7880129 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -8,6 +8,8 @@
*
* See ../COPYING for licensing terms.
*/
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
@@ -18,14 +20,14 @@
#include <linux/backing-dev.h>
#include <linux/uio.h>
-#define DEBUG 0
-
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
+#include <linux/bio.h>
#include <linux/mmu_context.h>
+#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/aio.h>
@@ -35,15 +37,111 @@
#include <linux/eventfd.h>
#include <linux/blkdev.h>
#include <linux/compat.h>
+#include <linux/percpu-refcount.h>
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
-#if DEBUG > 1
-#define dprintk printk
-#else
-#define dprintk(x...) do { ; } while (0)
-#endif
+#define AIO_RING_MAGIC 0xa10a10a1
+#define AIO_RING_COMPAT_FEATURES 1
+#define AIO_RING_INCOMPAT_FEATURES 0
+struct aio_ring {
+ unsigned id; /* kernel internal index number */
+ unsigned nr; /* number of io_events */
+ unsigned head;
+ unsigned tail;
+
+ unsigned magic;
+ unsigned compat_features;
+ unsigned incompat_features;
+ unsigned header_length; /* size of aio_ring */
+
+
+ struct io_event io_events[0];
+}; /* 128 bytes + ring size */
+
+#define AIO_RING_PAGES 8
+
+struct kioctx_cpu {
+ unsigned reqs_available;
+};
+
+struct kioctx {
+ struct percpu_ref users;
+
+ /* This needs improving */
+ unsigned long user_id;
+ struct hlist_node list;
+
+ struct __percpu kioctx_cpu *cpu;
+
+ /*
+ * For percpu reqs_available, number of slots we move to/from global
+ * counter at a time:
+ */
+ unsigned req_batch;
+ /*
+ * This is what userspace passed to io_setup(), it's not used for
+ * anything but counting against the global max_reqs quota.
+ *
+ * The real limit is nr_events - 1, which will be larger (see
+ * aio_setup_ring())
+ */
+ unsigned max_reqs;
+
+ /* Size of ringbuffer, in units of struct io_event */
+ unsigned nr_events;
+
+ unsigned long mmap_base;
+ unsigned long mmap_size;
+
+ struct page **ring_pages;
+ long nr_pages;
+
+ struct rcu_head rcu_head;
+ struct work_struct rcu_work;
+
+ struct {
+ /*
+ * This counts the number of available slots in the ringbuffer,
+ * so we avoid overflowing it: it's decremented (if positive)
+ * when allocating a kiocb and incremented when the resulting
+ * io_event is pulled off the ringbuffer.
+ *
+ * We batch accesses to it with a percpu version.
+ */
+ atomic_t reqs_available;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ spinlock_t ctx_lock;
+ struct list_head active_reqs; /* used for cancellation */
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ struct mutex ring_lock;
+ wait_queue_head_t wait;
+
+ /*
+ * Copy of the real tail - to reduce cacheline bouncing. Updated
+ * by aio_complete() whenever it updates the real tail.
+ */
+ unsigned shadow_tail;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ /*
+ * This is the canonical copy of the tail pointer, updated by
+ * aio_complete(). But aio_complete() also uses it as a lock, so
+ * other code can't use it; aio_complete() keeps shadow_tail in
+ * sync with the real value of the tail pointer for other code
+ * to use.
+ */
+ unsigned tail;
+ } ____cacheline_aligned_in_smp;
+
+ struct page *internal_pages[AIO_RING_PAGES];
+};
/*------ sysctl variables----*/
static DEFINE_SPINLOCK(aio_nr_lock);
@@ -54,11 +152,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request
static struct kmem_cache *kiocb_cachep;
static struct kmem_cache *kioctx_cachep;
-static struct workqueue_struct *aio_wq;
-
-static void aio_kick_handler(struct work_struct *);
-static void aio_queue_work(struct kioctx *);
-
/* aio_setup
* Creates the slab caches used by the aio routines, panic on
* failure as this is done early during the boot sequence.
@@ -68,10 +161,7 @@ static int __init aio_setup(void)
kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
- aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */
- BUG_ON(!aio_wq);
-
- pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
+ pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
return 0;
}
@@ -79,28 +169,23 @@ __initcall(aio_setup);
static void aio_free_ring(struct kioctx *ctx)
{
- struct aio_ring_info *info = &ctx->ring_info;
long i;
- for (i=0; i<info->nr_pages; i++)
- put_page(info->ring_pages[i]);
+ for (i = 0; i < ctx->nr_pages; i++)
+ put_page(ctx->ring_pages[i]);
- if (info->mmap_size) {
- BUG_ON(ctx->mm != current->mm);
- vm_munmap(info->mmap_base, info->mmap_size);
- }
+ if (ctx->mmap_size)
+ vm_munmap(ctx->mmap_base, ctx->mmap_size);
- if (info->ring_pages && info->ring_pages != info->internal_pages)
- kfree(info->ring_pages);
- info->ring_pages = NULL;
- info->nr = 0;
+ if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
+ kfree(ctx->ring_pages);
}
static int aio_setup_ring(struct kioctx *ctx)
{
struct aio_ring *ring;
- struct aio_ring_info *info = &ctx->ring_info;
unsigned nr_events = ctx->max_reqs;
+ struct mm_struct *mm = current->mm;
unsigned long size, populate;
int nr_pages;
@@ -116,46 +201,44 @@ static int aio_setup_ring(struct kioctx *ctx)
nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
- info->nr = 0;
- info->ring_pages = info->internal_pages;
+ ctx->nr_events = 0;
+ ctx->ring_pages = ctx->internal_pages;
if (nr_pages > AIO_RING_PAGES) {
- info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (!info->ring_pages)
+ ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!ctx->ring_pages)
return -ENOMEM;
}
- info->mmap_size = nr_pages * PAGE_SIZE;
- dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
- down_write(&ctx->mm->mmap_sem);
- info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size,
- PROT_READ|PROT_WRITE,
- MAP_ANONYMOUS|MAP_PRIVATE, 0,
- &populate);
- if (IS_ERR((void *)info->mmap_base)) {
- up_write(&ctx->mm->mmap_sem);
- info->mmap_size = 0;
+ ctx->mmap_size = nr_pages * PAGE_SIZE;
+ pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
+ down_write(&mm->mmap_sem);
+ ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
+ PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
+ if (IS_ERR((void *)ctx->mmap_base)) {
+ up_write(&mm->mmap_sem);
+ ctx->mmap_size = 0;
aio_free_ring(ctx);
return -EAGAIN;
}
- dprintk("mmap address: 0x%08lx\n", info->mmap_base);
- info->nr_pages = get_user_pages(current, ctx->mm,
- info->mmap_base, nr_pages,
- 1, 0, info->ring_pages, NULL);
- up_write(&ctx->mm->mmap_sem);
+ pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
+ ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
+ 1, 0, ctx->ring_pages, NULL);
+ up_write(&mm->mmap_sem);
- if (unlikely(info->nr_pages != nr_pages)) {
+ if (unlikely(ctx->nr_pages != nr_pages)) {
aio_free_ring(ctx);
return -EAGAIN;
}
if (populate)
- mm_populate(info->mmap_base, populate);
+ mm_populate(ctx->mmap_base, populate);
- ctx->user_id = info->mmap_base;
+ ctx->user_id = ctx->mmap_base;
+ ctx->nr_events = nr_events; /* trusted copy */
- info->nr = nr_events; /* trusted copy */
-
- ring = kmap_atomic(info->ring_pages[0]);
+ ring = kmap_atomic(ctx->ring_pages[0]);
ring->nr = nr_events; /* user copy */
ring->id = ctx->user_id;
ring->head = ring->tail = 0;
@@ -164,72 +247,145 @@ static int aio_setup_ring(struct kioctx *ctx)
ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
ring->header_length = sizeof(struct aio_ring);
kunmap_atomic(ring);
+ flush_dcache_page(ctx->ring_pages[0]);
return 0;
}
-
-/* aio_ring_event: returns a pointer to the event at the given index from
- * kmap_atomic(). Release the pointer with put_aio_ring_event();
- */
#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))
#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
-#define aio_ring_event(info, nr) ({ \
- unsigned pos = (nr) + AIO_EVENTS_OFFSET; \
- struct io_event *__event; \
- __event = kmap_atomic( \
- (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \
- __event += pos % AIO_EVENTS_PER_PAGE; \
- __event; \
-})
-
-#define put_aio_ring_event(event) do { \
- struct io_event *__event = (event); \
- (void)__event; \
- kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \
-} while(0)
-
-static void ctx_rcu_free(struct rcu_head *head)
+void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+{
+ struct kioctx *ctx = req->ki_ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+
+ if (!req->ki_list.next)
+ list_add(&req->ki_list, &ctx->active_reqs);
+
+ req->ki_cancel = cancel;
+
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
+EXPORT_SYMBOL(kiocb_set_cancel_fn);
+
+static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
+ struct io_event *res)
+{
+ kiocb_cancel_fn *old, *cancel;
+ int ret = -EINVAL;
+
+ /*
+ * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
+ * actually has a cancel function, hence the cmpxchg()
+ */
+
+ cancel = ACCESS_ONCE(kiocb->ki_cancel);
+ do {
+ if (!cancel || cancel == KIOCB_CANCELLED)
+ return ret;
+
+ old = cancel;
+ cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
+ } while (cancel != old);
+
+ atomic_inc(&kiocb->ki_users);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ memset(res, 0, sizeof(*res));
+ res->obj = (u64)(unsigned long)kiocb->ki_obj.user;
+ res->data = kiocb->ki_user_data;
+ ret = cancel(kiocb, res);
+
+ spin_lock_irq(&ctx->ctx_lock);
+
+ return ret;
+}
+
+static void free_ioctx_rcu(struct rcu_head *head)
{
struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+
+ free_percpu(ctx->cpu);
kmem_cache_free(kioctx_cachep, ctx);
}
-/* __put_ioctx
- * Called when the last user of an aio context has gone away,
- * and the struct needs to be freed.
+/*
+ * When this function runs, the kioctx has been removed from the "hash table"
+ * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
+ * now it's safe to cancel any that need to be.
*/
-static void __put_ioctx(struct kioctx *ctx)
+static void free_ioctx(struct kioctx *ctx)
{
- unsigned nr_events = ctx->max_reqs;
- BUG_ON(ctx->reqs_active);
+ struct aio_ring *ring;
+ struct io_event res;
+ struct kiocb *req;
+ unsigned cpu, head, avail;
- cancel_delayed_work_sync(&ctx->wq);
- aio_free_ring(ctx);
- mmdrop(ctx->mm);
- ctx->mm = NULL;
- if (nr_events) {
- spin_lock(&aio_nr_lock);
- BUG_ON(aio_nr - nr_events > aio_nr);
- aio_nr -= nr_events;
- spin_unlock(&aio_nr_lock);
+ spin_lock_irq(&ctx->ctx_lock);
+
+ while (!list_empty(&ctx->active_reqs)) {
+ req = list_first_entry(&ctx->active_reqs,
+ struct kiocb, ki_list);
+
+ list_del_init(&req->ki_list);
+ kiocb_cancel(ctx, req, &res);
}
- pr_debug("__put_ioctx: freeing %p\n", ctx);
- call_rcu(&ctx->rcu_head, ctx_rcu_free);
-}
-static inline int try_get_ioctx(struct kioctx *kioctx)
-{
- return atomic_inc_not_zero(&kioctx->users);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ for_each_possible_cpu(cpu) {
+ struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu);
+
+ atomic_add(kcpu->reqs_available, &ctx->reqs_available);
+ kcpu->reqs_available = 0;
+ }
+
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
+ kunmap_atomic(ring);
+
+ while (atomic_read(&ctx->reqs_available) < ctx->nr_events - 1) {
+ wait_event(ctx->wait,
+ (head != ctx->shadow_tail) ||
+ (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1));
+
+ avail = (head <= ctx->shadow_tail
+ ? ctx->shadow_tail : ctx->nr_events) - head;
+
+ atomic_add(avail, &ctx->reqs_available);
+ head += avail;
+ head %= ctx->nr_events;
+ }
+
+ WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1);
+
+ aio_free_ring(ctx);
+
+ spin_lock(&aio_nr_lock);
+ BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
+ aio_nr -= ctx->max_reqs;
+ spin_unlock(&aio_nr_lock);
+
+ pr_debug("freeing %p\n", ctx);
+
+ /*
+ * Here the call_rcu() is between the wait_event() for reqs_active to
+ * hit 0, and freeing the ioctx.
+ *
+ * aio_complete() decrements reqs_active, but it has to touch the ioctx
+ * after to issue a wakeup so we use rcu.
+ */
+ call_rcu(&ctx->rcu_head, free_ioctx_rcu);
}
-static inline void put_ioctx(struct kioctx *kioctx)
+static void put_ioctx(struct kioctx *ctx)
{
- BUG_ON(atomic_read(&kioctx->users) <= 0);
- if (unlikely(atomic_dec_and_test(&kioctx->users)))
- __put_ioctx(kioctx);
+ if (percpu_ref_put(&ctx->users))
+ free_ioctx(ctx);
}
/* ioctx_alloc
@@ -237,10 +393,22 @@ static inline void put_ioctx(struct kioctx *kioctx)
*/
static struct kioctx *ioctx_alloc(unsigned nr_events)
{
- struct mm_struct *mm;
+ struct mm_struct *mm = current->mm;
struct kioctx *ctx;
int err = -ENOMEM;
+ /*
+ * We keep track of the number of available ringbuffer slots, to prevent
+ * overflow (reqs_available), and we also use percpu counters for this.
+ *
+ * So since up to half the slots might be on other cpu's percpu counters
+ * and unavailable, double nr_events so userspace sees what they
+ * expected: additionally, we move req_batch slots to/from percpu
+ * counters at a time, so make sure that isn't 0:
+ */
+ nr_events = max(nr_events, num_possible_cpus() * 4);
+ nr_events *= 2;
+
/* Prevent overflows */
if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
(nr_events > (0x10000000U / sizeof(struct kiocb)))) {
@@ -256,21 +424,29 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
return ERR_PTR(-ENOMEM);
ctx->max_reqs = nr_events;
- mm = ctx->mm = current->mm;
- atomic_inc(&mm->mm_count);
- atomic_set(&ctx->users, 2);
+ percpu_ref_init(&ctx->users);
+ rcu_read_lock();
+ percpu_ref_get(&ctx->users);
+ rcu_read_unlock();
+
spin_lock_init(&ctx->ctx_lock);
- spin_lock_init(&ctx->ring_info.ring_lock);
+ mutex_init(&ctx->ring_lock);
init_waitqueue_head(&ctx->wait);
INIT_LIST_HEAD(&ctx->active_reqs);
- INIT_LIST_HEAD(&ctx->run_list);
- INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler);
- if (aio_setup_ring(ctx) < 0)
+ ctx->cpu = alloc_percpu(struct kioctx_cpu);
+ if (!ctx->cpu)
goto out_freectx;
+ if (aio_setup_ring(ctx) < 0)
+ goto out_freepcpu;
+
+ atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
+ ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
+ BUG_ON(!ctx->req_batch);
+
/* limit the number of system wide aios */
spin_lock(&aio_nr_lock);
if (aio_nr + nr_events > aio_max_nr ||
@@ -286,64 +462,58 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
spin_unlock(&mm->ioctx_lock);
- dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
- ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
+ pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
+ ctx, ctx->user_id, mm, ctx->nr_events);
return ctx;
out_cleanup:
err = -EAGAIN;
aio_free_ring(ctx);
+out_freepcpu:
+ free_percpu(ctx->cpu);
out_freectx:
- mmdrop(mm);
kmem_cache_free(kioctx_cachep, ctx);
- dprintk("aio: error allocating ioctx %d\n", err);
+ pr_debug("error allocating ioctx %d\n", err);
return ERR_PTR(err);
}
-/* kill_ctx
- * Cancels all outstanding aio requests on an aio context. Used
- * when the processes owning a context have all exited to encourage
- * the rapid destruction of the kioctx.
- */
-static void kill_ctx(struct kioctx *ctx)
+static void kill_ioctx_work(struct work_struct *work)
{
- int (*cancel)(struct kiocb *, struct io_event *);
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
- struct io_event res;
+ struct kioctx *ctx = container_of(work, struct kioctx, rcu_work);
- spin_lock_irq(&ctx->ctx_lock);
- ctx->dead = 1;
- while (!list_empty(&ctx->active_reqs)) {
- struct list_head *pos = ctx->active_reqs.next;
- struct kiocb *iocb = list_kiocb(pos);
- list_del_init(&iocb->ki_list);
- cancel = iocb->ki_cancel;
- kiocbSetCancelled(iocb);
- if (cancel) {
- iocb->ki_users++;
- spin_unlock_irq(&ctx->ctx_lock);
- cancel(iocb, &res);
- spin_lock_irq(&ctx->ctx_lock);
- }
- }
+ wake_up_all(&ctx->wait);
+ put_ioctx(ctx);
+}
- if (!ctx->reqs_active)
- goto out;
+static void kill_ioctx_rcu(struct rcu_head *head)
+{
+ struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
- add_wait_queue(&ctx->wait, &wait);
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- while (ctx->reqs_active) {
- spin_unlock_irq(&ctx->ctx_lock);
- io_schedule();
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- spin_lock_irq(&ctx->ctx_lock);
- }
- __set_task_state(tsk, TASK_RUNNING);
- remove_wait_queue(&ctx->wait, &wait);
+ INIT_WORK(&ctx->rcu_work, kill_ioctx_work);
+ schedule_work(&ctx->rcu_work);
+}
-out:
- spin_unlock_irq(&ctx->ctx_lock);
+/* kill_ioctx
+ * Cancels all outstanding aio requests on an aio context. Used
+ * when the processes owning a context have all exited to encourage
+ * the rapid destruction of the kioctx.
+ */
+static void kill_ioctx(struct kioctx *ctx)
+{
+ if (percpu_ref_kill(&ctx->users)) {
+ hlist_del_rcu(&ctx->list);
+ /* Between hlist_del_rcu() and dropping the initial ref */
+ synchronize_rcu();
+
+ /*
+ * We can't punt to workqueue here because put_ioctx() ->
+ * free_ioctx() will unmap the ringbuffer, and that has to be
+ * done in the original process's context. kill_ioctx_rcu/work()
+ * exist for exit_aio(), as in that path free_ioctx() won't do
+ * the unmap.
+ */
+ kill_ioctx_work(&ctx->rcu_work);
+ }
}
/* wait_on_sync_kiocb:
@@ -351,9 +521,9 @@ out:
*/
ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
{
- while (iocb->ki_users) {
+ while (atomic_read(&iocb->ki_users)) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (!iocb->ki_users)
+ if (!atomic_read(&iocb->ki_users))
break;
io_schedule();
}
@@ -362,28 +532,20 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
}
EXPORT_SYMBOL(wait_on_sync_kiocb);
-/* exit_aio: called when the last user of mm goes away. At this point,
- * there is no way for any new requests to be submited or any of the
- * io_* syscalls to be called on the context. However, there may be
- * outstanding requests which hold references to the context; as they
- * go away, they will call put_ioctx and release any pinned memory
- * associated with the request (held via struct page * references).
+/*
+ * exit_aio: called when the last user of mm goes away. At this point, there is
+ * no way for any new requests to be submited or any of the io_* syscalls to be
+ * called on the context.
+ *
+ * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
+ * them.
*/
void exit_aio(struct mm_struct *mm)
{
struct kioctx *ctx;
+ struct hlist_node *n;
- while (!hlist_empty(&mm->ioctx_list)) {
- ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
- hlist_del_rcu(&ctx->list);
-
- kill_ctx(ctx);
-
- if (1 != atomic_read(&ctx->users))
- printk(KERN_DEBUG
- "exit_aio:ioctx still alive: %d %d %d\n",
- atomic_read(&ctx->users), ctx->dead,
- ctx->reqs_active);
+ hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
/*
* We don't need to bother with munmap() here -
* exit_mmap(mm) is coming and it'll unmap everything.
@@ -391,150 +553,95 @@ void exit_aio(struct mm_struct *mm)
* as indicator that it needs to unmap the area,
* just set it to 0; aio_free_ring() is the only
* place that uses ->mmap_size, so it's safe.
- * That way we get all munmap done to current->mm -
- * all other callers have ctx->mm == current->mm.
*/
- ctx->ring_info.mmap_size = 0;
- put_ioctx(ctx);
+ ctx->mmap_size = 0;
+
+ if (percpu_ref_kill(&ctx->users)) {
+ hlist_del_rcu(&ctx->list);
+ call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
+ }
}
}
-/* aio_get_req
- * Allocate a slot for an aio request. Increments the users count
- * of the kioctx so that the kioctx stays around until all requests are
- * complete. Returns NULL if no requests are free.
- *
- * Returns with kiocb->users set to 2. The io submit code path holds
- * an extra reference while submitting the i/o.
- * This prevents races between the aio code path referencing the
- * req (after submitting it) and aio_complete() freeing the req.
- */
-static struct kiocb *__aio_get_req(struct kioctx *ctx)
+static void put_reqs_available(struct kioctx *ctx, unsigned nr)
{
- struct kiocb *req = NULL;
+ struct kioctx_cpu *kcpu;
- req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
- if (unlikely(!req))
- return NULL;
+ preempt_disable();
+ kcpu = this_cpu_ptr(ctx->cpu);
- req->ki_flags = 0;
- req->ki_users = 2;
- req->ki_key = 0;
- req->ki_ctx = ctx;
- req->ki_cancel = NULL;
- req->ki_retry = NULL;
- req->ki_dtor = NULL;
- req->private = NULL;
- req->ki_iovec = NULL;
- INIT_LIST_HEAD(&req->ki_run_list);
- req->ki_eventfd = NULL;
+ kcpu->reqs_available += nr;
+ while (kcpu->reqs_available >= ctx->req_batch * 2) {
+ kcpu->reqs_available -= ctx->req_batch;
+ atomic_add(ctx->req_batch, &ctx->reqs_available);
+ }
- return req;
+ preempt_enable();
}
-/*
- * struct kiocb's are allocated in batches to reduce the number of
- * times the ctx lock is acquired and released.
- */
-#define KIOCB_BATCH_SIZE 32L
-struct kiocb_batch {
- struct list_head head;
- long count; /* number of requests left to allocate */
-};
-
-static void kiocb_batch_init(struct kiocb_batch *batch, long total)
+static bool get_reqs_available(struct kioctx *ctx)
{
- INIT_LIST_HEAD(&batch->head);
- batch->count = total;
-}
+ struct kioctx_cpu *kcpu;
+ bool ret = false;
-static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
-{
- struct kiocb *req, *n;
+ preempt_disable();
+ kcpu = this_cpu_ptr(ctx->cpu);
- if (list_empty(&batch->head))
- return;
+ if (!kcpu->reqs_available) {
+ int old, avail = atomic_read(&ctx->reqs_available);
- spin_lock_irq(&ctx->ctx_lock);
- list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
- list_del(&req->ki_batch);
- list_del(&req->ki_list);
- kmem_cache_free(kiocb_cachep, req);
- ctx->reqs_active--;
- }
- if (unlikely(!ctx->reqs_active && ctx->dead))
- wake_up_all(&ctx->wait);
- spin_unlock_irq(&ctx->ctx_lock);
-}
-
-/*
- * Allocate a batch of kiocbs. This avoids taking and dropping the
- * context lock a lot during setup.
- */
-static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
-{
- unsigned short allocated, to_alloc;
- long avail;
- struct kiocb *req, *n;
- struct aio_ring *ring;
-
- to_alloc = min(batch->count, KIOCB_BATCH_SIZE);
- for (allocated = 0; allocated < to_alloc; allocated++) {
- req = __aio_get_req(ctx);
- if (!req)
- /* allocation failed, go with what we've got */
- break;
- list_add(&req->ki_batch, &batch->head);
- }
-
- if (allocated == 0)
- goto out;
+ do {
+ if (avail < ctx->req_batch)
+ goto out;
- spin_lock_irq(&ctx->ctx_lock);
- ring = kmap_atomic(ctx->ring_info.ring_pages[0]);
-
- avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
- BUG_ON(avail < 0);
- if (avail < allocated) {
- /* Trim back the number of requests. */
- list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
- list_del(&req->ki_batch);
- kmem_cache_free(kiocb_cachep, req);
- if (--allocated <= avail)
- break;
- }
- }
+ old = avail;
+ avail = atomic_cmpxchg(&ctx->reqs_available,
+ avail, avail - ctx->req_batch);
+ } while (avail != old);
- batch->count -= allocated;
- list_for_each_entry(req, &batch->head, ki_batch) {
- list_add(&req->ki_list, &ctx->active_reqs);
- ctx->reqs_active++;
+ kcpu->reqs_available += ctx->req_batch;
}
- kunmap_atomic(ring);
- spin_unlock_irq(&ctx->ctx_lock);
-
+ ret = true;
+ kcpu->reqs_available--;
out:
- return allocated;
+ preempt_enable();
+ return ret;
}
-static inline struct kiocb *aio_get_req(struct kioctx *ctx,
- struct kiocb_batch *batch)
+/* aio_get_req
+ * Allocate a slot for an aio request. Increments the ki_users count
+ * of the kioctx so that the kioctx stays around until all requests are
+ * complete. Returns NULL if no requests are free.
+ *
+ * Returns with kiocb->ki_users set to 2. The io submit code path holds
+ * an extra reference while submitting the i/o.
+ * This prevents races between the aio code path referencing the
+ * req (after submitting it) and aio_complete() freeing the req.
+ */
+static inline struct kiocb *aio_get_req(struct kioctx *ctx)
{
struct kiocb *req;
- if (list_empty(&batch->head))
- if (kiocb_batch_refill(ctx, batch) == 0)
- return NULL;
- req = list_first_entry(&batch->head, struct kiocb, ki_batch);
- list_del(&req->ki_batch);
+ if (!get_reqs_available(ctx))
+ return NULL;
+
+ req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
+ if (unlikely(!req))
+ goto out_put;
+
+ atomic_set(&req->ki_users, 2);
+ req->ki_ctx = ctx;
return req;
+out_put:
+ put_reqs_available(ctx, 1);
+ return NULL;
}
-static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
+static void kiocb_free(struct kiocb *req)
{
- assert_spin_locked(&ctx->ctx_lock);
-
+ if (req->ki_filp)
+ fput(req->ki_filp);
if (req->ki_eventfd != NULL)
eventfd_ctx_put(req->ki_eventfd);
if (req->ki_dtor)
@@ -542,48 +649,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
if (req->ki_iovec != &req->ki_inline_vec)
kfree(req->ki_iovec);
kmem_cache_free(kiocb_cachep, req);
- ctx->reqs_active--;
-
- if (unlikely(!ctx->reqs_active && ctx->dead))
- wake_up_all(&ctx->wait);
}
-/* __aio_put_req
- * Returns true if this put was the last user of the request.
- */
-static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
+void aio_put_req(struct kiocb *req)
{
- dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
- req, atomic_long_read(&req->ki_filp->f_count));
-
- assert_spin_locked(&ctx->ctx_lock);
-
- req->ki_users--;
- BUG_ON(req->ki_users < 0);
- if (likely(req->ki_users))
- return 0;
- list_del(&req->ki_list); /* remove from active_reqs */
- req->ki_cancel = NULL;
- req->ki_retry = NULL;
-
- fput(req->ki_filp);
- req->ki_filp = NULL;
- really_put_req(ctx, req);
- return 1;
-}
-
-/* aio_put_req
- * Returns true if this put was the last user of the kiocb,
- * false if the request is still in use.
- */
-int aio_put_req(struct kiocb *req)
-{
- struct kioctx *ctx = req->ki_ctx;
- int ret;
- spin_lock_irq(&ctx->ctx_lock);
- ret = __aio_put_req(ctx, req);
- spin_unlock_irq(&ctx->ctx_lock);
- return ret;
+ if (atomic_dec_and_test(&req->ki_users))
+ kiocb_free(req);
}
EXPORT_SYMBOL(aio_put_req);
@@ -595,13 +666,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
rcu_read_lock();
hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
- /*
- * RCU protects us against accessing freed memory but
- * we have to be careful not to get a reference when the
- * reference count already dropped to 0 (ctx->dead test
- * is unreliable because of races).
- */
- if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
+ if (ctx->user_id == ctx_id) {
+ percpu_ref_get(&ctx->users);
ret = ctx;
break;
}
@@ -611,610 +677,332 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
return ret;
}
-/*
- * Queue up a kiocb to be retried. Assumes that the kiocb
- * has already been marked as kicked, and places it on
- * the retry run list for the corresponding ioctx, if it
- * isn't already queued. Returns 1 if it actually queued
- * the kiocb (to tell the caller to activate the work
- * queue to process it), or 0, if it found that it was
- * already queued.
- */
-static inline int __queue_kicked_iocb(struct kiocb *iocb)
+static inline unsigned kioctx_ring_put(struct kioctx *ctx, struct kiocb *req,
+ unsigned tail)
{
- struct kioctx *ctx = iocb->ki_ctx;
-
- assert_spin_locked(&ctx->ctx_lock);
+ struct io_event *ev_page, *event;
+ unsigned pos = tail + AIO_EVENTS_OFFSET;
- if (list_empty(&iocb->ki_run_list)) {
- list_add_tail(&iocb->ki_run_list,
- &ctx->run_list);
- return 1;
- }
- return 0;
-}
+ if (++tail >= ctx->nr_events)
+ tail = 0;
-/* aio_run_iocb
- * This is the core aio execution routine. It is
- * invoked both for initial i/o submission and
- * subsequent retries via the aio_kick_handler.
- * Expects to be invoked with iocb->ki_ctx->lock
- * already held. The lock is released and reacquired
- * as needed during processing.
- *
- * Calls the iocb retry method (already setup for the
- * iocb on initial submission) for operation specific
- * handling, but takes care of most of common retry
- * execution details for a given iocb. The retry method
- * needs to be non-blocking as far as possible, to avoid
- * holding up other iocbs waiting to be serviced by the
- * retry kernel thread.
- *
- * The trickier parts in this code have to do with
- * ensuring that only one retry instance is in progress
- * for a given iocb at any time. Providing that guarantee
- * simplifies the coding of individual aio operations as
- * it avoids various potential races.
- */
-static ssize_t aio_run_iocb(struct kiocb *iocb)
-{
- struct kioctx *ctx = iocb->ki_ctx;
- ssize_t (*retry)(struct kiocb *);
- ssize_t ret;
+ ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+ event = ev_page + pos % AIO_EVENTS_PER_PAGE;
- if (!(retry = iocb->ki_retry)) {
- printk("aio_run_iocb: iocb->ki_retry = NULL\n");
- return 0;
- }
+ event->obj = (u64)(unsigned long)req->ki_obj.user;
+ event->data = req->ki_user_data;
+ event->res = req->ki_res;
+ event->res2 = req->ki_res2;
- /*
- * We don't want the next retry iteration for this
- * operation to start until this one has returned and
- * updated the iocb state. However, wait_queue functions
- * can trigger a kick_iocb from interrupt context in the
- * meantime, indicating that data is available for the next
- * iteration. We want to remember that and enable the
- * next retry iteration _after_ we are through with
- * this one.
- *
- * So, in order to be able to register a "kick", but
- * prevent it from being queued now, we clear the kick
- * flag, but make the kick code *think* that the iocb is
- * still on the run list until we are actually done.
- * When we are done with this iteration, we check if
- * the iocb was kicked in the meantime and if so, queue
- * it up afresh.
- */
+ kunmap_atomic(ev_page);
+ flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
- kiocbClearKicked(iocb);
+ pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
+ ctx, tail, req, req->ki_obj.user, req->ki_user_data,
+ req->ki_res, req->ki_res2);
- /*
- * This is so that aio_complete knows it doesn't need to
- * pull the iocb off the run list (We can't just call
- * INIT_LIST_HEAD because we don't want a kick_iocb to
- * queue this on the run list yet)
- */
- iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
- spin_unlock_irq(&ctx->ctx_lock);
+ return tail;
+}
- /* Quit retrying if the i/o has been cancelled */
- if (kiocbIsCancelled(iocb)) {
- ret = -EINTR;
- aio_complete(iocb, ret, 0);
- /* must not access the iocb after this */
- goto out;
- }
+static inline unsigned kioctx_ring_lock(struct kioctx *ctx)
+{
+ unsigned tail;
/*
- * Now we are all set to call the retry method in async
- * context.
+ * ctx->tail is both our lock and the canonical version of the tail
+ * pointer.
*/
- ret = retry(iocb);
-
- if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
- /*
- * There's no easy way to restart the syscall since other AIO's
- * may be already running. Just fail this IO with EINTR.
- */
- if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
- ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
- ret = -EINTR;
- aio_complete(iocb, ret, 0);
- }
-out:
- spin_lock_irq(&ctx->ctx_lock);
-
- if (-EIOCBRETRY == ret) {
- /*
- * OK, now that we are done with this iteration
- * and know that there is more left to go,
- * this is where we let go so that a subsequent
- * "kick" can start the next iteration
- */
+ while ((tail = xchg(&ctx->tail, UINT_MAX)) == UINT_MAX)
+ cpu_relax();
- /* will make __queue_kicked_iocb succeed from here on */
- INIT_LIST_HEAD(&iocb->ki_run_list);
- /* we must queue the next iteration ourselves, if it
- * has already been kicked */
- if (kiocbIsKicked(iocb)) {
- __queue_kicked_iocb(iocb);
-
- /*
- * __queue_kicked_iocb will always return 1 here, because
- * iocb->ki_run_list is empty at this point so it should
- * be safe to unconditionally queue the context into the
- * work queue.
- */
- aio_queue_work(ctx);
- }
- }
- return ret;
+ return tail;
}
-/*
- * __aio_run_iocbs:
- * Process all pending retries queued on the ioctx
- * run list.
- * Assumes it is operating within the aio issuer's mm
- * context.
- */
-static int __aio_run_iocbs(struct kioctx *ctx)
+static inline void kioctx_ring_unlock(struct kioctx *ctx, unsigned tail)
{
- struct kiocb *iocb;
- struct list_head run_list;
+ struct aio_ring *ring;
- assert_spin_locked(&ctx->ctx_lock);
+ if (!ctx)
+ return;
- list_replace_init(&ctx->run_list, &run_list);
- while (!list_empty(&run_list)) {
- iocb = list_entry(run_list.next, struct kiocb,
- ki_run_list);
- list_del(&iocb->ki_run_list);
- /*
- * Hold an extra reference while retrying i/o.
- */
- iocb->ki_users++; /* grab extra reference */
- aio_run_iocb(iocb);
- __aio_put_req(ctx, iocb);
- }
- if (!list_empty(&ctx->run_list))
- return 1;
- return 0;
-}
+ smp_wmb();
+ /* make event visible before updating tail */
-static void aio_queue_work(struct kioctx * ctx)
-{
- unsigned long timeout;
- /*
- * if someone is waiting, get the work started right
- * away, otherwise, use a longer delay
- */
- smp_mb();
- if (waitqueue_active(&ctx->wait))
- timeout = 1;
- else
- timeout = HZ/10;
- queue_delayed_work(aio_wq, &ctx->wq, timeout);
-}
+ ctx->shadow_tail = tail;
-/*
- * aio_run_all_iocbs:
- * Process all pending retries queued on the ioctx
- * run list, and keep running them until the list
- * stays empty.
- * Assumes it is operating within the aio issuer's mm context.
- */
-static inline void aio_run_all_iocbs(struct kioctx *ctx)
-{
- spin_lock_irq(&ctx->ctx_lock);
- while (__aio_run_iocbs(ctx))
- ;
- spin_unlock_irq(&ctx->ctx_lock);
-}
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ ring->tail = tail;
+ kunmap_atomic(ring);
+ flush_dcache_page(ctx->ring_pages[0]);
-/*
- * aio_kick_handler:
- * Work queue handler triggered to process pending
- * retries on an ioctx. Takes on the aio issuer's
- * mm context before running the iocbs, so that
- * copy_xxx_user operates on the issuer's address
- * space.
- * Run on aiod's context.
- */
-static void aio_kick_handler(struct work_struct *work)
-{
- struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
- mm_segment_t oldfs = get_fs();
- struct mm_struct *mm;
- int requeue;
+ /* unlock, make new tail visible before checking waitlist */
+ smp_mb();
- set_fs(USER_DS);
- use_mm(ctx->mm);
- spin_lock_irq(&ctx->ctx_lock);
- requeue =__aio_run_iocbs(ctx);
- mm = ctx->mm;
- spin_unlock_irq(&ctx->ctx_lock);
- unuse_mm(mm);
- set_fs(oldfs);
- /*
- * we're in a worker thread already; no point using non-zero delay
- */
- if (requeue)
- queue_delayed_work(aio_wq, &ctx->wq, 0);
-}
+ ctx->tail = tail;
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+}
-/*
- * Called by kick_iocb to queue the kiocb for retry
- * and if required activate the aio work queue to process
- * it
- */
-static void try_queue_kicked_iocb(struct kiocb *iocb)
+void batch_complete_aio(struct batch_complete *batch)
{
- struct kioctx *ctx = iocb->ki_ctx;
+ struct kioctx *ctx = NULL;
+ struct eventfd_ctx *eventfd = NULL;
+ struct rb_node *n;
unsigned long flags;
- int run = 0;
+ unsigned tail = 0;
- spin_lock_irqsave(&ctx->ctx_lock, flags);
- /* set this inside the lock so that we can't race with aio_run_iocb()
- * testing it and putting the iocb on the run list under the lock */
- if (!kiocbTryKick(iocb))
- run = __queue_kicked_iocb(iocb);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- if (run)
- aio_queue_work(ctx);
-}
-
-/*
- * kick_iocb:
- * Called typically from a wait queue callback context
- * to trigger a retry of the iocb.
- * The retry is usually executed by aio workqueue
- * threads (See aio_kick_handler).
- */
-void kick_iocb(struct kiocb *iocb)
-{
- /* sync iocbs are easy: they can only ever be executing from a
- * single context. */
- if (is_sync_kiocb(iocb)) {
- kiocbSetKicked(iocb);
- wake_up_process(iocb->ki_obj.tsk);
+ if (RB_EMPTY_ROOT(&batch->kiocb))
return;
- }
-
- try_queue_kicked_iocb(iocb);
-}
-EXPORT_SYMBOL(kick_iocb);
-
-/* aio_complete
- * Called when the io request on the given iocb is complete.
- * Returns true if this is the last user of the request. The
- * only other user of the request can be the cancellation code.
- */
-int aio_complete(struct kiocb *iocb, long res, long res2)
-{
- struct kioctx *ctx = iocb->ki_ctx;
- struct aio_ring_info *info;
- struct aio_ring *ring;
- struct io_event *event;
- unsigned long flags;
- unsigned long tail;
- int ret;
/*
- * Special case handling for sync iocbs:
- * - events go directly into the iocb for fast handling
- * - the sync task with the iocb in its stack holds the single iocb
- * ref, no other paths have a way to get another ref
- * - the sync task helpfully left a reference to itself in the iocb
+ * Take rcu_read_lock() in case the kioctx is being destroyed, as we
+ * need to issue a wakeup after incrementing reqs_available.
*/
- if (is_sync_kiocb(iocb)) {
- BUG_ON(iocb->ki_users != 1);
- iocb->ki_user_data = res;
- iocb->ki_users = 0;
- wake_up_process(iocb->ki_obj.tsk);
- return 1;
- }
-
- info = &ctx->ring_info;
+ rcu_read_lock();
+ local_irq_save(flags);
- /* add a completion event to the ring buffer.
- * must be done holding ctx->ctx_lock to prevent
- * other code from messing with the tail
- * pointer since we might be called from irq
- * context.
- */
- spin_lock_irqsave(&ctx->ctx_lock, flags);
+ n = rb_first(&batch->kiocb);
+ while (n) {
+ struct kiocb *req = container_of(n, struct kiocb, ki_node);
- if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
- list_del_init(&iocb->ki_run_list);
+ if (n->rb_right) {
+ n->rb_right->__rb_parent_color = n->__rb_parent_color;
+ n = n->rb_right;
- /*
- * cancelled requests don't get events, userland was given one
- * when the event got cancelled.
- */
- if (kiocbIsCancelled(iocb))
- goto put_rq;
+ while (n->rb_left)
+ n = n->rb_left;
+ } else {
+ n = rb_parent(n);
+ }
- ring = kmap_atomic(info->ring_pages[0]);
+ if (unlikely(req->ki_eventfd != eventfd)) {
+ if (eventfd) {
+ /* Make event visible */
+ kioctx_ring_unlock(ctx, tail);
+ ctx = NULL;
- tail = info->tail;
- event = aio_ring_event(info, tail);
- if (++tail >= info->nr)
- tail = 0;
-
- event->obj = (u64)(unsigned long)iocb->ki_obj.user;
- event->data = iocb->ki_user_data;
- event->res = res;
- event->res2 = res2;
+ eventfd_signal(eventfd, 1);
+ eventfd_ctx_put(eventfd);
+ }
- dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
- ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
- res, res2);
+ eventfd = req->ki_eventfd;
+ req->ki_eventfd = NULL;
+ }
- /* after flagging the request as done, we
- * must never even look at it again
- */
- smp_wmb(); /* make event visible before updating tail */
+ if (unlikely(req->ki_ctx != ctx)) {
+ kioctx_ring_unlock(ctx, tail);
- info->tail = tail;
- ring->tail = tail;
+ ctx = req->ki_ctx;
+ tail = kioctx_ring_lock(ctx);
+ }
- put_aio_ring_event(event);
- kunmap_atomic(ring);
+ tail = kioctx_ring_put(ctx, req, tail);
+ aio_put_req(req);
+ }
- pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+ kioctx_ring_unlock(ctx, tail);
+ local_irq_restore(flags);
+ rcu_read_unlock();
/*
* Check if the user asked us to deliver the result through an
* eventfd. The eventfd_signal() function is safe to be called
* from IRQ context.
*/
- if (iocb->ki_eventfd != NULL)
- eventfd_signal(iocb->ki_eventfd, 1);
+ if (eventfd) {
+ eventfd_signal(eventfd, 1);
+ eventfd_ctx_put(eventfd);
+ }
+}
+EXPORT_SYMBOL(batch_complete_aio);
-put_rq:
- /* everything turned out well, dispose of the aiocb. */
- ret = __aio_put_req(ctx, iocb);
+/* aio_complete_batch
+ * Called when the io request on the given iocb is complete; @batch may be
+ * NULL.
+ */
+void aio_complete_batch(struct kiocb *req, long res, long res2,
+ struct batch_complete *batch)
+{
+ req->ki_res = res;
+ req->ki_res2 = res2;
+
+ if (req->ki_list.next) {
+ struct kioctx *ctx = req->ki_ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+ list_del(&req->ki_list);
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+ }
/*
- * We have to order our ring_info tail store above and test
- * of the wait list below outside the wait lock. This is
- * like in wake_up_bit() where clearing a bit has to be
- * ordered with the unlocked test.
+ * Special case handling for sync iocbs:
+ * - events go directly into the iocb for fast handling
+ * - the sync task with the iocb in its stack holds the single iocb
+ * ref, no other paths have a way to get another ref
+ * - the sync task helpfully left a reference to itself in the iocb
*/
- smp_mb();
+ if (is_sync_kiocb(req)) {
+ BUG_ON(atomic_read(&req->ki_users) != 1);
+ req->ki_user_data = req->ki_res;
+ atomic_set(&req->ki_users, 0);
+ wake_up_process(req->ki_obj.tsk);
+ } else if (batch) {
+ int res;
+ struct kiocb *t;
+ struct rb_node **n = &batch->kiocb.rb_node, *parent = NULL;
+
+ while (*n) {
+ parent = *n;
+ t = container_of(*n, struct kiocb, ki_node);
+
+ res = req->ki_ctx != t->ki_ctx
+ ? req->ki_ctx < t->ki_ctx
+ : req->ki_eventfd != t->ki_eventfd
+ ? req->ki_eventfd < t->ki_eventfd
+ : req < t;
+
+ n = res ? &(*n)->rb_left : &(*n)->rb_right;
+ }
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+ rb_link_node(&req->ki_node, parent, n);
+ rb_insert_color(&req->ki_node, &batch->kiocb);
+ } else {
+ struct batch_complete batch_stack;
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- return ret;
+ memset(&req->ki_node, 0, sizeof(req->ki_node));
+ batch_stack.kiocb.rb_node = &req->ki_node;
+
+ batch_complete_aio(&batch_stack);
+ }
}
-EXPORT_SYMBOL(aio_complete);
+EXPORT_SYMBOL(aio_complete_batch);
-/* aio_read_evt
- * Pull an event off of the ioctx's event ring. Returns the number of
- * events fetched (0 or 1 ;-)
- * FIXME: make this use cmpxchg.
- * TODO: make the ringbuffer user mmap()able (requires FIXME).
+/* aio_read_events
+ * Pull an event off of the ioctx's event ring. Returns the number of
+ * events fetched
*/
-static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+static long aio_read_events_ring(struct kioctx *ctx,
+ struct io_event __user *event, long nr)
{
- struct aio_ring_info *info = &ioctx->ring_info;
struct aio_ring *ring;
- unsigned long head;
- int ret = 0;
-
- ring = kmap_atomic(info->ring_pages[0]);
- dprintk("in aio_read_evt h%lu t%lu m%lu\n",
- (unsigned long)ring->head, (unsigned long)ring->tail,
- (unsigned long)ring->nr);
-
- if (ring->head == ring->tail)
- goto out;
+ unsigned head, pos;
+ long ret = 0;
+ int copy_ret;
- spin_lock(&info->ring_lock);
-
- head = ring->head % info->nr;
- if (head != ring->tail) {
- struct io_event *evp = aio_ring_event(info, head);
- *ent = *evp;
- head = (head + 1) % info->nr;
- smp_mb(); /* finish reading the event before updatng the head */
- ring->head = head;
- ret = 1;
- put_aio_ring_event(evp);
- }
- spin_unlock(&info->ring_lock);
+ mutex_lock(&ctx->ring_lock);
-out:
- dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret,
- (unsigned long)ring->head, (unsigned long)ring->tail);
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
kunmap_atomic(ring);
- return ret;
-}
-
-struct aio_timeout {
- struct timer_list timer;
- int timed_out;
- struct task_struct *p;
-};
-static void timeout_func(unsigned long data)
-{
- struct aio_timeout *to = (struct aio_timeout *)data;
+ pr_debug("h%u t%u m%u\n", head, ctx->shadow_tail, ctx->nr_events);
- to->timed_out = 1;
- wake_up_process(to->p);
-}
+ if (head == ctx->shadow_tail)
+ goto out;
-static inline void init_timeout(struct aio_timeout *to)
-{
- setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to);
- to->timed_out = 0;
- to->p = current;
-}
+ while (ret < nr) {
+ long avail;
+ struct io_event *ev;
+ struct page *page;
-static inline void set_timeout(long start_jiffies, struct aio_timeout *to,
- const struct timespec *ts)
-{
- to->timer.expires = start_jiffies + timespec_to_jiffies(ts);
- if (time_after(to->timer.expires, jiffies))
- add_timer(&to->timer);
- else
- to->timed_out = 1;
-}
+ avail = (head <= ctx->shadow_tail ?
+ ctx->shadow_tail : ctx->nr_events) - head;
+ if (head == ctx->shadow_tail)
+ break;
-static inline void clear_timeout(struct aio_timeout *to)
-{
- del_singleshot_timer_sync(&to->timer);
-}
+ avail = min(avail, nr - ret);
+ avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
+ ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
-static int read_events(struct kioctx *ctx,
- long min_nr, long nr,
- struct io_event __user *event,
- struct timespec __user *timeout)
-{
- long start_jiffies = jiffies;
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
- int ret;
- int i = 0;
- struct io_event ent;
- struct aio_timeout to;
- int retry = 0;
-
- /* needed to zero any padding within an entry (there shouldn't be
- * any, but C is fun!
- */
- memset(&ent, 0, sizeof(ent));
-retry:
- ret = 0;
- while (likely(i < nr)) {
- ret = aio_read_evt(ctx, &ent);
- if (unlikely(ret <= 0))
- break;
+ pos = head + AIO_EVENTS_OFFSET;
+ page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
+ pos %= AIO_EVENTS_PER_PAGE;
- dprintk("read event: %Lx %Lx %Lx %Lx\n",
- ent.data, ent.obj, ent.res, ent.res2);
+ ev = kmap(page);
+ copy_ret = copy_to_user(event + ret, ev + pos,
+ sizeof(*ev) * avail);
+ kunmap(page);
- /* Could we split the check in two? */
- ret = -EFAULT;
- if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
- dprintk("aio: lost an event due to EFAULT.\n");
- break;
+ if (unlikely(copy_ret)) {
+ ret = -EFAULT;
+ goto out;
}
- ret = 0;
- /* Good, event copied to userland, update counts. */
- event ++;
- i ++;
+ ret += avail;
+ head += avail;
+ head %= ctx->nr_events;
}
- if (min_nr <= i)
- return i;
- if (ret)
- return ret;
-
- /* End fast path */
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ ring->head = head;
+ kunmap_atomic(ring);
+ flush_dcache_page(ctx->ring_pages[0]);
- /* racey check, but it gets redone */
- if (!retry && unlikely(!list_empty(&ctx->run_list))) {
- retry = 1;
- aio_run_all_iocbs(ctx);
- goto retry;
- }
+ pr_debug("%li h%u t%u\n", ret, head, ctx->shadow_tail);
- init_timeout(&to);
- if (timeout) {
- struct timespec ts;
- ret = -EFAULT;
- if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
- goto out;
-
- set_timeout(start_jiffies, &to, &ts);
- }
+ put_reqs_available(ctx, ret);
+out:
+ mutex_unlock(&ctx->ring_lock);
- while (likely(i < nr)) {
- add_wait_queue_exclusive(&ctx->wait, &wait);
- do {
- set_task_state(tsk, TASK_INTERRUPTIBLE);
- ret = aio_read_evt(ctx, &ent);
- if (ret)
- break;
- if (min_nr <= i)
- break;
- if (unlikely(ctx->dead)) {
- ret = -EINVAL;
- break;
- }
- if (to.timed_out) /* Only check after read evt */
- break;
- /* Try to only show up in io wait if there are ops
- * in flight */
- if (ctx->reqs_active)
- io_schedule();
- else
- schedule();
- if (signal_pending(tsk)) {
- ret = -EINTR;
- break;
- }
- /*ret = aio_read_evt(ctx, &ent);*/
- } while (1) ;
+ return ret;
+}
- set_task_state(tsk, TASK_RUNNING);
- remove_wait_queue(&ctx->wait, &wait);
+static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
+ struct io_event __user *event, long *i)
+{
+ long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
- if (unlikely(ret <= 0))
- break;
+ if (ret > 0)
+ *i += ret;
- ret = -EFAULT;
- if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
- dprintk("aio: lost an event due to EFAULT.\n");
- break;
- }
+ if (unlikely(percpu_ref_dead(&ctx->users)))
+ ret = -EINVAL;
- /* Good, event copied to userland, update counts. */
- event ++;
- i ++;
- }
+ if (!*i)
+ *i = ret;
- if (timeout)
- clear_timeout(&to);
-out:
- destroy_timer_on_stack(&to.timer);
- return i ? i : ret;
+ return ret < 0 || *i >= min_nr;
}
-/* Take an ioctx and remove it from the list of ioctx's. Protects
- * against races with itself via ->dead.
- */
-static void io_destroy(struct kioctx *ioctx)
+static long read_events(struct kioctx *ctx, long min_nr, long nr,
+ struct io_event __user *event,
+ struct timespec __user *timeout)
{
- struct mm_struct *mm = current->mm;
- int was_dead;
+ ktime_t until = { .tv64 = KTIME_MAX };
+ long ret = 0;
- /* delete the entry from the list is someone else hasn't already */
- spin_lock(&mm->ioctx_lock);
- was_dead = ioctx->dead;
- ioctx->dead = 1;
- hlist_del_rcu(&ioctx->list);
- spin_unlock(&mm->ioctx_lock);
+ if (timeout) {
+ struct timespec ts;
- dprintk("aio_release(%p)\n", ioctx);
- if (likely(!was_dead))
- put_ioctx(ioctx); /* twice for the list */
+ if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
+ return -EFAULT;
- kill_ctx(ioctx);
+ until = timespec_to_ktime(ts);
+ }
/*
- * Wake up any waiters. The setting of ctx->dead must be seen
- * by other CPUs at this point. Right now, we rely on the
- * locking done by the above calls to ensure this consistency.
+ * Note that aio_read_events() is being called as the conditional - i.e.
+ * we're calling it after prepare_to_wait() has set task state to
+ * TASK_INTERRUPTIBLE.
+ *
+ * But aio_read_events() can block, and if it blocks it's going to flip
+ * the task state back to TASK_RUNNING.
+ *
+ * This should be ok, provided it doesn't flip the state back to
+ * TASK_RUNNING and return 0 too much - that causes us to spin. That
+ * will only happen if the mutex_lock() call blocks, and we then find
+ * the ringbuffer empty. So in practice we should be ok, but it's
+ * something to be aware of when touching this code.
*/
- wake_up_all(&ioctx->wait);
+ wait_event_interruptible_hrtimeout(ctx->wait,
+ aio_read_events(ctx, min_nr, nr, event, &ret), until);
+
+ if (!ret && signal_pending(current))
+ ret = -EINTR;
+
+ return ret;
}
/* sys_io_setup:
@@ -1252,7 +1040,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
if (ret)
- io_destroy(ioctx);
+ kill_ioctx(ioctx);
put_ioctx(ioctx);
}
@@ -1270,7 +1058,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
{
struct kioctx *ioctx = lookup_ioctx(ctx);
if (likely(NULL != ioctx)) {
- io_destroy(ioctx);
+ kill_ioctx(ioctx);
put_ioctx(ioctx);
return 0;
}
@@ -1301,30 +1089,21 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
BUG_ON(ret > 0 && iocb->ki_left == 0);
}
-static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
+typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
+
+static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
- unsigned long, loff_t);
ssize_t ret = 0;
- unsigned short opcode;
-
- if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
- (iocb->ki_opcode == IOCB_CMD_PREAD)) {
- rw_op = file->f_op->aio_read;
- opcode = IOCB_CMD_PREADV;
- } else {
- rw_op = file->f_op->aio_write;
- opcode = IOCB_CMD_PWRITEV;
- }
/* This matches the pread()/pwrite() logic */
if (iocb->ki_pos < 0)
return -EINVAL;
- if (opcode == IOCB_CMD_PWRITEV)
+ if (rw == WRITE)
file_start_write(file);
do {
ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
@@ -1336,9 +1115,9 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
/* retry all partial writes. retry partial reads as long as its a
* regular file. */
} while (ret > 0 && iocb->ki_left > 0 &&
- (opcode == IOCB_CMD_PWRITEV ||
+ (rw == WRITE ||
(!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
- if (opcode == IOCB_CMD_PWRITEV)
+ if (rw == WRITE)
file_end_write(file);
/* This means we must have transferred all that we could */
@@ -1348,81 +1127,49 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
/* If we managed to write some out we return that, rather than
* the eventual error. */
- if (opcode == IOCB_CMD_PWRITEV
- && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY
+ if (rw == WRITE
+ && ret < 0 && ret != -EIOCBQUEUED
&& iocb->ki_nbytes - iocb->ki_left)
ret = iocb->ki_nbytes - iocb->ki_left;
return ret;
}
-static ssize_t aio_fdsync(struct kiocb *iocb)
-{
- struct file *file = iocb->ki_filp;
- ssize_t ret = -EINVAL;
-
- if (file->f_op->aio_fsync)
- ret = file->f_op->aio_fsync(iocb, 1);
- return ret;
-}
-
-static ssize_t aio_fsync(struct kiocb *iocb)
-{
- struct file *file = iocb->ki_filp;
- ssize_t ret = -EINVAL;
-
- if (file->f_op->aio_fsync)
- ret = file->f_op->aio_fsync(iocb, 0);
- return ret;
-}
-
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
+static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat)
{
ssize_t ret;
+ kiocb->ki_nr_segs = kiocb->ki_nbytes;
+
#ifdef CONFIG_COMPAT
if (compat)
- ret = compat_rw_copy_check_uvector(type,
+ ret = compat_rw_copy_check_uvector(rw,
(struct compat_iovec __user *)kiocb->ki_buf,
- kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+ kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
&kiocb->ki_iovec);
else
#endif
- ret = rw_copy_check_uvector(type,
+ ret = rw_copy_check_uvector(rw,
(struct iovec __user *)kiocb->ki_buf,
- kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+ kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
&kiocb->ki_iovec);
if (ret < 0)
- goto out;
-
- ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret);
- if (ret < 0)
- goto out;
+ return ret;
- kiocb->ki_nr_segs = kiocb->ki_nbytes;
- kiocb->ki_cur_seg = 0;
- /* ki_nbytes/left now reflect bytes instead of segs */
+ /* ki_nbytes now reflect bytes instead of segs */
kiocb->ki_nbytes = ret;
- kiocb->ki_left = ret;
-
- ret = 0;
-out:
- return ret;
+ return 0;
}
-static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb)
+static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
{
- int bytes;
-
- bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left);
- if (bytes < 0)
- return bytes;
+ if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes)))
+ return -EFAULT;
kiocb->ki_iovec = &kiocb->ki_inline_vec;
kiocb->ki_iovec->iov_base = kiocb->ki_buf;
- kiocb->ki_iovec->iov_len = bytes;
+ kiocb->ki_iovec->iov_len = kiocb->ki_nbytes;
kiocb->ki_nr_segs = 1;
- kiocb->ki_cur_seg = 0;
return 0;
}
@@ -1431,96 +1178,95 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc
* Performs the initial checks and aio retry method
* setup for the kiocb at the time of io submission.
*/
-static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
+static ssize_t aio_run_iocb(struct kiocb *req, bool compat)
{
- struct file *file = kiocb->ki_filp;
- ssize_t ret = 0;
+ struct file *file = req->ki_filp;
+ ssize_t ret;
+ int rw;
+ fmode_t mode;
+ aio_rw_op *rw_op;
- switch (kiocb->ki_opcode) {
+ switch (req->ki_opcode) {
case IOCB_CMD_PREAD:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_READ)))
- break;
- ret = -EFAULT;
- if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
- kiocb->ki_left)))
- break;
- ret = aio_setup_single_vector(READ, file, kiocb);
- if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_read)
- kiocb->ki_retry = aio_rw_vect_retry;
- break;
- case IOCB_CMD_PWRITE:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_WRITE)))
- break;
- ret = -EFAULT;
- if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
- kiocb->ki_left)))
- break;
- ret = aio_setup_single_vector(WRITE, file, kiocb);
- if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_write)
- kiocb->ki_retry = aio_rw_vect_retry;
- break;
case IOCB_CMD_PREADV:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_READ)))
- break;
- ret = aio_setup_vectored_rw(READ, kiocb, compat);
- if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_read)
- kiocb->ki_retry = aio_rw_vect_retry;
- break;
+ mode = FMODE_READ;
+ rw = READ;
+ rw_op = file->f_op->aio_read;
+ goto rw_common;
+
+ case IOCB_CMD_PWRITE:
case IOCB_CMD_PWRITEV:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_WRITE)))
- break;
- ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
+ mode = FMODE_WRITE;
+ rw = WRITE;
+ rw_op = file->f_op->aio_write;
+ goto rw_common;
+rw_common:
+ if (unlikely(!(file->f_mode & mode)))
+ return -EBADF;
+
+ if (!rw_op)
+ return -EINVAL;
+
+ ret = (req->ki_opcode == IOCB_CMD_PREADV ||
+ req->ki_opcode == IOCB_CMD_PWRITEV)
+ ? aio_setup_vectored_rw(rw, req, compat)
+ : aio_setup_single_vector(rw, req);
if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_write)
- kiocb->ki_retry = aio_rw_vect_retry;
+ return ret;
+
+ ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+ if (ret < 0)
+ return ret;
+
+ req->ki_nbytes = ret;
+ req->ki_left = ret;
+
+ ret = aio_rw_vect_retry(req, rw, rw_op);
break;
+
case IOCB_CMD_FDSYNC:
- ret = -EINVAL;
- if (file->f_op->aio_fsync)
- kiocb->ki_retry = aio_fdsync;
+ if (!file->f_op->aio_fsync)
+ return -EINVAL;
+
+ ret = file->f_op->aio_fsync(req, 1);
break;
+
case IOCB_CMD_FSYNC:
- ret = -EINVAL;
- if (file->f_op->aio_fsync)
- kiocb->ki_retry = aio_fsync;
+ if (!file->f_op->aio_fsync)
+ return -EINVAL;
+
+ ret = file->f_op->aio_fsync(req, 0);
break;
+
default:
- dprintk("EINVAL: io_submit: no operation provided\n");
- ret = -EINVAL;
+ pr_debug("EINVAL: no operation provided\n");
+ return -EINVAL;
}
- if (!kiocb->ki_retry)
- return ret;
+ if (ret != -EIOCBQUEUED) {
+ /*
+ * There's no easy way to restart the syscall since other AIO's
+ * may be already running. Just fail this IO with EINTR.
+ */
+ if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+ ret == -ERESTARTNOHAND ||
+ ret == -ERESTART_RESTARTBLOCK))
+ ret = -EINTR;
+ aio_complete(req, ret, 0);
+ }
return 0;
}
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- struct iocb *iocb, struct kiocb_batch *batch,
- bool compat)
+ struct iocb *iocb, bool compat)
{
struct kiocb *req;
- struct file *file;
ssize_t ret;
/* enforce forwards compatibility on users */
if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
- pr_debug("EINVAL: io_submit: reserve field set\n");
+ pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
@@ -1534,16 +1280,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
return -EINVAL;
}
- file = fget(iocb->aio_fildes);
- if (unlikely(!file))
- return -EBADF;
-
- req = aio_get_req(ctx, batch); /* returns with 2 references to req */
- if (unlikely(!req)) {
- fput(file);
+ req = aio_get_req(ctx);
+ if (unlikely(!req))
return -EAGAIN;
+
+ req->ki_filp = fget(iocb->aio_fildes);
+ if (unlikely(!req->ki_filp)) {
+ ret = -EBADF;
+ goto out_put_req;
}
- req->ki_filp = file;
+
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -1559,9 +1305,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
}
}
- ret = put_user(req->ki_key, &user_iocb->aio_key);
+ ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
if (unlikely(ret)) {
- dprintk("EFAULT: aio_key\n");
+ pr_debug("EFAULT: aio_key\n");
goto out_put_req;
}
@@ -1573,41 +1319,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
req->ki_opcode = iocb->aio_lio_opcode;
- ret = aio_setup_iocb(req, compat);
-
+ ret = aio_run_iocb(req, compat);
if (ret)
goto out_put_req;
- spin_lock_irq(&ctx->ctx_lock);
- /*
- * We could have raced with io_destroy() and are currently holding a
- * reference to ctx which should be destroyed. We cannot submit IO
- * since ctx gets freed as soon as io_submit() puts its reference. The
- * check here is reliable: io_destroy() sets ctx->dead before waiting
- * for outstanding IO and the barrier between these two is realized by
- * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we
- * increment ctx->reqs_active before checking for ctx->dead and the
- * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
- * don't see ctx->dead set here, io_destroy() waits for our IO to
- * finish.
- */
- if (ctx->dead) {
- spin_unlock_irq(&ctx->ctx_lock);
- ret = -EINVAL;
- goto out_put_req;
- }
- aio_run_iocb(req);
- if (!list_empty(&ctx->run_list)) {
- /* drain the run list */
- while (__aio_run_iocbs(ctx))
- ;
- }
- spin_unlock_irq(&ctx->ctx_lock);
-
aio_put_req(req); /* drop extra ref to req */
return 0;
-
out_put_req:
+ put_reqs_available(ctx, 1);
aio_put_req(req); /* drop extra ref to req */
aio_put_req(req); /* drop i/o ref to req */
return ret;
@@ -1620,7 +1339,6 @@ long do_io_submit(aio_context_t ctx_id, long nr,
long ret = 0;
int i = 0;
struct blk_plug plug;
- struct kiocb_batch batch;
if (unlikely(nr < 0))
return -EINVAL;
@@ -1633,12 +1351,10 @@ long do_io_submit(aio_context_t ctx_id, long nr,
ctx = lookup_ioctx(ctx_id);
if (unlikely(!ctx)) {
- pr_debug("EINVAL: io_submit: invalid context id\n");
+ pr_debug("EINVAL: invalid context id\n");
return -EINVAL;
}
- kiocb_batch_init(&batch, nr);
-
blk_start_plug(&plug);
/*
@@ -1659,13 +1375,12 @@ long do_io_submit(aio_context_t ctx_id, long nr,
break;
}
- ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat);
+ ret = io_submit_one(ctx, user_iocb, &tmp, compat);
if (ret)
break;
}
blk_finish_plug(&plug);
- kiocb_batch_free(ctx, &batch);
put_ioctx(ctx);
return i ? i : ret;
}
@@ -1698,10 +1413,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
assert_spin_locked(&ctx->ctx_lock);
+ if (key != KIOCB_KEY)
+ return NULL;
+
/* TODO: use a hash or array, this sucks. */
list_for_each(pos, &ctx->active_reqs) {
struct kiocb *kiocb = list_kiocb(pos);
- if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key)
+ if (kiocb->ki_obj.user == iocb)
return kiocb;
}
return NULL;
@@ -1720,7 +1438,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
struct io_event __user *, result)
{
- int (*cancel)(struct kiocb *iocb, struct io_event *res);
+ struct io_event res;
struct kioctx *ctx;
struct kiocb *kiocb;
u32 key;
@@ -1735,32 +1453,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
return -EINVAL;
spin_lock_irq(&ctx->ctx_lock);
- ret = -EAGAIN;
+
kiocb = lookup_kiocb(ctx, iocb, key);
- if (kiocb && kiocb->ki_cancel) {
- cancel = kiocb->ki_cancel;
- kiocb->ki_users ++;
- kiocbSetCancelled(kiocb);
- } else
- cancel = NULL;
+ if (kiocb)
+ ret = kiocb_cancel(ctx, kiocb, &res);
+ else
+ ret = -EINVAL;
+
spin_unlock_irq(&ctx->ctx_lock);
- if (NULL != cancel) {
- struct io_event tmp;
- pr_debug("calling cancel\n");
- memset(&tmp, 0, sizeof(tmp));
- tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
- tmp.data = kiocb->ki_user_data;
- ret = cancel(kiocb, &tmp);
- if (!ret) {
- /* Cancellation succeeded -- copy the result
- * into the user's buffer.
- */
- if (copy_to_user(result, &tmp, sizeof(tmp)))
- ret = -EFAULT;
- }
- } else
- ret = -EINVAL;
+ if (!ret) {
+ /* Cancellation succeeded -- copy the result
+ * into the user's buffer.
+ */
+ if (copy_to_user(result, &res, sizeof(res)))
+ ret = -EFAULT;
+ }
put_ioctx(ctx);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index bce87694f7b0..89dec7f789a4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -255,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
(current->mm->start_data = N_DATADDR(ex));
current->mm->brk = ex.a_bss +
(current->mm->start_brk = N_BSSADDR(ex));
- current->mm->free_area_cache = current->mm->mmap_base;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
if (retval < 0) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f8a0b0efda44..53b6d624c7a9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -140,6 +140,25 @@ static int padzero(unsigned long elf_bss)
#define ELF_BASE_PLATFORM NULL
#endif
+/*
+ * Use get_random_int() to implement AT_RANDOM while avoiding depletion
+ * of the entropy pool.
+ */
+static void get_atrandom_bytes(unsigned char *buf, size_t nbytes)
+{
+ unsigned char *p = buf;
+
+ while (nbytes) {
+ unsigned int random_variable;
+ size_t chunk = min(nbytes, sizeof(random_variable));
+
+ random_variable = get_random_int();
+ memcpy(p, &random_variable, chunk);
+ p += chunk;
+ nbytes -= chunk;
+ }
+}
+
static int
create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
unsigned long load_addr, unsigned long interp_load_addr)
@@ -201,7 +220,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
/*
* Generate 16 random bytes for userspace PRNG seeding.
*/
- get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+ get_atrandom_bytes(k_rand_bytes, sizeof(k_rand_bytes));
u_rand_bytes = (elf_addr_t __user *)
STACK_ALLOC(p, sizeof(k_rand_bytes));
if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
@@ -738,8 +757,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
/* Do this so that we can load the interpreter, if need be. We will
change some of these later */
- current->mm->free_area_cache = current->mm->mmap_base;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
if (retval < 0) {
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 8fb42916d8a2..69f6f802b09e 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -510,7 +510,8 @@ static void bio_integrity_verify_fn(struct work_struct *work)
* in process context. This function postpones completion
* accordingly.
*/
-void bio_integrity_endio(struct bio *bio, int error)
+void bio_integrity_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct bio_integrity_payload *bip = bio->bi_integrity;
diff --git a/fs/bio.c b/fs/bio.c
index afe2db6d4412..89cf9776f7ab 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -19,6 +19,7 @@
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/uio.h>
#include <linux/iocontext.h>
#include <linux/slab.h>
#include <linux/init.h>
@@ -27,6 +28,7 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
+#include <linux/aio.h>
#include <scsi/sg.h> /* for struct sg_iovec */
#include <trace/events/block.h>
@@ -759,7 +761,8 @@ struct submit_bio_ret {
int error;
};
-static void submit_bio_wait_endio(struct bio *bio, int error)
+static void submit_bio_wait_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct submit_bio_ret *ret = bio->bi_private;
@@ -1413,7 +1416,8 @@ void bio_unmap_user(struct bio *bio)
}
EXPORT_SYMBOL(bio_unmap_user);
-static void bio_map_kern_endio(struct bio *bio, int err)
+static void bio_map_kern_endio(struct bio *bio, int err,
+ struct batch_complete *batch)
{
bio_put(bio);
}
@@ -1485,7 +1489,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
}
EXPORT_SYMBOL(bio_map_kern);
-static void bio_copy_kern_endio(struct bio *bio, int err)
+static void bio_copy_kern_endio(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct bio_vec *bvec;
const int read = bio_data_dir(bio) == READ;
@@ -1684,31 +1689,40 @@ void bio_flush_dcache_pages(struct bio *bi)
EXPORT_SYMBOL(bio_flush_dcache_pages);
#endif
-/**
- * bio_endio - end I/O on a bio
- * @bio: bio
- * @error: error, if any
- *
- * Description:
- * bio_endio() will end I/O on the whole bio. bio_endio() is the
- * preferred way to end I/O on a bio, it takes care of clearing
- * BIO_UPTODATE on error. @error is 0 on success, and and one of the
- * established -Exxxx (-EIO, for instance) error values in case
- * something went wrong. No one should call bi_end_io() directly on a
- * bio unless they own it and thus know that it has an end_io
- * function.
- **/
-void bio_endio(struct bio *bio, int error)
+static inline void __bio_endio(struct bio *bio, struct batch_complete *batch)
{
- if (error)
+ if (bio->bi_error)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- error = -EIO;
+ bio->bi_error = -EIO;
if (bio->bi_end_io)
- bio->bi_end_io(bio, error);
+ bio->bi_end_io(bio, bio->bi_error, batch);
+}
+
+void bio_endio_batch(struct bio *bio, int error, struct batch_complete *batch)
+{
+ if (error)
+ bio->bi_error = error;
+
+ if (batch)
+ bio_list_add(&batch->bio, bio);
+ else
+ __bio_endio(bio, batch);
+
+}
+EXPORT_SYMBOL(bio_endio_batch);
+
+void batch_complete(struct batch_complete *batch)
+{
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&batch->bio)))
+ __bio_endio(bio, batch);
+
+ batch_complete_aio(batch);
}
-EXPORT_SYMBOL(bio_endio);
+EXPORT_SYMBOL(batch_complete);
void bio_pair_release(struct bio_pair *bp)
{
@@ -1721,7 +1735,8 @@ void bio_pair_release(struct bio_pair *bp)
}
EXPORT_SYMBOL(bio_pair_release);
-static void bio_pair_end_1(struct bio *bi, int err)
+static void bio_pair_end_1(struct bio *bi, int err,
+ struct batch_complete *batch)
{
struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
@@ -1731,7 +1746,8 @@ static void bio_pair_end_1(struct bio *bi, int err)
bio_pair_release(bp);
}
-static void bio_pair_end_2(struct bio *bi, int err)
+static void bio_pair_end_2(struct bio *bi, int err,
+ struct batch_complete *batch)
{
struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index e5571ea15673..61675d7b9828 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -27,6 +27,7 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
#include "internal.h"
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 18af6f48781a..3c617b3244c0 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -323,7 +323,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
static int btrfsic_read_block(struct btrfsic_state *state,
struct btrfsic_block_data_ctx *block_ctx);
static void btrfsic_dump_database(struct btrfsic_state *state);
-static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch);
static int btrfsic_test_for_metadata(struct btrfsic_state *state,
char **datav, unsigned int num_pages);
static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
@@ -336,7 +337,8 @@ static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const block,
struct btrfs_super_block *const super_hdr);
-static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status,
+ struct batch_complete *batch);
static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
const struct btrfsic_block *block,
@@ -1751,7 +1753,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return block_ctx->len;
}
-static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
complete((struct completion *)bio->bi_private);
}
@@ -2294,7 +2297,8 @@ continue_loop:
goto again;
}
-static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status,
+ struct batch_complete *batch)
{
struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
int iodone_w_error;
@@ -2342,7 +2346,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
block = next_block;
} while (NULL != block);
- bp->bi_end_io(bp, bio_error_status);
+ bp->bi_end_io(bp, bio_error_status, batch);
}
static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 15b94089abc4..74ae115edba0 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -153,7 +153,8 @@ fail:
* The compressed pages are freed here, and it must be run
* in process context
*/
-static void end_compressed_bio_read(struct bio *bio, int err)
+static void end_compressed_bio_read(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
@@ -263,7 +264,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start,
* This also calls the writeback end hooks for the file pages so that
* metadata and checksums can be updated in the file.
*/
-static void end_compressed_bio_write(struct bio *bio, int err)
+static void end_compressed_bio_write(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct extent_io_tree *tree;
struct compressed_bio *cb = bio->bi_private;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6d19a0a554aa..8e7250029f64 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -669,7 +669,8 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
return -EIO; /* we fixed nothing */
}
-static void end_workqueue_bio(struct bio *bio, int err)
+static void end_workqueue_bio(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
@@ -2957,7 +2958,8 @@ static int write_dev_supers(struct btrfs_device *device,
* endio for the write_dev_flush, this will wake anyone waiting
* for the barrier when it is done
*/
-static void btrfs_end_empty_barrier(struct bio *bio, int err)
+static void btrfs_end_empty_barrier(struct bio *bio, int err,
+ struct batch_complete *batch)
{
if (err) {
if (err == -EOPNOTSUPP)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 73f2bfe3ac93..fbf0d44851e4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1937,7 +1937,8 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
return err;
}
-static void repair_io_failure_callback(struct bio *bio, int err)
+static void repair_io_failure_callback(struct bio *bio, int err,
+ struct batch_complete *batch)
{
complete(bio->bi_private);
}
@@ -2317,7 +2318,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_writepage(struct bio *bio, int err)
+static void end_bio_extent_writepage(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
struct extent_io_tree *tree;
@@ -2363,7 +2365,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_readpage(struct bio *bio, int err)
+static void end_bio_extent_readpage(struct bio *bio, int err,
+ struct batch_complete *batch)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -3186,7 +3189,8 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
}
-static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+static void end_bio_extent_buffer_writepage(struct bio *bio, int err,
+ struct batch_complete *batch)
{
int uptodate = err == 0;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bb8b7a0e28a6..bc4d54c465a0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
+#include <linux/aio.h>
#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/writeback.h>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 09c58a35b429..00313057e444 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
+#include <linux/aio.h>
#include <linux/bit_spinlock.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
@@ -6913,7 +6914,8 @@ struct btrfs_dio_private {
struct bio *orig_bio;
};
-static void btrfs_endio_direct_read(struct bio *bio, int err)
+static void btrfs_endio_direct_read(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_dio_private *dip = bio->bi_private;
struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -6967,10 +6969,11 @@ failed:
/* If we had a csum failure make sure to clear the uptodate flag */
if (err)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
- dio_end_io(bio, err);
+ dio_end_io(bio, err, batch);
}
-static void btrfs_endio_direct_write(struct bio *bio, int err)
+static void btrfs_endio_direct_write(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_dio_private *dip = bio->bi_private;
struct inode *inode = dip->inode;
@@ -7012,7 +7015,7 @@ out_done:
/* If we had an error make sure to clear the uptodate flag */
if (err)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
- dio_end_io(bio, err);
+ dio_end_io(bio, err, batch);
}
static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
@@ -7026,7 +7029,8 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
return 0;
}
-static void btrfs_end_dio_bio(struct bio *bio, int err)
+static void btrfs_end_dio_bio(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_dio_private *dip = bio->bi_private;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9a79fb790adb..6df1ac8d0adf 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -850,7 +850,8 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
* end io function used by finish_rmw. When we finally
* get here, we've written a full stripe
*/
-static void raid_write_end_io(struct bio *bio, int err)
+static void raid_write_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
@@ -1384,7 +1385,8 @@ static void set_bio_pages_uptodate(struct bio *bio)
* This will usually kick off finish_rmw once all the bios are read in, but it
* may trigger parity reconstruction if we had any errors along the way
*/
-static void raid_rmw_end_io(struct bio *bio, int err)
+static void raid_rmw_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
@@ -1905,7 +1907,8 @@ cleanup_io:
* This is called only for stripes we've read from disk to
* reconstruct the parity.
*/
-static void raid_recover_end_io(struct bio *bio, int err)
+static void raid_recover_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 85e072b956d5..fc29a119436a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -200,7 +200,8 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
int is_metadata, int have_csum,
const u8 *csum, u64 generation,
u16 csum_size);
-static void scrub_complete_bio_end_io(struct bio *bio, int err);
+static void scrub_complete_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int force_write);
@@ -223,7 +224,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum, int force,
u64 physical_for_dev_replace);
-static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch);
static void scrub_bio_end_io_worker(struct btrfs_work *work);
static void scrub_block_complete(struct scrub_block *sblock);
static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
@@ -240,7 +242,8 @@ static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
static void scrub_wr_submit(struct scrub_ctx *sctx);
-static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch);
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
static int write_page_nocow(struct scrub_ctx *sctx,
u64 physical_for_dev_replace, struct page *page);
@@ -1386,7 +1389,8 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
sblock->checksum_error = 1;
}
-static void scrub_complete_bio_end_io(struct bio *bio, int err)
+static void scrub_complete_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
complete((struct completion *)bio->bi_private);
}
@@ -1586,7 +1590,8 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
btrfsic_submit_bio(WRITE, sbio->bio);
}
-static void scrub_wr_bio_end_io(struct bio *bio, int err)
+static void scrub_wr_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
@@ -2056,7 +2061,8 @@ leave_nomem:
return 0;
}
-static void scrub_bio_end_io(struct bio *bio, int err)
+static void scrub_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 678977226570..0182898a6c88 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5029,7 +5029,8 @@ static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
return (unsigned int)((uintptr_t)bi_private) & 3;
}
-static void btrfs_end_bio(struct bio *bio, int err)
+static void btrfs_end_bio(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
int is_orig_bio = 0;
@@ -5086,7 +5087,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
}
kfree(bbio);
- bio_endio(bio, err);
+ bio_endio_batch(bio, err, batch);
} else if (!is_orig_bio) {
bio_put(bio);
}
diff --git a/fs/buffer.c b/fs/buffer.c
index d2a4d1bb2d57..c41042274ac7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2882,7 +2882,8 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
}
EXPORT_SYMBOL(generic_block_bmap);
-static void end_bio_bh_io_sync(struct bio *bio, int err)
+static void end_bio_bh_io_sync(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct buffer_head *bh = bio->bi_private;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d70830c66833..656e16907430 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -7,6 +7,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/writeback.h>
+#include <linux/aio.h>
#include "super.h"
#include "mds_client.h"
diff --git a/fs/compat.c b/fs/compat.c
index d0560c93973d..d172b71b83ef 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -47,6 +47,7 @@
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 4925c1b51abb..b4dd97c8cea3 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,6 +37,7 @@
#include <linux/uio.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
+#include <linux/aio.h>
/*
* How many user pages to map in one call to get_user_pages(). This determines
@@ -229,7 +230,8 @@ static inline struct page *dio_get_page(struct dio *dio,
* filesystems can use it to hold additional state between get_block calls and
* dio_complete.
*/
-static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
+ bool is_async, struct batch_complete *batch)
{
ssize_t transferred = 0;
@@ -263,7 +265,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
} else {
inode_dio_done(dio->inode);
if (is_async)
- aio_complete(dio->iocb, ret, 0);
+ aio_complete_batch(dio->iocb, ret, 0, batch);
}
return ret;
@@ -273,7 +275,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio);
/*
* Asynchronous IO callback.
*/
-static void dio_bio_end_aio(struct bio *bio, int error)
+static void dio_bio_end_aio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct dio *dio = bio->bi_private;
unsigned long remaining;
@@ -289,7 +292,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (remaining == 0) {
- dio_complete(dio, dio->iocb->ki_pos, 0, true);
+ dio_complete(dio, dio->iocb->ki_pos, 0, true, batch);
kmem_cache_free(dio_cache, dio);
}
}
@@ -323,12 +326,12 @@ static void dio_bio_end_io(struct bio *bio, int error)
* so that the DIO specific endio actions are dealt with after the filesystem
* has done it's completion work.
*/
-void dio_end_io(struct bio *bio, int error)
+void dio_end_io(struct bio *bio, int error, struct batch_complete *batch)
{
struct dio *dio = bio->bi_private;
if (dio->is_async)
- dio_bio_end_aio(bio, error);
+ dio_bio_end_aio(bio, error, batch);
else
dio_bio_end_io(bio, error);
}
@@ -349,10 +352,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
bio->bi_bdev = bdev;
bio->bi_sector = first_sector;
- if (dio->is_async)
- bio->bi_end_io = dio_bio_end_aio;
- else
- bio->bi_end_io = dio_bio_end_io;
+ bio->bi_end_io = dio_end_io;
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
@@ -1267,7 +1267,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
dio_await_completion(dio);
if (drop_refcount(dio) == 0) {
- retval = dio_complete(dio, offset, retval, false);
+ retval = dio_complete(dio, offset, retval, false, NULL);
kmem_cache_free(dio_cache, dio);
} else
BUG_ON(retval != -EIOCBQUEUED);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index c00e055b6282..f23d2a7ed438 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -58,6 +58,8 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
if (ret)
return ret;
if (write) {
+ printk(KERN_NOTICE "%s (%d): dropped kernel caches: %d\n",
+ current->comm, task_pid_nr(current), sysctl_drop_caches);
if (sysctl_drop_caches & 1)
iterate_supers(drop_pagecache_sb, NULL);
if (sysctl_drop_caches & 2)
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 63b1f54b6a1f..201f0a0d6b0a 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,6 +31,7 @@
#include <linux/security.h>
#include <linux/compat.h>
#include <linux/fs_stack.h>
+#include <linux/aio.h>
#include "ecryptfs_kernel.h"
/**
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fe60cc1117d8..0a87bb10998d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
#include <linux/mpage.h>
#include <linux/fiemap.h>
#include <linux/namei.h>
+#include <linux/aio.h>
#include "ext2.h"
#include "acl.h"
#include "xip.h"
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index d706dbfa6220..23c712825640 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -27,6 +27,7 @@
#include <linux/writeback.h>
#include <linux/mpage.h>
#include <linux/namei.h>
+#include <linux/aio.h>
#include "ext3.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 64848b595b24..4959e29573b6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
#include <linux/jbd2.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/aio.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include "ext4.h"
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 98be6f697463..b8d5d351e24f 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,6 +20,7 @@
* (sct@redhat.com), 1993, 1998
*/
+#include <linux/aio.h>
#include "ext4_jbd2.h"
#include "truncate.h"
#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 793d44b84d7f..0723774bdfb5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
+#include <linux/aio.h>
#include "ext4_jbd2.h"
#include "xattr.h"
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 5929cd0baa20..0f5670970915 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -18,6 +18,7 @@
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/namei.h>
+#include <linux/aio.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
@@ -257,7 +258,8 @@ static void buffer_io_error(struct buffer_head *bh)
(unsigned long long)bh->b_blocknr);
}
-static void ext4_end_bio(struct bio *bio, int error)
+static void ext4_end_bio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
ext4_io_end_t *io_end = bio->bi_private;
struct inode *inode;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2db9380f5dda..d5038e5688f6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -12,6 +12,7 @@
#include <linux/f2fs_fs.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
+#include <linux/aio.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
@@ -329,7 +330,7 @@ repeat:
return page;
}
-static void read_end_io(struct bio *bio, int err)
+static void read_end_io(struct bio *bio, int err, struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d8e84e49a5c3..e36793c00caf 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -633,7 +633,8 @@ static const struct segment_allocation default_salloc_ops = {
.allocate_segment = allocate_segment_by_default,
};
-static void f2fs_end_io_write(struct bio *bio, int err)
+static void f2fs_end_io_write(struct bio *bio, int err,
+ struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b0b632e50ddb..73264395f705 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -17,8 +17,11 @@
#include <linux/blkdev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
+#include <linux/falloc.h>
#include "fat.h"
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len);
static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
@@ -140,6 +143,22 @@ static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
static int fat_file_release(struct inode *inode, struct file *filp)
{
+
+ struct super_block *sb = inode->i_sb;
+ loff_t mmu_private_ideal;
+
+ /*
+ * Release unwritten fallocated blocks on file release.
+ * Do this only when the last open file descriptor is closed.
+ */
+ mutex_lock(&inode->i_mutex);
+ mmu_private_ideal = round_up(inode->i_size, sb->s_blocksize);
+
+ if (mmu_private_ideal < MSDOS_I(inode)->mmu_private &&
+ filp->f_dentry->d_count == 1)
+ fat_truncate_blocks(inode, inode->i_size);
+ mutex_unlock(&inode->i_mutex);
+
if ((filp->f_mode & FMODE_WRITE) &&
MSDOS_SB(inode->i_sb)->options.flush) {
fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -174,6 +193,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .fallocate = fat_fallocate,
};
static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -212,6 +232,88 @@ out:
return err;
}
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate. The
+ * allocated clusters are freed in fat_file_release().
+ */
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ int cluster, fclus, dclus;
+ int nr_cluster; /* Number of clusters to be allocated */
+ loff_t nr_bytes; /* Number of bytes to be allocated*/
+ loff_t free_bytes; /* Unused bytes in the last cluster of file*/
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int err = 0;
+
+ /* No support for hole punch or other fallocate flags. */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&inode->i_mutex);
+ if ((offset + len) <= MSDOS_I(inode)->mmu_private) {
+ fat_msg(sb, KERN_ERR,
+ "fat_fallocate(): Blocks already allocated");
+ err = -EINVAL;
+ goto error;
+ }
+
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ /* First compute the number of clusters to be allocated */
+ if (inode->i_size > 0) {
+ err = fat_get_cluster(inode, FAT_ENT_EOF,
+ &fclus, &dclus);
+ if (err < 0) {
+ fat_msg(sb, KERN_ERR,
+ "fat_fallocate(): fat_get_cluster() error");
+ goto error;
+ }
+ free_bytes = ((fclus + 1) << sbi->cluster_bits) -
+ inode->i_size;
+ nr_bytes = offset + len - inode->i_size - free_bytes;
+ MSDOS_I(inode)->mmu_private = (fclus + 1) <<
+ sbi->cluster_bits;
+ } else
+ nr_bytes = offset + len - inode->i_size;
+
+ nr_cluster = (nr_bytes + (sbi->cluster_size - 1)) >>
+ sbi->cluster_bits;
+
+ /* Start the allocation.We are not zeroing out the clusters */
+ while (nr_cluster-- > 0) {
+ err = fat_alloc_clusters(inode, &cluster, 1);
+ if (err) {
+ fat_msg(sb, KERN_ERR,
+ "fat_fallocate(): fat_alloc_clusters() error");
+ goto error;
+ }
+ err = fat_chain_add(inode, cluster, 1);
+ if (err) {
+ fat_free_clusters(inode, cluster);
+ goto error;
+ }
+ MSDOS_I(inode)->mmu_private += sbi->cluster_size;
+ }
+ } else {
+ /* This is just an expanding truncate */
+ err = fat_cont_expand(inode, (offset + len));
+ if (err) {
+ fat_msg(sb, KERN_ERR,
+ "fat_fallocate(): fat_cont_expand() error");
+ }
+ }
+
+error:
+ mutex_unlock(&inode->i_mutex);
+ return err;
+}
+
/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
@@ -378,6 +480,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *inode = dentry->d_inode;
unsigned int ia_valid;
int error;
+ loff_t mmu_private_ideal;
+
+ mmu_private_ideal = round_up(inode->i_size, dentry->d_sb->s_blocksize);
/* Check for setting the inode time. */
ia_valid = attr->ia_valid;
@@ -403,7 +508,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & ATTR_SIZE) {
inode_dio_wait(inode);
- if (attr->ia_size > inode->i_size) {
+ if (attr->ia_size > inode->i_size &&
+ MSDOS_I(inode)->mmu_private <= mmu_private_ideal) {
error = fat_cont_expand(inode, attr->ia_size);
if (error || attr->ia_valid == ATTR_SIZE)
goto out;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4ff901632b26..3300da0ce0e5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -19,6 +19,7 @@
#include <linux/mpage.h>
#include <linux/buffer_head.h>
#include <linux/mount.h>
+#include <linux/aio.h>
#include <linux/vfs.h>
#include <linux/parser.h>
#include <linux/uio.h>
@@ -151,11 +152,65 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
}
}
+static int fat_zero_falloc_area(struct file *file,
+ struct address_space *mapping, loff_t pos)
+{
+ struct page *page;
+ struct inode *inode = mapping->host;
+ loff_t curpos = i_size_read(inode);
+ size_t count = pos - curpos;
+ int err;
+
+ do {
+ unsigned offset;
+ size_t bytes;
+ void *fsdata;
+
+ offset = (curpos & (PAGE_CACHE_SIZE - 1));
+ bytes = PAGE_CACHE_SIZE - offset;
+ bytes = min(bytes, count);
+
+ err = pagecache_write_begin(NULL, mapping, curpos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE,
+ &page, &fsdata);
+ if (err)
+ break;
+
+ zero_user(page, offset, bytes);
+
+ err = pagecache_write_end(NULL, mapping, curpos, bytes, bytes,
+ page, fsdata);
+ if (err < 0)
+ break;
+ curpos += bytes;
+ count -= bytes;
+ err = 0;
+ } while (count);
+
+ return err;
+}
+
static int fat_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
int err;
+ loff_t mmu_private_ideal, mmu_private_actual;
+ loff_t size;
+ struct inode *inode = mapping->host;
+ struct super_block *sb = inode->i_sb;
+
+ size = i_size_read(inode);
+ mmu_private_actual = MSDOS_I(inode)->mmu_private;
+ mmu_private_ideal = round_up(size, sb->s_blocksize);
+ if ((mmu_private_actual > mmu_private_ideal) && (pos > size)) {
+ err = fat_zero_falloc_area(file, mapping, pos);
+ if (err) {
+ fat_msg(sb, KERN_ERR,
+ "Error (%d) zeroing fallocated area", err);
+ return err;
+ }
+ }
*pagep = NULL;
err = cont_write_begin(file, mapping, pos, len, flags,
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b3aaf7b3578b..aef34b1e635e 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -38,6 +38,7 @@
#include <linux/device.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/aio.h>
#include <linux/kdev_t.h>
#include <linux/kthread.h>
#include <linux/list.h>
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a6c1664e330b..1d55f9465400 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -19,6 +19,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/swap.h>
#include <linux/splice.h>
+#include <linux/aio.h>
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4655e59d545b..d1c9b85b3f58 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/compat.h>
#include <linux/swap.h>
+#include <linux/aio.h>
static const struct file_operations fuse_direct_io_file_operations;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 9883694f1e7c..0bad69ed6336 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,6 +20,7 @@
#include <linux/swap.h>
#include <linux/gfs2_ondisk.h>
#include <linux/backing-dev.h>
+#include <linux/aio.h>
#include "gfs2.h"
#include "incore.h"
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index d79c2dadc536..acd16764b133 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,6 +25,7 @@
#include <asm/uaccess.h>
#include <linux/dlm.h>
#include <linux/dlm_plock.h>
+#include <linux/aio.h>
#include "gfs2.h"
#include "incore.h"
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index c5fa758fd844..91a5ebb614ca 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -200,7 +200,8 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
*
*/
-static void gfs2_end_log_write(struct bio *bio, int error)
+static void gfs2_end_log_write(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct gfs2_sbd *sdp = bio->bi_private;
struct bio_vec *bvec;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 60ede2a0f43f..86eb657aeaca 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -155,7 +155,8 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
return -EINVAL;
}
-static void end_bio_io_page(struct bio *bio, int error)
+static void end_bio_io_page(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct page *page = bio->bi_private;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 716e1aafb2e2..f9299d8a64e3 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
+#include <linux/aio.h>
#include "hfs_fs.h"
#include "btree.h"
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 7faaa964968e..f833d35630ab 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
+#include <linux/aio.h>
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index b51a6079108d..96375a5124b2 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -24,7 +24,8 @@ struct hfsplus_wd {
u16 embed_count;
};
-static void hfsplus_end_io_sync(struct bio *bio, int err)
+static void hfsplus_end_io_sync(struct bio *bio, int err,
+ struct batch_complete *batch)
{
if (err)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 77554b61d124..730f24e282a6 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -23,6 +23,7 @@
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
+#include <linux/aio.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_filsys.h"
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index c57499dca89c..b02926c80b84 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2153,7 +2153,7 @@ static void lbmStartIO(struct lbuf * bp)
/* check if journaling to disk has been disabled */
if (log->no_integrity) {
bio->bi_size = 0;
- lbmIODone(bio, 0);
+ lbmIODone(bio, 0, NULL);
} else {
submit_bio(WRITE_SYNC, bio);
INCREMENT(lmStat.submitted);
@@ -2191,7 +2191,7 @@ static int lbmIOWait(struct lbuf * bp, int flag)
*
* executed at INTIODONE level
*/
-static void lbmIODone(struct bio *bio, int error)
+static void lbmIODone(struct bio *bio, int error, struct batch_complete *batch)
{
struct lbuf *bp = bio->bi_private;
struct lbuf *nextbp, *tail;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 6740d34cd82b..6ba675782e9f 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -283,7 +283,8 @@ static void last_read_complete(struct page *page)
unlock_page(page);
}
-static void metapage_read_end_io(struct bio *bio, int err)
+static void metapage_read_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct page *page = bio->bi_private;
@@ -338,7 +339,8 @@ static void last_write_complete(struct page *page)
end_page_writeback(page);
}
-static void metapage_write_end_io(struct bio *bio, int err)
+static void metapage_write_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct page *page = bio->bi_private;
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 550475ca6a0e..0ae2254f74bf 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -14,7 +14,8 @@
#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
-static void request_complete(struct bio *bio, int err)
+static void request_complete(struct bio *bio, int err,
+ struct batch_complete *batch)
{
complete((struct completion *)bio->bi_private);
}
@@ -64,7 +65,8 @@ static int bdev_readpage(void *_sb, struct page *page)
static DECLARE_WAIT_QUEUE_HEAD(wq);
-static void writeseg_end_io(struct bio *bio, int err)
+static void writeseg_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -168,7 +170,7 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
}
-static void erase_end_io(struct bio *bio, int err)
+static void erase_end_io(struct bio *bio, int err, struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct super_block *sb = bio->bi_private;
diff --git a/fs/mpage.c b/fs/mpage.c
index 0face1c4d4c6..a4089bbfee0a 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -41,7 +41,7 @@
* status of that page is hard. See end_buffer_async_read() for the details.
* There is no point in duplicating all that complexity.
*/
-static void mpage_end_io(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio, int err, struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 434b93ec0970..76cf69557051 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -143,7 +143,7 @@ bl_submit_bio(int rw, struct bio *bio)
static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
+ bio_end_io_t *end_io,
struct parallel_io *par)
{
struct bio *bio;
@@ -167,7 +167,7 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
sector_t isect, struct page *page,
struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
+ bio_end_io_t *end_io,
struct parallel_io *par,
unsigned int offset, int len)
{
@@ -190,7 +190,7 @@ retry:
static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
sector_t isect, struct page *page,
struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
+ bio_end_io_t *end_io,
struct parallel_io *par)
{
return do_add_page_to_bio(bio, npg, rw, isect, page, be,
@@ -198,7 +198,8 @@ static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
}
/* This is basically copied from mpage_end_io_read */
-static void bl_end_io_read(struct bio *bio, int err)
+static void bl_end_io_read(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct parallel_io *par = bio->bi_private;
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -380,7 +381,8 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
}
}
-static void bl_end_io_write_zero(struct bio *bio, int err)
+static void bl_end_io_write_zero(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct parallel_io *par = bio->bi_private;
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -408,7 +410,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
put_parallel(par);
}
-static void bl_end_io_write(struct bio *bio, int err)
+static void bl_end_io_write(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct parallel_io *par = bio->bi_private;
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -487,7 +490,7 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
}
static void
-bl_read_single_end_io(struct bio *bio, int error)
+bl_read_single_end_io(struct bio *bio, int error, struct batch_complete *batch)
{
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
struct page *page = bvec->bv_page;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index cf02f5530713..689fb608648e 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -25,7 +25,7 @@
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/writeback.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
#include "nilfs.h"
#include "btnode.h"
#include "segment.h"
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index dc9a913784ab..680b65b8a74d 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -338,7 +338,8 @@ void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
/*
* BIO operations
*/
-static void nilfs_end_bio_write(struct bio *bio, int err)
+static void nilfs_end_bio_write(struct bio *bio, int err,
+ struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct nilfs_segment_buffer *segbuf = bio->bi_private;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 1da4b81e6f76..c5670b8d198c 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -27,6 +27,7 @@
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/writeback.h>
+#include <linux/aio.h>
#include <asm/page.h>
#include <asm/uaccess.h>
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index d3e118cc6ffa..2778b0255dc6 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -28,6 +28,7 @@
#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/log2.h>
+#include <linux/aio.h>
#include "aops.h"
#include "attrib.h"
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index ffb2da370a99..f671e49beb34 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,6 +22,8 @@
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
+#include <linux/aio.h>
+
handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
unsigned from,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 8c3318bf2252..0cc19d0417bf 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -372,8 +372,8 @@ static void o2hb_wait_on_io(struct o2hb_region *reg,
wait_for_completion(&wc->wc_io_complete);
}
-static void o2hb_bio_end_io(struct bio *bio,
- int error)
+static void o2hb_bio_end_io(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 12ae194ac943..3a44a648dae7 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
arg_flags, subclass, _RET_IP_);
if (status < 0) {
- if (status != -EAGAIN && status != -EIOCBRETRY)
+ if (status != -EAGAIN)
mlog_errno(status);
goto bail;
}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 88924a3133fa..c765bdf6d60e 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -28,6 +28,8 @@
#include "extent_map.h"
+struct iocb;
+
/* OCFS2 Inode Private Data */
struct ocfs2_inode_info
{
diff --git a/fs/pipe.c b/fs/pipe.c
index a029a14bacf1..d2c45e14e6d8 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,6 +21,7 @@
#include <linux/audit.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dd51e50001fe..4d3ebd6fd710 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2673,6 +2673,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
REG("pagemap", S_IRUGO, proc_pagemap_operations),
+ REG("pagemap2", S_IRUGO, proc_pagemap2_operations),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -3029,6 +3030,7 @@ static const struct pid_entry tid_base_stuff[] = {
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_tid_smaps_operations),
REG("pagemap", S_IRUGO, proc_pagemap_operations),
+ REG("pagemap2", S_IRUGO, proc_pagemap2_operations),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d600fb098b6a..f417d4398b68 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -290,6 +290,7 @@ extern const struct file_operations proc_pid_smaps_operations;
extern const struct file_operations proc_tid_smaps_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_pagemap2_operations;
extern unsigned long task_vsize(struct mm_struct *);
extern unsigned long task_statm(struct mm_struct *,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d864d56..3240a497926c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -688,10 +688,41 @@ const struct file_operations proc_tid_smaps_operations = {
.release = seq_release_private,
};
+enum clear_refs_types {
+ CLEAR_REFS_ALL = 1,
+ CLEAR_REFS_ANON,
+ CLEAR_REFS_MAPPED,
+ CLEAR_REFS_SOFT_DIRTY,
+ CLEAR_REFS_LAST,
+};
+
+struct clear_refs_private {
+ struct vm_area_struct *vma;
+ enum clear_refs_types type;
+};
+
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ /*
+ * The soft-dirty tracker uses #PF-s to catch writes
+ * to pages, so write-protect the pte as well. See the
+ * Documentation/vm/soft-dirty.txt for full description
+ * of how soft-dirty works.
+ */
+ pte_t ptent = *pte;
+ ptent = pte_wrprotect(ptent);
+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
+#endif
+}
+
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->private;
+ struct clear_refs_private *cp = walk->private;
+ struct vm_area_struct *vma = cp->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
@@ -706,6 +737,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte_present(ptent))
continue;
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ clear_soft_dirty(vma, addr, pte);
+ continue;
+ }
+
page = vm_normal_page(vma, addr, ptent);
if (!page)
continue;
@@ -719,10 +755,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-#define CLEAR_REFS_ALL 1
-#define CLEAR_REFS_ANON 2
-#define CLEAR_REFS_MAPPED 3
-
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -730,7 +762,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
char buffer[PROC_NUMBUF];
struct mm_struct *mm;
struct vm_area_struct *vma;
- int type;
+ enum clear_refs_types type;
+ int itype;
int rv;
memset(buffer, 0, sizeof(buffer));
@@ -738,23 +771,28 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- rv = kstrtoint(strstrip(buffer), 10, &type);
+ rv = kstrtoint(strstrip(buffer), 10, &itype);
if (rv < 0)
return rv;
- if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
+ type = (enum clear_refs_types)itype;
+ if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
return -EINVAL;
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
+ struct clear_refs_private cp = {
+ .type = type,
+ };
struct mm_walk clear_refs_walk = {
.pmd_entry = clear_refs_pte_range,
.mm = mm,
+ .private = &cp,
};
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- clear_refs_walk.private = vma;
+ cp.vma = vma;
if (is_vm_hugetlb_page(vma))
continue;
/*
@@ -794,6 +832,7 @@ typedef struct {
struct pagemapread {
int pos, len;
pagemap_entry_t *buffer;
+ bool v2;
};
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
@@ -807,14 +846,17 @@ struct pagemapread {
#define PM_PSHIFT_BITS 6
#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
+/* in pagemap2 pshift bits are occupied with more status bits */
+#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
+#define __PM_SOFT_DIRTY (1LL)
#define PM_PRESENT PM_STATUS(4LL)
#define PM_SWAP PM_STATUS(2LL)
#define PM_FILE PM_STATUS(1LL)
-#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT)
+#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
#define PM_END_OF_BUFFER 1
static inline pagemap_entry_t make_pme(u64 val)
@@ -837,7 +879,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
struct pagemapread *pm = walk->private;
unsigned long addr;
int err = 0;
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
for (addr = start; addr < end; addr += PAGE_SIZE) {
err = add_to_pagemap(addr, &pme, pm);
@@ -847,11 +889,12 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
return err;
}
-static void pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
u64 frame, flags;
struct page *page = NULL;
+ int flags2 = 0;
if (pte_present(pte)) {
frame = pte_pfn(pte);
@@ -866,19 +909,21 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme,
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
} else {
- *pme = make_pme(PM_NOT_PRESENT);
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2));
return;
}
if (page && !PageAnon(page))
flags |= PM_FILE;
+ if (pte_soft_dirty(pte))
+ flags2 |= __PM_SOFT_DIRTY;
- *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags);
+ *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
- pmd_t pmd, int offset)
+static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+ pmd_t pmd, int offset, int pmd_flags2)
{
/*
* Currently pmd for thp is always present because thp can not be
@@ -887,13 +932,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
*/
if (pmd_present(pmd))
*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+ | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
else
- *pme = make_pme(PM_NOT_PRESENT);
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2));
}
#else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
- pmd_t pmd, int offset)
+static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+ pmd_t pmd, int offset, int pmd_flags2)
{
}
#endif
@@ -905,17 +950,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct pagemapread *pm = walk->private;
pte_t *pte;
int err = 0;
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+ int pmd_flags2;
+
+ pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
for (; addr != end; addr += PAGE_SIZE) {
unsigned long offset;
offset = (addr & ~PAGEMAP_WALK_MASK) >>
PAGE_SHIFT;
- thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
+ thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
@@ -932,7 +980,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
* and need a new, higher one */
if (vma && (addr >= vma->vm_end)) {
vma = find_vma(walk->mm, addr);
- pme = make_pme(PM_NOT_PRESENT);
+ pme = make_pme(PM_NOT_PRESENT(pm->v2));
}
/* check that 'vma' actually covers this address,
@@ -940,7 +988,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (vma && (vma->vm_start <= addr) &&
!is_vm_hugetlb_page(vma)) {
pte = pte_offset_map(pmd, addr);
- pte_to_pagemap_entry(&pme, vma, addr, *pte);
+ pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
/* unmap before userspace copy */
pte_unmap(pte);
}
@@ -955,14 +1003,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
#ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
pte_t pte, int offset)
{
if (pte_present(pte))
*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+ | PM_STATUS2(pm->v2, 0) | PM_PRESENT);
else
- *pme = make_pme(PM_NOT_PRESENT);
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2));
}
/* This function walks within one hugetlb entry in the single call */
@@ -976,7 +1024,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
for (; addr != end; addr += PAGE_SIZE) {
int offset = (addr & ~hmask) >> PAGE_SHIFT;
- huge_pte_to_pagemap_entry(&pme, *pte, offset);
+ huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
@@ -1012,8 +1060,8 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
* determine which areas of memory are actually mapped and llseek to
* skip over unmapped regions.
*/
-static ssize_t pagemap_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t do_pagemap_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos, bool v2)
{
struct task_struct *task = get_proc_task(file_inode(file));
struct mm_struct *mm;
@@ -1038,6 +1086,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!count)
goto out_task;
+ pm.v2 = v2;
pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
ret = -ENOMEM;
@@ -1110,10 +1159,27 @@ out:
return ret;
}
+static ssize_t pagemap_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return do_pagemap_read(file, buf, count, ppos, false);
+}
+
const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
};
+
+static ssize_t pagemap2_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return do_pagemap_read(file, buf, count, ppos, true);
+}
+
+const struct file_operations proc_pagemap2_operations = {
+ .llseek = mem_lseek, /* borrow this */
+ .read = pagemap2_read,
+};
#endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA
diff --git a/fs/read_write.c b/fs/read_write.c
index 605dbbcb1973..03c47562d43c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,6 +9,7 @@
#include <linux/fcntl.h>
#include <linux/file.h>
#include <linux/uio.h>
+#include <linux/aio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/export.h>
@@ -326,16 +327,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
}
-static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
-{
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!kiocbIsKicked(iocb))
- schedule();
- else
- kiocbClearKicked(iocb);
- __set_current_state(TASK_RUNNING);
-}
-
ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
@@ -347,13 +338,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
- for (;;) {
- ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
- if (ret != -EIOCBRETRY)
- break;
- wait_on_retry_sync_kiocb(&kiocb);
- }
-
+ ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
@@ -403,13 +388,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
- for (;;) {
- ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
- if (ret != -EIOCBRETRY)
- break;
- wait_on_retry_sync_kiocb(&kiocb);
- }
-
+ ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
@@ -589,13 +568,7 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
- for (;;) {
- ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
- if (ret != -EIOCBRETRY)
- break;
- wait_on_retry_sync_kiocb(&kiocb);
- }
-
+ ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
if (ret == -EIOCBQUEUED)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ea5061fd4f3e..77d6d47abc83 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -18,6 +18,7 @@
#include <linux/writeback.h>
#include <linux/quotaops.h>
#include <linux/swap.h>
+#include <linux/aio.h>
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index f12189d2db1d..14374530784c 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -50,6 +50,7 @@
*/
#include "ubifs.h"
+#include <linux/aio.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/slab.h>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 7a12e48ad819..b6d15d349810 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,6 +38,7 @@
#include <linux/slab.h>
#include <linux/crc-itu-t.h>
#include <linux/mpage.h>
+#include <linux/aio.h>
#include "udf_i.h"
#include "udf_sb.h"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3244c988d379..f64ee7130509 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,6 +31,7 @@
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
#include "xfs_bmap.h"
+#include <linux/aio.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/pagevec.h>
@@ -379,7 +380,8 @@ xfs_imap_valid(
STATIC void
xfs_end_bio(
struct bio *bio,
- int error)
+ int error,
+ struct batch_complete *batch)
{
xfs_ioend_t *ioend = bio->bi_private;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 82b70bda9f47..cee0e42b5389 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1224,7 +1224,8 @@ _xfs_buf_ioend(
STATIC void
xfs_buf_bio_end_io(
struct bio *bio,
- int error)
+ int error,
+ struct batch_complete *batch)
{
xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 054d60c0ac57..a5f2042aec8b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -36,6 +36,7 @@
#include "xfs_ioctl.h"
#include "xfs_trace.h"
+#include <linux/aio.h>
#include <linux/dcache.h>
#include <linux/falloc.h>
#include <linux/pagevec.h>
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index a59ff51b0166..347fb7bfcbe2 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -396,6 +396,28 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
#define arch_start_context_switch(prev) do {} while (0)
#endif
+#ifndef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline int pte_soft_dirty(pte_t pte)
+{
+ return 0;
+}
+
+static inline int pmd_soft_dirty(pmd_t pmd)
+{
+ return 0;
+}
+
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+ return pte;
+}
+
+static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
+{
+ return pmd;
+}
+#endif
+
#ifndef __HAVE_PFNMAP_TRACKING
/*
* Interfaces that can be used by architecture code to keep track of
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 31ff6dba4872..a7e4c595825e 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -6,94 +6,38 @@
#include <linux/aio_abi.h>
#include <linux/uio.h>
#include <linux/rcupdate.h>
-
#include <linux/atomic.h>
-
-#define AIO_MAXSEGS 4
-#define AIO_KIOGRP_NR_ATOMIC 8
+#include <linux/batch_complete.h>
struct kioctx;
+struct kiocb;
+struct batch_complete;
-/* Notes on cancelling a kiocb:
- * If a kiocb is cancelled, aio_complete may return 0 to indicate
- * that cancel has not yet disposed of the kiocb. All cancel
- * operations *must* call aio_put_req to dispose of the kiocb
- * to guard against races with the completion code.
- */
-#define KIOCB_C_CANCELLED 0x01
-#define KIOCB_C_COMPLETE 0x02
-
-#define KIOCB_SYNC_KEY (~0U)
+#define KIOCB_KEY 0
-/* ki_flags bits */
/*
- * This may be used for cancel/retry serialization in the future, but
- * for now it's unused and we probably don't want modules to even
- * think they can use it.
+ * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+ * cancelled or completed (this makes a certain amount of sense because
+ * successful cancellation - io_cancel() - does deliver the completion to
+ * userspace).
+ *
+ * And since most things don't implement kiocb cancellation and we'd really like
+ * kiocb completion to be lockless when possible, we use ki_cancel to
+ * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+ * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
*/
-/* #define KIF_LOCKED 0 */
-#define KIF_KICKED 1
-#define KIF_CANCELLED 2
-
-#define kiocbTryLock(iocb) test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbTryKick(iocb) test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags)
+#define KIOCB_CANCELLED ((void *) (~0ULL))
-#define kiocbSetLocked(iocb) set_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbSetKicked(iocb) set_bit(KIF_KICKED, &(iocb)->ki_flags)
-#define kiocbSetCancelled(iocb) set_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *);
-#define kiocbClearLocked(iocb) clear_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbClearKicked(iocb) clear_bit(KIF_KICKED, &(iocb)->ki_flags)
-#define kiocbClearCancelled(iocb) clear_bit(KIF_CANCELLED, &(iocb)->ki_flags)
-
-#define kiocbIsLocked(iocb) test_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbIsKicked(iocb) test_bit(KIF_KICKED, &(iocb)->ki_flags)
-#define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
-
-/* is there a better place to document function pointer methods? */
-/**
- * ki_retry - iocb forward progress callback
- * @kiocb: The kiocb struct to advance by performing an operation.
- *
- * This callback is called when the AIO core wants a given AIO operation
- * to make forward progress. The kiocb argument describes the operation
- * that is to be performed. As the operation proceeds, perhaps partially,
- * ki_retry is expected to update the kiocb with progress made. Typically
- * ki_retry is set in the AIO core and it itself calls file_operations
- * helpers.
- *
- * ki_retry's return value determines when the AIO operation is completed
- * and an event is generated in the AIO event ring. Except the special
- * return values described below, the value that is returned from ki_retry
- * is transferred directly into the completion ring as the operation's
- * resulting status. Once this has happened ki_retry *MUST NOT* reference
- * the kiocb pointer again.
- *
- * If ki_retry returns -EIOCBQUEUED it has made a promise that aio_complete()
- * will be called on the kiocb pointer in the future. The AIO core will
- * not ask the method again -- ki_retry must ensure forward progress.
- * aio_complete() must be called once and only once in the future, multiple
- * calls may result in undefined behaviour.
- *
- * If ki_retry returns -EIOCBRETRY it has made a promise that kick_iocb()
- * will be called on the kiocb pointer in the future. This may happen
- * through generic helpers that associate kiocb->ki_wait with a wait
- * queue head that ki_retry uses via current->io_wait. It can also happen
- * with custom tracking and manual calls to kick_iocb(), though that is
- * discouraged. In either case, kick_iocb() must be called once and only
- * once. ki_retry must ensure forward progress, the AIO core will wait
- * indefinitely for kick_iocb() to be called.
- */
struct kiocb {
- struct list_head ki_run_list;
- unsigned long ki_flags;
- int ki_users;
- unsigned ki_key; /* id of this request */
+ struct rb_node ki_node;
+
+ atomic_t ki_users;
struct file *ki_filp;
- struct kioctx *ki_ctx; /* may be NULL for sync ops */
- int (*ki_cancel)(struct kiocb *, struct io_event *);
- ssize_t (*ki_retry)(struct kiocb *);
+ struct kioctx *ki_ctx; /* NULL for sync ops */
+ kiocb_cancel_fn *ki_cancel;
void (*ki_dtor)(struct kiocb *);
union {
@@ -102,6 +46,9 @@ struct kiocb {
} ki_obj;
__u64 ki_user_data; /* user's data for completion */
+ long ki_res;
+ long ki_res2;
+
loff_t ki_pos;
void *private;
@@ -117,7 +64,6 @@ struct kiocb {
struct list_head ki_list; /* the aio core uses this
* for cancellation */
- struct list_head ki_batch; /* batch allocation */
/*
* If the aio_resfd field of the userspace iocb is not zero,
@@ -128,108 +74,55 @@ struct kiocb {
static inline bool is_sync_kiocb(struct kiocb *kiocb)
{
- return kiocb->ki_key == KIOCB_SYNC_KEY;
+ return kiocb->ki_ctx == NULL;
}
static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
*kiocb = (struct kiocb) {
- .ki_users = 1,
- .ki_key = KIOCB_SYNC_KEY,
+ .ki_users = ATOMIC_INIT(1),
+ .ki_ctx = NULL,
.ki_filp = filp,
.ki_obj.tsk = current,
};
}
-#define AIO_RING_MAGIC 0xa10a10a1
-#define AIO_RING_COMPAT_FEATURES 1
-#define AIO_RING_INCOMPAT_FEATURES 0
-struct aio_ring {
- unsigned id; /* kernel internal index number */
- unsigned nr; /* number of io_events */
- unsigned head;
- unsigned tail;
-
- unsigned magic;
- unsigned compat_features;
- unsigned incompat_features;
- unsigned header_length; /* size of aio_ring */
-
-
- struct io_event io_events[0];
-}; /* 128 bytes + ring size */
-
-#define AIO_RING_PAGES 8
-struct aio_ring_info {
- unsigned long mmap_base;
- unsigned long mmap_size;
-
- struct page **ring_pages;
- spinlock_t ring_lock;
- long nr_pages;
-
- unsigned nr, tail;
-
- struct page *internal_pages[AIO_RING_PAGES];
-};
-
-static inline unsigned aio_ring_avail(struct aio_ring_info *info,
- struct aio_ring *ring)
-{
- return (ring->head + info->nr - 1 - ring->tail) % info->nr;
-}
-
-struct kioctx {
- atomic_t users;
- int dead;
- struct mm_struct *mm;
-
- /* This needs improving */
- unsigned long user_id;
- struct hlist_node list;
-
- wait_queue_head_t wait;
-
- spinlock_t ctx_lock;
-
- int reqs_active;
- struct list_head active_reqs; /* used for cancellation */
- struct list_head run_list; /* used for kicked reqs */
-
- /* sys_io_setup currently limits this to an unsigned int */
- unsigned max_reqs;
-
- struct aio_ring_info ring_info;
-
- struct delayed_work wq;
-
- struct rcu_head rcu_head;
-};
-
/* prototypes */
-extern unsigned aio_max_size;
-
#ifdef CONFIG_AIO
extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
-extern int aio_put_req(struct kiocb *iocb);
-extern void kick_iocb(struct kiocb *iocb);
-extern int aio_complete(struct kiocb *iocb, long res, long res2);
+extern void aio_put_req(struct kiocb *iocb);
+extern void batch_complete_aio(struct batch_complete *batch);
+extern void aio_complete_batch(struct kiocb *iocb, long res, long res2,
+ struct batch_complete *batch);
struct mm_struct;
extern void exit_aio(struct mm_struct *mm);
extern long do_io_submit(aio_context_t ctx_id, long nr,
struct iocb __user *__user *iocbpp, bool compat);
+void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
#else
static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
-static inline int aio_put_req(struct kiocb *iocb) { return 0; }
-static inline void kick_iocb(struct kiocb *iocb) { }
-static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; }
+static inline void aio_put_req(struct kiocb *iocb) { }
+
+static inline void batch_complete_aio(struct batch_complete *batch) { }
+static inline void aio_complete_batch(struct kiocb *iocb, long res, long res2,
+ struct batch_complete *batch)
+{
+ return;
+}
struct mm_struct;
static inline void exit_aio(struct mm_struct *mm) { }
static inline long do_io_submit(aio_context_t ctx_id, long nr,
struct iocb __user * __user *iocbpp,
bool compat) { return 0; }
+static inline void kiocb_set_cancel_fn(struct kiocb *req,
+ kiocb_cancel_fn *cancel) { }
#endif /* CONFIG_AIO */
+static inline void aio_complete(struct kiocb *iocb, long res, long res2)
+{
+ aio_complete_batch(iocb, res, res2, NULL);
+}
+
static inline struct kiocb *list_kiocb(struct list_head *h)
{
return list_entry(h, struct kiocb, ki_list);
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index f7f1d7169b11..6fd5cc80f62f 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -213,8 +213,15 @@ static inline bool balloon_compaction_check(void)
return true;
}
+static inline void balloon_event_count(enum vm_event_item item)
+{
+ count_vm_event(item);
+}
#else /* !CONFIG_BALLOON_COMPACTION */
+/* A macro, to avoid generating references to the undefined COMPACTBALLOON* */
+#define balloon_event_count(item) do { } while (0)
+
static inline void *balloon_mapping_alloc(void *balloon_device,
const struct address_space_operations *a_ops)
{
diff --git a/include/linux/batch_complete.h b/include/linux/batch_complete.h
new file mode 100644
index 000000000000..8167a9d306fb
--- /dev/null
+++ b/include/linux/batch_complete.h
@@ -0,0 +1,23 @@
+#ifndef _LINUX_BATCH_COMPLETE_H
+#define _LINUX_BATCH_COMPLETE_H
+
+#include <linux/rbtree.h>
+
+/*
+ * Common stuff to the aio and block code for batch completion. Everything
+ * important is elsewhere:
+ */
+
+struct bio;
+
+struct bio_list {
+ struct bio *head;
+ struct bio *tail;
+};
+
+struct batch_complete {
+ struct bio_list bio;
+ struct rb_root kiocb;
+};
+
+#endif
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ef24466d8f82..5db8a51eebb1 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -24,6 +24,7 @@
#include <linux/mempool.h>
#include <linux/ioprio.h>
#include <linux/bug.h>
+#include <linux/batch_complete.h>
#ifdef CONFIG_BLOCK
@@ -69,6 +70,8 @@
#define bio_sectors(bio) ((bio)->bi_size >> 9)
#define bio_end_sector(bio) ((bio)->bi_sector + bio_sectors((bio)))
+void bio_endio_batch(struct bio *bio, int error, struct batch_complete *batch);
+
static inline unsigned int bio_cur_bytes(struct bio *bio)
{
if (bio->bi_vcnt)
@@ -252,7 +255,25 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
}
-extern void bio_endio(struct bio *, int);
+/**
+ * bio_endio - end I/O on a bio
+ * @bio: bio
+ * @error: error, if any
+ *
+ * Description:
+ * bio_endio() will end I/O on the whole bio. bio_endio() is the
+ * preferred way to end I/O on a bio, it takes care of clearing
+ * BIO_UPTODATE on error. @error is 0 on success, and and one of the
+ * established -Exxxx (-EIO, for instance) error values in case
+ * something went wrong. No one should call bi_end_io() directly on a
+ * bio unless they own it and thus know that it has an end_io
+ * function.
+ **/
+static inline void bio_endio(struct bio *bio, int error)
+{
+ bio_endio_batch(bio, error, NULL);
+}
+
struct request_queue;
extern int bio_phys_segments(struct request_queue *, struct bio *);
@@ -404,10 +425,6 @@ static inline bool bio_mergeable(struct bio *bio)
* member of the bio. The bio_list also caches the last list member to allow
* fast access to the tail.
*/
-struct bio_list {
- struct bio *head;
- struct bio *tail;
-};
static inline int bio_list_empty(const struct bio_list *bl)
{
@@ -554,6 +571,15 @@ struct biovec_slab {
*/
#define BIO_SPLIT_ENTRIES 2
+static inline void batch_complete_init(struct batch_complete *batch)
+{
+ bio_list_init(&batch->bio);
+ batch->kiocb = RB_ROOT;
+}
+
+void batch_complete(struct batch_complete *batch);
+
+
#if defined(CONFIG_BLK_DEV_INTEGRITY)
#define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)]))
@@ -580,7 +606,7 @@ extern int bio_integrity_enabled(struct bio *bio);
extern int bio_integrity_set_tag(struct bio *, void *, unsigned int);
extern int bio_integrity_get_tag(struct bio *, void *, unsigned int);
extern int bio_integrity_prep(struct bio *);
-extern void bio_integrity_endio(struct bio *, int);
+extern void bio_integrity_endio(struct bio *, int, struct batch_complete *);
extern void bio_integrity_advance(struct bio *, unsigned int);
extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
extern void bio_integrity_split(struct bio *, struct bio_pair *, int);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index fa1abeb45b76..9d3cafa6bbcd 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -16,7 +16,8 @@ struct page;
struct block_device;
struct io_context;
struct cgroup_subsys_state;
-typedef void (bio_end_io_t) (struct bio *, int);
+struct batch_complete;
+typedef void (bio_end_io_t) (struct bio *, int, struct batch_complete *);
typedef void (bio_destructor_t) (struct bio *);
/*
@@ -42,6 +43,7 @@ struct bio {
* top bits priority
*/
+ short bi_error;
unsigned short bi_vcnt; /* how many bio_vec's */
unsigned short bi_idx; /* current index into bvl_vec */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6189bf26b53d..ff636bd8fde1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -883,7 +883,8 @@ extern struct request *blk_fetch_request(struct request_queue *q);
* This prevents code duplication in drivers.
*/
extern bool blk_update_request(struct request *rq, int error,
- unsigned int nr_bytes);
+ unsigned int nr_bytes,
+ struct batch_complete *batch);
extern bool blk_end_request(struct request *rq, int error,
unsigned int nr_bytes);
extern void blk_end_request_all(struct request *rq, int error);
@@ -891,10 +892,17 @@ extern bool blk_end_request_cur(struct request *rq, int error);
extern bool blk_end_request_err(struct request *rq, int error);
extern bool __blk_end_request(struct request *rq, int error,
unsigned int nr_bytes);
-extern void __blk_end_request_all(struct request *rq, int error);
extern bool __blk_end_request_cur(struct request *rq, int error);
extern bool __blk_end_request_err(struct request *rq, int error);
+extern void blk_end_request_all_batch(struct request *rq, int error,
+ struct batch_complete *batch);
+
+static inline void __blk_end_request_all(struct request *rq, int error)
+{
+ blk_end_request_all_batch(rq, error, NULL);
+}
+
extern void blk_complete_request(struct request *);
extern void __blk_complete_request(struct request *);
extern void blk_abort_request(struct request *);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3bff9ce09cf7..5047355b9a0f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -28,6 +28,7 @@ struct cgroup_subsys;
struct inode;
struct cgroup;
struct css_id;
+struct eventfd_ctx;
extern int cgroup_init_early(void);
extern int cgroup_init(void);
diff --git a/include/linux/decompress/unlz4.h b/include/linux/decompress/unlz4.h
new file mode 100644
index 000000000000..d5b68bf3ec92
--- /dev/null
+++ b/include/linux/decompress/unlz4.h
@@ -0,0 +1,10 @@
+#ifndef DECOMPRESS_UNLZ4_H
+#define DECOMPRESS_UNLZ4_H
+
+int unlz4(unsigned char *inbuf, int len,
+ int(*fill)(void*, unsigned int),
+ int(*flush)(void*, unsigned int),
+ unsigned char *output,
+ int *pos,
+ void(*error)(char *x));
+#endif
diff --git a/include/linux/errno.h b/include/linux/errno.h
index f6bf082d4d4f..89627b9187f9 100644
--- a/include/linux/errno.h
+++ b/include/linux/errno.h
@@ -28,6 +28,5 @@
#define EBADTYPE 527 /* Type not supported by server */
#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */
#define EIOCBQUEUED 529 /* iocb queued, will get completion event */
-#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */
#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e8cd6b839675..8d47c9afa2d8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2446,7 +2446,7 @@ enum {
DIO_SKIP_HOLES = 0x02,
};
-void dio_end_io(struct bio *bio, int error);
+void dio_end_io(struct bio *bio, int error, struct batch_complete *batch);
ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index c1d6555d2567..f3cec6856a4b 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -94,6 +94,11 @@
*/
#define in_nmi() (preempt_count() & NMI_MASK)
+/*
+ * Are we in nmi,irq context, or softirq context?
+ */
+#define in_serving_irq() (in_nmi() || in_irq() || in_serving_softirq())
+
#if defined(CONFIG_PREEMPT_COUNT)
# define PREEMPT_CHECK_OFFSET 1
#else
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index f1e877b79ed8..cfc2f119779a 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -365,7 +365,7 @@ extern void lockdep_trace_alloc(gfp_t mask);
#define lockdep_recursing(tsk) ((tsk)->lockdep_recursion)
-#else /* !LOCKDEP */
+#else /* !CONFIG_LOCKDEP */
static inline void lockdep_off(void)
{
@@ -479,82 +479,36 @@ static inline void print_irqtrace_events(struct task_struct *curr)
* on the per lock-class debug mode:
*/
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# ifdef CONFIG_PROVE_LOCKING
-# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i)
-# define spin_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i)
-# else
-# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i)
-# define spin_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, NULL, i)
-# endif
-# define spin_release(l, n, i) lock_release(l, n, i)
+#ifdef CONFIG_PROVE_LOCKING
+ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i)
+ #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 2, n, i)
+ #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 2, n, i)
#else
-# define spin_acquire(l, s, t, i) do { } while (0)
-# define spin_release(l, n, i) do { } while (0)
+ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i)
+ #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i)
+ #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i)
#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# ifdef CONFIG_PROVE_LOCKING
-# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i)
-# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 2, NULL, i)
-# else
-# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i)
-# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 1, NULL, i)
-# endif
-# define rwlock_release(l, n, i) lock_release(l, n, i)
-#else
-# define rwlock_acquire(l, s, t, i) do { } while (0)
-# define rwlock_acquire_read(l, s, t, i) do { } while (0)
-# define rwlock_release(l, n, i) do { } while (0)
-#endif
+#define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i)
+#define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i)
+#define spin_release(l, n, i) lock_release(l, n, i)
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# ifdef CONFIG_PROVE_LOCKING
-# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i)
-# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i)
-# else
-# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i)
-# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i)
-# endif
-# define mutex_release(l, n, i) lock_release(l, n, i)
-#else
-# define mutex_acquire(l, s, t, i) do { } while (0)
-# define mutex_acquire_nest(l, s, t, n, i) do { } while (0)
-# define mutex_release(l, n, i) do { } while (0)
-#endif
+#define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i)
+#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i)
+#define rwlock_release(l, n, i) lock_release(l, n, i)
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# ifdef CONFIG_PROVE_LOCKING
-# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i)
-# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i)
-# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 2, NULL, i)
-# else
-# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i)
-# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i)
-# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 1, NULL, i)
-# endif
+#define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i)
+#define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i)
+#define mutex_release(l, n, i) lock_release(l, n, i)
+
+#define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i)
+#define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i)
+#define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i)
# define rwsem_release(l, n, i) lock_release(l, n, i)
-#else
-# define rwsem_acquire(l, s, t, i) do { } while (0)
-# define rwsem_acquire_nest(l, s, t, n, i) do { } while (0)
-# define rwsem_acquire_read(l, s, t, i) do { } while (0)
-# define rwsem_release(l, n, i) do { } while (0)
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# ifdef CONFIG_PROVE_LOCKING
-# define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 2, NULL, _THIS_IP_)
-# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 2, NULL, _THIS_IP_)
-# else
-# define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 1, NULL, _THIS_IP_)
-# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 1, NULL, _THIS_IP_)
-# endif
+#define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
+#define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
# define lock_map_release(l) lock_release(l, 1, _THIS_IP_)
-#else
-# define lock_map_acquire(l) do { } while (0)
-# define lock_map_acquire_read(l) do { } while (0)
-# define lock_map_release(l) do { } while (0)
-#endif
#ifdef CONFIG_PROVE_LOCKING
# define might_lock(lock) \
diff --git a/include/linux/lz4.h b/include/linux/lz4.h
new file mode 100644
index 000000000000..d21c13f10a64
--- /dev/null
+++ b/include/linux/lz4.h
@@ -0,0 +1,87 @@
+#ifndef __LZ4_H__
+#define __LZ4_H__
+/*
+ * LZ4 Kernel Interface
+ *
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define LZ4_MEM_COMPRESS (4096 * sizeof(unsigned char *))
+#define LZ4HC_MEM_COMPRESS (65538 * sizeof(unsigned char *))
+
+/*
+ * lz4_compressbound()
+ * Provides the maximum size that LZ4 may output in a "worst case" scenario
+ * (input data not compressible)
+ */
+static inline size_t lz4_compressbound(size_t isize)
+{
+ return isize + (isize / 255) + 16;
+}
+
+/*
+ * lz4_compress()
+ * src : source address of the original data
+ * src_len : size of the original data
+ * dst : output buffer address of the compressed data
+ * This requires 'dst' of size LZ4_COMPRESSBOUND.
+ * dst_len : is the output size, which is returned after compress done
+ * workmem : address of the working memory.
+ * This requires 'workmem' of size LZ4_MEM_COMPRESS.
+ * return : Success if return 0
+ * Error if return (< 0)
+ * note : Destination buffer and workmem must be already allocated with
+ * the defined size.
+ */
+int lz4_compress(const unsigned char *src, size_t src_len,
+ unsigned char *dst, size_t *dst_len, void *wrkmem);
+
+ /*
+ * lz4hc_compress()
+ * src : source address of the original data
+ * src_len : size of the original data
+ * dst : output buffer address of the compressed data
+ * This requires 'dst' of size LZ4_COMPRESSBOUND.
+ * dst_len : is the output size, which is returned after compress done
+ * workmem : address of the working memory.
+ * This requires 'workmem' of size LZ4HC_MEM_COMPRESS.
+ * return : Success if return 0
+ * Error if return (< 0)
+ * note : Destination buffer and workmem must be already allocated with
+ * the defined size.
+ */
+int lz4hc_compress(const unsigned char *src, size_t src_len,
+ unsigned char *dst, size_t *dst_len, void *wrkmem);
+
+/*
+ * lz4_decompress()
+ * src : source address of the compressed data
+ * src_len : is the input size, whcih is returned after decompress done
+ * dest : output buffer address of the decompressed data
+ * actual_dest_len: is the size of uncompressed data, supposing it's known
+ * return : Success if return 0
+ * Error if return (< 0)
+ * note : Destination buffer must be already allocated.
+ * slightly faster than lz4_decompress_unknownoutputsize()
+ */
+int lz4_decompress(const char *src, size_t *src_len, char *dest,
+ size_t actual_dest_len);
+
+/*
+ * lz4_decompress_unknownoutputsize()
+ * src : source address of the compressed data
+ * src_len : is the input size, therefore the compressed size
+ * dest : output buffer address of the decompressed data
+ * dest_len: is the max size of the destination buffer, which is
+ * returned with actual size of decompressed data after
+ * decompress done
+ * return : Success if return 0
+ * Error if return (< 0)
+ * note : Destination buffer must be already allocated.
+ */
+int lz4_decompress_unknownoutputsize(const char *src, size_t src_len,
+ char *dest, size_t *dest_len);
+#endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ace9a5f01c64..fb425aa16c01 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -330,12 +330,9 @@ struct mm_struct {
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
- void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
#endif
unsigned long mmap_base; /* base of mmap area */
unsigned long task_size; /* size of task vm space */
- unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
- unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
unsigned long highest_vm_end; /* highest vma end address */
pgd_t * pgd;
atomic_t mm_users; /* How many users with user space? */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5c76737d836b..8c9f859ab558 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -815,7 +815,10 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
/*
* zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
*/
-#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
+static inline enum zone_type zone_idx(struct zone *zone)
+{
+ return zone - zone->zone_pgdat->node_zones;
+}
static inline int populated_zone(struct zone *zone)
{
@@ -856,25 +859,18 @@ static inline int is_normal_idx(enum zone_type idx)
*/
static inline int is_highmem(struct zone *zone)
{
-#ifdef CONFIG_HIGHMEM
- int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones;
- return zone_off == ZONE_HIGHMEM * sizeof(*zone) ||
- (zone_off == ZONE_MOVABLE * sizeof(*zone) &&
- zone_movable_is_highmem());
-#else
- return 0;
-#endif
+ return is_highmem_idx(zone_idx(zone));
}
static inline int is_normal(struct zone *zone)
{
- return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
+ return zone_idx(zone) == ZONE_NORMAL;
}
static inline int is_dma32(struct zone *zone)
{
#ifdef CONFIG_ZONE_DMA32
- return zone == zone->zone_pgdat->node_zones + ZONE_DMA32;
+ return zone_idx(zone) == ZONE_DMA32;
#else
return 0;
#endif
@@ -883,7 +879,7 @@ static inline int is_dma32(struct zone *zone)
static inline int is_dma(struct zone *zone)
{
#ifdef CONFIG_ZONE_DMA
- return zone == zone->zone_pgdat->node_zones + ZONE_DMA;
+ return zone_idx(zone) == ZONE_DMA;
#else
return 0;
#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e3dea75a078b..8352e0300001 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -187,7 +187,7 @@ static inline int page_cache_add_speculative(struct page *page, int count)
{
VM_BUG_ON(in_interrupt());
-#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
+#ifdef CONFIG_TINY_RCU
# ifdef CONFIG_PREEMPT_COUNT
VM_BUG_ON(!in_atomic());
# endif
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
new file mode 100644
index 000000000000..d0cf8872dc43
--- /dev/null
+++ b/include/linux/percpu-refcount.h
@@ -0,0 +1,114 @@
+/*
+ * Dynamic percpu refcounts:
+ * (C) 2012 Google, Inc.
+ * Author: Kent Overstreet <koverstreet@google.com>
+ *
+ * This implements a refcount with similar semantics to atomic_t - atomic_inc(),
+ * atomic_dec_and_test() - but potentially percpu.
+ *
+ * There's one important difference between percpu refs and normal atomic_t
+ * refcounts; you have to keep track of your initial refcount, and then when you
+ * start shutting down you call percpu_ref_kill() _before_ dropping the initial
+ * refcount.
+ *
+ * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
+ * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
+ * puts the ref back in single atomic_t mode, collecting the per cpu refs and
+ * issuing the appropriate barriers, and then marks the ref as shutting down so
+ * that percpu_ref_put() will check for the ref hitting 0. After it returns,
+ * it's safe to drop the initial ref.
+ *
+ * BACKGROUND:
+ *
+ * Percpu refcounts are quite useful for performance, but if we blindly
+ * converted all refcounts to percpu counters we'd waste quite a bit of memory.
+ *
+ * Think about all the refcounts embedded in kobjects, files, etc. most of which
+ * aren't used much. These start out as simple atomic counters - a little bigger
+ * than a bare atomic_t, 16 bytes instead of 4 - but if we exceed some arbitrary
+ * number of gets in one second, we then switch to percpu counters.
+ *
+ * This heuristic isn't perfect because it'll fire if the refcount was only
+ * being used on one cpu; ideally we'd be able to count the number of cache
+ * misses on percpu_ref_get() or something similar, but that'd make the non
+ * percpu path significantly heavier/more complex. We can count the number of
+ * gets() without any extra atomic instructions on arches that support
+ * atomic64_t - simply by changing the atomic_inc() to atomic_add_return().
+ *
+ * USAGE:
+ *
+ * See fs/aio.c for some example usage; it's used there for struct kioctx, which
+ * is created when userspaces calls io_setup(), and destroyed when userspace
+ * calls io_destroy() or the process exits.
+ *
+ * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
+ * calls percpu_ref_kill(), then hlist_del_rcu() and sychronize_rcu() to remove
+ * the kioctx from the proccess's list of kioctxs - after that, there can't be
+ * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop
+ * the initial ref with percpu_ref_put().
+ *
+ * Code that does a two stage shutdown like this often needs some kind of
+ * explicit synchronization to ensure the initial refcount can only be dropped
+ * once - percpu_ref_kill() does this for you, it returns true once and false if
+ * someone else already called it. The aio code uses it this way, but it's not
+ * necessary if the code has some other mechanism to synchronize teardown.
+ *
+ * As mentioned previously, we decide when to convert a ref to percpu counters
+ * in percpu_ref_get(). However, since percpu_ref_get() will often be called
+ * with rcu_read_lock() held, it's not done there - percpu_ref_get() returns
+ * true if the ref should be converted to percpu counters.
+ *
+ * The caller should then call percpu_ref_alloc() after dropping
+ * rcu_read_lock(); if there is an uncommonly used codepath where it's
+ * inconvenient to call percpu_ref_alloc() after get(), it may be safely skipped
+ * and percpu_ref_get() will return true again the next time the counter wraps
+ * around.
+ */
+
+#ifndef _LINUX_PERCPU_REFCOUNT_H
+#define _LINUX_PERCPU_REFCOUNT_H
+
+#include <linux/atomic.h>
+#include <linux/percpu.h>
+
+struct percpu_ref {
+ atomic64_t count;
+ unsigned long pcpu_count;
+};
+
+void percpu_ref_init(struct percpu_ref *ref);
+void __percpu_ref_get(struct percpu_ref *ref, bool alloc);
+int percpu_ref_put(struct percpu_ref *ref);
+
+int percpu_ref_kill(struct percpu_ref *ref);
+int percpu_ref_dead(struct percpu_ref *ref);
+
+/**
+ * percpu_ref_get - increment a dynamic percpu refcount
+ *
+ * Increments @ref and possibly converts it to percpu counters. Must be called
+ * with rcu_read_lock() held, and may potentially drop/reacquire rcu_read_lock()
+ * to allocate percpu counters - if sleeping/allocation isn't safe for some
+ * other reason (e.g. a spinlock), see percpu_ref_get_noalloc().
+ *
+ * Analagous to atomic_inc().
+ */
+static inline void percpu_ref_get(struct percpu_ref *ref)
+{
+ __percpu_ref_get(ref, true);
+}
+
+/**
+ * percpu_ref_get_noalloc - increment a dynamic percpu refcount
+ *
+ * Increments @ref, to be used when it's not safe to allocate percpu counters.
+ * Must be called with rcu_read_lock() held.
+ *
+ * Analagous to atomic_inc().
+ */
+static inline void percpu_ref_get_noalloc(struct percpu_ref *ref)
+{
+ __percpu_ref_get(ref, false);
+}
+
+#endif
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 731e4ecee3bd..e2772666f004 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <linux/bug.h>
#include <linux/mm.h>
+#include <linux/workqueue.h>
#include <linux/threads.h>
#include <linux/nsproxy.h>
#include <linux/kref.h>
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 7794d75ed155..907f3fd191ac 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -7,14 +7,20 @@
#include <linux/timex.h>
#include <linux/alarmtimer.h>
-union cpu_time_count {
- cputime_t cpu;
- unsigned long long sched;
-};
+
+static inline unsigned long long cputime_to_expires(cputime_t expires)
+{
+ return (__force unsigned long long)expires;
+}
+
+static inline cputime_t expires_to_cputime(unsigned long long expires)
+{
+ return (__force cputime_t)expires;
+}
struct cpu_timer_list {
struct list_head entry;
- union cpu_time_count expires, incr;
+ unsigned long long expires, incr;
struct task_struct *task;
int firing;
};
diff --git a/include/linux/random.h b/include/linux/random.h
index 347ce553a306..3b9377d6b7a5 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -29,13 +29,6 @@ u32 prandom_u32(void);
void prandom_bytes(void *buf, int nbytes);
void prandom_seed(u32 seed);
-/*
- * These macros are preserved for backward compatibility and should be
- * removed as soon as a transition is finished.
- */
-#define random32() prandom_u32()
-#define srandom32(seed) prandom_seed(seed)
-
u32 prandom_u32_state(struct rnd_state *);
void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes);
diff --git a/include/linux/rio.h b/include/linux/rio.h
index a3e784278667..a29a35bb649d 100644
--- a/include/linux/rio.h
+++ b/include/linux/rio.h
@@ -237,6 +237,7 @@ enum rio_phy_type {
* @name: Port name string
* @priv: Master port private data
* @dma: DMA device associated with mport
+ * @nscan: RapidIO network enumeration/discovery operations
*/
struct rio_mport {
struct list_head dbells; /* list of doorbell events */
@@ -262,8 +263,14 @@ struct rio_mport {
#ifdef CONFIG_RAPIDIO_DMA_ENGINE
struct dma_device dma;
#endif
+ struct rio_scan *nscan;
};
+/*
+ * Enumeration/discovery control flags
+ */
+#define RIO_SCAN_ENUM_NO_WAIT 0x00000001 /* Do not wait for enum completed */
+
struct rio_id_table {
u16 start; /* logical minimal id */
u32 max; /* max number of IDs in table */
@@ -460,6 +467,16 @@ static inline struct rio_mport *dma_to_mport(struct dma_device *ddev)
}
#endif /* CONFIG_RAPIDIO_DMA_ENGINE */
+/**
+ * struct rio_scan - RIO enumeration and discovery operations
+ * @enumerate: Callback to perform RapidIO fabric enumeration.
+ * @discover: Callback to perform RapidIO fabric discovery.
+ */
+struct rio_scan {
+ int (*enumerate)(struct rio_mport *mport, u32 flags);
+ int (*discover)(struct rio_mport *mport, u32 flags);
+};
+
/* Architecture and hardware-specific functions */
extern int rio_register_mport(struct rio_mport *);
extern int rio_open_inb_mbox(struct rio_mport *, void *, int, int);
diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h
index b75c05920ab5..5059994fe297 100644
--- a/include/linux/rio_drv.h
+++ b/include/linux/rio_drv.h
@@ -433,5 +433,6 @@ extern u16 rio_local_get_device_id(struct rio_mport *port);
extern struct rio_dev *rio_get_device(u16 vid, u16 did, struct rio_dev *from);
extern struct rio_dev *rio_get_asm(u16 vid, u16 did, u16 asm_vid, u16 asm_did,
struct rio_dev *from);
+extern int rio_init_mports(void);
#endif /* LINUX_RIO_DRV_H */
diff --git a/include/linux/rtc-pxa.h b/include/linux/rtc-pxa.h
new file mode 100644
index 000000000000..71bc45f060fc
--- /dev/null
+++ b/include/linux/rtc-pxa.h
@@ -0,0 +1,18 @@
+/*
+ * include/linux/rtc-pxa.h
+ *
+ * RTC PXA Header file
+ *
+ * Copyright (C) 2010 Marvell International Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_RTC_PXA_H
+#define __LINUX_RTC_PXA_H
+
+extern int pxa_rtc_sync_time(unsigned int ticks);
+
+#endif /* __LINUX_RTC_PXA_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 535d498958d9..4c72250d1b63 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -313,8 +313,6 @@ extern void schedule_preempt_disabled(void);
struct nsproxy;
struct user_namespace;
-#include <linux/aio.h>
-
#ifdef CONFIG_MMU
extern void arch_pick_mmap_layout(struct mm_struct *mm);
extern unsigned long
@@ -324,8 +322,6 @@ extern unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags);
-extern void arch_unmap_area(struct mm_struct *, unsigned long);
-extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
#endif
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 6f19cfd1840e..2793607bf6b8 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -47,6 +47,24 @@ static inline int seccomp_mode(struct seccomp *s)
return s->mode;
}
+#ifdef CONFIG_SECCOMP_FILTER_JIT
+extern void seccomp_jit_compile(struct seccomp_filter *fp);
+extern void seccomp_jit_free(struct seccomp_filter *fp);
+
+/*
+ * accessors used by seccomp JIT code to avoid having to declare the
+ * seccomp_filter struct here.
+ */
+unsigned short seccomp_filter_get_len(struct seccomp_filter *fp);
+struct sock_filter *seccomp_filter_get_insns(struct seccomp_filter *fp);
+void seccomp_filter_set_bpf_func(struct seccomp_filter *fp, void *func);
+void *seccomp_filter_get_bpf_func(struct seccomp_filter *fp);
+
+#else
+static inline void seccomp_jit_compile(struct seccomp_filter *fp) { }
+static inline void seccomp_jit_free(struct seccomp_filter *fp) { }
+#endif
+
#else /* CONFIG_SECCOMP */
#include <linux/errno.h>
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1701ce4be746..ca031f7abee4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -330,11 +330,14 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
/* linux/mm/page_io.c */
extern int swap_readpage(struct page *);
extern int swap_writepage(struct page *page, struct writeback_control *wbc);
-extern void end_swap_bio_write(struct bio *bio, int err);
+extern void end_swap_bio_write(struct bio *bio, int err,
+ struct batch_complete *batch);
extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
- void (*end_write_func)(struct bio *, int));
+ void (*end_write_func)(struct bio *bio, int err,
+ struct batch_complete *batch));
extern int swap_set_page_dirty(struct page *page);
-extern void end_swap_bio_read(struct bio *bio, int err);
+extern void end_swap_bio_read(struct bio *bio, int err,
+ struct batch_complete *batch);
int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index bd6cf61142be..d4b7a184f08c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -50,7 +50,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED,
COMPACTISOLATED,
COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
-#endif
+#ifdef CONFIG_BALLOON_COMPACTION
+ COMPACTBALLOONISOLATED, /* isolated from balloon pagelist */
+ COMPACTBALLOONMIGRATED, /* balloon page sucessfully migrated */
+ COMPACTBALLOONRETURNED, /* putback to pagelist, not-migrated */
+#endif /* CONFIG_BALLOON_COMPACTION */
+#endif /* CONFIG_COMPACTION */
#ifdef CONFIG_HUGETLB_PAGE
HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
#endif
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 7cb64d4b499d..ac38be2692d8 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -330,6 +330,92 @@ do { \
__ret; \
})
+#define __wait_event_hrtimeout(wq, condition, timeout, state) \
+({ \
+ int __ret = 0; \
+ DEFINE_WAIT(__wait); \
+ struct hrtimer_sleeper __t; \
+ \
+ hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \
+ HRTIMER_MODE_REL); \
+ hrtimer_init_sleeper(&__t, current); \
+ if ((timeout).tv64 != KTIME_MAX) \
+ hrtimer_start_range_ns(&__t.timer, timeout, \
+ current->timer_slack_ns, \
+ HRTIMER_MODE_REL); \
+ \
+ for (;;) { \
+ prepare_to_wait(&wq, &__wait, state); \
+ if (condition) \
+ break; \
+ if (state == TASK_INTERRUPTIBLE && \
+ signal_pending(current)) { \
+ __ret = -ERESTARTSYS; \
+ break; \
+ } \
+ if (!__t.task) { \
+ __ret = -ETIME; \
+ break; \
+ } \
+ schedule(); \
+ } \
+ \
+ hrtimer_cancel(&__t.timer); \
+ destroy_hrtimer_on_stack(&__t.timer); \
+ finish_wait(&wq, &__wait); \
+ __ret; \
+})
+
+/**
+ * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, as a ktime_t
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if @condition became true, or -ETIME if the timeout
+ * elapsed.
+ */
+#define wait_event_hrtimeout(wq, condition, timeout) \
+({ \
+ int __ret = 0; \
+ if (!(condition)) \
+ __ret = __wait_event_hrtimeout(wq, condition, timeout, \
+ TASK_UNINTERRUPTIBLE); \
+ __ret; \
+})
+
+/**
+ * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, as a ktime_t
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if @condition became true, -ERESTARTSYS if it was
+ * interrupted by a signal, or -ETIME if the timeout elapsed.
+ */
+#define wait_event_interruptible_hrtimeout(wq, condition, timeout) \
+({ \
+ long __ret = 0; \
+ if (!(condition)) \
+ __ret = __wait_event_hrtimeout(wq, condition, timeout, \
+ TASK_INTERRUPTIBLE); \
+ __ret; \
+})
+
#define __wait_event_interruptible_exclusive(wq, condition, ret) \
do { \
DEFINE_WAIT(__wait); \
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9a9367c0c076..579a5007c696 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -5,6 +5,7 @@
#define WRITEBACK_H
#include <linux/sched.h>
+#include <linux/workqueue.h>
#include <linux/fs.h>
DECLARE_PER_CPU(int, dirty_throttle_leaks);
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4c062ccff9aa..4405886980c7 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -905,7 +905,7 @@ struct ip_vs_app {
struct ipvs_master_sync_state {
struct list_head sync_queue;
struct ip_vs_sync_buff *sync_buff;
- int sync_queue_len;
+ unsigned long sync_queue_len;
unsigned int sync_queue_delay;
struct task_struct *master_thread;
struct delayed_work master_wakeup_work;
@@ -998,7 +998,7 @@ struct netns_ipvs {
int sysctl_snat_reroute;
int sysctl_sync_ver;
int sysctl_sync_ports;
- int sysctl_sync_qlen_max;
+ unsigned long sysctl_sync_qlen_max;
int sysctl_sync_sock_size;
int sysctl_cache_bypass;
int sysctl_expire_nodest_conn;
@@ -1085,7 +1085,7 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
return ACCESS_ONCE(ipvs->sysctl_sync_ports);
}
-static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs)
+static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs)
{
return ipvs->sysctl_sync_qlen_max;
}
@@ -1138,7 +1138,7 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
return 1;
}
-static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs)
+static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs)
{
return IPVS_SYNC_QLEN_MAX;
}
diff --git a/init/Kconfig b/init/Kconfig
index 7597bb9da33a..a28ee42ed2fc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -98,10 +98,13 @@ config HAVE_KERNEL_XZ
config HAVE_KERNEL_LZO
bool
+config HAVE_KERNEL_LZ4
+ bool
+
choice
prompt "Kernel compression mode"
default KERNEL_GZIP
- depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO
+ depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4
help
The linux kernel is a kind of self-extracting executable.
Several compression algorithms are available, which differ
@@ -168,6 +171,18 @@ config KERNEL_LZO
size is about 10% bigger than gzip; however its speed
(both compression and decompression) is the fastest.
+config KERNEL_LZ4
+ bool "LZ4"
+ depends on HAVE_KERNEL_LZ4
+ help
+ LZ4 is an LZ77-type compressor with a fixed, byte-oriented encoding.
+ A preliminary version of LZ4 de/compression tool is available at
+ <https://code.google.com/p/lz4/>.
+
+ Its compression ratio is worse than LZO. The size of the kernel
+ is about 8% bigger than LZO. But the decompression speed is
+ faster than LZO.
+
endchoice
config DEFAULT_HOSTNAME
@@ -931,6 +946,23 @@ config MEMCG_KMEM
the kmem extension can use it to guarantee that no group of processes
will ever exhaust kernel resources alone.
+config MEMCG_DEBUG_ASYNC_DESTROY
+ bool "Memory Resource Controller Debug asynchronous object destruction"
+ depends on MEMCG_KMEM || MEMCG_SWAP
+ default n
+ help
+ When a memcg is destroyed, the memory consumed by it may not be
+ immediately freed. This is because when some extensions are used, such
+ as swap or kernel memory, objects can outlive the group and hold a
+ reference to it.
+
+ If this is the case, the dangling_memcgs file will show information
+ about what are the memcgs still alive, and which references are still
+ preventing it to be freed. There is nothing wrong with that, but it is
+ very useful when debugging, to know where this memory is being held.
+ This is a developer-oriented debugging facility only, and no
+ guarantees of interface stability will be given.
+
config CGROUP_HUGETLB
bool "HugeTLB Resource Controller for Control Groups"
depends on RESOURCE_COUNTERS && HUGETLB_PAGE
diff --git a/ipc/msg.c b/ipc/msg.c
index d0c6d967b390..29787212d097 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -827,16 +827,15 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp,
struct ipc_namespace *ns;
struct msg_msg *copy = NULL;
- ns = current->nsproxy->ipc_ns;
-
if (msqid < 0 || (long) bufsz < 0)
return -EINVAL;
if (msgflg & MSG_COPY) {
- copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax));
+ copy = prepare_copy(buf, bufsz);
if (IS_ERR(copy))
return PTR_ERR(copy);
}
mode = convert_mode(&msgtyp, msgflg);
+ ns = current->nsproxy->ipc_ns;
msq = msg_lock_check(ns, msqid);
if (IS_ERR(msq)) {
diff --git a/ipc/sem.c b/ipc/sem.c
index e78ee3186d1f..9a71b5a7152f 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -61,8 +61,8 @@
* - A woken up task may not even touch the semaphore array anymore, it may
* have been destroyed already by a semctl(RMID).
* - The synchronizations between wake-ups due to a timeout/signal and a
- * wake-up due to a completed semaphore operation is achieved by using an
- * intermediate state (IN_WAKEUP).
+ * wake-up due to a completed semaphore operation is achieved by using a
+ * special wakeup scheme (queuewakeup_wait and support functions)
* - UNDO values are stored in an array (one per process and per
* semaphore array, lazily allocated). For backwards compatibility, multiple
* modes for the UNDO variables are supported (per process, per thread)
@@ -90,6 +90,135 @@
#include <asm/uaccess.h>
#include "util.h"
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+ #define SYSVSEM_COMPLETION 1
+#else
+ #define SYSVSEM_CUSTOM 1
+#endif
+
+#ifdef SYSVSEM_COMPLETION
+ /* Using a completion causes some overhead, but avoids a busy loop
+ * that increases the worst case latency.
+ */
+ struct queue_done {
+ struct completion done;
+ };
+
+ static void queuewakeup_prepare(void)
+ {
+ /* no preparation necessary */
+ }
+
+ static void queuewakeup_completed(void)
+ {
+ /* empty */
+ }
+
+ static void queuewakeup_block(struct queue_done *qd)
+ {
+ /* empty */
+ }
+
+ static void queuewakeup_handsoff(struct queue_done *qd)
+ {
+ complete_all(&qd->done);
+ }
+
+ static void queuewakeup_init(struct queue_done *qd)
+ {
+ init_completion(&qd->done);
+ }
+
+ static void queuewakeup_wait(struct queue_done *qd)
+ {
+ wait_for_completion(&qd->done);
+ }
+
+#elif defined(SYSVSEM_SPINLOCK)
+ /* Note: Spinlocks do not work because:
+ * - lockdep complains [could be fixed]
+ * - only 255 concurrent spin_lock() calls are permitted, then the
+ * preempt-counter overflows
+ */
+#error SYSVSEM_SPINLOCK is a prove of concept, does not work.
+ struct queue_done {
+ spinlock_t done;
+ };
+
+ static void queuewakeup_prepare(void)
+ {
+ /* empty */
+ }
+
+ static void queuewakeup_completed(void)
+ {
+ /* empty */
+ }
+
+ static void queuewakeup_block(struct queue_done *qd)
+ {
+ BUG_ON(spin_is_locked(&qd->done));
+ spin_lock(&qd->done);
+ }
+
+ static void queuewakeup_handsoff(struct queue_done *qd)
+ {
+ spin_unlock(&qd->done);
+ }
+
+ static void queuewakeup_init(struct queue_done *qd)
+ {
+ spin_lock_init(&qd->done);
+ }
+
+ static void queuewakeup_wait(struct queue_done *qd)
+ {
+ spin_unlock_wait(&qd->done);
+ }
+#else
+ struct queue_done {
+ atomic_t done;
+ };
+
+ static void queuewakeup_prepare(void)
+ {
+ preempt_disable();
+ }
+
+ static void queuewakeup_completed(void)
+ {
+ preempt_enable();
+ }
+
+ static void queuewakeup_block(struct queue_done *qd)
+ {
+ BUG_ON(atomic_read(&qd->done) != 1);
+ atomic_set(&qd->done, 2);
+ }
+
+ static void queuewakeup_handsoff(struct queue_done *qd)
+ {
+ BUG_ON(atomic_read(&qd->done) != 2);
+ smp_mb();
+ atomic_set(&qd->done, 1);
+ }
+
+ static void queuewakeup_init(struct queue_done *qd)
+ {
+ atomic_set(&qd->done, 1);
+ }
+
+ static void queuewakeup_wait(struct queue_done *qd)
+ {
+ while (atomic_read(&qd->done) != 1)
+ cpu_relax();
+
+ smp_mb();
+ }
+#endif
+
+
/* One semaphore structure for each semaphore in the system. */
struct sem {
int semval; /* current value */
@@ -108,6 +237,7 @@ struct sem_queue {
struct sembuf *sops; /* array of pending operations */
int nsops; /* number of operations */
int alter; /* does *sops alter the array? */
+ struct queue_done done; /* completion synchronization */
};
/* Each task has a list of undo requests. They are executed automatically
@@ -361,23 +491,27 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
* - queue.status is initialized to -EINTR before blocking.
* - wakeup is performed by
* * unlinking the queue entry from sma->sem_pending
- * * setting queue.status to IN_WAKEUP
- * This is the notification for the blocked thread that a
- * result value is imminent.
+ * * setting queue.status to the actual result code
+ * This is the notification for the blocked thread that someone
+ * (usually: update_queue()) completed the semtimedop() operation.
* * call wake_up_process
- * * set queue.status to the final value.
+ * * queuewakeup_handsoff(&q->done);
* - the previously blocked thread checks queue.status:
- * * if it's IN_WAKEUP, then it must wait until the value changes
- * * if it's not -EINTR, then the operation was completed by
- * update_queue. semtimedop can return queue.status without
- * performing any operation on the sem array.
- * * otherwise it must acquire the spinlock and check what's up.
+ * * if it's not -EINTR, then someone completed the operation.
+ * First, queuewakeup_wait() must be called. Afterwards,
+ * semtimedop must return queue.status without performing any
+ * operation on the sem array.
+ * - otherwise it must acquire the spinlock and repeat the test
+ * - If it is still -EINTR, then no update_queue() completed the
+ * operation, thus semtimedop() can proceed normally.
*
- * The two-stage algorithm is necessary to protect against the following
+ * queuewakeup_wait() is necessary to protect against the following
* races:
* - if queue.status is set after wake_up_process, then the woken up idle
* thread could race forward and try (and fail) to acquire sma->lock
- * before update_queue had a chance to set queue.status
+ * before update_queue had a chance to set queue.status.
+ * More importantly, it would mean that wake_up_process must be done
+ * while holding sma->lock, i.e. this would reduce the scalability.
* - if queue.status is written before wake_up_process and if the
* blocked process is woken up by a signal between writing
* queue.status and the wake_up_process, then the woken up
@@ -387,7 +521,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
* (yes, this happened on s390 with sysv msg).
*
*/
-#define IN_WAKEUP 1
/**
* newary - Create a new semaphore set
@@ -579,15 +712,11 @@ undo:
static void wake_up_sem_queue_prepare(struct list_head *pt,
struct sem_queue *q, int error)
{
- if (list_empty(pt)) {
- /*
- * Hold preempt off so that we don't get preempted and have the
- * wakee busy-wait until we're scheduled back on.
- */
- preempt_disable();
- }
- q->status = IN_WAKEUP;
- q->pid = error;
+ if (list_empty(pt))
+ queuewakeup_prepare();
+
+ queuewakeup_block(&q->done);
+ q->status = error;
list_add_tail(&q->list, pt);
}
@@ -598,8 +727,8 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
*
* Do the actual wake-up.
* The function is called without any locks held, thus the semaphore array
- * could be destroyed already and the tasks can disappear as soon as the
- * status is set to the actual return code.
+ * could be destroyed already and the tasks can disappear as soon as
+ * queuewakeup_handsoff() is called.
*/
static void wake_up_sem_queue_do(struct list_head *pt)
{
@@ -609,12 +738,11 @@ static void wake_up_sem_queue_do(struct list_head *pt)
did_something = !list_empty(pt);
list_for_each_entry_safe(q, t, pt, list) {
wake_up_process(q->sleeper);
- /* q can disappear immediately after writing q->status. */
- smp_wmb();
- q->status = q->pid;
+ /* q can disappear immediately after completing q->done */
+ queuewakeup_handsoff(&q->done);
}
if (did_something)
- preempt_enable();
+ queuewakeup_completed();
}
static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
@@ -1491,33 +1619,6 @@ out:
return un;
}
-
-/**
- * get_queue_result - Retrieve the result code from sem_queue
- * @q: Pointer to queue structure
- *
- * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
- * q->status, then we must loop until the value is replaced with the final
- * value: This may happen if a task is woken up by an unrelated event (e.g.
- * signal) and in parallel the task is woken up by another task because it got
- * the requested semaphores.
- *
- * The function can be called with or without holding the semaphore spinlock.
- */
-static int get_queue_result(struct sem_queue *q)
-{
- int error;
-
- error = q->status;
- while (unlikely(error == IN_WAKEUP)) {
- cpu_relax();
- error = q->status;
- }
-
- return error;
-}
-
-
SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
unsigned, nsops, const struct timespec __user *, timeout)
{
@@ -1657,6 +1758,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
queue.status = -EINTR;
queue.sleeper = current;
+ queuewakeup_init(&queue.done);
sleep_again:
current->state = TASK_INTERRUPTIBLE;
@@ -1667,17 +1769,14 @@ sleep_again:
else
schedule();
- error = get_queue_result(&queue);
+ error = queue.status;
if (error != -EINTR) {
/* fast path: update_queue already obtained all requested
- * resources.
- * Perform a smp_mb(): User space could assume that semop()
- * is a memory barrier: Without the mb(), the cpu could
- * speculatively read in user space stale data that was
- * overwritten by the previous owner of the semaphore.
+ * resources. Just ensure that update_queue completed
+ * it's access to &queue.
*/
- smp_mb();
+ queuewakeup_wait(&queue.done);
goto out_free;
}
@@ -1687,23 +1786,16 @@ sleep_again:
/*
* Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
*/
- error = get_queue_result(&queue);
-
- /*
- * Array removed? If yes, leave without sem_unlock().
- */
- if (IS_ERR(sma)) {
- goto out_free;
- }
-
-
- /*
- * If queue.status != -EINTR we are woken up by another process.
- * Leave without unlink_queue(), but with sem_unlock().
- */
-
+ error = queue.status;
if (error != -EINTR) {
- goto out_unlock_free;
+ /* If there is a return code, then we can leave immediately. */
+ if (!IS_ERR(sma)) {
+ /* sem_lock() succeeded - then unlock */
+ sem_unlock(sma, locknum);
+ }
+ /* Except that we must wait for the hands-off */
+ queuewakeup_wait(&queue.done);
+ goto out_free;
}
/*
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index a291aa23fb3f..9fd17f057ce5 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -621,6 +621,7 @@ void audit_trim_trees(void)
skip_it:
put_tree(tree);
mutex_lock(&audit_filter_mutex);
+ put_tree(tree);
}
list_del(&cursor);
mutex_unlock(&audit_filter_mutex);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 267436826c3b..b93206c29243 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -423,6 +423,10 @@ exit_nofree:
return entry;
exit_free:
+ if (entry->rule.watch)
+ audit_put_watch(entry->rule.watch); /* matches initial get */
+ if (entry->rule.tree)
+ audit_put_tree(entry->rule.tree); /* that's the temporary one */
audit_free_rule(entry);
return ERR_PTR(err);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 0519c9b3261c..6fcc2e24561e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -70,6 +70,7 @@
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
+#include <linux/aio.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -364,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mm->locked_vm = 0;
mm->mmap = NULL;
mm->mmap_cache = NULL;
- mm->free_area_cache = oldmm->mmap_base;
- mm->cached_hole_size = ~0UL;
mm->map_count = 0;
cpumask_clear(mm_cpumask(mm));
mm->mm_rb = RB_ROOT;
@@ -539,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
mm->nr_ptes = 0;
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = ~0UL;
mm_init_aio(mm);
mm_init_owner(mm, p);
diff --git a/kernel/lglock.c b/kernel/lglock.c
index 6535a667a5a7..86ae2aebf004 100644
--- a/kernel/lglock.c
+++ b/kernel/lglock.c
@@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg)
arch_spinlock_t *lock;
preempt_disable();
- rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+ lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
lock = this_cpu_ptr(lg->lock);
arch_spin_lock(lock);
}
@@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg)
{
arch_spinlock_t *lock;
- rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
lock = this_cpu_ptr(lg->lock);
arch_spin_unlock(lock);
preempt_enable();
@@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu)
arch_spinlock_t *lock;
preempt_disable();
- rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+ lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
lock = per_cpu_ptr(lg->lock, cpu);
arch_spin_lock(lock);
}
@@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
{
arch_spinlock_t *lock;
- rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
lock = per_cpu_ptr(lg->lock, cpu);
arch_spin_unlock(lock);
preempt_enable();
@@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg)
int i;
preempt_disable();
- rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+ lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
for_each_possible_cpu(i) {
arch_spinlock_t *lock;
lock = per_cpu_ptr(lg->lock, i);
@@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg)
{
int i;
- rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
for_each_possible_cpu(i) {
arch_spinlock_t *lock;
lock = per_cpu_ptr(lg->lock, i);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 42670e9b44e0..c7f31aa272f7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock)
return error;
}
-static inline union cpu_time_count
+static inline unsigned long long
timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
{
- union cpu_time_count ret;
- ret.sched = 0; /* high half always zero when .cpu used */
+ unsigned long long ret;
+
+ ret = 0; /* high half always zero when .cpu used */
if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
- ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
+ ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
} else {
- ret.cpu = timespec_to_cputime(tp);
+ ret = cputime_to_expires(timespec_to_cputime(tp));
}
return ret;
}
static void sample_to_timespec(const clockid_t which_clock,
- union cpu_time_count cpu,
+ unsigned long long expires,
struct timespec *tp)
{
if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
- *tp = ns_to_timespec(cpu.sched);
+ *tp = ns_to_timespec(expires);
else
- cputime_to_timespec(cpu.cpu, tp);
-}
-
-static inline int cpu_time_before(const clockid_t which_clock,
- union cpu_time_count now,
- union cpu_time_count then)
-{
- if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
- return now.sched < then.sched;
- } else {
- return now.cpu < then.cpu;
- }
-}
-static inline void cpu_time_add(const clockid_t which_clock,
- union cpu_time_count *acc,
- union cpu_time_count val)
-{
- if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
- acc->sched += val.sched;
- } else {
- acc->cpu += val.cpu;
- }
-}
-static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
- union cpu_time_count a,
- union cpu_time_count b)
-{
- if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
- a.sched -= b.sched;
- } else {
- a.cpu -= b.cpu;
- }
- return a;
+ cputime_to_timespec((__force cputime_t)expires, tp);
}
/*
@@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
* given the current clock sample.
*/
static void bump_cpu_timer(struct k_itimer *timer,
- union cpu_time_count now)
+ unsigned long long now)
{
int i;
+ unsigned long long delta, incr;
- if (timer->it.cpu.incr.sched == 0)
+ if (timer->it.cpu.incr == 0)
return;
- if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
- unsigned long long delta, incr;
+ if (now < timer->it.cpu.expires)
+ return;
- if (now.sched < timer->it.cpu.expires.sched)
- return;
- incr = timer->it.cpu.incr.sched;
- delta = now.sched + incr - timer->it.cpu.expires.sched;
- /* Don't use (incr*2 < delta), incr*2 might overflow. */
- for (i = 0; incr < delta - incr; i++)
- incr = incr << 1;
- for (; i >= 0; incr >>= 1, i--) {
- if (delta < incr)
- continue;
- timer->it.cpu.expires.sched += incr;
- timer->it_overrun += 1 << i;
- delta -= incr;
- }
- } else {
- cputime_t delta, incr;
+ incr = timer->it.cpu.incr;
+ delta = now + incr - timer->it.cpu.expires;
- if (now.cpu < timer->it.cpu.expires.cpu)
- return;
- incr = timer->it.cpu.incr.cpu;
- delta = now.cpu + incr - timer->it.cpu.expires.cpu;
- /* Don't use (incr*2 < delta), incr*2 might overflow. */
- for (i = 0; incr < delta - incr; i++)
- incr += incr;
- for (; i >= 0; incr = incr >> 1, i--) {
- if (delta < incr)
- continue;
- timer->it.cpu.expires.cpu += incr;
- timer->it_overrun += 1 << i;
- delta -= incr;
- }
+ /* Don't use (incr*2 < delta), incr*2 might overflow. */
+ for (i = 0; incr < delta - incr; i++)
+ incr = incr << 1;
+
+ for (; i >= 0; incr >>= 1, i--) {
+ if (delta < incr)
+ continue;
+
+ timer->it.cpu.expires += incr;
+ timer->it_overrun += 1 << i;
+ delta -= incr;
}
}
@@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime)
return 0;
}
-static inline cputime_t prof_ticks(struct task_struct *p)
+static inline unsigned long long prof_ticks(struct task_struct *p)
{
cputime_t utime, stime;
task_cputime(p, &utime, &stime);
- return utime + stime;
+ return cputime_to_expires(utime + stime);
}
-static inline cputime_t virt_ticks(struct task_struct *p)
+static inline unsigned long long virt_ticks(struct task_struct *p)
{
cputime_t utime;
task_cputime(p, &utime, NULL);
- return utime;
+ return cputime_to_expires(utime);
}
static int
@@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
* Sample a per-thread clock for the given task.
*/
static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
- union cpu_time_count *cpu)
+ unsigned long long *sample)
{
switch (CPUCLOCK_WHICH(which_clock)) {
default:
return -EINVAL;
case CPUCLOCK_PROF:
- cpu->cpu = prof_ticks(p);
+ *sample = prof_ticks(p);
break;
case CPUCLOCK_VIRT:
- cpu->cpu = virt_ticks(p);
+ *sample = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = task_sched_runtime(p);
+ *sample = task_sched_runtime(p);
break;
}
return 0;
@@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
*/
static int cpu_clock_sample_group(const clockid_t which_clock,
struct task_struct *p,
- union cpu_time_count *cpu)
+ unsigned long long *sample)
{
struct task_cputime cputime;
@@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
return -EINVAL;
case CPUCLOCK_PROF:
thread_group_cputime(p, &cputime);
- cpu->cpu = cputime.utime + cputime.stime;
+ *sample = cputime_to_expires(cputime.utime + cputime.stime);
break;
case CPUCLOCK_VIRT:
thread_group_cputime(p, &cputime);
- cpu->cpu = cputime.utime;
+ *sample = cputime_to_expires(cputime.utime);
break;
case CPUCLOCK_SCHED:
thread_group_cputime(p, &cputime);
- cpu->sched = cputime.sum_exec_runtime;
+ *sample = cputime.sum_exec_runtime;
break;
}
return 0;
@@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
{
const pid_t pid = CPUCLOCK_PID(which_clock);
int error = -EINVAL;
- union cpu_time_count rtn;
+ unsigned long long rtn;
if (pid == 0) {
/*
@@ -446,6 +399,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
return ret;
}
+static void cleanup_timers_list(struct list_head *head,
+ unsigned long long curr)
+{
+ struct cpu_timer_list *timer, *next;
+
+ list_for_each_entry_safe(timer, next, head, entry)
+ list_del_init(&timer->entry);
+}
+
/*
* Clean out CPU timers still ticking when a thread exited. The task
* pointer is cleared, and the expiry time is replaced with the residual
@@ -456,37 +418,12 @@ static void cleanup_timers(struct list_head *head,
cputime_t utime, cputime_t stime,
unsigned long long sum_exec_runtime)
{
- struct cpu_timer_list *timer, *next;
- cputime_t ptime = utime + stime;
-
- list_for_each_entry_safe(timer, next, head, entry) {
- list_del_init(&timer->entry);
- if (timer->expires.cpu < ptime) {
- timer->expires.cpu = 0;
- } else {
- timer->expires.cpu -= ptime;
- }
- }
- ++head;
- list_for_each_entry_safe(timer, next, head, entry) {
- list_del_init(&timer->entry);
- if (timer->expires.cpu < utime) {
- timer->expires.cpu = 0;
- } else {
- timer->expires.cpu -= utime;
- }
- }
+ cputime_t ptime = utime + stime;
- ++head;
- list_for_each_entry_safe(timer, next, head, entry) {
- list_del_init(&timer->entry);
- if (timer->expires.sched < sum_exec_runtime) {
- timer->expires.sched = 0;
- } else {
- timer->expires.sched -= sum_exec_runtime;
- }
- }
+ cleanup_timers_list(head, cputime_to_expires(ptime));
+ cleanup_timers_list(++head, cputime_to_expires(utime));
+ cleanup_timers_list(++head, sum_exec_runtime);
}
/*
@@ -516,17 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
}
-static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
+static void clear_dead_task(struct k_itimer *itimer, unsigned long long now)
{
+ struct cpu_timer_list *timer = &itimer->it.cpu;
+
/*
* That's all for this thread or process.
* We leave our residual in expires to be reported.
*/
- put_task_struct(timer->it.cpu.task);
- timer->it.cpu.task = NULL;
- timer->it.cpu.expires = cpu_time_sub(timer->it_clock,
- timer->it.cpu.expires,
- now);
+ put_task_struct(timer->task);
+ timer->task = NULL;
+ if (timer->expires < now) {
+ timer->expires = 0;
+ } else {
+ timer->expires -= now;
+ }
}
static inline int expires_gt(cputime_t expires, cputime_t new_exp)
@@ -558,14 +499,14 @@ static void arm_timer(struct k_itimer *timer)
listpos = head;
list_for_each_entry(next, head, entry) {
- if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
+ if (nt->expires < next->expires)
break;
listpos = &next->entry;
}
list_add(&nt->entry, listpos);
if (listpos == head) {
- union cpu_time_count *exp = &nt->expires;
+ unsigned long long exp = nt->expires;
/*
* We are the new earliest-expiring POSIX 1.b timer, hence
@@ -576,17 +517,17 @@ static void arm_timer(struct k_itimer *timer)
switch (CPUCLOCK_WHICH(timer->it_clock)) {
case CPUCLOCK_PROF:
- if (expires_gt(cputime_expires->prof_exp, exp->cpu))
- cputime_expires->prof_exp = exp->cpu;
+ if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
+ cputime_expires->prof_exp = expires_to_cputime(exp);
break;
case CPUCLOCK_VIRT:
- if (expires_gt(cputime_expires->virt_exp, exp->cpu))
- cputime_expires->virt_exp = exp->cpu;
+ if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
+ cputime_expires->virt_exp = expires_to_cputime(exp);
break;
case CPUCLOCK_SCHED:
if (cputime_expires->sched_exp == 0 ||
- cputime_expires->sched_exp > exp->sched)
- cputime_expires->sched_exp = exp->sched;
+ cputime_expires->sched_exp > exp)
+ cputime_expires->sched_exp = exp;
break;
}
}
@@ -601,20 +542,20 @@ static void cpu_timer_fire(struct k_itimer *timer)
/*
* User don't want any signal.
*/
- timer->it.cpu.expires.sched = 0;
+ timer->it.cpu.expires = 0;
} else if (unlikely(timer->sigq == NULL)) {
/*
* This a special case for clock_nanosleep,
* not a normal timer from sys_timer_create.
*/
wake_up_process(timer->it_process);
- timer->it.cpu.expires.sched = 0;
- } else if (timer->it.cpu.incr.sched == 0) {
+ timer->it.cpu.expires = 0;
+ } else if (timer->it.cpu.incr == 0) {
/*
* One-shot timer. Clear it as soon as it's fired.
*/
posix_timer_event(timer, 0);
- timer->it.cpu.expires.sched = 0;
+ timer->it.cpu.expires = 0;
} else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
/*
* The signal did not get queued because the signal
@@ -632,7 +573,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
*/
static int cpu_timer_sample_group(const clockid_t which_clock,
struct task_struct *p,
- union cpu_time_count *cpu)
+ unsigned long long *sample)
{
struct task_cputime cputime;
@@ -641,13 +582,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
default:
return -EINVAL;
case CPUCLOCK_PROF:
- cpu->cpu = cputime.utime + cputime.stime;
+ *sample = cputime_to_expires(cputime.utime + cputime.stime);
break;
case CPUCLOCK_VIRT:
- cpu->cpu = cputime.utime;
+ *sample = cputime_to_expires(cputime.utime);
break;
case CPUCLOCK_SCHED:
- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ *sample = cputime.sum_exec_runtime + task_delta_exec(p);
break;
}
return 0;
@@ -694,7 +635,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
struct itimerspec *new, struct itimerspec *old)
{
struct task_struct *p = timer->it.cpu.task;
- union cpu_time_count old_expires, new_expires, old_incr, val;
+ unsigned long long old_expires, new_expires, old_incr, val;
int ret;
if (unlikely(p == NULL)) {
@@ -749,7 +690,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
}
if (old) {
- if (old_expires.sched == 0) {
+ if (old_expires == 0) {
old->it_value.tv_sec = 0;
old->it_value.tv_nsec = 0;
} else {
@@ -764,11 +705,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
* new setting.
*/
bump_cpu_timer(timer, val);
- if (cpu_time_before(timer->it_clock, val,
- timer->it.cpu.expires)) {
- old_expires = cpu_time_sub(
- timer->it_clock,
- timer->it.cpu.expires, val);
+ if (val < timer->it.cpu.expires) {
+ old_expires = timer->it.cpu.expires - val;
sample_to_timespec(timer->it_clock,
old_expires,
&old->it_value);
@@ -791,8 +729,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
goto out;
}
- if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) {
- cpu_time_add(timer->it_clock, &new_expires, val);
+ if (new_expires != 0 && !(flags & TIMER_ABSTIME)) {
+ new_expires += val;
}
/*
@@ -801,8 +739,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
* arm the timer (we'll just fake it for timer_gettime).
*/
timer->it.cpu.expires = new_expires;
- if (new_expires.sched != 0 &&
- cpu_time_before(timer->it_clock, val, new_expires)) {
+ if (new_expires != 0 && val < new_expires) {
arm_timer(timer);
}
@@ -826,8 +763,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
timer->it_overrun_last = 0;
timer->it_overrun = -1;
- if (new_expires.sched != 0 &&
- !cpu_time_before(timer->it_clock, val, new_expires)) {
+ if (new_expires != 0 && !(val < new_expires)) {
/*
* The designated time already passed, so we notify
* immediately, even if the thread never runs to
@@ -849,7 +785,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
{
- union cpu_time_count now;
+ unsigned long long now;
struct task_struct *p = timer->it.cpu.task;
int clear_dead;
@@ -859,7 +795,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
sample_to_timespec(timer->it_clock,
timer->it.cpu.incr, &itp->it_interval);
- if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */
+ if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
return;
}
@@ -891,7 +827,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
*/
put_task_struct(p);
timer->it.cpu.task = NULL;
- timer->it.cpu.expires.sched = 0;
+ timer->it.cpu.expires = 0;
read_unlock(&tasklist_lock);
goto dead;
} else {
@@ -912,10 +848,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
goto dead;
}
- if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) {
+ if (now < timer->it.cpu.expires) {
sample_to_timespec(timer->it_clock,
- cpu_time_sub(timer->it_clock,
- timer->it.cpu.expires, now),
+ timer->it.cpu.expires - now,
&itp->it_value);
} else {
/*
@@ -927,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
}
}
+static unsigned long long
+check_timers_list(struct list_head *timers,
+ struct list_head *firing,
+ unsigned long long curr)
+{
+ int maxfire = 20;
+
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *t;
+
+ t = list_first_entry(timers, struct cpu_timer_list, entry);
+
+ if (!--maxfire || curr < t->expires)
+ return t->expires;
+
+ t->firing = 1;
+ list_move_tail(&t->entry, firing);
+ }
+
+ return 0;
+}
+
/*
* Check for any per-thread CPU timers that have fired and move them off
* the tsk->cpu_timers[N] list onto the firing list. Here we update the
@@ -935,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
static void check_thread_timers(struct task_struct *tsk,
struct list_head *firing)
{
- int maxfire;
struct list_head *timers = tsk->cpu_timers;
struct signal_struct *const sig = tsk->signal;
+ struct task_cputime *tsk_expires = &tsk->cputime_expires;
+ unsigned long long expires;
unsigned long soft;
- maxfire = 20;
- tsk->cputime_expires.prof_exp = 0;
- while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_first_entry(timers,
- struct cpu_timer_list,
- entry);
- if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
- tsk->cputime_expires.prof_exp = t->expires.cpu;
- break;
- }
- t->firing = 1;
- list_move_tail(&t->entry, firing);
- }
+ expires = check_timers_list(timers, firing, prof_ticks(tsk));
+ tsk_expires->prof_exp = expires_to_cputime(expires);
- ++timers;
- maxfire = 20;
- tsk->cputime_expires.virt_exp = 0;
- while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_first_entry(timers,
- struct cpu_timer_list,
- entry);
- if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
- tsk->cputime_expires.virt_exp = t->expires.cpu;
- break;
- }
- t->firing = 1;
- list_move_tail(&t->entry, firing);
- }
+ expires = check_timers_list(++timers, firing, virt_ticks(tsk));
+ tsk_expires->virt_exp = expires_to_cputime(expires);
- ++timers;
- maxfire = 20;
- tsk->cputime_expires.sched_exp = 0;
- while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_first_entry(timers,
- struct cpu_timer_list,
- entry);
- if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
- tsk->cputime_expires.sched_exp = t->expires.sched;
- break;
- }
- t->firing = 1;
- list_move_tail(&t->entry, firing);
- }
+ tsk_expires->sched_exp = check_timers_list(++timers, firing,
+ tsk->se.sum_exec_runtime);
/*
* Check for the special case thread timers.
@@ -1030,7 +953,8 @@ static void stop_process_timers(struct signal_struct *sig)
static u32 onecputick;
static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
- cputime_t *expires, cputime_t cur_time, int signo)
+ unsigned long long *expires,
+ unsigned long long cur_time, int signo)
{
if (!it->expires)
return;
@@ -1066,9 +990,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
static void check_process_timers(struct task_struct *tsk,
struct list_head *firing)
{
- int maxfire;
struct signal_struct *const sig = tsk->signal;
- cputime_t utime, ptime, virt_expires, prof_expires;
+ unsigned long long utime, ptime, virt_expires, prof_expires;
unsigned long long sum_sched_runtime, sched_expires;
struct list_head *timers = sig->cpu_timers;
struct task_cputime cputime;
@@ -1078,52 +1001,13 @@ static void check_process_timers(struct task_struct *tsk,
* Collect the current process totals.
*/
thread_group_cputimer(tsk, &cputime);
- utime = cputime.utime;
- ptime = utime + cputime.stime;
+ utime = cputime_to_expires(cputime.utime);
+ ptime = utime + cputime_to_expires(cputime.stime);
sum_sched_runtime = cputime.sum_exec_runtime;
- maxfire = 20;
- prof_expires = 0;
- while (!list_empty(timers)) {
- struct cpu_timer_list *tl = list_first_entry(timers,
- struct cpu_timer_list,
- entry);
- if (!--maxfire || ptime < tl->expires.cpu) {
- prof_expires = tl->expires.cpu;
- break;
- }
- tl->firing = 1;
- list_move_tail(&tl->entry, firing);
- }
- ++timers;
- maxfire = 20;
- virt_expires = 0;
- while (!list_empty(timers)) {
- struct cpu_timer_list *tl = list_first_entry(timers,
- struct cpu_timer_list,
- entry);
- if (!--maxfire || utime < tl->expires.cpu) {
- virt_expires = tl->expires.cpu;
- break;
- }
- tl->firing = 1;
- list_move_tail(&tl->entry, firing);
- }
-
- ++timers;
- maxfire = 20;
- sched_expires = 0;
- while (!list_empty(timers)) {
- struct cpu_timer_list *tl = list_first_entry(timers,
- struct cpu_timer_list,
- entry);
- if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
- sched_expires = tl->expires.sched;
- break;
- }
- tl->firing = 1;
- list_move_tail(&tl->entry, firing);
- }
+ prof_expires = check_timers_list(timers, firing, ptime);
+ virt_expires = check_timers_list(++timers, firing, utime);
+ sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
/*
* Check for the special case process timers.
@@ -1162,8 +1046,8 @@ static void check_process_timers(struct task_struct *tsk,
}
}
- sig->cputime_expires.prof_exp = prof_expires;
- sig->cputime_expires.virt_exp = virt_expires;
+ sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
+ sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
sig->cputime_expires.sched_exp = sched_expires;
if (task_cputime_zero(&sig->cputime_expires))
stop_process_timers(sig);
@@ -1176,7 +1060,7 @@ static void check_process_timers(struct task_struct *tsk,
void posix_cpu_timer_schedule(struct k_itimer *timer)
{
struct task_struct *p = timer->it.cpu.task;
- union cpu_time_count now;
+ unsigned long long now;
if (unlikely(p == NULL))
/*
@@ -1205,7 +1089,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
*/
put_task_struct(p);
timer->it.cpu.task = p = NULL;
- timer->it.cpu.expires.sched = 0;
+ timer->it.cpu.expires = 0;
goto out_unlock;
} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
/*
@@ -1213,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
* not yet reaped. Take this opportunity to
* drop our task ref.
*/
+ cpu_timer_sample_group(timer->it_clock, p, &now);
clear_dead_task(timer, now);
goto out_unlock;
}
@@ -1387,7 +1272,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
cputime_t *newval, cputime_t *oldval)
{
- union cpu_time_count now;
+ unsigned long long now;
BUG_ON(clock_idx == CPUCLOCK_SCHED);
cpu_timer_sample_group(clock_idx, tsk, &now);
@@ -1399,17 +1284,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
* it to be absolute.
*/
if (*oldval) {
- if (*oldval <= now.cpu) {
+ if (*oldval <= now) {
/* Just about to fire. */
*oldval = cputime_one_jiffy;
} else {
- *oldval -= now.cpu;
+ *oldval -= now;
}
}
if (!*newval)
goto out;
- *newval += now.cpu;
+ *newval += now;
}
/*
@@ -1459,7 +1344,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
}
while (!signal_pending(current)) {
- if (timer.it.cpu.expires.sched == 0) {
+ if (timer.it.cpu.expires == 0) {
/*
* Our timer fired and was reset, below
* deletion can not fail.
diff --git a/kernel/printk.c b/kernel/printk.c
index 59d354e95763..a1bdf61d38ba 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
+#include <linux/aio.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
@@ -362,6 +363,46 @@ static void log_store(int facility, int level,
log_next_seq++;
}
+#ifdef CONFIG_SECURITY_DMESG_RESTRICT
+int dmesg_restrict = 1;
+#else
+int dmesg_restrict;
+#endif
+
+static int syslog_action_restricted(int type)
+{
+ if (dmesg_restrict)
+ return 1;
+ /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
+ return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
+}
+
+static int check_syslog_permissions(int type, bool from_file)
+{
+ /*
+ * If this is from /proc/kmsg and we've already opened it, then we've
+ * already done the capabilities checks at open time.
+ */
+ if (from_file && type != SYSLOG_ACTION_OPEN)
+ goto ok;
+
+ if (syslog_action_restricted(type)) {
+ if (capable(CAP_SYSLOG))
+ goto ok;
+ /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
+ if (capable(CAP_SYS_ADMIN)) {
+ printk_once(KERN_WARNING "%s (%d): "
+ "Attempt to access syslog with CAP_SYS_ADMIN "
+ "but no CAP_SYSLOG (deprecated).\n",
+ current->comm, task_pid_nr(current));
+ goto ok;
+ }
+ return -EPERM;
+ }
+ok:
+ return security_syslog(type);
+}
+
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
u64 seq;
@@ -437,10 +478,16 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
char cont = '-';
size_t len;
ssize_t ret;
+ int err;
if (!user)
return -EBADF;
+ err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+ SYSLOG_FROM_FILE);
+ if (err)
+ return err;
+
ret = mutex_lock_interruptible(&user->lock);
if (ret)
return ret;
@@ -619,7 +666,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
if ((file->f_flags & O_ACCMODE) == O_WRONLY)
return 0;
- err = security_syslog(SYSLOG_ACTION_READ_ALL);
+ err = check_syslog_permissions(SYSLOG_ACTION_OPEN, SYSLOG_FROM_FILE);
if (err)
return err;
@@ -812,45 +859,6 @@ static inline void boot_delay_msec(int level)
}
#endif
-#ifdef CONFIG_SECURITY_DMESG_RESTRICT
-int dmesg_restrict = 1;
-#else
-int dmesg_restrict;
-#endif
-
-static int syslog_action_restricted(int type)
-{
- if (dmesg_restrict)
- return 1;
- /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
- return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
-}
-
-static int check_syslog_permissions(int type, bool from_file)
-{
- /*
- * If this is from /proc/kmsg and we've already opened it, then we've
- * already done the capabilities checks at open time.
- */
- if (from_file && type != SYSLOG_ACTION_OPEN)
- return 0;
-
- if (syslog_action_restricted(type)) {
- if (capable(CAP_SYSLOG))
- return 0;
- /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
- if (capable(CAP_SYS_ADMIN)) {
- printk_once(KERN_WARNING "%s (%d): "
- "Attempt to access syslog with CAP_SYS_ADMIN "
- "but no CAP_SYSLOG (deprecated).\n",
- current->comm, task_pid_nr(current));
- return 0;
- }
- return -EPERM;
- }
- return 0;
-}
-
#if defined(CONFIG_PRINTK_TIME)
static bool printk_time = 1;
#else
@@ -1126,10 +1134,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
if (error)
goto out;
- error = security_syslog(type);
- if (error)
- return error;
-
switch (type) {
case SYSLOG_ACTION_CLOSE: /* Close log */
break;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 17ae54da0ec2..aed981a3f69c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -17,6 +17,7 @@
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/signal.h>
+#include <linux/uio.h>
#include <linux/audit.h>
#include <linux/pid_namespace.h>
#include <linux/syscalls.h>
diff --git a/kernel/relay.c b/kernel/relay.c
index b91488ba2e5a..e03440ba0f20 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -339,6 +339,10 @@ static void wakeup_readers(unsigned long data)
{
struct rchan_buf *buf = (struct rchan_buf *)data;
wake_up_interruptible(&buf->read_wait);
+ /*
+ * Stupid polling for now:
+ */
+ mod_timer(&buf->timer, jiffies + 1);
}
/**
@@ -356,6 +360,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
init_waitqueue_head(&buf->read_wait);
kref_init(&buf->kref);
setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+ mod_timer(&buf->timer, jiffies + 1);
} else
del_timer_sync(&buf->timer);
@@ -739,15 +744,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
else
buf->early_bytes += buf->chan->subbuf_size -
buf->padding[old_subbuf];
- smp_mb();
- if (waitqueue_active(&buf->read_wait))
- /*
- * Calling wake_up_interruptible() from here
- * will deadlock if we happen to be logging
- * from the scheduler (trying to re-grab
- * rq->lock), so defer it.
- */
- mod_timer(&buf->timer, jiffies + 1);
}
old = buf->data;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b7a10048a32c..237741439ae9 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -39,6 +39,8 @@
* is only needed for handling filters shared across tasks.
* @prev: points to a previously installed, or inherited, filter
* @len: the number of instructions in the program
+ * @bpf_func: points to either sk_run_filter or the code generated
+ * by the BPF JIT.
* @insns: the BPF program instructions to evaluate
*
* seccomp_filter objects are organized in a tree linked via the @prev
@@ -55,6 +57,8 @@ struct seccomp_filter {
atomic_t usage;
struct seccomp_filter *prev;
unsigned short len; /* Instruction count */
+ unsigned int (*bpf_func)(const struct sk_buff *skb,
+ const struct sock_filter *filter);
struct sock_filter insns[];
};
@@ -213,7 +217,7 @@ static u32 seccomp_run_filters(int syscall)
* value always takes priority (ignoring the DATA).
*/
for (f = current->seccomp.filter; f; f = f->prev) {
- u32 cur_ret = sk_run_filter(NULL, f->insns);
+ u32 cur_ret = f->bpf_func(NULL, f->insns);
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
}
@@ -275,6 +279,9 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
if (ret)
goto fail;
+ filter->bpf_func = sk_run_filter;
+ seccomp_jit_compile(filter);
+
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
@@ -332,10 +339,34 @@ void put_seccomp_filter(struct task_struct *tsk)
while (orig && atomic_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
orig = orig->prev;
+ seccomp_jit_free(freeme);
kfree(freeme);
}
}
+/*
+ * seccomp filter accessors for JIT code.
+ */
+unsigned short seccomp_filter_get_len(struct seccomp_filter *fp)
+{
+ return fp->len;
+}
+
+struct sock_filter *seccomp_filter_get_insns(struct seccomp_filter *fp)
+{
+ return fp->insns;
+}
+
+void seccomp_filter_set_bpf_func(struct seccomp_filter *fp, void *func)
+{
+ fp->bpf_func = func;
+}
+
+void *seccomp_filter_get_bpf_func(struct seccomp_filter *fp)
+{
+ return fp->bpf_func;
+}
+
/**
* seccomp_send_sigsys - signals the task to allow in-process syscall emulation
* @syscall: syscall number to send to userland
diff --git a/kernel/smp.c b/kernel/smp.c
index 4dba0f7b72ad..97084cfc61ca 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -12,6 +12,7 @@
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/cpu.h>
+#include <linux/hardirq.h>
#include "smpboot.h"
@@ -240,8 +241,9 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
* send smp call function interrupt to this cpu and as such deadlocks
* can't happen.
*/
- WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
- && !oops_in_progress);
+ WARN_ON_ONCE(cpu_online(this_cpu)
+ && (irqs_disabled() || in_serving_irq())
+ && !oops_in_progress);
if (cpu == this_cpu) {
local_irq_save(flags);
@@ -378,8 +380,9 @@ void smp_call_function_many(const struct cpumask *mask,
* send smp call function interrupt to this cpu and as such deadlocks
* can't happen.
*/
- WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
- && !oops_in_progress && !early_boot_irqs_disabled);
+ WARN_ON_ONCE(cpu_online(this_cpu)
+ && (irqs_disabled() || in_serving_irq())
+ && !oops_in_progress && !early_boot_irqs_disabled);
/* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 05039e348f07..ea741c32d596 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -239,10 +239,12 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
- if (hardlockup_panic)
+ if (hardlockup_panic) {
+ trigger_all_cpu_backtrace();
panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
- else
+ } else {
WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ }
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -323,8 +325,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
else
dump_stack();
- if (softlockup_panic)
+ if (softlockup_panic) {
+ trigger_all_cpu_backtrace();
panic("softlockup: hung tasks");
+ }
__this_cpu_write(soft_watchdog_warn, true);
} else
__this_cpu_write(soft_watchdog_warn, false);
diff --git a/lib/Kconfig b/lib/Kconfig
index fe01d418b09a..d526e109b147 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -189,6 +189,15 @@ config LZO_COMPRESS
config LZO_DECOMPRESS
tristate
+config LZ4_COMPRESS
+ tristate
+
+config LZ4HC_COMPRESS
+ tristate
+
+config LZ4_DECOMPRESS
+ tristate
+
source "lib/xz/Kconfig"
#
@@ -213,6 +222,10 @@ config DECOMPRESS_LZO
select LZO_DECOMPRESS
tristate
+config DECOMPRESS_LZ4
+ select LZ4_DECOMPRESS
+ tristate
+
#
# Generic allocator support is selected if needed
#
diff --git a/lib/Makefile b/lib/Makefile
index e9c52e1b853a..af79e8c25c1b 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
- earlycpio.o
+ earlycpio.o percpu-refcount.o
obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
lib-$(CONFIG_MMU) += ioremap.o
@@ -75,6 +75,9 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
obj-$(CONFIG_BCH) += bch.o
obj-$(CONFIG_LZO_COMPRESS) += lzo/
obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
+obj-$(CONFIG_LZ4_COMPRESS) += lz4/
+obj-$(CONFIG_LZ4HC_COMPRESS) += lz4/
+obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/
obj-$(CONFIG_XZ_DEC) += xz/
obj-$(CONFIG_RAID6_PQ) += raid6/
@@ -83,6 +86,7 @@ lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o
lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o
lib-$(CONFIG_DECOMPRESS_XZ) += decompress_unxz.o
lib-$(CONFIG_DECOMPRESS_LZO) += decompress_unlzo.o
+lib-$(CONFIG_DECOMPRESS_LZ4) += decompress_unlz4.o
obj-$(CONFIG_TEXTSEARCH) += textsearch.o
obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
diff --git a/lib/decompress.c b/lib/decompress.c
index f8fdedaf7b3d..4d1cd0397aab 100644
--- a/lib/decompress.c
+++ b/lib/decompress.c
@@ -11,6 +11,7 @@
#include <linux/decompress/unxz.h>
#include <linux/decompress/inflate.h>
#include <linux/decompress/unlzo.h>
+#include <linux/decompress/unlz4.h>
#include <linux/types.h>
#include <linux/string.h>
@@ -31,6 +32,9 @@
#ifndef CONFIG_DECOMPRESS_LZO
# define unlzo NULL
#endif
+#ifndef CONFIG_DECOMPRESS_LZ4
+# define unlz4 NULL
+#endif
struct compress_format {
unsigned char magic[2];
@@ -45,6 +49,7 @@ static const struct compress_format compressed_formats[] __initconst = {
{ {0x5d, 0x00}, "lzma", unlzma },
{ {0xfd, 0x37}, "xz", unxz },
{ {0x89, 0x4c}, "lzo", unlzo },
+ { {0x02, 0x21}, "lz4", unlz4 },
{ {0, 0}, NULL, NULL }
};
diff --git a/lib/decompress_unlz4.c b/lib/decompress_unlz4.c
new file mode 100644
index 000000000000..3e67cfad16ad
--- /dev/null
+++ b/lib/decompress_unlz4.c
@@ -0,0 +1,187 @@
+/*
+ * Wrapper for decompressing LZ4-compressed kernel, initramfs, and initrd
+ *
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifdef STATIC
+#define PREBOOT
+#include "lz4/lz4_decompress.c"
+#else
+#include <linux/decompress/unlz4.h>
+#endif
+#include <linux/types.h>
+#include <linux/lz4.h>
+#include <linux/decompress/mm.h>
+#include <linux/compiler.h>
+
+#include <asm/unaligned.h>
+
+/*
+ * Note: Uncompressed chunk size is used in the compressor side
+ * (userspace side for compression).
+ * It is hardcoded because there is not proper way to extract it
+ * from the binary stream which is generated by the preliminary
+ * version of LZ4 tool so far.
+ */
+#define LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE (8 << 20)
+#define ARCHIVE_MAGICNUMBER 0x184C2102
+
+STATIC inline int INIT unlz4(u8 *input, int in_len,
+ int (*fill) (void *, unsigned int),
+ int (*flush) (void *, unsigned int),
+ u8 *output, int *posp,
+ void (*error) (char *x))
+{
+ int ret = -1;
+ size_t chunksize = 0;
+ size_t uncomp_chunksize = LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE;
+ u8 *inp;
+ u8 *inp_start;
+ u8 *outp;
+ int size = in_len;
+#ifdef PREBOOT
+ size_t out_len = get_unaligned_le32(input + in_len);
+#endif
+ size_t dest_len;
+
+
+ if (output) {
+ outp = output;
+ } else if (!flush) {
+ error("NULL output pointer and no flush function provided");
+ goto exit_0;
+ } else {
+ outp = large_malloc(uncomp_chunksize);
+ if (!outp) {
+ error("Could not allocate output buffer");
+ goto exit_0;
+ }
+ }
+
+ if (input && fill) {
+ error("Both input pointer and fill function provided,");
+ goto exit_1;
+ } else if (input) {
+ inp = input;
+ } else if (!fill) {
+ error("NULL input pointer and missing fill function");
+ goto exit_1;
+ } else {
+ inp = large_malloc(lz4_compressbound(uncomp_chunksize));
+ if (!inp) {
+ error("Could not allocate input buffer");
+ goto exit_1;
+ }
+ }
+ inp_start = inp;
+
+ if (posp)
+ *posp = 0;
+
+ if (fill)
+ fill(inp, 4);
+
+ chunksize = get_unaligned_le32(inp);
+ if (chunksize == ARCHIVE_MAGICNUMBER) {
+ inp += 4;
+ size -= 4;
+ } else {
+ error("invalid header");
+ goto exit_2;
+ }
+
+ if (posp)
+ *posp += 4;
+
+ for (;;) {
+
+ if (fill)
+ fill(inp, 4);
+
+ chunksize = get_unaligned_le32(inp);
+ if (chunksize == ARCHIVE_MAGICNUMBER) {
+ inp += 4;
+ size -= 4;
+ if (posp)
+ *posp += 4;
+ continue;
+ }
+ inp += 4;
+ size -= 4;
+
+ if (posp)
+ *posp += 4;
+
+ if (fill) {
+ if (chunksize > lz4_compressbound(uncomp_chunksize)) {
+ error("chunk length is longer than allocated");
+ goto exit_2;
+ }
+ fill(inp, chunksize);
+ }
+#ifdef PREBOOT
+ if (out_len >= uncomp_chunksize) {
+ dest_len = uncomp_chunksize;
+ out_len -= dest_len;
+ } else
+ dest_len = out_len;
+ ret = lz4_decompress(inp, &chunksize, outp, dest_len);
+#else
+ dest_len = uncomp_chunksize;
+ ret = lz4_decompress_unknownoutputsize(inp, chunksize, outp,
+ &dest_len);
+#endif
+ if (ret < 0) {
+ error("Decoding failed");
+ goto exit_2;
+ }
+
+ if (flush && flush(outp, dest_len) != dest_len)
+ goto exit_2;
+ if (output)
+ outp += dest_len;
+ if (posp)
+ *posp += chunksize;
+
+ size -= chunksize;
+
+ if (size == 0)
+ break;
+ else if (size < 0) {
+ error("data corrupted");
+ goto exit_2;
+ }
+
+ inp += chunksize;
+ if (fill)
+ inp = inp_start;
+ }
+
+ ret = 0;
+exit_2:
+ if (!input)
+ large_free(inp_start);
+exit_1:
+ if (!output)
+ large_free(outp);
+exit_0:
+ return ret;
+}
+
+#ifdef PREBOOT
+STATIC int INIT decompress(unsigned char *buf, int in_len,
+ int(*fill)(void*, unsigned int),
+ int(*flush)(void*, unsigned int),
+ unsigned char *output,
+ int *posp,
+ void(*error)(char *x)
+ )
+{
+ return unlz4(buf, in_len - 4, fill, flush, output, posp, error);
+}
+#endif
diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile
new file mode 100644
index 000000000000..8085d04e9309
--- /dev/null
+++ b/lib/lz4/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LZ4_COMPRESS) += lz4_compress.o
+obj-$(CONFIG_LZ4HC_COMPRESS) += lz4hc_compress.o
+obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o
diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
new file mode 100644
index 000000000000..fd94058bd7f9
--- /dev/null
+++ b/lib/lz4/lz4_compress.c
@@ -0,0 +1,443 @@
+/*
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011-2012, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ *
+ * Changed for kernel use by:
+ * Chanho Min <chanho.min@lge.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/lz4.h>
+#include <asm/unaligned.h>
+#include "lz4defs.h"
+
+/*
+ * LZ4_compressCtx :
+ * -----------------
+ * Compress 'isize' bytes from 'source' into an output buffer 'dest' of
+ * maximum size 'maxOutputSize'. * If it cannot achieve it, compression
+ * will stop, and result of the function will be zero.
+ * return : the number of bytes written in buffer 'dest', or 0 if the
+ * compression fails
+ */
+static inline int lz4_compressctx(void *ctx,
+ const char *source,
+ char *dest,
+ int isize,
+ int maxoutputsize)
+{
+ HTYPE *hashtable = (HTYPE *)ctx;
+ const u8 *ip = (u8 *)source;
+#if LZ4_ARCH64
+ const BYTE * const base = ip;
+#else
+ const int base = 0;
+#endif
+ const u8 *anchor = ip;
+ const u8 *const iend = ip + isize;
+ const u8 *const mflimit = iend - MFLIMIT;
+ #define MATCHLIMIT (iend - LASTLITERALS)
+
+ u8 *op = (u8 *) dest;
+ u8 *const oend = op + maxoutputsize;
+ int length;
+ const int skipstrength = SKIPSTRENGTH;
+ u32 forwardh;
+ int lastrun;
+
+ /* Init */
+ if (isize < MINLENGTH)
+ goto _last_literals;
+
+ memset((void *)hashtable, 0, LZ4_MEM_COMPRESS);
+
+ /* First Byte */
+ hashtable[LZ4_HASH_VALUE(ip)] = ip - base;
+ ip++;
+ forwardh = LZ4_HASH_VALUE(ip);
+
+ /* Main Loop */
+ for (;;) {
+ int findmatchattempts = (1U << skipstrength) + 3;
+ const u8 *forwardip = ip;
+ const u8 *ref;
+ u8 *token;
+
+ /* Find a match */
+ do {
+ u32 h = forwardh;
+ int step = findmatchattempts++ >> skipstrength;
+ ip = forwardip;
+ forwardip = ip + step;
+
+ if (unlikely(forwardip > mflimit))
+ goto _last_literals;
+
+ forwardh = LZ4_HASH_VALUE(forwardip);
+ ref = base + hashtable[h];
+ hashtable[h] = ip - base;
+ } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
+
+ /* Catch up */
+ while ((ip > anchor) && (ref > (u8 *)source) &&
+ unlikely(ip[-1] == ref[-1])) {
+ ip--;
+ ref--;
+ }
+
+ /* Encode Literal length */
+ length = (int)(ip - anchor);
+ token = op++;
+ /* check output limit */
+ if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
+ (length >> 8) > oend))
+ return 0;
+
+ if (length >= (int)RUN_MASK) {
+ int len;
+ *token = (RUN_MASK << ML_BITS);
+ len = length - RUN_MASK;
+ for (; len > 254 ; len -= 255)
+ *op++ = 255;
+ *op++ = (u8)len;
+ } else
+ *token = (length << ML_BITS);
+
+ /* Copy Literals */
+ LZ4_BLINDCOPY(anchor, op, length);
+_next_match:
+ /* Encode Offset */
+ LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref));
+
+ /* Start Counting */
+ ip += MINMATCH;
+ /* MinMatch verified */
+ ref += MINMATCH;
+ anchor = ip;
+ while (likely(ip < MATCHLIMIT - (STEPSIZE - 1))) {
+ #if LZ4_ARCH64
+ u64 diff = A64(ref) ^ A64(ip);
+ #else
+ u32 diff = A32(ref) ^ A32(ip);
+ #endif
+ if (!diff) {
+ ip += STEPSIZE;
+ ref += STEPSIZE;
+ continue;
+ }
+ ip += LZ4_NBCOMMONBYTES(diff);
+ goto _endcount;
+ }
+ #if LZ4_ARCH64
+ if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) {
+ ip += 4;
+ ref += 4;
+ }
+ #endif
+ if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) {
+ ip += 2;
+ ref += 2;
+ }
+ if ((ip < MATCHLIMIT) && (*ref == *ip))
+ ip++;
+_endcount:
+ /* Encode MatchLength */
+ length = (int)(ip - anchor);
+ /* Check output limit */
+ if (unlikely(op + (1 + LASTLITERALS) + (length >> 8) > oend))
+ return 0;
+ if (length >= (int)ML_MASK) {
+ *token += ML_MASK;
+ length -= ML_MASK;
+ for (; length > 509 ; length -= 510) {
+ *op++ = 255;
+ *op++ = 255;
+ }
+ if (length > 254) {
+ length -= 255;
+ *op++ = 255;
+ }
+ *op++ = (u8)length;
+ } else
+ *token += length;
+
+ /* Test end of chunk */
+ if (ip > mflimit) {
+ anchor = ip;
+ break;
+ }
+
+ /* Fill table */
+ hashtable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base;
+
+ /* Test next position */
+ ref = base + hashtable[LZ4_HASH_VALUE(ip)];
+ hashtable[LZ4_HASH_VALUE(ip)] = ip - base;
+ if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) {
+ token = op++;
+ *token = 0;
+ goto _next_match;
+ }
+
+ /* Prepare next loop */
+ anchor = ip++;
+ forwardh = LZ4_HASH_VALUE(ip);
+ }
+
+_last_literals:
+ /* Encode Last Literals */
+ lastrun = (int)(iend - anchor);
+ if (((char *)op - dest) + lastrun + 1
+ + ((lastrun + 255 - RUN_MASK) / 255) > (u32)maxoutputsize)
+ return 0;
+
+ if (lastrun >= (int)RUN_MASK) {
+ *op++ = (RUN_MASK << ML_BITS);
+ lastrun -= RUN_MASK;
+ for (; lastrun > 254 ; lastrun -= 255)
+ *op++ = 255;
+ *op++ = (u8)lastrun;
+ } else
+ *op++ = (lastrun << ML_BITS);
+ memcpy(op, anchor, iend - anchor);
+ op += iend - anchor;
+
+ /* End */
+ return (int)(((char *)op) - dest);
+}
+
+static inline int lz4_compress64kctx(void *ctx,
+ const char *source,
+ char *dest,
+ int isize,
+ int maxoutputsize)
+{
+ u16 *hashtable = (u16 *)ctx;
+ const u8 *ip = (u8 *) source;
+ const u8 *anchor = ip;
+ const u8 *const base = ip;
+ const u8 *const iend = ip + isize;
+ const u8 *const mflimit = iend - MFLIMIT;
+ #define MATCHLIMIT (iend - LASTLITERALS)
+
+ u8 *op = (u8 *) dest;
+ u8 *const oend = op + maxoutputsize;
+ int len, length;
+ const int skipstrength = SKIPSTRENGTH;
+ u32 forwardh;
+ int lastrun;
+
+ /* Init */
+ if (isize < MINLENGTH)
+ goto _last_literals;
+
+ memset((void *)hashtable, 0, LZ4_MEM_COMPRESS);
+
+ /* First Byte */
+ ip++;
+ forwardh = LZ4_HASH64K_VALUE(ip);
+
+ /* Main Loop */
+ for (;;) {
+ int findmatchattempts = (1U << skipstrength) + 3;
+ const u8 *forwardip = ip;
+ const u8 *ref;
+ u8 *token;
+
+ /* Find a match */
+ do {
+ u32 h = forwardh;
+ int step = findmatchattempts++ >> skipstrength;
+ ip = forwardip;
+ forwardip = ip + step;
+
+ if (forwardip > mflimit)
+ goto _last_literals;
+
+ forwardh = LZ4_HASH64K_VALUE(forwardip);
+ ref = base + hashtable[h];
+ hashtable[h] = (u16)(ip - base);
+ } while (A32(ref) != A32(ip));
+
+ /* Catch up */
+ while ((ip > anchor) && (ref > (u8 *)source)
+ && (ip[-1] == ref[-1])) {
+ ip--;
+ ref--;
+ }
+
+ /* Encode Literal length */
+ length = (int)(ip - anchor);
+ token = op++;
+ /* Check output limit */
+ if (unlikely(op + length + (2 + 1 + LASTLITERALS)
+ + (length >> 8) > oend))
+ return 0;
+ if (length >= (int)RUN_MASK) {
+ *token = (RUN_MASK << ML_BITS);
+ len = length - RUN_MASK;
+ for (; len > 254 ; len -= 255)
+ *op++ = 255;
+ *op++ = (u8)len;
+ } else
+ *token = (length << ML_BITS);
+
+ /* Copy Literals */
+ LZ4_BLINDCOPY(anchor, op, length);
+
+_next_match:
+ /* Encode Offset */
+ LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref));
+
+ /* Start Counting */
+ ip += MINMATCH;
+ /* MinMatch verified */
+ ref += MINMATCH;
+ anchor = ip;
+
+ while (ip < MATCHLIMIT - (STEPSIZE - 1)) {
+ #if LZ4_ARCH64
+ u64 diff = A64(ref) ^ A64(ip);
+ #else
+ u32 diff = A32(ref) ^ A32(ip);
+ #endif
+
+ if (!diff) {
+ ip += STEPSIZE;
+ ref += STEPSIZE;
+ continue;
+ }
+ ip += LZ4_NBCOMMONBYTES(diff);
+ goto _endcount;
+ }
+ #if LZ4_ARCH64
+ if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) {
+ ip += 4;
+ ref += 4;
+ }
+ #endif
+ if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) {
+ ip += 2;
+ ref += 2;
+ }
+ if ((ip < MATCHLIMIT) && (*ref == *ip))
+ ip++;
+_endcount:
+
+ /* Encode MatchLength */
+ len = (int)(ip - anchor);
+ /* Check output limit */
+ if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
+ return 0;
+ if (len >= (int)ML_MASK) {
+ *token += ML_MASK;
+ len -= ML_MASK;
+ for (; len > 509 ; len -= 510) {
+ *op++ = 255;
+ *op++ = 255;
+ }
+ if (len > 254) {
+ len -= 255;
+ *op++ = 255;
+ }
+ *op++ = (u8)len;
+ } else
+ *token += len;
+
+ /* Test end of chunk */
+ if (ip > mflimit) {
+ anchor = ip;
+ break;
+ }
+
+ /* Fill table */
+ hashtable[LZ4_HASH64K_VALUE(ip-2)] = (u16)(ip - 2 - base);
+
+ /* Test next position */
+ ref = base + hashtable[LZ4_HASH64K_VALUE(ip)];
+ hashtable[LZ4_HASH64K_VALUE(ip)] = (u16)(ip - base);
+ if (A32(ref) == A32(ip)) {
+ token = op++;
+ *token = 0;
+ goto _next_match;
+ }
+
+ /* Prepare next loop */
+ anchor = ip++;
+ forwardh = LZ4_HASH64K_VALUE(ip);
+ }
+
+_last_literals:
+ /* Encode Last Literals */
+ lastrun = (int)(iend - anchor);
+ if (op + lastrun + 1 + (lastrun - RUN_MASK + 255) / 255 > oend)
+ return 0;
+ if (lastrun >= (int)RUN_MASK) {
+ *op++ = (RUN_MASK << ML_BITS);
+ lastrun -= RUN_MASK;
+ for (; lastrun > 254 ; lastrun -= 255)
+ *op++ = 255;
+ *op++ = (u8)lastrun;
+ } else
+ *op++ = (lastrun << ML_BITS);
+ memcpy(op, anchor, iend - anchor);
+ op += iend - anchor;
+ /* End */
+ return (int)(((char *)op) - dest);
+}
+
+int lz4_compress(const unsigned char *src, size_t src_len,
+ unsigned char *dst, size_t *dst_len, void *wrkmem)
+{
+ int ret = -1;
+ int out_len = 0;
+
+ if (src_len < LZ4_64KLIMIT)
+ out_len = lz4_compress64kctx(wrkmem, src, dst, src_len,
+ lz4_compressbound(src_len));
+ else
+ out_len = lz4_compressctx(wrkmem, src, dst, src_len,
+ lz4_compressbound(src_len));
+
+ if (out_len < 0)
+ goto exit;
+
+ *dst_len = out_len;
+
+ return 0;
+exit:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lz4_compress);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZ4 compressor");
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
new file mode 100644
index 000000000000..d3414eae73a1
--- /dev/null
+++ b/lib/lz4/lz4_decompress.c
@@ -0,0 +1,326 @@
+/*
+ * LZ4 Decompressor for Linux kernel
+ *
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ *
+ * Based on LZ4 implementation by Yann Collet.
+ *
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011-2012, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ */
+
+#ifndef STATIC
+#include <linux/module.h>
+#include <linux/kernel.h>
+#endif
+#include <linux/lz4.h>
+
+#include <asm/unaligned.h>
+
+#include "lz4defs.h"
+
+static int lz4_uncompress(const char *source, char *dest, int osize)
+{
+ const BYTE *ip = (const BYTE *) source;
+ const BYTE *ref;
+ BYTE *op = (BYTE *) dest;
+ BYTE * const oend = op + osize;
+ BYTE *cpy;
+ unsigned token;
+ size_t length;
+ size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+ size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
+#endif
+
+ while (1) {
+
+ /* get runlength */
+ token = *ip++;
+ length = (token >> ML_BITS);
+ if (length == RUN_MASK) {
+ size_t len;
+
+ len = *ip++;
+ for (; len == 255; length += 255)
+ len = *ip++;
+ length += len;
+ }
+
+ /* copy literals */
+ cpy = op + length;
+ if (unlikely(cpy > oend - COPYLENGTH)) {
+ /*
+ * Error: not enough place for another match
+ * (min 4) + 5 literals
+ */
+ if (cpy != oend)
+ goto _output_error;
+
+ memcpy(op, ip, length);
+ ip += length;
+ break; /* EOF */
+ }
+ LZ4_WILDCOPY(ip, op, cpy);
+ ip -= (op - cpy);
+ op = cpy;
+
+ /* get offset */
+ LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
+ ip += 2;
+
+ /* Error: offset create reference outside destination buffer */
+ if (unlikely(ref < (BYTE *const) dest))
+ goto _output_error;
+
+ /* get matchlength */
+ length = token & ML_MASK;
+ if (length == ML_MASK) {
+ for (; *ip == 255; length += 255)
+ ip++;
+ length += *ip++;
+ }
+
+ /* copy repeated sequence */
+ if (unlikely((op - ref) < STEPSIZE)) {
+#if LZ4_ARCH64
+ size_t dec64 = dec64table[op - ref];
+#else
+ const int dec64 = 0;
+#endif
+ op[0] = ref[0];
+ op[1] = ref[1];
+ op[2] = ref[2];
+ op[3] = ref[3];
+ op += 4;
+ ref += 4;
+ ref -= dec32table[op-ref];
+ PUT4(ref, op);
+ op += STEPSIZE - 4;
+ ref -= dec64;
+ } else {
+ LZ4_COPYSTEP(ref, op);
+ }
+ cpy = op + length - (STEPSIZE - 4);
+ if (cpy > (oend - COPYLENGTH)) {
+
+ /* Error: request to write beyond destination buffer */
+ if (cpy > oend)
+ goto _output_error;
+ LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
+ while (op < cpy)
+ *op++ = *ref++;
+ op = cpy;
+ /*
+ * Check EOF (should never happen, since last 5 bytes
+ * are supposed to be literals)
+ */
+ if (op == oend)
+ goto _output_error;
+ continue;
+ }
+ LZ4_SECURECOPY(ref, op, cpy);
+ op = cpy; /* correction */
+ }
+ /* end of decoding */
+ return (int) (((char *)ip) - source);
+
+ /* write overflow error detected */
+_output_error:
+ return (int) (-(((char *)ip) - source));
+}
+
+static int lz4_uncompress_unknownoutputsize(const char *source, char *dest,
+ int isize, size_t maxoutputsize)
+{
+ const BYTE *ip = (const BYTE *) source;
+ const BYTE *const iend = ip + isize;
+ const BYTE *ref;
+
+
+ BYTE *op = (BYTE *) dest;
+ BYTE * const oend = op + maxoutputsize;
+ BYTE *cpy;
+
+ size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+ size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
+#endif
+
+ /* Main Loop */
+ while (ip < iend) {
+
+ unsigned token;
+ size_t length;
+
+ /* get runlength */
+ token = *ip++;
+ length = (token >> ML_BITS);
+ if (length == RUN_MASK) {
+ int s = 255;
+ while ((ip < iend) && (s == 255)) {
+ s = *ip++;
+ length += s;
+ }
+ }
+ /* copy literals */
+ cpy = op + length;
+ if ((cpy > oend - COPYLENGTH) ||
+ (ip + length > iend - COPYLENGTH)) {
+
+ if (cpy > oend)
+ goto _output_error;/* writes beyond buffer */
+
+ if (ip + length != iend)
+ goto _output_error;/*
+ * Error: LZ4 format requires
+ * to consume all input
+ * at this stage
+ */
+ memcpy(op, ip, length);
+ op += length;
+ break;/* Necessarily EOF, due to parsing restrictions */
+ }
+ LZ4_WILDCOPY(ip, op, cpy);
+ ip -= (op - cpy);
+ op = cpy;
+
+ /* get offset */
+ LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
+ ip += 2;
+ if (ref < (BYTE * const) dest)
+ goto _output_error;
+ /*
+ * Error : offset creates reference
+ * outside of destination buffer
+ */
+
+ /* get matchlength */
+ length = (token & ML_MASK);
+ if (length == ML_MASK) {
+ while (ip < iend) {
+ int s = *ip++;
+ length += s;
+ if (s == 255)
+ continue;
+ break;
+ }
+ }
+
+ /* copy repeated sequence */
+ if (unlikely((op - ref) < STEPSIZE)) {
+#if LZ4_ARCH64
+ size_t dec64 = dec64table[op - ref];
+#else
+ const int dec64 = 0;
+#endif
+ op[0] = ref[0];
+ op[1] = ref[1];
+ op[2] = ref[2];
+ op[3] = ref[3];
+ op += 4;
+ ref += 4;
+ ref -= dec32table[op - ref];
+ PUT4(ref, op);
+ op += STEPSIZE - 4;
+ ref -= dec64;
+ } else {
+ LZ4_COPYSTEP(ref, op);
+ }
+ cpy = op + length - (STEPSIZE-4);
+ if (cpy > oend - COPYLENGTH) {
+ if (cpy > oend)
+ goto _output_error; /* write outside of buf */
+
+ LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
+ while (op < cpy)
+ *op++ = *ref++;
+ op = cpy;
+ /*
+ * Check EOF (should never happen, since last 5 bytes
+ * are supposed to be literals)
+ */
+ if (op == oend)
+ goto _output_error;
+ continue;
+ }
+ LZ4_SECURECOPY(ref, op, cpy);
+ op = cpy; /* correction */
+ }
+ /* end of decoding */
+ return (int) (((char *) op) - dest);
+
+ /* write overflow error detected */
+_output_error:
+ return (int) (-(((char *) ip) - source));
+}
+
+int lz4_decompress(const char *src, size_t *src_len, char *dest,
+ size_t actual_dest_len)
+{
+ int ret = -1;
+ int input_len = 0;
+
+ input_len = lz4_uncompress(src, dest, actual_dest_len);
+ if (input_len < 0)
+ goto exit_0;
+ *src_len = input_len;
+
+ return 0;
+exit_0:
+ return ret;
+}
+#ifndef STATIC
+EXPORT_SYMBOL_GPL(lz4_decompress);
+#endif
+
+int lz4_decompress_unknownoutputsize(const char *src, size_t src_len,
+ char *dest, size_t *dest_len)
+{
+ int ret = -1;
+ int out_len = 0;
+
+ out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len,
+ *dest_len);
+ if (out_len < 0)
+ goto exit_0;
+ *dest_len = out_len;
+
+ return 0;
+exit_0:
+ return ret;
+}
+#ifndef STATIC
+EXPORT_SYMBOL_GPL(lz4_decompress_unknownoutputsize);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZ4 Decompressor");
+#endif
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
new file mode 100644
index 000000000000..abcecdc2d0f2
--- /dev/null
+++ b/lib/lz4/lz4defs.h
@@ -0,0 +1,156 @@
+/*
+ * lz4defs.h -- architecture specific defines
+ *
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Detects 64 bits mode
+ */
+#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) \
+ || defined(__ppc64__) || defined(__LP64__))
+#define LZ4_ARCH64 1
+#else
+#define LZ4_ARCH64 0
+#endif
+
+/*
+ * Architecture-specific macros
+ */
+#define BYTE u8
+typedef struct _U16_S { u16 v; } U16_S;
+typedef struct _U32_S { u32 v; } U32_S;
+typedef struct _U64_S { u64 v; } U64_S;
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \
+ || defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6 \
+ && defined(ARM_EFFICIENT_UNALIGNED_ACCESS)
+
+#define A16(x) (((U16_S *)(x))->v)
+#define A32(x) (((U32_S *)(x))->v)
+#define A64(x) (((U64_S *)(x))->v)
+
+#define PUT4(s, d) (A32(d) = A32(s))
+#define PUT8(s, d) (A64(d) = A64(s))
+#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \
+ do { \
+ A16(p) = v; \
+ p += 2; \
+ } while (0)
+#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */
+
+#define A64(x) get_unaligned((u64 *)&(((U16_S *)(x))->v))
+#define A32(x) get_unaligned((u32 *)&(((U16_S *)(x))->v))
+#define A16(x) get_unaligned((u16 *)&(((U16_S *)(x))->v))
+
+#define PUT4(s, d) \
+ put_unaligned(get_unaligned((const u32 *) s), (u32 *) d)
+#define PUT8(s, d) \
+ put_unaligned(get_unaligned((const u64 *) s), (u64 *) d)
+
+#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \
+ do { \
+ put_unaligned(v, (u16 *)(p)); \
+ p += 2; \
+ } while (0)
+#endif
+
+#define COPYLENGTH 8
+#define ML_BITS 4
+#define ML_MASK ((1U << ML_BITS) - 1)
+#define RUN_BITS (8 - ML_BITS)
+#define RUN_MASK ((1U << RUN_BITS) - 1)
+#define MEMORY_USAGE 14
+#define MINMATCH 4
+#define SKIPSTRENGTH 6
+#define LASTLITERALS 5
+#define MFLIMIT (COPYLENGTH + MINMATCH)
+#define MINLENGTH (MFLIMIT + 1)
+#define MAXD_LOG 16
+#define MAXD (1 << MAXD_LOG)
+#define MAXD_MASK (u32)(MAXD - 1)
+#define MAX_DISTANCE (MAXD - 1)
+#define HASH_LOG (MAXD_LOG - 1)
+#define HASHTABLESIZE (1 << HASH_LOG)
+#define MAX_NB_ATTEMPTS 256
+#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH)
+#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT - 1))
+#define HASHLOG64K ((MEMORY_USAGE - 2) + 1)
+#define HASH64KTABLESIZE (1U << HASHLOG64K)
+#define LZ4_HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \
+ ((MINMATCH * 8) - (MEMORY_USAGE-2)))
+#define LZ4_HASH64K_VALUE(p) (((A32(p)) * 2654435761U) >> \
+ ((MINMATCH * 8) - HASHLOG64K))
+#define HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \
+ ((MINMATCH * 8) - HASH_LOG))
+
+#if LZ4_ARCH64/* 64-bit */
+#define STEPSIZE 8
+
+#define LZ4_COPYSTEP(s, d) \
+ do { \
+ PUT8(s, d); \
+ d += 8; \
+ s += 8; \
+ } while (0)
+
+#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d)
+
+#define LZ4_SECURECOPY(s, d, e) \
+ do { \
+ if (d < e) { \
+ LZ4_WILDCOPY(s, d, e); \
+ } \
+ } while (0)
+#define HTYPE u32
+
+#ifdef __BIG_ENDIAN
+#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3)
+#else
+#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3)
+#endif
+
+#else /* 32-bit */
+#define STEPSIZE 4
+
+#define LZ4_COPYSTEP(s, d) \
+ do { \
+ PUT4(s, d); \
+ d += 4; \
+ s += 4; \
+ } while (0)
+
+#define LZ4_COPYPACKET(s, d) \
+ do { \
+ LZ4_COPYSTEP(s, d); \
+ LZ4_COPYSTEP(s, d); \
+ } while (0)
+
+#define LZ4_SECURECOPY LZ4_WILDCOPY
+#define HTYPE const u8*
+
+#ifdef __BIG_ENDIAN
+#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3)
+#else
+#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3)
+#endif
+
+#endif
+
+#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \
+ (d = s - get_unaligned_le16(p))
+
+#define LZ4_WILDCOPY(s, d, e) \
+ do { \
+ LZ4_COPYPACKET(s, d); \
+ } while (d < e)
+
+#define LZ4_BLINDCOPY(s, d, l) \
+ do { \
+ u8 *e = (d) + l; \
+ LZ4_WILDCOPY(s, d, e); \
+ d = e; \
+ } while (0)
diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
new file mode 100644
index 000000000000..eb1a74f5e368
--- /dev/null
+++ b/lib/lz4/lz4hc_compress.c
@@ -0,0 +1,539 @@
+/*
+ * LZ4 HC - High Compression Mode of LZ4
+ * Copyright (C) 2011-2012, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ *
+ * Changed for kernel use by:
+ * Chanho Min <chanho.min@lge.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/lz4.h>
+#include <asm/unaligned.h>
+#include "lz4defs.h"
+
+struct lz4hc_data {
+ const u8 *base;
+ HTYPE hashtable[HASHTABLESIZE];
+ u16 chaintable[MAXD];
+ const u8 *nexttoupdate;
+} __attribute__((__packed__));
+
+static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base)
+{
+ memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable));
+ memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable));
+
+#if LZ4_ARCH64
+ hc4->nexttoupdate = base + 1;
+#else
+ hc4->nexttoupdate = base;
+#endif
+ hc4->base = base;
+ return 1;
+}
+
+/* Update chains up to ip (excluded) */
+static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip)
+{
+ u16 *chaintable = hc4->chaintable;
+ HTYPE *hashtable = hc4->hashtable;
+#if LZ4_ARCH64
+ const BYTE * const base = hc4->base;
+#else
+ const int base = 0;
+#endif
+
+ while (hc4->nexttoupdate < ip) {
+ const u8 *p = hc4->nexttoupdate;
+ size_t delta = p - (hashtable[HASH_VALUE(p)] + base);
+ if (delta > MAX_DISTANCE)
+ delta = MAX_DISTANCE;
+ chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta;
+ hashtable[HASH_VALUE(p)] = (p) - base;
+ hc4->nexttoupdate++;
+ }
+}
+
+static inline size_t lz4hc_commonlength(const u8 *p1, const u8 *p2,
+ const u8 *const matchlimit)
+{
+ const u8 *p1t = p1;
+
+ while (p1t < matchlimit - (STEPSIZE - 1)) {
+#if LZ4_ARCH64
+ u64 diff = A64(p2) ^ A64(p1t);
+#else
+ u32 diff = A32(p2) ^ A32(p1t);
+#endif
+ if (!diff) {
+ p1t += STEPSIZE;
+ p2 += STEPSIZE;
+ continue;
+ }
+ p1t += LZ4_NBCOMMONBYTES(diff);
+ return p1t - p1;
+ }
+#if LZ4_ARCH64
+ if ((p1t < (matchlimit-3)) && (A32(p2) == A32(p1t))) {
+ p1t += 4;
+ p2 += 4;
+ }
+#endif
+
+ if ((p1t < (matchlimit - 1)) && (A16(p2) == A16(p1t))) {
+ p1t += 2;
+ p2 += 2;
+ }
+ if ((p1t < matchlimit) && (*p2 == *p1t))
+ p1t++;
+ return p1t - p1;
+}
+
+static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4,
+ const u8 *ip, const u8 *const matchlimit, const u8 **matchpos)
+{
+ u16 *const chaintable = hc4->chaintable;
+ HTYPE *const hashtable = hc4->hashtable;
+ const u8 *ref;
+#if LZ4_ARCH64
+ const BYTE * const base = hc4->base;
+#else
+ const int base = 0;
+#endif
+ int nbattempts = MAX_NB_ATTEMPTS;
+ size_t repl = 0, ml = 0;
+ u16 delta;
+
+ /* HC4 match finder */
+ lz4hc_insert(hc4, ip);
+ ref = hashtable[HASH_VALUE(ip)] + base;
+
+ /* potential repetition */
+ if (ref >= ip-4) {
+ /* confirmed */
+ if (A32(ref) == A32(ip)) {
+ delta = (u16)(ip-ref);
+ repl = ml = lz4hc_commonlength(ip + MINMATCH,
+ ref + MINMATCH, matchlimit) + MINMATCH;
+ *matchpos = ref;
+ }
+ ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+ }
+
+ while ((ref >= ip - MAX_DISTANCE) && nbattempts) {
+ nbattempts--;
+ if (*(ref + ml) == *(ip + ml)) {
+ if (A32(ref) == A32(ip)) {
+ size_t mlt =
+ lz4hc_commonlength(ip + MINMATCH,
+ ref + MINMATCH, matchlimit) + MINMATCH;
+ if (mlt > ml) {
+ ml = mlt;
+ *matchpos = ref;
+ }
+ }
+ }
+ ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+ }
+
+ /* Complete table */
+ if (repl) {
+ const BYTE *ptr = ip;
+ const BYTE *end;
+ end = ip + repl - (MINMATCH-1);
+ /* Pre-Load */
+ while (ptr < end - delta) {
+ chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
+ ptr++;
+ }
+ do {
+ chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
+ /* Head of chain */
+ hashtable[HASH_VALUE(ptr)] = (ptr) - base;
+ ptr++;
+ } while (ptr < end);
+ hc4->nexttoupdate = end;
+ }
+
+ return (int)ml;
+}
+
+static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4,
+ const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest,
+ const u8 **matchpos, const u8 **startpos)
+{
+ u16 *const chaintable = hc4->chaintable;
+ HTYPE *const hashtable = hc4->hashtable;
+#if LZ4_ARCH64
+ const BYTE * const base = hc4->base;
+#else
+ const int base = 0;
+#endif
+ const u8 *ref;
+ int nbattempts = MAX_NB_ATTEMPTS;
+ int delta = (int)(ip - startlimit);
+
+ /* First Match */
+ lz4hc_insert(hc4, ip);
+ ref = hashtable[HASH_VALUE(ip)] + base;
+
+ while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base)
+ && (nbattempts)) {
+ nbattempts--;
+ if (*(startlimit + longest) == *(ref - delta + longest)) {
+ if (A32(ref) == A32(ip)) {
+ const u8 *reft = ref + MINMATCH;
+ const u8 *ipt = ip + MINMATCH;
+ const u8 *startt = ip;
+
+ while (ipt < matchlimit-(STEPSIZE - 1)) {
+ #if LZ4_ARCH64
+ u64 diff = A64(reft) ^ A64(ipt);
+ #else
+ u32 diff = A32(reft) ^ A32(ipt);
+ #endif
+
+ if (!diff) {
+ ipt += STEPSIZE;
+ reft += STEPSIZE;
+ continue;
+ }
+ ipt += LZ4_NBCOMMONBYTES(diff);
+ goto _endcount;
+ }
+ #if LZ4_ARCH64
+ if ((ipt < (matchlimit - 3))
+ && (A32(reft) == A32(ipt))) {
+ ipt += 4;
+ reft += 4;
+ }
+ ipt += 2;
+ #endif
+ if ((ipt < (matchlimit - 1))
+ && (A16(reft) == A16(ipt))) {
+ reft += 2;
+ }
+ if ((ipt < matchlimit) && (*reft == *ipt))
+ ipt++;
+_endcount:
+ reft = ref;
+
+ while ((startt > startlimit)
+ && (reft > hc4->base)
+ && (startt[-1] == reft[-1])) {
+ startt--;
+ reft--;
+ }
+
+ if ((ipt - startt) > longest) {
+ longest = (int)(ipt - startt);
+ *matchpos = reft;
+ *startpos = startt;
+ }
+ }
+ }
+ ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+ }
+ return longest;
+}
+
+static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor,
+ int ml, const u8 *ref)
+{
+ int length, len;
+ u8 *token;
+
+ /* Encode Literal length */
+ length = (int)(*ip - *anchor);
+ token = (*op)++;
+ if (length >= (int)RUN_MASK) {
+ *token = (RUN_MASK << ML_BITS);
+ len = length - RUN_MASK;
+ for (; len > 254 ; len -= 255)
+ *(*op)++ = 255;
+ *(*op)++ = (u8)len;
+ } else
+ *token = (length << ML_BITS);
+
+ /* Copy Literals */
+ LZ4_BLINDCOPY(*anchor, *op, length);
+
+ /* Encode Offset */
+ LZ4_WRITE_LITTLEENDIAN_16(*op, (u16)(*ip - ref));
+
+ /* Encode MatchLength */
+ len = (int)(ml - MINMATCH);
+ if (len >= (int)ML_MASK) {
+ *token += ML_MASK;
+ len -= ML_MASK;
+ for (; len > 509 ; len -= 510) {
+ *(*op)++ = 255;
+ *(*op)++ = 255;
+ }
+ if (len > 254) {
+ len -= 255;
+ *(*op)++ = 255;
+ }
+ *(*op)++ = (u8)len;
+ } else
+ *token += len;
+
+ /* Prepare next loop */
+ *ip += ml;
+ *anchor = *ip;
+
+ return 0;
+}
+
+static int lz4_compresshcctx(struct lz4hc_data *ctx,
+ const char *source,
+ char *dest,
+ int isize)
+{
+ const u8 *ip = (const u8 *)source;
+ const u8 *anchor = ip;
+ const u8 *const iend = ip + isize;
+ const u8 *const mflimit = iend - MFLIMIT;
+ const u8 *const matchlimit = (iend - LASTLITERALS);
+
+ u8 *op = (u8 *)dest;
+
+ int ml, ml2, ml3, ml0;
+ const u8 *ref = NULL;
+ const u8 *start2 = NULL;
+ const u8 *ref2 = NULL;
+ const u8 *start3 = NULL;
+ const u8 *ref3 = NULL;
+ const u8 *start0;
+ const u8 *ref0;
+ int lastrun;
+
+ ip++;
+
+ /* Main Loop */
+ while (ip < mflimit) {
+ ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref));
+ if (!ml) {
+ ip++;
+ continue;
+ }
+
+ /* saved, in case we would skip too much */
+ start0 = ip;
+ ref0 = ref;
+ ml0 = ml;
+_search2:
+ if (ip+ml < mflimit)
+ ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2,
+ ip + 1, matchlimit, ml, &ref2, &start2);
+ else
+ ml2 = ml;
+ /* No better match */
+ if (ml2 == ml) {
+ lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+ continue;
+ }
+
+ if (start0 < ip) {
+ /* empirical */
+ if (start2 < ip + ml0) {
+ ip = start0;
+ ref = ref0;
+ ml = ml0;
+ }
+ }
+ /*
+ * Here, start0==ip
+ * First Match too small : removed
+ */
+ if ((start2 - ip) < 3) {
+ ml = ml2;
+ ip = start2;
+ ref = ref2;
+ goto _search2;
+ }
+
+_search3:
+ /*
+ * Currently we have :
+ * ml2 > ml1, and
+ * ip1+3 <= ip2 (usually < ip1+ml1)
+ */
+ if ((start2 - ip) < OPTIMAL_ML) {
+ int correction;
+ int new_ml = ml;
+ if (new_ml > OPTIMAL_ML)
+ new_ml = OPTIMAL_ML;
+ if (ip + new_ml > start2 + ml2 - MINMATCH)
+ new_ml = (int)(start2 - ip) + ml2 - MINMATCH;
+ correction = new_ml - (int)(start2 - ip);
+ if (correction > 0) {
+ start2 += correction;
+ ref2 += correction;
+ ml2 -= correction;
+ }
+ }
+ /*
+ * Now, we have start2 = ip+new_ml,
+ * with new_ml=min(ml, OPTIMAL_ML=18)
+ */
+ if (start2 + ml2 < mflimit)
+ ml3 = lz4hc_insertandgetwidermatch(ctx,
+ start2 + ml2 - 3, start2, matchlimit,
+ ml2, &ref3, &start3);
+ else
+ ml3 = ml2;
+
+ /* No better match : 2 sequences to encode */
+ if (ml3 == ml2) {
+ /* ip & ref are known; Now for ml */
+ if (start2 < ip+ml)
+ ml = (int)(start2 - ip);
+
+ /* Now, encode 2 sequences */
+ lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+ ip = start2;
+ lz4_encodesequence(&ip, &op, &anchor, ml2, ref2);
+ continue;
+ }
+
+ /* Not enough space for match 2 : remove it */
+ if (start3 < ip + ml + 3) {
+ /*
+ * can write Seq1 immediately ==> Seq2 is removed,
+ * so Seq3 becomes Seq1
+ */
+ if (start3 >= (ip + ml)) {
+ if (start2 < ip + ml) {
+ int correction =
+ (int)(ip + ml - start2);
+ start2 += correction;
+ ref2 += correction;
+ ml2 -= correction;
+ if (ml2 < MINMATCH) {
+ start2 = start3;
+ ref2 = ref3;
+ ml2 = ml3;
+ }
+ }
+
+ lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+ ip = start3;
+ ref = ref3;
+ ml = ml3;
+
+ start0 = start2;
+ ref0 = ref2;
+ ml0 = ml2;
+ goto _search2;
+ }
+
+ start2 = start3;
+ ref2 = ref3;
+ ml2 = ml3;
+ goto _search3;
+ }
+
+ /*
+ * OK, now we have 3 ascending matches; let's write at least
+ * the first one ip & ref are known; Now for ml
+ */
+ if (start2 < ip + ml) {
+ if ((start2 - ip) < (int)ML_MASK) {
+ int correction;
+ if (ml > OPTIMAL_ML)
+ ml = OPTIMAL_ML;
+ if (ip + ml > start2 + ml2 - MINMATCH)
+ ml = (int)(start2 - ip) + ml2
+ - MINMATCH;
+ correction = ml - (int)(start2 - ip);
+ if (correction > 0) {
+ start2 += correction;
+ ref2 += correction;
+ ml2 -= correction;
+ }
+ } else
+ ml = (int)(start2 - ip);
+ }
+ lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+
+ ip = start2;
+ ref = ref2;
+ ml = ml2;
+
+ start2 = start3;
+ ref2 = ref3;
+ ml2 = ml3;
+
+ goto _search3;
+ }
+
+ /* Encode Last Literals */
+ lastrun = (int)(iend - anchor);
+ if (lastrun >= (int)RUN_MASK) {
+ *op++ = (RUN_MASK << ML_BITS);
+ lastrun -= RUN_MASK;
+ for (; lastrun > 254 ; lastrun -= 255)
+ *op++ = 255;
+ *op++ = (u8) lastrun;
+ } else
+ *op++ = (lastrun << ML_BITS);
+ memcpy(op, anchor, iend - anchor);
+ op += iend - anchor;
+ /* End */
+ return (int) (((char *)op) - dest);
+}
+
+int lz4hc_compress(const unsigned char *src, size_t src_len,
+ unsigned char *dst, size_t *dst_len, void *wrkmem)
+{
+ int ret = -1;
+ int out_len = 0;
+
+ struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem;
+ lz4hc_init(hc4, (const u8 *)src);
+ out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src,
+ (char *)dst, (int)src_len);
+
+ if (out_len < 0)
+ goto exit;
+
+ *dst_len = out_len;
+ return 0;
+
+exit:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lz4hc_compress);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZ4HC compressor");
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
new file mode 100644
index 000000000000..b2bfa1057f7b
--- /dev/null
+++ b/lib/percpu-refcount.c
@@ -0,0 +1,251 @@
+#define pr_fmt(fmt) "%s: " fmt "\n", __func__
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/percpu-refcount.h>
+#include <linux/rcupdate.h>
+
+/*
+ * A percpu refcount can be in 4 different modes. The state is tracked in the
+ * low two bits of percpu_ref->pcpu_count:
+ *
+ * PCPU_REF_NONE - the initial state, no percpu counters allocated.
+ *
+ * PCPU_REF_PTR - using percpu counters for the refcount.
+ *
+ * PCPU_REF_DYING - we're shutting down so get()/put() should use the embedded
+ * atomic counter, but we're not finished updating the atomic counter from the
+ * percpu counters - this means that percpu_ref_put() can't check for the ref
+ * hitting 0 yet.
+ *
+ * PCPU_REF_DEAD - we've finished the teardown sequence, percpu_ref_put() should
+ * now check for the ref hitting 0.
+ *
+ * In PCPU_REF_NONE mode, we need to count the number of times percpu_ref_get()
+ * is called; this is done with the high bits of the raw atomic counter. We also
+ * track the time, in jiffies, when the get count last wrapped - this is done
+ * with the remaining bits of percpu_ref->percpu_count.
+ *
+ * So, when percpu_ref_get() is called it increments the get count and checks if
+ * it wrapped; if it did, it checks if the last time it wrapped was less than
+ * one second ago; if so, we want to allocate percpu counters.
+ *
+ * PCPU_COUNT_BITS determines the threshold where we convert to percpu: of the
+ * raw 64 bit counter, we use PCPU_COUNT_BITS for the refcount, and the
+ * remaining (high) bits to count the number of times percpu_ref_get() has been
+ * called. It's currently (completely arbitrarily) 16384 times in one second.
+ *
+ * Percpu mode (PCPU_REF_PTR):
+ *
+ * In percpu mode all we do on get and put is increment or decrement the cpu
+ * local counter, which is a 32 bit unsigned int.
+ *
+ * Note that all the gets() could be happening on one cpu, and all the puts() on
+ * another - the individual cpu counters can wrap (potentially many times).
+ *
+ * But this is fine because we don't need to check for the ref hitting 0 in
+ * percpu mode; before we set the state to PCPU_REF_DEAD we simply sum up all
+ * the percpu counters and add them to the atomic counter. Since addition and
+ * subtraction in modular arithmatic is still associative, the result will be
+ * correct.
+ */
+
+#define PCPU_COUNT_BITS 50
+#define PCPU_COUNT_MASK ((1LL << PCPU_COUNT_BITS) - 1)
+
+#define PCPU_STATUS_BITS 2
+#define PCPU_STATUS_MASK ((1 << PCPU_STATUS_BITS) - 1)
+
+#define PCPU_REF_PTR 0
+#define PCPU_REF_NONE 1
+#define PCPU_REF_DYING 2
+#define PCPU_REF_DEAD 3
+
+#define REF_STATUS(count) (count & PCPU_STATUS_MASK)
+
+/**
+ * percpu_ref_init - initialize a dynamic percpu refcount
+ *
+ * Initializes the refcount in single atomic counter mode with a refcount of 1;
+ * analagous to atomic_set(ref, 1).
+ */
+void percpu_ref_init(struct percpu_ref *ref)
+{
+ unsigned long now = jiffies;
+
+ atomic64_set(&ref->count, 1);
+
+ now <<= PCPU_STATUS_BITS;
+ now |= PCPU_REF_NONE;
+
+ ref->pcpu_count = now;
+}
+
+static void percpu_ref_alloc(struct percpu_ref *ref, unsigned long pcpu_count)
+{
+ unsigned long new, now = jiffies;
+
+ now <<= PCPU_STATUS_BITS;
+ now |= PCPU_REF_NONE;
+
+ if (now - pcpu_count <= HZ << PCPU_STATUS_BITS) {
+ rcu_read_unlock();
+ new = (unsigned long) alloc_percpu(unsigned);
+ rcu_read_lock();
+
+ if (!new)
+ goto update_time;
+
+ BUG_ON(new & PCPU_STATUS_MASK);
+
+ if (cmpxchg(&ref->pcpu_count, pcpu_count, new) != pcpu_count)
+ free_percpu((void __percpu *) new);
+ else
+ pr_debug("created");
+ } else {
+update_time:
+ new = now;
+ cmpxchg(&ref->pcpu_count, pcpu_count, new);
+ }
+}
+
+void __percpu_ref_get(struct percpu_ref *ref, bool alloc)
+{
+ unsigned long pcpu_count;
+ uint64_t v;
+
+ pcpu_count = ACCESS_ONCE(ref->pcpu_count);
+
+ if (REF_STATUS(pcpu_count) == PCPU_REF_PTR) {
+ /* for rcu - we're not using rcu_dereference() */
+ smp_read_barrier_depends();
+ __this_cpu_inc(*((unsigned __percpu *) pcpu_count));
+ } else {
+ v = atomic64_add_return(1 + (1ULL << PCPU_COUNT_BITS),
+ &ref->count);
+
+ /*
+ * The high bits of the counter count the number of gets() that
+ * have occured; we check for overflow to call
+ * percpu_ref_alloc() every (1 << (64 - PCPU_COUNT_BITS))
+ * iterations.
+ */
+
+ if (!(v >> PCPU_COUNT_BITS) &&
+ REF_STATUS(pcpu_count) == PCPU_REF_NONE && alloc)
+ percpu_ref_alloc(ref, pcpu_count);
+ }
+}
+
+/**
+ * percpu_ref_put - decrement a dynamic percpu refcount
+ *
+ * Returns true if the result is 0, otherwise false; only checks for the ref
+ * hitting 0 after percpu_ref_kill() has been called. Analagous to
+ * atomic_dec_and_test().
+ */
+int percpu_ref_put(struct percpu_ref *ref)
+{
+ unsigned long pcpu_count;
+ uint64_t v;
+ int ret = 0;
+
+ rcu_read_lock();
+
+ pcpu_count = ACCESS_ONCE(ref->pcpu_count);
+
+ switch (REF_STATUS(pcpu_count)) {
+ case PCPU_REF_PTR:
+ /* for rcu - we're not using rcu_dereference() */
+ smp_read_barrier_depends();
+ __this_cpu_dec(*((unsigned __percpu *) pcpu_count));
+ break;
+ case PCPU_REF_NONE:
+ case PCPU_REF_DYING:
+ atomic64_dec(&ref->count);
+ break;
+ case PCPU_REF_DEAD:
+ v = atomic64_dec_return(&ref->count);
+ v &= PCPU_COUNT_MASK;
+
+ ret = v == 0;
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/**
+ * percpu_ref_kill - prepare a dynamic percpu refcount for teardown
+ *
+ * Must be called before dropping the initial ref, so that percpu_ref_put()
+ * knows to check for the refcount hitting 0. If the refcount was in percpu
+ * mode, converts it back to single atomic counter mode.
+ *
+ * Returns true the first time called on @ref and false if @ref is already
+ * shutting down, so it may be used by the caller for synchronizing other parts
+ * of a two stage shutdown.
+ */
+int percpu_ref_kill(struct percpu_ref *ref)
+{
+ unsigned long old, new, status, pcpu_count;
+
+ pcpu_count = ACCESS_ONCE(ref->pcpu_count);
+
+ do {
+ status = REF_STATUS(pcpu_count);
+
+ switch (status) {
+ case PCPU_REF_PTR:
+ new = PCPU_REF_DYING;
+ break;
+ case PCPU_REF_NONE:
+ new = PCPU_REF_DEAD;
+ break;
+ case PCPU_REF_DYING:
+ case PCPU_REF_DEAD:
+ return 0;
+ }
+
+ old = pcpu_count;
+ pcpu_count = cmpxchg(&ref->pcpu_count, old, new);
+ } while (pcpu_count != old);
+
+ if (status == PCPU_REF_PTR) {
+ unsigned count = 0, cpu;
+
+ synchronize_rcu();
+
+ for_each_possible_cpu(cpu)
+ count += *per_cpu_ptr((unsigned __percpu *)pcpu_count,
+ cpu);
+
+ pr_debug("global %lli pcpu %i",
+ atomic64_read(&ref->count) & PCPU_COUNT_MASK,
+ (int) count);
+
+ atomic64_add((int) count, &ref->count);
+ smp_wmb();
+ /* Between setting global count and setting PCPU_REF_DEAD */
+ ref->pcpu_count = PCPU_REF_DEAD;
+
+ free_percpu((unsigned __percpu *) pcpu_count);
+ }
+
+ return 1;
+}
+
+/**
+ * percpu_ref_dead - check if a dynamic percpu refcount is shutting down
+ *
+ * Returns true if percpu_ref_kill() has been called on @ref, false otherwise.
+ */
+int percpu_ref_dead(struct percpu_ref *ref)
+{
+ unsigned status = REF_STATUS(ref->pcpu_count);
+
+ return status == PCPU_REF_DYING ||
+ status == PCPU_REF_DEAD;
+}
diff --git a/mm/Kconfig b/mm/Kconfig
index e742d06285b7..b0a7dad663f3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -477,3 +477,15 @@ config FRONTSWAP
and swap data is stored as normal on the matching swap device.
If unsure, say Y to enable frontswap.
+
+config MEM_SOFT_DIRTY
+ bool "Track memory changes"
+ depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
+ select PROC_PAGE_MONITOR
+ help
+ This option enables memory changes tracking by introducing a
+ soft-dirty bit on pte-s. This bit it set when someone writes
+ into a page just as regular dirty bit, but unlike the latter
+ it can be cleared by hands.
+
+ See Documentation/vm/soft-dirty.txt for more details.
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 07dbc8ec46cf..2c8ce496804f 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -242,6 +242,7 @@ bool balloon_page_isolate(struct page *page)
if (__is_movable_balloon_page(page) &&
page_count(page) == 2) {
__isolate_balloon_page(page);
+ balloon_event_count(COMPACTBALLOONISOLATED);
unlock_page(page);
return true;
}
@@ -265,6 +266,7 @@ void balloon_page_putback(struct page *page)
__putback_balloon_page(page);
/* drop the extra ref count taken for page isolation */
put_page(page);
+ balloon_event_count(COMPACTBALLOONRETURNED);
} else {
WARN_ON(1);
dump_page(page);
diff --git a/mm/bounce.c b/mm/bounce.c
index c9f0a4339a7d..708c1e99f57b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -147,12 +147,14 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
bio_put(bio);
}
-static void bounce_end_io_write(struct bio *bio, int err)
+static void bounce_end_io_write(struct bio *bio, int err,
+ struct batch_complete *batch)
{
bounce_end_io(bio, page_pool, err);
}
-static void bounce_end_io_write_isa(struct bio *bio, int err)
+static void bounce_end_io_write_isa(struct bio *bio, int err,
+ struct batch_complete *batch)
{
bounce_end_io(bio, isa_page_pool, err);
@@ -168,12 +170,14 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
bounce_end_io(bio, pool, err);
}
-static void bounce_end_io_read(struct bio *bio, int err)
+static void bounce_end_io_read(struct bio *bio, int err,
+ struct batch_complete *batch)
{
__bounce_end_io_read(bio, page_pool, err);
}
-static void bounce_end_io_read_isa(struct bio *bio, int err)
+static void bounce_end_io_read_isa(struct bio *bio, int err,
+ struct batch_complete *batch)
{
__bounce_end_io_read(bio, isa_page_pool, err);
}
diff --git a/mm/dmapool.c b/mm/dmapool.c
index c69781e97cf9..668f26316e2e 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -132,6 +132,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
{
struct dma_pool *retval;
size_t allocation;
+ int node;
if (align == 0) {
align = 1;
@@ -156,7 +157,9 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
return NULL;
}
- retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
+ node = WARN_ON(!dev) ? -1 : dev_to_node(dev);
+
+ retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, node);
if (!retval)
return retval;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 03a89a2f464b..d252c6e0fd03 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1429,7 +1429,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
if (ret == 1) {
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
- set_pmd_at(mm, new_addr, new_pmd, pmd);
+ set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
spin_unlock(&mm->page_table_lock);
}
out:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0f1d92163f30..fe4f123f1170 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -319,14 +319,31 @@ struct mem_cgroup {
/* thresholds for mem+swap usage. RCU-protected */
struct mem_cgroup_thresholds memsw_thresholds;
- /* For oom notifier event fd */
- struct list_head oom_notify;
+ union {
+ /* For oom notifier event fd */
+ struct list_head oom_notify;
+ /*
+ * we can only trigger an oom event if the memcg is alive.
+ * so we will reuse this field to hook the memcg in the list
+ * of dead memcgs.
+ */
+ struct list_head dead;
+ };
- /*
- * Should we move charges of a task when a task is moved into this
- * mem_cgroup ? And what type of charges should we move ?
- */
- unsigned long move_charge_at_immigrate;
+ union {
+ /*
+ * Should we move charges of a task when a task is moved into
+ * this mem_cgroup ? And what type of charges should we move ?
+ */
+ unsigned long move_charge_at_immigrate;
+
+ /*
+ * We are no longer concerned about moving charges after memcg
+ * is dead. So we will fill this up with its name, to aid
+ * debugging.
+ */
+ char *memcg_name;
+ };
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/
@@ -380,6 +397,55 @@ static size_t memcg_size(void)
nr_node_ids * sizeof(struct mem_cgroup_per_node);
}
+#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY
+static LIST_HEAD(dangling_memcgs);
+static DEFINE_MUTEX(dangling_memcgs_mutex);
+
+static inline void memcg_dangling_free(struct mem_cgroup *memcg)
+{
+ mutex_lock(&dangling_memcgs_mutex);
+ list_del(&memcg->dead);
+ mutex_unlock(&dangling_memcgs_mutex);
+ free_pages((unsigned long)memcg->memcg_name, 0);
+}
+
+static inline void memcg_dangling_add(struct mem_cgroup *memcg)
+{
+ /*
+ * cgroup.c will do page-sized allocations most of the time,
+ * so we'll just follow the pattern. Also, __get_free_pages
+ * is a better interface than kmalloc for us here, because
+ * we'd like this memory to be always billed to the root cgroup,
+ * not to the process removing the memcg. While kmalloc would
+ * require us to wrap it into memcg_stop/resume_kmem_account,
+ * with __get_free_pages we just don't pass the memcg flag.
+ */
+ memcg->memcg_name = (char *)__get_free_pages(GFP_KERNEL, 0);
+
+ /*
+ * we will, in general, just ignore failures. No need to go crazy,
+ * being this just a debugging interface. It is nice to copy a memcg
+ * name over, but if we (unlikely) can't, just the address will do
+ */
+ if (!memcg->memcg_name)
+ goto add_list;
+
+ if (cgroup_path(memcg->css.cgroup, memcg->memcg_name, PAGE_SIZE) < 0) {
+ free_pages((unsigned long)memcg->memcg_name, 0);
+ memcg->memcg_name = NULL;
+ }
+
+add_list:
+ INIT_LIST_HEAD(&memcg->dead);
+ mutex_lock(&dangling_memcgs_mutex);
+ list_add(&memcg->dead, &dangling_memcgs);
+ mutex_unlock(&dangling_memcgs_mutex);
+}
+#else
+static inline void memcg_dangling_free(struct mem_cgroup *memcg) {}
+static inline void memcg_dangling_add(struct mem_cgroup *memcg) {}
+#endif
+
/* internal only representation about the status of kmem accounting. */
enum {
KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
@@ -5076,6 +5142,107 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
}
+#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY
+static void
+mem_cgroup_dangling_swap(struct mem_cgroup *memcg, struct seq_file *m)
+{
+#ifdef CONFIG_MEMCG_SWAP
+ u64 kmem;
+ u64 memsw;
+
+ /*
+ * kmem will also propagate here, so we are only interested in the
+ * difference. See comment in mem_cgroup_reparent_charges for details.
+ *
+ * We could save this value for later consumption by kmem reports, but
+ * there is not a lot of problem if the figures differ slightly.
+ */
+ kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE);
+ memsw = res_counter_read_u64(&memcg->memsw, RES_USAGE) - kmem;
+ seq_printf(m, "\t%llu swap bytes\n", memsw);
+#endif
+}
+
+
+static void
+mem_cgroup_dangling_tcp(struct mem_cgroup *memcg, struct seq_file *m)
+{
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
+ struct tcp_memcontrol *tcp = &memcg->tcp_mem;
+ s64 tcp_socks;
+ u64 tcp_bytes;
+
+ tcp_socks = percpu_counter_sum_positive(&tcp->tcp_sockets_allocated);
+ tcp_bytes = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
+ seq_printf(m, "\t%llu tcp bytes", tcp_bytes);
+ /*
+ * if tcp_bytes == 0, tcp_socks != 0 is a bug. One more reason to print
+ * it!
+ */
+ if (tcp_bytes || tcp_socks)
+ seq_printf(m, ", in %lld sockets", tcp_socks);
+ seq_printf(m, "\n");
+
+#endif
+}
+
+static void
+mem_cgroup_dangling_kmem(struct mem_cgroup *memcg, struct seq_file *m)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ u64 kmem;
+ struct memcg_cache_params *params;
+
+ kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE);
+ seq_printf(m, "\t%llu kmem bytes", kmem);
+
+ /* list below may not be initialized, so not even try */
+ if (!kmem)
+ return;
+
+ seq_printf(m, " in caches");
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+ struct kmem_cache *s = memcg_params_to_cache(params);
+
+ seq_printf(m, " %s", s->name);
+ }
+ mutex_unlock(&memcg->slab_caches_mutex);
+ seq_printf(m, "\n");
+#endif
+}
+
+/*
+ * After a memcg is destroyed, it may still be kept around in memory.
+ * Currently, the two main reasons for it are swap entries, and kernel memory.
+ * Because they will be freed assynchronously, they will pin the memcg structure
+ * and its resources until the last reference goes away.
+ *
+ * This root-only file will show information about which users
+ */
+static int mem_cgroup_dangling_read(struct cgroup *cont, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct mem_cgroup *memcg;
+
+ mutex_lock(&dangling_memcgs_mutex);
+
+ list_for_each_entry(memcg, &dangling_memcgs, dead) {
+ if (memcg->memcg_name)
+ seq_printf(m, "%s:\n", memcg->memcg_name);
+ else
+ seq_printf(m, "%p (name lost):\n", memcg);
+
+ mem_cgroup_dangling_swap(memcg, m);
+ mem_cgroup_dangling_tcp(memcg, m);
+ mem_cgroup_dangling_kmem(memcg, m);
+ }
+
+ mutex_unlock(&dangling_memcgs_mutex);
+ return 0;
+}
+#endif
+
static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
{
int ret = -EINVAL;
@@ -5977,6 +6144,14 @@ static struct cftype mem_cgroup_files[] = {
},
#endif
#endif
+
+#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY
+ {
+ .name = "dangling_memcgs",
+ .read_seq_string = mem_cgroup_dangling_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+#endif
{ }, /* terminate */
};
@@ -6126,6 +6301,8 @@ static void free_work(struct work_struct *work)
struct mem_cgroup *memcg;
memcg = container_of(work, struct mem_cgroup, work_freeing);
+
+ memcg_dangling_free(memcg);
__mem_cgroup_free(memcg);
}
@@ -6320,6 +6497,7 @@ static void mem_cgroup_css_free(struct cgroup *cont)
kmem_cgroup_destroy(memcg);
+ memcg_dangling_add(memcg);
mem_cgroup_put(memcg);
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 27ed22579fd9..c9c5eeebe5a4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -876,6 +876,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
balloon_page_free(page);
+ balloon_event_count(COMPACTBALLOONMIGRATED);
return MIGRATEPAGE_SUCCESS;
}
out:
diff --git a/mm/mmap.c b/mm/mmap.c
index da3e9c04bf37..eb390ee646c5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1867,15 +1867,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
}
#endif
-void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
-{
- /*
- * Is this a new hole at the lowest possible address?
- */
- if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
- mm->free_area_cache = addr;
-}
-
/*
* This mmap-allocator allocates new areas top-down from below the
* stack's low limit (the base):
@@ -1932,19 +1923,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
}
#endif
-void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
-{
- /*
- * Is this a new hole at the highest possible address?
- */
- if (addr > mm->free_area_cache)
- mm->free_area_cache = addr;
-
- /* dont allow allocations above current base */
- if (mm->free_area_cache > mm->mmap_base)
- mm->free_area_cache = mm->mmap_base;
-}
-
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
@@ -2365,7 +2343,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct vm_area_struct **insertion_point;
struct vm_area_struct *tail_vma = NULL;
- unsigned long addr;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
@@ -2382,11 +2359,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
} else
mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
- if (mm->unmap_area == arch_unmap_area)
- addr = prev ? prev->vm_end : mm->mmap_base;
- else
- addr = vma ? vma->vm_start : mm->mmap_base;
- mm->unmap_area(mm, addr);
mm->mmap_cache = NULL; /* Kill the cache. */
}
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 3dcfaf4ed355..8a8cd0265e52 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -14,9 +14,6 @@
* use_mm
* Makes the calling kernel thread take on the specified
* mm context.
- * Called by the retry thread execute retries within the
- * iocb issuer's mm context, so that copy_from/to_user
- * operations work seamlessly for aio.
* (Note: this routine is intended to be called only
* from a kernel thread context)
*/
diff --git a/mm/mremap.c b/mm/mremap.c
index 463a25705ac6..3708655378e9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
continue;
pte = ptep_get_and_clear(mm, old_addr, old_pte);
pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
- set_pte_at(mm, new_addr, new_pte, pte);
+ set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte));
}
arch_leave_lazy_mmu_mode();
diff --git a/mm/nommu.c b/mm/nommu.c
index 298884dcd6e7..886e07ce4d1d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1869,10 +1869,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
return -ENOMEM;
}
-void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
-{
-}
-
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen,
int even_cows)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 98cbdf6e5532..76350aacdfe5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3926,8 +3926,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn))
+ if (!early_pfn_valid(pfn)) {
+ pfn = ALIGN(pfn + MAX_ORDER_NR_PAGES,
+ MAX_ORDER_NR_PAGES) - 1;
continue;
+ }
if (!early_pfn_in_nid(pfn, nid))
continue;
}
diff --git a/mm/page_io.c b/mm/page_io.c
index e9a3f6334352..a29407666467 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -20,6 +20,7 @@
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/frontswap.h>
+#include <linux/aio.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -41,7 +42,8 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
return bio;
}
-void end_swap_bio_write(struct bio *bio, int err)
+void end_swap_bio_write(struct bio *bio, int err,
+ struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
@@ -67,7 +69,7 @@ void end_swap_bio_write(struct bio *bio, int err)
bio_put(bio);
}
-void end_swap_bio_read(struct bio *bio, int err)
+void end_swap_bio_read(struct bio *bio, int err, struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
@@ -202,7 +204,8 @@ out:
}
int __swap_writepage(struct page *page, struct writeback_control *wbc,
- void (*end_write_func)(struct bio *, int))
+ void (*end_write_func)(struct bio *bio, int err,
+ struct batch_complete *batch))
{
struct bio *bio;
int ret = 0, rw = WRITE;
diff --git a/mm/shmem.c b/mm/shmem.c
index 39b2a0b86fe8..5e6a8422658b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -31,6 +31,7 @@
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/swap.h>
+#include <linux/aio.h>
static struct vfsmount *shm_mnt;
diff --git a/mm/swap.c b/mm/swap.c
index acd40bfffa82..dfd7d71d6841 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
+#include <linux/uio.h>
#include "internal.h"
diff --git a/mm/util.c b/mm/util.c
index ab1424dbe2e6..7441c41d00f6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -295,7 +295,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
}
#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f42745e65780..7a35116462a7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -794,7 +794,14 @@ const char * const vmstat_text[] = {
"compact_stall",
"compact_fail",
"compact_success",
-#endif
+
+#ifdef CONFIG_BALLOON_COMPACTION
+ "compact_balloon_isolated",
+ "compact_balloon_migrated",
+ "compact_balloon_returned",
+#endif /* CONFIG_BALLOON_COMPACTION */
+
+#endif /* CONFIG_COMPACTION */
#ifdef CONFIG_HUGETLB_PAGE
"htlb_buddy_alloc_success",
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5b142fb16480..70146496e73a 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1716,9 +1716,9 @@ static struct ctl_table vs_vars[] = {
},
{
.procname = "sync_qlen_max",
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_doulongvec_minmax,
},
{
.procname = "sync_sock_size",
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 51bb3de680b6..a0ab6d7599f1 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -311,6 +311,11 @@ cmd_lzo = (cat $(filter-out FORCE,$^) | \
lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
(rm -f $@ ; false)
+quiet_cmd_lz4 = LZ4 $@
+cmd_lz4 = (cat $(filter-out FORCE,$^) | \
+ lz4demo -c1 stdin stdout && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
+ (rm -f $@ ; false)
+
# U-Boot mkimage
# ---------------------------------------------------------------------------
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 8bbefc3b55d4..d4f1468b9b50 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -16,6 +16,8 @@
#include <linux/key-type.h>
#include <linux/task_work.h>
+struct iovec;
+
#ifdef __KDEBUG
#define kenter(FMT, ...) \
printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 4b5c948eb414..33cfd27b4de2 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -22,6 +22,7 @@
#include <linux/err.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
+#include <linux/uio.h>
#include <asm/uaccess.h>
#include "internal.h"
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 23e3c46cd0a4..ccfa383f1fda 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -25,7 +25,7 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/pm_qos.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
#include <linux/dma-mapping.h>
#include <sound/core.h>
#include <sound/control.h>
diff --git a/sound/soc/codecs/si476x.c b/sound/soc/codecs/si476x.c
index 721587c9cd84..73e205c892a0 100644
--- a/sound/soc/codecs/si476x.c
+++ b/sound/soc/codecs/si476x.c
@@ -38,9 +38,9 @@ enum si476x_digital_io_output_format {
SI476X_DIGITAL_IO_SAMPLE_SIZE_SHIFT = 8,
};
-#define SI476X_DIGITAL_IO_OUTPUT_WIDTH_MASK ((0b111 << SI476X_DIGITAL_IO_SLOT_SIZE_SHIFT) | \
- (0b111 << SI476X_DIGITAL_IO_SAMPLE_SIZE_SHIFT))
-#define SI476X_DIGITAL_IO_OUTPUT_FORMAT_MASK (0b1111110)
+#define SI476X_DIGITAL_IO_OUTPUT_WIDTH_MASK ((0x7 << SI476X_DIGITAL_IO_SLOT_SIZE_SHIFT) | \
+ (0x7 << SI476X_DIGITAL_IO_SAMPLE_SIZE_SHIFT))
+#define SI476X_DIGITAL_IO_OUTPUT_FORMAT_MASK (0x7e)
enum si476x_daudio_formats {
SI476X_DAUDIO_MODE_I2S = (0x0 << 1),
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index d4abc59ce1d9..a2c214e39fd4 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -2,6 +2,7 @@ TARGETS = breakpoints
TARGETS += cpu-hotplug
TARGETS += efivarfs
TARGETS += kcmp
+TARGETS += timers
TARGETS += memory-hotplug
TARGETS += mqueue
TARGETS += net
diff --git a/tools/testing/selftests/timers/Makefile b/tools/testing/selftests/timers/Makefile
new file mode 100644
index 000000000000..eb2859f4ad21
--- /dev/null
+++ b/tools/testing/selftests/timers/Makefile
@@ -0,0 +1,8 @@
+all:
+ gcc posix_timers.c -o posix_timers -lrt
+
+run_tests: all
+ ./posix_timers
+
+clean:
+ rm -f ./posix_timers
diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
new file mode 100644
index 000000000000..4fa655d68a81
--- /dev/null
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (C) 2013 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Licensed under the terms of the GNU GPL License version 2
+ *
+ * Selftests for a few posix timers interface.
+ *
+ * Kernel loop code stolen from Steven Rostedt <srostedt@redhat.com>
+ */
+
+#include <sys/time.h>
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+#include <time.h>
+#include <pthread.h>
+
+#define DELAY 2
+#define USECS_PER_SEC 1000000
+
+static volatile int done;
+
+/* Busy loop in userspace to elapse ITIMER_VIRTUAL */
+static void user_loop(void)
+{
+ while (!done);
+}
+
+/*
+ * Try to spend as much time as possible in kernelspace
+ * to elapse ITIMER_PROF.
+ */
+static void kernel_loop(void)
+{
+ void *addr = sbrk(0);
+
+ while (!done) {
+ brk(addr + 4096);
+ brk(addr);
+ }
+}
+
+/*
+ * Sleep until ITIMER_REAL expiration.
+ */
+static void idle_loop(void)
+{
+ pause();
+}
+
+static void sig_handler(int nr)
+{
+ done = 1;
+}
+
+/*
+ * Check the expected timer expiration matches the GTOD elapsed delta since
+ * we armed the timer. Keep a 0.5 sec error margin due to various jitter.
+ */
+static int check_diff(struct timeval start, struct timeval end)
+{
+ long long diff;
+
+ diff = end.tv_usec - start.tv_usec;
+ diff += (end.tv_sec - start.tv_sec) * USECS_PER_SEC;
+
+ if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) {
+ printf("Diff too high: %lld..", diff);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int check_itimer(int which)
+{
+ int err;
+ struct timeval start, end;
+ struct itimerval val = {
+ .it_value.tv_sec = DELAY,
+ };
+
+ printf("Check itimer ");
+
+ if (which == ITIMER_VIRTUAL)
+ printf("virtual... ");
+ else if (which == ITIMER_PROF)
+ printf("prof... ");
+ else if (which == ITIMER_REAL)
+ printf("real... ");
+
+ fflush(stdout);
+
+ done = 0;
+
+ if (which == ITIMER_VIRTUAL)
+ signal(SIGVTALRM, sig_handler);
+ else if (which == ITIMER_PROF)
+ signal(SIGPROF, sig_handler);
+ else if (which == ITIMER_REAL)
+ signal(SIGALRM, sig_handler);
+
+ err = gettimeofday(&start, NULL);
+ if (err < 0) {
+ perror("Can't call gettimeofday()\n");
+ return -1;
+ }
+
+ err = setitimer(which, &val, NULL);
+ if (err < 0) {
+ perror("Can't set timer\n");
+ return -1;
+ }
+
+ if (which == ITIMER_VIRTUAL)
+ user_loop();
+ else if (which == ITIMER_PROF)
+ kernel_loop();
+ else if (which == ITIMER_REAL)
+ idle_loop();
+
+ gettimeofday(&end, NULL);
+ if (err < 0) {
+ perror("Can't call gettimeofday()\n");
+ return -1;
+ }
+
+ if (!check_diff(start, end))
+ printf("[OK]\n");
+ else
+ printf("[FAIL]\n");
+
+ return 0;
+}
+
+static int check_timer_create(int which)
+{
+ int err;
+ timer_t id;
+ struct timeval start, end;
+ struct itimerspec val = {
+ .it_value.tv_sec = DELAY,
+ };
+
+ printf("Check timer_create() ");
+ if (which == CLOCK_THREAD_CPUTIME_ID) {
+ printf("per thread... ");
+ } else if (which == CLOCK_PROCESS_CPUTIME_ID) {
+ printf("per process... ");
+ }
+ fflush(stdout);
+
+ done = 0;
+ timer_create(which, NULL, &id);
+ if (err < 0) {
+ perror("Can't create timer\n");
+ return -1;
+ }
+ signal(SIGALRM, sig_handler);
+
+ err = gettimeofday(&start, NULL);
+ if (err < 0) {
+ perror("Can't call gettimeofday()\n");
+ return -1;
+ }
+
+ err = timer_settime(id, 0, &val, NULL);
+ if (err < 0) {
+ perror("Can't set timer\n");
+ return -1;
+ }
+
+ user_loop();
+
+ gettimeofday(&end, NULL);
+ if (err < 0) {
+ perror("Can't call gettimeofday()\n");
+ return -1;
+ }
+
+ if (!check_diff(start, end))
+ printf("[OK]\n");
+ else
+ printf("[FAIL]\n");
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int err;
+
+ printf("Testing posix timers. False negative may happen on CPU execution \n");
+ printf("based timers if other threads run on the CPU...\n");
+
+ if (check_itimer(ITIMER_VIRTUAL) < 0)
+ return -1;
+
+ if (check_itimer(ITIMER_PROF) < 0)
+ return -1;
+
+ if (check_itimer(ITIMER_REAL) < 0)
+ return -1;
+
+ if (check_timer_create(CLOCK_THREAD_CPUTIME_ID) < 0)
+ return -1;
+
+ /*
+ * It's unfortunately hard to reliably test a timer expiration
+ * on parallel multithread cputime. We could arm it to expire
+ * on DELAY * nr_threads, with nr_threads busy looping, then wait
+ * the normal DELAY since the time is elapsing nr_threads faster.
+ * But for that we need to ensure we have real physical free CPUs
+ * to ensure true parallelism. So test only one thread until we
+ * find a better solution.
+ */
+ if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0)
+ return -1;
+
+ return 0;
+}
diff --git a/usr/Kconfig b/usr/Kconfig
index 085872bb2bb5..642f503d3e9f 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -90,6 +90,15 @@ config RD_LZO
Support loading of a LZO encoded initial ramdisk or cpio buffer
If unsure, say N.
+config RD_LZ4
+ bool "Support initial ramdisks compressed using LZ4" if EXPERT
+ default !EXPERT
+ depends on BLK_DEV_INITRD
+ select DECOMPRESS_LZ4
+ help
+ Support loading of a LZ4 encoded initial ramdisk or cpio buffer
+ If unsure, say N.
+
choice
prompt "Built-in initramfs compression mode" if INITRAMFS_SOURCE!=""
help