summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2013-03-01 15:18:28 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2013-03-01 15:31:07 +1100
commit9c9ce516d8594d02b113cd27b92b15b272e13859 (patch)
tree426456b03bf50eb440da78bbc471ed408443e868
parent5e73a53613dab2589117c595acafa1b8c682e951 (diff)
parentcc321d523dc4a7e0e1a4adaa70342f1ca9af2964 (diff)
Merge branch 'akpm/master'
-rw-r--r--CREDITS5
-rw-r--r--Documentation/cgroups/memory.txt16
-rw-r--r--Documentation/sysctl/vm.txt33
-rw-r--r--MAINTAINERS17
-rw-r--r--arch/arm/boot/dts/armada-370-xp.dtsi6
-rw-r--r--arch/arm/mm/mmap.c2
-rw-r--r--arch/arm64/mm/mmap.c2
-rw-r--r--arch/mips/mm/mmap.c2
-rw-r--r--arch/powerpc/include/asm/page_64.h3
-rw-r--r--arch/powerpc/mm/hugetlbpage.c2
-rw-r--r--arch/powerpc/mm/mmap_64.c2
-rw-r--r--arch/powerpc/mm/slice.c223
-rw-r--r--arch/powerpc/platforms/cell/spufs/file.c2
-rw-r--r--arch/s390/hypfs/inode.c1
-rw-r--r--arch/s390/mm/mmap.c4
-rw-r--r--arch/sparc/kernel/sys_sparc_64.c2
-rw-r--r--arch/tile/mm/mmap.c2
-rw-r--r--arch/x86/ia32/ia32_aout.c2
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/kernel/e820.c72
-rw-r--r--arch/x86/mm/mmap.c2
-rw-r--r--arch/x86/platform/efi/efi.c15
-rw-r--r--block/blk-core.c34
-rw-r--r--block/blk-flush.c2
-rw-r--r--block/blk.h3
-rw-r--r--block/genhd.c2
-rw-r--r--block/scsi_ioctl.c1
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c64
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h8
-rw-r--r--drivers/block/swim3.c2
-rw-r--r--drivers/block/virtio_blk.c31
-rw-r--r--drivers/char/mem.c36
-rw-r--r--drivers/gpu/drm/drm_fb_helper.c8
-rw-r--r--drivers/infiniband/hw/ipath/ipath_file_ops.c1
-rw-r--r--drivers/infiniband/hw/qib/qib_file_ops.c2
-rw-r--r--drivers/leds/leds-ot200.c14
-rw-r--r--drivers/md/dm.c2
-rw-r--r--drivers/rtc/rtc-ds1307.c60
-rw-r--r--drivers/rtc/rtc-pxa.c64
-rw-r--r--drivers/scsi/sg.c1
-rw-r--r--drivers/staging/android/logger.c1
-rw-r--r--drivers/usb/gadget/amd5536udc.c16
-rw-r--r--drivers/usb/gadget/inode.c42
-rw-r--r--drivers/video/cyber2000fb.c3
-rw-r--r--fs/9p/vfs_addr.c1
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/aio.c1770
-rw-r--r--fs/binfmt_aout.c2
-rw-r--r--fs/binfmt_elf.c23
-rw-r--r--fs/bio.c52
-rw-r--r--fs/block_dev.c9
-rw-r--r--fs/btrfs/file.c4
-rw-r--r--fs/btrfs/inode.c1
-rw-r--r--fs/ceph/file.c1
-rw-r--r--fs/cifs/file.c3
-rw-r--r--fs/compat.c1
-rw-r--r--fs/coredump.c6
-rw-r--r--fs/direct-io.c21
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/file.c1
-rw-r--r--fs/eventpoll.c40
-rw-r--r--fs/ext2/inode.c1
-rw-r--r--fs/ext3/inode.c1
-rw-r--r--fs/ext4/file.c1
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inode.c1
-rw-r--r--fs/ext4/page-io.c1
-rw-r--r--fs/f2fs/data.c1
-rw-r--r--fs/fat/inode.c1
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/fuse/dev.c1
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/gfs2/aops.c1
-rw-r--r--fs/gfs2/file.c1
-rw-r--r--fs/hfs/inode.c1
-rw-r--r--fs/hfsplus/inode.c1
-rw-r--r--fs/jfs/inode.c1
-rw-r--r--fs/nfsd/nfs4state.c1
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/ntfs/file.c4
-rw-r--r--fs/ntfs/inode.c1
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/file.c7
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/pipe.c1
-rw-r--r--fs/read_write.c35
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/splice.c3
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/udf/inode.c1
-rw-r--r--fs/xfs/xfs_aops.c1
-rw-r--r--fs/xfs/xfs_file.c4
-rw-r--r--include/linux/aio.h199
-rw-r--r--include/linux/balloon_compaction.h7
-rw-r--r--include/linux/batch_complete.h23
-rw-r--r--include/linux/bio.h36
-rw-r--r--include/linux/blk_types.h11
-rw-r--r--include/linux/blkdev.h12
-rw-r--r--include/linux/cgroup.h1
-rw-r--r--include/linux/errno.h1
-rw-r--r--include/linux/fs.h12
-rw-r--r--include/linux/hardirq.h5
-rw-r--r--include/linux/mm_types.h3
-rw-r--r--include/linux/mmzone.h20
-rw-r--r--include/linux/percpu-refcount.h114
-rw-r--r--include/linux/pid_namespace.h1
-rw-r--r--include/linux/rtc-pxa.h18
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/vm_event_item.h7
-rw-r--r--include/linux/wait.h86
-rw-r--r--include/linux/writeback.h1
-rw-r--r--include/uapi/linux/eventpoll.h1
-rw-r--r--init/Kconfig17
-rw-r--r--ipc/sem.c250
-rw-r--r--kernel/acct.c3
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/printk.c1
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/smp.c102
-rw-r--r--kernel/time/timer_list.c112
-rw-r--r--kernel/watchdog.c10
-rw-r--r--lib/Makefile2
-rw-r--r--lib/percpu-refcount.c243
-rw-r--r--mm/balloon_compaction.c2
-rw-r--r--mm/dmapool.c5
-rw-r--r--mm/filemap.c3
-rw-r--r--mm/memcontrol.c192
-rw-r--r--mm/migrate.c1
-rw-r--r--mm/mmap.c28
-rw-r--r--mm/mmu_context.c3
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/page_io.c1
-rw-r--r--mm/shmem.c1
-rw-r--r--mm/swap.c1
-rw-r--r--mm/util.c1
-rw-r--r--mm/vmstat.c9
-rw-r--r--scripts/pnmtologo.c8
-rw-r--r--security/keys/internal.h2
-rw-r--r--security/keys/keyctl.c1
-rw-r--r--sound/core/pcm_native.c2
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/epoll/Makefile11
-rw-r--r--tools/testing/selftests/epoll/test_epoll.c364
145 files changed, 2979 insertions, 1819 deletions
diff --git a/CREDITS b/CREDITS
index 948e0fb9a70e..ad373cead42b 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1510,6 +1510,11 @@ D: Natsemi ethernet
D: Cobalt Networks (x86) support
D: This-and-That
+N: Mark M. Hoffman
+E: mhoffman@lightlink.com
+D: Maintenance of drivers/hwmon/asb100.c, drivers/i2c/busses/i2c-sis96x.c
+D: i2c core contributions
+
N: Dirk Hohndel
E: hohndel@suse.de
D: The XFree86[tm] Project
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 8b8c28b9864c..addb1f110e9a 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -70,6 +70,7 @@ Brief summary of control files.
memory.move_charge_at_immigrate # set/show controls of moving charges
memory.oom_control # set/show oom controls.
memory.numa_stat # show the number of memory usage per numa node
+ memory.dangling_memcgs # show debugging information about dangling groups
memory.kmem.limit_in_bytes # set/show hard limit for kernel memory
memory.kmem.usage_in_bytes # show current kernel memory allocation
@@ -577,6 +578,21 @@ unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
And we have total = file + anon + unevictable.
+5.7 dangling_memcgs
+
+This file will only be ever present in the root cgroup, if the option
+CONFIG_MEMCG_DEBUG_ASYNC_DESTROY is set. When a memcg is destroyed, the memory
+consumed by it may not be immediately freed. This is because when some
+extensions are used, such as swap or kernel memory, objects can outlive the
+group and hold a reference to it.
+
+If this is the case, the dangling_memcgs file will show information about what
+are the memcgs still alive, and which references are still preventing it to be
+freed. There is nothing wrong with that, but it is very useful when debugging,
+to know where this memory is being held. This is a developer-oriented debugging
+facility only, and no guarantees of interface stability will be given. The file
+is read-only, and has the sole purpose of displaying information.
+
6. Hierarchy support
The memory controller supports a deep hierarchy and hierarchical accounting.
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 078701fdbd4d..21ad1815be8c 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -138,18 +138,39 @@ Setting this to zero disables periodic writeback altogether.
drop_caches
-Writing to this will cause the kernel to drop clean caches, dentries and
-inodes from memory, causing that memory to become free.
+Writing to this will cause the kernel to drop clean caches, as well as
+reclaimable slab objects like dentries and inodes. Once dropped, their
+memory becomes free.
To free pagecache:
echo 1 > /proc/sys/vm/drop_caches
-To free dentries and inodes:
+To free reclaimable slab objects (includes dentries and inodes):
echo 2 > /proc/sys/vm/drop_caches
-To free pagecache, dentries and inodes:
+To free slab objects and pagecache:
echo 3 > /proc/sys/vm/drop_caches
-As this is a non-destructive operation and dirty objects are not freeable, the
-user should run `sync' first.
+This is a non-destructive operation and will not free any dirty objects.
+To increase the number of objects freed by this operation, the user may run
+`sync' prior to writing to /proc/sys/vm/drop_caches. This will minimize the
+number of dirty objects on the system and create more candidates to be
+dropped.
+
+This file is not a means to control the growth of the various kernel caches
+(inodes, dentries, pagecache, etc...) These objects are automatically
+reclaimed by the kernel when memory is needed elsewhere on the system.
+
+Use of this file can cause performance problems. Since it discards cached
+objects, it may cost a significant amount of I/O and CPU to recreate the
+dropped objects, especially if they were under heavy use. Because of this,
+use outside of a testing or debugging environment is not recommended.
+
+You may see informational messages in your kernel log when this file is
+used:
+
+ cat (1234): dropped kernel caches: 3
+
+These are informational only. They do not mean that anything is wrong
+with your system.
==============================================================
diff --git a/MAINTAINERS b/MAINTAINERS
index e95b1e944eb7..5a1a1ccd1e6a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1344,12 +1344,6 @@ S: Maintained
F: drivers/platform/x86/asus*.c
F: drivers/platform/x86/eeepc*.c
-ASUS ASB100 HARDWARE MONITOR DRIVER
-M: "Mark M. Hoffman" <mhoffman@lightlink.com>
-L: lm-sensors@lm-sensors.org
-S: Maintained
-F: drivers/hwmon/asb100.c
-
ASYNCHRONOUS TRANSFERS/TRANSFORMS (IOAT) API
M: Dan Williams <djbw@fb.com>
W: http://sourceforge.net/projects/xscaleiop
@@ -3869,7 +3863,7 @@ F: drivers/i2c/busses/i2c-ismt.c
F: Documentation/i2c/busses/i2c-ismt
I2C/SMBUS STUB DRIVER
-M: "Mark M. Hoffman" <mhoffman@lightlink.com>
+M: Jean Delvare <khali@linux-fr.org>
L: linux-i2c@vger.kernel.org
S: Maintained
F: drivers/i2c/i2c-stub.c
@@ -7189,13 +7183,6 @@ L: netdev@vger.kernel.org
S: Maintained
F: drivers/net/ethernet/sis/sis900.*
-SIS 96X I2C/SMBUS DRIVER
-M: "Mark M. Hoffman" <mhoffman@lightlink.com>
-L: linux-i2c@vger.kernel.org
-S: Maintained
-F: Documentation/i2c/busses/i2c-sis96x
-F: drivers/i2c/busses/i2c-sis96x.c
-
SIS FRAMEBUFFER DRIVER
M: Thomas Winischhofer <thomas@winischhofer.net>
W: http://www.winischhofer.net/linuxsisvga.shtml
@@ -7273,7 +7260,7 @@ F: Documentation/hwmon/sch5627
F: drivers/hwmon/sch5627.c
SMSC47B397 HARDWARE MONITOR DRIVER
-M: "Mark M. Hoffman" <mhoffman@lightlink.com>
+M: Jean Delvare <khali@linux-fr.org>
L: lm-sensors@lm-sensors.org
S: Maintained
F: Documentation/hwmon/smsc47b397
diff --git a/arch/arm/boot/dts/armada-370-xp.dtsi b/arch/arm/boot/dts/armada-370-xp.dtsi
index 6f1acc75e155..4e3e4a3ce540 100644
--- a/arch/arm/boot/dts/armada-370-xp.dtsi
+++ b/arch/arm/boot/dts/armada-370-xp.dtsi
@@ -182,6 +182,12 @@
clocks = <&coreclk 0>;
status = "disabled";
};
+
+ rtc@10300 {
+ compatible = "marvell,orion-rtc";
+ reg = <0xd0010300 0x20>;
+ interrupts = <50>;
+ };
};
};
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 10062ceadd1c..0c6356255fe3 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -181,11 +181,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base(random_factor);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 7c7be7855638..8ed6cb1a900f 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -90,11 +90,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
EXPORT_SYMBOL_GPL(arch_pick_mmap_layout);
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index 7e5fe2790d8a..f1baadd56e82 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -158,11 +158,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base(random_factor);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index cd915d6b093d..88693cef4f3d 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -99,8 +99,7 @@ extern unsigned long slice_get_unmapped_area(unsigned long addr,
unsigned long len,
unsigned long flags,
unsigned int psize,
- int topdown,
- int use_cache);
+ int topdown);
extern unsigned int get_slice_psize(struct mm_struct *mm,
unsigned long addr);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1a6de0a7d8eb..5dc52d803ed8 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -742,7 +742,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct hstate *hstate = hstate_file(file);
int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
- return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
+ return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
}
#endif
diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap_64.c
index 67a42ed0d2fc..cb8bdbe4972f 100644
--- a/arch/powerpc/mm/mmap_64.c
+++ b/arch/powerpc/mm/mmap_64.c
@@ -92,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index cf9dada734b6..3e99c149271a 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -237,134 +237,112 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
#endif
}
+/*
+ * Compute which slice addr is part of;
+ * set *boundary_addr to the start or end boundary of that slice
+ * (depending on 'end' parameter);
+ * return boolean indicating if the slice is marked as available in the
+ * 'available' slice_mark.
+ */
+static bool slice_scan_available(unsigned long addr,
+ struct slice_mask available,
+ int end,
+ unsigned long *boundary_addr)
+{
+ unsigned long slice;
+ if (addr < SLICE_LOW_TOP) {
+ slice = GET_LOW_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
+ return !!(available.low_slices & (1u << slice));
+ } else {
+ slice = GET_HIGH_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) ?
+ ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
+ return !!(available.high_slices & (1u << slice));
+ }
+}
+
static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
unsigned long len,
struct slice_mask available,
- int psize, int use_cache)
+ int psize)
{
- struct vm_area_struct *vma;
- unsigned long start_addr, addr;
- struct slice_mask mask;
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
-
- if (use_cache) {
- if (len <= mm->cached_hole_size) {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- } else
- start_addr = addr = mm->free_area_cache;
- } else
- start_addr = addr = TASK_UNMAPPED_BASE;
-
-full_search:
- for (;;) {
- addr = _ALIGN_UP(addr, 1ul << pshift);
- if ((TASK_SIZE - len) < addr)
- break;
- vma = find_vma(mm, addr);
- BUG_ON(vma && (addr >= vma->vm_end));
-
- mask = slice_range_to_mask(addr, len);
- if (!slice_check_fit(mask, available)) {
- if (addr < SLICE_LOW_TOP)
- addr = _ALIGN_UP(addr + 1, 1ul << SLICE_LOW_SHIFT);
- else
- addr = _ALIGN_UP(addr + 1, 1ul << SLICE_HIGH_SHIFT);
+ unsigned long addr, found, next_end;
+ struct vm_unmapped_area_info info;
+
+ info.flags = 0;
+ info.length = len;
+ info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+ info.align_offset = 0;
+
+ addr = TASK_UNMAPPED_BASE;
+ while (addr < TASK_SIZE) {
+ info.low_limit = addr;
+ if (!slice_scan_available(addr, available, 1, &addr))
continue;
+
+ next_slice:
+ /*
+ * At this point [info.low_limit; addr) covers
+ * available slices only and ends at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the next available slice.
+ */
+ if (addr >= TASK_SIZE)
+ addr = TASK_SIZE;
+ else if (slice_scan_available(addr, available, 1, &next_end)) {
+ addr = next_end;
+ goto next_slice;
}
- if (!vma || addr + len <= vma->vm_start) {
- /*
- * Remember the place where we stopped the search:
- */
- if (use_cache)
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = vma->vm_end;
- }
+ info.high_limit = addr;
- /* Make sure we didn't miss any holes */
- if (use_cache && start_addr != TASK_UNMAPPED_BASE) {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- goto full_search;
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
}
+
return -ENOMEM;
}
static unsigned long slice_find_area_topdown(struct mm_struct *mm,
unsigned long len,
struct slice_mask available,
- int psize, int use_cache)
+ int psize)
{
- struct vm_area_struct *vma;
- unsigned long addr;
- struct slice_mask mask;
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+ unsigned long addr, found, prev;
+ struct vm_unmapped_area_info info;
- /* check if free_area_cache is useful for us */
- if (use_cache) {
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
- }
-
- /* either no address requested or can't fit in requested
- * address hole
- */
- addr = mm->free_area_cache;
-
- /* make sure it can fit in the remaining address space */
- if (addr > len) {
- addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
- mask = slice_range_to_mask(addr, len);
- if (slice_check_fit(mask, available) &&
- slice_area_is_free(mm, addr, len))
- /* remember the address as a hint for
- * next time
- */
- return (mm->free_area_cache = addr);
- }
- }
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+ info.align_offset = 0;
addr = mm->mmap_base;
- while (addr > len) {
- /* Go down by chunk size */
- addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
-
- /* Check for hit with different page size */
- mask = slice_range_to_mask(addr, len);
- if (!slice_check_fit(mask, available)) {
- if (addr < SLICE_LOW_TOP)
- addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT);
- else if (addr < (1ul << SLICE_HIGH_SHIFT))
- addr = SLICE_LOW_TOP;
- else
- addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT);
+ while (addr > PAGE_SIZE) {
+ info.high_limit = addr;
+ if (!slice_scan_available(addr - 1, available, 0, &addr))
continue;
- }
+ prev_slice:
/*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
+ * At this point [addr; info.high_limit) covers
+ * available slices only and starts at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the previous available slice.
*/
- vma = find_vma(mm, addr);
- if (!vma || (addr + len) <= vma->vm_start) {
- /* remember the address as a hint for next time */
- if (use_cache)
- mm->free_area_cache = addr;
- return addr;
+ if (addr < PAGE_SIZE)
+ addr = PAGE_SIZE;
+ else if (slice_scan_available(addr - 1, available, 0, &prev)) {
+ addr = prev;
+ goto prev_slice;
}
+ info.low_limit = addr;
- /* remember the largest hole we saw so far */
- if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = vma->vm_start;
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
}
/*
@@ -373,28 +351,18 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
* can happen with large stack limits and large mmap()
* allocations.
*/
- addr = slice_find_area_bottomup(mm, len, available, psize, 0);
-
- /*
- * Restore the topdown base:
- */
- if (use_cache) {
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
- }
-
- return addr;
+ return slice_find_area_bottomup(mm, len, available, psize);
}
static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
struct slice_mask mask, int psize,
- int topdown, int use_cache)
+ int topdown)
{
if (topdown)
- return slice_find_area_topdown(mm, len, mask, psize, use_cache);
+ return slice_find_area_topdown(mm, len, mask, psize);
else
- return slice_find_area_bottomup(mm, len, mask, psize, use_cache);
+ return slice_find_area_bottomup(mm, len, mask, psize);
}
#define or_mask(dst, src) do { \
@@ -415,7 +383,7 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
unsigned long flags, unsigned int psize,
- int topdown, int use_cache)
+ int topdown)
{
struct slice_mask mask = {0, 0};
struct slice_mask good_mask;
@@ -430,8 +398,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
BUG_ON(mm->task_size == 0);
slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
- slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d, use_cache=%d\n",
- addr, len, flags, topdown, use_cache);
+ slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
+ addr, len, flags, topdown);
if (len > mm->task_size)
return -ENOMEM;
@@ -503,8 +471,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
/* Now let's see if we can find something in the existing
* slices for that size
*/
- newaddr = slice_find_area(mm, len, good_mask, psize, topdown,
- use_cache);
+ newaddr = slice_find_area(mm, len, good_mask, psize, topdown);
if (newaddr != -ENOMEM) {
/* Found within the good mask, we don't have to setup,
* we thus return directly
@@ -536,8 +503,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
* anywhere in the good area.
*/
if (addr) {
- addr = slice_find_area(mm, len, good_mask, psize, topdown,
- use_cache);
+ addr = slice_find_area(mm, len, good_mask, psize, topdown);
if (addr != -ENOMEM) {
slice_dbg(" found area at 0x%lx\n", addr);
return addr;
@@ -547,15 +513,14 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
/* Now let's see if we can find something in the existing slices
* for that size plus free slices
*/
- addr = slice_find_area(mm, len, potential_mask, psize, topdown,
- use_cache);
+ addr = slice_find_area(mm, len, potential_mask, psize, topdown);
#ifdef CONFIG_PPC_64K_PAGES
if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
/* retry the search with 4k-page slices included */
or_mask(potential_mask, compat_mask);
addr = slice_find_area(mm, len, potential_mask, psize,
- topdown, use_cache);
+ topdown);
}
#endif
@@ -586,8 +551,7 @@ unsigned long arch_get_unmapped_area(struct file *filp,
unsigned long flags)
{
return slice_get_unmapped_area(addr, len, flags,
- current->mm->context.user_psize,
- 0, 1);
+ current->mm->context.user_psize, 0);
}
unsigned long arch_get_unmapped_area_topdown(struct file *filp,
@@ -597,8 +561,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
const unsigned long flags)
{
return slice_get_unmapped_area(addr0, len, flags,
- current->mm->context.user_psize,
- 1, 1);
+ current->mm->context.user_psize, 1);
}
unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 68c57d38745a..0026a37e21fd 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -352,7 +352,7 @@ static unsigned long spufs_get_unmapped_area(struct file *file,
/* Else, try to obtain a 64K pages slice */
return slice_get_unmapped_area(addr, len, flags,
- MMU_PAGE_64K, 1, 0);
+ MMU_PAGE_64K, 1);
}
#endif /* CONFIG_SPU_FS_64K_LS */
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 280ded8b79ba..a71a3dfc1c88 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
+#include <linux/aio.h>
#include <asm/ebcdic.h>
#include "hypfs.h"
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 06bafec00278..40023290ee5b 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -91,11 +91,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
@@ -176,11 +174,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = s390_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = s390_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 708bc29d36a8..f3c169f9d3a1 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -290,7 +290,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
sysctl_legacy_va_layout) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
/* We know it's 32-bit */
unsigned long task_size = STACK_TOP32;
@@ -302,7 +301,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c
index f96f4cec602a..d67d91ebf63e 100644
--- a/arch/tile/mm/mmap.c
+++ b/arch/tile/mm/mmap.c
@@ -66,10 +66,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base(mm);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 03abf9b70011..14fb6f9a7363 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -309,8 +309,6 @@ static int load_aout_binary(struct linux_binprm *bprm)
(current->mm->start_data = N_DATADDR(ex));
current->mm->brk = ex.a_bss +
(current->mm->start_brk = N_BSSADDR(ex));
- current->mm->free_area_cache = TASK_UNMAPPED_BASE;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
if (retval < 0) {
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index cccd07fa5e3a..b8e9224f0b45 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -17,6 +17,8 @@ extern unsigned long pci_mem_start;
extern int e820_any_mapped(u64 start, u64 end, unsigned type);
extern int e820_all_mapped(u64 start, u64 end, unsigned type);
extern void e820_add_region(u64 start, u64 size, int type);
+extern void e820_add_limit_region(u64 start, u64 size, int type);
+extern void e820_adjust_region(u64 *start, u64 *size);
extern void e820_print_map(char *who);
extern int
sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d32abeabbda5..0d5bb689649a 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -47,6 +47,7 @@ unsigned long pci_mem_start = 0xaeedbabe;
#ifdef CONFIG_PCI
EXPORT_SYMBOL(pci_mem_start);
#endif
+static u64 mem_limit = ~0ULL;
/*
* This function checks if any part of the range <start,end> is mapped
@@ -108,7 +109,7 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
* Add a memory region to the kernel e820 map.
*/
static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
- int type)
+ int type, bool limited)
{
int x = e820x->nr_map;
@@ -119,6 +120,22 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
return;
}
+ if (limited) {
+ if (start >= mem_limit) {
+ printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
+ (unsigned long long)start,
+ (unsigned long long)(start + size - 1));
+ return;
+ }
+
+ if (mem_limit - start < size) {
+ printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
+ (unsigned long long)mem_limit,
+ (unsigned long long)(start + size - 1));
+ size = mem_limit - start;
+ }
+ }
+
e820x->map[x].addr = start;
e820x->map[x].size = size;
e820x->map[x].type = type;
@@ -127,7 +144,37 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
void __init e820_add_region(u64 start, u64 size, int type)
{
- __e820_add_region(&e820, start, size, type);
+ __e820_add_region(&e820, start, size, type, false);
+}
+
+/*
+ * do_add_efi_memmap() calls this function().
+ *
+ * Note: BOOT_SERVICES_{CODE,DATA} regions on some efi machines are marked
+ * as E820_RAM, and they are needed to be mapped. Please use e820_add_region()
+ * to add BOOT_SERVICES_{CODE,DATA} regions.
+ */
+void __init e820_add_limit_region(u64 start, u64 size, int type)
+{
+ /*
+ * efi_init() is called after finish_e820_parsing(), so we should
+ * check whether [start, start + size) contains address above
+ * mem_limit if the type is E820_RAM.
+ */
+ __e820_add_region(&e820, start, size, type, type == E820_RAM);
+}
+
+void __init e820_adjust_region(u64 *start, u64 *size)
+{
+ if (*start >= mem_limit) {
+ *size = 0;
+ return;
+ }
+
+ if (mem_limit - *start < *size)
+ *size = mem_limit - *start;
+
+ return;
}
static void __init e820_print_type(u32 type)
@@ -455,8 +502,9 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
/* new range is totally covered? */
if (ei->addr < start && ei_end > end) {
- __e820_add_region(e820x, start, size, new_type);
- __e820_add_region(e820x, end, ei_end - end, ei->type);
+ __e820_add_region(e820x, start, size, new_type, false);
+ __e820_add_region(e820x, end, ei_end - end, ei->type,
+ false);
ei->size = start - ei->addr;
real_updated_size += size;
continue;
@@ -469,7 +517,7 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
continue;
__e820_add_region(e820x, final_start, final_end - final_start,
- new_type);
+ new_type, false);
real_updated_size += final_end - final_start;
@@ -809,7 +857,7 @@ static int userdef __initdata;
/* "mem=nopentium" disables the 4MB page tables. */
static int __init parse_memopt(char *p)
{
- u64 mem_size;
+ char *oldp;
if (!p)
return -EINVAL;
@@ -825,11 +873,11 @@ static int __init parse_memopt(char *p)
}
userdef = 1;
- mem_size = memparse(p, &p);
+ oldp = p;
+ mem_limit = memparse(p, &p);
/* don't remove all of memory when handling "mem={invalid}" param */
- if (mem_size == 0)
+ if (mem_limit == 0 || p == oldp)
return -EINVAL;
- e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
return 0;
}
@@ -895,6 +943,12 @@ early_param("memmap", parse_memmap_opt);
void __init finish_e820_parsing(void)
{
+ if (mem_limit != ~0ULL) {
+ userdef = 1;
+ e820_remove_range(mem_limit, ULLONG_MAX - mem_limit,
+ E820_RAM, 1);
+ }
+
if (userdef) {
u32 nr = e820.nr_map;
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 845df6835f9f..62c29a5bfe26 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -115,10 +115,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = mmap_legacy_base();
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 5f2ecaf3f9d8..4f65e1d05119 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -316,10 +316,17 @@ static void __init do_add_efi_memmap(void)
int e820_type;
switch (md->type) {
- case EFI_LOADER_CODE:
- case EFI_LOADER_DATA:
case EFI_BOOT_SERVICES_CODE:
case EFI_BOOT_SERVICES_DATA:
+ /* EFI_BOOT_SERVICES_{CODE,DATA} needs to be mapped */
+ if (md->attribute & EFI_MEMORY_WB)
+ e820_type = E820_RAM;
+ else
+ e820_type = E820_RESERVED;
+ e820_add_region(start, size, e820_type);
+ continue;
+ case EFI_LOADER_CODE:
+ case EFI_LOADER_DATA:
case EFI_CONVENTIONAL_MEMORY:
if (md->attribute & EFI_MEMORY_WB)
e820_type = E820_RAM;
@@ -344,7 +351,7 @@ static void __init do_add_efi_memmap(void)
e820_type = E820_RESERVED;
break;
}
- e820_add_region(start, size, e820_type);
+ e820_add_limit_region(start, size, e820_type);
}
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
}
@@ -451,6 +458,8 @@ void __init efi_free_boot_services(void)
md->type != EFI_BOOT_SERVICES_DATA)
continue;
+ e820_adjust_region(&start, &size);
+
/* Could not reserve boot area */
if (!size)
continue;
diff --git a/block/blk-core.c b/block/blk-core.c
index 074b758efc42..186603b36c44 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -151,7 +151,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
EXPORT_SYMBOL(blk_rq_init);
static void req_bio_endio(struct request *rq, struct bio *bio,
- unsigned int nbytes, int error)
+ unsigned int nbytes, int error,
+ struct batch_complete *batch)
{
if (error)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -175,7 +176,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
/* don't actually finish bio if it's part of flush sequence */
if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
- bio_endio(bio, error);
+ bio_endio_batch(bio, error, batch);
}
void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -2250,7 +2251,8 @@ EXPORT_SYMBOL(blk_fetch_request);
* %false - this request doesn't have any more data
* %true - this request has more data
**/
-bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
+bool blk_update_request(struct request *req, int error, unsigned int nr_bytes,
+ struct batch_complete *batch)
{
int total_bytes, bio_nbytes, next_idx = 0;
struct bio *bio;
@@ -2306,7 +2308,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
if (nr_bytes >= bio->bi_size) {
req->bio = bio->bi_next;
nbytes = bio->bi_size;
- req_bio_endio(req, bio, nbytes, error);
+ req_bio_endio(req, bio, nbytes, error, batch);
next_idx = 0;
bio_nbytes = 0;
} else {
@@ -2368,7 +2370,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
* if the request wasn't completed, update state
*/
if (bio_nbytes) {
- req_bio_endio(req, bio, bio_nbytes, error);
+ req_bio_endio(req, bio, bio_nbytes, error, batch);
bio->bi_idx += next_idx;
bio_iovec(bio)->bv_offset += nr_bytes;
bio_iovec(bio)->bv_len -= nr_bytes;
@@ -2405,14 +2407,15 @@ EXPORT_SYMBOL_GPL(blk_update_request);
static bool blk_update_bidi_request(struct request *rq, int error,
unsigned int nr_bytes,
- unsigned int bidi_bytes)
+ unsigned int bidi_bytes,
+ struct batch_complete *batch)
{
- if (blk_update_request(rq, error, nr_bytes))
+ if (blk_update_request(rq, error, nr_bytes, batch))
return true;
/* Bidi request must be completed as a whole */
if (unlikely(blk_bidi_rq(rq)) &&
- blk_update_request(rq->next_rq, error, bidi_bytes))
+ blk_update_request(rq->next_rq, error, bidi_bytes, batch))
return true;
if (blk_queue_add_random(rq->q))
@@ -2495,7 +2498,7 @@ static bool blk_end_bidi_request(struct request *rq, int error,
struct request_queue *q = rq->q;
unsigned long flags;
- if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
+ if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes, NULL))
return true;
spin_lock_irqsave(q->queue_lock, flags);
@@ -2521,9 +2524,10 @@ static bool blk_end_bidi_request(struct request *rq, int error,
* %true - still buffers pending for this request
**/
bool __blk_end_bidi_request(struct request *rq, int error,
- unsigned int nr_bytes, unsigned int bidi_bytes)
+ unsigned int nr_bytes, unsigned int bidi_bytes,
+ struct batch_complete *batch)
{
- if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
+ if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes, batch))
return true;
blk_finish_request(rq, error);
@@ -2624,7 +2628,7 @@ EXPORT_SYMBOL_GPL(blk_end_request_err);
**/
bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
{
- return __blk_end_bidi_request(rq, error, nr_bytes, 0);
+ return __blk_end_bidi_request(rq, error, nr_bytes, 0, NULL);
}
EXPORT_SYMBOL(__blk_end_request);
@@ -2636,7 +2640,7 @@ EXPORT_SYMBOL(__blk_end_request);
* Description:
* Completely finish @rq. Must be called with queue lock held.
*/
-void __blk_end_request_all(struct request *rq, int error)
+void blk_end_request_all_batch(struct request *rq, int error, struct batch_complete *batch)
{
bool pending;
unsigned int bidi_bytes = 0;
@@ -2644,10 +2648,10 @@ void __blk_end_request_all(struct request *rq, int error)
if (unlikely(blk_bidi_rq(rq)))
bidi_bytes = blk_rq_bytes(rq->next_rq);
- pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
+ pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes, batch);
BUG_ON(pending);
}
-EXPORT_SYMBOL(__blk_end_request_all);
+EXPORT_SYMBOL(blk_end_request_all_batch);
/**
* __blk_end_request_cur - Helper function to finish the current request chunk.
diff --git a/block/blk-flush.c b/block/blk-flush.c
index db8f1b507857..28785079ee23 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -316,7 +316,7 @@ void blk_insert_flush(struct request *rq)
* complete the request.
*/
if (!policy) {
- __blk_end_bidi_request(rq, 0, 0, 0);
+ __blk_end_bidi_request(rq, 0, 0, 0, NULL);
return;
}
diff --git a/block/blk.h b/block/blk.h
index e837b8f619b7..dc8fee6d41d6 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -31,7 +31,8 @@ void blk_queue_bypass_end(struct request_queue *q);
void blk_dequeue_request(struct request *rq);
void __blk_queue_free_tags(struct request_queue *q);
bool __blk_end_bidi_request(struct request *rq, int error,
- unsigned int nr_bytes, unsigned int bidi_bytes);
+ unsigned int nr_bytes, unsigned int bidi_bytes,
+ struct batch_complete *batch);
void blk_rq_timed_out_timer(unsigned long data);
void blk_delete_timer(struct request *);
diff --git a/block/genhd.c b/block/genhd.c
index 3c001fba80c7..a1ed52af5290 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -849,7 +849,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 9a87daa6f4fb..a5ffcc988f0b 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -27,6 +27,7 @@
#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <linux/times.h>
+#include <linux/uio.h>
#include <asm/uaccess.h>
#include <scsi/scsi.h>
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 11cc9522cdd4..8bedaf46792f 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -161,11 +161,9 @@ static void mtip_command_cleanup(struct driver_data *dd)
command = &port->commands[commandindex];
if (atomic_read(&command->active)
- && (command->async_callback)) {
- command->async_callback(command->async_data,
- -ENODEV);
- command->async_callback = NULL;
- command->async_data = NULL;
+ && (command->bio)) {
+ bio_endio(command->bio, -ENODEV);
+ command->bio = NULL;
}
dma_unmap_sg(&port->dd->pdev->dev,
@@ -606,11 +604,9 @@ static void mtip_timeout_function(unsigned long int data)
writel(1 << bit, port->completed[group]);
/* Call the async completion callback. */
- if (likely(command->async_callback))
- command->async_callback(command->async_data,
- -EIO);
- command->async_callback = NULL;
- command->comp_func = NULL;
+ if (likely(command->bio))
+ bio_endio(command->bio, -EIO);
+ command->bio = NULL;
/* Unmap the DMA scatter list entries */
dma_unmap_sg(&port->dd->pdev->dev,
@@ -679,7 +675,8 @@ static void mtip_timeout_function(unsigned long int data)
static void mtip_async_complete(struct mtip_port *port,
int tag,
void *data,
- int status)
+ int status,
+ struct batch_complete *batch)
{
struct mtip_cmd *command;
struct driver_data *dd = data;
@@ -696,11 +693,10 @@ static void mtip_async_complete(struct mtip_port *port,
}
/* Upper layer callback */
- if (likely(command->async_callback))
- command->async_callback(command->async_data, cb_status);
+ if (likely(command->bio))
+ bio_endio_batch(command->bio, cb_status, batch);
- command->async_callback = NULL;
- command->comp_func = NULL;
+ command->bio = NULL;
/* Unmap the DMA scatter list entries */
dma_unmap_sg(&dd->pdev->dev,
@@ -733,24 +729,22 @@ static void mtip_async_complete(struct mtip_port *port,
static void mtip_completion(struct mtip_port *port,
int tag,
void *data,
- int status)
+ int status,
+ struct batch_complete *batch)
{
- struct mtip_cmd *command = &port->commands[tag];
struct completion *waiting = data;
if (unlikely(status == PORT_IRQ_TF_ERR))
dev_warn(&port->dd->pdev->dev,
"Internal command %d completed with TFE\n", tag);
- command->async_callback = NULL;
- command->comp_func = NULL;
-
complete(waiting);
}
static void mtip_null_completion(struct mtip_port *port,
int tag,
void *data,
- int status)
+ int status,
+ struct batch_complete *batch)
{
return;
}
@@ -796,7 +790,7 @@ static void mtip_handle_tfe(struct driver_data *dd)
atomic_inc(&cmd->active); /* active > 1 indicates error */
if (cmd->comp_data && cmd->comp_func) {
cmd->comp_func(port, MTIP_TAG_INTERNAL,
- cmd->comp_data, PORT_IRQ_TF_ERR);
+ cmd->comp_data, PORT_IRQ_TF_ERR, NULL);
}
goto handle_tfe_exit;
}
@@ -829,7 +823,7 @@ static void mtip_handle_tfe(struct driver_data *dd)
cmd->comp_func(port,
tag,
cmd->comp_data,
- 0);
+ 0, NULL);
} else {
dev_err(&port->dd->pdev->dev,
"Missing completion func for tag %d",
@@ -916,7 +910,7 @@ static void mtip_handle_tfe(struct driver_data *dd)
if (cmd->comp_func) {
cmd->comp_func(port, tag,
cmd->comp_data,
- -ENODATA);
+ -ENODATA, NULL);
}
continue;
}
@@ -946,7 +940,7 @@ static void mtip_handle_tfe(struct driver_data *dd)
port,
tag,
cmd->comp_data,
- PORT_IRQ_TF_ERR);
+ PORT_IRQ_TF_ERR, NULL);
else
dev_warn(&port->dd->pdev->dev,
"Bad completion for tag %d\n",
@@ -973,6 +967,9 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
struct driver_data *dd = port->dd;
int tag, bit;
struct mtip_cmd *command;
+ struct batch_complete batch;
+
+ batch_complete_init(&batch);
if (!completed) {
WARN_ON_ONCE(!completed);
@@ -997,7 +994,8 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
port,
tag,
command->comp_data,
- 0);
+ 0,
+ &batch);
} else {
dev_warn(&dd->pdev->dev,
"Null completion "
@@ -1034,7 +1032,7 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
cmd->comp_func(port,
MTIP_TAG_INTERNAL,
cmd->comp_data,
- 0);
+ 0, NULL);
return;
}
}
@@ -2554,8 +2552,8 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
* None
*/
static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
- int nsect, int nents, int tag, void *callback,
- void *data, int dir)
+ int nsect, int nents, int tag,
+ struct bio *bio, int dir)
{
struct host_to_dev_fis *fis;
struct mtip_port *port = dd->port;
@@ -2610,12 +2608,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
command->comp_func = mtip_async_complete;
command->direction = dma_dir;
- /*
- * Set the completion function and data for the command passed
- * from the upper layer.
- */
- command->async_data = data;
- command->async_callback = callback;
+ command->bio = bio;
/*
* To prevent this command from being issued
@@ -3795,7 +3788,6 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
bio_sectors(bio),
nents,
tag,
- bio_endio,
bio,
bio_data_dir(bio));
} else
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 3bffff5f670c..af8c6f79a8d8 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -325,11 +325,9 @@ struct mtip_cmd {
void (*comp_func)(struct mtip_port *port,
int tag,
void *data,
- int status);
- /* Additional callback function that may be called by comp_func() */
- void (*async_callback)(void *data, int status);
-
- void *async_data; /* Addl. data passed to async_callback() */
+ int status,
+ struct batch_complete *batch);
+ struct bio *bio;
int scatter_ents; /* Number of scatter list entries used */
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 758f2ac878cf..deb722d63d68 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -775,7 +775,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
if (intr & ERROR_INTR) {
n = fs->scount - 1 - resid / 512;
if (n > 0) {
- blk_update_request(req, 0, n << 9);
+ blk_update_request(req, 0, n << 9, NULL);
fs->req_sector += n;
}
if (fs->retries < 5) {
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 8ad21a25bc0d..5a9e04a6a6d7 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -210,7 +210,8 @@ static void virtblk_bio_send_flush_work(struct work_struct *work)
virtblk_bio_send_flush(vbr);
}
-static inline void virtblk_request_done(struct virtblk_req *vbr)
+static inline void virtblk_request_done(struct virtblk_req *vbr,
+ struct batch_complete *batch)
{
struct virtio_blk *vblk = vbr->vblk;
struct request *req = vbr->req;
@@ -224,11 +225,12 @@ static inline void virtblk_request_done(struct virtblk_req *vbr)
req->errors = (error != 0);
}
- __blk_end_request_all(req, error);
+ blk_end_request_all_batch(req, error, batch);
mempool_free(vbr, vblk->pool);
}
-static inline void virtblk_bio_flush_done(struct virtblk_req *vbr)
+static inline void virtblk_bio_flush_done(struct virtblk_req *vbr,
+ struct batch_complete *batch)
{
struct virtio_blk *vblk = vbr->vblk;
@@ -237,12 +239,13 @@ static inline void virtblk_bio_flush_done(struct virtblk_req *vbr)
INIT_WORK(&vbr->work, virtblk_bio_send_data_work);
queue_work(virtblk_wq, &vbr->work);
} else {
- bio_endio(vbr->bio, virtblk_result(vbr));
+ bio_endio_batch(vbr->bio, virtblk_result(vbr), batch);
mempool_free(vbr, vblk->pool);
}
}
-static inline void virtblk_bio_data_done(struct virtblk_req *vbr)
+static inline void virtblk_bio_data_done(struct virtblk_req *vbr,
+ struct batch_complete *batch)
{
struct virtio_blk *vblk = vbr->vblk;
@@ -252,17 +255,18 @@ static inline void virtblk_bio_data_done(struct virtblk_req *vbr)
INIT_WORK(&vbr->work, virtblk_bio_send_flush_work);
queue_work(virtblk_wq, &vbr->work);
} else {
- bio_endio(vbr->bio, virtblk_result(vbr));
+ bio_endio_batch(vbr->bio, virtblk_result(vbr), batch);
mempool_free(vbr, vblk->pool);
}
}
-static inline void virtblk_bio_done(struct virtblk_req *vbr)
+static inline void virtblk_bio_done(struct virtblk_req *vbr,
+ struct batch_complete *batch)
{
if (unlikely(vbr->flags & VBLK_IS_FLUSH))
- virtblk_bio_flush_done(vbr);
+ virtblk_bio_flush_done(vbr, batch);
else
- virtblk_bio_data_done(vbr);
+ virtblk_bio_data_done(vbr, batch);
}
static void virtblk_done(struct virtqueue *vq)
@@ -272,16 +276,19 @@ static void virtblk_done(struct virtqueue *vq)
struct virtblk_req *vbr;
unsigned long flags;
unsigned int len;
+ struct batch_complete batch;
+
+ batch_complete_init(&batch);
spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
do {
virtqueue_disable_cb(vq);
while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
if (vbr->bio) {
- virtblk_bio_done(vbr);
+ virtblk_bio_done(vbr, &batch);
bio_done = true;
} else {
- virtblk_request_done(vbr);
+ virtblk_request_done(vbr, &batch);
req_done = true;
}
}
@@ -291,6 +298,8 @@ static void virtblk_done(struct virtqueue *vq)
blk_start_queue(vblk->disk->queue);
spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags);
+ batch_complete(&batch);
+
if (bio_done)
wake_up(&vblk->queue_wait);
}
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 2c644afbcdd4..1ccbe9482faa 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -28,6 +28,7 @@
#include <linux/pfn.h>
#include <linux/export.h>
#include <linux/io.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
@@ -627,6 +628,18 @@ static ssize_t write_null(struct file *file, const char __user *buf,
return count;
}
+static ssize_t aio_read_null(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ return 0;
+}
+
+static ssize_t aio_write_null(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ return iov_length(iov, nr_segs);
+}
+
static int pipe_to_null(struct pipe_inode_info *info, struct pipe_buffer *buf,
struct splice_desc *sd)
{
@@ -670,6 +683,24 @@ static ssize_t read_zero(struct file *file, char __user *buf,
return written ? written : -EFAULT;
}
+static ssize_t aio_read_zero(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ size_t written = 0;
+ unsigned long i;
+ ssize_t ret;
+
+ for (i = 0; i < nr_segs; i++) {
+ ret = read_zero(iocb->ki_filp, iov[i].iov_base, iov[i].iov_len,
+ &pos);
+ if (ret < 0)
+ break;
+ written += ret;
+ }
+
+ return written ? written : -EFAULT;
+}
+
static int mmap_zero(struct file *file, struct vm_area_struct *vma)
{
#ifndef CONFIG_MMU
@@ -738,6 +769,7 @@ static int open_port(struct inode *inode, struct file *filp)
#define full_lseek null_lseek
#define write_zero write_null
#define read_full read_zero
+#define aio_write_zero aio_write_null
#define open_mem open_port
#define open_kmem open_mem
#define open_oldmem open_mem
@@ -766,6 +798,8 @@ static const struct file_operations null_fops = {
.llseek = null_lseek,
.read = read_null,
.write = write_null,
+ .aio_read = aio_read_null,
+ .aio_write = aio_write_null,
.splice_write = splice_write_null,
};
@@ -782,6 +816,8 @@ static const struct file_operations zero_fops = {
.llseek = zero_lseek,
.read = read_zero,
.write = write_zero,
+ .aio_read = aio_read_zero,
+ .aio_write = aio_write_zero,
.mmap = mmap_zero,
};
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 59d6b9bf204b..cf71f1d627fe 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -399,6 +399,14 @@ static void drm_fb_helper_dpms(struct fb_info *info, int dpms_mode)
return;
/*
+ * fbdev->blank can be called from irq context in case of a panic.
+ * Since we already have our own special panic handler which will
+ * restore the fbdev console mode completely, just bail out early.
+ */
+ if (oops_in_progress)
+ return;
+
+ /*
* For each CRTC in this fb, turn the connectors on/off.
*/
drm_modeset_lock_all(dev);
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
index aed8afee56da..6d7f453b4d05 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -40,6 +40,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/io.h>
+#include <linux/aio.h>
#include <linux/jiffies.h>
#include <linux/cpu.h>
#include <asm/pgtable.h>
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 4f7aa301b3b1..b56c9428f3c5 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -39,7 +39,7 @@
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/io.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
#include <linux/jiffies.h>
#include <asm/pgtable.h>
#include <linux/delay.h>
diff --git a/drivers/leds/leds-ot200.c b/drivers/leds/leds-ot200.c
index ee14662ed5ce..98cae529373f 100644
--- a/drivers/leds/leds-ot200.c
+++ b/drivers/leds/leds-ot200.c
@@ -47,37 +47,37 @@ static struct ot200_led leds[] = {
{
.name = "led_1",
.port = 0x49,
- .mask = BIT(7),
+ .mask = BIT(6),
},
{
.name = "led_2",
.port = 0x49,
- .mask = BIT(6),
+ .mask = BIT(5),
},
{
.name = "led_3",
.port = 0x49,
- .mask = BIT(5),
+ .mask = BIT(4),
},
{
.name = "led_4",
.port = 0x49,
- .mask = BIT(4),
+ .mask = BIT(3),
},
{
.name = "led_5",
.port = 0x49,
- .mask = BIT(3),
+ .mask = BIT(2),
},
{
.name = "led_6",
.port = 0x49,
- .mask = BIT(2),
+ .mask = BIT(1),
},
{
.name = "led_7",
.port = 0x49,
- .mask = BIT(1),
+ .mask = BIT(0),
}
};
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7e469260fe5e..e52880f6b589 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -695,7 +695,7 @@ static void end_clone_bio(struct bio *clone, int error)
* Do not use blk_end_request() here, because it may complete
* the original request before the clone, and break the ordering.
*/
- blk_update_request(tio->orig, 0, nr_bytes);
+ blk_update_request(tio->orig, 0, nr_bytes, NULL);
}
/*
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index 970a236b147a..a65621c42170 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -4,6 +4,7 @@
* Copyright (C) 2005 James Chapman (ds1337 core)
* Copyright (C) 2006 David Brownell
* Copyright (C) 2009 Matthias Fuchs (rx8025 support)
+ * Copyright (C) 2012 Bertrand Achard (nvram access fixes)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -196,7 +197,7 @@ static s32 ds1307_read_block_data_once(const struct i2c_client *client,
static s32 ds1307_read_block_data(const struct i2c_client *client, u8 command,
u8 length, u8 *values)
{
- u8 oldvalues[I2C_SMBUS_BLOCK_MAX];
+ u8 oldvalues[255];
s32 ret;
int tries = 0;
@@ -222,7 +223,7 @@ static s32 ds1307_read_block_data(const struct i2c_client *client, u8 command,
static s32 ds1307_write_block_data(const struct i2c_client *client, u8 command,
u8 length, const u8 *values)
{
- u8 currvalues[I2C_SMBUS_BLOCK_MAX];
+ u8 currvalues[255];
int tries = 0;
dev_dbg(&client->dev, "ds1307_write_block_data (length=%d)\n", length);
@@ -250,6 +251,57 @@ static s32 ds1307_write_block_data(const struct i2c_client *client, u8 command,
/*----------------------------------------------------------------------*/
+/* These RTC devices are not designed to be connected to a SMbus adapter.
+ SMbus limits block operations length to 32 bytes, whereas it's not
+ limited on I2C buses. As a result, accesses may exceed 32 bytes;
+ in that case, split them into smaller blocks */
+
+static s32 ds1307_native_smbus_write_block_data(const struct i2c_client *client,
+ u8 command, u8 length, const u8 *values)
+{
+ u8 suboffset = 0;
+
+ if (length <= I2C_SMBUS_BLOCK_MAX)
+ return i2c_smbus_write_i2c_block_data(client,
+ command, length, values);
+
+ while (suboffset < length) {
+ s32 retval = i2c_smbus_write_i2c_block_data(client,
+ command + suboffset,
+ min(I2C_SMBUS_BLOCK_MAX, length - suboffset),
+ values + suboffset);
+ if (retval < 0)
+ return retval;
+
+ suboffset += I2C_SMBUS_BLOCK_MAX;
+ }
+ return length;
+}
+
+static s32 ds1307_native_smbus_read_block_data(const struct i2c_client *client,
+ u8 command, u8 length, u8 *values)
+{
+ u8 suboffset = 0;
+
+ if (length <= I2C_SMBUS_BLOCK_MAX)
+ return i2c_smbus_read_i2c_block_data(client,
+ command, length, values);
+
+ while (suboffset < length) {
+ s32 retval = i2c_smbus_read_i2c_block_data(client,
+ command + suboffset,
+ min(I2C_SMBUS_BLOCK_MAX, length - suboffset),
+ values + suboffset);
+ if (retval < 0)
+ return retval;
+
+ suboffset += I2C_SMBUS_BLOCK_MAX;
+ }
+ return length;
+}
+
+/*----------------------------------------------------------------------*/
+
/*
* The IRQ logic includes a "real" handler running in IRQ context just
* long enough to schedule this workqueue entry. We need a task context
@@ -646,8 +698,8 @@ static int ds1307_probe(struct i2c_client *client,
buf = ds1307->regs;
if (i2c_check_functionality(adapter, I2C_FUNC_SMBUS_I2C_BLOCK)) {
- ds1307->read_block_data = i2c_smbus_read_i2c_block_data;
- ds1307->write_block_data = i2c_smbus_write_i2c_block_data;
+ ds1307->read_block_data = ds1307_native_smbus_read_block_data;
+ ds1307->write_block_data = ds1307_native_smbus_write_block_data;
} else {
ds1307->read_block_data = ds1307_read_block_data;
ds1307->write_block_data = ds1307_write_block_data;
diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c
index 03c85ee719a7..b7e462978609 100644
--- a/drivers/rtc/rtc-pxa.c
+++ b/drivers/rtc/rtc-pxa.c
@@ -19,6 +19,7 @@
*
*/
+#include <linux/delay.h>
#include <linux/init.h>
#include <linux/platform_device.h>
#include <linux/module.h>
@@ -80,22 +81,29 @@
#define RYAR1 0x1c
#define RTCPICR 0x34
#define PIAR 0x38
+#define PSBR_RTC 0x00
#define rtc_readl(pxa_rtc, reg) \
__raw_readl((pxa_rtc)->base + (reg))
#define rtc_writel(pxa_rtc, reg, value) \
__raw_writel((value), (pxa_rtc)->base + (reg))
+#define rtc_readl_psbr(pxa_rtc, reg) \
+ __raw_readl((pxa_rtc)->base_psbr + (reg))
+#define rtc_writel_psbr(pxa_rtc, reg, value) \
+ __raw_writel((value), (pxa_rtc)->base_psbr + (reg))
struct pxa_rtc {
struct resource *ress;
+ struct resource *ress_psbr;
void __iomem *base;
+ void __iomem *base_psbr;
int irq_1Hz;
int irq_Alrm;
struct rtc_device *rtc;
spinlock_t lock; /* Protects this structure */
};
-
+static struct pxa_rtc *rtc_info;
static u32 ryxr_calc(struct rtc_time *tm)
{
return ((tm->tm_year + 1900) << RYxR_YEAR_S)
@@ -117,7 +125,7 @@ static void tm_calc(u32 rycr, u32 rdcr, struct rtc_time *tm)
tm->tm_year = ((rycr & RYxR_YEAR_MASK) >> RYxR_YEAR_S) - 1900;
tm->tm_mon = (((rycr & RYxR_MONTH_MASK) >> RYxR_MONTH_S)) - 1;
tm->tm_mday = (rycr & RYxR_DAY_MASK);
- tm->tm_wday = ((rycr & RDxR_DOW_MASK) >> RDxR_DOW_S) - 1;
+ tm->tm_wday = ((rdcr & RDxR_DOW_MASK) >> RDxR_DOW_S) - 1;
tm->tm_hour = (rdcr & RDxR_HOUR_MASK) >> RDxR_HOUR_S;
tm->tm_min = (rdcr & RDxR_MIN_MASK) >> RDxR_MIN_S;
tm->tm_sec = rdcr & RDxR_SEC_MASK;
@@ -175,7 +183,6 @@ static irqreturn_t pxa_rtc_irq(int irq, void *dev_id)
/* enable back rtc interrupts */
rtc_writel(pxa_rtc, RTSR, rtsr & ~RTSR_TRIG_MASK);
-
spin_unlock(&pxa_rtc->lock);
return IRQ_HANDLED;
}
@@ -250,12 +257,45 @@ static int pxa_rtc_read_time(struct device *dev, struct rtc_time *tm)
static int pxa_rtc_set_time(struct device *dev, struct rtc_time *tm)
{
struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
-
+ /* sequence to wirte pxa rtc register RCNR RDCR RYCR is
+ *1. set PSBR[RWE] bit, take 2x32-khz to complete
+ *2. write to RTC register,take 2x32-khz to complete
+ *3. clear PSBR[RWE] bit,take 2x32-khz to complete
+ */
+ if ((tm->tm_year < 70) || (tm->tm_year > 138))
+ return -EINVAL;
+ rtc_writel_psbr(rtc_info, PSBR_RTC, 0x01);
+ udelay(100);
rtc_writel(pxa_rtc, RYCR, ryxr_calc(tm));
rtc_writel(pxa_rtc, RDCR, rdxr_calc(tm));
+ udelay(100);
+ rtc_writel_psbr(rtc_info, PSBR_RTC, 0x00);
+ udelay(100);
+ pxa_rtc_read_time(dev, tm);
+ dev_info(dev, "tm.year = %d, tm.month = %d, tm.day = %d\n",
+ tm->tm_year + 1900, tm->tm_mon, tm->tm_mday);
+ return 0;
+}
+int pxa_rtc_sync_time(unsigned int ticks)
+{
+ /* sequence to wirte pxa rtc register RCNR RDCR RYCR is
+ *1. set PSBR[RWE] bit, take 2x32-khz to complete
+ *2. write to RTC register,take 2x32-khz to complete
+ *3. clear PSBR[RWE] bit,take 2x32-khz to complete
+ */
+ struct rtc_time tm;
+ rtc_time_to_tm(ticks, &tm);
+ rtc_writel_psbr(rtc_info, PSBR_RTC, 0x01);
+ udelay(100);
+ rtc_writel(rtc_info, RYCR, ryxr_calc(&tm));
+ rtc_writel(rtc_info, RDCR, rdxr_calc(&tm));
+ udelay(100);
+ rtc_writel_psbr(rtc_info, PSBR_RTC, 0x00);
+ udelay(100);
return 0;
}
+EXPORT_SYMBOL(pxa_rtc_sync_time);
static int pxa_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
{
@@ -327,7 +367,7 @@ static int __init pxa_rtc_probe(struct platform_device *pdev)
pxa_rtc = kzalloc(sizeof(struct pxa_rtc), GFP_KERNEL);
if (!pxa_rtc)
return -ENOMEM;
-
+ rtc_info = pxa_rtc;
spin_lock_init(&pxa_rtc->lock);
platform_set_drvdata(pdev, pxa_rtc);
@@ -337,6 +377,11 @@ static int __init pxa_rtc_probe(struct platform_device *pdev)
dev_err(dev, "No I/O memory resource defined\n");
goto err_ress;
}
+ pxa_rtc->ress_psbr = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+ if (!pxa_rtc->ress_psbr) {
+ dev_err(dev, "No I/O memory resource defined\n");
+ goto err_ress;
+ }
pxa_rtc->irq_1Hz = platform_get_irq(pdev, 0);
if (pxa_rtc->irq_1Hz < 0) {
@@ -348,7 +393,6 @@ static int __init pxa_rtc_probe(struct platform_device *pdev)
dev_err(dev, "No alarm IRQ resource defined\n");
goto err_ress;
}
- pxa_rtc_open(dev);
ret = -ENOMEM;
pxa_rtc->base = ioremap(pxa_rtc->ress->start,
resource_size(pxa_rtc->ress));
@@ -357,6 +401,12 @@ static int __init pxa_rtc_probe(struct platform_device *pdev)
goto err_map;
}
+ pxa_rtc->base_psbr = ioremap(pxa_rtc->ress_psbr->start,
+ resource_size(pxa_rtc->ress_psbr));
+ if (!pxa_rtc->base_psbr) {
+ dev_err(&pdev->dev, "Unable to map pxa RTC PSBR I/O memory\n");
+ goto err_map;
+ }
/*
* If the clock divider is uninitialized then reset it to the
* default value to get the 1Hz clock.
@@ -379,7 +429,7 @@ static int __init pxa_rtc_probe(struct platform_device *pdev)
}
device_init_wakeup(dev, 1);
-
+ pxa_rtc_open(dev);
return 0;
err_rtc_reg:
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 9f0c46547459..df5e961484e1 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -35,6 +35,7 @@ static int sg_version_num = 30534; /* 2 digits for each component */
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/aio.h>
#include <linux/errno.h>
#include <linux/mtio.h>
#include <linux/ioctl.h>
diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c
index dbc63cbb4d3a..b4a313cdbed5 100644
--- a/drivers/staging/android/logger.c
+++ b/drivers/staging/android/logger.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/vmalloc.h>
+#include <linux/aio.h>
#include "logger.h"
#include <asm/ioctls.h>
diff --git a/drivers/usb/gadget/amd5536udc.c b/drivers/usb/gadget/amd5536udc.c
index 75973f33a4c8..142616a42f45 100644
--- a/drivers/usb/gadget/amd5536udc.c
+++ b/drivers/usb/gadget/amd5536udc.c
@@ -3102,7 +3102,7 @@ static int init_dma_pools(struct udc *dev)
}
/* DMA setup */
- dev->data_requests = dma_pool_create("data_requests", NULL,
+ dev->data_requests = dma_pool_create("data_requests", &dev->pdev->dev,
sizeof(struct udc_data_dma), 0, 0);
if (!dev->data_requests) {
DBG(dev, "can't get request data pool\n");
@@ -3114,7 +3114,7 @@ static int init_dma_pools(struct udc *dev)
dev->ep[UDC_EP0IN_IX].dma = &dev->regs->ctl;
/* dma desc for setup data */
- dev->stp_requests = dma_pool_create("setup requests", NULL,
+ dev->stp_requests = dma_pool_create("setup requests", &dev->pdev->dev,
sizeof(struct udc_stp_dma), 0, 0);
if (!dev->stp_requests) {
DBG(dev, "can't get stp request pool\n");
@@ -3235,6 +3235,12 @@ static int udc_pci_probe(
pci_set_master(pdev);
pci_try_set_mwi(pdev);
+ dev->phys_addr = resource;
+ dev->irq = pdev->irq;
+ dev->pdev = pdev;
+ dev->gadget.dev.parent = &pdev->dev;
+ dev->gadget.dev.dma_mask = pdev->dev.dma_mask;
+
/* init dma pools */
if (use_dma) {
retval = init_dma_pools(dev);
@@ -3242,12 +3248,6 @@ static int udc_pci_probe(
goto finished;
}
- dev->phys_addr = resource;
- dev->irq = pdev->irq;
- dev->pdev = pdev;
- dev->gadget.dev.parent = &pdev->dev;
- dev->gadget.dev.dma_mask = pdev->dev.dma_mask;
-
/* general probing */
if (udc_probe(dev) == 0)
return 0;
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index 8ac840f25ba9..f4bbc7d39ae9 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -24,6 +24,8 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/poll.h>
+#include <linux/mmu_context.h>
+#include <linux/aio.h>
#include <linux/device.h>
#include <linux/moduleparam.h>
@@ -513,6 +515,9 @@ static long ep_ioctl(struct file *fd, unsigned code, unsigned long value)
struct kiocb_priv {
struct usb_request *req;
struct ep_data *epdata;
+ struct kiocb *iocb;
+ struct mm_struct *mm;
+ struct work_struct work;
void *buf;
const struct iovec *iv;
unsigned long nr_segs;
@@ -528,7 +533,6 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e)
local_irq_disable();
epdata = priv->epdata;
// spin_lock(&epdata->dev->lock);
- kiocbSetCancelled(iocb);
if (likely(epdata && epdata->ep && priv->req))
value = usb_ep_dequeue (epdata->ep, priv->req);
else
@@ -540,15 +544,12 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e)
return value;
}
-static ssize_t ep_aio_read_retry(struct kiocb *iocb)
+static ssize_t ep_copy_to_user(struct kiocb_priv *priv)
{
- struct kiocb_priv *priv = iocb->private;
ssize_t len, total;
void *to_copy;
int i;
- /* we "retry" to get the right mm context for this: */
-
/* copy stuff into user buffers */
total = priv->actual;
len = 0;
@@ -568,9 +569,26 @@ static ssize_t ep_aio_read_retry(struct kiocb *iocb)
if (total == 0)
break;
}
+
+ return len;
+}
+
+static void ep_user_copy_worker(struct work_struct *work)
+{
+ struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work);
+ struct mm_struct *mm = priv->mm;
+ struct kiocb *iocb = priv->iocb;
+ size_t ret;
+
+ use_mm(mm);
+ ret = ep_copy_to_user(priv);
+ unuse_mm(mm);
+
+ /* completing the iocb can drop the ctx and mm, don't touch mm after */
+ aio_complete(iocb, ret, ret);
+
kfree(priv->buf);
kfree(priv);
- return len;
}
static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
@@ -596,14 +614,14 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
aio_complete(iocb, req->actual ? req->actual : req->status,
req->status);
} else {
- /* retry() won't report both; so we hide some faults */
+ /* ep_copy_to_user() won't report both; we hide some faults */
if (unlikely(0 != req->status))
DBG(epdata->dev, "%s fault %d len %d\n",
ep->name, req->status, req->actual);
priv->buf = req->buf;
priv->actual = req->actual;
- kick_iocb(iocb);
+ schedule_work(&priv->work);
}
spin_unlock(&epdata->dev->lock);
@@ -633,8 +651,10 @@ fail:
return value;
}
iocb->private = priv;
+ priv->iocb = iocb;
priv->iv = iv;
priv->nr_segs = nr_segs;
+ INIT_WORK(&priv->work, ep_user_copy_worker);
value = get_ready_ep(iocb->ki_filp->f_flags, epdata);
if (unlikely(value < 0)) {
@@ -642,10 +662,11 @@ fail:
goto fail;
}
- iocb->ki_cancel = ep_aio_cancel;
+ kiocb_set_cancel_fn(iocb, ep_aio_cancel);
get_ep(epdata);
priv->epdata = epdata;
priv->actual = 0;
+ priv->mm = current->mm; /* mm teardown waits for iocbs in exit_aio() */
/* each kiocb is coupled to one usb_request, but we can't
* allocate or submit those if the host disconnected.
@@ -674,7 +695,7 @@ fail:
kfree(priv);
put_ep(epdata);
} else
- value = (iv ? -EIOCBRETRY : -EIOCBQUEUED);
+ value = -EIOCBQUEUED;
return value;
}
@@ -692,7 +713,6 @@ ep_aio_read(struct kiocb *iocb, const struct iovec *iov,
if (unlikely(!buf))
return -ENOMEM;
- iocb->ki_retry = ep_aio_read_retry;
return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs);
}
diff --git a/drivers/video/cyber2000fb.c b/drivers/video/cyber2000fb.c
index 57886787ead0..e78d9f2233b8 100644
--- a/drivers/video/cyber2000fb.c
+++ b/drivers/video/cyber2000fb.c
@@ -518,6 +518,9 @@ static void cyber2000fb_set_timing(struct cfb_info *cfb, struct par_info *hw)
cyber2000_grphw(0xb9, 0x00, cfb);
spin_unlock(&cfb->reg_b0_lock);
+ /* wait (for the PLL?) to avoid palette corruption at higher clocks */
+ msleep(1000);
+
cfb->ramdac_ctrl = hw->ramdac;
cyber2000fb_write_ramdac_ctrl(cfb);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 0ad61c6a65a5..055562c580b4 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,6 +33,7 @@
#include <linux/pagemap.h>
#include <linux/idr.h>
#include <linux/sched.h>
+#include <linux/aio.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 7e03eadb40c0..a890db4b9898 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
+#include <linux/aio.h>
#include "internal.h"
static int afs_write_back_from_locked_page(struct afs_writeback *wb,
diff --git a/fs/aio.c b/fs/aio.c
index 3f941f2a3059..2512232189d0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -8,6 +8,8 @@
*
* See ../COPYING for licensing terms.
*/
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
@@ -18,14 +20,14 @@
#include <linux/backing-dev.h>
#include <linux/uio.h>
-#define DEBUG 0
-
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
+#include <linux/bio.h>
#include <linux/mmu_context.h>
+#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/aio.h>
@@ -35,15 +37,94 @@
#include <linux/eventfd.h>
#include <linux/blkdev.h>
#include <linux/compat.h>
+#include <linux/percpu-refcount.h>
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
-#if DEBUG > 1
-#define dprintk printk
-#else
-#define dprintk(x...) do { ; } while (0)
-#endif
+#define AIO_RING_MAGIC 0xa10a10a1
+#define AIO_RING_COMPAT_FEATURES 1
+#define AIO_RING_INCOMPAT_FEATURES 0
+struct aio_ring {
+ unsigned id; /* kernel internal index number */
+ unsigned nr; /* number of io_events */
+ unsigned head;
+ unsigned tail;
+
+ unsigned magic;
+ unsigned compat_features;
+ unsigned incompat_features;
+ unsigned header_length; /* size of aio_ring */
+
+
+ struct io_event io_events[0];
+}; /* 128 bytes + ring size */
+
+#define AIO_RING_PAGES 8
+
+struct kioctx_cpu {
+ unsigned reqs_available;
+};
+
+struct kioctx {
+ struct percpu_ref users;
+
+ /* This needs improving */
+ unsigned long user_id;
+ struct hlist_node list;
+
+ struct __percpu kioctx_cpu *cpu;
+
+ unsigned req_batch;
+
+ unsigned nr;
+
+ /* sys_io_setup currently limits this to an unsigned int */
+ unsigned max_reqs;
+
+ unsigned long mmap_base;
+ unsigned long mmap_size;
+
+ struct page **ring_pages;
+ long nr_pages;
+
+ struct rcu_head rcu_head;
+ struct work_struct rcu_work;
+
+ struct {
+ atomic_t reqs_available;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ spinlock_t ctx_lock;
+ struct list_head active_reqs; /* used for cancellation */
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ struct mutex ring_lock;
+ wait_queue_head_t wait;
+
+ /*
+ * Copy of the real tail, that aio_complete uses - to reduce
+ * cacheline bouncing. The real tail will tend to be much more
+ * contended - since typically events are delivered one at a
+ * time, and then aio_read_events() slurps them up a bunch at a
+ * time - so it's helpful if aio_read_events() isn't also
+ * contending for the tail. So, aio_complete() updates
+ * shadow_tail whenever it updates tail.
+ *
+ * Also needed because tail is used as a hacky lock and isn't
+ * always the real tail.
+ */
+ unsigned shadow_tail;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ unsigned tail;
+ } ____cacheline_aligned_in_smp;
+
+ struct page *internal_pages[AIO_RING_PAGES];
+};
/*------ sysctl variables----*/
static DEFINE_SPINLOCK(aio_nr_lock);
@@ -54,11 +135,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request
static struct kmem_cache *kiocb_cachep;
static struct kmem_cache *kioctx_cachep;
-static struct workqueue_struct *aio_wq;
-
-static void aio_kick_handler(struct work_struct *);
-static void aio_queue_work(struct kioctx *);
-
/* aio_setup
* Creates the slab caches used by the aio routines, panic on
* failure as this is done early during the boot sequence.
@@ -68,10 +144,7 @@ static int __init aio_setup(void)
kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
- aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */
- BUG_ON(!aio_wq);
-
- pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
+ pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
return 0;
}
@@ -79,31 +152,29 @@ __initcall(aio_setup);
static void aio_free_ring(struct kioctx *ctx)
{
- struct aio_ring_info *info = &ctx->ring_info;
long i;
- for (i=0; i<info->nr_pages; i++)
- put_page(info->ring_pages[i]);
+ for (i = 0; i < ctx->nr_pages; i++)
+ put_page(ctx->ring_pages[i]);
- if (info->mmap_size) {
- BUG_ON(ctx->mm != current->mm);
- vm_munmap(info->mmap_base, info->mmap_size);
- }
+ if (ctx->mmap_size)
+ vm_munmap(ctx->mmap_base, ctx->mmap_size);
- if (info->ring_pages && info->ring_pages != info->internal_pages)
- kfree(info->ring_pages);
- info->ring_pages = NULL;
- info->nr = 0;
+ if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
+ kfree(ctx->ring_pages);
}
static int aio_setup_ring(struct kioctx *ctx)
{
struct aio_ring *ring;
- struct aio_ring_info *info = &ctx->ring_info;
unsigned nr_events = ctx->max_reqs;
+ struct mm_struct *mm = current->mm;
unsigned long size, populate;
int nr_pages;
+ nr_events = max(nr_events, num_possible_cpus() * 4);
+ nr_events *= 2;
+
/* Compensate for the ring buffer's head/tail overlap entry */
nr_events += 2; /* 1 is required, 2 for good luck */
@@ -116,46 +187,44 @@ static int aio_setup_ring(struct kioctx *ctx)
nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
- info->nr = 0;
- info->ring_pages = info->internal_pages;
+ ctx->nr = 0;
+ ctx->ring_pages = ctx->internal_pages;
if (nr_pages > AIO_RING_PAGES) {
- info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (!info->ring_pages)
+ ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!ctx->ring_pages)
return -ENOMEM;
}
- info->mmap_size = nr_pages * PAGE_SIZE;
- dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
- down_write(&ctx->mm->mmap_sem);
- info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size,
- PROT_READ|PROT_WRITE,
- MAP_ANONYMOUS|MAP_PRIVATE, 0,
- &populate);
- if (IS_ERR((void *)info->mmap_base)) {
- up_write(&ctx->mm->mmap_sem);
- info->mmap_size = 0;
+ ctx->mmap_size = nr_pages * PAGE_SIZE;
+ pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
+ down_write(&mm->mmap_sem);
+ ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
+ PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
+ if (IS_ERR((void *)ctx->mmap_base)) {
+ up_write(&mm->mmap_sem);
+ ctx->mmap_size = 0;
aio_free_ring(ctx);
return -EAGAIN;
}
- dprintk("mmap address: 0x%08lx\n", info->mmap_base);
- info->nr_pages = get_user_pages(current, ctx->mm,
- info->mmap_base, nr_pages,
- 1, 0, info->ring_pages, NULL);
- up_write(&ctx->mm->mmap_sem);
+ pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
+ ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
+ 1, 0, ctx->ring_pages, NULL);
+ up_write(&mm->mmap_sem);
- if (unlikely(info->nr_pages != nr_pages)) {
+ if (unlikely(ctx->nr_pages != nr_pages)) {
aio_free_ring(ctx);
return -EAGAIN;
}
if (populate)
- mm_populate(info->mmap_base, populate);
-
- ctx->user_id = info->mmap_base;
+ mm_populate(ctx->mmap_base, populate);
- info->nr = nr_events; /* trusted copy */
+ ctx->user_id = ctx->mmap_base;
+ ctx->nr = nr_events; /* trusted copy */
- ring = kmap_atomic(info->ring_pages[0]);
+ ring = kmap_atomic(ctx->ring_pages[0]);
ring->nr = nr_events; /* user copy */
ring->id = ctx->user_id;
ring->head = ring->tail = 0;
@@ -164,72 +233,145 @@ static int aio_setup_ring(struct kioctx *ctx)
ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
ring->header_length = sizeof(struct aio_ring);
kunmap_atomic(ring);
+ flush_dcache_page(ctx->ring_pages[0]);
return 0;
}
-
-/* aio_ring_event: returns a pointer to the event at the given index from
- * kmap_atomic(). Release the pointer with put_aio_ring_event();
- */
#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))
#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
-#define aio_ring_event(info, nr) ({ \
- unsigned pos = (nr) + AIO_EVENTS_OFFSET; \
- struct io_event *__event; \
- __event = kmap_atomic( \
- (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \
- __event += pos % AIO_EVENTS_PER_PAGE; \
- __event; \
-})
-
-#define put_aio_ring_event(event) do { \
- struct io_event *__event = (event); \
- (void)__event; \
- kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \
-} while(0)
-
-static void ctx_rcu_free(struct rcu_head *head)
+void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+{
+ struct kioctx *ctx = req->ki_ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+
+ if (!req->ki_list.next)
+ list_add(&req->ki_list, &ctx->active_reqs);
+
+ req->ki_cancel = cancel;
+
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
+EXPORT_SYMBOL(kiocb_set_cancel_fn);
+
+static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
+ struct io_event *res)
+{
+ kiocb_cancel_fn *old, *cancel;
+ int ret = -EINVAL;
+
+ /*
+ * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
+ * actually has a cancel function, hence the cmpxchg()
+ */
+
+ cancel = ACCESS_ONCE(kiocb->ki_cancel);
+ do {
+ if (!cancel || cancel == KIOCB_CANCELLED)
+ return ret;
+
+ old = cancel;
+ cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
+ } while (cancel != old);
+
+ atomic_inc(&kiocb->ki_users);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ memset(res, 0, sizeof(*res));
+ res->obj = (u64)(unsigned long)kiocb->ki_obj.user;
+ res->data = kiocb->ki_user_data;
+ ret = cancel(kiocb, res);
+
+ spin_lock_irq(&ctx->ctx_lock);
+
+ return ret;
+}
+
+static void free_ioctx_rcu(struct rcu_head *head)
{
struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+
+ free_percpu(ctx->cpu);
kmem_cache_free(kioctx_cachep, ctx);
}
-/* __put_ioctx
- * Called when the last user of an aio context has gone away,
- * and the struct needs to be freed.
+/*
+ * When this function runs, the kioctx has been removed from the "hash table"
+ * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
+ * now it's safe to cancel any that need to be.
*/
-static void __put_ioctx(struct kioctx *ctx)
+static void free_ioctx(struct kioctx *ctx)
{
- unsigned nr_events = ctx->max_reqs;
- BUG_ON(ctx->reqs_active);
+ struct aio_ring *ring;
+ struct io_event res;
+ struct kiocb *req;
+ unsigned cpu, head, avail;
- cancel_delayed_work_sync(&ctx->wq);
- aio_free_ring(ctx);
- mmdrop(ctx->mm);
- ctx->mm = NULL;
- if (nr_events) {
- spin_lock(&aio_nr_lock);
- BUG_ON(aio_nr - nr_events > aio_nr);
- aio_nr -= nr_events;
- spin_unlock(&aio_nr_lock);
+ spin_lock_irq(&ctx->ctx_lock);
+
+ while (!list_empty(&ctx->active_reqs)) {
+ req = list_first_entry(&ctx->active_reqs,
+ struct kiocb, ki_list);
+
+ list_del_init(&req->ki_list);
+ kiocb_cancel(ctx, req, &res);
}
- pr_debug("__put_ioctx: freeing %p\n", ctx);
- call_rcu(&ctx->rcu_head, ctx_rcu_free);
-}
-static inline int try_get_ioctx(struct kioctx *kioctx)
-{
- return atomic_inc_not_zero(&kioctx->users);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ for_each_possible_cpu(cpu) {
+ struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu);
+
+ atomic_add(kcpu->reqs_available, &ctx->reqs_available);
+ kcpu->reqs_available = 0;
+ }
+
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
+ kunmap_atomic(ring);
+
+ while (atomic_read(&ctx->reqs_available) < ctx->nr) {
+ wait_event(ctx->wait,
+ (head != ctx->shadow_tail) ||
+ (atomic_read(&ctx->reqs_available) >= ctx->nr));
+
+ avail = (head <= ctx->shadow_tail ?
+ ctx->shadow_tail : ctx->nr) - head;
+
+ atomic_add(avail, &ctx->reqs_available);
+ head += avail;
+ head %= ctx->nr;
+ }
+
+ WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr);
+
+ aio_free_ring(ctx);
+
+ spin_lock(&aio_nr_lock);
+ BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
+ aio_nr -= ctx->max_reqs;
+ spin_unlock(&aio_nr_lock);
+
+ pr_debug("freeing %p\n", ctx);
+
+ /*
+ * Here the call_rcu() is between the wait_event() for reqs_active to
+ * hit 0, and freeing the ioctx.
+ *
+ * aio_complete() decrements reqs_active, but it has to touch the ioctx
+ * after to issue a wakeup so we use rcu.
+ */
+ call_rcu(&ctx->rcu_head, free_ioctx_rcu);
}
-static inline void put_ioctx(struct kioctx *kioctx)
+static void put_ioctx(struct kioctx *ctx)
{
- BUG_ON(atomic_read(&kioctx->users) <= 0);
- if (unlikely(atomic_dec_and_test(&kioctx->users)))
- __put_ioctx(kioctx);
+ if (percpu_ref_put(&ctx->users))
+ free_ioctx(ctx);
}
/* ioctx_alloc
@@ -237,7 +379,7 @@ static inline void put_ioctx(struct kioctx *kioctx)
*/
static struct kioctx *ioctx_alloc(unsigned nr_events)
{
- struct mm_struct *mm;
+ struct mm_struct *mm = current->mm;
struct kioctx *ctx;
int err = -ENOMEM;
@@ -256,21 +398,29 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
return ERR_PTR(-ENOMEM);
ctx->max_reqs = nr_events;
- mm = ctx->mm = current->mm;
- atomic_inc(&mm->mm_count);
- atomic_set(&ctx->users, 2);
+ percpu_ref_init(&ctx->users);
+ rcu_read_lock();
+ percpu_ref_get(&ctx->users);
+ rcu_read_unlock();
+
spin_lock_init(&ctx->ctx_lock);
- spin_lock_init(&ctx->ring_info.ring_lock);
+ mutex_init(&ctx->ring_lock);
init_waitqueue_head(&ctx->wait);
INIT_LIST_HEAD(&ctx->active_reqs);
- INIT_LIST_HEAD(&ctx->run_list);
- INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler);
- if (aio_setup_ring(ctx) < 0)
+ ctx->cpu = alloc_percpu(struct kioctx_cpu);
+ if (!ctx->cpu)
goto out_freectx;
+ if (aio_setup_ring(ctx) < 0)
+ goto out_freepcpu;
+
+ atomic_set(&ctx->reqs_available, ctx->nr);
+ ctx->req_batch = ctx->nr / (num_possible_cpus() * 4);
+ BUG_ON(!ctx->req_batch);
+
/* limit the number of system wide aios */
spin_lock(&aio_nr_lock);
if (aio_nr + nr_events > aio_max_nr ||
@@ -286,64 +436,58 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
spin_unlock(&mm->ioctx_lock);
- dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
- ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
+ pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
+ ctx, ctx->user_id, mm, ctx->nr);
return ctx;
out_cleanup:
err = -EAGAIN;
aio_free_ring(ctx);
+out_freepcpu:
+ free_percpu(ctx->cpu);
out_freectx:
- mmdrop(mm);
kmem_cache_free(kioctx_cachep, ctx);
- dprintk("aio: error allocating ioctx %d\n", err);
+ pr_debug("error allocating ioctx %d\n", err);
return ERR_PTR(err);
}
-/* kill_ctx
- * Cancels all outstanding aio requests on an aio context. Used
- * when the processes owning a context have all exited to encourage
- * the rapid destruction of the kioctx.
- */
-static void kill_ctx(struct kioctx *ctx)
+static void kill_ioctx_work(struct work_struct *work)
{
- int (*cancel)(struct kiocb *, struct io_event *);
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
- struct io_event res;
+ struct kioctx *ctx = container_of(work, struct kioctx, rcu_work);
- spin_lock_irq(&ctx->ctx_lock);
- ctx->dead = 1;
- while (!list_empty(&ctx->active_reqs)) {
- struct list_head *pos = ctx->active_reqs.next;
- struct kiocb *iocb = list_kiocb(pos);
- list_del_init(&iocb->ki_list);
- cancel = iocb->ki_cancel;
- kiocbSetCancelled(iocb);
- if (cancel) {
- iocb->ki_users++;
- spin_unlock_irq(&ctx->ctx_lock);
- cancel(iocb, &res);
- spin_lock_irq(&ctx->ctx_lock);
- }
- }
+ wake_up_all(&ctx->wait);
+ put_ioctx(ctx);
+}
- if (!ctx->reqs_active)
- goto out;
+static void kill_ioctx_rcu(struct rcu_head *head)
+{
+ struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
- add_wait_queue(&ctx->wait, &wait);
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- while (ctx->reqs_active) {
- spin_unlock_irq(&ctx->ctx_lock);
- io_schedule();
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- spin_lock_irq(&ctx->ctx_lock);
- }
- __set_task_state(tsk, TASK_RUNNING);
- remove_wait_queue(&ctx->wait, &wait);
+ INIT_WORK(&ctx->rcu_work, kill_ioctx_work);
+ schedule_work(&ctx->rcu_work);
+}
-out:
- spin_unlock_irq(&ctx->ctx_lock);
+/* kill_ioctx
+ * Cancels all outstanding aio requests on an aio context. Used
+ * when the processes owning a context have all exited to encourage
+ * the rapid destruction of the kioctx.
+ */
+static void kill_ioctx(struct kioctx *ctx)
+{
+ if (percpu_ref_kill(&ctx->users)) {
+ hlist_del_rcu(&ctx->list);
+ /* Between hlist_del_rcu() and dropping the initial ref */
+ synchronize_rcu();
+
+ /*
+ * We can't punt to workqueue here because put_ioctx() ->
+ * free_ioctx() will unmap the ringbuffer, and that has to be
+ * done in the original process's context. kill_ioctx_rcu/work()
+ * exist for exit_aio(), as in that path free_ioctx() won't do
+ * the unmap.
+ */
+ kill_ioctx_work(&ctx->rcu_work);
+ }
}
/* wait_on_sync_kiocb:
@@ -351,9 +495,9 @@ out:
*/
ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
{
- while (iocb->ki_users) {
+ while (atomic_read(&iocb->ki_users)) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (!iocb->ki_users)
+ if (!atomic_read(&iocb->ki_users))
break;
io_schedule();
}
@@ -362,28 +506,20 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
}
EXPORT_SYMBOL(wait_on_sync_kiocb);
-/* exit_aio: called when the last user of mm goes away. At this point,
- * there is no way for any new requests to be submited or any of the
- * io_* syscalls to be called on the context. However, there may be
- * outstanding requests which hold references to the context; as they
- * go away, they will call put_ioctx and release any pinned memory
- * associated with the request (held via struct page * references).
+/*
+ * exit_aio: called when the last user of mm goes away. At this point, there is
+ * no way for any new requests to be submited or any of the io_* syscalls to be
+ * called on the context.
+ *
+ * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
+ * them.
*/
void exit_aio(struct mm_struct *mm)
{
struct kioctx *ctx;
+ struct hlist_node *n;
- while (!hlist_empty(&mm->ioctx_list)) {
- ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
- hlist_del_rcu(&ctx->list);
-
- kill_ctx(ctx);
-
- if (1 != atomic_read(&ctx->users))
- printk(KERN_DEBUG
- "exit_aio:ioctx still alive: %d %d %d\n",
- atomic_read(&ctx->users), ctx->dead,
- ctx->reqs_active);
+ hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
/*
* We don't need to bother with munmap() here -
* exit_mmap(mm) is coming and it'll unmap everything.
@@ -391,150 +527,95 @@ void exit_aio(struct mm_struct *mm)
* as indicator that it needs to unmap the area,
* just set it to 0; aio_free_ring() is the only
* place that uses ->mmap_size, so it's safe.
- * That way we get all munmap done to current->mm -
- * all other callers have ctx->mm == current->mm.
*/
- ctx->ring_info.mmap_size = 0;
- put_ioctx(ctx);
+ ctx->mmap_size = 0;
+
+ if (percpu_ref_kill(&ctx->users)) {
+ hlist_del_rcu(&ctx->list);
+ call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
+ }
}
}
-/* aio_get_req
- * Allocate a slot for an aio request. Increments the users count
- * of the kioctx so that the kioctx stays around until all requests are
- * complete. Returns NULL if no requests are free.
- *
- * Returns with kiocb->users set to 2. The io submit code path holds
- * an extra reference while submitting the i/o.
- * This prevents races between the aio code path referencing the
- * req (after submitting it) and aio_complete() freeing the req.
- */
-static struct kiocb *__aio_get_req(struct kioctx *ctx)
+static void put_reqs_available(struct kioctx *ctx, unsigned nr)
{
- struct kiocb *req = NULL;
-
- req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
- if (unlikely(!req))
- return NULL;
-
- req->ki_flags = 0;
- req->ki_users = 2;
- req->ki_key = 0;
- req->ki_ctx = ctx;
- req->ki_cancel = NULL;
- req->ki_retry = NULL;
- req->ki_dtor = NULL;
- req->private = NULL;
- req->ki_iovec = NULL;
- INIT_LIST_HEAD(&req->ki_run_list);
- req->ki_eventfd = NULL;
+ struct kioctx_cpu *kcpu;
- return req;
-}
+ preempt_disable();
+ kcpu = this_cpu_ptr(ctx->cpu);
-/*
- * struct kiocb's are allocated in batches to reduce the number of
- * times the ctx lock is acquired and released.
- */
-#define KIOCB_BATCH_SIZE 32L
-struct kiocb_batch {
- struct list_head head;
- long count; /* number of requests left to allocate */
-};
+ kcpu->reqs_available += nr;
+ while (kcpu->reqs_available >= ctx->req_batch * 2) {
+ kcpu->reqs_available -= ctx->req_batch;
+ atomic_add(ctx->req_batch, &ctx->reqs_available);
+ }
-static void kiocb_batch_init(struct kiocb_batch *batch, long total)
-{
- INIT_LIST_HEAD(&batch->head);
- batch->count = total;
+ preempt_enable();
}
-static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
+static bool get_reqs_available(struct kioctx *ctx)
{
- struct kiocb *req, *n;
-
- if (list_empty(&batch->head))
- return;
+ struct kioctx_cpu *kcpu;
+ bool ret = false;
- spin_lock_irq(&ctx->ctx_lock);
- list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
- list_del(&req->ki_batch);
- list_del(&req->ki_list);
- kmem_cache_free(kiocb_cachep, req);
- ctx->reqs_active--;
- }
- if (unlikely(!ctx->reqs_active && ctx->dead))
- wake_up_all(&ctx->wait);
- spin_unlock_irq(&ctx->ctx_lock);
-}
-
-/*
- * Allocate a batch of kiocbs. This avoids taking and dropping the
- * context lock a lot during setup.
- */
-static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
-{
- unsigned short allocated, to_alloc;
- long avail;
- struct kiocb *req, *n;
- struct aio_ring *ring;
+ preempt_disable();
+ kcpu = this_cpu_ptr(ctx->cpu);
- to_alloc = min(batch->count, KIOCB_BATCH_SIZE);
- for (allocated = 0; allocated < to_alloc; allocated++) {
- req = __aio_get_req(ctx);
- if (!req)
- /* allocation failed, go with what we've got */
- break;
- list_add(&req->ki_batch, &batch->head);
- }
+ if (!kcpu->reqs_available) {
+ int old, avail = atomic_read(&ctx->reqs_available);
- if (allocated == 0)
- goto out;
+ do {
+ if (avail < ctx->req_batch)
+ goto out;
- spin_lock_irq(&ctx->ctx_lock);
- ring = kmap_atomic(ctx->ring_info.ring_pages[0]);
-
- avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
- BUG_ON(avail < 0);
- if (avail < allocated) {
- /* Trim back the number of requests. */
- list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
- list_del(&req->ki_batch);
- kmem_cache_free(kiocb_cachep, req);
- if (--allocated <= avail)
- break;
- }
- }
+ old = avail;
+ avail = atomic_cmpxchg(&ctx->reqs_available,
+ avail, avail - ctx->req_batch);
+ } while (avail != old);
- batch->count -= allocated;
- list_for_each_entry(req, &batch->head, ki_batch) {
- list_add(&req->ki_list, &ctx->active_reqs);
- ctx->reqs_active++;
+ kcpu->reqs_available += ctx->req_batch;
}
- kunmap_atomic(ring);
- spin_unlock_irq(&ctx->ctx_lock);
-
+ ret = true;
+ kcpu->reqs_available--;
out:
- return allocated;
+ preempt_enable();
+ return ret;
}
-static inline struct kiocb *aio_get_req(struct kioctx *ctx,
- struct kiocb_batch *batch)
+/* aio_get_req
+ * Allocate a slot for an aio request. Increments the ki_users count
+ * of the kioctx so that the kioctx stays around until all requests are
+ * complete. Returns NULL if no requests are free.
+ *
+ * Returns with kiocb->ki_users set to 2. The io submit code path holds
+ * an extra reference while submitting the i/o.
+ * This prevents races between the aio code path referencing the
+ * req (after submitting it) and aio_complete() freeing the req.
+ */
+static inline struct kiocb *aio_get_req(struct kioctx *ctx)
{
struct kiocb *req;
- if (list_empty(&batch->head))
- if (kiocb_batch_refill(ctx, batch) == 0)
- return NULL;
- req = list_first_entry(&batch->head, struct kiocb, ki_batch);
- list_del(&req->ki_batch);
+ if (!get_reqs_available(ctx))
+ return NULL;
+
+ req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
+ if (unlikely(!req))
+ goto out_put;
+
+ atomic_set(&req->ki_users, 2);
+ req->ki_ctx = ctx;
return req;
+out_put:
+ put_reqs_available(ctx, 1);
+ return NULL;
}
-static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
+static void kiocb_free(struct kiocb *req)
{
- assert_spin_locked(&ctx->ctx_lock);
-
+ if (req->ki_filp)
+ fput(req->ki_filp);
if (req->ki_eventfd != NULL)
eventfd_ctx_put(req->ki_eventfd);
if (req->ki_dtor)
@@ -542,48 +623,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
if (req->ki_iovec != &req->ki_inline_vec)
kfree(req->ki_iovec);
kmem_cache_free(kiocb_cachep, req);
- ctx->reqs_active--;
-
- if (unlikely(!ctx->reqs_active && ctx->dead))
- wake_up_all(&ctx->wait);
}
-/* __aio_put_req
- * Returns true if this put was the last user of the request.
- */
-static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
+void aio_put_req(struct kiocb *req)
{
- dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
- req, atomic_long_read(&req->ki_filp->f_count));
-
- assert_spin_locked(&ctx->ctx_lock);
-
- req->ki_users--;
- BUG_ON(req->ki_users < 0);
- if (likely(req->ki_users))
- return 0;
- list_del(&req->ki_list); /* remove from active_reqs */
- req->ki_cancel = NULL;
- req->ki_retry = NULL;
-
- fput(req->ki_filp);
- req->ki_filp = NULL;
- really_put_req(ctx, req);
- return 1;
-}
-
-/* aio_put_req
- * Returns true if this put was the last user of the kiocb,
- * false if the request is still in use.
- */
-int aio_put_req(struct kiocb *req)
-{
- struct kioctx *ctx = req->ki_ctx;
- int ret;
- spin_lock_irq(&ctx->ctx_lock);
- ret = __aio_put_req(ctx, req);
- spin_unlock_irq(&ctx->ctx_lock);
- return ret;
+ if (atomic_dec_and_test(&req->ki_users))
+ kiocb_free(req);
}
EXPORT_SYMBOL(aio_put_req);
@@ -594,627 +639,341 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
rcu_read_lock();
- hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
- /*
- * RCU protects us against accessing freed memory but
- * we have to be careful not to get a reference when the
- * reference count already dropped to 0 (ctx->dead test
- * is unreliable because of races).
- */
- if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
+ hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list)
+ if (ctx->user_id == ctx_id){
+ percpu_ref_get(&ctx->users);
ret = ctx;
break;
}
- }
rcu_read_unlock();
return ret;
}
-/*
- * Queue up a kiocb to be retried. Assumes that the kiocb
- * has already been marked as kicked, and places it on
- * the retry run list for the corresponding ioctx, if it
- * isn't already queued. Returns 1 if it actually queued
- * the kiocb (to tell the caller to activate the work
- * queue to process it), or 0, if it found that it was
- * already queued.
- */
-static inline int __queue_kicked_iocb(struct kiocb *iocb)
+static inline unsigned kioctx_ring_put(struct kioctx *ctx, struct kiocb *req,
+ unsigned tail)
{
- struct kioctx *ctx = iocb->ki_ctx;
-
- assert_spin_locked(&ctx->ctx_lock);
+ struct io_event *ev_page, *event;
+ unsigned pos = tail + AIO_EVENTS_OFFSET;
- if (list_empty(&iocb->ki_run_list)) {
- list_add_tail(&iocb->ki_run_list,
- &ctx->run_list);
- return 1;
- }
- return 0;
-}
+ if (++tail >= ctx->nr)
+ tail = 0;
-/* aio_run_iocb
- * This is the core aio execution routine. It is
- * invoked both for initial i/o submission and
- * subsequent retries via the aio_kick_handler.
- * Expects to be invoked with iocb->ki_ctx->lock
- * already held. The lock is released and reacquired
- * as needed during processing.
- *
- * Calls the iocb retry method (already setup for the
- * iocb on initial submission) for operation specific
- * handling, but takes care of most of common retry
- * execution details for a given iocb. The retry method
- * needs to be non-blocking as far as possible, to avoid
- * holding up other iocbs waiting to be serviced by the
- * retry kernel thread.
- *
- * The trickier parts in this code have to do with
- * ensuring that only one retry instance is in progress
- * for a given iocb at any time. Providing that guarantee
- * simplifies the coding of individual aio operations as
- * it avoids various potential races.
- */
-static ssize_t aio_run_iocb(struct kiocb *iocb)
-{
- struct kioctx *ctx = iocb->ki_ctx;
- ssize_t (*retry)(struct kiocb *);
- ssize_t ret;
+ ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+ event = ev_page + pos % AIO_EVENTS_PER_PAGE;
- if (!(retry = iocb->ki_retry)) {
- printk("aio_run_iocb: iocb->ki_retry = NULL\n");
- return 0;
- }
+ event->obj = (u64)(unsigned long)req->ki_obj.user;
+ event->data = req->ki_user_data;
+ event->res = req->ki_res;
+ event->res2 = req->ki_res2;
- /*
- * We don't want the next retry iteration for this
- * operation to start until this one has returned and
- * updated the iocb state. However, wait_queue functions
- * can trigger a kick_iocb from interrupt context in the
- * meantime, indicating that data is available for the next
- * iteration. We want to remember that and enable the
- * next retry iteration _after_ we are through with
- * this one.
- *
- * So, in order to be able to register a "kick", but
- * prevent it from being queued now, we clear the kick
- * flag, but make the kick code *think* that the iocb is
- * still on the run list until we are actually done.
- * When we are done with this iteration, we check if
- * the iocb was kicked in the meantime and if so, queue
- * it up afresh.
- */
+ kunmap_atomic(ev_page);
+ flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
- kiocbClearKicked(iocb);
+ pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
+ ctx, tail, req, req->ki_obj.user, req->ki_user_data,
+ req->ki_res, req->ki_res2);
- /*
- * This is so that aio_complete knows it doesn't need to
- * pull the iocb off the run list (We can't just call
- * INIT_LIST_HEAD because we don't want a kick_iocb to
- * queue this on the run list yet)
- */
- iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
- spin_unlock_irq(&ctx->ctx_lock);
+ return tail;
+}
- /* Quit retrying if the i/o has been cancelled */
- if (kiocbIsCancelled(iocb)) {
- ret = -EINTR;
- aio_complete(iocb, ret, 0);
- /* must not access the iocb after this */
- goto out;
- }
+static inline unsigned kioctx_ring_lock(struct kioctx *ctx)
+{
+ unsigned tail;
/*
- * Now we are all set to call the retry method in async
- * context.
+ * ctx->tail is both our lock and the canonical version of the tail
+ * pointer.
*/
- ret = retry(iocb);
-
- if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
- /*
- * There's no easy way to restart the syscall since other AIO's
- * may be already running. Just fail this IO with EINTR.
- */
- if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
- ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
- ret = -EINTR;
- aio_complete(iocb, ret, 0);
- }
-out:
- spin_lock_irq(&ctx->ctx_lock);
+ while ((tail = xchg(&ctx->tail, UINT_MAX)) == UINT_MAX)
+ cpu_relax();
- if (-EIOCBRETRY == ret) {
- /*
- * OK, now that we are done with this iteration
- * and know that there is more left to go,
- * this is where we let go so that a subsequent
- * "kick" can start the next iteration
- */
-
- /* will make __queue_kicked_iocb succeed from here on */
- INIT_LIST_HEAD(&iocb->ki_run_list);
- /* we must queue the next iteration ourselves, if it
- * has already been kicked */
- if (kiocbIsKicked(iocb)) {
- __queue_kicked_iocb(iocb);
-
- /*
- * __queue_kicked_iocb will always return 1 here, because
- * iocb->ki_run_list is empty at this point so it should
- * be safe to unconditionally queue the context into the
- * work queue.
- */
- aio_queue_work(ctx);
- }
- }
- return ret;
+ return tail;
}
-/*
- * __aio_run_iocbs:
- * Process all pending retries queued on the ioctx
- * run list.
- * Assumes it is operating within the aio issuer's mm
- * context.
- */
-static int __aio_run_iocbs(struct kioctx *ctx)
+static inline void kioctx_ring_unlock(struct kioctx *ctx, unsigned tail)
{
- struct kiocb *iocb;
- struct list_head run_list;
+ struct aio_ring *ring;
- assert_spin_locked(&ctx->ctx_lock);
+ if (!ctx)
+ return;
- list_replace_init(&ctx->run_list, &run_list);
- while (!list_empty(&run_list)) {
- iocb = list_entry(run_list.next, struct kiocb,
- ki_run_list);
- list_del(&iocb->ki_run_list);
- /*
- * Hold an extra reference while retrying i/o.
- */
- iocb->ki_users++; /* grab extra reference */
- aio_run_iocb(iocb);
- __aio_put_req(ctx, iocb);
- }
- if (!list_empty(&ctx->run_list))
- return 1;
- return 0;
-}
+ smp_wmb();
+ /* make event visible before updating tail */
-static void aio_queue_work(struct kioctx * ctx)
-{
- unsigned long timeout;
- /*
- * if someone is waiting, get the work started right
- * away, otherwise, use a longer delay
- */
- smp_mb();
- if (waitqueue_active(&ctx->wait))
- timeout = 1;
- else
- timeout = HZ/10;
- queue_delayed_work(aio_wq, &ctx->wq, timeout);
-}
+ ctx->shadow_tail = tail;
-/*
- * aio_run_all_iocbs:
- * Process all pending retries queued on the ioctx
- * run list, and keep running them until the list
- * stays empty.
- * Assumes it is operating within the aio issuer's mm context.
- */
-static inline void aio_run_all_iocbs(struct kioctx *ctx)
-{
- spin_lock_irq(&ctx->ctx_lock);
- while (__aio_run_iocbs(ctx))
- ;
- spin_unlock_irq(&ctx->ctx_lock);
-}
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ ring->tail = tail;
+ kunmap_atomic(ring);
+ flush_dcache_page(ctx->ring_pages[0]);
-/*
- * aio_kick_handler:
- * Work queue handler triggered to process pending
- * retries on an ioctx. Takes on the aio issuer's
- * mm context before running the iocbs, so that
- * copy_xxx_user operates on the issuer's address
- * space.
- * Run on aiod's context.
- */
-static void aio_kick_handler(struct work_struct *work)
-{
- struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
- mm_segment_t oldfs = get_fs();
- struct mm_struct *mm;
- int requeue;
+ /* unlock, make new tail visible before checking waitlist */
+ smp_mb();
- set_fs(USER_DS);
- use_mm(ctx->mm);
- spin_lock_irq(&ctx->ctx_lock);
- requeue =__aio_run_iocbs(ctx);
- mm = ctx->mm;
- spin_unlock_irq(&ctx->ctx_lock);
- unuse_mm(mm);
- set_fs(oldfs);
- /*
- * we're in a worker thread already; no point using non-zero delay
- */
- if (requeue)
- queue_delayed_work(aio_wq, &ctx->wq, 0);
-}
+ ctx->tail = tail;
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+}
-/*
- * Called by kick_iocb to queue the kiocb for retry
- * and if required activate the aio work queue to process
- * it
- */
-static void try_queue_kicked_iocb(struct kiocb *iocb)
+void batch_complete_aio(struct batch_complete *batch)
{
- struct kioctx *ctx = iocb->ki_ctx;
+ struct kioctx *ctx = NULL;
+ struct eventfd_ctx *eventfd = NULL;
+ struct rb_node *n;
unsigned long flags;
- int run = 0;
-
- spin_lock_irqsave(&ctx->ctx_lock, flags);
- /* set this inside the lock so that we can't race with aio_run_iocb()
- * testing it and putting the iocb on the run list under the lock */
- if (!kiocbTryKick(iocb))
- run = __queue_kicked_iocb(iocb);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- if (run)
- aio_queue_work(ctx);
-}
+ unsigned tail = 0;
-/*
- * kick_iocb:
- * Called typically from a wait queue callback context
- * to trigger a retry of the iocb.
- * The retry is usually executed by aio workqueue
- * threads (See aio_kick_handler).
- */
-void kick_iocb(struct kiocb *iocb)
-{
- /* sync iocbs are easy: they can only ever be executing from a
- * single context. */
- if (is_sync_kiocb(iocb)) {
- kiocbSetKicked(iocb);
- wake_up_process(iocb->ki_obj.tsk);
+ if (RB_EMPTY_ROOT(&batch->kiocb))
return;
- }
-
- try_queue_kicked_iocb(iocb);
-}
-EXPORT_SYMBOL(kick_iocb);
-
-/* aio_complete
- * Called when the io request on the given iocb is complete.
- * Returns true if this is the last user of the request. The
- * only other user of the request can be the cancellation code.
- */
-int aio_complete(struct kiocb *iocb, long res, long res2)
-{
- struct kioctx *ctx = iocb->ki_ctx;
- struct aio_ring_info *info;
- struct aio_ring *ring;
- struct io_event *event;
- unsigned long flags;
- unsigned long tail;
- int ret;
/*
- * Special case handling for sync iocbs:
- * - events go directly into the iocb for fast handling
- * - the sync task with the iocb in its stack holds the single iocb
- * ref, no other paths have a way to get another ref
- * - the sync task helpfully left a reference to itself in the iocb
- */
- if (is_sync_kiocb(iocb)) {
- BUG_ON(iocb->ki_users != 1);
- iocb->ki_user_data = res;
- iocb->ki_users = 0;
- wake_up_process(iocb->ki_obj.tsk);
- return 1;
- }
-
- info = &ctx->ring_info;
-
- /* add a completion event to the ring buffer.
- * must be done holding ctx->ctx_lock to prevent
- * other code from messing with the tail
- * pointer since we might be called from irq
- * context.
+ * Take rcu_read_lock() in case the kioctx is being destroyed, as we
+ * need to issue a wakeup after incrementing reqs_available.
*/
- spin_lock_irqsave(&ctx->ctx_lock, flags);
+ rcu_read_lock();
+ local_irq_save(flags);
- if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
- list_del_init(&iocb->ki_run_list);
+ n = rb_first(&batch->kiocb);
+ while (n) {
+ struct kiocb *req = container_of(n, struct kiocb, ki_node);
- /*
- * cancelled requests don't get events, userland was given one
- * when the event got cancelled.
- */
- if (kiocbIsCancelled(iocb))
- goto put_rq;
+ if (n->rb_right) {
+ n->rb_right->__rb_parent_color = n->__rb_parent_color;
+ n = n->rb_right;
- ring = kmap_atomic(info->ring_pages[0]);
+ while (n->rb_left)
+ n = n->rb_left;
+ } else {
+ n = rb_parent(n);
+ }
- tail = info->tail;
- event = aio_ring_event(info, tail);
- if (++tail >= info->nr)
- tail = 0;
+ if (unlikely(req->ki_eventfd != eventfd)) {
+ if (eventfd) {
+ /* Make event visible */
+ kioctx_ring_unlock(ctx, tail);
+ ctx = NULL;
- event->obj = (u64)(unsigned long)iocb->ki_obj.user;
- event->data = iocb->ki_user_data;
- event->res = res;
- event->res2 = res2;
+ eventfd_signal(eventfd, 1);
+ eventfd_ctx_put(eventfd);
+ }
- dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
- ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
- res, res2);
+ eventfd = req->ki_eventfd;
+ req->ki_eventfd = NULL;
+ }
- /* after flagging the request as done, we
- * must never even look at it again
- */
- smp_wmb(); /* make event visible before updating tail */
+ if (unlikely(req->ki_ctx != ctx)) {
+ kioctx_ring_unlock(ctx, tail);
- info->tail = tail;
- ring->tail = tail;
+ ctx = req->ki_ctx;
+ tail = kioctx_ring_lock(ctx);
+ }
- put_aio_ring_event(event);
- kunmap_atomic(ring);
+ tail = kioctx_ring_put(ctx, req, tail);
+ aio_put_req(req);
+ }
- pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+ kioctx_ring_unlock(ctx, tail);
+ local_irq_restore(flags);
+ rcu_read_unlock();
/*
* Check if the user asked us to deliver the result through an
* eventfd. The eventfd_signal() function is safe to be called
* from IRQ context.
*/
- if (iocb->ki_eventfd != NULL)
- eventfd_signal(iocb->ki_eventfd, 1);
+ if (eventfd) {
+ eventfd_signal(eventfd, 1);
+ eventfd_ctx_put(eventfd);
+ }
+}
+EXPORT_SYMBOL(batch_complete_aio);
-put_rq:
- /* everything turned out well, dispose of the aiocb. */
- ret = __aio_put_req(ctx, iocb);
+/* aio_complete_batch
+ * Called when the io request on the given iocb is complete; @batch may be
+ * NULL.
+ */
+void aio_complete_batch(struct kiocb *req, long res, long res2,
+ struct batch_complete *batch)
+{
+ req->ki_res = res;
+ req->ki_res2 = res2;
+
+ if (req->ki_list.next) {
+ struct kioctx *ctx = req->ki_ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+ list_del(&req->ki_list);
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+ }
/*
- * We have to order our ring_info tail store above and test
- * of the wait list below outside the wait lock. This is
- * like in wake_up_bit() where clearing a bit has to be
- * ordered with the unlocked test.
+ * Special case handling for sync iocbs:
+ * - events go directly into the iocb for fast handling
+ * - the sync task with the iocb in its stack holds the single iocb
+ * ref, no other paths have a way to get another ref
+ * - the sync task helpfully left a reference to itself in the iocb
*/
- smp_mb();
+ if (is_sync_kiocb(req)) {
+ BUG_ON(atomic_read(&req->ki_users) != 1);
+ req->ki_user_data = req->ki_res;
+ atomic_set(&req->ki_users, 0);
+ wake_up_process(req->ki_obj.tsk);
+ } else if (batch) {
+ int res;
+ struct kiocb *t;
+ struct rb_node **n = &batch->kiocb.rb_node, *parent = NULL;
+
+ while (*n) {
+ parent = *n;
+ t = container_of(*n, struct kiocb, ki_node);
+
+ res = req->ki_ctx != t->ki_ctx
+ ? req->ki_ctx < t->ki_ctx
+ : req->ki_eventfd != t->ki_eventfd
+ ? req->ki_eventfd < t->ki_eventfd
+ : req < t;
+
+ n = res ? &(*n)->rb_left : &(*n)->rb_right;
+ }
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+ rb_link_node(&req->ki_node, parent, n);
+ rb_insert_color(&req->ki_node, &batch->kiocb);
+ } else {
+ struct batch_complete batch_stack;
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- return ret;
+ memset(&req->ki_node, 0, sizeof(req->ki_node));
+ batch_stack.kiocb.rb_node = &req->ki_node;
+
+ batch_complete_aio(&batch_stack);
+ }
}
-EXPORT_SYMBOL(aio_complete);
+EXPORT_SYMBOL(aio_complete_batch);
-/* aio_read_evt
- * Pull an event off of the ioctx's event ring. Returns the number of
- * events fetched (0 or 1 ;-)
- * FIXME: make this use cmpxchg.
- * TODO: make the ringbuffer user mmap()able (requires FIXME).
+/* aio_read_events
+ * Pull an event off of the ioctx's event ring. Returns the number of
+ * events fetched
*/
-static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+static long aio_read_events_ring(struct kioctx *ctx,
+ struct io_event __user *event, long nr)
{
- struct aio_ring_info *info = &ioctx->ring_info;
struct aio_ring *ring;
- unsigned long head;
- int ret = 0;
-
- ring = kmap_atomic(info->ring_pages[0]);
- dprintk("in aio_read_evt h%lu t%lu m%lu\n",
- (unsigned long)ring->head, (unsigned long)ring->tail,
- (unsigned long)ring->nr);
-
- if (ring->head == ring->tail)
- goto out;
+ unsigned head, pos;
+ long ret = 0;
+ int copy_ret;
- spin_lock(&info->ring_lock);
-
- head = ring->head % info->nr;
- if (head != ring->tail) {
- struct io_event *evp = aio_ring_event(info, head);
- *ent = *evp;
- head = (head + 1) % info->nr;
- smp_mb(); /* finish reading the event before updatng the head */
- ring->head = head;
- ret = 1;
- put_aio_ring_event(evp);
- }
- spin_unlock(&info->ring_lock);
+ mutex_lock(&ctx->ring_lock);
-out:
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
kunmap_atomic(ring);
- dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret,
- (unsigned long)ring->head, (unsigned long)ring->tail);
- return ret;
-}
-struct aio_timeout {
- struct timer_list timer;
- int timed_out;
- struct task_struct *p;
-};
+ pr_debug("h%u t%u m%u\n", head, ctx->shadow_tail, ctx->nr);
-static void timeout_func(unsigned long data)
-{
- struct aio_timeout *to = (struct aio_timeout *)data;
+ if (head == ctx->shadow_tail)
+ goto out;
- to->timed_out = 1;
- wake_up_process(to->p);
-}
+ while (ret < nr) {
+ long avail = (head <= ctx->shadow_tail
+ ? ctx->shadow_tail : ctx->nr) - head;
+ struct io_event *ev;
+ struct page *page;
-static inline void init_timeout(struct aio_timeout *to)
-{
- setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to);
- to->timed_out = 0;
- to->p = current;
-}
+ if (head == ctx->shadow_tail)
+ break;
-static inline void set_timeout(long start_jiffies, struct aio_timeout *to,
- const struct timespec *ts)
-{
- to->timer.expires = start_jiffies + timespec_to_jiffies(ts);
- if (time_after(to->timer.expires, jiffies))
- add_timer(&to->timer);
- else
- to->timed_out = 1;
-}
+ avail = min(avail, nr - ret);
+ avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
+ ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
-static inline void clear_timeout(struct aio_timeout *to)
-{
- del_singleshot_timer_sync(&to->timer);
-}
+ pos = head + AIO_EVENTS_OFFSET;
+ page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
+ pos %= AIO_EVENTS_PER_PAGE;
-static int read_events(struct kioctx *ctx,
- long min_nr, long nr,
- struct io_event __user *event,
- struct timespec __user *timeout)
-{
- long start_jiffies = jiffies;
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
- int ret;
- int i = 0;
- struct io_event ent;
- struct aio_timeout to;
- int retry = 0;
-
- /* needed to zero any padding within an entry (there shouldn't be
- * any, but C is fun!
- */
- memset(&ent, 0, sizeof(ent));
-retry:
- ret = 0;
- while (likely(i < nr)) {
- ret = aio_read_evt(ctx, &ent);
- if (unlikely(ret <= 0))
- break;
+ ev = kmap(page);
+ copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail);
+ kunmap(page);
- dprintk("read event: %Lx %Lx %Lx %Lx\n",
- ent.data, ent.obj, ent.res, ent.res2);
-
- /* Could we split the check in two? */
- ret = -EFAULT;
- if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
- dprintk("aio: lost an event due to EFAULT.\n");
- break;
+ if (unlikely(copy_ret)) {
+ ret = -EFAULT;
+ goto out;
}
- ret = 0;
- /* Good, event copied to userland, update counts. */
- event ++;
- i ++;
+ ret += avail;
+ head += avail;
+ head %= ctx->nr;
}
- if (min_nr <= i)
- return i;
- if (ret)
- return ret;
-
- /* End fast path */
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ ring->head = head;
+ kunmap_atomic(ring);
+ flush_dcache_page(ctx->ring_pages[0]);
- /* racey check, but it gets redone */
- if (!retry && unlikely(!list_empty(&ctx->run_list))) {
- retry = 1;
- aio_run_all_iocbs(ctx);
- goto retry;
- }
+ pr_debug("%li h%u t%u\n", ret, head, ctx->shadow_tail);
- init_timeout(&to);
- if (timeout) {
- struct timespec ts;
- ret = -EFAULT;
- if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
- goto out;
+ put_reqs_available(ctx, ret);
+out:
+ mutex_unlock(&ctx->ring_lock);
- set_timeout(start_jiffies, &to, &ts);
- }
+ return ret;
+}
- while (likely(i < nr)) {
- add_wait_queue_exclusive(&ctx->wait, &wait);
- do {
- set_task_state(tsk, TASK_INTERRUPTIBLE);
- ret = aio_read_evt(ctx, &ent);
- if (ret)
- break;
- if (min_nr <= i)
- break;
- if (unlikely(ctx->dead)) {
- ret = -EINVAL;
- break;
- }
- if (to.timed_out) /* Only check after read evt */
- break;
- /* Try to only show up in io wait if there are ops
- * in flight */
- if (ctx->reqs_active)
- io_schedule();
- else
- schedule();
- if (signal_pending(tsk)) {
- ret = -EINTR;
- break;
- }
- /*ret = aio_read_evt(ctx, &ent);*/
- } while (1) ;
+static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
+ struct io_event __user *event, long *i)
+{
+ long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
- set_task_state(tsk, TASK_RUNNING);
- remove_wait_queue(&ctx->wait, &wait);
+ if (ret > 0)
+ *i += ret;
- if (unlikely(ret <= 0))
- break;
-
- ret = -EFAULT;
- if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
- dprintk("aio: lost an event due to EFAULT.\n");
- break;
- }
+ if (unlikely(percpu_ref_dead(&ctx->users)))
+ ret = -EINVAL;
- /* Good, event copied to userland, update counts. */
- event ++;
- i ++;
- }
+ if (!*i)
+ *i = ret;
- if (timeout)
- clear_timeout(&to);
-out:
- destroy_timer_on_stack(&to.timer);
- return i ? i : ret;
+ return ret < 0 || *i >= min_nr;
}
-/* Take an ioctx and remove it from the list of ioctx's. Protects
- * against races with itself via ->dead.
- */
-static void io_destroy(struct kioctx *ioctx)
+static long read_events(struct kioctx *ctx, long min_nr, long nr,
+ struct io_event __user *event,
+ struct timespec __user *timeout)
{
- struct mm_struct *mm = current->mm;
- int was_dead;
+ ktime_t until = { .tv64 = KTIME_MAX };
+ long ret = 0;
- /* delete the entry from the list is someone else hasn't already */
- spin_lock(&mm->ioctx_lock);
- was_dead = ioctx->dead;
- ioctx->dead = 1;
- hlist_del_rcu(&ioctx->list);
- spin_unlock(&mm->ioctx_lock);
+ if (timeout) {
+ struct timespec ts;
- dprintk("aio_release(%p)\n", ioctx);
- if (likely(!was_dead))
- put_ioctx(ioctx); /* twice for the list */
+ if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
+ return -EFAULT;
- kill_ctx(ioctx);
+ until = timespec_to_ktime(ts);
+ }
/*
- * Wake up any waiters. The setting of ctx->dead must be seen
- * by other CPUs at this point. Right now, we rely on the
- * locking done by the above calls to ensure this consistency.
+ * Note that aio_read_events() is being called as the conditional - i.e.
+ * we're calling it after prepare_to_wait() has set task state to
+ * TASK_INTERRUPTIBLE.
+ *
+ * But aio_read_events() can block, and if it blocks it's going to flip
+ * the task state back to TASK_RUNNING.
+ *
+ * This should be ok, provided it doesn't flip the state back to
+ * TASK_RUNNING and return 0 too much - that causes us to spin. That
+ * will only happen if the mutex_lock() call blocks, and we then find
+ * the ringbuffer empty. So in practice we should be ok, but it's
+ * something to be aware of when touching this code.
*/
- wake_up_all(&ioctx->wait);
+ wait_event_interruptible_hrtimeout(ctx->wait,
+ aio_read_events(ctx, min_nr, nr, event, &ret), until);
+
+ if (!ret && signal_pending(current))
+ ret = -EINTR;
+
+ return ret;
}
/* sys_io_setup:
@@ -1252,7 +1011,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
if (ret)
- io_destroy(ioctx);
+ kill_ioctx(ioctx);
put_ioctx(ioctx);
}
@@ -1270,7 +1029,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
{
struct kioctx *ioctx = lookup_ioctx(ctx);
if (likely(NULL != ioctx)) {
- io_destroy(ioctx);
+ kill_ioctx(ioctx);
put_ioctx(ioctx);
return 0;
}
@@ -1301,24 +1060,15 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
BUG_ON(ret > 0 && iocb->ki_left == 0);
}
-static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
+typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
+
+static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
- unsigned long, loff_t);
ssize_t ret = 0;
- unsigned short opcode;
-
- if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
- (iocb->ki_opcode == IOCB_CMD_PREAD)) {
- rw_op = file->f_op->aio_read;
- opcode = IOCB_CMD_PREADV;
- } else {
- rw_op = file->f_op->aio_write;
- opcode = IOCB_CMD_PWRITEV;
- }
/* This matches the pread()/pwrite() logic */
if (iocb->ki_pos < 0)
@@ -1334,7 +1084,7 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
/* retry all partial writes. retry partial reads as long as its a
* regular file. */
} while (ret > 0 && iocb->ki_left > 0 &&
- (opcode == IOCB_CMD_PWRITEV ||
+ (rw == WRITE ||
(!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
/* This means we must have transferred all that we could */
@@ -1344,81 +1094,49 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
/* If we managed to write some out we return that, rather than
* the eventual error. */
- if (opcode == IOCB_CMD_PWRITEV
- && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY
+ if (rw == WRITE
+ && ret < 0 && ret != -EIOCBQUEUED
&& iocb->ki_nbytes - iocb->ki_left)
ret = iocb->ki_nbytes - iocb->ki_left;
return ret;
}
-static ssize_t aio_fdsync(struct kiocb *iocb)
-{
- struct file *file = iocb->ki_filp;
- ssize_t ret = -EINVAL;
-
- if (file->f_op->aio_fsync)
- ret = file->f_op->aio_fsync(iocb, 1);
- return ret;
-}
-
-static ssize_t aio_fsync(struct kiocb *iocb)
-{
- struct file *file = iocb->ki_filp;
- ssize_t ret = -EINVAL;
-
- if (file->f_op->aio_fsync)
- ret = file->f_op->aio_fsync(iocb, 0);
- return ret;
-}
-
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
+static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat)
{
ssize_t ret;
+ kiocb->ki_nr_segs = kiocb->ki_nbytes;
+
#ifdef CONFIG_COMPAT
if (compat)
- ret = compat_rw_copy_check_uvector(type,
+ ret = compat_rw_copy_check_uvector(rw,
(struct compat_iovec __user *)kiocb->ki_buf,
- kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+ kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
&kiocb->ki_iovec);
else
#endif
- ret = rw_copy_check_uvector(type,
+ ret = rw_copy_check_uvector(rw,
(struct iovec __user *)kiocb->ki_buf,
- kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+ kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
&kiocb->ki_iovec);
if (ret < 0)
- goto out;
-
- ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret);
- if (ret < 0)
- goto out;
+ return ret;
- kiocb->ki_nr_segs = kiocb->ki_nbytes;
- kiocb->ki_cur_seg = 0;
- /* ki_nbytes/left now reflect bytes instead of segs */
+ /* ki_nbytes now reflect bytes instead of segs */
kiocb->ki_nbytes = ret;
- kiocb->ki_left = ret;
-
- ret = 0;
-out:
- return ret;
+ return 0;
}
-static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb)
+static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
{
- int bytes;
-
- bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left);
- if (bytes < 0)
- return bytes;
+ if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes)))
+ return -EFAULT;
kiocb->ki_iovec = &kiocb->ki_inline_vec;
kiocb->ki_iovec->iov_base = kiocb->ki_buf;
- kiocb->ki_iovec->iov_len = bytes;
+ kiocb->ki_iovec->iov_len = kiocb->ki_nbytes;
kiocb->ki_nr_segs = 1;
- kiocb->ki_cur_seg = 0;
return 0;
}
@@ -1427,96 +1145,94 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc
* Performs the initial checks and aio retry method
* setup for the kiocb at the time of io submission.
*/
-static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
+static ssize_t aio_run_iocb(struct kiocb *req, bool compat)
{
- struct file *file = kiocb->ki_filp;
- ssize_t ret = 0;
+ struct file *file = req->ki_filp;
+ ssize_t ret;
+ int rw;
+ fmode_t mode;
+ aio_rw_op *rw_op;
- switch (kiocb->ki_opcode) {
+ switch (req->ki_opcode) {
case IOCB_CMD_PREAD:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_READ)))
- break;
- ret = -EFAULT;
- if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
- kiocb->ki_left)))
- break;
- ret = aio_setup_single_vector(READ, file, kiocb);
- if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_read)
- kiocb->ki_retry = aio_rw_vect_retry;
- break;
- case IOCB_CMD_PWRITE:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_WRITE)))
- break;
- ret = -EFAULT;
- if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
- kiocb->ki_left)))
- break;
- ret = aio_setup_single_vector(WRITE, file, kiocb);
- if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_write)
- kiocb->ki_retry = aio_rw_vect_retry;
- break;
case IOCB_CMD_PREADV:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_READ)))
- break;
- ret = aio_setup_vectored_rw(READ, kiocb, compat);
- if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_read)
- kiocb->ki_retry = aio_rw_vect_retry;
- break;
+ mode = FMODE_READ;
+ rw = READ;
+ rw_op = file->f_op->aio_read;
+ goto rw_common;
+
+ case IOCB_CMD_PWRITE:
case IOCB_CMD_PWRITEV:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_WRITE)))
- break;
- ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
+ mode = FMODE_WRITE;
+ rw = WRITE;
+ rw_op = file->f_op->aio_write;
+ goto rw_common;
+rw_common:
+ if (unlikely(!(file->f_mode & mode)))
+ return -EBADF;
+
+ if (!rw_op)
+ return -EINVAL;
+
+ ret = (req->ki_opcode == IOCB_CMD_PREADV ||
+ req->ki_opcode == IOCB_CMD_PWRITEV)
+ ? aio_setup_vectored_rw(rw, req, compat)
+ : aio_setup_single_vector(rw, req);
if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_write)
- kiocb->ki_retry = aio_rw_vect_retry;
+ return ret;
+
+ ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+ if (ret < 0)
+ return ret;
+
+ req->ki_nbytes = ret;
+ req->ki_left = ret;
+
+ ret = aio_rw_vect_retry(req, rw, rw_op);
break;
+
case IOCB_CMD_FDSYNC:
- ret = -EINVAL;
- if (file->f_op->aio_fsync)
- kiocb->ki_retry = aio_fdsync;
+ if (!file->f_op->aio_fsync)
+ return -EINVAL;
+
+ ret = file->f_op->aio_fsync(req, 1);
break;
+
case IOCB_CMD_FSYNC:
- ret = -EINVAL;
- if (file->f_op->aio_fsync)
- kiocb->ki_retry = aio_fsync;
+ if (!file->f_op->aio_fsync)
+ return -EINVAL;
+
+ ret = file->f_op->aio_fsync(req, 0);
break;
+
default:
- dprintk("EINVAL: io_submit: no operation provided\n");
- ret = -EINVAL;
+ pr_debug("EINVAL: no operation provided\n");
+ return -EINVAL;
}
- if (!kiocb->ki_retry)
- return ret;
+ if (ret != -EIOCBQUEUED) {
+ /*
+ * There's no easy way to restart the syscall since other AIO's
+ * may be already running. Just fail this IO with EINTR.
+ */
+ if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+ ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
+ ret = -EINTR;
+ aio_complete(req, ret, 0);
+ }
return 0;
}
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- struct iocb *iocb, struct kiocb_batch *batch,
- bool compat)
+ struct iocb *iocb, bool compat)
{
struct kiocb *req;
- struct file *file;
ssize_t ret;
/* enforce forwards compatibility on users */
if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
- pr_debug("EINVAL: io_submit: reserve field set\n");
+ pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
@@ -1530,16 +1246,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
return -EINVAL;
}
- file = fget(iocb->aio_fildes);
- if (unlikely(!file))
- return -EBADF;
-
- req = aio_get_req(ctx, batch); /* returns with 2 references to req */
- if (unlikely(!req)) {
- fput(file);
+ req = aio_get_req(ctx);
+ if (unlikely(!req))
return -EAGAIN;
+
+ req->ki_filp = fget(iocb->aio_fildes);
+ if (unlikely(!req->ki_filp)) {
+ ret = -EBADF;
+ goto out_put_req;
}
- req->ki_filp = file;
+
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -1555,9 +1271,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
}
}
- ret = put_user(req->ki_key, &user_iocb->aio_key);
+ ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
if (unlikely(ret)) {
- dprintk("EFAULT: aio_key\n");
+ pr_debug("EFAULT: aio_key\n");
goto out_put_req;
}
@@ -1569,41 +1285,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
req->ki_opcode = iocb->aio_lio_opcode;
- ret = aio_setup_iocb(req, compat);
-
+ ret = aio_run_iocb(req, compat);
if (ret)
goto out_put_req;
- spin_lock_irq(&ctx->ctx_lock);
- /*
- * We could have raced with io_destroy() and are currently holding a
- * reference to ctx which should be destroyed. We cannot submit IO
- * since ctx gets freed as soon as io_submit() puts its reference. The
- * check here is reliable: io_destroy() sets ctx->dead before waiting
- * for outstanding IO and the barrier between these two is realized by
- * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we
- * increment ctx->reqs_active before checking for ctx->dead and the
- * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
- * don't see ctx->dead set here, io_destroy() waits for our IO to
- * finish.
- */
- if (ctx->dead) {
- spin_unlock_irq(&ctx->ctx_lock);
- ret = -EINVAL;
- goto out_put_req;
- }
- aio_run_iocb(req);
- if (!list_empty(&ctx->run_list)) {
- /* drain the run list */
- while (__aio_run_iocbs(ctx))
- ;
- }
- spin_unlock_irq(&ctx->ctx_lock);
-
aio_put_req(req); /* drop extra ref to req */
return 0;
-
out_put_req:
+ put_reqs_available(ctx, 1);
aio_put_req(req); /* drop extra ref to req */
aio_put_req(req); /* drop i/o ref to req */
return ret;
@@ -1616,7 +1305,6 @@ long do_io_submit(aio_context_t ctx_id, long nr,
long ret = 0;
int i = 0;
struct blk_plug plug;
- struct kiocb_batch batch;
if (unlikely(nr < 0))
return -EINVAL;
@@ -1629,12 +1317,10 @@ long do_io_submit(aio_context_t ctx_id, long nr,
ctx = lookup_ioctx(ctx_id);
if (unlikely(!ctx)) {
- pr_debug("EINVAL: io_submit: invalid context id\n");
+ pr_debug("EINVAL: invalid context id\n");
return -EINVAL;
}
- kiocb_batch_init(&batch, nr);
-
blk_start_plug(&plug);
/*
@@ -1655,13 +1341,12 @@ long do_io_submit(aio_context_t ctx_id, long nr,
break;
}
- ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat);
+ ret = io_submit_one(ctx, user_iocb, &tmp, compat);
if (ret)
break;
}
blk_finish_plug(&plug);
- kiocb_batch_free(ctx, &batch);
put_ioctx(ctx);
return i ? i : ret;
}
@@ -1694,10 +1379,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
assert_spin_locked(&ctx->ctx_lock);
+ if (key != KIOCB_KEY)
+ return NULL;
+
/* TODO: use a hash or array, this sucks. */
list_for_each(pos, &ctx->active_reqs) {
struct kiocb *kiocb = list_kiocb(pos);
- if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key)
+ if (kiocb->ki_obj.user == iocb)
return kiocb;
}
return NULL;
@@ -1716,7 +1404,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
struct io_event __user *, result)
{
- int (*cancel)(struct kiocb *iocb, struct io_event *res);
+ struct io_event res;
struct kioctx *ctx;
struct kiocb *kiocb;
u32 key;
@@ -1731,32 +1419,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
return -EINVAL;
spin_lock_irq(&ctx->ctx_lock);
- ret = -EAGAIN;
+
kiocb = lookup_kiocb(ctx, iocb, key);
- if (kiocb && kiocb->ki_cancel) {
- cancel = kiocb->ki_cancel;
- kiocb->ki_users ++;
- kiocbSetCancelled(kiocb);
- } else
- cancel = NULL;
+ if (kiocb)
+ ret = kiocb_cancel(ctx, kiocb, &res);
+ else
+ ret = -EINVAL;
+
spin_unlock_irq(&ctx->ctx_lock);
- if (NULL != cancel) {
- struct io_event tmp;
- pr_debug("calling cancel\n");
- memset(&tmp, 0, sizeof(tmp));
- tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
- tmp.data = kiocb->ki_user_data;
- ret = cancel(kiocb, &tmp);
- if (!ret) {
- /* Cancellation succeeded -- copy the result
- * into the user's buffer.
- */
- if (copy_to_user(result, &tmp, sizeof(tmp)))
- ret = -EFAULT;
- }
- } else
- ret = -EINVAL;
+ if (!ret) {
+ /* Cancellation succeeded -- copy the result
+ * into the user's buffer.
+ */
+ if (copy_to_user(result, &res, sizeof(res)))
+ ret = -EFAULT;
+ }
put_ioctx(ctx);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index bbc8f8827eac..b37033819070 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -256,8 +256,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
(current->mm->start_data = N_DATADDR(ex));
current->mm->brk = ex.a_bss +
(current->mm->start_brk = N_BSSADDR(ex));
- current->mm->free_area_cache = current->mm->mmap_base;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
if (retval < 0) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3939829f6c5c..8afbb009b2e2 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -140,6 +140,25 @@ static int padzero(unsigned long elf_bss)
#define ELF_BASE_PLATFORM NULL
#endif
+/*
+ * Use get_random_int() to implement AT_RANDOM while avoiding depletion
+ * of the entropy pool.
+ */
+static void get_atrandom_bytes(unsigned char *buf, size_t nbytes)
+{
+ unsigned char *p = buf;
+
+ while (nbytes) {
+ unsigned int random_variable;
+ size_t chunk = min(nbytes, sizeof(random_variable));
+
+ random_variable = get_random_int();
+ memcpy(p, &random_variable, chunk);
+ p += chunk;
+ nbytes -= chunk;
+ }
+}
+
static int
create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
unsigned long load_addr, unsigned long interp_load_addr)
@@ -201,7 +220,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
/*
* Generate 16 random bytes for userspace PRNG seeding.
*/
- get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+ get_atrandom_bytes(k_rand_bytes, sizeof(k_rand_bytes));
u_rand_bytes = (elf_addr_t __user *)
STACK_ALLOC(p, sizeof(k_rand_bytes));
if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
@@ -735,8 +754,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
/* Do this so that we can load the interpreter, if need be. We will
change some of these later */
- current->mm->free_area_cache = current->mm->mmap_base;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
if (retval < 0) {
diff --git a/fs/bio.c b/fs/bio.c
index bb5768f59b32..0cc1b5562abc 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -27,6 +27,7 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
+#include <linux/aio.h>
#include <scsi/sg.h> /* for struct sg_iovec */
#include <trace/events/block.h>
@@ -1407,33 +1408,44 @@ void bio_flush_dcache_pages(struct bio *bi)
EXPORT_SYMBOL(bio_flush_dcache_pages);
#endif
-/**
- * bio_endio - end I/O on a bio
- * @bio: bio
- * @error: error, if any
- *
- * Description:
- * bio_endio() will end I/O on the whole bio. bio_endio() is the
- * preferred way to end I/O on a bio, it takes care of clearing
- * BIO_UPTODATE on error. @error is 0 on success, and and one of the
- * established -Exxxx (-EIO, for instance) error values in case
- * something went wrong. No one should call bi_end_io() directly on a
- * bio unless they own it and thus know that it has an end_io
- * function.
- **/
-void bio_endio(struct bio *bio, int error)
+static inline void __bio_endio(struct bio *bio, struct batch_complete *batch)
{
- if (error)
+ if (bio->bi_error)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- error = -EIO;
+ bio->bi_error = -EIO;
+
+ if (bio_flagged(bio, BIO_BATCH_ENDIO))
+ bio->bi_batch_end_io(bio, bio->bi_error, batch);
+ else if (bio->bi_end_io)
+ bio->bi_end_io(bio, bio->bi_error);
+}
+
+void bio_endio_batch(struct bio *bio, int error, struct batch_complete *batch)
+{
+ if (error)
+ bio->bi_error = error;
trace_block_bio_complete(bio, error);
- if (bio->bi_end_io)
- bio->bi_end_io(bio, error);
+ if (batch)
+ bio_list_add(&batch->bio, bio);
+ else
+ __bio_endio(bio, batch);
+
+}
+EXPORT_SYMBOL(bio_endio_batch);
+
+void batch_complete(struct batch_complete *batch)
+{
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&batch->bio)))
+ __bio_endio(bio, batch);
+
+ batch_complete_aio(batch);
}
-EXPORT_SYMBOL(bio_endio);
+EXPORT_SYMBOL(batch_complete);
void bio_pair_release(struct bio_pair *bp)
{
diff --git a/fs/block_dev.c b/fs/block_dev.c
index aea605c98ba6..4d48cf5814f8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -27,6 +27,7 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -616,11 +617,9 @@ void bd_forget(struct inode *inode)
struct block_device *bdev = NULL;
spin_lock(&bdev_lock);
- if (inode->i_bdev) {
- if (!sb_is_blkdev_sb(inode->i_sb))
- bdev = inode->i_bdev;
- __bd_forget(inode);
- }
+ if (!sb_is_blkdev_sb(inode->i_sb))
+ bdev = inode->i_bdev;
+ __bd_forget(inode);
spin_unlock(&bdev_lock);
if (bdev)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3dca405848f2..af71b64d8ca6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -31,6 +31,7 @@
#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/btrfs.h>
+#include <linux/aio.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -1514,7 +1515,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
size_t count, ocount;
bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
- sb_start_write(inode->i_sb);
+ if (!sb_start_file_write(file))
+ return -EAGAIN;
mutex_lock(&inode->i_mutex);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 77a8e72d9c75..15b9d255ed89 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -41,6 +41,7 @@
#include <linux/mount.h>
#include <linux/btrfs.h>
#include <linux/blkdev.h>
+#include <linux/aio.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index bf338d9b67e3..eb09f41ee52d 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -7,6 +7,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/writeback.h>
+#include <linux/aio.h>
#include "super.h"
#include "mds_client.h"
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8c0d85577314..517f464c70f5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2516,7 +2516,8 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
BUG_ON(iocb->ki_pos != pos);
- sb_start_write(inode->i_sb);
+ if (!sb_start_file_write(file))
+ return -EAGAIN;
/*
* We need to hold the sem to be sure nobody modifies lock list
diff --git a/fs/compat.c b/fs/compat.c
index fe40fde29111..ba323f8de6fc 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -48,6 +48,7 @@
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
diff --git a/fs/coredump.c b/fs/coredump.c
index c6479658d487..25c9677cbad8 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -32,6 +32,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/compat.h>
+#include <linux/freezer.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -417,10 +418,13 @@ static void wait_for_dump_helpers(struct file *file)
pipe->readers++;
pipe->writers--;
- while ((pipe->readers > 1) && (!signal_pending(current))) {
+ while ((pipe->readers > 1) && (!fatal_signal_pending(current))) {
wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
pipe_wait(pipe);
+ pipe_unlock(pipe);
+ try_to_freeze();
+ pipe_lock(pipe);
}
pipe->readers--;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f853263cf74f..55683f36a2f1 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,6 +37,7 @@
#include <linux/uio.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
+#include <linux/aio.h>
/*
* How many user pages to map in one call to get_user_pages(). This determines
@@ -229,7 +230,8 @@ static inline struct page *dio_get_page(struct dio *dio,
* filesystems can use it to hold additional state between get_block calls and
* dio_complete.
*/
-static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async,
+ struct batch_complete *batch)
{
ssize_t transferred = 0;
@@ -263,7 +265,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
} else {
inode_dio_done(dio->inode);
if (is_async)
- aio_complete(dio->iocb, ret, 0);
+ aio_complete_batch(dio->iocb, ret, 0, batch);
}
return ret;
@@ -273,7 +275,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio);
/*
* Asynchronous IO callback.
*/
-static void dio_bio_end_aio(struct bio *bio, int error)
+static void dio_bio_end_aio(struct bio *bio, int error, struct batch_complete *batch)
{
struct dio *dio = bio->bi_private;
unsigned long remaining;
@@ -289,7 +291,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (remaining == 0) {
- dio_complete(dio, dio->iocb->ki_pos, 0, true);
+ dio_complete(dio, dio->iocb->ki_pos, 0, true, batch);
kmem_cache_free(dio_cache, dio);
}
}
@@ -328,7 +330,7 @@ void dio_end_io(struct bio *bio, int error)
struct dio *dio = bio->bi_private;
if (dio->is_async)
- dio_bio_end_aio(bio, error);
+ dio_bio_end_aio(bio, error, NULL);
else
dio_bio_end_io(bio, error);
}
@@ -349,9 +351,10 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
bio->bi_bdev = bdev;
bio->bi_sector = first_sector;
- if (dio->is_async)
- bio->bi_end_io = dio_bio_end_aio;
- else
+ if (dio->is_async) {
+ bio->bi_batch_end_io = dio_bio_end_aio;
+ bio->bi_flags |= 1 << BIO_BATCH_ENDIO;
+ } else
bio->bi_end_io = dio_bio_end_io;
sdio->bio = bio;
@@ -1272,7 +1275,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
dio_await_completion(dio);
if (drop_refcount(dio) == 0) {
- retval = dio_complete(dio, offset, retval, false);
+ retval = dio_complete(dio, offset, retval, false, NULL);
kmem_cache_free(dio_cache, dio);
} else
BUG_ON(retval != -EIOCBQUEUED);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index c00e055b6282..f23d2a7ed438 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -58,6 +58,8 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
if (ret)
return ret;
if (write) {
+ printk(KERN_NOTICE "%s (%d): dropped kernel caches: %d\n",
+ current->comm, task_pid_nr(current), sysctl_drop_caches);
if (sysctl_drop_caches & 1)
iterate_supers(drop_pagecache_sb, NULL);
if (sysctl_drop_caches & 2)
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 63b1f54b6a1f..201f0a0d6b0a 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,6 +31,7 @@
#include <linux/security.h>
#include <linux/compat.h>
#include <linux/fs_stack.h>
+#include <linux/aio.h>
#include "ecryptfs_kernel.h"
/**
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9fec1836057a..5dcc470879c7 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -348,7 +348,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
- return op != EPOLL_CTL_DEL;
+ return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
}
/* Initialize the poll safe wake up structure */
@@ -678,6 +678,36 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
return 0;
}
+/*
+ * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
+ * had no event flags set, indicating that another thread may be currently
+ * handling that item's events (in the case that EPOLLONESHOT was being
+ * used). Otherwise a zero result indicates that the item has been disabled
+ * from receiving events. A disabled item may be re-enabled via
+ * EPOLL_CTL_MOD. Must be called with "mtx" held.
+ */
+static int ep_disable(struct eventpoll *ep, struct epitem *epi)
+{
+ int result = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ep->lock, flags);
+ if (epi->event.events & EPOLLONESHOT) {
+ if (epi->event.events & ~EP_PRIVATE_BITS) {
+ if (ep_is_linked(&epi->rdllink))
+ list_del_init(&epi->rdllink);
+ /* Ensure ep_poll_callback will not add epi back onto
+ ready list: */
+ epi->event.events &= EP_PRIVATE_BITS;
+ } else
+ result = -EBUSY;
+ } else
+ result = -EINVAL;
+ spin_unlock_irqrestore(&ep->lock, flags);
+
+ return result;
+}
+
static void ep_free(struct eventpoll *ep)
{
struct rb_node *rbp;
@@ -1048,8 +1078,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
rb_insert_color(&epi->rbn, &ep->rbr);
}
-
-
#define PATH_ARR_SIZE 5
/*
* These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1835,6 +1863,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
} else
error = -ENOENT;
break;
+ case EPOLL_CTL_DISABLE:
+ if (epi)
+ error = ep_disable(ep, epi);
+ else
+ error = -ENOENT;
+ break;
}
mutex_unlock(&ep->mtx);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c3881e56662e..43d72d0ec7e2 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
#include <linux/mpage.h>
#include <linux/fiemap.h>
#include <linux/namei.h>
+#include <linux/aio.h>
#include "ext2.h"
#include "acl.h"
#include "xip.h"
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index d512c4bc4ad7..eac4f041f5fc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -27,6 +27,7 @@
#include <linux/writeback.h>
#include <linux/mpage.h>
#include <linux/namei.h>
+#include <linux/aio.h>
#include "ext3.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 64848b595b24..4959e29573b6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
#include <linux/jbd2.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/aio.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include "ext4.h"
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index c541ab8b64dd..cf102246f2da 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,6 +20,7 @@
* (sct@redhat.com), 1993, 1998
*/
+#include <linux/aio.h>
#include "ext4_jbd2.h"
#include "truncate.h"
#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9ea0cde3fa9e..f513f3dea057 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
+#include <linux/aio.h>
#include "ext4_jbd2.h"
#include "xattr.h"
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 809b31003ecc..d9903af92e51 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -18,6 +18,7 @@
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/namei.h>
+#include <linux/aio.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7bd22a201125..d0ed4ba4b61b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -12,6 +12,7 @@
#include <linux/f2fs_fs.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
+#include <linux/aio.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index acf6e479b443..d1d502a026a5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -20,6 +20,7 @@
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/mount.h>
+#include <linux/aio.h>
#include <linux/vfs.h>
#include <linux/parser.h>
#include <linux/uio.h>
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 6f96a8def147..06b5e086ab3a 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -38,6 +38,7 @@
#include <linux/device.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/aio.h>
#include <linux/kdev_t.h>
#include <linux/kthread.h>
#include <linux/list.h>
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e9bdec0b16d9..4b24ff2ee636 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -19,6 +19,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/swap.h>
#include <linux/splice.h>
+#include <linux/aio.h>
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c8071768b950..3933c534b422 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/compat.h>
#include <linux/swap.h>
+#include <linux/aio.h>
static const struct file_operations fuse_direct_io_file_operations;
@@ -971,7 +972,8 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
return err;
count = ocount;
- sb_start_write(inode->i_sb);
+ if (!sb_start_file_write(file))
+ return -EAGAIN;
mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 24f414f0ce61..371bd144d802 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,6 +20,7 @@
#include <linux/swap.h>
#include <linux/gfs2_ondisk.h>
#include <linux/backing-dev.h>
+#include <linux/aio.h>
#include "gfs2.h"
#include "incore.h"
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 019f45e45097..1b78c78cde29 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,6 +25,7 @@
#include <asm/uaccess.h>
#include <linux/dlm.h>
#include <linux/dlm_plock.h>
+#include <linux/aio.h>
#include "gfs2.h"
#include "incore.h"
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 3031dfdd2358..a9d60d46ba99 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
+#include <linux/aio.h>
#include "hfs_fs.h"
#include "btree.h"
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 160ccc9cdb4b..cdd181d8ba09 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
+#include <linux/aio.h>
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b7dc47ba675e..1781f06aa1c1 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -23,6 +23,7 @@
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
+#include <linux/aio.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_filsys.h"
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 68cfc118a806..16d39c6c4fbb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1079,7 +1079,6 @@ free_client(struct nfs4_client *clp)
}
free_svc_cred(&clp->cl_cred);
kfree(clp->cl_name.data);
- idr_remove_all(&clp->cl_stateids);
idr_destroy(&clp->cl_stateids);
kfree(clp);
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6b49f14eac8c..1e92930d59c3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -25,7 +25,7 @@
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/writeback.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
#include "nilfs.h"
#include "btnode.h"
#include "segment.h"
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 5b2d4f0853ac..b870ae00517a 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -27,6 +27,7 @@
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/writeback.h>
+#include <linux/aio.h>
#include <asm/page.h>
#include <asm/uaccess.h>
@@ -2129,7 +2130,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
BUG_ON(iocb->ki_pos != pos);
- sb_start_write(inode->i_sb);
+ if (!sb_start_file_write(file))
+ return -EAGAIN;
mutex_lock(&inode->i_mutex);
ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index d3e118cc6ffa..2778b0255dc6 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -28,6 +28,7 @@
#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/log2.h>
+#include <linux/aio.h>
#include "aops.h"
#include "attrib.h"
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index ffb2da370a99..f671e49beb34 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,6 +22,8 @@
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
+#include <linux/aio.h>
+
handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
unsigned from,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 12ae194ac943..3a44a648dae7 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
arg_flags, subclass, _RET_IP_);
if (status < 0) {
- if (status != -EAGAIN && status != -EIOCBRETRY)
+ if (status != -EAGAIN)
mlog_errno(status);
goto bail;
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6474cb44004d..859cef7e8b4a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2248,7 +2248,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
if (iocb->ki_left == 0)
return 0;
- sb_start_write(inode->i_sb);
+ if (!sb_start_file_write(file))
+ return -EAGAIN;
appending = file->f_flags & O_APPEND ? 1 : 0;
direct_io = file->f_flags & O_DIRECT ? 1 : 0;
@@ -2468,6 +2469,9 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
out->f_path.dentry->d_name.len,
out->f_path.dentry->d_name.name, len);
+ if (!sb_start_file_write(out))
+ return -EAGAIN;
+
if (pipe->inode)
mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
@@ -2506,6 +2510,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
balance_dirty_pages_ratelimited(mapping);
}
+ sb_end_write(inode->i_sb);
return ret;
}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 88924a3133fa..c765bdf6d60e 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -28,6 +28,8 @@
#include "extent_map.h"
+struct iocb;
+
/* OCFS2 Inode Private Data */
struct ocfs2_inode_info
{
diff --git a/fs/pipe.c b/fs/pipe.c
index 64a494cef0a0..4c8622cfaa80 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,6 +21,7 @@
#include <linux/audit.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
diff --git a/fs/read_write.c b/fs/read_write.c
index 3ae6dbe828bf..85de8388d182 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -15,6 +15,7 @@
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
+#include <linux/aio.h>
#include "read_write.h"
#include <asm/uaccess.h>
@@ -318,16 +319,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
}
-static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
-{
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!kiocbIsKicked(iocb))
- schedule();
- else
- kiocbClearKicked(iocb);
- __set_current_state(TASK_RUNNING);
-}
-
ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
@@ -339,13 +330,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
- for (;;) {
- ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
- if (ret != -EIOCBRETRY)
- break;
- wait_on_retry_sync_kiocb(&kiocb);
- }
-
+ ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
@@ -395,13 +380,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
- for (;;) {
- ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
- if (ret != -EIOCBRETRY)
- break;
- wait_on_retry_sync_kiocb(&kiocb);
- }
-
+ ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
@@ -568,13 +547,7 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
- for (;;) {
- ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
- if (ret != -EIOCBRETRY)
- break;
- wait_on_retry_sync_kiocb(&kiocb);
- }
-
+ ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
if (ret == -EIOCBQUEUED)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ea5061fd4f3e..77d6d47abc83 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -18,6 +18,7 @@
#include <linux/writeback.h>
#include <linux/quotaops.h>
#include <linux/swap.h>
+#include <linux/aio.h>
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
diff --git a/fs/splice.c b/fs/splice.c
index 718bd0056384..84f0cbe735ce 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -999,7 +999,8 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
};
ssize_t ret;
- sb_start_write(inode->i_sb);
+ if (!sb_start_file_write(out))
+ return -EAGAIN;
pipe_lock(pipe);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index f12189d2db1d..14374530784c 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -50,6 +50,7 @@
*/
#include "ubifs.h"
+#include <linux/aio.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/slab.h>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 7a12e48ad819..b6d15d349810 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,6 +38,7 @@
#include <linux/slab.h>
#include <linux/crc-itu-t.h>
#include <linux/mpage.h>
+#include <linux/aio.h>
#include "udf_i.h"
#include "udf_sb.h"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 5f707e537171..c24ce0e9c67c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,6 +31,7 @@
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
#include "xfs_bmap.h"
+#include <linux/aio.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/pagevec.h>
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f03bf1a456fb..a81aa74a7263 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -36,6 +36,7 @@
#include "xfs_ioctl.h"
#include "xfs_trace.h"
+#include <linux/aio.h>
#include <linux/dcache.h>
#include <linux/falloc.h>
#include <linux/pagevec.h>
@@ -775,7 +776,8 @@ xfs_file_aio_write(
if (ocount == 0)
return 0;
- sb_start_write(inode->i_sb);
+ if (!sb_start_file_write(file))
+ return -EAGAIN;
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
ret = -EIO;
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 31ff6dba4872..a7e4c595825e 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -6,94 +6,38 @@
#include <linux/aio_abi.h>
#include <linux/uio.h>
#include <linux/rcupdate.h>
-
#include <linux/atomic.h>
-
-#define AIO_MAXSEGS 4
-#define AIO_KIOGRP_NR_ATOMIC 8
+#include <linux/batch_complete.h>
struct kioctx;
+struct kiocb;
+struct batch_complete;
-/* Notes on cancelling a kiocb:
- * If a kiocb is cancelled, aio_complete may return 0 to indicate
- * that cancel has not yet disposed of the kiocb. All cancel
- * operations *must* call aio_put_req to dispose of the kiocb
- * to guard against races with the completion code.
- */
-#define KIOCB_C_CANCELLED 0x01
-#define KIOCB_C_COMPLETE 0x02
-
-#define KIOCB_SYNC_KEY (~0U)
+#define KIOCB_KEY 0
-/* ki_flags bits */
/*
- * This may be used for cancel/retry serialization in the future, but
- * for now it's unused and we probably don't want modules to even
- * think they can use it.
+ * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+ * cancelled or completed (this makes a certain amount of sense because
+ * successful cancellation - io_cancel() - does deliver the completion to
+ * userspace).
+ *
+ * And since most things don't implement kiocb cancellation and we'd really like
+ * kiocb completion to be lockless when possible, we use ki_cancel to
+ * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+ * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
*/
-/* #define KIF_LOCKED 0 */
-#define KIF_KICKED 1
-#define KIF_CANCELLED 2
-
-#define kiocbTryLock(iocb) test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbTryKick(iocb) test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags)
+#define KIOCB_CANCELLED ((void *) (~0ULL))
-#define kiocbSetLocked(iocb) set_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbSetKicked(iocb) set_bit(KIF_KICKED, &(iocb)->ki_flags)
-#define kiocbSetCancelled(iocb) set_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *);
-#define kiocbClearLocked(iocb) clear_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbClearKicked(iocb) clear_bit(KIF_KICKED, &(iocb)->ki_flags)
-#define kiocbClearCancelled(iocb) clear_bit(KIF_CANCELLED, &(iocb)->ki_flags)
-
-#define kiocbIsLocked(iocb) test_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbIsKicked(iocb) test_bit(KIF_KICKED, &(iocb)->ki_flags)
-#define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
-
-/* is there a better place to document function pointer methods? */
-/**
- * ki_retry - iocb forward progress callback
- * @kiocb: The kiocb struct to advance by performing an operation.
- *
- * This callback is called when the AIO core wants a given AIO operation
- * to make forward progress. The kiocb argument describes the operation
- * that is to be performed. As the operation proceeds, perhaps partially,
- * ki_retry is expected to update the kiocb with progress made. Typically
- * ki_retry is set in the AIO core and it itself calls file_operations
- * helpers.
- *
- * ki_retry's return value determines when the AIO operation is completed
- * and an event is generated in the AIO event ring. Except the special
- * return values described below, the value that is returned from ki_retry
- * is transferred directly into the completion ring as the operation's
- * resulting status. Once this has happened ki_retry *MUST NOT* reference
- * the kiocb pointer again.
- *
- * If ki_retry returns -EIOCBQUEUED it has made a promise that aio_complete()
- * will be called on the kiocb pointer in the future. The AIO core will
- * not ask the method again -- ki_retry must ensure forward progress.
- * aio_complete() must be called once and only once in the future, multiple
- * calls may result in undefined behaviour.
- *
- * If ki_retry returns -EIOCBRETRY it has made a promise that kick_iocb()
- * will be called on the kiocb pointer in the future. This may happen
- * through generic helpers that associate kiocb->ki_wait with a wait
- * queue head that ki_retry uses via current->io_wait. It can also happen
- * with custom tracking and manual calls to kick_iocb(), though that is
- * discouraged. In either case, kick_iocb() must be called once and only
- * once. ki_retry must ensure forward progress, the AIO core will wait
- * indefinitely for kick_iocb() to be called.
- */
struct kiocb {
- struct list_head ki_run_list;
- unsigned long ki_flags;
- int ki_users;
- unsigned ki_key; /* id of this request */
+ struct rb_node ki_node;
+
+ atomic_t ki_users;
struct file *ki_filp;
- struct kioctx *ki_ctx; /* may be NULL for sync ops */
- int (*ki_cancel)(struct kiocb *, struct io_event *);
- ssize_t (*ki_retry)(struct kiocb *);
+ struct kioctx *ki_ctx; /* NULL for sync ops */
+ kiocb_cancel_fn *ki_cancel;
void (*ki_dtor)(struct kiocb *);
union {
@@ -102,6 +46,9 @@ struct kiocb {
} ki_obj;
__u64 ki_user_data; /* user's data for completion */
+ long ki_res;
+ long ki_res2;
+
loff_t ki_pos;
void *private;
@@ -117,7 +64,6 @@ struct kiocb {
struct list_head ki_list; /* the aio core uses this
* for cancellation */
- struct list_head ki_batch; /* batch allocation */
/*
* If the aio_resfd field of the userspace iocb is not zero,
@@ -128,108 +74,55 @@ struct kiocb {
static inline bool is_sync_kiocb(struct kiocb *kiocb)
{
- return kiocb->ki_key == KIOCB_SYNC_KEY;
+ return kiocb->ki_ctx == NULL;
}
static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
*kiocb = (struct kiocb) {
- .ki_users = 1,
- .ki_key = KIOCB_SYNC_KEY,
+ .ki_users = ATOMIC_INIT(1),
+ .ki_ctx = NULL,
.ki_filp = filp,
.ki_obj.tsk = current,
};
}
-#define AIO_RING_MAGIC 0xa10a10a1
-#define AIO_RING_COMPAT_FEATURES 1
-#define AIO_RING_INCOMPAT_FEATURES 0
-struct aio_ring {
- unsigned id; /* kernel internal index number */
- unsigned nr; /* number of io_events */
- unsigned head;
- unsigned tail;
-
- unsigned magic;
- unsigned compat_features;
- unsigned incompat_features;
- unsigned header_length; /* size of aio_ring */
-
-
- struct io_event io_events[0];
-}; /* 128 bytes + ring size */
-
-#define AIO_RING_PAGES 8
-struct aio_ring_info {
- unsigned long mmap_base;
- unsigned long mmap_size;
-
- struct page **ring_pages;
- spinlock_t ring_lock;
- long nr_pages;
-
- unsigned nr, tail;
-
- struct page *internal_pages[AIO_RING_PAGES];
-};
-
-static inline unsigned aio_ring_avail(struct aio_ring_info *info,
- struct aio_ring *ring)
-{
- return (ring->head + info->nr - 1 - ring->tail) % info->nr;
-}
-
-struct kioctx {
- atomic_t users;
- int dead;
- struct mm_struct *mm;
-
- /* This needs improving */
- unsigned long user_id;
- struct hlist_node list;
-
- wait_queue_head_t wait;
-
- spinlock_t ctx_lock;
-
- int reqs_active;
- struct list_head active_reqs; /* used for cancellation */
- struct list_head run_list; /* used for kicked reqs */
-
- /* sys_io_setup currently limits this to an unsigned int */
- unsigned max_reqs;
-
- struct aio_ring_info ring_info;
-
- struct delayed_work wq;
-
- struct rcu_head rcu_head;
-};
-
/* prototypes */
-extern unsigned aio_max_size;
-
#ifdef CONFIG_AIO
extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
-extern int aio_put_req(struct kiocb *iocb);
-extern void kick_iocb(struct kiocb *iocb);
-extern int aio_complete(struct kiocb *iocb, long res, long res2);
+extern void aio_put_req(struct kiocb *iocb);
+extern void batch_complete_aio(struct batch_complete *batch);
+extern void aio_complete_batch(struct kiocb *iocb, long res, long res2,
+ struct batch_complete *batch);
struct mm_struct;
extern void exit_aio(struct mm_struct *mm);
extern long do_io_submit(aio_context_t ctx_id, long nr,
struct iocb __user *__user *iocbpp, bool compat);
+void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
#else
static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
-static inline int aio_put_req(struct kiocb *iocb) { return 0; }
-static inline void kick_iocb(struct kiocb *iocb) { }
-static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; }
+static inline void aio_put_req(struct kiocb *iocb) { }
+
+static inline void batch_complete_aio(struct batch_complete *batch) { }
+static inline void aio_complete_batch(struct kiocb *iocb, long res, long res2,
+ struct batch_complete *batch)
+{
+ return;
+}
struct mm_struct;
static inline void exit_aio(struct mm_struct *mm) { }
static inline long do_io_submit(aio_context_t ctx_id, long nr,
struct iocb __user * __user *iocbpp,
bool compat) { return 0; }
+static inline void kiocb_set_cancel_fn(struct kiocb *req,
+ kiocb_cancel_fn *cancel) { }
#endif /* CONFIG_AIO */
+static inline void aio_complete(struct kiocb *iocb, long res, long res2)
+{
+ aio_complete_batch(iocb, res, res2, NULL);
+}
+
static inline struct kiocb *list_kiocb(struct list_head *h)
{
return list_entry(h, struct kiocb, ki_list);
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index f7f1d7169b11..6fd5cc80f62f 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -213,8 +213,15 @@ static inline bool balloon_compaction_check(void)
return true;
}
+static inline void balloon_event_count(enum vm_event_item item)
+{
+ count_vm_event(item);
+}
#else /* !CONFIG_BALLOON_COMPACTION */
+/* A macro, to avoid generating references to the undefined COMPACTBALLOON* */
+#define balloon_event_count(item) do { } while (0)
+
static inline void *balloon_mapping_alloc(void *balloon_device,
const struct address_space_operations *a_ops)
{
diff --git a/include/linux/batch_complete.h b/include/linux/batch_complete.h
new file mode 100644
index 000000000000..8167a9d306fb
--- /dev/null
+++ b/include/linux/batch_complete.h
@@ -0,0 +1,23 @@
+#ifndef _LINUX_BATCH_COMPLETE_H
+#define _LINUX_BATCH_COMPLETE_H
+
+#include <linux/rbtree.h>
+
+/*
+ * Common stuff to the aio and block code for batch completion. Everything
+ * important is elsewhere:
+ */
+
+struct bio;
+
+struct bio_list {
+ struct bio *head;
+ struct bio *tail;
+};
+
+struct batch_complete {
+ struct bio_list bio;
+ struct rb_root kiocb;
+};
+
+#endif
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 820e7aaad4fd..5f549176745c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -24,6 +24,7 @@
#include <linux/mempool.h>
#include <linux/ioprio.h>
#include <linux/bug.h>
+#include <linux/batch_complete.h>
#ifdef CONFIG_BLOCK
@@ -68,6 +69,8 @@
#define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx)
#define bio_sectors(bio) ((bio)->bi_size >> 9)
+void bio_endio_batch(struct bio *bio, int error, struct batch_complete *batch);
+
static inline unsigned int bio_cur_bytes(struct bio *bio)
{
if (bio->bi_vcnt)
@@ -241,7 +244,25 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
}
-extern void bio_endio(struct bio *, int);
+/**
+ * bio_endio - end I/O on a bio
+ * @bio: bio
+ * @error: error, if any
+ *
+ * Description:
+ * bio_endio() will end I/O on the whole bio. bio_endio() is the
+ * preferred way to end I/O on a bio, it takes care of clearing
+ * BIO_UPTODATE on error. @error is 0 on success, and and one of the
+ * established -Exxxx (-EIO, for instance) error values in case
+ * something went wrong. No one should call bi_end_io() directly on a
+ * bio unless they own it and thus know that it has an end_io
+ * function.
+ **/
+static inline void bio_endio(struct bio *bio, int error)
+{
+ bio_endio_batch(bio, error, NULL);
+}
+
struct request_queue;
extern int bio_phys_segments(struct request_queue *, struct bio *);
@@ -420,10 +441,6 @@ static inline bool bio_mergeable(struct bio *bio)
* member of the bio. The bio_list also caches the last list member to allow
* fast access to the tail.
*/
-struct bio_list {
- struct bio *head;
- struct bio *tail;
-};
static inline int bio_list_empty(const struct bio_list *bl)
{
@@ -527,6 +544,15 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
return bio;
}
+static inline void batch_complete_init(struct batch_complete *batch)
+{
+ bio_list_init(&batch->bio);
+ batch->kiocb = RB_ROOT;
+}
+
+void batch_complete(struct batch_complete *batch);
+
+
#if defined(CONFIG_BLK_DEV_INTEGRITY)
#define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)]))
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cdf11191e645..d4e7bab9a17e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -16,7 +16,9 @@ struct page;
struct block_device;
struct io_context;
struct cgroup_subsys_state;
+struct batch_complete;
typedef void (bio_end_io_t) (struct bio *, int);
+typedef void (bio_batch_end_io_t) (struct bio *, int, struct batch_complete *);
typedef void (bio_destructor_t) (struct bio *);
/*
@@ -42,6 +44,7 @@ struct bio {
* top bits priority
*/
+ short bi_error;
unsigned short bi_vcnt; /* how many bio_vec's */
unsigned short bi_idx; /* current index into bvl_vec */
@@ -59,7 +62,10 @@ struct bio {
unsigned int bi_seg_front_size;
unsigned int bi_seg_back_size;
- bio_end_io_t *bi_end_io;
+ union {
+ bio_end_io_t *bi_end_io;
+ bio_batch_end_io_t *bi_batch_end_io;
+ };
void *bi_private;
#ifdef CONFIG_BLK_CGROUP
@@ -111,12 +117,13 @@ struct bio {
#define BIO_FS_INTEGRITY 9 /* fs owns integrity data, not block layer */
#define BIO_QUIET 10 /* Make BIO Quiet */
#define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
+#define BIO_BATCH_ENDIO 12
/*
* Flags starting here get preserved by bio_reset() - this includes
* BIO_POOL_IDX()
*/
-#define BIO_RESET_BITS 12
+#define BIO_RESET_BITS 13
#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 78feda9bbae2..2f91edb6e2d3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -877,7 +877,8 @@ extern struct request *blk_fetch_request(struct request_queue *q);
* This prevents code duplication in drivers.
*/
extern bool blk_update_request(struct request *rq, int error,
- unsigned int nr_bytes);
+ unsigned int nr_bytes,
+ struct batch_complete *batch);
extern bool blk_end_request(struct request *rq, int error,
unsigned int nr_bytes);
extern void blk_end_request_all(struct request *rq, int error);
@@ -885,10 +886,17 @@ extern bool blk_end_request_cur(struct request *rq, int error);
extern bool blk_end_request_err(struct request *rq, int error);
extern bool __blk_end_request(struct request *rq, int error,
unsigned int nr_bytes);
-extern void __blk_end_request_all(struct request *rq, int error);
extern bool __blk_end_request_cur(struct request *rq, int error);
extern bool __blk_end_request_err(struct request *rq, int error);
+extern void blk_end_request_all_batch(struct request *rq, int error,
+ struct batch_complete *batch);
+
+static inline void __blk_end_request_all(struct request *rq, int error)
+{
+ blk_end_request_all_batch(rq, error, NULL);
+}
+
extern void blk_complete_request(struct request *);
extern void __blk_complete_request(struct request *);
extern void blk_abort_request(struct request *);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 900af5964f55..906e5c59be3c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -27,6 +27,7 @@ struct cgroup_subsys;
struct inode;
struct cgroup;
struct css_id;
+struct eventfd_ctx;
extern int cgroup_init_early(void);
extern int cgroup_init(void);
diff --git a/include/linux/errno.h b/include/linux/errno.h
index f6bf082d4d4f..89627b9187f9 100644
--- a/include/linux/errno.h
+++ b/include/linux/errno.h
@@ -28,6 +28,5 @@
#define EBADTYPE 527 /* Type not supported by server */
#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */
#define EIOCBQUEUED 529 /* iocb queued, will get completion event */
-#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */
#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4e686a099465..0b67925c128f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -675,9 +675,11 @@ static inline loff_t i_size_read(const struct inode *inode)
static inline void i_size_write(struct inode *inode, loff_t i_size)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+ preempt_disable();
write_seqcount_begin(&inode->i_size_seqcount);
inode->i_size = i_size;
write_seqcount_end(&inode->i_size_seqcount);
+ preempt_enable();
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
preempt_disable();
inode->i_size = i_size;
@@ -1399,6 +1401,16 @@ static inline int sb_start_write_trylock(struct super_block *sb)
return __sb_start_write(sb, SB_FREEZE_WRITE, false);
}
+/*
+ * sb_start_write() for writing into a file. When file has O_NONBLOCK set,
+ * we use trylock semantics, otherwise we block on frozen filesystem.
+ */
+static inline int sb_start_file_write(struct file *file)
+{
+ return __sb_start_write(file->f_mapping->host->i_sb, SB_FREEZE_WRITE,
+ !(file->f_flags & O_NONBLOCK));
+}
+
/**
* sb_start_pagefault - get write access to a superblock from a page fault
* @sb: the super we write to
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 29eb805ea4a6..7a3046dbe255 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -94,6 +94,11 @@
*/
#define in_nmi() (preempt_count() & NMI_MASK)
+/*
+ * Are we in nmi,irq context, or softirq context?
+ */
+#define in_serving_irq() (in_nmi() || in_irq() || in_serving_softirq())
+
#if defined(CONFIG_PREEMPT_COUNT)
# define PREEMPT_CHECK_OFFSET 1
#else
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ace9a5f01c64..fb425aa16c01 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -330,12 +330,9 @@ struct mm_struct {
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
- void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
#endif
unsigned long mmap_base; /* base of mmap area */
unsigned long task_size; /* size of task vm space */
- unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
- unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
unsigned long highest_vm_end; /* highest vma end address */
pgd_t * pgd;
atomic_t mm_users; /* How many users with user space? */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ede274957e05..ab20a60d2671 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -815,7 +815,10 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
/*
* zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
*/
-#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
+static inline enum zone_type zone_idx(struct zone *zone)
+{
+ return zone - zone->zone_pgdat->node_zones;
+}
static inline int populated_zone(struct zone *zone)
{
@@ -856,25 +859,18 @@ static inline int is_normal_idx(enum zone_type idx)
*/
static inline int is_highmem(struct zone *zone)
{
-#ifdef CONFIG_HIGHMEM
- int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones;
- return zone_off == ZONE_HIGHMEM * sizeof(*zone) ||
- (zone_off == ZONE_MOVABLE * sizeof(*zone) &&
- zone_movable_is_highmem());
-#else
- return 0;
-#endif
+ return is_highmem_idx(zone_idx(zone));
}
static inline int is_normal(struct zone *zone)
{
- return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
+ return zone_idx(zone) == ZONE_NORMAL;
}
static inline int is_dma32(struct zone *zone)
{
#ifdef CONFIG_ZONE_DMA32
- return zone == zone->zone_pgdat->node_zones + ZONE_DMA32;
+ return zone_idx(zone) == ZONE_DMA32;
#else
return 0;
#endif
@@ -883,7 +879,7 @@ static inline int is_dma32(struct zone *zone)
static inline int is_dma(struct zone *zone)
{
#ifdef CONFIG_ZONE_DMA
- return zone == zone->zone_pgdat->node_zones + ZONE_DMA;
+ return zone_idx(zone) == ZONE_DMA;
#else
return 0;
#endif
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
new file mode 100644
index 000000000000..d0cf8872dc43
--- /dev/null
+++ b/include/linux/percpu-refcount.h
@@ -0,0 +1,114 @@
+/*
+ * Dynamic percpu refcounts:
+ * (C) 2012 Google, Inc.
+ * Author: Kent Overstreet <koverstreet@google.com>
+ *
+ * This implements a refcount with similar semantics to atomic_t - atomic_inc(),
+ * atomic_dec_and_test() - but potentially percpu.
+ *
+ * There's one important difference between percpu refs and normal atomic_t
+ * refcounts; you have to keep track of your initial refcount, and then when you
+ * start shutting down you call percpu_ref_kill() _before_ dropping the initial
+ * refcount.
+ *
+ * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
+ * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
+ * puts the ref back in single atomic_t mode, collecting the per cpu refs and
+ * issuing the appropriate barriers, and then marks the ref as shutting down so
+ * that percpu_ref_put() will check for the ref hitting 0. After it returns,
+ * it's safe to drop the initial ref.
+ *
+ * BACKGROUND:
+ *
+ * Percpu refcounts are quite useful for performance, but if we blindly
+ * converted all refcounts to percpu counters we'd waste quite a bit of memory.
+ *
+ * Think about all the refcounts embedded in kobjects, files, etc. most of which
+ * aren't used much. These start out as simple atomic counters - a little bigger
+ * than a bare atomic_t, 16 bytes instead of 4 - but if we exceed some arbitrary
+ * number of gets in one second, we then switch to percpu counters.
+ *
+ * This heuristic isn't perfect because it'll fire if the refcount was only
+ * being used on one cpu; ideally we'd be able to count the number of cache
+ * misses on percpu_ref_get() or something similar, but that'd make the non
+ * percpu path significantly heavier/more complex. We can count the number of
+ * gets() without any extra atomic instructions on arches that support
+ * atomic64_t - simply by changing the atomic_inc() to atomic_add_return().
+ *
+ * USAGE:
+ *
+ * See fs/aio.c for some example usage; it's used there for struct kioctx, which
+ * is created when userspaces calls io_setup(), and destroyed when userspace
+ * calls io_destroy() or the process exits.
+ *
+ * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
+ * calls percpu_ref_kill(), then hlist_del_rcu() and sychronize_rcu() to remove
+ * the kioctx from the proccess's list of kioctxs - after that, there can't be
+ * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop
+ * the initial ref with percpu_ref_put().
+ *
+ * Code that does a two stage shutdown like this often needs some kind of
+ * explicit synchronization to ensure the initial refcount can only be dropped
+ * once - percpu_ref_kill() does this for you, it returns true once and false if
+ * someone else already called it. The aio code uses it this way, but it's not
+ * necessary if the code has some other mechanism to synchronize teardown.
+ *
+ * As mentioned previously, we decide when to convert a ref to percpu counters
+ * in percpu_ref_get(). However, since percpu_ref_get() will often be called
+ * with rcu_read_lock() held, it's not done there - percpu_ref_get() returns
+ * true if the ref should be converted to percpu counters.
+ *
+ * The caller should then call percpu_ref_alloc() after dropping
+ * rcu_read_lock(); if there is an uncommonly used codepath where it's
+ * inconvenient to call percpu_ref_alloc() after get(), it may be safely skipped
+ * and percpu_ref_get() will return true again the next time the counter wraps
+ * around.
+ */
+
+#ifndef _LINUX_PERCPU_REFCOUNT_H
+#define _LINUX_PERCPU_REFCOUNT_H
+
+#include <linux/atomic.h>
+#include <linux/percpu.h>
+
+struct percpu_ref {
+ atomic64_t count;
+ unsigned long pcpu_count;
+};
+
+void percpu_ref_init(struct percpu_ref *ref);
+void __percpu_ref_get(struct percpu_ref *ref, bool alloc);
+int percpu_ref_put(struct percpu_ref *ref);
+
+int percpu_ref_kill(struct percpu_ref *ref);
+int percpu_ref_dead(struct percpu_ref *ref);
+
+/**
+ * percpu_ref_get - increment a dynamic percpu refcount
+ *
+ * Increments @ref and possibly converts it to percpu counters. Must be called
+ * with rcu_read_lock() held, and may potentially drop/reacquire rcu_read_lock()
+ * to allocate percpu counters - if sleeping/allocation isn't safe for some
+ * other reason (e.g. a spinlock), see percpu_ref_get_noalloc().
+ *
+ * Analagous to atomic_inc().
+ */
+static inline void percpu_ref_get(struct percpu_ref *ref)
+{
+ __percpu_ref_get(ref, true);
+}
+
+/**
+ * percpu_ref_get_noalloc - increment a dynamic percpu refcount
+ *
+ * Increments @ref, to be used when it's not safe to allocate percpu counters.
+ * Must be called with rcu_read_lock() held.
+ *
+ * Analagous to atomic_inc().
+ */
+static inline void percpu_ref_get_noalloc(struct percpu_ref *ref)
+{
+ __percpu_ref_get(ref, false);
+}
+
+#endif
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 215e5e3dda10..6051177a890d 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <linux/bug.h>
#include <linux/mm.h>
+#include <linux/workqueue.h>
#include <linux/threads.h>
#include <linux/nsproxy.h>
#include <linux/kref.h>
diff --git a/include/linux/rtc-pxa.h b/include/linux/rtc-pxa.h
new file mode 100644
index 000000000000..71bc45f060fc
--- /dev/null
+++ b/include/linux/rtc-pxa.h
@@ -0,0 +1,18 @@
+/*
+ * include/linux/rtc-pxa.h
+ *
+ * RTC PXA Header file
+ *
+ * Copyright (C) 2010 Marvell International Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_RTC_PXA_H
+#define __LINUX_RTC_PXA_H
+
+extern int pxa_rtc_sync_time(unsigned int ticks);
+
+#endif /* __LINUX_RTC_PXA_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6ddbfb..1fa7cc4405a9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -325,8 +325,6 @@ extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
struct nsproxy;
struct user_namespace;
-#include <linux/aio.h>
-
#ifdef CONFIG_MMU
extern void arch_pick_mmap_layout(struct mm_struct *mm);
extern unsigned long
@@ -336,8 +334,6 @@ extern unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags);
-extern void arch_unmap_area(struct mm_struct *, unsigned long);
-extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
#endif
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index bd6cf61142be..d4b7a184f08c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -50,7 +50,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED,
COMPACTISOLATED,
COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
-#endif
+#ifdef CONFIG_BALLOON_COMPACTION
+ COMPACTBALLOONISOLATED, /* isolated from balloon pagelist */
+ COMPACTBALLOONMIGRATED, /* balloon page sucessfully migrated */
+ COMPACTBALLOONRETURNED, /* putback to pagelist, not-migrated */
+#endif /* CONFIG_BALLOON_COMPACTION */
+#endif /* CONFIG_COMPACTION */
#ifdef CONFIG_HUGETLB_PAGE
HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
#endif
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 7cb64d4b499d..ac38be2692d8 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -330,6 +330,92 @@ do { \
__ret; \
})
+#define __wait_event_hrtimeout(wq, condition, timeout, state) \
+({ \
+ int __ret = 0; \
+ DEFINE_WAIT(__wait); \
+ struct hrtimer_sleeper __t; \
+ \
+ hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \
+ HRTIMER_MODE_REL); \
+ hrtimer_init_sleeper(&__t, current); \
+ if ((timeout).tv64 != KTIME_MAX) \
+ hrtimer_start_range_ns(&__t.timer, timeout, \
+ current->timer_slack_ns, \
+ HRTIMER_MODE_REL); \
+ \
+ for (;;) { \
+ prepare_to_wait(&wq, &__wait, state); \
+ if (condition) \
+ break; \
+ if (state == TASK_INTERRUPTIBLE && \
+ signal_pending(current)) { \
+ __ret = -ERESTARTSYS; \
+ break; \
+ } \
+ if (!__t.task) { \
+ __ret = -ETIME; \
+ break; \
+ } \
+ schedule(); \
+ } \
+ \
+ hrtimer_cancel(&__t.timer); \
+ destroy_hrtimer_on_stack(&__t.timer); \
+ finish_wait(&wq, &__wait); \
+ __ret; \
+})
+
+/**
+ * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, as a ktime_t
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if @condition became true, or -ETIME if the timeout
+ * elapsed.
+ */
+#define wait_event_hrtimeout(wq, condition, timeout) \
+({ \
+ int __ret = 0; \
+ if (!(condition)) \
+ __ret = __wait_event_hrtimeout(wq, condition, timeout, \
+ TASK_UNINTERRUPTIBLE); \
+ __ret; \
+})
+
+/**
+ * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, as a ktime_t
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if @condition became true, -ERESTARTSYS if it was
+ * interrupted by a signal, or -ETIME if the timeout elapsed.
+ */
+#define wait_event_interruptible_hrtimeout(wq, condition, timeout) \
+({ \
+ long __ret = 0; \
+ if (!(condition)) \
+ __ret = __wait_event_hrtimeout(wq, condition, timeout, \
+ TASK_INTERRUPTIBLE); \
+ __ret; \
+})
+
#define __wait_event_interruptible_exclusive(wq, condition, ret) \
do { \
DEFINE_WAIT(__wait); \
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9a9367c0c076..579a5007c696 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -5,6 +5,7 @@
#define WRITEBACK_H
#include <linux/sched.h>
+#include <linux/workqueue.h>
#include <linux/fs.h>
DECLARE_PER_CPU(int, dirty_throttle_leaks);
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 2c267bcbb85c..8c99ce7202c5 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -25,6 +25,7 @@
#define EPOLL_CTL_ADD 1
#define EPOLL_CTL_DEL 2
#define EPOLL_CTL_MOD 3
+#define EPOLL_CTL_DISABLE 4
/*
* Request the handling of system wakeup events so as to prevent system suspends
diff --git a/init/Kconfig b/init/Kconfig
index ae3c7a17ae26..4f8b5792ccca 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -891,6 +891,23 @@ config MEMCG_KMEM
the kmem extension can use it to guarantee that no group of processes
will ever exhaust kernel resources alone.
+config MEMCG_DEBUG_ASYNC_DESTROY
+ bool "Memory Resource Controller Debug asynchronous object destruction"
+ depends on MEMCG_KMEM || MEMCG_SWAP
+ default n
+ help
+ When a memcg is destroyed, the memory consumed by it may not be
+ immediately freed. This is because when some extensions are used, such
+ as swap or kernel memory, objects can outlive the group and hold a
+ reference to it.
+
+ If this is the case, the dangling_memcgs file will show information
+ about what are the memcgs still alive, and which references are still
+ preventing it to be freed. There is nothing wrong with that, but it is
+ very useful when debugging, to know where this memory is being held.
+ This is a developer-oriented debugging facility only, and no
+ guarantees of interface stability will be given.
+
config CGROUP_HUGETLB
bool "HugeTLB Resource Controller for Control Groups"
depends on RESOURCE_COUNTERS && HUGETLB_PAGE
diff --git a/ipc/sem.c b/ipc/sem.c
index 58d31f1c1eb5..ebd8fececcfc 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -61,8 +61,8 @@
* - A woken up task may not even touch the semaphore array anymore, it may
* have been destroyed already by a semctl(RMID).
* - The synchronizations between wake-ups due to a timeout/signal and a
- * wake-up due to a completed semaphore operation is achieved by using an
- * intermediate state (IN_WAKEUP).
+ * wake-up due to a completed semaphore operation is achieved by using a
+ * special wakeup scheme (queuewakeup_wait and support functions)
* - UNDO values are stored in an array (one per process and per
* semaphore array, lazily allocated). For backwards compatibility, multiple
* modes for the UNDO variables are supported (per process, per thread)
@@ -90,6 +90,135 @@
#include <asm/uaccess.h>
#include "util.h"
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+ #define SYSVSEM_COMPLETION 1
+#else
+ #define SYSVSEM_CUSTOM 1
+#endif
+
+#ifdef SYSVSEM_COMPLETION
+ /* Using a completion causes some overhead, but avoids a busy loop
+ * that increases the worst case latency.
+ */
+ struct queue_done {
+ struct completion done;
+ };
+
+ static void queuewakeup_prepare(void)
+ {
+ /* no preparation necessary */
+ }
+
+ static void queuewakeup_completed(void)
+ {
+ /* empty */
+ }
+
+ static void queuewakeup_block(struct queue_done *qd)
+ {
+ /* empty */
+ }
+
+ static void queuewakeup_handsoff(struct queue_done *qd)
+ {
+ complete_all(&qd->done);
+ }
+
+ static void queuewakeup_init(struct queue_done *qd)
+ {
+ init_completion(&qd->done);
+ }
+
+ static void queuewakeup_wait(struct queue_done *qd)
+ {
+ wait_for_completion(&qd->done);
+ }
+
+#elif defined(SYSVSEM_SPINLOCK)
+ /* Note: Spinlocks do not work because:
+ * - lockdep complains [could be fixed]
+ * - only 255 concurrent spin_lock() calls are permitted, then the
+ * preempt-counter overflows
+ */
+#error SYSVSEM_SPINLOCK is a prove of concept, does not work.
+ struct queue_done {
+ spinlock_t done;
+ };
+
+ static void queuewakeup_prepare(void)
+ {
+ /* empty */
+ }
+
+ static void queuewakeup_completed(void)
+ {
+ /* empty */
+ }
+
+ static void queuewakeup_block(struct queue_done *qd)
+ {
+ BUG_ON(spin_is_locked(&qd->done));
+ spin_lock(&qd->done);
+ }
+
+ static void queuewakeup_handsoff(struct queue_done *qd)
+ {
+ spin_unlock(&qd->done);
+ }
+
+ static void queuewakeup_init(struct queue_done *qd)
+ {
+ spin_lock_init(&qd->done);
+ }
+
+ static void queuewakeup_wait(struct queue_done *qd)
+ {
+ spin_unlock_wait(&qd->done);
+ }
+#else
+ struct queue_done {
+ atomic_t done;
+ };
+
+ static void queuewakeup_prepare(void)
+ {
+ preempt_disable();
+ }
+
+ static void queuewakeup_completed(void)
+ {
+ preempt_enable();
+ }
+
+ static void queuewakeup_block(struct queue_done *qd)
+ {
+ BUG_ON(atomic_read(&qd->done) != 1);
+ atomic_set(&qd->done, 2);
+ }
+
+ static void queuewakeup_handsoff(struct queue_done *qd)
+ {
+ BUG_ON(atomic_read(&qd->done) != 2);
+ smp_mb();
+ atomic_set(&qd->done, 1);
+ }
+
+ static void queuewakeup_init(struct queue_done *qd)
+ {
+ atomic_set(&qd->done, 1);
+ }
+
+ static void queuewakeup_wait(struct queue_done *qd)
+ {
+ while (atomic_read(&qd->done) != 1)
+ cpu_relax();
+
+ smp_mb();
+ }
+#endif
+
+
/* One semaphore structure for each semaphore in the system. */
struct sem {
int semval; /* current value */
@@ -108,6 +237,7 @@ struct sem_queue {
struct sembuf *sops; /* array of pending operations */
int nsops; /* number of operations */
int alter; /* does *sops alter the array? */
+ struct queue_done done; /* completion synchronization */
};
/* Each task has a list of undo requests. They are executed automatically
@@ -245,23 +375,27 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
* - queue.status is initialized to -EINTR before blocking.
* - wakeup is performed by
* * unlinking the queue entry from sma->sem_pending
- * * setting queue.status to IN_WAKEUP
- * This is the notification for the blocked thread that a
- * result value is imminent.
+ * * setting queue.status to the actual result code
+ * This is the notification for the blocked thread that someone
+ * (usually: update_queue()) completed the semtimedop() operation.
* * call wake_up_process
- * * set queue.status to the final value.
+ * * queuewakeup_handsoff(&q->done);
* - the previously blocked thread checks queue.status:
- * * if it's IN_WAKEUP, then it must wait until the value changes
- * * if it's not -EINTR, then the operation was completed by
- * update_queue. semtimedop can return queue.status without
- * performing any operation on the sem array.
- * * otherwise it must acquire the spinlock and check what's up.
+ * * if it's not -EINTR, then someone completed the operation.
+ * First, queuewakeup_wait() must be called. Afterwards,
+ * semtimedop must return queue.status without performing any
+ * operation on the sem array.
+ * - otherwise it must acquire the spinlock and repeat the test
+ * - If it is still -EINTR, then no update_queue() completed the
+ * operation, thus semtimedop() can proceed normally.
*
- * The two-stage algorithm is necessary to protect against the following
+ * queuewakeup_wait() is necessary to protect against the following
* races:
* - if queue.status is set after wake_up_process, then the woken up idle
* thread could race forward and try (and fail) to acquire sma->lock
- * before update_queue had a chance to set queue.status
+ * before update_queue had a chance to set queue.status.
+ * More importantly, it would mean that wake_up_process must be done
+ * while holding sma->lock, i.e. this would reduce the scalability.
* - if queue.status is written before wake_up_process and if the
* blocked process is woken up by a signal between writing
* queue.status and the wake_up_process, then the woken up
@@ -271,7 +405,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
* (yes, this happened on s390 with sysv msg).
*
*/
-#define IN_WAKEUP 1
/**
* newary - Create a new semaphore set
@@ -461,15 +594,11 @@ undo:
static void wake_up_sem_queue_prepare(struct list_head *pt,
struct sem_queue *q, int error)
{
- if (list_empty(pt)) {
- /*
- * Hold preempt off so that we don't get preempted and have the
- * wakee busy-wait until we're scheduled back on.
- */
- preempt_disable();
- }
- q->status = IN_WAKEUP;
- q->pid = error;
+ if (list_empty(pt))
+ queuewakeup_prepare();
+
+ queuewakeup_block(&q->done);
+ q->status = error;
list_add_tail(&q->simple_list, pt);
}
@@ -480,8 +609,8 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
*
* Do the actual wake-up.
* The function is called without any locks held, thus the semaphore array
- * could be destroyed already and the tasks can disappear as soon as the
- * status is set to the actual return code.
+ * could be destroyed already and the tasks can disappear as soon as
+ * queuewakeup_handsoff() is called.
*/
static void wake_up_sem_queue_do(struct list_head *pt)
{
@@ -491,12 +620,11 @@ static void wake_up_sem_queue_do(struct list_head *pt)
did_something = !list_empty(pt);
list_for_each_entry_safe(q, t, pt, simple_list) {
wake_up_process(q->sleeper);
- /* q can disappear immediately after writing q->status. */
- smp_wmb();
- q->status = q->pid;
+ /* q can disappear immediately after completing q->done */
+ queuewakeup_handsoff(&q->done);
}
if (did_something)
- preempt_enable();
+ queuewakeup_completed();
}
static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
@@ -1302,33 +1430,6 @@ out:
return un;
}
-
-/**
- * get_queue_result - Retrieve the result code from sem_queue
- * @q: Pointer to queue structure
- *
- * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
- * q->status, then we must loop until the value is replaced with the final
- * value: This may happen if a task is woken up by an unrelated event (e.g.
- * signal) and in parallel the task is woken up by another task because it got
- * the requested semaphores.
- *
- * The function can be called with or without holding the semaphore spinlock.
- */
-static int get_queue_result(struct sem_queue *q)
-{
- int error;
-
- error = q->status;
- while (unlikely(error == IN_WAKEUP)) {
- cpu_relax();
- error = q->status;
- }
-
- return error;
-}
-
-
SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
unsigned, nsops, const struct timespec __user *, timeout)
{
@@ -1474,6 +1575,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
queue.status = -EINTR;
queue.sleeper = current;
+ queuewakeup_init(&queue.done);
sleep_again:
current->state = TASK_INTERRUPTIBLE;
@@ -1484,17 +1586,14 @@ sleep_again:
else
schedule();
- error = get_queue_result(&queue);
+ error = queue.status;
if (error != -EINTR) {
/* fast path: update_queue already obtained all requested
- * resources.
- * Perform a smp_mb(): User space could assume that semop()
- * is a memory barrier: Without the mb(), the cpu could
- * speculatively read in user space stale data that was
- * overwritten by the previous owner of the semaphore.
+ * resources. Just ensure that update_queue completed
+ * it's access to &queue.
*/
- smp_mb();
+ queuewakeup_wait(&queue.done);
goto out_free;
}
@@ -1504,23 +1603,16 @@ sleep_again:
/*
* Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
*/
- error = get_queue_result(&queue);
-
- /*
- * Array removed? If yes, leave without sem_unlock().
- */
- if (IS_ERR(sma)) {
- goto out_free;
- }
-
-
- /*
- * If queue.status != -EINTR we are woken up by another process.
- * Leave without unlink_queue(), but with sem_unlock().
- */
-
+ error = queue.status;
if (error != -EINTR) {
- goto out_unlock_free;
+ /* If there is a return code, then we can leave immediately. */
+ if (!IS_ERR(sma)) {
+ /* sem_lock() succeeded - then unlock */
+ sem_unlock(sma);
+ }
+ /* Except that we must wait for the hands-off */
+ queuewakeup_wait(&queue.done);
+ goto out_free;
}
/*
diff --git a/kernel/acct.c b/kernel/acct.c
index b9bd7f098ee5..a370ccb43840 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -201,7 +201,8 @@ static int acct_on(struct filename *pathname)
struct bsd_acct_struct *acct = NULL;
/* Difference from BSD - they don't do O_APPEND */
- file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+ file = file_open_name(pathname,
+ O_WRONLY|O_APPEND|O_LARGEFILE|O_NONBLOCK, 0);
if (IS_ERR(file))
return PTR_ERR(file);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8d932b1c9056..b404b6f95a17 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -70,6 +70,7 @@
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
+#include <linux/aio.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -364,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mm->locked_vm = 0;
mm->mmap = NULL;
mm->mmap_cache = NULL;
- mm->free_area_cache = oldmm->mmap_base;
- mm->cached_hole_size = ~0UL;
mm->map_count = 0;
cpumask_clear(mm_cpumask(mm));
mm->mm_rb = RB_ROOT;
@@ -539,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
mm->nr_ptes = 0;
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = ~0UL;
mm_init_aio(mm);
mm_init_owner(mm, p);
@@ -722,9 +719,11 @@ static int wait_for_vfork_done(struct task_struct *child,
{
int killed;
- freezer_do_not_count();
+ if (current->mm)
+ freezer_do_not_count();
killed = wait_for_completion_killable(vfork);
- freezer_count();
+ if (current->mm)
+ freezer_count();
if (killed) {
task_lock(child);
diff --git a/kernel/printk.c b/kernel/printk.c
index 546e78184096..7c69b3eff864 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
+#include <linux/aio.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index acbd28424d81..c53874bbd35a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -24,6 +24,7 @@
#include <linux/regset.h>
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
+#include <linux/uio.h>
static int ptrace_trapping_sleep_fn(void *flags)
diff --git a/kernel/smp.c b/kernel/smp.c
index 8e451f3ff51b..31670c8d8f89 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -12,6 +12,7 @@
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/cpu.h>
+#include <linux/hardirq.h>
#include "smpboot.h"
@@ -100,16 +101,16 @@ void __init call_function_init(void)
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
-static void csd_lock_wait(struct call_single_data *data)
+static void csd_lock_wait(struct call_single_data *csd)
{
- while (data->flags & CSD_FLAG_LOCK)
+ while (csd->flags & CSD_FLAG_LOCK)
cpu_relax();
}
-static void csd_lock(struct call_single_data *data)
+static void csd_lock(struct call_single_data *csd)
{
- csd_lock_wait(data);
- data->flags = CSD_FLAG_LOCK;
+ csd_lock_wait(csd);
+ csd->flags = CSD_FLAG_LOCK;
/*
* prevent CPU from reordering the above assignment
@@ -119,16 +120,16 @@ static void csd_lock(struct call_single_data *data)
smp_mb();
}
-static void csd_unlock(struct call_single_data *data)
+static void csd_unlock(struct call_single_data *csd)
{
- WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+ WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
/*
* ensure we're all done before releasing data:
*/
smp_mb();
- data->flags &= ~CSD_FLAG_LOCK;
+ csd->flags &= ~CSD_FLAG_LOCK;
}
/*
@@ -137,7 +138,7 @@ static void csd_unlock(struct call_single_data *data)
* ->func, ->info, and ->flags set.
*/
static
-void generic_exec_single(int cpu, struct call_single_data *data, int wait)
+void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
{
struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
unsigned long flags;
@@ -145,7 +146,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
raw_spin_lock_irqsave(&dst->lock, flags);
ipi = list_empty(&dst->list);
- list_add_tail(&data->list, &dst->list);
+ list_add_tail(&csd->list, &dst->list);
raw_spin_unlock_irqrestore(&dst->lock, flags);
/*
@@ -163,7 +164,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
arch_send_call_function_single_ipi(cpu);
if (wait)
- csd_lock_wait(data);
+ csd_lock_wait(csd);
}
/*
@@ -173,7 +174,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
void generic_smp_call_function_single_interrupt(void)
{
struct call_single_queue *q = &__get_cpu_var(call_single_queue);
- unsigned int data_flags;
LIST_HEAD(list);
/*
@@ -186,25 +186,26 @@ void generic_smp_call_function_single_interrupt(void)
raw_spin_unlock(&q->lock);
while (!list_empty(&list)) {
- struct call_single_data *data;
+ struct call_single_data *csd;
+ unsigned int csd_flags;
- data = list_entry(list.next, struct call_single_data, list);
- list_del(&data->list);
+ csd = list_entry(list.next, struct call_single_data, list);
+ list_del(&csd->list);
/*
- * 'data' can be invalid after this call if flags == 0
+ * 'csd' can be invalid after this call if flags == 0
* (when called through generic_exec_single()),
* so save them away before making the call:
*/
- data_flags = data->flags;
+ csd_flags = csd->flags;
- data->func(data->info);
+ csd->func(csd->info);
/*
* Unlocked CSDs are valid through generic_exec_single():
*/
- if (data_flags & CSD_FLAG_LOCK)
- csd_unlock(data);
+ if (csd_flags & CSD_FLAG_LOCK)
+ csd_unlock(csd);
}
}
@@ -240,8 +241,9 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
* send smp call function interrupt to this cpu and as such deadlocks
* can't happen.
*/
- WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
- && !oops_in_progress);
+ WARN_ON_ONCE(cpu_online(this_cpu)
+ && (irqs_disabled() || in_serving_irq())
+ && !oops_in_progress);
if (cpu == this_cpu) {
local_irq_save(flags);
@@ -249,16 +251,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
local_irq_restore(flags);
} else {
if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
- struct call_single_data *data = &d;
+ struct call_single_data *csd = &d;
if (!wait)
- data = &__get_cpu_var(csd_data);
+ csd = &__get_cpu_var(csd_data);
- csd_lock(data);
+ csd_lock(csd);
- data->func = func;
- data->info = info;
- generic_exec_single(cpu, data, wait);
+ csd->func = func;
+ csd->info = info;
+ generic_exec_single(cpu, csd, wait);
} else {
err = -ENXIO; /* CPU not online */
}
@@ -325,7 +327,7 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
* pre-allocated data structure. Useful for embedding @data inside
* other structures, for instance.
*/
-void __smp_call_function_single(int cpu, struct call_single_data *data,
+void __smp_call_function_single(int cpu, struct call_single_data *csd,
int wait)
{
unsigned int this_cpu;
@@ -343,11 +345,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
if (cpu == this_cpu) {
local_irq_save(flags);
- data->func(data->info);
+ csd->func(csd->info);
local_irq_restore(flags);
} else {
- csd_lock(data);
- generic_exec_single(cpu, data, wait);
+ csd_lock(csd);
+ generic_exec_single(cpu, csd, wait);
}
put_cpu();
}
@@ -369,7 +371,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
- struct call_function_data *data;
+ struct call_function_data *cfd;
int cpu, next_cpu, this_cpu = smp_processor_id();
/*
@@ -378,8 +380,9 @@ void smp_call_function_many(const struct cpumask *mask,
* send smp call function interrupt to this cpu and as such deadlocks
* can't happen.
*/
- WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
- && !oops_in_progress && !early_boot_irqs_disabled);
+ WARN_ON_ONCE(cpu_online(this_cpu)
+ && (irqs_disabled() || in_serving_irq())
+ && !oops_in_progress && !early_boot_irqs_disabled);
/* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -401,24 +404,24 @@ void smp_call_function_many(const struct cpumask *mask,
return;
}
- data = &__get_cpu_var(cfd_data);
+ cfd = &__get_cpu_var(cfd_data);
- cpumask_and(data->cpumask, mask, cpu_online_mask);
- cpumask_clear_cpu(this_cpu, data->cpumask);
+ cpumask_and(cfd->cpumask, mask, cpu_online_mask);
+ cpumask_clear_cpu(this_cpu, cfd->cpumask);
/* Some callers race with other cpus changing the passed mask */
- if (unlikely(!cpumask_weight(data->cpumask)))
+ if (unlikely(!cpumask_weight(cfd->cpumask)))
return;
/*
- * After we put an entry into the list, data->cpumask
- * may be cleared again when another CPU sends another IPI for
- * a SMP function call, so data->cpumask will be zero.
+ * After we put an entry into the list, cfd->cpumask may be cleared
+ * again when another CPU sends another IPI for a SMP function call, so
+ * cfd->cpumask will be zero.
*/
- cpumask_copy(data->cpumask_ipi, data->cpumask);
+ cpumask_copy(cfd->cpumask_ipi, cfd->cpumask);
- for_each_cpu(cpu, data->cpumask) {
- struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
+ for_each_cpu(cpu, cfd->cpumask) {
+ struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
struct call_single_queue *dst =
&per_cpu(call_single_queue, cpu);
unsigned long flags;
@@ -433,12 +436,13 @@ void smp_call_function_many(const struct cpumask *mask,
}
/* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(data->cpumask_ipi);
+ arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
if (wait) {
- for_each_cpu(cpu, data->cpumask) {
- struct call_single_data *csd =
- per_cpu_ptr(data->csd, cpu);
+ for_each_cpu(cpu, cfd->cpumask) {
+ struct call_single_data *csd;
+
+ csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock_wait(csd);
}
}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index af5a7e9f164b..9904e4880d04 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -133,7 +133,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
- SEQ_printf(m, "\n");
SEQ_printf(m, "cpu: %d\n", cpu);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
SEQ_printf(m, " clock %d:\n", i);
@@ -187,6 +186,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
#undef P
#undef P_ns
+ SEQ_printf(m, "\n");
}
#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -195,7 +195,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
{
struct clock_event_device *dev = td->evtdev;
- SEQ_printf(m, "\n");
SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
if (cpu < 0)
SEQ_printf(m, "Broadcast device\n");
@@ -230,12 +229,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
print_name_offset(m, dev->event_handler);
SEQ_printf(m, "\n");
SEQ_printf(m, " retries: %lu\n", dev->retries);
+ SEQ_printf(m, "\n");
}
-static void timer_list_show_tickdevices(struct seq_file *m)
+static void timer_list_show_tickdevices_header(struct seq_file *m)
{
- int cpu;
-
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
print_tickdevice(m, tick_get_broadcast_device(), -1);
SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
@@ -246,12 +244,8 @@ static void timer_list_show_tickdevices(struct seq_file *m)
#endif
SEQ_printf(m, "\n");
#endif
- for_each_online_cpu(cpu)
- print_tickdevice(m, tick_get_device(cpu), cpu);
SEQ_printf(m, "\n");
}
-#else
-static void timer_list_show_tickdevices(struct seq_file *m) { }
#endif
static int timer_list_show(struct seq_file *m, void *v)
@@ -259,34 +253,114 @@ static int timer_list_show(struct seq_file *m, void *v)
u64 now = ktime_to_ns(ktime_get());
int cpu;
- SEQ_printf(m, "Timer List Version: v0.7\n");
- SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
- SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
-
- for_each_online_cpu(cpu)
+ if (v == (void *)1) {
+ SEQ_printf(m, "Timer List Version: v0.7\n");
+ SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n",
+ HRTIMER_MAX_CLOCK_BASES);
+ SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+ SEQ_printf(m, "\n");
+ } else if (v < (void *)(unsigned long)(nr_cpu_ids + 2)) {
+ cpu = (unsigned long)(v - 2);
print_cpu(m, cpu, now);
+ }
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ else if (v == (void *)(unsigned long)nr_cpu_ids + 2) {
+ timer_list_show_tickdevices_header(m);
+ } else {
+ cpu = (unsigned long)(v - 3 - nr_cpu_ids);
+ print_tickdevice(m, tick_get_device(cpu), cpu);
+ }
+#endif
+ return 0;
+}
- SEQ_printf(m, "\n");
- timer_list_show_tickdevices(m);
+/*
+ * This itererator really needs some explnation since it is offset and has
+ * two passes, one of which is controlled by a config option.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ * For the first pass:
+ * It returns 1 for the header position.
+ * For cpu 0 it returns 2 and the final possible cpu would be nr_cpu_ids + 1.
+ * On the second pass:
+ * It returnes nr_cpu_ids + 1 for the second header position.
+ * For cpu 0 it returns nr_cpu_ids + 2
+ * The final possible cpu would be nr_cpu_ids + nr_cpu_ids + 2.
+ */
+static void *timer_list_start(struct seq_file *file, loff_t *offset)
+{
+ unsigned long n = *offset;
+
+ if (n == 0)
+ return (void *) 1;
+
+ if (n < nr_cpu_ids + 1) {
+ n--;
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+ *offset = n + 1;
+ return (void *)(unsigned long)(n + 2);
+ }
- return 0;
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ if (n == nr_cpu_ids + 1)
+ return (void *)(unsigned long)(nr_cpu_ids + 2);
+
+ if (n < nr_cpu_ids * 2 + 2) {
+ n -= (nr_cpu_ids + 2);
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+ *offset = n + 2 + nr_cpu_ids;
+ return (void *)(unsigned long)(n + 3 + nr_cpu_ids);
+ }
+#endif
+
+ return NULL;
}
+static void *timer_list_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return timer_list_start(file, offset);
+}
+
+static void timer_list_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations timer_list_sops = {
+ .start = timer_list_start,
+ .next = timer_list_next,
+ .stop = timer_list_stop,
+ .show = timer_list_show,
+};
+
void sysrq_timer_list_show(void)
{
timer_list_show(NULL, NULL);
}
+static int timer_list_release(struct inode *inode, struct file *filep)
+{
+ seq_release(inode, filep);
+
+ return 0;
+}
+
static int timer_list_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, timer_list_show, NULL);
+ return seq_open(filp, &timer_list_sops);
}
static const struct file_operations timer_list_fops = {
.open = timer_list_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = timer_list_release,
};
static int __init init_timer_list_procfs(void)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4a944676358e..7672bef6c88b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -239,10 +239,12 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
- if (hardlockup_panic)
+ if (hardlockup_panic) {
+ trigger_all_cpu_backtrace();
panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
- else
+ } else {
WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ }
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -323,8 +325,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
else
dump_stack();
- if (softlockup_panic)
+ if (softlockup_panic) {
+ trigger_all_cpu_backtrace();
panic("softlockup: hung tasks");
+ }
__this_cpu_write(soft_watchdog_warn, true);
} else
__this_cpu_write(soft_watchdog_warn, false);
diff --git a/lib/Makefile b/lib/Makefile
index d7946ff75b2e..32f445541423 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
- earlycpio.o
+ earlycpio.o percpu-refcount.o
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
new file mode 100644
index 000000000000..79c61580a211
--- /dev/null
+++ b/lib/percpu-refcount.c
@@ -0,0 +1,243 @@
+#define pr_fmt(fmt) "%s: " fmt "\n", __func__
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/percpu-refcount.h>
+#include <linux/rcupdate.h>
+
+/*
+ * A percpu refcount can be in 4 different modes. The state is tracked in the
+ * low two bits of percpu_ref->pcpu_count:
+ *
+ * PCPU_REF_NONE - the initial state, no percpu counters allocated.
+ *
+ * PCPU_REF_PTR - using percpu counters for the refcount.
+ *
+ * PCPU_REF_DYING - we're shutting down so get()/put() should use the embedded
+ * atomic counter, but we're not finished updating the atomic counter from the
+ * percpu counters - this means that percpu_ref_put() can't check for the ref
+ * hitting 0 yet.
+ *
+ * PCPU_REF_DEAD - we've finished the teardown sequence, percpu_ref_put() should
+ * now check for the ref hitting 0.
+ *
+ * In PCPU_REF_NONE mode, we need to count the number of times percpu_ref_get()
+ * is called; this is done with the high bits of the raw atomic counter. We also
+ * track the time, in jiffies, when the get count last wrapped - this is done
+ * with the remaining bits of percpu_ref->percpu_count.
+ *
+ * So, when percpu_ref_get() is called it increments the get count and checks if
+ * it wrapped; if it did, it checks if the last time it wrapped was less than
+ * one second ago; if so, we want to allocate percpu counters.
+ *
+ * PCPU_COUNT_BITS determines the threshold where we convert to percpu: of the
+ * raw 64 bit counter, we use PCPU_COUNT_BITS for the refcount, and the
+ * remaining (high) bits to count the number of times percpu_ref_get() has been
+ * called. It's currently (completely arbitrarily) 16384 times in one second.
+ *
+ * Percpu mode (PCPU_REF_PTR):
+ *
+ * In percpu mode all we do on get and put is increment or decrement the cpu
+ * local counter, which is a 32 bit unsigned int.
+ *
+ * Note that all the gets() could be happening on one cpu, and all the puts() on
+ * another - the individual cpu counters can wrap (potentially many times).
+ *
+ * But this is fine because we don't need to check for the ref hitting 0 in
+ * percpu mode; before we set the state to PCPU_REF_DEAD we simply sum up all
+ * the percpu counters and add them to the atomic counter. Since addition and
+ * subtraction in modular arithmatic is still associative, the result will be
+ * correct.
+ */
+
+#define PCPU_COUNT_BITS 50
+#define PCPU_COUNT_MASK ((1LL << PCPU_COUNT_BITS) - 1)
+
+#define PCPU_STATUS_BITS 2
+#define PCPU_STATUS_MASK ((1 << PCPU_STATUS_BITS) - 1)
+
+#define PCPU_REF_PTR 0
+#define PCPU_REF_NONE 1
+#define PCPU_REF_DYING 2
+#define PCPU_REF_DEAD 3
+
+#define REF_STATUS(count) (count & PCPU_STATUS_MASK)
+
+/**
+ * percpu_ref_init - initialize a dynamic percpu refcount
+ *
+ * Initializes the refcount in single atomic counter mode with a refcount of 1;
+ * analagous to atomic_set(ref, 1).
+ */
+void percpu_ref_init(struct percpu_ref *ref)
+{
+ unsigned long now = jiffies;
+
+ atomic64_set(&ref->count, 1);
+
+ now <<= PCPU_STATUS_BITS;
+ now |= PCPU_REF_NONE;
+
+ ref->pcpu_count = now;
+}
+
+static void percpu_ref_alloc(struct percpu_ref *ref, unsigned long pcpu_count)
+{
+ unsigned long new, now = jiffies;
+
+ now <<= PCPU_STATUS_BITS;
+ now |= PCPU_REF_NONE;
+
+ if (now - pcpu_count <= HZ << PCPU_STATUS_BITS) {
+ rcu_read_unlock();
+ new = (unsigned long) alloc_percpu(unsigned);
+ rcu_read_lock();
+
+ if (!new)
+ goto update_time;
+
+ BUG_ON(new & PCPU_STATUS_MASK);
+
+ if (cmpxchg(&ref->pcpu_count, pcpu_count, new) != pcpu_count)
+ free_percpu((void __percpu *) new);
+ else
+ pr_debug("created");
+ } else {
+update_time:
+ new = now;
+ cmpxchg(&ref->pcpu_count, pcpu_count, new);
+ }
+}
+
+void __percpu_ref_get(struct percpu_ref *ref, bool alloc)
+{
+ unsigned long pcpu_count;
+ uint64_t v;
+
+ pcpu_count = ACCESS_ONCE(ref->pcpu_count);
+
+ if (REF_STATUS(pcpu_count) == PCPU_REF_PTR) {
+ /* for rcu - we're not using rcu_dereference() */
+ smp_read_barrier_depends();
+ __this_cpu_inc(*((unsigned __percpu *) pcpu_count));
+ } else {
+ v = atomic64_add_return(1 + (1ULL << PCPU_COUNT_BITS),
+ &ref->count);
+
+ if (!(v >> PCPU_COUNT_BITS) &&
+ REF_STATUS(pcpu_count) == PCPU_REF_NONE && alloc)
+ percpu_ref_alloc(ref, pcpu_count);
+ }
+}
+
+/**
+ * percpu_ref_put - decrement a dynamic percpu refcount
+ *
+ * Returns true if the result is 0, otherwise false; only checks for the ref
+ * hitting 0 after percpu_ref_kill() has been called. Analagous to
+ * atomic_dec_and_test().
+ */
+int percpu_ref_put(struct percpu_ref *ref)
+{
+ unsigned long pcpu_count;
+ uint64_t v;
+ int ret = 0;
+
+ rcu_read_lock();
+
+ pcpu_count = ACCESS_ONCE(ref->pcpu_count);
+
+ switch (REF_STATUS(pcpu_count)) {
+ case PCPU_REF_PTR:
+ /* for rcu - we're not using rcu_dereference() */
+ smp_read_barrier_depends();
+ __this_cpu_dec(*((unsigned __percpu *) pcpu_count));
+ break;
+ case PCPU_REF_NONE:
+ case PCPU_REF_DYING:
+ atomic64_dec(&ref->count);
+ break;
+ case PCPU_REF_DEAD:
+ v = atomic64_dec_return(&ref->count);
+ v &= PCPU_COUNT_MASK;
+
+ ret = v == 0;
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/**
+ * percpu_ref_kill - prepare a dynamic percpu refcount for teardown
+ *
+ * Must be called before dropping the initial ref, so that percpu_ref_put()
+ * knows to check for the refcount hitting 0. If the refcount was in percpu
+ * mode, converts it back to single atomic counter mode.
+ *
+ * Returns true the first time called on @ref and false if @ref is already
+ * shutting down, so it may be used by the caller for synchronizing other parts
+ * of a two stage shutdown.
+ */
+int percpu_ref_kill(struct percpu_ref *ref)
+{
+ unsigned long old, new, status, pcpu_count;
+
+ pcpu_count = ACCESS_ONCE(ref->pcpu_count);
+
+ do {
+ status = REF_STATUS(pcpu_count);
+
+ switch (status) {
+ case PCPU_REF_PTR:
+ new = PCPU_REF_DYING;
+ break;
+ case PCPU_REF_NONE:
+ new = PCPU_REF_DEAD;
+ break;
+ case PCPU_REF_DYING:
+ case PCPU_REF_DEAD:
+ return 0;
+ }
+
+ old = pcpu_count;
+ pcpu_count = cmpxchg(&ref->pcpu_count, old, new);
+ } while (pcpu_count != old);
+
+ if (status == PCPU_REF_PTR) {
+ unsigned count = 0, cpu;
+
+ synchronize_rcu();
+
+ for_each_possible_cpu(cpu)
+ count += *per_cpu_ptr((unsigned __percpu *) pcpu_count, cpu);
+
+ pr_debug("global %lli pcpu %i",
+ atomic64_read(&ref->count) & PCPU_COUNT_MASK,
+ (int) count);
+
+ atomic64_add((int) count, &ref->count);
+ smp_wmb();
+ /* Between setting global count and setting PCPU_REF_DEAD */
+ ref->pcpu_count = PCPU_REF_DEAD;
+
+ free_percpu((unsigned __percpu *) pcpu_count);
+ }
+
+ return 1;
+}
+
+/**
+ * percpu_ref_dead - check if a dynamic percpu refcount is shutting down
+ *
+ * Returns true if percpu_ref_kill() has been called on @ref, false otherwise.
+ */
+int percpu_ref_dead(struct percpu_ref *ref)
+{
+ unsigned status = REF_STATUS(ref->pcpu_count);
+
+ return status == PCPU_REF_DYING ||
+ status == PCPU_REF_DEAD;
+}
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 07dbc8ec46cf..2c8ce496804f 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -242,6 +242,7 @@ bool balloon_page_isolate(struct page *page)
if (__is_movable_balloon_page(page) &&
page_count(page) == 2) {
__isolate_balloon_page(page);
+ balloon_event_count(COMPACTBALLOONISOLATED);
unlock_page(page);
return true;
}
@@ -265,6 +266,7 @@ void balloon_page_putback(struct page *page)
__putback_balloon_page(page);
/* drop the extra ref count taken for page isolation */
put_page(page);
+ balloon_event_count(COMPACTBALLOONRETURNED);
} else {
WARN_ON(1);
dump_page(page);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index c69781e97cf9..668f26316e2e 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -132,6 +132,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
{
struct dma_pool *retval;
size_t allocation;
+ int node;
if (align == 0) {
align = 1;
@@ -156,7 +157,9 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
return NULL;
}
- retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
+ node = WARN_ON(!dev) ? -1 : dev_to_node(dev);
+
+ retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, node);
if (!retval)
return retval;
diff --git a/mm/filemap.c b/mm/filemap.c
index e1979fdca805..8984966309ef 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2528,7 +2528,8 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
BUG_ON(iocb->ki_pos != pos);
- sb_start_write(inode->i_sb);
+ if (!sb_start_file_write(file))
+ return -EAGAIN;
mutex_lock(&inode->i_mutex);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53b8201b31eb..669d16ac56dc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -310,14 +310,31 @@ struct mem_cgroup {
/* thresholds for mem+swap usage. RCU-protected */
struct mem_cgroup_thresholds memsw_thresholds;
- /* For oom notifier event fd */
- struct list_head oom_notify;
+ union {
+ /* For oom notifier event fd */
+ struct list_head oom_notify;
+ /*
+ * we can only trigger an oom event if the memcg is alive.
+ * so we will reuse this field to hook the memcg in the list
+ * of dead memcgs.
+ */
+ struct list_head dead;
+ };
- /*
- * Should we move charges of a task when a task is moved into this
- * mem_cgroup ? And what type of charges should we move ?
- */
- unsigned long move_charge_at_immigrate;
+ union {
+ /*
+ * Should we move charges of a task when a task is moved into
+ * this mem_cgroup ? And what type of charges should we move ?
+ */
+ unsigned long move_charge_at_immigrate;
+
+ /*
+ * We are no longer concerned about moving charges after memcg
+ * is dead. So we will fill this up with its name, to aid
+ * debugging.
+ */
+ char *memcg_name;
+ };
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/
@@ -369,6 +386,55 @@ static size_t memcg_size(void)
nr_node_ids * sizeof(struct mem_cgroup_per_node);
}
+#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY
+static LIST_HEAD(dangling_memcgs);
+static DEFINE_MUTEX(dangling_memcgs_mutex);
+
+static inline void memcg_dangling_free(struct mem_cgroup *memcg)
+{
+ mutex_lock(&dangling_memcgs_mutex);
+ list_del(&memcg->dead);
+ mutex_unlock(&dangling_memcgs_mutex);
+ free_pages((unsigned long)memcg->memcg_name, 0);
+}
+
+static inline void memcg_dangling_add(struct mem_cgroup *memcg)
+{
+ /*
+ * cgroup.c will do page-sized allocations most of the time,
+ * so we'll just follow the pattern. Also, __get_free_pages
+ * is a better interface than kmalloc for us here, because
+ * we'd like this memory to be always billed to the root cgroup,
+ * not to the process removing the memcg. While kmalloc would
+ * require us to wrap it into memcg_stop/resume_kmem_account,
+ * with __get_free_pages we just don't pass the memcg flag.
+ */
+ memcg->memcg_name = (char *)__get_free_pages(GFP_KERNEL, 0);
+
+ /*
+ * we will, in general, just ignore failures. No need to go crazy,
+ * being this just a debugging interface. It is nice to copy a memcg
+ * name over, but if we (unlikely) can't, just the address will do
+ */
+ if (!memcg->memcg_name)
+ goto add_list;
+
+ if (cgroup_path(memcg->css.cgroup, memcg->memcg_name, PAGE_SIZE) < 0) {
+ free_pages((unsigned long)memcg->memcg_name, 0);
+ memcg->memcg_name = NULL;
+ }
+
+add_list:
+ INIT_LIST_HEAD(&memcg->dead);
+ mutex_lock(&dangling_memcgs_mutex);
+ list_add(&memcg->dead, &dangling_memcgs);
+ mutex_unlock(&dangling_memcgs_mutex);
+}
+#else
+static inline void memcg_dangling_free(struct mem_cgroup *memcg) {}
+static inline void memcg_dangling_add(struct mem_cgroup *memcg) {}
+#endif
+
/* internal only representation about the status of kmem accounting. */
enum {
KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
@@ -4970,6 +5036,107 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
}
+#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY
+static void
+mem_cgroup_dangling_swap(struct mem_cgroup *memcg, struct seq_file *m)
+{
+#ifdef CONFIG_MEMCG_SWAP
+ u64 kmem;
+ u64 memsw;
+
+ /*
+ * kmem will also propagate here, so we are only interested in the
+ * difference. See comment in mem_cgroup_reparent_charges for details.
+ *
+ * We could save this value for later consumption by kmem reports, but
+ * there is not a lot of problem if the figures differ slightly.
+ */
+ kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE);
+ memsw = res_counter_read_u64(&memcg->memsw, RES_USAGE) - kmem;
+ seq_printf(m, "\t%llu swap bytes\n", memsw);
+#endif
+}
+
+
+static void
+mem_cgroup_dangling_tcp(struct mem_cgroup *memcg, struct seq_file *m)
+{
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
+ struct tcp_memcontrol *tcp = &memcg->tcp_mem;
+ s64 tcp_socks;
+ u64 tcp_bytes;
+
+ tcp_socks = percpu_counter_sum_positive(&tcp->tcp_sockets_allocated);
+ tcp_bytes = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
+ seq_printf(m, "\t%llu tcp bytes", tcp_bytes);
+ /*
+ * if tcp_bytes == 0, tcp_socks != 0 is a bug. One more reason to print
+ * it!
+ */
+ if (tcp_bytes || tcp_socks)
+ seq_printf(m, ", in %lld sockets", tcp_socks);
+ seq_printf(m, "\n");
+
+#endif
+}
+
+static void
+mem_cgroup_dangling_kmem(struct mem_cgroup *memcg, struct seq_file *m)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ u64 kmem;
+ struct memcg_cache_params *params;
+
+ kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE);
+ seq_printf(m, "\t%llu kmem bytes", kmem);
+
+ /* list below may not be initialized, so not even try */
+ if (!kmem)
+ return;
+
+ seq_printf(m, " in caches");
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+ struct kmem_cache *s = memcg_params_to_cache(params);
+
+ seq_printf(m, " %s", s->name);
+ }
+ mutex_unlock(&memcg->slab_caches_mutex);
+ seq_printf(m, "\n");
+#endif
+}
+
+/*
+ * After a memcg is destroyed, it may still be kept around in memory.
+ * Currently, the two main reasons for it are swap entries, and kernel memory.
+ * Because they will be freed assynchronously, they will pin the memcg structure
+ * and its resources until the last reference goes away.
+ *
+ * This root-only file will show information about which users
+ */
+static int mem_cgroup_dangling_read(struct cgroup *cont, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct mem_cgroup *memcg;
+
+ mutex_lock(&dangling_memcgs_mutex);
+
+ list_for_each_entry(memcg, &dangling_memcgs, dead) {
+ if (memcg->memcg_name)
+ seq_printf(m, "%s:\n", memcg->memcg_name);
+ else
+ seq_printf(m, "%p (name lost):\n", memcg);
+
+ mem_cgroup_dangling_swap(memcg, m);
+ mem_cgroup_dangling_tcp(memcg, m);
+ mem_cgroup_dangling_kmem(memcg, m);
+ }
+
+ mutex_unlock(&dangling_memcgs_mutex);
+ return 0;
+}
+#endif
+
static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
{
int ret = -EINVAL;
@@ -5871,6 +6038,14 @@ static struct cftype mem_cgroup_files[] = {
},
#endif
#endif
+
+#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY
+ {
+ .name = "dangling_memcgs",
+ .read_seq_string = mem_cgroup_dangling_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+#endif
{ }, /* terminate */
};
@@ -6020,6 +6195,8 @@ static void free_work(struct work_struct *work)
struct mem_cgroup *memcg;
memcg = container_of(work, struct mem_cgroup, work_freeing);
+
+ memcg_dangling_free(memcg);
__mem_cgroup_free(memcg);
}
@@ -6194,6 +6371,7 @@ static void mem_cgroup_css_free(struct cgroup *cont)
kmem_cgroup_destroy(memcg);
+ memcg_dangling_add(memcg);
mem_cgroup_put(memcg);
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 3bbaf5d230b0..4e3dab51f8fa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -876,6 +876,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
balloon_page_free(page);
+ balloon_event_count(COMPACTBALLOONMIGRATED);
return MIGRATEPAGE_SUCCESS;
}
out:
diff --git a/mm/mmap.c b/mm/mmap.c
index 2664a47cec93..49dc7d577b46 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1816,15 +1816,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
}
#endif
-void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
-{
- /*
- * Is this a new hole at the lowest possible address?
- */
- if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
- mm->free_area_cache = addr;
-}
-
/*
* This mmap-allocator allocates new areas top-down from below the
* stack's low limit (the base):
@@ -1881,19 +1872,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
}
#endif
-void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
-{
- /*
- * Is this a new hole at the highest possible address?
- */
- if (addr > mm->free_area_cache)
- mm->free_area_cache = addr;
-
- /* dont allow allocations above current base */
- if (mm->free_area_cache > mm->mmap_base)
- mm->free_area_cache = mm->mmap_base;
-}
-
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
@@ -2317,7 +2295,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct vm_area_struct **insertion_point;
struct vm_area_struct *tail_vma = NULL;
- unsigned long addr;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
@@ -2334,11 +2311,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
} else
mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
- if (mm->unmap_area == arch_unmap_area)
- addr = prev ? prev->vm_end : mm->mmap_base;
- else
- addr = vma ? vma->vm_start : mm->mmap_base;
- mm->unmap_area(mm, addr);
mm->mmap_cache = NULL; /* Kill the cache. */
}
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 3dcfaf4ed355..8a8cd0265e52 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -14,9 +14,6 @@
* use_mm
* Makes the calling kernel thread take on the specified
* mm context.
- * Called by the retry thread execute retries within the
- * iocb issuer's mm context, so that copy_from/to_user
- * operations work seamlessly for aio.
* (Note: this routine is intended to be called only
* from a kernel thread context)
*/
diff --git a/mm/nommu.c b/mm/nommu.c
index e19328087534..f5d57a3cca2b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1859,10 +1859,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
return -ENOMEM;
}
-void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
-{
-}
-
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen,
int even_cows)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0dade3f18f7d..6cacfee8355a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3907,8 +3907,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn))
+ if (!early_pfn_valid(pfn)) {
+ pfn = ALIGN(pfn + MAX_ORDER_NR_PAGES,
+ MAX_ORDER_NR_PAGES) - 1;
continue;
+ }
if (!early_pfn_in_nid(pfn, nid))
continue;
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 78eee32ee486..c535d395a440 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -20,6 +20,7 @@
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/frontswap.h>
+#include <linux/aio.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
diff --git a/mm/shmem.c b/mm/shmem.c
index ed2befb4952e..09ee154de93a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -30,6 +30,7 @@
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/swap.h>
+#include <linux/aio.h>
static struct vfsmount *shm_mnt;
diff --git a/mm/swap.c b/mm/swap.c
index 8a529a01e8fc..92a9be551846 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
+#include <linux/uio.h>
#include "internal.h"
diff --git a/mm/util.c b/mm/util.c
index ab1424dbe2e6..7441c41d00f6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -295,7 +295,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
}
#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e1d8ed172c42..292b1cf785e0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -792,7 +792,14 @@ const char * const vmstat_text[] = {
"compact_stall",
"compact_fail",
"compact_success",
-#endif
+
+#ifdef CONFIG_BALLOON_COMPACTION
+ "compact_balloon_isolated",
+ "compact_balloon_migrated",
+ "compact_balloon_returned",
+#endif /* CONFIG_BALLOON_COMPACTION */
+
+#endif /* CONFIG_COMPACTION */
#ifdef CONFIG_HUGETLB_PAGE
"htlb_buddy_alloc_success",
diff --git a/scripts/pnmtologo.c b/scripts/pnmtologo.c
index 68bb4efc5af4..ce937970cf98 100644
--- a/scripts/pnmtologo.c
+++ b/scripts/pnmtologo.c
@@ -104,9 +104,11 @@ static unsigned int get_number(FILE *fp)
val = 0;
while (isdigit(c)) {
val = 10*val+c-'0';
- /* some PBM are 'broken'; GiMP for example exports a PBM without space
- * between the digits. This is Ok cause we know a PBM can only have a '1'
- * or a '0' for the digit. */
+ /*
+ * Some PBM are 'broken'; GiMP for example exports a PBM without space
+ * between the digits. This is OK because we know a PBM can only have a
+ * '1' or a '0' for the digit.
+ */
if (is_plain_pbm)
break;
c = fgetc(fp);
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 8bbefc3b55d4..d4f1468b9b50 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -16,6 +16,8 @@
#include <linux/key-type.h>
#include <linux/task_work.h>
+struct iovec;
+
#ifdef __KDEBUG
#define kenter(FMT, ...) \
printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 4b5c948eb414..33cfd27b4de2 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -22,6 +22,7 @@
#include <linux/err.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
+#include <linux/uio.h>
#include <asm/uaccess.h>
#include "internal.h"
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 71ae86ca64ac..d470c35fcb41 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -25,7 +25,7 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/pm_qos.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
#include <linux/dma-mapping.h>
#include <sound/core.h>
#include <sound/control.h>
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 3cc0ad7ae863..b51381459214 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -1,4 +1,5 @@
TARGETS = breakpoints
+TARGETS += epoll
TARGETS += kcmp
TARGETS += mqueue
TARGETS += vm
diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile
new file mode 100644
index 000000000000..19806ed62f50
--- /dev/null
+++ b/tools/testing/selftests/epoll/Makefile
@@ -0,0 +1,11 @@
+# Makefile for epoll selftests
+
+all: test_epoll
+%: %.c
+ gcc -pthread -g -o $@ $^
+
+run_tests: all
+ ./test_epoll
+
+clean:
+ $(RM) test_epoll
diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c
new file mode 100644
index 000000000000..1034ed4cc5b4
--- /dev/null
+++ b/tools/testing/selftests/epoll/test_epoll.c
@@ -0,0 +1,364 @@
+/*
+ * tools/testing/selftests/epoll/test_epoll.c
+ *
+ * Copyright 2012 Adobe Systems Incorporated
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Paton J. Lewis <palewis@adobe.com>
+ *
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+
+/*
+ * A pointer to an epoll_item_private structure will be stored in the epoll
+ * item's event structure so that we can get access to the epoll_item_private
+ * data after calling epoll_wait:
+ */
+struct epoll_item_private {
+ int index; /* Position of this struct within the epoll_items array. */
+ int fd;
+ uint32_t events;
+ pthread_mutex_t mutex; /* Guards the following variables... */
+ int stop;
+ int status; /* Stores any error encountered while handling item. */
+ /* The following variable allows us to test whether we have encountered
+ a problem while attempting to cancel and delete the associated
+ event. When the test program exits, 'deleted' should be exactly
+ one. If it is greater than one, then the failed test reflects a real
+ world situation where we would have tried to access the epoll item's
+ private data after deleting it: */
+ int deleted;
+};
+
+struct epoll_item_private *epoll_items;
+
+/*
+ * Delete the specified item from the epoll set. In a real-world secneario this
+ * is where we would free the associated data structure, but in this testing
+ * environment we retain the structure so that we can test for double-deletion:
+ */
+void delete_item(int index)
+{
+ __sync_fetch_and_add(&epoll_items[index].deleted, 1);
+}
+
+/*
+ * A pointer to a read_thread_data structure will be passed as the argument to
+ * each read thread:
+ */
+struct read_thread_data {
+ int stop;
+ int status; /* Indicates any error encountered by the read thread. */
+ int epoll_set;
+};
+
+/*
+ * The function executed by the read threads:
+ */
+void *read_thread_function(void *function_data)
+{
+ struct read_thread_data *thread_data =
+ (struct read_thread_data *)function_data;
+ struct epoll_event event_data;
+ struct epoll_item_private *item_data;
+ char socket_data;
+
+ /* Handle events until we encounter an error or this thread's 'stop'
+ condition is set: */
+ while (1) {
+ int result = epoll_wait(thread_data->epoll_set,
+ &event_data,
+ 1, /* Number of desired events */
+ 1000); /* Timeout in ms */
+ if (result < 0) {
+ /* Breakpoints signal all threads. Ignore that while
+ debugging: */
+ if (errno == EINTR)
+ continue;
+ thread_data->status = errno;
+ return 0;
+ } else if (thread_data->stop)
+ return 0;
+ else if (result == 0) /* Timeout */
+ continue;
+
+ /* We need the mutex here because checking for the stop
+ condition and re-enabling the epoll item need to be done
+ together as one atomic operation when EPOLL_CTL_DISABLE is
+ available: */
+ item_data = (struct epoll_item_private *)event_data.data.ptr;
+ pthread_mutex_lock(&item_data->mutex);
+
+ /* Remove the item from the epoll set if we want to stop
+ handling that event: */
+ if (item_data->stop)
+ delete_item(item_data->index);
+ else {
+ /* Clear the data that was written to the other end of
+ our non-blocking socket: */
+ do {
+ if (read(item_data->fd, &socket_data, 1) < 1) {
+ if ((errno == EAGAIN) ||
+ (errno == EWOULDBLOCK))
+ break;
+ else
+ goto error_unlock;
+ }
+ } while (item_data->events & EPOLLET);
+
+ /* The item was one-shot, so re-enable it: */
+ event_data.events = item_data->events;
+ if (epoll_ctl(thread_data->epoll_set,
+ EPOLL_CTL_MOD,
+ item_data->fd,
+ &event_data) < 0)
+ goto error_unlock;
+ }
+
+ pthread_mutex_unlock(&item_data->mutex);
+ }
+
+error_unlock:
+ thread_data->status = item_data->status = errno;
+ pthread_mutex_unlock(&item_data->mutex);
+ return 0;
+}
+
+/*
+ * A pointer to a write_thread_data structure will be passed as the argument to
+ * the write thread:
+ */
+struct write_thread_data {
+ int stop;
+ int status; /* Indicates any error encountered by the write thread. */
+ int n_fds;
+ int *fds;
+};
+
+/*
+ * The function executed by the write thread. It writes a single byte to each
+ * socket in turn until the stop condition for this thread is set. If writing to
+ * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for
+ * the moment and just move on to the next socket in the list. We don't care
+ * about the order in which we deliver events to the epoll set. In fact we don't
+ * care about the data we're writing to the pipes at all; we just want to
+ * trigger epoll events:
+ */
+void *write_thread_function(void *function_data)
+{
+ const char data = 'X';
+ int index;
+ struct write_thread_data *thread_data =
+ (struct write_thread_data *)function_data;
+ while (!thread_data->stop)
+ for (index = 0;
+ !thread_data->stop && (index < thread_data->n_fds);
+ ++index)
+ if ((write(thread_data->fds[index], &data, 1) < 1) &&
+ (errno != EAGAIN) &&
+ (errno != EWOULDBLOCK)) {
+ thread_data->status = errno;
+ return;
+ }
+}
+
+/*
+ * Arguments are currently ignored:
+ */
+int main(int argc, char **argv)
+{
+ const int n_read_threads = 100;
+ const int n_epoll_items = 500;
+ int index;
+ int epoll_set = epoll_create1(0);
+ struct write_thread_data write_thread_data = {
+ 0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int))
+ };
+ struct read_thread_data *read_thread_data =
+ malloc(n_read_threads * sizeof(struct read_thread_data));
+ pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t));
+ pthread_t write_thread;
+ int socket_pair[2];
+ struct epoll_event event_data;
+
+ printf("-----------------\n");
+ printf("Runing test_epoll\n");
+ printf("-----------------\n");
+
+ epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private));
+
+ if (epoll_set < 0 || !epoll_items || write_thread_data.fds == NULL ||
+ !read_thread_data || !read_threads)
+ goto error;
+
+ if (sysconf(_SC_NPROCESSORS_ONLN) < 2) {
+ printf("Error: please run this test on a multi-core system.\n");
+ goto error;
+ }
+
+ /* Create the socket pairs and epoll items: */
+ for (index = 0; index < n_epoll_items; ++index) {
+ if (socketpair(AF_UNIX,
+ SOCK_STREAM | SOCK_NONBLOCK,
+ 0,
+ socket_pair) < 0)
+ goto error;
+ write_thread_data.fds[index] = socket_pair[0];
+ epoll_items[index].index = index;
+ epoll_items[index].fd = socket_pair[1];
+ if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0)
+ goto error;
+ /* We always use EPOLLONESHOT because this test is currently
+ structured to demonstrate the need for EPOLL_CTL_DISABLE,
+ which only produces useful information in the EPOLLONESHOT
+ case (without EPOLLONESHOT, calling epoll_ctl with
+ EPOLL_CTL_DISABLE will never return EBUSY). If support for
+ testing events without EPOLLONESHOT is desired, it should
+ probably be implemented in a separate unit test. */
+ epoll_items[index].events = EPOLLIN | EPOLLONESHOT;
+ if (index < n_epoll_items / 2)
+ epoll_items[index].events |= EPOLLET;
+ epoll_items[index].stop = 0;
+ epoll_items[index].status = 0;
+ epoll_items[index].deleted = 0;
+ event_data.events = epoll_items[index].events;
+ event_data.data.ptr = &epoll_items[index];
+ if (epoll_ctl(epoll_set,
+ EPOLL_CTL_ADD,
+ epoll_items[index].fd,
+ &event_data) < 0)
+ goto error;
+ }
+
+#ifdef EPOLL_CTL_DISABLE
+ /* Test to make sure that using EPOLL_CTL_DISABLE without EPOLLONESHOT
+ returns a clear error: */
+ if (socketpair(AF_UNIX,
+ SOCK_STREAM | SOCK_NONBLOCK,
+ 0,
+ socket_pair) < 0)
+ goto error;
+ event_data.events = EPOLLIN;
+ event_data.data.ptr = NULL;
+ if (epoll_ctl(epoll_set, EPOLL_CTL_ADD,
+ socket_pair[1], &event_data) < 0)
+ goto error;
+ if ((epoll_ctl(epoll_set, EPOLL_CTL_DISABLE,
+ socket_pair[1], NULL) == 0) || (errno != EINVAL))
+ goto error;
+ if (epoll_ctl(epoll_set, EPOLL_CTL_DEL, socket_pair[1], NULL) != 0)
+ goto error;
+#endif
+
+ /* Create and start the read threads: */
+ for (index = 0; index < n_read_threads; ++index) {
+ read_thread_data[index].stop = 0;
+ read_thread_data[index].status = 0;
+ read_thread_data[index].epoll_set = epoll_set;
+ if (pthread_create(&read_threads[index],
+ NULL,
+ read_thread_function,
+ &read_thread_data[index]) != 0)
+ goto error;
+ }
+
+ if (pthread_create(&write_thread,
+ NULL,
+ write_thread_function,
+ &write_thread_data) != 0)
+ goto error;
+
+ /* Cancel all event pollers: */
+#ifdef EPOLL_CTL_DISABLE
+ for (index = 0; index < n_epoll_items; ++index) {
+ pthread_mutex_lock(&epoll_items[index].mutex);
+ ++epoll_items[index].stop;
+ if (epoll_ctl(epoll_set,
+ EPOLL_CTL_DISABLE,
+ epoll_items[index].fd,
+ NULL) == 0)
+ delete_item(index);
+ else if (errno != EBUSY) {
+ pthread_mutex_unlock(&epoll_items[index].mutex);
+ goto error;
+ }
+ /* EBUSY means events were being handled; allow the other thread
+ to delete the item. */
+ pthread_mutex_unlock(&epoll_items[index].mutex);
+ }
+#else
+ for (index = 0; index < n_epoll_items; ++index) {
+ pthread_mutex_lock(&epoll_items[index].mutex);
+ ++epoll_items[index].stop;
+ pthread_mutex_unlock(&epoll_items[index].mutex);
+ /* Wait in case a thread running read_thread_function is
+ currently executing code between epoll_wait and
+ pthread_mutex_lock with this item. Note that a longer delay
+ would make double-deletion less likely (at the expense of
+ performance), but there is no guarantee that any delay would
+ ever be sufficient. Note also that we delete all event
+ pollers at once for testing purposes, but in a real-world
+ environment we are likely to want to be able to cancel event
+ pollers at arbitrary times. Therefore we can't improve this
+ situation by just splitting this loop into two loops
+ (i.e. signal 'stop' for all items, sleep, and then delete all
+ items). We also can't fix the problem via EPOLL_CTL_DEL
+ because that command can't prevent the case where some other
+ thread is executing read_thread_function within the region
+ mentioned above: */
+ usleep(1);
+ pthread_mutex_lock(&epoll_items[index].mutex);
+ if (!epoll_items[index].deleted)
+ delete_item(index);
+ pthread_mutex_unlock(&epoll_items[index].mutex);
+ }
+#endif
+
+ /* Shut down the read threads: */
+ for (index = 0; index < n_read_threads; ++index)
+ __sync_fetch_and_add(&read_thread_data[index].stop, 1);
+ for (index = 0; index < n_read_threads; ++index) {
+ if (pthread_join(read_threads[index], NULL) != 0)
+ goto error;
+ if (read_thread_data[index].status)
+ goto error;
+ }
+
+ /* Shut down the write thread: */
+ __sync_fetch_and_add(&write_thread_data.stop, 1);
+ if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status)
+ goto error;
+
+ /* Check for final error conditions: */
+ for (index = 0; index < n_epoll_items; ++index) {
+ if (epoll_items[index].status != 0)
+ goto error;
+ if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0)
+ goto error;
+ }
+ for (index = 0; index < n_epoll_items; ++index)
+ if (epoll_items[index].deleted != 1) {
+ printf("Error: item data deleted %1d times.\n",
+ epoll_items[index].deleted);
+ goto error;
+ }
+
+ printf("[PASS]\n");
+ return 0;
+
+ error:
+ printf("[FAIL]\n");
+ return errno;
+}