summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/aio.c500
-rw-r--r--fs/autofs4/autofs_i.h4
-rw-r--r--fs/autofs4/dev-ioctl.c16
-rw-r--r--fs/autofs4/inode.c33
-rw-r--r--fs/autofs4/waitq.c16
-rw-r--r--fs/binfmt_aout.c2
-rw-r--r--fs/binfmt_elf.c23
-rw-r--r--fs/bio-integrity.c3
-rw-r--r--fs/bio.c63
-rw-r--r--fs/btrfs/check-integrity.c14
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/disk-io.c6
-rw-r--r--fs/btrfs/extent_io.c12
-rw-r--r--fs/btrfs/inode.c15
-rw-r--r--fs/btrfs/raid56.c9
-rw-r--r--fs/btrfs/scrub.c18
-rw-r--r--fs/btrfs/volumes.c5
-rw-r--r--fs/buffer.c3
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/direct-io.c20
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ext4/page-io.c3
-rw-r--r--fs/f2fs/data.c2
-rw-r--r--fs/f2fs/segment.c3
-rw-r--r--fs/fat/file.c108
-rw-r--r--fs/fat/inode.c54
-rw-r--r--fs/fat/misc.c5
-rw-r--r--fs/gfs2/lops.c3
-rw-r--r--fs/gfs2/ops_fstype.c3
-rw-r--r--fs/hfsplus/wrapper.c3
-rw-r--r--fs/jfs/jfs_logmgr.c6
-rw-r--r--fs/jfs/jfs_metapage.c6
-rw-r--r--fs/logfs/dev_bdev.c8
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c17
-rw-r--r--fs/nilfs2/segbuf.c3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/proc/kmsg.c10
-rw-r--r--fs/proc/task_mmu.c145
-rw-r--r--fs/xfs/xfs_aops.c3
-rw-r--r--fs/xfs/xfs_buf.c3
43 files changed, 864 insertions, 303 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 7fe5bdee1630..827485113682 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -25,7 +25,9 @@
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
+#include <linux/bio.h>
#include <linux/mmu_context.h>
+#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/aio.h>
@@ -35,6 +37,8 @@
#include <linux/eventfd.h>
#include <linux/blkdev.h>
#include <linux/compat.h>
+#include <linux/percpu-refcount.h>
+#include <linux/radix-tree.h>
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
@@ -59,14 +63,22 @@ struct aio_ring {
#define AIO_RING_PAGES 8
+struct kioctx_cpu {
+ unsigned reqs_available;
+};
+
struct kioctx {
- atomic_t users;
- atomic_t dead;
+ struct percpu_ref users;
- /* This needs improving */
unsigned long user_id;
- struct hlist_node list;
+ struct __percpu kioctx_cpu *cpu;
+
+ /*
+ * For percpu reqs_available, number of slots we move to/from global
+ * counter at a time:
+ */
+ unsigned req_batch;
/*
* This is what userspace passed to io_setup(), it's not used for
* anything but counting against the global max_reqs quota.
@@ -89,7 +101,15 @@ struct kioctx {
struct work_struct rcu_work;
struct {
- atomic_t reqs_active;
+ /*
+ * This counts the number of available slots in the ringbuffer,
+ * so we avoid overflowing it: it's decremented (if positive)
+ * when allocating a kiocb and incremented when the resulting
+ * io_event is pulled off the ringbuffer.
+ *
+ * We batch accesses to it with a percpu version.
+ */
+ atomic_t reqs_available;
} ____cacheline_aligned_in_smp;
struct {
@@ -100,11 +120,23 @@ struct kioctx {
struct {
struct mutex ring_lock;
wait_queue_head_t wait;
+
+ /*
+ * Copy of the real tail - to reduce cacheline bouncing. Updated
+ * by aio_complete() whenever it updates the real tail.
+ */
+ unsigned shadow_tail;
} ____cacheline_aligned_in_smp;
struct {
+ /*
+ * This is the canonical copy of the tail pointer, updated by
+ * aio_complete(). But aio_complete() also uses it as a lock, so
+ * other code can't use it; aio_complete() keeps shadow_tail in
+ * sync with the real value of the tail pointer for other code
+ * to use.
+ */
unsigned tail;
- spinlock_t completion_lock;
} ____cacheline_aligned_in_smp;
struct page *internal_pages[AIO_RING_PAGES];
@@ -275,6 +307,8 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
static void free_ioctx_rcu(struct rcu_head *head)
{
struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+
+ free_percpu(ctx->cpu);
kmem_cache_free(kioctx_cachep, ctx);
}
@@ -288,7 +322,7 @@ static void free_ioctx(struct kioctx *ctx)
struct aio_ring *ring;
struct io_event res;
struct kiocb *req;
- unsigned head, avail;
+ unsigned cpu, head, avail;
spin_lock_irq(&ctx->ctx_lock);
@@ -302,23 +336,31 @@ static void free_ioctx(struct kioctx *ctx)
spin_unlock_irq(&ctx->ctx_lock);
+ for_each_possible_cpu(cpu) {
+ struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu);
+
+ atomic_add(kcpu->reqs_available, &ctx->reqs_available);
+ kcpu->reqs_available = 0;
+ }
+
ring = kmap_atomic(ctx->ring_pages[0]);
head = ring->head;
kunmap_atomic(ring);
- while (atomic_read(&ctx->reqs_active) > 0) {
+ while (atomic_read(&ctx->reqs_available) < ctx->nr_events - 1) {
wait_event(ctx->wait,
- head != ctx->tail ||
- atomic_read(&ctx->reqs_active) <= 0);
+ (head != ctx->shadow_tail) ||
+ (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1));
- avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
+ avail = (head <= ctx->shadow_tail
+ ? ctx->shadow_tail : ctx->nr_events) - head;
- atomic_sub(avail, &ctx->reqs_active);
+ atomic_add(avail, &ctx->reqs_available);
head += avail;
head %= ctx->nr_events;
}
- WARN_ON(atomic_read(&ctx->reqs_active) < 0);
+ WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1);
aio_free_ring(ctx);
@@ -341,7 +383,7 @@ static void free_ioctx(struct kioctx *ctx)
static void put_ioctx(struct kioctx *ctx)
{
- if (unlikely(atomic_dec_and_test(&ctx->users)))
+ if (percpu_ref_put(&ctx->users))
free_ioctx(ctx);
}
@@ -354,6 +396,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
struct kioctx *ctx;
int err = -ENOMEM;
+ /*
+ * We keep track of the number of available ringbuffer slots, to prevent
+ * overflow (reqs_available), and we also use percpu counters for this.
+ *
+ * So since up to half the slots might be on other cpu's percpu counters
+ * and unavailable, double nr_events so userspace sees what they
+ * expected: additionally, we move req_batch slots to/from percpu
+ * counters at a time, so make sure that isn't 0:
+ */
+ nr_events = max(nr_events, num_possible_cpus() * 4);
+ nr_events *= 2;
+
/* Prevent overflows */
if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
(nr_events > (0x10000000U / sizeof(struct kiocb)))) {
@@ -370,18 +424,28 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
ctx->max_reqs = nr_events;
- atomic_set(&ctx->users, 2);
- atomic_set(&ctx->dead, 0);
+ percpu_ref_init(&ctx->users);
+ rcu_read_lock();
+ percpu_ref_get(&ctx->users);
+ rcu_read_unlock();
+
spin_lock_init(&ctx->ctx_lock);
- spin_lock_init(&ctx->completion_lock);
mutex_init(&ctx->ring_lock);
init_waitqueue_head(&ctx->wait);
INIT_LIST_HEAD(&ctx->active_reqs);
- if (aio_setup_ring(ctx) < 0)
+ ctx->cpu = alloc_percpu(struct kioctx_cpu);
+ if (!ctx->cpu)
goto out_freectx;
+ if (aio_setup_ring(ctx) < 0)
+ goto out_freepcpu;
+
+ atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
+ ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
+ BUG_ON(!ctx->req_batch);
+
/* limit the number of system wide aios */
spin_lock(&aio_nr_lock);
if (aio_nr + nr_events > aio_max_nr ||
@@ -392,10 +456,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
aio_nr += ctx->max_reqs;
spin_unlock(&aio_nr_lock);
- /* now link into global list. */
+ /* now insert into the radix tree */
+ err = radix_tree_preload(GFP_KERNEL);
+ if (err)
+ goto out_cleanup;
spin_lock(&mm->ioctx_lock);
- hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
+ err = radix_tree_insert(&mm->ioctx_rtree, ctx->user_id, ctx);
spin_unlock(&mm->ioctx_lock);
+ radix_tree_preload_end();
+ if (err) {
+ WARN_ONCE(1, "aio: insert into ioctx tree failed: %d", err);
+ goto out_cleanup;
+ }
pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
ctx, ctx->user_id, mm, ctx->nr_events);
@@ -404,6 +476,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
out_cleanup:
err = -EAGAIN;
aio_free_ring(ctx);
+out_freepcpu:
+ free_percpu(ctx->cpu);
out_freectx:
kmem_cache_free(kioctx_cachep, ctx);
pr_debug("error allocating ioctx %d\n", err);
@@ -433,9 +507,9 @@ static void kill_ioctx_rcu(struct rcu_head *head)
*/
static void kill_ioctx(struct kioctx *ctx)
{
- if (!atomic_xchg(&ctx->dead, 1)) {
- hlist_del_rcu(&ctx->list);
- /* Between hlist_del_rcu() and dropping the initial ref */
+ if (percpu_ref_kill(&ctx->users)) {
+ radix_tree_delete(&current->mm->ioctx_rtree, ctx->user_id);
+ /* Between radix_tree_delete() and dropping the initial ref */
synchronize_rcu();
/*
@@ -475,31 +549,84 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
*/
void exit_aio(struct mm_struct *mm)
{
- struct kioctx *ctx;
- struct hlist_node *n;
-
- hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
- if (1 != atomic_read(&ctx->users))
- printk(KERN_DEBUG
- "exit_aio:ioctx still alive: %d %d %d\n",
- atomic_read(&ctx->users),
- atomic_read(&ctx->dead),
- atomic_read(&ctx->reqs_active));
- /*
- * We don't need to bother with munmap() here -
- * exit_mmap(mm) is coming and it'll unmap everything.
- * Since aio_free_ring() uses non-zero ->mmap_size
- * as indicator that it needs to unmap the area,
- * just set it to 0; aio_free_ring() is the only
- * place that uses ->mmap_size, so it's safe.
- */
- ctx->mmap_size = 0;
+ struct kioctx *ctx[16];
+ unsigned long idx = 0;
+ int count;
- if (!atomic_xchg(&ctx->dead, 1)) {
- hlist_del_rcu(&ctx->list);
- call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
+ do {
+ int i;
+
+ count = radix_tree_gang_lookup(&mm->ioctx_rtree, (void **)ctx,
+ idx, sizeof(ctx)/sizeof(void *));
+ for (i = 0; i < count; i++) {
+ void *ret;
+
+ BUG_ON(ctx[i]->user_id < idx);
+ idx = ctx[i]->user_id;
+
+ /*
+ * We don't need to bother with munmap() here -
+ * exit_mmap(mm) is coming and it'll unmap everything.
+ * Since aio_free_ring() uses non-zero ->mmap_size
+ * as indicator that it needs to unmap the area,
+ * just set it to 0; aio_free_ring() is the only
+ * place that uses ->mmap_size, so it's safe.
+ */
+ ctx[i]->mmap_size = 0;
+
+ if (percpu_ref_kill(&ctx[i]->users)) {
+ ret = radix_tree_delete(&mm->ioctx_rtree, idx);
+ BUG_ON(!ret || ret != ctx[i]);
+ call_rcu(&ctx[i]->rcu_head, kill_ioctx_rcu);
+ }
}
+ } while (count);
+}
+
+static void put_reqs_available(struct kioctx *ctx, unsigned nr)
+{
+ struct kioctx_cpu *kcpu;
+
+ preempt_disable();
+ kcpu = this_cpu_ptr(ctx->cpu);
+
+ kcpu->reqs_available += nr;
+ while (kcpu->reqs_available >= ctx->req_batch * 2) {
+ kcpu->reqs_available -= ctx->req_batch;
+ atomic_add(ctx->req_batch, &ctx->reqs_available);
}
+
+ preempt_enable();
+}
+
+static bool get_reqs_available(struct kioctx *ctx)
+{
+ struct kioctx_cpu *kcpu;
+ bool ret = false;
+
+ preempt_disable();
+ kcpu = this_cpu_ptr(ctx->cpu);
+
+ if (!kcpu->reqs_available) {
+ int old, avail = atomic_read(&ctx->reqs_available);
+
+ do {
+ if (avail < ctx->req_batch)
+ goto out;
+
+ old = avail;
+ avail = atomic_cmpxchg(&ctx->reqs_available,
+ avail, avail - ctx->req_batch);
+ } while (avail != old);
+
+ kcpu->reqs_available += ctx->req_batch;
+ }
+
+ ret = true;
+ kcpu->reqs_available--;
+out:
+ preempt_enable();
+ return ret;
}
/* aio_get_req
@@ -516,22 +643,18 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
{
struct kiocb *req;
- if (atomic_read(&ctx->reqs_active) >= ctx->nr_events)
+ if (!get_reqs_available(ctx))
return NULL;
- if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1)
- goto out_put;
-
req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
if (unlikely(!req))
goto out_put;
atomic_set(&req->ki_users, 2);
req->ki_ctx = ctx;
-
return req;
out_put:
- atomic_dec(&ctx->reqs_active);
+ put_reqs_available(ctx, 1);
return NULL;
}
@@ -562,78 +685,21 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
rcu_read_lock();
- hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
- if (ctx->user_id == ctx_id) {
- atomic_inc(&ctx->users);
- ret = ctx;
- break;
- }
+ ctx = radix_tree_lookup(&mm->ioctx_rtree, ctx_id);
+ if (ctx) {
+ percpu_ref_get(&ctx->users);
+ ret = ctx;
}
rcu_read_unlock();
return ret;
}
-/* aio_complete
- * Called when the io request on the given iocb is complete.
- */
-void aio_complete(struct kiocb *iocb, long res, long res2)
+static inline unsigned kioctx_ring_put(struct kioctx *ctx, struct kiocb *req,
+ unsigned tail)
{
- struct kioctx *ctx = iocb->ki_ctx;
- struct aio_ring *ring;
struct io_event *ev_page, *event;
- unsigned long flags;
- unsigned tail, pos;
-
- /*
- * Special case handling for sync iocbs:
- * - events go directly into the iocb for fast handling
- * - the sync task with the iocb in its stack holds the single iocb
- * ref, no other paths have a way to get another ref
- * - the sync task helpfully left a reference to itself in the iocb
- */
- if (is_sync_kiocb(iocb)) {
- BUG_ON(atomic_read(&iocb->ki_users) != 1);
- iocb->ki_user_data = res;
- atomic_set(&iocb->ki_users, 0);
- wake_up_process(iocb->ki_obj.tsk);
- return;
- }
-
- /*
- * Take rcu_read_lock() in case the kioctx is being destroyed, as we
- * need to issue a wakeup after decrementing reqs_active.
- */
- rcu_read_lock();
-
- if (iocb->ki_list.next) {
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->ctx_lock, flags);
- list_del(&iocb->ki_list);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- }
-
- /*
- * cancelled requests don't get events, userland was given one
- * when the event got cancelled.
- */
- if (unlikely(xchg(&iocb->ki_cancel,
- KIOCB_CANCELLED) == KIOCB_CANCELLED)) {
- atomic_dec(&ctx->reqs_active);
- /* Still need the wake_up in case free_ioctx is waiting */
- goto put_rq;
- }
-
- /*
- * Add a completion event to the ring buffer. Must be done holding
- * ctx->ctx_lock to prevent other code from messing with the tail
- * pointer since we might be called from irq context.
- */
- spin_lock_irqsave(&ctx->completion_lock, flags);
-
- tail = ctx->tail;
- pos = tail + AIO_EVENTS_OFFSET;
+ unsigned pos = tail + AIO_EVENTS_OFFSET;
if (++tail >= ctx->nr_events)
tail = 0;
@@ -641,60 +707,195 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
event = ev_page + pos % AIO_EVENTS_PER_PAGE;
- event->obj = (u64)(unsigned long)iocb->ki_obj.user;
- event->data = iocb->ki_user_data;
- event->res = res;
- event->res2 = res2;
+ event->obj = (u64)(unsigned long)req->ki_obj.user;
+ event->data = req->ki_user_data;
+ event->res = req->ki_res;
+ event->res2 = req->ki_res2;
kunmap_atomic(ev_page);
flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
- ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
- res, res2);
+ ctx, tail, req, req->ki_obj.user, req->ki_user_data,
+ req->ki_res, req->ki_res2);
+
+ return tail;
+}
+
+static inline unsigned kioctx_ring_lock(struct kioctx *ctx)
+{
+ unsigned tail;
- /* after flagging the request as done, we
- * must never even look at it again
+ /*
+ * ctx->tail is both our lock and the canonical version of the tail
+ * pointer.
*/
- smp_wmb(); /* make event visible before updating tail */
+ while ((tail = xchg(&ctx->tail, UINT_MAX)) == UINT_MAX)
+ cpu_relax();
- ctx->tail = tail;
+ return tail;
+}
+
+static inline void kioctx_ring_unlock(struct kioctx *ctx, unsigned tail)
+{
+ struct aio_ring *ring;
+
+ if (!ctx)
+ return;
+
+ smp_wmb();
+ /* make event visible before updating tail */
+
+ ctx->shadow_tail = tail;
ring = kmap_atomic(ctx->ring_pages[0]);
ring->tail = tail;
kunmap_atomic(ring);
flush_dcache_page(ctx->ring_pages[0]);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ /* unlock, make new tail visible before checking waitlist */
+ smp_mb();
+
+ ctx->tail = tail;
+
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+}
+
+void batch_complete_aio(struct batch_complete *batch)
+{
+ struct kioctx *ctx = NULL;
+ struct eventfd_ctx *eventfd = NULL;
+ struct rb_node *n;
+ unsigned long flags;
+ unsigned tail = 0;
+
+ if (RB_EMPTY_ROOT(&batch->kiocb))
+ return;
+
+ /*
+ * Take rcu_read_lock() in case the kioctx is being destroyed, as we
+ * need to issue a wakeup after incrementing reqs_available.
+ */
+ rcu_read_lock();
+ local_irq_save(flags);
+
+ n = rb_first(&batch->kiocb);
+ while (n) {
+ struct kiocb *req = container_of(n, struct kiocb, ki_node);
+
+ if (n->rb_right) {
+ n->rb_right->__rb_parent_color = n->__rb_parent_color;
+ n = n->rb_right;
+
+ while (n->rb_left)
+ n = n->rb_left;
+ } else {
+ n = rb_parent(n);
+ }
+
+ if (unlikely(req->ki_eventfd != eventfd)) {
+ if (eventfd) {
+ /* Make event visible */
+ kioctx_ring_unlock(ctx, tail);
+ ctx = NULL;
+
+ eventfd_signal(eventfd, 1);
+ eventfd_ctx_put(eventfd);
+ }
+
+ eventfd = req->ki_eventfd;
+ req->ki_eventfd = NULL;
+ }
+
+ if (unlikely(req->ki_ctx != ctx)) {
+ kioctx_ring_unlock(ctx, tail);
+
+ ctx = req->ki_ctx;
+ tail = kioctx_ring_lock(ctx);
+ }
+
+ tail = kioctx_ring_put(ctx, req, tail);
+ aio_put_req(req);
+ }
- pr_debug("added to ring %p at [%u]\n", iocb, tail);
+ kioctx_ring_unlock(ctx, tail);
+ local_irq_restore(flags);
+ rcu_read_unlock();
/*
* Check if the user asked us to deliver the result through an
* eventfd. The eventfd_signal() function is safe to be called
* from IRQ context.
*/
- if (iocb->ki_eventfd != NULL)
- eventfd_signal(iocb->ki_eventfd, 1);
+ if (eventfd) {
+ eventfd_signal(eventfd, 1);
+ eventfd_ctx_put(eventfd);
+ }
+}
+EXPORT_SYMBOL(batch_complete_aio);
-put_rq:
- /* everything turned out well, dispose of the aiocb. */
- aio_put_req(iocb);
+/* aio_complete_batch
+ * Called when the io request on the given iocb is complete; @batch may be
+ * NULL.
+ */
+void aio_complete_batch(struct kiocb *req, long res, long res2,
+ struct batch_complete *batch)
+{
+ req->ki_res = res;
+ req->ki_res2 = res2;
+
+ if (req->ki_list.next) {
+ struct kioctx *ctx = req->ki_ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+ list_del(&req->ki_list);
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+ }
/*
- * We have to order our ring_info tail store above and test
- * of the wait list below outside the wait lock. This is
- * like in wake_up_bit() where clearing a bit has to be
- * ordered with the unlocked test.
+ * Special case handling for sync iocbs:
+ * - events go directly into the iocb for fast handling
+ * - the sync task with the iocb in its stack holds the single iocb
+ * ref, no other paths have a way to get another ref
+ * - the sync task helpfully left a reference to itself in the iocb
*/
- smp_mb();
+ if (is_sync_kiocb(req)) {
+ BUG_ON(atomic_read(&req->ki_users) != 1);
+ req->ki_user_data = req->ki_res;
+ atomic_set(&req->ki_users, 0);
+ wake_up_process(req->ki_obj.tsk);
+ } else if (batch) {
+ int res;
+ struct kiocb *t;
+ struct rb_node **n = &batch->kiocb.rb_node, *parent = NULL;
+
+ while (*n) {
+ parent = *n;
+ t = container_of(*n, struct kiocb, ki_node);
+
+ res = req->ki_ctx != t->ki_ctx
+ ? req->ki_ctx < t->ki_ctx
+ : req->ki_eventfd != t->ki_eventfd
+ ? req->ki_eventfd < t->ki_eventfd
+ : req < t;
+
+ n = res ? &(*n)->rb_left : &(*n)->rb_right;
+ }
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+ rb_link_node(&req->ki_node, parent, n);
+ rb_insert_color(&req->ki_node, &batch->kiocb);
+ } else {
+ struct batch_complete batch_stack;
- rcu_read_unlock();
+ memset(&req->ki_node, 0, sizeof(req->ki_node));
+ batch_stack.kiocb.rb_node = &req->ki_node;
+
+ batch_complete_aio(&batch_stack);
+ }
}
-EXPORT_SYMBOL(aio_complete);
+EXPORT_SYMBOL(aio_complete_batch);
/* aio_read_events
* Pull an event off of the ioctx's event ring. Returns the number of
@@ -714,9 +915,9 @@ static long aio_read_events_ring(struct kioctx *ctx,
head = ring->head;
kunmap_atomic(ring);
- pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events);
+ pr_debug("h%u t%u m%u\n", head, ctx->shadow_tail, ctx->nr_events);
- if (head == ctx->tail)
+ if (head == ctx->shadow_tail)
goto out;
while (ret < nr) {
@@ -724,8 +925,9 @@ static long aio_read_events_ring(struct kioctx *ctx,
struct io_event *ev;
struct page *page;
- avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
- if (head == ctx->tail)
+ avail = (head <= ctx->shadow_tail ?
+ ctx->shadow_tail : ctx->nr_events) - head;
+ if (head == ctx->shadow_tail)
break;
avail = min(avail, nr - ret);
@@ -756,9 +958,9 @@ static long aio_read_events_ring(struct kioctx *ctx,
kunmap_atomic(ring);
flush_dcache_page(ctx->ring_pages[0]);
- pr_debug("%li h%u t%u\n", ret, head, ctx->tail);
+ pr_debug("%li h%u t%u\n", ret, head, ctx->shadow_tail);
- atomic_sub(ret, &ctx->reqs_active);
+ put_reqs_available(ctx, ret);
out:
mutex_unlock(&ctx->ring_lock);
@@ -773,7 +975,7 @@ static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
if (ret > 0)
*i += ret;
- if (unlikely(atomic_read(&ctx->dead)))
+ if (unlikely(percpu_ref_dead(&ctx->users)))
ret = -EINVAL;
if (!*i)
@@ -1142,7 +1344,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
aio_put_req(req); /* drop extra ref to req */
return 0;
out_put_req:
- atomic_dec(&ctx->reqs_active);
+ put_reqs_available(ctx, 1);
aio_put_req(req); /* drop extra ref to req */
aio_put_req(req); /* drop i/o ref to req */
return ret;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 3f1128b37e46..16d3288c808d 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -104,7 +104,7 @@ struct autofs_sb_info {
u32 magic;
int pipefd;
struct file *pipe;
- pid_t oz_pgrp;
+ struct pid *oz_pgrp;
int catatonic;
int version;
int sub_version;
@@ -139,7 +139,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
filesystem without "magic".) */
static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
- return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp;
+ return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
}
/* Does a dentry have some pending activity? */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 743c7c2c949d..91838211b66d 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -346,6 +346,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
{
int pipefd;
int err = 0;
+ struct pid *new_pid = NULL;
if (param->setpipefd.pipefd == -1)
return -EINVAL;
@@ -357,7 +358,17 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
mutex_unlock(&sbi->wq_mutex);
return -EBUSY;
} else {
- struct file *pipe = fget(pipefd);
+ struct file *pipe;
+
+ new_pid = get_task_pid(current, PIDTYPE_PGID);
+
+ if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
+ AUTOFS_WARN("Not allowed to change PID namespace");
+ err = -EINVAL;
+ goto out;
+ }
+
+ pipe = fget(pipefd);
if (!pipe) {
err = -EBADF;
goto out;
@@ -367,12 +378,13 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
fput(pipe);
goto out;
}
- sbi->oz_pgrp = task_pgrp_nr(current);
+ swap(sbi->oz_pgrp, new_pid);
sbi->pipefd = pipefd;
sbi->pipe = pipe;
sbi->catatonic = 0;
}
out:
+ put_pid(new_pid);
mutex_unlock(&sbi->wq_mutex);
return err;
}
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index b104726e2d0a..1b045ecfcea2 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -62,6 +62,8 @@ void autofs4_kill_sb(struct super_block *sb)
/* Free wait queues, close pipe */
autofs4_catatonic_mode(sbi);
+ put_pid(sbi->oz_pgrp);
+
sb->s_fs_info = NULL;
kfree(sbi);
@@ -85,7 +87,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
seq_printf(m, ",gid=%u",
from_kgid_munged(&init_user_ns, root_inode->i_gid));
- seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
+ seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp));
seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
seq_printf(m, ",minproto=%d", sbi->min_proto);
seq_printf(m, ",maxproto=%d", sbi->max_proto);
@@ -129,7 +131,8 @@ static const match_table_t tokens = {
};
static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
- pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
+ int *pgrp, bool *pgrp_set, unsigned int *type,
+ int *minproto, int *maxproto)
{
char *p;
substring_t args[MAX_OPT_ARGS];
@@ -137,7 +140,6 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
*uid = current_uid();
*gid = current_gid();
- *pgrp = task_pgrp_nr(current);
*minproto = AUTOFS_MIN_PROTO_VERSION;
*maxproto = AUTOFS_MAX_PROTO_VERSION;
@@ -176,6 +178,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
if (match_int(args, &option))
return 1;
*pgrp = option;
+ *pgrp_set = true;
break;
case Opt_minproto:
if (match_int(args, &option))
@@ -211,6 +214,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
int pipefd;
struct autofs_sb_info *sbi;
struct autofs_info *ino;
+ int pgrp;
+ bool pgrp_set = false;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -223,7 +228,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->pipe = NULL;
sbi->catatonic = 1;
sbi->exp_timeout = 0;
- sbi->oz_pgrp = task_pgrp_nr(current);
+ sbi->oz_pgrp = NULL;
sbi->sb = s;
sbi->version = 0;
sbi->sub_version = 0;
@@ -260,12 +265,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
/* Can this call block? */
if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
- &sbi->oz_pgrp, &sbi->type, &sbi->min_proto,
- &sbi->max_proto)) {
+ &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
+ &sbi->max_proto)) {
printk("autofs: called with bogus options\n");
goto fail_dput;
}
+ if (pgrp_set) {
+ sbi->oz_pgrp = find_get_pid(pgrp);
+ if (!sbi->oz_pgrp) {
+ pr_warn("autofs: could not find process group %d\n",
+ pgrp);
+ goto fail_dput;
+ }
+ } else {
+ sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
+ }
+
if (autofs_type_trigger(sbi->type))
__managed_dentry_set_managed(root);
@@ -289,9 +305,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->version = sbi->max_proto;
sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
- DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp);
+ DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
pipe = fget(pipefd);
-
+
if (!pipe) {
printk("autofs: could not open pipe file descriptor\n");
goto fail_dput;
@@ -321,6 +337,7 @@ fail_dput:
fail_ino:
kfree(ino);
fail_free:
+ put_pid(sbi->oz_pgrp);
kfree(sbi);
s->s_fs_info = NULL;
fail_unlock:
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 3db70dae40d3..309ca6bcbb09 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -353,11 +353,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
struct qstr qstr;
char *name;
int status, ret, type;
+ pid_t pid;
+ pid_t tgid;
/* In catatonic mode, we don't wait for nobody */
if (sbi->catatonic)
return -ENOENT;
+ /*
+ * Try translating pids to the namespace of the daemon.
+ *
+ * Zero means failure: we are in an unrelated pid namespace.
+ */
+ pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+ tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+ if (pid == 0 || tgid == 0)
+ return -ENOENT;
+
if (!dentry->d_inode) {
/*
* A wait for a negative dentry is invalid for certain
@@ -423,8 +435,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
wq->ino = autofs4_get_ino(sbi);
wq->uid = current_uid();
wq->gid = current_gid();
- wq->pid = current->pid;
- wq->tgid = current->tgid;
+ wq->pid = pid;
+ wq->tgid = tgid;
wq->status = -EINTR; /* Status return if interrupted */
wq->wait_ctr = 2;
mutex_unlock(&sbi->wq_mutex);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index bce87694f7b0..89dec7f789a4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -255,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
(current->mm->start_data = N_DATADDR(ex));
current->mm->brk = ex.a_bss +
(current->mm->start_brk = N_BSSADDR(ex));
- current->mm->free_area_cache = current->mm->mmap_base;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
if (retval < 0) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f8a0b0efda44..53b6d624c7a9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -140,6 +140,25 @@ static int padzero(unsigned long elf_bss)
#define ELF_BASE_PLATFORM NULL
#endif
+/*
+ * Use get_random_int() to implement AT_RANDOM while avoiding depletion
+ * of the entropy pool.
+ */
+static void get_atrandom_bytes(unsigned char *buf, size_t nbytes)
+{
+ unsigned char *p = buf;
+
+ while (nbytes) {
+ unsigned int random_variable;
+ size_t chunk = min(nbytes, sizeof(random_variable));
+
+ random_variable = get_random_int();
+ memcpy(p, &random_variable, chunk);
+ p += chunk;
+ nbytes -= chunk;
+ }
+}
+
static int
create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
unsigned long load_addr, unsigned long interp_load_addr)
@@ -201,7 +220,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
/*
* Generate 16 random bytes for userspace PRNG seeding.
*/
- get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+ get_atrandom_bytes(k_rand_bytes, sizeof(k_rand_bytes));
u_rand_bytes = (elf_addr_t __user *)
STACK_ALLOC(p, sizeof(k_rand_bytes));
if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
@@ -738,8 +757,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
/* Do this so that we can load the interpreter, if need be. We will
change some of these later */
- current->mm->free_area_cache = current->mm->mmap_base;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
if (retval < 0) {
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 8fb42916d8a2..69f6f802b09e 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -510,7 +510,8 @@ static void bio_integrity_verify_fn(struct work_struct *work)
* in process context. This function postpones completion
* accordingly.
*/
-void bio_integrity_endio(struct bio *bio, int error)
+void bio_integrity_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct bio_integrity_payload *bip = bio->bi_integrity;
diff --git a/fs/bio.c b/fs/bio.c
index 94bbc04dba77..89cf9776f7ab 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,6 +28,7 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
+#include <linux/aio.h>
#include <scsi/sg.h> /* for struct sg_iovec */
#include <trace/events/block.h>
@@ -760,7 +761,8 @@ struct submit_bio_ret {
int error;
};
-static void submit_bio_wait_endio(struct bio *bio, int error)
+static void submit_bio_wait_endio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct submit_bio_ret *ret = bio->bi_private;
@@ -1414,7 +1416,8 @@ void bio_unmap_user(struct bio *bio)
}
EXPORT_SYMBOL(bio_unmap_user);
-static void bio_map_kern_endio(struct bio *bio, int err)
+static void bio_map_kern_endio(struct bio *bio, int err,
+ struct batch_complete *batch)
{
bio_put(bio);
}
@@ -1486,7 +1489,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
}
EXPORT_SYMBOL(bio_map_kern);
-static void bio_copy_kern_endio(struct bio *bio, int err)
+static void bio_copy_kern_endio(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct bio_vec *bvec;
const int read = bio_data_dir(bio) == READ;
@@ -1685,31 +1689,40 @@ void bio_flush_dcache_pages(struct bio *bi)
EXPORT_SYMBOL(bio_flush_dcache_pages);
#endif
-/**
- * bio_endio - end I/O on a bio
- * @bio: bio
- * @error: error, if any
- *
- * Description:
- * bio_endio() will end I/O on the whole bio. bio_endio() is the
- * preferred way to end I/O on a bio, it takes care of clearing
- * BIO_UPTODATE on error. @error is 0 on success, and and one of the
- * established -Exxxx (-EIO, for instance) error values in case
- * something went wrong. No one should call bi_end_io() directly on a
- * bio unless they own it and thus know that it has an end_io
- * function.
- **/
-void bio_endio(struct bio *bio, int error)
+static inline void __bio_endio(struct bio *bio, struct batch_complete *batch)
{
- if (error)
+ if (bio->bi_error)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- error = -EIO;
+ bio->bi_error = -EIO;
if (bio->bi_end_io)
- bio->bi_end_io(bio, error);
+ bio->bi_end_io(bio, bio->bi_error, batch);
+}
+
+void bio_endio_batch(struct bio *bio, int error, struct batch_complete *batch)
+{
+ if (error)
+ bio->bi_error = error;
+
+ if (batch)
+ bio_list_add(&batch->bio, bio);
+ else
+ __bio_endio(bio, batch);
+
+}
+EXPORT_SYMBOL(bio_endio_batch);
+
+void batch_complete(struct batch_complete *batch)
+{
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&batch->bio)))
+ __bio_endio(bio, batch);
+
+ batch_complete_aio(batch);
}
-EXPORT_SYMBOL(bio_endio);
+EXPORT_SYMBOL(batch_complete);
void bio_pair_release(struct bio_pair *bp)
{
@@ -1722,7 +1735,8 @@ void bio_pair_release(struct bio_pair *bp)
}
EXPORT_SYMBOL(bio_pair_release);
-static void bio_pair_end_1(struct bio *bi, int err)
+static void bio_pair_end_1(struct bio *bi, int err,
+ struct batch_complete *batch)
{
struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
@@ -1732,7 +1746,8 @@ static void bio_pair_end_1(struct bio *bi, int err)
bio_pair_release(bp);
}
-static void bio_pair_end_2(struct bio *bi, int err)
+static void bio_pair_end_2(struct bio *bi, int err,
+ struct batch_complete *batch)
{
struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 1431a6965017..29b35350b01d 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -323,7 +323,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
static int btrfsic_read_block(struct btrfsic_state *state,
struct btrfsic_block_data_ctx *block_ctx);
static void btrfsic_dump_database(struct btrfsic_state *state);
-static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch);
static int btrfsic_test_for_metadata(struct btrfsic_state *state,
char **datav, unsigned int num_pages);
static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
@@ -336,7 +337,8 @@ static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const block,
struct btrfs_super_block *const super_hdr);
-static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status,
+ struct batch_complete *batch);
static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
const struct btrfsic_block *block,
@@ -1751,7 +1753,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return block_ctx->len;
}
-static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
complete((struct completion *)bio->bi_private);
}
@@ -2294,7 +2297,8 @@ continue_loop:
goto again;
}
-static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status,
+ struct batch_complete *batch)
{
struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
int iodone_w_error;
@@ -2342,7 +2346,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
block = next_block;
} while (NULL != block);
- bp->bi_end_io(bp, bio_error_status);
+ bp->bi_end_io(bp, bio_error_status, batch);
}
static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b189bd1e7a3e..2298567b48e2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -156,7 +156,8 @@ fail:
* The compressed pages are freed here, and it must be run
* in process context
*/
-static void end_compressed_bio_read(struct bio *bio, int err)
+static void end_compressed_bio_read(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
@@ -266,7 +267,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start,
* This also calls the writeback end hooks for the file pages so that
* metadata and checksums can be updated in the file.
*/
-static void end_compressed_bio_write(struct bio *bio, int err)
+static void end_compressed_bio_write(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct extent_io_tree *tree;
struct compressed_bio *cb = bio->bi_private;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 40c7bc300075..364ce4fbc3cf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -685,7 +685,8 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
return -EIO; /* we fixed nothing */
}
-static void end_workqueue_bio(struct bio *bio, int err)
+static void end_workqueue_bio(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
@@ -3075,7 +3076,8 @@ static int write_dev_supers(struct btrfs_device *device,
* endio for the write_dev_flush, this will wake anyone waiting
* for the barrier when it is done
*/
-static void btrfs_end_empty_barrier(struct bio *bio, int err)
+static void btrfs_end_empty_barrier(struct bio *bio, int err,
+ struct batch_complete *batch)
{
if (err) {
if (err == -EOPNOTSUPP)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6bca9472f313..94258a1407c1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2003,7 +2003,8 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
return err;
}
-static void repair_io_failure_callback(struct bio *bio, int err)
+static void repair_io_failure_callback(struct bio *bio, int err,
+ struct batch_complete *batch)
{
complete(bio->bi_private);
}
@@ -2383,7 +2384,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_writepage(struct bio *bio, int err)
+static void end_bio_extent_writepage(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
struct extent_io_tree *tree;
@@ -2431,7 +2433,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_readpage(struct bio *bio, int err)
+static void end_bio_extent_readpage(struct bio *bio, int err,
+ struct batch_complete *batch)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -3270,7 +3273,8 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
}
-static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+static void end_bio_extent_buffer_writepage(struct bio *bio, int err,
+ struct batch_complete *batch)
{
int uptodate = err == 0;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index db57e6384fbb..06acf922af38 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6939,7 +6939,8 @@ struct btrfs_dio_private {
struct bio *dio_bio;
};
-static void btrfs_endio_direct_read(struct bio *bio, int err)
+static void btrfs_endio_direct_read(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_dio_private *dip = bio->bi_private;
struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -6993,11 +6994,12 @@ failed:
/* If we had a csum failure make sure to clear the uptodate flag */
if (err)
clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
- dio_end_io(dio_bio, err);
+ dio_end_io(dio_bio, err, batch);
bio_put(bio);
}
-static void btrfs_endio_direct_write(struct bio *bio, int err)
+static void btrfs_endio_direct_write(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_dio_private *dip = bio->bi_private;
struct inode *inode = dip->inode;
@@ -7040,7 +7042,7 @@ out_done:
/* If we had an error make sure to clear the uptodate flag */
if (err)
clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
- dio_end_io(dio_bio, err);
+ dio_end_io(dio_bio, err, batch);
bio_put(bio);
}
@@ -7055,7 +7057,8 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
return 0;
}
-static void btrfs_end_dio_bio(struct bio *bio, int err)
+static void btrfs_end_dio_bio(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_dio_private *dip = bio->bi_private;
@@ -7081,7 +7084,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
bio_io_error(dip->orig_bio);
} else {
set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
- bio_endio(dip->orig_bio, 0);
+ bio_endio_batch(dip->orig_bio, 0, batch);
}
out:
bio_put(bio);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0525e1389f5b..17cef49e21f6 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -850,7 +850,8 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
* end io function used by finish_rmw. When we finally
* get here, we've written a full stripe
*/
-static void raid_write_end_io(struct bio *bio, int err)
+static void raid_write_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
@@ -1384,7 +1385,8 @@ static void set_bio_pages_uptodate(struct bio *bio)
* This will usually kick off finish_rmw once all the bios are read in, but it
* may trigger parity reconstruction if we had any errors along the way
*/
-static void raid_rmw_end_io(struct bio *bio, int err)
+static void raid_rmw_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
@@ -1905,7 +1907,8 @@ cleanup_io:
* This is called only for stripes we've read from disk to
* reconstruct the parity.
*/
-static void raid_recover_end_io(struct bio *bio, int err)
+static void raid_recover_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 79bd479317cb..86114520ba45 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -200,7 +200,8 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
int is_metadata, int have_csum,
const u8 *csum, u64 generation,
u16 csum_size);
-static void scrub_complete_bio_end_io(struct bio *bio, int err);
+static void scrub_complete_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int force_write);
@@ -223,7 +224,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum, int force,
u64 physical_for_dev_replace);
-static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch);
static void scrub_bio_end_io_worker(struct btrfs_work *work);
static void scrub_block_complete(struct scrub_block *sblock);
static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
@@ -240,7 +242,8 @@ static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
static void scrub_wr_submit(struct scrub_ctx *sctx);
-static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch);
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
static int write_page_nocow(struct scrub_ctx *sctx,
u64 physical_for_dev_replace, struct page *page);
@@ -1384,7 +1387,8 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
sblock->checksum_error = 1;
}
-static void scrub_complete_bio_end_io(struct bio *bio, int err)
+static void scrub_complete_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
complete((struct completion *)bio->bi_private);
}
@@ -1584,7 +1588,8 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
btrfsic_submit_bio(WRITE, sbio->bio);
}
-static void scrub_wr_bio_end_io(struct bio *bio, int err)
+static void scrub_wr_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
@@ -2053,7 +2058,8 @@ leave_nomem:
return 0;
}
-static void scrub_bio_end_io(struct bio *bio, int err)
+static void scrub_bio_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bffb9174afb..34cb538ecd2e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5018,7 +5018,8 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
-static void btrfs_end_bio(struct bio *bio, int err)
+static void btrfs_end_bio(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct btrfs_bio *bbio = bio->bi_private;
int is_orig_bio = 0;
@@ -5073,7 +5074,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
}
kfree(bbio);
- bio_endio(bio, err);
+ bio_endio_batch(bio, err, batch);
} else if (!is_orig_bio) {
bio_put(bio);
}
diff --git a/fs/buffer.c b/fs/buffer.c
index f93392e2df12..d3b6c14a7b1c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2897,7 +2897,8 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
}
EXPORT_SYMBOL(generic_block_bmap);
-static void end_bio_bh_io_sync(struct bio *bio, int err)
+static void end_bio_bh_io_sync(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct buffer_head *bh = bio->bi_private;
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2b6cb23dd14e..1d1c41f1014d 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
mutex_lock(&buffer->mutex);
len = fill_write_buffer(buffer, buf, count);
if (len > 0)
- len = flush_write_buffer(file->f_path.dentry, buffer, count);
+ len = flush_write_buffer(file->f_path.dentry, buffer, len);
if (len > 0)
*ppos += len;
mutex_unlock(&buffer->mutex);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7ab90f5081ee..b8707bf52b82 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -230,7 +230,8 @@ static inline struct page *dio_get_page(struct dio *dio,
* filesystems can use it to hold additional state between get_block calls and
* dio_complete.
*/
-static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async,
+ struct batch_complete *batch)
{
ssize_t transferred = 0;
@@ -264,7 +265,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
} else {
inode_dio_done(dio->inode);
if (is_async)
- aio_complete(dio->iocb, ret, 0);
+ aio_complete_batch(dio->iocb, ret, 0, batch);
}
return ret;
@@ -274,7 +275,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio);
/*
* Asynchronous IO callback.
*/
-static void dio_bio_end_aio(struct bio *bio, int error)
+static void dio_bio_end_aio(struct bio *bio, int error, struct batch_complete *batch)
{
struct dio *dio = bio->bi_private;
unsigned long remaining;
@@ -290,7 +291,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (remaining == 0) {
- dio_complete(dio, dio->iocb->ki_pos, 0, true);
+ dio_complete(dio, dio->iocb->ki_pos, 0, true, batch);
kmem_cache_free(dio_cache, dio);
}
}
@@ -324,12 +325,12 @@ static void dio_bio_end_io(struct bio *bio, int error)
* so that the DIO specific endio actions are dealt with after the filesystem
* has done it's completion work.
*/
-void dio_end_io(struct bio *bio, int error)
+void dio_end_io(struct bio *bio, int error, struct batch_complete *batch)
{
struct dio *dio = bio->bi_private;
if (dio->is_async)
- dio_bio_end_aio(bio, error);
+ dio_bio_end_aio(bio, error, batch);
else
dio_bio_end_io(bio, error);
}
@@ -350,10 +351,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
bio->bi_bdev = bdev;
bio->bi_sector = first_sector;
- if (dio->is_async)
- bio->bi_end_io = dio_bio_end_aio;
- else
- bio->bi_end_io = dio_bio_end_io;
+ bio->bi_end_io = dio_end_io;
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
@@ -1268,7 +1266,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
dio_await_completion(dio);
if (drop_refcount(dio) == 0) {
- retval = dio_complete(dio, offset, retval, false);
+ retval = dio_complete(dio, offset, retval, false, NULL);
kmem_cache_free(dio_cache, dio);
} else
BUG_ON(retval != -EIOCBQUEUED);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index c00e055b6282..f23d2a7ed438 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -58,6 +58,8 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
if (ret)
return ret;
if (write) {
+ printk(KERN_NOTICE "%s (%d): dropped kernel caches: %d\n",
+ current->comm, task_pid_nr(current), sysctl_drop_caches);
if (sysctl_drop_caches & 1)
iterate_supers(drop_pagecache_sb, NULL);
if (sysctl_drop_caches & 2)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4acf1f78881b..1678d9ed2354 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -219,7 +219,8 @@ static void buffer_io_error(struct buffer_head *bh)
(unsigned long long)bh->b_blocknr);
}
-static void ext4_end_bio(struct bio *bio, int error)
+static void ext4_end_bio(struct bio *bio, int error,
+ struct batch_complete *batch)
{
ext4_io_end_t *io_end = bio->bi_private;
struct inode *inode;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f0f2e26e0e53..552bcbbbc8c4 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -344,7 +344,7 @@ repeat:
return page;
}
-static void read_end_io(struct bio *bio, int err)
+static void read_end_io(struct bio *bio, int err, struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index be668ffb001c..e76c448bd6ca 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -634,7 +634,8 @@ static const struct segment_allocation default_salloc_ops = {
.allocate_segment = allocate_segment_by_default,
};
-static void f2fs_end_io_write(struct bio *bio, int err)
+static void f2fs_end_io_write(struct bio *bio, int err,
+ struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b0b632e50ddb..73264395f705 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -17,8 +17,11 @@
#include <linux/blkdev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
+#include <linux/falloc.h>
#include "fat.h"
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len);
static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
@@ -140,6 +143,22 @@ static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
static int fat_file_release(struct inode *inode, struct file *filp)
{
+
+ struct super_block *sb = inode->i_sb;
+ loff_t mmu_private_ideal;
+
+ /*
+ * Release unwritten fallocated blocks on file release.
+ * Do this only when the last open file descriptor is closed.
+ */
+ mutex_lock(&inode->i_mutex);
+ mmu_private_ideal = round_up(inode->i_size, sb->s_blocksize);
+
+ if (mmu_private_ideal < MSDOS_I(inode)->mmu_private &&
+ filp->f_dentry->d_count == 1)
+ fat_truncate_blocks(inode, inode->i_size);
+ mutex_unlock(&inode->i_mutex);
+
if ((filp->f_mode & FMODE_WRITE) &&
MSDOS_SB(inode->i_sb)->options.flush) {
fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -174,6 +193,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .fallocate = fat_fallocate,
};
static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -212,6 +232,88 @@ out:
return err;
}
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate. The
+ * allocated clusters are freed in fat_file_release().
+ */
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ int cluster, fclus, dclus;
+ int nr_cluster; /* Number of clusters to be allocated */
+ loff_t nr_bytes; /* Number of bytes to be allocated*/
+ loff_t free_bytes; /* Unused bytes in the last cluster of file*/
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int err = 0;
+
+ /* No support for hole punch or other fallocate flags. */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&inode->i_mutex);
+ if ((offset + len) <= MSDOS_I(inode)->mmu_private) {
+ fat_msg(sb, KERN_ERR,
+ "fat_fallocate(): Blocks already allocated");
+ err = -EINVAL;
+ goto error;
+ }
+
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ /* First compute the number of clusters to be allocated */
+ if (inode->i_size > 0) {
+ err = fat_get_cluster(inode, FAT_ENT_EOF,
+ &fclus, &dclus);
+ if (err < 0) {
+ fat_msg(sb, KERN_ERR,
+ "fat_fallocate(): fat_get_cluster() error");
+ goto error;
+ }
+ free_bytes = ((fclus + 1) << sbi->cluster_bits) -
+ inode->i_size;
+ nr_bytes = offset + len - inode->i_size - free_bytes;
+ MSDOS_I(inode)->mmu_private = (fclus + 1) <<
+ sbi->cluster_bits;
+ } else
+ nr_bytes = offset + len - inode->i_size;
+
+ nr_cluster = (nr_bytes + (sbi->cluster_size - 1)) >>
+ sbi->cluster_bits;
+
+ /* Start the allocation.We are not zeroing out the clusters */
+ while (nr_cluster-- > 0) {
+ err = fat_alloc_clusters(inode, &cluster, 1);
+ if (err) {
+ fat_msg(sb, KERN_ERR,
+ "fat_fallocate(): fat_alloc_clusters() error");
+ goto error;
+ }
+ err = fat_chain_add(inode, cluster, 1);
+ if (err) {
+ fat_free_clusters(inode, cluster);
+ goto error;
+ }
+ MSDOS_I(inode)->mmu_private += sbi->cluster_size;
+ }
+ } else {
+ /* This is just an expanding truncate */
+ err = fat_cont_expand(inode, (offset + len));
+ if (err) {
+ fat_msg(sb, KERN_ERR,
+ "fat_fallocate(): fat_cont_expand() error");
+ }
+ }
+
+error:
+ mutex_unlock(&inode->i_mutex);
+ return err;
+}
+
/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
@@ -378,6 +480,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *inode = dentry->d_inode;
unsigned int ia_valid;
int error;
+ loff_t mmu_private_ideal;
+
+ mmu_private_ideal = round_up(inode->i_size, dentry->d_sb->s_blocksize);
/* Check for setting the inode time. */
ia_valid = attr->ia_valid;
@@ -403,7 +508,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & ATTR_SIZE) {
inode_dio_wait(inode);
- if (attr->ia_size > inode->i_size) {
+ if (attr->ia_size > inode->i_size &&
+ MSDOS_I(inode)->mmu_private <= mmu_private_ideal) {
error = fat_cont_expand(inode, attr->ia_size);
if (error || attr->ia_valid == ATTR_SIZE)
goto out;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5d4513cb1b3c..f49de9379bcd 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -152,11 +152,65 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
}
}
+static int fat_zero_falloc_area(struct file *file,
+ struct address_space *mapping, loff_t pos)
+{
+ struct page *page;
+ struct inode *inode = mapping->host;
+ loff_t curpos = i_size_read(inode);
+ size_t count = pos - curpos;
+ int err;
+
+ do {
+ unsigned offset;
+ size_t bytes;
+ void *fsdata;
+
+ offset = (curpos & (PAGE_CACHE_SIZE - 1));
+ bytes = PAGE_CACHE_SIZE - offset;
+ bytes = min(bytes, count);
+
+ err = pagecache_write_begin(NULL, mapping, curpos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE,
+ &page, &fsdata);
+ if (err)
+ break;
+
+ zero_user(page, offset, bytes);
+
+ err = pagecache_write_end(NULL, mapping, curpos, bytes, bytes,
+ page, fsdata);
+ if (err < 0)
+ break;
+ curpos += bytes;
+ count -= bytes;
+ err = 0;
+ } while (count);
+
+ return err;
+}
+
static int fat_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
int err;
+ loff_t mmu_private_ideal, mmu_private_actual;
+ loff_t size;
+ struct inode *inode = mapping->host;
+ struct super_block *sb = inode->i_sb;
+
+ size = i_size_read(inode);
+ mmu_private_actual = MSDOS_I(inode)->mmu_private;
+ mmu_private_ideal = round_up(size, sb->s_blocksize);
+ if ((mmu_private_actual > mmu_private_ideal) && (pos > size)) {
+ err = fat_zero_falloc_area(file, mapping, pos);
+ if (err) {
+ fat_msg(sb, KERN_ERR,
+ "Error (%d) zeroing fallocated area", err);
+ return err;
+ }
+ }
*pagep = NULL;
err = cont_write_begin(file, mapping, pos, len, flags,
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 359d307b5507..628e22a5a543 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf);
+ fat_msg(sb, KERN_ERR, "error, %pV", &vaf);
va_end(args);
}
@@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
sb->s_flags |= MS_RDONLY;
- printk(KERN_ERR "FAT-fs (%s): Filesystem has been "
- "set read-only\n", sb->s_id);
+ fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
}
}
EXPORT_SYMBOL_GPL(__fat_fs_error);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 33f18b7282b2..d3ac40e24111 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -201,7 +201,8 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
*
*/
-static void gfs2_end_log_write(struct bio *bio, int error)
+static void gfs2_end_log_write(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct gfs2_sbd *sdp = bio->bi_private;
struct bio_vec *bvec;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 60ede2a0f43f..86eb657aeaca 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -155,7 +155,8 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
return -EINVAL;
}
-static void end_bio_io_page(struct bio *bio, int error)
+static void end_bio_io_page(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct page *page = bio->bi_private;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index b51a6079108d..96375a5124b2 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -24,7 +24,8 @@ struct hfsplus_wd {
u16 embed_count;
};
-static void hfsplus_end_io_sync(struct bio *bio, int err)
+static void hfsplus_end_io_sync(struct bio *bio, int err,
+ struct batch_complete *batch)
{
if (err)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 360d27c48887..4c3289c00d0c 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2012,7 +2012,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
/*check if journaling to disk has been disabled*/
if (log->no_integrity) {
bio->bi_size = 0;
- lbmIODone(bio, 0);
+ lbmIODone(bio, 0, NULL);
} else {
submit_bio(READ_SYNC, bio);
}
@@ -2159,7 +2159,7 @@ static void lbmStartIO(struct lbuf * bp)
/* check if journaling to disk has been disabled */
if (log->no_integrity) {
bio->bi_size = 0;
- lbmIODone(bio, 0);
+ lbmIODone(bio, 0, NULL);
} else {
submit_bio(WRITE_SYNC, bio);
INCREMENT(lmStat.submitted);
@@ -2197,7 +2197,7 @@ static int lbmIOWait(struct lbuf * bp, int flag)
*
* executed at INTIODONE level
*/
-static void lbmIODone(struct bio *bio, int error)
+static void lbmIODone(struct bio *bio, int error, struct batch_complete *batch)
{
struct lbuf *bp = bio->bi_private;
struct lbuf *nextbp, *tail;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 9e3aaff11f89..de87794fedaf 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -283,7 +283,8 @@ static void last_read_complete(struct page *page)
unlock_page(page);
}
-static void metapage_read_end_io(struct bio *bio, int err)
+static void metapage_read_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct page *page = bio->bi_private;
@@ -338,7 +339,8 @@ static void last_write_complete(struct page *page)
end_page_writeback(page);
}
-static void metapage_write_end_io(struct bio *bio, int err)
+static void metapage_write_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct page *page = bio->bi_private;
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 550475ca6a0e..0ae2254f74bf 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -14,7 +14,8 @@
#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
-static void request_complete(struct bio *bio, int err)
+static void request_complete(struct bio *bio, int err,
+ struct batch_complete *batch)
{
complete((struct completion *)bio->bi_private);
}
@@ -64,7 +65,8 @@ static int bdev_readpage(void *_sb, struct page *page)
static DECLARE_WAIT_QUEUE_HEAD(wq);
-static void writeseg_end_io(struct bio *bio, int err)
+static void writeseg_end_io(struct bio *bio, int err,
+ struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -168,7 +170,7 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
}
-static void erase_end_io(struct bio *bio, int err)
+static void erase_end_io(struct bio *bio, int err, struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct super_block *sb = bio->bi_private;
diff --git a/fs/mpage.c b/fs/mpage.c
index 0face1c4d4c6..a4089bbfee0a 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -41,7 +41,7 @@
* status of that page is hard. See end_buffer_async_read() for the details.
* There is no point in duplicating all that complexity.
*/
-static void mpage_end_io(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio, int err, struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index ee24df5af1f9..3c5dd55d284c 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
/* we do not support files bigger than 4GB... We eventually
supports just 4GB... */
- if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff
+ if (vma_pages(vma) + vma->vm_pgoff
> (1U << (32 - PAGE_SHIFT)))
return -EFBIG;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 1e5fdd3506e2..dba178e2a1bd 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -143,7 +143,7 @@ bl_submit_bio(int rw, struct bio *bio)
static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
+ bio_end_io_t *end_io,
struct parallel_io *par)
{
struct bio *bio;
@@ -167,7 +167,7 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
sector_t isect, struct page *page,
struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
+ bio_end_io_t *end_io,
struct parallel_io *par,
unsigned int offset, int len)
{
@@ -190,7 +190,7 @@ retry:
static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
sector_t isect, struct page *page,
struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
+ bio_end_io_t *end_io,
struct parallel_io *par)
{
return do_add_page_to_bio(bio, npg, rw, isect, page, be,
@@ -198,7 +198,8 @@ static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
}
/* This is basically copied from mpage_end_io_read */
-static void bl_end_io_read(struct bio *bio, int err)
+static void bl_end_io_read(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct parallel_io *par = bio->bi_private;
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -380,7 +381,8 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
}
}
-static void bl_end_io_write_zero(struct bio *bio, int err)
+static void bl_end_io_write_zero(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct parallel_io *par = bio->bi_private;
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -408,7 +410,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
put_parallel(par);
}
-static void bl_end_io_write(struct bio *bio, int err)
+static void bl_end_io_write(struct bio *bio, int err,
+ struct batch_complete *batch)
{
struct parallel_io *par = bio->bi_private;
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -487,7 +490,7 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
}
static void
-bl_read_single_end_io(struct bio *bio, int error)
+bl_read_single_end_io(struct bio *bio, int error, struct batch_complete *batch)
{
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
struct page *page = bvec->bv_page;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index dc9a913784ab..680b65b8a74d 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -338,7 +338,8 @@ void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
/*
* BIO operations
*/
-static void nilfs_end_bio_write(struct bio *bio, int err)
+static void nilfs_end_bio_write(struct bio *bio, int err,
+ struct batch_complete *batch)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct nilfs_segment_buffer *segbuf = bio->bi_private;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 8c3318bf2252..0cc19d0417bf 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -372,8 +372,8 @@ static void o2hb_wait_on_io(struct o2hb_region *reg,
wait_for_completion(&wc->wc_io_complete);
}
-static void o2hb_bio_end_io(struct bio *bio,
- int error)
+static void o2hb_bio_end_io(struct bio *bio, int error,
+ struct batch_complete *batch)
{
struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 04ee1b57c243..33c7b91c777b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -947,7 +947,7 @@ leave:
ocfs2_free_dir_lookup_result(&orphan_insert);
ocfs2_free_dir_lookup_result(&lookup);
- if (status)
+ if (status && (status != -ENOTEMPTY))
mlog_errno(status);
return status;
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index bd4b5a740ff1..bdfabdaefdce 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait;
static int kmsg_open(struct inode * inode, struct file * file)
{
- return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
+ return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
}
static int kmsg_release(struct inode * inode, struct file * file)
{
- (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
+ (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC);
return 0;
}
@@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
if ((file->f_flags & O_NONBLOCK) &&
- !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
+ !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
return -EAGAIN;
- return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
+ return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC);
}
static unsigned int kmsg_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &log_wait, wait);
- if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
+ if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
return POLLIN | POLLRDNORM;
return 0;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d864d56..dbf61f6174f0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -688,10 +689,58 @@ const struct file_operations proc_tid_smaps_operations = {
.release = seq_release_private,
};
+/*
+ * We do not want to have constant page-shift bits sitting in
+ * pagemap entries and are about to reuse them some time soon.
+ *
+ * Here's the "migration strategy":
+ * 1. when the system boots these bits remain what they are,
+ * but a warning about future change is printed in log;
+ * 2. once anyone clears soft-dirty bits via clear_refs file,
+ * these flag is set to denote, that user is aware of the
+ * new API and those page-shift bits change their meaning.
+ * The respective warning is printed in dmesg;
+ * 3. In a couple of releases we will remove all the mentions
+ * of page-shift in pagemap entries.
+ */
+
+static bool soft_dirty_cleared __read_mostly;
+
+enum clear_refs_types {
+ CLEAR_REFS_ALL = 1,
+ CLEAR_REFS_ANON,
+ CLEAR_REFS_MAPPED,
+ CLEAR_REFS_SOFT_DIRTY,
+ CLEAR_REFS_LAST,
+};
+
+struct clear_refs_private {
+ struct vm_area_struct *vma;
+ enum clear_refs_types type;
+};
+
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ /*
+ * The soft-dirty tracker uses #PF-s to catch writes
+ * to pages, so write-protect the pte as well. See the
+ * Documentation/vm/soft-dirty.txt for full description
+ * of how soft-dirty works.
+ */
+ pte_t ptent = *pte;
+ ptent = pte_wrprotect(ptent);
+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
+#endif
+}
+
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->private;
+ struct clear_refs_private *cp = walk->private;
+ struct vm_area_struct *vma = cp->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
@@ -706,6 +755,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte_present(ptent))
continue;
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ clear_soft_dirty(vma, addr, pte);
+ continue;
+ }
+
page = vm_normal_page(vma, addr, ptent);
if (!page)
continue;
@@ -719,10 +773,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-#define CLEAR_REFS_ALL 1
-#define CLEAR_REFS_ANON 2
-#define CLEAR_REFS_MAPPED 3
-
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -730,7 +780,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
char buffer[PROC_NUMBUF];
struct mm_struct *mm;
struct vm_area_struct *vma;
- int type;
+ enum clear_refs_types type;
+ int itype;
int rv;
memset(buffer, 0, sizeof(buffer));
@@ -738,23 +789,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- rv = kstrtoint(strstrip(buffer), 10, &type);
+ rv = kstrtoint(strstrip(buffer), 10, &itype);
if (rv < 0)
return rv;
- if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
+ type = (enum clear_refs_types)itype;
+ if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
return -EINVAL;
+
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
+ soft_dirty_cleared = true;
+ pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
+ "See the linux/Documentation/vm/pagemap.txt for details.\n");
+ }
+
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
+ struct clear_refs_private cp = {
+ .type = type,
+ };
struct mm_walk clear_refs_walk = {
.pmd_entry = clear_refs_pte_range,
.mm = mm,
+ .private = &cp,
};
down_read(&mm->mmap_sem);
+ if (type == CLEAR_REFS_SOFT_DIRTY)
+ mmu_notifier_invalidate_range_start(mm, 0, -1);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- clear_refs_walk.private = vma;
+ cp.vma = vma;
if (is_vm_hugetlb_page(vma))
continue;
/*
@@ -773,6 +838,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
walk_page_range(vma->vm_start, vma->vm_end,
&clear_refs_walk);
}
+ if (type == CLEAR_REFS_SOFT_DIRTY)
+ mmu_notifier_invalidate_range_end(mm, 0, -1);
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
mmput(mm);
@@ -794,6 +861,7 @@ typedef struct {
struct pagemapread {
int pos, len;
pagemap_entry_t *buffer;
+ bool v2;
};
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
@@ -807,14 +875,17 @@ struct pagemapread {
#define PM_PSHIFT_BITS 6
#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
+/* in "new" pagemap pshift bits are occupied with more status bits */
+#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
+#define __PM_SOFT_DIRTY (1LL)
#define PM_PRESENT PM_STATUS(4LL)
#define PM_SWAP PM_STATUS(2LL)
#define PM_FILE PM_STATUS(1LL)
-#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT)
+#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
#define PM_END_OF_BUFFER 1
static inline pagemap_entry_t make_pme(u64 val)
@@ -837,7 +908,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
struct pagemapread *pm = walk->private;
unsigned long addr;
int err = 0;
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
for (addr = start; addr < end; addr += PAGE_SIZE) {
err = add_to_pagemap(addr, &pme, pm);
@@ -847,11 +918,12 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
return err;
}
-static void pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
u64 frame, flags;
struct page *page = NULL;
+ int flags2 = 0;
if (pte_present(pte)) {
frame = pte_pfn(pte);
@@ -866,19 +938,21 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme,
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
} else {
- *pme = make_pme(PM_NOT_PRESENT);
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2));
return;
}
if (page && !PageAnon(page))
flags |= PM_FILE;
+ if (pte_soft_dirty(pte))
+ flags2 |= __PM_SOFT_DIRTY;
- *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags);
+ *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
- pmd_t pmd, int offset)
+static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+ pmd_t pmd, int offset, int pmd_flags2)
{
/*
* Currently pmd for thp is always present because thp can not be
@@ -887,13 +961,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
*/
if (pmd_present(pmd))
*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+ | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
else
- *pme = make_pme(PM_NOT_PRESENT);
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2));
}
#else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
- pmd_t pmd, int offset)
+static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+ pmd_t pmd, int offset, int pmd_flags2)
{
}
#endif
@@ -905,17 +979,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct pagemapread *pm = walk->private;
pte_t *pte;
int err = 0;
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+ int pmd_flags2;
+
+ pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
for (; addr != end; addr += PAGE_SIZE) {
unsigned long offset;
offset = (addr & ~PAGEMAP_WALK_MASK) >>
PAGE_SHIFT;
- thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
+ thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
@@ -932,7 +1009,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
* and need a new, higher one */
if (vma && (addr >= vma->vm_end)) {
vma = find_vma(walk->mm, addr);
- pme = make_pme(PM_NOT_PRESENT);
+ pme = make_pme(PM_NOT_PRESENT(pm->v2));
}
/* check that 'vma' actually covers this address,
@@ -940,7 +1017,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (vma && (vma->vm_start <= addr) &&
!is_vm_hugetlb_page(vma)) {
pte = pte_offset_map(pmd, addr);
- pte_to_pagemap_entry(&pme, vma, addr, *pte);
+ pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
/* unmap before userspace copy */
pte_unmap(pte);
}
@@ -955,14 +1032,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
#ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
pte_t pte, int offset)
{
if (pte_present(pte))
*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+ | PM_STATUS2(pm->v2, 0) | PM_PRESENT);
else
- *pme = make_pme(PM_NOT_PRESENT);
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2));
}
/* This function walks within one hugetlb entry in the single call */
@@ -976,7 +1053,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
for (; addr != end; addr += PAGE_SIZE) {
int offset = (addr & ~hmask) >> PAGE_SHIFT;
- huge_pte_to_pagemap_entry(&pme, *pte, offset);
+ huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
@@ -1038,6 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!count)
goto out_task;
+ pm.v2 = soft_dirty_cleared;
pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
ret = -ENOMEM;
@@ -1110,9 +1188,18 @@ out:
return ret;
}
+static int pagemap_open(struct inode *inode, struct file *file)
+{
+ pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
+ "to stop being page-shift some time soon. See the "
+ "linux/Documentation/vm/pagemap.txt for details.\n");
+ return 0;
+}
+
const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
+ .open = pagemap_open,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 596ec71da00e..8a58be3f2dcf 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -380,7 +380,8 @@ xfs_imap_valid(
STATIC void
xfs_end_bio(
struct bio *bio,
- int error)
+ int error,
+ struct batch_complete *batch)
{
xfs_ioend_t *ioend = bio->bi_private;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1b2472a46e46..e8610aa1a0fe 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1225,7 +1225,8 @@ _xfs_buf_ioend(
STATIC void
xfs_buf_bio_end_io(
struct bio *bio,
- int error)
+ int error,
+ struct batch_complete *batch)
{
xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;