From 4b70ac5fd9b58bfaa5f25b4ea48f528aefbf3308 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2014 19:02:48 +0200 Subject: aio: change exit_aio() to load mm->ioctx_table once and avoid rcu_read_lock() On 04/30, Benjamin LaHaise wrote: > > > - ctx->mmap_size = 0; > > - > > - kill_ioctx(mm, ctx, NULL); > > + if (ctx) { > > + ctx->mmap_size = 0; > > + kill_ioctx(mm, ctx, NULL); > > + } > > Rather than indenting and moving the two lines changing mmap_size and the > kill_ioctx() call, why not just do "if (!ctx) ... continue;"? That reduces > the number of lines changed and avoid excessive indentation. OK. To me the code looks better/simpler with "if (ctx)", but this is subjective of course, I won't argue. The patch still removes the empty line between mmap_size = 0 and kill_ioctx(), we reset mmap_size only for kill_ioctx(). But feel free to remove this change. ------------------------------------------------------------------------------- Subject: [PATCH v3 1/2] aio: change exit_aio() to load mm->ioctx_table once and avoid rcu_read_lock() 1. We can read ->ioctx_table only once and we do not read rcu_read_lock() or even rcu_dereference(). This mm has no users, nobody else can play with ->ioctx_table. Otherwise the code is buggy anyway, if we need rcu_read_lock() in a loop because ->ioctx_table can be updated then kfree(table) is obviously wrong. 2. Update the comment. "exit_mmap(mm) is coming" is the good reason to avoid munmap(), but another reason is that we simply can't do vm_munmap() unless current->mm == mm and this is not true in general, the caller is mmput(). 3. We do not really need to nullify mm->ioctx_table before return, probably the current code does this to catch the potential problems. But in this case RCU_INIT_POINTER(NULL) looks better. Signed-off-by: Oleg Nesterov Signed-off-by: Benjamin LaHaise --- fs/aio.c | 42 ++++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 955947ef3e02..b6696462e345 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -791,40 +791,30 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); */ void exit_aio(struct mm_struct *mm) { - struct kioctx_table *table; - struct kioctx *ctx; - unsigned i = 0; - - while (1) { - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - - do { - if (!table || i >= table->nr) { - rcu_read_unlock(); - rcu_assign_pointer(mm->ioctx_table, NULL); - if (table) - kfree(table); - return; - } + struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); + int i; - ctx = table->table[i++]; - } while (!ctx); + if (!table) + return; - rcu_read_unlock(); + for (i = 0; i < table->nr; ++i) { + struct kioctx *ctx = table->table[i]; + if (!ctx) + continue; /* - * We don't need to bother with munmap() here - - * exit_mmap(mm) is coming and it'll unmap everything. - * Since aio_free_ring() uses non-zero ->mmap_size - * as indicator that it needs to unmap the area, - * just set it to 0; aio_free_ring() is the only - * place that uses ->mmap_size, so it's safe. + * We don't need to bother with munmap() here - exit_mmap(mm) + * is coming and it'll unmap everything. And we simply can't, + * this is not necessarily our ->mm. + * Since kill_ioctx() uses non-zero ->mmap_size as indicator + * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx, NULL); } + + RCU_INIT_POINTER(mm->ioctx_table, NULL); + kfree(table); } static void put_reqs_available(struct kioctx *ctx, unsigned nr) -- cgit v1.2.3 From 855ef0dec7271ff7be7381feaaf3f4aed80bd503 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2014 16:16:36 +0200 Subject: aio: kill the misleading rcu read locks in ioctx_add_table() and kill_ioctx() ioctx_add_table() is the writer, it does not need rcu_read_lock() to protect ->ioctx_table. It relies on mm->ioctx_lock and rcu locks just add the confusion. And it doesn't need rcu_dereference() by the same reason, it must see any updates previously done under the same ->ioctx_lock. We could use rcu_dereference_protected() but the patch uses rcu_dereference_raw(), the function is simple enough. The same for kill_ioctx(), although it does not update the pointer. Signed-off-by: Oleg Nesterov Signed-off-by: Benjamin LaHaise --- fs/aio.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index b6696462e345..c1d8c480c138 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -554,8 +554,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) struct aio_ring *ring; spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); + table = rcu_dereference_raw(mm->ioctx_table); while (1) { if (table) @@ -563,7 +562,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) if (!table->table[i]) { ctx->id = i; table->table[i] = ctx; - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, @@ -577,8 +575,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) } new_nr = (table ? table->nr : 1) * 4; - - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * @@ -589,8 +585,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) table->nr = new_nr; spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - old = rcu_dereference(mm->ioctx_table); + old = rcu_dereference_raw(mm->ioctx_table); if (!old) { rcu_assign_pointer(mm->ioctx_table, table); @@ -737,12 +732,9 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - + table = rcu_dereference_raw(mm->ioctx_table); WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); /* percpu_ref_kill() will do the necessary call_rcu() */ -- cgit v1.2.3 From 55c6c814ae0aa896781d3c51e4608de542624f64 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 28 Jun 2014 08:10:12 -0400 Subject: percpu-refcount, aio: use percpu_ref_cancel_init() in ioctx_alloc() ioctx_alloc() reaches inside percpu_ref and directly frees ->pcpu_count in its failure path, which is quite gross. percpu_ref has been providing a proper interface to do this, percpu_ref_cancel_init(), for quite some time now. Let's use that instead. This patch doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Acked-by: Benjamin LaHaise Cc: Kent Overstreet --- fs/aio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 4f078c054b41..5e0d7f9cb693 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -715,8 +715,8 @@ err_ctx: err: mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); - free_percpu(ctx->reqs.pcpu_count); - free_percpu(ctx->users.pcpu_count); + percpu_ref_cancel_init(&ctx->reqs); + percpu_ref_cancel_init(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); -- cgit v1.2.3 From 9a1049da9bd2cd83fe11d46433e603c193aa9c71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 28 Jun 2014 08:10:14 -0400 Subject: percpu-refcount: require percpu_ref to be exited explicitly Currently, a percpu_ref undoes percpu_ref_init() automatically by freeing the allocated percpu area when the percpu_ref is killed. While seemingly convenient, this has the following niggles. * It's impossible to re-init a released reference counter without going through re-allocation. * In the similar vein, it's impossible to initialize a percpu_ref count with static percpu variables. * We need and have an explicit destructor anyway for failure paths - percpu_ref_cancel_init(). This patch removes the automatic percpu counter freeing in percpu_ref_kill_rcu() and repurposes percpu_ref_cancel_init() into a generic destructor now named percpu_ref_exit(). percpu_ref_destroy() is considered but it gets confusing with percpu_ref_kill() while "exit" clearly indicates that it's the counterpart of percpu_ref_init(). All percpu_ref_cancel_init() users are updated to invoke percpu_ref_exit() instead and explicit percpu_ref_exit() calls are added to the destruction path of all percpu_ref users. Signed-off-by: Tejun Heo Acked-by: Benjamin LaHaise Cc: Kent Overstreet Cc: Christoph Lameter Cc: Benjamin LaHaise Cc: Nicholas A. Bellinger Cc: Li Zefan --- drivers/target/target_core_tpg.c | 4 +++- fs/aio.c | 6 ++++-- include/linux/percpu-refcount.h | 6 ++---- kernel/cgroup.c | 8 +++++--- lib/percpu-refcount.c | 33 ++++++++++----------------------- 5 files changed, 24 insertions(+), 33 deletions(-) (limited to 'fs/aio.c') diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index c036595b17cf..fddfae61222f 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -825,7 +825,7 @@ int core_tpg_add_lun( ret = core_dev_export(dev, tpg, lun); if (ret < 0) { - percpu_ref_cancel_init(&lun->lun_ref); + percpu_ref_exit(&lun->lun_ref); return ret; } @@ -880,5 +880,7 @@ int core_tpg_post_dellun( lun->lun_status = TRANSPORT_LUN_STATUS_FREE; spin_unlock(&tpg->tpg_lun_lock); + percpu_ref_exit(&lun->lun_ref); + return 0; } diff --git a/fs/aio.c b/fs/aio.c index 5e0d7f9cb693..ea1bc2e8f4f3 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -506,6 +506,8 @@ static void free_ioctx(struct work_struct *work) aio_free_ring(ctx); free_percpu(ctx->cpu); + percpu_ref_exit(&ctx->reqs); + percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); } @@ -715,8 +717,8 @@ err_ctx: err: mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); - percpu_ref_cancel_init(&ctx->reqs); - percpu_ref_cancel_init(&ctx->users); + percpu_ref_exit(&ctx->reqs); + percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 6f8cd4c0546c..0ddd2839ca84 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -57,9 +57,7 @@ struct percpu_ref { atomic_t count; /* * The low bit of the pointer indicates whether the ref is in percpu - * mode; if set, then get/put will manipulate the atomic_t (this is a - * hack because we need to keep the pointer around for - * percpu_ref_kill_rcu()) + * mode; if set, then get/put will manipulate the atomic_t. */ unsigned long pcpu_count_ptr; percpu_ref_func_t *release; @@ -69,7 +67,7 @@ struct percpu_ref { int __must_check percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release); -void percpu_ref_cancel_init(struct percpu_ref *ref); +void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7868fc3c0bc5..c06aa5e257a8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1638,7 +1638,7 @@ destroy_root: exit_root_id: cgroup_exit_root_id(root); cancel_ref: - percpu_ref_cancel_init(&root_cgrp->self.refcnt); + percpu_ref_exit(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; @@ -4133,6 +4133,8 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; + percpu_ref_exit(&css->refcnt); + if (css->ss) { /* css free path */ if (css->parent) @@ -4330,7 +4332,7 @@ err_list_del: err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: - percpu_ref_cancel_init(&css->refcnt); + percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); return err; @@ -4441,7 +4443,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, out_free_id: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_cancel_ref: - percpu_ref_cancel_init(&cgrp->self.refcnt); + percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: kfree(cgrp); out_unlock: diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 94e5b624de64..ac4299120087 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -61,36 +61,25 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) EXPORT_SYMBOL_GPL(percpu_ref_init); /** - * percpu_ref_cancel_init - cancel percpu_ref_init() - * @ref: percpu_ref to cancel init for + * percpu_ref_exit - undo percpu_ref_init() + * @ref: percpu_ref to exit * - * Once a percpu_ref is initialized, its destruction is initiated by - * percpu_ref_kill() and completes asynchronously, which can be painful to - * do when destroying a half-constructed object in init failure path. - * - * This function destroys @ref without invoking @ref->release and the - * memory area containing it can be freed immediately on return. To - * prevent accidental misuse, it's required that @ref has finished - * percpu_ref_init(), whether successful or not, but never used. - * - * The weird name and usage restriction are to prevent people from using - * this function by mistake for normal shutdown instead of - * percpu_ref_kill(). + * This function exits @ref. The caller is responsible for ensuring that + * @ref is no longer in active use. The usual places to invoke this + * function from are the @ref->release() callback or in init failure path + * where percpu_ref_init() succeeded but other parts of the initialization + * of the embedding object failed. */ -void percpu_ref_cancel_init(struct percpu_ref *ref) +void percpu_ref_exit(struct percpu_ref *ref) { unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); - int cpu; - - WARN_ON_ONCE(atomic_read(&ref->count) != 1 + PCPU_COUNT_BIAS); if (pcpu_count) { - for_each_possible_cpu(cpu) - WARN_ON_ONCE(*per_cpu_ptr(pcpu_count, cpu)); free_percpu(pcpu_count); + ref->pcpu_count_ptr = PCPU_REF_DEAD; } } -EXPORT_SYMBOL_GPL(percpu_ref_cancel_init); +EXPORT_SYMBOL_GPL(percpu_ref_exit); static void percpu_ref_kill_rcu(struct rcu_head *rcu) { @@ -102,8 +91,6 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) for_each_possible_cpu(cpu) count += *per_cpu_ptr(pcpu_count, cpu); - free_percpu(pcpu_count); - pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count); /* -- cgit v1.2.3 From be6fb451a24582732c66e28cb0beb3f19c4289fd Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 22 Jul 2014 09:56:56 -0400 Subject: aio: remove no longer needed preempt_disable() Based on feedback from Jens Axboe on 263782c1c95bbddbb022dc092fd89a36bb8d5577, clean up get/put_reqs_available() to remove the no longer needed preempt_disable() and preempt_enable() pair. Signed-off-by: Benjamin LaHaise Cc: Jens Axboe --- fs/aio.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 8216aa0c7539..9ce9e8ea773f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -814,10 +814,8 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) struct kioctx_cpu *kcpu; unsigned long flags; - preempt_disable(); - kcpu = this_cpu_ptr(ctx->cpu); - local_irq_save(flags); + kcpu = this_cpu_ptr(ctx->cpu); kcpu->reqs_available += nr; while (kcpu->reqs_available >= ctx->req_batch * 2) { @@ -826,7 +824,6 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) } local_irq_restore(flags); - preempt_enable(); } static bool get_reqs_available(struct kioctx *ctx) @@ -835,10 +832,8 @@ static bool get_reqs_available(struct kioctx *ctx) bool ret = false; unsigned long flags; - preempt_disable(); - kcpu = this_cpu_ptr(ctx->cpu); - local_irq_save(flags); + kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { int old, avail = atomic_read(&ctx->reqs_available); @@ -858,7 +853,6 @@ static bool get_reqs_available(struct kioctx *ctx) kcpu->reqs_available--; out: local_irq_restore(flags); - preempt_enable(); return ret; } -- cgit v1.2.3 From b53f1f82fbde8dcf34ab7d731c2a9ae6f0d8d2e2 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:51 +0800 Subject: aio: remove the needless registration of ring file's private_data Remove the registration of ring file's private_data, we do not use it. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 9ce9e8ea773f..55e03a858e8b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -192,7 +192,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) } file->f_flags = O_RDWR; - file->private_data = ctx; return file; } -- cgit v1.2.3 From 8dc4379e17cddad7b2088a3f300ded50d2a6d493 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:52 +0800 Subject: aio: use the macro rather than the inline magic number Replace the inline magic number with the ready-made macro(AIO_RING_MAGIC), just clean up. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 55e03a858e8b..6fc6b9857a58 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -201,7 +201,7 @@ static struct dentry *aio_mount(struct file_system_type *fs_type, static const struct dentry_operations ops = { .d_dname = simple_dname, }; - return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1); + return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); } /* aio_setup -- cgit v1.2.3 From 2be4e7deec2d4398a0eb2165cc04086ebfc831d2 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:53 +0800 Subject: aio: fix some comments The function comments of aio_run_iocb and aio_read_events are out of date, so fix them here. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 6fc6b9857a58..d6d9520f6866 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1020,7 +1020,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) } EXPORT_SYMBOL(aio_complete); -/* aio_read_events +/* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of * events fetched */ @@ -1272,9 +1272,8 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb, } /* - * aio_setup_iocb: - * Performs the initial checks and aio retry method - * setup for the kiocb at the time of io submission. + * aio_run_iocb: + * Performs the initial checks and io submission. */ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, char __user *buf, bool compat) -- cgit v1.2.3 From 00fefb9cf2b5493a86912de55ba912bdfae4a207 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:54 +0800 Subject: aio: use iovec array rather than the single one Previously, we only offer a single iovec to handle all the read/write cases, so the PREADV/PWRITEV request always need to alloc more iovec buffer when copying user vectors. If we use a tmp iovec array rather than the single one, some small PREADV/PWRITEV workloads(vector size small than the tmp buffer) will not need to alloc more iovec buffer when copying user vectors. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index d6d9520f6866..0fd9181d8c0c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1243,12 +1243,12 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, if (compat) ret = compat_rw_copy_check_uvector(rw, (struct compat_iovec __user *)buf, - *nr_segs, 1, *iovec, iovec); + *nr_segs, UIO_FASTIOV, *iovec, iovec); else #endif ret = rw_copy_check_uvector(rw, (struct iovec __user *)buf, - *nr_segs, 1, *iovec, iovec); + *nr_segs, UIO_FASTIOV, *iovec, iovec); if (ret < 0) return ret; @@ -1285,7 +1285,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, fmode_t mode; aio_rw_op *rw_op; rw_iter_op *iter_op; - struct iovec inline_vec, *iovec = &inline_vec; + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; switch (opcode) { @@ -1320,7 +1320,7 @@ rw_common: if (!ret) ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); if (ret < 0) { - if (iovec != &inline_vec) + if (iovec != inline_vecs) kfree(iovec); return ret; } @@ -1367,7 +1367,7 @@ rw_common: return -EINVAL; } - if (iovec != &inline_vec) + if (iovec != inline_vecs) kfree(iovec); if (ret != -EIOCBQUEUED) { -- cgit v1.2.3 From d856f32a86b2b015ab180ab7a55e455ed8d3ccc5 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Sun, 24 Aug 2014 13:14:05 -0400 Subject: aio: fix reqs_available handling As reported by Dan Aloni, commit f8567a3845ac ("aio: fix aio request leak when events are reaped by userspace") introduces a regression when user code attempts to perform io_submit() with more events than are available in the ring buffer. Reverting that commit would reintroduce a regression when user space event reaping is used. Fixing this bug is a bit more involved than the previous attempts to fix this regression. Since we do not have a single point at which we can count events as being reaped by user space and io_getevents(), we have to track event completion by looking at the number of events left in the event ring. So long as there are as many events in the ring buffer as there have been completion events generate, we cannot call put_reqs_available(). The code to check for this is now placed in refill_reqs_available(). A test program from Dan and modified by me for verifying this bug is available at http://www.kvack.org/~bcrl/20140824-aio_bug.c . Reported-by: Dan Aloni Signed-off-by: Benjamin LaHaise Acked-by: Dan Aloni Cc: Kent Overstreet Cc: Mateusz Guzik Cc: Petr Matousek Cc: stable@vger.kernel.org # v3.16 and anything that f8567a3845ac was backported to Signed-off-by: Linus Torvalds --- fs/aio.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 4 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index ae635872affb..97bc62cbe2da 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -141,6 +141,7 @@ struct kioctx { struct { unsigned tail; + unsigned completed_events; spinlock_t completion_lock; } ____cacheline_aligned_in_smp; @@ -857,6 +858,68 @@ out: return ret; } +/* refill_reqs_available + * Updates the reqs_available reference counts used for tracking the + * number of free slots in the completion ring. This can be called + * from aio_complete() (to optimistically update reqs_available) or + * from aio_get_req() (the we're out of events case). It must be + * called holding ctx->completion_lock. + */ +static void refill_reqs_available(struct kioctx *ctx, unsigned head, + unsigned tail) +{ + unsigned events_in_ring, completed; + + /* Clamp head since userland can write to it. */ + head %= ctx->nr_events; + if (head <= tail) + events_in_ring = tail - head; + else + events_in_ring = ctx->nr_events - (head - tail); + + completed = ctx->completed_events; + if (events_in_ring < completed) + completed -= events_in_ring; + else + completed = 0; + + if (!completed) + return; + + ctx->completed_events -= completed; + put_reqs_available(ctx, completed); +} + +/* user_refill_reqs_available + * Called to refill reqs_available when aio_get_req() encounters an + * out of space in the completion ring. + */ +static void user_refill_reqs_available(struct kioctx *ctx) +{ + spin_lock_irq(&ctx->completion_lock); + if (ctx->completed_events) { + struct aio_ring *ring; + unsigned head; + + /* Access of ring->head may race with aio_read_events_ring() + * here, but that's okay since whether we read the old version + * or the new version, and either will be valid. The important + * part is that head cannot pass tail since we prevent + * aio_complete() from updating tail by holding + * ctx->completion_lock. Even if head is invalid, the check + * against ctx->completed_events below will make sure we do the + * safe/right thing. + */ + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; + kunmap_atomic(ring); + + refill_reqs_available(ctx, head, ctx->tail); + } + + spin_unlock_irq(&ctx->completion_lock); +} + /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. @@ -865,8 +928,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (!get_reqs_available(ctx)) - return NULL; + if (!get_reqs_available(ctx)) { + user_refill_reqs_available(ctx); + if (!get_reqs_available(ctx)) + return NULL; + } req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); if (unlikely(!req)) @@ -925,8 +991,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2) struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; + unsigned tail, pos, head; unsigned long flags; - unsigned tail, pos; /* * Special case handling for sync iocbs: @@ -987,10 +1053,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2) ctx->tail = tail; ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; ring->tail = tail; kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); + ctx->completed_events++; + if (ctx->completed_events > 1) + refill_reqs_available(ctx, head, tail); spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); @@ -1005,7 +1075,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) /* everything turned out well, dispose of the aiocb. */ kiocb_free(iocb); - put_reqs_available(ctx, 1); /* * We have to order our ring_info tail store above and test -- cgit v1.2.3 From 2ff396be602f10b5eab8e73b24f20348fa2de159 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Tue, 2 Sep 2014 13:17:00 -0400 Subject: aio: add missing smp_rmb() in read_events_ring We ran into a case on ppc64 running mariadb where io_getevents would return zeroed out I/O events. After adding instrumentation, it became clear that there was some missing synchronization between reading the tail pointer and the events themselves. This small patch fixes the problem in testing. Thanks to Zach for helping to look into this, and suggesting the fix. Signed-off-by: Jeff Moyer Signed-off-by: Benjamin LaHaise Cc: stable@vger.kernel.org --- fs/aio.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 97bc62cbe2da..5f2e9c6c328e 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1111,6 +1111,12 @@ static long aio_read_events_ring(struct kioctx *ctx, tail = ring->tail; kunmap_atomic(ring); + /* + * Ensure that once we've read the current tail pointer, that + * we also see the events that were stored up to the tail. + */ + smp_rmb(); + pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); if (head == tail) -- cgit v1.2.3 From 6098b45b32e6baeacc04790773ced9340601d511 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 3 Sep 2014 17:45:44 +0800 Subject: aio: block exit_aio() until all context requests are completed It seems that exit_aio() also needs to wait for all iocbs to complete (like io_destroy), but we missed the wait step in current implemention, so fix it in the same way as we did in io_destroy. Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise Cc: stable@vger.kernel.org --- fs/aio.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 5f2e9c6c328e..733750096b71 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -793,6 +793,8 @@ void exit_aio(struct mm_struct *mm) for (i = 0; i < table->nr; ++i) { struct kioctx *ctx = table->table[i]; + struct completion requests_done = + COMPLETION_INITIALIZER_ONSTACK(requests_done); if (!ctx) continue; @@ -804,7 +806,10 @@ void exit_aio(struct mm_struct *mm) * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx, NULL); + kill_ioctx(mm, ctx, &requests_done); + + /* Wait until all IO for the context are done. */ + wait_for_completion(&requests_done); } RCU_INIT_POINTER(mm->ioctx_table, NULL); -- cgit v1.2.3