From 4b70ac5fd9b58bfaa5f25b4ea48f528aefbf3308 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 30 Apr 2014 19:02:48 +0200
Subject: aio: change exit_aio() to load mm->ioctx_table once and avoid
 rcu_read_lock()

On 04/30, Benjamin LaHaise wrote:
>
> > -		ctx->mmap_size = 0;
> > -
> > -		kill_ioctx(mm, ctx, NULL);
> > +		if (ctx) {
> > +			ctx->mmap_size = 0;
> > +			kill_ioctx(mm, ctx, NULL);
> > +		}
>
> Rather than indenting and moving the two lines changing mmap_size and the
> kill_ioctx() call, why not just do "if (!ctx) ... continue;"?  That reduces
> the number of lines changed and avoid excessive indentation.

OK. To me the code looks better/simpler with "if (ctx)", but this is subjective
of course, I won't argue.

The patch still removes the empty line between mmap_size = 0 and kill_ioctx(),
we reset mmap_size only for kill_ioctx(). But feel free to remove this change.

-------------------------------------------------------------------------------
Subject: [PATCH v3 1/2] aio: change exit_aio() to load mm->ioctx_table once and avoid rcu_read_lock()

1. We can read ->ioctx_table only once and we do not read rcu_read_lock()
   or even rcu_dereference().

   This mm has no users, nobody else can play with ->ioctx_table. Otherwise
   the code is buggy anyway, if we need rcu_read_lock() in a loop because
   ->ioctx_table can be updated then kfree(table) is obviously wrong.

2. Update the comment. "exit_mmap(mm) is coming" is the good reason to avoid
   munmap(), but another reason is that we simply can't do vm_munmap() unless
   current->mm == mm and this is not true in general, the caller is mmput().

3. We do not really need to nullify mm->ioctx_table before return, probably
   the current code does this to catch the potential problems. But in this
   case RCU_INIT_POINTER(NULL) looks better.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 42 ++++++++++++++++--------------------------
 1 file changed, 16 insertions(+), 26 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 955947ef3e02..b6696462e345 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -791,40 +791,30 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
  */
 void exit_aio(struct mm_struct *mm)
 {
-	struct kioctx_table *table;
-	struct kioctx *ctx;
-	unsigned i = 0;
-
-	while (1) {
-		rcu_read_lock();
-		table = rcu_dereference(mm->ioctx_table);
-
-		do {
-			if (!table || i >= table->nr) {
-				rcu_read_unlock();
-				rcu_assign_pointer(mm->ioctx_table, NULL);
-				if (table)
-					kfree(table);
-				return;
-			}
+	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
+	int i;
 
-			ctx = table->table[i++];
-		} while (!ctx);
+	if (!table)
+		return;
 
-		rcu_read_unlock();
+	for (i = 0; i < table->nr; ++i) {
+		struct kioctx *ctx = table->table[i];
 
+		if (!ctx)
+			continue;
 		/*
-		 * We don't need to bother with munmap() here -
-		 * exit_mmap(mm) is coming and it'll unmap everything.
-		 * Since aio_free_ring() uses non-zero ->mmap_size
-		 * as indicator that it needs to unmap the area,
-		 * just set it to 0; aio_free_ring() is the only
-		 * place that uses ->mmap_size, so it's safe.
+		 * We don't need to bother with munmap() here - exit_mmap(mm)
+		 * is coming and it'll unmap everything. And we simply can't,
+		 * this is not necessarily our ->mm.
+		 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
+		 * that it needs to unmap the area, just set it to 0.
 		 */
 		ctx->mmap_size = 0;
-
 		kill_ioctx(mm, ctx, NULL);
 	}
+
+	RCU_INIT_POINTER(mm->ioctx_table, NULL);
+	kfree(table);
 }
 
 static void put_reqs_available(struct kioctx *ctx, unsigned nr)
-- 
cgit v1.2.3


From 855ef0dec7271ff7be7381feaaf3f4aed80bd503 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 30 Apr 2014 16:16:36 +0200
Subject: aio: kill the misleading rcu read locks in ioctx_add_table() and
 kill_ioctx()

ioctx_add_table() is the writer, it does not need rcu_read_lock() to
protect ->ioctx_table. It relies on mm->ioctx_lock and rcu locks just
add the confusion.

And it doesn't need rcu_dereference() by the same reason, it must see
any updates previously done under the same ->ioctx_lock. We could use
rcu_dereference_protected() but the patch uses rcu_dereference_raw(),
the function is simple enough.

The same for kill_ioctx(), although it does not update the pointer.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index b6696462e345..c1d8c480c138 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -554,8 +554,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 	struct aio_ring *ring;
 
 	spin_lock(&mm->ioctx_lock);
-	rcu_read_lock();
-	table = rcu_dereference(mm->ioctx_table);
+	table = rcu_dereference_raw(mm->ioctx_table);
 
 	while (1) {
 		if (table)
@@ -563,7 +562,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 				if (!table->table[i]) {
 					ctx->id = i;
 					table->table[i] = ctx;
-					rcu_read_unlock();
 					spin_unlock(&mm->ioctx_lock);
 
 					/* While kioctx setup is in progress,
@@ -577,8 +575,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 				}
 
 		new_nr = (table ? table->nr : 1) * 4;
-
-		rcu_read_unlock();
 		spin_unlock(&mm->ioctx_lock);
 
 		table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
@@ -589,8 +585,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 		table->nr = new_nr;
 
 		spin_lock(&mm->ioctx_lock);
-		rcu_read_lock();
-		old = rcu_dereference(mm->ioctx_table);
+		old = rcu_dereference_raw(mm->ioctx_table);
 
 		if (!old) {
 			rcu_assign_pointer(mm->ioctx_table, table);
@@ -737,12 +732,9 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 
 
 	spin_lock(&mm->ioctx_lock);
-	rcu_read_lock();
-	table = rcu_dereference(mm->ioctx_table);
-
+	table = rcu_dereference_raw(mm->ioctx_table);
 	WARN_ON(ctx != table->table[ctx->id]);
 	table->table[ctx->id] = NULL;
-	rcu_read_unlock();
 	spin_unlock(&mm->ioctx_lock);
 
 	/* percpu_ref_kill() will do the necessary call_rcu() */
-- 
cgit v1.2.3


From 55c6c814ae0aa896781d3c51e4608de542624f64 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 28 Jun 2014 08:10:12 -0400
Subject: percpu-refcount, aio: use percpu_ref_cancel_init() in ioctx_alloc()

ioctx_alloc() reaches inside percpu_ref and directly frees
->pcpu_count in its failure path, which is quite gross.  percpu_ref
has been providing a proper interface to do this,
percpu_ref_cancel_init(), for quite some time now.  Let's use that
instead.

This patch doesn't introduce any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Benjamin LaHaise <bcrl@kvack.org>
Cc: Kent Overstreet <kmo@daterainc.com>
---
 fs/aio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 4f078c054b41..5e0d7f9cb693 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -715,8 +715,8 @@ err_ctx:
 err:
 	mutex_unlock(&ctx->ring_lock);
 	free_percpu(ctx->cpu);
-	free_percpu(ctx->reqs.pcpu_count);
-	free_percpu(ctx->users.pcpu_count);
+	percpu_ref_cancel_init(&ctx->reqs);
+	percpu_ref_cancel_init(&ctx->users);
 	kmem_cache_free(kioctx_cachep, ctx);
 	pr_debug("error allocating ioctx %d\n", err);
 	return ERR_PTR(err);
-- 
cgit v1.2.3


From 9a1049da9bd2cd83fe11d46433e603c193aa9c71 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 28 Jun 2014 08:10:14 -0400
Subject: percpu-refcount: require percpu_ref to be exited explicitly

Currently, a percpu_ref undoes percpu_ref_init() automatically by
freeing the allocated percpu area when the percpu_ref is killed.
While seemingly convenient, this has the following niggles.

* It's impossible to re-init a released reference counter without
  going through re-allocation.

* In the similar vein, it's impossible to initialize a percpu_ref
  count with static percpu variables.

* We need and have an explicit destructor anyway for failure paths -
  percpu_ref_cancel_init().

This patch removes the automatic percpu counter freeing in
percpu_ref_kill_rcu() and repurposes percpu_ref_cancel_init() into a
generic destructor now named percpu_ref_exit().  percpu_ref_destroy()
is considered but it gets confusing with percpu_ref_kill() while
"exit" clearly indicates that it's the counterpart of
percpu_ref_init().

All percpu_ref_cancel_init() users are updated to invoke
percpu_ref_exit() instead and explicit percpu_ref_exit() calls are
added to the destruction path of all percpu_ref users.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Benjamin LaHaise <bcrl@kvack.org>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Nicholas A. Bellinger <nab@linux-iscsi.org>
Cc: Li Zefan <lizefan@huawei.com>
---
 drivers/target/target_core_tpg.c |  4 +++-
 fs/aio.c                         |  6 ++++--
 include/linux/percpu-refcount.h  |  6 ++----
 kernel/cgroup.c                  |  8 +++++---
 lib/percpu-refcount.c            | 33 ++++++++++-----------------------
 5 files changed, 24 insertions(+), 33 deletions(-)

(limited to 'fs/aio.c')

diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index c036595b17cf..fddfae61222f 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -825,7 +825,7 @@ int core_tpg_add_lun(
 
 	ret = core_dev_export(dev, tpg, lun);
 	if (ret < 0) {
-		percpu_ref_cancel_init(&lun->lun_ref);
+		percpu_ref_exit(&lun->lun_ref);
 		return ret;
 	}
 
@@ -880,5 +880,7 @@ int core_tpg_post_dellun(
 	lun->lun_status = TRANSPORT_LUN_STATUS_FREE;
 	spin_unlock(&tpg->tpg_lun_lock);
 
+	percpu_ref_exit(&lun->lun_ref);
+
 	return 0;
 }
diff --git a/fs/aio.c b/fs/aio.c
index 5e0d7f9cb693..ea1bc2e8f4f3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -506,6 +506,8 @@ static void free_ioctx(struct work_struct *work)
 
 	aio_free_ring(ctx);
 	free_percpu(ctx->cpu);
+	percpu_ref_exit(&ctx->reqs);
+	percpu_ref_exit(&ctx->users);
 	kmem_cache_free(kioctx_cachep, ctx);
 }
 
@@ -715,8 +717,8 @@ err_ctx:
 err:
 	mutex_unlock(&ctx->ring_lock);
 	free_percpu(ctx->cpu);
-	percpu_ref_cancel_init(&ctx->reqs);
-	percpu_ref_cancel_init(&ctx->users);
+	percpu_ref_exit(&ctx->reqs);
+	percpu_ref_exit(&ctx->users);
 	kmem_cache_free(kioctx_cachep, ctx);
 	pr_debug("error allocating ioctx %d\n", err);
 	return ERR_PTR(err);
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 6f8cd4c0546c..0ddd2839ca84 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -57,9 +57,7 @@ struct percpu_ref {
 	atomic_t		count;
 	/*
 	 * The low bit of the pointer indicates whether the ref is in percpu
-	 * mode; if set, then get/put will manipulate the atomic_t (this is a
-	 * hack because we need to keep the pointer around for
-	 * percpu_ref_kill_rcu())
+	 * mode; if set, then get/put will manipulate the atomic_t.
 	 */
 	unsigned long		pcpu_count_ptr;
 	percpu_ref_func_t	*release;
@@ -69,7 +67,7 @@ struct percpu_ref {
 
 int __must_check percpu_ref_init(struct percpu_ref *ref,
 				 percpu_ref_func_t *release);
-void percpu_ref_cancel_init(struct percpu_ref *ref);
+void percpu_ref_exit(struct percpu_ref *ref);
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7868fc3c0bc5..c06aa5e257a8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1638,7 +1638,7 @@ destroy_root:
 exit_root_id:
 	cgroup_exit_root_id(root);
 cancel_ref:
-	percpu_ref_cancel_init(&root_cgrp->self.refcnt);
+	percpu_ref_exit(&root_cgrp->self.refcnt);
 out:
 	free_cgrp_cset_links(&tmp_links);
 	return ret;
@@ -4133,6 +4133,8 @@ static void css_free_work_fn(struct work_struct *work)
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 	struct cgroup *cgrp = css->cgroup;
 
+	percpu_ref_exit(&css->refcnt);
+
 	if (css->ss) {
 		/* css free path */
 		if (css->parent)
@@ -4330,7 +4332,7 @@ err_list_del:
 err_free_id:
 	cgroup_idr_remove(&ss->css_idr, css->id);
 err_free_percpu_ref:
-	percpu_ref_cancel_init(&css->refcnt);
+	percpu_ref_exit(&css->refcnt);
 err_free_css:
 	call_rcu(&css->rcu_head, css_free_rcu_fn);
 	return err;
@@ -4441,7 +4443,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 out_free_id:
 	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
 out_cancel_ref:
-	percpu_ref_cancel_init(&cgrp->self.refcnt);
+	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
 	kfree(cgrp);
 out_unlock:
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 94e5b624de64..ac4299120087 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -61,36 +61,25 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release)
 EXPORT_SYMBOL_GPL(percpu_ref_init);
 
 /**
- * percpu_ref_cancel_init - cancel percpu_ref_init()
- * @ref: percpu_ref to cancel init for
+ * percpu_ref_exit - undo percpu_ref_init()
+ * @ref: percpu_ref to exit
  *
- * Once a percpu_ref is initialized, its destruction is initiated by
- * percpu_ref_kill() and completes asynchronously, which can be painful to
- * do when destroying a half-constructed object in init failure path.
- *
- * This function destroys @ref without invoking @ref->release and the
- * memory area containing it can be freed immediately on return.  To
- * prevent accidental misuse, it's required that @ref has finished
- * percpu_ref_init(), whether successful or not, but never used.
- *
- * The weird name and usage restriction are to prevent people from using
- * this function by mistake for normal shutdown instead of
- * percpu_ref_kill().
+ * This function exits @ref.  The caller is responsible for ensuring that
+ * @ref is no longer in active use.  The usual places to invoke this
+ * function from are the @ref->release() callback or in init failure path
+ * where percpu_ref_init() succeeded but other parts of the initialization
+ * of the embedding object failed.
  */
-void percpu_ref_cancel_init(struct percpu_ref *ref)
+void percpu_ref_exit(struct percpu_ref *ref)
 {
 	unsigned __percpu *pcpu_count = pcpu_count_ptr(ref);
-	int cpu;
-
-	WARN_ON_ONCE(atomic_read(&ref->count) != 1 + PCPU_COUNT_BIAS);
 
 	if (pcpu_count) {
-		for_each_possible_cpu(cpu)
-			WARN_ON_ONCE(*per_cpu_ptr(pcpu_count, cpu));
 		free_percpu(pcpu_count);
+		ref->pcpu_count_ptr = PCPU_REF_DEAD;
 	}
 }
-EXPORT_SYMBOL_GPL(percpu_ref_cancel_init);
+EXPORT_SYMBOL_GPL(percpu_ref_exit);
 
 static void percpu_ref_kill_rcu(struct rcu_head *rcu)
 {
@@ -102,8 +91,6 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu)
 	for_each_possible_cpu(cpu)
 		count += *per_cpu_ptr(pcpu_count, cpu);
 
-	free_percpu(pcpu_count);
-
 	pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count);
 
 	/*
-- 
cgit v1.2.3


From be6fb451a24582732c66e28cb0beb3f19c4289fd Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Tue, 22 Jul 2014 09:56:56 -0400
Subject: aio: remove no longer needed preempt_disable()

Based on feedback from Jens Axboe on 263782c1c95bbddbb022dc092fd89a36bb8d5577,
clean up get/put_reqs_available() to remove the no longer needed preempt_disable()
and preempt_enable() pair.

Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 fs/aio.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 8216aa0c7539..9ce9e8ea773f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -814,10 +814,8 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
 	struct kioctx_cpu *kcpu;
 	unsigned long flags;
 
-	preempt_disable();
-	kcpu = this_cpu_ptr(ctx->cpu);
-
 	local_irq_save(flags);
+	kcpu = this_cpu_ptr(ctx->cpu);
 	kcpu->reqs_available += nr;
 
 	while (kcpu->reqs_available >= ctx->req_batch * 2) {
@@ -826,7 +824,6 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
 	}
 
 	local_irq_restore(flags);
-	preempt_enable();
 }
 
 static bool get_reqs_available(struct kioctx *ctx)
@@ -835,10 +832,8 @@ static bool get_reqs_available(struct kioctx *ctx)
 	bool ret = false;
 	unsigned long flags;
 
-	preempt_disable();
-	kcpu = this_cpu_ptr(ctx->cpu);
-
 	local_irq_save(flags);
+	kcpu = this_cpu_ptr(ctx->cpu);
 	if (!kcpu->reqs_available) {
 		int old, avail = atomic_read(&ctx->reqs_available);
 
@@ -858,7 +853,6 @@ static bool get_reqs_available(struct kioctx *ctx)
 	kcpu->reqs_available--;
 out:
 	local_irq_restore(flags);
-	preempt_enable();
 	return ret;
 }
 
-- 
cgit v1.2.3


From b53f1f82fbde8dcf34ab7d731c2a9ae6f0d8d2e2 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 23 Jul 2014 18:03:51 +0800
Subject: aio: remove the needless registration of ring file's private_data

Remove the registration of ring file's private_data, we do not use
it.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 9ce9e8ea773f..55e03a858e8b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -192,7 +192,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 	}
 
 	file->f_flags = O_RDWR;
-	file->private_data = ctx;
 	return file;
 }
 
-- 
cgit v1.2.3


From 8dc4379e17cddad7b2088a3f300ded50d2a6d493 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 23 Jul 2014 18:03:52 +0800
Subject: aio: use the macro rather than the inline magic number

Replace the inline magic number with the ready-made macro(AIO_RING_MAGIC),
just clean up.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 55e03a858e8b..6fc6b9857a58 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -201,7 +201,7 @@ static struct dentry *aio_mount(struct file_system_type *fs_type,
 	static const struct dentry_operations ops = {
 		.d_dname	= simple_dname,
 	};
-	return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1);
+	return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
 }
 
 /* aio_setup
-- 
cgit v1.2.3


From 2be4e7deec2d4398a0eb2165cc04086ebfc831d2 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 23 Jul 2014 18:03:53 +0800
Subject: aio: fix some comments

The function comments of aio_run_iocb and aio_read_events are out of date, so
fix them here.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 6fc6b9857a58..d6d9520f6866 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1020,7 +1020,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 }
 EXPORT_SYMBOL(aio_complete);
 
-/* aio_read_events
+/* aio_read_events_ring
  *	Pull an event off of the ioctx's event ring.  Returns the number of
  *	events fetched
  */
@@ -1272,9 +1272,8 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
 }
 
 /*
- * aio_setup_iocb:
- *	Performs the initial checks and aio retry method
- *	setup for the kiocb at the time of io submission.
+ * aio_run_iocb:
+ *	Performs the initial checks and io submission.
  */
 static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
 			    char __user *buf, bool compat)
-- 
cgit v1.2.3


From 00fefb9cf2b5493a86912de55ba912bdfae4a207 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 23 Jul 2014 18:03:54 +0800
Subject: aio: use iovec array rather than the single one

Previously, we only offer a single iovec to handle all the read/write cases, so
the PREADV/PWRITEV request always need to alloc more iovec buffer when copying
user vectors.
If we use a tmp iovec array rather than the single one, some small PREADV/PWRITEV
workloads(vector size small than the tmp buffer) will not need to alloc more
iovec buffer when copying user vectors.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index d6d9520f6866..0fd9181d8c0c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1243,12 +1243,12 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
 	if (compat)
 		ret = compat_rw_copy_check_uvector(rw,
 				(struct compat_iovec __user *)buf,
-				*nr_segs, 1, *iovec, iovec);
+				*nr_segs, UIO_FASTIOV, *iovec, iovec);
 	else
 #endif
 		ret = rw_copy_check_uvector(rw,
 				(struct iovec __user *)buf,
-				*nr_segs, 1, *iovec, iovec);
+				*nr_segs, UIO_FASTIOV, *iovec, iovec);
 	if (ret < 0)
 		return ret;
 
@@ -1285,7 +1285,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
 	fmode_t mode;
 	aio_rw_op *rw_op;
 	rw_iter_op *iter_op;
-	struct iovec inline_vec, *iovec = &inline_vec;
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct iov_iter iter;
 
 	switch (opcode) {
@@ -1320,7 +1320,7 @@ rw_common:
 		if (!ret)
 			ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
 		if (ret < 0) {
-			if (iovec != &inline_vec)
+			if (iovec != inline_vecs)
 				kfree(iovec);
 			return ret;
 		}
@@ -1367,7 +1367,7 @@ rw_common:
 		return -EINVAL;
 	}
 
-	if (iovec != &inline_vec)
+	if (iovec != inline_vecs)
 		kfree(iovec);
 
 	if (ret != -EIOCBQUEUED) {
-- 
cgit v1.2.3


From d856f32a86b2b015ab180ab7a55e455ed8d3ccc5 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Sun, 24 Aug 2014 13:14:05 -0400
Subject: aio: fix reqs_available handling

As reported by Dan Aloni, commit f8567a3845ac ("aio: fix aio request
leak when events are reaped by userspace") introduces a regression when
user code attempts to perform io_submit() with more events than are
available in the ring buffer.  Reverting that commit would reintroduce a
regression when user space event reaping is used.

Fixing this bug is a bit more involved than the previous attempts to fix
this regression.  Since we do not have a single point at which we can
count events as being reaped by user space and io_getevents(), we have
to track event completion by looking at the number of events left in the
event ring.  So long as there are as many events in the ring buffer as
there have been completion events generate, we cannot call
put_reqs_available().  The code to check for this is now placed in
refill_reqs_available().

A test program from Dan and modified by me for verifying this bug is available
at http://www.kvack.org/~bcrl/20140824-aio_bug.c .

Reported-by: Dan Aloni <dan@kernelim.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Acked-by: Dan Aloni <dan@kernelim.com>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Petr Matousek <pmatouse@redhat.com>
Cc: stable@vger.kernel.org      # v3.16 and anything that f8567a3845ac was backported to
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 73 insertions(+), 4 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index ae635872affb..97bc62cbe2da 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -141,6 +141,7 @@ struct kioctx {
 
 	struct {
 		unsigned	tail;
+		unsigned	completed_events;
 		spinlock_t	completion_lock;
 	} ____cacheline_aligned_in_smp;
 
@@ -857,6 +858,68 @@ out:
 	return ret;
 }
 
+/* refill_reqs_available
+ *	Updates the reqs_available reference counts used for tracking the
+ *	number of free slots in the completion ring.  This can be called
+ *	from aio_complete() (to optimistically update reqs_available) or
+ *	from aio_get_req() (the we're out of events case).  It must be
+ *	called holding ctx->completion_lock.
+ */
+static void refill_reqs_available(struct kioctx *ctx, unsigned head,
+                                  unsigned tail)
+{
+	unsigned events_in_ring, completed;
+
+	/* Clamp head since userland can write to it. */
+	head %= ctx->nr_events;
+	if (head <= tail)
+		events_in_ring = tail - head;
+	else
+		events_in_ring = ctx->nr_events - (head - tail);
+
+	completed = ctx->completed_events;
+	if (events_in_ring < completed)
+		completed -= events_in_ring;
+	else
+		completed = 0;
+
+	if (!completed)
+		return;
+
+	ctx->completed_events -= completed;
+	put_reqs_available(ctx, completed);
+}
+
+/* user_refill_reqs_available
+ *	Called to refill reqs_available when aio_get_req() encounters an
+ *	out of space in the completion ring.
+ */
+static void user_refill_reqs_available(struct kioctx *ctx)
+{
+	spin_lock_irq(&ctx->completion_lock);
+	if (ctx->completed_events) {
+		struct aio_ring *ring;
+		unsigned head;
+
+		/* Access of ring->head may race with aio_read_events_ring()
+		 * here, but that's okay since whether we read the old version
+		 * or the new version, and either will be valid.  The important
+		 * part is that head cannot pass tail since we prevent
+		 * aio_complete() from updating tail by holding
+		 * ctx->completion_lock.  Even if head is invalid, the check
+		 * against ctx->completed_events below will make sure we do the
+		 * safe/right thing.
+		 */
+		ring = kmap_atomic(ctx->ring_pages[0]);
+		head = ring->head;
+		kunmap_atomic(ring);
+
+		refill_reqs_available(ctx, head, ctx->tail);
+	}
+
+	spin_unlock_irq(&ctx->completion_lock);
+}
+
 /* aio_get_req
  *	Allocate a slot for an aio request.
  * Returns NULL if no requests are free.
@@ -865,8 +928,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
 {
 	struct kiocb *req;
 
-	if (!get_reqs_available(ctx))
-		return NULL;
+	if (!get_reqs_available(ctx)) {
+		user_refill_reqs_available(ctx);
+		if (!get_reqs_available(ctx))
+			return NULL;
+	}
 
 	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
 	if (unlikely(!req))
@@ -925,8 +991,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	struct kioctx	*ctx = iocb->ki_ctx;
 	struct aio_ring	*ring;
 	struct io_event	*ev_page, *event;
+	unsigned tail, pos, head;
 	unsigned long	flags;
-	unsigned tail, pos;
 
 	/*
 	 * Special case handling for sync iocbs:
@@ -987,10 +1053,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	ctx->tail = tail;
 
 	ring = kmap_atomic(ctx->ring_pages[0]);
+	head = ring->head;
 	ring->tail = tail;
 	kunmap_atomic(ring);
 	flush_dcache_page(ctx->ring_pages[0]);
 
+	ctx->completed_events++;
+	if (ctx->completed_events > 1)
+		refill_reqs_available(ctx, head, tail);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1005,7 +1075,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 
 	/* everything turned out well, dispose of the aiocb. */
 	kiocb_free(iocb);
-	put_reqs_available(ctx, 1);
 
 	/*
 	 * We have to order our ring_info tail store above and test
-- 
cgit v1.2.3


From 2ff396be602f10b5eab8e73b24f20348fa2de159 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Tue, 2 Sep 2014 13:17:00 -0400
Subject: aio: add missing smp_rmb() in read_events_ring

We ran into a case on ppc64 running mariadb where io_getevents would
return zeroed out I/O events.  After adding instrumentation, it became
clear that there was some missing synchronization between reading the
tail pointer and the events themselves.  This small patch fixes the
problem in testing.

Thanks to Zach for helping to look into this, and suggesting the fix.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Cc: stable@vger.kernel.org
---
 fs/aio.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 97bc62cbe2da..5f2e9c6c328e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1111,6 +1111,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
 	tail = ring->tail;
 	kunmap_atomic(ring);
 
+	/*
+	 * Ensure that once we've read the current tail pointer, that
+	 * we also see the events that were stored up to the tail.
+	 */
+	smp_rmb();
+
 	pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
 
 	if (head == tail)
-- 
cgit v1.2.3


From 6098b45b32e6baeacc04790773ced9340601d511 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 3 Sep 2014 17:45:44 +0800
Subject: aio: block exit_aio() until all context requests are completed

It seems that exit_aio() also needs to wait for all iocbs to complete (like
io_destroy), but we missed the wait step in current implemention, so fix
it in the same way as we did in io_destroy.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Cc: stable@vger.kernel.org
---
 fs/aio.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 5f2e9c6c328e..733750096b71 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -793,6 +793,8 @@ void exit_aio(struct mm_struct *mm)
 
 	for (i = 0; i < table->nr; ++i) {
 		struct kioctx *ctx = table->table[i];
+		struct completion requests_done =
+			COMPLETION_INITIALIZER_ONSTACK(requests_done);
 
 		if (!ctx)
 			continue;
@@ -804,7 +806,10 @@ void exit_aio(struct mm_struct *mm)
 		 * that it needs to unmap the area, just set it to 0.
 		 */
 		ctx->mmap_size = 0;
-		kill_ioctx(mm, ctx, NULL);
+		kill_ioctx(mm, ctx, &requests_done);
+
+		/* Wait until all IO for the context are done. */
+		wait_for_completion(&requests_done);
 	}
 
 	RCU_INIT_POINTER(mm->ioctx_table, NULL);
-- 
cgit v1.2.3