From a492f075450f3ba87de36e5ffe92a9d0c7af9723 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@stratus.com>
Date: Thu, 28 Aug 2014 08:15:21 -0600
Subject: block,scsi: fixup blk_get_request dead queue scenarios

The blk_get_request function may fail in low-memory conditions or during
device removal (even if __GFP_WAIT is set). To distinguish between these
errors, modify the blk_get_request call stack to return the appropriate
ERR_PTR. Verify that all callers check the return status and consider
IS_ERR instead of a simple NULL pointer check.

For consistency, make a similar change to the blk_mq_alloc_request leg
of blk_get_request.  It may fail if the queue is dead, or the caller was
unwilling to wait.

Signed-off-by: Joe Lawrence <joe.lawrence@stratus.com>
Acked-by: Jiri Kosina <jkosina@suse.cz> [for pktdvd]
Acked-by: Boaz Harrosh <bharrosh@panasas.com> [for osd]
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index c359d72e9d76..93603e6ff479 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -933,9 +933,9 @@ static struct io_context *rq_ioc(struct bio *bio)
  * Get a free request from @q.  This function may fail under memory
  * pressure or if @q is dead.
  *
- * Must be callled with @q->queue_lock held and,
- * Returns %NULL on failure, with @q->queue_lock held.
- * Returns !%NULL on success, with @q->queue_lock *not held*.
+ * Must be called with @q->queue_lock held and,
+ * Returns ERR_PTR on failure, with @q->queue_lock held.
+ * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *__get_request(struct request_list *rl, int rw_flags,
 				     struct bio *bio, gfp_t gfp_mask)
@@ -949,7 +949,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
 	int may_queue;
 
 	if (unlikely(blk_queue_dying(q)))
-		return NULL;
+		return ERR_PTR(-ENODEV);
 
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
@@ -974,7 +974,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
 					 * process is not a "batcher", and not
 					 * exempted by the IO scheduler
 					 */
-					return NULL;
+					return ERR_PTR(-ENOMEM);
 				}
 			}
 		}
@@ -992,7 +992,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
 	 * allocated with any setting of ->nr_requests
 	 */
 	if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	q->nr_rqs[is_sync]++;
 	rl->count[is_sync]++;
@@ -1097,7 +1097,7 @@ fail_alloc:
 rq_starved:
 	if (unlikely(rl->count[is_sync] == 0))
 		rl->starved[is_sync] = 1;
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 
 /**
@@ -1110,9 +1110,9 @@ rq_starved:
  * Get a free request from @q.  If %__GFP_WAIT is set in @gfp_mask, this
  * function keeps retrying under memory pressure and fails iff @q is dead.
  *
- * Must be callled with @q->queue_lock held and,
- * Returns %NULL on failure, with @q->queue_lock held.
- * Returns !%NULL on success, with @q->queue_lock *not held*.
+ * Must be called with @q->queue_lock held and,
+ * Returns ERR_PTR on failure, with @q->queue_lock held.
+ * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, int rw_flags,
 				   struct bio *bio, gfp_t gfp_mask)
@@ -1125,12 +1125,12 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	rl = blk_get_rl(q, bio);	/* transferred to @rq on success */
 retry:
 	rq = __get_request(rl, rw_flags, bio, gfp_mask);
-	if (rq)
+	if (!IS_ERR(rq))
 		return rq;
 
 	if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) {
 		blk_put_rl(rl);
-		return NULL;
+		return rq;
 	}
 
 	/* wait on @rl and retry */
@@ -1167,7 +1167,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 
 	spin_lock_irq(q->queue_lock);
 	rq = get_request(q, rw, NULL, gfp_mask);
-	if (!rq)
+	if (IS_ERR(rq))
 		spin_unlock_irq(q->queue_lock);
 	/* q->queue_lock is unlocked at this point */
 
@@ -1219,8 +1219,8 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio,
 {
 	struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);
 
-	if (unlikely(!rq))
-		return ERR_PTR(-ENOMEM);
+	if (IS_ERR(rq))
+		return rq;
 
 	blk_rq_set_block_pc(rq);
 
@@ -1615,8 +1615,8 @@ get_rq:
 	 * Returns with the queue unlocked.
 	 */
 	req = get_request(q, rw_flags, bio, GFP_NOIO);
-	if (unlikely(!req)) {
-		bio_endio(bio, -ENODEV);	/* @q is dead */
+	if (IS_ERR(req)) {
+		bio_endio(bio, PTR_ERR(req));	/* @q is dead */
 		goto out_unlock;
 	}
 
-- 
cgit v1.2.3


From ff9ea323816dc1c8ac7144afd4eab3ac97704430 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Sep 2014 08:03:56 +0900
Subject: block, bdi: an active gendisk always has a request_queue associated
 with it

bdev_get_queue() returns the request_queue associated with the
specified block_device.  blk_get_backing_dev_info() makes use of
bdev_get_queue() to determine the associated bdi given a block_device.

All the callers of bdev_get_queue() including
blk_get_backing_dev_info() assume that bdev_get_queue() may return
NULL and implement NULL handling; however, bdev_get_queue() requires
the passed in block_device is opened and attached to its gendisk.
Because an active gendisk always has a valid request_queue associated
with it, bdev_get_queue() can never return NULL and neither can
blk_get_backing_dev_info().

Make it clear that neither of the two functions can return NULL and
remove NULL handling from all the callers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 10 +++-------
 block/compat_ioctl.c   |  4 ----
 block/ioctl.c          |  4 ----
 fs/block_dev.c         |  2 --
 fs/btrfs/disk-io.c     |  2 +-
 fs/xfs/xfs_buf.c       |  2 --
 include/linux/blkdev.h |  2 +-
 7 files changed, 5 insertions(+), 21 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 93603e6ff479..817446175489 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -83,18 +83,14 @@ void blk_queue_congestion_threshold(struct request_queue *q)
  * @bdev:	device
  *
  * Locates the passed device's request queue and returns the address of its
- * backing_dev_info
- *
- * Will return NULL if the request queue cannot be located.
+ * backing_dev_info.  This function can only be called if @bdev is opened
+ * and the return value is never NULL.
  */
 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
 {
-	struct backing_dev_info *ret = NULL;
 	struct request_queue *q = bdev_get_queue(bdev);
 
-	if (q)
-		ret = &q->backing_dev_info;
-	return ret;
+	return &q->backing_dev_info;
 }
 EXPORT_SYMBOL(blk_get_backing_dev_info);
 
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 18b282ce361e..f678c733df40 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -709,8 +709,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		if (!arg)
 			return -EINVAL;
 		bdi = blk_get_backing_dev_info(bdev);
-		if (bdi == NULL)
-			return -ENOTTY;
 		return compat_put_long(arg,
 				       (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
 	case BLKROGET: /* compatible */
@@ -731,8 +729,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
 		bdi = blk_get_backing_dev_info(bdev);
-		if (bdi == NULL)
-			return -ENOTTY;
 		bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
 		return 0;
 	case BLKGETSIZE:
diff --git a/block/ioctl.c b/block/ioctl.c
index d6cda8147c91..6c7bf903742f 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -356,8 +356,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		if (!arg)
 			return -EINVAL;
 		bdi = blk_get_backing_dev_info(bdev);
-		if (bdi == NULL)
-			return -ENOTTY;
 		return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
 	case BLKROGET:
 		return put_int(arg, bdev_read_only(bdev) != 0);
@@ -386,8 +384,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		if(!capable(CAP_SYS_ADMIN))
 			return -EACCES;
 		bdi = blk_get_backing_dev_info(bdev);
-		if (bdi == NULL)
-			return -ENOTTY;
 		bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
 		return 0;
 	case BLKBSZSET:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6d7274619bf9..d3251eca6429 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1173,8 +1173,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			if (!ret) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
 				bdi = blk_get_backing_dev_info(bdev);
-				if (bdi == NULL)
-					bdi = &default_backing_dev_info;
 				bdev_inode_switch_bdi(bdev->bd_inode, bdi);
 			}
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d0ed9e664f7d..39ff591ae1b4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1694,7 +1694,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 		if (!device->bdev)
 			continue;
 		bdi = blk_get_backing_dev_info(device->bdev);
-		if (bdi && bdi_congested(bdi, bdi_bits)) {
+		if (bdi_congested(bdi, bdi_bits)) {
 			ret = 1;
 			break;
 		}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index cd7b8ca9b064..497fcde381d7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1678,8 +1678,6 @@ xfs_alloc_buftarg(
 	btp->bt_dev =  bdev->bd_dev;
 	btp->bt_bdev = bdev;
 	btp->bt_bdi = blk_get_backing_dev_info(bdev);
-	if (!btp->bt_bdi)
-		goto error;
 
 	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 518b46555b80..e267bf0db559 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -865,7 +865,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
-	return bdev->bd_disk->queue;
+	return bdev->bd_disk->queue;	/* this is never NULL */
 }
 
 /*
-- 
cgit v1.2.3


From f355265571440a7db16e784b6edf4e7d26971a03 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 25 Sep 2014 23:23:40 +0800
Subject: block: introduce blk_init_flush and its pair

These two temporary functions are introduced for holding flush
initialization and de-initialization, so that we can
introduce 'flush queue' easier in the following patch. And
once 'flush queue' and its allocation/free functions are ready,
they will be removed for sake of code readability.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c  |  5 ++---
 block/blk-flush.c | 19 ++++++++++++++++++-
 block/blk-mq.c    |  2 +-
 block/blk-mq.h    |  1 -
 block/blk-sysfs.c |  4 ++--
 block/blk.h       |  3 +++
 6 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6946a4275e6f..0a9d17269957 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -705,8 +705,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 	if (!q)
 		return NULL;
 
-	q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);
-	if (!q->flush_rq)
+	if (blk_init_flush(q))
 		return NULL;
 
 	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
@@ -742,7 +741,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 	return q;
 
 fail:
-	kfree(q->flush_rq);
+	blk_exit_flush(q);
 	return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 55028a707927..c72ab32fd8eb 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -472,7 +472,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
 
-int blk_mq_init_flush(struct request_queue *q)
+static int blk_mq_init_flush(struct request_queue *q)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
 
@@ -485,3 +485,20 @@ int blk_mq_init_flush(struct request_queue *q)
 		return -ENOMEM;
 	return 0;
 }
+
+int blk_init_flush(struct request_queue *q)
+{
+	if (q->mq_ops)
+		return blk_mq_init_flush(q);
+
+	q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);
+	if (!q->flush_rq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void blk_exit_flush(struct request_queue *q)
+{
+	kfree(q->flush_rq);
+}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 78bcf8bfb22a..2758cdf2de94 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1859,7 +1859,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 
 	blk_mq_add_queue_tag_set(set, q);
 
-	if (blk_mq_init_flush(q))
+	if (blk_init_flush(q))
 		goto err_hw_queues;
 
 	blk_mq_map_swqueue(q);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ecac69c08937..d567d5283ffa 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -27,7 +27,6 @@ struct blk_mq_ctx {
 
 void __blk_mq_complete_request(struct request *rq);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
-int blk_mq_init_flush(struct request_queue *q);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 void blk_mq_clone_flush_request(struct request *flush_rq,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 17f5c84ce7bf..949075952119 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -517,11 +517,11 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
+	blk_exit_flush(q);
+
 	if (q->mq_ops)
 		blk_mq_free_queue(q);
 
-	kfree(q->flush_rq);
-
 	blk_trace_shutdown(q);
 
 	bdi_destroy(&q->backing_dev_info);
diff --git a/block/blk.h b/block/blk.h
index e515a285d4c9..c6fa3d4c6a89 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -22,6 +22,9 @@ static inline void __blk_get_queue(struct request_queue *q)
 	kobject_get(&q->kobj);
 }
 
+int blk_init_flush(struct request_queue *q);
+void blk_exit_flush(struct request_queue *q);
+
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
 		gfp_t gfp_mask);
 void blk_exit_rl(struct request_list *rl);
-- 
cgit v1.2.3


From 3c09676c12b1dabf84acbb5849bfc54acadaf092 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 25 Sep 2014 23:23:41 +0800
Subject: block: move flush initialization to blk_flush_init

These fields are always used with the flush request, so
initialize them together.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c  | 3 ---
 block/blk-flush.c | 4 ++++
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 0a9d17269957..222fe84d6ac4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -600,9 +600,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 #ifdef CONFIG_BLK_CGROUP
 	INIT_LIST_HEAD(&q->blkg_list);
 #endif
-	INIT_LIST_HEAD(&q->flush_queue[0]);
-	INIT_LIST_HEAD(&q->flush_queue[1]);
-	INIT_LIST_HEAD(&q->flush_data_in_flight);
 	INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index c72ab32fd8eb..a49ffbdcfcdc 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -488,6 +488,10 @@ static int blk_mq_init_flush(struct request_queue *q)
 
 int blk_init_flush(struct request_queue *q)
 {
+	INIT_LIST_HEAD(&q->flush_queue[0]);
+	INIT_LIST_HEAD(&q->flush_queue[1]);
+	INIT_LIST_HEAD(&q->flush_data_in_flight);
+
 	if (q->mq_ops)
 		return blk_mq_init_flush(q);
 
-- 
cgit v1.2.3


From 7c94e1c157a227837b04f02f5edeff8301410ba2 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 25 Sep 2014 23:23:43 +0800
Subject: block: introduce blk_flush_queue to drive flush machinery

This patch introduces 'struct blk_flush_queue' and puts all
flush machinery related fields into this structure, so that

	- flush implementation details aren't exposed to driver
	- it is easy to convert to per dispatch-queue flush machinery

This patch is basically a mechanical replacement.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |   4 +-
 block/blk-flush.c      | 109 ++++++++++++++++++++++++++++++-------------------
 block/blk-mq.c         |  10 +++--
 block/blk.h            |  22 +++++++++-
 include/linux/blkdev.h |  10 +----
 5 files changed, 99 insertions(+), 56 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 222fe84d6ac4..cfaca8ca6cc4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -390,11 +390,13 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all)
 		 * be drained.  Check all the queues and counters.
 		 */
 		if (drain_all) {
+			struct blk_flush_queue *fq = blk_get_flush_queue(q);
 			drain |= !list_empty(&q->queue_head);
 			for (i = 0; i < 2; i++) {
 				drain |= q->nr_rqs[i];
 				drain |= q->in_flight[i];
-				drain |= !list_empty(&q->flush_queue[i]);
+				if (fq)
+				    drain |= !list_empty(&fq->flush_queue[i]);
 			}
 		}
 
diff --git a/block/blk-flush.c b/block/blk-flush.c
index caf44756d329..b01a86d6bf86 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -28,7 +28,7 @@
  *
  * The actual execution of flush is double buffered.  Whenever a request
  * needs to execute PRE or POSTFLUSH, it queues at
- * q->flush_queue[q->flush_pending_idx].  Once certain criteria are met, a
+ * fq->flush_queue[fq->flush_pending_idx].  Once certain criteria are met, a
  * flush is issued and the pending_idx is toggled.  When the flush
  * completes, all the requests which were pending are proceeded to the next
  * step.  This allows arbitrary merging of different types of FLUSH/FUA
@@ -155,7 +155,7 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front)
  * completion and trigger the next step.
  *
  * CONTEXT:
- * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
+ * spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
  *
  * RETURNS:
  * %true if requests were added to the dispatch queue, %false otherwise.
@@ -164,7 +164,8 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 				   int error)
 {
 	struct request_queue *q = rq->q;
-	struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
+	struct blk_flush_queue *fq = blk_get_flush_queue(q);
+	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 	bool queued = false, kicked;
 
 	BUG_ON(rq->flush.seq & seq);
@@ -180,12 +181,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 	case REQ_FSEQ_POSTFLUSH:
 		/* queue for flush */
 		if (list_empty(pending))
-			q->flush_pending_since = jiffies;
+			fq->flush_pending_since = jiffies;
 		list_move_tail(&rq->flush.list, pending);
 		break;
 
 	case REQ_FSEQ_DATA:
-		list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
+		list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
 		queued = blk_flush_queue_rq(rq, true);
 		break;
 
@@ -220,17 +221,18 @@ static void flush_end_io(struct request *flush_rq, int error)
 	bool queued = false;
 	struct request *rq, *n;
 	unsigned long flags = 0;
+	struct blk_flush_queue *fq = blk_get_flush_queue(q);
 
 	if (q->mq_ops) {
-		spin_lock_irqsave(&q->mq_flush_lock, flags);
+		spin_lock_irqsave(&fq->mq_flush_lock, flags);
 		flush_rq->tag = -1;
 	}
 
-	running = &q->flush_queue[q->flush_running_idx];
-	BUG_ON(q->flush_pending_idx == q->flush_running_idx);
+	running = &fq->flush_queue[fq->flush_running_idx];
+	BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
 
 	/* account completion of the flush request */
-	q->flush_running_idx ^= 1;
+	fq->flush_running_idx ^= 1;
 
 	if (!q->mq_ops)
 		elv_completed_request(q, flush_rq);
@@ -254,13 +256,13 @@ static void flush_end_io(struct request *flush_rq, int error)
 	 * directly into request_fn may confuse the driver.  Always use
 	 * kblockd.
 	 */
-	if (queued || q->flush_queue_delayed) {
+	if (queued || fq->flush_queue_delayed) {
 		WARN_ON(q->mq_ops);
 		blk_run_queue_async(q);
 	}
-	q->flush_queue_delayed = 0;
+	fq->flush_queue_delayed = 0;
 	if (q->mq_ops)
-		spin_unlock_irqrestore(&q->mq_flush_lock, flags);
+		spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 }
 
 /**
@@ -271,33 +273,34 @@ static void flush_end_io(struct request *flush_rq, int error)
  * Please read the comment at the top of this file for more info.
  *
  * CONTEXT:
- * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
+ * spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
  *
  * RETURNS:
  * %true if flush was issued, %false otherwise.
  */
 static bool blk_kick_flush(struct request_queue *q)
 {
-	struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
+	struct blk_flush_queue *fq = blk_get_flush_queue(q);
+	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 	struct request *first_rq =
 		list_first_entry(pending, struct request, flush.list);
-	struct request *flush_rq = q->flush_rq;
+	struct request *flush_rq = fq->flush_rq;
 
 	/* C1 described at the top of this file */
-	if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))
+	if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
 		return false;
 
 	/* C2 and C3 */
-	if (!list_empty(&q->flush_data_in_flight) &&
+	if (!list_empty(&fq->flush_data_in_flight) &&
 	    time_before(jiffies,
-			q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
+			fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
 		return false;
 
 	/*
 	 * Issue flush and toggle pending_idx.  This makes pending_idx
 	 * different from running_idx, which means flush is in flight.
 	 */
-	q->flush_pending_idx ^= 1;
+	fq->flush_pending_idx ^= 1;
 
 	blk_rq_init(q, flush_rq);
 	if (q->mq_ops)
@@ -329,6 +332,7 @@ static void mq_flush_data_end_io(struct request *rq, int error)
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	unsigned long flags;
+	struct blk_flush_queue *fq = blk_get_flush_queue(q);
 
 	ctx = rq->mq_ctx;
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
@@ -337,10 +341,10 @@ static void mq_flush_data_end_io(struct request *rq, int error)
 	 * After populating an empty queue, kick it to avoid stall.  Read
 	 * the comment in flush_end_io().
 	 */
-	spin_lock_irqsave(&q->mq_flush_lock, flags);
+	spin_lock_irqsave(&fq->mq_flush_lock, flags);
 	if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
 		blk_mq_run_hw_queue(hctx, true);
-	spin_unlock_irqrestore(&q->mq_flush_lock, flags);
+	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 }
 
 /**
@@ -408,11 +412,13 @@ void blk_insert_flush(struct request *rq)
 	rq->cmd_flags |= REQ_FLUSH_SEQ;
 	rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
 	if (q->mq_ops) {
+		struct blk_flush_queue *fq = blk_get_flush_queue(q);
+
 		rq->end_io = mq_flush_data_end_io;
 
-		spin_lock_irq(&q->mq_flush_lock);
+		spin_lock_irq(&fq->mq_flush_lock);
 		blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
-		spin_unlock_irq(&q->mq_flush_lock);
+		spin_unlock_irq(&fq->mq_flush_lock);
 		return;
 	}
 	rq->end_io = flush_data_end_io;
@@ -473,31 +479,52 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
 
-static int blk_mq_init_flush(struct request_queue *q)
+static struct blk_flush_queue *blk_alloc_flush_queue(
+		struct request_queue *q)
 {
-	struct blk_mq_tag_set *set = q->tag_set;
+	struct blk_flush_queue *fq;
+	int rq_sz = sizeof(struct request);
 
-	spin_lock_init(&q->mq_flush_lock);
+	fq = kzalloc(sizeof(*fq), GFP_KERNEL);
+	if (!fq)
+		goto fail;
 
-	q->flush_rq = kzalloc(round_up(sizeof(struct request) +
-				set->cmd_size, cache_line_size()),
-				GFP_KERNEL);
-	if (!q->flush_rq)
-		return -ENOMEM;
-	return 0;
+	if (q->mq_ops) {
+		spin_lock_init(&fq->mq_flush_lock);
+		rq_sz = round_up(rq_sz + q->tag_set->cmd_size,
+				cache_line_size());
+	}
+
+	fq->flush_rq = kzalloc(rq_sz, GFP_KERNEL);
+	if (!fq->flush_rq)
+		goto fail_rq;
+
+	INIT_LIST_HEAD(&fq->flush_queue[0]);
+	INIT_LIST_HEAD(&fq->flush_queue[1]);
+	INIT_LIST_HEAD(&fq->flush_data_in_flight);
+
+	return fq;
+
+ fail_rq:
+	kfree(fq);
+ fail:
+	return NULL;
 }
 
-int blk_init_flush(struct request_queue *q)
+static void blk_free_flush_queue(struct blk_flush_queue *fq)
 {
-	INIT_LIST_HEAD(&q->flush_queue[0]);
-	INIT_LIST_HEAD(&q->flush_queue[1]);
-	INIT_LIST_HEAD(&q->flush_data_in_flight);
+	/* bio based request queue hasn't flush queue */
+	if (!fq)
+		return;
 
-	if (q->mq_ops)
-		return blk_mq_init_flush(q);
+	kfree(fq->flush_rq);
+	kfree(fq);
+}
 
-	q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);
-	if (!q->flush_rq)
+int blk_init_flush(struct request_queue *q)
+{
+	q->fq = blk_alloc_flush_queue(q);
+	if (!q->fq)
 		return -ENOMEM;
 
 	return 0;
@@ -505,5 +532,5 @@ int blk_init_flush(struct request_queue *q)
 
 void blk_exit_flush(struct request_queue *q)
 {
-	kfree(q->flush_rq);
+	blk_free_flush_queue(q->fq);
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2758cdf2de94..d39e8a5eaeaa 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -508,20 +508,22 @@ void blk_mq_kick_requeue_list(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
-static inline bool is_flush_request(struct request *rq, unsigned int tag)
+static inline bool is_flush_request(struct request *rq,
+		struct blk_flush_queue *fq, unsigned int tag)
 {
 	return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
-			rq->q->flush_rq->tag == tag);
+			fq->flush_rq->tag == tag);
 }
 
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
 	struct request *rq = tags->rqs[tag];
+	struct blk_flush_queue *fq = blk_get_flush_queue(rq->q);
 
-	if (!is_flush_request(rq, tag))
+	if (!is_flush_request(rq, fq, tag))
 		return rq;
 
-	return rq->q->flush_rq;
+	return fq->flush_rq;
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
diff --git a/block/blk.h b/block/blk.h
index c6fa3d4c6a89..833c4ac6c4eb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -12,11 +12,28 @@
 /* Max future timer expiry for timeouts */
 #define BLK_MAX_TIMEOUT		(5 * HZ)
 
+struct blk_flush_queue {
+	unsigned int		flush_queue_delayed:1;
+	unsigned int		flush_pending_idx:1;
+	unsigned int		flush_running_idx:1;
+	unsigned long		flush_pending_since;
+	struct list_head	flush_queue[2];
+	struct list_head	flush_data_in_flight;
+	struct request		*flush_rq;
+	spinlock_t		mq_flush_lock;
+};
+
 extern struct kmem_cache *blk_requestq_cachep;
 extern struct kmem_cache *request_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 
+static inline struct blk_flush_queue *blk_get_flush_queue(
+		struct request_queue *q)
+{
+	return q->fq;
+}
+
 static inline void __blk_get_queue(struct request_queue *q)
 {
 	kobject_get(&q->kobj);
@@ -89,6 +106,7 @@ void blk_insert_flush(struct request *rq);
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
+	struct blk_flush_queue *fq = blk_get_flush_queue(q);
 
 	while (1) {
 		if (!list_empty(&q->queue_head)) {
@@ -111,9 +129,9 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 		 * should be restarted later. Please see flush_end_io() for
 		 * details.
 		 */
-		if (q->flush_pending_idx != q->flush_running_idx &&
+		if (fq->flush_pending_idx != fq->flush_running_idx &&
 				!queue_flush_queueable(q)) {
-			q->flush_queue_delayed = 1;
+			fq->flush_queue_delayed = 1;
 			return NULL;
 		}
 		if (unlikely(blk_queue_bypass(q)) ||
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e267bf0db559..49f3461e4272 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -36,6 +36,7 @@ struct request;
 struct sg_io_hdr;
 struct bsg_job;
 struct blkcg_gq;
+struct blk_flush_queue;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -455,14 +456,7 @@ struct request_queue {
 	 */
 	unsigned int		flush_flags;
 	unsigned int		flush_not_queueable:1;
-	unsigned int		flush_queue_delayed:1;
-	unsigned int		flush_pending_idx:1;
-	unsigned int		flush_running_idx:1;
-	unsigned long		flush_pending_since;
-	struct list_head	flush_queue[2];
-	struct list_head	flush_data_in_flight;
-	struct request		*flush_rq;
-	spinlock_t		mq_flush_lock;
+	struct blk_flush_queue	*fq;
 
 	struct list_head	requeue_list;
 	spinlock_t		requeue_lock;
-- 
cgit v1.2.3


From ba483388e3058b3e412632a84e6bf1f134beaf3d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 25 Sep 2014 23:23:44 +0800
Subject: block: remove blk_init_flush() and its pair

Now mission of the two helpers is over, and just call
blk_alloc_flush_queue() and blk_free_flush_queue() directly.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c  |  5 +++--
 block/blk-flush.c | 19 ++-----------------
 block/blk-mq.c    |  3 ++-
 block/blk-sysfs.c |  2 +-
 block/blk.h       |  4 ++--
 5 files changed, 10 insertions(+), 23 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index cfaca8ca6cc4..dba0a8350807 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -704,7 +704,8 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 	if (!q)
 		return NULL;
 
-	if (blk_init_flush(q))
+	q->fq = blk_alloc_flush_queue(q);
+	if (!q->fq)
 		return NULL;
 
 	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
@@ -740,7 +741,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 	return q;
 
 fail:
-	blk_exit_flush(q);
+	blk_free_flush_queue(q->fq);
 	return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index b01a86d6bf86..d66cbf2b2bc8 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -479,8 +479,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
 
-static struct blk_flush_queue *blk_alloc_flush_queue(
-		struct request_queue *q)
+struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q)
 {
 	struct blk_flush_queue *fq;
 	int rq_sz = sizeof(struct request);
@@ -511,7 +510,7 @@ static struct blk_flush_queue *blk_alloc_flush_queue(
 	return NULL;
 }
 
-static void blk_free_flush_queue(struct blk_flush_queue *fq)
+void blk_free_flush_queue(struct blk_flush_queue *fq)
 {
 	/* bio based request queue hasn't flush queue */
 	if (!fq)
@@ -520,17 +519,3 @@ static void blk_free_flush_queue(struct blk_flush_queue *fq)
 	kfree(fq->flush_rq);
 	kfree(fq);
 }
-
-int blk_init_flush(struct request_queue *q)
-{
-	q->fq = blk_alloc_flush_queue(q);
-	if (!q->fq)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void blk_exit_flush(struct request_queue *q)
-{
-	blk_free_flush_queue(q->fq);
-}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d39e8a5eaeaa..59ca79634cb9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1861,7 +1861,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 
 	blk_mq_add_queue_tag_set(set, q);
 
-	if (blk_init_flush(q))
+	q->fq = blk_alloc_flush_queue(q);
+	if (!q->fq)
 		goto err_hw_queues;
 
 	blk_mq_map_swqueue(q);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 949075952119..718cffc4c678 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -517,7 +517,7 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
-	blk_exit_flush(q);
+	blk_free_flush_queue(q->fq);
 
 	if (q->mq_ops)
 		blk_mq_free_queue(q);
diff --git a/block/blk.h b/block/blk.h
index 833c4ac6c4eb..9eaa6e91b13f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -39,8 +39,8 @@ static inline void __blk_get_queue(struct request_queue *q)
 	kobject_get(&q->kobj);
 }
 
-int blk_init_flush(struct request_queue *q);
-void blk_exit_flush(struct request_queue *q);
+struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q);
+void blk_free_flush_queue(struct blk_flush_queue *fq);
 
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
 		gfp_t gfp_mask);
-- 
cgit v1.2.3


From e97c293cdf77263abdc021de280516e0017afc84 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 25 Sep 2014 23:23:46 +0800
Subject: block: introduce 'blk_mq_ctx' parameter to blk_get_flush_queue

This patch adds 'blk_mq_ctx' parameter to blk_get_flush_queue(),
so that this function can find the corresponding blk_flush_queue
bound with current mq context since the flush queue will become
per hw-queue.

For legacy queue, the parameter can be simply 'NULL'.

For multiqueue case, the parameter should be set as the context
from which the related request is originated. With this context
info, the hw queue and related flush queue can be found easily.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c  |  2 +-
 block/blk-flush.c | 11 +++++------
 block/blk-mq.c    |  3 ++-
 block/blk.h       |  4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index dba0a8350807..b1dd4e086740 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -390,7 +390,7 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all)
 		 * be drained.  Check all the queues and counters.
 		 */
 		if (drain_all) {
-			struct blk_flush_queue *fq = blk_get_flush_queue(q);
+			struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
 			drain |= !list_empty(&q->queue_head);
 			for (i = 0; i < 2; i++) {
 				drain |= q->nr_rqs[i];
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 9bc5b4f35c23..004d95e4098e 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -223,7 +223,7 @@ static void flush_end_io(struct request *flush_rq, int error)
 	bool queued = false;
 	struct request *rq, *n;
 	unsigned long flags = 0;
-	struct blk_flush_queue *fq = blk_get_flush_queue(q);
+	struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
 
 	if (q->mq_ops) {
 		spin_lock_irqsave(&fq->mq_flush_lock, flags);
@@ -319,7 +319,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 static void flush_data_end_io(struct request *rq, int error)
 {
 	struct request_queue *q = rq->q;
-	struct blk_flush_queue *fq = blk_get_flush_queue(q);
+	struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
 
 	/*
 	 * After populating an empty queue, kick it to avoid stall.  Read
@@ -333,11 +333,10 @@ static void mq_flush_data_end_io(struct request *rq, int error)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	unsigned long flags;
-	struct blk_flush_queue *fq = blk_get_flush_queue(q);
+	struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
 
-	ctx = rq->mq_ctx;
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 
 	/*
@@ -367,7 +366,7 @@ void blk_insert_flush(struct request *rq)
 	struct request_queue *q = rq->q;
 	unsigned int fflags = q->flush_flags;	/* may change, cache */
 	unsigned int policy = blk_flush_policy(fflags, rq);
-	struct blk_flush_queue *fq = blk_get_flush_queue(q);
+	struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
 
 	/*
 	 * @policy now records what operations need to be done.  Adjust
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 59ca79634cb9..53b6def12fc4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -518,7 +518,8 @@ static inline bool is_flush_request(struct request *rq,
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
 	struct request *rq = tags->rqs[tag];
-	struct blk_flush_queue *fq = blk_get_flush_queue(rq->q);
+	/* mq_ctx of flush rq is always cloned from the corresponding req */
+	struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx);
 
 	if (!is_flush_request(rq, fq, tag))
 		return rq;
diff --git a/block/blk.h b/block/blk.h
index 9eaa6e91b13f..7ecdd8517e69 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -29,7 +29,7 @@ extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 
 static inline struct blk_flush_queue *blk_get_flush_queue(
-		struct request_queue *q)
+		struct request_queue *q, struct blk_mq_ctx *ctx)
 {
 	return q->fq;
 }
@@ -106,7 +106,7 @@ void blk_insert_flush(struct request *rq);
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
-	struct blk_flush_queue *fq = blk_get_flush_queue(q);
+	struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
 
 	while (1) {
 		if (!list_empty(&q->queue_head)) {
-- 
cgit v1.2.3


From f70ced09170761acb69840cafaace4abc72cba4b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 25 Sep 2014 23:23:47 +0800
Subject: blk-mq: support per-distpatch_queue flush machinery

This patch supports to run one single flush machinery for
each blk-mq dispatch queue, so that:

- current init_request and exit_request callbacks can
cover flush request too, then the buggy copying way of
initializing flush request's pdu can be fixed

- flushing performance gets improved in case of multi hw-queue

In fio sync write test over virtio-blk(4 hw queues, ioengine=sync,
iodepth=64, numjobs=4, bs=4K), it is observed that througput gets
increased a lot over my test environment:
	- throughput: +70% in case of virtio-blk over null_blk
	- throughput: +30% in case of virtio-blk over SSD image

The multi virtqueue feature isn't merged to QEMU yet, and patches for
the feature can be found in below tree:

	git://kernel.ubuntu.com/ming/qemu.git  	v2.1.0-mq.4

And simply passing 'num_queues=4 vectors=5' should be enough to
enable multi queue(quad queue) feature for QEMU virtio-blk.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  2 +-
 block/blk-flush.c      | 21 ++++++++++++++-------
 block/blk-mq.c         | 50 ++++++++++++++++++++++++--------------------------
 block/blk-sysfs.c      |  4 ++--
 block/blk.h            | 16 +++++++++++++---
 include/linux/blk-mq.h |  6 ++++++
 6 files changed, 60 insertions(+), 39 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index b1dd4e086740..e1c2775c7597 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -704,7 +704,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 	if (!q)
 		return NULL;
 
-	q->fq = blk_alloc_flush_queue(q);
+	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0);
 	if (!q->fq)
 		return NULL;
 
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 004d95e4098e..20badd7b9d1b 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -305,8 +305,15 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	fq->flush_pending_idx ^= 1;
 
 	blk_rq_init(q, flush_rq);
-	if (q->mq_ops)
-		blk_mq_clone_flush_request(flush_rq, first_rq);
+
+	/*
+	 * Borrow tag from the first request since they can't
+	 * be in flight at the same time.
+	 */
+	if (q->mq_ops) {
+		flush_rq->mq_ctx = first_rq->mq_ctx;
+		flush_rq->tag = first_rq->tag;
+	}
 
 	flush_rq->cmd_type = REQ_TYPE_FS;
 	flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
@@ -480,22 +487,22 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
 
-struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q)
+struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
+		int node, int cmd_size)
 {
 	struct blk_flush_queue *fq;
 	int rq_sz = sizeof(struct request);
 
-	fq = kzalloc(sizeof(*fq), GFP_KERNEL);
+	fq = kzalloc_node(sizeof(*fq), GFP_KERNEL, node);
 	if (!fq)
 		goto fail;
 
 	if (q->mq_ops) {
 		spin_lock_init(&fq->mq_flush_lock);
-		rq_sz = round_up(rq_sz + q->tag_set->cmd_size,
-				cache_line_size());
+		rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
 	}
 
-	fq->flush_rq = kzalloc(rq_sz, GFP_KERNEL);
+	fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node);
 	if (!fq->flush_rq)
 		goto fail_rq;
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 53b6def12fc4..4e7a31466139 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -281,26 +281,6 @@ void blk_mq_free_request(struct request *rq)
 	__blk_mq_free_request(hctx, ctx, rq);
 }
 
-/*
- * Clone all relevant state from a request that has been put on hold in
- * the flush state machine into the preallocated flush request that hangs
- * off the request queue.
- *
- * For a driver the flush request should be invisible, that's why we are
- * impersonating the original request here.
- */
-void blk_mq_clone_flush_request(struct request *flush_rq,
-		struct request *orig_rq)
-{
-	struct blk_mq_hw_ctx *hctx =
-		orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu);
-
-	flush_rq->mq_ctx = orig_rq->mq_ctx;
-	flush_rq->tag = orig_rq->tag;
-	memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq),
-		hctx->cmd_size);
-}
-
 inline void __blk_mq_end_request(struct request *rq, int error)
 {
 	blk_account_io_done(rq);
@@ -1516,12 +1496,20 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 {
+	unsigned flush_start_tag = set->queue_depth;
+
 	blk_mq_tag_idle(hctx);
 
+	if (set->ops->exit_request)
+		set->ops->exit_request(set->driver_data,
+				       hctx->fq->flush_rq, hctx_idx,
+				       flush_start_tag + hctx_idx);
+
 	if (set->ops->exit_hctx)
 		set->ops->exit_hctx(hctx, hctx_idx);
 
 	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
+	blk_free_flush_queue(hctx->fq);
 	kfree(hctx->ctxs);
 	blk_mq_free_bitmap(&hctx->ctx_map);
 }
@@ -1556,6 +1544,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
 {
 	int node;
+	unsigned flush_start_tag = set->queue_depth;
 
 	node = hctx->numa_node;
 	if (node == NUMA_NO_NODE)
@@ -1594,8 +1583,23 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
 		goto free_bitmap;
 
+	hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
+	if (!hctx->fq)
+		goto exit_hctx;
+
+	if (set->ops->init_request &&
+	    set->ops->init_request(set->driver_data,
+				   hctx->fq->flush_rq, hctx_idx,
+				   flush_start_tag + hctx_idx, node))
+		goto free_fq;
+
 	return 0;
 
+ free_fq:
+	kfree(hctx->fq);
+ exit_hctx:
+	if (set->ops->exit_hctx)
+		set->ops->exit_hctx(hctx, hctx_idx);
  free_bitmap:
 	blk_mq_free_bitmap(&hctx->ctx_map);
  free_ctxs:
@@ -1862,16 +1866,10 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 
 	blk_mq_add_queue_tag_set(set, q);
 
-	q->fq = blk_alloc_flush_queue(q);
-	if (!q->fq)
-		goto err_hw_queues;
-
 	blk_mq_map_swqueue(q);
 
 	return q;
 
-err_hw_queues:
-	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
 err_hw:
 	blk_cleanup_queue(q);
 err_hctxs:
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 718cffc4c678..e8f38a36c625 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -517,10 +517,10 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
-	blk_free_flush_queue(q->fq);
-
 	if (q->mq_ops)
 		blk_mq_free_queue(q);
+	else
+		blk_free_flush_queue(q->fq);
 
 	blk_trace_shutdown(q);
 
diff --git a/block/blk.h b/block/blk.h
index 7ecdd8517e69..43b036185712 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -2,6 +2,8 @@
 #define BLK_INTERNAL_H
 
 #include <linux/idr.h>
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
 
 /* Amount of time in which a process may batch requests */
 #define BLK_BATCH_TIME	(HZ/50UL)
@@ -31,7 +33,14 @@ extern struct ida blk_queue_ida;
 static inline struct blk_flush_queue *blk_get_flush_queue(
 		struct request_queue *q, struct blk_mq_ctx *ctx)
 {
-	return q->fq;
+	struct blk_mq_hw_ctx *hctx;
+
+	if (!q->mq_ops)
+		return q->fq;
+
+	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+	return hctx->fq;
 }
 
 static inline void __blk_get_queue(struct request_queue *q)
@@ -39,8 +48,9 @@ static inline void __blk_get_queue(struct request_queue *q)
 	kobject_get(&q->kobj);
 }
 
-struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q);
-void blk_free_flush_queue(struct blk_flush_queue *fq);
+struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
+		int node, int cmd_size);
+void blk_free_flush_queue(struct blk_flush_queue *q);
 
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
 		gfp_t gfp_mask);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 325349559fb0..02c5d950f444 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -4,6 +4,7 @@
 #include <linux/blkdev.h>
 
 struct blk_mq_tags;
+struct blk_flush_queue;
 
 struct blk_mq_cpu_notifier {
 	struct list_head list;
@@ -34,6 +35,7 @@ struct blk_mq_hw_ctx {
 
 	struct request_queue	*queue;
 	unsigned int		queue_num;
+	struct blk_flush_queue	*fq;
 
 	void			*driver_data;
 
@@ -119,6 +121,10 @@ struct blk_mq_ops {
 	/*
 	 * Called for every command allocated by the block layer to allow
 	 * the driver to set up driver specific data.
+	 *
+	 * Tag greater than or equal to queue_depth is for setting up
+	 * flush request.
+	 *
 	 * Ditto for exit/teardown.
 	 */
 	init_request_fn		*init_request;
-- 
cgit v1.2.3


From 4a0efdc933680d908de11712a774a2c9492c3d5a Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Wed, 1 Oct 2014 14:32:31 +0200
Subject: block: misplaced rq_complete tracepoint

The rq_complete tracepoint was never issued for empty requests,
causing the resulting blktrace information to never show any
completion for those request.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index e1c2775c7597..4aa9ccd02a50 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2400,11 +2400,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 {
 	int total_bytes;
 
+	trace_block_rq_complete(req->q, req, nr_bytes);
+
 	if (!req->bio)
 		return false;
 
-	trace_block_rq_complete(req->q, req, nr_bytes);
-
 	/*
 	 * For fs requests, rq is just carrier of independent bio's
 	 * and each partial completion should be handled separately.
-- 
cgit v1.2.3


From 11dfce509eaa35e8fc81cb50d0910c0e235fd7e2 Mon Sep 17 00:00:00 2001
From: Junichi Nomura <j-nomura@ce.jp.nec.com>
Date: Fri, 3 Oct 2014 17:27:11 -0400
Subject: block: use bio_clone_fast() in blk_rq_prep_clone()

Request cloning clones bios in the request to track the completion
of each bio.
For that purpose, we can use bio_clone_fast() instead of bio_clone()
to avoid unnecessary allocation and copy of bvecs.

This patch reduces memory footprint of request-based device-mapper
(about 1-4KB for each request) and is a preparation for further
reduction of memory usage by removing unused bvec mempool.

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 4aa9ccd02a50..ffcb47af35f3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2926,7 +2926,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 	blk_rq_init(NULL, rq);
 
 	__rq_for_each_bio(bio_src, rq_src) {
-		bio = bio_clone_bioset(bio_src, gfp_mask, bs);
+		bio = bio_clone_fast(bio_src, gfp_mask, bs);
 		if (!bio)
 			goto free_and_out;
 
-- 
cgit v1.2.3


From ef3ecb66bcd6b2076dc8782e1315cf2807b73c0c Mon Sep 17 00:00:00 2001
From: Robert Elliott <elliott@hp.com>
Date: Wed, 27 Aug 2014 10:50:31 -0500
Subject: block: make blk_update_request print prefix match ratelimited prefix

In blk_update_request, change the printk_ratelimited
prefix from end_request to blk_update_request so it
matches the name printed if rate limiting occurs.

Old:
[10234.933106] blk_update_request: 174 callbacks suppressed
[10234.934940] end_request: critical target error, dev sdr, sector 16
[10234.949788] end_request: critical target error, dev sdr, sector 16

New:
[16863.445173] blk_update_request: 398 callbacks suppressed
[16863.447029] blk_update_request: critical target error, dev sdr, sector
1442066176
[16863.449383] blk_update_request: critical target error, dev sdr, sector
802802888
[16863.451680] blk_update_request: critical target error, dev sdr, sector
1609535456

Signed-off-by: Robert Elliott <elliott@hp.com>
Reviewed-by: Webb Scales <webbnh@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index ffcb47af35f3..ecc124ec53bb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2444,8 +2444,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 			error_type = "I/O";
 			break;
 		}
-		printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
-				   error_type, req->rq_disk ?
+		printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n",
+				   __func__, error_type, req->rq_disk ?
 				   req->rq_disk->disk_name : "?",
 				   (unsigned long long)blk_rq_pos(req));
 
-- 
cgit v1.2.3


From 7b2b10e0e2c65ebc11314e1af9924d0824ec1562 Mon Sep 17 00:00:00 2001
From: Robert Elliott <elliott@hp.com>
Date: Wed, 27 Aug 2014 10:50:36 -0500
Subject: block: include func name in __get_request prints

In __get_request calls to printk_ratelimited, include the function name so
the callbacks suppressed message matches the messages that are printed,
and add "dev" before the device name so it matches other block layer
messages.

Signed-off-by: Robert Elliott <elliott@hp.com>
Reviewed-by: Webb Scales <webbnh@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index ecc124ec53bb..d6ec7db9a9f4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1060,8 +1060,8 @@ fail_elvpriv:
 	 * shouldn't stall IO.  Treat this request as !elvpriv.  This will
 	 * disturb iosched and blkcg but weird is bettern than dead.
 	 */
-	printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n",
-			   dev_name(q->backing_dev_info.dev));
+	printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n",
+			   __func__, dev_name(q->backing_dev_info.dev));
 
 	rq->cmd_flags &= ~REQ_ELVPRIV;
 	rq->elv.icq = NULL;
-- 
cgit v1.2.3