From 01c5f85aebaaddfd7e6051fb2ec80c1d4b463554 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 11 Sep 2018 10:59:53 -0600
Subject: blk-cgroup: increase number of supported policies

After merging the iolatency policy, we potentially now have 4 policies
being registered, but only support 3. This causes one of them to fail
loading. Takashi reports that BFQ no longer works for him, because it
fails to load due to policy registration failure.

Bump to 5 policies, and also add a warning for when we have exceeded
the global amount. If we have to touch this again, we should switch
to a dynamic scheme instead.

Reported-by: Takashi Iwai <tiwai@suse.de>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Tested-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c19f9078da1e..c630e02836a8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1510,8 +1510,10 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	for (i = 0; i < BLKCG_MAX_POLS; i++)
 		if (!blkcg_policy[i])
 			break;
-	if (i >= BLKCG_MAX_POLS)
+	if (i >= BLKCG_MAX_POLS) {
+		pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
 		goto err_unlock;
+	}
 
 	/* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
 	if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
-- 
cgit v1.2.3


From b57e99b4b8b0ebdf9707424e7ddc0c392bdc5fe6 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 21 Sep 2018 16:44:34 -0700
Subject: block: use nanosecond resolution for iostat

Klaus Kusche reported that the I/O busy time in /proc/diskstats was not
updating properly on 4.18. This is because we started using ktime to
track elapsed time, and we convert nanoseconds to jiffies when we update
the partition counter. However, this gets rounded down, so any I/Os that
take less than a jiffy are not accounted for. Previously in this case,
the value of jiffies would sometimes increment while we were doing I/O,
so at least some I/Os were accounted for.

Let's convert the stats to use nanoseconds internally. We still report
milliseconds as before, now more accurately than ever. The value is
still truncated to 32 bits for backwards compatibility.

Fixes: 522a777566f5 ("block: consolidate struct request timestamp fields")
Cc: stable@vger.kernel.org
Reported-by: Klaus Kusche <klaus.kusche@computerix.info>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               | 2 +-
 block/blk-core.c          | 4 +---
 block/genhd.c             | 6 +++---
 block/partition-generic.c | 6 +++---
 include/linux/genhd.h     | 5 ++++-
 5 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index 8c680a776171..0093bed81c0e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1684,7 +1684,7 @@ void generic_end_io_acct(struct request_queue *q, int req_op,
 	const int sgrp = op_stat_group(req_op);
 	int cpu = part_stat_lock();
 
-	part_stat_add(cpu, part, ticks[sgrp], duration);
+	part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration));
 	part_round_stats(q, cpu, part);
 	part_dec_in_flight(q, part, op_is_write(req_op));
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 4dbc93f43b38..cff0a60ee200 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2733,17 +2733,15 @@ void blk_account_io_done(struct request *req, u64 now)
 	 * containing request is enough.
 	 */
 	if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
-		unsigned long duration;
 		const int sgrp = op_stat_group(req_op(req));
 		struct hd_struct *part;
 		int cpu;
 
-		duration = nsecs_to_jiffies(now - req->start_time_ns);
 		cpu = part_stat_lock();
 		part = req->part;
 
 		part_stat_inc(cpu, part, ios[sgrp]);
-		part_stat_add(cpu, part, ticks[sgrp], duration);
+		part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns);
 		part_round_stats(req->q, cpu, part);
 		part_dec_in_flight(req->q, part, rq_data_dir(req));
 
diff --git a/block/genhd.c b/block/genhd.c
index 8cc719a37b32..be5bab20b2ab 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1343,18 +1343,18 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   part_stat_read(hd, ios[STAT_READ]),
 			   part_stat_read(hd, merges[STAT_READ]),
 			   part_stat_read(hd, sectors[STAT_READ]),
-			   jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])),
+			   (unsigned int)part_stat_read_msecs(hd, STAT_READ),
 			   part_stat_read(hd, ios[STAT_WRITE]),
 			   part_stat_read(hd, merges[STAT_WRITE]),
 			   part_stat_read(hd, sectors[STAT_WRITE]),
-			   jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])),
+			   (unsigned int)part_stat_read_msecs(hd, STAT_WRITE),
 			   inflight[0],
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
 			   jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
 			   part_stat_read(hd, ios[STAT_DISCARD]),
 			   part_stat_read(hd, merges[STAT_DISCARD]),
 			   part_stat_read(hd, sectors[STAT_DISCARD]),
-			   jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD]))
+			   (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD)
 			);
 	}
 	disk_part_iter_exit(&piter);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 5a8975a1201c..d3d14e81fb12 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -136,18 +136,18 @@ ssize_t part_stat_show(struct device *dev,
 		part_stat_read(p, ios[STAT_READ]),
 		part_stat_read(p, merges[STAT_READ]),
 		(unsigned long long)part_stat_read(p, sectors[STAT_READ]),
-		jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])),
+		(unsigned int)part_stat_read_msecs(p, STAT_READ),
 		part_stat_read(p, ios[STAT_WRITE]),
 		part_stat_read(p, merges[STAT_WRITE]),
 		(unsigned long long)part_stat_read(p, sectors[STAT_WRITE]),
-		jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])),
+		(unsigned int)part_stat_read_msecs(p, STAT_WRITE),
 		inflight[0],
 		jiffies_to_msecs(part_stat_read(p, io_ticks)),
 		jiffies_to_msecs(part_stat_read(p, time_in_queue)),
 		part_stat_read(p, ios[STAT_DISCARD]),
 		part_stat_read(p, merges[STAT_DISCARD]),
 		(unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]),
-		jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD])));
+		(unsigned int)part_stat_read_msecs(p, STAT_DISCARD));
 }
 
 ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 57864422a2c8..25c08c6c7f99 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -83,10 +83,10 @@ struct partition {
 } __attribute__((packed));
 
 struct disk_stats {
+	u64 nsecs[NR_STAT_GROUPS];
 	unsigned long sectors[NR_STAT_GROUPS];
 	unsigned long ios[NR_STAT_GROUPS];
 	unsigned long merges[NR_STAT_GROUPS];
-	unsigned long ticks[NR_STAT_GROUPS];
 	unsigned long io_ticks;
 	unsigned long time_in_queue;
 };
@@ -354,6 +354,9 @@ static inline void free_part_stats(struct hd_struct *part)
 
 #endif /* CONFIG_SMP */
 
+#define part_stat_read_msecs(part, which)				\
+	div_u64(part_stat_read(part, nsecs[which]), NSEC_PER_MSEC)
+
 #define part_stat_read_accum(part, field)				\
 	(part_stat_read(part, field[STAT_READ]) +			\
 	 part_stat_read(part, field[STAT_WRITE]) +			\
-- 
cgit v1.2.3


From 530ca2c9bd6949c72c9b5cfc330cb3dbccaa3f5b Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 25 Sep 2018 10:36:20 -0600
Subject: blk-mq: Allow blocking queue tag iter callbacks

A recent commit runs tag iterator callbacks under the rcu read lock,
but existing callbacks do not satisfy the non-blocking requirement.
The commit intended to prevent an iterator from accessing a queue that's
being modified. This patch fixes the original issue by taking a queue
reference instead of reading it, which allows callbacks to make blocking
calls.

Fixes: f5bbbbe4d6357 ("blk-mq: sync the update nr_hw_queues with blk_mq_queue_tag_busy_iter")
Acked-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 94e1ed667b6e..41317c50a446 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -322,16 +322,11 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 
 	/*
 	 * __blk_mq_update_nr_hw_queues will update the nr_hw_queues and
-	 * queue_hw_ctx after freeze the queue. So we could use q_usage_counter
-	 * to avoid race with it. __blk_mq_update_nr_hw_queues will users
-	 * synchronize_rcu to ensure all of the users go out of the critical
-	 * section below and see zeroed q_usage_counter.
+	 * queue_hw_ctx after freeze the queue, so we use q_usage_counter
+	 * to avoid race with it.
 	 */
-	rcu_read_lock();
-	if (percpu_ref_is_zero(&q->q_usage_counter)) {
-		rcu_read_unlock();
+	if (!percpu_ref_tryget(&q->q_usage_counter))
 		return;
-	}
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		struct blk_mq_tags *tags = hctx->tags;
@@ -347,7 +342,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 			bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
 		bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
 	}
-	rcu_read_unlock();
+	blk_queue_exit(q);
 }
 
 static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
-- 
cgit v1.2.3


From 854f31ccdd7964c9c2e68da234a3a8aedb51cf6b Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 27 Sep 2018 10:55:13 +0900
Subject: block: fix deadline elevator drain for zoned block devices

When the deadline scheduler is used with a zoned block device, writes
to a zone will be dispatched one at a time. This causes the warning
message:

deadline: forced dispatching is broken (nr_sorted=X), please report this

to be displayed when switching to another elevator with the legacy I/O
path while write requests to a zone are being retained in the scheduler
queue.

Prevent this message from being displayed when executing
elv_drain_elevator() for a zoned block device. __blk_drain_queue() will
loop until all writes are dispatched and completed, resulting in the
desired elevator queue drain without extensive modifications to the
deadline code itself to handle forced-dispatch calls.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Fixes: 8dc8146f9c92 ("deadline-iosched: Introduce zone locking support")
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/elevator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/elevator.c b/block/elevator.c
index 6a06b5d040e5..fae58b2f906f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -609,7 +609,7 @@ void elv_drain_elevator(struct request_queue *q)
 
 	while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
 		;
-	if (q->nr_sorted && printed++ < 10) {
+	if (q->nr_sorted && !blk_queue_is_zoned(q) && printed++ < 10 ) {
 		printk(KERN_ERR "%s: forced dispatching is broken "
 		       "(nr_sorted=%u), please report this\n",
 		       q->elevator->type->elevator_name, q->nr_sorted);
-- 
cgit v1.2.3


From 587562d0c7cd6861f4f90a2eb811cccb1a376f5f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 26 Sep 2018 14:35:50 +0200
Subject: blk-mq: I/O and timer unplugs are inverted in blktrace

trace_block_unplug() takes true for explicit unplugs and false for
implicit unplugs.  schedule() unplugs are implicit and should be
reported as timer unplugs.  While correct in the legacy code, this has
been inverted in blk-mq since 4.11.

Cc: stable@vger.kernel.org
Fixes: bd166ef183c2 ("blk-mq-sched: add framework for MQ capable IO schedulers")
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 85a1c1a59c72..e3c39ea8e17b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1628,7 +1628,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		BUG_ON(!rq->q);
 		if (rq->mq_ctx != this_ctx) {
 			if (this_ctx) {
-				trace_block_unplug(this_q, depth, from_schedule);
+				trace_block_unplug(this_q, depth, !from_schedule);
 				blk_mq_sched_insert_requests(this_q, this_ctx,
 								&ctx_list,
 								from_schedule);
@@ -1648,7 +1648,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	 * on 'ctx_list'. Do those.
 	 */
 	if (this_ctx) {
-		trace_block_unplug(this_q, depth, from_schedule);
+		trace_block_unplug(this_q, depth, !from_schedule);
 		blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
 						from_schedule);
 	}
-- 
cgit v1.2.3