From 01c5f85aebaaddfd7e6051fb2ec80c1d4b463554 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Sep 2018 10:59:53 -0600 Subject: blk-cgroup: increase number of supported policies After merging the iolatency policy, we potentially now have 4 policies being registered, but only support 3. This causes one of them to fail loading. Takashi reports that BFQ no longer works for him, because it fails to load due to policy registration failure. Bump to 5 policies, and also add a warning for when we have exceeded the global amount. If we have to touch this again, we should switch to a dynamic scheme instead. Reported-by: Takashi Iwai Reviewed-by: Jeff Moyer Tested-by: Takashi Iwai Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c19f9078da1e..c630e02836a8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1510,8 +1510,10 @@ int blkcg_policy_register(struct blkcg_policy *pol) for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; - if (i >= BLKCG_MAX_POLS) + if (i >= BLKCG_MAX_POLS) { + pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); goto err_unlock; + } /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || -- cgit v1.2.3 From b57e99b4b8b0ebdf9707424e7ddc0c392bdc5fe6 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 21 Sep 2018 16:44:34 -0700 Subject: block: use nanosecond resolution for iostat Klaus Kusche reported that the I/O busy time in /proc/diskstats was not updating properly on 4.18. This is because we started using ktime to track elapsed time, and we convert nanoseconds to jiffies when we update the partition counter. However, this gets rounded down, so any I/Os that take less than a jiffy are not accounted for. Previously in this case, the value of jiffies would sometimes increment while we were doing I/O, so at least some I/Os were accounted for. Let's convert the stats to use nanoseconds internally. We still report milliseconds as before, now more accurately than ever. The value is still truncated to 32 bits for backwards compatibility. Fixes: 522a777566f5 ("block: consolidate struct request timestamp fields") Cc: stable@vger.kernel.org Reported-by: Klaus Kusche Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-core.c | 4 +--- block/genhd.c | 6 +++--- block/partition-generic.c | 6 +++--- include/linux/genhd.h | 5 ++++- 5 files changed, 12 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 8c680a776171..0093bed81c0e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1684,7 +1684,7 @@ void generic_end_io_acct(struct request_queue *q, int req_op, const int sgrp = op_stat_group(req_op); int cpu = part_stat_lock(); - part_stat_add(cpu, part, ticks[sgrp], duration); + part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); part_round_stats(q, cpu, part); part_dec_in_flight(q, part, op_is_write(req_op)); diff --git a/block/blk-core.c b/block/blk-core.c index 4dbc93f43b38..cff0a60ee200 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2733,17 +2733,15 @@ void blk_account_io_done(struct request *req, u64 now) * containing request is enough. */ if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { - unsigned long duration; const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; - duration = nsecs_to_jiffies(now - req->start_time_ns); cpu = part_stat_lock(); part = req->part; part_stat_inc(cpu, part, ios[sgrp]); - part_stat_add(cpu, part, ticks[sgrp], duration); + part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); part_round_stats(req->q, cpu, part); part_dec_in_flight(req->q, part, rq_data_dir(req)); diff --git a/block/genhd.c b/block/genhd.c index 8cc719a37b32..be5bab20b2ab 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1343,18 +1343,18 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, ios[STAT_READ]), part_stat_read(hd, merges[STAT_READ]), part_stat_read(hd, sectors[STAT_READ]), - jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])), + (unsigned int)part_stat_read_msecs(hd, STAT_READ), part_stat_read(hd, ios[STAT_WRITE]), part_stat_read(hd, merges[STAT_WRITE]), part_stat_read(hd, sectors[STAT_WRITE]), - jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])), + (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), inflight[0], jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)), part_stat_read(hd, ios[STAT_DISCARD]), part_stat_read(hd, merges[STAT_DISCARD]), part_stat_read(hd, sectors[STAT_DISCARD]), - jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD])) + (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD) ); } disk_part_iter_exit(&piter); diff --git a/block/partition-generic.c b/block/partition-generic.c index 5a8975a1201c..d3d14e81fb12 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -136,18 +136,18 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, ios[STAT_READ]), part_stat_read(p, merges[STAT_READ]), (unsigned long long)part_stat_read(p, sectors[STAT_READ]), - jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])), + (unsigned int)part_stat_read_msecs(p, STAT_READ), part_stat_read(p, ios[STAT_WRITE]), part_stat_read(p, merges[STAT_WRITE]), (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), - jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])), + (unsigned int)part_stat_read_msecs(p, STAT_WRITE), inflight[0], jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue)), part_stat_read(p, ios[STAT_DISCARD]), part_stat_read(p, merges[STAT_DISCARD]), (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD]))); + (unsigned int)part_stat_read_msecs(p, STAT_DISCARD)); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 57864422a2c8..25c08c6c7f99 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -83,10 +83,10 @@ struct partition { } __attribute__((packed)); struct disk_stats { + u64 nsecs[NR_STAT_GROUPS]; unsigned long sectors[NR_STAT_GROUPS]; unsigned long ios[NR_STAT_GROUPS]; unsigned long merges[NR_STAT_GROUPS]; - unsigned long ticks[NR_STAT_GROUPS]; unsigned long io_ticks; unsigned long time_in_queue; }; @@ -354,6 +354,9 @@ static inline void free_part_stats(struct hd_struct *part) #endif /* CONFIG_SMP */ +#define part_stat_read_msecs(part, which) \ + div_u64(part_stat_read(part, nsecs[which]), NSEC_PER_MSEC) + #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ part_stat_read(part, field[STAT_WRITE]) + \ -- cgit v1.2.3 From 530ca2c9bd6949c72c9b5cfc330cb3dbccaa3f5b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 25 Sep 2018 10:36:20 -0600 Subject: blk-mq: Allow blocking queue tag iter callbacks A recent commit runs tag iterator callbacks under the rcu read lock, but existing callbacks do not satisfy the non-blocking requirement. The commit intended to prevent an iterator from accessing a queue that's being modified. This patch fixes the original issue by taking a queue reference instead of reading it, which allows callbacks to make blocking calls. Fixes: f5bbbbe4d6357 ("blk-mq: sync the update nr_hw_queues with blk_mq_queue_tag_busy_iter") Acked-by: Jianchao Wang Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 94e1ed667b6e..41317c50a446 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -322,16 +322,11 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, /* * __blk_mq_update_nr_hw_queues will update the nr_hw_queues and - * queue_hw_ctx after freeze the queue. So we could use q_usage_counter - * to avoid race with it. __blk_mq_update_nr_hw_queues will users - * synchronize_rcu to ensure all of the users go out of the critical - * section below and see zeroed q_usage_counter. + * queue_hw_ctx after freeze the queue, so we use q_usage_counter + * to avoid race with it. */ - rcu_read_lock(); - if (percpu_ref_is_zero(&q->q_usage_counter)) { - rcu_read_unlock(); + if (!percpu_ref_tryget(&q->q_usage_counter)) return; - } queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; @@ -347,7 +342,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); } - rcu_read_unlock(); + blk_queue_exit(q); } static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, -- cgit v1.2.3 From 854f31ccdd7964c9c2e68da234a3a8aedb51cf6b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 27 Sep 2018 10:55:13 +0900 Subject: block: fix deadline elevator drain for zoned block devices When the deadline scheduler is used with a zoned block device, writes to a zone will be dispatched one at a time. This causes the warning message: deadline: forced dispatching is broken (nr_sorted=X), please report this to be displayed when switching to another elevator with the legacy I/O path while write requests to a zone are being retained in the scheduler queue. Prevent this message from being displayed when executing elv_drain_elevator() for a zoned block device. __blk_drain_queue() will loop until all writes are dispatched and completed, resulting in the desired elevator queue drain without extensive modifications to the deadline code itself to handle forced-dispatch calls. Signed-off-by: Damien Le Moal Fixes: 8dc8146f9c92 ("deadline-iosched: Introduce zone locking support") Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 6a06b5d040e5..fae58b2f906f 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -609,7 +609,7 @@ void elv_drain_elevator(struct request_queue *q) while (e->type->ops.sq.elevator_dispatch_fn(q, 1)) ; - if (q->nr_sorted && printed++ < 10) { + if (q->nr_sorted && !blk_queue_is_zoned(q) && printed++ < 10 ) { printk(KERN_ERR "%s: forced dispatching is broken " "(nr_sorted=%u), please report this\n", q->elevator->type->elevator_name, q->nr_sorted); -- cgit v1.2.3 From 587562d0c7cd6861f4f90a2eb811cccb1a376f5f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 26 Sep 2018 14:35:50 +0200 Subject: blk-mq: I/O and timer unplugs are inverted in blktrace trace_block_unplug() takes true for explicit unplugs and false for implicit unplugs. schedule() unplugs are implicit and should be reported as timer unplugs. While correct in the legacy code, this has been inverted in blk-mq since 4.11. Cc: stable@vger.kernel.org Fixes: bd166ef183c2 ("blk-mq-sched: add framework for MQ capable IO schedulers") Reviewed-by: Omar Sandoval Signed-off-by: Ilya Dryomov Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 85a1c1a59c72..e3c39ea8e17b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1628,7 +1628,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) BUG_ON(!rq->q); if (rq->mq_ctx != this_ctx) { if (this_ctx) { - trace_block_unplug(this_q, depth, from_schedule); + trace_block_unplug(this_q, depth, !from_schedule); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); @@ -1648,7 +1648,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) * on 'ctx_list'. Do those. */ if (this_ctx) { - trace_block_unplug(this_q, depth, from_schedule); + trace_block_unplug(this_q, depth, !from_schedule); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); } -- cgit v1.2.3