From 66cb45aa41315d1d9972cada354fbdf7870d7714 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 Jun 2014 16:22:24 -0600 Subject: block: add support for limiting gaps in SG lists Another restriction inherited for NVMe - those devices don't support SG lists that have "gaps" in them. Gaps refers to cases where the previous SG entry doesn't end on a page boundary. For NVMe, all SG entries must start at offset 0 (except the first) and end on a page boundary (except the last). Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 713f8b62b435..8699bcf5f099 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -512,6 +512,7 @@ struct request_queue { #define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ #define QUEUE_FLAG_NO_SG_MERGE 21 /* don't attempt to merge SG segments*/ +#define QUEUE_FLAG_SG_GAPS 22 /* queue doesn't support SG gaps */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3 From 780db2071ac4d167ee4154ad9c96088f1bba044b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Jul 2014 10:31:13 -0600 Subject: blk-mq: decouble blk-mq freezing from generic bypassing blk_mq freezing is entangled with generic bypassing which bypasses blkcg and io scheduler and lets IO requests fall through the block layer to the drivers in FIFO order. This allows forward progress on IOs with the advanced features disabled so that those features can be configured or altered without worrying about stalling IO which may lead to deadlock through memory allocation. However, generic bypassing doesn't quite fit blk-mq. blk-mq currently doesn't make use of blkcg or ioscheds and it maps bypssing to freezing, which blocks request processing and drains all the in-flight ones. This causes problems as bypassing assumes that request processing is online. blk-mq works around this by conditionally allowing request processing for the problem case - during queue initialization. Another weirdity is that except for during queue cleanup, bypassing started on the generic side prevents blk-mq from processing new requests but doesn't drain the in-flight ones. This shouldn't break anything but again highlights that something isn't quite right here. The root cause is conflating blk-mq freezing and generic bypassing which are two different mechanisms. The only intersecting purpose that they serve is during queue cleanup. Let's properly separate blk-mq freezing from generic bypassing and simply use it where necessary. * request_queue->mq_freeze_depth is added and blk_mq_[un]freeze_queue() now operate on this counter instead of ->bypass_depth. The replacement for QUEUE_FLAG_BYPASS isn't added but the counter is tested directly. This will be further updated by later changes. * blk_mq_drain_queue() is dropped and "__" prefix is dropped from blk_mq_freeze_queue(). Queue cleanup path now calls blk_mq_freeze_queue() directly. * blk_queue_enter()'s fast path condition is simplified to simply check @q->mq_freeze_depth. Previously, the condition was !blk_queue_dying(q) && (!blk_queue_bypass(q) || !blk_queue_init_done(q)) mq_freeze_depth is incremented right after dying is set and blk_queue_init_done() exception isn't necessary as blk-mq doesn't start frozen, which only leaves the blk_queue_bypass() test which can be replaced by @q->mq_freeze_depth test. This change simplifies the code and reduces confusion in the area. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Nicholas A. Bellinger Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-mq.c | 17 ++++++----------- block/blk-mq.h | 2 +- include/linux/blkdev.h | 1 + 4 files changed, 9 insertions(+), 13 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index 0d0bdd65b2d7..c359d72e9d76 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -514,7 +514,7 @@ void blk_cleanup_queue(struct request_queue *q) * prevent that q->request_fn() gets invoked after draining finished. */ if (q->mq_ops) { - blk_mq_drain_queue(q); + blk_mq_freeze_queue(q); spin_lock_irq(lock); } else { spin_lock_irq(lock); diff --git a/block/blk-mq.c b/block/blk-mq.c index f4bdddd7ed99..1e324a123d40 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -84,15 +84,14 @@ static int blk_mq_queue_enter(struct request_queue *q) smp_mb(); /* we have problems freezing the queue if it's initializing */ - if (!blk_queue_dying(q) && - (!blk_queue_bypass(q) || !blk_queue_init_done(q))) + if (!q->mq_freeze_depth) return 0; __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); spin_lock_irq(q->queue_lock); ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, - !blk_queue_bypass(q) || blk_queue_dying(q), + !q->mq_freeze_depth || blk_queue_dying(q), *q->queue_lock); /* inc usage with lock hold to avoid freeze_queue runs here */ if (!ret && !blk_queue_dying(q)) @@ -129,11 +128,10 @@ void blk_mq_drain_queue(struct request_queue *q) * Guarantee no request is in use, so we can change any data structure of * the queue afterward. */ -static void blk_mq_freeze_queue(struct request_queue *q) +void blk_mq_freeze_queue(struct request_queue *q) { spin_lock_irq(q->queue_lock); - q->bypass_depth++; - queue_flag_set(QUEUE_FLAG_BYPASS, q); + q->mq_freeze_depth++; spin_unlock_irq(q->queue_lock); blk_mq_drain_queue(q); @@ -144,11 +142,8 @@ static void blk_mq_unfreeze_queue(struct request_queue *q) bool wake = false; spin_lock_irq(q->queue_lock); - if (!--q->bypass_depth) { - queue_flag_clear(QUEUE_FLAG_BYPASS, q); - wake = true; - } - WARN_ON_ONCE(q->bypass_depth < 0); + wake = !--q->mq_freeze_depth; + WARN_ON_ONCE(q->mq_freeze_depth < 0); spin_unlock_irq(q->queue_lock); if (wake) wake_up_all(&q->mq_freeze_wq); diff --git a/block/blk-mq.h b/block/blk-mq.h index 26460884c6cd..ca4964a6295d 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -28,7 +28,7 @@ struct blk_mq_ctx { void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_init_flush(struct request_queue *q); -void blk_mq_drain_queue(struct request_queue *q); +void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); void blk_mq_clone_flush_request(struct request *flush_rq, struct request *orig_rq); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8699bcf5f099..c8f344ff74fe 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -470,6 +470,7 @@ struct request_queue { struct mutex sysfs_lock; int bypass_depth; + int mq_freeze_depth; #if defined(CONFIG_BLK_DEV_BSG) bsg_job_fn *bsg_job_fn; -- cgit v1.2.3 From add703fda981b9719d37f371498b9f129acbd997 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Jul 2014 10:34:38 -0600 Subject: blk-mq: use percpu_ref for mq usage count Currently, blk-mq uses a percpu_counter to keep track of how many usages are in flight. The percpu_counter is drained while freezing to ensure that no usage is left in-flight after freezing is complete. blk_mq_queue_enter/exit() and blk_mq_[un]freeze_queue() implement this per-cpu gating mechanism. This type of code has relatively high chance of subtle bugs which are extremely difficult to trigger and it's way too hairy to be open coded in blk-mq. percpu_ref can serve the same purpose after the recent changes. This patch replaces the open-coded per-cpu usage counting and draining mechanism with percpu_ref. blk_mq_queue_enter() performs tryget_live on the ref and exit() performs put. blk_mq_freeze_queue() kills the ref and waits until the reference count reaches zero. blk_mq_unfreeze_queue() revives the ref and wakes up the waiters. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Nicholas A. Bellinger Cc: Kent Overstreet Signed-off-by: Jens Axboe --- block/blk-mq.c | 68 +++++++++++++++++++++----------------------------- include/linux/blkdev.h | 3 ++- 2 files changed, 31 insertions(+), 40 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-mq.c b/block/blk-mq.c index 22682fb4be65..5189cb1e478a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -78,34 +78,32 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, static int blk_mq_queue_enter(struct request_queue *q) { - int ret; - - __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); - smp_mb(); - - /* we have problems freezing the queue if it's initializing */ - if (!q->mq_freeze_depth) - return 0; - - __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); + while (true) { + int ret; - spin_lock_irq(q->queue_lock); - ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, - !q->mq_freeze_depth || blk_queue_dying(q), - *q->queue_lock); - /* inc usage with lock hold to avoid freeze_queue runs here */ - if (!ret && !blk_queue_dying(q)) - __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); - else if (blk_queue_dying(q)) - ret = -ENODEV; - spin_unlock_irq(q->queue_lock); + if (percpu_ref_tryget_live(&q->mq_usage_counter)) + return 0; - return ret; + ret = wait_event_interruptible(q->mq_freeze_wq, + !q->mq_freeze_depth || blk_queue_dying(q)); + if (blk_queue_dying(q)) + return -ENODEV; + if (ret) + return ret; + } } static void blk_mq_queue_exit(struct request_queue *q) { - __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); + percpu_ref_put(&q->mq_usage_counter); +} + +static void blk_mq_usage_counter_release(struct percpu_ref *ref) +{ + struct request_queue *q = + container_of(ref, struct request_queue, mq_usage_counter); + + wake_up_all(&q->mq_freeze_wq); } /* @@ -118,18 +116,9 @@ void blk_mq_freeze_queue(struct request_queue *q) q->mq_freeze_depth++; spin_unlock_irq(q->queue_lock); - while (true) { - s64 count; - - spin_lock_irq(q->queue_lock); - count = percpu_counter_sum(&q->mq_usage_counter); - spin_unlock_irq(q->queue_lock); - - if (count == 0) - break; - blk_mq_start_hw_queues(q); - msleep(10); - } + percpu_ref_kill(&q->mq_usage_counter); + blk_mq_run_queues(q, false); + wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); } static void blk_mq_unfreeze_queue(struct request_queue *q) @@ -140,8 +129,10 @@ static void blk_mq_unfreeze_queue(struct request_queue *q) wake = !--q->mq_freeze_depth; WARN_ON_ONCE(q->mq_freeze_depth < 0); spin_unlock_irq(q->queue_lock); - if (wake) + if (wake) { + percpu_ref_reinit(&q->mq_usage_counter); wake_up_all(&q->mq_freeze_wq); + } } bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) @@ -1785,7 +1776,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!q) goto err_hctxs; - if (percpu_counter_init(&q->mq_usage_counter, 0)) + if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); @@ -1878,7 +1869,7 @@ void blk_mq_free_queue(struct request_queue *q) blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); blk_mq_free_hw_queues(q, set); - percpu_counter_destroy(&q->mq_usage_counter); + percpu_ref_exit(&q->mq_usage_counter); free_percpu(q->queue_ctx); kfree(q->queue_hw_ctx); @@ -2037,8 +2028,7 @@ static int __init blk_mq_init(void) { blk_mq_cpu_init(); - /* Must be called after percpu_counter_hotcpu_callback() */ - hotcpu_notifier(blk_mq_queue_reinit_notify, -10); + hotcpu_notifier(blk_mq_queue_reinit_notify, 0); return 0; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c8f344ff74fe..518b46555b80 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -484,7 +485,7 @@ struct request_queue { #endif struct rcu_head rcu_head; wait_queue_head_t mq_freeze_wq; - struct percpu_counter mq_usage_counter; + struct percpu_ref mq_usage_counter; struct list_head all_q_node; struct blk_mq_tag_set *tag_set; -- cgit v1.2.3