From 55c022bbddb2c056b5dff1bd1b1758d31b6d64c9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 Jul 2011 08:19:20 +0200 Subject: block: avoid building too big plug list When I test fio script with big I/O depth, I found the total throughput drops compared to some relative small I/O depth. The reason is the thread accumulates big requests in its plug list and causes some delays (surely this depends on CPU speed). I thought we'd better have a threshold for requests. When a threshold reaches, this means there is no request merge and queue lock contention isn't severe when pushing per-task requests to queue, so the main advantages of blk plug don't exist. We can force a plug list flush in this case. With this, my test throughput actually increases and almost equals to small I/O depth. Another side effect is irq off time decreases in blk_flush_plug_list() for big I/O depth. The BLK_MAX_REQUEST_COUNT is choosen arbitarily, but 16 is efficiently to reduce lock contention to me. But I'm open here, 32 is ok in my test too. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index d2f8f4049abd..a56485292062 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1302,7 +1302,10 @@ get_rq: plug->should_sort = 1; } list_add_tail(&req->queuelist, &plug->list); + plug->count++; drive_stat_acct(req, 1); + if (plug->count >= BLK_MAX_REQUEST_COUNT) + blk_flush_plug_list(plug, false); } else { spin_lock_irq(q->queue_lock); add_acct_request(q, req, where); @@ -2626,6 +2629,7 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->cb_list); plug->should_sort = 0; + plug->count = 0; /* * If this is a nested plug, don't actually assign it. It will be @@ -2709,6 +2713,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) return; list_splice_init(&plug->list, &list); + plug->count = 0; if (plug->should_sort) { list_sort(NULL, &list, plug_rq_cmp); -- cgit v1.2.3 From 5757a6d76cdf6dda2a492c09b985c015e86779b1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 23 Jul 2011 20:44:25 +0200 Subject: block: strict rq_affinity Some systems benefit from completions always being steered to the strict requester cpu rather than the looser "per-socket" steering that blk_cpu_to_group() attempts by default. This is because the first CPU in the group mask ends up being completely overloaded with work, while the others (including the original submitter) has power left to spare. Allow the strict mode to be set by writing '2' to the sysfs control file. This is identical to the scheme used for the nomerges file, where '2' is a more aggressive setting than just being turned on. echo 2 > /sys/block//queue/rq_affinity Cc: Christoph Hellwig Cc: Roland Dreier Tested-by: Dave Jiang Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- Documentation/block/queue-sysfs.txt | 10 +++++++--- block/blk-core.c | 6 ++---- block/blk-softirq.c | 11 +++++++---- block/blk-sysfs.c | 13 +++++++++---- include/linux/blkdev.h | 3 ++- 5 files changed, 27 insertions(+), 16 deletions(-) (limited to 'block/blk-core.c') diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index f65274081c8d..d8147b336c35 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -45,9 +45,13 @@ device. rq_affinity (RW) ---------------- -If this option is enabled, the block layer will migrate request completions -to the CPU that originally submitted the request. For some workloads -this provides a significant reduction in CPU cycles due to caching effects. +If this option is '1', the block layer will migrate request completions to the +cpu "group" that originally submitted the request. For some workloads this +provides a significant reduction in CPU cycles due to caching effects. + +For storage configurations that need to maximize distribution of completion +processing setting this option to '2' forces the completion to run on the +requesting cpu (bypassing the "group" aggregation logic). scheduler (RW) -------------- diff --git a/block/blk-core.c b/block/blk-core.c index a56485292062..b3228255304d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1279,10 +1279,8 @@ get_rq: init_request_from_bio(req, bio); if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || - bio_flagged(bio, BIO_CPU_AFFINE)) { - req->cpu = blk_cpu_to_group(get_cpu()); - put_cpu(); - } + bio_flagged(bio, BIO_CPU_AFFINE)) + req->cpu = smp_processor_id(); plug = current->plug; if (plug) { diff --git a/block/blk-softirq.c b/block/blk-softirq.c index ee9c21602228..475fab809a80 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -103,22 +103,25 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = { void __blk_complete_request(struct request *req) { + int ccpu, cpu, group_cpu = NR_CPUS; struct request_queue *q = req->q; unsigned long flags; - int ccpu, cpu, group_cpu; BUG_ON(!q->softirq_done_fn); local_irq_save(flags); cpu = smp_processor_id(); - group_cpu = blk_cpu_to_group(cpu); /* * Select completion CPU */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) { ccpu = req->cpu; - else + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) { + ccpu = blk_cpu_to_group(ccpu); + group_cpu = blk_cpu_to_group(cpu); + } + } else ccpu = cpu; if (ccpu == cpu || ccpu == group_cpu) { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index d935bd859c87..0ee17b5e7fb6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -244,8 +244,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) { bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); + bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags); - return queue_var_show(set, page); + return queue_var_show(set << force, page); } static ssize_t @@ -257,10 +258,14 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) ret = queue_var_store(&val, page, count); spin_lock_irq(q->queue_lock); - if (val) + if (val) { queue_flag_set(QUEUE_FLAG_SAME_COMP, q); - else - queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + if (val == 2) + queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); + } else { + queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + } spin_unlock_irq(q->queue_lock); #endif return ret; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c0cd9a2f22ef..0e67c45b3bc9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -392,7 +392,7 @@ struct request_queue { #define QUEUE_FLAG_ELVSWITCH 6 /* don't use elevator, just do FIFO */ #define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ -#define QUEUE_FLAG_SAME_COMP 9 /* force complete on same CPU */ +#define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 10 /* fake timeout */ #define QUEUE_FLAG_STACKABLE 11 /* supports request stacking */ #define QUEUE_FLAG_NONROT 12 /* non-rotational device (SSD) */ @@ -402,6 +402,7 @@ struct request_queue { #define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */ #define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ +#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3