From 1bf70d08cc3b55abd1763e6dff5855cb8dd8318b Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:33 +0530 Subject: block: introduce a dedicated lock for protecting queue elevator updates A queue's elevator can be updated either when modifying nr_hw_queues or through the sysfs scheduler attribute. Currently, elevator switching/ updating is protected using q->sysfs_lock, but this has led to lockdep splats[1] due to inconsistent lock ordering between q->sysfs_lock and the freeze-lock in multiple block layer call sites. As the scope of q->sysfs_lock is not well-defined, its (mis)use has resulted in numerous lockdep warnings. To address this, introduce a new q->elevator_lock, dedicated specifically for protecting elevator switches/updates. And we'd now use this new q->elevator_lock instead of q->sysfs_lock for protecting elevator switches/updates. While at it, make elv_iosched_load_module() a static function, as it is only called from elv_iosched_store(). Also, remove redundant parameters from elv_iosched_load_module() function signature. [1] https://lore.kernel.org/all/67637e70.050a0220.3157ee.000c.GAE@google.com/ Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-5-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 248416ecd01c..31b1b635c710 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -560,6 +560,14 @@ struct request_queue { struct blk_flush_queue *fq; struct list_head flush_list; + /* + * Protects against I/O scheduler switching, specifically when + * updating q->elevator. To ensure proper locking order during + * an elevator update, first freeze the queue, then acquire + * ->elevator_lock. + */ + struct mutex elevator_lock; + struct mutex sysfs_lock; struct mutex limits_lock; -- cgit v1.2.3 From 3efe7571c3ae2b6481253a2616c2bb3fbadd503b Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:34 +0530 Subject: block: protect nr_requests update using q->elevator_lock The sysfs attribute nr_requests could be simultaneously updated from elevator switch/update or nr_hw_queue update code path. The update to nr_requests for each of those code paths runs holding q->elevator_lock. So we should protect access to sysfs attribute nr_requests using q-> elevator_lock instead of q->sysfs_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-6-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 10 +++++----- include/linux/blkdev.h | 10 ++++++---- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1562e22877e1..f1fa57de29ed 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -55,9 +55,9 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page) { ssize_t ret; - mutex_lock(&disk->queue->sysfs_lock); + mutex_lock(&disk->queue->elevator_lock); ret = queue_var_show(disk->queue->nr_requests, page); - mutex_unlock(&disk->queue->sysfs_lock); + mutex_unlock(&disk->queue->elevator_lock); return ret; } @@ -76,16 +76,16 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) if (ret < 0) return ret; - mutex_lock(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; err = blk_mq_update_nr_requests(disk->queue, nr); if (err) ret = err; + mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); return ret; } @@ -692,7 +692,6 @@ static struct attribute *blk_mq_queue_attrs[] = { /* * Attributes which are protected with q->sysfs_lock. */ - &queue_requests_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif @@ -701,6 +700,7 @@ static struct attribute *blk_mq_queue_attrs[] = { * q->sysfs_lock. */ &elv_iosched_entry.attr, + &queue_requests_entry.attr, /* * Attributes which don't require locking. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 31b1b635c710..3e66ad016a23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -561,10 +561,12 @@ struct request_queue { struct list_head flush_list; /* - * Protects against I/O scheduler switching, specifically when - * updating q->elevator. To ensure proper locking order during - * an elevator update, first freeze the queue, then acquire - * ->elevator_lock. + * Protects against I/O scheduler switching, particularly when + * updating q->elevator. Since the elevator update code path may + * also modify q->nr_requests, this lock also protects the sysfs + * attribute nr_requests. + * To ensure proper locking order during an elevator update, first + * freeze the queue, then acquire ->elevator_lock. */ struct mutex elevator_lock; -- cgit v1.2.3 From 245618f8e45ff4f79327627b474b563da71c2c75 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:35 +0530 Subject: block: protect wbt_lat_usec using q->elevator_lock The wbt latency and state could be updated while initializing the elevator or exiting the elevator. It could be also updated while configuring IO latency QoS parameters using cgroup. The elevator code path is now protected with q->elevator_lock. So we should protect the access to sysfs attribute wbt_lat_usec using q->elevator _lock instead of q->sysfs_lock. White we're at it, also protect ioc_qos_write(), which configures wbt parameters via cgroup, using q->elevator_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-7-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 ++ block/blk-sysfs.c | 20 ++++++++------------ include/linux/blkdev.h | 4 ++-- 3 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6be46e28459b..38e7bf3c3b4f 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3248,6 +3248,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, } memflags = blk_mq_freeze_queue(disk->queue); + mutex_lock(&disk->queue->elevator_lock); blk_mq_quiesce_queue(disk->queue); spin_lock_irq(&ioc->lock); @@ -3355,6 +3356,7 @@ einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(disk->queue); + mutex_unlock(&disk->queue->elevator_lock); blk_mq_unfreeze_queue(disk->queue, memflags); ret = -EINVAL; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f1fa57de29ed..223da196a548 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -557,7 +557,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) ssize_t ret; struct request_queue *q = disk->queue; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); if (!wbt_rq_qos(q)) { ret = -EINVAL; goto out; @@ -570,7 +570,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); out: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return ret; } @@ -589,8 +589,8 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (val < -1) return -EINVAL; - mutex_lock(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); rqos = wbt_rq_qos(q); if (!rqos) { @@ -619,8 +619,8 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, blk_mq_unquiesce_queue(q); out: + mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); return ret; } @@ -689,19 +689,15 @@ static struct attribute *queue_attrs[] = { /* Request-based queue attributes that are not relevant for bio-based queues. */ static struct attribute *blk_mq_queue_attrs[] = { - /* - * Attributes which are protected with q->sysfs_lock. - */ -#ifdef CONFIG_BLK_WBT - &queue_wb_lat_entry.attr, -#endif /* * Attributes which require some form of locking other than * q->sysfs_lock. */ &elv_iosched_entry.attr, &queue_requests_entry.attr, - +#ifdef CONFIG_BLK_WBT + &queue_wb_lat_entry.attr, +#endif /* * Attributes which don't require locking. */ @@ -882,10 +878,10 @@ int blk_register_queue(struct gendisk *disk) goto out_crypto_sysfs_unregister; } } + wbt_enable_default(disk); mutex_unlock(&q->elevator_lock); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); - wbt_enable_default(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3e66ad016a23..0ee3b5c9388e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -563,8 +563,8 @@ struct request_queue { /* * Protects against I/O scheduler switching, particularly when * updating q->elevator. Since the elevator update code path may - * also modify q->nr_requests, this lock also protects the sysfs - * attribute nr_requests. + * also modify q->nr_requests and wbt latency, this lock also + * protects the sysfs attributes nr_requests and wbt_lat_usec. * To ensure proper locking order during an elevator update, first * freeze the queue, then acquire ->elevator_lock. */ -- cgit v1.2.3 From 5e40f4452dc9a3fb44d13bb6bc7032f3911a2675 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:36 +0530 Subject: block: protect read_ahead_kb using q->limits_lock The bdi->ra_pages could be updated under q->limits_lock because it's usually calculated from the queue limits by queue_limits_commit_update. So protect reading/writing the sysfs attribute read_ahead_kb using q->limits_lock instead of q->sysfs_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-8-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 16 ++++++++++------ include/linux/blkdev.h | 3 +++ 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 223da196a548..d584461a1d84 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -93,9 +93,9 @@ static ssize_t queue_ra_show(struct gendisk *disk, char *page) { ssize_t ret; - mutex_lock(&disk->queue->sysfs_lock); + mutex_lock(&disk->queue->limits_lock); ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); - mutex_unlock(&disk->queue->sysfs_lock); + mutex_unlock(&disk->queue->limits_lock); return ret; } @@ -111,12 +111,15 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count) ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - - mutex_lock(&q->sysfs_lock); + /* + * ->ra_pages is protected by ->limits_lock because it is usually + * calculated from the queue limits by queue_limits_commit_update. + */ + mutex_lock(&q->limits_lock); memflags = blk_mq_freeze_queue(q); disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + mutex_unlock(&q->limits_lock); blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); return ret; } @@ -670,7 +673,8 @@ static struct attribute *queue_attrs[] = { &queue_dma_alignment_entry.attr, /* - * Attributes which are protected with q->sysfs_lock. + * Attributes which require some form of locking other than + * q->sysfs_lock. */ &queue_ra_entry.attr, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0ee3b5c9388e..3bee1b4858b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,6 +571,9 @@ struct request_queue { struct mutex elevator_lock; struct mutex sysfs_lock; + /* + * Protects queue limits and also sysfs attribute read_ahead_kb. + */ struct mutex limits_lock; /* -- cgit v1.2.3 From 5abba4cebec0a591ca7e7f55701e42cd5dc059af Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 6 Mar 2025 15:09:53 +0530 Subject: block: protect hctx attributes/params using q->elevator_lock Currently, hctx attributes (nr_tags, nr_reserved_tags, and cpu_list) are protected using `q->sysfs_lock`. However, these attributes can be updated in multiple scenarios: - During the driver's probe method. - When updating nr_hw_queues. - When writing to the sysfs attribute nr_requests, which can modify nr_tags. The nr_requests attribute is already protected using q->elevator_lock, but none of the update paths actually use q->sysfs_lock to protect hctx attributes. So to ensure proper synchronization, replace q->sysfs_lock with q->elevator_lock when reading hctx attributes through sysfs. Additionally, blk_mq_update_nr_hw_queues allocates and updates hctx. The allocation of hctx is protected using q->elevator_lock, however, updating hctx params happens without any protection, so safeguard hctx param update path by also using q->elevator_lock. Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250306093956.2818808-1-nilay@linux.ibm.com [axboe: wrap comment at 80 chars] Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 4 ++-- block/blk-mq.c | 4 ++++ include/linux/blkdev.h | 14 ++++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 3feeeccf8a99..24656980f443 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -61,9 +61,9 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, if (!entry->show) return -EIO; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); res = entry->show(hctx, page); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return res; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 5a2d63927525..b9550a127c8e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4094,6 +4094,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; + mutex_lock(&q->elevator_lock); + queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; @@ -4198,6 +4200,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } + + mutex_unlock(&q->elevator_lock); } /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3bee1b4858b6..dcf8fce15e23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -561,12 +561,14 @@ struct request_queue { struct list_head flush_list; /* - * Protects against I/O scheduler switching, particularly when - * updating q->elevator. Since the elevator update code path may - * also modify q->nr_requests and wbt latency, this lock also - * protects the sysfs attributes nr_requests and wbt_lat_usec. - * To ensure proper locking order during an elevator update, first - * freeze the queue, then acquire ->elevator_lock. + * Protects against I/O scheduler switching, particularly when updating + * q->elevator. Since the elevator update code path may also modify q-> + * nr_requests and wbt latency, this lock also protects the sysfs attrs + * nr_requests and wbt_lat_usec. Additionally the nr_hw_queues update + * may modify hctx tags, reserved-tags and cpumask, so this lock also + * helps protect the hctx attrs. To ensure proper locking order during + * an elevator or nr_hw_queue update, first freeze the queue, then + * acquire ->elevator_lock. */ struct mutex elevator_lock; -- cgit v1.2.3 From a3996d11f3ab743e6cc4e3529ce9459c2cd27139 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 13 Mar 2025 17:21:50 +0530 Subject: block: protect debugfs attrs using elevator_lock instead of sysfs_lock Currently, the block debugfs attributes (tags, tags_bitmap, sched_tags, and sched_tags_bitmap) are protected using q->sysfs_lock. However, these attributes are updated in multiple scenarios: - During driver probe method - During an elevator switch/update - During an nr_hw_queues update - When writing to the sysfs attribute nr_requests All these update paths (except driver probe method, which doesn't require any protection) are already protected using q->elevator_lock. To ensure consistency and proper synchronization, replace q->sysfs_lock with q->elevator_lock for protecting these debugfs attributes. Signed-off-by: Nilay Shroff Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250313115235.3707600-2-nilay@linux.ibm.com [axboe: some commit message rewording/fixes] Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 16 ++++++++-------- include/linux/blkdev.h | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index adf5f0697b6b..62775b132d4c 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -400,12 +400,12 @@ static int hctx_tags_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) goto out; if (hctx->tags) blk_mq_debugfs_tags_show(m, hctx->tags); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); out: return res; @@ -417,12 +417,12 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) goto out; if (hctx->tags) sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); out: return res; @@ -434,12 +434,12 @@ static int hctx_sched_tags_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) goto out; if (hctx->sched_tags) blk_mq_debugfs_tags_show(m, hctx->sched_tags); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); out: return res; @@ -451,12 +451,12 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) goto out; if (hctx->sched_tags) sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); out: return res; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dcf8fce15e23..8d072042790e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -566,9 +566,9 @@ struct request_queue { * nr_requests and wbt latency, this lock also protects the sysfs attrs * nr_requests and wbt_lat_usec. Additionally the nr_hw_queues update * may modify hctx tags, reserved-tags and cpumask, so this lock also - * helps protect the hctx attrs. To ensure proper locking order during - * an elevator or nr_hw_queue update, first freeze the queue, then - * acquire ->elevator_lock. + * helps protect the hctx sysfs/debugfs attrs. To ensure proper locking + * order during an elevator or nr_hw_queue update, first freeze the + * queue, then acquire ->elevator_lock. */ struct mutex elevator_lock; -- cgit v1.2.3