From 590347e4000356f55eb10b03ced2686bd74dab40 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 22 Feb 2018 16:56:16 +0100 Subject: dm bufio: avoid false-positive Wmaybe-uninitialized warning gcc-6.3 and earlier show a new warning after a seemingly unrelated change to the arm64 PAGE_KERNEL definition: In file included from drivers/md/dm-bufio.c:14:0: drivers/md/dm-bufio.c: In function 'alloc_buffer': include/linux/sched/mm.h:182:56: warning: 'noio_flag' may be used uninitialized in this function [-Wmaybe-uninitialized] current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; ^ The same warning happened earlier on linux-3.18 for MIPS and I did a workaround for that, but now it's come back. gcc-7 and newer are apparently smart enough to figure this out, and other architectures don't show it, so the best I could come up with is to rework the caller slightly in a way that makes it obvious enough to all arm64 compilers what is happening here. Fixes: 41acec624087 ("arm64: kpti: Make use of nG dependent on arm64_kernel_unmapped_at_el0()") Link: https://patchwork.kernel.org/patch/9692829/ Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann [snitzer: moved declarations inside conditional, altered vmalloc return] Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 414c9af54ded..aa2032fa80d4 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -386,9 +386,6 @@ static void __cache_size_refresh(void) static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, enum data_mode *data_mode) { - unsigned noio_flag; - void *ptr; - if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { *data_mode = DATA_MODE_SLAB; return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); @@ -412,16 +409,15 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, * all allocations done by this process (including pagetables) are done * as if GFP_NOIO was specified. */ + if (gfp_mask & __GFP_NORETRY) { + unsigned noio_flag = memalloc_noio_save(); + void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); - if (gfp_mask & __GFP_NORETRY) - noio_flag = memalloc_noio_save(); - - ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); - - if (gfp_mask & __GFP_NORETRY) memalloc_noio_restore(noio_flag); + return ptr; + } - return ptr; + return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); } /* -- cgit v1.2.3 From 519049afead4f7c3e6446028c41e99fde958cc04 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 22 Feb 2018 13:31:20 -0500 Subject: dm: use blkdev_get rather than bdgrab when issuing pass-through ioctl Otherwise an underlying device's teardown (e.g. SCSI) may race with the DM ioctl or persistent reservation and result in dereferencing driver memory that gets freed when the underlying device's final blkdev_put() occurs. bdgrab() only increases the refcount for the block_device's inode to ensure the block_device struct itself will not be freed, but does not guarantee the block_device will remain associated with the gendisk or its storage. Cc: stable@vger.kernel.org # 4.8+ Reported-by: David Jeffery Suggested-by: David Jeffery Reviewed-by: Ben Marzinski Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 68136806d365..45328d8b2859 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -458,9 +458,11 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } -static int dm_grab_bdev_for_ioctl(struct mapped_device *md, - struct block_device **bdev, - fmode_t *mode) +static char *_dm_claim_ptr = "I belong to device-mapper"; + +static int dm_get_bdev_for_ioctl(struct mapped_device *md, + struct block_device **bdev, + fmode_t *mode) { struct dm_target *tgt; struct dm_table *map; @@ -490,6 +492,10 @@ retry: goto out; bdgrab(*bdev); + r = blkdev_get(*bdev, *mode, _dm_claim_ptr); + if (r < 0) + goto out; + dm_put_live_table(md, srcu_idx); return r; @@ -508,7 +514,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, struct mapped_device *md = bdev->bd_disk->private_data; int r; - r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); + r = dm_get_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -528,7 +534,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); out: - bdput(bdev); + blkdev_put(bdev, mode); return r; } @@ -708,14 +714,13 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) static int open_table_device(struct table_device *td, dev_t dev, struct mapped_device *md) { - static char *_claim_ptr = "I belong to device-mapper"; struct block_device *bdev; int r; BUG_ON(td->dm_dev.bdev); - bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); + bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); if (IS_ERR(bdev)) return PTR_ERR(bdev); @@ -3011,7 +3016,7 @@ static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, fmode_t mode; int r; - r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); + r = dm_get_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3021,7 +3026,7 @@ static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, else r = -EOPNOTSUPP; - bdput(bdev); + blkdev_put(bdev, mode); return r; } @@ -3032,7 +3037,7 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) fmode_t mode; int r; - r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); + r = dm_get_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3042,7 +3047,7 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) else r = -EOPNOTSUPP; - bdput(bdev); + blkdev_put(bdev, mode); return r; } @@ -3054,7 +3059,7 @@ static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, fmode_t mode; int r; - r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); + r = dm_get_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3064,7 +3069,7 @@ static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, else r = -EOPNOTSUPP; - bdput(bdev); + blkdev_put(bdev, mode); return r; } @@ -3075,7 +3080,7 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) fmode_t mode; int r; - r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); + r = dm_get_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3085,7 +3090,7 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) else r = -EOPNOTSUPP; - bdput(bdev); + blkdev_put(bdev, mode); return r; } -- cgit v1.2.3 From da1e148803e0b98961599b0295418bb7a8fc79f3 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Tue, 27 Feb 2018 21:58:59 +0100 Subject: dm raid: fix incorrect sync_ratio when degraded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upstream commit 4102d9de6d375 ("dm raid: fix rs_get_progress() synchronization state/ratio") in combination with commit 7c29744ecce ("dm raid: simplify rs_get_progress()") introduced a regression by incorrectly reporting a sync_ratio of 0 for degraded raid sets. This caused lvm2 to fail to repair raid legs automatically. Fix by identifying the degraded state by checking the MD_RECOVERY_INTR flag and returning mddev->recovery_cp in case it is set. MD sets recovery = [ MD_RECOVERY_RECOVER MD_RECOVERY_INTR MD_RECOVERY_NEEDED ] when a RAID member fails. It then shuts down any sync thread that is running and leaves us with all MD_RECOVERY_* flags cleared. The bug occurs if a status is requested in the short time it takes to shut down any sync thread and clear the flags, because we were keying in on the MD_RECOVERY_NEEDED - understanding it to be the initial phase of a “recover” sync thread. However, this is an incorrect interpretation if MD_RECOVERY_INTR is also set. This also explains why the bug only happened when automatic repair was enabled and not a normal ‘manual’ method. It is impossible to react quick enough to hit the problematic window without it being automated. Fix passes automatic repair tests. Fixes: 7c29744ecce ("dm raid: simplify rs_get_progress()") Signed-off-by: Jonathan Brassow Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 7ef469e902c6..c1d1034ff7b7 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3408,9 +3408,10 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { - if (test_bit(MD_RECOVERY_NEEDED, &recovery) || - test_bit(MD_RECOVERY_RESHAPE, &recovery) || - test_bit(MD_RECOVERY_RUNNING, &recovery)) + if (!test_bit(MD_RECOVERY_INTR, &recovery) && + (test_bit(MD_RECOVERY_NEEDED, &recovery) || + test_bit(MD_RECOVERY_RESHAPE, &recovery) || + test_bit(MD_RECOVERY_RUNNING, &recovery))) r = mddev->curr_resync_completed; else r = mddev->recovery_cp; -- cgit v1.2.3 From 99243b922c9ddb4976b8db2eeffb0aed6e06c6f9 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 26 Feb 2018 15:22:32 -0500 Subject: dm table: fix "nvme" test The strncmp function should compare 4 bytes. Fixes: 22c11858e8002 ("dm: introduce DM_TYPE_NVME_BIO_BASED") Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 5fe7ec356c33..30b3294a8778 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1755,7 +1755,7 @@ static int device_no_partial_completion(struct dm_target *ti, struct dm_dev *dev char b[BDEVNAME_SIZE]; /* For now, NVMe devices are the only devices of this class */ - return (strncmp(bdevname(dev->bdev, b), "nvme", 3) == 0); + return (strncmp(bdevname(dev->bdev, b), "nvme", 4) == 0); } static bool dm_table_does_not_support_partial_completion(struct dm_table *t) -- cgit v1.2.3 From 8d47e65948ddea4398892946d9e50778a316b397 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 5 Mar 2018 14:10:11 -0500 Subject: dm mpath: remove unnecessary NVMe branching in favor of scsi_dh checks This eliminates the "queue_mode" configuration's "nvme" mode. There wasn't anything NVMe-specific about that mode. It was named "nvme" because it was a short name for the mode. But the entire point of the mode was to optimize the multipath target for underlying devices that are _not_ SCSI-based. Devices that aren't SCSI have no need for the various SCSI device handler (scsi_dh) specific code in DM multipath. But rather than narrowly define this scsi_dh vs not branching in terms of "nvme": invert the logic so that we're just checking whether a multipath device is layered on SCSI devices with scsi_dh attached. This allows any future storage technology to avoid scsi_dh specific code in the multipath target too. Fixes: 848b8aefd4 ("dm mpath: optimize NVMe bio-based support") Suggested-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 66 ++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 37 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 7d3e572072f5..3fde9e9faddd 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -211,25 +212,13 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) else m->queue_mode = DM_TYPE_REQUEST_BASED; - } else if (m->queue_mode == DM_TYPE_BIO_BASED || - m->queue_mode == DM_TYPE_NVME_BIO_BASED) { + } else if (m->queue_mode == DM_TYPE_BIO_BASED) { INIT_WORK(&m->process_queued_bios, process_queued_bios); - - if (m->queue_mode == DM_TYPE_BIO_BASED) { - /* - * bio-based doesn't support any direct scsi_dh management; - * it just discovers if a scsi_dh is attached. - */ - set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); - } - } - - if (m->queue_mode != DM_TYPE_NVME_BIO_BASED) { - set_bit(MPATHF_QUEUE_IO, &m->flags); - atomic_set(&m->pg_init_in_progress, 0); - atomic_set(&m->pg_init_count, 0); - m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; - init_waitqueue_head(&m->pg_init_wait); + /* + * bio-based doesn't support any direct scsi_dh management; + * it just discovers if a scsi_dh is attached. + */ + set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); } dm_table_set_type(ti->table, m->queue_mode); @@ -337,14 +326,12 @@ static void __switch_pg(struct multipath *m, struct priority_group *pg) { m->current_pg = pg; - if (m->queue_mode == DM_TYPE_NVME_BIO_BASED) - return; - /* Must we initialise the PG first, and queue I/O till it's ready? */ if (m->hw_handler_name) { set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); set_bit(MPATHF_QUEUE_IO, &m->flags); } else { + /* FIXME: not needed if no scsi_dh is attached */ clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); clear_bit(MPATHF_QUEUE_IO, &m->flags); } @@ -385,8 +372,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) unsigned bypassed = 1; if (!atomic_read(&m->nr_valid_paths)) { - if (m->queue_mode != DM_TYPE_NVME_BIO_BASED) - clear_bit(MPATHF_QUEUE_IO, &m->flags); + clear_bit(MPATHF_QUEUE_IO, &m->flags); goto failed; } @@ -599,7 +585,7 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) return pgpath; } -static struct pgpath *__map_bio_nvme(struct multipath *m, struct bio *bio) +static struct pgpath *__map_bio_fast(struct multipath *m, struct bio *bio) { struct pgpath *pgpath; unsigned long flags; @@ -634,8 +620,8 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, { struct pgpath *pgpath; - if (m->queue_mode == DM_TYPE_NVME_BIO_BASED) - pgpath = __map_bio_nvme(m, bio); + if (!m->hw_handler_name) + pgpath = __map_bio_fast(m, bio); else pgpath = __map_bio(m, bio); @@ -675,8 +661,7 @@ static void process_queued_io_list(struct multipath *m) { if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED) dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table)); - else if (m->queue_mode == DM_TYPE_BIO_BASED || - m->queue_mode == DM_TYPE_NVME_BIO_BASED) + else if (m->queue_mode == DM_TYPE_BIO_BASED) queue_work(kmultipathd, &m->process_queued_bios); } @@ -838,6 +823,16 @@ retain: */ kfree(m->hw_handler_name); m->hw_handler_name = attached_handler_name; + + /* + * Init fields that are only used when a scsi_dh is attached + */ + if (!test_and_set_bit(MPATHF_QUEUE_IO, &m->flags)) { + atomic_set(&m->pg_init_in_progress, 0); + atomic_set(&m->pg_init_count, 0); + m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; + init_waitqueue_head(&m->pg_init_wait); + } } } @@ -873,6 +868,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps int r; struct pgpath *p; struct multipath *m = ti->private; + struct scsi_device *sdev; /* we need at least a path arg */ if (as->argc < 1) { @@ -891,7 +887,9 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps goto bad; } - if (m->queue_mode != DM_TYPE_NVME_BIO_BASED) { + sdev = scsi_device_from_queue(bdev_get_queue(p->path.dev->bdev)); + if (sdev) { + put_device(&sdev->sdev_gendev); INIT_DELAYED_WORK(&p->activate_path, activate_path_work); r = setup_scsi_dh(p->path.dev->bdev, m, &ti->error); if (r) { @@ -1001,8 +999,7 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) if (!hw_argc) return 0; - if (m->queue_mode == DM_TYPE_BIO_BASED || - m->queue_mode == DM_TYPE_NVME_BIO_BASED) { + if (m->queue_mode == DM_TYPE_BIO_BASED) { dm_consume_args(as, hw_argc); DMERR("bio-based multipath doesn't allow hardware handler args"); return 0; @@ -1091,8 +1088,6 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) if (!strcasecmp(queue_mode_name, "bio")) m->queue_mode = DM_TYPE_BIO_BASED; - else if (!strcasecmp(queue_mode_name, "nvme")) - m->queue_mode = DM_TYPE_NVME_BIO_BASED; else if (!strcasecmp(queue_mode_name, "rq")) m->queue_mode = DM_TYPE_REQUEST_BASED; else if (!strcasecmp(queue_mode_name, "mq")) @@ -1193,7 +1188,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->num_discard_bios = 1; ti->num_write_same_bios = 1; ti->num_write_zeroes_bios = 1; - if (m->queue_mode == DM_TYPE_BIO_BASED || m->queue_mode == DM_TYPE_NVME_BIO_BASED) + if (m->queue_mode == DM_TYPE_BIO_BASED) ti->per_io_data_size = multipath_per_bio_data_size(); else ti->per_io_data_size = sizeof(struct dm_mpath_io); @@ -1730,9 +1725,6 @@ static void multipath_status(struct dm_target *ti, status_type_t type, case DM_TYPE_BIO_BASED: DMEMIT("queue_mode bio "); break; - case DM_TYPE_NVME_BIO_BASED: - DMEMIT("queue_mode nvme "); - break; case DM_TYPE_MQ_REQUEST_BASED: DMEMIT("queue_mode mq "); break; -- cgit v1.2.3 From c934edadcc7a64e399942ae34b912939057a77a7 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 5 Mar 2018 15:26:06 -0500 Subject: dm table: allow upgrade from bio-based to specialized bio-based variant In practice this is really only meaningful in the context of the DM multipath target (which uses dm_table_set_type() to set the type of device DM should create via its "queue_mode" option). So this change allows a DM multipath device with "queue_mode bio" to be upgraded from DM_TYPE_BIO_BASED to DM_TYPE_NVME_BIO_BASED -- iff the underlying device(s) are NVMe. DM_TYPE_NVME_BIO_BASED is just a DM core implementation detail that allows for NVMe-specific optimizations (e.g. use direct_make_request instead of generic_make_request). If in the future there is no benefit or need to distinguish NVMe vs not: then it will be removed. Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 30b3294a8778..7eb3e2a3c07d 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -942,17 +942,12 @@ static int dm_table_determine_type(struct dm_table *t) if (t->type != DM_TYPE_NONE) { /* target already set the table's type */ - if (t->type == DM_TYPE_BIO_BASED) - return 0; - else if (t->type == DM_TYPE_NVME_BIO_BASED) { - if (!dm_table_does_not_support_partial_completion(t)) { - DMERR("nvme bio-based is only possible with devices" - " that don't support partial completion"); - return -EINVAL; - } - /* Fallthru, also verify all devices are blk-mq */ + if (t->type == DM_TYPE_BIO_BASED) { + /* possibly upgrade to a variant of bio-based */ + goto verify_bio_based; } BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED); + BUG_ON(t->type == DM_TYPE_NVME_BIO_BASED); goto verify_rq_based; } @@ -985,6 +980,7 @@ static int dm_table_determine_type(struct dm_table *t) } if (bio_based) { +verify_bio_based: /* We must use this table as bio-based */ t->type = DM_TYPE_BIO_BASED; if (dm_table_supports_dax(t) || -- cgit v1.2.3