From 5651cd3c43368873d0787b52acb2e0e08f3c5da4 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 28 May 2019 22:49:04 -0700 Subject: nvme-rdma: fix queue mapping when queue count is limited When the controller supports less queues than requested, we should make sure that queue mapping does the right thing and not assume that all queues are available. This fixes a crash when the controller supports less queues than requested. The rules are: 1. if no write/poll queues are requested, we assign the available queues to the default queue map. The default and read queue maps share the existing queues. 2. if write queues are requested: - first make sure that read queue map gets the requested nr_io_queues count - then grant the default queue map the minimum between the requested nr_write_queues and the remaining queues. If there are no available queues to dedicate to the default queue map, fallback to (1) and share all the queues in the existing queue map. 3. if poll queues are requested: - map the remaining queues to the poll queue map. Also, provide a log indication on how we constructed the different queue maps. Reported-by: Harris, James R Reviewed-by: Max Gurtovoy Tested-by: Jim Harris Cc: # v5.0+ Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 99 +++++++++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 38 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index f383146e7d0f..26709a2ab593 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -641,34 +641,16 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; struct ib_device *ibdev = ctrl->device->dev; - unsigned int nr_io_queues; + unsigned int nr_io_queues, nr_default_queues; + unsigned int nr_read_queues, nr_poll_queues; int i, ret; - nr_io_queues = min(opts->nr_io_queues, num_online_cpus()); - - /* - * we map queues according to the device irq vectors for - * optimal locality so we don't need more queues than - * completion vectors. - */ - nr_io_queues = min_t(unsigned int, nr_io_queues, - ibdev->num_comp_vectors); - - if (opts->nr_write_queues) { - ctrl->io_queues[HCTX_TYPE_DEFAULT] = - min(opts->nr_write_queues, nr_io_queues); - nr_io_queues += ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } else { - ctrl->io_queues[HCTX_TYPE_DEFAULT] = nr_io_queues; - } - - ctrl->io_queues[HCTX_TYPE_READ] = nr_io_queues; - - if (opts->nr_poll_queues) { - ctrl->io_queues[HCTX_TYPE_POLL] = - min(opts->nr_poll_queues, num_online_cpus()); - nr_io_queues += ctrl->io_queues[HCTX_TYPE_POLL]; - } + nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors, + min(opts->nr_io_queues, num_online_cpus())); + nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors, + min(opts->nr_write_queues, num_online_cpus())); + nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus()); + nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues; ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) @@ -681,6 +663,34 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues); + if (opts->nr_write_queues && nr_read_queues < nr_io_queues) { + /* + * separate read/write queues + * hand out dedicated default queues only after we have + * sufficient read queues. + */ + ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues; + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ]; + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(nr_default_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } else { + /* + * shared read/write queues + * either no write queues were requested, or we don't have + * sufficient queue count to have dedicated default queues. + */ + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(nr_read_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } + + if (opts->nr_poll_queues && nr_io_queues) { + /* map dedicated poll queues only if we have queues left */ + ctrl->io_queues[HCTX_TYPE_POLL] = + min(nr_poll_queues, nr_io_queues); + } + for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvme_rdma_alloc_queue(ctrl, i, ctrl->ctrl.sqsize + 1); @@ -1763,17 +1773,24 @@ static void nvme_rdma_complete_rq(struct request *rq) static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) { struct nvme_rdma_ctrl *ctrl = set->driver_data; + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; - set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; - set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_READ].nr_queues = ctrl->io_queues[HCTX_TYPE_READ]; - if (ctrl->ctrl.opts->nr_write_queues) { + if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { /* separate read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_READ]; set->map[HCTX_TYPE_READ].queue_offset = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; + ctrl->io_queues[HCTX_TYPE_DEFAULT]; } else { - /* mixed read/write queues */ + /* shared read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; set->map[HCTX_TYPE_READ].queue_offset = 0; } blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT], @@ -1781,16 +1798,22 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ], ctrl->device->dev, 0); - if (ctrl->ctrl.opts->nr_poll_queues) { + if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { + /* map dedicated poll queues only if we have queues left */ set->map[HCTX_TYPE_POLL].nr_queues = ctrl->io_queues[HCTX_TYPE_POLL]; set->map[HCTX_TYPE_POLL].queue_offset = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - if (ctrl->ctrl.opts->nr_write_queues) - set->map[HCTX_TYPE_POLL].queue_offset += - ctrl->io_queues[HCTX_TYPE_READ]; + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); } + + dev_info(ctrl->ctrl.device, + "mapped %d/%d/%d default/read/poll queues.\n", + ctrl->io_queues[HCTX_TYPE_DEFAULT], + ctrl->io_queues[HCTX_TYPE_READ], + ctrl->io_queues[HCTX_TYPE_POLL]); + return 0; } -- cgit v1.2.3 From 6486199378a505c58fddc47459631235c9fb7638 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 28 May 2019 22:49:05 -0700 Subject: nvme-tcp: fix queue mapping when queue count is limited When the controller supports less queues than requested, we should make sure that queue mapping does the right thing and not assume that all queues are available. This fixes a crash when the controller supports less queues than requested. The rules are: 1. if no write queues are requested, we assign the available queues to the default queue map. The default and read queue maps share the existing queues. 2. if write queues are requested: - first make sure that read queue map gets the requested nr_io_queues count - then grant the default queue map the minimum between the requested nr_write_queues and the remaining queues. If there are no available queues to dedicate to the default queue map, fallback to (1) and share all the queues in the existing queue map. Also, provide a log indication on how we constructed the different queue maps. Reported-by: Harris, James R Tested-by: Jim Harris Cc: # v5.0+ Suggested-by: Roy Shterman Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 57 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2b107a1d152b..08a2501b9357 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -111,6 +111,7 @@ struct nvme_tcp_ctrl { struct work_struct err_work; struct delayed_work connect_work; struct nvme_tcp_request async_req; + u32 io_queues[HCTX_MAX_TYPES]; }; static LIST_HEAD(nvme_tcp_ctrl_list); @@ -1564,6 +1565,35 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) return nr_io_queues; } +static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl, + unsigned int nr_io_queues) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvmf_ctrl_options *opts = nctrl->opts; + + if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) { + /* + * separate read/write queues + * hand out dedicated default queues only after we have + * sufficient read queues. + */ + ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues; + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ]; + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(opts->nr_write_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } else { + /* + * shared read/write queues + * either no write queues were requested, or we don't have + * sufficient queue count to have dedicated default queues. + */ + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(opts->nr_io_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } +} + static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { unsigned int nr_io_queues; @@ -1581,6 +1611,8 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) dev_info(ctrl->device, "creating %d I/O queues.\n", nr_io_queues); + nvme_tcp_set_io_queues(ctrl, nr_io_queues); + return __nvme_tcp_alloc_io_queues(ctrl); } @@ -2089,23 +2121,34 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) { struct nvme_tcp_ctrl *ctrl = set->driver_data; + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; - set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; - set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues; - if (ctrl->ctrl.opts->nr_write_queues) { + if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { /* separate read/write queues */ set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->ctrl.opts->nr_write_queues; + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_READ]; set->map[HCTX_TYPE_READ].queue_offset = - ctrl->ctrl.opts->nr_write_queues; + ctrl->io_queues[HCTX_TYPE_DEFAULT]; } else { - /* mixed read/write queues */ + /* shared read/write queues */ set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->ctrl.opts->nr_io_queues; + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; set->map[HCTX_TYPE_READ].queue_offset = 0; } blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); + + dev_info(ctrl->ctrl.device, + "mapped %d/%d default/read queues.\n", + ctrl->io_queues[HCTX_TYPE_DEFAULT], + ctrl->io_queues[HCTX_TYPE_READ]); + return 0; } -- cgit v1.2.3 From 3562f5d9f21e7779ae442a45197fed6cb247fd22 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 2 Jun 2019 12:43:39 +0900 Subject: nvmet: fix data_len to 0 for bdev-backed write_zeroes The WRITE ZEROES command has no data transfer so that we need to initialize the struct (nvmet_req *req)->data_len to 0x0. While (nvmet_req *req)->transfer_len is initialized in nvmet_req_init(), data_len will be initialized by nowhere which might cause the failure with status code NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR randomly. It's because nvmet_req_execute() checks like: if (unlikely(req->data_len != req->transfer_len)) { req->error_loc = offsetof(struct nvme_common_command, dptr); nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR); } else req->execute(req); This patch fixes req->data_len not to be a randomly assigned by initializing it to 0x0 when preparing the command in nvmet_bdev_parse_io_cmd(). nvmet_file_parse_io_cmd() which is for file-backed I/O has already initialized the data_len field to 0x0, though. Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: Chaitanya Kulkarni Signed-off-by: Minwoo Im Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/target/io-cmd-bdev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 3efc52f9c309..7a1cf6437a6a 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -293,6 +293,7 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) return 0; case nvme_cmd_write_zeroes: req->execute = nvmet_bdev_execute_write_zeroes; + req->data_len = 0; return 0; default: pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, -- cgit v1.2.3 From e0241fb0b94399bb4131da7f29dc92cf5dc1ae71 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 3 Jun 2019 21:47:54 +0200 Subject: block: aoe: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Cc: "Ed L. Cashin" Cc: linux-block@vger.kernel.org Cc: Omar Sandoval Acked-by: Justin Sanders Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- drivers/block/aoe/aoeblk.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index e2c6aae2d636..bd19f8af950b 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -196,7 +196,6 @@ static const struct file_operations aoe_debugfs_fops = { static void aoedisk_add_debugfs(struct aoedev *d) { - struct dentry *entry; char *p; if (aoe_debugfs_dir == NULL) @@ -207,15 +206,8 @@ aoedisk_add_debugfs(struct aoedev *d) else p++; BUG_ON(*p == '\0'); - entry = debugfs_create_file(p, 0444, aoe_debugfs_dir, d, - &aoe_debugfs_fops); - if (IS_ERR_OR_NULL(entry)) { - pr_info("aoe: cannot create debugfs file for %s\n", - d->gd->disk_name); - return; - } - BUG_ON(d->debugfs); - d->debugfs = entry; + d->debugfs = debugfs_create_file(p, 0444, aoe_debugfs_dir, d, + &aoe_debugfs_fops); } void aoedisk_rm_debugfs(struct aoedev *d) @@ -472,10 +464,6 @@ aoeblk_init(void) if (buf_pool_cache == NULL) return -ENOMEM; aoe_debugfs_dir = debugfs_create_dir("aoe", NULL); - if (IS_ERR_OR_NULL(aoe_debugfs_dir)) { - pr_info("aoe: cannot create debugfs directory\n"); - aoe_debugfs_dir = NULL; - } return 0; } -- cgit v1.2.3 From a48bc520011ea7a701826a9e3a770b128f283328 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Jun 2019 21:08:24 +0200 Subject: nvme-pci: don't limit DMA segement size NVMe uses PRPs (or optionally unlimited SGLs) for data transfers and has no specific limit for a single DMA segement. Limiting the size will cause problems because the block layer assumes PRP-ish devices using a virt boundary mask don't have a segment limit. And while this is true, we also really need to tell the DMA mapping layer about it, otherwise dma-debug will trip over it. Signed-off-by: Christoph Hellwig Reported-by: Sebastian Ott Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f562154551ce..524d6bd6d095 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2513,6 +2513,12 @@ static void nvme_reset_work(struct work_struct *work) */ dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; dev->ctrl.max_segments = NVME_MAX_SEGS; + + /* + * Don't limit the IOMMU merged segment size. + */ + dma_set_max_seg_size(dev->dev, 0xffffffff); + mutex_unlock(&dev->shutdown_lock); /* -- cgit v1.2.3 From 84f3fc7aec42cefba34839d541d8bf8a3087e123 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Jun 2019 21:08:25 +0200 Subject: rsxx: don't call dma_set_max_seg_size This driver does never uses dma_map_sg, so the setting is rather pointless. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/rsxx/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index de9b2d2f8654..76b73ddf8fd7 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -767,7 +767,6 @@ static int rsxx_pci_probe(struct pci_dev *dev, goto failed_enable; pci_set_master(dev); - dma_set_max_seg_size(&dev->dev, RSXX_HW_BLK_SIZE); st = dma_set_mask(&dev->dev, DMA_BIT_MASK(64)); if (st) { -- cgit v1.2.3 From bb6f59af309c69643b6b07d9372c01a1cc0792e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Jun 2019 21:08:26 +0200 Subject: mtip32xx: also set max_segment_size in the device If we only set the max_segment_size on the queue an IOMMU merge might create bigger segments again, so limit the IOMMU merges as well. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index bacfdac7161c..a14b09ab3a41 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3676,6 +3676,7 @@ skip_create_disk: blk_queue_physical_block_size(dd->queue, 4096); blk_queue_max_hw_sectors(dd->queue, 0xffff); blk_queue_max_segment_size(dd->queue, 0x400000); + dma_set_max_seg_size(&dd->pdev->dev, 0x400000); blk_queue_io_min(dd->queue, 4096); /* Set the capacity of the device in 512 byte sectors. */ -- cgit v1.2.3 From cf1db7fc8c2d31222701bd5c01b9cbaf89d8e7ce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Jun 2019 21:08:27 +0200 Subject: mmc: also set max_segment_size in the device If we only set the max_segment_size on the queue an IOMMU merge might create bigger segments again, so limit the IOMMU merges as well. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/mmc/core/queue.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index b5b9c6142f08..92900a095796 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -377,6 +377,8 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card) blk_queue_max_segment_size(mq->queue, round_down(host->max_seg_size, block_size)); + dma_set_max_seg_size(mmc_dev(host), queue_max_segment_size(mq->queue)); + INIT_WORK(&mq->recovery_work, mmc_mq_recovery_handler); INIT_WORK(&mq->complete_work, mmc_blk_mq_complete_work); -- cgit v1.2.3 From c8e8c77b3bdbade6e26e8e76595f141ede12b692 Mon Sep 17 00:00:00 2001 From: Jaesoo Lee Date: Mon, 3 Jun 2019 16:42:28 -0700 Subject: nvme: Fix u32 overflow in the number of namespace list calculation The Number of Namespaces (nn) field in the identify controller data structure is defined as u32 and the maximum allowed value in NVMe specification is 0xFFFFFFFEUL. This change fixes the possible overflow of the DIV_ROUND_UP() operation used in nvme_scan_ns_list() by casting the nn to u64. Signed-off-by: Jaesoo Lee Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1b7c2afd84cb..120fb593d1da 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3400,7 +3400,8 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) { struct nvme_ns *ns; __le32 *ns_list; - unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024); + unsigned i, j, nsid, prev = 0; + unsigned num_lists = DIV_ROUND_UP_ULL((u64)nn, 1024); int ret = 0; ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); -- cgit v1.2.3 From 62f99b62e5e3b88d23b6ced4380199e8386965af Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 6 Jun 2019 12:27:36 +0300 Subject: nvme-rdma: use dynamic dma mapping per command Commit 87fd125344d6 ("nvme-rdma: remove redundant reference between ib_device and tagset") caused a kernel panic when disconnecting from an inaccessible controller (disconnect during re-connection). -- nvme nvme0: Removing ctrl: NQN "testnqn1" nvme_rdma: nvme_rdma_exit_request: hctx 0 queue_idx 1 BUG: unable to handle kernel paging request at 0000000080000228 PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI ... Call Trace: blk_mq_exit_hctx+0x5c/0xf0 blk_mq_exit_queue+0xd4/0x100 blk_cleanup_queue+0x9a/0xc0 nvme_rdma_destroy_io_queues+0x52/0x60 [nvme_rdma] nvme_rdma_shutdown_ctrl+0x3e/0x80 [nvme_rdma] nvme_do_delete_ctrl+0x53/0x80 [nvme_core] nvme_sysfs_delete+0x45/0x60 [nvme_core] kernfs_fop_write+0x105/0x180 vfs_write+0xad/0x1a0 ksys_write+0x5a/0xd0 do_syscall_64+0x55/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fa215417154 -- The reason for this crash is accessing an already freed ib_device for performing dma_unmap during exit_request commands. The root cause for that is that during re-connection all the queues are destroyed and re-created (and the ib_device is reference counted by the queues and freed as well) but the tagset stays alive and all the DMA mappings (that we perform in init_request) kept in the request context. The original commit fixed a different bug that was introduced during bonding (aka nic teaming) tests that for some scenarios change the underlying ib_device and caused memory leakage and possible segmentation fault. This commit is a complementary commit that also changes the wrong DMA mappings that were saved in the request context and making the request sqe dma mappings dynamic with the command lifetime (i.e. mapped in .queue_rq and unmapped in .complete). It also fixes the above crash of accessing freed ib_device during destruction of the tagset. Fixes: 87fd125344d6 ("nvme-rdma: remove redundant reference between ib_device and tagset") Reported-by: Jim Harris Suggested-by: Sagi Grimberg Tested-by: Jim Harris Reviewed-by: Christoph Hellwig Signed-off-by: Max Gurtovoy Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 53 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 17 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 26709a2ab593..97f668a39ae1 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -213,6 +213,11 @@ static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev, if (!ring) return NULL; + /* + * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue + * lifetime. It's safe, since any chage in the underlying RDMA device + * will issue error recovery and queue re-creation. + */ for (i = 0; i < ib_queue_size; i++) { if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir)) goto out_free_ring; @@ -274,14 +279,9 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx) { - struct nvme_rdma_ctrl *ctrl = set->driver_data; struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); - int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; - struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; - struct nvme_rdma_device *dev = queue->device; - nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), - DMA_TO_DEVICE); + kfree(req->sqe.data); } static int nvme_rdma_init_request(struct blk_mq_tag_set *set, @@ -292,15 +292,11 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; - struct nvme_rdma_device *dev = queue->device; - struct ib_device *ibdev = dev->dev; - int ret; nvme_req(rq)->ctrl = &ctrl->ctrl; - ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), - DMA_TO_DEVICE); - if (ret) - return ret; + req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL); + if (!req->sqe.data) + return -ENOMEM; req->queue = queue; @@ -779,6 +775,11 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev); + /* + * Bind the async event SQE DMA mapping to the admin queue lifetime. + * It's safe, since any chage in the underlying RDMA device will issue + * error recovery and queue re-creation. + */ error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); if (error) @@ -1719,12 +1720,20 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); dev = queue->device->dev; + + req->sqe.dma = ib_dma_map_single(dev, req->sqe.data, + sizeof(struct nvme_command), + DMA_TO_DEVICE); + err = ib_dma_mapping_error(dev, req->sqe.dma); + if (unlikely(err)) + return BLK_STS_RESOURCE; + ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); ret = nvme_setup_cmd(ns, rq, c); if (ret) - return ret; + goto unmap_qe; blk_mq_start_request(rq); @@ -1749,10 +1758,16 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, } return BLK_STS_OK; + err: if (err == -ENOMEM || err == -EAGAIN) - return BLK_STS_RESOURCE; - return BLK_STS_IOERR; + ret = BLK_STS_RESOURCE; + else + ret = BLK_STS_IOERR; +unmap_qe: + ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command), + DMA_TO_DEVICE); + return ret; } static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx) @@ -1765,8 +1780,12 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx) static void nvme_rdma_complete_rq(struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + struct ib_device *ibdev = queue->device->dev; - nvme_rdma_unmap_data(req->queue, rq); + nvme_rdma_unmap_data(queue, rq); + ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command), + DMA_TO_DEVICE); nvme_complete_rq(rq); } -- cgit v1.2.3