diff options
Diffstat (limited to 'drivers/nvme/host/rdma.c')
-rw-r--r-- | drivers/nvme/host/rdma.c | 150 |
1 files changed, 77 insertions, 73 deletions
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 6d4119dfbdaa..a7f7d0ae3331 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -19,6 +19,7 @@ #include <linux/string.h> #include <linux/atomic.h> #include <linux/blk-mq.h> +#include <linux/blk-mq-rdma.h> #include <linux/types.h> #include <linux/list.h> #include <linux/mutex.h> @@ -86,7 +87,7 @@ enum nvme_rdma_queue_flags { struct nvme_rdma_queue { struct nvme_rdma_qe *rsp_ring; - u8 sig_count; + atomic_t sig_count; int queue_size; size_t cmnd_capsule_len; struct nvme_rdma_ctrl *ctrl; @@ -103,7 +104,6 @@ struct nvme_rdma_queue { struct nvme_rdma_ctrl { /* read only in the hot path */ struct nvme_rdma_queue *queues; - u32 queue_count; /* other member variables */ struct blk_mq_tag_set tag_set; @@ -119,7 +119,6 @@ struct nvme_rdma_ctrl { struct blk_mq_tag_set admin_tag_set; struct nvme_rdma_device *device; - u64 cap; u32 max_fr_pages; struct sockaddr_storage addr; @@ -274,9 +273,6 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq) struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); int ret = 0; - if (!req->mr->need_inval) - goto out; - ib_dereg_mr(req->mr); req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, @@ -349,7 +345,7 @@ static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, struct nvme_rdma_ctrl *ctrl = data; struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1]; - BUG_ON(hctx_idx >= ctrl->queue_count); + BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); hctx->driver_data = queue; return 0; @@ -468,14 +464,10 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) ibdev = queue->device->dev; /* - * The admin queue is barely used once the controller is live, so don't - * bother to spread it out. + * Spread I/O queues completion vectors according their queue index. + * Admin queues can always go on completion vector 0. */ - if (idx == 0) - comp_vector = 0; - else - comp_vector = idx % ibdev->num_comp_vectors; - + comp_vector = idx == 0 ? idx : idx - 1; /* +1 for ib_stop_cq */ queue->ib_cq = ib_alloc_cq(ibdev, queue, @@ -525,6 +517,7 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, queue->cmnd_capsule_len = sizeof(struct nvme_command); queue->queue_size = queue_size; + atomic_set(&queue->sig_count, 0); queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, RDMA_PS_TCP, IB_QPT_RC); @@ -587,7 +580,7 @@ static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) { int i; - for (i = 1; i < ctrl->queue_count; i++) + for (i = 1; i < ctrl->ctrl.queue_count; i++) nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); } @@ -595,7 +588,7 @@ static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) { int i, ret = 0; - for (i = 1; i < ctrl->queue_count; i++) { + for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvmf_connect_io_queue(&ctrl->ctrl, i); if (ret) { dev_info(ctrl->ctrl.device, @@ -615,22 +608,32 @@ out_free_queues: static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + struct ib_device *ibdev = ctrl->device->dev; unsigned int nr_io_queues; int i, ret; nr_io_queues = min(opts->nr_io_queues, num_online_cpus()); + + /* + * we map queues according to the device irq vectors for + * optimal locality so we don't need more queues than + * completion vectors. + */ + nr_io_queues = min_t(unsigned int, nr_io_queues, + ibdev->num_comp_vectors); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) return ret; - ctrl->queue_count = nr_io_queues + 1; - if (ctrl->queue_count < 2) + ctrl->ctrl.queue_count = nr_io_queues + 1; + if (ctrl->ctrl.queue_count < 2) return 0; dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues); - for (i = 1; i < ctrl->queue_count; i++) { + for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvme_rdma_init_queue(ctrl, i, ctrl->ctrl.opts->queue_size); if (ret) { @@ -705,7 +708,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ++ctrl->ctrl.nr_reconnects; - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { nvme_rdma_free_io_queues(ctrl); ret = blk_mq_reinit_tagset(&ctrl->tag_set); @@ -729,13 +732,11 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); if (ret) goto requeue; - nvme_start_keep_alive(&ctrl->ctrl); - - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { ret = nvme_rdma_init_io_queues(ctrl); if (ret) goto requeue; @@ -743,16 +744,16 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ret = nvme_rdma_connect_io_queues(ctrl); if (ret) goto requeue; + + blk_mq_update_nr_hw_queues(&ctrl->tag_set, + ctrl->ctrl.queue_count - 1); } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); ctrl->ctrl.nr_reconnects = 0; - if (ctrl->queue_count > 1) { - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - } + nvme_start_ctrl(&ctrl->ctrl); dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); @@ -770,17 +771,17 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl, err_work); int i; - nvme_stop_keep_alive(&ctrl->ctrl); + nvme_stop_ctrl(&ctrl->ctrl); - for (i = 0; i < ctrl->queue_count; i++) + for (i = 0; i < ctrl->ctrl.queue_count; i++) clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); - if (ctrl->queue_count > 1) + if (ctrl->ctrl.queue_count > 1) nvme_stop_queues(&ctrl->ctrl); - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); /* We must take care of fastfail/requeue all our inflight requests */ - if (ctrl->queue_count > 1) + if (ctrl->ctrl.queue_count > 1) blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, @@ -790,7 +791,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) * queues are not a live anymore, so restart the queues to fail fast * new IO */ - blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_start_queues(&ctrl->ctrl); nvme_rdma_reconnect_or_remove(ctrl); @@ -926,7 +927,11 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; int nr; - nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE); + /* + * Align the MR to a 4K page size to match the ctrl page size and + * the block virtual boundary. + */ + nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K); if (nr < count) { if (nr < 0) return nr; @@ -1008,17 +1013,16 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) nvme_rdma_wr_error(cq, wc, "SEND"); } -static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) +/* + * We want to signal completion at least every queue depth/2. This returns the + * largest power of two that is not above half of (queue size + 1) to optimize + * (avoid divisions). + */ +static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) { - int sig_limit; + int limit = 1 << ilog2((queue->queue_size + 1) / 2); - /* - * We signal completion every queue depth/2 and also handle the - * degenerated case of a device with queue_depth=1, where we - * would need to signal every message. - */ - sig_limit = max(queue->queue_size / 2, 1); - return (++queue->sig_count % sig_limit) == 0; + return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0; } static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, @@ -1505,6 +1509,13 @@ static void nvme_rdma_complete_rq(struct request *rq) nvme_complete_rq(rq); } +static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) +{ + struct nvme_rdma_ctrl *ctrl = set->driver_data; + + return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0); +} + static const struct blk_mq_ops nvme_rdma_mq_ops = { .queue_rq = nvme_rdma_queue_rq, .complete = nvme_rdma_complete_rq, @@ -1514,6 +1525,7 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = { .init_hctx = nvme_rdma_init_hctx, .poll = nvme_rdma_poll, .timeout = nvme_rdma_timeout, + .map_queues = nvme_rdma_map_queues, }; static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { @@ -1574,7 +1586,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, + &ctrl->ctrl.cap); if (error) { dev_err(ctrl->ctrl.device, "prop_get NVME_REG_CAP failed\n"); @@ -1582,14 +1595,14 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) } ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->ctrl.sqsize); + min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); if (error) goto out_cleanup_queue; ctrl->ctrl.max_hw_sectors = - (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9); + (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9); error = nvme_init_identify(&ctrl->ctrl); if (error) @@ -1601,8 +1614,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) if (error) goto out_cleanup_queue; - nvme_start_keep_alive(&ctrl->ctrl); - return 0; out_cleanup_queue: @@ -1620,11 +1631,10 @@ out_free_queue: static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) { - nvme_stop_keep_alive(&ctrl->ctrl); cancel_work_sync(&ctrl->err_work); cancel_delayed_work_sync(&ctrl->reconnect_work); - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); @@ -1634,18 +1644,21 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags)) nvme_shutdown_ctrl(&ctrl->ctrl); - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl); } static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { - nvme_uninit_ctrl(&ctrl->ctrl); + nvme_stop_ctrl(&ctrl->ctrl); + nvme_remove_namespaces(&ctrl->ctrl); if (shutdown) nvme_rdma_shutdown_ctrl(ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); if (ctrl->ctrl.tagset) { blk_cleanup_queue(ctrl->ctrl.connect_q); blk_mq_free_tag_set(&ctrl->tag_set); @@ -1707,6 +1720,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) int ret; bool changed; + nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl); ret = nvme_rdma_configure_admin_queue(ctrl); @@ -1716,7 +1730,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) goto del_dead_ctrl; } - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { ret = blk_mq_reinit_tagset(&ctrl->tag_set); if (ret) goto del_dead_ctrl; @@ -1728,16 +1742,15 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) ret = nvme_rdma_connect_io_queues(ctrl); if (ret) goto del_dead_ctrl; + + blk_mq_update_nr_hw_queues(&ctrl->tag_set, + ctrl->ctrl.queue_count - 1); } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - if (ctrl->queue_count > 1) { - nvme_start_queues(&ctrl->ctrl); - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - } + nvme_start_ctrl(&ctrl->ctrl); return; @@ -1785,7 +1798,7 @@ static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) + SG_CHUNK_SIZE * sizeof(struct scatterlist); ctrl->tag_set.driver_data = ctrl; - ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; ctrl->tag_set.timeout = NVME_IO_TIMEOUT; ret = blk_mq_alloc_tag_set(&ctrl->tag_set); @@ -1863,12 +1876,12 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); - ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ + ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; ret = -ENOMEM; - ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues), + ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), GFP_KERNEL); if (!ctrl->queues) goto out_uninit_ctrl; @@ -1925,15 +1938,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); - if (opts->nr_io_queues) { - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - } + nvme_start_ctrl(&ctrl->ctrl); return &ctrl->ctrl; out_remove_admin_queue: - nvme_stop_keep_alive(&ctrl->ctrl); nvme_rdma_destroy_admin_queue(ctrl); out_kfree_queues: kfree(ctrl->queues); @@ -1956,10 +1965,6 @@ static struct nvmf_transport_ops nvme_rdma_transport = { .create_ctrl = nvme_rdma_create_ctrl, }; -static void nvme_rdma_add_one(struct ib_device *ib_device) -{ -} - static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) { struct nvme_rdma_ctrl *ctrl; @@ -1981,7 +1986,6 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) static struct ib_client nvme_rdma_ib_client = { .name = "nvme_rdma", - .add = nvme_rdma_add_one, .remove = nvme_rdma_remove_one }; |