summaryrefslogtreecommitdiff
path: root/drivers/nvme/host/rdma.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/rdma.c')
-rw-r--r--drivers/nvme/host/rdma.c150
1 files changed, 77 insertions, 73 deletions
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 6d4119dfbdaa..a7f7d0ae3331 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -19,6 +19,7 @@
#include <linux/string.h>
#include <linux/atomic.h>
#include <linux/blk-mq.h>
+#include <linux/blk-mq-rdma.h>
#include <linux/types.h>
#include <linux/list.h>
#include <linux/mutex.h>
@@ -86,7 +87,7 @@ enum nvme_rdma_queue_flags {
struct nvme_rdma_queue {
struct nvme_rdma_qe *rsp_ring;
- u8 sig_count;
+ atomic_t sig_count;
int queue_size;
size_t cmnd_capsule_len;
struct nvme_rdma_ctrl *ctrl;
@@ -103,7 +104,6 @@ struct nvme_rdma_queue {
struct nvme_rdma_ctrl {
/* read only in the hot path */
struct nvme_rdma_queue *queues;
- u32 queue_count;
/* other member variables */
struct blk_mq_tag_set tag_set;
@@ -119,7 +119,6 @@ struct nvme_rdma_ctrl {
struct blk_mq_tag_set admin_tag_set;
struct nvme_rdma_device *device;
- u64 cap;
u32 max_fr_pages;
struct sockaddr_storage addr;
@@ -274,9 +273,6 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq)
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
int ret = 0;
- if (!req->mr->need_inval)
- goto out;
-
ib_dereg_mr(req->mr);
req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
@@ -349,7 +345,7 @@ static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
struct nvme_rdma_ctrl *ctrl = data;
struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
- BUG_ON(hctx_idx >= ctrl->queue_count);
+ BUG_ON(hctx_idx >= ctrl->ctrl.queue_count);
hctx->driver_data = queue;
return 0;
@@ -468,14 +464,10 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
ibdev = queue->device->dev;
/*
- * The admin queue is barely used once the controller is live, so don't
- * bother to spread it out.
+ * Spread I/O queues completion vectors according their queue index.
+ * Admin queues can always go on completion vector 0.
*/
- if (idx == 0)
- comp_vector = 0;
- else
- comp_vector = idx % ibdev->num_comp_vectors;
-
+ comp_vector = idx == 0 ? idx : idx - 1;
/* +1 for ib_stop_cq */
queue->ib_cq = ib_alloc_cq(ibdev, queue,
@@ -525,6 +517,7 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
queue->cmnd_capsule_len = sizeof(struct nvme_command);
queue->queue_size = queue_size;
+ atomic_set(&queue->sig_count, 0);
queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
RDMA_PS_TCP, IB_QPT_RC);
@@ -587,7 +580,7 @@ static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
{
int i;
- for (i = 1; i < ctrl->queue_count; i++)
+ for (i = 1; i < ctrl->ctrl.queue_count; i++)
nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
}
@@ -595,7 +588,7 @@ static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl)
{
int i, ret = 0;
- for (i = 1; i < ctrl->queue_count; i++) {
+ for (i = 1; i < ctrl->ctrl.queue_count; i++) {
ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
if (ret) {
dev_info(ctrl->ctrl.device,
@@ -615,22 +608,32 @@ out_free_queues:
static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl)
{
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+ struct ib_device *ibdev = ctrl->device->dev;
unsigned int nr_io_queues;
int i, ret;
nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
+
+ /*
+ * we map queues according to the device irq vectors for
+ * optimal locality so we don't need more queues than
+ * completion vectors.
+ */
+ nr_io_queues = min_t(unsigned int, nr_io_queues,
+ ibdev->num_comp_vectors);
+
ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
if (ret)
return ret;
- ctrl->queue_count = nr_io_queues + 1;
- if (ctrl->queue_count < 2)
+ ctrl->ctrl.queue_count = nr_io_queues + 1;
+ if (ctrl->ctrl.queue_count < 2)
return 0;
dev_info(ctrl->ctrl.device,
"creating %d I/O queues.\n", nr_io_queues);
- for (i = 1; i < ctrl->queue_count; i++) {
+ for (i = 1; i < ctrl->ctrl.queue_count; i++) {
ret = nvme_rdma_init_queue(ctrl, i,
ctrl->ctrl.opts->queue_size);
if (ret) {
@@ -705,7 +708,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
++ctrl->ctrl.nr_reconnects;
- if (ctrl->queue_count > 1) {
+ if (ctrl->ctrl.queue_count > 1) {
nvme_rdma_free_io_queues(ctrl);
ret = blk_mq_reinit_tagset(&ctrl->tag_set);
@@ -729,13 +732,11 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
- ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
+ ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
if (ret)
goto requeue;
- nvme_start_keep_alive(&ctrl->ctrl);
-
- if (ctrl->queue_count > 1) {
+ if (ctrl->ctrl.queue_count > 1) {
ret = nvme_rdma_init_io_queues(ctrl);
if (ret)
goto requeue;
@@ -743,16 +744,16 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
ret = nvme_rdma_connect_io_queues(ctrl);
if (ret)
goto requeue;
+
+ blk_mq_update_nr_hw_queues(&ctrl->tag_set,
+ ctrl->ctrl.queue_count - 1);
}
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
ctrl->ctrl.nr_reconnects = 0;
- if (ctrl->queue_count > 1) {
- nvme_queue_scan(&ctrl->ctrl);
- nvme_queue_async_events(&ctrl->ctrl);
- }
+ nvme_start_ctrl(&ctrl->ctrl);
dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
@@ -770,17 +771,17 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
struct nvme_rdma_ctrl, err_work);
int i;
- nvme_stop_keep_alive(&ctrl->ctrl);
+ nvme_stop_ctrl(&ctrl->ctrl);
- for (i = 0; i < ctrl->queue_count; i++)
+ for (i = 0; i < ctrl->ctrl.queue_count; i++)
clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags);
- if (ctrl->queue_count > 1)
+ if (ctrl->ctrl.queue_count > 1)
nvme_stop_queues(&ctrl->ctrl);
- blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
/* We must take care of fastfail/requeue all our inflight requests */
- if (ctrl->queue_count > 1)
+ if (ctrl->ctrl.queue_count > 1)
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_cancel_request, &ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
@@ -790,7 +791,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
* queues are not a live anymore, so restart the queues to fail fast
* new IO
*/
- blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_reconnect_or_remove(ctrl);
@@ -926,7 +927,11 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
int nr;
- nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE);
+ /*
+ * Align the MR to a 4K page size to match the ctrl page size and
+ * the block virtual boundary.
+ */
+ nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
if (nr < count) {
if (nr < 0)
return nr;
@@ -1008,17 +1013,16 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
nvme_rdma_wr_error(cq, wc, "SEND");
}
-static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
+/*
+ * We want to signal completion at least every queue depth/2. This returns the
+ * largest power of two that is not above half of (queue size + 1) to optimize
+ * (avoid divisions).
+ */
+static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
{
- int sig_limit;
+ int limit = 1 << ilog2((queue->queue_size + 1) / 2);
- /*
- * We signal completion every queue depth/2 and also handle the
- * degenerated case of a device with queue_depth=1, where we
- * would need to signal every message.
- */
- sig_limit = max(queue->queue_size / 2, 1);
- return (++queue->sig_count % sig_limit) == 0;
+ return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0;
}
static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
@@ -1505,6 +1509,13 @@ static void nvme_rdma_complete_rq(struct request *rq)
nvme_complete_rq(rq);
}
+static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
+{
+ struct nvme_rdma_ctrl *ctrl = set->driver_data;
+
+ return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0);
+}
+
static const struct blk_mq_ops nvme_rdma_mq_ops = {
.queue_rq = nvme_rdma_queue_rq,
.complete = nvme_rdma_complete_rq,
@@ -1514,6 +1525,7 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = {
.init_hctx = nvme_rdma_init_hctx,
.poll = nvme_rdma_poll,
.timeout = nvme_rdma_timeout,
+ .map_queues = nvme_rdma_map_queues,
};
static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
@@ -1574,7 +1586,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
- error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
+ error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP,
+ &ctrl->ctrl.cap);
if (error) {
dev_err(ctrl->ctrl.device,
"prop_get NVME_REG_CAP failed\n");
@@ -1582,14 +1595,14 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
}
ctrl->ctrl.sqsize =
- min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->ctrl.sqsize);
+ min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
- error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
+ error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
if (error)
goto out_cleanup_queue;
ctrl->ctrl.max_hw_sectors =
- (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9);
+ (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
error = nvme_init_identify(&ctrl->ctrl);
if (error)
@@ -1601,8 +1614,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
if (error)
goto out_cleanup_queue;
- nvme_start_keep_alive(&ctrl->ctrl);
-
return 0;
out_cleanup_queue:
@@ -1620,11 +1631,10 @@ out_free_queue:
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl)
{
- nvme_stop_keep_alive(&ctrl->ctrl);
cancel_work_sync(&ctrl->err_work);
cancel_delayed_work_sync(&ctrl->reconnect_work);
- if (ctrl->queue_count > 1) {
+ if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_cancel_request, &ctrl->ctrl);
@@ -1634,18 +1644,21 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl)
if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags))
nvme_shutdown_ctrl(&ctrl->ctrl);
- blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_cancel_request, &ctrl->ctrl);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_destroy_admin_queue(ctrl);
}
static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
{
- nvme_uninit_ctrl(&ctrl->ctrl);
+ nvme_stop_ctrl(&ctrl->ctrl);
+ nvme_remove_namespaces(&ctrl->ctrl);
if (shutdown)
nvme_rdma_shutdown_ctrl(ctrl);
+ nvme_uninit_ctrl(&ctrl->ctrl);
if (ctrl->ctrl.tagset) {
blk_cleanup_queue(ctrl->ctrl.connect_q);
blk_mq_free_tag_set(&ctrl->tag_set);
@@ -1707,6 +1720,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
int ret;
bool changed;
+ nvme_stop_ctrl(&ctrl->ctrl);
nvme_rdma_shutdown_ctrl(ctrl);
ret = nvme_rdma_configure_admin_queue(ctrl);
@@ -1716,7 +1730,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
goto del_dead_ctrl;
}
- if (ctrl->queue_count > 1) {
+ if (ctrl->ctrl.queue_count > 1) {
ret = blk_mq_reinit_tagset(&ctrl->tag_set);
if (ret)
goto del_dead_ctrl;
@@ -1728,16 +1742,15 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
ret = nvme_rdma_connect_io_queues(ctrl);
if (ret)
goto del_dead_ctrl;
+
+ blk_mq_update_nr_hw_queues(&ctrl->tag_set,
+ ctrl->ctrl.queue_count - 1);
}
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
- if (ctrl->queue_count > 1) {
- nvme_start_queues(&ctrl->ctrl);
- nvme_queue_scan(&ctrl->ctrl);
- nvme_queue_async_events(&ctrl->ctrl);
- }
+ nvme_start_ctrl(&ctrl->ctrl);
return;
@@ -1785,7 +1798,7 @@ static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl)
ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
ctrl->tag_set.driver_data = ctrl;
- ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
+ ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1;
ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
@@ -1863,12 +1876,12 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
- ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
+ ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
ctrl->ctrl.sqsize = opts->queue_size - 1;
ctrl->ctrl.kato = opts->kato;
ret = -ENOMEM;
- ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues),
+ ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
GFP_KERNEL);
if (!ctrl->queues)
goto out_uninit_ctrl;
@@ -1925,15 +1938,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
mutex_unlock(&nvme_rdma_ctrl_mutex);
- if (opts->nr_io_queues) {
- nvme_queue_scan(&ctrl->ctrl);
- nvme_queue_async_events(&ctrl->ctrl);
- }
+ nvme_start_ctrl(&ctrl->ctrl);
return &ctrl->ctrl;
out_remove_admin_queue:
- nvme_stop_keep_alive(&ctrl->ctrl);
nvme_rdma_destroy_admin_queue(ctrl);
out_kfree_queues:
kfree(ctrl->queues);
@@ -1956,10 +1965,6 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
.create_ctrl = nvme_rdma_create_ctrl,
};
-static void nvme_rdma_add_one(struct ib_device *ib_device)
-{
-}
-
static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
{
struct nvme_rdma_ctrl *ctrl;
@@ -1981,7 +1986,6 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
static struct ib_client nvme_rdma_ib_client = {
.name = "nvme_rdma",
- .add = nvme_rdma_add_one,
.remove = nvme_rdma_remove_one
};