summaryrefslogtreecommitdiff
path: root/drivers/nvme/host
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-17 16:57:47 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-17 16:57:47 -0700
commit7ad67ca5534ee7c958559c4ad610f05c4578e361 (patch)
treedc6b6a8a6b70b5f25b07bcdc06d8e77e705f6822 /drivers/nvme/host
parent5260c2b863ef1152445ce93476c95d8c8a727eef (diff)
parent9c7eddf1b080f98fed1aadb74fe784f29bf77a08 (diff)
Merge tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: - Two NVMe pull requests: - ana log parse fix from Anton - nvme quirks support for Apple devices from Ben - fix missing bio completion tracing for multipath stack devices from Hannes and Mikhail - IP TOS settings for nvme rdma and tcp transports from Israel - rq_dma_dir cleanups from Israel - tracing for Get LBA Status command from Minwoo - Some nvme-tcp cleanups from Minwoo, Potnuri and Myself - Some consolidation between the fabrics transports for handling the CAP register - reset race with ns scanning fix for fabrics (move fabrics commands to a dedicated request queue with a different lifetime from the admin request queue)." - controller reset and namespace scan races fixes - nvme discovery log change uevent support - naming improvements from Keith - multiple discovery controllers reject fix from James - some regular cleanups from various people - Series fixing (and re-fixing) null_blk debug printing and nr_devices checks (André) - A few pull requests from Song, with fixes from Andy, Guoqing, Guilherme, Neil, Nigel, and Yufen. - REQ_OP_ZONE_RESET_ALL support (Chaitanya) - Bio merge handling unification (Christoph) - Pick default elevator correctly for devices with special needs (Damien) - Block stats fixes (Hou) - Timeout and support devices nbd fixes (Mike) - Series fixing races around elevator switching and device add/remove (Ming) - sed-opal cleanups (Revanth) - Per device weight support for BFQ (Fam) - Support for blk-iocost, a new model that can properly account cost of IO workloads. (Tejun) - blk-cgroup writeback fixes (Tejun) - paride queue init fixes (zhengbin) - blk_set_runtime_active() cleanup (Stanley) - Block segment mapping optimizations (Bart) - lightnvm fixes (Hans/Minwoo/YueHaibing) - Various little fixes and cleanups * tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block: (186 commits) null_blk: format pr_* logs with pr_fmt null_blk: match the type of parameter nr_devices null_blk: do not fail the module load with zero devices block: also check RQF_STATS in blk_mq_need_time_stamp() block: make rq sector size accessible for block stats bfq: Fix bfq linkage error raid5: use bio_end_sector in r5_next_bio raid5: remove STRIPE_OPS_REQ_PENDING md: add feature flag MD_FEATURE_RAID0_LAYOUT md/raid0: avoid RAID0 data corruption due to layout confusion. raid5: don't set STRIPE_HANDLE to stripe which is in batch list raid5: don't increment read_errors on EILSEQ return nvmet: fix a wrong error status returned in error log page nvme: send discovery log page change events to userspace nvme: add uevent variables for controller devices nvme: enable aen regardless of the presence of I/O queues nvme-fabrics: allow discovery subsystems accept a kato nvmet: Use PTR_ERR_OR_ZERO() in nvmet_init_discovery() nvme: Remove redundant assignment of cq vector nvme: Assign subsys instance from first ctrl ...
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r--drivers/nvme/host/Kconfig1
-rw-r--r--drivers/nvme/host/core.c201
-rw-r--r--drivers/nvme/host/fabrics.c38
-rw-r--r--drivers/nvme/host/fabrics.h3
-rw-r--r--drivers/nvme/host/fc.c73
-rw-r--r--drivers/nvme/host/lightnvm.c45
-rw-r--r--drivers/nvme/host/multipath.c8
-rw-r--r--drivers/nvme/host/nvme.h36
-rw-r--r--drivers/nvme/host/pci.c102
-rw-r--r--drivers/nvme/host/rdma.c61
-rw-r--r--drivers/nvme/host/tcp.c144
-rw-r--r--drivers/nvme/host/trace.c18
12 files changed, 500 insertions, 230 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index ec43ac9199e2..2b36f052bfb9 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -64,6 +64,7 @@ config NVME_TCP
depends on INET
depends on BLK_DEV_NVME
select NVME_FABRICS
+ select CRYPTO_CRC32C
help
This provides support for the NVMe over Fabrics protocol using
the TCP transport. This allows you to use remote block devices
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d3d6b7bd6903..1ede1763a5ee 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -22,12 +22,12 @@
#include <linux/pm_qos.h>
#include <asm/unaligned.h>
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
#include "nvme.h"
#include "fabrics.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
#define NVME_MINORS (1U << MINORBITS)
unsigned int admin_timeout = 60;
@@ -81,7 +81,6 @@ EXPORT_SYMBOL_GPL(nvme_reset_wq);
struct workqueue_struct *nvme_delete_wq;
EXPORT_SYMBOL_GPL(nvme_delete_wq);
-static DEFINE_IDA(nvme_subsystems_ida);
static LIST_HEAD(nvme_subsystems);
static DEFINE_MUTEX(nvme_subsystems_lock);
@@ -197,9 +196,9 @@ static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
}
-static blk_status_t nvme_error_status(struct request *req)
+static blk_status_t nvme_error_status(u16 status)
{
- switch (nvme_req(req)->status & 0x7ff) {
+ switch (status & 0x7ff) {
case NVME_SC_SUCCESS:
return BLK_STS_OK;
case NVME_SC_CAP_EXCEEDED:
@@ -226,6 +225,8 @@ static blk_status_t nvme_error_status(struct request *req)
return BLK_STS_PROTECTION;
case NVME_SC_RESERVATION_CONFLICT:
return BLK_STS_NEXUS;
+ case NVME_SC_HOST_PATH_ERROR:
+ return BLK_STS_TRANSPORT;
default:
return BLK_STS_IOERR;
}
@@ -260,7 +261,7 @@ static void nvme_retry_req(struct request *req)
void nvme_complete_rq(struct request *req)
{
- blk_status_t status = nvme_error_status(req);
+ blk_status_t status = nvme_error_status(nvme_req(req)->status);
trace_nvme_complete_rq(req);
@@ -279,6 +280,8 @@ void nvme_complete_rq(struct request *req)
return;
}
}
+
+ nvme_trace_bio_complete(req, status);
blk_mq_end_request(req, status);
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
@@ -288,8 +291,12 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
"Cancelling I/O %d", req->tag);
- nvme_req(req)->status = NVME_SC_ABORT_REQ;
- blk_mq_complete_request_sync(req);
+ /* don't abort one completed request */
+ if (blk_mq_request_completed(req))
+ return true;
+
+ nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
+ blk_mq_complete_request(req);
return true;
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
@@ -1088,10 +1095,9 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n
NVME_IDENTIFY_DATA_SIZE);
}
-static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
- unsigned nsid)
+static int nvme_identify_ns(struct nvme_ctrl *ctrl,
+ unsigned nsid, struct nvme_id_ns **id)
{
- struct nvme_id_ns *id;
struct nvme_command c = { };
int error;
@@ -1100,18 +1106,17 @@ static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
c.identify.nsid = cpu_to_le32(nsid);
c.identify.cns = NVME_ID_CNS_NS;
- id = kmalloc(sizeof(*id), GFP_KERNEL);
- if (!id)
- return NULL;
+ *id = kmalloc(sizeof(**id), GFP_KERNEL);
+ if (!*id)
+ return -ENOMEM;
- error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
+ error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
if (error) {
dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
- kfree(id);
- return NULL;
+ kfree(*id);
}
- return id;
+ return error;
}
static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
@@ -1180,7 +1185,8 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
#define NVME_AEN_SUPPORTED \
- (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE)
+ (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
+ NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
static void nvme_enable_aen(struct nvme_ctrl *ctrl)
{
@@ -1195,6 +1201,8 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl)
if (status)
dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
supported_aens);
+
+ queue_work(nvme_wq, &ctrl->async_event_work);
}
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
@@ -1594,9 +1602,11 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors);
}
-static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
+static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
struct nvme_id_ns *id, struct nvme_ns_ids *ids)
{
+ int ret = 0;
+
memset(ids, 0, sizeof(*ids));
if (ctrl->vs >= NVME_VS(1, 1, 0))
@@ -1607,10 +1617,12 @@ static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
/* Don't treat error as fatal we potentially
* already have a NGUID or EUI-64
*/
- if (nvme_identify_ns_descs(ctrl, nsid, ids))
+ ret = nvme_identify_ns_descs(ctrl, nsid, ids);
+ if (ret)
dev_warn(ctrl->device,
- "%s: Identify Descriptors failed\n", __func__);
+ "Identify Descriptors failed (%d)\n", ret);
}
+ return ret;
}
static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
@@ -1738,25 +1750,37 @@ static int nvme_revalidate_disk(struct gendisk *disk)
return -ENODEV;
}
- id = nvme_identify_ns(ctrl, ns->head->ns_id);
- if (!id)
- return -ENODEV;
+ ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id);
+ if (ret)
+ goto out;
if (id->ncap == 0) {
ret = -ENODEV;
- goto out;
+ goto free_id;
}
__nvme_revalidate_disk(disk, id);
- nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
+ ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
+ if (ret)
+ goto free_id;
+
if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
dev_err(ctrl->device,
"identifiers changed for nsid %d\n", ns->head->ns_id);
ret = -ENODEV;
}
-out:
+free_id:
kfree(id);
+out:
+ /*
+ * Only fail the function if we got a fatal error back from the
+ * device, otherwise ignore the error and just move on.
+ */
+ if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR)))
+ ret = 0;
+ else if (ret > 0)
+ ret = blk_status_to_errno(nvme_error_status(ret));
return ret;
}
@@ -1952,7 +1976,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
* bits', but doing so may cause the device to complete commands to the
* admin queue ... and we don't know what memory that might be pointing at!
*/
-int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
{
int ret;
@@ -1966,20 +1990,27 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
msleep(NVME_QUIRK_DELAY_AMOUNT);
- return nvme_wait_ready(ctrl, cap, false);
+ return nvme_wait_ready(ctrl, ctrl->cap, false);
}
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
-int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
{
/*
* Default to a 4K page size, with the intention to update this
* path in the future to accomodate architectures with differing
* kernel and IO page sizes.
*/
- unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
+ unsigned dev_page_min, page_shift = 12;
int ret;
+ ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
+ if (ret) {
+ dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
+ return ret;
+ }
+ dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
+
if (page_shift < dev_page_min) {
dev_err(ctrl->device,
"Minimum device page size %u too large for host (%u)\n",
@@ -1998,7 +2029,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
- return nvme_wait_ready(ctrl, cap, true);
+ return nvme_wait_ready(ctrl, ctrl->cap, true);
}
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
@@ -2332,7 +2363,8 @@ static void nvme_release_subsystem(struct device *dev)
struct nvme_subsystem *subsys =
container_of(dev, struct nvme_subsystem, dev);
- ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
+ if (subsys->instance >= 0)
+ ida_simple_remove(&nvme_instance_ida, subsys->instance);
kfree(subsys);
}
@@ -2361,6 +2393,17 @@ static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
lockdep_assert_held(&nvme_subsystems_lock);
+ /*
+ * Fail matches for discovery subsystems. This results
+ * in each discovery controller bound to a unique subsystem.
+ * This avoids issues with validating controller values
+ * that can only be true when there is a single unique subsystem.
+ * There may be multiple and completely independent entities
+ * that provide discovery controllers.
+ */
+ if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
+ return NULL;
+
list_for_each_entry(subsys, &nvme_subsystems, entry) {
if (strcmp(subsys->subnqn, subsysnqn))
continue;
@@ -2461,12 +2504,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
if (!subsys)
return -ENOMEM;
- ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
- if (ret < 0) {
- kfree(subsys);
- return ret;
- }
- subsys->instance = ret;
+
+ subsys->instance = -1;
mutex_init(&subsys->lock);
kref_init(&subsys->ref);
INIT_LIST_HEAD(&subsys->ctrls);
@@ -2485,7 +2524,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
subsys->dev.class = nvme_subsys_class;
subsys->dev.release = nvme_release_subsystem;
subsys->dev.groups = nvme_subsys_attrs_groups;
- dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
+ dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
device_initialize(&subsys->dev);
mutex_lock(&nvme_subsystems_lock);
@@ -2517,6 +2556,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
goto out_put_subsystem;
}
+ if (!found)
+ subsys->instance = ctrl->instance;
ctrl->subsys = subsys;
list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
mutex_unlock(&nvme_subsystems_lock);
@@ -2574,7 +2615,6 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
int nvme_init_identify(struct nvme_ctrl *ctrl)
{
struct nvme_id_ctrl *id;
- u64 cap;
int ret, page_shift;
u32 max_hw_sectors;
bool prev_apst_enabled;
@@ -2584,16 +2624,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
return ret;
}
-
- ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
- if (ret) {
- dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
- return ret;
- }
- page_shift = NVME_CAP_MPSMIN(cap) + 12;
+ page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
+ ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
if (ctrl->vs >= NVME_VS(1, 1, 0))
- ctrl->subsystem = NVME_CAP_NSSRC(cap);
+ ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
ret = nvme_identify_ctrl(ctrl, &id);
if (ret) {
@@ -3184,7 +3219,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
head->ns_id = nsid;
kref_init(&head->ref);
- nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
+ ret = nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
+ if (ret)
+ goto out_cleanup_srcu;
ret = __nvme_check_ids(ctrl->subsys, head);
if (ret) {
@@ -3209,6 +3246,8 @@ out_ida_remove:
out_free_head:
kfree(head);
out:
+ if (ret > 0)
+ ret = blk_status_to_errno(nvme_error_status(ret));
return ERR_PTR(ret);
}
@@ -3232,7 +3271,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
} else {
struct nvme_ns_ids ids;
- nvme_report_ns_ids(ctrl, nsid, id, &ids);
+ ret = nvme_report_ns_ids(ctrl, nsid, id, &ids);
+ if (ret)
+ goto out_unlock;
+
if (!nvme_ns_ids_equal(&head->ids, &ids)) {
dev_err(ctrl->device,
"IDs don't match for shared namespace %d\n",
@@ -3247,6 +3289,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
out_unlock:
mutex_unlock(&ctrl->subsys->lock);
+ if (ret > 0)
+ ret = blk_status_to_errno(nvme_error_status(ret));
return ret;
}
@@ -3338,11 +3382,9 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
nvme_set_queue_limits(ctrl, ns->queue);
- id = nvme_identify_ns(ctrl, nsid);
- if (!id) {
- ret = -EIO;
+ ret = nvme_identify_ns(ctrl, nsid, &id);
+ if (ret)
goto out_free_queue;
- }
if (id->ncap == 0) {
ret = -EINVAL;
@@ -3404,6 +3446,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_cleanup_queue(ns->queue);
out_free_ns:
kfree(ns);
+ if (ret > 0)
+ ret = blk_status_to_errno(nvme_error_status(ret));
return ret;
}
@@ -3617,6 +3661,33 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
+static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct nvme_ctrl *ctrl =
+ container_of(dev, struct nvme_ctrl, ctrl_device);
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+ int ret;
+
+ ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
+ if (ret)
+ return ret;
+
+ if (opts) {
+ ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
+ if (ret)
+ return ret;
+
+ ret = add_uevent_var(env, "NVME_TRSVCID=%s",
+ opts->trsvcid ?: "none");
+ if (ret)
+ return ret;
+
+ ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
+ opts->host_traddr ?: "none");
+ }
+ return ret;
+}
+
static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
{
char *envp[2] = { NULL, NULL };
@@ -3723,6 +3794,9 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
queue_work(nvme_wq, &ctrl->ana_work);
break;
#endif
+ case NVME_AER_NOTICE_DISC_CHANGED:
+ ctrl->aen_result = result;
+ break;
default:
dev_warn(ctrl->device, "async event result %08x\n", result);
}
@@ -3769,10 +3843,10 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
if (ctrl->kato)
nvme_start_keep_alive(ctrl);
+ nvme_enable_aen(ctrl);
+
if (ctrl->queue_count > 1) {
nvme_queue_scan(ctrl);
- nvme_enable_aen(ctrl);
- queue_work(nvme_wq, &ctrl->async_event_work);
nvme_start_queues(ctrl);
}
}
@@ -3792,7 +3866,9 @@ static void nvme_free_ctrl(struct device *dev)
container_of(dev, struct nvme_ctrl, ctrl_device);
struct nvme_subsystem *subsys = ctrl->subsys;
- ida_simple_remove(&nvme_instance_ida, ctrl->instance);
+ if (subsys && ctrl->instance != subsys->instance)
+ ida_simple_remove(&nvme_instance_ida, ctrl->instance);
+
kfree(ctrl->effects);
nvme_mpath_uninit(ctrl);
__free_page(ctrl->discard_page);
@@ -3992,6 +4068,9 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl)
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_sync_queue(ns->queue);
up_read(&ctrl->namespaces_rwsem);
+
+ if (ctrl->admin_q)
+ blk_sync_queue(ctrl->admin_q);
}
EXPORT_SYMBOL_GPL(nvme_sync_queues);
@@ -4050,6 +4129,7 @@ static int __init nvme_core_init(void)
result = PTR_ERR(nvme_class);
goto unregister_chrdev;
}
+ nvme_class->dev_uevent = nvme_class_uevent;
nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
if (IS_ERR(nvme_subsys_class)) {
@@ -4074,7 +4154,6 @@ out:
static void __exit nvme_core_exit(void)
{
- ida_destroy(&nvme_subsystems_ida);
class_destroy(nvme_subsys_class);
class_destroy(nvme_class);
unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 1994d5b42f94..74b8818ac9a1 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -150,7 +150,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
cmd.prop_get.fctype = nvme_fabrics_type_property_get;
cmd.prop_get.offset = cpu_to_le32(off);
- ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
NVME_QID_ANY, 0, 0, false);
if (ret >= 0)
@@ -197,7 +197,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
cmd.prop_get.attrib = 1;
cmd.prop_get.offset = cpu_to_le32(off);
- ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
NVME_QID_ANY, 0, 0, false);
if (ret >= 0)
@@ -243,7 +243,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
cmd.prop_set.offset = cpu_to_le32(off);
cmd.prop_set.value = cpu_to_le64(val);
- ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0,
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0,
NVME_QID_ANY, 0, 0, false);
if (unlikely(ret))
dev_err(ctrl->device,
@@ -381,8 +381,8 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
* Set keep-alive timeout in seconds granularity (ms * 1000)
* and add a grace period for controller kato enforcement
*/
- cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 :
- cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000);
+ cmd.connect.kato = ctrl->kato ?
+ cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000) : 0;
if (ctrl->opts->disable_sqflow)
cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW;
@@ -396,7 +396,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
- ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res,
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res,
data, sizeof(*data), 0, NVME_QID_ANY, 1,
BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false);
if (ret) {
@@ -611,6 +611,7 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_DATA_DIGEST, "data_digest" },
{ NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" },
{ NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" },
+ { NVMF_OPT_TOS, "tos=%d" },
{ NVMF_OPT_ERR, NULL }
};
@@ -632,6 +633,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
opts->duplicate_connect = false;
opts->hdr_digest = false;
opts->data_digest = false;
+ opts->tos = -1; /* < 0 == use transport default */
options = o = kstrdup(buf, GFP_KERNEL);
if (!options)
@@ -738,13 +740,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n");
}
opts->kato = token;
-
- if (opts->discovery_nqn && opts->kato) {
- pr_err("Discovery controllers cannot accept KATO != 0\n");
- ret = -EINVAL;
- goto out;
- }
-
break;
case NVMF_OPT_CTRL_LOSS_TMO:
if (match_int(args, &token)) {
@@ -856,6 +851,22 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
opts->nr_poll_queues = token;
break;
+ case NVMF_OPT_TOS:
+ if (match_int(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (token < 0) {
+ pr_err("Invalid type of service %d\n", token);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (token > 255) {
+ pr_warn("Clamping type of service to 255\n");
+ token = 255;
+ }
+ opts->tos = token;
+ break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
@@ -865,7 +876,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
if (opts->discovery_nqn) {
- opts->kato = 0;
opts->nr_io_queues = 0;
opts->nr_write_queues = 0;
opts->nr_poll_queues = 0;
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 3044d8b99a24..93f08d77c896 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -55,6 +55,7 @@ enum {
NVMF_OPT_DATA_DIGEST = 1 << 16,
NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
+ NVMF_OPT_TOS = 1 << 19,
};
/**
@@ -87,6 +88,7 @@ enum {
* @data_digest: generate/verify data digest (TCP)
* @nr_write_queues: number of queues for write I/O
* @nr_poll_queues: number of queues for polling I/O
+ * @tos: type of service
*/
struct nvmf_ctrl_options {
unsigned mask;
@@ -108,6 +110,7 @@ struct nvmf_ctrl_options {
bool data_digest;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
+ int tos;
};
/*
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 232d8094091b..265f89e11d8b 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1608,9 +1608,13 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
sizeof(op->rsp_iu), DMA_FROM_DEVICE);
if (opstate == FCPOP_STATE_ABORTED)
- status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
- else if (freq->status)
- status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ else if (freq->status) {
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: io failed due to lldd error %d\n",
+ ctrl->cnum, freq->status);
+ }
/*
* For the linux implementation, if we have an unsuccesful
@@ -1637,8 +1641,13 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
* no payload in the CQE by the transport.
*/
if (freq->transferred_length !=
- be32_to_cpu(op->cmd_iu.data_len)) {
- status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+ be32_to_cpu(op->cmd_iu.data_len)) {
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: io failed due to bad transfer "
+ "length: %d vs expected %d\n",
+ ctrl->cnum, freq->transferred_length,
+ be32_to_cpu(op->cmd_iu.data_len));
goto done;
}
result.u64 = 0;
@@ -1655,7 +1664,17 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
freq->transferred_length ||
op->rsp_iu.status_code ||
sqe->common.command_id != cqe->command_id)) {
- status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: io failed due to bad NVMe_ERSP: "
+ "iu len %d, xfr len %d vs %d, status code "
+ "%d, cmdid %d vs %d\n",
+ ctrl->cnum, be16_to_cpu(op->rsp_iu.iu_len),
+ be32_to_cpu(op->rsp_iu.xfrd_len),
+ freq->transferred_length,
+ op->rsp_iu.status_code,
+ sqe->common.command_id,
+ cqe->command_id);
goto done;
}
result = cqe->result;
@@ -1663,7 +1682,11 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
break;
default:
- status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: io failed due to odd NVMe_xRSP iu "
+ "len %d\n",
+ ctrl->cnum, freq->rcv_rsplen);
goto done;
}
@@ -2006,6 +2029,7 @@ nvme_fc_ctrl_free(struct kref *ref)
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
blk_cleanup_queue(ctrl->ctrl.admin_q);
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
blk_mq_free_tag_set(&ctrl->admin_tag_set);
kfree(ctrl->queues);
@@ -2107,7 +2131,6 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
struct nvme_fc_fcp_op *op)
{
struct nvmefc_fcp_req *freq = &op->fcp_req;
- enum dma_data_direction dir;
int ret;
freq->sg_cnt = 0;
@@ -2124,9 +2147,8 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl);
WARN_ON(op->nents > blk_rq_nr_phys_segments(rq));
- dir = (rq_data_dir(rq) == WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl,
- op->nents, dir);
+ op->nents, rq_dma_dir(rq));
if (unlikely(freq->sg_cnt <= 0)) {
sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE);
freq->sg_cnt = 0;
@@ -2149,8 +2171,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
return;
fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents,
- ((rq_data_dir(rq) == WRITE) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE));
+ rq_dma_dir(rq));
nvme_cleanup_cmd(rq);
@@ -2633,8 +2654,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
if (ret)
goto out_delete_hw_queue;
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
-
ret = nvmf_connect_admin_queue(&ctrl->ctrl);
if (ret)
goto out_disconnect_admin_queue;
@@ -2648,23 +2667,15 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
* prior connection values
*/
- ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap);
- if (ret) {
- dev_err(ctrl->ctrl.device,
- "prop_get NVME_REG_CAP failed\n");
- goto out_disconnect_admin_queue;
- }
-
- ctrl->ctrl.sqsize =
- min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
-
- ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+ ret = nvme_enable_ctrl(&ctrl->ctrl);
if (ret)
goto out_disconnect_admin_queue;
ctrl->ctrl.max_hw_sectors =
(ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+
ret = nvme_init_identify(&ctrl->ctrl);
if (ret)
goto out_disconnect_admin_queue;
@@ -2774,6 +2785,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
nvme_stop_queues(&ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
+ blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
}
/*
@@ -2796,6 +2808,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
+ blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
/* kill the aens as they are a separate path */
nvme_fc_abort_aen_ops(ctrl);
@@ -3109,10 +3122,16 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
goto out_free_queues;
ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set;
+ ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+ if (IS_ERR(ctrl->ctrl.fabrics_q)) {
+ ret = PTR_ERR(ctrl->ctrl.fabrics_q);
+ goto out_free_admin_tag_set;
+ }
+
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
ret = PTR_ERR(ctrl->ctrl.admin_q);
- goto out_free_admin_tag_set;
+ goto out_cleanup_fabrics_q;
}
/*
@@ -3184,6 +3203,8 @@ fail_ctrl:
out_cleanup_admin_q:
blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_cleanup_fabrics_q:
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
out_free_admin_tag_set:
blk_mq_free_tag_set(&ctrl->admin_tag_set);
out_free_queues:
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index ba009d4c9dfa..ec46693f6b64 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -667,11 +667,14 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q,
return rq;
}
-static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd,
+ void *buf)
{
+ struct nvm_geo *geo = &dev->geo;
struct request_queue *q = dev->q;
struct nvme_nvm_command *cmd;
struct request *rq;
+ int ret;
cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
if (!cmd)
@@ -679,8 +682,15 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
rq = nvme_nvm_alloc_request(q, rqd, cmd);
if (IS_ERR(rq)) {
- kfree(cmd);
- return PTR_ERR(rq);
+ ret = PTR_ERR(rq);
+ goto err_free_cmd;
+ }
+
+ if (buf) {
+ ret = blk_rq_map_kern(q, rq, buf, geo->csecs * rqd->nr_ppas,
+ GFP_KERNEL);
+ if (ret)
+ goto err_free_cmd;
}
rq->end_io_data = rqd;
@@ -688,33 +698,9 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
return 0;
-}
-
-static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
- struct request_queue *q = dev->q;
- struct request *rq;
- struct nvme_nvm_command cmd;
- int ret = 0;
-
- memset(&cmd, 0, sizeof(struct nvme_nvm_command));
-
- rq = nvme_nvm_alloc_request(q, rqd, &cmd);
- if (IS_ERR(rq))
- return PTR_ERR(rq);
-
- /* I/Os can fail and the error is signaled through rqd. Callers must
- * handle the error accordingly.
- */
- blk_execute_rq(q, NULL, rq, 0);
- if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
- ret = -EINTR;
-
- rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
- rqd->error = nvme_req(rq)->status;
-
- blk_mq_free_request(rq);
+err_free_cmd:
+ kfree(cmd);
return ret;
}
@@ -754,7 +740,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
.get_chk_meta = nvme_nvm_get_chk_meta,
.submit_io = nvme_nvm_submit_io,
- .submit_io_sync = nvme_nvm_submit_io_sync,
.create_dma_pool = nvme_nvm_create_dma_pool,
.destroy_dma_pool = nvme_nvm_destroy_dma_pool,
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index af831d3d15d0..30de7efef003 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -509,14 +509,16 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
down_write(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
- if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
+ unsigned nsid = le32_to_cpu(desc->nsids[n]);
+
+ if (ns->head->ns_id < nsid)
continue;
- nvme_update_ns_ana_state(desc, ns);
+ if (ns->head->ns_id == nsid)
+ nvme_update_ns_ana_state(desc, ns);
if (++n == nr_nsids)
break;
}
up_write(&ctrl->namespaces_rwsem);
- WARN_ON_ONCE(n < nr_nsids);
return 0;
}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2d678fb968c7..b5013c101b35 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -16,6 +16,8 @@
#include <linux/fault-inject.h>
#include <linux/rcupdate.h>
+#include <trace/events/block.h>
+
extern unsigned int nvme_io_timeout;
#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ)
@@ -97,6 +99,21 @@ enum nvme_quirks {
* Force simple suspend/resume path.
*/
NVME_QUIRK_SIMPLE_SUSPEND = (1 << 10),
+
+ /*
+ * Use only one interrupt vector for all queues
+ */
+ NVME_QUIRK_SINGLE_VECTOR = (1 << 11),
+
+ /*
+ * Use non-standard 128 bytes SQEs.
+ */
+ NVME_QUIRK_128_BYTES_SQES = (1 << 12),
+
+ /*
+ * Prevent tag overlap between queues
+ */
+ NVME_QUIRK_SHARED_TAGS = (1 << 13),
};
/*
@@ -169,6 +186,7 @@ struct nvme_ctrl {
const struct nvme_ctrl_ops *ops;
struct request_queue *admin_q;
struct request_queue *connect_q;
+ struct request_queue *fabrics_q;
struct device *dev;
int instance;
int numa_node;
@@ -431,8 +449,8 @@ void nvme_complete_rq(struct request *req);
bool nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
-int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
-int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks);
@@ -520,6 +538,16 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
kblockd_schedule_work(&head->requeue_work);
}
+static inline void nvme_trace_bio_complete(struct request *req,
+ blk_status_t status)
+{
+ struct nvme_ns *ns = req->q->queuedata;
+
+ if (req->cmd_flags & REQ_NVME_MPATH)
+ trace_block_bio_complete(ns->head->disk->queue,
+ req->bio, status);
+}
+
extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute subsys_attr_iopolicy;
@@ -567,6 +595,10 @@ static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
{
}
+static inline void nvme_trace_bio_complete(struct request *req,
+ blk_status_t status)
+{
+}
static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
struct nvme_id_ctrl *id)
{
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 732d5b63ec05..6b4d7b064b38 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -28,8 +28,8 @@
#include "trace.h"
#include "nvme.h"
-#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
-#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
+#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
+#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
@@ -100,6 +100,7 @@ struct nvme_dev {
unsigned io_queues[HCTX_MAX_TYPES];
unsigned int num_vecs;
int q_depth;
+ int io_sqes;
u32 db_stride;
void __iomem *bar;
unsigned long bar_mapped_size;
@@ -162,7 +163,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
struct nvme_queue {
struct nvme_dev *dev;
spinlock_t sq_lock;
- struct nvme_command *sq_cmds;
+ void *sq_cmds;
/* only used for poll queues: */
spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
volatile struct nvme_completion *cqes;
@@ -178,6 +179,7 @@ struct nvme_queue {
u16 last_cq_head;
u16 qid;
u8 cq_phase;
+ u8 sqes;
unsigned long flags;
#define NVMEQ_ENABLED 0
#define NVMEQ_SQ_CMB 1
@@ -488,7 +490,8 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
bool write_sq)
{
spin_lock(&nvmeq->sq_lock);
- memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
+ memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
+ cmd, sizeof(*cmd));
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
nvme_write_sq_db(nvmeq, write_sq);
@@ -534,14 +537,13 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- enum dma_data_direction dma_dir = rq_data_dir(req) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE;
const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
int i;
if (iod->dma_len) {
- dma_unmap_page(dev->dev, dma_addr, iod->dma_len, dma_dir);
+ dma_unmap_page(dev->dev, dma_addr, iod->dma_len,
+ rq_dma_dir(req));
return;
}
@@ -1344,16 +1346,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
- dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq->q_depth),
+ dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
if (!nvmeq->sq_cmds)
return;
if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
- nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
+ nvmeq->sq_cmds, SQ_SIZE(nvmeq));
} else {
- dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq->q_depth),
+ dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
}
}
@@ -1403,7 +1405,7 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
if (shutdown)
nvme_shutdown_ctrl(&dev->ctrl);
else
- nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
+ nvme_disable_ctrl(&dev->ctrl);
nvme_poll_irqdisable(nvmeq, -1);
}
@@ -1433,12 +1435,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
}
static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
- int qid, int depth)
+ int qid)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
- nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
+ nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
if (nvmeq->sq_cmds) {
nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
nvmeq->sq_cmds);
@@ -1447,11 +1449,11 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
return 0;
}
- pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth));
+ pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
}
}
- nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+ nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
&nvmeq->sq_dma_addr, GFP_KERNEL);
if (!nvmeq->sq_cmds)
return -ENOMEM;
@@ -1465,12 +1467,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
if (dev->ctrl.queue_count > qid)
return 0;
- nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(depth),
+ nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
+ nvmeq->q_depth = depth;
+ nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
&nvmeq->cq_dma_addr, GFP_KERNEL);
if (!nvmeq->cqes)
goto free_nvmeq;
- if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
+ if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
goto free_cqdma;
nvmeq->dev = dev;
@@ -1479,15 +1483,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
- nvmeq->q_depth = depth;
nvmeq->qid = qid;
dev->ctrl.queue_count++;
return 0;
free_cqdma:
- dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
- nvmeq->cq_dma_addr);
+ dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
+ nvmeq->cq_dma_addr);
free_nvmeq:
return -ENOMEM;
}
@@ -1515,7 +1518,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
- memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
+ memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
nvme_dbbuf_init(dev, nvmeq, qid);
dev->online_queues++;
wmb(); /* ensure the first interrupt sees the initialization */
@@ -1552,7 +1555,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
nvme_init_queue(nvmeq, qid);
if (!polled) {
- nvmeq->cq_vector = vector;
result = queue_request_irq(nvmeq);
if (result < 0)
goto release_sq;
@@ -1679,7 +1681,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
- result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
+ result = nvme_disable_ctrl(&dev->ctrl);
if (result < 0)
return result;
@@ -1695,7 +1697,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
- result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap);
+ result = nvme_enable_ctrl(&dev->ctrl);
if (result)
return result;
@@ -2077,6 +2079,13 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
dev->io_queues[HCTX_TYPE_READ] = 0;
+ /*
+ * Some Apple controllers require all queues to use the
+ * first vector.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)
+ irq_queues = 1;
+
return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
}
@@ -2095,6 +2104,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
unsigned long size;
nr_io_queues = max_io_queues();
+
+ /*
+ * If tags are shared with admin queue (Apple bug), then
+ * make sure we only use one IO queue.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
+ nr_io_queues = 1;
+
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
return result;
@@ -2265,6 +2282,14 @@ static int nvme_dev_add(struct nvme_dev *dev)
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
+ /*
+ * Some Apple controllers requires tags to be unique
+ * across admin and IO queue, so reserve the first 32
+ * tags of the IO queue.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
+ dev->tagset.reserved_tags = NVME_AQ_DEPTH;
+
ret = blk_mq_alloc_tag_set(&dev->tagset);
if (ret) {
dev_warn(dev->ctrl.device,
@@ -2314,10 +2339,21 @@ static int nvme_pci_enable(struct nvme_dev *dev)
dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
io_queue_depth);
+ dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
dev->dbs = dev->bar + 4096;
/*
+ * Some Apple controllers require a non-standard SQE size.
+ * Interestingly they also seem to ignore the CC:IOSQES register
+ * so we don't bother updating it here.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
+ dev->io_sqes = 7;
+ else
+ dev->io_sqes = NVME_NVM_IOSQES;
+
+ /*
* Temporary fix for the Apple controller found in the MacBook8,1 and
* some MacBook7,1 to avoid controller resets and data loss.
*/
@@ -2334,6 +2370,18 @@ static int nvme_pci_enable(struct nvme_dev *dev)
"set queue depth=%u\n", dev->q_depth);
}
+ /*
+ * Controllers with the shared tags quirk need the IO queue to be
+ * big enough so that we get 32 tags for the admin queue
+ */
+ if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
+ (dev->q_depth < (NVME_AQ_DEPTH + 2))) {
+ dev->q_depth = NVME_AQ_DEPTH + 2;
+ dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
+ dev->q_depth);
+ }
+
+
nvme_map_cmb(dev);
pci_enable_pcie_error_reporting(pdev);
@@ -2401,6 +2449,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
+ blk_mq_tagset_wait_completed_request(&dev->tagset);
+ blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
/*
* The driver will not be starting up queues again if shutting down so
@@ -3041,6 +3091,10 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
+ { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
+ .driver_data = NVME_QUIRK_SINGLE_VECTOR |
+ NVME_QUIRK_128_BYTES_SQES |
+ NVME_QUIRK_SHARED_TAGS },
{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 1a6449bc547b..dfa07bb9dfeb 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -757,6 +757,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
{
if (remove) {
blk_cleanup_queue(ctrl->ctrl.admin_q);
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
}
if (ctrl->async_event_sqe.data) {
@@ -798,10 +799,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
goto out_free_async_qe;
}
+ ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+ if (IS_ERR(ctrl->ctrl.fabrics_q)) {
+ error = PTR_ERR(ctrl->ctrl.fabrics_q);
+ goto out_free_tagset;
+ }
+
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
error = PTR_ERR(ctrl->ctrl.admin_q);
- goto out_free_tagset;
+ goto out_cleanup_fabrics_q;
}
}
@@ -809,24 +816,15 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
if (error)
goto out_cleanup_queue;
- error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP,
- &ctrl->ctrl.cap);
- if (error) {
- dev_err(ctrl->ctrl.device,
- "prop_get NVME_REG_CAP failed\n");
- goto out_stop_queue;
- }
-
- ctrl->ctrl.sqsize =
- min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
-
- error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+ error = nvme_enable_ctrl(&ctrl->ctrl);
if (error)
goto out_stop_queue;
ctrl->ctrl.max_hw_sectors =
(ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+
error = nvme_init_identify(&ctrl->ctrl);
if (error)
goto out_stop_queue;
@@ -838,6 +836,9 @@ out_stop_queue:
out_cleanup_queue:
if (new)
blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_cleanup_fabrics_q:
+ if (new)
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
out_free_tagset:
if (new)
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
@@ -907,10 +908,13 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
{
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
- if (ctrl->ctrl.admin_tagset)
+ if (ctrl->ctrl.admin_tagset) {
blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
nvme_cancel_request, &ctrl->ctrl);
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset);
+ }
+ if (remove)
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_destroy_admin_queue(ctrl, remove);
}
@@ -920,9 +924,11 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
- if (ctrl->ctrl.tagset)
+ if (ctrl->ctrl.tagset) {
blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
nvme_cancel_request, &ctrl->ctrl);
+ blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset);
+ }
if (remove)
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, remove);
@@ -1059,6 +1065,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
nvme_rdma_teardown_io_queues(ctrl, false);
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_teardown_admin_queue(ctrl, false);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
@@ -1145,9 +1152,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
req->mr = NULL;
}
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
- req->nents, rq_data_dir(rq) ==
- WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
nvme_cleanup_cmd(rq);
sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
@@ -1273,7 +1278,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
- rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ rq_dma_dir(rq));
if (unlikely(count <= 0)) {
ret = -EIO;
goto out_free_table;
@@ -1302,9 +1307,7 @@ out:
return 0;
out_unmap_sg:
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
- req->nents, rq_data_dir(rq) ==
- WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
out_free_table:
sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
return ret;
@@ -1547,16 +1550,18 @@ static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
{
+ struct nvme_ctrl *ctrl = &queue->ctrl->ctrl;
int ret;
ret = nvme_rdma_create_queue_ib(queue);
if (ret)
return ret;
+ if (ctrl->opts->tos >= 0)
+ rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
if (ret) {
- dev_err(queue->ctrl->ctrl.device,
- "rdma_resolve_route failed (%d).\n",
+ dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
queue->cm_error);
goto out_destroy_queue;
}
@@ -1869,10 +1874,11 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
cancel_delayed_work_sync(&ctrl->reconnect_work);
nvme_rdma_teardown_io_queues(ctrl, shutdown);
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
if (shutdown)
nvme_shutdown_ctrl(&ctrl->ctrl);
else
- nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+ nvme_disable_ctrl(&ctrl->ctrl);
nvme_rdma_teardown_admin_queue(ctrl, shutdown);
}
@@ -2051,7 +2057,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
.required_opts = NVMF_OPT_TRADDR,
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
- NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES,
+ NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
+ NVMF_OPT_TOS,
.create_ctrl = nvme_rdma_create_ctrl,
};
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 606b13d35d16..4ffd5957637a 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -13,6 +13,7 @@
#include <net/tcp.h>
#include <linux/blk-mq.h>
#include <crypto/hash.h>
+#include <net/busy_poll.h>
#include "nvme.h"
#include "fabrics.h"
@@ -72,6 +73,7 @@ struct nvme_tcp_queue {
int pdu_offset;
size_t data_remaining;
size_t ddgst_remaining;
+ unsigned int nr_cqe;
/* send state */
struct nvme_tcp_request *request;
@@ -438,6 +440,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
}
nvme_end_request(rq, cqe->status, cqe->result);
+ queue->nr_cqe++;
return 0;
}
@@ -608,23 +611,18 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
switch (hdr->type) {
case nvme_tcp_c2h_data:
- ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
- break;
+ return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
case nvme_tcp_rsp:
nvme_tcp_init_recv_ctx(queue);
- ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu);
- break;
+ return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
case nvme_tcp_r2t:
nvme_tcp_init_recv_ctx(queue);
- ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
- break;
+ return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
default:
dev_err(queue->ctrl->ctrl.device,
"unsupported pdu type (%d)\n", hdr->type);
return -EINVAL;
}
-
- return ret;
}
static inline void nvme_tcp_end_request(struct request *rq, u16 status)
@@ -701,8 +699,10 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
} else {
- if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS)
+ if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
+ queue->nr_cqe++;
+ }
nvme_tcp_init_recv_ctx(queue);
}
}
@@ -742,6 +742,7 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
pdu->command_id);
nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
+ queue->nr_cqe++;
}
nvme_tcp_init_recv_ctx(queue);
@@ -841,7 +842,7 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
{
- nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_DATA_XFER_ERROR);
+ nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
}
static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
@@ -1023,14 +1024,16 @@ done:
static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
{
- struct sock *sk = queue->sock->sk;
+ struct socket *sock = queue->sock;
+ struct sock *sk = sock->sk;
read_descriptor_t rd_desc;
int consumed;
rd_desc.arg.data = queue;
rd_desc.count = 1;
lock_sock(sk);
- consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
+ queue->nr_cqe = 0;
+ consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
release_sock(sk);
return consumed;
}
@@ -1255,7 +1258,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
queue->queue_size = queue_size;
if (qid > 0)
- queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
+ queue->cmnd_capsule_len = nctrl->ioccsz * 16;
else
queue->cmnd_capsule_len = sizeof(struct nvme_command) +
NVME_TCP_ADMIN_CCSZ;
@@ -1263,7 +1266,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
IPPROTO_TCP, &queue->sock);
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to create socket: %d\n", ret);
return ret;
}
@@ -1273,7 +1276,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
(char *)&opt, sizeof(opt));
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to set TCP_SYNCNT sock opt %d\n", ret);
goto err_sock;
}
@@ -1283,7 +1286,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
TCP_NODELAY, (char *)&opt, sizeof(opt));
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to set TCP_NODELAY sock opt %d\n", ret);
goto err_sock;
}
@@ -1296,11 +1299,23 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
(char *)&sol, sizeof(sol));
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to set SO_LINGER sock opt %d\n", ret);
goto err_sock;
}
+ /* Set socket type of service */
+ if (nctrl->opts->tos >= 0) {
+ opt = nctrl->opts->tos;
+ ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS,
+ (char *)&opt, sizeof(opt));
+ if (ret) {
+ dev_err(nctrl->device,
+ "failed to set IP_TOS sock opt %d\n", ret);
+ goto err_sock;
+ }
+ }
+
queue->sock->sk->sk_allocation = GFP_ATOMIC;
if (!qid)
n = 0;
@@ -1314,11 +1329,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
queue->pdu_offset = 0;
sk_set_memalloc(queue->sock->sk);
- if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
+ if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
sizeof(ctrl->src_addr));
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to bind queue %d socket %d\n",
qid, ret);
goto err_sock;
@@ -1330,7 +1345,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
if (queue->hdr_digest || queue->data_digest) {
ret = nvme_tcp_alloc_crypto(queue);
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to allocate queue %d crypto\n", qid);
goto err_sock;
}
@@ -1344,13 +1359,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
goto err_crypto;
}
- dev_dbg(ctrl->ctrl.device, "connecting queue %d\n",
+ dev_dbg(nctrl->device, "connecting queue %d\n",
nvme_tcp_queue_id(queue));
ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
sizeof(ctrl->addr), 0);
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to connect socket: %d\n", ret);
goto err_rcv_pdu;
}
@@ -1371,6 +1386,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
queue->sock->sk->sk_state_change = nvme_tcp_state_change;
queue->sock->sk->sk_write_space = nvme_tcp_write_space;
+ queue->sock->sk->sk_ll_usec = 1;
write_unlock_bh(&queue->sock->sk->sk_callback_lock);
return 0;
@@ -1469,7 +1485,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
- set->nr_maps = 2 /* default + read */;
+ set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
}
ret = blk_mq_alloc_tag_set(set);
@@ -1568,6 +1584,7 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
+ nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
return nr_io_queues;
}
@@ -1599,6 +1616,12 @@ static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
min(opts->nr_io_queues, nr_io_queues);
nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
}
+
+ if (opts->nr_poll_queues && nr_io_queues) {
+ /* map dedicated poll queues only if we have queues left */
+ ctrl->io_queues[HCTX_TYPE_POLL] =
+ min(opts->nr_poll_queues, nr_io_queues);
+ }
}
static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
@@ -1680,6 +1703,7 @@ static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
nvme_tcp_stop_queue(ctrl, 0);
if (remove) {
blk_cleanup_queue(ctrl->admin_q);
+ blk_cleanup_queue(ctrl->fabrics_q);
blk_mq_free_tag_set(ctrl->admin_tagset);
}
nvme_tcp_free_admin_queue(ctrl);
@@ -1700,10 +1724,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
goto out_free_queue;
}
+ ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
+ if (IS_ERR(ctrl->fabrics_q)) {
+ error = PTR_ERR(ctrl->fabrics_q);
+ goto out_free_tagset;
+ }
+
ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
if (IS_ERR(ctrl->admin_q)) {
error = PTR_ERR(ctrl->admin_q);
- goto out_free_tagset;
+ goto out_cleanup_fabrics_q;
}
}
@@ -1711,19 +1741,12 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
if (error)
goto out_cleanup_queue;
- error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
- if (error) {
- dev_err(ctrl->device,
- "prop_get NVME_REG_CAP failed\n");
- goto out_stop_queue;
- }
-
- ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
-
- error = nvme_enable_ctrl(ctrl, ctrl->cap);
+ error = nvme_enable_ctrl(ctrl);
if (error)
goto out_stop_queue;
+ blk_mq_unquiesce_queue(ctrl->admin_q);
+
error = nvme_init_identify(ctrl);
if (error)
goto out_stop_queue;
@@ -1735,6 +1758,9 @@ out_stop_queue:
out_cleanup_queue:
if (new)
blk_cleanup_queue(ctrl->admin_q);
+out_cleanup_fabrics_q:
+ if (new)
+ blk_cleanup_queue(ctrl->fabrics_q);
out_free_tagset:
if (new)
blk_mq_free_tag_set(ctrl->admin_tagset);
@@ -1748,10 +1774,13 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
{
blk_mq_quiesce_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0);
- if (ctrl->admin_tagset)
+ if (ctrl->admin_tagset) {
blk_mq_tagset_busy_iter(ctrl->admin_tagset,
nvme_cancel_request, ctrl);
- blk_mq_unquiesce_queue(ctrl->admin_q);
+ blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
+ }
+ if (remove)
+ blk_mq_unquiesce_queue(ctrl->admin_q);
nvme_tcp_destroy_admin_queue(ctrl, remove);
}
@@ -1762,9 +1791,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
return;
nvme_stop_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
- if (ctrl->tagset)
+ if (ctrl->tagset) {
blk_mq_tagset_busy_iter(ctrl->tagset,
nvme_cancel_request, ctrl);
+ blk_mq_tagset_wait_completed_request(ctrl->tagset);
+ }
if (remove)
nvme_start_queues(ctrl);
nvme_tcp_destroy_io_queues(ctrl, remove);
@@ -1793,7 +1824,7 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
{
struct nvmf_ctrl_options *opts = ctrl->opts;
- int ret = -EINVAL;
+ int ret;
ret = nvme_tcp_configure_admin_queue(ctrl, new);
if (ret)
@@ -1876,6 +1907,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
/* unquiesce to fail fast pending requests */
nvme_start_queues(ctrl);
nvme_tcp_teardown_admin_queue(ctrl, false);
+ blk_mq_unquiesce_queue(ctrl->admin_q);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
@@ -1892,10 +1924,11 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
nvme_tcp_teardown_io_queues(ctrl, shutdown);
+ blk_mq_quiesce_queue(ctrl->admin_q);
if (shutdown)
nvme_shutdown_ctrl(ctrl);
else
- nvme_disable_ctrl(ctrl, ctrl->cap);
+ nvme_disable_ctrl(ctrl);
nvme_tcp_teardown_admin_queue(ctrl, shutdown);
}
@@ -2151,14 +2184,36 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
+ if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
+ /* map dedicated poll queues only if we have queues left */
+ set->map[HCTX_TYPE_POLL].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_POLL];
+ set->map[HCTX_TYPE_POLL].queue_offset =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+ ctrl->io_queues[HCTX_TYPE_READ];
+ blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
+ }
+
dev_info(ctrl->ctrl.device,
- "mapped %d/%d default/read queues.\n",
+ "mapped %d/%d/%d default/read/poll queues.\n",
ctrl->io_queues[HCTX_TYPE_DEFAULT],
- ctrl->io_queues[HCTX_TYPE_READ]);
+ ctrl->io_queues[HCTX_TYPE_READ],
+ ctrl->io_queues[HCTX_TYPE_POLL]);
return 0;
}
+static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
+{
+ struct nvme_tcp_queue *queue = hctx->driver_data;
+ struct sock *sk = queue->sock->sk;
+
+ if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue))
+ sk_busy_loop(sk, true);
+ nvme_tcp_try_recv(queue);
+ return queue->nr_cqe;
+}
+
static struct blk_mq_ops nvme_tcp_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
.complete = nvme_complete_rq,
@@ -2167,6 +2222,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = {
.init_hctx = nvme_tcp_init_hctx,
.timeout = nvme_tcp_timeout,
.map_queues = nvme_tcp_map_queues,
+ .poll = nvme_tcp_poll,
};
static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
@@ -2220,7 +2276,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
INIT_LIST_HEAD(&ctrl->list);
ctrl->ctrl.opts = opts;
- ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
+ ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
+ opts->nr_poll_queues + 1;
ctrl->ctrl.sqsize = opts->queue_size - 1;
ctrl->ctrl.kato = opts->kato;
@@ -2314,7 +2371,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
- NVMF_OPT_NR_WRITE_QUEUES,
+ NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
+ NVMF_OPT_TOS,
.create_ctrl = nvme_tcp_create_ctrl,
};
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 9778eb0406b3..5c3cb6928f3c 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -86,6 +86,22 @@ static const char *nvme_trace_admin_get_features(struct trace_seq *p,
return ret;
}
+static const char *nvme_trace_get_lba_status(struct trace_seq *p,
+ u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u64 slba = get_unaligned_le64(cdw10);
+ u32 mndw = get_unaligned_le32(cdw10 + 8);
+ u16 rl = get_unaligned_le16(cdw10 + 12);
+ u8 atype = cdw10[15];
+
+ trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u",
+ slba, mndw, rl, atype);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
@@ -141,6 +157,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
return nvme_trace_admin_identify(p, cdw10);
case nvme_admin_get_features:
return nvme_trace_admin_get_features(p, cdw10);
+ case nvme_admin_get_lba_status:
+ return nvme_trace_get_lba_status(p, cdw10);
default:
return nvme_trace_common(p, cdw10);
}