From 5c11f7d9f843bdd24cd29b95401938bc3f168070 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 21 Dec 2020 00:03:39 -0800
Subject: nvme-tcp: Fix possible race of io_work and direct send

We may send a request (with or without its data) from two paths:

  1. From our I/O context nvme_tcp_io_work which is triggered from:
    - queue_rq
    - r2t reception
    - socket data_ready and write_space callbacks
  2. Directly from queue_rq if the send_list is empty (because we want to
     save the context switch associated with scheduling our io_work).

However, given that now we have the send_mutex, we may run into a race
condition where none of these contexts will send the pending payload to
the controller. Both io_work send path and queue_rq send path
opportunistically attempt to acquire the send_mutex however queue_rq only
attempts to send a single request, and if io_work context fails to
acquire the send_mutex it will complete without rescheduling itself.

The race can trigger with the following sequence:

  1. queue_rq sends request (no incapsule data) and blocks
  2. RX path receives r2t - prepares data PDU to send, adds h2cdata PDU
     to the send_list and schedules io_work
  3. io_work triggers and cannot acquire the send_mutex - because of (1),
     ends without self rescheduling
  4. queue_rq completes the send, and completes

==> no context will send the h2cdata - timeout.

Fix this by having queue_rq sending as much as it can from the send_list
such that if it still has any left, its because the socket buffer is
full and the socket write_space callback will trigger, thus guaranteeing
that a context will be scheduled to send the h2cdata PDU.

Fixes: db5ad6b7f8cd ("nvme-tcp: try to send request in queue_rq context")
Reported-by: Potnuri Bharat Teja <bharat@chelsio.com>
Reported-by: Samuel Jones <sjones@kalrayinc.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Tested-by: Potnuri Bharat Teja <bharat@chelsio.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'drivers/nvme/host/tcp.c')

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 1ba659927442..979ee31b8dd1 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -262,6 +262,16 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
 	}
 }
 
+static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
+{
+	int ret;
+
+	/* drain the send queue as much as we can... */
+	do {
+		ret = nvme_tcp_try_send(queue);
+	} while (ret > 0);
+}
+
 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 		bool sync, bool last)
 {
@@ -279,7 +289,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 	if (queue->io_cpu == smp_processor_id() &&
 	    sync && empty && mutex_trylock(&queue->send_mutex)) {
 		queue->more_requests = !last;
-		nvme_tcp_try_send(queue);
+		nvme_tcp_send_all(queue);
 		queue->more_requests = false;
 		mutex_unlock(&queue->send_mutex);
 	} else if (last) {
-- 
cgit v1.2.3


From ada831772188192243f9ea437c46e37e97a5975d Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 13 Jan 2021 14:03:04 -0800
Subject: nvme-tcp: Fix warning with CONFIG_DEBUG_PREEMPT

We shouldn't call smp_processor_id() in a preemptible
context, but this is advisory at best, so instead
call __smp_processor_id().

Fixes: db5ad6b7f8cd ("nvme-tcp: try to send request in queue_rq context")
Reported-by: Or Gerlitz <gerlitz.or@gmail.com>
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/nvme/host/tcp.c')

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 979ee31b8dd1..b2e0865785ef 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -286,7 +286,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 	 * directly, otherwise queue io_work. Also, only do that if we
 	 * are on the same cpu, so we don't introduce contention.
 	 */
-	if (queue->io_cpu == smp_processor_id() &&
+	if (queue->io_cpu == __smp_processor_id() &&
 	    sync && empty && mutex_trylock(&queue->send_mutex)) {
 		queue->more_requests = !last;
 		nvme_tcp_send_all(queue);
-- 
cgit v1.2.3


From ca1ff67d0fb14f39cf0cc5102b1fbcc3b14f6fb9 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 13 Jan 2021 13:56:57 -0800
Subject: nvme-tcp: fix possible data corruption with bio merges

When a bio merges, we can get a request that spans multiple
bios, and the overall request payload size is the sum of
all bios. When we calculate how much we need to send
from the existing bio (and bvec), we did not take into
account the iov_iter byte count cap.

Since multipage bvecs support, bvecs can split in the middle
which means that when we account for the last bvec send we
should also take the iov_iter byte count cap as it might be
lower than the last bvec size.

Reported-by: Hao Wang <pkuwangh@gmail.com>
Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver")
Tested-by: Hao Wang <pkuwangh@gmail.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/nvme/host/tcp.c')

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index b2e0865785ef..216619926563 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -201,7 +201,7 @@ static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
 
 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
 {
-	return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
+	return min_t(size_t, iov_iter_single_seg_count(&req->iter),
 			req->pdu_len - req->pdu_sent);
 }
 
-- 
cgit v1.2.3


From 9ebbfe495ecd2e51bc92ac21ed5817c3b9e223ce Mon Sep 17 00:00:00 2001
From: Chao Leng <lengchao@huawei.com>
Date: Thu, 14 Jan 2021 17:09:26 +0800
Subject: nvme-tcp: avoid request double completion for concurrent
 nvme_tcp_timeout

Each name space has a request queue, if complete request long time,
multi request queues may have time out requests at the same time,
nvme_tcp_timeout will execute concurrently. Multi requests in different
request queues may be queued in the same tcp queue, multi
nvme_tcp_timeout may call nvme_tcp_stop_queue at the same time.
The first nvme_tcp_stop_queue will clear NVME_TCP_Q_LIVE and continue
stopping the tcp queue(cancel io_work), but the others check
NVME_TCP_Q_LIVE is already cleared, and then directly complete the
requests, complete request before the io work is completely canceled may
lead to a use-after-free condition.
Add a multex lock to serialize nvme_tcp_stop_queue.

Signed-off-by: Chao Leng <lengchao@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'drivers/nvme/host/tcp.c')

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 216619926563..881d28eb15e9 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -76,6 +76,7 @@ struct nvme_tcp_queue {
 	struct work_struct	io_work;
 	int			io_cpu;
 
+	struct mutex		queue_lock;
 	struct mutex		send_mutex;
 	struct llist_head	req_list;
 	struct list_head	send_list;
@@ -1219,6 +1220,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 
 	sock_release(queue->sock);
 	kfree(queue->pdu);
+	mutex_destroy(&queue->queue_lock);
 }
 
 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
@@ -1380,6 +1382,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
 	int ret, rcv_pdu_size;
 
+	mutex_init(&queue->queue_lock);
 	queue->ctrl = ctrl;
 	init_llist_head(&queue->req_list);
 	INIT_LIST_HEAD(&queue->send_list);
@@ -1398,7 +1401,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	if (ret) {
 		dev_err(nctrl->device,
 			"failed to create socket: %d\n", ret);
-		return ret;
+		goto err_destroy_mutex;
 	}
 
 	/* Single syn retry */
@@ -1507,6 +1510,8 @@ err_crypto:
 err_sock:
 	sock_release(queue->sock);
 	queue->sock = NULL;
+err_destroy_mutex:
+	mutex_destroy(&queue->queue_lock);
 	return ret;
 }
 
@@ -1534,9 +1539,10 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
 	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
 
-	if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
-		return;
-	__nvme_tcp_stop_queue(queue);
+	mutex_lock(&queue->queue_lock);
+	if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
+		__nvme_tcp_stop_queue(queue);
+	mutex_unlock(&queue->queue_lock);
 }
 
 static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
-- 
cgit v1.2.3