* [PATCH 1/5] nvme-tcp: open-code nvme_tcp_queue_request() for R2T
2025-03-27 15:48 [PATCHv2 0/5] nvme-tcp: fixup I/O stall on congested sockets Hannes Reinecke
@ 2025-03-27 15:48 ` Hannes Reinecke
2025-03-27 15:48 ` [PATCH 2/5] nvme-tcp: sanitize request list handling Hannes Reinecke
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Hannes Reinecke @ 2025-03-27 15:48 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Sagi Grimberg, Keith Busch, linux-nvme, Hannes Reinecke
When handling an R2T PDU we short-circuit nvme_tcp_queue_request()
as we should not attempt to send consecutive PDUs. So open-code
nvme_tcp_queue_request() for R2T and drop the last argument.
Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
drivers/nvme/host/tcp.c | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 327f3f2f5399..b2f2aef2e2f2 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -404,7 +404,7 @@ static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
}
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
- bool sync, bool last)
+ bool last)
{
struct nvme_tcp_queue *queue = req->queue;
bool empty;
@@ -418,7 +418,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
* are on the same cpu, so we don't introduce contention.
*/
if (queue->io_cpu == raw_smp_processor_id() &&
- sync && empty && mutex_trylock(&queue->send_mutex)) {
+ empty && mutex_trylock(&queue->send_mutex)) {
nvme_tcp_send_all(queue);
mutex_unlock(&queue->send_mutex);
}
@@ -771,7 +771,9 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
req->ttag = pdu->ttag;
nvme_tcp_setup_h2c_data_pdu(req);
- nvme_tcp_queue_request(req, false, true);
+
+ llist_add(&req->lentry, &queue->req_list);
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
return 0;
}
@@ -2557,7 +2559,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
- nvme_tcp_queue_request(&ctrl->async_req, true, true);
+ nvme_tcp_queue_request(&ctrl->async_req, true);
}
static void nvme_tcp_complete_timed_out(struct request *rq)
@@ -2709,7 +2711,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
nvme_start_request(rq);
- nvme_tcp_queue_request(req, true, bd->last);
+ nvme_tcp_queue_request(req, bd->last);
return BLK_STS_OK;
}
--
2.35.3
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 2/5] nvme-tcp: sanitize request list handling
2025-03-27 15:48 [PATCHv2 0/5] nvme-tcp: fixup I/O stall on congested sockets Hannes Reinecke
2025-03-27 15:48 ` [PATCH 1/5] nvme-tcp: open-code nvme_tcp_queue_request() for R2T Hannes Reinecke
@ 2025-03-27 15:48 ` Hannes Reinecke
2025-03-27 15:48 ` [PATCH 3/5] nvme-tcp: fix I/O stalls on congested sockets Hannes Reinecke
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Hannes Reinecke @ 2025-03-27 15:48 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Sagi Grimberg, Keith Busch, linux-nvme, Hannes Reinecke
Validate the request in nvme_tcp_handle_r2t() to ensure it's not
part of any list, otherwise a malicious R2T PDU might inject a
loop in request list processing.
Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
drivers/nvme/host/tcp.c | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index b2f2aef2e2f2..1a319cb86453 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -454,6 +454,7 @@ nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
}
list_del(&req->entry);
+ init_llist_node(&req->lentry);
return req;
}
@@ -561,6 +562,8 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
req->queue = queue;
nvme_req(rq)->ctrl = &ctrl->ctrl;
nvme_req(rq)->cmd = &pdu->cmd;
+ init_llist_node(&req->lentry);
+ INIT_LIST_HEAD(&req->entry);
return 0;
}
@@ -765,6 +768,15 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
return -EPROTO;
}
+ if (queue->request == req ||
+ llist_on_list(&req->lentry) ||
+ !list_empty(&req->entry)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "req %d unexpected r2t while processing request\n",
+ rq->tag);
+ return -EPROTO;
+ }
+
req->pdu_len = 0;
req->h2cdata_left = r2t_length;
req->h2cdata_offset = r2t_offset;
@@ -773,7 +785,8 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
nvme_tcp_setup_h2c_data_pdu(req);
llist_add(&req->lentry, &queue->req_list);
- queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+ if (list_empty(&queue->send_list))
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
return 0;
}
@@ -2558,6 +2571,8 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
ctrl->async_req.offset = 0;
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
+ init_llist_node(&ctrl->async_req.lentry);
+ INIT_LIST_HEAD(&ctrl->async_req.entry);
nvme_tcp_queue_request(&ctrl->async_req, true);
}
--
2.35.3
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 3/5] nvme-tcp: fix I/O stalls on congested sockets
2025-03-27 15:48 [PATCHv2 0/5] nvme-tcp: fixup I/O stall on congested sockets Hannes Reinecke
2025-03-27 15:48 ` [PATCH 1/5] nvme-tcp: open-code nvme_tcp_queue_request() for R2T Hannes Reinecke
2025-03-27 15:48 ` [PATCH 2/5] nvme-tcp: sanitize request list handling Hannes Reinecke
@ 2025-03-27 15:48 ` Hannes Reinecke
2025-03-27 15:48 ` [PATCH 4/5] nvme: requeue namespace scan on missed AENs Hannes Reinecke
2025-03-27 15:48 ` [PATCH 5/5] nvme: re-read ANA log page after ns scan completes Hannes Reinecke
4 siblings, 0 replies; 6+ messages in thread
From: Hannes Reinecke @ 2025-03-27 15:48 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Sagi Grimberg, Keith Busch, linux-nvme, Hannes Reinecke,
Chris Leech
When the socket is busy processing nvme_tcp_try_recv() might
return -EAGAIN, but this doesn't automatically imply that
the sending side is blocked, too.
So check if there are pending requests once nvme_tcp_try_recv()
returns -EAGAIN and continue with the sending loop to avoid
I/O stalls.
Acked-by: Chris Leech <cleech@redhat.com>
Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
drivers/nvme/host/tcp.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 1a319cb86453..87f1d7a4ea06 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1389,9 +1389,12 @@ static void nvme_tcp_io_work(struct work_struct *w)
result = nvme_tcp_try_recv(queue);
if (result > 0)
pending = true;
- else if (unlikely(result < 0))
+ else if (unlikely(result < 0) && result != -EAGAIN)
return;
+ if (nvme_tcp_queue_has_pending(queue))
+ pending = true;
+
if (!pending || !queue->rd_enabled)
return;
--
2.35.3
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 4/5] nvme: requeue namespace scan on missed AENs
2025-03-27 15:48 [PATCHv2 0/5] nvme-tcp: fixup I/O stall on congested sockets Hannes Reinecke
` (2 preceding siblings ...)
2025-03-27 15:48 ` [PATCH 3/5] nvme-tcp: fix I/O stalls on congested sockets Hannes Reinecke
@ 2025-03-27 15:48 ` Hannes Reinecke
2025-03-27 15:48 ` [PATCH 5/5] nvme: re-read ANA log page after ns scan completes Hannes Reinecke
4 siblings, 0 replies; 6+ messages in thread
From: Hannes Reinecke @ 2025-03-27 15:48 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Sagi Grimberg, Keith Busch, linux-nvme, Hannes Reinecke,
Hannes Reinecke
From: Hannes Reinecke <hare@suse.de>
Scanning for namespaces can take some time, so if the target is
reconfigured while the scan is running we will miss the AEN.
So check if the NVME_AER_NOTICE_NS_CHANGED bit is set once the scan is
finished, and requeue scanning to pick up any missed changes.
Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
drivers/nvme/host/core.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8359d0aa0e44..70f9c2d2b113 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4292,6 +4292,10 @@ static void nvme_scan_work(struct work_struct *work)
nvme_scan_ns_sequential(ctrl);
}
mutex_unlock(&ctrl->scan_lock);
+
+ /* Requeue if we have missed AENs */
+ if (test_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events))
+ nvme_queue_scan(ctrl);
}
/*
--
2.35.3
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 5/5] nvme: re-read ANA log page after ns scan completes
2025-03-27 15:48 [PATCHv2 0/5] nvme-tcp: fixup I/O stall on congested sockets Hannes Reinecke
` (3 preceding siblings ...)
2025-03-27 15:48 ` [PATCH 4/5] nvme: requeue namespace scan on missed AENs Hannes Reinecke
@ 2025-03-27 15:48 ` Hannes Reinecke
4 siblings, 0 replies; 6+ messages in thread
From: Hannes Reinecke @ 2025-03-27 15:48 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Sagi Grimberg, Keith Busch, linux-nvme, Hannes Reinecke,
Hannes Reinecke
From: Hannes Reinecke <hare@suse.de>
When scanning for new namespaces we might have missed an ANA AEN,
so we should always re-read the ANA log page after scanning to ensure
we don't miss updates there.
Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
drivers/nvme/host/core.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 70f9c2d2b113..161dd2b49865 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4296,6 +4296,9 @@ static void nvme_scan_work(struct work_struct *work)
/* Requeue if we have missed AENs */
if (test_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events))
nvme_queue_scan(ctrl);
+ else
+ /* Re-read the ANA log page to not miss updates */
+ queue_work(nvme_wq, &ctrl->ana_work);
}
/*
--
2.35.3
^ permalink raw reply related [flat|nested] 6+ messages in thread