[PATCH RFC] nvme-tcp: Implement recvmsg() receive flow

Linux-NVME Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow
@ 2025-09-12 11:58 Hannes Reinecke
  2025-09-22 17:41 ` Christoph Hellwig
  2026-02-25 10:56 ` Alistair Francis
  0 siblings, 2 replies; 10+ messages in thread
From: Hannes Reinecke @ 2025-09-12 11:58 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, Sagi Grimberg, linux-nvme, Alistair Francis,
	Hannes Reinecke

Switch to use recvmsg() so that we get access to TLS control
messages eg for handling TLS KeyUpdate.

Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
 drivers/nvme/host/tcp.c | 204 ++++++++++++++++++++++------------------
 1 file changed, 111 insertions(+), 93 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index c0fe8cfb7229..9ef1d4aea838 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -17,6 +17,7 @@
 #include <net/tls_prot.h>
 #include <net/handshake.h>
 #include <linux/blk-mq.h>
+#include <linux/iov_iter.h>
 #include <net/busy_poll.h>
 #include <trace/events/sock.h>
 
@@ -476,6 +477,28 @@ static inline void nvme_tcp_ddgst_update(u32 *crcp,
 	}
 }
 
+static size_t nvme_tcp_ddgst_step(void *iter_base, size_t progress, size_t len,
+				  void *priv, void *priv2)
+{
+	u32 *crcp = priv;
+
+	*crcp = crc32c(*crcp, iter_base, len);
+        return 0;
+}
+
+static int nvme_tcp_ddgst_calc(struct nvme_tcp_request *req, u32 *crcp,
+			       size_t maxsize)
+{
+	struct iov_iter tmp = req->iter;
+	int err = 0;
+
+	tmp.count = maxsize;
+	if (iterate_and_advance_kernel(&tmp, maxsize, crcp, &err,
+				       nvme_tcp_ddgst_step) != maxsize)
+		return err;
+	return 0;
+}
+
 static inline __le32 nvme_tcp_ddgst_final(u32 crc)
 {
 	return cpu_to_le32(~crc);
@@ -827,23 +850,26 @@ static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue,
 		"Received C2HTermReq (FES = %s)\n", msg);
 }
 
-static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
-		unsigned int *offset, size_t *len)
+static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
 {
-	struct nvme_tcp_hdr *hdr;
 	char *pdu = queue->pdu;
-	size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
+	struct msghdr msg = {
+		.msg_flags = MSG_DONTWAIT,
+	};
+	struct kvec iov = {
+		.iov_base = pdu + queue->pdu_offset,
+		.iov_len = queue->pdu_remaining,
+	};
+	struct nvme_tcp_hdr *hdr;
 	int ret;
 
-	ret = skb_copy_bits(skb, *offset,
-		&pdu[queue->pdu_offset], rcv_len);
-	if (unlikely(ret))
+	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+			     iov.iov_len, msg.msg_flags);
+	if (ret <= 0)
 		return ret;
 
-	queue->pdu_remaining -= rcv_len;
-	queue->pdu_offset += rcv_len;
-	*offset += rcv_len;
-	*len -= rcv_len;
+	queue->pdu_remaining -= ret;
+	queue->pdu_offset += ret;
 	if (queue->pdu_remaining)
 		return 0;
 
@@ -907,20 +933,19 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status)
 		nvme_complete_rq(rq);
 }
 
-static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
-			      unsigned int *offset, size_t *len)
+static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
 {
 	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 	struct request *rq =
 		nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 
-	while (true) {
-		int recv_len, ret;
+	if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
+		return 0;
 
-		recv_len = min_t(size_t, *len, queue->data_remaining);
-		if (!recv_len)
-			break;
+	while (queue->data_remaining) {
+		struct msghdr msg;
+		int ret;
 
 		if (!iov_iter_count(&req->iter)) {
 			req->curr_bio = req->curr_bio->bi_next;
@@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 		}
 
 		/* we can read only from what is left in this bio */
-		recv_len = min_t(size_t, recv_len,
-				iov_iter_count(&req->iter));
+		memset(&msg, 0, sizeof(msg));
+		msg.msg_iter = req->iter;
+		msg.msg_flags = MSG_DONTWAIT;
 
-		if (queue->data_digest)
-			ret = skb_copy_and_crc32c_datagram_iter(skb, *offset,
-				&req->iter, recv_len, &queue->rcv_crc);
-		else
-			ret = skb_copy_datagram_iter(skb, *offset,
-					&req->iter, recv_len);
-		if (ret) {
+		ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
+		if (ret < 0) {
 			dev_err(queue->ctrl->ctrl.device,
-				"queue %d failed to copy request %#x data",
+				"queue %d failed to receive request %#x data",
 				nvme_tcp_queue_id(queue), rq->tag);
 			return ret;
 		}
-
-		*len -= recv_len;
-		*offset += recv_len;
-		queue->data_remaining -= recv_len;
+		if (queue->data_digest)
+			nvme_tcp_ddgst_calc(req, &queue->rcv_crc, ret);
+		queue->data_remaining -= ret;
+		if (queue->data_remaining)
+			nvme_tcp_advance_req(req, ret);
 	}
 
 	if (!queue->data_remaining) {
@@ -968,7 +990,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 		} else {
 			if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 				nvme_tcp_end_request(rq,
-						le16_to_cpu(req->status));
+						     le16_to_cpu(req->status));
 				queue->nr_cqe++;
 			}
 			nvme_tcp_init_recv_ctx(queue);
@@ -978,24 +1000,9 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 	return 0;
 }
 
-static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
-		struct sk_buff *skb, unsigned int *offset, size_t *len)
+static int __nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue)
 {
 	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
-	char *ddgst = (char *)&queue->recv_ddgst;
-	size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
-	off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
-	int ret;
-
-	ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
-	if (unlikely(ret))
-		return ret;
-
-	queue->ddgst_remaining -= recv_len;
-	*offset += recv_len;
-	*len -= recv_len;
-	if (queue->ddgst_remaining)
-		return 0;
 
 	if (queue->recv_ddgst != queue->exp_ddgst) {
 		struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
@@ -1023,40 +1030,32 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
 	return 0;
 }
 
-static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
-			     unsigned int offset, size_t len)
+static int nvme_tcp_recvmsg_ddgst(struct nvme_tcp_queue *queue)
 {
-	struct nvme_tcp_queue *queue = desc->arg.data;
-	size_t consumed = len;
-	int result;
+	char *ddgst = (char *)&queue->recv_ddgst;
+	off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
+	struct msghdr msg = {
+		.msg_flags = MSG_WAITALL,
+	};
+	struct kvec iov = {
+		.iov_base = (u8 *)ddgst + off,
+		.iov_len = queue->ddgst_remaining,
+	};
+	int ret;
 
-	if (unlikely(!queue->rd_enabled))
-		return -EFAULT;
+	if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DDGST)
+		return 0;
 
-	while (len) {
-		switch (nvme_tcp_recv_state(queue)) {
-		case NVME_TCP_RECV_PDU:
-			result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
-			break;
-		case NVME_TCP_RECV_DATA:
-			result = nvme_tcp_recv_data(queue, skb, &offset, &len);
-			break;
-		case NVME_TCP_RECV_DDGST:
-			result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
-			break;
-		default:
-			result = -EFAULT;
-		}
-		if (result) {
-			dev_err(queue->ctrl->ctrl.device,
-				"receive failed:  %d\n", result);
-			queue->rd_enabled = false;
-			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
-			return result;
-		}
-	}
+	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len,
+			     msg.msg_flags);
+	if (ret <= 0)
+		return ret;
+
+	queue->ddgst_remaining -= ret;
+	if (queue->ddgst_remaining)
+		return 0;
 
-	return consumed;
+	return __nvme_tcp_recv_ddgst(queue);
 }
 
 static void nvme_tcp_data_ready(struct sock *sk)
@@ -1353,20 +1352,39 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
 	return ret;
 }
 
-static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
+static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
 {
-	struct socket *sock = queue->sock;
-	struct sock *sk = sock->sk;
-	read_descriptor_t rd_desc;
-	int consumed;
+	int result;
+	int nr_cqe = queue->nr_cqe;
+
+	if (unlikely(!queue->rd_enabled))
+		return -EFAULT;
+
+	do {
+		switch (nvme_tcp_recv_state(queue)) {
+		case NVME_TCP_RECV_PDU:
+			result = nvme_tcp_recvmsg_pdu(queue);
+			break;
+		case NVME_TCP_RECV_DATA:
+			result = nvme_tcp_recvmsg_data(queue);
+			break;
+		case NVME_TCP_RECV_DDGST:
+			result = nvme_tcp_recvmsg_ddgst(queue);
+			break;
+		default:
+			result = -EFAULT;
+		}
+	} while (result >= 0);
+
+	if (result < 0 && result != -EAGAIN) {
+		dev_err(queue->ctrl->ctrl.device,
+			"receive failed:  %d\n", result);
+		queue->rd_enabled = false;
+		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+	} else if (result == -EAGAIN)
+		result = 0;
 
-	rd_desc.arg.data = queue;
-	rd_desc.count = 1;
-	lock_sock(sk);
-	queue->nr_cqe = 0;
-	consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
-	release_sock(sk);
-	return consumed == -EAGAIN ? 0 : consumed;
+	return result < 0 ? result : (queue->nr_cqe = nr_cqe);
 }
 
 static void nvme_tcp_io_work(struct work_struct *w)
@@ -1388,7 +1406,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
 				break;
 		}
 
-		result = nvme_tcp_try_recv(queue);
+		result = nvme_tcp_try_recvmsg(queue);
 		if (result > 0)
 			pending = true;
 		else if (unlikely(result < 0))
@@ -2794,7 +2812,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 	set_bit(NVME_TCP_Q_POLLING, &queue->flags);
 	if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
 		sk_busy_loop(sk, true);
-	ret = nvme_tcp_try_recv(queue);
+	ret = nvme_tcp_try_recvmsg(queue);
 	clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
 	return ret < 0 ? ret : queue->nr_cqe;
 }
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow
  2025-09-12 11:58 [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow Hannes Reinecke
@ 2025-09-22 17:41 ` Christoph Hellwig
  2025-09-23  6:30   ` Hannes Reinecke
  2026-02-25 10:56 ` Alistair Francis
  1 sibling, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2025-09-22 17:41 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Keith Busch, Sagi Grimberg, linux-nvme,
	Alistair Francis

On Fri, Sep 12, 2025 at 01:58:29PM +0200, Hannes Reinecke wrote:
> Switch to use recvmsg() so that we get access to TLS control
> messages eg for handling TLS KeyUpdate.

That is a very spare commit message for a huge change.  Why did it not
use recvmsg before?  I'm pretty sure there are some tradeoffs here.

> +static size_t nvme_tcp_ddgst_step(void *iter_base, size_t progress, size_t len,
> +				  void *priv, void *priv2)
> +{
> +	u32 *crcp = priv;
> +
> +	*crcp = crc32c(*crcp, iter_base, len);
> +        return 0;

And fix the whitespace damage while you're at it.



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow
  2025-09-22 17:41 ` Christoph Hellwig
@ 2025-09-23  6:30   ` Hannes Reinecke
  0 siblings, 0 replies; 10+ messages in thread
From: Hannes Reinecke @ 2025-09-23  6:30 UTC (permalink / raw)
  To: Christoph Hellwig, Hannes Reinecke
  Cc: Keith Busch, Sagi Grimberg, linux-nvme, Alistair Francis

On 9/22/25 19:41, Christoph Hellwig wrote:
> On Fri, Sep 12, 2025 at 01:58:29PM +0200, Hannes Reinecke wrote:
>> Switch to use recvmsg() so that we get access to TLS control
>> messages eg for handling TLS KeyUpdate.
> 
> That is a very spare commit message for a huge change.  Why did it not
> use recvmsg before?  I'm pretty sure there are some tradeoffs here.
> 
Well, main reason was that the iSCSI driver has been used as a template
for NVMe-TCP, and that one uses ->read_sock().

But technically the reason is that ->read_sock() is driven from the
ingress side; the callback is triggered for each skb received.
Which (theoretically) can drive down latency as the callback is
only invoked when data is to be processed.
Drawback is that we lose access to the control messages, as these
are only extracted by the recvmsg code, so we cannot handle any
specific control messages like TLS alerts or KeyUpdate messages.
And pacing of ->read_sock() invocation gets tricky with TLS,
as the TLS stream is overlaid on TCP segments, so the skbs
received by TCP are not the skbs seen by the ->read_sock()
TLS implementation (stream parser magic).

The recvmsg workflow has the benefit that we're getting full
access to the control messages, so we can handle things like
TLS alerts and KeyUpdate messages. And the recvmsg() call
is driven from the consumer side, so we can get a better
pacing between sendmsg() and recvmsg() calls, which is
particularly important for the NVMe-TCP workflow where
we need to wait for responses before we can continue
with sending more data.

Sadly all performance differences between those two
implementations are completely obscured by some
network buffering effects, so I haven't been able
to come up with a meaningful comparison here.

But I'll improve the description.

>> +static size_t nvme_tcp_ddgst_step(void *iter_base, size_t progress, size_t len,
>> +				  void *priv, void *priv2)
>> +{
>> +	u32 *crcp = priv;
>> +
>> +	*crcp = crc32c(*crcp, iter_base, len);
>> +        return 0;
> 
> And fix the whitespace damage while you're at it.
> 
Sure.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow
  2025-09-12 11:58 [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow Hannes Reinecke
  2025-09-22 17:41 ` Christoph Hellwig
@ 2026-02-25 10:56 ` Alistair Francis
  2026-02-25 11:41   ` Hannes Reinecke
  1 sibling, 1 reply; 10+ messages in thread
From: Alistair Francis @ 2026-02-25 10:56 UTC (permalink / raw)
  To: hch, hare@kernel.org
  Cc: kbusch@kernel.org, sagi@grimberg.me,
	linux-nvme@lists.infradead.org

On Fri, 2025-09-12 at 13:58 +0200, Hannes Reinecke wrote:
> Switch to use recvmsg() so that we get access to TLS control
> messages eg for handling TLS KeyUpdate.
> 
> Signed-off-by: Hannes Reinecke <hare@kernel.org>
> ---
>  drivers/nvme/host/tcp.c | 204 ++++++++++++++++++++++----------------
> --
>  1 file changed, 111 insertions(+), 93 deletions(-)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index c0fe8cfb7229..9ef1d4aea838 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -17,6 +17,7 @@
>  #include <net/tls_prot.h>
>  #include <net/handshake.h>
>  #include <linux/blk-mq.h>
> +#include <linux/iov_iter.h>
>  #include <net/busy_poll.h>
>  #include <trace/events/sock.h>
>  
> @@ -476,6 +477,28 @@ static inline void nvme_tcp_ddgst_update(u32
> *crcp,
>  	}
>  }
>  
> +static size_t nvme_tcp_ddgst_step(void *iter_base, size_t progress,
> size_t len,
> +				  void *priv, void *priv2)
> +{
> +	u32 *crcp = priv;
> +
> +	*crcp = crc32c(*crcp, iter_base, len);
> +        return 0;
> +}
> +
> +static int nvme_tcp_ddgst_calc(struct nvme_tcp_request *req, u32
> *crcp,
> +			       size_t maxsize)
> +{
> +	struct iov_iter tmp = req->iter;
> +	int err = 0;
> +
> +	tmp.count = maxsize;
> +	if (iterate_and_advance_kernel(&tmp, maxsize, crcp, &err,
> +				       nvme_tcp_ddgst_step) !=
> maxsize)
> +		return err;
> +	return 0;
> +}
> +
>  static inline __le32 nvme_tcp_ddgst_final(u32 crc)
>  {
>  	return cpu_to_le32(~crc);
> @@ -827,23 +850,26 @@ static void nvme_tcp_handle_c2h_term(struct
> nvme_tcp_queue *queue,
>  		"Received C2HTermReq (FES = %s)\n", msg);
>  }
>  
> -static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct
> sk_buff *skb,
> -		unsigned int *offset, size_t *len)
> +static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
>  {
> -	struct nvme_tcp_hdr *hdr;
>  	char *pdu = queue->pdu;
> -	size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
> +	struct msghdr msg = {
> +		.msg_flags = MSG_DONTWAIT,
> +	};
> +	struct kvec iov = {
> +		.iov_base = pdu + queue->pdu_offset,
> +		.iov_len = queue->pdu_remaining,
> +	};
> +	struct nvme_tcp_hdr *hdr;
>  	int ret;
>  
> -	ret = skb_copy_bits(skb, *offset,
> -		&pdu[queue->pdu_offset], rcv_len);
> -	if (unlikely(ret))
> +	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
> +			     iov.iov_len, msg.msg_flags);
> +	if (ret <= 0)
>  		return ret;
>  
> -	queue->pdu_remaining -= rcv_len;
> -	queue->pdu_offset += rcv_len;
> -	*offset += rcv_len;
> -	*len -= rcv_len;
> +	queue->pdu_remaining -= ret;
> +	queue->pdu_offset += ret;
>  	if (queue->pdu_remaining)
>  		return 0;
>  
> @@ -907,20 +933,19 @@ static inline void nvme_tcp_end_request(struct
> request *rq, u16 status)
>  		nvme_complete_rq(rq);
>  }
>  
> -static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct
> sk_buff *skb,
> -			      unsigned int *offset, size_t *len)
> +static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
>  {
>  	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
>  	struct request *rq =
>  		nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu-
> >command_id);
>  	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
>  
> -	while (true) {
> -		int recv_len, ret;
> +	if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
> +		return 0;
>  
> -		recv_len = min_t(size_t, *len, queue-
> >data_remaining);
> -		if (!recv_len)
> -			break;
> +	while (queue->data_remaining) {
> +		struct msghdr msg;
> +		int ret;
>  
>  		if (!iov_iter_count(&req->iter)) {
>  			req->curr_bio = req->curr_bio->bi_next;
> @@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct
> nvme_tcp_queue *queue, struct sk_buff *skb,
>  		}
>  
>  		/* we can read only from what is left in this bio */
> -		recv_len = min_t(size_t, recv_len,
> -				iov_iter_count(&req->iter));
> +		memset(&msg, 0, sizeof(msg));
> +		msg.msg_iter = req->iter;
> +		msg.msg_flags = MSG_DONTWAIT;
>  
> -		if (queue->data_digest)
> -			ret = skb_copy_and_crc32c_datagram_iter(skb,
> *offset,
> -				&req->iter, recv_len, &queue-
> >rcv_crc);
> -		else
> -			ret = skb_copy_datagram_iter(skb, *offset,
> -					&req->iter, recv_len);
> -		if (ret) {
> +		ret = sock_recvmsg(queue->sock, &msg,
> msg.msg_flags);

This doesn't work unfortunately.

The problem is what happens if queue->data_remaining is smaller then
iov_iter_count(&req->iter)?

queue->data_remaining is set by the data length in the c2h, while the
length of the request iov_iter_count(&req->iter) is set when the
request is submitted.

If queue->data_remaining ends up being smaller then
iov_iter_count(&req->iter) then we need to read less data then the
actual count of req->iter.

So we need a iov_iter_truncate(), but then we end up overwriting the
data on the next iteration as we have no way to keep...

> +		if (ret < 0) {
>  			dev_err(queue->ctrl->ctrl.device,
> -				"queue %d failed to copy request %#x
> data",
> +				"queue %d failed to receive request
> %#x data",
>  				nvme_tcp_queue_id(queue), rq->tag);
>  			return ret;
>  		}
> -
> -		*len -= recv_len;
> -		*offset += recv_len;

...this offset.

PDU and ddgst in this patch continue using an offset as they use
kernel_recvmsg() instead of sock_recvmsg() directly.

So I think either:

 1. I'm missing something and there is a nice way to do this, please
let me know if that's the case.
 2. We need to convert this sock_recvmsg() to kernel_recvmsg(), with
the overhead of an extra iov_iter (I don't know if that is high or not)
 3. We use Chuck's read_sock with cmsg approach [1]

1:
https://lore.kernel.org/kernel-tls-handshake/20260217222033.1929211-1-cel@kernel.org/T/#t

Alistair

> -		queue->data_remaining -= recv_len;
> +		if (queue->data_digest)
> +			nvme_tcp_ddgst_calc(req, &queue->rcv_crc,
> ret);
> +		queue->data_remaining -= ret;
> +		if (queue->data_remaining)
> +			nvme_tcp_advance_req(req, ret);
>  	}
>  
>  	if (!queue->data_remaining) {
> @@ -968,7 +990,7 @@ static int nvme_tcp_recv_data(struct
> nvme_tcp_queue *queue, struct sk_buff *skb,
>  		} else {
>  			if (pdu->hdr.flags &
> NVME_TCP_F_DATA_SUCCESS) {
>  				nvme_tcp_end_request(rq,
> -						le16_to_cpu(req-
> >status));
> +						    
> le16_to_cpu(req->status));
>  				queue->nr_cqe++;
>  			}
>  			nvme_tcp_init_recv_ctx(queue);
> @@ -978,24 +1000,9 @@ static int nvme_tcp_recv_data(struct
> nvme_tcp_queue *queue, struct sk_buff *skb,
>  	return 0;
>  }
>  
> -static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
> -		struct sk_buff *skb, unsigned int *offset, size_t
> *len)
> +static int __nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue)
>  {
>  	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
> -	char *ddgst = (char *)&queue->recv_ddgst;
> -	size_t recv_len = min_t(size_t, *len, queue-
> >ddgst_remaining);
> -	off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
> -	int ret;
> -
> -	ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
> -	if (unlikely(ret))
> -		return ret;
> -
> -	queue->ddgst_remaining -= recv_len;
> -	*offset += recv_len;
> -	*len -= recv_len;
> -	if (queue->ddgst_remaining)
> -		return 0;
>  
>  	if (queue->recv_ddgst != queue->exp_ddgst) {
>  		struct request *rq =
> nvme_cid_to_rq(nvme_tcp_tagset(queue),
> @@ -1023,40 +1030,32 @@ static int nvme_tcp_recv_ddgst(struct
> nvme_tcp_queue *queue,
>  	return 0;
>  }
>  
> -static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff
> *skb,
> -			     unsigned int offset, size_t len)
> +static int nvme_tcp_recvmsg_ddgst(struct nvme_tcp_queue *queue)
>  {
> -	struct nvme_tcp_queue *queue = desc->arg.data;
> -	size_t consumed = len;
> -	int result;
> +	char *ddgst = (char *)&queue->recv_ddgst;
> +	off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
> +	struct msghdr msg = {
> +		.msg_flags = MSG_WAITALL,
> +	};
> +	struct kvec iov = {
> +		.iov_base = (u8 *)ddgst + off,
> +		.iov_len = queue->ddgst_remaining,
> +	};
> +	int ret;
>  
> -	if (unlikely(!queue->rd_enabled))
> -		return -EFAULT;
> +	if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DDGST)
> +		return 0;
>  
> -	while (len) {
> -		switch (nvme_tcp_recv_state(queue)) {
> -		case NVME_TCP_RECV_PDU:
> -			result = nvme_tcp_recv_pdu(queue, skb,
> &offset, &len);
> -			break;
> -		case NVME_TCP_RECV_DATA:
> -			result = nvme_tcp_recv_data(queue, skb,
> &offset, &len);
> -			break;
> -		case NVME_TCP_RECV_DDGST:
> -			result = nvme_tcp_recv_ddgst(queue, skb,
> &offset, &len);
> -			break;
> -		default:
> -			result = -EFAULT;
> -		}
> -		if (result) {
> -			dev_err(queue->ctrl->ctrl.device,
> -				"receive failed:  %d\n", result);
> -			queue->rd_enabled = false;
> -			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> -			return result;
> -		}
> -	}
> +	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
> iov.iov_len,
> +			     msg.msg_flags);
> +	if (ret <= 0)
> +		return ret;
> +
> +	queue->ddgst_remaining -= ret;
> +	if (queue->ddgst_remaining)
> +		return 0;
>  
> -	return consumed;
> +	return __nvme_tcp_recv_ddgst(queue);
>  }
>  
>  static void nvme_tcp_data_ready(struct sock *sk)
> @@ -1353,20 +1352,39 @@ static int nvme_tcp_try_send(struct
> nvme_tcp_queue *queue)
>  	return ret;
>  }
>  
> -static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
> +static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
>  {
> -	struct socket *sock = queue->sock;
> -	struct sock *sk = sock->sk;
> -	read_descriptor_t rd_desc;
> -	int consumed;
> +	int result;
> +	int nr_cqe = queue->nr_cqe;
> +
> +	if (unlikely(!queue->rd_enabled))
> +		return -EFAULT;
> +
> +	do {
> +		switch (nvme_tcp_recv_state(queue)) {
> +		case NVME_TCP_RECV_PDU:
> +			result = nvme_tcp_recvmsg_pdu(queue);
> +			break;
> +		case NVME_TCP_RECV_DATA:
> +			result = nvme_tcp_recvmsg_data(queue);
> +			break;
> +		case NVME_TCP_RECV_DDGST:
> +			result = nvme_tcp_recvmsg_ddgst(queue);
> +			break;
> +		default:
> +			result = -EFAULT;
> +		}
> +	} while (result >= 0);
> +
> +	if (result < 0 && result != -EAGAIN) {
> +		dev_err(queue->ctrl->ctrl.device,
> +			"receive failed:  %d\n", result);
> +		queue->rd_enabled = false;
> +		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +	} else if (result == -EAGAIN)
> +		result = 0;
>  
> -	rd_desc.arg.data = queue;
> -	rd_desc.count = 1;
> -	lock_sock(sk);
> -	queue->nr_cqe = 0;
> -	consumed = sock->ops->read_sock(sk, &rd_desc,
> nvme_tcp_recv_skb);
> -	release_sock(sk);
> -	return consumed == -EAGAIN ? 0 : consumed;
> +	return result < 0 ? result : (queue->nr_cqe = nr_cqe);
>  }
>  
>  static void nvme_tcp_io_work(struct work_struct *w)
> @@ -1388,7 +1406,7 @@ static void nvme_tcp_io_work(struct work_struct
> *w)
>  				break;
>  		}
>  
> -		result = nvme_tcp_try_recv(queue);
> +		result = nvme_tcp_try_recvmsg(queue);
>  		if (result > 0)
>  			pending = true;
>  		else if (unlikely(result < 0))
> @@ -2794,7 +2812,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx
> *hctx, struct io_comp_batch *iob)
>  	set_bit(NVME_TCP_Q_POLLING, &queue->flags);
>  	if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk-
> >sk_receive_queue))
>  		sk_busy_loop(sk, true);
> -	ret = nvme_tcp_try_recv(queue);
> +	ret = nvme_tcp_try_recvmsg(queue);
>  	clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
>  	return ret < 0 ? ret : queue->nr_cqe;
>  }

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow
  2026-02-25 10:56 ` Alistair Francis
@ 2026-02-25 11:41   ` Hannes Reinecke
  2026-02-25 13:15     ` Alistair Francis
  0 siblings, 1 reply; 10+ messages in thread
From: Hannes Reinecke @ 2026-02-25 11:41 UTC (permalink / raw)
  To: Alistair Francis, hch, hare@kernel.org
  Cc: kbusch@kernel.org, sagi@grimberg.me,
	linux-nvme@lists.infradead.org

On 2/25/26 11:56, Alistair Francis wrote:
> On Fri, 2025-09-12 at 13:58 +0200, Hannes Reinecke wrote:
>> Switch to use recvmsg() so that we get access to TLS control
>> messages eg for handling TLS KeyUpdate.
>>
>> Signed-off-by: Hannes Reinecke <hare@kernel.org>
>> ---
>>   drivers/nvme/host/tcp.c | 204 ++++++++++++++++++++++----------------
>> --
>>   1 file changed, 111 insertions(+), 93 deletions(-)
>>
>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>> index c0fe8cfb7229..9ef1d4aea838 100644
>> --- a/drivers/nvme/host/tcp.c
>> +++ b/drivers/nvme/host/tcp.c
>> @@ -17,6 +17,7 @@
>>   #include <net/tls_prot.h>
>>   #include <net/handshake.h>
>>   #include <linux/blk-mq.h>
>> +#include <linux/iov_iter.h>
>>   #include <net/busy_poll.h>
>>   #include <trace/events/sock.h>
>>   
>> @@ -476,6 +477,28 @@ static inline void nvme_tcp_ddgst_update(u32
>> *crcp,
>>   	}
>>   }
>>   
>> +static size_t nvme_tcp_ddgst_step(void *iter_base, size_t progress,
>> size_t len,
>> +				  void *priv, void *priv2)
>> +{
>> +	u32 *crcp = priv;
>> +
>> +	*crcp = crc32c(*crcp, iter_base, len);
>> +        return 0;
>> +}
>> +
>> +static int nvme_tcp_ddgst_calc(struct nvme_tcp_request *req, u32
>> *crcp,
>> +			       size_t maxsize)
>> +{
>> +	struct iov_iter tmp = req->iter;
>> +	int err = 0;
>> +
>> +	tmp.count = maxsize;
>> +	if (iterate_and_advance_kernel(&tmp, maxsize, crcp, &err,
>> +				       nvme_tcp_ddgst_step) !=
>> maxsize)
>> +		return err;
>> +	return 0;
>> +}
>> +
>>   static inline __le32 nvme_tcp_ddgst_final(u32 crc)
>>   {
>>   	return cpu_to_le32(~crc);
>> @@ -827,23 +850,26 @@ static void nvme_tcp_handle_c2h_term(struct
>> nvme_tcp_queue *queue,
>>   		"Received C2HTermReq (FES = %s)\n", msg);
>>   }
>>   
>> -static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct
>> sk_buff *skb,
>> -		unsigned int *offset, size_t *len)
>> +static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
>>   {
>> -	struct nvme_tcp_hdr *hdr;
>>   	char *pdu = queue->pdu;
>> -	size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
>> +	struct msghdr msg = {
>> +		.msg_flags = MSG_DONTWAIT,
>> +	};
>> +	struct kvec iov = {
>> +		.iov_base = pdu + queue->pdu_offset,
>> +		.iov_len = queue->pdu_remaining,
>> +	};
>> +	struct nvme_tcp_hdr *hdr;
>>   	int ret;
>>   
>> -	ret = skb_copy_bits(skb, *offset,
>> -		&pdu[queue->pdu_offset], rcv_len);
>> -	if (unlikely(ret))
>> +	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
>> +			     iov.iov_len, msg.msg_flags);
>> +	if (ret <= 0)
>>   		return ret;
>>   
>> -	queue->pdu_remaining -= rcv_len;
>> -	queue->pdu_offset += rcv_len;
>> -	*offset += rcv_len;
>> -	*len -= rcv_len;
>> +	queue->pdu_remaining -= ret;
>> +	queue->pdu_offset += ret;
>>   	if (queue->pdu_remaining)
>>   		return 0;
>>   
>> @@ -907,20 +933,19 @@ static inline void nvme_tcp_end_request(struct
>> request *rq, u16 status)
>>   		nvme_complete_rq(rq);
>>   }
>>   
>> -static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct
>> sk_buff *skb,
>> -			      unsigned int *offset, size_t *len)
>> +static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
>>   {
>>   	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
>>   	struct request *rq =
>>   		nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu-
>>> command_id);
>>   	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
>>   
>> -	while (true) {
>> -		int recv_len, ret;
>> +	if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
>> +		return 0;
>>   
>> -		recv_len = min_t(size_t, *len, queue-
>>> data_remaining);
>> -		if (!recv_len)
>> -			break;
>> +	while (queue->data_remaining) {
>> +		struct msghdr msg;
>> +		int ret;
>>   
>>   		if (!iov_iter_count(&req->iter)) {
>>   			req->curr_bio = req->curr_bio->bi_next;
>> @@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct
>> nvme_tcp_queue *queue, struct sk_buff *skb,
>>   		}
>>   
>>   		/* we can read only from what is left in this bio */
>> -		recv_len = min_t(size_t, recv_len,
>> -				iov_iter_count(&req->iter));
>> +		memset(&msg, 0, sizeof(msg));
>> +		msg.msg_iter = req->iter;
>> +		msg.msg_flags = MSG_DONTWAIT;
>>   
>> -		if (queue->data_digest)
>> -			ret = skb_copy_and_crc32c_datagram_iter(skb,
>> *offset,
>> -				&req->iter, recv_len, &queue-
>>> rcv_crc);
>> -		else
>> -			ret = skb_copy_datagram_iter(skb, *offset,
>> -					&req->iter, recv_len);
>> -		if (ret) {
>> +		ret = sock_recvmsg(queue->sock, &msg,
>> msg.msg_flags);
> 
> This doesn't work unfortunately.
> 
> The problem is what happens if queue->data_remaining is smaller then
> iov_iter_count(&req->iter)?
> 
> queue->data_remaining is set by the data length in the c2h, while the
> length of the request iov_iter_count(&req->iter) is set when the
> request is submitted.
> 
> If queue->data_remaining ends up being smaller then
> iov_iter_count(&req->iter) then we need to read less data then the
> actual count of req->iter.
> 
> So we need a iov_iter_truncate(), but then we end up overwriting the
> data on the next iteration as we have no way to keep...
> 
Question is, though: what _is_ in the remaining iov?
The most reasonable explanation would be that it's the start of the
next PDU (which we haven't accounted for, and hence haven't set
up pointers correctly).
I can see this happening for TLS when the sender doesn't space
the records correctly (ie if the PDU end is not falling on a
TLS record boundary).

But yeah, I can see the issue. While we can (and do)
advance the iterator to complete the request, we still
have the remaining data in the iterator.

What we can do, though, is to copy the remaining data over
to 'queue->pdu' (as we assume it's the start of the next PDU),
set up pointers, and let it rip.

Sounds okay-ish.

And it would certainly do away with the restriction of PDUs
having to be spaced on TLS record boundaries...


>> +		if (ret < 0) {
>>   			dev_err(queue->ctrl->ctrl.device,
>> -				"queue %d failed to copy request %#x
>> data",
>> +				"queue %d failed to receive request
>> %#x data",
>>   				nvme_tcp_queue_id(queue), rq->tag);
>>   			return ret;
>>   		}
>> -
>> -		*len -= recv_len;
>> -		*offset += recv_len;
> 
> ...this offset.
> 
Yikes. The is wrong, indeed.

> PDU and ddgst in this patch continue using an offset as they use
> kernel_recvmsg() instead of sock_recvmsg() directly.
> 
Argl. Rather not. I don't want to repackage iterators ...

> So I think either:
> 
>   1. I'm missing something and there is a nice way to do this, please
> let me know if that's the case.
See above. We could copy the remaining iter data onto 'queue->pdu',
and restart parsing.
(It _must_ be the start of a PDU anyway, otherwise queue->remaining 
would've covered it).

>   2. We need to convert this sock_recvmsg() to kernel_recvmsg(), with
> the overhead of an extra iov_iter (I don't know if that is high or not)

Rather not.

>   3. We use Chuck's read_sock with cmsg approach [1]

Another possibility.
Especially as now Chuck has worked on improving the TLS record spacing,
so this might be a possible avenue.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow
  2026-02-25 11:41   ` Hannes Reinecke
@ 2026-02-25 13:15     ` Alistair Francis
  2026-02-25 15:02       ` Hannes Reinecke
  0 siblings, 1 reply; 10+ messages in thread
From: Alistair Francis @ 2026-02-25 13:15 UTC (permalink / raw)
  To: hch, hare@kernel.org, hare@suse.de
  Cc: kbusch@kernel.org, sagi@grimberg.me,
	linux-nvme@lists.infradead.org

On Wed, 2026-02-25 at 12:41 +0100, Hannes Reinecke wrote:
> On 2/25/26 11:56, Alistair Francis wrote:
> > On Fri, 2025-09-12 at 13:58 +0200, Hannes Reinecke wrote:
> > > Switch to use recvmsg() so that we get access to TLS control
> > > messages eg for handling TLS KeyUpdate.
> > > 
> > > Signed-off-by: Hannes Reinecke <hare@kernel.org>
> > > ---
> > >   drivers/nvme/host/tcp.c | 204 ++++++++++++++++++++++-----------
> > > -----
> > > --
> > >   1 file changed, 111 insertions(+), 93 deletions(-)
> > > 
> > > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> > > index c0fe8cfb7229..9ef1d4aea838 100644
> > > --- a/drivers/nvme/host/tcp.c
> > > +++ b/drivers/nvme/host/tcp.c
> > > @@ -17,6 +17,7 @@
> > >   #include <net/tls_prot.h>
> > >   #include <net/handshake.h>
> > >   #include <linux/blk-mq.h>
> > > +#include <linux/iov_iter.h>
> > >   #include <net/busy_poll.h>
> > >   #include <trace/events/sock.h>
> > >   
> > > @@ -476,6 +477,28 @@ static inline void nvme_tcp_ddgst_update(u32
> > > *crcp,
> > >   	}
> > >   }
> > >   
> > > +static size_t nvme_tcp_ddgst_step(void *iter_base, size_t
> > > progress,
> > > size_t len,
> > > +				  void *priv, void *priv2)
> > > +{
> > > +	u32 *crcp = priv;
> > > +
> > > +	*crcp = crc32c(*crcp, iter_base, len);
> > > +        return 0;
> > > +}
> > > +
> > > +static int nvme_tcp_ddgst_calc(struct nvme_tcp_request *req, u32
> > > *crcp,
> > > +			       size_t maxsize)
> > > +{
> > > +	struct iov_iter tmp = req->iter;
> > > +	int err = 0;
> > > +
> > > +	tmp.count = maxsize;
> > > +	if (iterate_and_advance_kernel(&tmp, maxsize, crcp,
> > > &err,
> > > +				       nvme_tcp_ddgst_step) !=
> > > maxsize)
> > > +		return err;
> > > +	return 0;
> > > +}
> > > +
> > >   static inline __le32 nvme_tcp_ddgst_final(u32 crc)
> > >   {
> > >   	return cpu_to_le32(~crc);
> > > @@ -827,23 +850,26 @@ static void nvme_tcp_handle_c2h_term(struct
> > > nvme_tcp_queue *queue,
> > >   		"Received C2HTermReq (FES = %s)\n", msg);
> > >   }
> > >   
> > > -static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue,
> > > struct
> > > sk_buff *skb,
> > > -		unsigned int *offset, size_t *len)
> > > +static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
> > >   {
> > > -	struct nvme_tcp_hdr *hdr;
> > >   	char *pdu = queue->pdu;
> > > -	size_t rcv_len = min_t(size_t, *len, queue-
> > > >pdu_remaining);
> > > +	struct msghdr msg = {
> > > +		.msg_flags = MSG_DONTWAIT,
> > > +	};
> > > +	struct kvec iov = {
> > > +		.iov_base = pdu + queue->pdu_offset,
> > > +		.iov_len = queue->pdu_remaining,
> > > +	};
> > > +	struct nvme_tcp_hdr *hdr;
> > >   	int ret;
> > >   
> > > -	ret = skb_copy_bits(skb, *offset,
> > > -		&pdu[queue->pdu_offset], rcv_len);
> > > -	if (unlikely(ret))
> > > +	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
> > > +			     iov.iov_len, msg.msg_flags);
> > > +	if (ret <= 0)
> > >   		return ret;
> > >   
> > > -	queue->pdu_remaining -= rcv_len;
> > > -	queue->pdu_offset += rcv_len;
> > > -	*offset += rcv_len;
> > > -	*len -= rcv_len;
> > > +	queue->pdu_remaining -= ret;
> > > +	queue->pdu_offset += ret;
> > >   	if (queue->pdu_remaining)
> > >   		return 0;
> > >   
> > > @@ -907,20 +933,19 @@ static inline void
> > > nvme_tcp_end_request(struct
> > > request *rq, u16 status)
> > >   		nvme_complete_rq(rq);
> > >   }
> > >   
> > > -static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue,
> > > struct
> > > sk_buff *skb,
> > > -			      unsigned int *offset, size_t *len)
> > > +static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
> > >   {
> > >   	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
> > >   	struct request *rq =
> > >   		nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu-
> > > > command_id);
> > >   	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
> > >   
> > > -	while (true) {
> > > -		int recv_len, ret;
> > > +	if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
> > > +		return 0;
> > >   
> > > -		recv_len = min_t(size_t, *len, queue-
> > > > data_remaining);
> > > -		if (!recv_len)
> > > -			break;
> > > +	while (queue->data_remaining) {
> > > +		struct msghdr msg;
> > > +		int ret;
> > >   
> > >   		if (!iov_iter_count(&req->iter)) {
> > >   			req->curr_bio = req->curr_bio->bi_next;
> > > @@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct
> > > nvme_tcp_queue *queue, struct sk_buff *skb,
> > >   		}
> > >   
> > >   		/* we can read only from what is left in this
> > > bio */
> > > -		recv_len = min_t(size_t, recv_len,
> > > -				iov_iter_count(&req->iter));
> > > +		memset(&msg, 0, sizeof(msg));
> > > +		msg.msg_iter = req->iter;
> > > +		msg.msg_flags = MSG_DONTWAIT;
> > >   
> > > -		if (queue->data_digest)
> > > -			ret =
> > > skb_copy_and_crc32c_datagram_iter(skb,
> > > *offset,
> > > -				&req->iter, recv_len, &queue-
> > > > rcv_crc);
> > > -		else
> > > -			ret = skb_copy_datagram_iter(skb,
> > > *offset,
> > > -					&req->iter, recv_len);
> > > -		if (ret) {
> > > +		ret = sock_recvmsg(queue->sock, &msg,
> > > msg.msg_flags);
> > 
> > This doesn't work unfortunately.
> > 
> > The problem is what happens if queue->data_remaining is smaller
> > then
> > iov_iter_count(&req->iter)?
> > 
> > queue->data_remaining is set by the data length in the c2h, while
> > the
> > length of the request iov_iter_count(&req->iter) is set when the
> > request is submitted.
> > 
> > If queue->data_remaining ends up being smaller then
> > iov_iter_count(&req->iter) then we need to read less data then the
> > actual count of req->iter.
> > 
> > So we need a iov_iter_truncate(), but then we end up overwriting
> > the
> > data on the next iteration as we have no way to keep...
> > 
> Question is, though: what _is_ in the remaining iov?

Which remaining iov?

> The most reasonable explanation would be that it's the start of the
> next PDU (which we haven't accounted for, and hence haven't set
> up pointers correctly).

The next PDU seems fine, it's just the next data that goes on the
current (and correct) IOV, just overwriting the previous data as there
is no offset.

> I can see this happening for TLS when the sender doesn't space
> the records correctly (ie if the PDU end is not falling on a
> TLS record boundary).
> 
> But yeah, I can see the issue. While we can (and do)
> advance the iterator to complete the request, we still
> have the remaining data in the iterator.

> What we can do, though, is to copy the remaining data over
> to 'queue->pdu' (as we assume it's the start of the next PDU),
> set up pointers, and let it rip.

Ah, I think that's the other way around of what I'm talking about. That
sounds more like you overflow the current iterator. I'm talking about
an underflow, where we don't transfer enough data to complete a
request. So when we continue the transfer we overwrite the previous
data.

> 
> Sounds okay-ish.
> 
> And it would certainly do away with the restriction of PDUs
> having to be spaced on TLS record boundaries...
> 
> 
> > > +		if (ret < 0) {
> > >   			dev_err(queue->ctrl->ctrl.device,
> > > -				"queue %d failed to copy request
> > > %#x
> > > data",
> > > +				"queue %d failed to receive
> > > request
> > > %#x data",
> > >   				nvme_tcp_queue_id(queue), rq-
> > > >tag);
> > >   			return ret;
> > >   		}
> > > -
> > > -		*len -= recv_len;
> > > -		*offset += recv_len;
> > 
> > ...this offset.
> > 
> Yikes. The is wrong, indeed.
> 
> > PDU and ddgst in this patch continue using an offset as they use
> > kernel_recvmsg() instead of sock_recvmsg() directly.
> > 
> Argl. Rather not. I don't want to repackage iterators ...
> 
> > So I think either:
> > 
> >   1. I'm missing something and there is a nice way to do this,
> > please
> > let me know if that's the case.
> See above. We could copy the remaining iter data onto 'queue->pdu',
> and restart parsing.

I don't think that addresses the issue. The next PDU seems fine, we
don't lose that, it's the next data that we end up putting in the wrong
place.

> (It _must_ be the start of a PDU anyway, otherwise queue->remaining 
> would've covered it).
> 
> >   2. We need to convert this sock_recvmsg() to kernel_recvmsg(),
> > with
> > the overhead of an extra iov_iter (I don't know if that is high or
> > not)
> 
> Rather not.

Agreed, but unless 1 works then we need 3.

> 
> >   3. We use Chuck's read_sock with cmsg approach [1]
> 
> Another possibility.
> Especially as now Chuck has worked on improving the TLS record
> spacing,
> so this might be a possible avenue.

I do think this is the way to go, albeit a little slower then I was
hoping for.

Alistair

> 
> Cheers,
> 
> Hannes

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow
  2026-02-25 13:15     ` Alistair Francis
@ 2026-02-25 15:02       ` Hannes Reinecke
  2026-02-25 23:37         ` Alistair Francis
  0 siblings, 1 reply; 10+ messages in thread
From: Hannes Reinecke @ 2026-02-25 15:02 UTC (permalink / raw)
  To: Alistair Francis, hch, hare@kernel.org
  Cc: kbusch@kernel.org, sagi@grimberg.me,
	linux-nvme@lists.infradead.org

On 2/25/26 14:15, Alistair Francis wrote:
> On Wed, 2026-02-25 at 12:41 +0100, Hannes Reinecke wrote:
>> On 2/25/26 11:56, Alistair Francis wrote:
[ .. ]
>>>
>>> This doesn't work unfortunately.
>>>
>>> The problem is what happens if queue->data_remaining is smaller
>>> then iov_iter_count(&req->iter)?
>>>
>>> queue->data_remaining is set by the data length in the c2h, while
>>> the length of the request iov_iter_count(&req->iter) is set when the
>>> request is submitted.
>>>
>>> If queue->data_remaining ends up being smaller then
>>> iov_iter_count(&req->iter) then we need to read less data then the
>>> actual count of req->iter.
>>>
>>> So we need a iov_iter_truncate(), but then we end up overwriting
>>> the data on the next iteration as we have no way to keep...
>>>
>> Question is, though: what _is_ in the remaining iov?
> 
> Which remaining iov?
> 

Well, if queue->
>> The most reasonable explanation would be that it's the start of the
>> next PDU (which we haven't accounted for, and hence haven't set
>> up pointers correctly).
> 
> The next PDU seems fine, it's just the next data that goes on the
> current (and correct) IOV, just overwriting the previous data as there
> is no offset.
> 

The offset is in the iov (ie you advance the iov iter to capture the offset)

>> I can see this happening for TLS when the sender doesn't space
>> the records correctly (ie if the PDU end is not falling on a
>> TLS record boundary).
>>
>> But yeah, I can see the issue. While we can (and do)
>> advance the iterator to complete the request, we still
>> have the remaining data in the iterator.
> 
>> What we can do, though, is to copy the remaining data over
>> to 'queue->pdu' (as we assume it's the start of the next PDU),
>> set up pointers, and let it rip.
> 
> Ah, I think that's the other way around of what I'm talking about. That
> sounds more like you overflow the current iterator. I'm talking about
> an underflow, where we don't transfer enough data to complete a
> request. So when we continue the transfer we overwrite the previous
> data.
> 
To my understanding queue->remaining is the number of bytes left
for _this_ PDU (actually, for the PDU data; digests are counted
separately).
And the iterator points to the iovec storing the eventual data.
So in your case iov_len() is _larger_ than queue->remaining.
How so? iov_len() is calculated from the request, and that
should have the correct length set.
So the only way I see how we could arrive in the situation
where iov_len() is larger thatn queue->remaining is when
we have a short read.
But really, both values are setup right at the start
before we even start receiving data.
Shouldn't we catch this issue there?

I guess I'm still confused how you could end up in a situation
where queue->remaining is smaller that the iovec length...

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow
  2026-02-25 15:02       ` Hannes Reinecke
@ 2026-02-25 23:37         ` Alistair Francis
  2026-02-26  8:40           ` Hannes Reinecke
  0 siblings, 1 reply; 10+ messages in thread
From: Alistair Francis @ 2026-02-25 23:37 UTC (permalink / raw)
  To: hch, hare@kernel.org, hare@suse.de
  Cc: kbusch@kernel.org, sagi@grimberg.me,
	linux-nvme@lists.infradead.org

On Wed, 2026-02-25 at 16:02 +0100, Hannes Reinecke wrote:
> On 2/25/26 14:15, Alistair Francis wrote:
> > On Wed, 2026-02-25 at 12:41 +0100, Hannes Reinecke wrote:
> > > On 2/25/26 11:56, Alistair Francis wrote:
> [ .. ]
> > > > 
> > > > This doesn't work unfortunately.
> > > > 
> > > > The problem is what happens if queue->data_remaining is smaller
> > > > then iov_iter_count(&req->iter)?
> > > > 
> > > > queue->data_remaining is set by the data length in the c2h,
> > > > while
> > > > the length of the request iov_iter_count(&req->iter) is set
> > > > when the
> > > > request is submitted.
> > > > 
> > > > If queue->data_remaining ends up being smaller then
> > > > iov_iter_count(&req->iter) then we need to read less data then
> > > > the
> > > > actual count of req->iter.
> > > > 
> > > > So we need a iov_iter_truncate(), but then we end up
> > > > overwriting
> > > > the data on the next iteration as we have no way to keep...
> > > > 
> > > Question is, though: what _is_ in the remaining iov?
> > 
> > Which remaining iov?
> > 
> 
> Well, if queue->
> > > The most reasonable explanation would be that it's the start of
> > > the
> > > next PDU (which we haven't accounted for, and hence haven't set
> > > up pointers correctly).
> > 
> > The next PDU seems fine, it's just the next data that goes on the
> > current (and correct) IOV, just overwriting the previous data as
> > there
> > is no offset.
> > 
> 
> The offset is in the iov (ie you advance the iov iter to capture the
> offset)
> 
> > > I can see this happening for TLS when the sender doesn't space
> > > the records correctly (ie if the PDU end is not falling on a
> > > TLS record boundary).
> > > 
> > > But yeah, I can see the issue. While we can (and do)
> > > advance the iterator to complete the request, we still
> > > have the remaining data in the iterator.
> > 
> > > What we can do, though, is to copy the remaining data over
> > > to 'queue->pdu' (as we assume it's the start of the next PDU),
> > > set up pointers, and let it rip.

I think I understand this part a bit more now. I'm currently using
iov_iter_truncate() to reduce the count of req->iter to match queue-
>data_remaining, so I don't see this.

If there was no truncation then the req->iter would fill up with the
current data and the next PDU. This is where we could copy that data
over and let it rip.

But it still has the same issue in that we can't add future data to an
offset in the req->iter. At least not that I can figure out.

> > 
> > Ah, I think that's the other way around of what I'm talking about.
> > That
> > sounds more like you overflow the current iterator. I'm talking
> > about
> > an underflow, where we don't transfer enough data to complete a
> > request. So when we continue the transfer we overwrite the previous
> > data.
> > 
> To my understanding queue->remaining is the number of bytes left
> for _this_ PDU (actually, for the PDU data; digests are counted
> separately).

Yeah, that's correct.

> And the iterator points to the iovec storing the eventual data.
> So in your case iov_len() is _larger_ than queue->remaining.
> How so? iov_len() is calculated from the request, and that
> should have the correct length set.

What I'm seeing is that nvme_submit_sync_cmd() is called with a length
of 4096 (called from nvme_identify_ctrl()).

That results in a req->iter with a count of 4096.

The data from the device is sent in 3 transfers (data_remaining)

1412 bytes
1412 bytes
1272 bytes

They all target the same req->iter.

In the previous code the offset handled this fine, but we no longer
have that.

> So the only way I see how we could arrive in the situation
> where iov_len() is larger thatn queue->remaining is when
> we have a short read.
> But really, both values are setup right at the start
> before we even start receiving data.
> Shouldn't we catch this issue there?

This happens really early as part of nvme_identify_ctrl()

Alistair

> 
> I guess I'm still confused how you could end up in a situation
> where queue->remaining is smaller that the iovec length...
> 
> Cheers,
> 
> Hannes

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow
  2026-02-25 23:37         ` Alistair Francis
@ 2026-02-26  8:40           ` Hannes Reinecke
  2026-02-26 10:42             ` Alistair Francis
  0 siblings, 1 reply; 10+ messages in thread
From: Hannes Reinecke @ 2026-02-26  8:40 UTC (permalink / raw)
  To: Alistair Francis, hch, hare@kernel.org
  Cc: kbusch@kernel.org, sagi@grimberg.me,
	linux-nvme@lists.infradead.org

On 2/26/26 00:37, Alistair Francis wrote:
> On Wed, 2026-02-25 at 16:02 +0100, Hannes Reinecke wrote:
>> On 2/25/26 14:15, Alistair Francis wrote:
>>> On Wed, 2026-02-25 at 12:41 +0100, Hannes Reinecke wrote:
>>>> On 2/25/26 11:56, Alistair Francis wrote:
>> [ .. ]
>>>>>
>>>>> This doesn't work unfortunately.
>>>>>
>>>>> The problem is what happens if queue->data_remaining is smaller
>>>>> then iov_iter_count(&req->iter)?
>>>>>
>>>>> queue->data_remaining is set by the data length in the c2h,
>>>>> while
>>>>> the length of the request iov_iter_count(&req->iter) is set
>>>>> when the
>>>>> request is submitted.
>>>>>
>>>>> If queue->data_remaining ends up being smaller then
>>>>> iov_iter_count(&req->iter) then we need to read less data then
>>>>> the
>>>>> actual count of req->iter.
>>>>>
>>>>> So we need a iov_iter_truncate(), but then we end up
>>>>> overwriting
>>>>> the data on the next iteration as we have no way to keep...
>>>>>
>>>> Question is, though: what _is_ in the remaining iov?
>>>
>>> Which remaining iov?
>>>
>>
>> Well, if queue->
>>>> The most reasonable explanation would be that it's the start of
>>>> the
>>>> next PDU (which we haven't accounted for, and hence haven't set
>>>> up pointers correctly).
>>>
>>> The next PDU seems fine, it's just the next data that goes on the
>>> current (and correct) IOV, just overwriting the previous data as
>>> there
>>> is no offset.
>>>
>>
>> The offset is in the iov (ie you advance the iov iter to capture the
>> offset)
>>
>>>> I can see this happening for TLS when the sender doesn't space
>>>> the records correctly (ie if the PDU end is not falling on a
>>>> TLS record boundary).
>>>>
>>>> But yeah, I can see the issue. While we can (and do)
>>>> advance the iterator to complete the request, we still
>>>> have the remaining data in the iterator.
>>>
>>>> What we can do, though, is to copy the remaining data over
>>>> to 'queue->pdu' (as we assume it's the start of the next PDU),
>>>> set up pointers, and let it rip.
> 
> I think I understand this part a bit more now. I'm currently using
> iov_iter_truncate() to reduce the count of req->iter to match queue-
>> data_remaining, so I don't see this.
> 
> If there was no truncation then the req->iter would fill up with the
> current data and the next PDU. This is where we could copy that data
> over and let it rip.
> 
> But it still has the same issue in that we can't add future data to an
> offset in the req->iter. At least not that I can figure out.
> 

Ah, I think I see it now.
The iovec might indeed be larger than queue->remaining
(data underflows are not that uncommon), and then recvmsg
might indeed fill the iovec with more data than the PDU
requires.

The old code had that bit:
  		/* we can read only from what is left in this bio */
		recv_len = min_t(size_t, recv_len,
				iov_iter_count(&req->iter));
(with recv_len being set to queue->data_remaining) to prevent that
from happening.
So we should do a

if (iov_iter_count(&req->iter) > queue->remaining)
    iov_iter_truncate(&req->iter, queue->remaining)

before issuing recvmsg. That should take care of things.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow
  2026-02-26  8:40           ` Hannes Reinecke
@ 2026-02-26 10:42             ` Alistair Francis
  0 siblings, 0 replies; 10+ messages in thread
From: Alistair Francis @ 2026-02-26 10:42 UTC (permalink / raw)
  To: hch, hare@kernel.org, hare@suse.de
  Cc: kbusch@kernel.org, sagi@grimberg.me,
	linux-nvme@lists.infradead.org

On Thu, 2026-02-26 at 09:40 +0100, Hannes Reinecke wrote:
> On 2/26/26 00:37, Alistair Francis wrote:
> > On Wed, 2026-02-25 at 16:02 +0100, Hannes Reinecke wrote:
> > > On 2/25/26 14:15, Alistair Francis wrote:
> > > > On Wed, 2026-02-25 at 12:41 +0100, Hannes Reinecke wrote:
> > > > > On 2/25/26 11:56, Alistair Francis wrote:
> > > [ .. ]
> > > > > > 
> > > > > > This doesn't work unfortunately.
> > > > > > 
> > > > > > The problem is what happens if queue->data_remaining is
> > > > > > smaller
> > > > > > then iov_iter_count(&req->iter)?
> > > > > > 
> > > > > > queue->data_remaining is set by the data length in the c2h,
> > > > > > while
> > > > > > the length of the request iov_iter_count(&req->iter) is set
> > > > > > when the
> > > > > > request is submitted.
> > > > > > 
> > > > > > If queue->data_remaining ends up being smaller then
> > > > > > iov_iter_count(&req->iter) then we need to read less data
> > > > > > then
> > > > > > the
> > > > > > actual count of req->iter.
> > > > > > 
> > > > > > So we need a iov_iter_truncate(), but then we end up
> > > > > > overwriting
> > > > > > the data on the next iteration as we have no way to keep...
> > > > > > 
> > > > > Question is, though: what _is_ in the remaining iov?
> > > > 
> > > > Which remaining iov?
> > > > 
> > > 
> > > Well, if queue->
> > > > > The most reasonable explanation would be that it's the start
> > > > > of
> > > > > the
> > > > > next PDU (which we haven't accounted for, and hence haven't
> > > > > set
> > > > > up pointers correctly).
> > > > 
> > > > The next PDU seems fine, it's just the next data that goes on
> > > > the
> > > > current (and correct) IOV, just overwriting the previous data
> > > > as
> > > > there
> > > > is no offset.
> > > > 
> > > 
> > > The offset is in the iov (ie you advance the iov iter to capture
> > > the
> > > offset)
> > > 
> > > > > I can see this happening for TLS when the sender doesn't
> > > > > space
> > > > > the records correctly (ie if the PDU end is not falling on a
> > > > > TLS record boundary).
> > > > > 
> > > > > But yeah, I can see the issue. While we can (and do)
> > > > > advance the iterator to complete the request, we still
> > > > > have the remaining data in the iterator.
> > > > 
> > > > > What we can do, though, is to copy the remaining data over
> > > > > to 'queue->pdu' (as we assume it's the start of the next
> > > > > PDU),
> > > > > set up pointers, and let it rip.
> > 
> > I think I understand this part a bit more now. I'm currently using
> > iov_iter_truncate() to reduce the count of req->iter to match
> > queue-
> > > data_remaining, so I don't see this.
> > 
> > If there was no truncation then the req->iter would fill up with
> > the
> > current data and the next PDU. This is where we could copy that
> > data
> > over and let it rip.
> > 
> > But it still has the same issue in that we can't add future data to
> > an
> > offset in the req->iter. At least not that I can figure out.
> > 
> 
> Ah, I think I see it now.
> The iovec might indeed be larger than queue->remaining
> (data underflows are not that uncommon), and then recvmsg
> might indeed fill the iovec with more data than the PDU
> requires.
> 
> The old code had that bit:
>   		/* we can read only from what is left in this bio */
> 		recv_len = min_t(size_t, recv_len,
> 				iov_iter_count(&req->iter));

That's part of the problem. But the old code also had an offset into
the skb_copy_datagram_iter() function, which the new code doesn't have.
That's the issue.

> (with recv_len being set to queue->data_remaining) to prevent that
> from happening.
> So we should do a
> 
> if (iov_iter_count(&req->iter) > queue->remaining)

You don't need the if(), iov_iter_truncate() handles that for you.

>     iov_iter_truncate(&req->iter, queue->remaining)

I have this already locally. It avoids the req->iter reading too much
data, but it doesn't fix the issue that the next receive will overwrite
the data in req->iter as there is no offset

Alistair

> 
> before issuing recvmsg. That should take care of things.
> 
> Cheers,
> 
> Hannes

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2026-02-26 10:42 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-12 11:58 [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow Hannes Reinecke
2025-09-22 17:41 ` Christoph Hellwig
2025-09-23  6:30   ` Hannes Reinecke
2026-02-25 10:56 ` Alistair Francis
2026-02-25 11:41   ` Hannes Reinecke
2026-02-25 13:15     ` Alistair Francis
2026-02-25 15:02       ` Hannes Reinecke
2026-02-25 23:37         ` Alistair Francis
2026-02-26  8:40           ` Hannes Reinecke
2026-02-26 10:42             ` Alistair Francis

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox