[PATCHv2] nvme-tcp: Implement recvmsg() receive flow

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCHv2] nvme-tcp: Implement recvmsg() receive flow
@ 2025-10-20  8:58 Hannes Reinecke
  2025-11-26  7:32 ` Sagi Grimberg
  0 siblings, 1 reply; 13+ messages in thread
From: Hannes Reinecke @ 2025-10-20  8:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, Sagi Grimberg, linux-nvme, Hannes Reinecke

The nvme-tcp code is using the ->read_sock() interface to
read data from the wire. While this interface gives us access
to the skbs themselves (and so might be able to reduce latency)
it does not interpret the skbs.
Additionally for TLS these skbs have to be re-constructed from
the TLS stream data, rendering any advantage questionable.
But the main drawback for TLS is that we do not get access to
the TLS control messages, so if we receive any of those message
the only choice we have is to tear down the connection and restart.
This patch switches the receive side over to use recvmsg(), which
provides us full access to the TLS control messages and is also
more efficient when working with TLS as skbs do not need to be
artificially constructed.

Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
 drivers/nvme/host/tcp.c | 203 ++++++++++++++++++++++------------------
 1 file changed, 110 insertions(+), 93 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 9a96df1a511c..081f53fa9fc4 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -17,6 +17,7 @@
 #include <net/tls_prot.h>
 #include <net/handshake.h>
 #include <linux/blk-mq.h>
+#include <linux/iov_iter.h>
 #include <net/busy_poll.h>
 #include <trace/events/sock.h>
 
@@ -476,6 +477,28 @@ static inline void nvme_tcp_ddgst_update(u32 *crcp,
 	}
 }
 
+static size_t nvme_tcp_ddgst_step(void *iter_base, size_t progress, size_t len,
+				  void *priv, void *priv2)
+{
+	u32 *crcp = priv;
+
+	*crcp = crc32c(*crcp, iter_base, len);
+	return 0;
+}
+
+static int nvme_tcp_ddgst_calc(struct nvme_tcp_request *req, u32 *crcp,
+			       size_t maxsize)
+{
+	struct iov_iter tmp = req->iter;
+	int err = 0;
+
+	tmp.count = maxsize;
+	if (iterate_and_advance_kernel(&tmp, maxsize, crcp, &err,
+				       nvme_tcp_ddgst_step) != maxsize)
+		return err;
+	return 0;
+}
+
 static inline __le32 nvme_tcp_ddgst_final(u32 crc)
 {
 	return cpu_to_le32(~crc);
@@ -827,23 +850,26 @@ static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue,
 		"Received C2HTermReq (FES = %s)\n", msg);
 }
 
-static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
-		unsigned int *offset, size_t *len)
+static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
 {
-	struct nvme_tcp_hdr *hdr;
 	char *pdu = queue->pdu;
-	size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
+	struct msghdr msg = {
+		.msg_flags = MSG_DONTWAIT,
+	};
+	struct kvec iov = {
+		.iov_base = pdu + queue->pdu_offset,
+		.iov_len = queue->pdu_remaining,
+	};
+	struct nvme_tcp_hdr *hdr;
 	int ret;
 
-	ret = skb_copy_bits(skb, *offset,
-		&pdu[queue->pdu_offset], rcv_len);
-	if (unlikely(ret))
+	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+			     iov.iov_len, msg.msg_flags);
+	if (ret <= 0)
 		return ret;
 
-	queue->pdu_remaining -= rcv_len;
-	queue->pdu_offset += rcv_len;
-	*offset += rcv_len;
-	*len -= rcv_len;
+	queue->pdu_remaining -= ret;
+	queue->pdu_offset += ret;
 	if (queue->pdu_remaining)
 		return 0;
 
@@ -907,20 +933,19 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status)
 		nvme_complete_rq(rq);
 }
 
-static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
-			      unsigned int *offset, size_t *len)
+static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
 {
 	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 	struct request *rq =
 		nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 
-	while (true) {
-		int recv_len, ret;
+	if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
+		return 0;
 
-		recv_len = min_t(size_t, *len, queue->data_remaining);
-		if (!recv_len)
-			break;
+	while (queue->data_remaining) {
+		struct msghdr msg;
+		int ret;
 
 		if (!iov_iter_count(&req->iter)) {
 			req->curr_bio = req->curr_bio->bi_next;
@@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 		}
 
 		/* we can read only from what is left in this bio */
-		recv_len = min_t(size_t, recv_len,
-				iov_iter_count(&req->iter));
+		memset(&msg, 0, sizeof(msg));
+		msg.msg_iter = req->iter;
+		msg.msg_flags = MSG_DONTWAIT;
 
-		if (queue->data_digest)
-			ret = skb_copy_and_crc32c_datagram_iter(skb, *offset,
-				&req->iter, recv_len, &queue->rcv_crc);
-		else
-			ret = skb_copy_datagram_iter(skb, *offset,
-					&req->iter, recv_len);
-		if (ret) {
+		ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
+		if (ret < 0) {
 			dev_err(queue->ctrl->ctrl.device,
-				"queue %d failed to copy request %#x data",
+				"queue %d failed to receive request %#x data",
 				nvme_tcp_queue_id(queue), rq->tag);
 			return ret;
 		}
-
-		*len -= recv_len;
-		*offset += recv_len;
-		queue->data_remaining -= recv_len;
+		if (queue->data_digest)
+			nvme_tcp_ddgst_calc(req, &queue->rcv_crc, ret);
+		queue->data_remaining -= ret;
+		if (queue->data_remaining)
+			nvme_tcp_advance_req(req, ret);
 	}
 
 	if (!queue->data_remaining) {
@@ -968,7 +990,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 		} else {
 			if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 				nvme_tcp_end_request(rq,
-						le16_to_cpu(req->status));
+						     le16_to_cpu(req->status));
 				queue->nr_cqe++;
 			}
 			nvme_tcp_init_recv_ctx(queue);
@@ -978,24 +1000,9 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 	return 0;
 }
 
-static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
-		struct sk_buff *skb, unsigned int *offset, size_t *len)
+static int __nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue)
 {
 	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
-	char *ddgst = (char *)&queue->recv_ddgst;
-	size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
-	off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
-	int ret;
-
-	ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
-	if (unlikely(ret))
-		return ret;
-
-	queue->ddgst_remaining -= recv_len;
-	*offset += recv_len;
-	*len -= recv_len;
-	if (queue->ddgst_remaining)
-		return 0;
 
 	if (queue->recv_ddgst != queue->exp_ddgst) {
 		struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
@@ -1023,40 +1030,32 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
 	return 0;
 }
 
-static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
-			     unsigned int offset, size_t len)
+static int nvme_tcp_recvmsg_ddgst(struct nvme_tcp_queue *queue)
 {
-	struct nvme_tcp_queue *queue = desc->arg.data;
-	size_t consumed = len;
-	int result;
+	char *ddgst = (char *)&queue->recv_ddgst;
+	off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
+	struct msghdr msg = {
+		.msg_flags = MSG_WAITALL,
+	};
+	struct kvec iov = {
+		.iov_base = (u8 *)ddgst + off,
+		.iov_len = queue->ddgst_remaining,
+	};
+	int ret;
 
-	if (unlikely(!queue->rd_enabled))
-		return -EFAULT;
+	if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DDGST)
+		return 0;
 
-	while (len) {
-		switch (nvme_tcp_recv_state(queue)) {
-		case NVME_TCP_RECV_PDU:
-			result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
-			break;
-		case NVME_TCP_RECV_DATA:
-			result = nvme_tcp_recv_data(queue, skb, &offset, &len);
-			break;
-		case NVME_TCP_RECV_DDGST:
-			result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
-			break;
-		default:
-			result = -EFAULT;
-		}
-		if (result) {
-			dev_err(queue->ctrl->ctrl.device,
-				"receive failed:  %d\n", result);
-			queue->rd_enabled = false;
-			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
-			return result;
-		}
-	}
+	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len,
+			     msg.msg_flags);
+	if (ret <= 0)
+		return ret;
 
-	return consumed;
+	queue->ddgst_remaining -= ret;
+	if (queue->ddgst_remaining)
+		return 0;
+
+	return __nvme_tcp_recv_ddgst(queue);
 }
 
 static void nvme_tcp_data_ready(struct sock *sk)
@@ -1356,20 +1355,38 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
 	return ret;
 }
 
-static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
+static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
 {
-	struct socket *sock = queue->sock;
-	struct sock *sk = sock->sk;
-	read_descriptor_t rd_desc;
-	int consumed;
+	int result;
+	int nr_cqe = queue->nr_cqe;
+
+	if (unlikely(!queue->rd_enabled))
+		return -EFAULT;
+
+	do {
+		switch (nvme_tcp_recv_state(queue)) {
+		case NVME_TCP_RECV_PDU:
+			result = nvme_tcp_recvmsg_pdu(queue);
+			break;
+		case NVME_TCP_RECV_DATA:
+			result = nvme_tcp_recvmsg_data(queue);
+			break;
+		case NVME_TCP_RECV_DDGST:
+			result = nvme_tcp_recvmsg_ddgst(queue);
+			break;
+		default:
+			result = -EFAULT;
+		}
+	} while (result >= 0);
+
+	if (result < 0 && result != -EAGAIN) {
+		dev_err(queue->ctrl->ctrl.device,
+			"receive failed:  %d\n", result);
+		queue->rd_enabled = false;
+		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+	}
 
-	rd_desc.arg.data = queue;
-	rd_desc.count = 1;
-	lock_sock(sk);
-	queue->nr_cqe = 0;
-	consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
-	release_sock(sk);
-	return consumed == -EAGAIN ? 0 : consumed;
+	return result < 0 ? result : (queue->nr_cqe = nr_cqe);
 }
 
 static void nvme_tcp_io_work(struct work_struct *w)
@@ -1391,7 +1408,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
 				break;
 		}
 
-		result = nvme_tcp_try_recv(queue);
+		result = nvme_tcp_try_recvmsg(queue);
 		if (result > 0)
 			pending = true;
 		else if (unlikely(result < 0))
@@ -2800,7 +2817,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 	set_bit(NVME_TCP_Q_POLLING, &queue->flags);
 	if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
 		sk_busy_loop(sk, true);
-	ret = nvme_tcp_try_recv(queue);
+	ret = nvme_tcp_try_recvmsg(queue);
 	clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
 	return ret < 0 ? ret : queue->nr_cqe;
 }
-- 
2.51.0



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCHv2] nvme-tcp: Implement recvmsg() receive flow
  2025-10-20  8:58 [PATCHv2] nvme-tcp: Implement recvmsg() receive flow Hannes Reinecke
@ 2025-11-26  7:32 ` Sagi Grimberg
  2025-11-26  8:58   ` Sagi Grimberg
  2025-11-27  7:52   ` Hannes Reinecke
  0 siblings, 2 replies; 13+ messages in thread
From: Sagi Grimberg @ 2025-11-26  7:32 UTC (permalink / raw)
  To: Hannes Reinecke, Christoph Hellwig; +Cc: Keith Busch, linux-nvme



On 20/10/2025 11:58, Hannes Reinecke wrote:
> The nvme-tcp code is using the ->read_sock() interface to
> read data from the wire. While this interface gives us access
> to the skbs themselves (and so might be able to reduce latency)
> it does not interpret the skbs.
> Additionally for TLS these skbs have to be re-constructed from
> the TLS stream data, rendering any advantage questionable.
> But the main drawback for TLS is that we do not get access to
> the TLS control messages, so if we receive any of those message
> the only choice we have is to tear down the connection and restart.
> This patch switches the receive side over to use recvmsg(), which
> provides us full access to the TLS control messages and is also
> more efficient when working with TLS as skbs do not need to be
> artificially constructed.

Hannes,

I generally agree with this approach. I'd like to point out though
that this is going to give up running RX from directly from softirq context.
I've gone back and forth on weather nvme-tcp should do that, but never
got to do a thorough comparison between the two. This probably shuts
the door on that option.

Having said that, I am fine with this approach. recvmsg is a saner 
interface.

>
> Signed-off-by: Hannes Reinecke <hare@kernel.org>
> ---
>   drivers/nvme/host/tcp.c | 203 ++++++++++++++++++++++------------------
>   1 file changed, 110 insertions(+), 93 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 9a96df1a511c..081f53fa9fc4 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -17,6 +17,7 @@
>   #include <net/tls_prot.h>
>   #include <net/handshake.h>
>   #include <linux/blk-mq.h>
> +#include <linux/iov_iter.h>
>   #include <net/busy_poll.h>
>   #include <trace/events/sock.h>
>   
> @@ -476,6 +477,28 @@ static inline void nvme_tcp_ddgst_update(u32 *crcp,
>   	}
>   }
>   
> +static size_t nvme_tcp_ddgst_step(void *iter_base, size_t progress, size_t len,
> +				  void *priv, void *priv2)
> +{
> +	u32 *crcp = priv;
> +
> +	*crcp = crc32c(*crcp, iter_base, len);
> +	return 0;
> +}
> +
> +static int nvme_tcp_ddgst_calc(struct nvme_tcp_request *req, u32 *crcp,
> +			       size_t maxsize)
> +{
> +	struct iov_iter tmp = req->iter;
> +	int err = 0;
> +
> +	tmp.count = maxsize;
> +	if (iterate_and_advance_kernel(&tmp, maxsize, crcp, &err,
> +				       nvme_tcp_ddgst_step) != maxsize)
> +		return err;
> +	return 0;
> +}
> +
>   static inline __le32 nvme_tcp_ddgst_final(u32 crc)
>   {
>   	return cpu_to_le32(~crc);
> @@ -827,23 +850,26 @@ static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue,
>   		"Received C2HTermReq (FES = %s)\n", msg);
>   }
>   
> -static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
> -		unsigned int *offset, size_t *len)
> +static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
>   {
> -	struct nvme_tcp_hdr *hdr;
>   	char *pdu = queue->pdu;
> -	size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
> +	struct msghdr msg = {
> +		.msg_flags = MSG_DONTWAIT,
> +	};
> +	struct kvec iov = {
> +		.iov_base = pdu + queue->pdu_offset,
> +		.iov_len = queue->pdu_remaining,
> +	};
> +	struct nvme_tcp_hdr *hdr;
>   	int ret;
>   
> -	ret = skb_copy_bits(skb, *offset,
> -		&pdu[queue->pdu_offset], rcv_len);
> -	if (unlikely(ret))
> +	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
> +			     iov.iov_len, msg.msg_flags);
> +	if (ret <= 0)
>   		return ret;
>   
> -	queue->pdu_remaining -= rcv_len;
> -	queue->pdu_offset += rcv_len;
> -	*offset += rcv_len;
> -	*len -= rcv_len;
> +	queue->pdu_remaining -= ret;
> +	queue->pdu_offset += ret;
>   	if (queue->pdu_remaining)
>   		return 0;
>   
> @@ -907,20 +933,19 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status)
>   		nvme_complete_rq(rq);
>   }
>   
> -static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
> -			      unsigned int *offset, size_t *len)
> +static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
>   {
>   	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
>   	struct request *rq =
>   		nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
>   	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
>   
> -	while (true) {
> -		int recv_len, ret;
> +	if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
> +		return 0;
>   
> -		recv_len = min_t(size_t, *len, queue->data_remaining);
> -		if (!recv_len)
> -			break;
> +	while (queue->data_remaining) {
> +		struct msghdr msg;
> +		int ret;
>   
>   		if (!iov_iter_count(&req->iter)) {
>   			req->curr_bio = req->curr_bio->bi_next;
> @@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
>   		}
>   
>   		/* we can read only from what is left in this bio */
> -		recv_len = min_t(size_t, recv_len,
> -				iov_iter_count(&req->iter));
> +		memset(&msg, 0, sizeof(msg));
> +		msg.msg_iter = req->iter;
> +		msg.msg_flags = MSG_DONTWAIT;
>   
> -		if (queue->data_digest)
> -			ret = skb_copy_and_crc32c_datagram_iter(skb, *offset,
> -				&req->iter, recv_len, &queue->rcv_crc);
> -		else
> -			ret = skb_copy_datagram_iter(skb, *offset,
> -					&req->iter, recv_len);
> -		if (ret) {
> +		ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
> +		if (ret < 0) {
>   			dev_err(queue->ctrl->ctrl.device,
> -				"queue %d failed to copy request %#x data",
> +				"queue %d failed to receive request %#x data",
>   				nvme_tcp_queue_id(queue), rq->tag);
>   			return ret;
>   		}
> -
> -		*len -= recv_len;
> -		*offset += recv_len;
> -		queue->data_remaining -= recv_len;
> +		if (queue->data_digest)
> +			nvme_tcp_ddgst_calc(req, &queue->rcv_crc, ret);
> +		queue->data_remaining -= ret;
> +		if (queue->data_remaining)
> +			nvme_tcp_advance_req(req, ret);
>   	}
>   
>   	if (!queue->data_remaining) {
> @@ -968,7 +990,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
>   		} else {
>   			if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
>   				nvme_tcp_end_request(rq,
> -						le16_to_cpu(req->status));
> +						     le16_to_cpu(req->status));
>   				queue->nr_cqe++;
>   			}
>   			nvme_tcp_init_recv_ctx(queue);
> @@ -978,24 +1000,9 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
>   	return 0;
>   }
>   
> -static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
> -		struct sk_buff *skb, unsigned int *offset, size_t *len)
> +static int __nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue)
>   {
>   	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
> -	char *ddgst = (char *)&queue->recv_ddgst;
> -	size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
> -	off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
> -	int ret;
> -
> -	ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
> -	if (unlikely(ret))
> -		return ret;
> -
> -	queue->ddgst_remaining -= recv_len;
> -	*offset += recv_len;
> -	*len -= recv_len;
> -	if (queue->ddgst_remaining)
> -		return 0;
>   
>   	if (queue->recv_ddgst != queue->exp_ddgst) {
>   		struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
> @@ -1023,40 +1030,32 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
>   	return 0;
>   }
>   
> -static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
> -			     unsigned int offset, size_t len)
> +static int nvme_tcp_recvmsg_ddgst(struct nvme_tcp_queue *queue)
>   {
> -	struct nvme_tcp_queue *queue = desc->arg.data;
> -	size_t consumed = len;
> -	int result;
> +	char *ddgst = (char *)&queue->recv_ddgst;
> +	off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
> +	struct msghdr msg = {
> +		.msg_flags = MSG_WAITALL,
> +	};

I don't think we want to use MSG_WAITALL ever. Why are you opting to do 
that?

> +	struct kvec iov = {
> +		.iov_base = (u8 *)ddgst + off,
> +		.iov_len = queue->ddgst_remaining,
> +	};
> +	int ret;
>   
> -	if (unlikely(!queue->rd_enabled))
> -		return -EFAULT;
> +	if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DDGST)
> +		return 0;
>   
> -	while (len) {
> -		switch (nvme_tcp_recv_state(queue)) {
> -		case NVME_TCP_RECV_PDU:
> -			result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
> -			break;
> -		case NVME_TCP_RECV_DATA:
> -			result = nvme_tcp_recv_data(queue, skb, &offset, &len);
> -			break;
> -		case NVME_TCP_RECV_DDGST:
> -			result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
> -			break;
> -		default:
> -			result = -EFAULT;
> -		}
> -		if (result) {
> -			dev_err(queue->ctrl->ctrl.device,
> -				"receive failed:  %d\n", result);
> -			queue->rd_enabled = false;
> -			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> -			return result;
> -		}
> -	}
> +	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len,
> +			     msg.msg_flags);
> +	if (ret <= 0)
> +		return ret;
>   
> -	return consumed;
> +	queue->ddgst_remaining -= ret;
> +	if (queue->ddgst_remaining)
> +		return 0;
> +
> +	return __nvme_tcp_recv_ddgst(queue);
>   }
>   
>   static void nvme_tcp_data_ready(struct sock *sk)
> @@ -1356,20 +1355,38 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
>   	return ret;
>   }
>   
> -static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
> +static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
>   {
> -	struct socket *sock = queue->sock;
> -	struct sock *sk = sock->sk;
> -	read_descriptor_t rd_desc;
> -	int consumed;
> +	int result;
> +	int nr_cqe = queue->nr_cqe;
> +
> +	if (unlikely(!queue->rd_enabled))
> +		return -EFAULT;
> +
> +	do {
> +		switch (nvme_tcp_recv_state(queue)) {
> +		case NVME_TCP_RECV_PDU:
> +			result = nvme_tcp_recvmsg_pdu(queue);
> +			break;
> +		case NVME_TCP_RECV_DATA:
> +			result = nvme_tcp_recvmsg_data(queue);
> +			break;
> +		case NVME_TCP_RECV_DDGST:
> +			result = nvme_tcp_recvmsg_ddgst(queue);
> +			break;
> +		default:
> +			result = -EFAULT;
> +		}
> +	} while (result >= 0);
> +
> +	if (result < 0 && result != -EAGAIN) {
> +		dev_err(queue->ctrl->ctrl.device,
> +			"receive failed:  %d\n", result);
> +		queue->rd_enabled = false;
> +		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +	}
>   
> -	rd_desc.arg.data = queue;
> -	rd_desc.count = 1;
> -	lock_sock(sk);
> -	queue->nr_cqe = 0;
> -	consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
> -	release_sock(sk);
> -	return consumed == -EAGAIN ? 0 : consumed;
> +	return result < 0 ? result : (queue->nr_cqe = nr_cqe);

This is changing the semantics of the retcode - can you please document 
in the commit msg
the change?

>   }
>   
>   static void nvme_tcp_io_work(struct work_struct *w)
> @@ -1391,7 +1408,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
>   				break;
>   		}
>   
> -		result = nvme_tcp_try_recv(queue);
> +		result = nvme_tcp_try_recvmsg(queue);
>   		if (result > 0)
>   			pending = true;
>   		else if (unlikely(result < 0))
> @@ -2800,7 +2817,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
>   	set_bit(NVME_TCP_Q_POLLING, &queue->flags);
>   	if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
>   		sk_busy_loop(sk, true);
> -	ret = nvme_tcp_try_recv(queue);
> +	ret = nvme_tcp_try_recvmsg(queue);
>   	clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
>   	return ret < 0 ? ret : queue->nr_cqe;
>   }



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCHv2] nvme-tcp: Implement recvmsg() receive flow
  2025-11-26  7:32 ` Sagi Grimberg
@ 2025-11-26  8:58   ` Sagi Grimberg
  2025-11-27  7:53     ` Hannes Reinecke
  2025-11-27  7:52   ` Hannes Reinecke
  1 sibling, 1 reply; 13+ messages in thread
From: Sagi Grimberg @ 2025-11-26  8:58 UTC (permalink / raw)
  To: Hannes Reinecke, Christoph Hellwig; +Cc: Keith Busch, linux-nvme



On 26/11/2025 9:32, Sagi Grimberg wrote:
>
>
> On 20/10/2025 11:58, Hannes Reinecke wrote:
>> The nvme-tcp code is using the ->read_sock() interface to
>> read data from the wire. While this interface gives us access
>> to the skbs themselves (and so might be able to reduce latency)
>> it does not interpret the skbs.
>> Additionally for TLS these skbs have to be re-constructed from
>> the TLS stream data, rendering any advantage questionable.
>> But the main drawback for TLS is that we do not get access to
>> the TLS control messages, so if we receive any of those message
>> the only choice we have is to tear down the connection and restart.
>> This patch switches the receive side over to use recvmsg(), which
>> provides us full access to the TLS control messages and is also
>> more efficient when working with TLS as skbs do not need to be
>> artificially constructed.
>
> Hannes,
>
> I generally agree with this approach. I'd like to point out though
> that this is going to give up running RX from directly from softirq 
> context.
> I've gone back and forth on weather nvme-tcp should do that, but never
> got to do a thorough comparison between the two. This probably shuts
> the door on that option.
>
> Having said that, I am fine with this approach. recvmsg is a saner 
> interface.
>
>>
>> Signed-off-by: Hannes Reinecke <hare@kernel.org>
>> ---
>>   drivers/nvme/host/tcp.c | 203 ++++++++++++++++++++++------------------
>>   1 file changed, 110 insertions(+), 93 deletions(-)
>>
>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>> index 9a96df1a511c..081f53fa9fc4 100644
>> --- a/drivers/nvme/host/tcp.c
>> +++ b/drivers/nvme/host/tcp.c
>> @@ -17,6 +17,7 @@
>>   #include <net/tls_prot.h>
>>   #include <net/handshake.h>
>>   #include <linux/blk-mq.h>
>> +#include <linux/iov_iter.h>
>>   #include <net/busy_poll.h>
>>   #include <trace/events/sock.h>
>>   @@ -476,6 +477,28 @@ static inline void nvme_tcp_ddgst_update(u32 
>> *crcp,
>>       }
>>   }
>>   +static size_t nvme_tcp_ddgst_step(void *iter_base, size_t 
>> progress, size_t len,
>> +                  void *priv, void *priv2)
>> +{
>> +    u32 *crcp = priv;
>> +
>> +    *crcp = crc32c(*crcp, iter_base, len);
>> +    return 0;
>> +}
>> +
>> +static int nvme_tcp_ddgst_calc(struct nvme_tcp_request *req, u32 *crcp,
>> +                   size_t maxsize)
>> +{
>> +    struct iov_iter tmp = req->iter;
>> +    int err = 0;
>> +
>> +    tmp.count = maxsize;
>> +    if (iterate_and_advance_kernel(&tmp, maxsize, crcp, &err,
>> +                       nvme_tcp_ddgst_step) != maxsize)
>> +        return err;
>> +    return 0;
>> +}
>> +
>>   static inline __le32 nvme_tcp_ddgst_final(u32 crc)
>>   {
>>       return cpu_to_le32(~crc);
>> @@ -827,23 +850,26 @@ static void nvme_tcp_handle_c2h_term(struct 
>> nvme_tcp_queue *queue,
>>           "Received C2HTermReq (FES = %s)\n", msg);
>>   }
>>   -static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct 
>> sk_buff *skb,
>> -        unsigned int *offset, size_t *len)
>> +static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
>>   {
>> -    struct nvme_tcp_hdr *hdr;
>>       char *pdu = queue->pdu;
>> -    size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
>> +    struct msghdr msg = {
>> +        .msg_flags = MSG_DONTWAIT,
>> +    };
>> +    struct kvec iov = {
>> +        .iov_base = pdu + queue->pdu_offset,
>> +        .iov_len = queue->pdu_remaining,
>> +    };
>> +    struct nvme_tcp_hdr *hdr;
>>       int ret;
>>   -    ret = skb_copy_bits(skb, *offset,
>> -        &pdu[queue->pdu_offset], rcv_len);
>> -    if (unlikely(ret))
>> +    ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
>> +                 iov.iov_len, msg.msg_flags);
>> +    if (ret <= 0)
>>           return ret;
>>   -    queue->pdu_remaining -= rcv_len;
>> -    queue->pdu_offset += rcv_len;
>> -    *offset += rcv_len;
>> -    *len -= rcv_len;
>> +    queue->pdu_remaining -= ret;
>> +    queue->pdu_offset += ret;
>>       if (queue->pdu_remaining)
>>           return 0;
>>   @@ -907,20 +933,19 @@ static inline void 
>> nvme_tcp_end_request(struct request *rq, u16 status)
>>           nvme_complete_rq(rq);
>>   }
>>   -static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct 
>> sk_buff *skb,
>> -                  unsigned int *offset, size_t *len)
>> +static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
>>   {
>>       struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
>>       struct request *rq =
>>           nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
>>       struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
>>   -    while (true) {
>> -        int recv_len, ret;
>> +    if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
>> +        return 0;
>>   -        recv_len = min_t(size_t, *len, queue->data_remaining);
>> -        if (!recv_len)
>> -            break;
>> +    while (queue->data_remaining) {
>> +        struct msghdr msg;
>> +        int ret;
>>             if (!iov_iter_count(&req->iter)) {
>>               req->curr_bio = req->curr_bio->bi_next;
>> @@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct 
>> nvme_tcp_queue *queue, struct sk_buff *skb,
>>           }
>>             /* we can read only from what is left in this bio */
>> -        recv_len = min_t(size_t, recv_len,
>> -                iov_iter_count(&req->iter));
>> +        memset(&msg, 0, sizeof(msg));
>> +        msg.msg_iter = req->iter;
>> +        msg.msg_flags = MSG_DONTWAIT;

Shouldn't this be initialized outside of the loop? then the loop becomes 
while (msg_data_left(&msg)) ?

>>   -        if (queue->data_digest)
>> -            ret = skb_copy_and_crc32c_datagram_iter(skb, *offset,
>> -                &req->iter, recv_len, &queue->rcv_crc);
>> -        else
>> -            ret = skb_copy_datagram_iter(skb, *offset,
>> -                    &req->iter, recv_len);
>> -        if (ret) {
>> +        ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
>> +        if (ret < 0) {
>>               dev_err(queue->ctrl->ctrl.device,
>> -                "queue %d failed to copy request %#x data",
>> +                "queue %d failed to receive request %#x data",
>>                   nvme_tcp_queue_id(queue), rq->tag);
>>               return ret;
>>           }
>> -
>> -        *len -= recv_len;
>> -        *offset += recv_len;
>> -        queue->data_remaining -= recv_len;
>> +        if (queue->data_digest)
>> +            nvme_tcp_ddgst_calc(req, &queue->rcv_crc, ret);
>> +        queue->data_remaining -= ret;
>> +        if (queue->data_remaining)
>> +            nvme_tcp_advance_req(req, ret);
>>       }
>>         if (!queue->data_remaining) {
>> @@ -968,7 +990,7 @@ static int nvme_tcp_recv_data(struct 
>> nvme_tcp_queue *queue, struct sk_buff *skb,
>>           } else {
>>               if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
>>                   nvme_tcp_end_request(rq,
>> -                        le16_to_cpu(req->status));
>> +                             le16_to_cpu(req->status));
>>                   queue->nr_cqe++;
>>               }
>>               nvme_tcp_init_recv_ctx(queue);
>> @@ -978,24 +1000,9 @@ static int nvme_tcp_recv_data(struct 
>> nvme_tcp_queue *queue, struct sk_buff *skb,
>>       return 0;
>>   }
>>   -static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
>> -        struct sk_buff *skb, unsigned int *offset, size_t *len)
>> +static int __nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue)
>>   {
>>       struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
>> -    char *ddgst = (char *)&queue->recv_ddgst;
>> -    size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
>> -    off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
>> -    int ret;
>> -
>> -    ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
>> -    if (unlikely(ret))
>> -        return ret;
>> -
>> -    queue->ddgst_remaining -= recv_len;
>> -    *offset += recv_len;
>> -    *len -= recv_len;
>> -    if (queue->ddgst_remaining)
>> -        return 0;
>>         if (queue->recv_ddgst != queue->exp_ddgst) {
>>           struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
>> @@ -1023,40 +1030,32 @@ static int nvme_tcp_recv_ddgst(struct 
>> nvme_tcp_queue *queue,
>>       return 0;
>>   }
>>   -static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct 
>> sk_buff *skb,
>> -                 unsigned int offset, size_t len)
>> +static int nvme_tcp_recvmsg_ddgst(struct nvme_tcp_queue *queue)
>>   {
>> -    struct nvme_tcp_queue *queue = desc->arg.data;
>> -    size_t consumed = len;
>> -    int result;
>> +    char *ddgst = (char *)&queue->recv_ddgst;
>> +    off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
>> +    struct msghdr msg = {
>> +        .msg_flags = MSG_WAITALL,
>> +    };
>
> I don't think we want to use MSG_WAITALL ever. Why are you opting to 
> do that?
>
>> +    struct kvec iov = {
>> +        .iov_base = (u8 *)ddgst + off,
>> +        .iov_len = queue->ddgst_remaining,
>> +    };
>> +    int ret;
>>   -    if (unlikely(!queue->rd_enabled))
>> -        return -EFAULT;
>> +    if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DDGST)
>> +        return 0;
>>   -    while (len) {
>> -        switch (nvme_tcp_recv_state(queue)) {
>> -        case NVME_TCP_RECV_PDU:
>> -            result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
>> -            break;
>> -        case NVME_TCP_RECV_DATA:
>> -            result = nvme_tcp_recv_data(queue, skb, &offset, &len);
>> -            break;
>> -        case NVME_TCP_RECV_DDGST:
>> -            result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
>> -            break;
>> -        default:
>> -            result = -EFAULT;
>> -        }
>> -        if (result) {
>> -            dev_err(queue->ctrl->ctrl.device,
>> -                "receive failed:  %d\n", result);
>> -            queue->rd_enabled = false;
>> - nvme_tcp_error_recovery(&queue->ctrl->ctrl);
>> -            return result;
>> -        }
>> -    }
>> +    ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len,
>> +                 msg.msg_flags);
>> +    if (ret <= 0)
>> +        return ret;
>>   -    return consumed;
>> +    queue->ddgst_remaining -= ret;
>> +    if (queue->ddgst_remaining)
>> +        return 0;
>> +
>> +    return __nvme_tcp_recv_ddgst(queue);
>>   }
>>     static void nvme_tcp_data_ready(struct sock *sk)
>> @@ -1356,20 +1355,38 @@ static int nvme_tcp_try_send(struct 
>> nvme_tcp_queue *queue)
>>       return ret;
>>   }
>>   -static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
>> +static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
>>   {
>> -    struct socket *sock = queue->sock;
>> -    struct sock *sk = sock->sk;
>> -    read_descriptor_t rd_desc;
>> -    int consumed;
>> +    int result;
>> +    int nr_cqe = queue->nr_cqe;
>> +
>> +    if (unlikely(!queue->rd_enabled))
>> +        return -EFAULT;
>> +
>> +    do {
>> +        switch (nvme_tcp_recv_state(queue)) {
>> +        case NVME_TCP_RECV_PDU:
>> +            result = nvme_tcp_recvmsg_pdu(queue);
>> +            break;
>> +        case NVME_TCP_RECV_DATA:
>> +            result = nvme_tcp_recvmsg_data(queue);
>> +            break;
>> +        case NVME_TCP_RECV_DDGST:
>> +            result = nvme_tcp_recvmsg_ddgst(queue);
>> +            break;
>> +        default:
>> +            result = -EFAULT;
>> +        }
>> +    } while (result >= 0);
>> +
>> +    if (result < 0 && result != -EAGAIN) {
>> +        dev_err(queue->ctrl->ctrl.device,
>> +            "receive failed:  %d\n", result);
>> +        queue->rd_enabled = false;
>> +        nvme_tcp_error_recovery(&queue->ctrl->ctrl);
>> +    }
>>   -    rd_desc.arg.data = queue;
>> -    rd_desc.count = 1;
>> -    lock_sock(sk);
>> -    queue->nr_cqe = 0;
>> -    consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
>> -    release_sock(sk);
>> -    return consumed == -EAGAIN ? 0 : consumed;
>> +    return result < 0 ? result : (queue->nr_cqe = nr_cqe);
>
> This is changing the semantics of the retcode - can you please 
> document in the commit msg
> the change?
>
>>   }
>>     static void nvme_tcp_io_work(struct work_struct *w)
>> @@ -1391,7 +1408,7 @@ static void nvme_tcp_io_work(struct work_struct 
>> *w)
>>                   break;
>>           }
>>   -        result = nvme_tcp_try_recv(queue);
>> +        result = nvme_tcp_try_recvmsg(queue);
>>           if (result > 0)
>>               pending = true;
>>           else if (unlikely(result < 0))
>> @@ -2800,7 +2817,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx 
>> *hctx, struct io_comp_batch *iob)
>>       set_bit(NVME_TCP_Q_POLLING, &queue->flags);
>>       if (sk_can_busy_loop(sk) && 
>> skb_queue_empty_lockless(&sk->sk_receive_queue))
>>           sk_busy_loop(sk, true);
>> -    ret = nvme_tcp_try_recv(queue);
>> +    ret = nvme_tcp_try_recvmsg(queue);
>>       clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
>>       return ret < 0 ? ret : queue->nr_cqe;
>>   }
>



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCHv2] nvme-tcp: Implement recvmsg() receive flow
  2025-11-26  7:32 ` Sagi Grimberg
  2025-11-26  8:58   ` Sagi Grimberg
@ 2025-11-27  7:52   ` Hannes Reinecke
  2025-11-27 11:47     ` Hannes Reinecke
  2025-11-30 21:35     ` Sagi Grimberg
  1 sibling, 2 replies; 13+ messages in thread
From: Hannes Reinecke @ 2025-11-27  7:52 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke, Christoph Hellwig; +Cc: Keith Busch, linux-nvme

On 11/26/25 08:32, Sagi Grimberg wrote:
> 
> 
> On 20/10/2025 11:58, Hannes Reinecke wrote:
>> The nvme-tcp code is using the ->read_sock() interface to
>> read data from the wire. While this interface gives us access
>> to the skbs themselves (and so might be able to reduce latency)
>> it does not interpret the skbs.
>> Additionally for TLS these skbs have to be re-constructed from
>> the TLS stream data, rendering any advantage questionable.
>> But the main drawback for TLS is that we do not get access to
>> the TLS control messages, so if we receive any of those message
>> the only choice we have is to tear down the connection and restart.
>> This patch switches the receive side over to use recvmsg(), which
>> provides us full access to the TLS control messages and is also
>> more efficient when working with TLS as skbs do not need to be
>> artificially constructed.
> 
> Hannes,
> 
> I generally agree with this approach. I'd like to point out though
> that this is going to give up running RX from directly from softirq 
> context.

Yes.

> I've gone back and forth on weather nvme-tcp should do that, but never
> got to do a thorough comparison between the two. This probably shuts
> the door on that option.
> 
The thing with running from softirq context is that it would only
make sense if we could _ensure_ that the softirq context is running
on the cpu where the blk-mq hardware context is expecting it to.
Not only would that require fiddling with RFS contexts, but we also
found that NVMe-over-fabrics should _not_ try to align with hardware
interrupts but rather rely on the driver to abstract things away.

> Having said that, I am fine with this approach. recvmsg is a saner 
> interface.
> 
Thanks!


[ .. ]>> @@ -1023,40 +1030,32 @@ static int nvme_tcp_recv_ddgst(struct
>> nvme_tcp_queue *queue,
>>       return 0;
>>   }
>> -static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff 
>> *skb,
>> -                 unsigned int offset, size_t len)
>> +static int nvme_tcp_recvmsg_ddgst(struct nvme_tcp_queue *queue)
>>   {
>> -    struct nvme_tcp_queue *queue = desc->arg.data;
>> -    size_t consumed = len;
>> -    int result;
>> +    char *ddgst = (char *)&queue->recv_ddgst;
>> +    off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
>> +    struct msghdr msg = {
>> +        .msg_flags = MSG_WAITALL,
>> +    };
> 
> I don't think we want to use MSG_WAITALL ever. Why are you opting to do 
> that?
> 
Hmm. You are right, it's not needed.

[ .. ]>> @@ -1356,20 +1355,38 @@ static int nvme_tcp_try_send(struct
>> nvme_tcp_queue *queue)
>>       return ret;
>>   }
>> -static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
>> +static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
>>   {
>> -    struct socket *sock = queue->sock;
>> -    struct sock *sk = sock->sk;
>> -    read_descriptor_t rd_desc;
>> -    int consumed;
>> +    int result;
>> +    int nr_cqe = queue->nr_cqe;
>> +
>> +    if (unlikely(!queue->rd_enabled))
>> +        return -EFAULT;
>> +
>> +    do {
>> +        switch (nvme_tcp_recv_state(queue)) {
>> +        case NVME_TCP_RECV_PDU:
>> +            result = nvme_tcp_recvmsg_pdu(queue);
>> +            break;
>> +        case NVME_TCP_RECV_DATA:
>> +            result = nvme_tcp_recvmsg_data(queue);
>> +            break;
>> +        case NVME_TCP_RECV_DDGST:
>> +            result = nvme_tcp_recvmsg_ddgst(queue);
>> +            break;
>> +        default:
>> +            result = -EFAULT;
>> +        }
>> +    } while (result >= 0);
>> +
>> +    if (result < 0 && result != -EAGAIN) {
>> +        dev_err(queue->ctrl->ctrl.device,
>> +            "receive failed:  %d\n", result);
>> +        queue->rd_enabled = false;
>> +        nvme_tcp_error_recovery(&queue->ctrl->ctrl);
>> +    }
>> -    rd_desc.arg.data = queue;
>> -    rd_desc.count = 1;
>> -    lock_sock(sk);
>> -    queue->nr_cqe = 0;
>> -    consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
>> -    release_sock(sk);
>> -    return consumed == -EAGAIN ? 0 : consumed;
>> +    return result < 0 ? result : (queue->nr_cqe = nr_cqe);
> 
> This is changing the semantics of the retcode - can you please document 
> in the commit msg
> the change?
> 
That was done to be compatible with the polling interface.
I'll check if it's needed after all.

>>   }
>>   static void nvme_tcp_io_work(struct work_struct *w)
>> @@ -1391,7 +1408,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
>>                   break;
>>           }
>> -        result = nvme_tcp_try_recv(queue);
>> +        result = nvme_tcp_try_recvmsg(queue);
>>           if (result > 0)
>>               pending = true;
>>           else if (unlikely(result < 0))
>> @@ -2800,7 +2817,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx 
>> *hctx, struct io_comp_batch *iob)
>>       set_bit(NVME_TCP_Q_POLLING, &queue->flags);
>>       if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk- 
>> >sk_receive_queue))
>>           sk_busy_loop(sk, true);
>> -    ret = nvme_tcp_try_recv(queue);
>> +    ret = nvme_tcp_try_recvmsg(queue);
>>       clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
>>       return ret < 0 ? ret : queue->nr_cqe;
>>   
Cheers,
Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCHv2] nvme-tcp: Implement recvmsg() receive flow
  2025-11-26  8:58   ` Sagi Grimberg
@ 2025-11-27  7:53     ` Hannes Reinecke
  2025-11-27 11:37       ` Hannes Reinecke
  0 siblings, 1 reply; 13+ messages in thread
From: Hannes Reinecke @ 2025-11-27  7:53 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke, Christoph Hellwig; +Cc: Keith Busch, linux-nvme

On 11/26/25 09:58, Sagi Grimberg wrote:
> 
> 
> On 26/11/2025 9:32, Sagi Grimberg wrote:
>>
>>
>> On 20/10/2025 11:58, Hannes Reinecke wrote:
>>> The nvme-tcp code is using the ->read_sock() interface to
>>> read data from the wire. While this interface gives us access
>>> to the skbs themselves (and so might be able to reduce latency)
>>> it does not interpret the skbs.
>>> Additionally for TLS these skbs have to be re-constructed from
>>> the TLS stream data, rendering any advantage questionable.
>>> But the main drawback for TLS is that we do not get access to
>>> the TLS control messages, so if we receive any of those message
>>> the only choice we have is to tear down the connection and restart.
>>> This patch switches the receive side over to use recvmsg(), which
>>> provides us full access to the TLS control messages and is also
>>> more efficient when working with TLS as skbs do not need to be
>>> artificially constructed.
>>
>> Hannes,
>>
[ .. ]>>> @@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct
>>> nvme_tcp_queue *queue, struct sk_buff *skb,
>>>           }
>>>             /* we can read only from what is left in this bio */
>>> -        recv_len = min_t(size_t, recv_len,
>>> -                iov_iter_count(&req->iter));
>>> +        memset(&msg, 0, sizeof(msg));
>>> +        msg.msg_iter = req->iter;
>>> +        msg.msg_flags = MSG_DONTWAIT;
> 
> Shouldn't this be initialized outside of the loop? then the loop becomes 
> while (msg_data_left(&msg)) ?
> 
I'll check.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCHv2] nvme-tcp: Implement recvmsg() receive flow
  2025-11-27  7:53     ` Hannes Reinecke
@ 2025-11-27 11:37       ` Hannes Reinecke
  0 siblings, 0 replies; 13+ messages in thread
From: Hannes Reinecke @ 2025-11-27 11:37 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke, Christoph Hellwig; +Cc: Keith Busch, linux-nvme

On 11/27/25 08:53, Hannes Reinecke wrote:
> On 11/26/25 09:58, Sagi Grimberg wrote:
>>
>>
>> On 26/11/2025 9:32, Sagi Grimberg wrote:
>>>
>>>
>>> On 20/10/2025 11:58, Hannes Reinecke wrote:
>>>> The nvme-tcp code is using the ->read_sock() interface to
>>>> read data from the wire. While this interface gives us access
>>>> to the skbs themselves (and so might be able to reduce latency)
>>>> it does not interpret the skbs.
>>>> Additionally for TLS these skbs have to be re-constructed from
>>>> the TLS stream data, rendering any advantage questionable.
>>>> But the main drawback for TLS is that we do not get access to
>>>> the TLS control messages, so if we receive any of those message
>>>> the only choice we have is to tear down the connection and restart.
>>>> This patch switches the receive side over to use recvmsg(), which
>>>> provides us full access to the TLS control messages and is also
>>>> more efficient when working with TLS as skbs do not need to be
>>>> artificially constructed.
>>>
>>> Hannes,
>>>
> [ .. ]>>> @@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct
>>>> nvme_tcp_queue *queue, struct sk_buff *skb,
>>>>           }
>>>>             /* we can read only from what is left in this bio */
>>>> -        recv_len = min_t(size_t, recv_len,
>>>> -                iov_iter_count(&req->iter));
>>>> +        memset(&msg, 0, sizeof(msg));
>>>> +        msg.msg_iter = req->iter;
>>>> +        msg.msg_flags = MSG_DONTWAIT;
>>
>> Shouldn't this be initialized outside of the loop? then the loop 
>> becomes while (msg_data_left(&msg)) ?
>>
> I'll check.
> 
Ah, no, we can't.
The conditional above might move to the next bio, and reset 'req->iter'
to point to the next bio iterator.
So we need re-initialize the message here.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCHv2] nvme-tcp: Implement recvmsg() receive flow
  2025-11-27  7:52   ` Hannes Reinecke
@ 2025-11-27 11:47     ` Hannes Reinecke
  2025-11-30 21:38       ` Sagi Grimberg
  2025-11-30 21:35     ` Sagi Grimberg
  1 sibling, 1 reply; 13+ messages in thread
From: Hannes Reinecke @ 2025-11-27 11:47 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke, Christoph Hellwig; +Cc: Keith Busch, linux-nvme

On 11/27/25 08:52, Hannes Reinecke wrote:
> On 11/26/25 08:32, Sagi Grimberg wrote:
>>
>>
>> On 20/10/2025 11:58, Hannes Reinecke wrote:
>>> The nvme-tcp code is using the ->read_sock() interface to
>>> read data from the wire. While this interface gives us access
>>> to the skbs themselves (and so might be able to reduce latency)
>>> it does not interpret the skbs.
>>> Additionally for TLS these skbs have to be re-constructed from
>>> the TLS stream data, rendering any advantage questionable.
>>> But the main drawback for TLS is that we do not get access to
>>> the TLS control messages, so if we receive any of those message
>>> the only choice we have is to tear down the connection and restart.
>>> This patch switches the receive side over to use recvmsg(), which
>>> provides us full access to the TLS control messages and is also
>>> more efficient when working with TLS as skbs do not need to be
>>> artificially constructed.
>>
>> Hannes,
>>
>> I generally agree with this approach. I'd like to point out though
>> that this is going to give up running RX from directly from softirq 
>> context.
> 
> Yes.
> 
>> I've gone back and forth on weather nvme-tcp should do that, but never
>> got to do a thorough comparison between the two. This probably shuts
>> the door on that option.
>>
> The thing with running from softirq context is that it would only
> make sense if we could _ensure_ that the softirq context is running
> on the cpu where the blk-mq hardware context is expecting it to.
> Not only would that require fiddling with RFS contexts, but we also
> found that NVMe-over-fabrics should _not_ try to align with hardware
> interrupts but rather rely on the driver to abstract things away.
> 
>> Having said that, I am fine with this approach. recvmsg is a saner 
>> interface.
>>
> Thanks!
> 
> 
> [ .. ]>> @@ -1023,40 +1030,32 @@ static int nvme_tcp_recv_ddgst(struct
>>> nvme_tcp_queue *queue,
>>>       return 0;
>>>   }
>>> -static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff 
>>> *skb,
>>> -                 unsigned int offset, size_t len)
>>> +static int nvme_tcp_recvmsg_ddgst(struct nvme_tcp_queue *queue)
>>>   {
>>> -    struct nvme_tcp_queue *queue = desc->arg.data;
>>> -    size_t consumed = len;
>>> -    int result;
>>> +    char *ddgst = (char *)&queue->recv_ddgst;
>>> +    off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
>>> +    struct msghdr msg = {
>>> +        .msg_flags = MSG_WAITALL,
>>> +    };
>>
>> I don't think we want to use MSG_WAITALL ever. Why are you opting to 
>> do that?
>>
> Hmm. You are right, it's not needed.
> 
> [ .. ]>> @@ -1356,20 +1355,38 @@ static int nvme_tcp_try_send(struct
>>> nvme_tcp_queue *queue)
>>>       return ret;
>>>   }
>>> -static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
>>> +static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
>>>   {
>>> -    struct socket *sock = queue->sock;
>>> -    struct sock *sk = sock->sk;
>>> -    read_descriptor_t rd_desc;
>>> -    int consumed;
>>> +    int result;
>>> +    int nr_cqe = queue->nr_cqe;
>>> +
>>> +    if (unlikely(!queue->rd_enabled))
>>> +        return -EFAULT;
>>> +
>>> +    do {
>>> +        switch (nvme_tcp_recv_state(queue)) {
>>> +        case NVME_TCP_RECV_PDU:
>>> +            result = nvme_tcp_recvmsg_pdu(queue);
>>> +            break;
>>> +        case NVME_TCP_RECV_DATA:
>>> +            result = nvme_tcp_recvmsg_data(queue);
>>> +            break;
>>> +        case NVME_TCP_RECV_DDGST:
>>> +            result = nvme_tcp_recvmsg_ddgst(queue);
>>> +            break;
>>> +        default:
>>> +            result = -EFAULT;
>>> +        }
>>> +    } while (result >= 0);
>>> +
>>> +    if (result < 0 && result != -EAGAIN) {
>>> +        dev_err(queue->ctrl->ctrl.device,
>>> +            "receive failed:  %d\n", result);
>>> +        queue->rd_enabled = false;
>>> +        nvme_tcp_error_recovery(&queue->ctrl->ctrl);
>>> +    }
>>> -    rd_desc.arg.data = queue;
>>> -    rd_desc.count = 1;
>>> -    lock_sock(sk);
>>> -    queue->nr_cqe = 0;
>>> -    consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
>>> -    release_sock(sk);
>>> -    return consumed == -EAGAIN ? 0 : consumed;
>>> +    return result < 0 ? result : (queue->nr_cqe = nr_cqe);
>>
>> This is changing the semantics of the retcode - can you please 
>> document in the commit msg
>> the change?
>>
> That was done to be compatible with the polling interface.
> I'll check if it's needed after all.
> 
Actually, I don't think I'm changing the interface.
The original interface would return either a negative number on error,
'0' if no PDUs have been processed, or the number of processed PDUs.
And the new recvmsg interface does the same (I think).
The only change which I did was to _not_ map out -EAGAIN, as this will
be handled by the 'data_ready' callback.
(Reasoning: -EAGAIN is returned when there is no (or not enough) data
to be read, and the network layer will call ->data_ready once the
situation improves. If we continue with the loop we will race/compete
with the networking layer, causing us to do duplicated work as the
io_work thread will be picking up the new data _and_ the data_ready
callback will be scheduling the io_work callback to pick up the new
data. So we can safely stop on -EAGAIN knowing that we'll be woken
up once we can continue to read data.).
(And arguably this was an issue with the original code, too.).

So where is the change in semantics?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCHv2] nvme-tcp: Implement recvmsg() receive flow
  2025-11-27  7:52   ` Hannes Reinecke
  2025-11-27 11:47     ` Hannes Reinecke
@ 2025-11-30 21:35     ` Sagi Grimberg
  2025-12-01  8:49       ` Hannes Reinecke
  1 sibling, 1 reply; 13+ messages in thread
From: Sagi Grimberg @ 2025-11-30 21:35 UTC (permalink / raw)
  To: Hannes Reinecke, Hannes Reinecke, Christoph Hellwig
  Cc: Keith Busch, linux-nvme



On 27/11/2025 9:52, Hannes Reinecke wrote:
> On 11/26/25 08:32, Sagi Grimberg wrote:
>>
>>
>> On 20/10/2025 11:58, Hannes Reinecke wrote:
>>> The nvme-tcp code is using the ->read_sock() interface to
>>> read data from the wire. While this interface gives us access
>>> to the skbs themselves (and so might be able to reduce latency)
>>> it does not interpret the skbs.
>>> Additionally for TLS these skbs have to be re-constructed from
>>> the TLS stream data, rendering any advantage questionable.
>>> But the main drawback for TLS is that we do not get access to
>>> the TLS control messages, so if we receive any of those message
>>> the only choice we have is to tear down the connection and restart.
>>> This patch switches the receive side over to use recvmsg(), which
>>> provides us full access to the TLS control messages and is also
>>> more efficient when working with TLS as skbs do not need to be
>>> artificially constructed.
>>
>> Hannes,
>>
>> I generally agree with this approach. I'd like to point out though
>> that this is going to give up running RX from directly from softirq 
>> context.
>
> Yes.
>
>> I've gone back and forth on weather nvme-tcp should do that, but never
>> got to do a thorough comparison between the two. This probably shuts
>> the door on that option.
>>
> The thing with running from softirq context is that it would only
> make sense if we could _ensure_ that the softirq context is running
> on the cpu where the blk-mq hardware context is expecting it to.

What is this statement based on? softirq runs where the NIC interrupt
happens, which eliminates the context switch to the workqueue io_cpu
which is not guaranteed to affinitize where userspace is, in fact it often
isn't in nvme-tcp...

> Not only would that require fiddling with RFS contexts, but we also
> found that NVMe-over-fabrics should _not_ try to align with hardware
> interrupts but rather rely on the driver to abstract things away.

I did not expect anyone to fiddle with RFS for softirq context. The main
benefit of softirq context RX (outside of latency reduction) is that it
makes io_work handle ONLY TX, which is probably somewhat more efficient.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCHv2] nvme-tcp: Implement recvmsg() receive flow
  2025-11-27 11:47     ` Hannes Reinecke
@ 2025-11-30 21:38       ` Sagi Grimberg
  2025-12-01  9:02         ` Hannes Reinecke
  0 siblings, 1 reply; 13+ messages in thread
From: Sagi Grimberg @ 2025-11-30 21:38 UTC (permalink / raw)
  To: Hannes Reinecke, Hannes Reinecke, Christoph Hellwig
  Cc: Keith Busch, linux-nvme



On 27/11/2025 13:47, Hannes Reinecke wrote:
> On 11/27/25 08:52, Hannes Reinecke wrote:
>> On 11/26/25 08:32, Sagi Grimberg wrote:
>>>
>>>
>>> On 20/10/2025 11:58, Hannes Reinecke wrote:
>>>> The nvme-tcp code is using the ->read_sock() interface to
>>>> read data from the wire. While this interface gives us access
>>>> to the skbs themselves (and so might be able to reduce latency)
>>>> it does not interpret the skbs.
>>>> Additionally for TLS these skbs have to be re-constructed from
>>>> the TLS stream data, rendering any advantage questionable.
>>>> But the main drawback for TLS is that we do not get access to
>>>> the TLS control messages, so if we receive any of those message
>>>> the only choice we have is to tear down the connection and restart.
>>>> This patch switches the receive side over to use recvmsg(), which
>>>> provides us full access to the TLS control messages and is also
>>>> more efficient when working with TLS as skbs do not need to be
>>>> artificially constructed.
>>>
>>> Hannes,
>>>
>>> I generally agree with this approach. I'd like to point out though
>>> that this is going to give up running RX from directly from softirq 
>>> context.
>>
>> Yes.
>>
>>> I've gone back and forth on weather nvme-tcp should do that, but never
>>> got to do a thorough comparison between the two. This probably shuts
>>> the door on that option.
>>>
>> The thing with running from softirq context is that it would only
>> make sense if we could _ensure_ that the softirq context is running
>> on the cpu where the blk-mq hardware context is expecting it to.
>> Not only would that require fiddling with RFS contexts, but we also
>> found that NVMe-over-fabrics should _not_ try to align with hardware
>> interrupts but rather rely on the driver to abstract things away.
>>
>>> Having said that, I am fine with this approach. recvmsg is a saner 
>>> interface.
>>>
>> Thanks!
>>
>>
>> [ .. ]>> @@ -1023,40 +1030,32 @@ static int nvme_tcp_recv_ddgst(struct
>>>> nvme_tcp_queue *queue,
>>>>       return 0;
>>>>   }
>>>> -static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct 
>>>> sk_buff *skb,
>>>> -                 unsigned int offset, size_t len)
>>>> +static int nvme_tcp_recvmsg_ddgst(struct nvme_tcp_queue *queue)
>>>>   {
>>>> -    struct nvme_tcp_queue *queue = desc->arg.data;
>>>> -    size_t consumed = len;
>>>> -    int result;
>>>> +    char *ddgst = (char *)&queue->recv_ddgst;
>>>> +    off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
>>>> +    struct msghdr msg = {
>>>> +        .msg_flags = MSG_WAITALL,
>>>> +    };
>>>
>>> I don't think we want to use MSG_WAITALL ever. Why are you opting to 
>>> do that?
>>>
>> Hmm. You are right, it's not needed.
>>
>> [ .. ]>> @@ -1356,20 +1355,38 @@ static int nvme_tcp_try_send(struct
>>>> nvme_tcp_queue *queue)
>>>>       return ret;
>>>>   }
>>>> -static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
>>>> +static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
>>>>   {
>>>> -    struct socket *sock = queue->sock;
>>>> -    struct sock *sk = sock->sk;
>>>> -    read_descriptor_t rd_desc;
>>>> -    int consumed;
>>>> +    int result;
>>>> +    int nr_cqe = queue->nr_cqe;
>>>> +
>>>> +    if (unlikely(!queue->rd_enabled))
>>>> +        return -EFAULT;
>>>> +
>>>> +    do {
>>>> +        switch (nvme_tcp_recv_state(queue)) {
>>>> +        case NVME_TCP_RECV_PDU:
>>>> +            result = nvme_tcp_recvmsg_pdu(queue);
>>>> +            break;
>>>> +        case NVME_TCP_RECV_DATA:
>>>> +            result = nvme_tcp_recvmsg_data(queue);
>>>> +            break;
>>>> +        case NVME_TCP_RECV_DDGST:
>>>> +            result = nvme_tcp_recvmsg_ddgst(queue);
>>>> +            break;
>>>> +        default:
>>>> +            result = -EFAULT;
>>>> +        }
>>>> +    } while (result >= 0);
>>>> +
>>>> +    if (result < 0 && result != -EAGAIN) {
>>>> +        dev_err(queue->ctrl->ctrl.device,
>>>> +            "receive failed:  %d\n", result);
>>>> +        queue->rd_enabled = false;
>>>> + nvme_tcp_error_recovery(&queue->ctrl->ctrl);
>>>> +    }
>>>> -    rd_desc.arg.data = queue;
>>>> -    rd_desc.count = 1;
>>>> -    lock_sock(sk);
>>>> -    queue->nr_cqe = 0;
>>>> -    consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
>>>> -    release_sock(sk);
>>>> -    return consumed == -EAGAIN ? 0 : consumed;
>>>> +    return result < 0 ? result : (queue->nr_cqe = nr_cqe);
>>>
>>> This is changing the semantics of the retcode - can you please 
>>> document in the commit msg
>>> the change?
>>>
>> That was done to be compatible with the polling interface.
>> I'll check if it's needed after all.
>>
> Actually, I don't think I'm changing the interface.
> The original interface would return either a negative number on error,
> '0' if no PDUs have been processed, or the number of processed PDUs.
> And the new recvmsg interface does the same (I think).
> The only change which I did was to _not_ map out -EAGAIN, as this will
> be handled by the 'data_ready' callback.

Which is a change that needs to be documented.

> (Reasoning: -EAGAIN is returned when there is no (or not enough) data
> to be read, and the network layer will call ->data_ready once the
> situation improves. If we continue with the loop we will race/compete
> with the networking layer, causing us to do duplicated work as the
> io_work thread will be picking up the new data _and_ the data_ready
> callback will be scheduling the io_work callback to pick up the new
> data. So we can safely stop on -EAGAIN knowing that we'll be woken
> up once we can continue to read data.).

There is TX work that may be pending as well, IIRC there is an 
assumption that
io_work will NOT return if there is any TX work pending (we've seen 
stalls in the
past due to issues like that).

> (And arguably this was an issue with the original code, too.).
>
> So where is the change in semantics?

Exactly where you described it.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCHv2] nvme-tcp: Implement recvmsg() receive flow
  2025-11-30 21:35     ` Sagi Grimberg
@ 2025-12-01  8:49       ` Hannes Reinecke
  2025-12-01 17:45         ` Sagi Grimberg
  0 siblings, 1 reply; 13+ messages in thread
From: Hannes Reinecke @ 2025-12-01  8:49 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke, Christoph Hellwig; +Cc: Keith Busch, linux-nvme

On 11/30/25 22:35, Sagi Grimberg wrote:
> 
> 
> On 27/11/2025 9:52, Hannes Reinecke wrote:
>> On 11/26/25 08:32, Sagi Grimberg wrote:
>>>
>>>
>>> On 20/10/2025 11:58, Hannes Reinecke wrote:
>>>> The nvme-tcp code is using the ->read_sock() interface to
>>>> read data from the wire. While this interface gives us access
>>>> to the skbs themselves (and so might be able to reduce latency)
>>>> it does not interpret the skbs.
>>>> Additionally for TLS these skbs have to be re-constructed from
>>>> the TLS stream data, rendering any advantage questionable.
>>>> But the main drawback for TLS is that we do not get access to
>>>> the TLS control messages, so if we receive any of those message
>>>> the only choice we have is to tear down the connection and restart.
>>>> This patch switches the receive side over to use recvmsg(), which
>>>> provides us full access to the TLS control messages and is also
>>>> more efficient when working with TLS as skbs do not need to be
>>>> artificially constructed.
>>>
>>> Hannes,
>>>
>>> I generally agree with this approach. I'd like to point out though
>>> that this is going to give up running RX from directly from softirq 
>>> context.
>>
>> Yes.
>>
>>> I've gone back and forth on weather nvme-tcp should do that, but never
>>> got to do a thorough comparison between the two. This probably shuts
>>> the door on that option.
>>>
>> The thing with running from softirq context is that it would only
>> make sense if we could _ensure_ that the softirq context is running
>> on the cpu where the blk-mq hardware context is expecting it to.
> 
> What is this statement based on? softirq runs where the NIC interrupt
> happens, which eliminates the context switch to the workqueue io_cpu
> which is not guaranteed to affinitize where userspace is, in fact it often
> isn't in nvme-tcp...
> 
My thinking was that if we were to split TX and RX paths it would make
sense to align RX paths with the blk-mq CPU mapping. But that is not
a simple operation, and quite often not possible.
But if that wasn't the goal then fine, ignore my comment.

>> Not only would that require fiddling with RFS contexts, but we also
>> found that NVMe-over-fabrics should _not_ try to align with hardware
>> interrupts but rather rely on the driver to abstract things away.
> 
> I did not expect anyone to fiddle with RFS for softirq context. The main
> benefit of softirq context RX (outside of latency reduction) is that it
> makes io_work handle ONLY TX, which is probably somewhat more efficient.

But we still could do that, right?
We can easily split the current io_work() into two parts, the TX part 
driven from queue_rq(), and the RX part driven from the data_ready()
callback, no?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCHv2] nvme-tcp: Implement recvmsg() receive flow
  2025-11-30 21:38       ` Sagi Grimberg
@ 2025-12-01  9:02         ` Hannes Reinecke
  2025-12-01 17:49           ` Sagi Grimberg
  0 siblings, 1 reply; 13+ messages in thread
From: Hannes Reinecke @ 2025-12-01  9:02 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke, Christoph Hellwig; +Cc: Keith Busch, linux-nvme

On 11/30/25 22:38, Sagi Grimberg wrote:
> 
> 
> On 27/11/2025 13:47, Hannes Reinecke wrote:
>> On 11/27/25 08:52, Hannes Reinecke wrote:
>>> On 11/26/25 08:32, Sagi Grimberg wrote:
>>>>
>>>>
>>>> On 20/10/2025 11:58, Hannes Reinecke wrote:
>>>>> The nvme-tcp code is using the ->read_sock() interface to
>>>>> read data from the wire. While this interface gives us access
>>>>> to the skbs themselves (and so might be able to reduce latency)
>>>>> it does not interpret the skbs.
>>>>> Additionally for TLS these skbs have to be re-constructed from
>>>>> the TLS stream data, rendering any advantage questionable.
>>>>> But the main drawback for TLS is that we do not get access to
>>>>> the TLS control messages, so if we receive any of those message
>>>>> the only choice we have is to tear down the connection and restart.
>>>>> This patch switches the receive side over to use recvmsg(), which
>>>>> provides us full access to the TLS control messages and is also
>>>>> more efficient when working with TLS as skbs do not need to be
>>>>> artificially constructed.
>>>>
>>>> Hannes,
>>>>
>>>> I generally agree with this approach. I'd like to point out though
>>>> that this is going to give up running RX from directly from softirq 
>>>> context.
>>>
>>> Yes.
>>>
>>>> I've gone back and forth on weather nvme-tcp should do that, but never
>>>> got to do a thorough comparison between the two. This probably shuts
>>>> the door on that option.
>>>>
>>> The thing with running from softirq context is that it would only
>>> make sense if we could _ensure_ that the softirq context is running
>>> on the cpu where the blk-mq hardware context is expecting it to.
>>> Not only would that require fiddling with RFS contexts, but we also
>>> found that NVMe-over-fabrics should _not_ try to align with hardware
>>> interrupts but rather rely on the driver to abstract things away.
>>>
>>>> Having said that, I am fine with this approach. recvmsg is a saner 
>>>> interface.
>>>>
>>> Thanks!
>>>
>>>
>>> [ .. ]>> @@ -1023,40 +1030,32 @@ static int nvme_tcp_recv_ddgst(struct
>>>>> nvme_tcp_queue *queue,
>>>>>       return 0;
>>>>>   }
>>>>> -static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct 
>>>>> sk_buff *skb,
>>>>> -                 unsigned int offset, size_t len)
>>>>> +static int nvme_tcp_recvmsg_ddgst(struct nvme_tcp_queue *queue)
>>>>>   {
>>>>> -    struct nvme_tcp_queue *queue = desc->arg.data;
>>>>> -    size_t consumed = len;
>>>>> -    int result;
>>>>> +    char *ddgst = (char *)&queue->recv_ddgst;
>>>>> +    off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
>>>>> +    struct msghdr msg = {
>>>>> +        .msg_flags = MSG_WAITALL,
>>>>> +    };
>>>>
>>>> I don't think we want to use MSG_WAITALL ever. Why are you opting to 
>>>> do that?
>>>>
>>> Hmm. You are right, it's not needed.
>>>
>>> [ .. ]>> @@ -1356,20 +1355,38 @@ static int nvme_tcp_try_send(struct
>>>>> nvme_tcp_queue *queue)
>>>>>       return ret;
>>>>>   }
>>>>> -static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
>>>>> +static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
>>>>>   {
>>>>> -    struct socket *sock = queue->sock;
>>>>> -    struct sock *sk = sock->sk;
>>>>> -    read_descriptor_t rd_desc;
>>>>> -    int consumed;
>>>>> +    int result;
>>>>> +    int nr_cqe = queue->nr_cqe;
>>>>> +
>>>>> +    if (unlikely(!queue->rd_enabled))
>>>>> +        return -EFAULT;
>>>>> +
>>>>> +    do {
>>>>> +        switch (nvme_tcp_recv_state(queue)) {
>>>>> +        case NVME_TCP_RECV_PDU:
>>>>> +            result = nvme_tcp_recvmsg_pdu(queue);
>>>>> +            break;
>>>>> +        case NVME_TCP_RECV_DATA:
>>>>> +            result = nvme_tcp_recvmsg_data(queue);
>>>>> +            break;
>>>>> +        case NVME_TCP_RECV_DDGST:
>>>>> +            result = nvme_tcp_recvmsg_ddgst(queue);
>>>>> +            break;
>>>>> +        default:
>>>>> +            result = -EFAULT;
>>>>> +        }
>>>>> +    } while (result >= 0);
>>>>> +
>>>>> +    if (result < 0 && result != -EAGAIN) {
>>>>> +        dev_err(queue->ctrl->ctrl.device,
>>>>> +            "receive failed:  %d\n", result);
>>>>> +        queue->rd_enabled = false;
>>>>> + nvme_tcp_error_recovery(&queue->ctrl->ctrl);
>>>>> +    }
>>>>> -    rd_desc.arg.data = queue;
>>>>> -    rd_desc.count = 1;
>>>>> -    lock_sock(sk);
>>>>> -    queue->nr_cqe = 0;
>>>>> -    consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
>>>>> -    release_sock(sk);
>>>>> -    return consumed == -EAGAIN ? 0 : consumed;
>>>>> +    return result < 0 ? result : (queue->nr_cqe = nr_cqe);
>>>>
>>>> This is changing the semantics of the retcode - can you please 
>>>> document in the commit msg
>>>> the change?
>>>>
>>> That was done to be compatible with the polling interface.
>>> I'll check if it's needed after all.
>>>
>> Actually, I don't think I'm changing the interface.
>> The original interface would return either a negative number on error,
>> '0' if no PDUs have been processed, or the number of processed PDUs.
>> And the new recvmsg interface does the same (I think).
>> The only change which I did was to _not_ map out -EAGAIN, as this will
>> be handled by the 'data_ready' callback.
> 
> Which is a change that needs to be documented.
> 
Okay, will do.

>> (Reasoning: -EAGAIN is returned when there is no (or not enough) data
>> to be read, and the network layer will call ->data_ready once the
>> situation improves. If we continue with the loop we will race/compete
>> with the networking layer, causing us to do duplicated work as the
>> io_work thread will be picking up the new data _and_ the data_ready
>> callback will be scheduling the io_work callback to pick up the new
>> data. So we can safely stop on -EAGAIN knowing that we'll be woken
>> up once we can continue to read data.).
> 
> There is TX work that may be pending as well, IIRC there is an 
> assumption that io_work will NOT return if there is any TX work
 > pending (we've seen stalls in the past due to issues like that).>
Hmm.
But I've seen the opposite; the current io_work prioritizes TX
over RX, so a 'data_ready' callback will trigger TX first, letting
the RX data to stew for a bit before the TX data has been sent.
Maybe we should be looking into splitting TX and RX paths after all...

>> (And arguably this was an issue with the original code, too.).
>>
>> So where is the change in semantics?
> 
> Exactly where you described it.

Okay, will be doing it.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCHv2] nvme-tcp: Implement recvmsg() receive flow
  2025-12-01  8:49       ` Hannes Reinecke
@ 2025-12-01 17:45         ` Sagi Grimberg
  0 siblings, 0 replies; 13+ messages in thread
From: Sagi Grimberg @ 2025-12-01 17:45 UTC (permalink / raw)
  To: Hannes Reinecke, Hannes Reinecke, Christoph Hellwig
  Cc: Keith Busch, linux-nvme


>> I did not expect anyone to fiddle with RFS for softirq context. The main
>> benefit of softirq context RX (outside of latency reduction) is that it
>> makes io_work handle ONLY TX, which is probably somewhat more efficient.
>
> But we still could do that, right?
> We can easily split the current io_work() into two parts, the TX part 
> driven from queue_rq(), and the RX part driven from the data_ready()
> callback, no?

We could, but then we create yet another kthread context. The main thing 
that makes this appealing is running RX from soft-irq.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCHv2] nvme-tcp: Implement recvmsg() receive flow
  2025-12-01  9:02         ` Hannes Reinecke
@ 2025-12-01 17:49           ` Sagi Grimberg
  0 siblings, 0 replies; 13+ messages in thread
From: Sagi Grimberg @ 2025-12-01 17:49 UTC (permalink / raw)
  To: Hannes Reinecke, Hannes Reinecke, Christoph Hellwig
  Cc: Keith Busch, linux-nvme


>> There is TX work that may be pending as well, IIRC there is an 
>> assumption that io_work will NOT return if there is any TX work
> > pending (we've seen stalls in the past due to issues like that).>
> Hmm.
> But I've seen the opposite; the current io_work prioritizes TX
> over RX, so a 'data_ready' callback will trigger TX first, letting
> the RX data to stew for a bit before the TX data has been sent.

I wouldn't consider that prioritizing TX. the fact that it goes first is 
because
you want to send request to the ctrl to go off and work while you 
receive data.

> Maybe we should be looking into splitting TX and RX paths after all...

If both TX/RX remain kthreads I don't see how it will help performance.
But if someone provides data that shows otherwise, we can do that.


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2025-12-01 17:49 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-20  8:58 [PATCHv2] nvme-tcp: Implement recvmsg() receive flow Hannes Reinecke
2025-11-26  7:32 ` Sagi Grimberg
2025-11-26  8:58   ` Sagi Grimberg
2025-11-27  7:53     ` Hannes Reinecke
2025-11-27 11:37       ` Hannes Reinecke
2025-11-27  7:52   ` Hannes Reinecke
2025-11-27 11:47     ` Hannes Reinecke
2025-11-30 21:38       ` Sagi Grimberg
2025-12-01  9:02         ` Hannes Reinecke
2025-12-01 17:49           ` Sagi Grimberg
2025-11-30 21:35     ` Sagi Grimberg
2025-12-01  8:49       ` Hannes Reinecke
2025-12-01 17:45         ` Sagi Grimberg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.