From: Hannes Reinecke <hare@kernel.org>
To: Christoph Hellwig <hch@lst.de>
Cc: Keith Busch <kbusch@kernel.org>, Sagi Grimberg <sagi@grimberg.me>,
linux-nvme@lists.infradead.org,
Alistair Francis <alistair.francis@wdc.com>,
Hannes Reinecke <hare@kernel.org>
Subject: [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow
Date: Fri, 12 Sep 2025 13:58:29 +0200 [thread overview]
Message-ID: <20250912115829.58669-1-hare@kernel.org> (raw)
Switch to use recvmsg() so that we get access to TLS control
messages eg for handling TLS KeyUpdate.
Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
drivers/nvme/host/tcp.c | 204 ++++++++++++++++++++++------------------
1 file changed, 111 insertions(+), 93 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index c0fe8cfb7229..9ef1d4aea838 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -17,6 +17,7 @@
#include <net/tls_prot.h>
#include <net/handshake.h>
#include <linux/blk-mq.h>
+#include <linux/iov_iter.h>
#include <net/busy_poll.h>
#include <trace/events/sock.h>
@@ -476,6 +477,28 @@ static inline void nvme_tcp_ddgst_update(u32 *crcp,
}
}
+static size_t nvme_tcp_ddgst_step(void *iter_base, size_t progress, size_t len,
+ void *priv, void *priv2)
+{
+ u32 *crcp = priv;
+
+ *crcp = crc32c(*crcp, iter_base, len);
+ return 0;
+}
+
+static int nvme_tcp_ddgst_calc(struct nvme_tcp_request *req, u32 *crcp,
+ size_t maxsize)
+{
+ struct iov_iter tmp = req->iter;
+ int err = 0;
+
+ tmp.count = maxsize;
+ if (iterate_and_advance_kernel(&tmp, maxsize, crcp, &err,
+ nvme_tcp_ddgst_step) != maxsize)
+ return err;
+ return 0;
+}
+
static inline __le32 nvme_tcp_ddgst_final(u32 crc)
{
return cpu_to_le32(~crc);
@@ -827,23 +850,26 @@ static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue,
"Received C2HTermReq (FES = %s)\n", msg);
}
-static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
- unsigned int *offset, size_t *len)
+static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
{
- struct nvme_tcp_hdr *hdr;
char *pdu = queue->pdu;
- size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT,
+ };
+ struct kvec iov = {
+ .iov_base = pdu + queue->pdu_offset,
+ .iov_len = queue->pdu_remaining,
+ };
+ struct nvme_tcp_hdr *hdr;
int ret;
- ret = skb_copy_bits(skb, *offset,
- &pdu[queue->pdu_offset], rcv_len);
- if (unlikely(ret))
+ ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+ iov.iov_len, msg.msg_flags);
+ if (ret <= 0)
return ret;
- queue->pdu_remaining -= rcv_len;
- queue->pdu_offset += rcv_len;
- *offset += rcv_len;
- *len -= rcv_len;
+ queue->pdu_remaining -= ret;
+ queue->pdu_offset += ret;
if (queue->pdu_remaining)
return 0;
@@ -907,20 +933,19 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status)
nvme_complete_rq(rq);
}
-static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
- unsigned int *offset, size_t *len)
+static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
struct request *rq =
nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
- while (true) {
- int recv_len, ret;
+ if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
+ return 0;
- recv_len = min_t(size_t, *len, queue->data_remaining);
- if (!recv_len)
- break;
+ while (queue->data_remaining) {
+ struct msghdr msg;
+ int ret;
if (!iov_iter_count(&req->iter)) {
req->curr_bio = req->curr_bio->bi_next;
@@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
}
/* we can read only from what is left in this bio */
- recv_len = min_t(size_t, recv_len,
- iov_iter_count(&req->iter));
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_iter = req->iter;
+ msg.msg_flags = MSG_DONTWAIT;
- if (queue->data_digest)
- ret = skb_copy_and_crc32c_datagram_iter(skb, *offset,
- &req->iter, recv_len, &queue->rcv_crc);
- else
- ret = skb_copy_datagram_iter(skb, *offset,
- &req->iter, recv_len);
- if (ret) {
+ ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
+ if (ret < 0) {
dev_err(queue->ctrl->ctrl.device,
- "queue %d failed to copy request %#x data",
+ "queue %d failed to receive request %#x data",
nvme_tcp_queue_id(queue), rq->tag);
return ret;
}
-
- *len -= recv_len;
- *offset += recv_len;
- queue->data_remaining -= recv_len;
+ if (queue->data_digest)
+ nvme_tcp_ddgst_calc(req, &queue->rcv_crc, ret);
+ queue->data_remaining -= ret;
+ if (queue->data_remaining)
+ nvme_tcp_advance_req(req, ret);
}
if (!queue->data_remaining) {
@@ -968,7 +990,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
} else {
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
nvme_tcp_end_request(rq,
- le16_to_cpu(req->status));
+ le16_to_cpu(req->status));
queue->nr_cqe++;
}
nvme_tcp_init_recv_ctx(queue);
@@ -978,24 +1000,9 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
return 0;
}
-static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
- struct sk_buff *skb, unsigned int *offset, size_t *len)
+static int __nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
- char *ddgst = (char *)&queue->recv_ddgst;
- size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
- off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
- int ret;
-
- ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
- if (unlikely(ret))
- return ret;
-
- queue->ddgst_remaining -= recv_len;
- *offset += recv_len;
- *len -= recv_len;
- if (queue->ddgst_remaining)
- return 0;
if (queue->recv_ddgst != queue->exp_ddgst) {
struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
@@ -1023,40 +1030,32 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
return 0;
}
-static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
- unsigned int offset, size_t len)
+static int nvme_tcp_recvmsg_ddgst(struct nvme_tcp_queue *queue)
{
- struct nvme_tcp_queue *queue = desc->arg.data;
- size_t consumed = len;
- int result;
+ char *ddgst = (char *)&queue->recv_ddgst;
+ off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
+ struct msghdr msg = {
+ .msg_flags = MSG_WAITALL,
+ };
+ struct kvec iov = {
+ .iov_base = (u8 *)ddgst + off,
+ .iov_len = queue->ddgst_remaining,
+ };
+ int ret;
- if (unlikely(!queue->rd_enabled))
- return -EFAULT;
+ if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DDGST)
+ return 0;
- while (len) {
- switch (nvme_tcp_recv_state(queue)) {
- case NVME_TCP_RECV_PDU:
- result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
- break;
- case NVME_TCP_RECV_DATA:
- result = nvme_tcp_recv_data(queue, skb, &offset, &len);
- break;
- case NVME_TCP_RECV_DDGST:
- result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
- break;
- default:
- result = -EFAULT;
- }
- if (result) {
- dev_err(queue->ctrl->ctrl.device,
- "receive failed: %d\n", result);
- queue->rd_enabled = false;
- nvme_tcp_error_recovery(&queue->ctrl->ctrl);
- return result;
- }
- }
+ ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len,
+ msg.msg_flags);
+ if (ret <= 0)
+ return ret;
+
+ queue->ddgst_remaining -= ret;
+ if (queue->ddgst_remaining)
+ return 0;
- return consumed;
+ return __nvme_tcp_recv_ddgst(queue);
}
static void nvme_tcp_data_ready(struct sock *sk)
@@ -1353,20 +1352,39 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
return ret;
}
-static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
+static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
{
- struct socket *sock = queue->sock;
- struct sock *sk = sock->sk;
- read_descriptor_t rd_desc;
- int consumed;
+ int result;
+ int nr_cqe = queue->nr_cqe;
+
+ if (unlikely(!queue->rd_enabled))
+ return -EFAULT;
+
+ do {
+ switch (nvme_tcp_recv_state(queue)) {
+ case NVME_TCP_RECV_PDU:
+ result = nvme_tcp_recvmsg_pdu(queue);
+ break;
+ case NVME_TCP_RECV_DATA:
+ result = nvme_tcp_recvmsg_data(queue);
+ break;
+ case NVME_TCP_RECV_DDGST:
+ result = nvme_tcp_recvmsg_ddgst(queue);
+ break;
+ default:
+ result = -EFAULT;
+ }
+ } while (result >= 0);
+
+ if (result < 0 && result != -EAGAIN) {
+ dev_err(queue->ctrl->ctrl.device,
+ "receive failed: %d\n", result);
+ queue->rd_enabled = false;
+ nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+ } else if (result == -EAGAIN)
+ result = 0;
- rd_desc.arg.data = queue;
- rd_desc.count = 1;
- lock_sock(sk);
- queue->nr_cqe = 0;
- consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
- release_sock(sk);
- return consumed == -EAGAIN ? 0 : consumed;
+ return result < 0 ? result : (queue->nr_cqe = nr_cqe);
}
static void nvme_tcp_io_work(struct work_struct *w)
@@ -1388,7 +1406,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
break;
}
- result = nvme_tcp_try_recv(queue);
+ result = nvme_tcp_try_recvmsg(queue);
if (result > 0)
pending = true;
else if (unlikely(result < 0))
@@ -2794,7 +2812,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
set_bit(NVME_TCP_Q_POLLING, &queue->flags);
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
- ret = nvme_tcp_try_recv(queue);
+ ret = nvme_tcp_try_recvmsg(queue);
clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
return ret < 0 ? ret : queue->nr_cqe;
}
--
2.43.0
next reply other threads:[~2025-09-12 11:58 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-09-12 11:58 Hannes Reinecke [this message]
2025-09-22 17:41 ` [PATCH RFC] nvme-tcp: Implement recvmsg() receive flow Christoph Hellwig
2025-09-23 6:30 ` Hannes Reinecke
2026-02-25 10:56 ` Alistair Francis
2026-02-25 11:41 ` Hannes Reinecke
2026-02-25 13:15 ` Alistair Francis
2026-02-25 15:02 ` Hannes Reinecke
2026-02-25 23:37 ` Alistair Francis
2026-02-26 8:40 ` Hannes Reinecke
2026-02-26 10:42 ` Alistair Francis
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250912115829.58669-1-hare@kernel.org \
--to=hare@kernel.org \
--cc=alistair.francis@wdc.com \
--cc=hch@lst.de \
--cc=kbusch@kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=sagi@grimberg.me \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.