From: Chuck Lever <cel@kernel.org>
To: Hannes Reinecke <hare@suse.de>, Olga Kornievskaia <okorniev@redhat.com>
Cc: kernel-tls-handshake@lists.linux.dev,
Chuck Lever <chuck.lever@oracle.com>
Subject: [RFC PATCH 4/4] sunrpc: Remove sock_recvmsg path from svcsock TCP receives
Date: Tue, 17 Feb 2026 17:20:33 -0500 [thread overview]
Message-ID: <20260217222033.1929211-5-cel@kernel.org> (raw)
In-Reply-To: <20260217222033.1929211-1-cel@kernel.org>
From: Chuck Lever <chuck.lever@oracle.com>
The svcsock TCP receive path maintains two code paths: one
using read_sock/read_sock_cmsg and a legacy path using
sock_recvmsg. Plain TCP sockets already provide read_sock
(tcp_read_sock) in their proto_ops, so the read_sock_cmsg
path can handle all cases relevant to NFSD by falling back
to read_sock when kTLS is not active.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/svcsock.c | 314 +++----------------------------------------
1 file changed, 22 insertions(+), 292 deletions(-)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 9600d15287e7..7d614dc44a05 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -238,140 +238,6 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
return len;
}
-static int
-svc_tcp_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
- struct cmsghdr *cmsg, int ret)
-{
- u8 content_type = tls_get_record_type(sock->sk, cmsg);
- u8 level, description;
-
- switch (content_type) {
- case 0:
- break;
- case TLS_RECORD_TYPE_DATA:
- /* TLS sets EOR at the end of each application data
- * record, even though there might be more frames
- * waiting to be decrypted.
- */
- msg->msg_flags &= ~MSG_EOR;
- break;
- case TLS_RECORD_TYPE_ALERT:
- tls_alert_recv(sock->sk, msg, &level, &description);
- ret = (level == TLS_ALERT_LEVEL_FATAL) ?
- -ENOTCONN : -EAGAIN;
- break;
- default:
- /* discard this record type */
- ret = -EAGAIN;
- }
- return ret;
-}
-
-static int
-svc_tcp_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags)
-{
- union {
- struct cmsghdr cmsg;
- u8 buf[CMSG_SPACE(sizeof(u8))];
- } u;
- u8 alert[2];
- struct kvec alert_kvec = {
- .iov_base = alert,
- .iov_len = sizeof(alert),
- };
- struct msghdr msg = {
- .msg_flags = *msg_flags,
- .msg_control = &u,
- .msg_controllen = sizeof(u),
- };
- int ret;
-
- iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1,
- alert_kvec.iov_len);
- ret = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
- if (ret > 0 &&
- tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) {
- iov_iter_revert(&msg.msg_iter, ret);
- ret = svc_tcp_sock_process_cmsg(sock, &msg, &u.cmsg, -EAGAIN);
- }
- return ret;
-}
-
-static int
-svc_tcp_sock_recvmsg(struct svc_sock *svsk, struct msghdr *msg)
-{
- int ret;
- struct socket *sock = svsk->sk_sock;
-
- ret = sock_recvmsg(sock, msg, MSG_DONTWAIT);
- if (msg->msg_flags & MSG_CTRUNC) {
- msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR);
- if (ret == 0 || ret == -EIO)
- ret = svc_tcp_sock_recv_cmsg(sock, &msg->msg_flags);
- }
- return ret;
-}
-
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-static void svc_flush_bvec(const struct bio_vec *bvec, size_t size, size_t seek)
-{
- struct bvec_iter bi = {
- .bi_size = size + seek,
- };
- struct bio_vec bv;
-
- bvec_iter_advance(bvec, &bi, seek & PAGE_MASK);
- for_each_bvec(bv, bvec, bi, bi)
- flush_dcache_page(bv.bv_page);
-}
-#else
-static inline void svc_flush_bvec(const struct bio_vec *bvec, size_t size,
- size_t seek)
-{
-}
-#endif
-
-/*
- * Read from @rqstp's transport socket. The incoming message fills whole
- * pages in @rqstp's rq_pages array until the last page of the message
- * has been received into a partial page.
- */
-static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen,
- size_t seek)
-{
- struct svc_sock *svsk =
- container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
- struct bio_vec *bvec = rqstp->rq_bvec;
- struct msghdr msg = { NULL };
- unsigned int i;
- ssize_t len;
- size_t t;
-
- clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
-
- for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE)
- bvec_set_page(&bvec[i], rqstp->rq_pages[i], PAGE_SIZE, 0);
- rqstp->rq_respages = &rqstp->rq_pages[i];
- rqstp->rq_next_page = rqstp->rq_respages + 1;
-
- iov_iter_bvec(&msg.msg_iter, ITER_DEST, bvec, i, buflen);
- if (seek) {
- iov_iter_advance(&msg.msg_iter, seek);
- buflen -= seek;
- }
- len = svc_tcp_sock_recvmsg(svsk, &msg);
- if (len > 0)
- svc_flush_bvec(bvec, len, seek);
-
- /* If we read a full record, then assume there may be more
- * data to read (stream based sockets only!)
- */
- if (len == buflen)
- set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
-
- return len;
-}
-
/*
* Set socket snd and rcv buffer lengths
*/
@@ -1038,50 +904,6 @@ static void svc_tcp_clear_pages(struct svc_sock *svsk)
svsk->sk_datalen = 0;
}
-/*
- * Receive fragment record header into sk_marker.
- */
-static ssize_t svc_tcp_read_marker(struct svc_sock *svsk,
- struct svc_rqst *rqstp)
-{
- ssize_t want, len;
-
- /* If we haven't gotten the record length yet,
- * get the next four bytes.
- */
- if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) {
- struct msghdr msg = { NULL };
- struct kvec iov;
-
- want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
- iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen;
- iov.iov_len = want;
- iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, want);
- len = svc_tcp_sock_recvmsg(svsk, &msg);
- if (len < 0)
- return len;
- svsk->sk_tcplen += len;
- if (len < want) {
- /* call again to read the remaining bytes */
- goto err_short;
- }
- trace_svcsock_marker(&svsk->sk_xprt, svsk->sk_marker);
- if (svc_sock_reclen(svsk) + svsk->sk_datalen >
- svsk->sk_xprt.xpt_server->sv_max_mesg)
- goto err_too_large;
- }
- return svc_sock_reclen(svsk);
-
-err_too_large:
- net_notice_ratelimited("svc: %s oversized RPC fragment (%u octets) from %pISpc\n",
- svsk->sk_xprt.xpt_server->sv_name,
- svc_sock_reclen(svsk),
- (struct sockaddr *)&svsk->sk_xprt.xpt_remote);
- svc_xprt_deferred_close(&svsk->sk_xprt);
-err_short:
- return -EAGAIN;
-}
-
static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
{
struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt;
@@ -1252,7 +1074,21 @@ static int svc_tcp_cmsg_actor(read_descriptor_t *desc,
return -EAGAIN;
}
-static int svc_tcp_recvfrom_readsock(struct svc_rqst *rqstp)
+/**
+ * svc_tcp_recvfrom - Receive data from a TCP socket
+ * @rqstp: request structure into which to receive an RPC Call
+ *
+ * Called in a loop when XPT_DATA has been set.
+ *
+ * Returns:
+ * On success, the number of bytes in a received RPC Call, or
+ * %0 if a complete RPC Call message was not ready to return
+ *
+ * The zero return case handles partial receives and callback Replies.
+ * The state of a partial receive is preserved in the svc_sock for
+ * the next call to svc_tcp_recvfrom.
+ */
+static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
{
struct svc_sock *svsk =
container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
@@ -1278,9 +1114,13 @@ static int svc_tcp_recvfrom_readsock(struct svc_rqst *rqstp)
desc.count = serv->sv_max_mesg;
lock_sock(sk);
- len = svsk->sk_sock->ops->read_sock_cmsg(sk, &desc,
- svc_tcp_recv_actor,
- svc_tcp_cmsg_actor);
+ if (svsk->sk_sock->ops->read_sock_cmsg)
+ len = svsk->sk_sock->ops->read_sock_cmsg(sk, &desc,
+ svc_tcp_recv_actor,
+ svc_tcp_cmsg_actor);
+ else
+ len = svsk->sk_sock->ops->read_sock(sk, &desc,
+ svc_tcp_recv_actor);
release_sock(sk);
if (desc.error == -EMSGSIZE)
@@ -1366,116 +1206,6 @@ static int svc_tcp_recvfrom_readsock(struct svc_rqst *rqstp)
return 0;
}
-/**
- * svc_tcp_recvfrom - Receive data from a TCP socket
- * @rqstp: request structure into which to receive an RPC Call
- *
- * Called in a loop when XPT_DATA has been set.
- *
- * Read the 4-byte stream record marker, then use the record length
- * in that marker to set up exactly the resources needed to receive
- * the next RPC message into @rqstp.
- *
- * Returns:
- * On success, the number of bytes in a received RPC Call, or
- * %0 if a complete RPC Call message was not ready to return
- *
- * The zero return case handles partial receives and callback Replies.
- * The state of a partial receive is preserved in the svc_sock for
- * the next call to svc_tcp_recvfrom.
- */
-static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
-{
- struct svc_sock *svsk =
- container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
- struct svc_serv *serv = svsk->sk_xprt.xpt_server;
- size_t want, base;
- ssize_t len;
- __be32 *p;
- __be32 calldir;
-
- if (svsk->sk_sock->ops->read_sock_cmsg)
- return svc_tcp_recvfrom_readsock(rqstp);
-
- clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- len = svc_tcp_read_marker(svsk, rqstp);
- if (len < 0)
- goto error;
-
- base = svc_tcp_restore_pages(svsk, rqstp);
- want = len - (svsk->sk_tcplen - sizeof(rpc_fraghdr));
- len = svc_tcp_read_msg(rqstp, base + want, base);
- if (len >= 0) {
- trace_svcsock_tcp_recv(&svsk->sk_xprt, len);
- svsk->sk_tcplen += len;
- svsk->sk_datalen += len;
- }
- if (len != want || !svc_sock_final_rec(svsk))
- goto err_incomplete;
- if (svsk->sk_datalen < 8)
- goto err_nuts;
-
- rqstp->rq_arg.len = svsk->sk_datalen;
- rqstp->rq_arg.page_base = 0;
- if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) {
- rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len;
- rqstp->rq_arg.page_len = 0;
- } else
- rqstp->rq_arg.page_len = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
-
- rqstp->rq_xprt_ctxt = NULL;
- rqstp->rq_prot = IPPROTO_TCP;
- if (test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags))
- set_bit(RQ_LOCAL, &rqstp->rq_flags);
- else
- clear_bit(RQ_LOCAL, &rqstp->rq_flags);
-
- p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
- calldir = p[1];
- if (calldir)
- len = receive_cb_reply(svsk, rqstp);
-
- /* Reset TCP read info */
- svsk->sk_datalen = 0;
- svc_tcp_fragment_received(svsk);
-
- if (len < 0)
- goto error;
-
- svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
- if (serv->sv_stats)
- serv->sv_stats->nettcpcnt++;
-
- svc_sock_secure_port(rqstp);
- svc_xprt_received(rqstp->rq_xprt);
- return rqstp->rq_arg.len;
-
-err_incomplete:
- svc_tcp_save_pages(svsk, rqstp);
- if (len < 0 && len != -EAGAIN)
- goto err_delete;
- if (len == want)
- svc_tcp_fragment_received(svsk);
- else
- trace_svcsock_tcp_recv_short(&svsk->sk_xprt,
- svc_sock_reclen(svsk),
- svsk->sk_tcplen - sizeof(rpc_fraghdr));
- goto err_noclose;
-error:
- if (len != -EAGAIN)
- goto err_delete;
- trace_svcsock_tcp_recv_eagain(&svsk->sk_xprt, 0);
- goto err_noclose;
-err_nuts:
- svsk->sk_datalen = 0;
-err_delete:
- trace_svcsock_tcp_recv_err(&svsk->sk_xprt, len);
- svc_xprt_deferred_close(&svsk->sk_xprt);
-err_noclose:
- svc_xprt_received(rqstp->rq_xprt);
- return 0; /* record not complete */
-}
-
/*
* MSG_SPLICE_PAGES is used exclusively to reduce the number of
* copy operations in this path. Therefore the caller must ensure
--
2.53.0
prev parent reply other threads:[~2026-02-17 22:20 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-17 22:20 [RFC PATCH 0/4] ->read_sock with cmsg Chuck Lever
2026-02-17 22:20 ` [RFC PATCH 1/4] net: Introduce read_sock_cmsg proto_ops for control message delivery Chuck Lever
2026-02-18 7:29 ` Hannes Reinecke
2026-02-18 14:33 ` Chuck Lever
2026-02-18 15:52 ` Hannes Reinecke
2026-02-18 16:12 ` Chuck Lever
2026-02-19 4:06 ` Alistair Francis
2026-02-19 8:05 ` Hannes Reinecke
2026-02-19 8:10 ` Hannes Reinecke
2026-02-19 13:59 ` Chuck Lever
2026-02-28 11:09 ` Alistair Francis
2026-02-17 22:20 ` [RFC PATCH 2/4] tls: Implement read_sock_cmsg for kTLS software path Chuck Lever
2026-02-17 22:20 ` [RFC PATCH 3/4] sunrpc: Use read_sock_cmsg for svcsock TCP receives Chuck Lever
2026-02-17 22:20 ` Chuck Lever [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260217222033.1929211-5-cel@kernel.org \
--to=cel@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=hare@suse.de \
--cc=kernel-tls-handshake@lists.linux.dev \
--cc=okorniev@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox