From: Chuck Lever <cel@kernel.org>
To: Hannes Reinecke <hare@suse.de>, Olga Kornievskaia <okorniev@redhat.com>
Cc: kernel-tls-handshake@lists.linux.dev,
Chuck Lever <chuck.lever@oracle.com>
Subject: [RFC PATCH 4/4] sunrpc: Remove sock_recvmsg path from svcsock TCP receives
Date: Tue, 17 Feb 2026 17:20:33 -0500 [thread overview]
Message-ID: <20260217222033.1929211-5-cel@kernel.org> (raw)
In-Reply-To: <20260217222033.1929211-1-cel@kernel.org>
From: Chuck Lever <chuck.lever@oracle.com>
The svcsock TCP receive path maintains two code paths: one
using read_sock/read_sock_cmsg and a legacy path using
sock_recvmsg. Plain TCP sockets already provide read_sock
(tcp_read_sock) in their proto_ops, so the read_sock_cmsg
path can handle all cases relevant to NFSD by falling back
to read_sock when kTLS is not active.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/svcsock.c | 314 +++----------------------------------------
1 file changed, 22 insertions(+), 292 deletions(-)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 9600d15287e7..7d614dc44a05 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -238,140 +238,6 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
return len;
}
-static int
-svc_tcp_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
- struct cmsghdr *cmsg, int ret)
-{
- u8 content_type = tls_get_record_type(sock->sk, cmsg);
- u8 level, description;
-
- switch (content_type) {
- case 0:
- break;
- case TLS_RECORD_TYPE_DATA:
- /* TLS sets EOR at the end of each application data
- * record, even though there might be more frames
- * waiting to be decrypted.
- */
- msg->msg_flags &= ~MSG_EOR;
- break;
- case TLS_RECORD_TYPE_ALERT:
- tls_alert_recv(sock->sk, msg, &level, &description);
- ret = (level == TLS_ALERT_LEVEL_FATAL) ?
- -ENOTCONN : -EAGAIN;
- break;
- default:
- /* discard this record type */
- ret = -EAGAIN;
- }
- return ret;
-}
-
-static int
-svc_tcp_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags)
-{
- union {
- struct cmsghdr cmsg;
- u8 buf[CMSG_SPACE(sizeof(u8))];
- } u;
- u8 alert[2];
- struct kvec alert_kvec = {
- .iov_base = alert,
- .iov_len = sizeof(alert),
- };
- struct msghdr msg = {
- .msg_flags = *msg_flags,
- .msg_control = &u,
- .msg_controllen = sizeof(u),
- };
- int ret;
-
- iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1,
- alert_kvec.iov_len);
- ret = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
- if (ret > 0 &&
- tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) {
- iov_iter_revert(&msg.msg_iter, ret);
- ret = svc_tcp_sock_process_cmsg(sock, &msg, &u.cmsg, -EAGAIN);
- }
- return ret;
-}
-
-static int
-svc_tcp_sock_recvmsg(struct svc_sock *svsk, struct msghdr *msg)
-{
- int ret;
- struct socket *sock = svsk->sk_sock;
-
- ret = sock_recvmsg(sock, msg, MSG_DONTWAIT);
- if (msg->msg_flags & MSG_CTRUNC) {
- msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR);
- if (ret == 0 || ret == -EIO)
- ret = svc_tcp_sock_recv_cmsg(sock, &msg->msg_flags);
- }
- return ret;
-}
-
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-static void svc_flush_bvec(const struct bio_vec *bvec, size_t size, size_t seek)
-{
- struct bvec_iter bi = {
- .bi_size = size + seek,
- };
- struct bio_vec bv;
-
- bvec_iter_advance(bvec, &bi, seek & PAGE_MASK);
- for_each_bvec(bv, bvec, bi, bi)
- flush_dcache_page(bv.bv_page);
-}
-#else
-static inline void svc_flush_bvec(const struct bio_vec *bvec, size_t size,
- size_t seek)
-{
-}
-#endif
-
-/*
- * Read from @rqstp's transport socket. The incoming message fills whole
- * pages in @rqstp's rq_pages array until the last page of the message
- * has been received into a partial page.
- */
-static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen,
- size_t seek)
-{
- struct svc_sock *svsk =
- container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
- struct bio_vec *bvec = rqstp->rq_bvec;
- struct msghdr msg = { NULL };
- unsigned int i;
- ssize_t len;
- size_t t;
-
- clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
-
- for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE)
- bvec_set_page(&bvec[i], rqstp->rq_pages[i], PAGE_SIZE, 0);
- rqstp->rq_respages = &rqstp->rq_pages[i];
- rqstp->rq_next_page = rqstp->rq_respages + 1;
-
- iov_iter_bvec(&msg.msg_iter, ITER_DEST, bvec, i, buflen);
- if (seek) {
- iov_iter_advance(&msg.msg_iter, seek);
- buflen -= seek;
- }
- len = svc_tcp_sock_recvmsg(svsk, &msg);
- if (len > 0)
- svc_flush_bvec(bvec, len, seek);
-
- /* If we read a full record, then assume there may be more
- * data to read (stream based sockets only!)
- */
- if (len == buflen)
- set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
-
- return len;
-}
-
/*
* Set socket snd and rcv buffer lengths
*/
@@ -1038,50 +904,6 @@ static void svc_tcp_clear_pages(struct svc_sock *svsk)
svsk->sk_datalen = 0;
}
-/*
- * Receive fragment record header into sk_marker.
- */
-static ssize_t svc_tcp_read_marker(struct svc_sock *svsk,
- struct svc_rqst *rqstp)
-{
- ssize_t want, len;
-
- /* If we haven't gotten the record length yet,
- * get the next four bytes.
- */
- if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) {
- struct msghdr msg = { NULL };
- struct kvec iov;
-
- want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
- iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen;
- iov.iov_len = want;
- iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, want);
- len = svc_tcp_sock_recvmsg(svsk, &msg);
- if (len < 0)
- return len;
- svsk->sk_tcplen += len;
- if (len < want) {
- /* call again to read the remaining bytes */
- goto err_short;
- }
- trace_svcsock_marker(&svsk->sk_xprt, svsk->sk_marker);
- if (svc_sock_reclen(svsk) + svsk->sk_datalen >
- svsk->sk_xprt.xpt_server->sv_max_mesg)
- goto err_too_large;
- }
- return svc_sock_reclen(svsk);
-
-err_too_large:
- net_notice_ratelimited("svc: %s oversized RPC fragment (%u octets) from %pISpc\n",
- svsk->sk_xprt.xpt_server->sv_name,
- svc_sock_reclen(svsk),
- (struct sockaddr *)&svsk->sk_xprt.xpt_remote);
- svc_xprt_deferred_close(&svsk->sk_xprt);
-err_short:
- return -EAGAIN;
-}
-
static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
{
struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt;
@@ -1252,7 +1074,21 @@ static int svc_tcp_cmsg_actor(read_descriptor_t *desc,
return -EAGAIN;
}
-static int svc_tcp_recvfrom_readsock(struct svc_rqst *rqstp)
+/**
+ * svc_tcp_recvfrom - Receive data from a TCP socket
+ * @rqstp: request structure into which to receive an RPC Call
+ *
+ * Called in a loop when XPT_DATA has been set.
+ *
+ * Returns:
+ * On success, the number of bytes in a received RPC Call, or
+ * %0 if a complete RPC Call message was not ready to return
+ *
+ * The zero return case handles partial receives and callback Replies.
+ * The state of a partial receive is preserved in the svc_sock for
+ * the next call to svc_tcp_recvfrom.
+ */
+static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
{
struct svc_sock *svsk =
container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
@@ -1278,9 +1114,13 @@ static int svc_tcp_recvfrom_readsock(struct svc_rqst *rqstp)
desc.count = serv->sv_max_mesg;
lock_sock(sk);
- len = svsk->sk_sock->ops->read_sock_cmsg(sk, &desc,
- svc_tcp_recv_actor,
- svc_tcp_cmsg_actor);
+ if (svsk->sk_sock->ops->read_sock_cmsg)
+ len = svsk->sk_sock->ops->read_sock_cmsg(sk, &desc,
+ svc_tcp_recv_actor,
+ svc_tcp_cmsg_actor);
+ else
+ len = svsk->sk_sock->ops->read_sock(sk, &desc,
+ svc_tcp_recv_actor);
release_sock(sk);
if (desc.error == -EMSGSIZE)
@@ -1366,116 +1206,6 @@ static int svc_tcp_recvfrom_readsock(struct svc_rqst *rqstp)
return 0;
}
-/**
- * svc_tcp_recvfrom - Receive data from a TCP socket
- * @rqstp: request structure into which to receive an RPC Call
- *
- * Called in a loop when XPT_DATA has been set.
- *
- * Read the 4-byte stream record marker, then use the record length
- * in that marker to set up exactly the resources needed to receive
- * the next RPC message into @rqstp.
- *
- * Returns:
- * On success, the number of bytes in a received RPC Call, or
- * %0 if a complete RPC Call message was not ready to return
- *
- * The zero return case handles partial receives and callback Replies.
- * The state of a partial receive is preserved in the svc_sock for
- * the next call to svc_tcp_recvfrom.
- */
-static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
-{
- struct svc_sock *svsk =
- container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
- struct svc_serv *serv = svsk->sk_xprt.xpt_server;
- size_t want, base;
- ssize_t len;
- __be32 *p;
- __be32 calldir;
-
- if (svsk->sk_sock->ops->read_sock_cmsg)
- return svc_tcp_recvfrom_readsock(rqstp);
-
- clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- len = svc_tcp_read_marker(svsk, rqstp);
- if (len < 0)
- goto error;
-
- base = svc_tcp_restore_pages(svsk, rqstp);
- want = len - (svsk->sk_tcplen - sizeof(rpc_fraghdr));
- len = svc_tcp_read_msg(rqstp, base + want, base);
- if (len >= 0) {
- trace_svcsock_tcp_recv(&svsk->sk_xprt, len);
- svsk->sk_tcplen += len;
- svsk->sk_datalen += len;
- }
- if (len != want || !svc_sock_final_rec(svsk))
- goto err_incomplete;
- if (svsk->sk_datalen < 8)
- goto err_nuts;
-
- rqstp->rq_arg.len = svsk->sk_datalen;
- rqstp->rq_arg.page_base = 0;
- if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) {
- rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len;
- rqstp->rq_arg.page_len = 0;
- } else
- rqstp->rq_arg.page_len = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
-
- rqstp->rq_xprt_ctxt = NULL;
- rqstp->rq_prot = IPPROTO_TCP;
- if (test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags))
- set_bit(RQ_LOCAL, &rqstp->rq_flags);
- else
- clear_bit(RQ_LOCAL, &rqstp->rq_flags);
-
- p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
- calldir = p[1];
- if (calldir)
- len = receive_cb_reply(svsk, rqstp);
-
- /* Reset TCP read info */
- svsk->sk_datalen = 0;
- svc_tcp_fragment_received(svsk);
-
- if (len < 0)
- goto error;
-
- svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
- if (serv->sv_stats)
- serv->sv_stats->nettcpcnt++;
-
- svc_sock_secure_port(rqstp);
- svc_xprt_received(rqstp->rq_xprt);
- return rqstp->rq_arg.len;
-
-err_incomplete:
- svc_tcp_save_pages(svsk, rqstp);
- if (len < 0 && len != -EAGAIN)
- goto err_delete;
- if (len == want)
- svc_tcp_fragment_received(svsk);
- else
- trace_svcsock_tcp_recv_short(&svsk->sk_xprt,
- svc_sock_reclen(svsk),
- svsk->sk_tcplen - sizeof(rpc_fraghdr));
- goto err_noclose;
-error:
- if (len != -EAGAIN)
- goto err_delete;
- trace_svcsock_tcp_recv_eagain(&svsk->sk_xprt, 0);
- goto err_noclose;
-err_nuts:
- svsk->sk_datalen = 0;
-err_delete:
- trace_svcsock_tcp_recv_err(&svsk->sk_xprt, len);
- svc_xprt_deferred_close(&svsk->sk_xprt);
-err_noclose:
- svc_xprt_received(rqstp->rq_xprt);
- return 0; /* record not complete */
-}
-
/*
* MSG_SPLICE_PAGES is used exclusively to reduce the number of
* copy operations in this path. Therefore the caller must ensure
--
2.53.0
prev parent reply other threads:[~2026-02-17 22:20 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-17 22:20 [RFC PATCH 0/4] ->read_sock with cmsg Chuck Lever
2026-02-17 22:20 ` [RFC PATCH 1/4] net: Introduce read_sock_cmsg proto_ops for control message delivery Chuck Lever
2026-02-18 7:29 ` Hannes Reinecke
2026-02-18 14:33 ` Chuck Lever
2026-02-18 15:52 ` Hannes Reinecke
2026-02-18 16:12 ` Chuck Lever
2026-02-19 4:06 ` Alistair Francis
2026-02-19 8:05 ` Hannes Reinecke
2026-02-19 8:10 ` Hannes Reinecke
2026-02-19 13:59 ` Chuck Lever
2026-02-28 11:09 ` Alistair Francis
2026-02-17 22:20 ` [RFC PATCH 2/4] tls: Implement read_sock_cmsg for kTLS software path Chuck Lever
2026-02-17 22:20 ` [RFC PATCH 3/4] sunrpc: Use read_sock_cmsg for svcsock TCP receives Chuck Lever
2026-02-17 22:20 ` Chuck Lever [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260217222033.1929211-5-cel@kernel.org \
--to=cel@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=hare@suse.de \
--cc=kernel-tls-handshake@lists.linux.dev \
--cc=okorniev@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.