All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chuck Lever <cel@kernel.org>
To: NeilBrown <neilb@ownmail.net>, Jeff Layton <jlayton@kernel.org>,
	Olga Kornievskaia <okorniev@redhat.com>,
	Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>,
	daire@dneg.com, Mike Snitzer <snitzer@kernel.org>
Cc: <linux-nfs@vger.kernel.org>, Chuck Lever <chuck.lever@oracle.com>
Subject: [RFC PATCH 7/7] SUNRPC: Set explicit TCP socket buffer sizes for NFSD
Date: Thu,  5 Feb 2026 10:57:29 -0500	[thread overview]
Message-ID: <20260205155729.6841-8-cel@kernel.org> (raw)
In-Reply-To: <20260205155729.6841-1-cel@kernel.org>

From: Chuck Lever <chuck.lever@oracle.com>

NFSD TCP sockets currently rely on system defaults with TCP
auto-tuning. On networks with large bandwidth-delay products, the
default maximum buffer sizes (6MB receive, 4MB send) can throttle
throughput. Administrators must resort to system-wide sysctl
adjustments (tcp_rmem/tcp_wmem), which affect all TCP connections
rather than just NFS traffic.

This change sets explicit buffer sizes for NFSD TCP data sockets.
The buffer size is set to 4 * sv_max_mesg, yielding approximately
16MB with default NFS payload sizes. On memory-constrained systems,
the buffer size is capped at 1/1024 of physical RAM, with a hard
ceiling of 16MB. SOCK_SNDBUF_LOCK and SOCK_RCVBUF_LOCK disable
auto-tuning, providing predictable memory consumption.

The existing svc_sock_setbufsize() is renamed to
svc_udp_setbufsize() to reflect its UDP-specific purpose, and a
new svc_tcp_setbufsize() handles TCP data connections. Listener
sockets remain unaffected, as listeners do not transfer data.

This approach improves throughput on high-speed networks without
requiring system-wide configuration changes, while automatically
scaling down buffer sizes on small systems.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/svcsock.c | 52 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 8d7ac777dfe3..e019ae285d47 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -50,6 +50,7 @@
 #include <net/handshake.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
+#include <linux/mm.h>
 #include <asm/ioctls.h>
 #include <linux/key.h>
 
@@ -377,9 +378,12 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen,
 }
 
 /*
- * Set socket snd and rcv buffer lengths
+ * Set socket snd and rcv buffer lengths for UDP sockets.
+ *
+ * UDP sockets need large buffers because pending requests remain
+ * in the receive buffer until processed by a worker thread.
  */
-static void svc_sock_setbufsize(struct svc_sock *svsk, unsigned int nreqs)
+static void svc_udp_setbufsize(struct svc_sock *svsk, unsigned int nreqs)
 {
 	unsigned int max_mesg = svsk->sk_xprt.xpt_server->sv_max_mesg;
 	struct socket *sock = svsk->sk_sock;
@@ -393,6 +397,45 @@ static void svc_sock_setbufsize(struct svc_sock *svsk, unsigned int nreqs)
 	release_sock(sock->sk);
 }
 
+/* Accommodate high bandwidth-delay product connections */
+#define SVC_TCP_SNDBUF_MAX	(16 * 1024 * 1024)
+#define SVC_TCP_RCVBUF_MAX	(16 * 1024 * 1024)
+
+/*
+ * Set socket snd and rcv buffer lengths for TCP data sockets.
+ *
+ * Buffers are sized to accommodate high-bandwidth data transfers on
+ * high-latency networks (large bandwidth-delay product). Automatic
+ * buffer tuning is disabled to allow control of server memory
+ * consumption.
+ */
+static void svc_tcp_setbufsize(struct svc_sock *svsk)
+{
+	struct svc_serv *serv = svsk->sk_xprt.xpt_server;
+	struct socket *sock = svsk->sk_sock;
+	unsigned long mem_cap, ideal;
+	unsigned int sndbuf, rcvbuf;
+
+	/* Buffer multiple in-flight RPC messages */
+	ideal = serv->sv_max_mesg * 4;
+
+	/* Memory-based cap: 1/1024 of physical RAM */
+	mem_cap = (totalram_pages() >> 10) << PAGE_SHIFT;
+
+	sndbuf = clamp_t(unsigned long, ideal,
+			 serv->sv_max_mesg, min(mem_cap, SVC_TCP_SNDBUF_MAX));
+	rcvbuf = clamp_t(unsigned long, ideal,
+			 serv->sv_max_mesg, min(mem_cap, SVC_TCP_RCVBUF_MAX));
+
+	lock_sock(sock->sk);
+	sock->sk->sk_sndbuf = sndbuf;
+	sock->sk->sk_rcvbuf = rcvbuf;
+	sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+	sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+	sock->sk->sk_write_space(sock->sk);
+	release_sock(sock->sk);
+}
+
 static void svc_sock_secure_port(struct svc_rqst *rqstp)
 {
 	if (svc_port_is_privileged(svc_addr(rqstp)))
@@ -656,7 +699,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 	     * provides an upper bound on the number of threads
 	     * which will access the socket.
 	     */
-	    svc_sock_setbufsize(svsk, serv->sv_nrthreads + 3);
+	    svc_udp_setbufsize(svsk, serv->sv_nrthreads + 3);
 
 	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 	err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
@@ -872,7 +915,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
 	 * receive and respond to one request.
 	 * svc_udp_recvfrom will re-adjust if necessary
 	 */
-	svc_sock_setbufsize(svsk, 3);
+	svc_udp_setbufsize(svsk, 3);
 
 	/* data might have come in before data_ready set up */
 	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
@@ -1986,6 +2029,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 		       svsk->sk_maxpages * sizeof(struct page *));
 
 		tcp_sock_set_nodelay(sk);
+		svc_tcp_setbufsize(svsk);
 
 		switch (sk->sk_state) {
 		case TCP_SYN_RECV:
-- 
2.52.0


  parent reply	other threads:[~2026-02-05 15:57 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-05 15:57 [RFC PATCH 0/7] sunrpc: Reduce lock contention for NFSD TCP sockets Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 1/7] workqueue: Automatic affinity scope fallback for single-pod topologies Chuck Lever
2026-02-06 14:57   ` Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 2/7] sunrpc: split svc_data_ready into protocol-specific callbacks Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 3/7] sunrpc: add per-transport page recycling pool Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 4/7] sunrpc: add dedicated TCP receiver thread Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 5/7] sunrpc: implement flat combining for TCP socket sends Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 6/7] sunrpc: unify fore and backchannel server TCP send paths Chuck Lever
2026-02-05 15:57 ` Chuck Lever [this message]
2026-03-30 18:57 ` [RFC PATCH 0/7] sunrpc: Reduce lock contention for NFSD TCP sockets Mike Snitzer
2026-03-30 19:04   ` Chuck Lever

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260205155729.6841-8-cel@kernel.org \
    --to=cel@kernel.org \
    --cc=chuck.lever@oracle.com \
    --cc=dai.ngo@oracle.com \
    --cc=daire@dneg.com \
    --cc=jlayton@kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=neilb@ownmail.net \
    --cc=okorniev@redhat.com \
    --cc=snitzer@kernel.org \
    --cc=tom@talpey.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.