public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: atwellwea@gmail.com
To: netdev@vger.kernel.org, davem@davemloft.net, kuba@kernel.org,
	pabeni@redhat.com, edumazet@google.com, ncardwell@google.com
Cc: linux-kernel@vger.kernel.org, linux-api@vger.kernel.org,
	linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org,
	linux-trace-kernel@vger.kernel.org, mptcp@lists.linux.dev,
	dsahern@kernel.org, horms@kernel.org, kuniyu@google.com,
	andrew+netdev@lunn.ch, willemdebruijn.kernel@gmail.com,
	jasowang@redhat.com, skhan@linuxfoundation.org, corbet@lwn.net,
	matttbe@kernel.org, martineau@kernel.org, geliang@kernel.org,
	rostedt@goodmis.org, mhiramat@kernel.org,
	mathieu.desnoyers@efficios.com, 0x7f454c46@gmail.com
Subject: [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack
Date: Sat, 14 Mar 2026 14:13:39 -0600	[thread overview]
Message-ID: <20260314201348.1786972-6-atwellwea@gmail.com> (raw)
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

Teach TCP to grow sk_rcvbuf when scale rounding would otherwise expose
more sender-visible window than the current hard receive-memory backing
can cover.

The new helper keeps backlog and memory-pressure limits in the same
units as the rest of the receive path, while __tcp_select_window()
backs any rounding slack before advertising it.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 include/net/tcp.h     | 12 ++++++++++++
 net/ipv4/tcp_input.c  | 36 ++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp_output.c | 15 +++++++++++++--
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index fc22ab6b80d5..5b479ad44f89 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -397,6 +397,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg);
 enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
 void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
+bool tcp_try_grow_rcvbuf(struct sock *sk, int needed);
 void tcp_rcv_space_adjust(struct sock *sk);
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
 void tcp_twsk_destructor(struct sock *sk);
@@ -1844,6 +1845,17 @@ static inline int tcp_rwnd_avail(const struct sock *sk)
 	return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len);
 }
 
+/* Passive children clone the listener's sk_socket until accept() grafts
+ * their own struct socket, so only sockets that point back to themselves
+ * should autotune receive-buffer backing.
+ */
+static inline bool tcp_rcvbuf_grow_allowed(const struct sock *sk)
+{
+	struct socket *sock = READ_ONCE(sk->sk_socket);
+
+	return sock && READ_ONCE(sock->sk) == sk;
+}
+
 /* Note: caller must be prepared to deal with negative returns */
 static inline int tcp_space(const struct sock *sk)
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 352f814a4ff6..32256519a085 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -774,6 +774,38 @@ static void tcp_init_buffer_space(struct sock *sk)
 				    (u32)TCP_INIT_CWND * tp->advmss);
 }
 
+/* Try to grow sk_rcvbuf so the hard receive-memory limit covers @needed
+ * bytes beyond sk_rmem_alloc while preserving sender-visible headroom
+ * already consumed by sk_backlog.len.
+ */
+bool tcp_try_grow_rcvbuf(struct sock *sk, int needed)
+{
+	struct net *net = sock_net(sk);
+	int backlog;
+	int rmem2;
+	int target;
+
+	needed = max(needed, 0);
+	backlog = READ_ONCE(sk->sk_backlog.len);
+	target = tcp_rmem_used(sk) + backlog + needed;
+
+	if (target <= READ_ONCE(sk->sk_rcvbuf))
+		return true;
+
+	rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+	if (READ_ONCE(sk->sk_rcvbuf) >= rmem2 ||
+	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
+	    tcp_under_memory_pressure(sk) ||
+	    sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
+		return false;
+
+	WRITE_ONCE(sk->sk_rcvbuf,
+		   min_t(int, rmem2,
+			 max_t(int, READ_ONCE(sk->sk_rcvbuf), target)));
+
+	return target <= READ_ONCE(sk->sk_rcvbuf);
+}
+
 /* 4. Recalculate window clamp after socket hit its memory bounds. */
 static void tcp_clamp_window(struct sock *sk)
 {
@@ -785,14 +817,14 @@ static void tcp_clamp_window(struct sock *sk)
 	icsk->icsk_ack.quick = 0;
 	rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
 
-	if (sk->sk_rcvbuf < rmem2 &&
+	if (READ_ONCE(sk->sk_rcvbuf) < rmem2 &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
 	    !tcp_under_memory_pressure(sk) &&
 	    sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
 		WRITE_ONCE(sk->sk_rcvbuf,
 			   min(atomic_read(&sk->sk_rmem_alloc), rmem2));
 	}
-	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+	if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf))
 		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 57a2a6daaad3..53781cf591d2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3375,13 +3375,24 @@ u32 __tcp_select_window(struct sock *sk)
 	 * scaled window will not line up with the MSS boundary anyway.
 	 */
 	if (tp->rx_opt.rcv_wscale) {
+		int rcv_wscale = 1 << tp->rx_opt.rcv_wscale;
+
 		window = free_space;
 
 		/* Advertise enough space so that it won't get scaled away.
-		 * Import case: prevent zero window announcement if
+		 * Important case: prevent zero-window announcement if
 		 * 1<<rcv_wscale > mss.
 		 */
-		window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
+		window = ALIGN(window, rcv_wscale);
+
+		/* Back any scale-quantization slack before we expose it.
+		 * Otherwise tcp_can_ingest() can reject data which is still
+		 * within the sender-visible window.
+		 */
+		if (window > free_space &&
+		    (!tcp_rcvbuf_grow_allowed(sk) ||
+		     !tcp_try_grow_rcvbuf(sk, tcp_space_from_win(sk, window))))
+			window = round_down(free_space, rcv_wscale);
 	} else {
 		window = tp->rcv_wnd;
 		/* Get the largest window that is a nice multiple of mss.
-- 
2.43.0


  parent reply	other threads:[~2026-03-14 20:14 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-14 20:13 [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 01/14] tcp: factor receive-memory accounting helpers atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 02/14] tcp: snapshot advertise-time scaling for rcv_wnd atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 03/14] tcp: refresh rcv_wnd snapshots at TCP write sites atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 04/14] tcp: snapshot the maximum advertised receive window atwellwea
2026-03-14 20:13 ` atwellwea [this message]
2026-03-16 11:04   ` [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack Paolo Abeni
2026-03-16 11:24   ` Paolo Abeni
2026-03-16 11:31   ` Paolo Abeni
2026-03-14 20:13 ` [PATCH net-next v2 06/14] tcp: regrow rcvbuf when scaling_ratio drops after advertisement atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 07/14] tcp: honor the maximum advertised window after live retraction atwellwea
2026-03-16 11:44   ` Paolo Abeni
2026-03-14 20:13 ` [PATCH net-next v2 08/14] tcp: extend TCP_REPAIR_WINDOW for live and max-window snapshots atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 09/14] mptcp: refresh TCP receive-window snapshots on subflows atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 10/14] tcp: expose rmem and backlog in tcp and mptcp rcvbuf_grow tracepoints atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 11/14] selftests: tcp_ao: cover legacy, v1, and retracted repair windows atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 12/14] tun/selftests: add RX truesize injection for TCP window tests atwellwea
2026-03-15  1:18   ` Jakub Kicinski
2026-03-14 20:13 ` [PATCH net-next v2 13/14] netdevsim: add peer RX truesize support for selftests atwellwea
2026-03-15  1:18   ` Jakub Kicinski
2026-03-14 20:13 ` [PATCH net-next v2 14/14] netdevsim: release pinned PSP ext on drop paths atwellwea
2026-03-15  1:19 ` [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift Jakub Kicinski
2026-03-16 11:09 ` Paolo Abeni
     [not found]   ` <CAN=sVvyNpkyok_bt8eQSmqc4f7g7QoZBUmRmNRLoFz1HasEzMA@mail.gmail.com>
2026-03-16 17:47     ` Paolo Abeni
2026-03-16 18:03       ` Wesley Atwell

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260314201348.1786972-6-atwellwea@gmail.com \
    --to=atwellwea@gmail.com \
    --cc=0x7f454c46@gmail.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=corbet@lwn.net \
    --cc=davem@davemloft.net \
    --cc=dsahern@kernel.org \
    --cc=edumazet@google.com \
    --cc=geliang@kernel.org \
    --cc=horms@kernel.org \
    --cc=jasowang@redhat.com \
    --cc=kuba@kernel.org \
    --cc=kuniyu@google.com \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=martineau@kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=matttbe@kernel.org \
    --cc=mhiramat@kernel.org \
    --cc=mptcp@lists.linux.dev \
    --cc=ncardwell@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=rostedt@goodmis.org \
    --cc=skhan@linuxfoundation.org \
    --cc=willemdebruijn.kernel@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox