From: atwellwea@gmail.com
To: netdev@vger.kernel.org, davem@davemloft.net, kuba@kernel.org,
pabeni@redhat.com, edumazet@google.com, ncardwell@google.com
Cc: linux-kernel@vger.kernel.org, linux-api@vger.kernel.org,
linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org,
linux-trace-kernel@vger.kernel.org, mptcp@lists.linux.dev,
dsahern@kernel.org, horms@kernel.org, kuniyu@google.com,
andrew+netdev@lunn.ch, willemdebruijn.kernel@gmail.com,
jasowang@redhat.com, skhan@linuxfoundation.org, corbet@lwn.net,
matttbe@kernel.org, martineau@kernel.org, geliang@kernel.org,
rostedt@goodmis.org, mhiramat@kernel.org,
mathieu.desnoyers@efficios.com, 0x7f454c46@gmail.com
Subject: [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack
Date: Sat, 14 Mar 2026 14:13:39 -0600 [thread overview]
Message-ID: <20260314201348.1786972-6-atwellwea@gmail.com> (raw)
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>
From: Wesley Atwell <atwellwea@gmail.com>
Teach TCP to grow sk_rcvbuf when scale rounding would otherwise expose
more sender-visible window than the current hard receive-memory backing
can cover.
The new helper keeps backlog and memory-pressure limits in the same
units as the rest of the receive path, while __tcp_select_window()
backs any rounding slack before advertising it.
Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
include/net/tcp.h | 12 ++++++++++++
net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++--
net/ipv4/tcp_output.c | 15 +++++++++++++--
3 files changed, 59 insertions(+), 4 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index fc22ab6b80d5..5b479ad44f89 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -397,6 +397,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg);
enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
+bool tcp_try_grow_rcvbuf(struct sock *sk, int needed);
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
@@ -1844,6 +1845,17 @@ static inline int tcp_rwnd_avail(const struct sock *sk)
return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len);
}
+/* Passive children clone the listener's sk_socket until accept() grafts
+ * their own struct socket, so only sockets that point back to themselves
+ * should autotune receive-buffer backing.
+ */
+static inline bool tcp_rcvbuf_grow_allowed(const struct sock *sk)
+{
+ struct socket *sock = READ_ONCE(sk->sk_socket);
+
+ return sock && READ_ONCE(sock->sk) == sk;
+}
+
/* Note: caller must be prepared to deal with negative returns */
static inline int tcp_space(const struct sock *sk)
{
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 352f814a4ff6..32256519a085 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -774,6 +774,38 @@ static void tcp_init_buffer_space(struct sock *sk)
(u32)TCP_INIT_CWND * tp->advmss);
}
+/* Try to grow sk_rcvbuf so the hard receive-memory limit covers @needed
+ * bytes beyond sk_rmem_alloc while preserving sender-visible headroom
+ * already consumed by sk_backlog.len.
+ */
+bool tcp_try_grow_rcvbuf(struct sock *sk, int needed)
+{
+ struct net *net = sock_net(sk);
+ int backlog;
+ int rmem2;
+ int target;
+
+ needed = max(needed, 0);
+ backlog = READ_ONCE(sk->sk_backlog.len);
+ target = tcp_rmem_used(sk) + backlog + needed;
+
+ if (target <= READ_ONCE(sk->sk_rcvbuf))
+ return true;
+
+ rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+ if (READ_ONCE(sk->sk_rcvbuf) >= rmem2 ||
+ (sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
+ tcp_under_memory_pressure(sk) ||
+ sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
+ return false;
+
+ WRITE_ONCE(sk->sk_rcvbuf,
+ min_t(int, rmem2,
+ max_t(int, READ_ONCE(sk->sk_rcvbuf), target)));
+
+ return target <= READ_ONCE(sk->sk_rcvbuf);
+}
+
/* 4. Recalculate window clamp after socket hit its memory bounds. */
static void tcp_clamp_window(struct sock *sk)
{
@@ -785,14 +817,14 @@ static void tcp_clamp_window(struct sock *sk)
icsk->icsk_ack.quick = 0;
rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
- if (sk->sk_rcvbuf < rmem2 &&
+ if (READ_ONCE(sk->sk_rcvbuf) < rmem2 &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
WRITE_ONCE(sk->sk_rcvbuf,
min(atomic_read(&sk->sk_rmem_alloc), rmem2));
}
- if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+ if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf))
tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 57a2a6daaad3..53781cf591d2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3375,13 +3375,24 @@ u32 __tcp_select_window(struct sock *sk)
* scaled window will not line up with the MSS boundary anyway.
*/
if (tp->rx_opt.rcv_wscale) {
+ int rcv_wscale = 1 << tp->rx_opt.rcv_wscale;
+
window = free_space;
/* Advertise enough space so that it won't get scaled away.
- * Import case: prevent zero window announcement if
+ * Important case: prevent zero-window announcement if
* 1<<rcv_wscale > mss.
*/
- window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
+ window = ALIGN(window, rcv_wscale);
+
+ /* Back any scale-quantization slack before we expose it.
+ * Otherwise tcp_can_ingest() can reject data which is still
+ * within the sender-visible window.
+ */
+ if (window > free_space &&
+ (!tcp_rcvbuf_grow_allowed(sk) ||
+ !tcp_try_grow_rcvbuf(sk, tcp_space_from_win(sk, window))))
+ window = round_down(free_space, rcv_wscale);
} else {
window = tp->rcv_wnd;
/* Get the largest window that is a nice multiple of mss.
--
2.43.0
next prev parent reply other threads:[~2026-03-14 20:14 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-14 20:13 [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 01/14] tcp: factor receive-memory accounting helpers atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 02/14] tcp: snapshot advertise-time scaling for rcv_wnd atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 03/14] tcp: refresh rcv_wnd snapshots at TCP write sites atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 04/14] tcp: snapshot the maximum advertised receive window atwellwea
2026-03-14 20:13 ` atwellwea [this message]
2026-03-16 11:04 ` [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack Paolo Abeni
2026-03-16 11:24 ` Paolo Abeni
2026-03-16 11:31 ` Paolo Abeni
2026-03-14 20:13 ` [PATCH net-next v2 06/14] tcp: regrow rcvbuf when scaling_ratio drops after advertisement atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 07/14] tcp: honor the maximum advertised window after live retraction atwellwea
2026-03-16 11:44 ` Paolo Abeni
2026-03-14 20:13 ` [PATCH net-next v2 08/14] tcp: extend TCP_REPAIR_WINDOW for live and max-window snapshots atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 09/14] mptcp: refresh TCP receive-window snapshots on subflows atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 10/14] tcp: expose rmem and backlog in tcp and mptcp rcvbuf_grow tracepoints atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 11/14] selftests: tcp_ao: cover legacy, v1, and retracted repair windows atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 12/14] tun/selftests: add RX truesize injection for TCP window tests atwellwea
2026-03-15 1:18 ` Jakub Kicinski
2026-03-14 20:13 ` [PATCH net-next v2 13/14] netdevsim: add peer RX truesize support for selftests atwellwea
2026-03-15 1:18 ` Jakub Kicinski
2026-03-14 20:13 ` [PATCH net-next v2 14/14] netdevsim: release pinned PSP ext on drop paths atwellwea
2026-03-15 1:19 ` [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift Jakub Kicinski
2026-03-16 11:09 ` Paolo Abeni
[not found] ` <CAN=sVvyNpkyok_bt8eQSmqc4f7g7QoZBUmRmNRLoFz1HasEzMA@mail.gmail.com>
2026-03-16 17:47 ` Paolo Abeni
2026-03-16 18:03 ` Wesley Atwell
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260314201348.1786972-6-atwellwea@gmail.com \
--to=atwellwea@gmail.com \
--cc=0x7f454c46@gmail.com \
--cc=andrew+netdev@lunn.ch \
--cc=corbet@lwn.net \
--cc=davem@davemloft.net \
--cc=dsahern@kernel.org \
--cc=edumazet@google.com \
--cc=geliang@kernel.org \
--cc=horms@kernel.org \
--cc=jasowang@redhat.com \
--cc=kuba@kernel.org \
--cc=kuniyu@google.com \
--cc=linux-api@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=martineau@kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=matttbe@kernel.org \
--cc=mhiramat@kernel.org \
--cc=mptcp@lists.linux.dev \
--cc=ncardwell@google.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=rostedt@goodmis.org \
--cc=skhan@linuxfoundation.org \
--cc=willemdebruijn.kernel@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.