[PATCH net 3/7] tcp: honor advertised receive window in memory admission and clamping

public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed

From: Wesley Atwell <atwellwea@gmail.com>
To: davem@davemloft.net, kuba@kernel.org, pabeni@redhat.com,
	edumazet@google.com, ncardwell@google.com, dsahern@kernel.org,
	matttbe@kernel.org, martineau@kernel.org, netdev@vger.kernel.org,
	mptcp@lists.linux.dev
Cc: kuniyu@google.com, horms@kernel.org, geliang@kernel.org,
	corbet@lwn.net, skhan@linuxfoundation.org, rostedt@goodmis.org,
	mhiramat@kernel.org, mathieu.desnoyers@efficios.com,
	0x7f454c46@gmail.com, linux-doc@vger.kernel.org,
	linux-trace-kernel@vger.kernel.org,
	linux-kselftest@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-api@vger.kernel.org, atwellwea@gmail.com
Subject: [PATCH net 3/7] tcp: honor advertised receive window in memory admission and clamping
Date: Wed, 11 Mar 2026 01:55:56 -0600	[thread overview]
Message-ID: <20260311075600.948413-4-atwellwea@gmail.com> (raw)
In-Reply-To: <20260311075600.948413-1-atwellwea@gmail.com>

tp->rcv_wnd is an advertised promise to the sender, but receive-memory
accounting was still reconstructing that promise through mutable live
state.

Switch the receive-side decisions over to the advertise-time snapshot.
Use it when deciding whether a packet can be admitted, when deciding how
far to clamp future window growth, and when handling the scaled-window
quantization slack in __tcp_select_window(). If a snapshot is not
available, keep the legacy fallback behavior.

This keeps sender-visible rwnd and the local hard rmem budget in the
same unit system instead of letting ratio drift create accounting
mismatches.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 include/net/tcp.h     |  1 +
 net/ipv4/tcp_input.c  | 86 ++++++++++++++++++++++++++++++++++++++++---
 net/ipv4/tcp_output.c | 14 ++++++-
 3 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 187e6d660f62..88ddf7ee826e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -384,6 +384,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg);
 enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
 void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
+bool tcp_try_grow_rcvbuf(struct sock *sk, int needed);
 void tcp_rcv_space_adjust(struct sock *sk);
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
 void tcp_twsk_destructor(struct sock *sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cba89733d121..f76011fc1b7a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -774,8 +774,37 @@ static void tcp_init_buffer_space(struct sock *sk)
 				    (u32)TCP_INIT_CWND * tp->advmss);
 }
 
+/* Try to grow sk_rcvbuf so the hard receive-memory limit covers @needed
+ * bytes beyond the memory already charged in sk_rmem_alloc.
+ */
+bool tcp_try_grow_rcvbuf(struct sock *sk, int needed)
+{
+	struct net *net = sock_net(sk);
+	int target;
+	int rmem2;
+
+	needed = max(needed, 0);
+	target = tcp_rmem_used(sk) + needed;
+
+	if (target <= READ_ONCE(sk->sk_rcvbuf))
+		return true;
+
+	rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+	if (READ_ONCE(sk->sk_rcvbuf) >= rmem2 ||
+	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
+	    tcp_under_memory_pressure(sk) ||
+	    sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
+		return false;
+
+	WRITE_ONCE(sk->sk_rcvbuf,
+		   min_t(int, rmem2,
+			 max_t(int, READ_ONCE(sk->sk_rcvbuf), target)));
+
+	return target <= READ_ONCE(sk->sk_rcvbuf);
+}
+
 /* 4. Recalculate window clamp after socket hit its memory bounds. */
-static void tcp_clamp_window(struct sock *sk)
+static void tcp_clamp_window_legacy(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
@@ -785,14 +814,42 @@ static void tcp_clamp_window(struct sock *sk)
 	icsk->icsk_ack.quick = 0;
 	rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
 
-	if (sk->sk_rcvbuf < rmem2 &&
+	if (READ_ONCE(sk->sk_rcvbuf) < rmem2 &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
 	    !tcp_under_memory_pressure(sk) &&
 	    sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
 		WRITE_ONCE(sk->sk_rcvbuf,
 			   min(atomic_read(&sk->sk_rmem_alloc), rmem2));
 	}
-	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+	if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf))
+		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
+}
+
+static void tcp_clamp_window(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 cur_rwnd = tcp_receive_window(tp);
+	int need;
+
+	if (!tcp_space_from_rcv_wnd(tp, cur_rwnd, &need)) {
+		tcp_clamp_window_legacy(sk);
+		return;
+	}
+
+	inet_csk(sk)->icsk_ack.quick = 0;
+	need = max_t(int, need, 0);
+
+	/* Keep the hard receive-memory cap large enough to honor the
+	 * remaining receive window we already exposed to the sender. Use
+	 * the scaling_ratio snapshot taken when tp->rcv_wnd was advertised,
+	 * not the mutable live ratio which may drift later in the flow.
+	 */
+	tcp_try_grow_rcvbuf(sk, need);
+
+	/* If the remaining advertised rwnd no longer fits the hard budget,
+	 * slow future window growth until the accounting converges again.
+	 */
+	if (need > tcp_rmem_avail(sk))
 		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
 }
 
@@ -5374,11 +5431,28 @@ static void tcp_ofo_queue(struct sock *sk)
 static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
 static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
 
+/* Sequence checks run against the sender-visible receive window before this
+ * point. Convert the incoming payload back to the hard receive-memory budget
+ * using the scaling_ratio that was in force when tp->rcv_wnd was advertised,
+ * so admission keeps honoring the same exposed window even if the live ratio
+ * changes later in the flow. Legacy TCP_REPAIR restores do not have that
+ * advertise-time basis, so they fall back to the pre-series admission rule
+ * until a fresh local advertisement refreshes the pair.
+ *
+ * Do not subtract sk_backlog.len here. tcp_space() already reserves backlog
+ * bytes when selecting future advertised windows, and sk_backlog.len stays
+ * inflated until __release_sock() finishes draining backlog. Subtracting it
+ * again here would double count already-queued backlog packets as they move
+ * into sk_rmem_alloc.
+ */
 static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb)
 {
-	unsigned int rmem = atomic_read(&sk->sk_rmem_alloc);
+	int need;
+
+	if (!tcp_space_from_rcv_wnd(tcp_sk(sk), skb->len, &need))
+		return atomic_read(&sk->sk_rmem_alloc) <= READ_ONCE(sk->sk_rcvbuf);
 
-	return rmem <= sk->sk_rcvbuf;
+	return need <= tcp_rmem_avail(sk);
 }
 
 static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
@@ -6014,7 +6088,7 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	/* Do nothing if our queues are empty. */
-	if (!atomic_read(&sk->sk_rmem_alloc))
+	if (!tcp_rmem_used(sk))
 		return -1;
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c1b94d67d8fe..5e69fc31a4da 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3377,13 +3377,23 @@ u32 __tcp_select_window(struct sock *sk)
 	 * scaled window will not line up with the MSS boundary anyway.
 	 */
 	if (tp->rx_opt.rcv_wscale) {
+		int rcv_wscale = 1 << tp->rx_opt.rcv_wscale;
+
 		window = free_space;
 
 		/* Advertise enough space so that it won't get scaled away.
-		 * Import case: prevent zero window announcement if
+		 * Important case: prevent zero-window announcement if
 		 * 1<<rcv_wscale > mss.
 		 */
-		window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
+		window = ALIGN(window, rcv_wscale);
+
+		/* Back any scale-quantization slack before we expose it.
+		 * Otherwise tcp_can_ingest() can reject data which is still
+		 * within the sender-visible window.
+		 */
+		if (window > free_space &&
+		    !tcp_try_grow_rcvbuf(sk, tcp_space_from_win(sk, window)))
+			window = round_down(free_space, rcv_wscale);
 	} else {
 		window = tp->rcv_wnd;
 		/* Get the largest window that is a nice multiple of mss.
-- 
2.34.1

next prev parent reply	other threads:[~2026-03-11  7:56 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-11  7:55 [PATCH net 0/7] tcp: preserve advertised rwnd accounting across receive-memory decisions Wesley Atwell
2026-03-11  7:55 ` [PATCH net 1/7] tcp: track advertise-time scaling basis for rcv_wnd Wesley Atwell
2026-03-11  7:55 ` [PATCH net 2/7] tcp: preserve rcv_wnd snapshot when updating advertised windows Wesley Atwell
2026-03-11  7:55 ` Wesley Atwell [this message]
2026-03-11  7:55 ` [PATCH net 4/7] tcp: extend TCP_REPAIR_WINDOW with receive-window scaling snapshot Wesley Atwell
2026-03-11  7:55 ` [PATCH net 5/7] mptcp: refresh tcp rcv_wnd snapshot when syncing receive windows Wesley Atwell
2026-03-11  7:55 ` [PATCH net 6/7] tcp: expose rmem and backlog accounting in rcvbuf_grow tracepoints Wesley Atwell
2026-03-11  7:56 ` [PATCH net 7/7] selftests: tcp_ao: cover legacy and extended TCP_REPAIR_WINDOW layouts Wesley Atwell
2026-03-11  8:34 ` [PATCH net 0/7] tcp: preserve advertised rwnd accounting across receive-memory decisions Eric Dumazet
2026-03-12  0:41   ` Jakub Kicinski
2026-03-12  1:49     ` Eric Dumazet
2026-03-12  0:43 ` Jakub Kicinski

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:187e6d660f6 dfblob:88ddf7ee826 dfblob:cba89733d12
dfblob:f76011fc1b7 dfblob:c1b94d67d8f dfblob:5e69fc31a4d )
 OR (
bs:"[PATCH net 3/7] tcp: honor advertised receive window in memory admission and clamping" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260311075600.948413-4-atwellwea@gmail.com \
    --to=atwellwea@gmail.com \
    --cc=0x7f454c46@gmail.com \
    --cc=corbet@lwn.net \
    --cc=davem@davemloft.net \
    --cc=dsahern@kernel.org \
    --cc=edumazet@google.com \
    --cc=geliang@kernel.org \
    --cc=horms@kernel.org \
    --cc=kuba@kernel.org \
    --cc=kuniyu@google.com \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=martineau@kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=matttbe@kernel.org \
    --cc=mhiramat@kernel.org \
    --cc=mptcp@lists.linux.dev \
    --cc=ncardwell@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=rostedt@goodmis.org \
    --cc=skhan@linuxfoundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox