netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next] tcp: better handle TCP_TX_DELAY on established flows
@ 2025-10-13 14:59 Eric Dumazet
  2025-10-14  8:22 ` Paolo Abeni
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Eric Dumazet @ 2025-10-13 14:59 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Neal Cardwell, Willem de Bruijn, Kuniyuki Iwashima,
	netdev, eric.dumazet, Eric Dumazet

Some applications uses TCP_TX_DELAY socket option after TCP flow
is established.

Some metrics need to be updated, otherwise TCP might take time to
adapt to the new (emulated) RTT.

This patch adjusts tp->srtt_us, tp->rtt_min, icsk_rto
and sk->sk_pacing_rate.

This is best effort, and for instance icsk_rto is reset
without taking backoff into account.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/tcp.h    |  2 ++
 net/ipv4/tcp.c       | 31 +++++++++++++++++++++++++++----
 net/ipv4/tcp_input.c |  4 ++--
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5ca230ed526ae02711e8d2a409b91664b73390f2..1e547138f4fb7f5c47d15990954d4d135f465f73 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -461,6 +461,8 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
 void tcp_enter_loss(struct sock *sk);
 void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag);
 void tcp_clear_retrans(struct tcp_sock *tp);
+void tcp_update_pacing_rate(struct sock *sk);
+void tcp_set_rto(struct sock *sk);
 void tcp_update_metrics(struct sock *sk);
 void tcp_init_metrics(struct sock *sk);
 void tcp_metrics_init(void);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8a18aeca7ab07480844946120f51a0555699b4c3..84662904ca96ed5685e56a827d067b62fdac3063 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3583,9 +3583,12 @@ static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
 DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
 EXPORT_IPV6_MOD(tcp_tx_delay_enabled);
 
-static void tcp_enable_tx_delay(void)
+static void tcp_enable_tx_delay(struct sock *sk, int val)
 {
-	if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
+	struct tcp_sock *tp = tcp_sk(sk);
+	s32 delta = (val - tp->tcp_tx_delay) << 3;
+
+	if (val && !static_branch_unlikely(&tcp_tx_delay_enabled)) {
 		static int __tcp_tx_delay_enabled = 0;
 
 		if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
@@ -3593,6 +3596,22 @@ static void tcp_enable_tx_delay(void)
 			pr_info("TCP_TX_DELAY enabled\n");
 		}
 	}
+	/* If we change tcp_tx_delay on a live flow, adjust tp->srtt_us,
+	 * tp->rtt_min, icsk_rto and sk->sk_pacing_rate.
+	 * This is best effort.
+	 */
+	if (delta && sk->sk_state == TCP_ESTABLISHED) {
+		s64 srtt = (s64)tp->srtt_us + delta;
+
+		tp->srtt_us = clamp_t(s64, srtt, 1, ~0U);
+
+		/* Note: does not deal with non zero icsk_backoff */
+		tcp_set_rto(sk);
+
+		minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
+
+		tcp_update_pacing_rate(sk);
+	}
 }
 
 /* When set indicates to always queue non-full frames.  Later the user clears
@@ -4119,8 +4138,12 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
 			tp->recvmsg_inq = val;
 		break;
 	case TCP_TX_DELAY:
-		if (val)
-			tcp_enable_tx_delay();
+		/* tp->srtt_us is u32, and is shifted by 3 */
+		if (val < 0 || val >= (1U << (31 - 3)) ) {
+			err = -EINVAL;
+			break;
+		}
+		tcp_enable_tx_delay(sk, val);
 		WRITE_ONCE(tp->tcp_tx_delay, val);
 		break;
 	default:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 31ea5af49f2dc8a6f95f3f8c24065369765b8987..8fc97f4d8a6b2f8e39cabf6c9b3e6cdae294a5f5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1095,7 +1095,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
 	tp->srtt_us = max(1U, srtt);
 }
 
-static void tcp_update_pacing_rate(struct sock *sk)
+void tcp_update_pacing_rate(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	u64 rate;
@@ -1132,7 +1132,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
  * routine referred to above.
  */
-static void tcp_set_rto(struct sock *sk)
+void tcp_set_rto(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	/* Old crap is replaced with new one. 8)
-- 
2.51.0.740.g6adb054d12-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2025-10-15 16:00 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-13 14:59 [PATCH net-next] tcp: better handle TCP_TX_DELAY on established flows Eric Dumazet
2025-10-14  8:22 ` Paolo Abeni
2025-10-14  8:29   ` Eric Dumazet
2025-10-14  8:54     ` Eric Dumazet
2025-10-14  9:37       ` Paolo Abeni
2025-10-14  9:40         ` Eric Dumazet
2025-10-14 16:06           ` Jakub Kicinski
2025-10-14 16:16             ` Eric Dumazet
2025-10-14 17:04               ` Jakub Kicinski
2025-10-14  9:38       ` Eric Dumazet
2025-10-15  2:34 ` Jakub Kicinski
2025-10-15  5:17   ` Eric Dumazet
2025-10-15 16:00 ` patchwork-bot+netdevbpf

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).