[PATCH v3 net-next 1/1] tcp: Replace min_tso_segs() with tso

Netdev List
 help / color / mirror / Atom feed

* [PATCH v3 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback
@ 2026-06-30 12:01 chia-yu.chang
  2026-06-30 23:20 ` Alexei Starovoitov
  0 siblings, 1 reply; 5+ messages in thread
From: chia-yu.chang @ 2026-06-30 12:01 UTC (permalink / raw)
  To: jolsa, yonghong.song, song, linux-kselftest, memxor, shuah,
	martin.lau, ast, daniel, andrii, eddyz87, horms, dsahern, bpf,
	netdev, pabeni, jhs, kuba, stephen, davem, edumazet,
	andrew+netdev, donald.hunter, kuniyu, ij, ncardwell,
	koen.de_schepper, g.white, ingemar.s.johansson, mirja.kuehlewind,
	cheshire, rs.ietf, Jason_Livingood, vidhi_goel
  Cc: Chia-Yu Chang

From: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>

This patch replaces existing min_tso_segs() with tso_segs() CC callbak
for CC algorithm to provides explicit tso segment number of each data
burst and overrides tcp_tso_autosize().

This change provides below impacts on BPF struct_ops users:
- The callback is renamed from min_tso_segs to tso_segs
- The signature gains an extra u32 mss_now argument
- The return value semantics is changed from "floor value passed into
  tcp_tso_autosize()" to "final tso_segs value", bypassing autosizing

As a result, BPF programs shall be updated, beccause retuning a small
constans will now directly limit tso_segs instead of the minimum.

Signed-off-by: Ilpo Järvinen <ij@kernel.org>
Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
---
 include/net/tcp.h                                | 13 +++++++++++--
 net/ipv4/bpf_tcp_ca.c                            |  8 +++++---
 net/ipv4/tcp_bbr.c                               | 13 ++++++++++---
 net/ipv4/tcp_output.c                            | 13 +++++++------
 tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c |  8 ++++----
 5 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6d376ea4d1c0..7fb42a0ce7da 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -824,6 +824,9 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
 unsigned int tcp_current_mss(struct sock *sk);
 u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when);
 
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+		     int min_tso_segs);
+
 /* Bound MSS / TSO packet size with the half of the window */
 static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
 {
@@ -1361,8 +1364,14 @@ struct tcp_congestion_ops {
 	/* hook for packet ack accounting (optional) */
 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
 
-	/* override sysctl_tcp_min_tso_segs (optional) */
-	u32 (*min_tso_segs)(struct sock *sk);
+	/*
+	 * Override tcp_tso_autosize (optional)
+	 *
+	 * If provided, this callback returns the final TSO segment number
+	 * and will bypass tcp_tso_autosize() entirely. The implementation
+	 * must derive an appropriate value and ensure the result is valid.
+	 */
+	u32 (*tso_segs)(struct sock *sk, u32 mss_now);
 
 	/* new value of cwnd after loss (required) */
 	u32  (*undo_cwnd)(struct sock *sk);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 791e15063237..27c4cdfd80a8 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -284,9 +284,11 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp
 {
 }
 
-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
+static u32 bpf_tcp_ca_tso_segs(struct sock *sk, u32 mss_now)
 {
-	return 0;
+	if (unlikely(!mss_now))
+		return U32_MAX;
+	return tcp_tso_autosize(sk, mss_now, 0);
 }
 
 static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag,
@@ -320,7 +322,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
 	.cwnd_event_tx_start = bpf_tcp_ca_cwnd_event_tx_start,
 	.in_ack_event = bpf_tcp_ca_in_ack_event,
 	.pkts_acked = bpf_tcp_ca_pkts_acked,
-	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
+	.tso_segs = bpf_tcp_ca_tso_segs,
 	.cong_control = bpf_tcp_ca_cong_control,
 	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
 	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 82378a2bfd1e..b63e77b14c65 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -297,11 +297,18 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 }
 
 /* override sysctl_tcp_min_tso_segs */
-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
+static u32 bbr_min_tso_segs(struct sock *sk)
 {
 	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
 }
 
+__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, u32 mss_now)
+{
+	if (unlikely(!mss_now))
+		return U32_MAX;
+	return tcp_tso_autosize(sk, mss_now, bbr_min_tso_segs(sk));
+}
+
 static u32 bbr_tso_segs_goal(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -1151,7 +1158,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
 	.undo_cwnd	= bbr_undo_cwnd,
 	.cwnd_event_tx_start	= bbr_cwnd_event_tx_start,
 	.ssthresh	= bbr_ssthresh,
-	.min_tso_segs	= bbr_min_tso_segs,
+	.tso_segs	= bbr_tso_segs,
 	.get_info	= bbr_get_info,
 	.set_state	= bbr_set_state,
 };
@@ -1163,7 +1170,7 @@ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
 BTF_ID_FLAGS(func, bbr_undo_cwnd)
 BTF_ID_FLAGS(func, bbr_cwnd_event_tx_start)
 BTF_ID_FLAGS(func, bbr_ssthresh)
-BTF_ID_FLAGS(func, bbr_min_tso_segs)
+BTF_ID_FLAGS(func, bbr_tso_segs)
 BTF_ID_FLAGS(func, bbr_set_state)
 BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 00ec4b5900f2..f3fc4b64e61d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2253,8 +2253,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
  * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
  * is below 1500 bytes after 6 * ~500 usec = 3ms.
  */
-static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
-			    int min_tso_segs)
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+		     int min_tso_segs)
 {
 	unsigned long bytes;
 	u32 r;
@@ -2269,6 +2269,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
 
 	return max_t(u32, bytes / mss_now, min_tso_segs);
 }
+EXPORT_SYMBOL(tcp_tso_autosize);
 
 /* Return the number of segments we want in the skb we are transmitting.
  * See if congestion control module wants to decide; otherwise, autosize.
@@ -2278,11 +2279,11 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
 	u32 min_tso, tso_segs;
 
-	min_tso = ca_ops->min_tso_segs ?
-			ca_ops->min_tso_segs(sk) :
-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+	min_tso = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
 
-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
+	tso_segs = ca_ops->tso_segs ?
+			ca_ops->tso_segs(sk, mss_now) :
+			tcp_tso_autosize(sk, mss_now, min_tso);
 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
 }
 
diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c
index 0a3e9d35bf6f..58262e490336 100644
--- a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c
+++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c
@@ -10,7 +10,7 @@ extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym;
 extern u32 bbr_undo_cwnd(struct sock *sk) __ksym;
 extern void bbr_cwnd_event_tx_start(struct sock *sk) __ksym;
 extern u32 bbr_ssthresh(struct sock *sk) __ksym;
-extern u32 bbr_min_tso_segs(struct sock *sk) __ksym;
+extern u32 bbr_tso_segs(struct sock *sk, u32 mss_now) __ksym;
 extern void bbr_set_state(struct sock *sk, u8 new_state) __ksym;
 
 extern void dctcp_init(struct sock *sk) __ksym;
@@ -90,9 +90,9 @@ u32 BPF_PROG(ssthresh, struct sock *sk)
 }
 
 SEC("struct_ops")
-u32 BPF_PROG(min_tso_segs, struct sock *sk)
+u32 BPF_PROG(tso_segs, struct sock *sk, u32 mss_now)
 {
-	return bbr_min_tso_segs(sk);
+	return bbr_tso_segs(sk, mss_now);
 }
 
 SEC("struct_ops")
@@ -120,7 +120,7 @@ struct tcp_congestion_ops tcp_ca_kfunc = {
 	.cwnd_event	= (void *)cwnd_event,
 	.cwnd_event_tx_start = (void *)cwnd_event_tx_start,
 	.ssthresh	= (void *)ssthresh,
-	.min_tso_segs	= (void *)min_tso_segs,
+	.tso_segs	= (void *)tso_segs,
 	.set_state	= (void *)set_state,
 	.pkts_acked     = (void *)pkts_acked,
 	.name		= "tcp_ca_kfunc",
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v3 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback
  2026-06-30 12:01 [PATCH v3 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback chia-yu.chang
@ 2026-06-30 23:20 ` Alexei Starovoitov
  2026-07-01  5:46   ` Chia-Yu Chang (Nokia)
  0 siblings, 1 reply; 5+ messages in thread
From: Alexei Starovoitov @ 2026-06-30 23:20 UTC (permalink / raw)
  To: chia-yu.chang, jolsa, yonghong.song, song, linux-kselftest,
	memxor, shuah, martin.lau, ast, daniel, andrii, eddyz87, horms,
	dsahern, bpf, netdev, pabeni, jhs, kuba, stephen, davem, edumazet,
	andrew+netdev, donald.hunter, kuniyu, ij, ncardwell,
	koen.de_schepper, g.white, ingemar.s.johansson, mirja.kuehlewind,
	cheshire, rs.ietf, Jason_Livingood, vidhi_goel

On Tue Jun 30, 2026 at 5:01 AM PDT, chia-yu.chang wrote:
> From: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
>
> This patch replaces existing min_tso_segs() with tso_segs() CC callbak
> for CC algorithm to provides explicit tso segment number of each data
> burst and overrides tcp_tso_autosize().
>
> This change provides below impacts on BPF struct_ops users:
> - The callback is renamed from min_tso_segs to tso_segs
> - The signature gains an extra u32 mss_now argument
> - The return value semantics is changed from "floor value passed into
>   tcp_tso_autosize()" to "final tso_segs value", bypassing autosizing
>
> As a result, BPF programs shall be updated, beccause retuning a small
> constans will now directly limit tso_segs instead of the minimum.
>
> Signed-off-by: Ilpo Järvinen <ij@kernel.org>
> Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
> ---
>  include/net/tcp.h                                | 13 +++++++++++--
>  net/ipv4/bpf_tcp_ca.c                            |  8 +++++---
>  net/ipv4/tcp_bbr.c                               | 13 ++++++++++---
>  net/ipv4/tcp_output.c                            | 13 +++++++------
>  tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c |  8 ++++----
>  5 files changed, 37 insertions(+), 18 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 6d376ea4d1c0..7fb42a0ce7da 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -824,6 +824,9 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
>  unsigned int tcp_current_mss(struct sock *sk);
>  u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when);
>  
> +u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
> +		     int min_tso_segs);
> +
>  /* Bound MSS / TSO packet size with the half of the window */
>  static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
>  {
> @@ -1361,8 +1364,14 @@ struct tcp_congestion_ops {
>  	/* hook for packet ack accounting (optional) */
>  	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
>  
> -	/* override sysctl_tcp_min_tso_segs (optional) */
> -	u32 (*min_tso_segs)(struct sock *sk);
> +	/*
> +	 * Override tcp_tso_autosize (optional)
> +	 *
> +	 * If provided, this callback returns the final TSO segment number
> +	 * and will bypass tcp_tso_autosize() entirely. The implementation
> +	 * must derive an appropriate value and ensure the result is valid.
> +	 */
> +	u32 (*tso_segs)(struct sock *sk, u32 mss_now);

I don't like this interface change.
It introduces churn for no good reason.
At least I don't see why you cannot live with the existing api.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v3 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback
  2026-06-30 23:20 ` Alexei Starovoitov
@ 2026-07-01  5:46   ` Chia-Yu Chang (Nokia)
  2026-07-01  5:56     ` Alexei Starovoitov
  0 siblings, 1 reply; 5+ messages in thread
From: Chia-Yu Chang (Nokia) @ 2026-07-01  5:46 UTC (permalink / raw)
  To: Alexei Starovoitov, jolsa@kernel.org, yonghong.song@linux.dev,
	song@kernel.org, linux-kselftest@vger.kernel.org,
	memxor@gmail.com, shuah@kernel.org, martin.lau@linux.dev,
	ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org,
	eddyz87@gmail.com, horms@kernel.org, dsahern@kernel.org,
	bpf@vger.kernel.org, netdev@vger.kernel.org, pabeni@redhat.com,
	jhs@mojatatu.com, kuba@kernel.org, stephen@networkplumber.org,
	davem@davemloft.net, edumazet@google.com, andrew+netdev@lunn.ch,
	donald.hunter@gmail.com, kuniyu@google.com, ij@kernel.org,
	ncardwell@google.com, Koen De Schepper (Nokia),
	g.white@cablelabs.com, ingemar.s.johansson@ericsson.com,
	mirja.kuehlewind@ericsson.com, cheshire@apple.com, rs.ietf@gmx.at,
	Jason_Livingood@comcast.com, vidhi_goel@apple.com

> -----Original Message-----
> From: Alexei Starovoitov <alexei.starovoitov@gmail.com> 
> Sent: Wednesday, July 1, 2026 1:21 AM
> To: Chia-Yu Chang (Nokia) <chia-yu.chang@nokia-bell-labs.com>; jolsa@kernel.org; yonghong.song@linux.dev; song@kernel.org; linux-kselftest@vger.kernel.org; memxor@gmail.com; shuah@kernel.org; martin.lau@linux.dev; ast@kernel.org; daniel@iogearbox.net; andrii@kernel.org; eddyz87@gmail.com; horms@kernel.org; dsahern@kernel.org; bpf@vger.kernel.org; netdev@vger.kernel.org; pabeni@redhat.com; jhs@mojatatu.com; kuba@kernel.org; stephen@networkplumber.org; davem@davemloft.net; edumazet@google.com; andrew+netdev@lunn.ch; donald.hunter@gmail.com; kuniyu@google.com; ij@kernel.org; ncardwell@google.com; Koen De Schepper (Nokia) <koen.de_schepper@nokia-bell-labs.com>; g.white@cablelabs.com; ingemar.s.johansson@ericsson.com; mirja.kuehlewind@ericsson.com; cheshire@apple.com; rs.ietf@gmx.at; Jason_Livingood@comcast.com; vidhi_goel@apple.com
> Subject: Re: [PATCH v3 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback
> 
> [You don't often get email from alexei.starovoitov@gmail.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
> 
> CAUTION: This is an external email. Please be very careful when clicking links or opening attachments. See the URL nok.it/ext for additional information.
> 
> 
> 
> On Tue Jun 30, 2026 at 5:01 AM PDT, chia-yu.chang wrote:
> > From: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
> >
> > This patch replaces existing min_tso_segs() with tso_segs() CC callbak 
> > for CC algorithm to provides explicit tso segment number of each data 
> > burst and overrides tcp_tso_autosize().
> >
> > This change provides below impacts on BPF struct_ops users:
> > - The callback is renamed from min_tso_segs to tso_segs
> > - The signature gains an extra u32 mss_now argument
> > - The return value semantics is changed from "floor value passed into
> >   tcp_tso_autosize()" to "final tso_segs value", bypassing autosizing
> >
> > As a result, BPF programs shall be updated, beccause retuning a small 
> > constans will now directly limit tso_segs instead of the minimum.
> >
> > Signed-off-by: Ilpo Järvinen <ij@kernel.org>
> > Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
> > ---
> >  include/net/tcp.h                                | 13 +++++++++++--
> >  net/ipv4/bpf_tcp_ca.c                            |  8 +++++---
> >  net/ipv4/tcp_bbr.c                               | 13 ++++++++++---
> >  net/ipv4/tcp_output.c                            | 13 +++++++------
> >  tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c |  8 ++++----
> >  5 files changed, 37 insertions(+), 18 deletions(-)
> >
> > diff --git a/include/net/tcp.h b/include/net/tcp.h index 
> > 6d376ea4d1c0..7fb42a0ce7da 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -824,6 +824,9 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 
> > pmtu);  unsigned int tcp_current_mss(struct sock *sk);
> >  u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 
> > when);
> >
> > +u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
> > +                  int min_tso_segs);
> > +
> >  /* Bound MSS / TSO packet size with the half of the window */  static 
> > inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)  { 
> > @@ -1361,8 +1364,14 @@ struct tcp_congestion_ops {
> >       /* hook for packet ack accounting (optional) */
> >       void (*pkts_acked)(struct sock *sk, const struct ack_sample 
> > *sample);
> >
> > -     /* override sysctl_tcp_min_tso_segs (optional) */
> > -     u32 (*min_tso_segs)(struct sock *sk);
> > +     /*
> > +      * Override tcp_tso_autosize (optional)
> > +      *
> > +      * If provided, this callback returns the final TSO segment number
> > +      * and will bypass tcp_tso_autosize() entirely. The implementation
> > +      * must derive an appropriate value and ensure the result is valid.
> > +      */
> > +     u32 (*tso_segs)(struct sock *sk, u32 mss_now);
> 
> I don't like this interface change.
> It introduces churn for no good reason.
> At least I don't see why you cannot live with the existing api.

Hi Alexei,

This patch was part of TCP Prague preparation series: https://lore.kernel.org/all/20260611161504.228319-4-chia-yu.chang@nokia-bell-labs.com/
Our original patch is to add an extra tso_segs, and after discussion it's recommended to replace exisiting min_tso_segs.

This is needed because TCP Prague would set the exact TSO size rather than using autosizing from TCP.
The TCP Prague itself is planned to be submitted after all preparation commits are accepted.
You can find its current stauts: https://github.com/L4STeam/linux-net-next/blob/upstream_l4steam/net/ipv4/tcp_prague.c

Thanks.
Chia-Yu

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback
  2026-07-01  5:46   ` Chia-Yu Chang (Nokia)
@ 2026-07-01  5:56     ` Alexei Starovoitov
  2026-07-01 16:16       ` Koen De Schepper (Nokia)
  0 siblings, 1 reply; 5+ messages in thread
From: Alexei Starovoitov @ 2026-07-01  5:56 UTC (permalink / raw)
  To: Chia-Yu Chang (Nokia), jolsa@kernel.org, yonghong.song@linux.dev,
	song@kernel.org, linux-kselftest@vger.kernel.org,
	memxor@gmail.com, shuah@kernel.org, martin.lau@linux.dev,
	ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org,
	eddyz87@gmail.com, horms@kernel.org, dsahern@kernel.org,
	bpf@vger.kernel.org, netdev@vger.kernel.org, pabeni@redhat.com,
	jhs@mojatatu.com, kuba@kernel.org, stephen@networkplumber.org,
	davem@davemloft.net, edumazet@google.com, andrew+netdev@lunn.ch,
	donald.hunter@gmail.com, kuniyu@google.com, ij@kernel.org,
	ncardwell@google.com, Koen De Schepper (Nokia),
	g.white@cablelabs.com, ingemar.s.johansson@ericsson.com,
	mirja.kuehlewind@ericsson.com, cheshire@apple.com, rs.ietf@gmx.at,
	Jason_Livingood@comcast.com, vidhi_goel@apple.com

On Tue Jun 30, 2026 at 10:46 PM PDT, Chia-Yu Chang (Nokia) wrote:
>> > -     /* override sysctl_tcp_min_tso_segs (optional) */
>> > -     u32 (*min_tso_segs)(struct sock *sk);
>> > +     /*
>> > +      * Override tcp_tso_autosize (optional)
>> > +      *
>> > +      * If provided, this callback returns the final TSO segment number
>> > +      * and will bypass tcp_tso_autosize() entirely. The implementation
>> > +      * must derive an appropriate value and ensure the result is valid.
>> > +      */
>> > +     u32 (*tso_segs)(struct sock *sk, u32 mss_now);
>> 
>> I don't like this interface change.
>> It introduces churn for no good reason.
>> At least I don't see why you cannot live with the existing api.
>
> Hi Alexei,
>
> This patch was part of TCP Prague preparation series: https://lore.kernel.org/all/20260611161504.228319-4-chia-yu.chang@nokia-bell-labs.com/
> Our original patch is to add an extra tso_segs, and after discussion it's recommended to replace exisiting min_tso_segs.
>
> This is needed because TCP Prague would set the exact TSO size rather than using autosizing from TCP.
> The TCP Prague itself is planned to be submitted after all preparation commits are accepted.
> You can find its current stauts: https://github.com/L4STeam/linux-net-next/blob/upstream_l4steam/net/ipv4/tcp_prague.c

You have to explain why Prague CC cannot rely on autosizing.
To me it sounds like a red flag. autosizing logic was there for a decade, if not more.
And now you're arguing that your CC logic is special and it deserves new API
and breakage of existing convention.
Maybe you should step back and reconsider.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v3 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback
  2026-07-01  5:56     ` Alexei Starovoitov
@ 2026-07-01 16:16       ` Koen De Schepper (Nokia)
  0 siblings, 0 replies; 5+ messages in thread
From: Koen De Schepper (Nokia) @ 2026-07-01 16:16 UTC (permalink / raw)
  To: Alexei Starovoitov, Chia-Yu Chang (Nokia), jolsa@kernel.org,
	yonghong.song@linux.dev, song@kernel.org,
	linux-kselftest@vger.kernel.org, memxor@gmail.com,
	shuah@kernel.org, martin.lau@linux.dev, ast@kernel.org,
	daniel@iogearbox.net, andrii@kernel.org, eddyz87@gmail.com,
	horms@kernel.org, dsahern@kernel.org, bpf@vger.kernel.org,
	netdev@vger.kernel.org, pabeni@redhat.com, jhs@mojatatu.com,
	kuba@kernel.org, stephen@networkplumber.org, davem@davemloft.net,
	edumazet@google.com, andrew+netdev@lunn.ch,
	donald.hunter@gmail.com, kuniyu@google.com, ij@kernel.org,
	ncardwell@google.com, g.white@cablelabs.com,
	ingemar.s.johansson@ericsson.com, mirja.kuehlewind@ericsson.com,
	cheshire@apple.com, rs.ietf@gmx.at, Jason_Livingood@comcast.com,
	vidhi_goel@apple.com

Hi Alexei,

>>You have to explain why Prague CC cannot rely on autosizing.
>>To me it sounds like a red flag. autosizing logic was there for a decade, if not more.
>>And now you're arguing that your CC logic is special and it deserves new API and breakage of existing convention.
>>Maybe you should step back and reconsider.

The objective behind Prague auto-sizing is that the TSO burst should not create a queuing latency bigger than 250us, on a bottleneck link rate that is the pacing rate. The rationale is that the L4S AQMs are allowed to have a very small threshold (often somewhere between 0.5ms to 1ms). When the pacing rate is 96Mbps we start allowing TSO size of 2, etc... Below 96Mbps the TSO size is 1, and below 48Mbps the potential serialization time of a single packet is bigger than 250us. You will also see in other patches that we limit the serialization time of one MTU to 10ms when the rate goes lower than 1Mbps by limiting the max MTU (going down to a minimum pacing rate of 100kbps as the minimum sending rate). 
L4S AQMs are designed with these parameters in mind. As AQMs usually work packet per packet and evaluate waiting time in the queue, definitely the minimum inter-packet departure time needs to be bigger than the serialization time - 0.5ms or packets will get marked without necessarily using the full link capacity.

The current autosizing algo has as objective to scale the TSO size depending on both pacing rate and RTT. It targets a 1ms queue delay burst when the RTT is bigger than 3ms, but when the RTT is smaller it further increases the TSO-size. The rationale behind this is that a bigger TSO-size results in a bigger chance of loss, and that loss is easier to retransmit if the RTT is smaller. 

So, the current algorithm does not line up with the Prague expectations. Is allows bursts of 1ms instead of 250us and it allows even bigger bursts if the minimum RTT is below 3ms (for instance 27ms burst at 10Mbps and 1ms RTT).

I see following options how to include the Prague requirement of 250us burst instead of 1ms and not increasing TSO if the RTT is lower than 3ms:
- Let Prague do the calculation and set the desired TSO size
       - previously done with an extra hook to overwrite the autosize, but it was already commented to avoid the extra hook
       - change the existing hook and add a fixed or max TSO size (current patch did the fixed TSO overruling, maybe a max could be more general as it would always be lower than the current calculated autosize)
- Add a cc settable parameter max-TSO-burst-time to let the current autosize do the extra calculation and take this extra check into account
- we are open to any other suggestions...

Koen.




-----Original Message-----
From: Alexei Starovoitov <alexei.starovoitov@gmail.com> 
Sent: Wednesday, July 1, 2026 7:56 AM
To: Chia-Yu Chang (Nokia) <chia-yu.chang@nokia-bell-labs.com>; jolsa@kernel.org; yonghong.song@linux.dev; song@kernel.org; linux-kselftest@vger.kernel.org; memxor@gmail.com; shuah@kernel.org; martin.lau@linux.dev; ast@kernel.org; daniel@iogearbox.net; andrii@kernel.org; eddyz87@gmail.com; horms@kernel.org; dsahern@kernel.org; bpf@vger.kernel.org; netdev@vger.kernel.org; pabeni@redhat.com; jhs@mojatatu.com; kuba@kernel.org; stephen@networkplumber.org; davem@davemloft.net; edumazet@google.com; andrew+netdev@lunn.ch; donald.hunter@gmail.com; kuniyu@google.com; ij@kernel.org; ncardwell@google.com; Koen De Schepper (Nokia) <koen.de_schepper@nokia-bell-labs.com>; g.white@cablelabs.com; ingemar.s.johansson@ericsson.com; mirja.kuehlewind@ericsson.com; cheshire@apple.com; rs.ietf@gmx.at; Jason_Livingood@comcast.com; vidhi_goel@apple.com
Subject: Re: [PATCH v3 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback


CAUTION: This is an external email. Please be very careful when clicking links or opening attachments. See the URL nok.it/ext for additional information.



On Tue Jun 30, 2026 at 10:46 PM PDT, Chia-Yu Chang (Nokia) wrote:
>> > -     /* override sysctl_tcp_min_tso_segs (optional) */
>> > -     u32 (*min_tso_segs)(struct sock *sk);
>> > +     /*
>> > +      * Override tcp_tso_autosize (optional)
>> > +      *
>> > +      * If provided, this callback returns the final TSO segment number
>> > +      * and will bypass tcp_tso_autosize() entirely. The implementation
>> > +      * must derive an appropriate value and ensure the result is valid.
>> > +      */
>> > +     u32 (*tso_segs)(struct sock *sk, u32 mss_now);
>>
>> I don't like this interface change.
>> It introduces churn for no good reason.
>> At least I don't see why you cannot live with the existing api.
>
> Hi Alexei,
>
> This patch was part of TCP Prague preparation series: 
> https://eur03.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore
> .kernel.org%2Fall%2F20260611161504.228319-4-chia-yu.chang%40nokia-bell
> -labs.com%2F&data=05%7C02%7Ckoen.de_schepper%40nokia-bell-labs.com%7C5
> a4af4e411b5485a2abb08ded735768f%7C5d4717519675428d917b70f44f9630b0%7C0
> %7C0%7C639184821777198173%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRy
> dWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%
> 3D%7C0%7C%7C%7C&sdata=yBTs89k8g2CSHfi0preWag0X%2BR%2F5vJAmRw6ijEK4E1k%
> 3D&reserved=0 Our original patch is to add an extra tso_segs, and 
> after discussion it's recommended to replace exisiting min_tso_segs.
>
> This is needed because TCP Prague would set the exact TSO size rather than using autosizing from TCP.
> The TCP Prague itself is planned to be submitted after all preparation commits are accepted.
> You can find its current stauts: 
> https://eur03.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgith
> ub.com%2FL4STeam%2Flinux-net-next%2Fblob%2Fupstream_l4steam%2Fnet%2Fip
> v4%2Ftcp_prague.c&data=05%7C02%7Ckoen.de_schepper%40nokia-bell-labs.co
> m%7C5a4af4e411b5485a2abb08ded735768f%7C5d4717519675428d917b70f44f9630b
> 0%7C0%7C0%7C639184821777240736%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGk
> iOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyf
> Q%3D%3D%7C0%7C%7C%7C&sdata=1tkkWXoSYqOyvr3FkmKpybDt2M6P7BOY240SEvkjfqY
> %3D&reserved=0

You have to explain why Prague CC cannot rely on autosizing.
To me it sounds like a red flag. autosizing logic was there for a decade, if not more.
And now you're arguing that your CC logic is special and it deserves new API and breakage of existing convention.
Maybe you should step back and reconsider.


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-07-01 16:16 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-30 12:01 [PATCH v3 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback chia-yu.chang
2026-06-30 23:20 ` Alexei Starovoitov
2026-07-01  5:46   ` Chia-Yu Chang (Nokia)
2026-07-01  5:56     ` Alexei Starovoitov
2026-07-01 16:16       ` Koen De Schepper (Nokia)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox