* [PATCH v2 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback for TCP Prague
@ 2026-06-14 7:17 chia-yu.chang
2026-06-15 7:19 ` sashiko-bot
2026-06-16 1:51 ` Jakub Kicinski
0 siblings, 2 replies; 4+ messages in thread
From: chia-yu.chang @ 2026-06-14 7:17 UTC (permalink / raw)
To: jolsa, yonghong.song, song, linux-kselftest, memxor, shuah,
martin.lau, ast, daniel, andrii, eddyz87, horms, dsahern, bpf,
netdev, pabeni, jhs, kuba, stephen, davem, edumazet,
andrew+netdev, donald.hunter, kuniyu, ij, ncardwell,
koen.de_schepper, g.white, ingemar.s.johansson, mirja.kuehlewind,
cheshire, rs.ietf, Jason_Livingood, vidhi_goel
Cc: Chia-Yu Chang
From: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
This patch replaces existing min_tso_segs() with tso_segs() CC callbak
for CC algorithm to provides explicit tso segment number of each data
burst and overrides tcp_tso_autosize().
No functional change.
Signed-off-by: Ilpo Järvinen <ij@kernel.org>
Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
---
include/net/tcp.h | 7 +++++--
net/ipv4/bpf_tcp_ca.c | 4 ++--
net/ipv4/tcp_bbr.c | 14 +++++++++++---
net/ipv4/tcp_output.c | 13 +++++++------
tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c | 8 ++++----
5 files changed, 29 insertions(+), 17 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f063eccbbba3..34d370ea9ceb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -824,6 +824,9 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
unsigned int tcp_current_mss(struct sock *sk);
u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when);
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ int min_tso_segs);
+
/* Bound MSS / TSO packet size with the half of the window */
static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
{
@@ -1361,8 +1364,8 @@ struct tcp_congestion_ops {
/* hook for packet ack accounting (optional) */
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
- /* override sysctl_tcp_min_tso_segs (optional) */
- u32 (*min_tso_segs)(struct sock *sk);
+ /* override tcp_tso_autosize (optional)*/
+ u32 (*tso_segs)(struct sock *sk, u32 mss_now);
/* new value of cwnd after loss (required) */
u32 (*undo_cwnd)(struct sock *sk);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 791e15063237..ed4fea98dfde 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -284,7 +284,7 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp
{
}
-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
+static u32 bpf_tcp_ca_tso_segs(struct sock *sk, u32 mss_now)
{
return 0;
}
@@ -320,7 +320,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
.cwnd_event_tx_start = bpf_tcp_ca_cwnd_event_tx_start,
.in_ack_event = bpf_tcp_ca_in_ack_event,
.pkts_acked = bpf_tcp_ca_pkts_acked,
- .min_tso_segs = bpf_tcp_ca_min_tso_segs,
+ .tso_segs = bpf_tcp_ca_tso_segs,
.cong_control = bpf_tcp_ca_cong_control,
.undo_cwnd = bpf_tcp_ca_undo_cwnd,
.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 82378a2bfd1e..15536564246c 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -297,11 +297,19 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
}
/* override sysctl_tcp_min_tso_segs */
-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
+static u32 bbr_min_tso_segs(struct sock *sk)
{
return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
}
+__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, u32 mss_now)
+{
+ u32 min_tso;
+
+ min_tso = bbr_min_tso_segs(sk);
+ return tcp_tso_autosize(sk, mss_now, min_tso);
+}
+
static u32 bbr_tso_segs_goal(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -1151,7 +1159,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
.undo_cwnd = bbr_undo_cwnd,
.cwnd_event_tx_start = bbr_cwnd_event_tx_start,
.ssthresh = bbr_ssthresh,
- .min_tso_segs = bbr_min_tso_segs,
+ .tso_segs = bbr_tso_segs,
.get_info = bbr_get_info,
.set_state = bbr_set_state,
};
@@ -1163,7 +1171,7 @@ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
BTF_ID_FLAGS(func, bbr_undo_cwnd)
BTF_ID_FLAGS(func, bbr_cwnd_event_tx_start)
BTF_ID_FLAGS(func, bbr_ssthresh)
-BTF_ID_FLAGS(func, bbr_min_tso_segs)
+BTF_ID_FLAGS(func, bbr_tso_segs)
BTF_ID_FLAGS(func, bbr_set_state)
BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 26dd751ec72a..a09b00c5483e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2253,8 +2253,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
* for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
* is below 1500 bytes after 6 * ~500 usec = 3ms.
*/
-static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
- int min_tso_segs)
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ int min_tso_segs)
{
unsigned long bytes;
u32 r;
@@ -2269,6 +2269,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
return max_t(u32, bytes / mss_now, min_tso_segs);
}
+EXPORT_SYMBOL(tcp_tso_autosize);
/* Return the number of segments we want in the skb we are transmitting.
* See if congestion control module wants to decide; otherwise, autosize.
@@ -2278,11 +2279,11 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
u32 min_tso, tso_segs;
- min_tso = ca_ops->min_tso_segs ?
- ca_ops->min_tso_segs(sk) :
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ min_tso = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
+ tso_segs = ca_ops->tso_segs ?
+ ca_ops->tso_segs(sk, mss_now) :
+ tcp_tso_autosize(sk, mss_now, min_tso);
return min_t(u32, tso_segs, sk->sk_gso_max_segs);
}
diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c
index 0a3e9d35bf6f..58262e490336 100644
--- a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c
+++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c
@@ -10,7 +10,7 @@ extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym;
extern u32 bbr_undo_cwnd(struct sock *sk) __ksym;
extern void bbr_cwnd_event_tx_start(struct sock *sk) __ksym;
extern u32 bbr_ssthresh(struct sock *sk) __ksym;
-extern u32 bbr_min_tso_segs(struct sock *sk) __ksym;
+extern u32 bbr_tso_segs(struct sock *sk, u32 mss_now) __ksym;
extern void bbr_set_state(struct sock *sk, u8 new_state) __ksym;
extern void dctcp_init(struct sock *sk) __ksym;
@@ -90,9 +90,9 @@ u32 BPF_PROG(ssthresh, struct sock *sk)
}
SEC("struct_ops")
-u32 BPF_PROG(min_tso_segs, struct sock *sk)
+u32 BPF_PROG(tso_segs, struct sock *sk, u32 mss_now)
{
- return bbr_min_tso_segs(sk);
+ return bbr_tso_segs(sk, mss_now);
}
SEC("struct_ops")
@@ -120,7 +120,7 @@ struct tcp_congestion_ops tcp_ca_kfunc = {
.cwnd_event = (void *)cwnd_event,
.cwnd_event_tx_start = (void *)cwnd_event_tx_start,
.ssthresh = (void *)ssthresh,
- .min_tso_segs = (void *)min_tso_segs,
+ .tso_segs = (void *)tso_segs,
.set_state = (void *)set_state,
.pkts_acked = (void *)pkts_acked,
.name = "tcp_ca_kfunc",
--
2.34.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH v2 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback for TCP Prague
2026-06-14 7:17 [PATCH v2 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback for TCP Prague chia-yu.chang
@ 2026-06-15 7:19 ` sashiko-bot
2026-06-16 1:51 ` Jakub Kicinski
1 sibling, 0 replies; 4+ messages in thread
From: sashiko-bot @ 2026-06-15 7:19 UTC (permalink / raw)
To: chia-yu.chang; +Cc: bpf
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
- [High] BPF programs can cause a kernel panic by passing `mss_now == 0` to the `bbr_tso_segs` kfunc, leading to a division by zero.
- [Medium] The `tcp_tso_autosize` function is not registered as a BPF kfunc, preventing BPF congestion control algorithms from reusing default TSO autosizing logic.
--
commit 39a9bf39a3860ba9dd1b2bed8669b97491df87c6
Author: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
tcp: Replace min_tso_segs() with tso_segs() CC callback for TCP Prague
This patch replaces existing min_tso_segs() with tso_segs() CC callbak
for CC algorithm to provides explicit tso segment number of each data
burst and overrides tcp_tso_autosize().
> diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
> --- a/net/ipv4/tcp_bbr.c
> +++ b/net/ipv4/tcp_bbr.c
[ ... ]
> @@ -297,11 +297,19 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
> }
>
> /* override sysctl_tcp_min_tso_segs */
> -__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
> +static u32 bbr_min_tso_segs(struct sock *sk)
> {
> return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
> }
>
> +__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, u32 mss_now)
> +{
> + u32 min_tso;
> +
> + min_tso = bbr_min_tso_segs(sk);
> + return tcp_tso_autosize(sk, mss_now, min_tso);
> +}
[Severity: High]
Can a buggy BPF program trigger a divide-by-zero panic here?
Since bbr_tso_segs() is exposed as a BPF kfunc, the verifier does not
restrict the value ranges of scalar arguments like mss_now. If mss_now is 0,
it is passed directly to tcp_tso_autosize(), where it is used as a divisor:
tcp_tso_autosize()
...
return max_t(u32, bytes / mss_now, min_tso_segs);
Does this need an explicit check for mss_now == 0 to prevent a local denial
of service?
> +
> static u32 bbr_tso_segs_goal(struct sock *sk)
> {
> struct tcp_sock *tp = tcp_sk(sk);
[ ... ]
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
[ ... ]
> @@ -2269,6 +2269,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
>
> return max_t(u32, bytes / mss_now, min_tso_segs);
> }
> +EXPORT_SYMBOL(tcp_tso_autosize);
[Severity: Medium]
Is there a way for out-of-tree BPF congestion controls to reuse the default
autosizing logic?
With the removal of min_tso_segs, BPF congestion control algorithms that
want a custom minimum TSO size must implement tso_segs and handle the
autosizing themselves. However, while tcp_tso_autosize() is exported to C
modules here, it does not appear to be registered as a BPF kfunc (e.g., via
BTF_KFUNCS_START).
Does this force BPF developers to either lose custom minimum TSO sizing
capabilities or duplicate the tcp_tso_autosize() logic within their BPF
programs?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260614071756.239686-1-chia-yu.chang@nokia-bell-labs.com?part=1
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v2 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback for TCP Prague
2026-06-14 7:17 [PATCH v2 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback for TCP Prague chia-yu.chang
2026-06-15 7:19 ` sashiko-bot
@ 2026-06-16 1:51 ` Jakub Kicinski
2026-06-16 2:17 ` Jakub Kicinski
1 sibling, 1 reply; 4+ messages in thread
From: Jakub Kicinski @ 2026-06-16 1:51 UTC (permalink / raw)
To: edumazet, ncardwell
Cc: chia-yu.chang, jolsa, yonghong.song, song, linux-kselftest,
memxor, shuah, martin.lau, ast, daniel, andrii, eddyz87, horms,
dsahern, bpf, netdev, pabeni, jhs, stephen, davem, andrew+netdev,
donald.hunter, kuniyu, ij, koen.de_schepper, g.white,
ingemar.s.johansson, mirja.kuehlewind, cheshire, rs.ietf,
Jason_Livingood, vidhi_goel
On Sun, 14 Jun 2026 09:17:56 +0200 chia-yu.chang@nokia-bell-labs.com
wrote:
> This patch replaces existing min_tso_segs() with tso_segs() CC callbak
> for CC algorithm to provides explicit tso segment number of each data
> burst and overrides tcp_tso_autosize().
>
> No functional change.
Eric, Neal, looks good?
The min rtt thing in tcp_tso_autosize() helps a bit but if the sender
gets congested for a longer stretch min_rtts on new connections are
high and we're back to sending small TSO, keeping the sender overloaded.
Which is to say - I _hope_ this also solves some of Meta's problems :)
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v2 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback for TCP Prague
2026-06-16 1:51 ` Jakub Kicinski
@ 2026-06-16 2:17 ` Jakub Kicinski
0 siblings, 0 replies; 4+ messages in thread
From: Jakub Kicinski @ 2026-06-16 2:17 UTC (permalink / raw)
To: edumazet, ncardwell
Cc: chia-yu.chang, jolsa, yonghong.song, song, linux-kselftest,
memxor, shuah, martin.lau, ast, daniel, andrii, eddyz87, horms,
dsahern, bpf, netdev, pabeni, jhs, stephen, davem, andrew+netdev,
donald.hunter, kuniyu, ij, koen.de_schepper, g.white,
ingemar.s.johansson, mirja.kuehlewind, cheshire, rs.ietf,
Jason_Livingood, vidhi_goel
On Mon, 15 Jun 2026 18:51:02 -0700 Jakub Kicinski wrote:
> On Sun, 14 Jun 2026 09:17:56 +0200 chia-yu.chang@nokia-bell-labs.com
> wrote:
> > This patch replaces existing min_tso_segs() with tso_segs() CC callbak
> > for CC algorithm to provides explicit tso segment number of each data
> > burst and overrides tcp_tso_autosize().
> >
> > No functional change.
>
> Eric, Neal, looks good?
>
> The min rtt thing in tcp_tso_autosize() helps a bit but if the sender
> gets congested for a longer stretch min_rtts on new connections are
> high and we're back to sending small TSO, keeping the sender overloaded.
> Which is to say - I _hope_ this also solves some of Meta's problems :)
Ugh, I didn't see the Sashiko report, it's only CCed to the author and
bpf@, not to netdev :/
The zero-check sounds legit. Let's revisit this after the merge window.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2026-06-16 2:17 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-14 7:17 [PATCH v2 net-next 1/1] tcp: Replace min_tso_segs() with tso_segs() CC callback for TCP Prague chia-yu.chang
2026-06-15 7:19 ` sashiko-bot
2026-06-16 1:51 ` Jakub Kicinski
2026-06-16 2:17 ` Jakub Kicinski
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.