Netdev List
 help / color / mirror / Atom feed
From: Mariusz Klimek <maklimek97@gmail.com>
To: netdev@vger.kernel.org
Cc: andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com, dsahern@kernel.org,
	idosch@nvidia.com, ncardwell@google.com, shuah@kernel.org,
	kuniyu@google.com, alice@isovalent.com,
	Mariusz Klimek <maklimek97@gmail.com>
Subject: [PATCH net-next 04/10] tcp: decouple TSO segment length from MSS
Date: Mon,  8 Jun 2026 15:07:49 +0200	[thread overview]
Message-ID: <20260608130755.5626-5-maklimek97@gmail.com> (raw)
In-Reply-To: <20260608130755.5626-1-maklimek97@gmail.com>

This patch decouples the TSO segment length from the MSS to allow for MSS >
65535 despite the TSO segment length being capped to 16 bits. Ideally
TSO/GSO would support jumbogram segments so that the decoupling isn't
necessary, but that would require a much bigger change.

Add a new helper function tcp_tso_seglen that returns the segment length
for a given MSS, capped at 65535 - MAX_TCP_HEADER, and use it where the MSS
is treated as the segment length. This leaves enough room for TCP/IPv6
headers, including TCP options and extension headers.

Change the signatures of some functions to accept max_len instead of segs
where segs is only used to calculate the maximum length of a TSO packet.

Signed-off-by: Mariusz Klimek <maklimek97@gmail.com>
---
 include/net/tcp.h     | 12 ++++++--
 net/ipv4/tcp.c        | 10 ++++---
 net/ipv4/tcp_output.c | 67 +++++++++++++++++++++++++------------------
 net/ipv4/tcp_timer.c  |  4 +--
 4 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f063eccbbba3..b3a50f6d3381 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -747,8 +747,8 @@ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb);
 void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb);
 void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
 			       int nonagle);
-int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
-int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
+int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int max_len);
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int max_len);
 void tcp_retransmit_timer(struct sock *sk);
 void tcp_xmit_retransmit_queue(struct sock *);
 void tcp_simple_retransmit(struct sock *);
@@ -1219,6 +1219,14 @@ static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs)
 	TCP_SKB_CB(skb)->tcp_gso_segs += segs;
 }
 
+/* Return the segment length we want for the given MSS. We cap the segment
+ * length to prevent the segments from becoming jumbograms.
+ */
+static inline u16 tcp_tso_seglen(u32 mss_now)
+{
+	return min_t(u32, GSO_BY_FRAGS - MAX_TCP_HEADER, mss_now);
+}
+
 /* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */
 static inline int tcp_skb_mss(const struct sk_buff *skb)
 {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 890182a151e1..5ac2befbdc58 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -960,6 +960,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 new_size_goal, size_goal;
+	u16 gso_size;
 
 	if (!large_allowed)
 		return mss_now;
@@ -968,12 +969,13 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 	new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);
 
 	/* We try hard to avoid divides here */
-	size_goal = tp->gso_segs * mss_now;
+	gso_size = tcp_tso_seglen(mss_now);
+	size_goal = tp->gso_segs * gso_size;
 	if (unlikely(new_size_goal < size_goal ||
-		     new_size_goal >= size_goal + mss_now)) {
-		tp->gso_segs = min_t(u16, new_size_goal / mss_now,
+		     new_size_goal >= size_goal + gso_size)) {
+		tp->gso_segs = min_t(u16, new_size_goal / gso_size,
 				     sk->sk_gso_max_segs);
-		size_goal = tp->gso_segs * mss_now;
+		size_goal = tp->gso_segs * gso_size;
 	}
 
 	return max(size_goal, mss_now);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d3b8e61d3c5e..a66a3622006d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1748,7 +1748,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 /* Initialize TSO segments for a packet. */
 static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
 {
-	int tso_segs;
+	int tso_size, tso_segs;
 
 	if (skb->len <= mss_now) {
 		/* Avoid the costly divide in the normal
@@ -1758,8 +1758,9 @@ static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
 		tcp_skb_pcount_set(skb, 1);
 		return 1;
 	}
-	TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
-	tso_segs = DIV_ROUND_UP(skb->len, mss_now);
+	tso_size = tcp_tso_seglen(mss_now);
+	TCP_SKB_CB(skb)->tcp_gso_size = tso_size;
+	tso_segs = DIV_ROUND_UP(skb->len, tso_size);
 	tcp_skb_pcount_set(skb, tso_segs);
 	return tso_segs;
 }
@@ -2207,12 +2208,14 @@ static bool tcp_minshall_check(const struct tcp_sock *tp)
  * if ((skb->len % mss) != 0)
  *        tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
  * But we can avoid doing the divide again given we already have
- *  skb_pcount = skb->len / mss_now
+ *  skb_pcount = skb->len / tcp_skb_seglen(skb)
  */
 static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
 				const struct sk_buff *skb)
 {
-	if (skb->len < tcp_skb_pcount(skb) * mss_now)
+	u32 seglen = tcp_skb_pcount(skb) == 1 ? mss_now : tcp_skb_mss(skb);
+
+	if (skb->len < tcp_skb_pcount(skb) * seglen)
 		tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
 }
 
@@ -2245,7 +2248,7 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
  * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
  * is below 1500 bytes after 6 * ~500 usec = 3ms.
  */
-static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+static u32 tcp_tso_autosize(const struct sock *sk, unsigned int tso_size,
 			    int min_tso_segs)
 {
 	unsigned long bytes;
@@ -2259,7 +2262,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
 
 	bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size);
 
-	return max_t(u32, bytes / mss_now, min_tso_segs);
+	return max_t(u32, bytes / tso_size, min_tso_segs);
 }
 
 /* Return the number of segments we want in the skb we are transmitting.
@@ -2274,14 +2277,14 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
 			ca_ops->min_tso_segs(sk) :
 			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
 
-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
+	tso_segs = tcp_tso_autosize(sk, tcp_tso_seglen(mss_now), min_tso);
 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
 }
 
 /* Returns the portion of skb which can be sent right away */
 static unsigned int tcp_mss_split_point(const struct sock *sk,
 					const struct sk_buff *skb,
-					unsigned int mss_now,
+					unsigned int seglen,
 					unsigned int max_segs,
 					int nonagle)
 {
@@ -2289,7 +2292,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
 	u32 partial, needed, window, max_len;
 
 	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
-	max_len = mss_now * max_segs;
+	max_len = seglen * max_segs;
 
 	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
 		return max_len;
@@ -2299,7 +2302,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
 	if (max_len <= needed)
 		return max_len;
 
-	partial = needed % mss_now;
+	partial = needed % seglen;
 	/* If last segment is not a full MSS, check if Nagle rules allow us
 	 * to include this last segment in this skb.
 	 * Otherwise, we'll split the skb at last MSS boundary
@@ -2337,7 +2340,8 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
 {
 	int tso_segs = tcp_skb_pcount(skb);
 
-	if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now))
+	if (!tso_segs ||
+	    (tso_segs > 1 && tcp_skb_mss(skb) != tcp_tso_seglen(mss_now)))
 		return tcp_set_skb_tso_segs(skb, mss_now);
 
 	return tso_segs;
@@ -2444,7 +2448,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 				 bool *is_cwnd_limited,
 				 bool *is_rwnd_limited,
-				 u32 max_segs)
+				 u32 max_len)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 send_win, cong_win, limit, in_flight, threshold;
@@ -2479,7 +2483,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	limit = min(send_win, cong_win);
 
 	/* If a full-sized TSO skb can be sent, do it. */
-	if (limit >= max_segs * tp->mss_cache)
+	if (limit >= max_len)
 		goto send_now;
 
 	/* Middle in queue won't get any more data, full sendable already? */
@@ -2956,10 +2960,10 @@ static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount)
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			   int push_one, gfp_t gfp)
 {
+	u32 cwnd_quota, max_segs, max_len;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	unsigned int tso_segs, sent_pkts;
-	u32 cwnd_quota, max_segs;
 	int result;
 	bool is_cwnd_limited = false, is_rwnd_limited = false;
 
@@ -3007,7 +3011,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 				break;
 		}
 		cwnd_quota = min(cwnd_quota, max_segs);
-		missing_bytes = cwnd_quota * mss_now - skb->len;
+
+		max_len = max(mss_now, cwnd_quota * tcp_tso_seglen(mss_now));
+		missing_bytes = max_len - skb->len;
 		if (missing_bytes > 0)
 			tcp_grow_skb(sk, skb, missing_bytes);
 
@@ -3026,13 +3032,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		} else {
 			if (!push_one &&
 			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
-						 &is_rwnd_limited, max_segs))
+						 &is_rwnd_limited, max_len))
 				break;
 		}
 
 		limit = mss_now;
 		if (tso_segs > 1 && !tcp_urg_mode(tp))
-			limit = tcp_mss_split_point(sk, skb, mss_now,
+			limit = tcp_mss_split_point(sk, skb, tcp_tso_seglen(mss_now),
 						    cwnd_quota,
 						    nonagle);
 
@@ -3193,10 +3199,10 @@ void tcp_send_loss_probe(struct sock *sk)
 	if (WARN_ON(!pcount))
 		goto rearm_timer;
 
-	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
+	if ((pcount > 1) && (skb->len > (pcount - 1) * tcp_tso_seglen(mss))) {
 		if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
-					  (pcount - 1) * mss, mss,
-					  GFP_ATOMIC)))
+					  (pcount - 1) *  tcp_tso_seglen(mss),
+					  mss, GFP_ATOMIC)))
 			goto rearm_timer;
 		skb = skb_rb_next(skb);
 	}
@@ -3204,7 +3210,7 @@ void tcp_send_loss_probe(struct sock *sk)
 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
 		goto rearm_timer;
 
-	if (__tcp_retransmit_skb(sk, skb, 1))
+	if (__tcp_retransmit_skb(sk, skb, mss))
 		goto rearm_timer;
 
 	tp->tlp_retrans = 1;
@@ -3539,13 +3545,14 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
  * state updates are done by the caller.  Returns non-zero if an
  * error occurred which prevented the send.
  */
-int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
+int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int max_len)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned int cur_mss;
 	int diff, len, err;
 	int avail_wnd;
+	int segs;
 
 	/* Inconclusive MTU probe */
 	if (icsk->icsk_mtup.probe_size)
@@ -3595,7 +3602,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		avail_wnd = cur_mss;
 	}
 
-	len = cur_mss * segs;
+	len = max_len;
 	if (len > avail_wnd) {
 		len = rounddown(avail_wnd, cur_mss);
 		if (!len)
@@ -3684,10 +3691,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	return err;
 }
 
-int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int max_len)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int err = __tcp_retransmit_skb(sk, skb, segs);
+	int err = __tcp_retransmit_skb(sk, skb, max_len);
 
 	if (err == 0) {
 #if FASTRETRANS_DEBUG > 0
@@ -3721,6 +3728,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	bool rearm_timer = false;
 	u32 max_segs;
+	u32 mss_now;
 	int mib_idx;
 
 	if (!tp->packets_out)
@@ -3728,9 +3736,11 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 
 	rtx_head = tcp_rtx_queue_head(sk);
 	skb = tp->retransmit_skb_hint ?: rtx_head;
-	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
+	mss_now = tcp_current_mss(sk);
+	max_segs = tcp_tso_segs(sk, mss_now);
 	skb_rbtree_walk_from(skb) {
 		__u8 sacked;
+		u32 max_len;
 		int segs;
 
 		if (tcp_pacing_check(sk))
@@ -3748,6 +3758,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		 * we need to make sure not sending too bigs TSO packets
 		 */
 		segs = min_t(int, segs, max_segs);
+		max_len = max_t(u32, mss_now, segs * tcp_tso_seglen(mss_now));
 
 		if (tp->retrans_out >= tp->lost_out) {
 			break;
@@ -3769,7 +3780,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		if (tcp_small_queue_check(sk, skb, 1))
 			break;
 
-		if (tcp_retransmit_skb(sk, skb, segs))
+		if (tcp_retransmit_skb(sk, skb, max_len))
 			break;
 
 		NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 322db13333c7..2e5331441469 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -595,7 +595,7 @@ void tcp_retransmit_timer(struct sock *sk)
 			goto out;
 		}
 		tcp_enter_loss(sk);
-		tcp_retransmit_skb(sk, skb, 1);
+		tcp_retransmit_skb(sk, skb, tcp_current_mss(sk));
 		__sk_dst_reset(sk);
 		goto out_reset_timer;
 	}
@@ -628,7 +628,7 @@ void tcp_retransmit_timer(struct sock *sk)
 	tcp_enter_loss(sk);
 
 	tcp_update_rto_stats(sk);
-	if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
+	if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), tcp_current_mss(sk)) > 0) {
 		/* Retransmission failed because of local congestion,
 		 * Let senders fight for local resources conservatively.
 		 */
-- 
2.47.3


  parent reply	other threads:[~2026-06-08 13:09 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20260608130755.5626-1-maklimek97@gmail.com>
2026-06-08 13:07 ` [PATCH net-next 01/10] ipv6: do not fragment packets into jumbograms Mariusz Klimek
2026-06-08 13:07 ` [PATCH net-next 02/10] ipv6: allow route exceptions with MTUs above 65535 Mariusz Klimek
2026-06-08 13:07 ` [PATCH net-next 03/10] ipv6: add jumbo payload option to non-gso jumbograms Mariusz Klimek
2026-06-08 13:07 ` Mariusz Klimek [this message]
2026-06-08 13:07 ` [PATCH net-next 05/10] tcp: split jumbograms with urgent pointer correctly Mariusz Klimek
2026-06-08 13:07 ` [PATCH net-next 06/10] tcp: set MSS correctly for PMTU above 65535 Mariusz Klimek
2026-06-08 13:07 ` [PATCH net-next 07/10] veth: raise the max MTU " Mariusz Klimek
2026-06-08 13:07 ` [PATCH net-next 08/10] selftests/net: test sending TCP jumbograms over veth Mariusz Klimek
2026-06-08 13:07 ` [PATCH net-next 09/10] selftests/net: add test cases with MTU above 65535 to big_tcp.sh Mariusz Klimek
2026-06-08 13:07 ` [PATCH net-next 10/10] selftests/net: add jumbogram test case to msg_zerocopy.sh Mariusz Klimek

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260608130755.5626-5-maklimek97@gmail.com \
    --to=maklimek97@gmail.com \
    --cc=alice@isovalent.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=davem@davemloft.net \
    --cc=dsahern@kernel.org \
    --cc=edumazet@google.com \
    --cc=idosch@nvidia.com \
    --cc=kuba@kernel.org \
    --cc=kuniyu@google.com \
    --cc=ncardwell@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=shuah@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox