[PATCH v10 mptcp-next 0/3] mptcp: address stall under memory pressure

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v10 mptcp-next 0/3] mptcp: address stall under memory pressure
@ 2026-05-29 10:44 Paolo Abeni
  2026-05-29 10:44 ` [PATCH v10 mptcp-next 1/3] Squash-to: "mptcp: implemented OoO queue pruning" Paolo Abeni
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Paolo Abeni @ 2026-05-29 10:44 UTC (permalink / raw)
  To: mptcp

This brings (hopefully) the final bits required to address the
data transfer stall reported by Geliang and Gang 

Patch 1 is a mostly cosmetic change over a previously accepted patch,
while patch 2 and 3 improves mptcp retranmission to make them reliable:
pruning can require some of them.

The main change over the previous iteration is in patch 3, addressing
sashiko feedback over the previous iteration (missing timer reschedule
in some corner cases).

Paolo Abeni (3):
  Squash-to: "mptcp: implemented OoO queue pruning"
  mptcp: move the retrans loop to a separate helper
  mptcp: let the retrans scheduler do its job.

 net/mptcp/protocol.c | 177 ++++++++++++++++++++++++++++++-------------
 1 file changed, 123 insertions(+), 54 deletions(-)

-- 
2.54.0

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v10 mptcp-next 1/3] Squash-to: "mptcp: implemented OoO queue pruning"
  2026-05-29 10:44 [PATCH v10 mptcp-next 0/3] mptcp: address stall under memory pressure Paolo Abeni
@ 2026-05-29 10:44 ` Paolo Abeni
  2026-05-30  8:52   ` Matthieu Baerts
  2026-05-29 10:44 ` [PATCH v10 mptcp-next 2/3] mptcp: move the retrans loop to a separate helper Paolo Abeni
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 11+ messages in thread
From: Paolo Abeni @ 2026-05-29 10:44 UTC (permalink / raw)
  To: mptcp

As discussed in the original patch, use the proper helper to
get the currently used memory, and let mptcp_prune_ofo_queue()
return true when skb can be accepted, to avoid code churn in
the caller.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/protocol.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 29cb10c02ed8..03234e8cc26c 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -376,7 +376,7 @@ static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset,
 /* "Inspired" from the TCP version; main difference: stop as soon as the MPTCP
  * socket is under memory limit.
  */
-static void mptcp_prune_ofo_queue(struct sock *sk, u64 seq)
+static bool mptcp_prune_ofo_queue(struct sock *sk, u64 seq)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct rb_node *node, *prev;
@@ -384,7 +384,7 @@ static void mptcp_prune_ofo_queue(struct sock *sk, u64 seq)
 	u64 mem;
 
 	if (RB_EMPTY_ROOT(&msk->out_of_order_queue))
-		return;
+		goto out;
 
 	node = &msk->ooo_last_skb->rbnode;
 
@@ -401,7 +401,7 @@ static void mptcp_prune_ofo_queue(struct sock *sk, u64 seq)
 		mptcp_drop(sk, skb);
 		msk->ooo_last_skb = rb_to_skb(prev);
 
-		mem = (unsigned int)atomic_read(&sk->sk_rmem_alloc);
+		mem = (unsigned int)sk_rmem_alloc_get(sk);
 		if (mem < sk->sk_rcvbuf)
 			break;
 
@@ -410,6 +410,10 @@ static void mptcp_prune_ofo_queue(struct sock *sk, u64 seq)
 
 	if (pruned)
 		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFO_PRUNED);
+
+out:
+	mem = (unsigned int)sk_rmem_alloc_get(sk);
+	return mem < sk->sk_rcvbuf;
 }
 
 static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
@@ -424,13 +428,11 @@ static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
 	 * will break.
 	 */
 	if (unlikely(sk_rmem_alloc_get(sk) > READ_ONCE(sk->sk_rcvbuf)) &&
-	    !__mptcp_check_fallback(msk)) {
-		mptcp_prune_ofo_queue(sk, MPTCP_SKB_CB(skb)->map_seq);
-		if (sk_rmem_alloc_get(sk) > READ_ONCE(sk->sk_rcvbuf)) {
-			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
-			mptcp_drop(sk, skb);
-			return false;
-		}
+	    !__mptcp_check_fallback(msk) &&
+	    !mptcp_prune_ofo_queue(sk, MPTCP_SKB_CB(skb)->map_seq)) {
+		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
+		mptcp_drop(sk, skb);
+		return false;
 	}
 
 	if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v10 mptcp-next 2/3] mptcp: move the retrans loop to a separate helper
  2026-05-29 10:44 [PATCH v10 mptcp-next 0/3] mptcp: address stall under memory pressure Paolo Abeni
  2026-05-29 10:44 ` [PATCH v10 mptcp-next 1/3] Squash-to: "mptcp: implemented OoO queue pruning" Paolo Abeni
@ 2026-05-29 10:44 ` Paolo Abeni
  2026-05-29 10:44 ` [PATCH v10 mptcp-next 3/3] mptcp: let the retrans scheduler do its job Paolo Abeni
  2026-05-29 12:00 ` [PATCH v10 mptcp-next 0/3] mptcp: address stall under memory pressure MPTCP CI
  3 siblings, 0 replies; 11+ messages in thread
From: Paolo Abeni @ 2026-05-29 10:44 UTC (permalink / raw)
  To: mptcp

This is a cleanup in order to make the next patch simpler.
No functional change intended.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/protocol.c | 74 +++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 31 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 03234e8cc26c..51756800edc2 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2830,41 +2830,14 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
 	sk_error_report(sk);
 }
 
-static void __mptcp_retrans(struct sock *sk)
+/* Retransmit the specified data fragment on all the selected subflows. */
+static int __mptcp_push_retrans(struct sock *sk, struct mptcp_data_frag *dfrag)
 {
 	struct mptcp_sendmsg_info info = { .data_lock_held = true, };
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct mptcp_subflow_context *subflow;
-	struct mptcp_data_frag *dfrag;
 	struct sock *ssk;
-	int ret, err;
-	u16 len = 0;
-
-	mptcp_clean_una_wakeup(sk);
-
-	/* first check ssk: need to kick "stale" logic */
-	err = mptcp_sched_get_retrans(msk);
-	dfrag = mptcp_rtx_head(sk);
-	if (!dfrag) {
-		if (mptcp_data_fin_enabled(msk)) {
-			struct inet_connection_sock *icsk = inet_csk(sk);
-
-			WRITE_ONCE(icsk->icsk_retransmits,
-				   icsk->icsk_retransmits + 1);
-			mptcp_set_datafin_timeout(sk);
-			mptcp_send_ack(msk);
-
-			goto reset_timer;
-		}
-
-		if (!mptcp_send_head(sk))
-			goto clear_scheduled;
-
-		goto reset_timer;
-	}
-
-	if (err)
-		goto reset_timer;
+	int ret, len = 0;
 
 	mptcp_for_each_subflow(msk, subflow) {
 		if (READ_ONCE(subflow->scheduled)) {
@@ -2892,7 +2865,7 @@ static void __mptcp_retrans(struct sock *sk)
 			    !msk->allow_subflows) {
 				spin_unlock_bh(&msk->fallback_lock);
 				release_sock(ssk);
-				goto clear_scheduled;
+				return -1;
 			}
 
 			while (info.sent < info.limit) {
@@ -2915,6 +2888,45 @@ static void __mptcp_retrans(struct sock *sk)
 			release_sock(ssk);
 		}
 	}
+	return len;
+}
+
+static void __mptcp_retrans(struct sock *sk)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct mptcp_subflow_context *subflow;
+	struct mptcp_data_frag *dfrag;
+	int err, len;
+
+	mptcp_clean_una_wakeup(sk);
+
+	/* first check ssk: need to kick "stale" logic */
+	err = mptcp_sched_get_retrans(msk);
+	dfrag = mptcp_rtx_head(sk);
+	if (!dfrag) {
+		if (mptcp_data_fin_enabled(msk)) {
+			struct inet_connection_sock *icsk = inet_csk(sk);
+
+			WRITE_ONCE(icsk->icsk_retransmits,
+				   icsk->icsk_retransmits + 1);
+			mptcp_set_datafin_timeout(sk);
+			mptcp_send_ack(msk);
+
+			goto reset_timer;
+		}
+
+		if (!mptcp_send_head(sk))
+			goto clear_scheduled;
+
+		goto reset_timer;
+	}
+
+	if (err)
+		goto reset_timer;
+
+	len = __mptcp_push_retrans(sk, dfrag);
+	if (len < 0)
+		goto clear_scheduled;
 
 	msk->bytes_retrans += len;
 	dfrag->already_sent = max(dfrag->already_sent, len);
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v10 mptcp-next 3/3] mptcp: let the retrans scheduler do its job.
  2026-05-29 10:44 [PATCH v10 mptcp-next 0/3] mptcp: address stall under memory pressure Paolo Abeni
  2026-05-29 10:44 ` [PATCH v10 mptcp-next 1/3] Squash-to: "mptcp: implemented OoO queue pruning" Paolo Abeni
  2026-05-29 10:44 ` [PATCH v10 mptcp-next 2/3] mptcp: move the retrans loop to a separate helper Paolo Abeni
@ 2026-05-29 10:44 ` Paolo Abeni
  2026-05-29 13:27   ` Geliang Tang
  2026-05-29 12:00 ` [PATCH v10 mptcp-next 0/3] mptcp: address stall under memory pressure MPTCP CI
  3 siblings, 1 reply; 11+ messages in thread
From: Paolo Abeni @ 2026-05-29 10:44 UTC (permalink / raw)
  To: mptcp

Currently the MPTCP core enforces that when MPTCP-level retrans timer
fires, at most a single dfrag is retransmitted. If some corner-cases it
may be necessary retransmit multiple dfrags, and the MPTCP socket will
need to wait multiple retrans timeout to accomplish that.

Remove the mentioned constraint, allowing to transmit multiple dfrags per
retrans period, as long as the scheduler keeps selecting subflows for
retransmissions and pending data is available in the rtx queue.
The default scheduler will transmit a dfrag per available subflow.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v9 -> v10:
  - simpler handling for data-fin rtx

v7 -> v8
  - fix corner-case retrans_seq update

v4 -> v5:
  - fixed already_sent update

v3 -> v4:
  - avoid quadratic behavior, fix retrans_seq update
  - fix rtx timer re-schedule miss

v2 -> v3:
  - fix infinite loop issue (should address tls tests failures)

v1 -> v2:
  - fix retrans sequence update (sashiko)

Note:
 - sashiko may see issues when dfrag = mptcp_rtx_head(sk) != NULL and
   dfrag->already_sent == 0. That condition should not possible: if
   mptcp_rtx_head() is not NULL there should be some data already
   sent.
 - sashiko may see missing data-fin rtx when the initial `dfrag` is
   not NULL. data-fin RTX is NOT needed in such scenario.
---
 net/mptcp/protocol.c | 119 +++++++++++++++++++++++++++++++------------
 1 file changed, 87 insertions(+), 32 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 51756800edc2..7fe618e22d1b 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1201,13 +1201,6 @@ static void __mptcp_clean_una_wakeup(struct sock *sk)
 	mptcp_write_space(sk);
 }
 
-static void mptcp_clean_una_wakeup(struct sock *sk)
-{
-	mptcp_data_lock(sk);
-	__mptcp_clean_una_wakeup(sk);
-	mptcp_data_unlock(sk);
-}
-
 static void mptcp_enter_memory_pressure(struct sock *sk)
 {
 	struct mptcp_subflow_context *subflow;
@@ -2830,8 +2823,12 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
 	sk_error_report(sk);
 }
 
-/* Retransmit the specified data fragment on all the selected subflows. */
-static int __mptcp_push_retrans(struct sock *sk, struct mptcp_data_frag *dfrag)
+/*
+ * Retransmit the specified data fragment on all the selected subflows,
+ * starting from the specified sequence
+ */
+static int __mptcp_push_retrans(struct sock *sk, struct mptcp_data_frag *dfrag,
+				u64 sent_seq)
 {
 	struct mptcp_sendmsg_info info = { .data_lock_held = true, };
 	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -2841,6 +2838,7 @@ static int __mptcp_push_retrans(struct sock *sk, struct mptcp_data_frag *dfrag)
 
 	mptcp_for_each_subflow(msk, subflow) {
 		if (READ_ONCE(subflow->scheduled)) {
+			u16 offset = sent_seq - dfrag->data_seq;
 			u16 copied = 0;
 
 			mptcp_subflow_set_scheduled(subflow, false);
@@ -2850,9 +2848,12 @@ static int __mptcp_push_retrans(struct sock *sk, struct mptcp_data_frag *dfrag)
 			lock_sock(ssk);
 
 			/* limit retransmission to the bytes already sent on some subflows */
-			info.sent = 0;
+			info.sent = offset;
 			info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len :
 								    dfrag->already_sent;
+			DEBUG_NET_WARN_ON_ONCE(!before64(sent_seq,
+							 dfrag->data_seq +
+							 info.limit));
 
 			/*
 			 * make the whole retrans decision, xmit, disallow
@@ -2896,14 +2897,85 @@ static void __mptcp_retrans(struct sock *sk)
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct mptcp_subflow_context *subflow;
 	struct mptcp_data_frag *dfrag;
+	bool need_retrans;
+	u64 retrans_seq;
 	int err, len;
 
-	mptcp_clean_una_wakeup(sk);
-
-	/* first check ssk: need to kick "stale" logic */
-	err = mptcp_sched_get_retrans(msk);
+	mptcp_data_lock(sk);
+	__mptcp_clean_una_wakeup(sk);
+	retrans_seq = msk->snd_una;
 	dfrag = mptcp_rtx_head(sk);
-	if (!dfrag) {
+	need_retrans = !!dfrag;
+	mptcp_data_unlock(sk);
+	if (!dfrag)
+		goto check_data_fin;
+
+	for (;;) {
+		bool already_retrans;
+		u64 sent_seq;
+
+		/* The default scheduler will kick "stale" logic, that in
+		 * turn can process incoming acks and clean the RTX queue;
+		 * ensure that the current dfrag will still be around
+		 * afterwards.
+		 */
+		get_page(dfrag->page);
+		err = mptcp_sched_get_retrans(msk);
+		if (err) {
+			put_page(dfrag->page);
+			break;
+		}
+
+		/* Incoming acks can have moved retrans sequence after
+		 * the current dfrag, if so try to start again from RTX head.
+		 */
+		mptcp_data_lock(sk);
+		already_retrans = !dfrag->already_sent ||
+				  !before64(msk->snd_una, dfrag->data_seq +
+					    dfrag->already_sent);
+		put_page(dfrag->page);
+		if (already_retrans) {
+			__mptcp_clean_una_wakeup(sk);
+			retrans_seq = msk->snd_una;
+			dfrag = mptcp_rtx_head(sk);
+			need_retrans = !!dfrag;
+		} else if (after64(msk->snd_una, retrans_seq)) {
+			retrans_seq = msk->snd_una;
+		}
+		mptcp_data_unlock(sk);
+		if (!dfrag)
+			break;
+
+		/* Can fail only in case of fallback. */
+		len = __mptcp_push_retrans(sk, dfrag, retrans_seq);
+		if (len < 0)
+			goto clear_scheduled;
+
+		retrans_seq += len;
+		msk->bytes_retrans += len;
+		dfrag->already_sent = max_t(u16, dfrag->already_sent,
+					    retrans_seq - dfrag->data_seq);
+
+		/* With csum enabled retransmission can send new data. */
+		sent_seq = dfrag->already_sent + dfrag->data_seq;
+		if (after64(sent_seq, msk->snd_nxt))
+			WRITE_ONCE(msk->snd_nxt, sent_seq);
+
+		/* Attempt the next fragment only if the current one is
+		 * completely retransmitted.
+		 */
+		if (before64(retrans_seq, dfrag->data_seq + dfrag->data_len))
+			break;
+
+		dfrag = list_is_last(&dfrag->list, &msk->rtx_queue) ?
+				NULL : list_next_entry(dfrag, list);
+		if (!dfrag || !dfrag->already_sent)
+			break;
+	}
+
+	/* Attempt data-fin retransmission only when the RTX queue is empty. */
+	if (!need_retrans) {
+check_data_fin:
 		if (mptcp_data_fin_enabled(msk)) {
 			struct inet_connection_sock *icsk = inet_csk(sk);
 
@@ -2911,30 +2983,13 @@ static void __mptcp_retrans(struct sock *sk)
 				   icsk->icsk_retransmits + 1);
 			mptcp_set_datafin_timeout(sk);
 			mptcp_send_ack(msk);
-
 			goto reset_timer;
 		}
 
 		if (!mptcp_send_head(sk))
 			goto clear_scheduled;
-
-		goto reset_timer;
 	}
 
-	if (err)
-		goto reset_timer;
-
-	len = __mptcp_push_retrans(sk, dfrag);
-	if (len < 0)
-		goto clear_scheduled;
-
-	msk->bytes_retrans += len;
-	dfrag->already_sent = max(dfrag->already_sent, len);
-
-	/* With csum enabled retransmission can send new data. */
-	if (after64(dfrag->already_sent + dfrag->data_seq, msk->snd_nxt))
-		WRITE_ONCE(msk->snd_nxt, dfrag->already_sent + dfrag->data_seq);
-
 reset_timer:
 	mptcp_check_and_set_pending(sk);
 
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v10 mptcp-next 0/3] mptcp: address stall under memory pressure
  2026-05-29 10:44 [PATCH v10 mptcp-next 0/3] mptcp: address stall under memory pressure Paolo Abeni
                   ` (2 preceding siblings ...)
  2026-05-29 10:44 ` [PATCH v10 mptcp-next 3/3] mptcp: let the retrans scheduler do its job Paolo Abeni
@ 2026-05-29 12:00 ` MPTCP CI
  3 siblings, 0 replies; 11+ messages in thread
From: MPTCP CI @ 2026-05-29 12:00 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: mptcp

Hi Paolo,

Thank you for your modifications, that's great!

Our CI did some validations and here is its report:

- KVM Validation: normal (except selftest_mptcp_join): Success! ✅
- KVM Validation: normal (only selftest_mptcp_join): Success! ✅
- KVM Validation: debug (except selftest_mptcp_join): Success! ✅
- KVM Validation: debug (only selftest_mptcp_join): Success! ✅
- KVM Validation: btf-normal (only bpftest_all): Success! ✅
- KVM Validation: btf-debug (only bpftest_all): Success! ✅
- Task: https://github.com/multipath-tcp/mptcp_net-next/actions/runs/26633750821

Initiator: Patchew Applier
Commits: https://github.com/multipath-tcp/mptcp_net-next/commits/8aeb83d616d6
Patchwork: https://patchwork.kernel.org/project/mptcp/list/?series=1102825


If there are some issues, you can reproduce them using the same environment as
the one used by the CI thanks to a docker image, e.g.:

    $ cd [kernel source code]
    $ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
        --pull always mptcp/mptcp-upstream-virtme-docker:latest \
        auto-normal

For more details:

    https://github.com/multipath-tcp/mptcp-upstream-virtme-docker


Please note that despite all the efforts that have been already done to have a
stable tests suite when executed on a public CI like here, it is possible some
reported issues are not due to your modifications. Still, do not hesitate to
help us improve that ;-)

Cheers,
MPTCP GH Action bot
Bot operated by Matthieu Baerts (NGI0 Core)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v10 mptcp-next 3/3] mptcp: let the retrans scheduler do its job.
  2026-05-29 10:44 ` [PATCH v10 mptcp-next 3/3] mptcp: let the retrans scheduler do its job Paolo Abeni
@ 2026-05-29 13:27   ` Geliang Tang
  2026-05-29 14:14     ` Paolo Abeni
  0 siblings, 1 reply; 11+ messages in thread
From: Geliang Tang @ 2026-05-29 13:27 UTC (permalink / raw)
  To: Paolo Abeni, mptcp

Hi Paolo,

On Fri, 2026-05-29 at 12:44 +0200, Paolo Abeni wrote:
> Currently the MPTCP core enforces that when MPTCP-level retrans timer
> fires, at most a single dfrag is retransmitted. If some corner-cases
> it
> may be necessary retransmit multiple dfrags, and the MPTCP socket
> will
> need to wait multiple retrans timeout to accomplish that.
> 
> Remove the mentioned constraint, allowing to transmit multiple dfrags
> per
> retrans period, as long as the scheduler keeps selecting subflows for
> retransmissions and pending data is available in the rtx queue.
> The default scheduler will transmit a dfrag per available subflow.
> 
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> v9 -> v10:
>   - simpler handling for data-fin rtx
> 
> v7 -> v8
>   - fix corner-case retrans_seq update
> 
> v4 -> v5:
>   - fixed already_sent update
> 
> v3 -> v4:
>   - avoid quadratic behavior, fix retrans_seq update
>   - fix rtx timer re-schedule miss
> 
> v2 -> v3:
>   - fix infinite loop issue (should address tls tests failures)
> 
> v1 -> v2:
>   - fix retrans sequence update (sashiko)
> 
> Note:
>  - sashiko may see issues when dfrag = mptcp_rtx_head(sk) != NULL and
>    dfrag->already_sent == 0. That condition should not possible: if
>    mptcp_rtx_head() is not NULL there should be some data already
>    sent.
>  - sashiko may see missing data-fin rtx when the initial `dfrag` is
>    not NULL. data-fin RTX is NOT needed in such scenario.
> ---
>  net/mptcp/protocol.c | 119 +++++++++++++++++++++++++++++++----------
> --
>  1 file changed, 87 insertions(+), 32 deletions(-)
> 
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 51756800edc2..7fe618e22d1b 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -1201,13 +1201,6 @@ static void __mptcp_clean_una_wakeup(struct
> sock *sk)
>  	mptcp_write_space(sk);
>  }
>  
> -static void mptcp_clean_una_wakeup(struct sock *sk)
> -{
> -	mptcp_data_lock(sk);
> -	__mptcp_clean_una_wakeup(sk);
> -	mptcp_data_unlock(sk);
> -}
> -
>  static void mptcp_enter_memory_pressure(struct sock *sk)
>  {
>  	struct mptcp_subflow_context *subflow;
> @@ -2830,8 +2823,12 @@ static void mptcp_check_fastclose(struct
> mptcp_sock *msk)
>  	sk_error_report(sk);
>  }
>  
> -/* Retransmit the specified data fragment on all the selected
> subflows. */
> -static int __mptcp_push_retrans(struct sock *sk, struct
> mptcp_data_frag *dfrag)
> +/*
> + * Retransmit the specified data fragment on all the selected
> subflows,
> + * starting from the specified sequence
> + */
> +static int __mptcp_push_retrans(struct sock *sk, struct
> mptcp_data_frag *dfrag,
> +				u64 sent_seq)
>  {
>  	struct mptcp_sendmsg_info info = { .data_lock_held = true,
> };
>  	struct mptcp_sock *msk = mptcp_sk(sk);
> @@ -2841,6 +2838,7 @@ static int __mptcp_push_retrans(struct sock
> *sk, struct mptcp_data_frag *dfrag)
>  
>  	mptcp_for_each_subflow(msk, subflow) {
>  		if (READ_ONCE(subflow->scheduled)) {
> +			u16 offset = sent_seq - dfrag->data_seq;
>  			u16 copied = 0;
>  
>  			mptcp_subflow_set_scheduled(subflow, false);
> @@ -2850,9 +2848,12 @@ static int __mptcp_push_retrans(struct sock
> *sk, struct mptcp_data_frag *dfrag)
>  			lock_sock(ssk);
>  
>  			/* limit retransmission to the bytes already
> sent on some subflows */
> -			info.sent = 0;
> +			info.sent = offset;
>  			info.limit = READ_ONCE(msk->csum_enabled) ?
> dfrag->data_len :
>  								   
> dfrag->already_sent;
> +			DEBUG_NET_WARN_ON_ONCE(!before64(sent_seq,
> +							 dfrag-
> >data_seq +
> +							
> info.limit));

Same as I reported in v8 of this patch [1], when testing based on v10 I
encountered the following error:

# Starting 4 threads
[  879.268966][ T4341] ------------[ cut here ]------------
[  879.269413][ T4341] !before64(sent_seq, dfrag->data_seq +
info.limit)
[  879.269417][ T4341] WARNING: net/mptcp/protocol.c:2883 at
__mptcp_push_retrans+0x562/0x6f0, CPU#18: kworker/18:6H/4341
[  879.271883][ T4341] Modules linked in: mptcp_diag tcp_diag inet_diag
sch_netem
[  879.272450][ T4341] CPU: 18 UID: 0 PID: 4341 Comm: kworker/18:6H Not
tainted 7.1.0-rc5+ #94 PREEMPT(full) 
[  879.272813][ T4341] Hardware name: Bochs Bochs, BIOS Bochs
01/01/2011
[  879.273225][ T4341] Workqueue: nvmet_tcp_wq nvmet_tcp_io_work
[  879.273527][ T4341] RIP: 0010:__mptcp_push_retrans+0x562/0x6f0

Thanks,
-Geliang

[1]
https://patchwork.kernel.org/project/mptcp/patch/a35934164aad8a08ed8108853cff7a282e5c25ef.1779485511.git.pabeni@redhat.com/

>  
>  			/*
>  			 * make the whole retrans decision, xmit,
> disallow
> @@ -2896,14 +2897,85 @@ static void __mptcp_retrans(struct sock *sk)
>  	struct mptcp_sock *msk = mptcp_sk(sk);
>  	struct mptcp_subflow_context *subflow;
>  	struct mptcp_data_frag *dfrag;
> +	bool need_retrans;
> +	u64 retrans_seq;
>  	int err, len;
>  
> -	mptcp_clean_una_wakeup(sk);
> -
> -	/* first check ssk: need to kick "stale" logic */
> -	err = mptcp_sched_get_retrans(msk);
> +	mptcp_data_lock(sk);
> +	__mptcp_clean_una_wakeup(sk);
> +	retrans_seq = msk->snd_una;
>  	dfrag = mptcp_rtx_head(sk);
> -	if (!dfrag) {
> +	need_retrans = !!dfrag;
> +	mptcp_data_unlock(sk);
> +	if (!dfrag)
> +		goto check_data_fin;
> +
> +	for (;;) {
> +		bool already_retrans;
> +		u64 sent_seq;
> +
> +		/* The default scheduler will kick "stale" logic,
> that in
> +		 * turn can process incoming acks and clean the RTX
> queue;
> +		 * ensure that the current dfrag will still be
> around
> +		 * afterwards.
> +		 */
> +		get_page(dfrag->page);
> +		err = mptcp_sched_get_retrans(msk);
> +		if (err) {
> +			put_page(dfrag->page);
> +			break;
> +		}
> +
> +		/* Incoming acks can have moved retrans sequence
> after
> +		 * the current dfrag, if so try to start again from
> RTX head.
> +		 */
> +		mptcp_data_lock(sk);
> +		already_retrans = !dfrag->already_sent ||
> +				  !before64(msk->snd_una, dfrag-
> >data_seq +
> +					    dfrag->already_sent);
> +		put_page(dfrag->page);
> +		if (already_retrans) {
> +			__mptcp_clean_una_wakeup(sk);
> +			retrans_seq = msk->snd_una;
> +			dfrag = mptcp_rtx_head(sk);
> +			need_retrans = !!dfrag;
> +		} else if (after64(msk->snd_una, retrans_seq)) {
> +			retrans_seq = msk->snd_una;
> +		}
> +		mptcp_data_unlock(sk);
> +		if (!dfrag)
> +			break;
> +
> +		/* Can fail only in case of fallback. */
> +		len = __mptcp_push_retrans(sk, dfrag, retrans_seq);
> +		if (len < 0)
> +			goto clear_scheduled;
> +
> +		retrans_seq += len;
> +		msk->bytes_retrans += len;
> +		dfrag->already_sent = max_t(u16, dfrag-
> >already_sent,
> +					    retrans_seq - dfrag-
> >data_seq);
> +
> +		/* With csum enabled retransmission can send new
> data. */
> +		sent_seq = dfrag->already_sent + dfrag->data_seq;
> +		if (after64(sent_seq, msk->snd_nxt))
> +			WRITE_ONCE(msk->snd_nxt, sent_seq);
> +
> +		/* Attempt the next fragment only if the current one
> is
> +		 * completely retransmitted.
> +		 */
> +		if (before64(retrans_seq, dfrag->data_seq + dfrag-
> >data_len))
> +			break;
> +
> +		dfrag = list_is_last(&dfrag->list, &msk->rtx_queue)
> ?
> +				NULL : list_next_entry(dfrag, list);
> +		if (!dfrag || !dfrag->already_sent)
> +			break;
> +	}
> +
> +	/* Attempt data-fin retransmission only when the RTX queue
> is empty. */
> +	if (!need_retrans) {
> +check_data_fin:
>  		if (mptcp_data_fin_enabled(msk)) {
>  			struct inet_connection_sock *icsk =
> inet_csk(sk);
>  
> @@ -2911,30 +2983,13 @@ static void __mptcp_retrans(struct sock *sk)
>  				   icsk->icsk_retransmits + 1);
>  			mptcp_set_datafin_timeout(sk);
>  			mptcp_send_ack(msk);
> -
>  			goto reset_timer;
>  		}
>  
>  		if (!mptcp_send_head(sk))
>  			goto clear_scheduled;
> -
> -		goto reset_timer;
>  	}
>  
> -	if (err)
> -		goto reset_timer;
> -
> -	len = __mptcp_push_retrans(sk, dfrag);
> -	if (len < 0)
> -		goto clear_scheduled;
> -
> -	msk->bytes_retrans += len;
> -	dfrag->already_sent = max(dfrag->already_sent, len);
> -
> -	/* With csum enabled retransmission can send new data. */
> -	if (after64(dfrag->already_sent + dfrag->data_seq, msk-
> >snd_nxt))
> -		WRITE_ONCE(msk->snd_nxt, dfrag->already_sent +
> dfrag->data_seq);
> -
>  reset_timer:
>  	mptcp_check_and_set_pending(sk);
>  

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v10 mptcp-next 3/3] mptcp: let the retrans scheduler do its job.
  2026-05-29 13:27   ` Geliang Tang
@ 2026-05-29 14:14     ` Paolo Abeni
  2026-05-29 14:41       ` Paolo Abeni
  2026-05-29 17:04       ` Paolo Abeni
  0 siblings, 2 replies; 11+ messages in thread
From: Paolo Abeni @ 2026-05-29 14:14 UTC (permalink / raw)
  To: Geliang Tang, mptcp

On 5/29/26 3:27 PM, Geliang Tang wrote:
> On Fri, 2026-05-29 at 12:44 +0200, Paolo Abeni wrote:
>> Currently the MPTCP core enforces that when MPTCP-level retrans timer
>> fires, at most a single dfrag is retransmitted. If some corner-cases
>> it
>> may be necessary retransmit multiple dfrags, and the MPTCP socket
>> will
>> need to wait multiple retrans timeout to accomplish that.
>>
>> Remove the mentioned constraint, allowing to transmit multiple dfrags
>> per
>> retrans period, as long as the scheduler keeps selecting subflows for
>> retransmissions and pending data is available in the rtx queue.
>> The default scheduler will transmit a dfrag per available subflow.
>>
>> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>> ---
>> v9 -> v10:
>>   - simpler handling for data-fin rtx
>>
>> v7 -> v8
>>   - fix corner-case retrans_seq update
>>
>> v4 -> v5:
>>   - fixed already_sent update
>>
>> v3 -> v4:
>>   - avoid quadratic behavior, fix retrans_seq update
>>   - fix rtx timer re-schedule miss
>>
>> v2 -> v3:
>>   - fix infinite loop issue (should address tls tests failures)
>>
>> v1 -> v2:
>>   - fix retrans sequence update (sashiko)
>>
>> Note:
>>  - sashiko may see issues when dfrag = mptcp_rtx_head(sk) != NULL and
>>    dfrag->already_sent == 0. That condition should not possible: if
>>    mptcp_rtx_head() is not NULL there should be some data already
>>    sent.
>>  - sashiko may see missing data-fin rtx when the initial `dfrag` is
>>    not NULL. data-fin RTX is NOT needed in such scenario.
>> ---
>>  net/mptcp/protocol.c | 119 +++++++++++++++++++++++++++++++----------
>> --
>>  1 file changed, 87 insertions(+), 32 deletions(-)
>>
>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>> index 51756800edc2..7fe618e22d1b 100644
>> --- a/net/mptcp/protocol.c
>> +++ b/net/mptcp/protocol.c
>> @@ -1201,13 +1201,6 @@ static void __mptcp_clean_una_wakeup(struct
>> sock *sk)
>>  	mptcp_write_space(sk);
>>  }
>>  
>> -static void mptcp_clean_una_wakeup(struct sock *sk)
>> -{
>> -	mptcp_data_lock(sk);
>> -	__mptcp_clean_una_wakeup(sk);
>> -	mptcp_data_unlock(sk);
>> -}
>> -
>>  static void mptcp_enter_memory_pressure(struct sock *sk)
>>  {
>>  	struct mptcp_subflow_context *subflow;
>> @@ -2830,8 +2823,12 @@ static void mptcp_check_fastclose(struct
>> mptcp_sock *msk)
>>  	sk_error_report(sk);
>>  }
>>  
>> -/* Retransmit the specified data fragment on all the selected
>> subflows. */
>> -static int __mptcp_push_retrans(struct sock *sk, struct
>> mptcp_data_frag *dfrag)
>> +/*
>> + * Retransmit the specified data fragment on all the selected
>> subflows,
>> + * starting from the specified sequence
>> + */
>> +static int __mptcp_push_retrans(struct sock *sk, struct
>> mptcp_data_frag *dfrag,
>> +				u64 sent_seq)
>>  {
>>  	struct mptcp_sendmsg_info info = { .data_lock_held = true,
>> };
>>  	struct mptcp_sock *msk = mptcp_sk(sk);
>> @@ -2841,6 +2838,7 @@ static int __mptcp_push_retrans(struct sock
>> *sk, struct mptcp_data_frag *dfrag)
>>  
>>  	mptcp_for_each_subflow(msk, subflow) {
>>  		if (READ_ONCE(subflow->scheduled)) {
>> +			u16 offset = sent_seq - dfrag->data_seq;
>>  			u16 copied = 0;
>>  
>>  			mptcp_subflow_set_scheduled(subflow, false);
>> @@ -2850,9 +2848,12 @@ static int __mptcp_push_retrans(struct sock
>> *sk, struct mptcp_data_frag *dfrag)
>>  			lock_sock(ssk);
>>  
>>  			/* limit retransmission to the bytes already
>> sent on some subflows */
>> -			info.sent = 0;
>> +			info.sent = offset;
>>  			info.limit = READ_ONCE(msk->csum_enabled) ?
>> dfrag->data_len :
>>  								   
>> dfrag->already_sent;
>> +			DEBUG_NET_WARN_ON_ONCE(!before64(sent_seq,
>> +							 dfrag-
>>> data_seq +
>> +							
>> info.limit));
> 
> Same as I reported in v8 of this patch [1], 

I'm sorry, I missed the previous report. Thanks for persisting :)

> when testing based on v10 I encountered the following error:
> 
> # Starting 4 threads
> [  879.268966][ T4341] ------------[ cut here ]------------
> [  879.269413][ T4341] !before64(sent_seq, dfrag->data_seq +
> info.limit)
> [  879.269417][ T4341] WARNING: net/mptcp/protocol.c:2883 at
> __mptcp_push_retrans+0x562/0x6f0, CPU#18: kworker/18:6H/4341
> [  879.271883][ T4341] Modules linked in: mptcp_diag tcp_diag inet_diag
> sch_netem
> [  879.272450][ T4341] CPU: 18 UID: 0 PID: 4341 Comm: kworker/18:6H Not
> tainted 7.1.0-rc5+ #94 PREEMPT(full) 
> [  879.272813][ T4341] Hardware name: Bochs Bochs, BIOS Bochs
> 01/01/2011
> [  879.273225][ T4341] Workqueue: nvmet_tcp_wq nvmet_tcp_io_work
> [  879.273527][ T4341] RIP: 0010:__mptcp_push_retrans+0x562/0x6f0

Which test-case produces the above splat? Is the splat deterministic?

Sashiko reported such problem, but I don't see how that can happen
without i.e. an independently generated data corruption as root cause:

info.limit can be 0 only when dfrag->already_sent == 0, and that can
happen only if mptcp_rtx_head(sk) == NULL - if the current rtx head is
not null, the current retransmit head data fragment must have some
unackledged bytes, that is already_sent must be greater than 0.

That incoming ack can touch/modify the RTX queue while __mptcp_retrans()
is running only when __mptcp_clean_una_wakeup() is explicitly invoked,
and the code always check `dfrag` afterwards.

AFAICS if that warning fires, there is some bug elsewhere.

/P


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v10 mptcp-next 3/3] mptcp: let the retrans scheduler do its job.
  2026-05-29 14:14     ` Paolo Abeni
@ 2026-05-29 14:41       ` Paolo Abeni
  2026-05-29 17:04       ` Paolo Abeni
  1 sibling, 0 replies; 11+ messages in thread
From: Paolo Abeni @ 2026-05-29 14:41 UTC (permalink / raw)
  To: Geliang Tang, mptcp

On 5/29/26 4:14 PM, Paolo Abeni wrote:
> On 5/29/26 3:27 PM, Geliang Tang wrote:
>> On Fri, 2026-05-29 at 12:44 +0200, Paolo Abeni wrote:
>>> Currently the MPTCP core enforces that when MPTCP-level retrans timer
>>> fires, at most a single dfrag is retransmitted. If some corner-cases
>>> it
>>> may be necessary retransmit multiple dfrags, and the MPTCP socket
>>> will
>>> need to wait multiple retrans timeout to accomplish that.
>>>
>>> Remove the mentioned constraint, allowing to transmit multiple dfrags
>>> per
>>> retrans period, as long as the scheduler keeps selecting subflows for
>>> retransmissions and pending data is available in the rtx queue.
>>> The default scheduler will transmit a dfrag per available subflow.
>>>
>>> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>>> ---
>>> v9 -> v10:
>>>   - simpler handling for data-fin rtx
>>>
>>> v7 -> v8
>>>   - fix corner-case retrans_seq update
>>>
>>> v4 -> v5:
>>>   - fixed already_sent update
>>>
>>> v3 -> v4:
>>>   - avoid quadratic behavior, fix retrans_seq update
>>>   - fix rtx timer re-schedule miss
>>>
>>> v2 -> v3:
>>>   - fix infinite loop issue (should address tls tests failures)
>>>
>>> v1 -> v2:
>>>   - fix retrans sequence update (sashiko)
>>>
>>> Note:
>>>  - sashiko may see issues when dfrag = mptcp_rtx_head(sk) != NULL and
>>>    dfrag->already_sent == 0. That condition should not possible: if
>>>    mptcp_rtx_head() is not NULL there should be some data already
>>>    sent.
>>>  - sashiko may see missing data-fin rtx when the initial `dfrag` is
>>>    not NULL. data-fin RTX is NOT needed in such scenario.
>>> ---
>>>  net/mptcp/protocol.c | 119 +++++++++++++++++++++++++++++++----------
>>> --
>>>  1 file changed, 87 insertions(+), 32 deletions(-)
>>>
>>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>>> index 51756800edc2..7fe618e22d1b 100644
>>> --- a/net/mptcp/protocol.c
>>> +++ b/net/mptcp/protocol.c
>>> @@ -1201,13 +1201,6 @@ static void __mptcp_clean_una_wakeup(struct
>>> sock *sk)
>>>  	mptcp_write_space(sk);
>>>  }
>>>  
>>> -static void mptcp_clean_una_wakeup(struct sock *sk)
>>> -{
>>> -	mptcp_data_lock(sk);
>>> -	__mptcp_clean_una_wakeup(sk);
>>> -	mptcp_data_unlock(sk);
>>> -}
>>> -
>>>  static void mptcp_enter_memory_pressure(struct sock *sk)
>>>  {
>>>  	struct mptcp_subflow_context *subflow;
>>> @@ -2830,8 +2823,12 @@ static void mptcp_check_fastclose(struct
>>> mptcp_sock *msk)
>>>  	sk_error_report(sk);
>>>  }
>>>  
>>> -/* Retransmit the specified data fragment on all the selected
>>> subflows. */
>>> -static int __mptcp_push_retrans(struct sock *sk, struct
>>> mptcp_data_frag *dfrag)
>>> +/*
>>> + * Retransmit the specified data fragment on all the selected
>>> subflows,
>>> + * starting from the specified sequence
>>> + */
>>> +static int __mptcp_push_retrans(struct sock *sk, struct
>>> mptcp_data_frag *dfrag,
>>> +				u64 sent_seq)
>>>  {
>>>  	struct mptcp_sendmsg_info info = { .data_lock_held = true,
>>> };
>>>  	struct mptcp_sock *msk = mptcp_sk(sk);
>>> @@ -2841,6 +2838,7 @@ static int __mptcp_push_retrans(struct sock
>>> *sk, struct mptcp_data_frag *dfrag)
>>>  
>>>  	mptcp_for_each_subflow(msk, subflow) {
>>>  		if (READ_ONCE(subflow->scheduled)) {
>>> +			u16 offset = sent_seq - dfrag->data_seq;
>>>  			u16 copied = 0;
>>>  
>>>  			mptcp_subflow_set_scheduled(subflow, false);
>>> @@ -2850,9 +2848,12 @@ static int __mptcp_push_retrans(struct sock
>>> *sk, struct mptcp_data_frag *dfrag)
>>>  			lock_sock(ssk);
>>>  
>>>  			/* limit retransmission to the bytes already
>>> sent on some subflows */
>>> -			info.sent = 0;
>>> +			info.sent = offset;
>>>  			info.limit = READ_ONCE(msk->csum_enabled) ?
>>> dfrag->data_len :
>>>  								   
>>> dfrag->already_sent;
>>> +			DEBUG_NET_WARN_ON_ONCE(!before64(sent_seq,
>>> +							 dfrag-
>>>> data_seq +
>>> +							
>>> info.limit));
>>
>> Same as I reported in v8 of this patch [1], 
> 
> I'm sorry, I missed the previous report. Thanks for persisting :)
> 
>> when testing based on v10 I encountered the following error:
>>
>> # Starting 4 threads
>> [  879.268966][ T4341] ------------[ cut here ]------------
>> [  879.269413][ T4341] !before64(sent_seq, dfrag->data_seq +
>> info.limit)
>> [  879.269417][ T4341] WARNING: net/mptcp/protocol.c:2883 at
>> __mptcp_push_retrans+0x562/0x6f0, CPU#18: kworker/18:6H/4341
>> [  879.271883][ T4341] Modules linked in: mptcp_diag tcp_diag inet_diag
>> sch_netem
>> [  879.272450][ T4341] CPU: 18 UID: 0 PID: 4341 Comm: kworker/18:6H Not
>> tainted 7.1.0-rc5+ #94 PREEMPT(full) 
>> [  879.272813][ T4341] Hardware name: Bochs Bochs, BIOS Bochs
>> 01/01/2011
>> [  879.273225][ T4341] Workqueue: nvmet_tcp_wq nvmet_tcp_io_work
>> [  879.273527][ T4341] RIP: 0010:__mptcp_push_retrans+0x562/0x6f0
> 
> Which test-case produces the above splat? Is the splat deterministic?
> 
> Sashiko reported such problem, but I don't see how that can happen
> without i.e. an independently generated data corruption as root cause:
> 
> info.limit can be 0 only when dfrag->already_sent == 0, and that can
> happen only if mptcp_rtx_head(sk) == NULL - if the current rtx head is
> not null, the current retransmit head data fragment must have some
> unackledged bytes, that is already_sent must be greater than 0.
> 
> That incoming ack can touch/modify the RTX queue while __mptcp_retrans()
> is running only when __mptcp_clean_una_wakeup() is explicitly invoked,
> and the code always check `dfrag` afterwards.
> 
> AFAICS if that warning fires, there is some bug elsewhere.

Well, sort of: __mptcp_retransmit_pending_data() explicitly breaks the
assumption above by explicitly clearing `already_sent` in all the data
fragments in the retransmission queue.

__mptcp_retransmit_pending_data() behavior is actually "correct" - to
ensure recovery. I'll and explicit `already_sent == 0` check and will
drop the DEBUG_NET_WARN_ON_ONCE().

/P


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v10 mptcp-next 3/3] mptcp: let the retrans scheduler do its job.
  2026-05-29 14:14     ` Paolo Abeni
  2026-05-29 14:41       ` Paolo Abeni
@ 2026-05-29 17:04       ` Paolo Abeni
  2026-05-30  4:23         ` Geliang Tang
  1 sibling, 1 reply; 11+ messages in thread
From: Paolo Abeni @ 2026-05-29 17:04 UTC (permalink / raw)
  To: Geliang Tang, mptcp

On 5/29/26 4:14 PM, Paolo Abeni wrote:
> On 5/29/26 3:27 PM, Geliang Tang wrote:
>> On Fri, 2026-05-29 at 12:44 +0200, Paolo Abeni wrote:
>>> Currently the MPTCP core enforces that when MPTCP-level retrans timer
>>> fires, at most a single dfrag is retransmitted. If some corner-cases
>>> it
>>> may be necessary retransmit multiple dfrags, and the MPTCP socket
>>> will
>>> need to wait multiple retrans timeout to accomplish that.
>>>
>>> Remove the mentioned constraint, allowing to transmit multiple dfrags
>>> per
>>> retrans period, as long as the scheduler keeps selecting subflows for
>>> retransmissions and pending data is available in the rtx queue.
>>> The default scheduler will transmit a dfrag per available subflow.
>>>
>>> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>>> ---
>>> v9 -> v10:
>>>   - simpler handling for data-fin rtx
>>>
>>> v7 -> v8
>>>   - fix corner-case retrans_seq update
>>>
>>> v4 -> v5:
>>>   - fixed already_sent update
>>>
>>> v3 -> v4:
>>>   - avoid quadratic behavior, fix retrans_seq update
>>>   - fix rtx timer re-schedule miss
>>>
>>> v2 -> v3:
>>>   - fix infinite loop issue (should address tls tests failures)
>>>
>>> v1 -> v2:
>>>   - fix retrans sequence update (sashiko)
>>>
>>> Note:
>>>  - sashiko may see issues when dfrag = mptcp_rtx_head(sk) != NULL and
>>>    dfrag->already_sent == 0. That condition should not possible: if
>>>    mptcp_rtx_head() is not NULL there should be some data already
>>>    sent.
>>>  - sashiko may see missing data-fin rtx when the initial `dfrag` is
>>>    not NULL. data-fin RTX is NOT needed in such scenario.
>>> ---
>>>  net/mptcp/protocol.c | 119 +++++++++++++++++++++++++++++++----------
>>> --
>>>  1 file changed, 87 insertions(+), 32 deletions(-)
>>>
>>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>>> index 51756800edc2..7fe618e22d1b 100644
>>> --- a/net/mptcp/protocol.c
>>> +++ b/net/mptcp/protocol.c
>>> @@ -1201,13 +1201,6 @@ static void __mptcp_clean_una_wakeup(struct
>>> sock *sk)
>>>  	mptcp_write_space(sk);
>>>  }
>>>  
>>> -static void mptcp_clean_una_wakeup(struct sock *sk)
>>> -{
>>> -	mptcp_data_lock(sk);
>>> -	__mptcp_clean_una_wakeup(sk);
>>> -	mptcp_data_unlock(sk);
>>> -}
>>> -
>>>  static void mptcp_enter_memory_pressure(struct sock *sk)
>>>  {
>>>  	struct mptcp_subflow_context *subflow;
>>> @@ -2830,8 +2823,12 @@ static void mptcp_check_fastclose(struct
>>> mptcp_sock *msk)
>>>  	sk_error_report(sk);
>>>  }
>>>  
>>> -/* Retransmit the specified data fragment on all the selected
>>> subflows. */
>>> -static int __mptcp_push_retrans(struct sock *sk, struct
>>> mptcp_data_frag *dfrag)
>>> +/*
>>> + * Retransmit the specified data fragment on all the selected
>>> subflows,
>>> + * starting from the specified sequence
>>> + */
>>> +static int __mptcp_push_retrans(struct sock *sk, struct
>>> mptcp_data_frag *dfrag,
>>> +				u64 sent_seq)
>>>  {
>>>  	struct mptcp_sendmsg_info info = { .data_lock_held = true,
>>> };
>>>  	struct mptcp_sock *msk = mptcp_sk(sk);
>>> @@ -2841,6 +2838,7 @@ static int __mptcp_push_retrans(struct sock
>>> *sk, struct mptcp_data_frag *dfrag)
>>>  
>>>  	mptcp_for_each_subflow(msk, subflow) {
>>>  		if (READ_ONCE(subflow->scheduled)) {
>>> +			u16 offset = sent_seq - dfrag->data_seq;
>>>  			u16 copied = 0;
>>>  
>>>  			mptcp_subflow_set_scheduled(subflow, false);
>>> @@ -2850,9 +2848,12 @@ static int __mptcp_push_retrans(struct sock
>>> *sk, struct mptcp_data_frag *dfrag)
>>>  			lock_sock(ssk);
>>>  
>>>  			/* limit retransmission to the bytes already
>>> sent on some subflows */
>>> -			info.sent = 0;
>>> +			info.sent = offset;
>>>  			info.limit = READ_ONCE(msk->csum_enabled) ?
>>> dfrag->data_len :
>>>  								   
>>> dfrag->already_sent;
>>> +			DEBUG_NET_WARN_ON_ONCE(!before64(sent_seq,
>>> +							 dfrag-
>>>> data_seq +
>>> +							
>>> info.limit));
>>
>> Same as I reported in v8 of this patch [1], 
> 
> I'm sorry, I missed the previous report. Thanks for persisting :)
> 
>> when testing based on v10 I encountered the following error:
>>
>> # Starting 4 threads
>> [  879.268966][ T4341] ------------[ cut here ]------------
>> [  879.269413][ T4341] !before64(sent_seq, dfrag->data_seq +
>> info.limit)
>> [  879.269417][ T4341] WARNING: net/mptcp/protocol.c:2883 at
>> __mptcp_push_retrans+0x562/0x6f0, CPU#18: kworker/18:6H/4341
>> [  879.271883][ T4341] Modules linked in: mptcp_diag tcp_diag inet_diag
>> sch_netem
>> [  879.272450][ T4341] CPU: 18 UID: 0 PID: 4341 Comm: kworker/18:6H Not
>> tainted 7.1.0-rc5+ #94 PREEMPT(full) 
>> [  879.272813][ T4341] Hardware name: Bochs Bochs, BIOS Bochs
>> 01/01/2011
>> [  879.273225][ T4341] Workqueue: nvmet_tcp_wq nvmet_tcp_io_work
>> [  879.273527][ T4341] RIP: 0010:__mptcp_push_retrans+0x562/0x6f0
> 
> Which test-case produces the above splat? Is the splat deterministic?
> 
> Sashiko reported such problem, but I don't see how that can happen
> without i.e. an independently generated data corruption as root cause:
> 
> info.limit can be 0 only when dfrag->already_sent == 0, and that can
> happen only if mptcp_rtx_head(sk) == NULL - if the current rtx head is
> not null, the current retransmit head data fragment must have some
> unackledged bytes, that is already_sent must be greater than 0.
> 
> That incoming ack can touch/modify the RTX queue while __mptcp_retrans()
> is running only when __mptcp_clean_una_wakeup() is explicitly invoked,
> and the code always check `dfrag` afterwards.
> 
> AFAICS if that warning fires, there is some bug elsewhere.

Well, sort of: __mptcp_retransmit_pending_data() explicitly breaks the
assumption above by explicitly clearing `already_sent` in all the data
fragments in the retransmission queue.

__mptcp_retransmit_pending_data() behavior is actually "correct" - to
ensure recovery. I'll and explicit `already_sent == 0` check and will
drop the DEBUG_NET_WARN_ON_ONCE().

/P


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v10 mptcp-next 3/3] mptcp: let the retrans scheduler do its job.
  2026-05-29 17:04       ` Paolo Abeni
@ 2026-05-30  4:23         ` Geliang Tang
  0 siblings, 0 replies; 11+ messages in thread
From: Geliang Tang @ 2026-05-30  4:23 UTC (permalink / raw)
  To: Paolo Abeni, mptcp

Hi Paolo,

On Fri, 2026-05-29 at 19:04 +0200, Paolo Abeni wrote:
> On 5/29/26 4:14 PM, Paolo Abeni wrote:
> > On 5/29/26 3:27 PM, Geliang Tang wrote:
> > > On Fri, 2026-05-29 at 12:44 +0200, Paolo Abeni wrote:
> > > > Currently the MPTCP core enforces that when MPTCP-level retrans
> > > > timer
> > > > fires, at most a single dfrag is retransmitted. If some corner-
> > > > cases
> > > > it
> > > > may be necessary retransmit multiple dfrags, and the MPTCP
> > > > socket
> > > > will
> > > > need to wait multiple retrans timeout to accomplish that.
> > > > 
> > > > Remove the mentioned constraint, allowing to transmit multiple
> > > > dfrags
> > > > per
> > > > retrans period, as long as the scheduler keeps selecting
> > > > subflows for
> > > > retransmissions and pending data is available in the rtx queue.
> > > > The default scheduler will transmit a dfrag per available
> > > > subflow.
> > > > 
> > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > > ---
> > > > v9 -> v10:
> > > >   - simpler handling for data-fin rtx
> > > > 
> > > > v7 -> v8
> > > >   - fix corner-case retrans_seq update
> > > > 
> > > > v4 -> v5:
> > > >   - fixed already_sent update
> > > > 
> > > > v3 -> v4:
> > > >   - avoid quadratic behavior, fix retrans_seq update
> > > >   - fix rtx timer re-schedule miss
> > > > 
> > > > v2 -> v3:
> > > >   - fix infinite loop issue (should address tls tests failures)
> > > > 
> > > > v1 -> v2:
> > > >   - fix retrans sequence update (sashiko)
> > > > 
> > > > Note:
> > > >  - sashiko may see issues when dfrag = mptcp_rtx_head(sk) !=
> > > > NULL and
> > > >    dfrag->already_sent == 0. That condition should not
> > > > possible: if
> > > >    mptcp_rtx_head() is not NULL there should be some data
> > > > already
> > > >    sent.
> > > >  - sashiko may see missing data-fin rtx when the initial
> > > > `dfrag` is
> > > >    not NULL. data-fin RTX is NOT needed in such scenario.
> > > > ---
> > > >  net/mptcp/protocol.c | 119 +++++++++++++++++++++++++++++++----
> > > > ------
> > > > --
> > > >  1 file changed, 87 insertions(+), 32 deletions(-)
> > > > 
> > > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > > > index 51756800edc2..7fe618e22d1b 100644
> > > > --- a/net/mptcp/protocol.c
> > > > +++ b/net/mptcp/protocol.c
> > > > @@ -1201,13 +1201,6 @@ static void
> > > > __mptcp_clean_una_wakeup(struct
> > > > sock *sk)
> > > >  	mptcp_write_space(sk);
> > > >  }
> > > >  
> > > > -static void mptcp_clean_una_wakeup(struct sock *sk)
> > > > -{
> > > > -	mptcp_data_lock(sk);
> > > > -	__mptcp_clean_una_wakeup(sk);
> > > > -	mptcp_data_unlock(sk);
> > > > -}
> > > > -
> > > >  static void mptcp_enter_memory_pressure(struct sock *sk)
> > > >  {
> > > >  	struct mptcp_subflow_context *subflow;
> > > > @@ -2830,8 +2823,12 @@ static void mptcp_check_fastclose(struct
> > > > mptcp_sock *msk)
> > > >  	sk_error_report(sk);
> > > >  }
> > > >  
> > > > -/* Retransmit the specified data fragment on all the selected
> > > > subflows. */
> > > > -static int __mptcp_push_retrans(struct sock *sk, struct
> > > > mptcp_data_frag *dfrag)
> > > > +/*
> > > > + * Retransmit the specified data fragment on all the selected
> > > > subflows,
> > > > + * starting from the specified sequence
> > > > + */
> > > > +static int __mptcp_push_retrans(struct sock *sk, struct
> > > > mptcp_data_frag *dfrag,
> > > > +				u64 sent_seq)
> > > >  {
> > > >  	struct mptcp_sendmsg_info info = { .data_lock_held =
> > > > true,
> > > > };
> > > >  	struct mptcp_sock *msk = mptcp_sk(sk);
> > > > @@ -2841,6 +2838,7 @@ static int __mptcp_push_retrans(struct
> > > > sock
> > > > *sk, struct mptcp_data_frag *dfrag)
> > > >  
> > > >  	mptcp_for_each_subflow(msk, subflow) {
> > > >  		if (READ_ONCE(subflow->scheduled)) {
> > > > +			u16 offset = sent_seq - dfrag-
> > > > >data_seq;
> > > >  			u16 copied = 0;
> > > >  
> > > >  			mptcp_subflow_set_scheduled(subflow,
> > > > false);
> > > > @@ -2850,9 +2848,12 @@ static int __mptcp_push_retrans(struct
> > > > sock
> > > > *sk, struct mptcp_data_frag *dfrag)
> > > >  			lock_sock(ssk);
> > > >  
> > > >  			/* limit retransmission to the bytes
> > > > already
> > > > sent on some subflows */
> > > > -			info.sent = 0;
> > > > +			info.sent = offset;
> > > >  			info.limit = READ_ONCE(msk-
> > > > >csum_enabled) ?
> > > > dfrag->data_len :
> > > >  							
> > > > 	   
> > > > dfrag->already_sent;
> > > > +			DEBUG_NET_WARN_ON_ONCE(!before64(sent_
> > > > seq,
> > > > +							
> > > > dfrag-
> > > > > data_seq +
> > > > +							
> > > > info.limit));
> > > 
> > > Same as I reported in v8 of this patch [1], 
> > 
> > I'm sorry, I missed the previous report. Thanks for persisting :)
> > 
> > > when testing based on v10 I encountered the following error:
> > > 
> > > # Starting 4 threads
> > > [  879.268966][ T4341] ------------[ cut here ]------------
> > > [  879.269413][ T4341] !before64(sent_seq, dfrag->data_seq +
> > > info.limit)
> > > [  879.269417][ T4341] WARNING: net/mptcp/protocol.c:2883 at
> > > __mptcp_push_retrans+0x562/0x6f0, CPU#18: kworker/18:6H/4341
> > > [  879.271883][ T4341] Modules linked in: mptcp_diag tcp_diag
> > > inet_diag
> > > sch_netem
> > > [  879.272450][ T4341] CPU: 18 UID: 0 PID: 4341 Comm:
> > > kworker/18:6H Not
> > > tainted 7.1.0-rc5+ #94 PREEMPT(full) 
> > > [  879.272813][ T4341] Hardware name: Bochs Bochs, BIOS Bochs
> > > 01/01/2011
> > > [  879.273225][ T4341] Workqueue: nvmet_tcp_wq nvmet_tcp_io_work
> > > [  879.273527][ T4341] RIP: 0010:__mptcp_push_retrans+0x562/0x6f0
> > 
> > Which test-case produces the above splat? Is the splat
> > deterministic?
> > 
> > Sashiko reported such problem, but I don't see how that can happen
> > without i.e. an independently generated data corruption as root
> > cause:
> > 
> > info.limit can be 0 only when dfrag->already_sent == 0, and that
> > can
> > happen only if mptcp_rtx_head(sk) == NULL - if the current rtx head
> > is
> > not null, the current retransmit head data fragment must have some
> > unackledged bytes, that is already_sent must be greater than 0.
> > 
> > That incoming ack can touch/modify the RTX queue while
> > __mptcp_retrans()
> > is running only when __mptcp_clean_una_wakeup() is explicitly
> > invoked,
> > and the code always check `dfrag` afterwards.
> > 
> > AFAICS if that warning fires, there is some bug elsewhere.
> 
> Well, sort of: __mptcp_retransmit_pending_data() explicitly breaks
> the
> assumption above by explicitly clearing `already_sent` in all the
> data
> fragments in the retransmission queue.
> 
> __mptcp_retransmit_pending_data() behavior is actually "correct" - to
> ensure recovery. I'll and explicit `already_sent == 0` check and will
> drop the DEBUG_NET_WARN_ON_ONCE().

Thank you very much. Except for this DEBUG_NET_WARN_ON_ONCE, all tests
are working well. Please add my tag for this series:

Tested-and-Acked-by: Geliang Tang <geliang@kernel.org>

-Geliang

> 
> /P

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v10 mptcp-next 1/3] Squash-to: "mptcp: implemented OoO queue pruning"
  2026-05-29 10:44 ` [PATCH v10 mptcp-next 1/3] Squash-to: "mptcp: implemented OoO queue pruning" Paolo Abeni
@ 2026-05-30  8:52   ` Matthieu Baerts
  0 siblings, 0 replies; 11+ messages in thread
From: Matthieu Baerts @ 2026-05-30  8:52 UTC (permalink / raw)
  To: Paolo Abeni, mptcp

Hi Paolo,

On 29/05/2026 20:44, Paolo Abeni wrote:
> As discussed in the original patch, use the proper helper to
> get the currently used memory, and let mptcp_prune_ofo_queue()
> return true when skb can be accepted, to avoid code churn in
> the caller.
I just applied this patch 1/3, but not 2-3/3.

New patches for t/upstream:
- cba0bf051b53: Squash-to: "mptcp: implemented OoO queue pruning"
- Results: 4d6bda29d05f..24c473fa6c91 (export)

Tests are now in progress:

- export:
https://github.com/multipath-tcp/mptcp_net-next/commit/9d89b8f76d6640109b7e412a3710c62c67000ac1/checks

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2026-05-30  8:52 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-29 10:44 [PATCH v10 mptcp-next 0/3] mptcp: address stall under memory pressure Paolo Abeni
2026-05-29 10:44 ` [PATCH v10 mptcp-next 1/3] Squash-to: "mptcp: implemented OoO queue pruning" Paolo Abeni
2026-05-30  8:52   ` Matthieu Baerts
2026-05-29 10:44 ` [PATCH v10 mptcp-next 2/3] mptcp: move the retrans loop to a separate helper Paolo Abeni
2026-05-29 10:44 ` [PATCH v10 mptcp-next 3/3] mptcp: let the retrans scheduler do its job Paolo Abeni
2026-05-29 13:27   ` Geliang Tang
2026-05-29 14:14     ` Paolo Abeni
2026-05-29 14:41       ` Paolo Abeni
2026-05-29 17:04       ` Paolo Abeni
2026-05-30  4:23         ` Geliang Tang
2026-05-29 12:00 ` [PATCH v10 mptcp-next 0/3] mptcp: address stall under memory pressure MPTCP CI

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.