[PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement

MPTCP Linux Development
 help / color / mirror / Atom feed

* [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement
@ 2025-11-04 21:51 Paolo Abeni
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 1/7] trace: mptcp: add mptcp_rcvbuf_grow tracepoint Paolo Abeni
                   ` (7 more replies)
  0 siblings, 8 replies; 11+ messages in thread
From: Paolo Abeni @ 2025-11-04 21:51 UTC (permalink / raw)
  To: mptcp

This series collects a few follow-up for the backlog refactor, with some
of them posing as fixes just to confuse the enemy (or feel the season
mood)

Targeting net-next as the issues addressed are very old, the change
is quite invasive and, as mentioned, based on the BL refactor.

Only patch 2/7 could be considered for net, but still included because
somewhat related to the others and to keep my life easy.

Overall this:
- introduces a tracepoint similar to tcp useful to track down the issues
  addressed here
- avoid rx drop in corner case scenarios, improving tput stability and
  peak tput over fast link
- refactor the rcv space and rtt estimator, overall making DRS more
  correct and avoiding rcv buffer drifting to tcp_rmem[2], which in
  turn makes the tput more stable and less bursty
---
v1 -> v2:
  - new patch 1/7
  - old patch 2/4 has been split in 3: 3/7, 4/7 and 5/7
  - refactor both the rcv space init and rtt estimator

Paolo Abeni (7):
  trace: mptcp: add mptcp_rcvbuf_grow tracepoint
  mptcp: avoid unneeded subflow-level drops.
  mptcp: fix receive space timestamp initialization.
  mptcp: consolidate rcv space init
  mptcp: better rcv space initialization
  mptcp: better mptcp-level rtt estimator
  mptcp: add receive queue awareness in tcp_rcv_space_adjust()

 include/trace/events/mptcp.h |  74 +++++++++++++++++
 net/mptcp/options.c          |  31 +++++++
 net/mptcp/protocol.c         | 154 ++++++++++++++++++++++-------------
 net/mptcp/protocol.h         |  44 +++++++++-
 net/mptcp/subflow.c          |   5 +-
 5 files changed, 247 insertions(+), 61 deletions(-)

-- 
2.51.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 mptcp-next 1/7] trace: mptcp: add mptcp_rcvbuf_grow tracepoint
  2025-11-04 21:51 [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement Paolo Abeni
@ 2025-11-04 21:51 ` Paolo Abeni
  2025-11-05  0:24   ` Mat Martineau
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 2/7] mptcp: avoid unneeded subflow-level drops Paolo Abeni
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 11+ messages in thread
From: Paolo Abeni @ 2025-11-04 21:51 UTC (permalink / raw)
  To: mptcp

Similar to tcp, provide a new tracepoint to better understand
mptcp_rcv_space_adjust() behavior, which presents many artifacts.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/trace/events/mptcp.h | 74 ++++++++++++++++++++++++++++++++++++
 net/mptcp/protocol.c         |  3 ++
 2 files changed, 77 insertions(+)

diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h
index 085b749cdd97..71fd6d33f48b 100644
--- a/include/trace/events/mptcp.h
+++ b/include/trace/events/mptcp.h
@@ -178,6 +178,80 @@ TRACE_EVENT(subflow_check_data_avail,
 		  __entry->skb)
 );
 
+#include <trace/events/net_probe_common.h>
+
+TRACE_EVENT(mptcp_rcvbuf_grow,
+
+	TP_PROTO(struct sock *sk, int time),
+
+	TP_ARGS(sk, time),
+
+	TP_STRUCT__entry(
+		__field(int, time)
+		__field(__u32, rtt_us)
+		__field(__u32, copied)
+		__field(__u32, inq)
+		__field(__u32, space)
+		__field(__u32, ooo_space)
+		__field(__u32, rcvbuf)
+		__field(__u32, rcv_wnd)
+		__field(__u8, scaling_ratio)
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__field(__u16, family)
+		__array(__u8, saddr, 4)
+		__array(__u8, daddr, 4)
+		__array(__u8, saddr_v6, 16)
+		__array(__u8, daddr_v6, 16)
+		__field(const void *, skaddr)
+	),
+
+	TP_fast_assign(
+		struct mptcp_sock *msk = mptcp_sk(sk);
+		struct inet_sock *inet = inet_sk(sk);
+		__be32 *p32;
+
+		__entry->time = time;
+		__entry->rtt_us = msk->rcvq_space.rtt_us >> 3;
+		__entry->copied = msk->rcvq_space.copied;
+		__entry->inq = mptcp_inq_hint(sk);
+		__entry->space = msk->rcvq_space.space;
+		__entry->ooo_space = RB_EMPTY_ROOT(&msk->out_of_order_queue) ? 0 :
+				     MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq -
+				     msk->ack_seq;
+
+		__entry->rcvbuf = sk->sk_rcvbuf;
+		__entry->rcv_wnd = atomic64_read(&msk->rcv_wnd_sent) -  msk->ack_seq;
+		__entry->scaling_ratio = msk->scaling_ratio;
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+		__entry->family = sk->sk_family;
+
+		p32 = (__be32 *) __entry->saddr;
+		*p32 = inet->inet_saddr;
+
+		p32 = (__be32 *) __entry->daddr;
+		*p32 = inet->inet_daddr;
+
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
+
+		__entry->skaddr = sk;
+	),
+
+	TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
+		  "rcv_wnd=%u "
+		  "sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
+		  "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p",
+		  __entry->time, __entry->rtt_us, __entry->copied,
+		  __entry->inq, __entry->space, __entry->ooo_space,
+		  __entry->scaling_ratio, __entry->rcvbuf,
+		  __entry->rcv_wnd,
+		  __entry->sport, __entry->dport,
+		  __entry->saddr, __entry->daddr,
+		  __entry->saddr_v6, __entry->daddr_v6,
+		  __entry->skaddr)
+);
 #endif /* _TRACE_MPTCP_H */
 
 /* This part must be outside protection */
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 01114456dec6..443406bc4a54 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -28,6 +28,8 @@
 #include "protocol.h"
 #include "mib.h"
 
+static unsigned int mptcp_inq_hint(const struct sock *sk);
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/mptcp.h>
 
@@ -2101,6 +2103,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 	if (msk->rcvq_space.copied <= msk->rcvq_space.space)
 		goto new_measure;
 
+	trace_mptcp_rcvbuf_grow(sk, time);
 	if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
 		/* Make subflows follow along.  If we do not do this, we
 		 * get drops at subflow level if skbs can't be moved to
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 mptcp-next 1/7] trace: mptcp: add mptcp_rcvbuf_grow tracepoint
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 1/7] trace: mptcp: add mptcp_rcvbuf_grow tracepoint Paolo Abeni
@ 2025-11-05  0:24   ` Mat Martineau
  2025-11-05  9:40     ` Paolo Abeni
  0 siblings, 1 reply; 11+ messages in thread
From: Mat Martineau @ 2025-11-05  0:24 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: mptcp

On Tue, 4 Nov 2025, Paolo Abeni wrote:

> Similar to tcp, provide a new tracepoint to better understand
> mptcp_rcv_space_adjust() behavior, which presents many artifacts.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> include/trace/events/mptcp.h | 74 ++++++++++++++++++++++++++++++++++++
> net/mptcp/protocol.c         |  3 ++
> 2 files changed, 77 insertions(+)
>
> diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h
> index 085b749cdd97..71fd6d33f48b 100644
> --- a/include/trace/events/mptcp.h
> +++ b/include/trace/events/mptcp.h
> @@ -178,6 +178,80 @@ TRACE_EVENT(subflow_check_data_avail,
> 		  __entry->skb)
> );
>
> +#include <trace/events/net_probe_common.h>
> +
> +TRACE_EVENT(mptcp_rcvbuf_grow,
> +
> +	TP_PROTO(struct sock *sk, int time),
> +
> +	TP_ARGS(sk, time),
> +
> +	TP_STRUCT__entry(
> +		__field(int, time)
> +		__field(__u32, rtt_us)
> +		__field(__u32, copied)
> +		__field(__u32, inq)
> +		__field(__u32, space)
> +		__field(__u32, ooo_space)
> +		__field(__u32, rcvbuf)
> +		__field(__u32, rcv_wnd)
> +		__field(__u8, scaling_ratio)
> +		__field(__u16, sport)
> +		__field(__u16, dport)
> +		__field(__u16, family)
> +		__array(__u8, saddr, 4)
> +		__array(__u8, daddr, 4)
> +		__array(__u8, saddr_v6, 16)
> +		__array(__u8, daddr_v6, 16)
> +		__field(const void *, skaddr)
> +	),
> +
> +	TP_fast_assign(
> +		struct mptcp_sock *msk = mptcp_sk(sk);
> +		struct inet_sock *inet = inet_sk(sk);
> +		__be32 *p32;
> +
> +		__entry->time = time;
> +		__entry->rtt_us = msk->rcvq_space.rtt_us >> 3;
> +		__entry->copied = msk->rcvq_space.copied;
> +		__entry->inq = mptcp_inq_hint(sk);
> +		__entry->space = msk->rcvq_space.space;
> +		__entry->ooo_space = RB_EMPTY_ROOT(&msk->out_of_order_queue) ? 0 :
> +				     MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq -
> +				     msk->ack_seq;
> +
> +		__entry->rcvbuf = sk->sk_rcvbuf;
> +		__entry->rcv_wnd = atomic64_read(&msk->rcv_wnd_sent) -  msk->ack_seq;
> +		__entry->scaling_ratio = msk->scaling_ratio;
> +		__entry->sport = ntohs(inet->inet_sport);
> +		__entry->dport = ntohs(inet->inet_dport);
> +		__entry->family = sk->sk_family;

Hi Paolo -

__entry->family isn't referenced in the TP_printk() below.

Other than that, the series is looking good. I still need to work on 
understanding the last 2 patches, even with the commit messages & comments 
the behavioral changes/consequences aren't clear to me yet.

- Mat

> +
> +		p32 = (__be32 *) __entry->saddr;
> +		*p32 = inet->inet_saddr;
> +
> +		p32 = (__be32 *) __entry->daddr;
> +		*p32 = inet->inet_daddr;
> +
> +		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
> +			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
> +
> +		__entry->skaddr = sk;
> +	),
> +
> +	TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
> +		  "rcv_wnd=%u "
> +		  "sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
> +		  "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p",
> +		  __entry->time, __entry->rtt_us, __entry->copied,
> +		  __entry->inq, __entry->space, __entry->ooo_space,
> +		  __entry->scaling_ratio, __entry->rcvbuf,
> +		  __entry->rcv_wnd,
> +		  __entry->sport, __entry->dport,
> +		  __entry->saddr, __entry->daddr,
> +		  __entry->saddr_v6, __entry->daddr_v6,
> +		  __entry->skaddr)
> +);
> #endif /* _TRACE_MPTCP_H */
>
> /* This part must be outside protection */
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 01114456dec6..443406bc4a54 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -28,6 +28,8 @@
> #include "protocol.h"
> #include "mib.h"
>
> +static unsigned int mptcp_inq_hint(const struct sock *sk);
> +
> #define CREATE_TRACE_POINTS
> #include <trace/events/mptcp.h>
>
> @@ -2101,6 +2103,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
> 	if (msk->rcvq_space.copied <= msk->rcvq_space.space)
> 		goto new_measure;
>
> +	trace_mptcp_rcvbuf_grow(sk, time);
> 	if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
> 		/* Make subflows follow along.  If we do not do this, we
> 		 * get drops at subflow level if skbs can't be moved to
> -- 
> 2.51.0
>
>
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 mptcp-next 1/7] trace: mptcp: add mptcp_rcvbuf_grow tracepoint
  2025-11-05  0:24   ` Mat Martineau
@ 2025-11-05  9:40     ` Paolo Abeni
  0 siblings, 0 replies; 11+ messages in thread
From: Paolo Abeni @ 2025-11-05  9:40 UTC (permalink / raw)
  To: Mat Martineau; +Cc: mptcp



On 11/5/25 1:24 AM, Mat Martineau wrote:
> On Tue, 4 Nov 2025, Paolo Abeni wrote:
> 
>> Similar to tcp, provide a new tracepoint to better understand
>> mptcp_rcv_space_adjust() behavior, which presents many artifacts.
>>
>> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>> ---
>> include/trace/events/mptcp.h | 74 ++++++++++++++++++++++++++++++++++++
>> net/mptcp/protocol.c         |  3 ++
>> 2 files changed, 77 insertions(+)
>>
>> diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h
>> index 085b749cdd97..71fd6d33f48b 100644
>> --- a/include/trace/events/mptcp.h
>> +++ b/include/trace/events/mptcp.h
>> @@ -178,6 +178,80 @@ TRACE_EVENT(subflow_check_data_avail,
>> 		  __entry->skb)
>> );
>>
>> +#include <trace/events/net_probe_common.h>
>> +
>> +TRACE_EVENT(mptcp_rcvbuf_grow,
>> +
>> +	TP_PROTO(struct sock *sk, int time),
>> +
>> +	TP_ARGS(sk, time),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(int, time)
>> +		__field(__u32, rtt_us)
>> +		__field(__u32, copied)
>> +		__field(__u32, inq)
>> +		__field(__u32, space)
>> +		__field(__u32, ooo_space)
>> +		__field(__u32, rcvbuf)
>> +		__field(__u32, rcv_wnd)
>> +		__field(__u8, scaling_ratio)
>> +		__field(__u16, sport)
>> +		__field(__u16, dport)
>> +		__field(__u16, family)
>> +		__array(__u8, saddr, 4)
>> +		__array(__u8, daddr, 4)
>> +		__array(__u8, saddr_v6, 16)
>> +		__array(__u8, daddr_v6, 16)
>> +		__field(const void *, skaddr)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		struct mptcp_sock *msk = mptcp_sk(sk);
>> +		struct inet_sock *inet = inet_sk(sk);
>> +		__be32 *p32;
>> +
>> +		__entry->time = time;
>> +		__entry->rtt_us = msk->rcvq_space.rtt_us >> 3;
>> +		__entry->copied = msk->rcvq_space.copied;
>> +		__entry->inq = mptcp_inq_hint(sk);
>> +		__entry->space = msk->rcvq_space.space;
>> +		__entry->ooo_space = RB_EMPTY_ROOT(&msk->out_of_order_queue) ? 0 :
>> +				     MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq -
>> +				     msk->ack_seq;
>> +
>> +		__entry->rcvbuf = sk->sk_rcvbuf;
>> +		__entry->rcv_wnd = atomic64_read(&msk->rcv_wnd_sent) -  msk->ack_seq;
>> +		__entry->scaling_ratio = msk->scaling_ratio;
>> +		__entry->sport = ntohs(inet->inet_sport);
>> +		__entry->dport = ntohs(inet->inet_dport);
>> +		__entry->family = sk->sk_family;
> 
> Hi Paolo -
> 
> __entry->family isn't referenced in the TP_printk() below.
> 
> Other than that, the series is looking good. I still need to work on 
> understanding the last 2 patches, even with the commit messages & comments 
> the behavioral changes/consequences aren't clear to me yet.

patch 7/7 is just 'inspired' by similar tcp change (commit
ea33537d82921e71f852ea2ed985acc562125efe) the goal is making DRS
converging faster to the right size. It should not have any downside.

patch 6/7 'fixes rtt estimation'; with the current algo there are some
major issues:
- max() is simply wrong, as we need to react reasonably fast. If a link
has extreme high latency and another is very fast, DRS will converge
very slowly. This is addressed using min()
- the subflow rtt is biased by mptcp; i.e. on the rx side is the time
measured between an ack and the next data. If the connection is CPU
bounded, and the scheduler picks a different subflow in response to an
incoming ack, the rtt estimated by acked subflow on the rx side could be
much more higher then the actual delay. This is addressed explicitly
filtering out "too high" rtt value (i.e. double than previous sample)
- the most subtle but very effective problem. When the link latency is
very low (i.e. I have 2 hosts b2b connected) the first rtt sample will
be much higher than the next ones (in my scenario 40K us vs 80 us, note
the missing 'K'), because the first sample includes all process
scheduler and the socket creation overhead. With the current algo I see:

	mptcp_rcv_space_adjust() // time  = ~40K us
	  mptcp_rcv_space_init() // msk rtt = ~40K us, ssk rtt = ~40K
	  mptcp_rcvbuf_grow()	

	mptcp_rcv_space_adjust() // for 40K us keeps increasing `copied`
	...
	mptcp_rcv_space_adjust() // time = ~40K us, copied is very high
	  // ssk rtt ~= 80us, msk rtt = ~80 us
	  mptcp_rcvbuf_grow() // set sk_rcvbuf to tcp_rmem[2], because
			// `copied` accumulated data for much more than
			// one rtt

  the root cause of the problem is that the msk rtt is updated with a
period equal to the previous rtt sample, which in turn is too high at
connection start.

The solution is keeping the msk rtt up2date with the subflow ones, with
update every rcvwin. In theory could be at every incoming packet, but it
would be useless because the subflow update the rtt every rcv wnd and
touching too often the msk field could cause performance regressions.

Side note: the msk rtt needs periodic 'reset'/'refresh' because
otherwise it could not deal with events like 'subflow with lowest rtt is
closed'. Such reset happens every <rcv wnd> * <number of subflows>
incoming bytes.

Please LMK if the above helps.

Cheers,

Paolo
	


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 mptcp-next 2/7] mptcp: avoid unneeded subflow-level drops.
  2025-11-04 21:51 [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement Paolo Abeni
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 1/7] trace: mptcp: add mptcp_rcvbuf_grow tracepoint Paolo Abeni
@ 2025-11-04 21:51 ` Paolo Abeni
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 3/7] mptcp: fix receive space timestamp initialization Paolo Abeni
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Paolo Abeni @ 2025-11-04 21:51 UTC (permalink / raw)
  To: mptcp

The rcv window is shared among all the subflows. Currently, MPTCP sync
the TCP-level rcv window with the MPTCP one at tcp_transmit_skb() time.

The above means that incoming data may sporadically observe outdated
TCP-level rcv window and being wrongly dropped by TCP.

Address the issue checking for the edge condition before queuing the data
at TCP level, and eventually syncing the rcv window as needed.

Note that the issue is actually present from the very first MPTCP
implementation, but backports older than the blamed commit below will range
from impossible to useless.

Before:
nstat >/dev/null ;sleep 1; nstat -z TcpExtBeyondWindow
TcpExtBeyondWindow              14                 0.0

After:
nstat >/dev/null ;sleep 1; nstat -z TcpExtBeyondWindow
TcpExtBeyondWindow              0                  0.0

Fixes: fa3fe2b15031 ("mptcp: track window announced to peer")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/options.c  | 31 +++++++++++++++++++++++++++++++
 net/mptcp/protocol.h |  1 +
 2 files changed, 32 insertions(+)

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index cf531f2d815c..ad51dcf18984 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1042,6 +1042,31 @@ static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una)
 	WRITE_ONCE(msk->snd_una, new_snd_una);
 }
 
+static void rwin_update(struct mptcp_sock *msk, struct sock *ssk,
+			struct sk_buff *skb)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+	struct tcp_sock *tp = tcp_sk(ssk);
+	u64 mptcp_rcv_wnd;
+
+	/* Avoid touching extra cachelines if TCP is going to accept this
+	 * skb without filling the TCP-level window even with a possibly
+	 * outdated mptcp-level rwin.
+	 */
+	if (!skb->len || skb->len < tcp_receive_window(tp))
+		return;
+
+	mptcp_rcv_wnd = atomic64_read(&msk->rcv_wnd_sent);
+	if (!after64(mptcp_rcv_wnd, subflow->rcv_wnd_sent))
+		return;
+
+	/* Some other subflow grew the mptcp-level rwin since rcv_wup,
+	 * resync.
+	 */
+	tp->rcv_wnd += mptcp_rcv_wnd - subflow->rcv_wnd_sent;
+	subflow->rcv_wnd_sent = mptcp_rcv_wnd;
+}
+
 static void ack_update_msk(struct mptcp_sock *msk,
 			   struct sock *ssk,
 			   struct mptcp_options_received *mp_opt)
@@ -1209,6 +1234,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 	 */
 	if (mp_opt.use_ack)
 		ack_update_msk(msk, sk, &mp_opt);
+	rwin_update(msk, sk, skb);
 
 	/* Zero-data-length packets are dropped by the caller and not
 	 * propagated to the MPTCP layer, so the skb extension does not
@@ -1295,6 +1321,10 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
 
 	if (rcv_wnd_new != rcv_wnd_old) {
 raise_win:
+		/* The msk-level rcv wnd is after the tcp level one,
+		 * sync the latter.
+		 */
+		rcv_wnd_new = rcv_wnd_old;
 		win = rcv_wnd_old - ack_seq;
 		tp->rcv_wnd = min_t(u64, win, U32_MAX);
 		new_win = tp->rcv_wnd;
@@ -1318,6 +1348,7 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
 
 update_wspace:
 	WRITE_ONCE(msk->old_wspace, tp->rcv_wnd);
+	subflow->rcv_wnd_sent = rcv_wnd_new;
 }
 
 __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index d0881db16b12..f14eeb4fd884 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -513,6 +513,7 @@ struct mptcp_subflow_context {
 	u64	remote_key;
 	u64	idsn;
 	u64	map_seq;
+	u64	rcv_wnd_sent;
 	u32	snd_isn;
 	u32	token;
 	u32	rel_write_seq;
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 mptcp-next 3/7] mptcp: fix receive space timestamp initialization.
  2025-11-04 21:51 [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement Paolo Abeni
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 1/7] trace: mptcp: add mptcp_rcvbuf_grow tracepoint Paolo Abeni
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 2/7] mptcp: avoid unneeded subflow-level drops Paolo Abeni
@ 2025-11-04 21:51 ` Paolo Abeni
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 4/7] mptcp: consolidate rcv space init Paolo Abeni
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Paolo Abeni @ 2025-11-04 21:51 UTC (permalink / raw)
  To: mptcp

MPTCP initialize the receive buffer stamp in mptcp_rcv_space_init(),
using the provided subflow stamp. Such helper is invoked in several
places; for passive sockets, space init happened at clone time.

In such scenario, MPTCP ends-up accesses the subflow stamp before
its initialization, leading to quite randomic timing for the first
receive buffer auto-tune event, as the timestamp for newly created
subflow is not refreshed there.

Fix the issue moving the stamp initialization of the mentioned helper, as
soon at the data transfer start, and always using a fresh timestamp.

This will also make the next patch cleaner.

Fixes: 013e3179dbd2 ("mptcp: fix rcv space initialization")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v1 -> v2:
 -- factor out only the tstamp change for better reviewability
---
 net/mptcp/protocol.c | 7 ++++---
 net/mptcp/protocol.h | 5 +++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 443406bc4a54..fd10565d9287 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2072,8 +2072,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 
 	msk->rcvq_space.copied += copied;
 
-	mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
-	time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
+	mstamp = mptcp_stamp();
+	time = tcp_stamp_us_delta(mstamp, READ_ONCE(msk->rcvq_space.time));
 
 	rtt_us = msk->rcvq_space.rtt_us;
 	if (rtt_us && time < (rtt_us >> 3))
@@ -3491,7 +3491,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
 	mptcp_copy_inaddrs(nsk, ssk);
 	__mptcp_propagate_sndbuf(nsk, ssk);
 
-	mptcp_rcv_space_init(msk, ssk);
+	msk->rcvq_space.time = mptcp_stamp();
 
 	if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK)
 		__mptcp_subflow_fully_established(msk, subflow, mp_opt);
@@ -3706,6 +3706,7 @@ void mptcp_finish_connect(struct sock *ssk)
 	 * accessing the field below
 	 */
 	WRITE_ONCE(msk->local_key, subflow->local_key);
+	WRITE_ONCE(msk->rcvq_space.time, mptcp_stamp());
 
 	mptcp_pm_new_connection(msk, ssk, 0);
 }
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index f14eeb4fd884..49f211e427bf 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -913,6 +913,11 @@ static inline bool mptcp_is_fully_established(struct sock *sk)
 	       READ_ONCE(mptcp_sk(sk)->fully_established);
 }
 
+static inline u64 mptcp_stamp(void)
+{
+	return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
+}
+
 void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
 void mptcp_data_ready(struct sock *sk, struct sock *ssk);
 bool mptcp_finish_join(struct sock *sk);
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 mptcp-next 4/7] mptcp: consolidate rcv space init
  2025-11-04 21:51 [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement Paolo Abeni
                   ` (2 preceding siblings ...)
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 3/7] mptcp: fix receive space timestamp initialization Paolo Abeni
@ 2025-11-04 21:51 ` Paolo Abeni
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 5/7] mptcp: better rcv space initialization Paolo Abeni
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Paolo Abeni @ 2025-11-04 21:51 UTC (permalink / raw)
  To: mptcp

MPTCP uses several calls of the mptcp_rcv_space_init() helper
to initialize the receive space, with a catch-up call in
mptcp_rcv_space_adjust().

Drop all the other strictly not needed invocations and move constant
fields initialization at socket init/reset time.

This removes a bit of complexity and will simplify the following patches.
No functional changes intended.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v1 -> v2:
  - split helper consolidation out of v1 patch
  - additionally move 'copied' and 'rtt_us' initialization out of
    mptcp_rcv_space_init()
---
 net/mptcp/protocol.c | 34 +++++++++++++++++-----------------
 net/mptcp/protocol.h |  1 -
 net/mptcp/subflow.c  |  2 --
 3 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index fd10565d9287..07e1703688ea 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2050,6 +2050,19 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
 	return copied;
 }
 
+static void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
+{
+	const struct tcp_sock *tp = tcp_sk(ssk);
+
+	msk->rcvspace_init = 1;
+
+	/* initial rcv_space offering made to peer */
+	msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
+				      TCP_INIT_CWND * tp->advmss);
+	if (msk->rcvq_space.space == 0)
+		msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
+}
+
 /* receive buffer autotuning.  See tcp_rcv_space_adjust for more information.
  *
  * Only difference: Use highest rtt estimate of the subflows in use.
@@ -2922,6 +2935,8 @@ static void __mptcp_init_sock(struct sock *sk)
 	msk->timer_ival = TCP_RTO_MIN;
 	msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
 	msk->backlog_len = 0;
+	msk->rcvq_space.copied = 0;
+	msk->rcvq_space.rtt_us = 0;
 
 	WRITE_ONCE(msk->first, NULL);
 	inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@@ -3364,6 +3379,8 @@ static int mptcp_disconnect(struct sock *sk, int flags)
 	msk->bytes_sent = 0;
 	msk->bytes_retrans = 0;
 	msk->rcvspace_init = 0;
+	msk->rcvq_space.copied = 0;
+	msk->rcvq_space.rtt_us = 0;
 
 	/* for fallback's sake */
 	WRITE_ONCE(msk->ack_seq, 0);
@@ -3501,23 +3518,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
 	return nsk;
 }
 
-void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
-{
-	const struct tcp_sock *tp = tcp_sk(ssk);
-
-	msk->rcvspace_init = 1;
-	msk->rcvq_space.copied = 0;
-	msk->rcvq_space.rtt_us = 0;
-
-	msk->rcvq_space.time = tp->tcp_mstamp;
-
-	/* initial rcv_space offering made to peer */
-	msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
-				      TCP_INIT_CWND * tp->advmss);
-	if (msk->rcvq_space.space == 0)
-		msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
-}
-
 static void mptcp_destroy(struct sock *sk)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 49f211e427bf..7c9e143c0fb5 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -918,7 +918,6 @@ static inline u64 mptcp_stamp(void)
 	return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
 }
 
-void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
 void mptcp_data_ready(struct sock *sk, struct sock *ssk);
 bool mptcp_finish_join(struct sock *sk);
 bool mptcp_schedule_work(struct sock *sk);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index ac8616e7521e..b64ab7649908 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -462,8 +462,6 @@ void __mptcp_sync_state(struct sock *sk, int state)
 
 	subflow = mptcp_subflow_ctx(ssk);
 	__mptcp_propagate_sndbuf(sk, ssk);
-	if (!msk->rcvspace_init)
-		mptcp_rcv_space_init(msk, ssk);
 
 	if (sk->sk_state == TCP_SYN_SENT) {
 		/* subflow->idsn is always available is TCP_SYN_SENT state,
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 mptcp-next 5/7] mptcp: better rcv space initialization
  2025-11-04 21:51 [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement Paolo Abeni
                   ` (3 preceding siblings ...)
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 4/7] mptcp: consolidate rcv space init Paolo Abeni
@ 2025-11-04 21:51 ` Paolo Abeni
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 6/7] mptcp: better mptcp-level rtt estimator Paolo Abeni
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Paolo Abeni @ 2025-11-04 21:51 UTC (permalink / raw)
  To: mptcp

Each additional subflow can overshot the rcv space by a full rcv wnd,
as the TCP socket at creation time will announce such window even if
the MPTCP-level window is closed.

Underestimating the initial real rcv space can overshot the initial rcv
win increase significantly.

Keep track explicitly of the rcv space contribution by newly created
subflow, updating rcvq_space.space accordingly for every successfully
completed subflow handshake.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/protocol.c | 44 ++++++++++++++++++++++++++++++--------------
 net/mptcp/protocol.h | 30 ++++++++++++++++++++++++++++++
 net/mptcp/subflow.c  |  3 +++
 3 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 07e1703688ea..c85ad7ef29b0 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -923,6 +923,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
 	mptcp_sockopt_sync_locked(msk, ssk);
 	mptcp_stop_tout_timer(sk);
 	__mptcp_propagate_sndbuf(sk, ssk);
+	__mptcp_propagate_rcvspace(sk, ssk);
 	return true;
 }
 
@@ -2050,17 +2051,21 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
 	return copied;
 }
 
-static void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
+static void mptcp_rcv_space_init(struct mptcp_sock *msk)
 {
-	const struct tcp_sock *tp = tcp_sk(ssk);
+	struct sock *sk = (struct sock *)msk;
 
 	msk->rcvspace_init = 1;
 
-	/* initial rcv_space offering made to peer */
-	msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
-				      TCP_INIT_CWND * tp->advmss);
-	if (msk->rcvq_space.space == 0)
+	mptcp_data_lock(sk);
+	__mptcp_sync_rcvspace(sk);
+
+	/* Paranoid check: at least one subflow pushed data to the msk. */
+	if (msk->rcvq_space.space == 0) {
+		DEBUG_NET_WARN_ON_ONCE(1);
 		msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
+	}
+	mptcp_data_unlock(sk);
 }
 
 /* receive buffer autotuning.  See tcp_rcv_space_adjust for more information.
@@ -2081,7 +2086,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 		return;
 
 	if (!msk->rcvspace_init)
-		mptcp_rcv_space_init(msk, msk->first);
+		mptcp_rcv_space_init(msk);
 
 	msk->rcvq_space.copied += copied;
 
@@ -3507,6 +3512,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
 	 */
 	mptcp_copy_inaddrs(nsk, ssk);
 	__mptcp_propagate_sndbuf(nsk, ssk);
+	__mptcp_propagate_rcvspace(nsk, ssk);
 
 	msk->rcvq_space.time = mptcp_stamp();
 
@@ -3606,8 +3612,10 @@ static void mptcp_release_cb(struct sock *sk)
 			__mptcp_sync_state(sk, msk->pending_state);
 		if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
 			__mptcp_error_report(sk);
-		if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags))
+		if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags)) {
 			__mptcp_sync_sndbuf(sk);
+			__mptcp_sync_rcvspace(sk);
+		}
 	}
 }
 
@@ -3723,13 +3731,13 @@ bool mptcp_finish_join(struct sock *ssk)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
-	struct sock *parent = (void *)msk;
+	struct sock *sk = (void *)msk;
 	bool ret = true;
 
 	pr_debug("msk=%p, subflow=%p\n", msk, subflow);
 
 	/* mptcp socket already closing? */
-	if (!mptcp_is_fully_established(parent)) {
+	if (!mptcp_is_fully_established(sk)) {
 		subflow->reset_reason = MPTCP_RST_EMPTCP;
 		return false;
 	}
@@ -3743,7 +3751,15 @@ bool mptcp_finish_join(struct sock *ssk)
 		}
 		mptcp_subflow_joined(msk, ssk);
 		spin_unlock_bh(&msk->fallback_lock);
-		mptcp_propagate_sndbuf(parent, ssk);
+		mptcp_data_lock(sk);
+		if (!sock_owned_by_user(sk)) {
+			__mptcp_propagate_sndbuf(sk, ssk);
+			__mptcp_propagate_rcvspace(sk, ssk);
+		} else {
+			__mptcp_bl_rcvspace(sk, ssk);
+			__set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags);
+		}
+		mptcp_data_unlock(sk);
 		return true;
 	}
 
@@ -3755,8 +3771,8 @@ bool mptcp_finish_join(struct sock *ssk)
 	/* If we can't acquire msk socket lock here, let the release callback
 	 * handle it
 	 */
-	mptcp_data_lock(parent);
-	if (!sock_owned_by_user(parent)) {
+	mptcp_data_lock(sk);
+	if (!sock_owned_by_user(sk)) {
 		ret = __mptcp_finish_join(msk, ssk);
 		if (ret) {
 			sock_hold(ssk);
@@ -3767,7 +3783,7 @@ bool mptcp_finish_join(struct sock *ssk)
 		list_add_tail(&subflow->node, &msk->join_list);
 		__set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags);
 	}
-	mptcp_data_unlock(parent);
+	mptcp_data_unlock(sk);
 
 	if (!ret) {
 err_prohibited:
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 7c9e143c0fb5..adc0851bad69 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -360,6 +360,7 @@ struct mptcp_sock {
 
 	struct list_head backlog_list;	/* protected by the data lock */
 	u32		backlog_len;
+	u32		bl_space;	/* rcvspace propagation via bl */
 };
 
 #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
@@ -976,6 +977,35 @@ static inline void mptcp_write_space(struct sock *sk)
 		sk_stream_write_space(sk);
 }
 
+static inline void __mptcp_sync_rcvspace(struct sock *sk)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+
+	msk->rcvq_space.space += msk->bl_space;
+	msk->bl_space = 0;
+}
+
+static inline u32 mptcp_subflow_rcvspace(struct sock *ssk)
+{
+	struct tcp_sock *tp = tcp_sk(ssk);
+	int space;
+
+	space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss);
+	if (space == 0)
+		space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
+	return space;
+}
+
+static inline void __mptcp_propagate_rcvspace(struct sock *sk, struct sock *ssk)
+{
+	mptcp_sk(sk)->rcvq_space.space += mptcp_subflow_rcvspace(ssk);
+}
+
+static inline void __mptcp_bl_rcvspace(struct sock *sk, struct sock *ssk)
+{
+	mptcp_sk(sk)->bl_space += mptcp_subflow_rcvspace(ssk);
+}
+
 static inline void __mptcp_sync_sndbuf(struct sock *sk)
 {
 	struct mptcp_subflow_context *subflow;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index b64ab7649908..33b88991cffb 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -462,6 +462,7 @@ void __mptcp_sync_state(struct sock *sk, int state)
 
 	subflow = mptcp_subflow_ctx(ssk);
 	__mptcp_propagate_sndbuf(sk, ssk);
+	__mptcp_sync_rcvspace(sk);
 
 	if (sk->sk_state == TCP_SYN_SENT) {
 		/* subflow->idsn is always available is TCP_SYN_SENT state,
@@ -516,8 +517,10 @@ static void mptcp_propagate_state(struct sock *sk, struct sock *ssk,
 
 	if (!sock_owned_by_user(sk)) {
 		__mptcp_sync_state(sk, ssk->sk_state);
+		__mptcp_propagate_rcvspace(sk, ssk);
 	} else {
 		msk->pending_state = ssk->sk_state;
+		__mptcp_bl_rcvspace(sk, ssk);
 		__set_bit(MPTCP_SYNC_STATE, &msk->cb_flags);
 	}
 	mptcp_data_unlock(sk);
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 mptcp-next 6/7] mptcp: better mptcp-level rtt estimator
  2025-11-04 21:51 [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement Paolo Abeni
                   ` (4 preceding siblings ...)
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 5/7] mptcp: better rcv space initialization Paolo Abeni
@ 2025-11-04 21:51 ` Paolo Abeni
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 7/7] mptcp: add receive queue awareness in tcp_rcv_space_adjust() Paolo Abeni
  2025-11-05 11:27 ` [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement MPTCP CI
  7 siblings, 0 replies; 11+ messages in thread
From: Paolo Abeni @ 2025-11-04 21:51 UTC (permalink / raw)
  To: mptcp

On high speed links, the MPTCP-level receive buffer auto-tuning
happens with a frequency well above the TCP-level's one. That in
turn can cause excessive/unneeded receive buffer increase.

On such links, the initial rtt_us value is considerably higher
than the actual delay, but the current mptcp_rcv_space_adjust() logic
prevents msk->rcvq_space.rtt_us from decreasing.

Address the issue with a more accurate RTT estimation strategy: the
MPTCP-level RTT is set to the minimum of all the subflows, in a rcv-win
based interval, feeding data into the MPTCP-receive buffer.

Use some care to avoid updating msk and ssk level fields too often.

Fixes: a6b118febbab ("mptcp: add receive buffer auto-tuning")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v1 -> v2:
  - do not use explicit reset flags - do rcv win based decision instead
  - discard 0 rtt_us samples from subflows
  - discard samples on non empty rx queue
  - discard "too high" samples, see the code comments WRT the whys
---
 include/trace/events/mptcp.h |  2 +-
 net/mptcp/protocol.c         | 74 ++++++++++++++++++++++--------------
 net/mptcp/protocol.h         |  7 +++-
 3 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h
index 71fd6d33f48b..999133000cb8 100644
--- a/include/trace/events/mptcp.h
+++ b/include/trace/events/mptcp.h
@@ -212,7 +212,7 @@ TRACE_EVENT(mptcp_rcvbuf_grow,
 		__be32 *p32;
 
 		__entry->time = time;
-		__entry->rtt_us = msk->rcvq_space.rtt_us >> 3;
+		__entry->rtt_us = msk->rcv_rtt_est.rtt_us >> 3;
 		__entry->copied = msk->rcvq_space.copied;
 		__entry->inq = mptcp_inq_hint(sk);
 		__entry->space = msk->rcvq_space.space;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index c85ad7ef29b0..414cca078541 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -868,6 +868,42 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
 	return moved;
 }
 
+static void mptcp_rcv_rtt_update(struct mptcp_sock *msk,
+				 struct mptcp_subflow_context *subflow)
+{
+	const struct tcp_sock *tp = tcp_sk(subflow->tcp_sock);
+	u32 rtt_us = tp->rcv_rtt_est.rtt_us;
+	u8 sr = tp->scaling_ratio;
+
+	/* MPTCP can react to incoming acks pushing data on different subflows,
+	 * causing apparent high RTT: ignore large samples; also do the update
+	 * only on RTT changes
+	 */
+	if (tp->rcv_rtt_est.seq == subflow->prev_rtt_seq ||
+	    (subflow->prev_rtt_us && (rtt_us >> 1) > subflow->prev_rtt_us))
+		return;
+
+	/* Similar to plain TCP, only consider samples with empty RX queue. */
+	if (!rtt_us || mptcp_data_avail(msk))
+		return;
+
+	/* Refresh the RTT after a full win per subflow */
+	subflow->prev_rtt_us = rtt_us;
+	subflow->prev_rtt_seq = tp->rcv_rtt_est.seq;
+	if (after(subflow->map_seq, msk->rcv_rtt_est.seq)) {
+		msk->rcv_rtt_est.seq = subflow->map_seq +
+				       tp->rcv_wnd * msk->pm.extra_subflows;
+		msk->rcv_rtt_est.rtt_us = rtt_us;
+		msk->scaling_ratio = sr;
+		return;
+	}
+
+	if (rtt_us < msk->rcv_rtt_est.rtt_us)
+		msk->rcv_rtt_est.rtt_us = rtt_us;
+	if (sr < msk->scaling_ratio)
+		msk->scaling_ratio = sr;
+}
+
 void mptcp_data_ready(struct sock *sk, struct sock *ssk)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
@@ -881,6 +917,7 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
 		return;
 
 	mptcp_data_lock(sk);
+	mptcp_rcv_rtt_update(msk, subflow);
 	if (!sock_owned_by_user(sk)) {
 		/* Wake-up the reader only for in-sequence data */
 		if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
@@ -2058,6 +2095,7 @@ static void mptcp_rcv_space_init(struct mptcp_sock *msk)
 	msk->rcvspace_init = 1;
 
 	mptcp_data_lock(sk);
+	msk->rcv_rtt_est.seq = atomic64_read(&msk->rcv_wnd_sent);
 	__mptcp_sync_rcvspace(sk);
 
 	/* Paranoid check: at least one subflow pushed data to the msk. */
@@ -2076,9 +2114,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 {
 	struct mptcp_subflow_context *subflow;
 	struct sock *sk = (struct sock *)msk;
-	u8 scaling_ratio = U8_MAX;
-	u32 time, advmss = 1;
-	u64 rtt_us, mstamp;
+	u32 rtt_us, time;
+	u64 mstamp;
 
 	msk_owned_by_me(msk);
 
@@ -2093,29 +2130,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 	mstamp = mptcp_stamp();
 	time = tcp_stamp_us_delta(mstamp, READ_ONCE(msk->rcvq_space.time));
 
-	rtt_us = msk->rcvq_space.rtt_us;
-	if (rtt_us && time < (rtt_us >> 3))
-		return;
-
-	rtt_us = 0;
-	mptcp_for_each_subflow(msk, subflow) {
-		const struct tcp_sock *tp;
-		u64 sf_rtt_us;
-		u32 sf_advmss;
-
-		tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));
-
-		sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
-		sf_advmss = READ_ONCE(tp->advmss);
-
-		rtt_us = max(sf_rtt_us, rtt_us);
-		advmss = max(sf_advmss, advmss);
-		scaling_ratio = min(tp->scaling_ratio, scaling_ratio);
-	}
-
-	msk->rcvq_space.rtt_us = rtt_us;
-	msk->scaling_ratio = scaling_ratio;
-	if (time < (rtt_us >> 3) || rtt_us == 0)
+	rtt_us = READ_ONCE(msk->rcv_rtt_est.rtt_us);
+	if (rtt_us == U32_MAX || time < (rtt_us >> 3))
 		return;
 
 	if (msk->rcvq_space.copied <= msk->rcvq_space.space)
@@ -2941,7 +2957,8 @@ static void __mptcp_init_sock(struct sock *sk)
 	msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
 	msk->backlog_len = 0;
 	msk->rcvq_space.copied = 0;
-	msk->rcvq_space.rtt_us = 0;
+	msk->rcv_rtt_est.rtt_us = U32_MAX;
+	msk->scaling_ratio = U8_MAX;
 
 	WRITE_ONCE(msk->first, NULL);
 	inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@@ -3385,7 +3402,8 @@ static int mptcp_disconnect(struct sock *sk, int flags)
 	msk->bytes_retrans = 0;
 	msk->rcvspace_init = 0;
 	msk->rcvq_space.copied = 0;
-	msk->rcvq_space.rtt_us = 0;
+	msk->scaling_ratio = U8_MAX;
+	msk->rcv_rtt_est.rtt_us = U32_MAX;
 
 	/* for fallback's sake */
 	WRITE_ONCE(msk->ack_seq, 0);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index adc0851bad69..051f21b06d33 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -340,11 +340,14 @@ struct mptcp_sock {
 				 */
 	struct mptcp_pm_data	pm;
 	struct mptcp_sched_ops	*sched;
+	struct {
+		u32	rtt_us;		/* Minimum RTT of subflows */
+		u64	seq;		/* Refresh RTT after this seq */
+	} rcv_rtt_est;
 	struct {
 		int	space;	/* bytes copied in last measurement window */
 		int	copied; /* bytes copied in this measurement window */
 		u64	time;	/* start time of measurement window */
-		u64	rtt_us; /* last maximum rtt of subflows */
 	} rcvq_space;
 	u8		scaling_ratio;
 	bool		allow_subflows;
@@ -523,6 +526,8 @@ struct mptcp_subflow_context {
 	u32	map_data_len;
 	__wsum	map_data_csum;
 	u32	map_csum_len;
+	u32	prev_rtt_us;
+	u32	prev_rtt_seq;
 	u32	request_mptcp : 1,  /* send MP_CAPABLE */
 		request_join : 1,   /* send MP_JOIN */
 		request_bkup : 1,
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 mptcp-next 7/7] mptcp: add receive queue awareness in tcp_rcv_space_adjust()
  2025-11-04 21:51 [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement Paolo Abeni
                   ` (5 preceding siblings ...)
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 6/7] mptcp: better mptcp-level rtt estimator Paolo Abeni
@ 2025-11-04 21:51 ` Paolo Abeni
  2025-11-05 11:27 ` [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement MPTCP CI
  7 siblings, 0 replies; 11+ messages in thread
From: Paolo Abeni @ 2025-11-04 21:51 UTC (permalink / raw)
  To: mptcp

This is the mptcp counter-part of commit ea33537d8292 ("tcp: add receive
queue awareness in tcp_rcv_space_adjust()").

Prior to this commit:

ESTAB 33165568 0      192.168.255.2:5201 192.168.255.1:53380 \
	skmem:(r33076416,rb33554432,t0,tb91136,f448,w0,o0,bl0,d0)

After:
ESTAB 3279168 0      192.168.255.2:5201 192.168.255.1]:53042 \
	skmem:(r3190912,rb3719956,t0,tb91136,f1536,w0,o0,bl0,d0)
(same tput)

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/protocol.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 414cca078541..4b1944e40d91 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2134,11 +2134,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 	if (rtt_us == U32_MAX || time < (rtt_us >> 3))
 		return;
 
-	if (msk->rcvq_space.copied <= msk->rcvq_space.space)
+	copied = msk->rcvq_space.copied;
+	copied -= mptcp_inq_hint(sk);
+	if (copied <= msk->rcvq_space.space)
 		goto new_measure;
 
 	trace_mptcp_rcvbuf_grow(sk, time);
-	if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
+	if (mptcp_rcvbuf_grow(sk, copied)) {
 		/* Make subflows follow along.  If we do not do this, we
 		 * get drops at subflow level if skbs can't be moved to
 		 * the mptcp rx queue fast enough (announced rcv_win can
@@ -2152,7 +2154,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 			slow = lock_sock_fast(ssk);
 			/* subflows can be added before tcp_init_transfer() */
 			if (tcp_sk(ssk)->rcvq_space.space)
-				tcp_rcvbuf_grow(ssk, msk->rcvq_space.copied);
+				tcp_rcvbuf_grow(ssk, copied);
 			unlock_sock_fast(ssk, slow);
 		}
 	}
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement
  2025-11-04 21:51 [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement Paolo Abeni
                   ` (6 preceding siblings ...)
  2025-11-04 21:51 ` [PATCH v2 mptcp-next 7/7] mptcp: add receive queue awareness in tcp_rcv_space_adjust() Paolo Abeni
@ 2025-11-05 11:27 ` MPTCP CI
  7 siblings, 0 replies; 11+ messages in thread
From: MPTCP CI @ 2025-11-05 11:27 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: mptcp

Hi Paolo,

Thank you for your modifications, that's great!

Our CI did some validations and here is its report:

- KVM Validation: normal (except selftest_mptcp_join): Success! ✅
- KVM Validation: normal (only selftest_mptcp_join): Success! ✅
- KVM Validation: debug (except selftest_mptcp_join): Unstable: 1 failed test(s): selftest_mptcp_connect_mmap 🔴
- KVM Validation: debug (only selftest_mptcp_join): Success! ✅
- KVM Validation: btf-normal (only bpftest_all): Success! ✅
- KVM Validation: btf-debug (only bpftest_all): Success! ✅
- Task: https://github.com/multipath-tcp/mptcp_net-next/actions/runs/19098415180

Initiator: Patchew Applier
Commits: https://github.com/multipath-tcp/mptcp_net-next/commits/45957e0ea29b
Patchwork: https://patchwork.kernel.org/project/mptcp/list/?series=1019624


If there are some issues, you can reproduce them using the same environment as
the one used by the CI thanks to a docker image, e.g.:

    $ cd [kernel source code]
    $ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
        --pull always mptcp/mptcp-upstream-virtme-docker:latest \
        auto-normal

For more details:

    https://github.com/multipath-tcp/mptcp-upstream-virtme-docker


Please note that despite all the efforts that have been already done to have a
stable tests suite when executed on a public CI like here, it is possible some
reported issues are not due to your modifications. Still, do not hesitate to
help us improve that ;-)

Cheers,
MPTCP GH Action bot
Bot operated by Matthieu Baerts (NGI0 Core)

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2025-11-05 11:27 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-04 21:51 [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement Paolo Abeni
2025-11-04 21:51 ` [PATCH v2 mptcp-next 1/7] trace: mptcp: add mptcp_rcvbuf_grow tracepoint Paolo Abeni
2025-11-05  0:24   ` Mat Martineau
2025-11-05  9:40     ` Paolo Abeni
2025-11-04 21:51 ` [PATCH v2 mptcp-next 2/7] mptcp: avoid unneeded subflow-level drops Paolo Abeni
2025-11-04 21:51 ` [PATCH v2 mptcp-next 3/7] mptcp: fix receive space timestamp initialization Paolo Abeni
2025-11-04 21:51 ` [PATCH v2 mptcp-next 4/7] mptcp: consolidate rcv space init Paolo Abeni
2025-11-04 21:51 ` [PATCH v2 mptcp-next 5/7] mptcp: better rcv space initialization Paolo Abeni
2025-11-04 21:51 ` [PATCH v2 mptcp-next 6/7] mptcp: better mptcp-level rtt estimator Paolo Abeni
2025-11-04 21:51 ` [PATCH v2 mptcp-next 7/7] mptcp: add receive queue awareness in tcp_rcv_space_adjust() Paolo Abeni
2025-11-05 11:27 ` [PATCH v2 mptcp-next 0/7] mptcp: autotune related improvement MPTCP CI

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox