From: Mat Martineau <martineau@kernel.org>
To: Paolo Abeni <pabeni@redhat.com>
Cc: mptcp@lists.linux.dev, geliang@kernel.org
Subject: Re: [PATCH RESENT v7 mptcp-next 2/4] mptcp: borrow forward memory from subflow
Date: Fri, 31 Oct 2025 15:44:13 -0700 (PDT) [thread overview]
Message-ID: <77fb4c39-fb48-1f88-614d-0cec18e9c687@kernel.org> (raw)
In-Reply-To: <501fc84688c32f846e262a9fd44683afa73ea509.1761576117.git.pabeni@redhat.com>
On Mon, 27 Oct 2025, Paolo Abeni wrote:
> In the MPTCP receive path, we release the subflow allocated fwd
> memory just to allocate it again shortly after for the msk.
>
> That could increases the failures chances, especially when we will
> add backlog processing, with other actions could consume the just
> released memory before the msk socket has a chance to do the
> rcv allocation.
>
> Replace the skb_orphan() call with an open-coded variant that
> explicitly borrows, the fwd memory from the subflow socket instead
> of releasing it.
>
> The borrowed memory does not have PAGE_SIZE granularity; rounding to
> the page size will make the fwd allocated memory higher than what is
> strictly required and could make the incoming subflow fwd mem
> consistently negative. Instead, keep track of the accumulated frag and
> borrow the full page at subflow close time.
>
> This allow removing the last drop in the TCP to MPTCP transition and
> the associated, now unused, MIB.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> net/mptcp/fastopen.c | 4 +++-
> net/mptcp/mib.c | 1 -
> net/mptcp/mib.h | 1 -
> net/mptcp/protocol.c | 23 +++++++++++++++--------
> net/mptcp/protocol.h | 23 +++++++++++++++++++++++
> 5 files changed, 41 insertions(+), 11 deletions(-)
>
> diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c
> index b9e451197902..82ec15bcfd7f 100644
> --- a/net/mptcp/fastopen.c
> +++ b/net/mptcp/fastopen.c
> @@ -32,7 +32,8 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf
> /* dequeue the skb from sk receive queue */
> __skb_unlink(skb, &ssk->sk_receive_queue);
> skb_ext_reset(skb);
> - skb_orphan(skb);
> +
> + mptcp_subflow_lend_fwdmem(subflow, skb);
>
> /* We copy the fastopen data, but that don't belong to the mptcp sequence
> * space, need to offset it in the subflow sequence, see mptcp_subflow_get_map_offset()
> @@ -50,6 +51,7 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf
> mptcp_data_lock(sk);
> DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk));
>
> + mptcp_borrow_fwdmem(sk, skb);
> skb_set_owner_r(skb, sk);
> __skb_queue_tail(&sk->sk_receive_queue, skb);
> mptcp_sk(sk)->bytes_received += skb->len;
> diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
> index 171643815076..f23fda0c55a7 100644
> --- a/net/mptcp/mib.c
> +++ b/net/mptcp/mib.c
> @@ -71,7 +71,6 @@ static const struct snmp_mib mptcp_snmp_list[] = {
> SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX),
> SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX),
> SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX),
> - SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED),
> SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE),
> SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER),
> SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED),
> diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
> index a1d3e9369fbb..812218b5ed2b 100644
> --- a/net/mptcp/mib.h
> +++ b/net/mptcp/mib.h
> @@ -70,7 +70,6 @@ enum linux_mptcp_mib_field {
> MPTCP_MIB_MPFASTCLOSERX, /* Received a MP_FASTCLOSE */
> MPTCP_MIB_MPRSTTX, /* Transmit a MP_RST */
> MPTCP_MIB_MPRSTRX, /* Received a MP_RST */
> - MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */
> MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */
> MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */
> MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 74be417be980..f6d96cb01e00 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -349,7 +349,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
> static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset,
> int copy_len)
> {
> - const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
> + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
> bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
>
> /* the skb map_seq accounts for the skb offset:
> @@ -374,11 +374,7 @@ static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
> struct mptcp_sock *msk = mptcp_sk(sk);
> struct sk_buff *tail;
>
> - /* try to fetch required memory from subflow */
> - if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
> - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
> - goto drop;
> - }
> + mptcp_borrow_fwdmem(sk, skb);
>
> if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
> /* in sequence */
> @@ -400,7 +396,6 @@ static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
> * will retransmit as needed, if needed.
> */
> MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
> -drop:
> mptcp_drop(sk, skb);
> return false;
> }
> @@ -701,7 +696,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
> size_t len = skb->len - offset;
>
> mptcp_init_skb(ssk, skb, offset, len);
> - skb_orphan(skb);
> + mptcp_subflow_lend_fwdmem(subflow, skb);
> ret = __mptcp_move_skb(sk, skb) || ret;
> seq += len;
>
> @@ -2428,6 +2423,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
> {
> struct mptcp_sock *msk = mptcp_sk(sk);
> bool dispose_it, need_push = false;
Hi Paolo -
> + int fwd_remaning;
One spelling fix, "fwd_remaining".
>
> /* Do not pass RX data to the msk, even if the subflow socket is not
> * going to be freed (i.e. even for the first subflow on graceful
> @@ -2436,6 +2432,17 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
> lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
> subflow->closing = 1;
>
> + /* Borrow the fwd allocated page left-over; fwd memory for the subflow
> + * could be negative at this point, but will be reach zero soon - when
> + * the data allocated using such fragment will be freed.
> + */
> + if (subflow->lent_mem_frag) {
> + fwd_remaning = PAGE_SIZE - subflow->lent_mem_frag;
> + sk_forward_alloc_add(sk, fwd_remaning);
> + sk_forward_alloc_add(ssk, -fwd_remaning);
> + subflow->lent_mem_frag = 0;
> + }
> +
> /* If the first subflow moved to a close state before accept, e.g. due
> * to an incoming reset or listener shutdown, the subflow socket is
> * already deleted by inet_child_forget() and the mptcp socket can't
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 9f7e5f2c964d..80d520888235 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -547,6 +547,7 @@ struct mptcp_subflow_context {
> bool scheduled;
> bool pm_listener; /* a listener managed by the kernel PM? */
> bool fully_established; /* path validated */
> + u32 lent_mem_frag;
> u32 remote_nonce;
> u64 thmac;
> u32 local_nonce;
> @@ -646,6 +647,28 @@ mptcp_send_active_reset_reason(struct sock *sk)
> tcp_send_active_reset(sk, GFP_ATOMIC, reason);
> }
>
> +static inline void mptcp_borrow_fwdmem(struct sock *sk, struct sk_buff *skb)
> +{
> + struct sock *ssk = skb->sk;
> +
> + /* The subflow just lend the skb fwd memory, and we know that the skb
> + * is only accounted on the incoming subflow rcvbuf.
> + */
> + skb->sk = NULL;
Looks like the intent is to always pair mptcp_subflow_lend_fwd() with
mptcp_borrow_fwdmem() (in that order).
Given that skb->sk and skb->destructor are usually cleared together,
should mptcp_borrow_fwdmem() have a WARN_ON_ONCE(skb->destructor)? Or is
that excessive paranoia?
- Mat
> + sk_forward_alloc_add(sk, skb->truesize);
> + atomic_sub(skb->truesize, &ssk->sk_rmem_alloc);
> +}
> +
> +static inline void
> +mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow,
> + struct sk_buff *skb)
> +{
> + int frag = (subflow->lent_mem_frag + skb->truesize) & (PAGE_SIZE - 1);
> +
> + skb->destructor = NULL;
> + subflow->lent_mem_frag = frag;
> +}
> +
> static inline u64
> mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
> {
> --
> 2.51.0
>
>
next prev parent reply other threads:[~2025-10-31 22:44 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-27 14:57 [PATCH RESENT v7 mptcp-next 0/4] mptcp: introduce backlog processing Paolo Abeni
2025-10-27 14:57 ` [PATCH RESENT v7 mptcp-next 1/4] mptcp: handle first subflow closing consistently Paolo Abeni
2025-10-27 14:58 ` [PATCH RESENT v7 mptcp-next 2/4] mptcp: borrow forward memory from subflow Paolo Abeni
2025-10-31 22:44 ` Mat Martineau [this message]
2025-11-03 16:26 ` Paolo Abeni
2025-10-27 14:58 ` [PATCH RESENT v7 mptcp-next 3/4] mptcp: introduce mptcp-level backlog Paolo Abeni
2025-10-27 14:58 ` [PATCH RESENT v7 mptcp-next 4/4] mptcp: leverage the backlog for RX packet processing Paolo Abeni
2025-11-01 0:04 ` Mat Martineau
2025-11-03 16:23 ` Paolo Abeni
2025-11-04 22:32 ` Mat Martineau
2025-10-27 16:13 ` [PATCH RESENT v7 mptcp-next 0/4] mptcp: introduce backlog processing MPTCP CI
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=77fb4c39-fb48-1f88-614d-0cec18e9c687@kernel.org \
--to=martineau@kernel.org \
--cc=geliang@kernel.org \
--cc=mptcp@lists.linux.dev \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox