From: Geliang Tang <geliang@kernel.org>
To: Paolo Abeni <pabeni@redhat.com>, mptcp@lists.linux.dev
Subject: Re: [PATCH v5 mptcp-next 09/10] mptcp: introduce mptcp-level backlog
Date: Wed, 08 Oct 2025 11:09:29 +0800 [thread overview]
Message-ID: <8d1a5a81a52c26ea19d6e10e9137b22875b58d7b.camel@kernel.org> (raw)
In-Reply-To: <16ead1024784cf5913a9c9ef69d7d15cb73512a3.1759737859.git.pabeni@redhat.com>
Hi Paolo,
On Mon, 2025-10-06 at 10:12 +0200, Paolo Abeni wrote:
> We are soon using it for incoming data processing.
> MPTCP can't leverage the sk_backlog, as the latter is processed
> before the release callback, and such callback for MPTCP releases
> and re-acquire the socket spinlock, breaking the sk_backlog
> processing
> assumption.
>
> Add a skb backlog list inside the mptcp sock struct, and implement
> basic helper to transfer packet to and purge such list.
>
> Packets in the backlog are not memory accounted, but still use the
> incoming subflow receive memory, to allow back-pressure.
>
> No packet is currently added to the backlog, so no functional changes
> intended here.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> --
> v4 -> v5:
> - split out of the next path, to make the latter smaller
> - set a custom destructor for skbs in the backlog, this avoid
> duplicate code, and fix a few places where the need ssk cleanup
> was not performed.
> - factor out the backlog purge in a new helper,
> use spinlock protection, clear the backlog list and zero the
> backlog len
> - explicitly init the backlog_len at mptcp_init_sock() time
> ---
> net/mptcp/protocol.c | 70 +++++++++++++++++++++++++++++++++++++++++-
> --
> net/mptcp/protocol.h | 4 +++
> 2 files changed, 70 insertions(+), 4 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 05ee6bd26b7fa..2d5d3da67d1ac 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -337,6 +337,11 @@ static void mptcp_data_queue_ofo(struct
> mptcp_sock *msk, struct sk_buff *skb)
> mptcp_rcvbuf_grow(sk);
> }
>
> +static void mptcp_bl_free(struct sk_buff *skb)
> +{
> + atomic_sub(skb->truesize, &skb->sk->sk_rmem_alloc);
> +}
> +
> static int mptcp_init_skb(struct sock *ssk,
> struct sk_buff *skb, int offset, int copy_len)
> {
> @@ -360,7 +365,7 @@ static int mptcp_init_skb(struct sock *ssk,
> skb_dst_drop(skb);
>
> /* "borrow" the fwd memory from the subflow, instead of reclaiming
> it */
> - skb->destructor = NULL;
> + skb->destructor = mptcp_bl_free;
> borrowed = ssk->sk_forward_alloc - sk_unused_reserved_mem(ssk);
> borrowed &= ~(PAGE_SIZE - 1);
> sk_forward_alloc_add(ssk, skb->truesize - borrowed);
> @@ -373,6 +378,13 @@ static bool __mptcp_move_skb(struct sock *sk,
> struct sk_buff *skb)
> struct mptcp_sock *msk = mptcp_sk(sk);
> struct sk_buff *tail;
>
> + /* Avoid the indirect call overhead, we know destructor is
> + * mptcp_bl_free at this point.
> + */
> + atomic_sub(skb->truesize, &skb->sk->sk_rmem_alloc);
> + skb->sk = NULL;
> + skb->destructor = NULL;
> +
> /* try to fetch required memory from subflow */
> if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
> MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
> @@ -654,6 +666,35 @@ static void mptcp_dss_corruption(struct
> mptcp_sock *msk, struct sock *ssk)
> }
> }
>
> +static void __mptcp_add_backlog(struct sock *sk, struct sk_buff
> *skb)
> +{
> + struct mptcp_sock *msk = mptcp_sk(sk);
> + struct sk_buff *tail = NULL;
> + bool fragstolen;
> + int delta;
> +
> + if (unlikely(sk->sk_state == TCP_CLOSE)) {
> + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
> + return;
> + }
> +
> + /* Try to coalesce with the last skb in our backlog */
> + if (!list_empty(&msk->backlog_list))
> + tail = list_last_entry(&msk->backlog_list, struct sk_buff, list);
> +
> + if (tail && MPTCP_SKB_CB(skb)->map_seq == MPTCP_SKB_CB(tail)-
> >end_seq &&
> + skb->sk == tail->sk &&
> + __mptcp_try_coalesce(sk, tail, skb, &fragstolen, &delta)) {
> + skb->truesize -= delta;
> + kfree_skb_partial(skb, fragstolen);
> + WRITE_ONCE(msk->backlog_len, msk->backlog_len + delta);
> + return;
> + }
> +
> + list_add_tail(&skb->list, &msk->backlog_list);
> + WRITE_ONCE(msk->backlog_len, msk->backlog_len + skb->truesize);
> +}
> +
> static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
> struct sock *ssk)
> {
> @@ -701,10 +742,12 @@ static bool
> __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
> int bmem;
>
> bmem = mptcp_init_skb(ssk, skb, offset, len);
> - skb->sk = NULL;
> sk_forward_alloc_add(sk, bmem);
> - atomic_sub(skb->truesize, &ssk->sk_rmem_alloc);
> - ret = __mptcp_move_skb(sk, skb) || ret;
> +
> + if (true)
nit: How about adding the own_msk parameter to
__mptcp_move_skbs_from_subflow() in this patch? This would allow us to
avoid using 'if (true)' here by using 'if (own_msk)'.
Thanks,
-Geliang
> + ret |= __mptcp_move_skb(sk, skb);
> + else
> + __mptcp_add_backlog(sk, skb);
> seq += len;
>
> if (unlikely(map_remaining < len)) {
> @@ -2753,12 +2796,28 @@ static void mptcp_mp_fail_no_response(struct
> mptcp_sock *msk)
> unlock_sock_fast(ssk, slow);
> }
>
> +static void mptcp_backlog_purge(struct sock *sk)
> +{
> + struct mptcp_sock *msk = mptcp_sk(sk);
> + struct sk_buff *tmp, *skb;
> + LIST_HEAD(backlog);
> +
> + mptcp_data_lock(sk);
> + list_splice_init(&msk->backlog_list, &backlog);
> + msk->backlog_len = 0;
> + mptcp_data_unlock(sk);
> +
> + list_for_each_entry_safe(skb, tmp, &backlog, list)
> + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
> +}
> +
> static void mptcp_do_fastclose(struct sock *sk)
> {
> struct mptcp_subflow_context *subflow, *tmp;
> struct mptcp_sock *msk = mptcp_sk(sk);
>
> mptcp_set_state(sk, TCP_CLOSE);
> + mptcp_backlog_purge(sk);
> mptcp_for_each_subflow_safe(msk, subflow, tmp)
> __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow),
> subflow, MPTCP_CF_FASTCLOSE);
> @@ -2816,11 +2875,13 @@ static void __mptcp_init_sock(struct sock
> *sk)
> INIT_LIST_HEAD(&msk->conn_list);
> INIT_LIST_HEAD(&msk->join_list);
> INIT_LIST_HEAD(&msk->rtx_queue);
> + INIT_LIST_HEAD(&msk->backlog_list);
> INIT_WORK(&msk->work, mptcp_worker);
> msk->out_of_order_queue = RB_ROOT;
> msk->first_pending = NULL;
> msk->timer_ival = TCP_RTO_MIN;
> msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
> + msk->backlog_len = 0;
>
> WRITE_ONCE(msk->first, NULL);
> inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
> @@ -3197,6 +3258,7 @@ static void mptcp_destroy_common(struct
> mptcp_sock *msk, unsigned int flags)
> struct sock *sk = (struct sock *)msk;
>
> __mptcp_clear_xmit(sk);
> + mptcp_backlog_purge(sk);
>
> /* join list will be eventually flushed (with rst) at sock lock
> release time */
> mptcp_for_each_subflow_safe(msk, subflow, tmp)
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 46d8432c72ee7..a21c4955f4cfb 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -358,6 +358,9 @@ struct mptcp_sock {
> * allow_infinite_fallback and
> * allow_join
> */
> +
> + struct list_head backlog_list; /*protected by the data lock */
> + u32 backlog_len;
> };
>
> #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
> @@ -408,6 +411,7 @@ static inline int mptcp_space_from_win(const
> struct sock *sk, int win)
> static inline int __mptcp_space(const struct sock *sk)
> {
> return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) -
> + READ_ONCE(mptcp_sk(sk)->backlog_len) -
> sk_rmem_alloc_get(sk));
> }
>
next prev parent reply other threads:[~2025-10-08 3:09 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-06 8:11 [PATCH v5 mptcp-next 00/10] mptcp: introduce backlog processing Paolo Abeni
2025-10-06 8:12 ` [PATCH v5 mptcp-next 01/10] mptcp: borrow forward memory from subflow Paolo Abeni
2025-10-06 8:12 ` [PATCH v5 mptcp-next 02/10] mptcp: cleanup fallback data fin reception Paolo Abeni
2025-10-06 8:12 ` [PATCH v5 mptcp-next 03/10] mptcp: cleanup fallback dummy mapping generation Paolo Abeni
2025-10-06 8:12 ` [PATCH v5 mptcp-next 04/10] mptcp: fix MSG_PEEK stream corruption Paolo Abeni
2025-10-06 8:12 ` [PATCH v5 mptcp-next 05/10] mptcp: ensure the kernel PM does not take action too late Paolo Abeni
2025-10-06 8:12 ` [PATCH v5 mptcp-next 06/10] mptcp: do not miss early first subflow close event notification Paolo Abeni
2025-10-06 8:12 ` [PATCH v5 mptcp-next 07/10] mptcp: make mptcp_destroy_common() static Paolo Abeni
2025-10-06 8:12 ` [PATCH v5 mptcp-next 08/10] mptcp: drop the __mptcp_data_ready() helper Paolo Abeni
2025-10-06 8:12 ` [PATCH v5 mptcp-next 09/10] mptcp: introduce mptcp-level backlog Paolo Abeni
2025-10-08 3:09 ` Geliang Tang [this message]
2025-10-20 19:45 ` Mat Martineau
2025-10-06 8:12 ` [PATCH v5 mptcp-next 10/10] mptcp: leverage the backlog for RX packet processing Paolo Abeni
2025-10-20 23:32 ` Mat Martineau
2025-10-21 17:21 ` Paolo Abeni
2025-10-21 23:53 ` Mat Martineau
2025-10-06 17:07 ` [PATCH v5 mptcp-next 00/10] mptcp: introduce backlog processing Matthieu Baerts
2025-10-08 3:07 ` Geliang Tang
2025-10-08 7:30 ` Paolo Abeni
2025-10-09 6:54 ` Geliang Tang
2025-10-09 7:52 ` Paolo Abeni
2025-10-09 9:02 ` Geliang Tang
2025-10-09 10:23 ` Paolo Abeni
2025-10-09 13:58 ` Paolo Abeni
2025-10-10 8:21 ` Paolo Abeni
2025-10-10 12:22 ` Geliang Tang
2025-10-13 9:07 ` Geliang Tang
2025-10-13 13:29 ` Paolo Abeni
2025-10-13 17:07 ` Paolo Abeni
2025-10-15 9:00 ` Paolo Abeni
2025-10-17 6:38 ` Geliang Tang
2025-10-18 0:16 ` Mat Martineau
2025-10-06 17:43 ` MPTCP CI
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=8d1a5a81a52c26ea19d6e10e9137b22875b58d7b.camel@kernel.org \
--to=geliang@kernel.org \
--cc=mptcp@lists.linux.dev \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox