All of lore.kernel.org
 help / color / mirror / Atom feed
From: gang.yan@linux.dev
To: "Geliang Tang" <geliang@kernel.org>, mptcp@lists.linux.dev
Cc: "Geliang Tang" <tanggeliang@kylinos.cn>, "Gang Yan" <yangang@kylinos.cn>
Subject: Re: [RFC mptcp-next v20 05/15] mptcp: implement tls_mptcp_ops
Date: Wed, 27 May 2026 06:52:29 +0000	[thread overview]
Message-ID: <836e21cb106dd2288b48132e736dff6efe2d0178@linux.dev> (raw)
In-Reply-To: <6557d95ab11416b3e798781cf95811bd6dd60d9e.1779788090.git.tanggeliang@kylinos.cn>

May 26, 2026 at 5:46 PM, "Geliang Tang" <geliang@kernel.org mailto:geliang@kernel.org?to=%22Geliang%20Tang%22%20%3Cgeliang%40kernel.org%3E > wrote:


> 
> From: Geliang Tang <tanggeliang@kylinos.cn>
> 
> This patch implements the MPTCP-specific struct tls_prot_ops, named
> 'tls_mptcp_ops'.
> 
> Passing an MPTCP socket to tcp_sock_rate_check_app_limited() can
> trigger a crash. Here, an MPTCP version of check_app_limited() is
> implemented, which calls tcp_sock_rate_check_app_limited() for each
> subflow.
> 
> When MPTCP implements lock_is_held interface, it not only checks
> sock_owned_by_user_nocheck(sk) as TCP does, but also needs to check
> whether the MPTCP data lock is held. This is required because TLS
> may call lock_is_held from softirq context with bh_lock_sock held.
> Checking both conditions ensures TLS always defers to workqueue when
> the MPTCP data lock is held, avoiding deadlock.
> 
> Co-developed-by: Gang Yan <yangang@kylinos.cn>
> Signed-off-by: Gang Yan <yangang@kylinos.cn>
> Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
> ---
>  include/net/mptcp.h | 2 +
>  include/net/tcp.h | 1 +
>  net/ipv4/tcp.c | 9 +++-
>  net/mptcp/protocol.c | 121 +++++++++++++++++++++++++++++++++++++++++--
>  net/mptcp/protocol.h | 1 +
>  net/tls/tls_main.c | 13 +++++
>  6 files changed, 141 insertions(+), 6 deletions(-)
> 
> diff --git a/include/net/mptcp.h b/include/net/mptcp.h
> index 4cf59e83c1c5..02564eceeb7e 100644
> --- a/include/net/mptcp.h
> +++ b/include/net/mptcp.h
> @@ -132,6 +132,8 @@ struct mptcp_pm_ops {
>  void (*release)(struct mptcp_sock *msk);
>  } ____cacheline_aligned_in_smp;
>  
> +extern struct tls_prot_ops tls_mptcp_ops;
> +
>  #ifdef CONFIG_MPTCP
>  void mptcp_init(void);
>  
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index f063eccbbba3..1c8201f69ef1 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -849,6 +849,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
>  
>  /* tcp.c */
>  void tcp_get_info(struct sock *, struct tcp_info *);
> +void tcp_sock_rate_check_app_limited(struct tcp_sock *tp);
>  void tcp_rate_check_app_limited(struct sock *sk);
>  
>  /* Read 'sendfile()'-style from a TCP socket */
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index a058f350a759..bdad459e6605 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -1097,9 +1097,9 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
>  }
>  
>  /* If a gap is detected between sends, mark the socket application-limited. */
> -void tcp_rate_check_app_limited(struct sock *sk)
> +void tcp_sock_rate_check_app_limited(struct tcp_sock *tp)
>  {
> - struct tcp_sock *tp = tcp_sk(sk);
> + struct sock *sk = (struct sock *)tp;
>  
>  if (/* We have less than one packet to send. */
>  tp->write_seq - tp->snd_nxt < tp->mss_cache &&
> @@ -1112,6 +1112,11 @@ void tcp_rate_check_app_limited(struct sock *sk)
>  tp->app_limited =
>  (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
>  }
> +
> +void tcp_rate_check_app_limited(struct sock *sk)
> +{
> + tcp_sock_rate_check_app_limited(tcp_sk(sk));
> +}
>  EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
>  
>  int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index a36ef97155a7..505eac23f35d 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -24,6 +24,7 @@
>  #include <net/mptcp.h>
>  #include <net/hotdata.h>
>  #include <net/xfrm.h>
> +#include <net/tls.h>
>  #include <asm/ioctls.h>
>  #include "protocol.h"
>  #include "mib.h"
> @@ -1909,7 +1910,7 @@ static void mptcp_rps_record_subflows(const struct mptcp_sock *msk)
>  }
>  }
>  
> -static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> +static int mptcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len)
>  {
>  struct mptcp_sock *msk = mptcp_sk(sk);
>  struct page_frag *pfrag;
> @@ -1921,8 +1922,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
>  MSG_FASTOPEN | MSG_EOR;
>  
> - lock_sock(sk);
> -
>  mptcp_rps_record_subflows(msk);
>  
>  if (unlikely(inet_test_bit(DEFER_CONNECT, sk) ||
> @@ -2038,7 +2037,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  }
>  
>  out:
> - release_sock(sk);
>  return copied;
>  
>  do_error:
> @@ -2049,6 +2047,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  goto out;
>  }
>  
> +static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> +{
> + int ret;
> +
> + lock_sock(sk);
> + ret = mptcp_sendmsg_locked(sk, msg, len);
> + release_sock(sk);
> +
> + return ret;
> +}
> +
>  static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied);
>  
>  static void mptcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
> @@ -4726,3 +4735,107 @@ int __init mptcp_proto_v6_init(void)
>  return err;
>  }
>  #endif
> +
> +static int mptcp_inq(struct sock *sk)
> +{
> + const struct mptcp_sock *msk = mptcp_sk(sk);
> + const struct sk_buff *skb;
> +
> + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
> + return 0;
> +
> + skb = skb_peek(&sk->sk_receive_queue);
> + if (skb) {
> + u64 answ = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq;
> +
> + if (answ >= INT_MAX)
> + answ = INT_MAX;
> +
> + /* Subtract 1, if FIN was received */
> + if (answ &&
> + (sk->sk_state == TCP_CLOSE ||
> + (sk->sk_shutdown & RCV_SHUTDOWN)))
> + answ--;
> +
> + return (int)answ;
> + }
> +
> + return 0;
> +}
> +
> +static bool mptcp_lock_is_held(struct sock *sk)
> +{
> + return sock_owned_by_user_nocheck(sk) ||
> + mptcp_data_is_locked(sk);
> +}
> +
> +static void mptcp_read_done(struct sock *sk, size_t len)
> +{
> + struct mptcp_sock *msk = mptcp_sk(sk);
> + struct sk_buff *skb;
> + size_t left;
> + u32 offset;
> +
> + msk_owned_by_me(msk);
> +
> + if (sk->sk_state == TCP_LISTEN)
> + return;
> +
> + left = len;
> + while (left && (skb = mptcp_recv_skb(sk, &offset)) != NULL) {
> + int used;
> +
> + used = min_t(size_t, skb->len - offset, left);
> + msk->bytes_consumed += used;
> + MPTCP_SKB_CB(skb)->offset += used;
> + MPTCP_SKB_CB(skb)->map_seq += used;
> + left -= used;
> +
> + if (skb->len > offset + used)
> + break;
> +
> + mptcp_eat_recv_skb(sk, skb);
> + }
> +
> + mptcp_rcv_space_adjust(msk, len - left);
> +
> + /* Clean up data we have read: This will do ACK frames. */
> + if (left != len)
> + mptcp_cleanup_rbuf(msk, len - left);
> +}
> +
> +static u32 mptcp_get_skb_seq(struct sk_buff *skb)
> +{
> + return MPTCP_SKB_CB(skb)->map_seq - MPTCP_SKB_CB(skb)->offset;
> +}
> +
> +static void mptcp_check_app_limited(struct sock *sk)
> +{
> + struct mptcp_sock *msk = mptcp_sk(sk);
> + struct mptcp_subflow_context *subflow;
> +
> + mptcp_for_each_subflow(msk, subflow) {
> + struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> + bool slow;
> +
> + slow = lock_sock_fast(ssk);
> + tcp_sock_rate_check_app_limited(tcp_sk(ssk));
> + unlock_sock_fast(ssk, slow);
> + }
> +}
> +
> +struct tls_prot_ops tls_mptcp_ops = {
> + .owner = THIS_MODULE,
> + .protocol = IPPROTO_MPTCP,
> + .inq = mptcp_inq,
> + .sendmsg_locked = mptcp_sendmsg_locked,
> + .recv_skb = mptcp_recv_skb,
> + .lock_is_held = mptcp_lock_is_held,
> + .read_sock = mptcp_read_sock,
> + .read_done = mptcp_read_done,
> + .get_skb_seq = mptcp_get_skb_seq,

Hi Geliang,

I think we need a get_skb_off callback, something like:

+static u32 mptcp_get_skb_off(struct sk_buff *skb)
+{
+       return MPTCP_SKB_CB(skb)->offset;
+}
+
 struct tls_prot_ops tls_mptcp_ops = {
        .owner                  = THIS_MODULE,
        .protocol               = IPPROTO_MPTCP,
@@ -5156,6 +5181,7 @@ struct tls_prot_ops tls_mptcp_ops = {
        .read_sock              = mptcp_read_sock,
        .read_done              = mptcp_read_done,
        .get_skb_seq            = mptcp_get_skb_seq,
+       .get_skb_off            = mptcp_get_skb_off,

The reason is that some functions in TLS still call skb_copy_bits directly,
which completely ignores the MPTCP-level offset stored in the SKB. That causes
data corruption / transmission errors.

Here is one concrete example:

The expected TLS record header is 17 03 03. We get 17 03 from the first skb, and the final 03 should come from the next skb at offset 2. But skb_copy_bits reads from offset 0 of that next skb, so we end up with 17 03 17 — and that causes error -22:
'''
  [  180.277464][ T2744] tls_rx_msg_size: bad TLS header: 17 03 17, offset=30 anchor_len=4590 anchor_dlen=4590  map_seq=12790065556595467861 end_seq=12790065556595467863frag_list=00000000683d062a fl_len=32 next=00000000d63e8c38 next_len=510          
  map_seq=12790065556595467863                                                                                                                                                                                                                            
  [  180.279420][ T2744] tls_rx_msg_size: next_skb first 8 bytes: 17 03 03 00 19 00 00 00                                                                                                                                                                 
  # # tls.c:629:multi_chunk_sendfile:Expected recv(self->cfd, buf, test_payload_size, MSG_WAITALL) (3945) == test_payload_size (4097)                                                                                                                     
  [  180.279828][ T2744] YGYG return: copied:3945, err:-22      
'''

In other places the same class of problem also triggers a -74 error.

After looking into this, here is a work-around patch I came up with:
diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
index c24aa4ec2687..bcd05f15bc68 100644
--- a/net/tls/tls_strp.c
+++ b/net/tls/tls_strp.c
@@ -452,7 +452,8 @@ static bool tls_strp_check_queue_ok(struct tls_strparser *strp)
                seq += skb->len;
                len -= skb->len;
                skb = skb->next;
-
+               if (ctx->proto->ops->get_skb_off(skb))
+                       return false;
                if (ctx->proto->ops->get_skb_seq(skb) != seq)
                        return false;
                if (skb_cmp_decrypted(first, skb))
@@ -531,6 +532,18 @@ static int tls_strp_read_sock(struct tls_strparser *strp)
                return tls_strp_read_copy(strp, true);
 
        tls_strp_load_anchor_with_queue(strp, inq);
+
+       /* If the next skb has non-zero MPTCP offset, skb_copy_bits
+        * would read stale data. Fallback to copy mode before
+        * tls_rx_msg_size touches the data.
+        */
+       if (ctx->proto->ops->get_skb_off) {
+               struct sk_buff *first = skb_shinfo(strp->anchor)->frag_list;
+
+               if (first->next && ctx->proto->ops->get_skb_off(first->next))
+                       return tls_strp_read_copy(strp, false);
+       }
+

The first chunk prevents the -74 error by forcing TLS into copy mode
(no direct skb_copy_bits on those SKBs for decryption) when it detects an MPTCP
offset.

The second chunk is a work-around that avoids the bad skb_copy_bits read inside
tls_rx_msg_size by falling back to copy mode when the next SKB in the frag_list
has a non-zero offset.

Longer term I think we should refactor tls_rx_msg_size to properly handle the
MPTCP offset, rather than relying on these fallback paths.

Thanks
Gang


> + .poll = mptcp_poll,
> + .epollin_ready = mptcp_epollin_ready,
> + .check_app_limited = mptcp_check_app_limited,
> +};
> +EXPORT_SYMBOL(tls_mptcp_ops);
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 661600f8b573..1c604a1ded6f 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -380,6 +380,7 @@ struct mptcp_sock {
>  
>  #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
>  #define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)
> +#define mptcp_data_is_locked(sk) spin_is_locked(&(sk)->sk_lock.slock)
>  
>  #define mptcp_for_each_subflow(__msk, __subflow) \
>  list_for_each_entry(__subflow, &((__msk)->conn_list), node)
> diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
> index c840a228af0d..fbb60dc1832e 100644
> --- a/net/tls/tls_main.c
> +++ b/net/tls/tls_main.c
> @@ -1425,6 +1425,12 @@ static int __init tls_register(void)
>  if (err)
>  goto err_strp;
>  
> +#ifdef CONFIG_MPTCP
> + err = tls_register_prot_ops(&tls_mptcp_ops);
> + if (err)
> + goto err_tcp;
> +#endif
> +
>  err = tls_device_init();
>  if (err)
>  goto err_ops;
> @@ -1433,6 +1439,10 @@ static int __init tls_register(void)
>  
>  return 0;
>  err_ops:
> +#ifdef CONFIG_MPTCP
> + tls_unregister_prot_ops(&tls_mptcp_ops);
> +err_tcp:
> +#endif
>  tls_unregister_prot_ops(&tls_tcp_ops);
>  err_strp:
>  tls_strp_dev_exit();
> @@ -1444,6 +1454,9 @@ static int __init tls_register(void)
>  static void __exit tls_unregister(void)
>  {
>  tcp_unregister_ulp(&tcp_tls_ulp_ops);
> +#ifdef CONFIG_MPTCP
> + tls_unregister_prot_ops(&tls_mptcp_ops);
> +#endif
>  tls_unregister_prot_ops(&tls_tcp_ops);
>  tls_proto_cleanup();
>  tls_strp_dev_exit();
> -- 
> 2.53.0
>

  reply	other threads:[~2026-05-27  6:52 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-26  9:46 [RFC mptcp-next v20 00/15] MPTCP KTLS support Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 01/15] tls: add per-protocol cache to support mptcp Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 02/15] tls: introduce struct tls_prot_ops Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 03/15] tls: add tls_prot_ops pointer to tls_proto Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 04/15] mptcp: update mptcp_check_readable Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 05/15] mptcp: implement tls_mptcp_ops Geliang Tang
2026-05-27  6:52   ` gang.yan [this message]
2026-05-26  9:46 ` [RFC mptcp-next v20 06/15] tls: disable device offload for mptcp sockets Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 07/15] mptcp: update ulp getsockopt for tls support Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 08/15] mptcp: enable ulp setsockopt " Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 09/15] selftests: mptcp: connect: use espintcp for ulp test Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 10/15] selftests: tls: add mptcp variant for testing Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 11/15] selftests: tls: increase pollin timeouts for mptcp Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 12/15] selftests: tls: increase nonblocking data size " Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 13/15] selftests: tls: retry bind on EINVAL in shutdown_reuse Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 14/15] selftests: tls: add mptcp test cases Geliang Tang
2026-05-26  9:46 ` [RFC mptcp-next v20 15/15] selftests: mptcp: cover mptcp tls tests Geliang Tang
2026-05-27  6:03 ` [RFC mptcp-next v20 00/15] MPTCP KTLS support Geliang Tang
2026-05-27  6:28   ` Matthieu Baerts
2026-05-27  7:36 ` MPTCP CI

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=836e21cb106dd2288b48132e736dff6efe2d0178@linux.dev \
    --to=gang.yan@linux.dev \
    --cc=geliang@kernel.org \
    --cc=mptcp@lists.linux.dev \
    --cc=tanggeliang@kylinos.cn \
    --cc=yangang@kylinos.cn \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.