Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC net-next 03/17] tls: add protocol dimension to tls operation cache
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

The current TLS operation cache is indexed solely by IP version
(IPv4/IPv6). This was sufficient when only TCP was supported.
Rename TLS_NUM_PROTS to TLS_NUM_FAMILY to accurately reflect that it
represents the number of address families.

With the introduction of MPTCP, both TCP and MPTCP sockets within the
same IP version now share the same cache entries. When an MPTCP socket
enables TLS, it overwrites the cache with MPTCP-specific operations,
causing existing TCP TLS sockets to use the wrong ops, leading to type
confusion and kernel panics.

Fix by extending the cache arrays with a protocol dimension to separate
TCP and MPTCP. Introduce TLSTCP and TLSMPTCP enum values, along with
separate saved protocol pointers and mutexes for MPTCP. update_sk_prot()
and __tls_build_proto() now select the appropriate cache based on
sk->sk_protocol.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/tls/tls_main.c | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index be824affd1b1..94133d62f73e 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -53,7 +53,13 @@ MODULE_ALIAS_TCP_ULP("tls");
 enum {
 	TLSV4,
 	TLSV6,
-	TLS_NUM_PROTS,
+	TLS_NUM_FAMILY,
+};
+
+enum {
+	TLSTCP,
+	TLSMPTCP,
+	TLS_NUM_PROTO,
 };
 
 #define CHECK_CIPHER_DESC(cipher,ci)				\
@@ -117,23 +123,30 @@ CHECK_CIPHER_DESC(TLS_CIPHER_SM4_CCM, tls12_crypto_info_sm4_ccm);
 CHECK_CIPHER_DESC(TLS_CIPHER_ARIA_GCM_128, tls12_crypto_info_aria_gcm_128);
 CHECK_CIPHER_DESC(TLS_CIPHER_ARIA_GCM_256, tls12_crypto_info_aria_gcm_256);
 
+static const struct proto *saved_mptcpv6_prot;
+static DEFINE_MUTEX(mptcpv6_prot_mutex);
 static const struct proto *saved_tcpv6_prot;
 static DEFINE_MUTEX(tcpv6_prot_mutex);
+static const struct proto *saved_mptcpv4_prot;
+static DEFINE_MUTEX(mptcpv4_prot_mutex);
 static const struct proto *saved_tcpv4_prot;
 static DEFINE_MUTEX(tcpv4_prot_mutex);
-static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
-static struct proto_ops tls_proto_ops[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
+static struct proto
+tls_prots[TLS_NUM_FAMILY][TLS_NUM_PROTO][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
+static struct proto_ops
+tls_proto_ops[TLS_NUM_FAMILY][TLS_NUM_PROTO][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
 static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
 			 const struct proto *base);
 
 static void update_sk_prot(struct sock *sk, struct tls_context *ctx)
 {
+	int proto = sk->sk_protocol == IPPROTO_MPTCP ? TLSMPTCP : TLSTCP;
 	int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
 
 	WRITE_ONCE(sk->sk_prot,
-		   &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf]);
+		   &tls_prots[ip_ver][proto][ctx->tx_conf][ctx->rx_conf]);
 	WRITE_ONCE(sk->sk_socket->ops,
-		   &tls_proto_ops[ip_ver][ctx->tx_conf][ctx->rx_conf]);
+		   &tls_proto_ops[ip_ver][proto][ctx->tx_conf][ctx->rx_conf]);
 }
 
 int wait_on_pending_writer(struct sock *sk, long *timeo)
@@ -971,18 +984,19 @@ static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG]
 static void __tls_build_proto(struct sock *sk,
 			      const struct proto *saved_prot,
 			      struct mutex *prot_mutex,
-			      int family)
+			      int family, int protocol)
 {
+	int proto = sk->sk_protocol == IPPROTO_MPTCP ? TLSMPTCP : TLSTCP;
 	int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
 	struct proto *prot = READ_ONCE(sk->sk_prot);
 
-	if (ip_ver == family) {
+	if (ip_ver == family && proto == protocol) {
 		/* smp_load_acquire pairs with smp_store_release below */
 		if (unlikely(prot != smp_load_acquire(&saved_prot))) {
 			mutex_lock(prot_mutex);
 			if (likely(prot != saved_prot)) {
-				build_protos(tls_prots[family], prot);
-				build_proto_ops(tls_proto_ops[family],
+				build_protos(tls_prots[family][protocol], prot);
+				build_proto_ops(tls_proto_ops[family][protocol],
 						sk->sk_socket->ops);
 				/* pairs with smp_load_acquire above */
 				smp_store_release(&saved_prot, prot);
@@ -995,10 +1009,14 @@ static void __tls_build_proto(struct sock *sk,
 static void tls_build_proto(struct sock *sk)
 {
 	/* Build IPv6 TLS whenever the address of tcpv6 _prot changes */
+	__tls_build_proto(sk, saved_mptcpv6_prot, &mptcpv6_prot_mutex,
+			  TLSV6, TLSMPTCP);
 	__tls_build_proto(sk, saved_tcpv6_prot, &tcpv6_prot_mutex,
-			  TLSV6);
+			  TLSV6, TLSTCP);
+	__tls_build_proto(sk, saved_mptcpv4_prot, &mptcpv4_prot_mutex,
+			  TLSV4, TLSMPTCP);
 	__tls_build_proto(sk, saved_tcpv4_prot, &tcpv4_prot_mutex,
-			  TLSV4);
+			  TLSV4, TLSTCP);
 }
 
 static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 04/17] mptcp: add sendmsg_locked to proto_ops
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

MPTCP currently provides a standard sendmsg() implementation which
acquires and releases the socket lock internally. However, certain
upper layers (e.g., TLS) need to call the sendmsg method while the
socket lock is already held.

Split the existing mptcp_sendmsg() into mptcp_sendmsg_locked() which
assumes the caller holds the socket lock, and a tiny wrapper
mptcp_sendmsg() that acquires the lock and calls the locked version.

Expose .sendmsg_locked in both mptcp_stream_ops and mptcp_v6_stream_ops.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/mptcp/protocol.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index a4f7e99b30db..7f0c560f6b7e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1967,7 +1967,7 @@ static void mptcp_rps_record_subflows(const struct mptcp_sock *msk)
 	}
 }
 
-static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+static int mptcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct page_frag *pfrag;
@@ -1979,8 +1979,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
 			  MSG_FASTOPEN | MSG_EOR;
 
-	lock_sock(sk);
-
 	mptcp_rps_record_subflows(msk);
 
 	if (unlikely(inet_test_bit(DEFER_CONNECT, sk) ||
@@ -2096,7 +2094,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	}
 
 out:
-	release_sock(sk);
 	return copied;
 
 do_error:
@@ -2107,6 +2104,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	goto out;
 }
 
+static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+	int ret;
+
+	lock_sock(sk);
+	ret = mptcp_sendmsg_locked(sk, msg, len);
+	release_sock(sk);
+
+	return ret;
+}
+
 static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied);
 
 static void mptcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
@@ -4703,6 +4711,7 @@ static const struct proto_ops mptcp_stream_ops = {
 	.set_rcvlowat	   = mptcp_set_rcvlowat,
 	.read_sock	   = mptcp_read_sock,
 	.splice_read	   = mptcp_splice_read,
+	.sendmsg_locked	   = mptcp_sendmsg_locked,
 };
 
 static struct inet_protosw mptcp_protosw = {
@@ -4815,6 +4824,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
 	.set_rcvlowat	   = mptcp_set_rcvlowat,
 	.read_sock	   = mptcp_read_sock,
 	.splice_read	   = mptcp_splice_read,
+	.sendmsg_locked	   = mptcp_sendmsg_locked,
 };
 
 static struct proto mptcp_v6_prot;
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 05/17] tls: use sendmsg_locked from the underlying socket
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

TLS offload (device and sw) may call tcp_sendmsg_locked() directly
when pushing TLS records. This assumes the underlying socket is always
a TCP socket. With MPTCP, the socket can be an MPTCP socket, which
does not directly expose a sendmsg_locked method via its proto_ops.

Replace the hard-coded tcp_sendmsg_locked() call with
sk->sk_socket->ops->sendmsg_locked(). This enables TLS to work
transparently over any socket that implements .sendmsg_locked,
including MPTCP after the previous commit.

The change is safe because both TCP and MPTCP now provide a conformant
.sendmsg_locked implementation.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/tls/tls_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 94133d62f73e..b6adfa67491b 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -205,7 +205,7 @@ int tls_push_sg(struct sock *sk,
 		bvec_set_page(&bvec, p, size, offset);
 		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);

-		ret = tcp_sendmsg_locked(sk, &msg, size);
+		ret = sk->sk_socket->ops->sendmsg_locked(sk, &msg, size);

 		if (ret != size) {
 			if (ret > 0) {
-- 
2.53.0

^ permalink raw reply related

* [RFC net-next 06/17] mptcp: implement peek_len for proto_ops
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

The TLS stack uses tcp_inq() to query the amount of data available
in the receive queue without consuming it. For MPTCP sockets, this
information is not directly available from a TCP subflow; it must be
computed from the MPTCP receive queue and the current mapping.

Introduce mptcp_peek_len() which returns the number of bytes that
can be peeked from the MPTCP socket. It reuses the existing
mptcp_inq() helper (used by ioctl SIOCINQ). The implementation
considers the first skb in the receive queue, the current ack_seq,
and handles the FIN case.

Assign .peek_len in both mptcp_stream_ops and mptcp_v6_stream_ops
so that upper layers (e.g., TLS) can obtain the correct in-queue
byte count for an MPTCP connection.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/mptcp/protocol.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 7f0c560f6b7e..18c8b6c64c3f 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -4689,6 +4689,38 @@ static ssize_t mptcp_splice_read(struct socket *sock, loff_t *ppos,
 	return ret;
 }
 
+static int mptcp_inq(struct sock *sk)
+{
+	const struct mptcp_sock *msk = mptcp_sk(sk);
+	const struct sk_buff *skb;
+
+	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+		return 0;
+
+	skb = skb_peek(&sk->sk_receive_queue);
+	if (skb) {
+		u64 answ = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq;
+
+		if (answ >= INT_MAX)
+			answ = INT_MAX;
+
+		/* Subtract 1, if FIN was received */
+		if (answ &&
+		    (sk->sk_state == TCP_CLOSE ||
+		     (sk->sk_shutdown & RCV_SHUTDOWN)))
+			answ--;
+
+		return (int)answ;
+	}
+
+	return 0;
+}
+
+static int mptcp_peek_len(struct socket *sock)
+{
+	return mptcp_inq(sock->sk);
+}
+
 static const struct proto_ops mptcp_stream_ops = {
 	.family		   = PF_INET,
 	.owner		   = THIS_MODULE,
@@ -4712,6 +4744,7 @@ static const struct proto_ops mptcp_stream_ops = {
 	.read_sock	   = mptcp_read_sock,
 	.splice_read	   = mptcp_splice_read,
 	.sendmsg_locked	   = mptcp_sendmsg_locked,
+	.peek_len	   = mptcp_peek_len,
 };
 
 static struct inet_protosw mptcp_protosw = {
@@ -4825,6 +4858,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
 	.read_sock	   = mptcp_read_sock,
 	.splice_read	   = mptcp_splice_read,
 	.sendmsg_locked	   = mptcp_sendmsg_locked,
+	.peek_len	   = mptcp_peek_len,
 };
 
 static struct proto mptcp_v6_prot;
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 07/17] tls: replace tcp_inq with socket peek_len
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

TLS (device, strparser, and software) calls tcp_inq() directly to
determine how much data is still pending in the socket receive queue.
This breaks when the underlying socket is not TCP (e.g., MPTCP).

Switch all occurrences of tcp_inq(sk) to sk->sk_socket->ops->
peek_len(sk->sk_socket). This operation is implemented for both TCP
and MPTCP (after the previous commits), making TLS transparently
usable over MPTCP connections.

The change is straightforward: every place where TLS needed the
available in-queue bytes now uses the protocol-specific peek_len
method instead of assuming a TCP socket.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/tls/tls_device.c | 4 ++--
 net/tls/tls_strp.c   | 6 ++++--
 net/tls/tls_sw.c     | 4 +++-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 741aef09bfd3..c44a59d9d715 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -805,7 +805,7 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
 		/* head of next rec is already in, note that the sock_inq will
 		 * include the currently parsed message when called from parser
 		 */
-		sock_data = tcp_inq(sk);
+		sock_data = sk->sk_socket->ops->peek_len(sk->sk_socket);
 		if (sock_data > rcd_len) {
 			trace_tls_device_rx_resync_nh_delay(sk, sock_data,
 							    rcd_len);
@@ -864,7 +864,7 @@ static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx,
 	rxm = strp_msg(skb);
 
 	/* head of next rec is already in, parser will sync for us */
-	if (tcp_inq(sk) > rxm->full_len) {
+	if (sk->sk_socket->ops->peek_len(sk->sk_socket) > rxm->full_len) {
 		trace_tls_device_rx_resync_nh_schedule(sk);
 		ctx->resync_nh_do_now = 1;
 	} else {
diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
index 61b10c697ecc..82a5b64b5f48 100644
--- a/net/tls/tls_strp.c
+++ b/net/tls/tls_strp.c
@@ -484,12 +484,14 @@ bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
 {
 	struct strp_msg *rxm;
 	struct tls_msg *tlm;
+	int inq;
 
 	DEBUG_NET_WARN_ON_ONCE(!strp->msg_ready);
 	DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len);
 
 	if (!strp->copy_mode && force_refresh) {
-		if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) {
+		inq = strp->sk->sk_socket->ops->peek_len(strp->sk->sk_socket);
+		if (unlikely(inq < strp->stm.full_len)) {
 			WRITE_ONCE(strp->msg_ready, 0);
 			strp->msg_announced = 0;
 			memset(&strp->stm, 0, sizeof(strp->stm));
@@ -513,7 +515,7 @@ static int tls_strp_read_sock(struct tls_strparser *strp)
 {
 	int sz, inq;
 
-	inq = tcp_inq(strp->sk);
+	inq = strp->sk->sk_socket->ops->peek_len(strp->sk->sk_socket);
 	if (inq < 1)
 		return 0;
 
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 9324e4ed20a3..35fb0c3c965a 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1706,12 +1706,14 @@ tls_read_flush_backlog(struct sock *sk, struct tls_prot_info *prot,
 		       size_t *flushed_at)
 {
 	size_t max_rec;
+	int inq;
 
 	if (len_left <= decrypted)
 		return false;
 
+	inq = sk->sk_socket->ops->peek_len(sk->sk_socket);
 	max_rec = prot->overhead_size - prot->tail_size + TLS_MAX_PAYLOAD_SIZE;
-	if (done - *flushed_at < SZ_128K && tcp_inq(sk) > max_rec)
+	if (done - *flushed_at < SZ_128K && inq > max_rec)
 		return false;
 
 	*flushed_at = done;
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 08/17] tls: store original read_sock for non-tcp sockets
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

TLS strparser uses tcp_read_sock() to copy data from the underlying
socket. This assumes the socket is always TCP, which fails when TLS
is used over MPTCP.

Store the original socket's read_sock method (sk->sk_socket->ops->
read_sock) in a new .sk_read_sock callback inside struct tls_context.
Then in tls_strp_read_copyin(), call this stored callback instead of
the hard-coded tcp_read_sock().

With this change, TLS strparser works transparently over any socket
that implements .read_sock (including MPTCP, which already provides
mptcp_read_sock). Behavior for plain TCP remains unchanged.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 include/net/tls.h  | 2 ++
 net/tls/tls_main.c | 1 +
 net/tls/tls_strp.c | 3 ++-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index e57bef58851e..aee4f74dc3d9 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -262,6 +262,8 @@ struct tls_context {
 	struct sock *sk;
 
 	void (*sk_destruct)(struct sock *sk);
+	int (*sk_read_sock)(struct sock *sk, read_descriptor_t *desc,
+			    sk_read_actor_t recv_actor);
 
 	union tls_crypto_context crypto_send;
 	union tls_crypto_context crypto_recv;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index b6adfa67491b..c9499bfd7a1d 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -1086,6 +1086,7 @@ static int tls_init(struct sock *sk)
 	ctx->tx_conf = TLS_BASE;
 	ctx->rx_conf = TLS_BASE;
 	ctx->tx_max_payload_len = TLS_MAX_PAYLOAD_SIZE;
+	ctx->sk_read_sock = sk->sk_socket->ops->read_sock;
 	update_sk_prot(sk, ctx);
 out:
 	write_unlock_bh(&sk->sk_callback_lock);
diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
index 82a5b64b5f48..9945d17b2f8c 100644
--- a/net/tls/tls_strp.c
+++ b/net/tls/tls_strp.c
@@ -375,6 +375,7 @@ static int tls_strp_copyin(read_descriptor_t *desc, struct sk_buff *in_skb,
 
 static int tls_strp_read_copyin(struct tls_strparser *strp)
 {
+	struct tls_context *ctx = tls_get_ctx(strp->sk);
 	read_descriptor_t desc;
 
 	desc.arg.data = strp;
@@ -382,7 +383,7 @@ static int tls_strp_read_copyin(struct tls_strparser *strp)
 	desc.count = 1; /* give more than one skb per call */
 
 	/* sk should be locked here, so okay to do read_sock */
-	tcp_read_sock(strp->sk, &desc, tls_strp_copyin);
+	ctx->sk_read_sock(strp->sk, &desc, tls_strp_copyin);
 
 	return desc.error;
 }
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 09/17] tls: introduce tls protocol ops structure
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

To extend MPTCP support based on TCP TLS, a tls_prot_ops structure has
been introduced for TLS, encapsulating TCP-specific helpers within this
structure.

Add registering, validating and finding functions for this structure to
add, validate and find a tls_prot_ops on the global list tls_prot_ops_list.

Register TCP-specific structure tls_tcp_ops in tls_register().

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 include/net/tls.h  |  15 +++++++
 net/tls/tls_main.c | 101 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index aee4f74dc3d9..500fe87b50d2 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -224,6 +224,21 @@ struct tls_prot_info {
 	u16 tail_size;
 };
 
+struct tls_prot_ops {
+	struct module		*owner;
+	struct list_head	list;
+	int			protocol;
+
+	struct sk_buff *(*recv_skb)(struct sock *sk, u32 *off);
+	bool (*lock_is_held)(struct sock *sk);
+	void (*read_done)(struct sock *sk, size_t len);
+	u32 (*get_skb_seq)(struct sk_buff *skb);
+	int (*skb_get_header)(const struct sk_buff *skb, int offset,
+			      void *to, int len);
+	bool (*epollin_ready)(const struct sock *sk);
+	void (*check_app_limited)(struct sock *sk);
+};
+
 struct tls_context {
 	/* read-only cache line */
 	struct tls_prot_info prot_info;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index c9499bfd7a1d..296d133fa61f 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -137,6 +137,8 @@ static struct proto_ops
 tls_proto_ops[TLS_NUM_FAMILY][TLS_NUM_PROTO][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
 static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
 			 const struct proto *base);
+static LIST_HEAD(tls_prot_ops_list);
+static DEFINE_SPINLOCK(tls_prot_ops_lock);
 
 static void update_sk_prot(struct sock *sk, struct tls_context *ctx)
 {
@@ -1059,6 +1061,22 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
 #endif
 }
 
+static struct tls_prot_ops *tls_prot_ops_find(int protocol)
+{
+	struct tls_prot_ops *ops, *ret = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ops, &tls_prot_ops_list, list) {
+		if (ops->protocol == protocol) {
+			ret = ops;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 static int tls_init(struct sock *sk)
 {
 	struct tls_context *ctx;
@@ -1245,6 +1263,80 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 	.get_info_size		= tls_get_info_size,
 };
 
+static int tls_validate_prot_ops(const struct tls_prot_ops *ops)
+{
+	if (!ops->recv_skb || !ops->lock_is_held ||
+	    !ops->read_done || !ops->get_skb_seq ||
+	    !ops->skb_get_header || !ops->epollin_ready ||
+	    !ops->check_app_limited) {
+		pr_err("%d does not implement required ops\n", ops->protocol);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int tls_register_prot_ops(struct tls_prot_ops *ops)
+{
+	int ret;
+
+	ret = tls_validate_prot_ops(ops);
+	if (ret)
+		return ret;
+
+	spin_lock_bh(&tls_prot_ops_lock);
+	if (tls_prot_ops_find(ops->protocol)) {
+		spin_unlock_bh(&tls_prot_ops_lock);
+		return -EEXIST;
+	}
+
+	list_add_tail_rcu(&ops->list, &tls_prot_ops_list);
+	spin_unlock_bh(&tls_prot_ops_lock);
+
+	pr_debug("tls_prot_ops %d registered\n", ops->protocol);
+	return 0;
+}
+
+static void tls_unregister_prot_ops(struct tls_prot_ops *ops)
+{
+	spin_lock_bh(&tls_prot_ops_lock);
+	list_del_rcu(&ops->list);
+	spin_unlock_bh(&tls_prot_ops_lock);
+	synchronize_rcu();
+}
+
+static struct sk_buff *tls_tcp_recv_skb(struct sock *sk, u32 *off)
+{
+	return tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, off);
+}
+
+static bool tls_tcp_lock_is_held(struct sock *sk)
+{
+	return sock_owned_by_user_nocheck(sk);
+}
+
+static u32 tls_tcp_get_skb_seq(struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->seq;
+}
+
+static bool tls_tcp_epollin_ready(const struct sock *sk)
+{
+	return tcp_epollin_ready(sk, INT_MAX);
+}
+
+static struct tls_prot_ops tls_tcp_ops = {
+	.owner			= THIS_MODULE,
+	.protocol		= IPPROTO_TCP,
+	.recv_skb		= tls_tcp_recv_skb,
+	.lock_is_held		= tls_tcp_lock_is_held,
+	.read_done		= tcp_read_done,
+	.get_skb_seq		= tls_tcp_get_skb_seq,
+	.skb_get_header		= skb_copy_bits,
+	.epollin_ready		= tls_tcp_epollin_ready,
+	.check_app_limited	= tcp_rate_check_app_limited,
+};
+
 static int __init tls_register(void)
 {
 	int err;
@@ -1257,13 +1349,19 @@ static int __init tls_register(void)
 	if (err)
 		goto err_pernet;
 
-	err = tls_device_init();
+	err = tls_register_prot_ops(&tls_tcp_ops);
 	if (err)
 		goto err_strp;
 
+	err = tls_device_init();
+	if (err)
+		goto err_ops;
+
 	tcp_register_ulp(&tcp_tls_ulp_ops);
 
 	return 0;
+err_ops:
+	tls_unregister_prot_ops(&tls_tcp_ops);
 err_strp:
 	tls_strp_dev_exit();
 err_pernet:
@@ -1274,6 +1372,7 @@ static int __init tls_register(void)
 static void __exit tls_unregister(void)
 {
 	tcp_unregister_ulp(&tcp_tls_ulp_ops);
+	tls_unregister_prot_ops(&tls_tcp_ops);
 	tls_strp_dev_exit();
 	tls_device_cleanup();
 	unregister_pernet_subsys(&tls_proc_ops);
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 10/17] tls: use protocol ops via tls_context
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

Currently, TLS code directly calls TCP-specific functions (e.g.,
tcp_rate_check_app_limited, tcp_read_done, tcp_epollin_ready, etc.)
and accesses TCP-specific fields (e.g., TCP_SKB_CB(skb)->seq).
This makes it hard to support other protocols like MPTCP.

Introduce a struct tls_prot_ops pointer in tls_context, which is
initialized during tls_init based on the socket protocol (TCP or
MPTCP). All protocol-dependent operations are now invoked via this
ops pointer, allowing each protocol to provide its own implementation.

Also add proper module reference counting for the ops owner.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 include/net/tls.h  |  2 ++
 net/tls/tls_main.c | 15 ++++++++++++++-
 net/tls/tls_strp.c | 26 ++++++++++++++++++--------
 net/tls/tls_sw.c   |  6 ++++--
 4 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 500fe87b50d2..9270de42787b 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -280,6 +280,8 @@ struct tls_context {
 	int (*sk_read_sock)(struct sock *sk, read_descriptor_t *desc,
 			    sk_read_actor_t recv_actor);
 
+	const struct tls_prot_ops *ops;
+
 	union tls_crypto_context crypto_send;
 	union tls_crypto_context crypto_recv;
 
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 296d133fa61f..b45890e75c9e 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -201,7 +201,7 @@ int tls_push_sg(struct sock *sk,
 	ctx->splicing_pages = true;
 	while (1) {
 		/* is sending application-limited? */
-		tcp_rate_check_app_limited(sk);
+		ctx->ops->check_app_limited(sk);
 		p = sg_page(sg);
 retry:
 		bvec_set_page(&bvec, p, size, offset);
@@ -340,6 +340,11 @@ void tls_ctx_free(struct sock *sk, struct tls_context *ctx)
 	if (!ctx)
 		return;
 
+	if (ctx->ops) {
+		module_put(ctx->ops->owner);
+		ctx->ops = NULL;
+	}
+
 	memzero_explicit(&ctx->crypto_send, sizeof(ctx->crypto_send));
 	memzero_explicit(&ctx->crypto_recv, sizeof(ctx->crypto_recv));
 	mutex_destroy(&ctx->tx_lock);
@@ -1079,6 +1084,7 @@ static struct tls_prot_ops *tls_prot_ops_find(int protocol)
 
 static int tls_init(struct sock *sk)
 {
+	struct tls_prot_ops *ops;
 	struct tls_context *ctx;
 	int rc = 0;
 
@@ -1101,10 +1107,17 @@ static int tls_init(struct sock *sk)
 		goto out;
 	}
 
+	ops = tls_prot_ops_find(sk->sk_protocol);
+	if (!ops || !try_module_get(ops->owner)) {
+		rc = -EINVAL;
+		goto out;
+	}
+
 	ctx->tx_conf = TLS_BASE;
 	ctx->rx_conf = TLS_BASE;
 	ctx->tx_max_payload_len = TLS_MAX_PAYLOAD_SIZE;
 	ctx->sk_read_sock = sk->sk_socket->ops->read_sock;
+	ctx->ops = ops;
 	update_sk_prot(sk, ctx);
 out:
 	write_unlock_bh(&sk->sk_callback_lock);
diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
index 9945d17b2f8c..48eb4b692f47 100644
--- a/net/tls/tls_strp.c
+++ b/net/tls/tls_strp.c
@@ -120,6 +120,7 @@ struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx)
 int tls_strp_msg_cow(struct tls_sw_context_rx *ctx)
 {
 	struct tls_strparser *strp = &ctx->strp;
+	struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
 	struct sk_buff *skb;
 
 	if (strp->copy_mode)
@@ -132,7 +133,7 @@ int tls_strp_msg_cow(struct tls_sw_context_rx *ctx)
 	tls_strp_anchor_free(strp);
 	strp->anchor = skb;
 
-	tcp_read_done(strp->sk, strp->stm.full_len);
+	tls_ctx->ops->read_done(strp->sk, strp->stm.full_len);
 	strp->copy_mode = 1;
 
 	return 0;
@@ -390,6 +391,7 @@ static int tls_strp_read_copyin(struct tls_strparser *strp)
 
 static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort)
 {
+	struct tls_context *ctx = tls_get_ctx(strp->sk);
 	struct skb_shared_info *shinfo;
 	struct page *page;
 	int need_spc, len;
@@ -398,7 +400,7 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort)
 	 * to read the data out. Otherwise the connection will stall.
 	 * Without pressure threshold of INT_MAX will never be ready.
 	 */
-	if (likely(qshort && !tcp_epollin_ready(strp->sk, INT_MAX)))
+	if (likely(qshort && !ctx->ops->epollin_ready(strp->sk)))
 		return 0;
 
 	shinfo = skb_shinfo(strp->anchor);
@@ -434,12 +436,13 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort)
 static bool tls_strp_check_queue_ok(struct tls_strparser *strp)
 {
 	unsigned int len = strp->stm.offset + strp->stm.full_len;
+	struct tls_context *ctx = tls_get_ctx(strp->sk);
 	struct sk_buff *first, *skb;
 	u32 seq;
 
 	first = skb_shinfo(strp->anchor)->frag_list;
 	skb = first;
-	seq = TCP_SKB_CB(first)->seq;
+	seq = ctx->ops->get_skb_seq(first);
 
 	/* Make sure there's no duplicate data in the queue,
 	 * and the decrypted status matches.
@@ -449,7 +452,7 @@ static bool tls_strp_check_queue_ok(struct tls_strparser *strp)
 		len -= skb->len;
 		skb = skb->next;
 
-		if (TCP_SKB_CB(skb)->seq != seq)
+		if (ctx->ops->get_skb_seq(skb) != seq)
 			return false;
 		if (skb_cmp_decrypted(first, skb))
 			return false;
@@ -460,11 +463,11 @@ static bool tls_strp_check_queue_ok(struct tls_strparser *strp)
 
 static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
 {
-	struct tcp_sock *tp = tcp_sk(strp->sk);
+	struct tls_context *ctx = tls_get_ctx(strp->sk);
 	struct sk_buff *first;
 	u32 offset;
 
-	first = tcp_recv_skb(strp->sk, tp->copied_seq, &offset);
+	first = ctx->ops->recv_skb(strp->sk, &offset);
 	if (WARN_ON_ONCE(!first))
 		return;
 
@@ -565,6 +568,11 @@ void tls_strp_check_rcv(struct tls_strparser *strp, bool announce)
 /* Lower sock lock held */
 void tls_strp_data_ready(struct tls_strparser *strp)
 {
+	struct tls_context *ctx = tls_get_ctx(strp->sk);
+
+	if (!ctx)
+		return;
+
 	/* This check is needed to synchronize with do_tls_strp_work.
 	 * do_tls_strp_work acquires a process lock (lock_sock) whereas
 	 * the lock held here is bh_lock_sock. The two locks can be
@@ -572,7 +580,7 @@ void tls_strp_data_ready(struct tls_strparser *strp)
 	 * allows a thread in BH context to safely check if the process
 	 * lock is held. In this case, if the lock is held, queue work.
 	 */
-	if (sock_owned_by_user_nocheck(strp->sk)) {
+	if (ctx->ops->lock_is_held(strp->sk)) {
 		queue_work(tls_strp_wq, &strp->work);
 		return;
 	}
@@ -597,10 +605,12 @@ static void tls_strp_work(struct work_struct *w)
  */
 void tls_strp_msg_consume(struct tls_strparser *strp)
 {
+	struct tls_context *ctx = tls_get_ctx(strp->sk);
+
 	WARN_ON(!strp->stm.full_len);
 
 	if (likely(!strp->copy_mode))
-		tcp_read_done(strp->sk, strp->stm.full_len);
+		ctx->ops->read_done(strp->sk, strp->stm.full_len);
 	else
 		tls_strp_flush_anchor_copy(strp);
 
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 35fb0c3c965a..71ab9763b1ed 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2171,7 +2171,8 @@ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb)
 	}
 
 	/* Linearize header to local buffer */
-	ret = skb_copy_bits(skb, strp->stm.offset, header, prot->prepend_size);
+	ret = tls_ctx->ops->skb_get_header(skb, strp->stm.offset, header,
+					   prot->prepend_size);
 	if (ret < 0)
 		goto read_failure;
 
@@ -2202,7 +2203,8 @@ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb)
 	}
 
 	tls_device_rx_resync_new_rec(strp->sk, data_len + TLS_HEADER_SIZE,
-				     TCP_SKB_CB(skb)->seq + strp->stm.offset);
+				     tls_ctx->ops->get_skb_seq(skb) +
+				     strp->stm.offset);
 	return data_len + TLS_HEADER_SIZE;
 
 read_failure:
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 11/17] mptcp: implement mptcp-specific tls protocol ops
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

This patch implements the MPTCP-specific struct tls_prot_ops, named
'tls_mptcp_ops'.

Passing an MPTCP socket to tcp_sock_rate_check_app_limited() can
trigger a crash. Here, an MPTCP version of check_app_limited() is
implemented, which calls tcp_sock_rate_check_app_limited() for each
subflow.

When MPTCP implements lock_is_held interface, it not only checks
sock_owned_by_user_nocheck(sk) as TCP does, but also needs to check
whether the MPTCP data lock is held. This is required because TLS
may call lock_is_held from softirq context with bh_lock_sock held.
Checking both conditions ensures TLS always defers to workqueue when
the MPTCP data lock is held, avoiding deadlock.

Implement mptcp_skb_get_header() to handle fragmented MPTCP skbs when
copying TLS record headers.

In tls_strp_read_sock(), tls_strp_load_anchor_with_queue() first
attaches the skbs from TCP/MPTCP to the frag_list of strp->anchor.
In TCP, this is fine because the skb data is contiguous; however,
in MPTCP, each skb has its own offset, causing the data to be
non-contiguous. As a result, during the subsequent tls_rx_msg_size()
process, skb_copy_bits() may access across skbs. In MPTCP, the offset
of the second skb is ignored, leading to data access errors.
Therefore, mptcp_skb_get_header() can effectively handle this
problem and obtain the correct TLS header.

In the later process, tls_strp_check_queue_ok() handles the copy_mode
scenario. When an MPTCP skb has a non-zero offset, it falls back to
copy_mode, copying the valid data from each skb one by one into
anchor->frag_list, thus resolving the offset issue. Hence, the impact
of the offset within the TLS module is completely eliminated.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 include/net/mptcp.h  |   2 +
 include/net/tcp.h    |   1 +
 net/ipv4/tcp.c       |   9 +++-
 net/mptcp/protocol.c | 113 +++++++++++++++++++++++++++++++++++++++++++
 net/mptcp/protocol.h |   1 +
 net/tls/tls_main.c   |  13 +++++
 6 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 333bde2a0b76..ba2257986b13 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -132,6 +132,8 @@ struct mptcp_pm_ops {
 	void (*release)(struct mptcp_sock *msk);
 } ____cacheline_aligned_in_smp;
 
+extern struct tls_prot_ops tls_mptcp_ops;
+
 #ifdef CONFIG_MPTCP
 void mptcp_init(void);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6d376ea4d1c0..ac823492d3e4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -849,6 +849,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
 
 /* tcp.c */
 void tcp_get_info(struct sock *, struct tcp_info *);
+void tcp_sock_rate_check_app_limited(struct tcp_sock *tp);
 void tcp_rate_check_app_limited(struct sock *sk);
 
 /* Read 'sendfile()'-style from a TCP socket */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b427f924608c..b875be6ae5bc 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1096,9 +1096,9 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
 }
 
 /* If a gap is detected between sends, mark the socket application-limited. */
-void tcp_rate_check_app_limited(struct sock *sk)
+void tcp_sock_rate_check_app_limited(struct tcp_sock *tp)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	struct sock *sk = (struct sock *)tp;
 
 	if (/* We have less than one packet to send. */
 	    tp->write_seq - tp->snd_nxt < tp->mss_cache &&
@@ -1111,6 +1111,11 @@ void tcp_rate_check_app_limited(struct sock *sk)
 		tp->app_limited =
 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
 }
+
+void tcp_rate_check_app_limited(struct sock *sk)
+{
+	tcp_sock_rate_check_app_limited(tcp_sk(sk));
+}
 EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
 
 int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 18c8b6c64c3f..f4cd7a6e5770 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -24,6 +24,7 @@
 #include <net/mptcp.h>
 #include <net/hotdata.h>
 #include <net/xfrm.h>
+#include <net/tls.h>
 #include <asm/ioctls.h>
 #include "protocol.h"
 #include "mib.h"
@@ -4894,3 +4895,115 @@ int __init mptcp_proto_v6_init(void)
 	return err;
 }
 #endif
+
+static bool mptcp_lock_is_held(struct sock *sk)
+{
+	return sock_owned_by_user_nocheck(sk) ||
+	       mptcp_data_is_locked(sk);
+}
+
+static void mptcp_read_done(struct sock *sk, size_t len)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct sk_buff *skb;
+	size_t left;
+	u32 offset;
+
+	msk_owned_by_me(msk);
+
+	if (sk->sk_state == TCP_LISTEN)
+		return;
+
+	left = len;
+	while (left && (skb = mptcp_recv_skb(sk, &offset)) != NULL) {
+		int used;
+
+		used = min_t(size_t, skb->len - offset, left);
+		msk->bytes_consumed += used;
+		MPTCP_SKB_CB(skb)->offset += used;
+		MPTCP_SKB_CB(skb)->map_seq += used;
+		left -= used;
+
+		if (skb->len > offset + used)
+			break;
+
+		mptcp_eat_recv_skb(sk, skb);
+	}
+
+	mptcp_rcv_space_adjust(msk, len - left);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	if (left != len)
+		mptcp_cleanup_rbuf(msk, len - left);
+}
+
+static u32 mptcp_get_skb_seq(struct sk_buff *skb)
+{
+	return MPTCP_SKB_CB(skb)->map_seq - MPTCP_SKB_CB(skb)->offset;
+}
+
+static int mptcp_skb_get_header(const struct sk_buff *skb, int off,
+				void *buf, int len)
+{
+	const struct sk_buff *iter = skb_shinfo(skb)->frag_list;
+	int copied = 0;
+	int ret = 0;
+
+	if (!iter)
+		return skb_copy_bits(skb, off, buf, len);
+
+	/* Make absolute to positive */
+	off -= MPTCP_SKB_CB(iter)->offset;
+
+	while (iter && copied < len) {
+		int skb_off  = MPTCP_SKB_CB(iter)->offset;
+		int data_len = iter->len - skb_off;
+		int count;
+
+		if (off >= data_len) {
+			off -= data_len; /* MPTCP skb avail data */
+			iter = iter->next;
+			continue;
+		}
+
+		count = min((int)(data_len - off), len - copied);
+		ret = skb_copy_bits(iter, skb_off + off, buf + copied, count);
+		if (ret)
+			break;
+		copied += count;
+		off = 0;
+		iter = iter->next;
+	}
+
+	if (copied < len && !ret)
+		ret = -EFAULT;
+	return ret;
+}
+
+static void mptcp_check_app_limited(struct sock *sk)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct mptcp_subflow_context *subflow;
+
+	mptcp_for_each_subflow(msk, subflow) {
+		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+		bool slow;
+
+		slow = lock_sock_fast(ssk);
+		tcp_sock_rate_check_app_limited(tcp_sk(ssk));
+		unlock_sock_fast(ssk, slow);
+	}
+}
+
+struct tls_prot_ops tls_mptcp_ops = {
+	.owner			= THIS_MODULE,
+	.protocol		= IPPROTO_MPTCP,
+	.recv_skb		= mptcp_recv_skb,
+	.lock_is_held		= mptcp_lock_is_held,
+	.read_done		= mptcp_read_done,
+	.get_skb_seq		= mptcp_get_skb_seq,
+	.skb_get_header		= mptcp_skb_get_header,
+	.epollin_ready		= mptcp_epollin_ready,
+	.check_app_limited	= mptcp_check_app_limited,
+};
+EXPORT_SYMBOL(tls_mptcp_ops);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index da40c6f3705f..6dea626348d9 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -380,6 +380,7 @@ struct mptcp_sock {
 
 #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
 #define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)
+#define mptcp_data_is_locked(sk) spin_is_locked(&(sk)->sk_lock.slock)
 
 #define mptcp_for_each_subflow(__msk, __subflow)			\
 	list_for_each_entry(__subflow, &((__msk)->conn_list), node)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index b45890e75c9e..170ccbb9d36d 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -1366,6 +1366,12 @@ static int __init tls_register(void)
 	if (err)
 		goto err_strp;
 
+#ifdef CONFIG_MPTCP
+	err = tls_register_prot_ops(&tls_mptcp_ops);
+	if (err)
+		goto err_tcp;
+#endif
+
 	err = tls_device_init();
 	if (err)
 		goto err_ops;
@@ -1374,6 +1380,10 @@ static int __init tls_register(void)
 
 	return 0;
 err_ops:
+#ifdef CONFIG_MPTCP
+	tls_unregister_prot_ops(&tls_mptcp_ops);
+err_tcp:
+#endif
 	tls_unregister_prot_ops(&tls_tcp_ops);
 err_strp:
 	tls_strp_dev_exit();
@@ -1385,6 +1395,9 @@ static int __init tls_register(void)
 static void __exit tls_unregister(void)
 {
 	tcp_unregister_ulp(&tcp_tls_ulp_ops);
+#ifdef CONFIG_MPTCP
+	tls_unregister_prot_ops(&tls_mptcp_ops);
+#endif
 	tls_unregister_prot_ops(&tls_tcp_ops);
 	tls_strp_dev_exit();
 	tls_device_cleanup();
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 12/17] tls: add mptcp support for sk_poll
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

The tls_sk_poll() function currently uses tcp_poll() unconditionally
to obtain the base poll mask, which works only for TCP. This prevents
TLS over MPTCP from working correctly with poll().

Make the poll function protocol-aware by selecting the appropriate
poll function based on sk->sk_protocol. For TCP it calls tcp_poll(),
for MPTCP it calls mptcp_poll() (guarded by CONFIG_MPTCP). Any other
protocol returns 0.

Also export mptcp_poll() symbol so that the TLS module can use it.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 include/net/mptcp.h  |  9 +++++++++
 net/mptcp/protocol.c |  5 +++--
 net/tls/tls_main.c   | 17 ++++++++++++++++-
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index ba2257986b13..b0a172c38891 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -239,6 +239,9 @@ static inline __be32 mptcp_reset_option(const struct sk_buff *skb)
 }
 
 void mptcp_active_detect_blackhole(struct sock *sk, bool expired);
+
+__poll_t mptcp_poll(struct file *file, struct socket *sock,
+		    struct poll_table_struct *wait);
 #else
 
 static inline void mptcp_init(void)
@@ -316,6 +319,12 @@ static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct reques
 static inline __be32 mptcp_reset_option(const struct sk_buff *skb)  { return htonl(0u); }
 
 static inline void mptcp_active_detect_blackhole(struct sock *sk, bool expired) { }
+
+static inline __poll_t mptcp_poll(struct file *file, struct socket *sock,
+				  struct poll_table_struct *wait)
+{
+	return 0;
+}
 #endif /* CONFIG_MPTCP */
 
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index f4cd7a6e5770..169bd468f212 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -4446,8 +4446,8 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
 	return 0;
 }
 
-static __poll_t mptcp_poll(struct file *file, struct socket *sock,
-			   struct poll_table_struct *wait)
+__poll_t mptcp_poll(struct file *file, struct socket *sock,
+		    struct poll_table_struct *wait)
 {
 	struct sock *sk = sock->sk;
 	struct mptcp_sock *msk;
@@ -4494,6 +4494,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
 
 	return mask;
 }
+EXPORT_SYMBOL_GPL(mptcp_poll);
 
 static struct sk_buff *mptcp_recv_skb(struct sock *sk, u32 *off)
 {
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 170ccbb9d36d..fa9fda3480da 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -416,6 +416,21 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 		tls_ctx_free(sk, ctx);
 }
 
+static __poll_t tls_proto_poll(struct file *file, struct socket *sock,
+			       struct poll_table_struct *wait)
+{
+	switch (sock->sk->sk_protocol) {
+	case IPPROTO_TCP:
+		return tcp_poll(file, sock, wait);
+#ifdef CONFIG_MPTCP
+	case IPPROTO_MPTCP:
+		return mptcp_poll(file, sock, wait);
+#endif
+	default:
+		return 0;
+	}
+}
+
 static __poll_t tls_sk_poll(struct file *file, struct socket *sock,
 			    struct poll_table_struct *wait)
 {
@@ -426,7 +441,7 @@ static __poll_t tls_sk_poll(struct file *file, struct socket *sock,
 	u8 shutdown;
 	int state;
 
-	mask = tcp_poll(file, sock, wait);
+	mask = tls_proto_poll(file, sock, wait);
 
 	state = inet_sk_state_load(sk);
 	shutdown = READ_ONCE(sk->sk_shutdown);
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 13/17] tls: disable device offload for mptcp sockets
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

MPTCP TLS hardware offload is not yet implemented. Return -EOPNOTSUPP
when attempting to enable device offload on MPTCP sockets.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/tls/tls_device.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index c44a59d9d715..e535edc23d0d 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -1074,6 +1074,9 @@ int tls_set_device_offload(struct sock *sk)
 	ctx = tls_get_ctx(sk);
 	prot = &ctx->prot_info;
 
+	if (sk->sk_protocol == IPPROTO_MPTCP)
+		return -EOPNOTSUPP;
+
 	if (ctx->priv_ctx_tx)
 		return -EEXIST;
 
@@ -1196,6 +1199,9 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
 	struct net_device *netdev;
 	int rc = 0;
 
+	if (sk->sk_protocol == IPPROTO_MPTCP)
+		return -EOPNOTSUPP;
+
 	if (ctx->crypto_recv.info.version != TLS_1_2_VERSION)
 		return -EOPNOTSUPP;
 
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 14/17] mptcp: update mptcp_check_readable helper
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Gang Yan, netdev, mptcp
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Gang Yan <yangang@kylinos.cn>

This patch makes mptcp_check_readable() aligned with TCP, and renames it to
mptcp_stream_is_readable(). It will be used in the case of KTLS, because
'prot' will be modified, tls_sw_sock_is_readable() is expected to be called
from prot->sock_is_readable().

Co-developed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
---
 net/mptcp/protocol.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 169bd468f212..4951b1dd013b 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3410,9 +3410,11 @@ void __mptcp_unaccepted_force_close(struct sock *sk)
 	__mptcp_destroy_sock(sk);
 }
 
-static __poll_t mptcp_check_readable(struct sock *sk)
+static bool mptcp_stream_is_readable(struct sock *sk)
 {
-	return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0;
+	if (mptcp_epollin_ready(sk))
+		return true;
+	return sk_is_readable(sk);
 }
 
 static void mptcp_check_listen_stop(struct sock *sk)
@@ -4476,7 +4478,8 @@ __poll_t mptcp_poll(struct file *file, struct socket *sock,
 		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
 
 	if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
-		mask |= mptcp_check_readable(sk);
+		if (mptcp_stream_is_readable(sk))
+			mask |= EPOLLIN | EPOLLRDNORM;
 		if (shutdown & SEND_SHUTDOWN)
 			mask |= EPOLLOUT | EPOLLWRNORM;
 		else
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 15/17] mptcp: implement ulp getsockopt for tls support
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

Add mptcp_getsockopt_tcp_ulp() to handle TCP_ULP getsockopt on MPTCP
sockets. The helper reads the user length once, checks for negative
value, takes the socket lock, caps the length to TCP_ULP_NAME_MAX, and
copies the ULP name (or sets the length to zero if no ULP is attached)
to userspace. The lock ensures safe access to icsk->icsk_ulp_ops.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/mptcp/sockopt.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index fcf6feb2a9eb..cc45491cd3b2 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -1408,6 +1408,39 @@ static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
 	return 0;
 }
 
+static int mptcp_getsockopt_tcp_ulp(struct sock *sk,
+				    char __user *optval,
+				    int __user *optlen)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int ret = 0, len;
+
+	if (copy_from_sockptr(&len, USER_SOCKPTR(optlen), sizeof(int)))
+		return -EFAULT;
+
+	if (len < 0)
+		return -EINVAL;
+
+	lock_sock(sk);
+	len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
+	if (!icsk->icsk_ulp_ops) {
+		len = 0;
+		if (copy_to_sockptr(USER_SOCKPTR(optlen), &len, sizeof(int)))
+			ret = -EFAULT;
+		goto out;
+	}
+	if (copy_to_sockptr(USER_SOCKPTR(optlen), &len, sizeof(int))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	if (copy_to_sockptr(USER_SOCKPTR(optval), icsk->icsk_ulp_ops->name,
+			    len))
+		ret = -EFAULT;
+out:
+	release_sock(sk);
+	return ret;
+}
+
 static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 				    char __user *optval, int __user *optlen)
 {
@@ -1415,6 +1448,7 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 
 	switch (optname) {
 	case TCP_ULP:
+		return mptcp_getsockopt_tcp_ulp(sk, optval, optlen);
 	case TCP_CONGESTION:
 	case TCP_INFO:
 	case TCP_CC_INFO:
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 16/17] mptcp: implement ulp setsockopt for tls support
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

Allow MPTCP sockets to set the TCP_ULP socket option to enable TLS.
Add mptcp_setsockopt_tcp_ulp() which validates the socket state (must
not be CLOSE or LISTEN), only accepts "tls" as the ULP name, and then
calls tcp_set_ulp().

Include TCP_ULP in the list of supported options in supported_sockopt(),
and handle it in setsockopt_sol_tcp() instead of returning -EOPNOTSUPP.

Call tcp_cleanup_ulp() in mptcp_destroy_common() to release ULP module's
reference count.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/mptcp/protocol.c |  1 +
 net/mptcp/sockopt.c  | 34 +++++++++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 4951b1dd013b..a13acee67688 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3765,6 +3765,7 @@ static void mptcp_destroy(struct sock *sk)
 	/* allow the following to close even the initial subflow */
 	msk->free_first = 1;
 	mptcp_destroy_common(msk);
+	tcp_cleanup_ulp(sk);
 	sk_sockets_allocated_dec(sk);
 }
 
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index cc45491cd3b2..eeb348336195 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -577,6 +577,7 @@ static bool mptcp_supported_sockopt(int level, int optname)
 		case TCP_FASTOPEN_CONNECT:
 		case TCP_FASTOPEN_KEY:
 		case TCP_FASTOPEN_NO_COOKIE:
+		case TCP_ULP:
 			return true;
 		}
 
@@ -830,6 +831,37 @@ static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level,
 	return ret;
 }
 
+static int mptcp_setsockopt_tcp_ulp(struct sock *sk, sockptr_t optval,
+				    unsigned int optlen)
+{
+	char name[TCP_ULP_NAME_MAX];
+	int err = 0;
+	size_t len;
+	int val;
+
+	if (optlen < 1)
+		return -EINVAL;
+
+	len = min_t(long, TCP_ULP_NAME_MAX - 1, optlen);
+	val = strncpy_from_sockptr(name, optval, len);
+	if (val < 0)
+		return -EFAULT;
+	name[val] = 0;
+
+	if (strcmp(name, "tls"))
+		return -EOPNOTSUPP;
+
+	sockopt_lock_sock(sk);
+	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) {
+		err = -ENOTCONN;
+		goto out;
+	}
+	err = tcp_set_ulp(sk, name);
+out:
+	sockopt_release_sock(sk);
+	return err;
+}
+
 static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 				    sockptr_t optval, unsigned int optlen)
 {
@@ -838,7 +870,7 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 
 	switch (optname) {
 	case TCP_ULP:
-		return -EOPNOTSUPP;
+		return mptcp_setsockopt_tcp_ulp(sk, optval, optlen);
 	case TCP_CONGESTION:
 		return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
 	case TCP_DEFER_ACCEPT:
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 17/17] selftests: mptcp: connect: use espintcp for ulp test
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

With KTLS being implemented, "tls" should no longer be used in
sock_test_tcpulp(), it breaks mptcp_connect.sh tests. Another ULP
name, "espintcp", is set instead in this patch.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 tools/testing/selftests/net/mptcp/config          | 4 ++++
 tools/testing/selftests/net/mptcp/mptcp_connect.c | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config
index 59051ee2a986..f48bd5183fb3 100644
--- a/tools/testing/selftests/net/mptcp/config
+++ b/tools/testing/selftests/net/mptcp/config
@@ -34,3 +34,7 @@ CONFIG_NFT_SOCKET=m
 CONFIG_NFT_TPROXY=m
 CONFIG_SYN_COOKIES=y
 CONFIG_VETH=y
+CONFIG_INET_ESP=y
+CONFIG_INET_ESPINTCP=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_ESPINTCP=y
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index cbe573c4ab3a..299a7a02d6f5 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -285,11 +285,11 @@ static void sock_test_tcpulp(int sock, int proto, unsigned int line)
 	if (buflen > 0) {
 		if (strcmp(buf, "mptcp") != 0)
 			xerror("unexpected ULP '%s' for proto %d at line %u", buf, proto, line);
-		ret = do_ulp_so(sock, "tls");
+		ret = do_ulp_so(sock, "espintcp");
 		if (ret == 0)
 			X("setsockopt");
 	} else if (proto == IPPROTO_MPTCP) {
-		ret = do_ulp_so(sock, "tls");
+		ret = do_ulp_so(sock, "espintcp");
 		if (ret != -1)
 			X("setsockopt");
 	}
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v3 0/7] Prepare mutable list iterators to cache cursor state
From: Andy Shevchenko @ 2026-06-22 10:46 UTC (permalink / raw)
  To: Kaitao Cheng
  Cc: Alexei Starovoitov, Andrew Morton, David Hildenbrand, Jens Axboe,
	Tejun Heo, Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Paul E. McKenney, Shakeel Butt, Christian König,
	David Howells, Simona Vetter, Randy Dunlap, Luca Ceresoli,
	Philipp Stanner, linux-block, LKML,
	open list:CONTROL GROUP (CGROUP), linux-ntfs-dev, Linux-Fsdevel,
	io-uring, audit, bpf, Network Development, dri-devel,
	linux-perf-use., linux-trace-kernel, kexec, live-patching,
	linux-modules, Linux Crypto Mailing List, Linux Power Management,
	rcu, sched-ext, linux-mm, virtualization, damon,
	clang-built-linux, chengkaitao, Muchun Song
In-Reply-To: <8c8f1849-86d3-4c69-be27-30bbdffdf616@linux.dev>

On Mon, Jun 22, 2026 at 02:15:01PM +0800, Kaitao Cheng wrote:
> 在 2026/6/22 13:28, Alexei Starovoitov 写道:
> > On Sun, Jun 21, 2026 at 9:06 PM Kaitao Cheng <kaitao.cheng@linux.dev> wrote:

...

> >>  block/bfq-iosched.c                 |  17 +-
> >>  block/blk-cgroup.c                  |  12 +-
> >>  block/blk-flush.c                   |   4 +-
> >>  block/blk-iocost.c                  |  18 +-
> >>  block/blk-mq.c                      |   8 +-
> >>  block/blk-throttle.c                |   4 +-
> >>  block/kyber-iosched.c               |   4 +-
> >>  block/partitions/ldm.c              |   8 +-
> >>  block/sed-opal.c                    |   4 +-
> >>  include/linux/list.h                | 269 ++++++++++++++++++++++++----
> >>  include/linux/llist.h               |  81 +++++++--
> >>  init/initramfs.c                    |   5 +-
> >>  io_uring/cancel.c                   |   6 +-
> >>  io_uring/poll.c                     |   3 +-
> >>  io_uring/rw.c                       |   4 +-
> >>  io_uring/timeout.c                  |   8 +-
> >>  io_uring/uring_cmd.c                |   3 +-
> >>  kernel/audit_tree.c                 |   4 +-
> >>  kernel/audit_watch.c                |  16 +-
> >>  kernel/auditfilter.c                |   4 +-
> >>  kernel/auditsc.c                    |   4 +-
> >>  kernel/bpf/arena.c                  |  10 +-
> >>  kernel/bpf/arraymap.c               |   8 +-
> >>  kernel/bpf/bpf_local_storage.c      |   3 +-
> >>  kernel/bpf/bpf_lru_list.c           |  25 ++-
> >>  kernel/bpf/btf.c                    |  18 +-
> >>  kernel/bpf/cgroup.c                 |   7 +-
> >>  kernel/bpf/cpumap.c                 |   4 +-
> >>  kernel/bpf/devmap.c                 |  10 +-
> >>  kernel/bpf/helpers.c                |   8 +-
> >>  kernel/bpf/local_storage.c          |   4 +-
> >>  kernel/bpf/memalloc.c               |  16 +-
> >>  kernel/bpf/offload.c                |   8 +-
> >>  kernel/bpf/states.c                 |   4 +-
> >>  kernel/bpf/stream.c                 |   4 +-
> >>  kernel/bpf/verifier.c               |   6 +-
> >>  kernel/cgroup/cgroup-v1.c           |   4 +-
> >>  kernel/cgroup/cgroup.c              |  54 +++---
> >>  kernel/cgroup/dmem.c                |  12 +-
> >>  kernel/cgroup/rdma.c                |   8 +-
> >>  kernel/events/core.c                |  44 +++--
> >>  kernel/events/uprobes.c             |  12 +-
> >>  kernel/exit.c                       |   8 +-
> >>  kernel/fail_function.c              |   4 +-
> >>  kernel/gcov/clang.c                 |   4 +-
> >>  kernel/irq_work.c                   |   4 +-
> >>  kernel/kexec_core.c                 |   4 +-
> >>  kernel/kprobes.c                    |  16 +-
> >>  kernel/livepatch/core.c             |   4 +-
> >>  kernel/livepatch/core.h             |   4 +-
> >>  kernel/liveupdate/kho_block.c       |   4 +-
> >>  kernel/liveupdate/luo_flb.c         |   4 +-
> >>  kernel/locking/rwsem.c              |   2 +-
> >>  kernel/locking/test-ww_mutex.c      |   2 +-
> >>  kernel/module/main.c                |  11 +-
> >>  kernel/padata.c                     |   4 +-
> >>  kernel/power/snapshot.c             |   8 +-
> >>  kernel/power/wakelock.c             |   4 +-
> >>  kernel/printk/printk.c              |  11 +-
> >>  kernel/ptrace.c                     |   4 +-
> >>  kernel/rcu/rcutorture.c             |   3 +-
> >>  kernel/rcu/tasks.h                  |   9 +-
> >>  kernel/rcu/tree.c                   |   6 +-
> >>  kernel/resource.c                   |   4 +-
> >>  kernel/sched/core.c                 |   4 +-
> >>  kernel/sched/ext.c                  |  22 +--
> >>  kernel/sched/fair.c                 |  28 +--
> >>  kernel/sched/topology.c             |   4 +-
> >>  kernel/sched/wait.c                 |   4 +-
> >>  kernel/seccomp.c                    |   4 +-
> >>  kernel/signal.c                     |  11 +-
> >>  kernel/smp.c                        |   4 +-
> >>  kernel/taskstats.c                  |   8 +-
> >>  kernel/time/clockevents.c           |   6 +-
> >>  kernel/time/clocksource.c           |   4 +-
> >>  kernel/time/posix-cpu-timers.c      |   4 +-
> >>  kernel/time/posix-timers.c          |   3 +-
> >>  kernel/torture.c                    |   3 +-
> >>  kernel/trace/bpf_trace.c            |   4 +-
> >>  kernel/trace/ftrace.c               |  49 +++--
> >>  kernel/trace/ring_buffer.c          |  25 ++-
> >>  kernel/trace/trace.c                |  12 +-
> >>  kernel/trace/trace_dynevent.c       |   6 +-
> >>  kernel/trace/trace_dynevent.h       |   5 +-
> >>  kernel/trace/trace_events.c         |  35 ++--
> >>  kernel/trace/trace_events_filter.c  |   4 +-
> >>  kernel/trace/trace_events_hist.c    |   8 +-
> >>  kernel/trace/trace_events_trigger.c |  17 +-
> >>  kernel/trace/trace_events_user.c    |  16 +-
> >>  kernel/trace/trace_stat.c           |   4 +-
> >>  kernel/user-return-notifier.c       |   3 +-
> >>  kernel/workqueue.c                  |  16 +-
> >>  mm/backing-dev.c                    |   8 +-
> >>  mm/balloon.c                        |   8 +-
> >>  mm/cma.c                            |   4 +-
> >>  mm/compaction.c                     |   4 +-
> >>  mm/damon/core.c                     |   4 +-
> >>  mm/damon/sysfs-schemes.c            |   4 +-
> >>  mm/dmapool.c                        |   4 +-
> >>  mm/huge_memory.c                    |   8 +-
> >>  mm/hugetlb.c                        |  56 +++---
> >>  mm/hugetlb_vmemmap.c                |  16 +-
> >>  mm/khugepaged.c                     |  14 +-
> >>  mm/kmemleak.c                       |   7 +-
> >>  mm/ksm.c                            |  25 +--
> >>  mm/list_lru.c                       |   4 +-
> >>  mm/memcontrol-v1.c                  |   8 +-
> >>  mm/memory-failure.c                 |  12 +-
> >>  mm/memory-tiers.c                   |   4 +-
> >>  mm/migrate.c                        |  23 ++-
> >>  mm/mmu_notifier.c                   |   9 +-
> >>  mm/page_alloc.c                     |   8 +-
> >>  mm/page_reporting.c                 |   2 +-
> >>  mm/percpu.c                         |  11 +-
> >>  mm/pgtable-generic.c                |   4 +-
> >>  mm/rmap.c                           |  10 +-
> >>  mm/shmem.c                          |   9 +-
> >>  mm/slab_common.c                    |  14 +-
> >>  mm/slub.c                           |  33 ++--
> >>  mm/swapfile.c                       |   4 +-
> >>  mm/userfaultfd.c                    |  12 +-
> >>  mm/vmalloc.c                        |  24 +--
> >>  mm/vmscan.c                         |   7 +-
> >>  mm/zsmalloc.c                       |   4 +-
> >>  124 files changed, 875 insertions(+), 681 deletions(-)
> > 
> > Not sure what you were thinking, but this diff stat
> > is not landable.
> 
> [PATCH v3 1/7] and [PATCH v3 2/7] contain the main logic and can
> be merged directly. They are also compatible with the old API.
> [PATCH v3 3/7] through [PATCH v3 7/7] are just simple interface
> replacements and do not change any functional logic. They can be
> left unmerged for now; individual modules can pick them up later
> if needed.
> 
> In v2, Andy Shevchenko mentioned: "If it's done by Linus himself
> during the day when he prepares -rc1, it's fine."

Yes, but you need to get his blessing first to go with this.
Have you communicated with him on this?

> Even so, the
> changes in this patch series are indeed quite large and touch
> almost every subsystem. I have only converted part of them for
> now, so I wanted to send this out first and see what people think.

That's why it's better to provide a script to convert (e.g., coccinelle)
instead of tons of patches.

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH net v6 0/4] Fix i40e/ice/iavf VF bonding after netdev lock changes
From: Jose Ignacio Tornos Martinez @ 2026-06-22 10:51 UTC (permalink / raw)
  To: horms
  Cc: aleksandr.loktionov, anthony.l.nguyen, davem, edumazet,
	intel-wired-lan, jacob.e.keller, jesse.brandeburg, jtornosm, kuba,
	netdev, pabeni, przemyslaw.kitszel
In-Reply-To: <20260622103159.GZ827683@horms.kernel.org>

Hello Simon,

> Unfortunately the Netdev CI was unable to apply this series cleanly against net.
> Would you be able to rebase and repost?
Sure, I'll rebase on current net and send v7 shortly.

Thanks

Best regards
José Ignacio


^ permalink raw reply

* Re: [PATCH net] octeontx2-af: Block VFs from clobbering special CGX PKIND state
From: Simon Horman @ 2026-06-22 10:51 UTC (permalink / raw)
  To: rkannoth
  Cc: Simon Horman, davem, gakula, linux-kernel, netdev, sgoutham,
	andrew+netdev, edumazet, kuba, pabeni, hkelam
In-Reply-To: <20260619041002.1773822-1-rkannoth@marvell.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
(Note: Due to recitation restrictions, the standard unified diff is omitted 
and the findings are summarized below instead of inline quoting.)

commit 40480a8afd593111da0b00679b782cbe187572e6
Author: Hariprasad Kelam <hkelam@marvell.com>

octeontx2-af: Block VFs from clobbering special CGX PKIND state

This commit adds checks to prevent VFs from overwriting LMAC RX PKIND or
default TX parse configs when a shared LMAC uses HIGIG2 or EDSA parsing.
It introduces helpers to verify if the PKIND configuration is permitted
before applying updates.

[Severity: High]
Can a Time-of-Check to Time-of-Use (TOCTOU) race occur between a VF checking
the LMAC PKIND state and subsequently updating it?

Looking at rvu_cgx_is_pkind_config_permitted() in rvu_cgx.c, it reads the
hardware LMAC PKIND register locklessly to check if HIGIG or EDSA parsing
is enabled.

If the check returns true, the VF proceeds to write its own default PKIND
to the LMAC via cgx_set_pkind() in nix_interface_init(), or updates the TX
parse config in rvu_mbox_handler_nix_lf_alloc() in rvu_nix.c.

Since VF mailbox messages are handled by rvu_afvf_mbox_handler() on a separate
workqueue without holding rvu->mbox_lock, could a PF executing a mailbox
message like rvu_mbox_handler_npc_set_pkind interleave between the VF's
lockless read and its subsequent write?

If so, the VF might observe the default PKIND, the PF could then set
HIGIG_PKIND or EDSA_PKIND, and the VF would then erroneously overwrite the
PF's configuration. Does this sequence need to be protected by a lock such
as cgx_cfg_lock or rsrc_lock to prevent bypassing the new protection?

^ permalink raw reply

* RE: [PATCH net v3] rtase: Workaround for TX hang caused by short UDP packets entering hardware PTP parsing
From: Justin Lai @ 2026-06-22 10:46 UTC (permalink / raw)
  To: Simon Horman
  Cc: kuba@kernel.org, davem@davemloft.net, edumazet@google.com,
	pabeni@redhat.com, andrew+netdev@lunn.ch,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	stable@vger.kernel.org, richardcochran@gmail.com,
	david.laight.linux@gmail.com, aleksander.lobakin@intel.com,
	Ping-Ke Shih, Larry Chiu
In-Reply-To: <20260617085416.841731-2-horms@kernel.org>

Simon Horman <horms@kernel.org> wrote:
> 
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> Full review at: https://netdev-ai.bots.linux.dev/sashiko/
> ---
> rtase: Workaround for TX hang caused by short UDP packets entering
> hardware PTP parsing
> 
> This patch adds a workaround for a hardware TX hang that occurs when
> short UDP packets matching destination ports 319/320 enter a PTP parsing
> path that reads beyond the packet end. The workaround pads affected
> packets so the transport data reaches RTASE_MIN_PAD_LEN before
> transmission.
> 
> > diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c
> b/drivers/net/ethernet/realtek/rtase/rtase_main.c
> > index 55105d34bc79..4c295a39c7a0 100644
> > --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c
> > +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c
> 
> [ ... ]
> 
> > +static bool rtase_get_udp_offset(struct sk_buff *skb, u32 *udp_offset)
> > +{
> > +     int no = skb_network_offset(skb);
> > +     struct ipv6hdr *i6h, _i6h;
> > +     struct iphdr *ih, _ih;
> > +
> > +     switch (vlan_get_protocol(skb)) {
> > +     case htons(ETH_P_IP):
> > +             ih = skb_header_pointer(skb, no, sizeof(_ih), &_ih);
> > +             if (!ih)
> > +                     return false;
> > +
> > +             if (ih->ihl < 5)
> > +                     return false;
> > +
> > +             if (ih->protocol != IPPROTO_UDP)
> > +                     return false;
> > +
> > +             *udp_offset = no + ih->ihl * 4;
> 
> [Low]
> Should ih->ihl also be checked against the actual data present in the
> skb? With ihl up to 15 (60 bytes), if the skb only contains the basic
> 20-byte IP header, the computed udp_offset can exceed skb->len.
> 
> Downstream in rtase_skb_pad():
> 
>         trans_data_len = skb->len - udp_offset;
> 
> trans_data_len is u32, so when udp_offset > skb->len this underflows to
> a very large value, which trivially satisfies:
> 
>         trans_data_len >= RTASE_MIN_PAD_LEN
> 
> and rtase_skb_pad() silently skips the padding. A malformed packet from
> an AF_PACKET sender with CAP_NET_RAW could craft a mismatched ihl and
> bypass the workaround. Would clamping or validating udp_offset against
> skb->len before the subtraction be appropriate?
> 
Thanks for pointing this out.
 
I'll add a check for udp_offset > skb->len before calculating
trans_data_len to avoid the potential underflow.
 
If the computed UDP offset is beyond the packet length, the packet is
malformed and cannot be safely handled by this workaround, so it should
not be transmitted.

> > +
> > +             return true;
> > +     case htons(ETH_P_IPV6):
> > +             i6h = skb_header_pointer(skb, no, sizeof(_i6h), &_i6h);
> > +             if (!i6h)
> > +                     return false;
> > +
> > +             if (i6h->nexthdr != IPPROTO_UDP)
> > +                     return false;
> > +
> > +             *udp_offset = no + sizeof(*i6h);
> > +
> > +             return true;
> > +     default:
> > +             return false;
> > +     }
> > +}
> > +
> > +static bool rtase_skb_pad(struct sk_buff *skb)
> > +{
> > +     __be16 *dest, _dest;
> > +     u32 trans_data_len;
> > +     u32 udp_offset;
> > +     u16 dest_port;
> > +     u32 pad_len;
> > +
> > +     if (!rtase_get_udp_offset(skb, &udp_offset))
> > +             return true;
> > +
> > +     trans_data_len = skb->len - udp_offset;
> > +     if (trans_data_len < offsetof(struct udphdr, len) ||
> > +         trans_data_len >= RTASE_MIN_PAD_LEN)
> > +             return true;
> 
> [Low]
> When trans_data_len is less than offsetof(struct udphdr, len) (i.e., the
> transport region is under 4 bytes), this returns true and skips the
> padding entirely.
> 
> The commit message states the hardware "may access data beyond the end
> of the packet" during PTP parsing. If the hardware also reads out of
> bounds while classifying the packet, an IPv4 last fragment with 1-3
> trailing bytes could still be misclassified as PTP based on whatever
> garbage memory the hardware happens to read, and the workaround would
> be bypassed.
> 
> The implicit assumption here seems to be "if the driver cannot read the
> dest port from the skb, the hardware cannot classify it either", which
> appears to contradict the premise that the hardware reads beyond the
> packet boundary. Should these very short transport-data cases also be
> padded to be safe?
> 


Thanks for the review.

The out-of-bounds access occurs after the packet has already matched
the PTP classification criteria (UDP destination port 319/320) and
enters the hardware PTP parsing path.

If the transport data is shorter than the destination port field
itself, the packet cannot match the affected PTP classification path.

Thanks,
Justin

> > +
> > +     dest = skb_header_pointer(skb,
> > +                               udp_offset + offsetof(struct udphdr,
> dest),
> > +                               sizeof(_dest), &_dest);
> 
> [ ... ]

^ permalink raw reply

* [PATCH net] net: do not acquire dev->tx_global_lock in netdev_watchdog_up()
From: Eric Dumazet @ 2026-06-22 11:01 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, netdev, eric.dumazet, Eric Dumazet,
	Marek Szyprowski

Marek Szyprowski reported a deadlock during system resume when virtio_net
driver is used.

The deadlock occurs because netif_device_attach() is called while holding
dev->tx_global_lock (via netif_tx_lock_bh() in virtnet_restore_up()).
netif_device_attach() calls __netdev_watchdog_up(), which now also tries
to acquire dev->tx_global_lock to synchronize with dev_watchdog().

This recursive lock acquisition results in a deadlock.

Fix this by removing the tx_global_lock acquisition from netdev_watchdog_up().

The critical state (watchdog_timer and watchdog_ref_held) is already
protected by dev->watchdog_lock, which was introduced in the blamed commit.

Fixes: 8eed5519e496 ("net: watchdog: fix refcount tracking races")
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Closes: https://lore.kernel.org/netdev/a443376e-5187-4268-93b3-58047ef113a8@samsung.com/
Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
---
 net/sched/sch_generic.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3f1c510df850dbdbaf10d483547c7b1f3a5d5482..ef2b4bf51564173751c74fefe17e3913ed2fa056 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -594,9 +594,8 @@ void netdev_watchdog_up(struct net_device *dev)
 		return;
 	if (dev->watchdog_timeo <= 0)
 		dev->watchdog_timeo = 5*HZ;
-	spin_lock_bh(&dev->tx_global_lock);
 
-	spin_lock(&dev->watchdog_lock);
+	spin_lock_bh(&dev->watchdog_lock);
 	if (!mod_timer(&dev->watchdog_timer,
 		       round_jiffies(jiffies + dev->watchdog_timeo))) {
 		if (!dev->watchdog_ref_held) {
@@ -605,9 +604,7 @@ void netdev_watchdog_up(struct net_device *dev)
 			dev->watchdog_ref_held = true;
 		}
 	}
-	spin_unlock(&dev->watchdog_lock);
-
-	spin_unlock_bh(&dev->tx_global_lock);
+	spin_unlock_bh(&dev->watchdog_lock);
 }
 EXPORT_SYMBOL_GPL(netdev_watchdog_up);
 
-- 
2.55.0.rc0.786.g65d90a0328-goog


^ permalink raw reply related

* RE: [PATCH net v3] rtase: Workaround for TX hang caused by short UDP packets entering hardware PTP parsing
From: Justin Lai @ 2026-06-22 10:56 UTC (permalink / raw)
  To: Simon Horman
  Cc: kuba@kernel.org, davem@davemloft.net, edumazet@google.com,
	pabeni@redhat.com, andrew+netdev@lunn.ch,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	stable@vger.kernel.org, richardcochran@gmail.com,
	david.laight.linux@gmail.com, aleksander.lobakin@intel.com,
	Ping-Ke Shih, Larry Chiu
In-Reply-To: <20260617085942.843250-2-horms@kernel.org>

Simon Horman <horms@kernel.org> wrote:
> 
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> Full review at: https://sashiko.dev
> ---
> rtase: Workaround for TX hang caused by short UDP packets entering
> hardware PTP parsing
> 
> The patch adds a workaround for a hardware bug in rtase where short UDP
> packets triggering PTP parsing can cause a TX hang due to out-of-bounds
> reads.
> It pads affected packets so the transport data reaches a minimum length.
> 
> Cc: stable@vger.kernel.org
> 
> > diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c
> b/drivers/net/ethernet/realtek/rtase/rtase_main.c
> > index 55105d34bc797..4c295a39c7a0c 100644
> > --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c
> > +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c
> [ ... ]
> > @@ -1249,6 +1250,81 @@ static u32 rtase_tx_csum(struct sk_buff *skb,
> const struct net_device *dev)
> >       return csum_cmd;
> >  }
> >
> > +static bool rtase_get_udp_offset(struct sk_buff *skb, u32 *udp_offset)
> > +{
> > +     int no = skb_network_offset(skb);
> > +     struct ipv6hdr *i6h, _i6h;
> > +     struct iphdr *ih, _ih;
> > +
> > +     switch (vlan_get_protocol(skb)) {
> > +     case htons(ETH_P_IP):
> 
> [Severity: High]
> Does relying on vlan_get_protocol(skb) allow an unprivileged user to bypass
> this workaround?
> 
> If an attacker uses an AF_PACKET/SOCK_RAW socket to inject a raw Ethernet
> frame containing an IPv4/UDP packet, they could set skb->protocol to an
> arbitrary value like htons(ETH_P_ALL). The switch statement will fall
> through to the default case and return false, causing rtase_skb_pad() to
> skip the padding.
> 
> Since the hardware independently parses the raw frame based on the
> EtherType,
> could it still process the unpadded packet and trigger the TX hang?
> 

Thanks for pointing this out.
 
Since the hardware parser operates on the actual Ethernet frame
contents, relying on skb protocol metadata is not ideal for this
workaround.
 
I'll update the code to derive the L3 protocol and network offset from
the Ethernet/VLAN headers in the skb data before parsing IPv4/IPv6.
This makes the software check match what the hardware parser sees and
avoids depending on skb protocol metadata.

> > +             ih = skb_header_pointer(skb, no, sizeof(_ih), &_ih);
> > +             if (!ih)
> > +                     return false;
> [ ... ]
> > +static bool rtase_skb_pad(struct sk_buff *skb)
> > +{
> > +     __be16 *dest, _dest;
> > +     u32 trans_data_len;
> > +     u32 udp_offset;
> > +     u16 dest_port;
> > +     u32 pad_len;
> > +
> > +     if (!rtase_get_udp_offset(skb, &udp_offset))
> > +             return true;
> > +
> > +     trans_data_len = skb->len - udp_offset;
> > +     if (trans_data_len < offsetof(struct udphdr, len) ||
> > +         trans_data_len >= RTASE_MIN_PAD_LEN)
> > +             return true;
> 
> [Severity: High]
> Does this logic inadvertently skip padding for underflowed lengths and tiny
> payloads?
> 
> If the IP header is malformed such that skb->len < udp_offset, the unsigned
> subtraction for trans_data_len will underflow to a large positive value.
> This causes trans_data_len >= RTASE_MIN_PAD_LEN to be true, bypassing the
> padding.
> 
> Additionally, if the packet has fewer than 4 bytes of UDP payload, meaning
> trans_data_len < offsetof(struct udphdr, len), it explicitly returns true.
> 
> Since the hardware blindly fetches the UDP destination port at
> udp_offset + 2 via DMA, won't transmitting these short or malformed packets
> without padding still expose the hardware to out-of-bounds reads?
> 

Thanks for the review.
 
I'll add a check for udp_offset > skb->len before calculating
trans_data_len to avoid the potential underflow.
 
If the computed UDP offset is beyond the packet length, the packet is
malformed and cannot be safely handled by this workaround, so it should
not be transmitted.
 
The out-of-bounds access occurs after the packet has already matched
the PTP classification criteria (UDP destination port 319/320) and
enters the hardware PTP parsing path.
 
If the transport data is shorter than the destination port field
itself, the packet cannot match the affected PTP classification path.

> > +
> > +     dest = skb_header_pointer(skb,
> > +                               udp_offset + offsetof(struct udphdr,
> dest),
> > +                               sizeof(_dest), &_dest);
> [ ... ]
> > @@ -1362,6 +1438,9 @@ static netdev_tx_t rtase_start_xmit(struct sk_buff
> *skb,
> >               opts2 |= rtase_tx_csum(skb, dev);
> >       }
> >
> > +     if (!rtase_skb_pad(skb))
> > +             goto err_dma_0;
> > +
> >       frags = rtase_xmit_frags(ring, skb, opts1, opts2);
> >       if (unlikely(frags < 0))
> >               goto err_dma_0;
> 
> [Severity: High]
> This is a pre-existing issue, but does the err_dma_1 error path in
> rtase_start_xmit() contain a double free of the skb?
> 
> When transmitting a fragmented packet, rtase_xmit_frags() maps the
> fragments and assigns ring->skbuff[entry] = skb for the last fragment
> descriptor. Back in rtase_start_xmit(), if dma_map_single() fails for the
> linear part, it jumps to err_dma_1:
> 
> err_dma_1:
>         ring->skbuff[entry] = NULL;
>         rtase_tx_clear_range(ring, ring->cur_idx + 1, frags);
> 
> err_dma_0:
>         tp->stats.tx_dropped++;
>         dev_kfree_skb_any(skb);
>         return NETDEV_TX_OK;
> 
> rtase_tx_clear_range() iterates over the mapped fragments and calls
> dev_kfree_skb_any() when it finds the skb pointer. Execution then falls
> through to err_dma_0, which unconditionally calls dev_kfree_skb_any(skb) a
> second time on the same skb pointer.

Thanks for pointing this out.

This appears to be a pre-existing issue and is unrelated to the change
in this patch.

I'll investigate the reported double free scenario further and address
it separately if a fix is needed.

Thanks,
Justin

^ permalink raw reply

* Re: [patch V2 18/25] timekeeping: Prepare for cross timestamps on arbitrary clock IDs
From: Thomas Gleixner @ 2026-06-22 11:07 UTC (permalink / raw)
  To: David Woodhouse, LKML
  Cc: Miroslav Lichvar, John Stultz, Stephen Boyd, Anna-Maria Behnsen,
	Frederic Weisbecker, thomas.weissschuh, Arthur Kiyanovski,
	Rodolfo Giometti, Vincent Donnefort, Marc Zyngier, Oliver Upton,
	kvmarm, Oliver Upton, Richard Cochran, netdev, Takashi Iwai,
	Miri Korenblit, Johannes Berg, Jacob Keller, Tony Nguyen,
	Saeed Mahameed, Peter Hilber, Michael S. Tsirkin, virtualization,
	linux-wireless, linux-sound, Vadim Fedorenko
In-Reply-To: <b296182e2e2c1ed2fe1c4879fd6f12d67a7ad22f.camel@infradead.org>

On Mon, Jun 22 2026 at 09:55, David Woodhouse wrote:
> On Fri, 2026-05-29 at 22:01 +0200, Thomas Gleixner wrote:
>> From: Thomas Gleixner <tglx@kernel.org>
>> 
>> PTP device system crosstime stamps support only CLOCK_REALTIME, which is
>> meaningless for AUX clocks. The PTP core hands in the clock ID already, so
>> prepare the core code to honor it.
>> 
>>  - Add a new sys_systime field to struct system_device_crosststamp which
>>    aliases the sys_realtime field. Once all users are converted
>>    sys_realtime can be removed.
>> 
>>  - Prepare get_device_system_crosststamp() and the related code for it by
>>    switching to sys_systime and providing the initial changes to utilize
>>    different time keepers.
>> 
>> No functional change intended.
>
> We ended up with ktime_get_snapshot_id() also supporting CLOCK_BOOTTIME
> and CLOCK_MONOTONIC_RAW, but not get_device_system_crosststamp().
> Should we make that consistent?

Maybe. The BOOTTIME support is only there for that ARM64 hyper trace muck,
but has no other relevance.

MONORAW is there for the PTP EXTENDED IOCTL, but with PRECISE the
snapshot already contains the raw value and you'd have to prevent the
historical adjustment part for RAW. So I don't see the actual value, but
I don't have a strong opinion either.

Thanks,

        tglx




^ permalink raw reply

* [PATCH] fsl/fman: Free init resources on KeyGen failure in fman_init()
From: Haoxiang Li @ 2026-06-22 11:16 UTC (permalink / raw)
  To: madalin.bucur, sean.anderson, andrew+netdev, davem, edumazet,
	kuba, pabeni, florinel.iordache
  Cc: netdev, linux-kernel, Haoxiang Li, stable

fman_muram_alloc() allocates initialization resources before
initializing the KeyGen block. If keygen_init() fails, the
function returns -EINVAL directly and leaves those resources
allocated. Free the initialization resources before returning
from the KeyGen failure path.

While at it, drop the unused error check around enable(), which
always returns 0.

Fixes: 7472f4f281d0 ("fsl/fman: enable FMan Keygen")
Cc: stable@kernel.org
Signed-off-by: Haoxiang Li <haoxiang_li2024@163.com>
---
 drivers/net/ethernet/freescale/fman/fman.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c
index 013273a2de32..3a2a57207e55 100644
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -1995,12 +1995,12 @@ static int fman_init(struct fman *fman)
 
 	/* Init KeyGen */
 	fman->keygen = keygen_init(fman->kg_regs);
-	if (!fman->keygen)
+	if (!fman->keygen) {
+		free_init_resources(fman);
 		return -EINVAL;
+	}
 
-	err = enable(fman, cfg);
-	if (err != 0)
-		return err;
+	enable(fman, cfg);
 
 	enable_time_stamp(fman);
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH net] veth: fix NAPI leak in XDP enable error path
From: Eric Dumazet @ 2026-06-22 11:18 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, netdev, eric.dumazet, Eric Dumazet, Guenter Roeck,
	Björn Töpel, Daniel Borkmann, Ilias Apalodimas,
	Michael S. Tsirkin, Tariq Toukan

During XDP enablement in veth, if xdp_rxq_info_reg() or
xdp_rxq_info_reg_mem_model() fails, the driver rolls back the changes.

However, the rollback loop:
	for (i--; i >= start; i--) {

decrements the loop index 'i' before the first iteration. This
correctly skips unregistering the rxq for the failed index 'i' (as
registration failed or was already cleaned up), but it also
erroneously skips calling netif_napi_deli() for rq[i].xdp_napi.

Since netif_napi_add() was already called for index 'i', this leaves
a dangling napi_struct in the device's napi_list. When the veth
device is later destroyed, the freed queue memory (which contains the
leaked NAPI structure) can be reused.

The subsequent device teardown iterates the NAPI list and
corrupts the reallocated memory, leading to UAF.

Fix this by explicitly deleting the NAPI association for the failed
index 'i' before rolling back the successfully configured queues.

Fixes: b02e5a0ebb17 ("xsk: Propagate napi_id to XDP socket Rx path")
Reported-by: Guenter Roeck <groeck@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Björn Töpel <bjorn.topel@intel.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Tariq Toukan <tariqt@nvidia.com>
---
 drivers/net/veth.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 0cfb19b760dd54eb896f469c02bb02ecf5eef504..1c5142149175369a642342849addfbb9c07404bc 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1137,6 +1137,8 @@ static int veth_enable_xdp_range(struct net_device *dev, int start, int end,
 err_reg_mem:
 	xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
 err_rxq_reg:
+	if (!napi_already_on)
+		netif_napi_del(&priv->rq[i].xdp_napi);
 	for (i--; i >= start; i--) {
 		struct veth_rq *rq = &priv->rq[i];

-- 
2.55.0.rc0.786.g65d90a0328-goog

^ permalink raw reply related

* RE: [PATCH net v6 4/4] ice: skip unnecessary VF reset when setting trust
From: Loktionov, Aleksandr @ 2026-06-22 11:18 UTC (permalink / raw)
  To: Jose Ignacio Tornos Martinez, netdev@vger.kernel.org
  Cc: intel-wired-lan@lists.osuosl.org, Kitszel, Przemyslaw,
	Keller, Jacob E, horms@kernel.org, jesse.brandeburg@intel.com,
	Nguyen, Anthony L, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com
In-Reply-To: <20260619061321.8554-5-jtornosm@redhat.com>



> -----Original Message-----
> From: Jose Ignacio Tornos Martinez <jtornosm@redhat.com>
> Sent: Friday, June 19, 2026 8:13 AM
> To: netdev@vger.kernel.org
> Cc: intel-wired-lan@lists.osuosl.org; Kitszel, Przemyslaw
> <przemyslaw.kitszel@intel.com>; Loktionov, Aleksandr
> <aleksandr.loktionov@intel.com>; Keller, Jacob E
> <jacob.e.keller@intel.com>; horms@kernel.org;
> jesse.brandeburg@intel.com; Nguyen, Anthony L
> <anthony.l.nguyen@intel.com>; davem@davemloft.net;
> edumazet@google.com; kuba@kernel.org; pabeni@redhat.com; Jose Ignacio
> Tornos Martinez <jtornosm@redhat.com>
> Subject: [PATCH net v6 4/4] ice: skip unnecessary VF reset when
> setting trust
> 
> Similar to the i40e fix, ice_set_vf_trust() unconditionally calls
> ice_reset_vf() when the trust setting changes. While the delay is
> smaller than i40e, this reset is still unnecessary in most cases.
> 
> When granting trust, no reset is needed - we can just set the
> capability flag to allow privileged operations.
> 
> When revoking trust, we only need to reset (conservative approach) if
> the VF has actually configured advanced features that require cleanup
> (MAC LLDP filters, promiscuous mode). For VFs in a clean state, we can
> safely change the trust setting without the disruptive reset.
> 
> When we do reset, we maintain the original ice pattern that has been
> reliable in production: cleanup LLDP filters first, then set vf-
> >trusted, then reset. This ensures the privilege capability bit is
> handled correctly during reset rebuild.
> 
> When we don't reset, we manually handle the capability flag via helper
> function, eliminating the delay.
> 
> Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm@redhat.com>
> ---
> v6: AI review identified issues with v5's reset-before-cleanup
> approach. Revert
>     to original reset procedure (cleanup before reset) which has
> proven reliable,
>     just adding the conditional check to skip reset when VF has no
> advanced
>     features configured.
> v5: https://lore.kernel.org/all/20260429102426.210750-5-
> jtornosm@redhat.com/
> 
>  drivers/net/ethernet/intel/ice/ice_sriov.c | 33 +++++++++++++++++++--
> -
>  1 file changed, 29 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c
> b/drivers/net/ethernet/intel/ice/ice_sriov.c
> index 7e00e091756d..XXXXXXXXXXXXXXXX 100644
> --- a/drivers/net/ethernet/intel/ice/ice_sriov.c
> +++ b/drivers/net/ethernet/intel/ice/ice_sriov.c
> @@ -1364,6 +1364,23 @@ int ice_set_vf_mac(struct net_device *netdev,
> int vf_id, u8 *mac)
>  	return __ice_set_vf_mac(ice_netdev_to_pf(netdev), vf_id, mac);
> }
> 
> +/**
> + * ice_setup_vf_trust - Enable/disable VF trust mode without reset
> + * @vf: VF to configure
> + * @setting: trust setting
> + *
> + * Update VF flags when changing trust without performing a VF reset.
> + * This is only called when it's safe to skip the reset (VF has no
> +advanced
> + * features configured that need cleanup).
> + */
> +static void ice_setup_vf_trust(struct ice_vf *vf, bool setting) {
> +	if (setting)
> +		set_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
> +	else
> +		clear_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
> }
> +
>  /**
>   * ice_set_vf_trust
>   * @netdev: network interface device structure @@ -1399,11 +1416,19
> @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool
> trusted)
> 
>  	mutex_lock(&vf->cfg_lock);
> 
> -	while (!trusted && vf->num_mac_lldp)
> -		ice_vf_update_mac_lldp_num(vf, ice_get_vf_vsi(vf),
> false);
> -
> +	/* Reset only if revoking trust and VF has advanced features
> configured */
> +	if (!trusted &&
> +	    (vf->num_mac_lldp > 0 ||
> +	     test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
> +	     test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))) {
> +		while (vf->num_mac_lldp)
> +			ice_vf_update_mac_lldp_num(vf,
> ice_get_vf_vsi(vf), false);
> +		vf->trusted = trusted;
> +		ice_reset_vf(vf, ICE_VF_RESET_NOTIFY);
> +	} else {
> +		vf->trusted = trusted;
> +		ice_setup_vf_trust(vf, trusted);
> +	}
> -	vf->trusted = trusted;
> -	ice_reset_vf(vf, ICE_VF_RESET_NOTIFY);
>  	dev_info(ice_pf_to_dev(pf), "VF %u is now %strusted\n",
>  		 vf_id, trusted ? "" : "un");
> 
> --
> 2.43.0

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox