Netdev List
 help / color / mirror / Atom feed
* [PATCH net 1/1] xfrm: espintcp: publish ULP context before entry points
       [not found] <cover.1778464688.git.zylzyl2333@gmail.com>
@ 2026-05-11 13:40 ` Ren Wei
  2026-05-12 10:07   ` Sabrina Dubroca
  0 siblings, 1 reply; 2+ messages in thread
From: Ren Wei @ 2026-05-11 13:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, edumazet, kuba, pabeni, horms, steffen.klassert, herbert,
	sd, yuantan098, yifanwucs, tomapufckgml, bird, ronbogo,
	zylzyl2333, n05ec

From: Yilin Zhu <zylzyl2333@gmail.com>

espintcp_init_sk() installs sk_prot, sk_socket->ops, and socket callbacks
while setting up the ULP state. These entry points can be observed by
receive and write-space paths before the attaching thread finishes.

Previously, espintcp_data_ready() could dereference icsk_ulp_data before
the new context was published, and espintcp_write_space() could schedule
ctx->work before INIT_WORK() initialized it.

Initialize the queues, saved callbacks, and TX work item before storing
the context in icsk_ulp_data.  Then publish sk_prot, sk_socket->ops, and
the socket callbacks only after smp_wmb(), paired with an ordered context
load in espintcp_getctx().  Use READ_ONCE()/WRITE_ONCE() for lockless
socket pointer accesses.

Fixes: e27cca96cd68 ("xfrm: add espintcp (RFC 8229)")
Cc: stable@kernel.org
Reported-by: Yuan Tan <yuantan098@gmail.com>
Reported-by: Yifan Wu <yifanwucs@gmail.com>
Reported-by: Juefei Pu <tomapufckgml@gmail.com>
Reported-by: Xin Liu <bird@lzu.edu.cn>
Co-developed-by: Peihan Liu <ronbogo@outlook.com>
Signed-off-by: Peihan Liu <ronbogo@outlook.com>
Signed-off-by: Yilin Zhu <zylzyl2333@gmail.com>
Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
---
 include/net/espintcp.h | 13 +++++++++--
 net/xfrm/espintcp.c    | 52 +++++++++++++++++++++++++++---------------
 2 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/include/net/espintcp.h b/include/net/espintcp.h
index c70efd704b6d..034be559786b 100644
--- a/include/net/espintcp.h
+++ b/include/net/espintcp.h
@@ -34,7 +34,16 @@ static inline struct espintcp_ctx *espintcp_getctx(const struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 
-	/* RCU is only needed for diag */
-	return (__force void *)icsk->icsk_ulp_data;
+	/*
+	 * The caller reached an ESP entry point by observing sk_prot,
+	 * sk_socket->ops, or one of the socket callbacks.  Keep the ctx
+	 * load after that observation so the caller cannot see the new
+	 * entry point while still seeing stale icsk_ulp_data.
+	 *
+	 * Pairs with smp_wmb() in espintcp_init_sk().
+	 */
+	smp_rmb();
+
+	return (__force void *)READ_ONCE(icsk->icsk_ulp_data);
 }
 #endif
diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c
index a2756186e13a..d847632e7d4e 100644
--- a/net/xfrm/espintcp.c
+++ b/net/xfrm/espintcp.c
@@ -440,7 +440,9 @@ static void espintcp_destruct(struct sock *sk)
 
 bool tcp_is_ulp_esp(struct sock *sk)
 {
-	return sk->sk_prot == &espintcp_prot || sk->sk_prot == &espintcp6_prot;
+	const struct proto *prot = READ_ONCE(sk->sk_prot);
+
+	return prot == &espintcp_prot || prot == &espintcp6_prot;
 }
 EXPORT_SYMBOL_GPL(tcp_is_ulp_esp);
 
@@ -451,10 +453,12 @@ static void build_protos(struct proto *espintcp_prot,
 static int espintcp_init_sk(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct proto_ops *ops;
 	struct strp_callbacks cb = {
 		.rcv_msg = espintcp_rcv,
 		.parse_msg = espintcp_parse,
 	};
 	struct espintcp_ctx *ctx;
+	struct proto *prot;
 	int err;
 
@@ -472,34 +476,46 @@ static int espintcp_init_sk(struct sock *sk)
 
 	__sk_dst_reset(sk);
 
-	strp_check_rcv(&ctx->strp);
 	skb_queue_head_init(&ctx->ike_queue);
 	skb_queue_head_init(&ctx->out_queue);
+	ctx->saved_data_ready = READ_ONCE(sk->sk_data_ready);
+	ctx->saved_write_space = READ_ONCE(sk->sk_write_space);
+	ctx->saved_destruct = READ_ONCE(sk->sk_destruct);
+	INIT_WORK(&ctx->work, espintcp_tx_work);
+
+	/* avoid using task_frag */
+	sk->sk_allocation = GFP_ATOMIC;
+	sk->sk_use_task_frag = false;
 
 	if (sk->sk_family == AF_INET) {
-		sk->sk_prot = &espintcp_prot;
-		sk->sk_socket->ops = &espintcp_ops;
+		prot = &espintcp_prot;
+		ops = &espintcp_ops;
 	} else {
 		mutex_lock(&tcpv6_prot_mutex);
 		if (!espintcp6_prot.recvmsg)
-			build_protos(&espintcp6_prot, &espintcp6_ops, sk->sk_prot, sk->sk_socket->ops);
+			build_protos(&espintcp6_prot, &espintcp6_ops,
+				     READ_ONCE(sk->sk_prot),
+				     READ_ONCE(sk->sk_socket->ops));
 		mutex_unlock(&tcpv6_prot_mutex);
 
-		sk->sk_prot = &espintcp6_prot;
-		sk->sk_socket->ops = &espintcp6_ops;
+		prot = &espintcp6_prot;
+		ops = &espintcp6_ops;
 	}
-	ctx->saved_data_ready = sk->sk_data_ready;
-	ctx->saved_write_space = sk->sk_write_space;
-	ctx->saved_destruct = sk->sk_destruct;
-	sk->sk_data_ready = espintcp_data_ready;
-	sk->sk_write_space = espintcp_write_space;
-	sk->sk_destruct = espintcp_destruct;
 	rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
-	INIT_WORK(&ctx->work, espintcp_tx_work);
 
-	/* avoid using task_frag */
-	sk->sk_allocation = GFP_ATOMIC;
-	sk->sk_use_task_frag = false;
+	/*
+	 * Publish the fully initialized ctx before publishing any entry point
+	 * that can call espintcp_getctx().  The read barrier there runs after
+	 * the caller has observed one of these pointers.
+	 */
+	smp_wmb();
+	WRITE_ONCE(sk->sk_prot, prot);
+	WRITE_ONCE(sk->sk_socket->ops, ops);
+	WRITE_ONCE(sk->sk_data_ready, espintcp_data_ready);
+	WRITE_ONCE(sk->sk_write_space, espintcp_write_space);
+	WRITE_ONCE(sk->sk_destruct, espintcp_destruct);
+
+	strp_check_rcv(&ctx->strp);
 
 	return 0;
 
@@ -530,7 +546,7 @@ static void espintcp_close(struct sock *sk, long timeout)
 
 	strp_stop(&ctx->strp);
 
-	sk->sk_prot = &tcp_prot;
+	WRITE_ONCE(sk->sk_prot, &tcp_prot);
 	barrier();
 
 	disable_work_sync(&ctx->work);
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH net 1/1] xfrm: espintcp: publish ULP context before entry points
  2026-05-11 13:40 ` [PATCH net 1/1] xfrm: espintcp: publish ULP context before entry points Ren Wei
@ 2026-05-12 10:07   ` Sabrina Dubroca
  0 siblings, 0 replies; 2+ messages in thread
From: Sabrina Dubroca @ 2026-05-12 10:07 UTC (permalink / raw)
  To: Ren Wei
  Cc: netdev, davem, edumazet, kuba, pabeni, horms, steffen.klassert,
	herbert, yuantan098, yifanwucs, tomapufckgml, bird, ronbogo,
	zylzyl2333

Thanks for the fix. A small note: IPsec fixes go through the "ipsec"
tree, not "net", so the prefix should be [PATCH ipsec]

Some comments inline:

2026-05-11, 21:40:58 +0800, Ren Wei wrote:
> diff --git a/include/net/espintcp.h b/include/net/espintcp.h
> index c70efd704b6d..034be559786b 100644
> --- a/include/net/espintcp.h
> +++ b/include/net/espintcp.h
> @@ -34,7 +34,16 @@ static inline struct espintcp_ctx *espintcp_getctx(const struct sock *sk)
>  {
>  	const struct inet_connection_sock *icsk = inet_csk(sk);
>  
> -	/* RCU is only needed for diag */
> -	return (__force void *)icsk->icsk_ulp_data;
> +	/*
> +	 * The caller reached an ESP entry point by observing sk_prot,
> +	 * sk_socket->ops, or one of the socket callbacks.  Keep the ctx
> +	 * load after that observation so the caller cannot see the new
> +	 * entry point while still seeing stale icsk_ulp_data.

I don't think this comment is really helpful.

> +	 *
> +	 * Pairs with smp_wmb() in espintcp_init_sk().
> +	 */
> +	smp_rmb();
> +
> +	return (__force void *)READ_ONCE(icsk->icsk_ulp_data);

I think smp_store_release/smp_load_acquire is the "standard spelling"
for this now.


[...]
> @@ -472,34 +476,46 @@ static int espintcp_init_sk(struct sock *sk)
>  
>  	__sk_dst_reset(sk);
>  
> -	strp_check_rcv(&ctx->strp);
>  	skb_queue_head_init(&ctx->ike_queue);
>  	skb_queue_head_init(&ctx->out_queue);
> +	ctx->saved_data_ready = READ_ONCE(sk->sk_data_ready);
> +	ctx->saved_write_space = READ_ONCE(sk->sk_write_space);
> +	ctx->saved_destruct = READ_ONCE(sk->sk_destruct);

If something is changing those while espintcp_init_sk is running,
READ_ONCE won't help us. We'll end up with the wrong saved_*
values. Can this actually happen here?


> +	INIT_WORK(&ctx->work, espintcp_tx_work);
> +
> +	/* avoid using task_frag */
> +	sk->sk_allocation = GFP_ATOMIC;
> +	sk->sk_use_task_frag = false;
>  
>  	if (sk->sk_family == AF_INET) {
> -		sk->sk_prot = &espintcp_prot;
> -		sk->sk_socket->ops = &espintcp_ops;
> +		prot = &espintcp_prot;
> +		ops = &espintcp_ops;
>  	} else {
>  		mutex_lock(&tcpv6_prot_mutex);
>  		if (!espintcp6_prot.recvmsg)
> -			build_protos(&espintcp6_prot, &espintcp6_ops, sk->sk_prot, sk->sk_socket->ops);
> +			build_protos(&espintcp6_prot, &espintcp6_ops,
> +				     READ_ONCE(sk->sk_prot),
> +				     READ_ONCE(sk->sk_socket->ops));

And similar here. Those should always be tcpv6_prot/inet6_stream_ops,
but I wrote it this way to avoid having to use stubs, back when IPv6
could be built as a module. This could now be moved into espintcp_init
like the ipv4 variant of this.

>  		mutex_unlock(&tcpv6_prot_mutex);
>  
> -		sk->sk_prot = &espintcp6_prot;
> -		sk->sk_socket->ops = &espintcp6_ops;
> +		prot = &espintcp6_prot;
> +		ops = &espintcp6_ops;
>  	}

Or just move the whole block to the end, instead of introducing those
temporary variables?

> -	ctx->saved_data_ready = sk->sk_data_ready;
> -	ctx->saved_write_space = sk->sk_write_space;
> -	ctx->saved_destruct = sk->sk_destruct;
> -	sk->sk_data_ready = espintcp_data_ready;
> -	sk->sk_write_space = espintcp_write_space;
> -	sk->sk_destruct = espintcp_destruct;
>  	rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
> -	INIT_WORK(&ctx->work, espintcp_tx_work);
>  
> -	/* avoid using task_frag */
> -	sk->sk_allocation = GFP_ATOMIC;
> -	sk->sk_use_task_frag = false;
> +	/*
> +	 * Publish the fully initialized ctx before publishing any entry point
> +	 * that can call espintcp_getctx().  The read barrier there runs after
> +	 * the caller has observed one of these pointers.
> +	 */
> +	smp_wmb();
> +	WRITE_ONCE(sk->sk_prot, prot);
> +	WRITE_ONCE(sk->sk_socket->ops, ops);
> +	WRITE_ONCE(sk->sk_data_ready, espintcp_data_ready);
> +	WRITE_ONCE(sk->sk_write_space, espintcp_write_space);
> +	WRITE_ONCE(sk->sk_destruct, espintcp_destruct);
> +
> +	strp_check_rcv(&ctx->strp);
>  
>  	return 0;
>  
> @@ -530,7 +546,7 @@ static void espintcp_close(struct sock *sk, long timeout)
>  
>  	strp_stop(&ctx->strp);
>  
> -	sk->sk_prot = &tcp_prot;
> +	WRITE_ONCE(sk->sk_prot, &tcp_prot);

Actually this should be the original sk_prot, which could be
tcpv6_prot.

I'm not sure how much the WRITE_ONCE matters here. What is it
protecting against/synchronizing with?

-- 
Sabrina

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-05-12 10:07 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <cover.1778464688.git.zylzyl2333@gmail.com>
2026-05-11 13:40 ` [PATCH net 1/1] xfrm: espintcp: publish ULP context before entry points Ren Wei
2026-05-12 10:07   ` Sabrina Dubroca

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox