* [PATCH net 1/1] xfrm: espintcp: publish ULP context before entry points
[not found] <cover.1778464688.git.zylzyl2333@gmail.com>
@ 2026-05-11 13:40 ` Ren Wei
2026-05-12 10:07 ` Sabrina Dubroca
0 siblings, 1 reply; 2+ messages in thread
From: Ren Wei @ 2026-05-11 13:40 UTC (permalink / raw)
To: netdev
Cc: davem, edumazet, kuba, pabeni, horms, steffen.klassert, herbert,
sd, yuantan098, yifanwucs, tomapufckgml, bird, ronbogo,
zylzyl2333, n05ec
From: Yilin Zhu <zylzyl2333@gmail.com>
espintcp_init_sk() installs sk_prot, sk_socket->ops, and socket callbacks
while setting up the ULP state. These entry points can be observed by
receive and write-space paths before the attaching thread finishes.
Previously, espintcp_data_ready() could dereference icsk_ulp_data before
the new context was published, and espintcp_write_space() could schedule
ctx->work before INIT_WORK() initialized it.
Initialize the queues, saved callbacks, and TX work item before storing
the context in icsk_ulp_data. Then publish sk_prot, sk_socket->ops, and
the socket callbacks only after smp_wmb(), paired with an ordered context
load in espintcp_getctx(). Use READ_ONCE()/WRITE_ONCE() for lockless
socket pointer accesses.
Fixes: e27cca96cd68 ("xfrm: add espintcp (RFC 8229)")
Cc: stable@kernel.org
Reported-by: Yuan Tan <yuantan098@gmail.com>
Reported-by: Yifan Wu <yifanwucs@gmail.com>
Reported-by: Juefei Pu <tomapufckgml@gmail.com>
Reported-by: Xin Liu <bird@lzu.edu.cn>
Co-developed-by: Peihan Liu <ronbogo@outlook.com>
Signed-off-by: Peihan Liu <ronbogo@outlook.com>
Signed-off-by: Yilin Zhu <zylzyl2333@gmail.com>
Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
---
include/net/espintcp.h | 13 +++++++++--
net/xfrm/espintcp.c | 52 +++++++++++++++++++++++++++---------------
2 files changed, 45 insertions(+), 20 deletions(-)
diff --git a/include/net/espintcp.h b/include/net/espintcp.h
index c70efd704b6d..034be559786b 100644
--- a/include/net/espintcp.h
+++ b/include/net/espintcp.h
@@ -34,7 +34,16 @@ static inline struct espintcp_ctx *espintcp_getctx(const struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- /* RCU is only needed for diag */
- return (__force void *)icsk->icsk_ulp_data;
+ /*
+ * The caller reached an ESP entry point by observing sk_prot,
+ * sk_socket->ops, or one of the socket callbacks. Keep the ctx
+ * load after that observation so the caller cannot see the new
+ * entry point while still seeing stale icsk_ulp_data.
+ *
+ * Pairs with smp_wmb() in espintcp_init_sk().
+ */
+ smp_rmb();
+
+ return (__force void *)READ_ONCE(icsk->icsk_ulp_data);
}
#endif
diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c
index a2756186e13a..d847632e7d4e 100644
--- a/net/xfrm/espintcp.c
+++ b/net/xfrm/espintcp.c
@@ -440,7 +440,9 @@ static void espintcp_destruct(struct sock *sk)
bool tcp_is_ulp_esp(struct sock *sk)
{
- return sk->sk_prot == &espintcp_prot || sk->sk_prot == &espintcp6_prot;
+ const struct proto *prot = READ_ONCE(sk->sk_prot);
+
+ return prot == &espintcp_prot || prot == &espintcp6_prot;
}
EXPORT_SYMBOL_GPL(tcp_is_ulp_esp);
@@ -451,10 +453,12 @@ static void build_protos(struct proto *espintcp_prot,
static int espintcp_init_sk(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct proto_ops *ops;
struct strp_callbacks cb = {
.rcv_msg = espintcp_rcv,
.parse_msg = espintcp_parse,
};
struct espintcp_ctx *ctx;
+ struct proto *prot;
int err;
@@ -472,34 +476,46 @@ static int espintcp_init_sk(struct sock *sk)
__sk_dst_reset(sk);
- strp_check_rcv(&ctx->strp);
skb_queue_head_init(&ctx->ike_queue);
skb_queue_head_init(&ctx->out_queue);
+ ctx->saved_data_ready = READ_ONCE(sk->sk_data_ready);
+ ctx->saved_write_space = READ_ONCE(sk->sk_write_space);
+ ctx->saved_destruct = READ_ONCE(sk->sk_destruct);
+ INIT_WORK(&ctx->work, espintcp_tx_work);
+
+ /* avoid using task_frag */
+ sk->sk_allocation = GFP_ATOMIC;
+ sk->sk_use_task_frag = false;
if (sk->sk_family == AF_INET) {
- sk->sk_prot = &espintcp_prot;
- sk->sk_socket->ops = &espintcp_ops;
+ prot = &espintcp_prot;
+ ops = &espintcp_ops;
} else {
mutex_lock(&tcpv6_prot_mutex);
if (!espintcp6_prot.recvmsg)
- build_protos(&espintcp6_prot, &espintcp6_ops, sk->sk_prot, sk->sk_socket->ops);
+ build_protos(&espintcp6_prot, &espintcp6_ops,
+ READ_ONCE(sk->sk_prot),
+ READ_ONCE(sk->sk_socket->ops));
mutex_unlock(&tcpv6_prot_mutex);
- sk->sk_prot = &espintcp6_prot;
- sk->sk_socket->ops = &espintcp6_ops;
+ prot = &espintcp6_prot;
+ ops = &espintcp6_ops;
}
- ctx->saved_data_ready = sk->sk_data_ready;
- ctx->saved_write_space = sk->sk_write_space;
- ctx->saved_destruct = sk->sk_destruct;
- sk->sk_data_ready = espintcp_data_ready;
- sk->sk_write_space = espintcp_write_space;
- sk->sk_destruct = espintcp_destruct;
rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
- INIT_WORK(&ctx->work, espintcp_tx_work);
- /* avoid using task_frag */
- sk->sk_allocation = GFP_ATOMIC;
- sk->sk_use_task_frag = false;
+ /*
+ * Publish the fully initialized ctx before publishing any entry point
+ * that can call espintcp_getctx(). The read barrier there runs after
+ * the caller has observed one of these pointers.
+ */
+ smp_wmb();
+ WRITE_ONCE(sk->sk_prot, prot);
+ WRITE_ONCE(sk->sk_socket->ops, ops);
+ WRITE_ONCE(sk->sk_data_ready, espintcp_data_ready);
+ WRITE_ONCE(sk->sk_write_space, espintcp_write_space);
+ WRITE_ONCE(sk->sk_destruct, espintcp_destruct);
+
+ strp_check_rcv(&ctx->strp);
return 0;
@@ -530,7 +546,7 @@ static void espintcp_close(struct sock *sk, long timeout)
strp_stop(&ctx->strp);
- sk->sk_prot = &tcp_prot;
+ WRITE_ONCE(sk->sk_prot, &tcp_prot);
barrier();
disable_work_sync(&ctx->work);
--
2.47.3
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH net 1/1] xfrm: espintcp: publish ULP context before entry points
2026-05-11 13:40 ` [PATCH net 1/1] xfrm: espintcp: publish ULP context before entry points Ren Wei
@ 2026-05-12 10:07 ` Sabrina Dubroca
0 siblings, 0 replies; 2+ messages in thread
From: Sabrina Dubroca @ 2026-05-12 10:07 UTC (permalink / raw)
To: Ren Wei
Cc: netdev, davem, edumazet, kuba, pabeni, horms, steffen.klassert,
herbert, yuantan098, yifanwucs, tomapufckgml, bird, ronbogo,
zylzyl2333
Thanks for the fix. A small note: IPsec fixes go through the "ipsec"
tree, not "net", so the prefix should be [PATCH ipsec]
Some comments inline:
2026-05-11, 21:40:58 +0800, Ren Wei wrote:
> diff --git a/include/net/espintcp.h b/include/net/espintcp.h
> index c70efd704b6d..034be559786b 100644
> --- a/include/net/espintcp.h
> +++ b/include/net/espintcp.h
> @@ -34,7 +34,16 @@ static inline struct espintcp_ctx *espintcp_getctx(const struct sock *sk)
> {
> const struct inet_connection_sock *icsk = inet_csk(sk);
>
> - /* RCU is only needed for diag */
> - return (__force void *)icsk->icsk_ulp_data;
> + /*
> + * The caller reached an ESP entry point by observing sk_prot,
> + * sk_socket->ops, or one of the socket callbacks. Keep the ctx
> + * load after that observation so the caller cannot see the new
> + * entry point while still seeing stale icsk_ulp_data.
I don't think this comment is really helpful.
> + *
> + * Pairs with smp_wmb() in espintcp_init_sk().
> + */
> + smp_rmb();
> +
> + return (__force void *)READ_ONCE(icsk->icsk_ulp_data);
I think smp_store_release/smp_load_acquire is the "standard spelling"
for this now.
[...]
> @@ -472,34 +476,46 @@ static int espintcp_init_sk(struct sock *sk)
>
> __sk_dst_reset(sk);
>
> - strp_check_rcv(&ctx->strp);
> skb_queue_head_init(&ctx->ike_queue);
> skb_queue_head_init(&ctx->out_queue);
> + ctx->saved_data_ready = READ_ONCE(sk->sk_data_ready);
> + ctx->saved_write_space = READ_ONCE(sk->sk_write_space);
> + ctx->saved_destruct = READ_ONCE(sk->sk_destruct);
If something is changing those while espintcp_init_sk is running,
READ_ONCE won't help us. We'll end up with the wrong saved_*
values. Can this actually happen here?
> + INIT_WORK(&ctx->work, espintcp_tx_work);
> +
> + /* avoid using task_frag */
> + sk->sk_allocation = GFP_ATOMIC;
> + sk->sk_use_task_frag = false;
>
> if (sk->sk_family == AF_INET) {
> - sk->sk_prot = &espintcp_prot;
> - sk->sk_socket->ops = &espintcp_ops;
> + prot = &espintcp_prot;
> + ops = &espintcp_ops;
> } else {
> mutex_lock(&tcpv6_prot_mutex);
> if (!espintcp6_prot.recvmsg)
> - build_protos(&espintcp6_prot, &espintcp6_ops, sk->sk_prot, sk->sk_socket->ops);
> + build_protos(&espintcp6_prot, &espintcp6_ops,
> + READ_ONCE(sk->sk_prot),
> + READ_ONCE(sk->sk_socket->ops));
And similar here. Those should always be tcpv6_prot/inet6_stream_ops,
but I wrote it this way to avoid having to use stubs, back when IPv6
could be built as a module. This could now be moved into espintcp_init
like the ipv4 variant of this.
> mutex_unlock(&tcpv6_prot_mutex);
>
> - sk->sk_prot = &espintcp6_prot;
> - sk->sk_socket->ops = &espintcp6_ops;
> + prot = &espintcp6_prot;
> + ops = &espintcp6_ops;
> }
Or just move the whole block to the end, instead of introducing those
temporary variables?
> - ctx->saved_data_ready = sk->sk_data_ready;
> - ctx->saved_write_space = sk->sk_write_space;
> - ctx->saved_destruct = sk->sk_destruct;
> - sk->sk_data_ready = espintcp_data_ready;
> - sk->sk_write_space = espintcp_write_space;
> - sk->sk_destruct = espintcp_destruct;
> rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
> - INIT_WORK(&ctx->work, espintcp_tx_work);
>
> - /* avoid using task_frag */
> - sk->sk_allocation = GFP_ATOMIC;
> - sk->sk_use_task_frag = false;
> + /*
> + * Publish the fully initialized ctx before publishing any entry point
> + * that can call espintcp_getctx(). The read barrier there runs after
> + * the caller has observed one of these pointers.
> + */
> + smp_wmb();
> + WRITE_ONCE(sk->sk_prot, prot);
> + WRITE_ONCE(sk->sk_socket->ops, ops);
> + WRITE_ONCE(sk->sk_data_ready, espintcp_data_ready);
> + WRITE_ONCE(sk->sk_write_space, espintcp_write_space);
> + WRITE_ONCE(sk->sk_destruct, espintcp_destruct);
> +
> + strp_check_rcv(&ctx->strp);
>
> return 0;
>
> @@ -530,7 +546,7 @@ static void espintcp_close(struct sock *sk, long timeout)
>
> strp_stop(&ctx->strp);
>
> - sk->sk_prot = &tcp_prot;
> + WRITE_ONCE(sk->sk_prot, &tcp_prot);
Actually this should be the original sk_prot, which could be
tcpv6_prot.
I'm not sure how much the WRITE_ONCE matters here. What is it
protecting against/synchronizing with?
--
Sabrina
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2026-05-12 10:07 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <cover.1778464688.git.zylzyl2333@gmail.com>
2026-05-11 13:40 ` [PATCH net 1/1] xfrm: espintcp: publish ULP context before entry points Ren Wei
2026-05-12 10:07 ` Sabrina Dubroca
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox