public inbox for stable@vger.kernel.org
 help / color / mirror / Atom feed
From: Qi Tang <tpluszz77@gmail.com>
To: Steffen Klassert <steffen.klassert@secunet.com>,
	Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S . Miller" <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Simon Horman <horms@kernel.org>, David Ahern <dsahern@kernel.org>,
	netdev@vger.kernel.org, Qi Tang <tpluszz77@gmail.com>,
	stable@vger.kernel.org
Subject: [PATCH v2] xfrm: delay dev_put in xfrm_input to after transport reinject
Date: Tue, 31 Mar 2026 17:27:37 +0800	[thread overview]
Message-ID: <20260331092737.1937-1-tpluszz77@gmail.com> (raw)

xfrm_trans_queue() queues transport-mode packets for async reinject
via xfrm_trans_reinject() workqueue.  After async crypto completes,
xfrm_input_resume() re-enters xfrm_input() with encap_type == -1,
which immediately calls dev_put(skb->dev) before the skb reaches
transport_finish and the reinject queue.  The device can be freed
before the workqueue callback runs, causing a use-after-free when
xfrm_trans_reinject dereferences skb->dev.

Remove the dev_put from the async resumption entry and let the
reference survive through the transport reinject path.  Introduce
async variants of the NF_HOOK okfn callbacks that queue the skb
with dev_held=true and drop the reference on error.  The reinject
worker checks this flag and puts the reference after the callback
completes.

For the synchronous crypto path, the existing dev_hold/dev_put
around x->type->input() is unchanged — the reference is balanced
within the same softirq context before the skb reaches the queue.

If the loop re-enters async crypto (multi-SPI with a second
-EINPROGRESS), drop the extra reference from the earlier async
resume so exactly one reference accompanies the skb.

Fixes: acf568ee859f ("xfrm: Reinject transport-mode packets through tasklet")
Cc: stable@vger.kernel.org
Signed-off-by: Qi Tang <tpluszz77@gmail.com>
---
Changes in v2:
  - Do not add extra dev_hold/dev_put pair (reviewer feedback:
    "expensive operation, we just drop it too early")
  - Reuse existing dev_hold from xfrm_input, delay dev_put to
    reinject completion
  - Add async okfn variants for IPv4/IPv6 transport_finish so
    the reinject queue knows whether a dev ref is held
  - Drop the cb->dev field from v1; use bool dev_held flag instead

Link: https://lore.kernel.org/all/20260320073023.21873-1-tpluszz77@gmail.com/
---
 include/net/xfrm.h     |  3 ++-
 net/ipv4/esp4.c        |  3 ++-
 net/ipv4/xfrm4_input.c | 25 ++++++++++++++++++++++++-
 net/ipv6/esp6.c        |  3 ++-
 net/ipv6/xfrm6_input.c | 16 +++++++++++++++-
 net/xfrm/xfrm_input.c  | 35 ++++++++++++++++++++++++++---------
 6 files changed, 71 insertions(+), 14 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 10d3edde6b2f..1dd8b3b36649 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1779,7 +1779,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
 int xfrm_input_resume(struct sk_buff *skb, int nexthdr);
 int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
 			 int (*finish)(struct net *, struct sock *,
-				       struct sk_buff *));
+				       struct sk_buff *),
+			 bool dev_held);
 int xfrm_trans_queue(struct sk_buff *skb,
 		     int (*finish)(struct net *, struct sock *,
 				   struct sk_buff *));
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 6dfc0bcdef65..0114c92b10d4 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -187,7 +187,8 @@ static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
 	int err;
 
 	local_bh_disable();
-	err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
+	err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb,
+				   false);
 	local_bh_enable();
 
 	/* EINPROGRESS just happens to do the right thing.  It
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index f28cfd88eaf5..9765fdc63ffc 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -46,6 +46,28 @@ static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk,
 	return NET_RX_DROP;
 }
 
+static int xfrm4_rcv_encap_finish_async(struct net *net, struct sock *sk,
+					struct sk_buff *skb)
+{
+	if (!skb_dst(skb)) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					 ip4h_dscp(iph), skb->dev))
+			goto drop;
+	}
+
+	if (xfrm_trans_queue_net(dev_net(skb->dev), skb,
+				 xfrm4_rcv_encap_finish2, true))
+		goto drop;
+
+	return 0;
+drop:
+	dev_put(skb->dev);
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
 int xfrm4_transport_finish(struct sk_buff *skb, int async)
 {
 	struct xfrm_offload *xo = xfrm_offload(skb);
@@ -74,7 +96,8 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
 
 	NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
 		dev_net(skb->dev), NULL, skb, skb->dev, NULL,
-		xfrm4_rcv_encap_finish);
+		async ? xfrm4_rcv_encap_finish_async :
+			xfrm4_rcv_encap_finish);
 	return 0;
 }
 
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 9f75313734f8..8a0a44d7d010 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -204,7 +204,8 @@ static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
 	int err;
 
 	local_bh_disable();
-	err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
+	err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb,
+				   false);
 	local_bh_enable();
 
 	/* EINPROGRESS just happens to do the right thing.  It
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 9005fc156a20..d4eede5315ac 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -40,6 +40,19 @@ static int xfrm6_transport_finish2(struct net *net, struct sock *sk,
 	return 0;
 }
 
+static int xfrm6_transport_finish2_async(struct net *net, struct sock *sk,
+					 struct sk_buff *skb)
+{
+	if (xfrm_trans_queue_net(dev_net(skb->dev), skb, ip6_rcv_finish,
+				 true)) {
+		dev_put(skb->dev);
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	return 0;
+}
+
 int xfrm6_transport_finish(struct sk_buff *skb, int async)
 {
 	struct xfrm_offload *xo = xfrm_offload(skb);
@@ -69,7 +82,8 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
 
 	NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
 		dev_net(skb->dev), NULL, skb, skb->dev, NULL,
-		xfrm6_transport_finish2);
+		async ? xfrm6_transport_finish2_async :
+			xfrm6_transport_finish2);
 	return 0;
 }
 
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index dc1312ed5a09..2d75f984532a 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -40,6 +40,7 @@ struct xfrm_trans_cb {
 	} header;
 	int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb);
 	struct net *net;
+	bool dev_held;
 };
 
 #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0]))
@@ -506,7 +507,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 		/* An encap_type of -1 indicates async resumption. */
 		if (encap_type == -1) {
 			async = 1;
-			dev_put(skb->dev);
 			seq = XFRM_SKB_CB(skb)->seq.input.low;
 			spin_lock(&x->lock);
 			goto resume;
@@ -659,8 +659,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 			dev_hold(skb->dev);
 
 			nexthdr = x->type->input(x, skb);
-			if (nexthdr == -EINPROGRESS)
+			if (nexthdr == -EINPROGRESS) {
+				if (async)
+					dev_put(skb->dev);
 				return 0;
+			}
 
 			dev_put(skb->dev);
 			spin_lock(&x->lock);
@@ -695,9 +698,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 		XFRM_MODE_SKB_CB(skb)->protocol = nexthdr;
 
 		err = xfrm_inner_mode_input(x, skb);
-		if (err == -EINPROGRESS)
+		if (err == -EINPROGRESS) {
+			if (async)
+				dev_put(skb->dev);
 			return 0;
-		else if (err) {
+		} else if (err) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
 			goto drop;
 		}
@@ -734,6 +739,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 			sp->olen = 0;
 		if (skb_valid_dst(skb))
 			skb_dst_drop(skb);
+		if (async)
+			dev_put(skb->dev);
 		gro_cells_receive(&gro_cells, skb);
 		return 0;
 	} else {
@@ -753,6 +760,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 				sp->olen = 0;
 			if (skb_valid_dst(skb))
 				skb_dst_drop(skb);
+			if (async)
+				dev_put(skb->dev);
 			gro_cells_receive(&gro_cells, skb);
 			return err;
 		}
@@ -763,6 +772,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 drop_unlock:
 	spin_unlock(&x->lock);
 drop:
+	if (async)
+		dev_put(skb->dev);
 	xfrm_rcv_cb(skb, family, x && x->type ? x->type->proto : nexthdr, -1);
 	kfree_skb(skb);
 	return 0;
@@ -787,15 +798,20 @@ static void xfrm_trans_reinject(struct work_struct *work)
 	spin_unlock_bh(&trans->queue_lock);
 
 	local_bh_disable();
-	while ((skb = __skb_dequeue(&queue)))
-		XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net,
-					       NULL, skb);
+	while ((skb = __skb_dequeue(&queue))) {
+		struct xfrm_trans_cb *cb = XFRM_TRANS_SKB_CB(skb);
+		struct net_device *dev = cb->dev_held ? skb->dev : NULL;
+
+		cb->finish(cb->net, NULL, skb);
+		dev_put(dev);
+	}
 	local_bh_enable();
 }
 
 int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
 			 int (*finish)(struct net *, struct sock *,
-				       struct sk_buff *))
+				       struct sk_buff *),
+			 bool dev_held)
 {
 	struct xfrm_trans_tasklet *trans;
 
@@ -808,6 +824,7 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
 
 	XFRM_TRANS_SKB_CB(skb)->finish = finish;
 	XFRM_TRANS_SKB_CB(skb)->net = net;
+	XFRM_TRANS_SKB_CB(skb)->dev_held = dev_held;
 	spin_lock_bh(&trans->queue_lock);
 	__skb_queue_tail(&trans->queue, skb);
 	spin_unlock_bh(&trans->queue_lock);
@@ -820,7 +837,7 @@ int xfrm_trans_queue(struct sk_buff *skb,
 		     int (*finish)(struct net *, struct sock *,
 				   struct sk_buff *))
 {
-	return xfrm_trans_queue_net(dev_net(skb->dev), skb, finish);
+	return xfrm_trans_queue_net(dev_net(skb->dev), skb, finish, false);
 }
 EXPORT_SYMBOL(xfrm_trans_queue);
 
-- 
2.43.0


             reply	other threads:[~2026-03-31  9:27 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-31  9:27 Qi Tang [this message]
2026-04-02 10:36 ` [PATCH v2] xfrm: delay dev_put in xfrm_input to after transport reinject Steffen Klassert
2026-04-02 10:54   ` Florian Westphal
2026-04-02 11:26     ` Qi Tang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260331092737.1937-1-tpluszz77@gmail.com \
    --to=tpluszz77@gmail.com \
    --cc=davem@davemloft.net \
    --cc=dsahern@kernel.org \
    --cc=edumazet@google.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=horms@kernel.org \
    --cc=kuba@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=stable@vger.kernel.org \
    --cc=steffen.klassert@secunet.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox