Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v2] ipv4: hold a consistent view of rt->dst.dev under RCU
@ 2026-07-01  3:24 xuanqiang.luo
  0 siblings, 0 replies; 2+ messages in thread
From: xuanqiang.luo @ 2026-07-01  3:24 UTC (permalink / raw)
  To: David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	David Ahern, Ido Schimmel
  Cc: Simon Horman, Kuniyuki Iwashima, netdev, linux-kernel,
	Xuanqiang Luo

From: Xuanqiang Luo <luoxuanqiang@kylinos.cn>

rt_flush_dev() walks the per-CPU uncached route list and rewrites
rt->dst.dev in-place to blackhole_netdev under spin_lock_bh().
This lock does not exclude RCU readers, which may load rt->dst.dev
multiple times within a single rcu_read_lock() region.

ip_rt_send_redirect() is a typical example: it reads rt->dst.dev
three times to obtain in_dev, the L3 master ifindex, and net.
A concurrent device unregistration can repoint rt->dst.dev to
blackhole_netdev between those reads, making the reader combine
state from two different net_devices — for instance, an in_dev
from the real device but a netns and peer lookup from the blackhole
device.  ip_rt_get_source() has the same problem: it reads
rt->dst.dev four times to obtain the output ifindex, the netns,
and the source address, so a concurrent flush can cause the source
selection to mix state from different devices.

Take a single dst_dev_rcu() snapshot of rt->dst.dev at the start
of each affected RCU reader and use that snapshot throughout, so
concurrent flushes cannot cause mid-function inconsistency.
Publish the in-place write in rt_flush_dev() with rcu_assign_pointer()
to match the readers.

Fixes: caacf05e5ad1a ("ipv4: Properly purge netdev references on uncached routes.")
Signed-off-by: Xuanqiang Luo <luoxuanqiang@kylinos.cn>
---
v2:
- Use dst_dev_rcu() and dev_net_rcu() for the RCU readers.
- Use rcu_assign_pointer() when publishing the uncached route device
  replacement.
- Slightly adjust the commit message wording because this issue was found
  by inspection, not from an observed user-visible failure.

v1: https://lore.kernel.org/all/20260630094250.29386-1-xuanqiang.luo@linux.dev/

 net/ipv4/route.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3f3de5164d6e5..57f38467e6d0c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -873,6 +873,7 @@ static void ipv4_negative_advice(struct sock *sk,
 void ip_rt_send_redirect(struct sk_buff *skb)
 {
 	struct rtable *rt = skb_rtable(skb);
+	struct net_device *dev;
 	struct in_device *in_dev;
 	struct inet_peer *peer;
 	struct net *net;
@@ -880,15 +881,16 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 	int vif;
 
 	rcu_read_lock();
-	in_dev = __in_dev_get_rcu(rt->dst.dev);
+	dev = dst_dev_rcu(&rt->dst);
+	in_dev = __in_dev_get_rcu(dev);
 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 		rcu_read_unlock();
 		return;
 	}
 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
-	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
+	vif = l3mdev_master_ifindex_rcu(dev);
 
-	net = dev_net(rt->dst.dev);
+	net = dev_net_rcu(dev);
 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif);
 	if (!peer) {
 		rcu_read_unlock();
@@ -1287,29 +1289,32 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
 {
 	__be32 src;
 
-	if (rt_is_output_route(rt))
+	rcu_read_lock();
+	if (rt_is_output_route(rt)) {
 		src = ip_hdr(skb)->saddr;
-	else {
+	} else {
 		struct fib_result res;
 		struct iphdr *iph = ip_hdr(skb);
+		struct net_device *dev = dst_dev_rcu(&rt->dst);
+		struct net *net = dev_net_rcu(dev);
 		struct flowi4 fl4 = {
 			.daddr = iph->daddr,
 			.saddr = iph->saddr,
 			.flowi4_dscp = ip4h_dscp(iph),
-			.flowi4_oif = rt->dst.dev->ifindex,
+			.flowi4_oif = dev->ifindex,
 			.flowi4_iif = skb->dev->ifindex,
 			.flowi4_mark = skb->mark,
 		};
 
-		rcu_read_lock();
-		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
-			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
+		if (fib_lookup(net, &fl4, &res, 0) == 0)
+			src = fib_result_prefsrc(net, &res);
 		else
-			src = inet_select_addr(rt->dst.dev,
+			src = inet_select_addr(dev,
 					       rt_nexthop(rt, iph->daddr),
 					       RT_SCOPE_UNIVERSE);
-		rcu_read_unlock();
 	}
+	rcu_read_unlock();
+
 	memcpy(addr, &src, 4);
 }
 
@@ -1565,7 +1570,7 @@ void rt_flush_dev(struct net_device *dev)
 		list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
 			if (rt->dst.dev != dev)
 				continue;
-			rt->dst.dev = blackhole_netdev;
+			rcu_assign_pointer(rt->dst.dev_rcu, blackhole_netdev);
 			netdev_ref_replace(dev, blackhole_netdev,
 					   &rt->dst.dev_tracker, GFP_ATOMIC);
 			list_del_init(&rt->dst.rt_uncached);
-- 
2.43.0

^ permalink raw reply related	[flat|nested] 2+ messages in thread
* [PATCH net-next v1] ipv4: hold a consistent view of rt->dst.dev under RCU
@ 2026-06-30  9:42 xuanqiang.luo
  2026-07-01  3:16 ` [PATCH net-next v2] " xuanqiang.luo
  0 siblings, 1 reply; 2+ messages in thread
From: xuanqiang.luo @ 2026-06-30  9:42 UTC (permalink / raw)
  To: David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	David Ahern, Ido Schimmel
  Cc: Simon Horman, Kuniyuki Iwashima, netdev, linux-kernel,
	Xuanqiang Luo

From: Xuanqiang Luo <luoxuanqiang@kylinos.cn>

rt_flush_dev() walks the per-CPU uncached route list and rewrites
rt->dst.dev in-place to blackhole_netdev under spin_lock_bh().
This lock does not exclude RCU readers, which may load rt->dst.dev
multiple times within a single rcu_read_lock() region.

ip_rt_send_redirect() is a typical example: it reads rt->dst.dev
three times to obtain in_dev, the L3 master ifindex, and net.
A concurrent device unregistration can repoint rt->dst.dev to
blackhole_netdev between those reads, making the reader combine
state from two different net_devices — for instance, an in_dev
from the real device but a netns and peer lookup from the blackhole
device, causing ICMP redirects to be issued against the wrong
namespace.  ip_rt_get_source() has the same problem: it reads
rt->dst.dev four times to obtain the output ifindex, the netns,
and the source address, so a concurrent flush can cause the source
selection to mix state from different devices.

Take a single READ_ONCE() snapshot of rt->dst.dev at the start of
each affected RCU reader and use that snapshot throughout, so
concurrent flushes cannot cause mid-function inconsistency.
Publish the in-place write in rt_flush_dev() with WRITE_ONCE() to
pair with the readers.

Fixes: caacf05e5ad1a ("ipv4: Properly purge netdev references on uncached routes.")
Signed-off-by: Xuanqiang Luo <luoxuanqiang@kylinos.cn>
---
 net/ipv4/route.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3f3de5164d6e5..e14325c4929ab 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -873,6 +873,7 @@ static void ipv4_negative_advice(struct sock *sk,
 void ip_rt_send_redirect(struct sk_buff *skb)
 {
 	struct rtable *rt = skb_rtable(skb);
+	struct net_device *dev;
 	struct in_device *in_dev;
 	struct inet_peer *peer;
 	struct net *net;
@@ -880,15 +881,16 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 	int vif;
 
 	rcu_read_lock();
-	in_dev = __in_dev_get_rcu(rt->dst.dev);
+	dev = READ_ONCE(rt->dst.dev);
+	in_dev = __in_dev_get_rcu(dev);
 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 		rcu_read_unlock();
 		return;
 	}
 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
-	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
+	vif = l3mdev_master_ifindex_rcu(dev);
 
-	net = dev_net(rt->dst.dev);
+	net = dev_net(dev);
 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif);
 	if (!peer) {
 		rcu_read_unlock();
@@ -1287,29 +1289,32 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
 {
 	__be32 src;
 
-	if (rt_is_output_route(rt))
+	rcu_read_lock();
+	if (rt_is_output_route(rt)) {
 		src = ip_hdr(skb)->saddr;
-	else {
+	} else {
 		struct fib_result res;
 		struct iphdr *iph = ip_hdr(skb);
+		struct net_device *dev = READ_ONCE(rt->dst.dev);
+		struct net *net = dev_net(dev);
 		struct flowi4 fl4 = {
 			.daddr = iph->daddr,
 			.saddr = iph->saddr,
 			.flowi4_dscp = ip4h_dscp(iph),
-			.flowi4_oif = rt->dst.dev->ifindex,
+			.flowi4_oif = dev->ifindex,
 			.flowi4_iif = skb->dev->ifindex,
 			.flowi4_mark = skb->mark,
 		};
 
-		rcu_read_lock();
-		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
-			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
+		if (fib_lookup(net, &fl4, &res, 0) == 0)
+			src = fib_result_prefsrc(net, &res);
 		else
-			src = inet_select_addr(rt->dst.dev,
+			src = inet_select_addr(dev,
 					       rt_nexthop(rt, iph->daddr),
 					       RT_SCOPE_UNIVERSE);
-		rcu_read_unlock();
 	}
+	rcu_read_unlock();
+
 	memcpy(addr, &src, 4);
 }
 
@@ -1565,7 +1570,7 @@ void rt_flush_dev(struct net_device *dev)
 		list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
 			if (rt->dst.dev != dev)
 				continue;
-			rt->dst.dev = blackhole_netdev;
+			WRITE_ONCE(rt->dst.dev, blackhole_netdev);
 			netdev_ref_replace(dev, blackhole_netdev,
 					   &rt->dst.dev_tracker, GFP_ATOMIC);
 			list_del_init(&rt->dst.rt_uncached);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-07-01  3:25 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-07-01  3:24 [PATCH net-next v2] ipv4: hold a consistent view of rt->dst.dev under RCU xuanqiang.luo
  -- strict thread matches above, loose matches on Subject: below --
2026-06-30  9:42 [PATCH net-next v1] " xuanqiang.luo
2026-07-01  3:16 ` [PATCH net-next v2] " xuanqiang.luo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox