Netdev List
 help / color / mirror / Atom feed
* [PATCH v1 net] ipmr: Free mr_table after RCU grace period.
@ 2026-04-23  5:34 Kuniyuki Iwashima
  2026-04-28  1:50 ` patchwork-bot+netdevbpf
  2026-05-06  5:59 ` Lai, Yi
  0 siblings, 2 replies; 4+ messages in thread
From: Kuniyuki Iwashima @ 2026-04-23  5:34 UTC (permalink / raw)
  To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

With CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup()
does not check if net->ipv4.mrt is NULL.

Since default_device_exit_batch() is called after ->exit_rtnl(),
a device could receive IGMP packets and access net->ipv4.mrt
during/after ipmr_rules_exit_rtnl().

If ipmr_rules_exit_rtnl() had already cleared it and freed the
memory, the access would trigger null-ptr-deref or use-after-free.

Let's fix it by using RCU helper and free mrt after RCU grace
period.

In addition, check_net(net) is added to mroute_clean_tables()
and ipmr_cache_unresolved() to synchronise via mfc_unres_lock.
This prevents ipmr_cache_unresolved() from putting skb into
c->_c.mfc_un.unres.unresolved after mroute_clean_tables()
purges it.

For the same reason, timer_shutdown_sync() is moved after
mroute_clean_tables().

Since rhltable_destroy() holds mutex internally, rcu_work is
used, and it is placed as the first member because rcu_head
must be placed within <4K offset.  mr_table is alraedy 3864
bytes without rcu_work.

Note that IP6MR is not yet converted to ->exit_rtnl(), so this
change is not needed for now but will be.

Fixes: b22b01867406 ("ipmr: Convert ipmr_net_exit_batch() to ->exit_rtnl().")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 include/linux/mroute_base.h |   3 +
 net/ipv4/ipmr.c             | 108 +++++++++++++++++++-----------------
 net/ipv4/ipmr_base.c        |  16 ++++++
 3 files changed, 77 insertions(+), 50 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index cf3374580f74..5d75cc5b057e 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -226,6 +226,7 @@ struct mr_table_ops {
 
 /**
  * struct mr_table - a multicast routing table
+ * @work: used for table destruction
  * @list: entry within a list of multicast routing tables
  * @net: net where this table belongs
  * @ops: protocol specific operations
@@ -243,6 +244,7 @@ struct mr_table_ops {
  * @mroute_reg_vif_num: PIM-device vif index
  */
 struct mr_table {
+	struct rcu_work		work;
 	struct list_head	list;
 	possible_net_t		net;
 	struct mr_table_ops	ops;
@@ -274,6 +276,7 @@ void vif_device_init(struct vif_device *v,
 		     unsigned short flags,
 		     unsigned short get_iflink_mask);
 
+void mr_table_free(struct mr_table *mrt);
 struct mr_table *
 mr_table_alloc(struct net *net, u32 id,
 	       struct mr_table_ops *ops,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8a08d09b4c30..2058ca860294 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -151,16 +151,6 @@ static struct mr_table *__ipmr_get_table(struct net *net, u32 id)
 	return NULL;
 }
 
-static struct mr_table *ipmr_get_table(struct net *net, u32 id)
-{
-	struct mr_table *mrt;
-
-	rcu_read_lock();
-	mrt = __ipmr_get_table(net, id);
-	rcu_read_unlock();
-	return mrt;
-}
-
 static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
 			   struct mr_table **mrt)
 {
@@ -293,7 +283,7 @@ static void __net_exit ipmr_rules_exit_rtnl(struct net *net,
 	struct mr_table *mrt, *next;
 
 	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
-		list_del(&mrt->list);
+		list_del_rcu(&mrt->list);
 		ipmr_free_table(mrt, dev_kill_list);
 	}
 }
@@ -315,28 +305,30 @@ bool ipmr_rule_default(const struct fib_rule *rule)
 }
 EXPORT_SYMBOL(ipmr_rule_default);
 #else
-#define ipmr_for_each_table(mrt, net) \
-	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
-
 static struct mr_table *ipmr_mr_table_iter(struct net *net,
 					   struct mr_table *mrt)
 {
 	if (!mrt)
-		return net->ipv4.mrt;
+		return rcu_dereference(net->ipv4.mrt);
 	return NULL;
 }
 
-static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+static struct mr_table *__ipmr_get_table(struct net *net, u32 id)
 {
-	return net->ipv4.mrt;
+	return rcu_dereference_check(net->ipv4.mrt,
+				     lockdep_rtnl_is_held() ||
+				     !rcu_access_pointer(net->ipv4.mrt));
 }
 
-#define __ipmr_get_table ipmr_get_table
+#define ipmr_for_each_table(mrt, net)				\
+	for (mrt = __ipmr_get_table(net, 0); mrt; mrt = NULL)
 
 static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
 			   struct mr_table **mrt)
 {
-	*mrt = net->ipv4.mrt;
+	*mrt = rcu_dereference(net->ipv4.mrt);
+	if (!*mrt)
+		return -EAGAIN;
 	return 0;
 }
 
@@ -347,7 +339,8 @@ static int __net_init ipmr_rules_init(struct net *net)
 	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
 	if (IS_ERR(mrt))
 		return PTR_ERR(mrt);
-	net->ipv4.mrt = mrt;
+
+	rcu_assign_pointer(net->ipv4.mrt, mrt);
 	return 0;
 }
 
@@ -358,9 +351,10 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 static void __net_exit ipmr_rules_exit_rtnl(struct net *net,
 					    struct list_head *dev_kill_list)
 {
-	ipmr_free_table(net->ipv4.mrt, dev_kill_list);
+	struct mr_table *mrt = rcu_dereference_protected(net->ipv4.mrt, 1);
 
-	net->ipv4.mrt = NULL;
+	RCU_INIT_POINTER(net->ipv4.mrt, NULL);
+	ipmr_free_table(mrt, dev_kill_list);
 }
 
 static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -381,6 +375,17 @@ bool ipmr_rule_default(const struct fib_rule *rule)
 EXPORT_SYMBOL(ipmr_rule_default);
 #endif
 
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+	struct mr_table *mrt;
+
+	rcu_read_lock();
+	mrt = __ipmr_get_table(net, id);
+	rcu_read_unlock();
+
+	return mrt;
+}
+
 static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
 				const void *ptr)
 {
@@ -441,12 +446,11 @@ static void ipmr_free_table(struct mr_table *mrt, struct list_head *dev_kill_lis
 
 	WARN_ON_ONCE(!mr_can_free_table(net));
 
-	timer_shutdown_sync(&mrt->ipmr_expire_timer);
 	mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC |
 			    MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC,
 			    &ipmr_dev_kill_list);
-	rhltable_destroy(&mrt->mfc_hash);
-	kfree(mrt);
+	timer_shutdown_sync(&mrt->ipmr_expire_timer);
+	mr_table_free(mrt);
 
 	WARN_ON_ONCE(!net_initialized(net) && !list_empty(&ipmr_dev_kill_list));
 	list_splice(&ipmr_dev_kill_list, dev_kill_list);
@@ -1135,12 +1139,19 @@ static int ipmr_cache_report(const struct mr_table *mrt,
 static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 				 struct sk_buff *skb, struct net_device *dev)
 {
+	struct net *net = read_pnet(&mrt->net);
 	const struct iphdr *iph = ip_hdr(skb);
-	struct mfc_cache *c;
+	struct mfc_cache *c = NULL;
 	bool found = false;
 	int err;
 
 	spin_lock_bh(&mfc_unres_lock);
+
+	if (!check_net(net)) {
+		err = -EINVAL;
+		goto err;
+	}
+
 	list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) {
 		if (c->mfc_mcastgrp == iph->daddr &&
 		    c->mfc_origin == iph->saddr) {
@@ -1153,10 +1164,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 		/* Create a new entry if allowable */
 		c = ipmr_cache_alloc_unres();
 		if (!c) {
-			spin_unlock_bh(&mfc_unres_lock);
-
-			kfree_skb(skb);
-			return -ENOBUFS;
+			err = -ENOBUFS;
+			goto err;
 		}
 
 		/* Fill in the new cache entry */
@@ -1166,17 +1175,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 
 		/* Reflect first query at mrouted. */
 		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
-
-		if (err < 0) {
-			/* If the report failed throw the cache entry
-			   out - Brad Parker
-			 */
-			spin_unlock_bh(&mfc_unres_lock);
-
-			ipmr_cache_free(c);
-			kfree_skb(skb);
-			return err;
-		}
+		if (err < 0)
+			goto err;
 
 		atomic_inc(&mrt->cache_resolve_queue_len);
 		list_add(&c->_c.list, &mrt->mfc_unres_queue);
@@ -1189,18 +1189,26 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 
 	/* See if we can append the packet */
 	if (c->_c.mfc_un.unres.unresolved.qlen > 3) {
-		kfree_skb(skb);
+		c = NULL;
 		err = -ENOBUFS;
-	} else {
-		if (dev) {
-			skb->dev = dev;
-			skb->skb_iif = dev->ifindex;
-		}
-		skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
-		err = 0;
+		goto err;
+	}
+
+	if (dev) {
+		skb->dev = dev;
+		skb->skb_iif = dev->ifindex;
 	}
 
+	skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
+
 	spin_unlock_bh(&mfc_unres_lock);
+	return 0;
+
+err:
+	spin_unlock_bh(&mfc_unres_lock);
+	if (c)
+		ipmr_cache_free(c);
+	kfree_skb(skb);
 	return err;
 }
 
@@ -1346,7 +1354,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags,
 	}
 
 	if (flags & MRT_FLUSH_MFC) {
-		if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
+		if (atomic_read(&mrt->cache_resolve_queue_len) != 0 || !check_net(net)) {
 			spin_lock_bh(&mfc_unres_lock);
 			list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
 				list_del(&c->list);
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 37a3c144276c..3930d612c3de 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -28,6 +28,20 @@ void vif_device_init(struct vif_device *v,
 		v->link = dev->ifindex;
 }
 
+static void __mr_free_table(struct work_struct *work)
+{
+	struct mr_table *mrt = container_of(to_rcu_work(work),
+					    struct mr_table, work);
+
+	rhltable_destroy(&mrt->mfc_hash);
+	kfree(mrt);
+}
+
+void mr_table_free(struct mr_table *mrt)
+{
+	queue_rcu_work(system_unbound_wq, &mrt->work);
+}
+
 struct mr_table *
 mr_table_alloc(struct net *net, u32 id,
 	       struct mr_table_ops *ops,
@@ -50,6 +64,8 @@ mr_table_alloc(struct net *net, u32 id,
 		kfree(mrt);
 		return ERR_PTR(err);
 	}
+
+	INIT_RCU_WORK(&mrt->work, __mr_free_table);
 	INIT_LIST_HEAD(&mrt->mfc_cache_list);
 	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
 
-- 
2.54.0.rc2.533.g4f5dca5207-goog


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v1 net] ipmr: Free mr_table after RCU grace period.
  2026-04-23  5:34 [PATCH v1 net] ipmr: Free mr_table after RCU grace period Kuniyuki Iwashima
@ 2026-04-28  1:50 ` patchwork-bot+netdevbpf
  2026-05-06  5:59 ` Lai, Yi
  1 sibling, 0 replies; 4+ messages in thread
From: patchwork-bot+netdevbpf @ 2026-04-28  1:50 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: davem, dsahern, edumazet, kuba, pabeni, horms, kuni1840, netdev

Hello:

This patch was applied to netdev/net.git (main)
by Jakub Kicinski <kuba@kernel.org>:

On Thu, 23 Apr 2026 05:34:54 +0000 you wrote:
> With CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup()
> does not check if net->ipv4.mrt is NULL.
> 
> Since default_device_exit_batch() is called after ->exit_rtnl(),
> a device could receive IGMP packets and access net->ipv4.mrt
> during/after ipmr_rules_exit_rtnl().
> 
> [...]

Here is the summary with links:
  - [v1,net] ipmr: Free mr_table after RCU grace period.
    https://git.kernel.org/netdev/net/c/b3b6babf4751

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v1 net] ipmr: Free mr_table after RCU grace period.
  2026-04-23  5:34 [PATCH v1 net] ipmr: Free mr_table after RCU grace period Kuniyuki Iwashima
  2026-04-28  1:50 ` patchwork-bot+netdevbpf
@ 2026-05-06  5:59 ` Lai, Yi
  2026-05-06  6:20   ` Kuniyuki Iwashima
  1 sibling, 1 reply; 4+ messages in thread
From: Lai, Yi @ 2026-05-06  5:59 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Kuniyuki Iwashima, netdev,
	linux-kernel

On Thu, Apr 23, 2026 at 05:34:54AM +0000, Kuniyuki Iwashima wrote:
> With CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup()
> does not check if net->ipv4.mrt is NULL.
> 
> Since default_device_exit_batch() is called after ->exit_rtnl(),
> a device could receive IGMP packets and access net->ipv4.mrt
> during/after ipmr_rules_exit_rtnl().
> 
> If ipmr_rules_exit_rtnl() had already cleared it and freed the
> memory, the access would trigger null-ptr-deref or use-after-free.
> 
> Let's fix it by using RCU helper and free mrt after RCU grace
> period.
> 
> In addition, check_net(net) is added to mroute_clean_tables()
> and ipmr_cache_unresolved() to synchronise via mfc_unres_lock.
> This prevents ipmr_cache_unresolved() from putting skb into
> c->_c.mfc_un.unres.unresolved after mroute_clean_tables()
> purges it.
> 
> For the same reason, timer_shutdown_sync() is moved after
> mroute_clean_tables().
> 
> Since rhltable_destroy() holds mutex internally, rcu_work is
> used, and it is placed as the first member because rcu_head
> must be placed within <4K offset.  mr_table is alraedy 3864
> bytes without rcu_work.
> 
> Note that IP6MR is not yet converted to ->exit_rtnl(), so this
> change is not needed for now but will be.
> 
> Fixes: b22b01867406 ("ipmr: Convert ipmr_net_exit_batch() to ->exit_rtnl().")
> Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>

Hi Kuniyuki Iwashima,

Greetings!

I used Syzkaller and found that there is WARNING: suspicious RCU usage in reg_vif_xmit in linux-next next-20260505.

After bisection and the first bad commit is:
"
b3b6babf4751 ipmr: Free mr_table after RCU grace period
"

All detailed into can be found at:
https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit
Syzkaller repro code:
https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/repro.c
Syzkaller repro syscall steps:
https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/repro.prog
Syzkaller report:
https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/repro.report
Kconfig(make olddefconfig):
https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/kconfig_origin
Bisect info:
https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/bisect_info.log
bzImage:
https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/260506_091248_reg_vif_xmit/bzImage_next-20260505
Issue dmesg:
https://github.com/laifryiee/syzkaller_logs/blob/main/260506_091248_reg_vif_xmit/next-20260505_dmesg.log

"
[   18.611146] =============================
[   18.611406] WARNING: suspicious RCU usage
[   18.611657] 7.1.0-rc2-next-20260505-next-2026050 #1 Not tainted
[   18.612022] -----------------------------
[   18.612289] net/ipv4/ipmr.c:329 suspicious rcu_dereference_check() usage!
[   18.612755]
[   18.612755] other info that might help us debug this:
[   18.612755]
[   18.613314]
[   18.613314] rcu_scheduler_active = 2, debug_locks = 1
[   18.613758] 2 locks held by repro/725:
[   18.614195]  #0: ffffffff87896440 (rcu_read_lock_bh){....}-{1:3}, at: __dev_queue_xmit+0x239/0x4140
[   18.614860]  #1: ff1100000df5b918 (_xmit_PIMREG#2){+...}-{3:3}, at: __dev_queue_xmit+0x1d5d/0x4140
[   18.615505]
[   18.615505] stack backtrace:
[   18.615814] CPU: 0 UID: 0 PID: 725 Comm: repro Not tainted 7.1.0-rc2-next-20260505-next-2026050 #1 PREEMPT(lazy)
[   18.615826] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu4
[   18.615831] Call Trace:
[   18.615834]  <TASK>
[   18.615838]  dump_stack_lvl+0x121/0x150
[   18.615853]  dump_stack+0x19/0x20
[   18.615864]  lockdep_rcu_suspicious+0x15b/0x1f0
[   18.615882]  reg_vif_xmit+0x2ee/0x3c0
[   18.615898]  dev_hard_start_xmit+0x170/0x700
[   18.615915]  __dev_queue_xmit+0x1df1/0x4140
[   18.615931]  ? __might_fault+0x14a/0x1b0
[   18.615943]  ? __this_cpu_preempt_check+0x21/0x30
[   18.615961]  ? __pfx___dev_queue_xmit+0x10/0x10
[   18.615977]  ? _copy_from_iter+0x288/0x15e0
[   18.615989]  ? __virt_addr_valid+0x22c/0x420
[   18.616004]  ? __virt_addr_valid+0x22c/0x420
[   18.616018]  ? __this_cpu_preempt_check+0x21/0x30
[   18.616030]  ? __pfx__copy_from_iter+0x10/0x10
[   18.616048]  ? __sanitizer_cov_trace_const_cmp1+0x1e/0x30
[   18.616064]  ? packet_parse_headers+0x439/0x7b0
[   18.616076]  ? packet_parse_headers+0x202/0x7b0
[   18.616088]  ? __pfx_packet_parse_headers+0x10/0x10
[   18.616103]  packet_xmit+0x252/0x370
[   18.616119]  packet_sendmsg+0x39ad/0x5650
[   18.616136]  ? __lock_acquire+0x412/0x2390
[   18.616174]  ? __pfx_packet_sendmsg+0x10/0x10
[   18.616189]  ? audit_watch_handle_event+0x130/0x900
[   18.616201]  ? __import_iovec+0x1df/0x660
[   18.616213]  ? _copy_from_user+0x75/0xa0
[   18.616229]  ? __pfx_packet_sendmsg+0x10/0x10
[   18.616242]  ____sys_sendmsg+0xa21/0xba0
[   18.616257]  ? __pfx_____sys_sendmsg+0x10/0x10
[   18.616274]  ? __this_cpu_preempt_check+0x21/0x30
[   18.616285]  ? lock_release+0x14f/0x2c0
[   18.616305]  ___sys_sendmsg+0x121/0x1c0
[   18.616322]  ? __pfx____sys_sendmsg+0x10/0x10
[   18.616347]  ? __handle_mm_fault+0x656/0x2cb0
[   18.616388]  __sys_sendmsg+0x177/0x220
[   18.616403]  ? __pfx___sys_sendmsg+0x10/0x10
[   18.616428]  ? seqcount_lockdep_reader_access.constprop.0+0xc0/0xd0
[   18.616440]  ? __sanitizer_cov_trace_cmp4+0x1a/0x20
[   18.616453]  ? ktime_get_coarse_real_ts64+0xad/0xf0
[   18.616471]  __x64_sys_sendmsg+0x80/0xc0
[   18.616487]  x64_sys_call+0x1d9c/0x21c0
[   18.616499]  do_syscall_64+0xc1/0x1020
[   18.616517]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   18.616527] RIP: 0033:0x7f93b863ee5d
[   18.616536] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c8
[   18.616546] RSP: 002b:00007fff211cf048 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
[   18.616555] RAX: ffffffffffffffda RBX: 0000200000000380 RCX: 00007f93b863ee5d
[   18.616561] RDX: 0000000000000000 RSI: 00002000000012c0 RDI: 0000000000000004
[   18.616567] RBP: 00007fff211cf070 R08: 0000000200000000 R09: 0000000200000000
[   18.616573] R10: 0000000200000000 R11: 0000000000000246 R12: 00007fff211cf188
[   18.616579] R13: 0000000000401164 R14: 0000000000403e08 R15: 00007f93b886e000
[   18.616601]  </TASK>
"

Hope this cound be insightful to you.

Regards,
Yi Lai

---

If you don't need the following environment to reproduce the problem or if you
already have one reproduced environment, please ignore the following information.

How to reproduce:
git clone https://gitlab.com/xupengfe/repro_vm_env.git
cd repro_vm_env
tar -xvf repro_vm_env.tar.gz
cd repro_vm_env; ./start3.sh  // it needs qemu-system-x86_64 and I used v7.1.0
  // start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel
  // You could change the bzImage_xxx as you want
  // Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version
You could use below command to log in, there is no password for root.
ssh -p 10023 root@localhost

After login vm(virtual machine) successfully, you could transfer reproduced
binary to the vm by below way, and reproduce the problem in vm:
gcc -pthread -o repro repro.c
scp -P 10023 repro root@localhost:/root/

Get the bzImage for target kernel:
Please use target kconfig and copy it to kernel_src/.config
make olddefconfig
make -jx bzImage           //x should equal or less than cpu num your pc has

Fill the bzImage file into above start3.sh to load the target kernel in vm.


Tips:
If you already have qemu-system-x86_64, please ignore below info.
If you want to install qemu v7.1.0 version:
git clone https://github.com/qemu/qemu.git
cd qemu
git checkout -f v7.1.0
mkdir build
cd build
yum install -y ninja-build.x86_64
yum -y install libslirp-devel.x86_64
../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp
make
make install 

> ---
>  include/linux/mroute_base.h |   3 +
>  net/ipv4/ipmr.c             | 108 +++++++++++++++++++-----------------
>  net/ipv4/ipmr_base.c        |  16 ++++++
>  3 files changed, 77 insertions(+), 50 deletions(-)
> 
> diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
> index cf3374580f74..5d75cc5b057e 100644
> --- a/include/linux/mroute_base.h
> +++ b/include/linux/mroute_base.h
> @@ -226,6 +226,7 @@ struct mr_table_ops {
>  
>  /**
>   * struct mr_table - a multicast routing table
> + * @work: used for table destruction
>   * @list: entry within a list of multicast routing tables
>   * @net: net where this table belongs
>   * @ops: protocol specific operations
> @@ -243,6 +244,7 @@ struct mr_table_ops {
>   * @mroute_reg_vif_num: PIM-device vif index
>   */
>  struct mr_table {
> +	struct rcu_work		work;
>  	struct list_head	list;
>  	possible_net_t		net;
>  	struct mr_table_ops	ops;
> @@ -274,6 +276,7 @@ void vif_device_init(struct vif_device *v,
>  		     unsigned short flags,
>  		     unsigned short get_iflink_mask);
>  
> +void mr_table_free(struct mr_table *mrt);
>  struct mr_table *
>  mr_table_alloc(struct net *net, u32 id,
>  	       struct mr_table_ops *ops,
> diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
> index 8a08d09b4c30..2058ca860294 100644
> --- a/net/ipv4/ipmr.c
> +++ b/net/ipv4/ipmr.c
> @@ -151,16 +151,6 @@ static struct mr_table *__ipmr_get_table(struct net *net, u32 id)
>  	return NULL;
>  }
>  
> -static struct mr_table *ipmr_get_table(struct net *net, u32 id)
> -{
> -	struct mr_table *mrt;
> -
> -	rcu_read_lock();
> -	mrt = __ipmr_get_table(net, id);
> -	rcu_read_unlock();
> -	return mrt;
> -}
> -
>  static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
>  			   struct mr_table **mrt)
>  {
> @@ -293,7 +283,7 @@ static void __net_exit ipmr_rules_exit_rtnl(struct net *net,
>  	struct mr_table *mrt, *next;
>  
>  	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
> -		list_del(&mrt->list);
> +		list_del_rcu(&mrt->list);
>  		ipmr_free_table(mrt, dev_kill_list);
>  	}
>  }
> @@ -315,28 +305,30 @@ bool ipmr_rule_default(const struct fib_rule *rule)
>  }
>  EXPORT_SYMBOL(ipmr_rule_default);
>  #else
> -#define ipmr_for_each_table(mrt, net) \
> -	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
> -
>  static struct mr_table *ipmr_mr_table_iter(struct net *net,
>  					   struct mr_table *mrt)
>  {
>  	if (!mrt)
> -		return net->ipv4.mrt;
> +		return rcu_dereference(net->ipv4.mrt);
>  	return NULL;
>  }
>  
> -static struct mr_table *ipmr_get_table(struct net *net, u32 id)
> +static struct mr_table *__ipmr_get_table(struct net *net, u32 id)
>  {
> -	return net->ipv4.mrt;
> +	return rcu_dereference_check(net->ipv4.mrt,
> +				     lockdep_rtnl_is_held() ||
> +				     !rcu_access_pointer(net->ipv4.mrt));
>  }
>  
> -#define __ipmr_get_table ipmr_get_table
> +#define ipmr_for_each_table(mrt, net)				\
> +	for (mrt = __ipmr_get_table(net, 0); mrt; mrt = NULL)
>  
>  static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
>  			   struct mr_table **mrt)
>  {
> -	*mrt = net->ipv4.mrt;
> +	*mrt = rcu_dereference(net->ipv4.mrt);
> +	if (!*mrt)
> +		return -EAGAIN;
>  	return 0;
>  }
>  
> @@ -347,7 +339,8 @@ static int __net_init ipmr_rules_init(struct net *net)
>  	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
>  	if (IS_ERR(mrt))
>  		return PTR_ERR(mrt);
> -	net->ipv4.mrt = mrt;
> +
> +	rcu_assign_pointer(net->ipv4.mrt, mrt);
>  	return 0;
>  }
>  
> @@ -358,9 +351,10 @@ static void __net_exit ipmr_rules_exit(struct net *net)
>  static void __net_exit ipmr_rules_exit_rtnl(struct net *net,
>  					    struct list_head *dev_kill_list)
>  {
> -	ipmr_free_table(net->ipv4.mrt, dev_kill_list);
> +	struct mr_table *mrt = rcu_dereference_protected(net->ipv4.mrt, 1);
>  
> -	net->ipv4.mrt = NULL;
> +	RCU_INIT_POINTER(net->ipv4.mrt, NULL);
> +	ipmr_free_table(mrt, dev_kill_list);
>  }
>  
>  static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
> @@ -381,6 +375,17 @@ bool ipmr_rule_default(const struct fib_rule *rule)
>  EXPORT_SYMBOL(ipmr_rule_default);
>  #endif
>  
> +static struct mr_table *ipmr_get_table(struct net *net, u32 id)
> +{
> +	struct mr_table *mrt;
> +
> +	rcu_read_lock();
> +	mrt = __ipmr_get_table(net, id);
> +	rcu_read_unlock();
> +
> +	return mrt;
> +}
> +
>  static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
>  				const void *ptr)
>  {
> @@ -441,12 +446,11 @@ static void ipmr_free_table(struct mr_table *mrt, struct list_head *dev_kill_lis
>  
>  	WARN_ON_ONCE(!mr_can_free_table(net));
>  
> -	timer_shutdown_sync(&mrt->ipmr_expire_timer);
>  	mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC |
>  			    MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC,
>  			    &ipmr_dev_kill_list);
> -	rhltable_destroy(&mrt->mfc_hash);
> -	kfree(mrt);
> +	timer_shutdown_sync(&mrt->ipmr_expire_timer);
> +	mr_table_free(mrt);
>  
>  	WARN_ON_ONCE(!net_initialized(net) && !list_empty(&ipmr_dev_kill_list));
>  	list_splice(&ipmr_dev_kill_list, dev_kill_list);
> @@ -1135,12 +1139,19 @@ static int ipmr_cache_report(const struct mr_table *mrt,
>  static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
>  				 struct sk_buff *skb, struct net_device *dev)
>  {
> +	struct net *net = read_pnet(&mrt->net);
>  	const struct iphdr *iph = ip_hdr(skb);
> -	struct mfc_cache *c;
> +	struct mfc_cache *c = NULL;
>  	bool found = false;
>  	int err;
>  
>  	spin_lock_bh(&mfc_unres_lock);
> +
> +	if (!check_net(net)) {
> +		err = -EINVAL;
> +		goto err;
> +	}
> +
>  	list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) {
>  		if (c->mfc_mcastgrp == iph->daddr &&
>  		    c->mfc_origin == iph->saddr) {
> @@ -1153,10 +1164,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
>  		/* Create a new entry if allowable */
>  		c = ipmr_cache_alloc_unres();
>  		if (!c) {
> -			spin_unlock_bh(&mfc_unres_lock);
> -
> -			kfree_skb(skb);
> -			return -ENOBUFS;
> +			err = -ENOBUFS;
> +			goto err;
>  		}
>  
>  		/* Fill in the new cache entry */
> @@ -1166,17 +1175,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
>  
>  		/* Reflect first query at mrouted. */
>  		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
> -
> -		if (err < 0) {
> -			/* If the report failed throw the cache entry
> -			   out - Brad Parker
> -			 */
> -			spin_unlock_bh(&mfc_unres_lock);
> -
> -			ipmr_cache_free(c);
> -			kfree_skb(skb);
> -			return err;
> -		}
> +		if (err < 0)
> +			goto err;
>  
>  		atomic_inc(&mrt->cache_resolve_queue_len);
>  		list_add(&c->_c.list, &mrt->mfc_unres_queue);
> @@ -1189,18 +1189,26 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
>  
>  	/* See if we can append the packet */
>  	if (c->_c.mfc_un.unres.unresolved.qlen > 3) {
> -		kfree_skb(skb);
> +		c = NULL;
>  		err = -ENOBUFS;
> -	} else {
> -		if (dev) {
> -			skb->dev = dev;
> -			skb->skb_iif = dev->ifindex;
> -		}
> -		skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
> -		err = 0;
> +		goto err;
> +	}
> +
> +	if (dev) {
> +		skb->dev = dev;
> +		skb->skb_iif = dev->ifindex;
>  	}
>  
> +	skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
> +
>  	spin_unlock_bh(&mfc_unres_lock);
> +	return 0;
> +
> +err:
> +	spin_unlock_bh(&mfc_unres_lock);
> +	if (c)
> +		ipmr_cache_free(c);
> +	kfree_skb(skb);
>  	return err;
>  }
>  
> @@ -1346,7 +1354,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags,
>  	}
>  
>  	if (flags & MRT_FLUSH_MFC) {
> -		if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
> +		if (atomic_read(&mrt->cache_resolve_queue_len) != 0 || !check_net(net)) {
>  			spin_lock_bh(&mfc_unres_lock);
>  			list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
>  				list_del(&c->list);
> diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
> index 37a3c144276c..3930d612c3de 100644
> --- a/net/ipv4/ipmr_base.c
> +++ b/net/ipv4/ipmr_base.c
> @@ -28,6 +28,20 @@ void vif_device_init(struct vif_device *v,
>  		v->link = dev->ifindex;
>  }
>  
> +static void __mr_free_table(struct work_struct *work)
> +{
> +	struct mr_table *mrt = container_of(to_rcu_work(work),
> +					    struct mr_table, work);
> +
> +	rhltable_destroy(&mrt->mfc_hash);
> +	kfree(mrt);
> +}
> +
> +void mr_table_free(struct mr_table *mrt)
> +{
> +	queue_rcu_work(system_unbound_wq, &mrt->work);
> +}
> +
>  struct mr_table *
>  mr_table_alloc(struct net *net, u32 id,
>  	       struct mr_table_ops *ops,
> @@ -50,6 +64,8 @@ mr_table_alloc(struct net *net, u32 id,
>  		kfree(mrt);
>  		return ERR_PTR(err);
>  	}
> +
> +	INIT_RCU_WORK(&mrt->work, __mr_free_table);
>  	INIT_LIST_HEAD(&mrt->mfc_cache_list);
>  	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
>  
> -- 
> 2.54.0.rc2.533.g4f5dca5207-goog
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v1 net] ipmr: Free mr_table after RCU grace period.
  2026-05-06  5:59 ` Lai, Yi
@ 2026-05-06  6:20   ` Kuniyuki Iwashima
  0 siblings, 0 replies; 4+ messages in thread
From: Kuniyuki Iwashima @ 2026-05-06  6:20 UTC (permalink / raw)
  To: Lai, Yi
  Cc: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Kuniyuki Iwashima, netdev,
	linux-kernel

On Tue, May 5, 2026 at 11:00 PM Lai, Yi <yi1.lai@intel.com> wrote:
>
> On Thu, Apr 23, 2026 at 05:34:54AM +0000, Kuniyuki Iwashima wrote:
> > With CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup()
> > does not check if net->ipv4.mrt is NULL.
> >
> > Since default_device_exit_batch() is called after ->exit_rtnl(),
> > a device could receive IGMP packets and access net->ipv4.mrt
> > during/after ipmr_rules_exit_rtnl().
> >
> > If ipmr_rules_exit_rtnl() had already cleared it and freed the
> > memory, the access would trigger null-ptr-deref or use-after-free.
> >
> > Let's fix it by using RCU helper and free mrt after RCU grace
> > period.
> >
> > In addition, check_net(net) is added to mroute_clean_tables()
> > and ipmr_cache_unresolved() to synchronise via mfc_unres_lock.
> > This prevents ipmr_cache_unresolved() from putting skb into
> > c->_c.mfc_un.unres.unresolved after mroute_clean_tables()
> > purges it.
> >
> > For the same reason, timer_shutdown_sync() is moved after
> > mroute_clean_tables().
> >
> > Since rhltable_destroy() holds mutex internally, rcu_work is
> > used, and it is placed as the first member because rcu_head
> > must be placed within <4K offset.  mr_table is alraedy 3864
> > bytes without rcu_work.
> >
> > Note that IP6MR is not yet converted to ->exit_rtnl(), so this
> > change is not needed for now but will be.
> >
> > Fixes: b22b01867406 ("ipmr: Convert ipmr_net_exit_batch() to ->exit_rtnl().")
> > Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
>
> Hi Kuniyuki Iwashima,
>
> Greetings!
>
> I used Syzkaller and found that there is WARNING: suspicious RCU usage in reg_vif_xmit in linux-next next-20260505.
>
> After bisection and the first bad commit is:
> "
> b3b6babf4751 ipmr: Free mr_table after RCU grace period
> "
>
> All detailed into can be found at:
> https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit
> Syzkaller repro code:
> https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/repro.c
> Syzkaller repro syscall steps:
> https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/repro.prog
> Syzkaller report:
> https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/repro.report
> Kconfig(make olddefconfig):
> https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/kconfig_origin
> Bisect info:
> https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/bisect_info.log
> bzImage:
> https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/260506_091248_reg_vif_xmit/bzImage_next-20260505
> Issue dmesg:
> https://github.com/laifryiee/syzkaller_logs/blob/main/260506_091248_reg_vif_xmit/next-20260505_dmesg.log
>
> "
> [   18.611146] =============================
> [   18.611406] WARNING: suspicious RCU usage
> [   18.611657] 7.1.0-rc2-next-20260505-next-2026050 #1 Not tainted
> [   18.612022] -----------------------------
> [   18.612289] net/ipv4/ipmr.c:329 suspicious rcu_dereference_check() usage!
> [   18.612755]
> [   18.612755] other info that might help us debug this:
> [   18.612755]
> [   18.613314]
> [   18.613314] rcu_scheduler_active = 2, debug_locks = 1
> [   18.613758] 2 locks held by repro/725:
> [   18.614195]  #0: ffffffff87896440 (rcu_read_lock_bh){....}-{1:3}, at: __dev_queue_xmit+0x239/0x4140
> [   18.614860]  #1: ff1100000df5b918 (_xmit_PIMREG#2){+...}-{3:3}, at: __dev_queue_xmit+0x1d5d/0x4140
> [   18.615505]
> [   18.615505] stack backtrace:
> [   18.615814] CPU: 0 UID: 0 PID: 725 Comm: repro Not tainted 7.1.0-rc2-next-20260505-next-2026050 #1 PREEMPT(lazy)
> [   18.615826] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu4
> [   18.615831] Call Trace:
> [   18.615834]  <TASK>
> [   18.615838]  dump_stack_lvl+0x121/0x150
> [   18.615853]  dump_stack+0x19/0x20
> [   18.615864]  lockdep_rcu_suspicious+0x15b/0x1f0
> [   18.615882]  reg_vif_xmit+0x2ee/0x3c0

Thanks for the report.

I'll just move up rcu_read_lock() in reg_vif_xmit().

ipmr_fib_lookup() for CONFIG_IP_MROUTE_MULTIPLE_TABLES=y
calls rcu_read_lock() at the same timing anyway.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-05-06  6:20 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-23  5:34 [PATCH v1 net] ipmr: Free mr_table after RCU grace period Kuniyuki Iwashima
2026-04-28  1:50 ` patchwork-bot+netdevbpf
2026-05-06  5:59 ` Lai, Yi
2026-05-06  6:20   ` Kuniyuki Iwashima

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox