Netdev List
 help / color / mirror / Atom feed
* [PATCH v2 net-next 14/15] ip6mr: Call fib_rules_unregister() without RTNL.
From: Kuniyuki Iwashima @ 2026-04-10 21:17 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

fib_rules_unregister() removes ops from net->rules_ops under
spinlock, calls ops->delete() for each rule, and frees the ops.

ip6mr_rules_ops_template does not have ->delete(), and any
operation does not require RTNL there.

Let's move fib_rules_unregister() from ip6mr_rules_exit_rtnl()
to ip6mr_net_exit().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 3b8867e150fe..a31e3b740581 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -259,6 +259,11 @@ static int __net_init ip6mr_rules_init(struct net *net)
 	return err;
 }
 
+static void __net_exit ip6mr_rules_exit(struct net *net)
+{
+	fib_rules_unregister(net->ipv6.mr6_rules_ops);
+}
+
 static void __net_exit ip6mr_rules_exit_rtnl(struct net *net,
 					     struct list_head *dev_kill_list)
 {
@@ -268,8 +273,6 @@ static void __net_exit ip6mr_rules_exit_rtnl(struct net *net,
 		list_del_rcu(&mrt->list);
 		ip6mr_free_table(mrt, dev_kill_list);
 	}
-
-	fib_rules_unregister(net->ipv6.mr6_rules_ops);
 }
 
 static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -329,6 +332,10 @@ static int __net_init ip6mr_rules_init(struct net *net)
 	return 0;
 }
 
+static void __net_exit ip6mr_rules_exit(struct net *net)
+{
+}
+
 static void __net_exit ip6mr_rules_exit_rtnl(struct net *net,
 					     struct list_head *dev_kill_list)
 {
@@ -1367,6 +1374,7 @@ static int __net_init ip6mr_net_init(struct net *net)
 	remove_proc_entry("ip6_mr_vif", net->proc_net);
 proc_vif_fail:
 	ip6mr_rules_exit_rtnl(net, &dev_kill_list);
+	ip6mr_rules_exit(net);
 #endif
 ip6mr_rules_fail:
 	ip6mr_notifier_exit(net);
@@ -1379,6 +1387,7 @@ static void __net_exit ip6mr_net_exit(struct net *net)
 	remove_proc_entry("ip6_mr_cache", net->proc_net);
 	remove_proc_entry("ip6_mr_vif", net->proc_net);
 #endif
+	ip6mr_rules_exit(net);
 	ip6mr_notifier_exit(net);
 }
 
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 13/15] ip6mr: Remove RTNL in ip6mr_rules_init() and ip6mr_net_init().
From: Kuniyuki Iwashima @ 2026-04-10 21:17 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

When ip6mr_free_table() is called from ip6mr_rules_init() or
ip6mr_net_init(), the netns is not yet published.

Thus, no device should have been registered, and
mroute_clean_tables() will not call mif6_delete(), so
unregister_netdevice_many() is unnecessary.

unregister_netdevice_many() does nothing if the list is empty,
but it requires RTNL due to the unconditional ASSERT_RTNL()
at the entry of unregister_netdevice_many_notify().

Let's remove unnecessary RTNL and ASSERT_RTNL() and instead
add WARN_ON_ONCE() in ip6mr_free_table().

Note that we use a local list for the new WARN_ON_ONCE() because
dev_kill_list passed from ip6mr_rules_exit_rtnl() may have some
devices when other ops->init() fails after ipmr durnig setup_net().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 860fce51819e..3b8867e150fe 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -253,10 +253,7 @@ static int __net_init ip6mr_rules_init(struct net *net)
 	return 0;
 
 err2:
-	rtnl_lock();
 	ip6mr_free_table(mrt, &dev_kill_list);
-	unregister_netdevice_many(&dev_kill_list);
-	rtnl_unlock();
 err1:
 	fib_rules_unregister(ops);
 	return err;
@@ -267,7 +264,6 @@ static void __net_exit ip6mr_rules_exit_rtnl(struct net *net,
 {
 	struct mr_table *mrt, *next;
 
-	ASSERT_RTNL();
 	list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
 		list_del_rcu(&mrt->list);
 		ip6mr_free_table(mrt, dev_kill_list);
@@ -338,8 +334,6 @@ static void __net_exit ip6mr_rules_exit_rtnl(struct net *net,
 {
 	struct mr_table *mrt = rcu_dereference_protected(net->ipv6.mrt6, 1);
 
-	ASSERT_RTNL();
-
 	RCU_INIT_POINTER(net->ipv6.mrt6, NULL);
 	ip6mr_free_table(mrt, dev_kill_list);
 }
@@ -420,15 +414,19 @@ static void ip6mr_free_table(struct mr_table *mrt,
 			     struct list_head *dev_kill_list)
 {
 	struct net *net = read_pnet(&mrt->net);
+	LIST_HEAD(ip6mr_dev_kill_list);
 
 	WARN_ON_ONCE(!mr_can_free_table(net));
 
 	timer_shutdown_sync(&mrt->ipmr_expire_timer);
 	mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC |
 			    MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC,
-			    dev_kill_list);
+			    &ip6mr_dev_kill_list);
 	rhltable_destroy(&mrt->mfc_hash);
 	kfree_rcu(mrt, rcu);
+
+	WARN_ON_ONCE(!net_initialized(net) && !list_empty(&ip6mr_dev_kill_list));
+	list_splice(&ip6mr_dev_kill_list, dev_kill_list);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -1368,10 +1366,7 @@ static int __net_init ip6mr_net_init(struct net *net)
 proc_cache_fail:
 	remove_proc_entry("ip6_mr_vif", net->proc_net);
 proc_vif_fail:
-	rtnl_lock();
 	ip6mr_rules_exit_rtnl(net, &dev_kill_list);
-	unregister_netdevice_many(&dev_kill_list);
-	rtnl_unlock();
 #endif
 ip6mr_rules_fail:
 	ip6mr_notifier_exit(net);
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 12/15] ip6mr: Convert ip6mr_net_exit_batch() to ->exit_rtnl().
From: Kuniyuki Iwashima @ 2026-04-10 21:17 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

ip6mr_net_ops uses ->exit_batch() to acquire RTNL only once
for dying network namespaces.

ip6mr does not depend on the ordering of ->exit_rtnl() and
->exit_batch() of other pernet_operations (unlike fib_net_ops).

Once ip6mr_free_table() is called and all devices are
queued for destruction in ->exit_rtnl(), later during
NETDEV_UNREGISTER, ip6mr_device_event() will not see anything
in vif table and just do nothing.

Let's convert ip6mr_net_exit_batch() to ->exit_rtnl().

Note that fib_rules_unregister() does not need RTNL and
we will remove RTNL and unregister_netdevice_many() in
ip6mr_rules_init().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index af11fd883831..860fce51819e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -262,18 +262,17 @@ static int __net_init ip6mr_rules_init(struct net *net)
 	return err;
 }
 
-static void __net_exit ip6mr_rules_exit(struct net *net)
+static void __net_exit ip6mr_rules_exit_rtnl(struct net *net,
+					     struct list_head *dev_kill_list)
 {
 	struct mr_table *mrt, *next;
-	LIST_HEAD(dev_kill_list);
 
 	ASSERT_RTNL();
 	list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
 		list_del_rcu(&mrt->list);
-		ip6mr_free_table(mrt, &dev_kill_list);
+		ip6mr_free_table(mrt, dev_kill_list);
 	}
 
-	unregister_netdevice_many(&dev_kill_list);
 	fib_rules_unregister(net->ipv6.mr6_rules_ops);
 }
 
@@ -334,16 +333,15 @@ static int __net_init ip6mr_rules_init(struct net *net)
 	return 0;
 }
 
-static void __net_exit ip6mr_rules_exit(struct net *net)
+static void __net_exit ip6mr_rules_exit_rtnl(struct net *net,
+					     struct list_head *dev_kill_list)
 {
 	struct mr_table *mrt = rcu_dereference_protected(net->ipv6.mrt6, 1);
-	LIST_HEAD(dev_kill_list);
 
 	ASSERT_RTNL();
 
 	RCU_INIT_POINTER(net->ipv6.mrt6, NULL);
-	ip6mr_free_table(mrt, &dev_kill_list);
-	unregister_netdevice_many(&dev_kill_list);
+	ip6mr_free_table(mrt, dev_kill_list);
 }
 
 static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -1343,6 +1341,7 @@ static void __net_exit ip6mr_notifier_exit(struct net *net)
 /* Setup for IP multicast routing */
 static int __net_init ip6mr_net_init(struct net *net)
 {
+	LIST_HEAD(dev_kill_list);
 	int err;
 
 	err = ip6mr_notifier_init(net);
@@ -1370,7 +1369,8 @@ static int __net_init ip6mr_net_init(struct net *net)
 	remove_proc_entry("ip6_mr_vif", net->proc_net);
 proc_vif_fail:
 	rtnl_lock();
-	ip6mr_rules_exit(net);
+	ip6mr_rules_exit_rtnl(net, &dev_kill_list);
+	unregister_netdevice_many(&dev_kill_list);
 	rtnl_unlock();
 #endif
 ip6mr_rules_fail:
@@ -1387,20 +1387,16 @@ static void __net_exit ip6mr_net_exit(struct net *net)
 	ip6mr_notifier_exit(net);
 }
 
-static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list)
+static void __net_exit ip6mr_net_exit_rtnl(struct net *net,
+					   struct list_head *dev_kill_list)
 {
-	struct net *net;
-
-	rtnl_lock();
-	list_for_each_entry(net, net_list, exit_list)
-		ip6mr_rules_exit(net);
-	rtnl_unlock();
+	ip6mr_rules_exit_rtnl(net, dev_kill_list);
 }
 
 static struct pernet_operations ip6mr_net_ops = {
 	.init = ip6mr_net_init,
 	.exit = ip6mr_net_exit,
-	.exit_batch = ip6mr_net_exit_batch,
+	.exit_rtnl = ip6mr_net_exit_rtnl,
 };
 
 static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = {
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 11/15] ip6mr: Move unregister_netdevice_many() out of ip6mr_free_table().
From: Kuniyuki Iwashima @ 2026-04-10 21:17 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

This is a prep commit to convert ip6mr_net_exit_batch() to
->exit_rtnl().

Let's move unregister_netdevice_many() in ip6mr_free_table()
to its callers.

Now ip6mr_rules_exit() can do batching all tables per netns.

Note that later we will remove RTNL and unregister_netdevice_many()
in ip6mr_rules_init().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index dd72eb346eb1..af11fd883831 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -85,7 +85,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
 static struct kmem_cache *mrt_cachep __read_mostly;
 
 static struct mr_table *ip6mr_new_table(struct net *net, u32 id);
-static void ip6mr_free_table(struct mr_table *mrt);
+static void ip6mr_free_table(struct mr_table *mrt,
+			     struct list_head *dev_kill_list);
 
 static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
 			   struct net_device *dev, struct sk_buff *skb,
@@ -228,6 +229,7 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
 static int __net_init ip6mr_rules_init(struct net *net)
 {
 	struct fib_rules_ops *ops;
+	LIST_HEAD(dev_kill_list);
 	struct mr_table *mrt;
 	int err;
 
@@ -252,7 +254,8 @@ static int __net_init ip6mr_rules_init(struct net *net)
 
 err2:
 	rtnl_lock();
-	ip6mr_free_table(mrt);
+	ip6mr_free_table(mrt, &dev_kill_list);
+	unregister_netdevice_many(&dev_kill_list);
 	rtnl_unlock();
 err1:
 	fib_rules_unregister(ops);
@@ -262,12 +265,15 @@ static int __net_init ip6mr_rules_init(struct net *net)
 static void __net_exit ip6mr_rules_exit(struct net *net)
 {
 	struct mr_table *mrt, *next;
+	LIST_HEAD(dev_kill_list);
 
 	ASSERT_RTNL();
 	list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
 		list_del_rcu(&mrt->list);
-		ip6mr_free_table(mrt);
+		ip6mr_free_table(mrt, &dev_kill_list);
 	}
+
+	unregister_netdevice_many(&dev_kill_list);
 	fib_rules_unregister(net->ipv6.mr6_rules_ops);
 }
 
@@ -331,11 +337,13 @@ static int __net_init ip6mr_rules_init(struct net *net)
 static void __net_exit ip6mr_rules_exit(struct net *net)
 {
 	struct mr_table *mrt = rcu_dereference_protected(net->ipv6.mrt6, 1);
+	LIST_HEAD(dev_kill_list);
 
 	ASSERT_RTNL();
 
 	RCU_INIT_POINTER(net->ipv6.mrt6, NULL);
-	ip6mr_free_table(mrt);
+	ip6mr_free_table(mrt, &dev_kill_list);
+	unregister_netdevice_many(&dev_kill_list);
 }
 
 static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -410,18 +418,17 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
 			      ipmr_expire_process, ip6mr_new_table_set);
 }
 
-static void ip6mr_free_table(struct mr_table *mrt)
+static void ip6mr_free_table(struct mr_table *mrt,
+			     struct list_head *dev_kill_list)
 {
 	struct net *net = read_pnet(&mrt->net);
-	LIST_HEAD(dev_kill_list);
 
 	WARN_ON_ONCE(!mr_can_free_table(net));
 
 	timer_shutdown_sync(&mrt->ipmr_expire_timer);
 	mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC |
 			    MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC,
-			    &dev_kill_list);
-	unregister_netdevice_many(&dev_kill_list);
+			    dev_kill_list);
 	rhltable_destroy(&mrt->mfc_hash);
 	kfree_rcu(mrt, rcu);
 }
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 10/15] ip6mr: Move unregister_netdevice_many() out of mroute_clean_tables().
From: Kuniyuki Iwashima @ 2026-04-10 21:17 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

This is a prep commit to convert ip6mr_net_exit_batch() to
->exit_rtnl().

Let's move unregister_netdevice_many() in mroute_clean_tables()
to its callers.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index fdec7a541cf6..dd72eb346eb1 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -99,7 +99,8 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 			      struct netlink_ext_ack *extack);
 static int ip6mr_rtm_dumproute(struct sk_buff *skb,
 			       struct netlink_callback *cb);
-static void mroute_clean_tables(struct mr_table *mrt, int flags);
+static void mroute_clean_tables(struct mr_table *mrt, int flags,
+				struct list_head *dev_kill_list);
 static void ipmr_expire_process(struct timer_list *t);
 
 #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
@@ -412,12 +413,15 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
 static void ip6mr_free_table(struct mr_table *mrt)
 {
 	struct net *net = read_pnet(&mrt->net);
+	LIST_HEAD(dev_kill_list);
 
 	WARN_ON_ONCE(!mr_can_free_table(net));
 
 	timer_shutdown_sync(&mrt->ipmr_expire_timer);
 	mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC |
-				 MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC);
+			    MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC,
+			    &dev_kill_list);
+	unregister_netdevice_many(&dev_kill_list);
 	rhltable_destroy(&mrt->mfc_hash);
 	kfree_rcu(mrt, rcu);
 }
@@ -1541,10 +1545,10 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
  *	Close the multicast socket, and clear the vif tables etc
  */
 
-static void mroute_clean_tables(struct mr_table *mrt, int flags)
+static void mroute_clean_tables(struct mr_table *mrt, int flags,
+				struct list_head *dev_kill_list)
 {
 	struct mr_mfc *c, *tmp;
-	LIST_HEAD(list);
 	int i;
 
 	/* Shut down all active vif entries */
@@ -1554,9 +1558,8 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags)
 			     !(flags & MRT6_FLUSH_MIFS_STATIC)) ||
 			    (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS)))
 				continue;
-			mif6_delete(mrt, i, 0, &list);
+			mif6_delete(mrt, i, 0, dev_kill_list);
 		}
-		unregister_netdevice_many(&list);
 	}
 
 	/* Wipe the cache */
@@ -1619,6 +1622,7 @@ int ip6mr_sk_done(struct sock *sk)
 {
 	struct net *net = sock_net(sk);
 	struct ipv6_devconf *devconf;
+	LIST_HEAD(dev_kill_list);
 	struct mr_table *mrt;
 	int err = -EACCES;
 
@@ -1646,11 +1650,13 @@ int ip6mr_sk_done(struct sock *sk)
 						     NETCONFA_IFINDEX_ALL,
 						     net->ipv6.devconf_all);
 
-			mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC);
+			mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC,
+					    &dev_kill_list);
 			err = 0;
 			break;
 		}
 	}
+	unregister_netdevice_many(&dev_kill_list);
 	rtnl_unlock();
 
 	return err;
@@ -1765,14 +1771,17 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 
 	case MRT6_FLUSH:
 	{
+		LIST_HEAD(dev_kill_list);
 		int flags;
 
 		if (optlen != sizeof(flags))
 			return -EINVAL;
 		if (copy_from_sockptr(&flags, optval, sizeof(flags)))
 			return -EFAULT;
+
 		rtnl_lock();
-		mroute_clean_tables(mrt, flags);
+		mroute_clean_tables(mrt, flags, &dev_kill_list);
+		unregister_netdevice_many(&dev_kill_list);
 		rtnl_unlock();
 		return 0;
 	}
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 09/15] ip6mr: Free mr_table after RCU grace period.
From: Kuniyuki Iwashima @ 2026-04-10 21:17 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

Since default_device_exit_batch() is called after ->exit_rtnl(),
idev->mc_ifc_work could finally call mroute6_is_socket() under RCU
while ->exit_rtnl() is running. [0]

With CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=n, ip6mr_fib_lookup() does
not check if net->ipv6.mrt6 is NULL.  If ip6mr_net_exit_batch()
set net->ipv6.mrt6 to NULL and freed it, the mrt->mroute_sk access
could result in null-ptr-deref or use-after-free.

Let's prepare for that situation by applying RCU rule to ip6mr
table similarly.

Link: https://lore.kernel.org/netdev/20260407184202.34cfe2d6@kernel.org/ #[0]
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 53 +++++++++++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 2b04e52ec61c..fdec7a541cf6 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -136,16 +136,6 @@ static struct mr_table *__ip6mr_get_table(struct net *net, u32 id)
 	return NULL;
 }
 
-static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
-{
-	struct mr_table *mrt;
-
-	rcu_read_lock();
-	mrt = __ip6mr_get_table(net, id);
-	rcu_read_unlock();
-	return mrt;
-}
-
 static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
 			    struct mr_table **mrt)
 {
@@ -274,7 +264,7 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
 
 	ASSERT_RTNL();
 	list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
-		list_del(&mrt->list);
+		list_del_rcu(&mrt->list);
 		ip6mr_free_table(mrt);
 	}
 	fib_rules_unregister(net->ipv6.mr6_rules_ops);
@@ -298,28 +288,30 @@ bool ip6mr_rule_default(const struct fib_rule *rule)
 }
 EXPORT_SYMBOL(ip6mr_rule_default);
 #else
-#define ip6mr_for_each_table(mrt, net) \
-	for (mrt = net->ipv6.mrt6; mrt; mrt = NULL)
-
 static struct mr_table *ip6mr_mr_table_iter(struct net *net,
 					    struct mr_table *mrt)
 {
 	if (!mrt)
-		return net->ipv6.mrt6;
+		return rcu_dereference(net->ipv6.mrt6);
 	return NULL;
 }
 
-static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
+static struct mr_table *__ip6mr_get_table(struct net *net, u32 id)
 {
-	return net->ipv6.mrt6;
+	return rcu_dereference_check(net->ipv6.mrt6,
+				     lockdep_rtnl_is_held() ||
+				     !rcu_access_pointer(net->ipv6.mrt6));
 }
 
-#define __ip6mr_get_table ip6mr_get_table
+#define ip6mr_for_each_table(mrt, net)				\
+	for (mrt = __ip6mr_get_table(net, 0); mrt; mrt = NULL)
 
 static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
 			    struct mr_table **mrt)
 {
-	*mrt = net->ipv6.mrt6;
+	*mrt = rcu_dereference(net->ipv6.mrt6);
+	if (!*mrt)
+		return -EAGAIN;
 	return 0;
 }
 
@@ -330,15 +322,19 @@ static int __net_init ip6mr_rules_init(struct net *net)
 	mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
 	if (IS_ERR(mrt))
 		return PTR_ERR(mrt);
-	net->ipv6.mrt6 = mrt;
+
+	rcu_assign_pointer(net->ipv6.mrt6, mrt);
 	return 0;
 }
 
 static void __net_exit ip6mr_rules_exit(struct net *net)
 {
+	struct mr_table *mrt = rcu_dereference_protected(net->ipv6.mrt6, 1);
+
 	ASSERT_RTNL();
-	ip6mr_free_table(net->ipv6.mrt6);
-	net->ipv6.mrt6 = NULL;
+
+	RCU_INIT_POINTER(net->ipv6.mrt6, NULL);
+	ip6mr_free_table(mrt);
 }
 
 static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -353,6 +349,17 @@ static unsigned int ip6mr_rules_seq_read(const struct net *net)
 }
 #endif
 
+static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
+{
+	struct mr_table *mrt;
+
+	rcu_read_lock();
+	mrt = __ip6mr_get_table(net, id);
+	rcu_read_unlock();
+
+	return mrt;
+}
+
 static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg,
 			  const void *ptr)
 {
@@ -412,7 +419,7 @@ static void ip6mr_free_table(struct mr_table *mrt)
 	mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC |
 				 MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC);
 	rhltable_destroy(&mrt->mfc_hash);
-	kfree(mrt);
+	kfree_rcu(mrt, rcu);
 }
 
 #ifdef CONFIG_PROC_FS
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 08/15] ipmr: Free mr_table after RCU grace period.
From: Kuniyuki Iwashima @ 2026-04-10 21:17 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

With CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup()
does not check if net->ipv4.mrt is NULL.

Since default_device_exit_batch() is called after ->exit_rtnl(),
a device could receive IGMP packets and access net->ipv4.mrt
during/after ipmr_rules_exit_rtnl().

If ipmr_rules_exit_rtnl() had already cleared it and freed the
memory, the access would trigger null-ptr-deref or use-after-free.

Let's fix it by using RCU helper and free mrt after RCU grace
period.

Note that rcu_head must be placed within <4K offset and mr_table
is already 3864 bytes without rcu_head.

Fixes: b22b01867406 ("ipmr: Convert ipmr_net_exit_batch() to ->exit_rtnl().")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 include/linux/mroute_base.h |  2 ++
 net/ipv4/ipmr.c             | 51 ++++++++++++++++++++-----------------
 2 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index cf3374580f74..db3f98cae4c9 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -226,6 +226,7 @@ struct mr_table_ops {
 
 /**
  * struct mr_table - a multicast routing table
+ * @rcu: used for table destruction
  * @list: entry within a list of multicast routing tables
  * @net: net where this table belongs
  * @ops: protocol specific operations
@@ -243,6 +244,7 @@ struct mr_table_ops {
  * @mroute_reg_vif_num: PIM-device vif index
  */
 struct mr_table {
+	struct rcu_head		rcu;
 	struct list_head	list;
 	possible_net_t		net;
 	struct mr_table_ops	ops;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index fa168513295d..3bf63f8ea606 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -151,16 +151,6 @@ static struct mr_table *__ipmr_get_table(struct net *net, u32 id)
 	return NULL;
 }
 
-static struct mr_table *ipmr_get_table(struct net *net, u32 id)
-{
-	struct mr_table *mrt;
-
-	rcu_read_lock();
-	mrt = __ipmr_get_table(net, id);
-	rcu_read_unlock();
-	return mrt;
-}
-
 static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
 			   struct mr_table **mrt)
 {
@@ -293,7 +283,7 @@ static void __net_exit ipmr_rules_exit_rtnl(struct net *net,
 	struct mr_table *mrt, *next;
 
 	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
-		list_del(&mrt->list);
+		list_del_rcu(&mrt->list);
 		ipmr_free_table(mrt, dev_kill_list);
 	}
 }
@@ -315,28 +305,30 @@ bool ipmr_rule_default(const struct fib_rule *rule)
 }
 EXPORT_SYMBOL(ipmr_rule_default);
 #else
-#define ipmr_for_each_table(mrt, net) \
-	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
-
 static struct mr_table *ipmr_mr_table_iter(struct net *net,
 					   struct mr_table *mrt)
 {
 	if (!mrt)
-		return net->ipv4.mrt;
+		return rcu_dereference(net->ipv4.mrt);
 	return NULL;
 }
 
-static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+static struct mr_table *__ipmr_get_table(struct net *net, u32 id)
 {
-	return net->ipv4.mrt;
+	return rcu_dereference_check(net->ipv4.mrt,
+				     lockdep_rtnl_is_held() ||
+				     !rcu_access_pointer(net->ipv4.mrt));
 }
 
-#define __ipmr_get_table ipmr_get_table
+#define ipmr_for_each_table(mrt, net)				\
+	for (mrt = __ipmr_get_table(net, 0); mrt; mrt = NULL)
 
 static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
 			   struct mr_table **mrt)
 {
-	*mrt = net->ipv4.mrt;
+	*mrt = rcu_dereference(net->ipv4.mrt);
+	if (!*mrt)
+		return -EAGAIN;
 	return 0;
 }
 
@@ -347,7 +339,8 @@ static int __net_init ipmr_rules_init(struct net *net)
 	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
 	if (IS_ERR(mrt))
 		return PTR_ERR(mrt);
-	net->ipv4.mrt = mrt;
+
+	rcu_assign_pointer(net->ipv4.mrt, mrt);
 	return 0;
 }
 
@@ -358,9 +351,10 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 static void __net_exit ipmr_rules_exit_rtnl(struct net *net,
 					    struct list_head *dev_kill_list)
 {
-	ipmr_free_table(net->ipv4.mrt, dev_kill_list);
+	struct mr_table *mrt = rcu_dereference_protected(net->ipv4.mrt, 1);
 
-	net->ipv4.mrt = NULL;
+	RCU_INIT_POINTER(net->ipv4.mrt, NULL);
+	ipmr_free_table(mrt, dev_kill_list);
 }
 
 static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -381,6 +375,17 @@ bool ipmr_rule_default(const struct fib_rule *rule)
 EXPORT_SYMBOL(ipmr_rule_default);
 #endif
 
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+	struct mr_table *mrt;
+
+	rcu_read_lock();
+	mrt = __ipmr_get_table(net, id);
+	rcu_read_unlock();
+
+	return mrt;
+}
+
 static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
 				const void *ptr)
 {
@@ -446,7 +451,7 @@ static void ipmr_free_table(struct mr_table *mrt, struct list_head *dev_kill_lis
 			    MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC,
 			    &ipmr_dev_kill_list);
 	rhltable_destroy(&mrt->mfc_hash);
-	kfree(mrt);
+	kfree_rcu(mrt, rcu);
 
 	WARN_ON_ONCE(!net_initialized(net) && !list_empty(&ipmr_dev_kill_list));
 	list_splice(&ipmr_dev_kill_list, dev_kill_list);
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 07/15] net: Remove rtnl_held of struct fib_dump_filter.
From: Kuniyuki Iwashima @ 2026-04-10 21:17 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

Commit 22e36ea9f5d7 ("inet: allow ip_valid_fib_dump_req() to
be called with RTNL or RCU") introduced the rtnl_held field in
struct fib_dump_filter to switch __dev_get_by_index() and
dev_get_by_index_rcu() depending on the caller's context.

This field served as an interim measure while we were incrementally
converting all callers of ip_valid_fib_dump_req() to RCU.

Now that all users (IPv4, IPv6, ipmr, ip6mr, and MPLS) have
been converted to RCU, the field is no longer necessary.

Let's remove it.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 include/net/ip_fib.h    |  1 -
 net/ipv4/fib_frontend.c | 19 ++++++-------------
 net/ipv4/ipmr.c         |  4 +---
 net/ipv6/ip6_fib.c      |  1 -
 net/ipv6/ip6mr.c        |  4 +---
 net/mpls/af_mpls.c      |  6 ++----
 6 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 318593743b6e..1142ffad7444 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -269,7 +269,6 @@ struct fib_dump_filter {
 	bool			filter_set;
 	bool			dump_routes;
 	bool			dump_exceptions;
-	bool			rtnl_held;
 	unsigned char		protocol;
 	unsigned char		rt_type;
 	unsigned int		flags;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1dab44e13d3b..ceeb87b13b93 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -946,9 +946,6 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 	struct rtmsg *rtm;
 	int err, i;
 
-	if (filter->rtnl_held)
-		ASSERT_RTNL();
-
 	rtm = nlmsg_payload(nlh, sizeof(*rtm));
 	if (!rtm) {
 		NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
@@ -992,10 +989,8 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 			break;
 		case RTA_OIF:
 			ifindex = nla_get_u32(tb[i]);
-			if (filter->rtnl_held)
-				filter->dev = __dev_get_by_index(net, ifindex);
-			else
-				filter->dev = dev_get_by_index_rcu(net, ifindex);
+
+			filter->dev = dev_get_by_index_rcu(net, ifindex);
 			if (!filter->dev)
 				return -ENODEV;
 			break;
@@ -1017,18 +1012,16 @@ EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
 
 static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	const struct nlmsghdr *nlh = cb->nlh;
+	struct net *net = sock_net(skb->sk);
 	struct fib_dump_filter filter = {
 		.dump_routes = true,
 		.dump_exceptions = true,
-		.rtnl_held = false,
 	};
-	const struct nlmsghdr *nlh = cb->nlh;
-	struct net *net = sock_net(skb->sk);
-	unsigned int h, s_h;
-	unsigned int e = 0, s_e;
-	struct fib_table *tb;
+	unsigned int e = 0, s_e, h, s_h;
 	struct hlist_head *head;
 	int dumped = 0, err = 0;
+	struct fib_table *tb;
 
 	rcu_read_lock();
 	if (cb->strict_check) {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8a08d09b4c30..fa168513295d 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2767,9 +2767,7 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct fib_dump_filter filter = {
-		.rtnl_held = false,
-	};
+	struct fib_dump_filter filter = {};
 	int err;
 
 	rcu_read_lock();
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index b897b3c5023b..fc95738ded76 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -633,7 +633,6 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	struct rt6_rtnl_dump_arg arg = {
 		.filter.dump_exceptions = true,
 		.filter.dump_routes = true,
-		.filter.rtnl_held = false,
 	};
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 9d02cd3b274c..2b04e52ec61c 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2747,9 +2747,7 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
-	struct fib_dump_filter filter = {
-		.rtnl_held = false,
-	};
+	struct fib_dump_filter filter = {};
 	int err;
 
 	rcu_read_lock();
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 26340a7306b5..ca504d9626cf 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2221,12 +2221,10 @@ static bool mpls_rt_uses_dev(struct mpls_route *rt,
 
 static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct mpls_route __rcu **platform_label;
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
-	struct mpls_route __rcu **platform_label;
-	struct fib_dump_filter filter = {
-		.rtnl_held = false,
-	};
+	struct fib_dump_filter filter = {};
 	unsigned int flags = NLM_F_MULTI;
 	size_t platform_labels;
 	unsigned int index;
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 06/15] ip6mr: Convert ip6mr_rtm_dumproute() to RCU.
From: Kuniyuki Iwashima @ 2026-04-10 21:17 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

ip6mr_rtm_dumproute() calls mr_table_dump() or mr_rtm_dumproute(),
and mr_rtm_dumproute() finally calls mr_table_dump().

mr_table_dump() calls the passed function, _ip6mr_fill_mroute().

_ip6mr_fill_mroute() is a wrapper for ip6mr_fill_mroute() to cast
struct mr_mfc * to struct mfc6_cache *.

ip6mr_fill_mroute() can already be called safely under RCU.

Let's convert ip6mr_rtm_dumproute() to RCU.

Now there is no user of the rtnl_held field in struct
fib_dump_filter, and the next patch will remove it.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 0054db00fadf..9d02cd3b274c 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1389,7 +1389,7 @@ static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_mo
 	{.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR,
 	 .msgtype = RTM_GETROUTE,
 	 .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute,
-	 .flags = RTNL_FLAG_DOIT_UNLOCKED},
+	 .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
 };
 
 int __init ip6_mr_init(void)
@@ -2748,15 +2748,17 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct fib_dump_filter filter = {
-		.rtnl_held = true,
+		.rtnl_held = false,
 	};
 	int err;
 
+	rcu_read_lock();
+
 	if (cb->strict_check) {
 		err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
 					    &filter, cb);
 		if (err < 0)
-			return err;
+			goto unlock;
 	}
 
 	if (filter.table_id) {
@@ -2764,17 +2766,26 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 
 		mrt = __ip6mr_get_table(sock_net(skb->sk), filter.table_id);
 		if (!mrt) {
-			if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR)
-				return skb->len;
+			if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) {
+				err = skb->len;
+				goto unlock;
+			}
 
 			NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist");
-			return -ENOENT;
+			err = -ENOENT;
+			goto unlock;
 		}
+
 		err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute,
 				    &mfc_unres_lock, &filter);
-		return skb->len ? : err;
+		err = skb->len ? : err;
+		goto unlock;
 	}
 
-	return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
-				_ip6mr_fill_mroute, &mfc_unres_lock, &filter);
+	err = mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
+			       _ip6mr_fill_mroute, &mfc_unres_lock, &filter);
+unlock:
+	rcu_read_unlock();
+
+	return err;
 }
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 05/15] ip6mr: Convert ip6mr_rtm_getroute() to RCU.
From: Kuniyuki Iwashima @ 2026-04-10 21:17 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

ip6mr_rtm_getroute() calls __ip6mr_get_table(), ip6mr_cache_find(),
and ip6mr_fill_mroute().

Once created, struct mr_table is not freed until netns dismantle,
so it's safe under RCU.

ip6mr_cache_find() iterates mrt->mfc_hash with rhl_for_each_entry_rcu().
struct mr_mfc is freed with call_rcu(), so this is also safe under
RCU.

ip6mr_fill_mroute() calls mr_fill_mroute(), which properly uses
RCU helpers.

Let's call them under RCU and register ip6mr_rtm_getroute() with
RTNL_FLAG_DOIT_UNLOCKED.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 5356957bfe94..0054db00fadf 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1388,7 +1388,8 @@ static struct pernet_operations ip6mr_net_ops = {
 static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = {
 	{.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR,
 	 .msgtype = RTM_GETROUTE,
-	 .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute},
+	 .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute,
+	 .flags = RTNL_FLAG_DOIT_UNLOCKED},
 };
 
 int __init ip6_mr_init(void)
@@ -2712,6 +2713,8 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		grp = nla_get_in6_addr(tb[RTA_DST]);
 	tableid = nla_get_u32_default(tb[RTA_TABLE], 0);
 
+	rcu_read_lock();
+
 	mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT);
 	if (!mrt) {
 		NL_SET_ERR_MSG_MOD(extack, "MR table does not exist");
@@ -2719,10 +2722,7 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		goto err;
 	}
 
-	/* entries are added/deleted only under RTNL */
-	rcu_read_lock();
 	cache = ip6mr_cache_find(mrt, &src, &grp);
-	rcu_read_unlock();
 	if (!cache) {
 		NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found");
 		err = -ENOENT;
@@ -2734,9 +2734,12 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto err;
 
+	rcu_read_unlock();
+
 	return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
 
 err:
+	rcu_read_unlock();
 	kfree_skb(skb);
 	return err;
 }
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 04/15] ip6mr: Allocate skb earlier in ip6mr_rtm_getroute().
From: Kuniyuki Iwashima @ 2026-04-10 21:17 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

We will convert ip6mr_rtm_getroute() to RCU in the following patch,
where __ip6mr_get_table() will be called under RCU.

nlmsg_new() uses GFP_KERNEL and needs to be called before holding
rcu_read_lock().

As a prep, let's move nlmsg_new() before __ip6mr_get_table().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 7ea572db9075..5356957bfe94 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2702,6 +2702,10 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	skb = nlmsg_new(mr6_msgsize(false), GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
 	if (tb[RTA_SRC])
 		src = nla_get_in6_addr(tb[RTA_SRC]);
 	if (tb[RTA_DST])
@@ -2711,7 +2715,8 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT);
 	if (!mrt) {
 		NL_SET_ERR_MSG_MOD(extack, "MR table does not exist");
-		return -ENOENT;
+		err = -ENOENT;
+		goto err;
 	}
 
 	/* entries are added/deleted only under RTNL */
@@ -2720,21 +2725,20 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	rcu_read_unlock();
 	if (!cache) {
 		NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found");
-		return -ENOENT;
+		err = -ENOENT;
+		goto err;
 	}
 
-	skb = nlmsg_new(mr6_msgsize(false), GFP_KERNEL);
-	if (!skb)
-		return -ENOBUFS;
-
 	err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid,
 				nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0);
-	if (err < 0) {
-		kfree_skb(skb);
-		return err;
-	}
+	if (err < 0)
+		goto err;
 
 	return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+
+err:
+	kfree_skb(skb);
+	return err;
 }
 
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 03/15] ip6mr: Use MAXMIFS in mr6_msgsize().
From: Kuniyuki Iwashima @ 2026-04-10 21:16 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

mr6_msgsize() calculates skb size needed for ip6mr_fill_mroute().

The size differs based on mrt->maxvif.

We will drop RTNL for ip6mr_rtm_getroute() and mrt->maxvif may
change under RCU.

To avoid -EMSGSIZE, let's calculate the size with the maximum
value of mrt->maxvif, MAXMIFS.

struct rtnexthop is 8 bytes and MAXMIFS is 32, so the maximum delta
is 256 bytes, which is small enough.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index b263d3c69a5a..7ea572db9075 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2539,7 +2539,7 @@ static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 				 cmd, flags);
 }
 
-static int mr6_msgsize(bool unresolved, int maxvif)
+static int mr6_msgsize(bool unresolved)
 {
 	size_t len =
 		NLMSG_ALIGN(sizeof(struct rtmsg))
@@ -2552,7 +2552,7 @@ static int mr6_msgsize(bool unresolved, int maxvif)
 		len = len
 		      + nla_total_size(4)	/* RTA_IIF */
 		      + nla_total_size(0)	/* RTA_MULTIPATH */
-		      + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
+		      + MAXMIFS * NLA_ALIGN(sizeof(struct rtnexthop))
 						/* RTA_MFC_STATS */
 		      + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
 		;
@@ -2567,8 +2567,7 @@ static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
 
-	skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif),
-			GFP_ATOMIC);
+	skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
 
@@ -2724,7 +2723,7 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		return -ENOENT;
 	}
 
-	skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL);
+	skb = nlmsg_new(mr6_msgsize(false), GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
 
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 02/15] ip6mr: Annotate access to mrt->mroute_do_{pim,assert,wrvifwhole}.
From: Kuniyuki Iwashima @ 2026-04-10 21:16 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

These fields in struct mr_table are updated in ip6_mroute_setsockopt()
under RTNL:

  * mroute_do_pim
  * mroute_do_assert (MRT6_PIM is under RTNL while MRT6_ASSERT is lockless)
  * mroute_do_wrvifwhole

However, ip6_mroute_getsockopt() does not hold RTNL and read the first
two fields locklessly, and ip6_mr_forward() reads all the three under
RCU.

Let's use WRITE_ONCE() and READ_ONCE() for them.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 85010ff21c98..b263d3c69a5a 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1780,7 +1780,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 			return -EINVAL;
 		if (copy_from_sockptr(&v, optval, sizeof(v)))
 			return -EFAULT;
-		mrt->mroute_do_assert = v;
+		WRITE_ONCE(mrt->mroute_do_assert, v);
 		return 0;
 	}
 
@@ -1800,9 +1800,9 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 		rtnl_lock();
 		ret = 0;
 		if (v != mrt->mroute_do_pim) {
-			mrt->mroute_do_pim = v;
-			mrt->mroute_do_assert = v;
-			mrt->mroute_do_wrvifwhole = do_wrmifwhole;
+			WRITE_ONCE(mrt->mroute_do_pim, v);
+			WRITE_ONCE(mrt->mroute_do_assert, v);
+			WRITE_ONCE(mrt->mroute_do_wrvifwhole, do_wrmifwhole);
 		}
 		rtnl_unlock();
 		return ret;
@@ -1870,11 +1870,11 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
 		break;
 #ifdef CONFIG_IPV6_PIMSM_V2
 	case MRT6_PIM:
-		val = mrt->mroute_do_pim;
+		val = READ_ONCE(mrt->mroute_do_pim);
 		break;
 #endif
 	case MRT6_ASSERT:
-		val = mrt->mroute_do_assert;
+		val = READ_ONCE(mrt->mroute_do_assert);
 		break;
 	default:
 		return -ENOPROTOOPT;
@@ -2177,20 +2177,20 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
 	if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
 		atomic_long_inc(&c->_c.mfc_un.res.wrong_if);
 
-		if (true_vifi >= 0 && mrt->mroute_do_assert &&
+		if (true_vifi >= 0 && READ_ONCE(mrt->mroute_do_assert) &&
 		    /* pimsm uses asserts, when switching from RPT to SPT,
 		       so that we cannot check that packet arrived on an oif.
 		       It is bad, but otherwise we would need to move pretty
 		       large chunk of pimd to kernel. Ough... --ANK
 		     */
-		    (mrt->mroute_do_pim ||
+		    (READ_ONCE(mrt->mroute_do_pim) ||
 		     c->_c.mfc_un.res.ttls[true_vifi] < 255) &&
 		    time_after(jiffies,
 			       c->_c.mfc_un.res.last_assert +
 			       MFC_ASSERT_THRESH)) {
 			c->_c.mfc_un.res.last_assert = jiffies;
 			ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF);
-			if (mrt->mroute_do_wrvifwhole)
+			if (READ_ONCE(mrt->mroute_do_wrvifwhole))
 				ip6mr_cache_report(mrt, skb, true_vifi,
 						   MRT6MSG_WRMIFWHOLE);
 		}
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 01/15] selftest: net: Extend ipmr.c for IP6MR.
From: Kuniyuki Iwashima @ 2026-04-10 21:16 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

This commit extends most test cases in ipmr.c for IPV6MR.

Note that IP6MR does not provide rtnetlink interface for MFC,
so such tests will be skipped.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 tools/testing/selftests/net/forwarding/ipmr.c | 163 ++++++++++++------
 1 file changed, 110 insertions(+), 53 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/ipmr.c b/tools/testing/selftests/net/forwarding/ipmr.c
index df870aad9ead..cfd00173bcd6 100644
--- a/tools/testing/selftests/net/forwarding/ipmr.c
+++ b/tools/testing/selftests/net/forwarding/ipmr.c
@@ -2,7 +2,9 @@
 /* Copyright 2026 Google LLC */
 
 #include <linux/if.h>
+#include <linux/in6.h>
 #include <linux/mroute.h>
+#include <linux/mroute6.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 #include <linux/socket.h>
@@ -17,6 +19,14 @@ FIXTURE(ipmr)
 	int netlink_sk;
 	int raw_sk;
 	int veth_ifindex;
+	union {
+		struct vifctl vif;
+		struct mif6ctl vif6;
+	};
+	union {
+		struct mfcctl mfc;
+		struct mf6cctl mfc6;
+	};
 };
 
 FIXTURE_VARIANT(ipmr)
@@ -25,6 +35,11 @@ FIXTURE_VARIANT(ipmr)
 	int protocol;
 	int level;
 	int opts[MRT_MAX - MRT_BASE + 1];
+	int vif_size;
+	char vif_check_cmd_pimreg[64];
+	char vif_check_cmd_veth[64];
+	int mfc_size;
+	char mfc_check_cmd[1024];
 };
 
 FIXTURE_VARIANT_ADD(ipmr, ipv4)
@@ -47,6 +62,39 @@ FIXTURE_VARIANT_ADD(ipmr, ipv4)
 		MRT_DEL_MFC_PROXY,
 		MRT_FLUSH,
 	},
+	.vif_size = sizeof(struct vifctl),
+	.vif_check_cmd_pimreg = "cat /proc/net/ip_mr_vif | grep -q pimreg",
+	.vif_check_cmd_veth = "cat /proc/net/ip_mr_vif | grep -q veth",
+	.mfc_size = sizeof(struct mfcctl),
+	.mfc_check_cmd = "cat /proc/net/ip_mr_cache | grep -q '00000000 00000000'",
+};
+
+FIXTURE_VARIANT_ADD(ipmr, ipv6)
+{
+	.family = AF_INET6,
+	.protocol = IPPROTO_ICMPV6,
+	.level = IPPROTO_IPV6,
+	.opts = {
+		MRT6_INIT,
+		MRT6_DONE,
+		MRT6_ADD_MIF,
+		MRT6_DEL_MIF,
+		MRT6_ADD_MFC,
+		MRT6_DEL_MFC,
+		MRT6_VERSION,
+		MRT6_ASSERT,
+		MRT6_PIM,
+		MRT6_TABLE,
+		MRT6_ADD_MFC_PROXY,
+		MRT6_DEL_MFC_PROXY,
+		MRT_FLUSH,
+	},
+	.vif_size = sizeof(struct mif6ctl),
+	.vif_check_cmd_pimreg = "cat /proc/net/ip6_mr_vif | grep -q pim6reg",
+	.vif_check_cmd_veth = "cat /proc/net/ip6_mr_vif | grep -q veth",
+	.mfc_size = sizeof(struct mf6cctl),
+	.mfc_check_cmd = "cat /proc/net/ip6_mr_cache | "
+		"grep -q '0000:0000:0000:0000:0000:0000:0000:0000 0000:0000:0000:0000:0000:0000:0000:0000'",
 };
 
 struct mfc_attr {
@@ -144,6 +192,18 @@ FIXTURE_SETUP(ipmr)
 	ASSERT_EQ(0, err);
 
 	self->veth_ifindex = ifr.ifr_ifindex;
+
+	if (variant->family == AF_INET) {
+		self->vif = (struct vifctl){
+			.vifc_flags = VIFF_USE_IFINDEX,
+			.vifc_lcl_ifindex = self->veth_ifindex,
+		};
+	} else {
+		self->vif6 = (struct mif6ctl){
+			.mif6c_flags = 0,
+			.mif6c_pifi = self->veth_ifindex,
+		};
+	}
 }
 
 FIXTURE_TEARDOWN(ipmr)
@@ -169,41 +229,39 @@ TEST_F(ipmr, mrt_init)
 
 TEST_F(ipmr, mrt_add_vif_register)
 {
-	struct vifctl vif = {
-		.vifc_vifi = 0,
-		.vifc_flags = VIFF_REGISTER,
-	};
 	int err;
 
+	memset(&self->vif, 0, variant->vif_size);
+
+	if (variant->family == AF_INET)
+		self->vif.vifc_flags = VIFF_REGISTER;
+	else
+		self->vif6.mif6c_flags = MIFF_REGISTER;
+
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif,  variant->vif_size);
 	ASSERT_EQ(0, err);
 
-	err = system("cat /proc/net/ip_mr_vif | grep -q pimreg");
+	err = system(variant->vif_check_cmd_pimreg);
 	ASSERT_EQ(0, err);
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif,  variant->vif_size);
 	ASSERT_EQ(0, err);
 }
 
 TEST_F(ipmr, mrt_del_vif_unreg)
 {
-	struct vifctl vif = {
-		.vifc_vifi = 0,
-		.vifc_flags = VIFF_USE_IFINDEX,
-		.vifc_lcl_ifindex = self->veth_ifindex,
-	};
 	int err;
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif,  variant->vif_size);
 	ASSERT_EQ(0, err);
 
-	err = system("cat /proc/net/ip_mr_vif | grep -q veth0");
+	err = system(variant->vif_check_cmd_veth);
 	ASSERT_EQ(0, err);
 
 	/* VIF is removed along with its device. */
@@ -213,23 +271,18 @@ TEST_F(ipmr, mrt_del_vif_unreg)
 	/* mrt->vif_table[veth_ifindex]->dev is NULL. */
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif,  variant->vif_size);
 	ASSERT_EQ(-1, err);
 	ASSERT_EQ(EADDRNOTAVAIL, errno);
 }
 
 TEST_F(ipmr, mrt_del_vif_netns_dismantle)
 {
-	struct vifctl vif = {
-		.vifc_vifi = 0,
-		.vifc_flags = VIFF_USE_IFINDEX,
-		.vifc_lcl_ifindex = self->veth_ifindex,
-	};
 	int err;
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif,  variant->vif_size);
 	ASSERT_EQ(0, err);
 
 	/* Let cleanup_net() remove veth0 and VIF. */
@@ -237,49 +290,49 @@ TEST_F(ipmr, mrt_del_vif_netns_dismantle)
 
 TEST_F(ipmr, mrt_add_mfc)
 {
-	struct mfcctl mfc = {};
 	int err;
 
 	/* MRT_ADD_MFC / MRT_ADD_MFC_PROXY does not need vif to exist (unlike netlink). */
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_MFC - MRT_BASE],
-			 &mfc,  sizeof(mfc));
+			 &self->mfc, variant->mfc_size);
 	ASSERT_EQ(0, err);
 
 	/* (0.0.0.0 -> 0.0.0.0) */
-	err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' ");
+	err = system(variant->mfc_check_cmd);
 	ASSERT_EQ(0, err);
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_DEL_MFC - MRT_BASE],
-			 &mfc,  sizeof(mfc));
+			 &self->mfc, variant->mfc_size);
 }
 
 TEST_F(ipmr, mrt_add_mfc_proxy)
 {
-	struct mfcctl mfc = {};
 	int err;
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_MFC_PROXY - MRT_BASE],
-			 &mfc,  sizeof(mfc));
+			 &self->mfc, variant->mfc_size);
 	ASSERT_EQ(0, err);
 
-	err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' ");
+	err = system(variant->mfc_check_cmd);
 	ASSERT_EQ(0, err);
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_DEL_MFC_PROXY - MRT_BASE],
-			 &mfc,  sizeof(mfc));
+			 &self->mfc, variant->mfc_size);
 }
 
+#define SKIP_IPV6()						\
+	do {							\
+		if (variant->family == AF_INET6)		\
+			SKIP(return,				\
+			     "no netlink MFC interface");	\
+	} while (0)
+
 TEST_F(ipmr, mrt_add_mfc_netlink)
 {
-	struct vifctl vif = {
-		.vifc_vifi = 0,
-		.vifc_flags = VIFF_USE_IFINDEX,
-		.vifc_lcl_ifindex = self->veth_ifindex,
-	};
 	struct mfc_attr mfc_attr = {
 		.table = RT_TABLE_DEFAULT,
 		.origin = 0,
@@ -289,15 +342,17 @@ TEST_F(ipmr, mrt_add_mfc_netlink)
 	};
 	int err;
 
+	SKIP_IPV6();
+
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif, variant->vif_size);
 	ASSERT_EQ(0, err);
 
 	err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr);
 	ASSERT_EQ(0, err);
 
-	err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' ");
+	err = system(variant->mfc_check_cmd);
 	ASSERT_EQ(0, err);
 
 	err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr);
@@ -306,11 +361,6 @@ TEST_F(ipmr, mrt_add_mfc_netlink)
 
 TEST_F(ipmr, mrt_add_mfc_netlink_proxy)
 {
-	struct vifctl vif = {
-		.vifc_vifi = 0,
-		.vifc_flags = VIFF_USE_IFINDEX,
-		.vifc_lcl_ifindex = self->veth_ifindex,
-	};
 	struct mfc_attr mfc_attr = {
 		.table = RT_TABLE_DEFAULT,
 		.origin = 0,
@@ -320,15 +370,17 @@ TEST_F(ipmr, mrt_add_mfc_netlink_proxy)
 	};
 	int err;
 
+	SKIP_IPV6();
+
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif, variant->vif_size);
 	ASSERT_EQ(0, err);
 
 	err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr);
 	ASSERT_EQ(0, err);
 
-	err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' ");
+	err = system(variant->mfc_check_cmd);
 	ASSERT_EQ(0, err);
 
 	err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr);
@@ -345,6 +397,8 @@ TEST_F(ipmr, mrt_add_mfc_netlink_no_vif)
 	};
 	int err;
 
+	SKIP_IPV6();
+
 	/* netlink always requires RTA_IIF of an existing vif. */
 	mfc_attr.ifindex = 0;
 	err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr);
@@ -378,6 +432,8 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle)
 	};
 	int i, err;
 
+	SKIP_IPV6();
+
 	for (i = 0; i < 2; i++) {
 		/* Create 2 VIFs just to avoid -ENFILE later. */
 		err = setsockopt(self->raw_sk,
@@ -390,7 +446,7 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle)
 	err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr);
 	ASSERT_EQ(0, err);
 
-	err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' ");
+	err = system(variant->mfc_check_cmd);
 	ASSERT_EQ(0, err);
 
 	/* Remove mrt->vif_table[0]. */
@@ -398,7 +454,7 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle)
 	ASSERT_EQ(0, err);
 
 	/* MFC entry is NOT removed even if the tied VIF is removed... */
-	err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' ");
+	err = system(variant->mfc_check_cmd);
 	ASSERT_EQ(0, err);
 
 	/* ... and netlink is not capable of removing such an entry
@@ -412,11 +468,6 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle)
 
 TEST_F(ipmr, mrt_table_flush)
 {
-	struct vifctl vif = {
-		.vifc_vifi = 0,
-		.vifc_flags = VIFF_USE_IFINDEX,
-		.vifc_lcl_ifindex = self->veth_ifindex,
-	};
 	struct mfc_attr mfc_attr = {
 		.origin = 0,
 		.group = 0,
@@ -436,11 +487,17 @@ TEST_F(ipmr, mrt_table_flush)
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif,  variant->vif_size);
 	ASSERT_EQ(0, err);
 
-	mfc_attr.table = table_id;
-	err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr);
+	if (variant->family == AF_INET) {
+		mfc_attr.table = table_id;
+		err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr);
+	} else {
+		err = setsockopt(self->raw_sk,
+				 variant->level, variant->opts[MRT_ADD_MFC - MRT_BASE],
+				 &self->mfc, variant->mfc_size);
+	}
 	ASSERT_EQ(0, err);
 
 	/* Flush mrt->vif_table[] and all caches. */
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 00/15] ip6mr: No RTNL for RTNL_FAMILY_IP6MR rtnetlink.
From: Kuniyuki Iwashima @ 2026-04-10 21:16 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

This series is the IPv6 version of

  https://lore.kernel.org/netdev/20260228221800.1082070-1-kuniyu@google.com/

and removes RTNL from ip6mr rtnetlink handlers.

After this series, there are a few RTNL left in net/ipv6/ipmr.c
and such users will be converted to per-netns RTNL in another
series.

Patch 1 extends the ipmr selftest to exercise most of the RTNL
 paths in net/ipv6/ipmr.c

Patch 2 - 6 converts RTM_GETROUTE handlers to RCU.

Patch 7 removes struct fib_dump_filter.rtnl_held.

Patch 8 - 9 use RCU for mr_table for CONFIG_IP_MROUTE_MULTIPLE_TABLES=n
 and CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=n for ->exit_rtnl().

Patch 10 - 12 converts ->exit_batch() to ->exit_rtnl() to
 save one RTNL in cleanup_net().

Patch 13 - 14 removes unnecessary RTNL during setup_net()
 failure.

Patch 15 drops RTNL for MRT6_(ADD|DEL)_MFC(_PROXY)?.


Changes:
  v2:
    Add patch 8 & 9 for ->exit_rtnl() conversion and
     drop 2 trivial patches (patch 2 & 14 in v1)

  v1: https://lore.kernel.org/netdev/20260407212001.2368593-1-kuniyu@google.com/


Kuniyuki Iwashima (15):
  selftest: net: Extend ipmr.c for IP6MR.
  ip6mr: Annotate access to mrt->mroute_do_{pim,assert,wrvifwhole}.
  ip6mr: Use MAXMIFS in mr6_msgsize().
  ip6mr: Allocate skb earlier in ip6mr_rtm_getroute().
  ip6mr: Convert ip6mr_rtm_getroute() to RCU.
  ip6mr: Convert ip6mr_rtm_dumproute() to RCU.
  net: Remove rtnl_held of struct fib_dump_filter.
  ipmr: Free mr_table after RCU grace period.
  ip6mr: Free mr_table after RCU grace period.
  ip6mr: Move unregister_netdevice_many() out of mroute_clean_tables().
  ip6mr: Move unregister_netdevice_many() out of ip6mr_free_table().
  ip6mr: Convert ip6mr_net_exit_batch() to ->exit_rtnl().
  ip6mr: Remove RTNL in ip6mr_rules_init() and ip6mr_net_init().
  ip6mr: Call fib_rules_unregister() without RTNL.
  ip6mr: Replace RTNL with a dedicated mutex for MFC.

 include/linux/mroute_base.h                   |   2 +
 include/net/ip_fib.h                          |   1 -
 include/net/netns/ipv6.h                      |   1 +
 net/ipv4/fib_frontend.c                       |  19 +-
 net/ipv4/ipmr.c                               |  55 +++--
 net/ipv6/ip6_fib.c                            |   1 -
 net/ipv6/ip6mr.c                              | 233 +++++++++++-------
 net/mpls/af_mpls.c                            |   6 +-
 tools/testing/selftests/net/forwarding/ipmr.c | 163 ++++++++----
 9 files changed, 289 insertions(+), 192 deletions(-)

-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply

* Re: [PATCH RFC net-next 02/10] net: stmmac: rename dev_id to userver
From: Jitendra Vegiraju @ 2026-04-10 21:04 UTC (permalink / raw)
  To: Russell King (Oracle)
  Cc: Andrew Lunn, Alexandre Torgue, Andrew Lunn, Chen-Yu Tsai,
	David S. Miller, Eric Dumazet, Jakub Kicinski, linux-arm-kernel,
	linux-stm32, linux-sunxi, netdev, Paolo Abeni, Samuel Holland
In-Reply-To: <adi3Vks-N0a83ylE@shell.armlinux.org.uk>

[-- Attachment #1: Type: text/plain, Size: 2139 bytes --]

On Fri, Apr 10, 2026 at 1:39 AM Russell King (Oracle)
<linux@armlinux.org.uk> wrote:
>
> On Thu, Apr 09, 2026 at 04:07:42PM -0700, Jitendra Vegiraju wrote:
> > Hi Russell,
> >
> > On Wed, Apr 8, 2026 at 2:27 AM Russell King (Oracle)
> > <rmk+kernel@armlinux.org.uk> wrote:
> > >
> > > The Synopsys Databook and several implementation TRMs identify bits
> > > 15:8 of the version register in dwmac v3.xx and v4.xx as "userver".
> > > We even print its value with "User ID". Rather than using "dev_id",
> > > use "userver" instead.
> > >
> > > Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
> > > ---
> > >  drivers/net/ethernet/stmicro/stmmac/hwif.c | 18 +++++++++---------
> > >  1 file changed, 9 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
> > > index 3774af66db48..830ff816ab4f 100644
> > > --- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
> > > +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
> > > @@ -15,7 +15,7 @@
> > >
> > >  struct stmmac_version {
> > >         u8 snpsver;
> > > -       u8 dev_id;
> > > +       u8 userver;
> > >  };
> > From the XGMAC databook that I have access to bits(15:8) identify the
> > DEVID field of MAC_version register.
> > The userver field is from bits(23:16) of the same register. This is a
> > customer defined field (configured with coreConsultant).
> > Currently stmmac doesn't care about bits(23:16).
>
> Thanks for the additional information.
>
> I don't have any XGMAC documentation, but this indicates that it differs
> between XGMAC and previous cores - GMAC and GMAC4 cores, 15:8 are
> documented as userver, and 31:16 are marked as reserved.
>
> Note that the dev_info() also prints 15:8 as "User ID" not "Device ID".
>
> To confirm, is the XGMAC version register at offset 0x20 ? Later GMAC
> cores moved it to 0x110.
The XGMAC version register is at offset 0x110.

>
> --
> RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
> FTTP is here! 80Mbps down 10Mbps up. Decent connectivity at last!

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5435 bytes --]

^ permalink raw reply

* Re: [PATCH net-next] tcp: add indirect call wrapper in tcp_conn_request()
From: Kuniyuki Iwashima @ 2026-04-10 20:57 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, netdev, eric.dumazet
In-Reply-To: <20260410174950.745670-1-edumazet@google.com>

On Fri, Apr 10, 2026 at 10:49 AM Eric Dumazet <edumazet@google.com> wrote:
>
> Small improvement in SYN processing, to directly call
> tcp_v6_init_seq_and_ts_off() or tcp_v4_init_seq_and_ts_off().
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>

^ permalink raw reply

* Re: [PATCH net-next 1/3] psp: add crypt-offset and spi-threshold get/set attributes
From: Jakub Kicinski @ 2026-04-10 20:57 UTC (permalink / raw)
  To: Akhilesh Samineni
  Cc: Willem de Bruijn, davem, edumazet, pabeni, andrew+netdev, horms,
	willemb, daniel.zahka, netdev, linux-kernel,
	jayakrishnan.udayavarma, ajit.khaparde, kiran.kella, sachin.suman
In-Reply-To: <CANQF7iDj1E6=3pod078-++5wbweyMg0H7gpeq0kbGFZHNytDdA@mail.gmail.com>

On Sat, 11 Apr 2026 01:06:06 +0530 Akhilesh Samineni wrote:
> On Wed, Apr 8, 2026 at 6:34 AM Jakub Kicinski <kuba@kernel.org> wrote:
> > On Tue, 07 Apr 2026 17:37:41 -0400 Willem de Bruijn wrote:  
> > > PSP defines a 6-bit field in 4 octet units. Does this need bounds checking?  
> >
> > More fundamentally, were we to support this -- is it a device property
> > or an assoc property?  
> 
> It's a device property. All associations under the device will share
> the same crypt-offset.

I don't think there's anything in the spec that says the crypto
offset is device level.
At the very least every L4 proto may want to have a different offset.
We should probably hold off adding this until a real user appears.

^ permalink raw reply

* [patch V1.1 11/38] misc: sgi-gru: Remove get_cycles() [ab]use
From: Thomas Gleixner @ 2026-04-10 20:56 UTC (permalink / raw)
  To: LKML
  Cc: Arnd Bergmann, x86, Lu Baolu, iommu, Michael Grzeschik, netdev,
	linux-wireless, Herbert Xu, linux-crypto, Vlastimil Babka,
	linux-mm, David Woodhouse, Bernie Thompson, linux-fbdev,
	Theodore Tso, linux-ext4, Andrew Morton, Uladzislau Rezki,
	Marco Elver, Dmitry Vyukov, kasan-dev, Andrey Ryabinin,
	Thomas Sailer, linux-hams, Jason A. Donenfeld, Richard Henderson,
	linux-alpha, Russell King, linux-arm-kernel, Catalin Marinas,
	Huacai Chen, loongarch, Geert Uytterhoeven, linux-m68k,
	Dinh Nguyen, Jonas Bonn, linux-openrisc, Helge Deller,
	linux-parisc, Michael Ellerman, linuxppc-dev, Paul Walmsley,
	linux-riscv, Heiko Carstens, linux-s390, David S. Miller,
	sparclinux
In-Reply-To: <20260410120318.320727701@kernel.org>

Calculating a timeout from get_cycles() is a historical leftover without
any functional requirement.

Use ktime_get() instead.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
V2: Fix typo
---
 drivers/misc/sgi-gru/gruhandles.c   |   20 ++++++++------------
 drivers/misc/sgi-gru/grukservices.c |    3 ++-
 drivers/misc/sgi-gru/grutlbpurge.c  |    5 ++---
 3 files changed, 12 insertions(+), 16 deletions(-)

--- a/drivers/misc/sgi-gru/gruhandles.c
+++ b/drivers/misc/sgi-gru/gruhandles.c
@@ -6,26 +6,22 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/timekeeping.h>
 #include "gru.h"
 #include "grulib.h"
 #include "grutables.h"
 
-/* 10 sec */
 #include <linux/sync_core.h>
-#include <asm/tsc.h>
-#define GRU_OPERATION_TIMEOUT	((cycles_t) tsc_khz*10*1000)
-#define CLKS2NSEC(c)		((c) * 1000000 / tsc_khz)
+
+#define GRU_OPERATION_TIMEOUT_NSEC	(((ktime_t)10 * NSEC_PER_SEC))
 
 /* Extract the status field from a kernel handle */
 #define GET_MSEG_HANDLE_STATUS(h)	(((*(unsigned long *)(h)) >> 16) & 3)
 
 struct mcs_op_statistic mcs_op_statistics[mcsop_last];
 
-static void update_mcs_stats(enum mcs_op op, unsigned long clks)
+static void update_mcs_stats(enum mcs_op op, unsigned long nsec)
 {
-	unsigned long nsec;
-
-	nsec = CLKS2NSEC(clks);
 	atomic_long_inc(&mcs_op_statistics[op].count);
 	atomic_long_add(nsec, &mcs_op_statistics[op].total);
 	if (mcs_op_statistics[op].max < nsec)
@@ -58,21 +54,21 @@ static void report_instruction_timeout(v
 
 static int wait_instruction_complete(void *h, enum mcs_op opc)
 {
+	ktime_t start_time = ktime_get();
 	int status;
-	unsigned long start_time = get_cycles();
 
 	while (1) {
 		cpu_relax();
 		status = GET_MSEG_HANDLE_STATUS(h);
 		if (status != CCHSTATUS_ACTIVE)
 			break;
-		if (GRU_OPERATION_TIMEOUT < (get_cycles() - start_time)) {
+		if (GRU_OPERATION_TIMEOUT_NSEC < (ktime_get() - start_time)) {
 			report_instruction_timeout(h);
-			start_time = get_cycles();
+			start_time = ktime_get();
 		}
 	}
 	if (gru_options & OPT_STATS)
-		update_mcs_stats(opc, get_cycles() - start_time);
+		update_mcs_stats(opc, (unsigned long)(ktime_get() - start_time));
 	return status;
 }
 
--- a/drivers/misc/sgi-gru/grukservices.c
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -20,6 +20,7 @@
 #include <linux/uaccess.h>
 #include <linux/delay.h>
 #include <linux/export.h>
+#include <linux/random.h>
 #include <asm/io_apic.h>
 #include "gru.h"
 #include "grulib.h"
@@ -1106,7 +1107,7 @@ static int quicktest3(unsigned long arg)
 	int ret = 0;
 
 	memset(buf2, 0, sizeof(buf2));
-	memset(buf1, get_cycles() & 255, sizeof(buf1));
+	memset(buf1, get_random_u32() & 255, sizeof(buf1));
 	gru_copy_gpa(uv_gpa(buf2), uv_gpa(buf1), BUFSIZE);
 	if (memcmp(buf1, buf2, BUFSIZE)) {
 		printk(KERN_DEBUG "GRU:%d quicktest3 error\n", smp_processor_id());
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -22,13 +22,12 @@
 #include <linux/delay.h>
 #include <linux/timex.h>
 #include <linux/srcu.h>
+#include <linux/random.h>
 #include <asm/processor.h>
 #include "gru.h"
 #include "grutables.h"
 #include <asm/uv/uv_hub.h>
 
-#define gru_random()	get_cycles()
-
 /* ---------------------------------- TLB Invalidation functions --------
  * get_tgh_handle
  *
@@ -49,7 +48,7 @@ static inline int get_off_blade_tgh(stru
 	int n;
 
 	n = GRU_NUM_TGH - gru->gs_tgh_first_remote;
-	n = gru_random() % n;
+	n = get_random_u32() % n;
 	n += gru->gs_tgh_first_remote;
 	return n;
 }

^ permalink raw reply

* [patch V1.1 02/38] x86: Cleanup include recursion hell
From: Thomas Gleixner @ 2026-04-10 20:55 UTC (permalink / raw)
  To: LKML
  Cc: Arnd Bergmann, x86, Lu Baolu, iommu, Michael Grzeschik, netdev,
	linux-wireless, Herbert Xu, linux-crypto, Vlastimil Babka,
	linux-mm, David Woodhouse, Bernie Thompson, linux-fbdev,
	Theodore Tso, linux-ext4, Andrew Morton, Uladzislau Rezki,
	Marco Elver, Dmitry Vyukov, kasan-dev, Andrey Ryabinin,
	Thomas Sailer, linux-hams, Jason A. Donenfeld, Richard Henderson,
	linux-alpha, Russell King, linux-arm-kernel, Catalin Marinas,
	Huacai Chen, loongarch, Geert Uytterhoeven, linux-m68k,
	Dinh Nguyen, Jonas Bonn, linux-openrisc, Helge Deller,
	linux-parisc, Michael Ellerman, linuxppc-dev, Paul Walmsley,
	linux-riscv, Heiko Carstens, linux-s390, David S. Miller,
	sparclinux
In-Reply-To: <20260410120317.709923681@kernel.org>

Including a random architecture specific header which requires global
headers just to avoid including that header at the two usage sites is
really beyond lazy and tasteless. Including global headers just to get the
__percpu macro from linux/compiler_types.h falls into the same category.

Remove the linux/percpu.h and asm/cpumask.h includes from msr.h and smp.h
and fix the resulting fallout by a simple forward struct declaration and by
including the x86 specific asm/cpumask.h header where it is actually
required.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
V1.1: Fix PARAVIRT_XXL fallout....
---
 arch/x86/include/asm/cache.h             |    1 +
 arch/x86/include/asm/msr.h               |    5 +++--
 arch/x86/include/asm/paravirt.h          |    3 ++-
 arch/x86/include/asm/pvclock.h           |    1 +
 arch/x86/include/asm/smp.h               |    2 --
 arch/x86/include/asm/vdso/gettimeofday.h |    5 ++---
 arch/x86/kernel/cpu/mce/core.c           |    1 +
 arch/x86/kernel/nmi.c                    |    1 +
 arch/x86/kernel/smpboot.c                |    1 +
 9 files changed, 12 insertions(+), 8 deletions(-)

--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_X86_CACHE_H
 #define _ASM_X86_CACHE_H
 
+#include <vdso/page.h>
 #include <linux/linkage.h>
 
 /* L1 cache line size */
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -8,12 +8,11 @@
 
 #include <asm/asm.h>
 #include <asm/errno.h>
-#include <asm/cpumask.h>
 #include <uapi/asm/msr.h>
 #include <asm/shared/msr.h>
 
+#include <linux/compiler_types.h>
 #include <linux/types.h>
-#include <linux/percpu.h>
 
 struct msr_info {
 	u32			msr_no;
@@ -256,6 +255,8 @@ int msr_set_bit(u32 msr, u8 bit);
 int msr_clear_bit(u32 msr, u8 bit);
 
 #ifdef CONFIG_SMP
+struct cpumask;
+
 int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
 int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
 int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -16,9 +16,10 @@
 
 #ifndef __ASSEMBLER__
 #include <linux/types.h>
-#include <linux/cpumask.h>
 #include <asm/frame.h>
 
+struct cpumask;
+
 /* The paravirtualized I/O functions */
 static inline void slow_down_io(void)
 {
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_X86_PVCLOCK_H
 #define _ASM_X86_PVCLOCK_H
 
+#include <asm/barrier.h>
 #include <asm/clocksource.h>
 #include <asm/pvclock-abi.h>
 
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -5,8 +5,6 @@
 #include <linux/cpumask.h>
 #include <linux/thread_info.h>
 
-#include <asm/cpumask.h>
-
 DECLARE_PER_CPU_CACHE_HOT(int, cpu_number);
 
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -11,13 +11,12 @@
 #define __ASM_VDSO_GETTIMEOFDAY_H
 
 #ifndef __ASSEMBLER__
-
+#include <clocksource/hyperv_timer.h>
 #include <uapi/linux/time.h>
+
 #include <asm/vgtod.h>
 #include <asm/unistd.h>
-#include <asm/msr.h>
 #include <asm/pvclock.h>
-#include <clocksource/hyperv_timer.h>
 #include <asm/vdso/sys_call.h>
 
 #define VDSO_HAS_TIME 1
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -48,6 +48,7 @@
 #include <linux/vmcore_info.h>
 
 #include <asm/fred.h>
+#include <asm/cpumask.h>
 #include <asm/cpu_device_id.h>
 #include <asm/processor.h>
 #include <asm/traps.h>
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -26,6 +26,7 @@
 #include <linux/sched/clock.h>
 #include <linux/kvm_types.h>
 
+#include <asm/cpumask.h>
 #include <asm/cpu_entry_area.h>
 #include <asm/traps.h>
 #include <asm/mach_traps.h>
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -70,6 +70,7 @@
 #include <asm/irq.h>
 #include <asm/realmode.h>
 #include <asm/cpu.h>
+#include <asm/cpumask.h>
 #include <asm/numa.h>
 #include <asm/tlbflush.h>
 #include <asm/mtrr.h>

^ permalink raw reply

* Re: [PATCH net-next v3 00/12] net: airoha: Support multiple net_devices connected to the same GDM port
From: Jakub Kicinski @ 2026-04-10 20:49 UTC (permalink / raw)
  To: Lorenzo Bianconi
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley, Christian Marangi,
	Benjamin Larsson, linux-arm-kernel, linux-mediatek, netdev,
	devicetree, Xuegang Lu
In-Reply-To: <adiop-9Mo4ADfvfw@lore-desk>

On Fri, 10 Apr 2026 09:37:11 +0200 Lorenzo Bianconi wrote:
> > On Mon, 06 Apr 2026 12:34:05 +0200 Lorenzo Bianconi wrote:  
> > > EN7581 or AN7583 SoCs support connecting multiple external SerDes (e.g.
> > > Ethernet or USB SerDes) to GDM3 or GDM4 ports via a hw arbiter that
> > > manages the traffic in a TDM manner. As a result multiple net_devices can
> > > connect to the same GDM{3,4} port and there is a theoretical "1:n"
> > > relation between GDM ports and net_devices.  
> > 
> > Looks like this driver uses page pool.
> > If you're sharing the same page pool across multiple netdevs
> > it must not be linked to a netdev.  
> 
> are you referring to slow.netdev pointer? If so, this is not set in airoha_eth
> driver.

Yes. Alright, thanks for checking. Pretty sure I saw it set somewhere 
in a file called airoha* but must be another component :)

^ permalink raw reply

* Re: [PATCH v3] selftests: vsock: avoid races creating Unix socket paths
From: Jakub Kicinski @ 2026-04-10 20:47 UTC (permalink / raw)
  To: Cao Ruichuang
  Cc: Stefano Garzarella, Shuah Khan, Stefano Garzarella, Simon Horman,
	Bobby Eshleman, virtualization, netdev, linux-kselftest,
	linux-kernel
In-Reply-To: <177581562073.13887.468247298173578281@163.com>

On Fri, 10 Apr 2026 18:07:00 +0800 Cao Ruichuang wrote:
> vmtest.sh currently uses mktemp -u to precompute Unix socket paths for the
> namespace bridge helpers. That only returns an unused pathname and leaves a
> time-of-check/time-of-use window before socat binds or connects to it.
> 
> Create a private temporary directory with mktemp -d and place the
> socket path inside it instead. This removes the pathname race while
> keeping cleanup straightforward.

And you actually run into this as a real problem?
How do you repro the failure?

Basic netdev rules:
 - don't post new version of patches in reply to the old ones
 - no more than 1 posting in a 24h period

^ permalink raw reply

* Re: [PATCH 2/4] net: ionic: Add PHC state page for user space access
From: Jakub Kicinski @ 2026-04-10 20:43 UTC (permalink / raw)
  To: Allen Hubbe
  Cc: Abhijit Gangurde, jgg, leon, brett.creeley, andrew+netdev, davem,
	edumazet, pabeni, nikhil.agarwal, linux-rdma, netdev,
	linux-kernel
In-Reply-To: <52cee89f-50e2-4569-a622-b03e711ab26b@amd.com>

On Fri, 10 Apr 2026 09:10:09 -0400 Allen Hubbe wrote:
> >> +struct ionic_phc_state {
> >> +     __u32 seq;
> >> +     __u32 rsvd;
> >> +     __aligned_u64 mask;
> >> +     __aligned_u64 tick;
> >> +     __aligned_u64 nsec;
> >> +     __aligned_u64 frac;
> >> +     __u32 mult;
> >> +     __u32 shift;
> >> +};  
> > 
> > You're just exposing kernel timecounter internals.
> > Why is this ionic uAPI and not something reusable by other drivers?  
> 
> The simple answer is just following the same approach as an existing 
> implementation.  See struct mlx5_ib_clock_info and 
> mlx5_update_clock_info_page().
> 
> Making this common might risk presuming that other implementations will 
> be a similar design.  Compare these to the sfc driver.  The clock is 
> quite different from ionic and mlx5, not using timecounter, because 
> instead of a free-running cycle counter the hardware itself provides an 
> adjustable clock for timestamping.

So your augment is basically that drivers which don't use sw timecounter
exist so we shouldn't bother creating common definitions for drivers
that do? Why do we have common implementation of timecounter in the
kernel at all then?

These are rhetorical questions.

^ permalink raw reply

* Re: [PATCH v5 net-next 0/8] dpll/ice: Add TXC DPLL type and full TX reference clock control for E825
From: Jakub Kicinski @ 2026-04-10 20:38 UTC (permalink / raw)
  To: Nitka, Grzegorz
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	intel-wired-lan@lists.osuosl.org, Oros, Petr,
	richardcochran@gmail.com, andrew+netdev@lunn.ch,
	Kitszel, Przemyslaw, Nguyen, Anthony L,
	Prathosh.Satish@microchip.com, Vecera, Ivan, jiri@resnulli.us,
	Kubalewski, Arkadiusz, vadim.fedorenko@linux.dev,
	donald.hunter@gmail.com, horms@kernel.org, pabeni@redhat.com,
	davem@davemloft.net, edumazet@google.com
In-Reply-To: <IA1PR11MB62194BF52262FCEB7FD5E76D92592@IA1PR11MB6219.namprd11.prod.outlook.com>

On Fri, 10 Apr 2026 14:23:58 +0000 Nitka, Grzegorz wrote:
> Here is the high-level connection diagram for E825 device. I hope you find it helpful:
> [..]

It does thanks a lot.

> Before this series, we tried different approaches.
> One of them was to create MUX pin associated with netdev interface.
> EXT_REF and SYNCE pins were registered with this MUX pin.
> However I recall there were at least two issues with this solution:
> - when using DPLL subsystem not all the connections/relations were visible
>   from DPLL pin-get perspective. RT netlink was required
> - due to mixing pins from different modules (like fwnode based pin from zl driver
>   and the pins from ice), we were not able to safely clean the references between
>   pins and dpll (basicaly .. we observed crashes)
> 
> Proposed solution just seems to be clean and fully reflects current
> connection topology.

Do you have the link to the old proposal that was adding stuff to
rtnetlink? I remember some discussion long-ish ago, maybe I was wrong.

> What's actually your biggest concern?
> The fact we introduce a new DPLL type? Or multiply DPLL instances? Or both?
> Do you prefer to see "one big" DPLL with 16 pins in our case (8 ports x 2 tx-clk pins)?
> Each pin with the name like, for example, PF0-SyncE/PF0-eRef etc.?

My concern is that I think this is a pretty run of the mill SyncE
design. If we need to pretend we have two DPLLs here if we really
only have one and a mux - then our APIs are mis-designed :(

^ permalink raw reply

* Re: [PATCH net 1/1] af_unix: Hold receive queue lock in ioctl(SIOCATMARK)
From: Kuniyuki Iwashima @ 2026-04-10 20:06 UTC (permalink / raw)
  To: n05ec
  Cc: bird, davem, edumazet, enjou1224z, horms, kuba, kuniyu, netdev,
	pabeni, rao.shoaib, tomapufckgml, wangjiexun2025, yifanwucs,
	yuantan098
In-Reply-To: <f6cbbc8da90e95584847b5ceb60aae830d1631c2.1775731983.git.wangjiexun2025@gmail.com>

From: Ren Wei <n05ec@lzu.edu.cn>
Date: Fri, 10 Apr 2026 14:31:57 +0800
> From: Jiexun Wang <wangjiexun2025@gmail.com>
> 
> unix_ioctl() peeks at the receive queue and may check both the head skb
> and its successor while deciding whether SIOCATMARK should report the
> mark. However, u->iolock does not stabilize receive-queue element
> lifetime. Queue teardown paths can purge or splice the queue under

Please be more specific here.


> sk->sk_receive_queue.lock and free the skb while unix_ioctl() still
> uses it.
> 
> Take sk->sk_receive_queue.lock while inspecting the queue so the skb
> and next_skb stay alive for the whole decision.
> 
> Fixes: 314001f0bf92 ("af_unix: Add OOB support")
> Reported-by: Yifan Wu <yifanwucs@gmail.com>
> Reported-by: Juefei Pu <tomapufckgml@gmail.com>
> Co-developed-by: Yuan Tan <yuantan098@gmail.com>
> Signed-off-by: Yuan Tan <yuantan098@gmail.com>
> Suggested-by: Xin Liu <bird@lzu.edu.cn>
> Tested-by: Ren Wei <enjou1224z@gmail.com>
> Signed-off-by: Jiexun Wang <wangjiexun2025@gmail.com>
> Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
> ---
>  net/unix/af_unix.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index b23c33df8b46..54f12d5cda37 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -3301,6 +3301,8 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
>  			int answ = 0;
>  
>  			mutex_lock(&u->iolock);
> +			/* The receive queue lock keeps skb and next_skb alive. */
> +			spin_lock(&sk->sk_receive_queue.lock);

I think this is not the correct fix.

SIOCATMARK is apparently for MSG_OOB,

  $ man 3 sockatmark

and non SOCK_STREAM sockets do not support MSG_OOB
and should not abuse SIOCATMARK.

---8<---
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index eebabf0bd850..868b26c963ab 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -3299,6 +3299,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			struct sk_buff *skb;
 			int answ = 0;
 
+			if (sk->sk_type != SOCK_STREAM)
+				return -EOPNOTSUPP;
+
 			mutex_lock(&u->iolock);
 
 			skb = skb_peek(&sk->sk_receive_queue);
---8<---

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox