public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
From: Daniel Borkmann <daniel@iogearbox.net>
To: netdev@vger.kernel.org
Cc: bpf@vger.kernel.org, kuba@kernel.org, davem@davemloft.net,
	razor@blackwall.org, pabeni@redhat.com, willemb@google.com,
	sdf@fomichev.me, john.fastabend@gmail.com, martin.lau@kernel.org,
	jordan@jrife.io, maciej.fijalkowski@intel.com,
	magnus.karlsson@intel.com, dw@davidwei.uk, toke@redhat.com,
	yangzhenze@bytedance.com, wangdongdong.6@bytedance.com
Subject: [PATCH net-next v9 12/14] netkit: Add netkit notifier to check for unregistering devices
Date: Fri, 20 Mar 2026 23:18:12 +0100	[thread overview]
Message-ID: <20260320221814.236775-13-daniel@iogearbox.net> (raw)
In-Reply-To: <20260320221814.236775-1-daniel@iogearbox.net>

Add a netdevice notifier in netkit to watch for NETDEV_UNREGISTER events.
If the target device is indeed NETREG_UNREGISTERING and previously leased
a queue to a netkit device, then collect the related netkit devices and
batch-unregister_netdevice_many() them.

If this were not done, then the netkit device would hold a reference on
the physical device preventing it from going away. However, in case of
both io_uring zero-copy as well as AF_XDP this situation is handled
gracefully and the allocated resources are torn down.

In the case where mentioned infra is used through netkit, the applications
have a reference on netkit, and netkit in turn holds a reference on the
physical device. In order to have netkit release the reference on the
physical device, we need such watcher to then unregister the netkit ones.

This is generally quite similar to the dependency handling in case of
tunnels (e.g. vxlan bound to a underlying netdev) where the tunnel device
gets removed along with the physical device.

  # ip a
  [...]
  4: enp10s0f0np0: <BROADCAST,MULTICAST> mtu 1500 qdisc mq state DOWN group default qlen 1000
      link/ether e8:eb:d3:a3:43:f6 brd ff:ff:ff:ff:ff:ff
      inet 10.0.0.2/24 scope global enp10s0f0np0
         valid_lft forever preferred_lft forever
  [...]
  8: nk@NONE: <BROADCAST,MULTICAST,NOARP> mtu 1500 qdisc noop state DOWN group default qlen 1000
      link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff
  [...]

  # rmmod mlx5_ib
  # rmmod mlx5_core
  [...]
  [  309.261822] mlx5_core 0000:0a:00.0 mlx5_0: Port: 1 Link DOWN
  [  344.235236] mlx5_core 0000:0a:00.1: E-Switch: Unload vfs: mode(LEGACY), nvfs(0), necvfs(0), active vports(0)
  [  344.246948] mlx5_core 0000:0a:00.1: E-Switch: Disable: mode(LEGACY), nvfs(0), necvfs(0), active vports(0)
  [  344.463754] mlx5_core 0000:0a:00.1: E-Switch: Disable: mode(LEGACY), nvfs(0), necvfs(0), active vports(0)
  [  344.770155] mlx5_core 0000:0a:00.1: E-Switch: cleanup
  [...]

  # ip a
  [...]
  [ both enp10s0f0np0 and nk gone ]
  [...]

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Co-developed-by: David Wei <dw@davidwei.uk>
Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
---
 drivers/net/netkit.c      | 59 ++++++++++++++++++++++++++++++++++++++-
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            |  6 ++++
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
index ed0d56bfb60c..018734c9897b 100644
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -1048,6 +1048,50 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
 	return 0;
 }
 
+static void netkit_check_lease_unregister(struct net_device *dev)
+{
+	LIST_HEAD(list_kill);
+	u32 q_idx;
+
+	if (READ_ONCE(dev->reg_state) != NETREG_UNREGISTERING ||
+	    !dev->dev.parent)
+		return;
+
+	netdev_lock_ops(dev);
+	for (q_idx = 0; q_idx < dev->real_num_rx_queues; q_idx++) {
+		struct net_device *tmp = dev;
+		struct netdev_rx_queue *rxq;
+		u32 tmp_q_idx = q_idx;
+
+		rxq = __netif_get_rx_queue_lease(&tmp, &tmp_q_idx,
+						 NETIF_PHYS_TO_VIRT);
+		if (rxq && tmp != dev &&
+		    tmp->netdev_ops == &netkit_netdev_ops) {
+			/* A single phys device can have multiple queues leased
+			 * to one netkit device. We can only queue that netkit
+			 * device once to the list_kill. Queues of that phys
+			 * device can be leased with different individual netkit
+			 * devices, hence we batch via list_kill.
+			 */
+			if (unregister_netdevice_queued(tmp))
+				continue;
+			netkit_del_link(tmp, &list_kill);
+		}
+	}
+	netdev_unlock_ops(dev);
+	unregister_netdevice_many(&list_kill);
+}
+
+static int netkit_notifier(struct notifier_block *this,
+			   unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	if (event == NETDEV_UNREGISTER)
+		netkit_check_lease_unregister(dev);
+	return NOTIFY_DONE;
+}
+
 static size_t netkit_get_size(const struct net_device *dev)
 {
 	return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */
@@ -1124,18 +1168,31 @@ static struct rtnl_link_ops netkit_link_ops = {
 	.maxtype	= IFLA_NETKIT_MAX,
 };
 
+static struct notifier_block netkit_netdev_notifier = {
+	.notifier_call	= netkit_notifier,
+};
+
 static __init int netkit_mod_init(void)
 {
+	int ret;
+
 	BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT ||
 		     (int)NETKIT_PASS != (int)TCX_PASS ||
 		     (int)NETKIT_DROP != (int)TCX_DROP ||
 		     (int)NETKIT_REDIRECT != (int)TCX_REDIRECT);
 
-	return rtnl_link_register(&netkit_link_ops);
+	ret = rtnl_link_register(&netkit_link_ops);
+	if (ret)
+		return ret;
+	ret = register_netdevice_notifier(&netkit_netdev_notifier);
+	if (ret)
+		rtnl_link_unregister(&netkit_link_ops);
+	return ret;
 }
 
 static __exit void netkit_mod_exit(void)
 {
+	unregister_netdevice_notifier(&netkit_netdev_notifier);
 	rtnl_link_unregister(&netkit_link_ops);
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 35b194e57c3f..0b26b27a3d92 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3411,6 +3411,8 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 int register_netdevice(struct net_device *dev);
 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
 void unregister_netdevice_many(struct list_head *head);
+bool unregister_netdevice_queued(const struct net_device *dev);
+
 static inline void unregister_netdevice(struct net_device *dev)
 {
 	unregister_netdevice_queue(dev, NULL);
diff --git a/net/core/dev.c b/net/core/dev.c
index 763a2c6c3bf1..60d23a49c929 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -12360,6 +12360,12 @@ static void netif_close_many_and_unlock_cond(struct list_head *close_head)
 #endif
 }
 
+bool unregister_netdevice_queued(const struct net_device *dev)
+{
+	ASSERT_RTNL();
+	return !list_empty(&dev->unreg_list);
+}
+
 void unregister_netdevice_many_notify(struct list_head *head,
 				      u32 portid, const struct nlmsghdr *nlh)
 {
-- 
2.43.0


  parent reply	other threads:[~2026-03-20 22:18 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-20 22:18 [PATCH net-next v9 00/14] netkit: Support for io_uring zero-copy and AF_XDP Daniel Borkmann
2026-03-20 22:18 ` [PATCH net-next v9 01/14] net: Add queue-create operation Daniel Borkmann
2026-03-20 22:18 ` [PATCH net-next v9 02/14] net: Implement netdev_nl_queue_create_doit Daniel Borkmann
2026-03-20 22:18 ` [PATCH net-next v9 03/14] net: Add lease info to queue-get response Daniel Borkmann
2026-03-20 22:18 ` [PATCH net-next v9 04/14] net, ethtool: Disallow leased real rxqs to be resized Daniel Borkmann
2026-03-20 22:18 ` [PATCH net-next v9 05/14] net: Slightly simplify net_mp_{open,close}_rxq Daniel Borkmann
2026-03-20 22:18 ` [PATCH net-next v9 06/14] net: Proxy netif_mp_{open,close}_rxq for leased queues Daniel Borkmann
2026-03-20 22:18 ` [PATCH net-next v9 07/14] net: Proxy netdev_queue_get_dma_dev " Daniel Borkmann
2026-03-20 22:18 ` [PATCH net-next v9 08/14] xsk: Extend xsk_rcv_check validation Daniel Borkmann
2026-03-20 22:18 ` [PATCH net-next v9 09/14] xsk: Proxy pool management for leased queues Daniel Borkmann
2026-03-20 22:18 ` [PATCH net-next v9 10/14] netkit: Add single device mode for netkit Daniel Borkmann
2026-03-20 22:18 ` [PATCH net-next v9 11/14] netkit: Implement rtnl_link_ops->alloc and ndo_queue_create Daniel Borkmann
2026-03-20 22:18 ` Daniel Borkmann [this message]
2026-03-20 22:18 ` [PATCH net-next v9 13/14] netkit: Add xsk support for af_xdp applications Daniel Borkmann
2026-03-20 22:18 ` [PATCH net-next v9 14/14] selftests/net: Add queue lease tests Daniel Borkmann
2026-03-24  2:55   ` Jakub Kicinski
2026-03-24  9:29     ` Daniel Borkmann
2026-03-24 21:40       ` Jakub Kicinski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260320221814.236775-13-daniel@iogearbox.net \
    --to=daniel@iogearbox.net \
    --cc=bpf@vger.kernel.org \
    --cc=davem@davemloft.net \
    --cc=dw@davidwei.uk \
    --cc=john.fastabend@gmail.com \
    --cc=jordan@jrife.io \
    --cc=kuba@kernel.org \
    --cc=maciej.fijalkowski@intel.com \
    --cc=magnus.karlsson@intel.com \
    --cc=martin.lau@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=razor@blackwall.org \
    --cc=sdf@fomichev.me \
    --cc=toke@redhat.com \
    --cc=wangdongdong.6@bytedance.com \
    --cc=willemb@google.com \
    --cc=yangzhenze@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox