From: Jakub Kicinski <kuba@kernel.org>
To: davem@davemloft.net
Cc: netdev@vger.kernel.org, edumazet@google.com, pabeni@redhat.com,
andrew+netdev@lunn.ch, horms@kernel.org, jv@jvosburgh.net,
sdf@fomichev.me, dongchenchen2@huawei.com, idosch@nvidia.com,
n05ec@lzu.edu.cn, yuantan098@gmail.com, kuniyu@google.com,
nb@tipi-net.de, aleksandr.loktionov@intel.com,
dtatulea@nvidia.com, Jakub Kicinski <kuba@kernel.org>
Subject: [PATCH net 1/4] net: turn the rx_mode work into a generic netdev_work facility
Date: Wed, 24 Jun 2026 11:20:15 -0700 [thread overview]
Message-ID: <20260624182018.2445732-2-kuba@kernel.org> (raw)
In-Reply-To: <20260624182018.2445732-1-kuba@kernel.org>
The rx_mode update runs from a workqueue: drivers have their
ndo_set_rx_mode_async() callback executed by a single global
work item under RTNL and ops lock. This is a useful pattern.
Support multiple "events" that need to be serviced and make RX_MODE
sync the first one. Call the events "core" because later on
we will let drivers define and schedule their own.
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
net/core/Makefile | 2 +-
include/linux/netdevice.h | 10 ++--
net/core/dev.h | 11 +++-
net/core/dev.c | 1 +
net/core/dev_addr_lists.c | 77 +------------------------
net/core/netdev_work.c | 117 ++++++++++++++++++++++++++++++++++++++
6 files changed, 138 insertions(+), 80 deletions(-)
create mode 100644 net/core/netdev_work.c
diff --git a/net/core/Makefile b/net/core/Makefile
index dc17c5a61e9a..b3fdcb4e355f 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -13,7 +13,7 @@ obj-y += dev.o dev_api.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
fib_notifier.o xdp.o flow_offload.o gro.o \
- netdev-genl.o netdev-genl-gen.o gso.o
+ netdev-genl.o netdev-genl-gen.o netdev_work.o gso.o
obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b67a12541eac..732506787db3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1930,8 +1930,9 @@ enum netdev_reg_state {
* has been enabled due to the need to listen to
* additional unicast addresses in a device that
* does not implement ndo_set_rx_mode()
- * @rx_mode_node: List entry for rx_mode work processing
- * @rx_mode_tracker: Refcount tracker for rx_mode work
+ * @work_node: List entry for async netdev_work processing
+ * @work_tracker: Refcount tracker for async netdev_work
+ * @work_core_pending: Core-defined pending netdev_work (NETDEV_WORK_*)
* @rx_mode_addr_cache: Recycled snapshot entries for rx_mode work
* @rx_mode_retry_timer: Timer that re-queues rx_mode work after failure
* @rx_mode_retry_count: Number of consecutive retries already scheduled
@@ -2326,8 +2327,9 @@ struct net_device {
unsigned int promiscuity;
unsigned int allmulti;
bool uc_promisc;
- struct list_head rx_mode_node;
- netdevice_tracker rx_mode_tracker;
+ struct list_head work_node;
+ netdevice_tracker work_tracker;
+ unsigned long work_core_pending;
struct netdev_hw_addr_list rx_mode_addr_cache;
struct timer_list rx_mode_retry_timer;
unsigned int rx_mode_retry_count;
diff --git a/net/core/dev.h b/net/core/dev.h
index 4121c50e7c88..5d0b0305d3ba 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -167,10 +167,19 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier);
void __dev_set_rx_mode(struct net_device *dev);
int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify);
void netif_rx_mode_init(struct net_device *dev);
-bool netif_rx_mode_clean(struct net_device *dev);
+void netif_rx_mode_run(struct net_device *dev);
void netif_rx_mode_sync(struct net_device *dev);
void netif_rx_mode_cancel_retry(struct net_device *dev);
+/* Events for the async netdev work, tracked in netdev->work_core_pending. */
+enum netdev_work_core {
+ NETDEV_WORK_RX_MODE = BIT(0), /* run the rx_mode update */
+};
+
+void __netdev_work_core_sched(struct net_device *dev, unsigned long event);
+unsigned long
+__netdev_work_core_cancel(struct net_device *dev, unsigned long mask);
+
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
unsigned int gchanges, u32 portid,
const struct nlmsghdr *nlh);
diff --git a/net/core/dev.c b/net/core/dev.c
index 5c01dfaa6c44..e1d8af0ef6ab 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -12093,6 +12093,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
INIT_LIST_HEAD(&dev->net_notifier_list);
+ INIT_LIST_HEAD(&dev->work_node);
#ifdef CONFIG_NET_SCHED
hash_init(dev->qdisc_hash);
#endif
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index e17f64a65e17..08528ca0a8b3 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -12,17 +12,10 @@
#include <linux/export.h>
#include <linux/list.h>
#include <linux/spinlock.h>
-#include <linux/workqueue.h>
#include <kunit/visibility.h>
#include "dev.h"
-static void netdev_rx_mode_work(struct work_struct *work);
-
-static LIST_HEAD(rx_mode_list);
-static DEFINE_SPINLOCK(rx_mode_lock);
-static DECLARE_WORK(rx_mode_work, netdev_rx_mode_work);
-
/*
* General list handling functions
*/
@@ -1281,7 +1274,7 @@ void netif_rx_mode_cancel_retry(struct net_device *dev)
dev->rx_mode_retry_count = 0;
}
-static void netif_rx_mode_run(struct net_device *dev)
+void netif_rx_mode_run(struct net_device *dev)
{
struct netdev_hw_addr_list uc_snap, mc_snap, uc_ref, mc_ref;
const struct net_device_ops *ops = dev->netdev_ops;
@@ -1339,49 +1332,9 @@ static void netif_rx_mode_run(struct net_device *dev)
}
}
-static void netdev_rx_mode_work(struct work_struct *work)
-{
- struct net_device *dev;
-
- rtnl_lock();
-
- while (true) {
- spin_lock_bh(&rx_mode_lock);
- if (list_empty(&rx_mode_list)) {
- spin_unlock_bh(&rx_mode_lock);
- break;
- }
- dev = list_first_entry(&rx_mode_list, struct net_device,
- rx_mode_node);
- list_del_init(&dev->rx_mode_node);
- /* We must free netdev tracker under
- * the spinlock protection.
- */
- netdev_tracker_free(dev, &dev->rx_mode_tracker);
- spin_unlock_bh(&rx_mode_lock);
-
- netdev_lock_ops(dev);
- netif_rx_mode_run(dev);
- netdev_unlock_ops(dev);
- /* Use __dev_put() because netdev_tracker_free() was already
- * called above. Must be after netdev_unlock_ops() to prevent
- * netdev_run_todo() from freeing the device while still in use.
- */
- __dev_put(dev);
- }
-
- rtnl_unlock();
-}
-
static void netif_rx_mode_queue(struct net_device *dev)
{
- spin_lock_bh(&rx_mode_lock);
- if (list_empty(&dev->rx_mode_node)) {
- list_add_tail(&dev->rx_mode_node, &rx_mode_list);
- netdev_hold(dev, &dev->rx_mode_tracker, GFP_ATOMIC);
- }
- spin_unlock_bh(&rx_mode_lock);
- schedule_work(&rx_mode_work);
+ __netdev_work_core_sched(dev, NETDEV_WORK_RX_MODE);
}
static void netif_rx_mode_retry(struct timer_list *t)
@@ -1394,7 +1347,6 @@ static void netif_rx_mode_retry(struct timer_list *t)
void netif_rx_mode_init(struct net_device *dev)
{
- INIT_LIST_HEAD(&dev->rx_mode_node);
__hw_addr_init(&dev->rx_mode_addr_cache);
timer_setup(&dev->rx_mode_retry_timer, netif_rx_mode_retry, 0);
}
@@ -1442,24 +1394,6 @@ void dev_set_rx_mode(struct net_device *dev)
netif_addr_unlock_bh(dev);
}
-bool netif_rx_mode_clean(struct net_device *dev)
-{
- bool clean = false;
-
- spin_lock_bh(&rx_mode_lock);
- if (!list_empty(&dev->rx_mode_node)) {
- list_del_init(&dev->rx_mode_node);
- clean = true;
- /* We must release netdev tracker under
- * the spinlock protection.
- */
- netdev_tracker_free(dev, &dev->rx_mode_tracker);
- }
- spin_unlock_bh(&rx_mode_lock);
-
- return clean;
-}
-
/**
* netif_rx_mode_sync() - sync rx mode inline
* @dev: network device
@@ -1473,11 +1407,6 @@ bool netif_rx_mode_clean(struct net_device *dev)
*/
void netif_rx_mode_sync(struct net_device *dev)
{
- if (netif_rx_mode_clean(dev)) {
+ if (__netdev_work_core_cancel(dev, NETDEV_WORK_RX_MODE))
netif_rx_mode_run(dev);
- /* Use __dev_put() because netdev_tracker_free() was already
- * called inside netif_rx_mode_clean().
- */
- __dev_put(dev);
- }
}
diff --git a/net/core/netdev_work.c b/net/core/netdev_work.c
new file mode 100644
index 000000000000..c121c24dc493
--- /dev/null
+++ b/net/core/netdev_work.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <net/netdev_lock.h>
+
+#include "dev.h"
+
+static void netdev_work_proc(struct work_struct *work);
+
+/* @netdev_work_lock protects:
+ * - @netdev_work_list
+ * - within the list entries (struct net_device fields):
+ * - work_node
+ * - work_tracker
+ * - work_core_pending
+ */
+static LIST_HEAD(netdev_work_list);
+static DEFINE_SPINLOCK(netdev_work_lock);
+static DECLARE_WORK(netdev_work, netdev_work_proc);
+
+void __netdev_work_core_sched(struct net_device *dev, unsigned long event)
+{
+ spin_lock_bh(&netdev_work_lock);
+ if (list_empty(&dev->work_node)) {
+ list_add_tail(&dev->work_node, &netdev_work_list);
+ netdev_hold(dev, &dev->work_tracker, GFP_ATOMIC);
+ }
+ dev->work_core_pending |= event;
+ spin_unlock_bh(&netdev_work_lock);
+
+ schedule_work(&netdev_work);
+}
+
+/**
+ * __netdev_work_core_cancel() - cancel selected core work for a netdev
+ * @dev: net_device
+ * @mask: events to cancel
+ *
+ * Clear @mask from the device's work pending mask. If no work is left pending
+ * the device is dequeued.
+ *
+ * No expectations on locking, but also no guarantees provided. If the caller
+ * wants to touch @dev afterwards (e.g. call the work that got canceled)
+ * they have to ensure @dev does not get freed.
+ *
+ * Returns: the subset of @mask that was actually pending, so the caller can run
+ * those events inline.
+ */
+unsigned long
+__netdev_work_core_cancel(struct net_device *dev, unsigned long mask)
+{
+ unsigned long event;
+
+ spin_lock_bh(&netdev_work_lock);
+ event = dev->work_core_pending & mask;
+ dev->work_core_pending &= ~mask;
+ if (!list_empty(&dev->work_node) && !dev->work_core_pending) {
+ list_del_init(&dev->work_node);
+ netdev_put(dev, &dev->work_tracker);
+ }
+ spin_unlock_bh(&netdev_work_lock);
+
+ return event;
+}
+
+static void netdev_work_proc(struct work_struct *work)
+{
+ rtnl_lock();
+
+ while (true) {
+ netdevice_tracker tracker;
+ struct net_device *dev;
+ unsigned long core = 0;
+
+ spin_lock_bh(&netdev_work_lock);
+ if (list_empty(&netdev_work_list)) {
+ spin_unlock_bh(&netdev_work_lock);
+ break;
+ }
+ dev = list_first_entry(&netdev_work_list, struct net_device,
+ work_node);
+ /* Take a temporary reference so @dev can't be freed while we
+ * drop the lock to grab its ops lock; the work reference is
+ * only released once we claim the work below.
+ * The re-locking dance is to ensure that ops lock is enough
+ * to ensure canceling work is not racy with dequeue.
+ */
+ netdev_hold(dev, &tracker, GFP_ATOMIC);
+ spin_unlock_bh(&netdev_work_lock);
+
+ netdev_lock_ops(dev);
+ spin_lock_bh(&netdev_work_lock);
+ if (!list_empty(&dev->work_node)) {
+ list_del_init(&dev->work_node);
+ core = dev->work_core_pending;
+ dev->work_core_pending = 0;
+ /* We took another ref above */
+ netdev_put(dev, &dev->work_tracker);
+
+ if (!dev_isalive(dev))
+ core = 0;
+ }
+ spin_unlock_bh(&netdev_work_lock);
+
+ if (core & NETDEV_WORK_RX_MODE)
+ netif_rx_mode_run(dev);
+ netdev_unlock_ops(dev);
+
+ netdev_put(dev, &tracker);
+ }
+
+ rtnl_unlock();
+}
--
2.54.0
next prev parent reply other threads:[~2026-06-24 18:20 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-24 18:20 [PATCH net 0/4] net: avoid nested UP notifier events Jakub Kicinski
2026-06-24 18:20 ` Jakub Kicinski [this message]
2026-06-24 18:20 ` [PATCH net 2/4] net: add the driver-facing netdev_work scheduling API Jakub Kicinski
2026-06-24 18:20 ` [PATCH net 3/4] vlan: defer real device state propagation to netdev_work Jakub Kicinski
2026-06-24 18:20 ` [PATCH net 4/4] selftests: bonding: add a test for VLAN propagation over a bonded real device Jakub Kicinski
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260624182018.2445732-2-kuba@kernel.org \
--to=kuba@kernel.org \
--cc=aleksandr.loktionov@intel.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=dongchenchen2@huawei.com \
--cc=dtatulea@nvidia.com \
--cc=edumazet@google.com \
--cc=horms@kernel.org \
--cc=idosch@nvidia.com \
--cc=jv@jvosburgh.net \
--cc=kuniyu@google.com \
--cc=n05ec@lzu.edu.cn \
--cc=nb@tipi-net.de \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=sdf@fomichev.me \
--cc=yuantan098@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox