From: Stanislav Fomichev <sdf@fomichev.me>
To: netdev@vger.kernel.org
Cc: davem@davemloft.net, edumazet@google.com, kuba@kernel.org,
pabeni@redhat.com
Subject: [PATCH net-next v4 02/14] net: introduce ndo_set_rx_mode_async and netdev_rx_mode_work
Date: Tue, 31 Mar 2026 18:19:02 -0700 [thread overview]
Message-ID: <20260401011914.1716692-3-sdf@fomichev.me> (raw)
In-Reply-To: <20260401011914.1716692-1-sdf@fomichev.me>
Add ndo_set_rx_mode_async callback that drivers can implement instead
of the legacy ndo_set_rx_mode. The legacy callback runs under the
netif_addr_lock spinlock with BHs disabled, preventing drivers from
sleeping. The async variant runs from a work queue with rtnl_lock and
netdev_lock_ops held, in fully sleepable context.
When __dev_set_rx_mode() sees ndo_set_rx_mode_async, it schedules
netdev_rx_mode_work instead of calling the driver inline. The work
function takes two snapshots of each address list (uc/mc) under
the addr_lock, then drops the lock and calls the driver with the
work copies. After the driver returns, it reconciles the snapshots
back to the real lists under the lock.
Add netif_rx_mode_sync() to opportunistically execute the pending
workqueue update inline, so that rx mode changes are committed
before returning to userspace:
- dev_change_flags (SIOCSIFFLAGS / RTM_NEWLINK)
- dev_set_promiscuity
- dev_set_allmulti
- dev_ifsioc SIOCADDMULTI / SIOCDELMULTI
- do_setlink (RTM_SETLINK)
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
---
Documentation/networking/netdevices.rst | 9 ++
include/linux/netdevice.h | 16 ++
net/core/dev.c | 46 +-----
net/core/dev.h | 3 +
net/core/dev_addr_lists.c | 194 ++++++++++++++++++++++++
net/core/dev_api.c | 3 +
net/core/dev_ioctl.c | 6 +-
net/core/rtnetlink.c | 1 +
8 files changed, 235 insertions(+), 43 deletions(-)
diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
index 35704d115312..8a488c21fd7c 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -289,6 +289,15 @@ struct net_device synchronization rules
ndo_set_rx_mode:
Synchronization: netif_addr_lock spinlock.
Context: BHs disabled
+ Notes: Deprecated in favor of ndo_set_rx_mode_async which runs
+ in process context.
+
+ndo_set_rx_mode_async:
+ Synchronization: rtnl_lock() semaphore. In addition, netdev instance
+ lock if the driver implements queue management or shaper API.
+ Context: process (from a work queue)
+ Notes: Async version of ndo_set_rx_mode which runs in process
+ context. Receives snapshots of the unicast and multicast address lists.
ndo_setup_tc:
``TC_SETUP_BLOCK`` and ``TC_SETUP_FT`` are running under NFT locks
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 151d6f4fd9b3..cf3dd3f1eb0f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1119,6 +1119,16 @@ struct netdev_net_notifier {
* This function is called device changes address list filtering.
* If driver handles unicast address filtering, it should set
* IFF_UNICAST_FLT in its priv_flags.
+ * Cannot sleep, called with netif_addr_lock_bh held.
+ * Deprecated in favor of ndo_set_rx_mode_async.
+ *
+ * void (*ndo_set_rx_mode_async)(struct net_device *dev,
+ * struct netdev_hw_addr_list *uc,
+ * struct netdev_hw_addr_list *mc);
+ * Async version of ndo_set_rx_mode which runs in process context
+ * with rtnl_lock and netdev_lock_ops(dev) held. The uc/mc parameters
+ * are snapshots of the address lists - iterate with
+ * netdev_hw_addr_list_for_each(ha, uc).
*
* int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
* This function is called when the Media Access Control address
@@ -1439,6 +1449,10 @@ struct net_device_ops {
void (*ndo_change_rx_flags)(struct net_device *dev,
int flags);
void (*ndo_set_rx_mode)(struct net_device *dev);
+ void (*ndo_set_rx_mode_async)(
+ struct net_device *dev,
+ struct netdev_hw_addr_list *uc,
+ struct netdev_hw_addr_list *mc);
int (*ndo_set_mac_address)(struct net_device *dev,
void *addr);
int (*ndo_validate_addr)(struct net_device *dev);
@@ -1903,6 +1917,7 @@ enum netdev_reg_state {
* has been enabled due to the need to listen to
* additional unicast addresses in a device that
* does not implement ndo_set_rx_mode()
+ * @rx_mode_node: List entry for rx_mode work processing
* @uc: unicast mac addresses
* @mc: multicast mac addresses
* @dev_addrs: list of device hw addresses
@@ -2294,6 +2309,7 @@ struct net_device {
unsigned int promiscuity;
unsigned int allmulti;
bool uc_promisc;
+ struct list_head rx_mode_node;
#ifdef CONFIG_LOCKDEP
unsigned char nested_level;
#endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 90d041390e29..b9f9e6a30963 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9574,7 +9574,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
ops->ndo_change_rx_flags(dev, flags);
}
-static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
+int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags;
unsigned int promiscuity, flags;
@@ -9678,46 +9678,6 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
return 0;
}
-/*
- * Upload unicast and multicast address lists to device and
- * configure RX filtering. When the device doesn't support unicast
- * filtering it is put in promiscuous mode while unicast addresses
- * are present.
- */
-void __dev_set_rx_mode(struct net_device *dev)
-{
- const struct net_device_ops *ops = dev->netdev_ops;
-
- /* dev_open will call this function so the list will stay sane. */
- if (!(dev->flags&IFF_UP))
- return;
-
- if (!netif_device_present(dev))
- return;
-
- if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
- /* Unicast addresses changes may only happen under the rtnl,
- * therefore calling __dev_set_promiscuity here is safe.
- */
- if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
- __dev_set_promiscuity(dev, 1, false);
- dev->uc_promisc = true;
- } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
- __dev_set_promiscuity(dev, -1, false);
- dev->uc_promisc = false;
- }
- }
-
- if (ops->ndo_set_rx_mode)
- ops->ndo_set_rx_mode(dev);
-}
-
-void dev_set_rx_mode(struct net_device *dev)
-{
- netif_addr_lock_bh(dev);
- __dev_set_rx_mode(dev);
- netif_addr_unlock_bh(dev);
-}
/**
* netif_get_flags() - get flags reported to userspace
@@ -12108,6 +12068,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
#endif
mutex_init(&dev->lock);
+ INIT_LIST_HEAD(&dev->rx_mode_node);
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
@@ -12212,6 +12173,9 @@ void free_netdev(struct net_device *dev)
kfree(rcu_dereference_protected(dev->ingress_queue, 1));
+ if (netif_rx_mode_clean(dev))
+ dev_put(dev);
+
/* Flush device addresses */
dev_addr_flush(dev);
diff --git a/net/core/dev.h b/net/core/dev.h
index acc925b7b337..50edb380ca94 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -146,6 +146,9 @@ int netif_change_carrier(struct net_device *dev, bool new_carrier);
int dev_change_carrier(struct net_device *dev, bool new_carrier);
void __dev_set_rx_mode(struct net_device *dev);
+int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify);
+bool netif_rx_mode_clean(struct net_device *dev);
+void netif_rx_mode_sync(struct net_device *dev);
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
unsigned int gchanges, u32 portid,
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 284fcfe9389b..a05db3ac43ea 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -11,10 +11,18 @@
#include <linux/rtnetlink.h>
#include <linux/export.h>
#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
#include <kunit/visibility.h>
#include "dev.h"
+static void netdev_rx_mode_work(struct work_struct *work);
+
+static LIST_HEAD(rx_mode_list);
+static DEFINE_SPINLOCK(rx_mode_lock);
+static DECLARE_WORK(rx_mode_work, netdev_rx_mode_work);
+
/*
* General list handling functions
*/
@@ -1159,3 +1167,189 @@ void dev_mc_init(struct net_device *dev)
__hw_addr_init(&dev->mc);
}
EXPORT_SYMBOL(dev_mc_init);
+
+static int netif_addr_lists_snapshot(struct net_device *dev,
+ struct netdev_hw_addr_list *uc_snap,
+ struct netdev_hw_addr_list *mc_snap,
+ struct netdev_hw_addr_list *uc_ref,
+ struct netdev_hw_addr_list *mc_ref)
+{
+ int err;
+
+ err = __hw_addr_list_snapshot(uc_snap, &dev->uc, dev->addr_len);
+ if (!err)
+ err = __hw_addr_list_snapshot(uc_ref, &dev->uc, dev->addr_len);
+ if (!err)
+ err = __hw_addr_list_snapshot(mc_snap, &dev->mc,
+ dev->addr_len);
+ if (!err)
+ err = __hw_addr_list_snapshot(mc_ref, &dev->mc, dev->addr_len);
+
+ if (err) {
+ __hw_addr_flush(uc_snap);
+ __hw_addr_flush(uc_ref);
+ __hw_addr_flush(mc_snap);
+ }
+
+ return err;
+}
+
+static void netif_addr_lists_reconcile(struct net_device *dev,
+ struct netdev_hw_addr_list *uc_snap,
+ struct netdev_hw_addr_list *mc_snap,
+ struct netdev_hw_addr_list *uc_ref,
+ struct netdev_hw_addr_list *mc_ref)
+{
+ __hw_addr_list_reconcile(&dev->uc, uc_snap, uc_ref, dev->addr_len);
+ __hw_addr_list_reconcile(&dev->mc, mc_snap, mc_ref, dev->addr_len);
+}
+
+static void netif_rx_mode_run(struct net_device *dev)
+{
+ struct netdev_hw_addr_list uc_snap, mc_snap, uc_ref, mc_ref;
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
+
+ might_sleep();
+ netdev_ops_assert_locked(dev);
+
+ __hw_addr_init(&uc_snap);
+ __hw_addr_init(&mc_snap);
+ __hw_addr_init(&uc_ref);
+ __hw_addr_init(&mc_ref);
+
+ if (!(dev->flags & IFF_UP) || !netif_device_present(dev))
+ return;
+
+ netif_addr_lock_bh(dev);
+ err = netif_addr_lists_snapshot(dev, &uc_snap, &mc_snap,
+ &uc_ref, &mc_ref);
+ if (err) {
+ netdev_WARN(dev, "failed to sync uc/mc addresses\n");
+ netif_addr_unlock_bh(dev);
+ return;
+ }
+ netif_addr_unlock_bh(dev);
+
+ ops->ndo_set_rx_mode_async(dev, &uc_snap, &mc_snap);
+
+ netif_addr_lock_bh(dev);
+ netif_addr_lists_reconcile(dev, &uc_snap, &mc_snap,
+ &uc_ref, &mc_ref);
+ netif_addr_unlock_bh(dev);
+}
+
+static void netdev_rx_mode_work(struct work_struct *work)
+{
+ struct net_device *dev;
+
+ rtnl_lock();
+
+ while (true) {
+ spin_lock_bh(&rx_mode_lock);
+ if (list_empty(&rx_mode_list)) {
+ spin_unlock_bh(&rx_mode_lock);
+ break;
+ }
+ dev = list_first_entry(&rx_mode_list, struct net_device,
+ rx_mode_node);
+ list_del_init(&dev->rx_mode_node);
+ spin_unlock_bh(&rx_mode_lock);
+
+ netdev_lock_ops(dev);
+ netif_rx_mode_run(dev);
+ netdev_unlock_ops(dev);
+ dev_put(dev);
+ }
+
+ rtnl_unlock();
+}
+
+static void netif_rx_mode_queue(struct net_device *dev)
+{
+ spin_lock_bh(&rx_mode_lock);
+ if (list_empty(&dev->rx_mode_node)) {
+ list_add_tail(&dev->rx_mode_node, &rx_mode_list);
+ dev_hold(dev);
+ }
+ spin_unlock_bh(&rx_mode_lock);
+ schedule_work(&rx_mode_work);
+}
+
+/**
+ * __dev_set_rx_mode() - upload unicast and multicast address lists to device
+ * and configure RX filtering.
+ * @dev: device
+ *
+ * When the device doesn't support unicast filtering it is put in promiscuous
+ * mode while unicast addresses are present.
+ */
+void __dev_set_rx_mode(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ /* dev_open will call this function so the list will stay sane. */
+ if (!(dev->flags & IFF_UP))
+ return;
+
+ if (!netif_device_present(dev))
+ return;
+
+ if (ops->ndo_set_rx_mode_async) {
+ netif_rx_mode_queue(dev);
+ return;
+ }
+
+ if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
+ if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
+ __dev_set_promiscuity(dev, 1, false);
+ dev->uc_promisc = true;
+ } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
+ __dev_set_promiscuity(dev, -1, false);
+ dev->uc_promisc = false;
+ }
+ }
+
+ if (ops->ndo_set_rx_mode)
+ ops->ndo_set_rx_mode(dev);
+}
+
+void dev_set_rx_mode(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+}
+
+bool netif_rx_mode_clean(struct net_device *dev)
+{
+ bool clean = false;
+
+ spin_lock_bh(&rx_mode_lock);
+ if (!list_empty(&dev->rx_mode_node)) {
+ list_del_init(&dev->rx_mode_node);
+ clean = true;
+ }
+ spin_unlock_bh(&rx_mode_lock);
+
+ return clean;
+}
+
+/**
+ * netif_rx_mode_sync() - sync rx mode inline
+ * @dev: network device
+ *
+ * Drivers implementing ndo_set_rx_mode_async() have their rx mode callback
+ * executed from a workqueue. This allows the callback to sleep, but means
+ * the hardware update is deferred and may not be visible to userspace
+ * by the time the initiating syscall returns. netif_rx_mode_sync() steals
+ * workqueue update and executes it inline. This preserves the atomicity of
+ * operations to the userspace.
+ */
+void netif_rx_mode_sync(struct net_device *dev)
+{
+ if (netif_rx_mode_clean(dev)) {
+ netif_rx_mode_run(dev);
+ dev_put(dev);
+ }
+}
diff --git a/net/core/dev_api.c b/net/core/dev_api.c
index f28852078aa6..437947dd08ed 100644
--- a/net/core/dev_api.c
+++ b/net/core/dev_api.c
@@ -66,6 +66,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
netdev_lock_ops(dev);
ret = netif_change_flags(dev, flags, extack);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return ret;
@@ -285,6 +286,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
netdev_lock_ops(dev);
ret = netif_set_promiscuity(dev, inc);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return ret;
@@ -311,6 +313,7 @@ int dev_set_allmulti(struct net_device *dev, int inc)
netdev_lock_ops(dev);
ret = netif_set_allmulti(dev, inc, true);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return ret;
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 7a8966544c9d..f3979b276090 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -586,24 +586,26 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
return err;
case SIOCADDMULTI:
- if (!ops->ndo_set_rx_mode ||
+ if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) ||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
netdev_lock_ops(dev);
err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return err;
case SIOCDELMULTI:
- if (!ops->ndo_set_rx_mode ||
+ if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) ||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
netdev_lock_ops(dev);
err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return err;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index fae8034efbff..f4e5ac70709d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3431,6 +3431,7 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
dev->name);
}
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return err;
--
2.53.0
next prev parent reply other threads:[~2026-04-01 1:19 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-01 1:19 [PATCH net-next v4 00/14] net: sleepable ndo_set_rx_mode Stanislav Fomichev
2026-04-01 1:19 ` [PATCH net-next v4 01/14] net: add address list snapshot and reconciliation infrastructure Stanislav Fomichev
2026-04-01 1:19 ` Stanislav Fomichev [this message]
2026-04-01 1:19 ` [PATCH net-next v4 03/14] net: move promiscuity handling into netdev_rx_mode_work Stanislav Fomichev
2026-04-01 1:19 ` [PATCH net-next v4 04/14] net: cache snapshot entries for ndo_set_rx_mode_async Stanislav Fomichev
2026-04-01 1:19 ` [PATCH net-next v4 05/14] fbnic: convert to ndo_set_rx_mode_async Stanislav Fomichev
2026-04-01 1:19 ` [PATCH net-next v4 06/14] mlx5: " Stanislav Fomichev
2026-04-01 1:19 ` [PATCH net-next v4 07/14] bnxt: " Stanislav Fomichev
2026-04-01 1:19 ` [PATCH net-next v4 08/14] bnxt: use snapshot in bnxt_cfg_rx_mode Stanislav Fomichev
2026-04-01 23:34 ` Michael Chan
2026-04-01 1:19 ` [PATCH net-next v4 09/14] iavf: convert to ndo_set_rx_mode_async Stanislav Fomichev
2026-04-01 1:19 ` [PATCH net-next v4 10/14] netdevsim: " Stanislav Fomichev
2026-04-01 1:19 ` [PATCH net-next v4 11/14] dummy: " Stanislav Fomichev
2026-04-01 1:19 ` [PATCH net-next v4 12/14] net: warn ops-locked drivers still using ndo_set_rx_mode Stanislav Fomichev
2026-04-01 1:19 ` [PATCH net-next v4 13/14] selftests: net: add team_bridge_macvlan rx_mode test Stanislav Fomichev
2026-04-01 1:19 ` [PATCH net-next v4 14/14] selftests: net: use ip commands instead of teamd in team " Stanislav Fomichev
2026-04-02 3:38 ` [PATCH net-next v4 00/14] net: sleepable ndo_set_rx_mode Jakub Kicinski
2026-04-02 15:06 ` Stanislav Fomichev
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260401011914.1716692-3-sdf@fomichev.me \
--to=sdf@fomichev.me \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=kuba@kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox