From: Stanislav Fomichev <sdf@fomichev.me>
To: netdev@vger.kernel.org
Cc: davem@davemloft.net, edumazet@google.com, kuba@kernel.org,
pabeni@redhat.com
Subject: [PATCH net-next 02/11] net: introduce ndo_set_rx_mode_async and dev_rx_mode_work
Date: Fri, 13 Mar 2026 07:51:04 -0700 [thread overview]
Message-ID: <20260313145113.1424442-3-sdf@fomichev.me> (raw)
In-Reply-To: <20260313145113.1424442-1-sdf@fomichev.me>
Add ndo_set_rx_mode_async callback that drivers can implement instead
of the legacy ndo_set_rx_mode. The legacy callback runs under the
netif_addr_lock spinlock with BHs disabled, preventing drivers from
sleeping. The async variant runs from a work queue with rtnl_lock and
netdev_lock_ops held, in fully sleepable context.
When __dev_set_rx_mode() sees ndo_set_rx_mode_async, it schedules
dev_rx_mode_work instead of calling the driver inline. The work
function takes two snapshots of each address list (uc/mc) under
the addr_lock, then drops the lock and calls the driver with the
work copies. After the driver returns, it reconciles the snapshots
back to the real lists under the lock.
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
---
Documentation/networking/netdevices.rst | 8 +++
include/linux/netdevice.h | 20 ++++++
net/core/dev.c | 94 +++++++++++++++++++++++--
3 files changed, 115 insertions(+), 7 deletions(-)
diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
index 35704d115312..dc83d78d3b27 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -289,6 +289,14 @@ struct net_device synchronization rules
ndo_set_rx_mode:
Synchronization: netif_addr_lock spinlock.
Context: BHs disabled
+ Notes: Deprecated in favor of sleepable ndo_set_rx_mode_async.
+
+ndo_set_rx_mode_async:
+ Synchronization: rtnl_lock() semaphore. In addition, netdev instance
+ lock if the driver implements queue management or shaper API.
+ Context: process (from a work queue)
+ Notes: Sleepable version of ndo_set_rx_mode. Receives snapshots
+ of the unicast and multicast address lists.
ndo_setup_tc:
``TC_SETUP_BLOCK`` and ``TC_SETUP_FT`` are running under NFT locks
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 469b7cdb3237..7ede1f56bd70 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1117,6 +1117,16 @@ struct netdev_net_notifier {
* This function is called device changes address list filtering.
* If driver handles unicast address filtering, it should set
* IFF_UNICAST_FLT in its priv_flags.
+ * Cannot sleep, called with netif_addr_lock_bh held.
+ * Deprecated in favor of sleepable ndo_set_rx_mode_async.
+ *
+ * void (*ndo_set_rx_mode_async)(struct net_device *dev,
+ * struct netdev_hw_addr_list *uc,
+ * struct netdev_hw_addr_list *mc);
+ * Sleepable version of ndo_set_rx_mode. Called from a work queue
+ * with rtnl_lock and netdev_lock_ops(dev) held. The uc/mc parameters
+ * are snapshots of the address lists - iterate with
+ * netdev_hw_addr_list_for_each(ha, uc).
*
* int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
* This function is called when the Media Access Control address
@@ -1437,6 +1447,9 @@ struct net_device_ops {
void (*ndo_change_rx_flags)(struct net_device *dev,
int flags);
void (*ndo_set_rx_mode)(struct net_device *dev);
+ void (*ndo_set_rx_mode_async)(struct net_device *dev,
+ struct netdev_hw_addr_list *uc,
+ struct netdev_hw_addr_list *mc);
int (*ndo_set_mac_address)(struct net_device *dev,
void *addr);
int (*ndo_validate_addr)(struct net_device *dev);
@@ -1903,6 +1916,7 @@ enum netdev_reg_state {
* has been enabled due to the need to listen to
* additional unicast addresses in a device that
* does not implement ndo_set_rx_mode()
+ * @rx_mode_work: Work queue entry for ndo_set_rx_mode_async()
* @uc: unicast mac addresses
* @mc: multicast mac addresses
* @dev_addrs: list of device hw addresses
@@ -2293,6 +2307,7 @@ struct net_device {
unsigned int promiscuity;
unsigned int allmulti;
bool uc_promisc;
+ struct work_struct rx_mode_work;
#ifdef CONFIG_LOCKDEP
unsigned char nested_level;
#endif
@@ -4661,6 +4676,11 @@ static inline bool netif_device_present(const struct net_device *dev)
return test_bit(__LINK_STATE_PRESENT, &dev->state);
}
+static inline bool netif_up_and_present(const struct net_device *dev)
+{
+ return (dev->flags & IFF_UP) && netif_device_present(dev);
+}
+
void netif_device_detach(struct net_device *dev);
void netif_device_attach(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index f48dc299e4b2..4b9375afcd85 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2381,6 +2381,8 @@ static void netstamp_clear(struct work_struct *work)
static DECLARE_WORK(netstamp_work, netstamp_clear);
#endif
+static struct workqueue_struct *rx_mode_wq;
+
void net_enable_timestamp(void)
{
#ifdef CONFIG_JUMP_LABEL
@@ -9666,22 +9668,83 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
return 0;
}
-/*
- * Upload unicast and multicast address lists to device and
- * configure RX filtering. When the device doesn't support unicast
- * filtering it is put in promiscuous mode while unicast addresses
- * are present.
+static void dev_rx_mode_work(struct work_struct *work)
+{
+ struct net_device *dev = container_of(work, struct net_device,
+ rx_mode_work);
+ struct netdev_hw_addr_list uc_snap, mc_snap, uc_ref, mc_ref;
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
+
+ __hw_addr_init(&uc_snap);
+ __hw_addr_init(&mc_snap);
+ __hw_addr_init(&uc_ref);
+ __hw_addr_init(&mc_ref);
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+
+ if (!netif_up_and_present(dev))
+ goto out;
+
+ if (ops->ndo_set_rx_mode_async) {
+ netif_addr_lock_bh(dev);
+
+ err = __hw_addr_list_snapshot(&uc_snap, &dev->uc,
+ dev->addr_len);
+ if (!err)
+ err = __hw_addr_list_snapshot(&uc_ref, &dev->uc,
+ dev->addr_len);
+ if (!err)
+ err = __hw_addr_list_snapshot(&mc_snap, &dev->mc,
+ dev->addr_len);
+ if (!err)
+ err = __hw_addr_list_snapshot(&mc_ref, &dev->mc,
+ dev->addr_len);
+ netif_addr_unlock_bh(dev);
+
+ if (err) {
+ __hw_addr_flush(&uc_snap);
+ __hw_addr_flush(&uc_ref);
+ __hw_addr_flush(&mc_snap);
+ goto out;
+ }
+
+ ops->ndo_set_rx_mode_async(dev, &uc_snap, &mc_snap);
+
+ netif_addr_lock_bh(dev);
+ __hw_addr_list_reconcile(&dev->uc, &uc_snap,
+ &uc_ref, dev->addr_len);
+ __hw_addr_list_reconcile(&dev->mc, &mc_snap,
+ &mc_ref, dev->addr_len);
+ netif_addr_unlock_bh(dev);
+ }
+
+out:
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+}
+
+/**
+ * __dev_set_rx_mode() - upload unicast and multicast address lists to device
+ * and configure RX filtering.
+ * @dev: device
+ *
+ * When the device doesn't support unicast filtering it is put in promiscuous
+ * mode while unicast addresses are present.
*/
void __dev_set_rx_mode(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
/* dev_open will call this function so the list will stay sane. */
- if (!(dev->flags&IFF_UP))
+ if (!netif_up_and_present(dev))
return;
- if (!netif_device_present(dev))
+ if (ops->ndo_set_rx_mode_async) {
+ queue_work(rx_mode_wq, &dev->rx_mode_work);
return;
+ }
if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
/* Unicast addresses changes may only happen under the rtnl,
@@ -11705,6 +11768,16 @@ void netdev_run_todo(void)
__rtnl_unlock();
+ /* Make sure all pending rx_mode work completes before returning.
+ *
+ * rx_mode_wq may be NULL during early boot:
+ * core_initcall(netlink_proto_init) vs subsys_initcall(net_dev_init).
+ *
+ * Check current_work() to avoid flushing from the wq.
+ */
+ if (rx_mode_wq && !current_work())
+ flush_workqueue(rx_mode_wq);
+
/* Wait for rcu callbacks to finish before next phase */
if (!list_empty(&list))
rcu_barrier();
@@ -12096,6 +12169,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
#endif
mutex_init(&dev->lock);
+ INIT_WORK(&dev->rx_mode_work, dev_rx_mode_work);
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
@@ -12200,6 +12274,8 @@ void free_netdev(struct net_device *dev)
kfree(rcu_dereference_protected(dev->ingress_queue, 1));
+ cancel_work_sync(&dev->rx_mode_work);
+
/* Flush device addresses */
dev_addr_flush(dev);
@@ -13293,6 +13369,10 @@ static int __init net_dev_init(void)
if (register_pernet_device(&default_device_ops))
goto out;
+ rx_mode_wq = alloc_ordered_workqueue("rx_mode_wq", 0);
+ if (!rx_mode_wq)
+ goto out;
+
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
--
2.53.0
next prev parent reply other threads:[~2026-03-13 14:51 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-13 14:51 [PATCH net-next 00/11] net: sleepable ndo_set_rx_mode Stanislav Fomichev
2026-03-13 14:51 ` [PATCH net-next 01/11] net: add address list snapshot and reconciliation infrastructure Stanislav Fomichev
2026-03-13 14:51 ` Stanislav Fomichev [this message]
2026-03-13 14:51 ` [PATCH net-next 03/11] net: move promiscuity handling into dev_rx_mode_work Stanislav Fomichev
2026-03-13 14:51 ` [PATCH net-next 04/11] fbnic: convert to ndo_set_rx_mode_async Stanislav Fomichev
2026-03-13 14:51 ` [PATCH net-next 05/11] mlx5: " Stanislav Fomichev
2026-03-13 16:13 ` Cosmin Ratiu
2026-03-16 15:42 ` Stanislav Fomichev
2026-03-13 14:51 ` [PATCH net-next 06/11] bnxt: " Stanislav Fomichev
2026-03-13 18:36 ` Michael Chan
2026-03-16 15:50 ` Stanislav Fomichev
2026-03-16 17:33 ` Michael Chan
2026-03-13 14:51 ` [PATCH net-next 07/11] iavf: " Stanislav Fomichev
2026-03-13 14:51 ` [PATCH net-next 08/11] netdevsim: " Stanislav Fomichev
2026-03-13 14:51 ` [PATCH net-next 09/11] dummy: " Stanislav Fomichev
2026-03-13 14:51 ` [PATCH net-next 10/11] net: warn ops-locked drivers still using ndo_set_rx_mode Stanislav Fomichev
2026-03-13 14:51 ` [PATCH net-next 11/11] selftests: net: add team_bridge_macvlan rx_mode test Stanislav Fomichev
2026-03-13 19:38 ` [PATCH net-next 00/11] net: sleepable ndo_set_rx_mode Jakub Kicinski
2026-03-16 15:58 ` Stanislav Fomichev
2026-03-14 18:48 ` [syzbot ci] " syzbot ci
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260313145113.1424442-3-sdf@fomichev.me \
--to=sdf@fomichev.me \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=kuba@kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox