* [RFC net-next PATCH 1/2] net: Add ndo_write_rx_config and helper structs and functions.
2025-10-20 13:48 [RFC net-next PATCH 0/2] net: Split ndo_set_rx_mode into snapshot and deferred write I Viswanath
@ 2025-10-20 13:48 ` I Viswanath
2025-10-20 13:48 ` [RFC net-next PATCH 2/2] net: ethernet: Implement ndo_write_rx_config callback for the 8139cp driver I Viswanath
2025-10-22 23:46 ` [RFC net-next PATCH 0/2] net: Split ndo_set_rx_mode into snapshot and deferred write Jacob Keller
2 siblings, 0 replies; 6+ messages in thread
From: I Viswanath @ 2025-10-20 13:48 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, horms, sdf, kuniyu, ahmed.zaki,
aleksander.lobakin, andrew+netdev
Cc: netdev, linux-kernel, skhan, linux-kernel-mentees,
david.hunter.linux, khalid, I Viswanath
Add ndo_write_rx_config callback and following helper structs/functions:
rx_config_work - To schedule the callback and handle synchronization
read_snapshot/update_snapshot - Helper functions to read/update the
rx_config snapshot
set_and_schedule_rx_config - Helper function to call ndo_set_rx_mode
and schedule ndo_write_rx_config
execute_write_rx_config - Helper function that will be scheduled
by rx_work->config_write
Signed-off-by: I Viswanath <viswanathiyyappan@gmail.com>
---
I expect that shallow copy should be good enough as rx_config should consist exclusively
of integer types (primitives and arrays)
Would flushing the work queue be necessary for functions like *_init_hw()?
include/linux/netdevice.h | 38 ++++++++++++++++++++++++++++++-
net/core/dev.c | 48 +++++++++++++++++++++++++++++++++++----
2 files changed, 81 insertions(+), 5 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d1a687444b27..37a48e41a004 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1100,7 +1100,12 @@ struct netdev_net_notifier {
* void (*ndo_set_rx_mode)(struct net_device *dev);
* This function is called device changes address list filtering.
* If driver handles unicast address filtering, it should set
- * IFF_UNICAST_FLT in its priv_flags.
+ * IFF_UNICAST_FLT in its priv_flags. This sets up the snapshot of
+ * rx_config that will be written to the device.
+ *
+ * void (*ndo_write_rx_config)(struct net_device *dev);
+ * This function is scheduled immediately after ndo_set_rx_mode to
+ * write rx_config to the device.
*
* int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
* This function is called when the Media Access Control address
@@ -1421,6 +1426,7 @@ struct net_device_ops {
void (*ndo_change_rx_flags)(struct net_device *dev,
int flags);
void (*ndo_set_rx_mode)(struct net_device *dev);
+ void (*ndo_write_rx_config)(struct net_device *dev);
int (*ndo_set_mac_address)(struct net_device *dev,
void *addr);
int (*ndo_validate_addr)(struct net_device *dev);
@@ -1767,6 +1773,12 @@ enum netdev_reg_state {
NETREG_DUMMY, /* dummy device for NAPI poll */
};
+struct rx_config_work {
+ struct work_struct config_write;
+ struct net_device *dev;
+ spinlock_t config_lock;
+};
+
/**
* struct net_device - The DEVICE structure.
*
@@ -2082,6 +2094,8 @@ enum netdev_reg_state {
* dev_list, one per address-family.
* @hwprov: Tracks which PTP performs hardware packet time stamping.
*
+ * @rx_work: helper struct to schedule rx config write to the hardware.
+ *
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
*/
@@ -2559,6 +2573,8 @@ struct net_device {
struct hwtstamp_provider __rcu *hwprov;
+ struct rx_config_work *rx_work;
+
u8 priv[] ____cacheline_aligned
__counted_by(priv_len);
} ____cacheline_aligned;
@@ -2734,6 +2750,26 @@ void dev_net_set(struct net_device *dev, struct net *net)
write_pnet(&dev->nd_net, net);
}
+#define update_snapshot(config_ptr, type) \
+ do { \
+ typeof((config_ptr)) rx_config = ((type *)(dev->priv))->rx_config; \
+ unsigned long flags; \
+ spin_lock_irqsave(&((dev)->rx_work->config_lock), flags); \
+ *rx_config = *(config_ptr); \
+ spin_unlock_irqrestore(&((dev)->rx_work->config_lock), flags); \
+ } while (0)
+
+#define read_snapshot(config_ptr, type) \
+ do { \
+ typeof((config_ptr)) rx_config = ((type *)(dev->priv))->rx_config; \
+ unsigned long flags; \
+ spin_lock_irqsave(&((dev)->rx_work->config_lock), flags); \
+ *(config_ptr) = *rx_config; \
+ spin_unlock_irqrestore(&((dev)->rx_work->config_lock), flags); \
+ } while (0)
+
+void set_and_schedule_rx_config(struct net_device *dev, bool flush);
+
/**
* netdev_priv - access network device private data
* @dev: network device
diff --git a/net/core/dev.c b/net/core/dev.c
index 2acfa44927da..24eeaec5881b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9524,6 +9524,37 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
return 0;
}
+static void execute_write_rx_config(struct work_struct *param)
+{
+ struct rx_config_work *rx_work = container_of(param,
+ struct rx_config_work,
+ config_write);
+ struct net_device *dev = rx_work->dev;
+
+ // This path should not be hit outside the work item
+ WARN_ON(!dev->netdev_ops->ndo_write_rx_config);
+ dev->netdev_ops->ndo_write_rx_config(dev);
+}
+
+/*
+ * Sets up the rx_config snapshot and schedules write_rx_config. If
+ * it's necessary to wait for completion of write_rx_config, set
+ * flush to true.
+ */
+void set_and_schedule_rx_config(struct net_device *dev, bool flush)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_set_rx_mode)
+ ops->ndo_set_rx_mode(dev);
+
+ if (ops->ndo_write_rx_config) {
+ schedule_work(&dev->rx_work->config_write);
+ if (flush)
+ flush_work(&dev->rx_work->config_write);
+ }
+}
+
/*
* Upload unicast and multicast address lists to device and
* configure RX filtering. When the device doesn't support unicast
@@ -9532,8 +9563,6 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
*/
void __dev_set_rx_mode(struct net_device *dev)
{
- const struct net_device_ops *ops = dev->netdev_ops;
-
/* dev_open will call this function so the list will stay sane. */
if (!(dev->flags&IFF_UP))
return;
@@ -9554,8 +9583,7 @@ void __dev_set_rx_mode(struct net_device *dev)
}
}
- if (ops->ndo_set_rx_mode)
- ops->ndo_set_rx_mode(dev);
+ set_and_schedule_rx_config(dev, false);
}
void dev_set_rx_mode(struct net_device *dev)
@@ -11946,6 +11974,15 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
INIT_LIST_HEAD(&dev->net_notifier_list);
+
+ dev->rx_work = kmalloc(sizeof(*dev->rx_work), GFP_KERNEL);
+ if (!dev->rx_work)
+ goto free_all;
+
+ dev->rx_work->dev = dev;
+ spin_lock_init(&dev->rx_work->config_lock);
+ INIT_WORK(&dev->rx_work->config_write, execute_write_rx_config);
+
#ifdef CONFIG_NET_SCHED
hash_init(dev->qdisc_hash);
#endif
@@ -12083,6 +12120,9 @@ void free_netdev(struct net_device *dev)
return;
}
+ cancel_work_sync(&dev->rx_work->config_write);
+ kfree(dev->rx_work);
+
BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
--
2.47.3
^ permalink raw reply related [flat|nested] 6+ messages in thread* [RFC net-next PATCH 2/2] net: ethernet: Implement ndo_write_rx_config callback for the 8139cp driver
2025-10-20 13:48 [RFC net-next PATCH 0/2] net: Split ndo_set_rx_mode into snapshot and deferred write I Viswanath
2025-10-20 13:48 ` [RFC net-next PATCH 1/2] net: Add ndo_write_rx_config and helper structs and functions I Viswanath
@ 2025-10-20 13:48 ` I Viswanath
2025-10-22 23:46 ` [RFC net-next PATCH 0/2] net: Split ndo_set_rx_mode into snapshot and deferred write Jacob Keller
2 siblings, 0 replies; 6+ messages in thread
From: I Viswanath @ 2025-10-20 13:48 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, horms, sdf, kuniyu, ahmed.zaki,
aleksander.lobakin, andrew+netdev
Cc: netdev, linux-kernel, skhan, linux-kernel-mentees,
david.hunter.linux, khalid, I Viswanath
Implement ndo_write_rx_config for the 8139cp driver
Signed-off-by: I Viswanath <viswanathiyyappan@gmail.com>
---
I modified and set ndo_set_rx_mode to be __cp_set_rx_mode as using
cp_set_rx_mode as ndo_set_rx_mode callback would have led to a deadlock
in cp_init_hw()
drivers/net/ethernet/realtek/8139cp.c | 67 ++++++++++++++++++---------
1 file changed, 46 insertions(+), 21 deletions(-)
diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index 5652da8a178c..ff6195c7f3b6 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -319,6 +319,11 @@ struct cp_extra_stats {
unsigned long rx_frags;
};
+struct cp_rx_config {
+ int rx_mode;
+ u32 mc_filter[2]; /* Multicast hash filter */
+};
+
struct cp_private {
void __iomem *regs;
struct net_device *dev;
@@ -328,7 +333,7 @@ struct cp_private {
struct napi_struct napi;
struct pci_dev *pdev;
- u32 rx_config;
+ struct cp_rx_config *rx_config;
u16 cpcmd;
struct cp_extra_stats cp_stats;
@@ -882,45 +887,53 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb,
goto out_unlock;
}
+static void cp_write_rx_config(struct net_device *dev)
+{
+ struct cp_private *cp = netdev_priv(dev);
+ struct cp_rx_config snapshot;
+
+ read_snapshot((&snapshot), struct cp_private);
+
+ /* We can safely update without stopping the chip. */
+ cpw32_f(RxConfig, snapshot.rx_mode);
+
+ cpw32_f(MAR0 + 0, snapshot.mc_filter[0]);
+ cpw32_f(MAR0 + 4, snapshot.mc_filter[1]);
+}
+
/* Set or clear the multicast filter for this adaptor.
This routine is not state sensitive and need not be SMP locked. */
static void __cp_set_rx_mode (struct net_device *dev)
{
- struct cp_private *cp = netdev_priv(dev);
- u32 mc_filter[2]; /* Multicast hash filter */
- int rx_mode;
+ struct cp_rx_config new_config;
/* Note: do not reorder, GCC is clever about common statements. */
if (dev->flags & IFF_PROMISC) {
/* Unconditionally log net taps. */
- rx_mode =
+ new_config.rx_mode =
AcceptBroadcast | AcceptMulticast | AcceptMyPhys |
AcceptAllPhys;
- mc_filter[1] = mc_filter[0] = 0xffffffff;
+ new_config.mc_filter[1] = new_config.mc_filter[0] = 0xffffffff;
} else if ((netdev_mc_count(dev) > multicast_filter_limit) ||
(dev->flags & IFF_ALLMULTI)) {
/* Too many to filter perfectly -- accept all multicasts. */
- rx_mode = AcceptBroadcast | AcceptMulticast | AcceptMyPhys;
- mc_filter[1] = mc_filter[0] = 0xffffffff;
+ new_config.rx_mode = AcceptBroadcast | AcceptMulticast | AcceptMyPhys;
+ new_config.mc_filter[1] = new_config.mc_filter[0] = 0xffffffff;
} else {
struct netdev_hw_addr *ha;
- rx_mode = AcceptBroadcast | AcceptMyPhys;
- mc_filter[1] = mc_filter[0] = 0;
+ new_config.rx_mode = AcceptBroadcast | AcceptMyPhys;
+ new_config.mc_filter[1] = new_config.mc_filter[0] = 0;
netdev_for_each_mc_addr(ha, dev) {
int bit_nr = ether_crc(ETH_ALEN, ha->addr) >> 26;
- mc_filter[bit_nr >> 5] |= 1 << (bit_nr & 31);
- rx_mode |= AcceptMulticast;
+ new_config.mc_filter[bit_nr >> 5] |= 1 << (bit_nr & 31);
+ new_config.rx_mode |= AcceptMulticast;
}
}
- /* We can safely update without stopping the chip. */
- cp->rx_config = cp_rx_config | rx_mode;
- cpw32_f(RxConfig, cp->rx_config);
-
- cpw32_f (MAR0 + 0, mc_filter[0]);
- cpw32_f (MAR0 + 4, mc_filter[1]);
+ new_config.rx_mode = cp_rx_config | new_config.rx_mode;
+ update_snapshot(&new_config, struct cp_private);
}
static void cp_set_rx_mode (struct net_device *dev)
@@ -1040,7 +1053,7 @@ static void cp_init_hw (struct cp_private *cp)
cp_start_hw(cp);
cpw8(TxThresh, 0x06); /* XXX convert magic num to a constant */
- __cp_set_rx_mode(dev);
+ set_and_schedule_rx_config(dev, true);
cpw32_f (TxConfig, IFG | (TX_DMA_BURST << TxDMAShift));
cpw8(Config1, cpr8(Config1) | DriverLoaded | PMEnable);
@@ -1188,6 +1201,12 @@ static int cp_open (struct net_device *dev)
if (rc)
return rc;
+ cp->rx_config = kmalloc(sizeof(*cp->rx_config), GFP_KERNEL);
+ if (!cp->rx_config) {
+ rc = -ENOMEM;
+ goto err_out_rx_config;
+ }
+
napi_enable(&cp->napi);
cp_init_hw(cp);
@@ -1207,6 +1226,9 @@ static int cp_open (struct net_device *dev)
err_out_hw:
napi_disable(&cp->napi);
cp_stop_hw(cp);
+ kfree(cp->rx_config);
+
+err_out_rx_config:
cp_free_rings(cp);
return rc;
}
@@ -1227,6 +1249,8 @@ static int cp_close (struct net_device *dev)
cp_stop_hw(cp);
+ kfree(cp->rx_config);
+
spin_unlock_irqrestore(&cp->lock, flags);
free_irq(cp->pdev->irq, dev);
@@ -1262,7 +1286,7 @@ static void cp_tx_timeout(struct net_device *dev, unsigned int txqueue)
cp_clean_rings(cp);
cp_init_rings(cp);
cp_start_hw(cp);
- __cp_set_rx_mode(dev);
+ set_and_schedule_rx_config(dev, false);
cpw16_f(IntrMask, cp_norx_intr_mask);
netif_wake_queue(dev);
@@ -1869,7 +1893,8 @@ static const struct net_device_ops cp_netdev_ops = {
.ndo_stop = cp_close,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_mac_address = cp_set_mac_address,
- .ndo_set_rx_mode = cp_set_rx_mode,
+ .ndo_set_rx_mode = __cp_set_rx_mode,
+ .ndo_write_rx_config = cp_write_rx_config,
.ndo_get_stats = cp_get_stats,
.ndo_eth_ioctl = cp_ioctl,
.ndo_start_xmit = cp_start_xmit,
--
2.47.3
^ permalink raw reply related [flat|nested] 6+ messages in thread* Re: [RFC net-next PATCH 0/2] net: Split ndo_set_rx_mode into snapshot and deferred write
2025-10-20 13:48 [RFC net-next PATCH 0/2] net: Split ndo_set_rx_mode into snapshot and deferred write I Viswanath
2025-10-20 13:48 ` [RFC net-next PATCH 1/2] net: Add ndo_write_rx_config and helper structs and functions I Viswanath
2025-10-20 13:48 ` [RFC net-next PATCH 2/2] net: ethernet: Implement ndo_write_rx_config callback for the 8139cp driver I Viswanath
@ 2025-10-22 23:46 ` Jacob Keller
2025-10-24 5:31 ` I Viswanath
2 siblings, 1 reply; 6+ messages in thread
From: Jacob Keller @ 2025-10-22 23:46 UTC (permalink / raw)
To: I Viswanath, davem, edumazet, kuba, pabeni, horms, sdf, kuniyu,
ahmed.zaki, aleksander.lobakin, andrew+netdev
Cc: netdev, linux-kernel, skhan, linux-kernel-mentees,
david.hunter.linux, khalid
[-- Attachment #1.1: Type: text/plain, Size: 601 bytes --]
On 10/20/2025 6:48 AM, I Viswanath wrote:
> 3. All calls to modify rx_mode should pass through the set_rx_mode +
> schedule write_rx_config execution flow.
>
> 1 and 2 are guaranteed because of the properties of work queues
>
> Drivers need to ensure 3
>
Is there any mechanism to make this guarantee either implemented or at
least verified by the core? If not that, what about some sort of way to
lint driver code and make sure its correct?
In my experience, requiring drivers to do something right typically
results in a long tail of correcting drivers into the future..
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 236 bytes --]
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC net-next PATCH 0/2] net: Split ndo_set_rx_mode into snapshot and deferred write
2025-10-22 23:46 ` [RFC net-next PATCH 0/2] net: Split ndo_set_rx_mode into snapshot and deferred write Jacob Keller
@ 2025-10-24 5:31 ` I Viswanath
2025-10-24 20:20 ` Jacob Keller
0 siblings, 1 reply; 6+ messages in thread
From: I Viswanath @ 2025-10-24 5:31 UTC (permalink / raw)
To: Jacob Keller
Cc: davem, edumazet, kuba, pabeni, horms, sdf, kuniyu, ahmed.zaki,
aleksander.lobakin, andrew+netdev, netdev, linux-kernel, skhan,
linux-kernel-mentees, david.hunter.linux, khalid
On Thu, 23 Oct 2025 at 05:16, Jacob Keller <jacob.e.keller@intel.com> wrote:
>
> Is there any mechanism to make this guarantee either implemented or at
> least verified by the core? If not that, what about some sort of way to
> lint driver code and make sure its correct?
From my observations, The sane drivers modify rx_config related
registers either through the set_rx_mode function or the unlocked
version (prefixed with __)
I am not sure how to convert this to a validation of that kind.
Basically the end result should be that warnings are generated when
those functions are called
normally but not when they are called through ops->set_rx_mode.
Coccinelle might be able to do
something like this.
Related to this, I don't think a sed would be sufficient as there
might be (in theory) cases where
the function has to do a "synchronous" rx write (flush the work queue)
for correctness
but it should be good enough for most cases.
I am also not sure what is to be done if the scheduled function just
never executes.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC net-next PATCH 0/2] net: Split ndo_set_rx_mode into snapshot and deferred write
2025-10-24 5:31 ` I Viswanath
@ 2025-10-24 20:20 ` Jacob Keller
0 siblings, 0 replies; 6+ messages in thread
From: Jacob Keller @ 2025-10-24 20:20 UTC (permalink / raw)
To: I Viswanath
Cc: davem, edumazet, kuba, pabeni, horms, sdf, kuniyu, ahmed.zaki,
aleksander.lobakin, andrew+netdev, netdev, linux-kernel, skhan,
linux-kernel-mentees, david.hunter.linux, khalid
[-- Attachment #1.1: Type: text/plain, Size: 1672 bytes --]
On 10/23/2025 10:31 PM, I Viswanath wrote:
> On Thu, 23 Oct 2025 at 05:16, Jacob Keller <jacob.e.keller@intel.com> wrote:
>>
>> Is there any mechanism to make this guarantee either implemented or at
>> least verified by the core? If not that, what about some sort of way to
>> lint driver code and make sure its correct?
>
> From my observations, The sane drivers modify rx_config related
> registers either through the set_rx_mode function or the unlocked
> version (prefixed with __)
> I am not sure how to convert this to a validation of that kind.
>
Right.
> Basically the end result should be that warnings are generated when
> those functions are called
> normally but not when they are called through ops->set_rx_mode.
> Coccinelle might be able to do
> something like this.
>
> Related to this, I don't think a sed would be sufficient as there
> might be (in theory) cases where
> the function has to do a "synchronous" rx write (flush the work queue)
> for correctness
> but it should be good enough for most cases.
>
> I am also not sure what is to be done if the scheduled function just
> never executes.
I'm not sure what the best mechanism is for helping make sure drivers
get it right. I just know that past experience shows that without some
sort of check, we end up burdening reviewers with another thing they
have to double check, and inevitably some drivers will screw it up and
we'll end up with a long tail of fixes.
If we can't come up with something, I don't think it would prevent
moving this forward. I just want to spend a little thought to make sure
nothing obvious was missed.
Thanks,
Jake
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 236 bytes --]
^ permalink raw reply [flat|nested] 6+ messages in thread