* [RFC net-next v4 1/9] net: napi: Make napi_defer_hard_irqs per-NAPI
2024-10-01 23:52 [RFC net-next v4 0/9] Add support for per-NAPI config via netlink Joe Damato
@ 2024-10-01 23:52 ` Joe Damato
2024-10-08 22:08 ` Jakub Kicinski
2024-10-01 23:52 ` [RFC net-next v4 3/9] net: napi: Make gro_flush_timeout per-NAPI Joe Damato
` (2 subsequent siblings)
3 siblings, 1 reply; 14+ messages in thread
From: Joe Damato @ 2024-10-01 23:52 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Joe Damato,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Jonathan Corbet, Jiri Pirko, Sebastian Andrzej Siewior,
Lorenzo Bianconi, Kory Maincent, Johannes Berg, Breno Leitao,
Alexander Lobakin, open list:DOCUMENTATION, open list
Add defer_hard_irqs to napi_struct in preparation for per-NAPI
settings.
The existing sysfs parameter is respected; writes to sysfs will write to
all NAPI structs for the device and the net_device defer_hard_irq field.
Reads from sysfs show the net_device field.
The ability to set defer_hard_irqs on specific NAPI instances will be
added in a later commit, via netdev-genl.
Signed-off-by: Joe Damato <jdamato@fastly.com>
---
.../networking/net_cachelines/net_device.rst | 2 +-
include/linux/netdevice.h | 3 +-
net/core/dev.c | 10 +++---
net/core/dev.h | 36 +++++++++++++++++++
net/core/net-sysfs.c | 2 +-
5 files changed, 45 insertions(+), 8 deletions(-)
diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 22b07c814f4a..eeeb7c925ec5 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -99,7 +99,6 @@ unsigned_int num_rx_queues
unsigned_int real_num_rx_queues - read_mostly get_rps_cpu
struct_bpf_prog* xdp_prog - read_mostly netif_elide_gro()
unsigned_long gro_flush_timeout - read_mostly napi_complete_done
-u32 napi_defer_hard_irqs - read_mostly napi_complete_done
unsigned_int gro_max_size - read_mostly skb_gro_receive
unsigned_int gro_ipv4_max_size - read_mostly skb_gro_receive
rx_handler_func_t* rx_handler read_mostly - __netif_receive_skb_core
@@ -183,3 +182,4 @@ struct_devlink_port* devlink_port
struct_dpll_pin* dpll_pin
struct hlist_head page_pools
struct dim_irq_moder* irq_moder
+u32 napi_defer_hard_irqs
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e87b5e488325..55764efc5c93 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -377,6 +377,7 @@ struct napi_struct {
struct list_head dev_list;
struct hlist_node napi_hash_node;
int irq;
+ u32 defer_hard_irqs;
};
enum {
@@ -2075,7 +2076,6 @@ struct net_device {
unsigned int real_num_rx_queues;
struct netdev_rx_queue *_rx;
unsigned long gro_flush_timeout;
- u32 napi_defer_hard_irqs;
unsigned int gro_max_size;
unsigned int gro_ipv4_max_size;
rx_handler_func_t __rcu *rx_handler;
@@ -2398,6 +2398,7 @@ struct net_device {
/** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */
struct dim_irq_moder *irq_moder;
+ u32 napi_defer_hard_irqs;
u8 priv[] ____cacheline_aligned
__counted_by(priv_len);
diff --git a/net/core/dev.c b/net/core/dev.c
index cd479f5f22f6..748739958d2a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6227,7 +6227,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
if (work_done) {
if (n->gro_bitmask)
timeout = READ_ONCE(n->dev->gro_flush_timeout);
- n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
+ n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
}
if (n->defer_hard_irqs_count > 0) {
n->defer_hard_irqs_count--;
@@ -6365,7 +6365,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (flags & NAPI_F_PREFER_BUSY_POLL) {
- napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
+ napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
timeout = READ_ONCE(napi->dev->gro_flush_timeout);
if (napi->defer_hard_irqs_count && timeout) {
hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
@@ -6647,6 +6647,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
+ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
init_gro_hash(napi);
napi->skb = NULL;
INIT_LIST_HEAD(&napi->rx_list);
@@ -11053,7 +11054,7 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
dev->gro_flush_timeout = 20000;
- dev->napi_defer_hard_irqs = 1;
+ netdev_set_defer_hard_irqs(dev, 1);
}
}
EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
@@ -11991,7 +11992,6 @@ static void __init net_dev_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
@@ -12003,7 +12003,7 @@ static void __init net_dev_struct_check(void)
#ifdef CONFIG_NET_XGRESS
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
#endif
- CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 100);
}
/*
diff --git a/net/core/dev.h b/net/core/dev.h
index 5654325c5b71..b3792219879b 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -138,6 +138,42 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
WRITE_ONCE(dev->gro_ipv4_max_size, size);
}
+/**
+ * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs
+ * @n: napi struct to get the defer_hard_irqs field from
+ *
+ * Return: the per-NAPI value of the defar_hard_irqs field.
+ */
+static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n)
+{
+ return READ_ONCE(n->defer_hard_irqs);
+}
+
+/**
+ * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi
+ * @n: napi_struct to set the defer_hard_irqs field
+ * @defer: the value the field should be set to
+ */
+static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
+{
+ WRITE_ONCE(n->defer_hard_irqs, defer);
+}
+
+/**
+ * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev
+ * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set
+ * @defer: the defer_hard_irqs value to set
+ */
+static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
+ u32 defer)
+{
+ struct napi_struct *napi;
+
+ WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
+ list_for_each_entry(napi, &netdev->napi_list, dev_list)
+ napi_set_defer_hard_irqs(napi, defer);
+}
+
int rps_cpumask_housekeeping(struct cpumask *mask);
#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 05cf5347f25e..25125f356a15 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -429,7 +429,7 @@ static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val
if (val > S32_MAX)
return -ERANGE;
- WRITE_ONCE(dev->napi_defer_hard_irqs, val);
+ netdev_set_defer_hard_irqs(dev, (u32)val);
return 0;
}
--
2.25.1
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [RFC net-next v4 1/9] net: napi: Make napi_defer_hard_irqs per-NAPI
2024-10-01 23:52 ` [RFC net-next v4 1/9] net: napi: Make napi_defer_hard_irqs per-NAPI Joe Damato
@ 2024-10-08 22:08 ` Jakub Kicinski
0 siblings, 0 replies; 14+ messages in thread
From: Jakub Kicinski @ 2024-10-08 22:08 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, David S. Miller,
Eric Dumazet, Paolo Abeni, Jonathan Corbet, Jiri Pirko,
Sebastian Andrzej Siewior, Lorenzo Bianconi, Kory Maincent,
Johannes Berg, Breno Leitao, Alexander Lobakin,
open list:DOCUMENTATION, open list
On Tue, 1 Oct 2024 23:52:32 +0000 Joe Damato wrote:
> @@ -377,6 +377,7 @@ struct napi_struct {
> struct list_head dev_list;
> struct hlist_node napi_hash_node;
> int irq;
> + u32 defer_hard_irqs;
This will be read on every unmasking, let's put it above
the "/* control-path-only fields follow */" comment
^ permalink raw reply [flat|nested] 14+ messages in thread
* [RFC net-next v4 3/9] net: napi: Make gro_flush_timeout per-NAPI
2024-10-01 23:52 [RFC net-next v4 0/9] Add support for per-NAPI config via netlink Joe Damato
2024-10-01 23:52 ` [RFC net-next v4 1/9] net: napi: Make napi_defer_hard_irqs per-NAPI Joe Damato
@ 2024-10-01 23:52 ` Joe Damato
2024-10-08 18:22 ` Joe Damato
2024-10-08 22:10 ` Jakub Kicinski
2024-10-01 23:52 ` [RFC net-next v4 5/9] net: napi: Add napi_config Joe Damato
2024-10-03 23:29 ` [RFC net-next v4 0/9] Add support for per-NAPI config via netlink Stanislav Fomichev
3 siblings, 2 replies; 14+ messages in thread
From: Joe Damato @ 2024-10-01 23:52 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Joe Damato,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Jonathan Corbet, Tony Nguyen, Przemek Kitszel, Jiri Pirko,
Sebastian Andrzej Siewior, Lorenzo Bianconi, David Ahern,
Kory Maincent, Johannes Berg, Breno Leitao, Alexander Lobakin,
open list:DOCUMENTATION, open list,
moderated list:INTEL ETHERNET DRIVERS
Allow per-NAPI gro_flush_timeout setting.
The existing sysfs parameter is respected; writes to sysfs will write to
all NAPI structs for the device and the net_device gro_flush_timeout
field. Reads from sysfs will read from the net_device field.
The ability to set gro_flush_timeout on specific NAPI instances will be
added in a later commit, via netdev-genl.
Note that idpf has embedded napi_struct in its internals and has
established some series of asserts that involve the size of napi
structure. Since this change increases the napi_struct size from 400 to
416 (according to pahole on my system), I've increased the assertion in
idpf by 16 bytes. No attention whatsoever was paid to the cacheline
placement of idpf internals as a result of this change.
Signed-off-by: Joe Damato <jdamato@fastly.com>
---
.../networking/net_cachelines/net_device.rst | 2 +-
drivers/net/ethernet/intel/idpf/idpf_txrx.h | 2 +-
include/linux/netdevice.h | 3 +-
net/core/dev.c | 12 +++---
net/core/dev.h | 40 +++++++++++++++++++
net/core/net-sysfs.c | 2 +-
6 files changed, 51 insertions(+), 10 deletions(-)
diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index eeeb7c925ec5..3d02ae79c850 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -98,7 +98,6 @@ struct_netdev_queue* _rx read_mostly
unsigned_int num_rx_queues
unsigned_int real_num_rx_queues - read_mostly get_rps_cpu
struct_bpf_prog* xdp_prog - read_mostly netif_elide_gro()
-unsigned_long gro_flush_timeout - read_mostly napi_complete_done
unsigned_int gro_max_size - read_mostly skb_gro_receive
unsigned_int gro_ipv4_max_size - read_mostly skb_gro_receive
rx_handler_func_t* rx_handler read_mostly - __netif_receive_skb_core
@@ -182,4 +181,5 @@ struct_devlink_port* devlink_port
struct_dpll_pin* dpll_pin
struct hlist_head page_pools
struct dim_irq_moder* irq_moder
+unsigned_long gro_flush_timeout
u32 napi_defer_hard_irqs
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
index f0537826f840..fcdf73486d46 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
@@ -438,7 +438,7 @@ struct idpf_q_vector {
__cacheline_group_end_aligned(cold);
};
libeth_cacheline_set_assert(struct idpf_q_vector, 112,
- 424 + 2 * sizeof(struct dim),
+ 440 + 2 * sizeof(struct dim),
8 + sizeof(cpumask_var_t));
struct idpf_rx_queue_stats {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 55764efc5c93..33897edd16c8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -377,6 +377,7 @@ struct napi_struct {
struct list_head dev_list;
struct hlist_node napi_hash_node;
int irq;
+ unsigned long gro_flush_timeout;
u32 defer_hard_irqs;
};
@@ -2075,7 +2076,6 @@ struct net_device {
int ifindex;
unsigned int real_num_rx_queues;
struct netdev_rx_queue *_rx;
- unsigned long gro_flush_timeout;
unsigned int gro_max_size;
unsigned int gro_ipv4_max_size;
rx_handler_func_t __rcu *rx_handler;
@@ -2398,6 +2398,7 @@ struct net_device {
/** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */
struct dim_irq_moder *irq_moder;
+ unsigned long gro_flush_timeout;
u32 napi_defer_hard_irqs;
u8 priv[] ____cacheline_aligned
diff --git a/net/core/dev.c b/net/core/dev.c
index 748739958d2a..056ed44f766f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6226,12 +6226,12 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
if (work_done) {
if (n->gro_bitmask)
- timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ timeout = napi_get_gro_flush_timeout(n);
n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
}
if (n->defer_hard_irqs_count > 0) {
n->defer_hard_irqs_count--;
- timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ timeout = napi_get_gro_flush_timeout(n);
if (timeout)
ret = false;
}
@@ -6366,7 +6366,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
if (flags & NAPI_F_PREFER_BUSY_POLL) {
napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
- timeout = READ_ONCE(napi->dev->gro_flush_timeout);
+ timeout = napi_get_gro_flush_timeout(napi);
if (napi->defer_hard_irqs_count && timeout) {
hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
skip_schedule = true;
@@ -6648,6 +6648,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+ napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
init_gro_hash(napi);
napi->skb = NULL;
INIT_LIST_HEAD(&napi->rx_list);
@@ -11053,7 +11054,7 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
WARN_ON(dev->reg_state == NETREG_REGISTERED);
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
- dev->gro_flush_timeout = 20000;
+ netdev_set_gro_flush_timeout(dev, 20000);
netdev_set_defer_hard_irqs(dev, 1);
}
}
@@ -11991,7 +11992,6 @@ static void __init net_dev_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
@@ -12003,7 +12003,7 @@ static void __init net_dev_struct_check(void)
#ifdef CONFIG_NET_XGRESS
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
#endif
- CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 100);
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
}
/*
diff --git a/net/core/dev.h b/net/core/dev.h
index b3792219879b..26e598aa56c3 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -174,6 +174,46 @@ static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
napi_set_defer_hard_irqs(napi, defer);
}
+/**
+ * napi_get_gro_flush_timeout - get the gro_flush_timeout
+ * @n: napi struct to get the gro_flush_timeout from
+ *
+ * Return: the per-NAPI value of the gro_flush_timeout field.
+ */
+static inline unsigned long
+napi_get_gro_flush_timeout(const struct napi_struct *n)
+{
+ return READ_ONCE(n->gro_flush_timeout);
+}
+
+/**
+ * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi
+ * @n: napi struct to set the gro_flush_timeout
+ * @timeout: timeout value to set
+ *
+ * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout
+ */
+static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
+ unsigned long timeout)
+{
+ WRITE_ONCE(n->gro_flush_timeout, timeout);
+}
+
+/**
+ * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs
+ * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set
+ * @timeout: the timeout value to set
+ */
+static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
+ unsigned long timeout)
+{
+ struct napi_struct *napi;
+
+ WRITE_ONCE(netdev->gro_flush_timeout, timeout);
+ list_for_each_entry(napi, &netdev->napi_list, dev_list)
+ napi_set_gro_flush_timeout(napi, timeout);
+}
+
int rps_cpumask_housekeeping(struct cpumask *mask);
#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 25125f356a15..2d9afc6e2161 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -409,7 +409,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
{
- WRITE_ONCE(dev->gro_flush_timeout, val);
+ netdev_set_gro_flush_timeout(dev, val);
return 0;
}
--
2.25.1
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [RFC net-next v4 3/9] net: napi: Make gro_flush_timeout per-NAPI
2024-10-01 23:52 ` [RFC net-next v4 3/9] net: napi: Make gro_flush_timeout per-NAPI Joe Damato
@ 2024-10-08 18:22 ` Joe Damato
2024-10-08 22:10 ` Jakub Kicinski
1 sibling, 0 replies; 14+ messages in thread
From: Joe Damato @ 2024-10-08 18:22 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Jonathan Corbet,
Tony Nguyen, Przemek Kitszel, Jiri Pirko,
Sebastian Andrzej Siewior, Lorenzo Bianconi, David Ahern,
Kory Maincent, Johannes Berg, Breno Leitao, Alexander Lobakin,
open list:DOCUMENTATION, open list,
moderated list:INTEL ETHERNET DRIVERS
On Tue, Oct 01, 2024 at 11:52:34PM +0000, Joe Damato wrote:
[...]
> Note that idpf has embedded napi_struct in its internals and has
> established some series of asserts that involve the size of napi
> structure. Since this change increases the napi_struct size from 400 to
> 416 (according to pahole on my system), I've increased the assertion in
> idpf by 16 bytes. No attention whatsoever was paid to the cacheline
> placement of idpf internals as a result of this change.
>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
> .../networking/net_cachelines/net_device.rst | 2 +-
> drivers/net/ethernet/intel/idpf/idpf_txrx.h | 2 +-
> include/linux/netdevice.h | 3 +-
> net/core/dev.c | 12 +++---
> net/core/dev.h | 40 +++++++++++++++++++
> net/core/net-sysfs.c | 2 +-
> 6 files changed, 51 insertions(+), 10 deletions(-)
[...]
> diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
> index f0537826f840..fcdf73486d46 100644
> --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
> +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
> @@ -438,7 +438,7 @@ struct idpf_q_vector {
> __cacheline_group_end_aligned(cold);
> };
> libeth_cacheline_set_assert(struct idpf_q_vector, 112,
> - 424 + 2 * sizeof(struct dim),
> + 440 + 2 * sizeof(struct dim),
> 8 + sizeof(cpumask_var_t));
>
> struct idpf_rx_queue_stats {
Now that idpf was fixed separately [1], this will be removed in the
v5.
[1]: https://lore.kernel.org/netdev/20241004105407.73585-1-jdamato@fastly.com/
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [RFC net-next v4 3/9] net: napi: Make gro_flush_timeout per-NAPI
2024-10-01 23:52 ` [RFC net-next v4 3/9] net: napi: Make gro_flush_timeout per-NAPI Joe Damato
2024-10-08 18:22 ` Joe Damato
@ 2024-10-08 22:10 ` Jakub Kicinski
1 sibling, 0 replies; 14+ messages in thread
From: Jakub Kicinski @ 2024-10-08 22:10 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, David S. Miller,
Eric Dumazet, Paolo Abeni, Jonathan Corbet, Tony Nguyen,
Przemek Kitszel, Jiri Pirko, Sebastian Andrzej Siewior,
Lorenzo Bianconi, David Ahern, Kory Maincent, Johannes Berg,
Breno Leitao, Alexander Lobakin, open list:DOCUMENTATION,
open list, moderated list:INTEL ETHERNET DRIVERS
On Tue, 1 Oct 2024 23:52:34 +0000 Joe Damato wrote:
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 55764efc5c93..33897edd16c8 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -377,6 +377,7 @@ struct napi_struct {
> struct list_head dev_list;
> struct hlist_node napi_hash_node;
> int irq;
> + unsigned long gro_flush_timeout;
> u32 defer_hard_irqs;
> };
Same story
^ permalink raw reply [flat|nested] 14+ messages in thread
* [RFC net-next v4 5/9] net: napi: Add napi_config
2024-10-01 23:52 [RFC net-next v4 0/9] Add support for per-NAPI config via netlink Joe Damato
2024-10-01 23:52 ` [RFC net-next v4 1/9] net: napi: Make napi_defer_hard_irqs per-NAPI Joe Damato
2024-10-01 23:52 ` [RFC net-next v4 3/9] net: napi: Make gro_flush_timeout per-NAPI Joe Damato
@ 2024-10-01 23:52 ` Joe Damato
2024-10-08 18:19 ` Joe Damato
2024-10-08 22:17 ` Jakub Kicinski
2024-10-03 23:29 ` [RFC net-next v4 0/9] Add support for per-NAPI config via netlink Stanislav Fomichev
3 siblings, 2 replies; 14+ messages in thread
From: Joe Damato @ 2024-10-01 23:52 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Joe Damato,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Jonathan Corbet, Jiri Pirko, Sebastian Andrzej Siewior,
Lorenzo Bianconi, Johannes Berg, open list:DOCUMENTATION,
open list
Add a persistent NAPI config area for NAPI configuration to the core.
Drivers opt-in to setting the persistent config for a NAPI by passing an
index when calling netif_napi_add_config.
napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
(after the NAPIs are deleted).
Drivers which call netif_napi_add_config will have persistent per-NAPI
settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
Per-NAPI settings are saved in napi_disable and restored in napi_enable.
Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Joe Damato <jdamato@fastly.com>
---
.../networking/net_cachelines/net_device.rst | 1 +
include/linux/netdevice.h | 32 ++++++++
net/core/dev.c | 79 +++++++++++++++++--
net/core/dev.h | 14 ++++
4 files changed, 119 insertions(+), 7 deletions(-)
diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 3d02ae79c850..11d659051f5e 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -183,3 +183,4 @@ struct hlist_head page_pools
struct dim_irq_moder* irq_moder
unsigned_long gro_flush_timeout
u32 napi_defer_hard_irqs
+struct napi_config* napi_config
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 33897edd16c8..51cff55e7ab8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -342,6 +342,15 @@ struct gro_list {
*/
#define GRO_HASH_BUCKETS 8
+/*
+ * Structure for per-NAPI config
+ */
+struct napi_config {
+ u64 gro_flush_timeout;
+ u32 defer_hard_irqs;
+ unsigned int napi_id;
+};
+
/*
* Structure for NAPI scheduling similar to tasklet but with weighting
*/
@@ -379,6 +388,8 @@ struct napi_struct {
int irq;
unsigned long gro_flush_timeout;
u32 defer_hard_irqs;
+ int index;
+ struct napi_config *config;
};
enum {
@@ -2011,6 +2022,9 @@ enum netdev_reg_state {
* @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
* where the clock is recovered.
*
+ * @napi_config: An array of napi_config structures containing per-NAPI
+ * settings.
+ *
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
*/
@@ -2400,6 +2414,7 @@ struct net_device {
struct dim_irq_moder *irq_moder;
unsigned long gro_flush_timeout;
u32 napi_defer_hard_irqs;
+ struct napi_config *napi_config;
u8 priv[] ____cacheline_aligned
__counted_by(priv_len);
@@ -2650,6 +2665,23 @@ netif_napi_add_tx_weight(struct net_device *dev,
netif_napi_add_weight(dev, napi, poll, weight);
}
+/**
+ * netif_napi_add_config - initialize a NAPI context with persistent config
+ * @dev: network device
+ * @napi: NAPI context
+ * @poll: polling function
+ * @weight: the poll weight of this NAPI
+ * @index: the NAPI index
+ */
+static inline void
+netif_napi_add_config(struct net_device *dev, struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int), int index)
+{
+ napi->index = index;
+ napi->config = &dev->napi_config[index];
+ netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
+}
+
/**
* netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
* @dev: network device
diff --git a/net/core/dev.c b/net/core/dev.c
index 056ed44f766f..9acb6db19200 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6499,6 +6499,22 @@ EXPORT_SYMBOL(napi_busy_loop);
#endif /* CONFIG_NET_RX_BUSY_POLL */
+static void __napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ napi->napi_id = napi_id;
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+}
+
+static void napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ spin_lock(&napi_hash_lock);
+ __napi_hash_add_with_id(napi, napi_id);
+ spin_unlock(&napi_hash_lock);
+}
+
static void napi_hash_add(struct napi_struct *napi)
{
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
@@ -6511,10 +6527,8 @@ static void napi_hash_add(struct napi_struct *napi)
if (unlikely(++napi_gen_id < MIN_NAPI_ID))
napi_gen_id = MIN_NAPI_ID;
} while (napi_by_id(napi_gen_id));
- napi->napi_id = napi_gen_id;
- hlist_add_head_rcu(&napi->napi_hash_node,
- &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+ __napi_hash_add_with_id(napi, napi_gen_id);
spin_unlock(&napi_hash_lock);
}
@@ -6637,6 +6651,28 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
}
EXPORT_SYMBOL(netif_queue_set_napi);
+static void napi_restore_config(struct napi_struct *n)
+{
+ n->defer_hard_irqs = n->config->defer_hard_irqs;
+ n->gro_flush_timeout = n->config->gro_flush_timeout;
+ /* a NAPI ID might be stored in the config, if so use it. if not, use
+ * napi_hash_add to generate one for us. It will be saved to the config
+ * in napi_disable.
+ */
+ if (n->config->napi_id)
+ napi_hash_add_with_id(n, n->config->napi_id);
+ else
+ napi_hash_add(n);
+}
+
+static void napi_save_config(struct napi_struct *n)
+{
+ n->config->defer_hard_irqs = n->defer_hard_irqs;
+ n->config->gro_flush_timeout = n->gro_flush_timeout;
+ n->config->napi_id = n->napi_id;
+ napi_hash_del(n);
+}
+
void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
@@ -6647,8 +6683,6 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
- napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
- napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
init_gro_hash(napi);
napi->skb = NULL;
INIT_LIST_HEAD(&napi->rx_list);
@@ -6666,7 +6700,13 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
- napi_hash_add(napi);
+
+ /* default settings from sysfs are applied to all NAPIs. any per-NAPI
+ * configuration will be loaded in napi_enable
+ */
+ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+ napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
+
napi_get_frags_check(napi);
/* Create kthread for this napi if dev->threaded is set.
* Clear dev->threaded if kthread creation failed so that
@@ -6698,6 +6738,11 @@ void napi_disable(struct napi_struct *n)
hrtimer_cancel(&n->timer);
+ if (n->config)
+ napi_save_config(n);
+ else
+ napi_hash_del(n);
+
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable);
@@ -6713,6 +6758,11 @@ void napi_enable(struct napi_struct *n)
{
unsigned long new, val = READ_ONCE(n->state);
+ if (n->config)
+ napi_restore_config(n);
+ else
+ napi_hash_add(n);
+
do {
BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
@@ -6742,7 +6792,11 @@ void __netif_napi_del(struct napi_struct *napi)
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
return;
- napi_hash_del(napi);
+ if (napi->config) {
+ napi->index = -1;
+ napi->config = NULL;
+ }
+
list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
@@ -11079,6 +11133,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
+ size_t napi_config_sz;
+ unsigned int maxqs;
BUG_ON(strlen(name) >= sizeof(dev->name));
@@ -11092,6 +11148,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
return NULL;
}
+ maxqs = max(txqs, rxqs);
+
dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!dev)
@@ -11166,6 +11224,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
if (!dev->ethtool)
goto free_all;
+ napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
+ dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
+ if (!dev->napi_config)
+ goto free_all;
+
strscpy(dev->name, name);
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
@@ -11227,6 +11290,8 @@ void free_netdev(struct net_device *dev)
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
+ kvfree(dev->napi_config);
+
ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
diff --git a/net/core/dev.h b/net/core/dev.h
index 26e598aa56c3..7365fa0ffdc7 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -167,11 +167,18 @@ static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
u32 defer)
{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
struct napi_struct *napi;
+ int i;
WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
list_for_each_entry(napi, &netdev->napi_list, dev_list)
napi_set_defer_hard_irqs(napi, defer);
+
+ if (netdev->napi_config)
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].defer_hard_irqs = defer;
}
/**
@@ -207,11 +214,18 @@ static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
unsigned long timeout)
{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
struct napi_struct *napi;
+ int i;
WRITE_ONCE(netdev->gro_flush_timeout, timeout);
list_for_each_entry(napi, &netdev->napi_list, dev_list)
napi_set_gro_flush_timeout(napi, timeout);
+
+ if (netdev->napi_config)
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].gro_flush_timeout = timeout;
}
int rps_cpumask_housekeeping(struct cpumask *mask);
--
2.25.1
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [RFC net-next v4 5/9] net: napi: Add napi_config
2024-10-01 23:52 ` [RFC net-next v4 5/9] net: napi: Add napi_config Joe Damato
@ 2024-10-08 18:19 ` Joe Damato
2024-10-08 22:17 ` Jakub Kicinski
1 sibling, 0 replies; 14+ messages in thread
From: Joe Damato @ 2024-10-08 18:19 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Jonathan Corbet,
Jiri Pirko, Sebastian Andrzej Siewior, Lorenzo Bianconi,
Johannes Berg, open list:DOCUMENTATION, open list
On Tue, Oct 01, 2024 at 11:52:36PM +0000, Joe Damato wrote:
> Add a persistent NAPI config area for NAPI configuration to the core.
> Drivers opt-in to setting the persistent config for a NAPI by passing an
> index when calling netif_napi_add_config.
>
> napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> (after the NAPIs are deleted).
>
> Drivers which call netif_napi_add_config will have persistent per-NAPI
> settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
>
> Per-NAPI settings are saved in napi_disable and restored in napi_enable.
>
> Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
> Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
> .../networking/net_cachelines/net_device.rst | 1 +
> include/linux/netdevice.h | 32 ++++++++
> net/core/dev.c | 79 +++++++++++++++++--
> net/core/dev.h | 14 ++++
> 4 files changed, 119 insertions(+), 7 deletions(-)
[...]
> +/**
> + * netif_napi_add_config - initialize a NAPI context with persistent config
> + * @dev: network device
> + * @napi: NAPI context
> + * @poll: polling function
> + * @weight: the poll weight of this NAPI
For anyone following along, I noticed this unnecessary bit of kdoc
left in from a previous revision and will remove it when submitting
v5.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC net-next v4 5/9] net: napi: Add napi_config
2024-10-01 23:52 ` [RFC net-next v4 5/9] net: napi: Add napi_config Joe Damato
2024-10-08 18:19 ` Joe Damato
@ 2024-10-08 22:17 ` Jakub Kicinski
2024-10-08 22:28 ` Joe Damato
1 sibling, 1 reply; 14+ messages in thread
From: Jakub Kicinski @ 2024-10-08 22:17 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, David S. Miller,
Eric Dumazet, Paolo Abeni, Jonathan Corbet, Jiri Pirko,
Sebastian Andrzej Siewior, Lorenzo Bianconi, Johannes Berg,
open list:DOCUMENTATION, open list
On Tue, 1 Oct 2024 23:52:36 +0000 Joe Damato wrote:
> static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
> u32 defer)
> {
> + unsigned int count = max(netdev->num_rx_queues,
> + netdev->num_tx_queues);
> struct napi_struct *napi;
> + int i;
>
> WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
> list_for_each_entry(napi, &netdev->napi_list, dev_list)
> napi_set_defer_hard_irqs(napi, defer);
> +
> + if (netdev->napi_config)
Could this ever be NULL ?
> + for (i = 0; i < count; i++)
> + netdev->napi_config[i].defer_hard_irqs = defer;
> }
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [RFC net-next v4 5/9] net: napi: Add napi_config
2024-10-08 22:17 ` Jakub Kicinski
@ 2024-10-08 22:28 ` Joe Damato
0 siblings, 0 replies; 14+ messages in thread
From: Joe Damato @ 2024-10-08 22:28 UTC (permalink / raw)
To: Jakub Kicinski
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, David S. Miller,
Eric Dumazet, Paolo Abeni, Jonathan Corbet, Jiri Pirko,
Sebastian Andrzej Siewior, Lorenzo Bianconi, Johannes Berg,
open list:DOCUMENTATION, open list
On Tue, Oct 08, 2024 at 03:17:01PM -0700, Jakub Kicinski wrote:
> On Tue, 1 Oct 2024 23:52:36 +0000 Joe Damato wrote:
> > static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
> > u32 defer)
> > {
> > + unsigned int count = max(netdev->num_rx_queues,
> > + netdev->num_tx_queues);
> > struct napi_struct *napi;
> > + int i;
> >
> > WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
> > list_for_each_entry(napi, &netdev->napi_list, dev_list)
> > napi_set_defer_hard_irqs(napi, defer);
> > +
> > + if (netdev->napi_config)
>
> Could this ever be NULL ?
Ah, good catch. I think this was an artifact from a previous
revision where it could have been. But, I'll double check.
In the current proposed implementation, however, I don't think it
can be NULL as it is always allocated.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC net-next v4 0/9] Add support for per-NAPI config via netlink
2024-10-01 23:52 [RFC net-next v4 0/9] Add support for per-NAPI config via netlink Joe Damato
` (2 preceding siblings ...)
2024-10-01 23:52 ` [RFC net-next v4 5/9] net: napi: Add napi_config Joe Damato
@ 2024-10-03 23:29 ` Stanislav Fomichev
2024-10-03 23:53 ` Joe Damato
3 siblings, 1 reply; 14+ messages in thread
From: Stanislav Fomichev @ 2024-10-03 23:29 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Alexander Lobakin,
Breno Leitao, Daniel Jurgens, David Ahern, David S. Miller,
Donald Hunter, Eric Dumazet,
moderated list:INTEL ETHERNET DRIVERS, Jakub Kicinski,
Jesper Dangaard Brouer, Jiri Pirko, Johannes Berg,
Jonathan Corbet, Kory Maincent, Leon Romanovsky,
open list:DOCUMENTATION, open list,
open list:MELLANOX MLX4 core VPI driver, Lorenzo Bianconi,
Michael Chan, Mina Almasry, Paolo Abeni, Przemek Kitszel,
Saeed Mahameed, Sebastian Andrzej Siewior, Tariq Toukan,
Tony Nguyen, Xuan Zhuo
On 10/01, Joe Damato wrote:
> Greetings:
>
> Welcome to RFC v4.
>
> Very important and significant changes have been made since RFC v3 [1],
> please see the changelog below for details.
>
> A couple important call outs for this revision for reviewers:
>
> 1. idpf embeds a napi_struct in an internal data structure and
> includes an assertion on the size of napi_struct. The maintainers
> have stated that they think anyone touching napi_struct should update
> the assertion [2], so I've done this in patch 3.
>
> Even though the assertion has been updated, I've given the
> cacheline placement of napi_struct within idpf's internals no
> thought or consideration.
>
> Would appreciate other opinions on this; I think idpf should be
> fixed. It seems unreasonable to me that anyone changing the size of
> a struct in the core should need to think about cachelines in idpf.
[..]
> 2. This revision seems to work (see below for a full walk through). Is
> this the behavior we want? Am I missing some use case or some
> behavioral thing other folks need?
The walk through looks good!
> 3. Re a previous point made by Stanislav regarding "taking over a NAPI
> ID" when the channel count changes: mlx5 seems to call napi_disable
> followed by netif_napi_del for the old queues and then calls
> napi_enable for the new ones. In this RFC, the NAPI ID generation
> is deferred to napi_enable. This means we won't end up with two of
> the same NAPI IDs added to the hash at the same time (I am pretty
> sure).
[..]
> Can we assume all drivers will napi_disable the old queues before
> napi_enable the new ones? If yes, we might not need to worry about
> a NAPI ID takeover function.
With the explicit driver opt-in via netif_napi_add_config, this
shouldn't matter? When somebody gets to converting the drivers that
don't follow this common pattern they'll have to solve the takeover
part :-)
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [RFC net-next v4 0/9] Add support for per-NAPI config via netlink
2024-10-03 23:29 ` [RFC net-next v4 0/9] Add support for per-NAPI config via netlink Stanislav Fomichev
@ 2024-10-03 23:53 ` Joe Damato
2024-10-04 2:33 ` Joe Damato
0 siblings, 1 reply; 14+ messages in thread
From: Joe Damato @ 2024-10-03 23:53 UTC (permalink / raw)
To: Stanislav Fomichev
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Alexander Lobakin,
Breno Leitao, Daniel Jurgens, David Ahern, David S. Miller,
Donald Hunter, Eric Dumazet,
moderated list:INTEL ETHERNET DRIVERS, Jakub Kicinski,
Jesper Dangaard Brouer, Jiri Pirko, Johannes Berg,
Jonathan Corbet, Kory Maincent, Leon Romanovsky,
open list:DOCUMENTATION, open list,
open list:MELLANOX MLX4 core VPI driver, Lorenzo Bianconi,
Michael Chan, Mina Almasry, Paolo Abeni, Przemek Kitszel,
Saeed Mahameed, Sebastian Andrzej Siewior, Tariq Toukan,
Tony Nguyen, Xuan Zhuo
On Thu, Oct 03, 2024 at 04:29:37PM -0700, Stanislav Fomichev wrote:
> On 10/01, Joe Damato wrote:
[...]
> > 2. This revision seems to work (see below for a full walk through). Is
> > this the behavior we want? Am I missing some use case or some
> > behavioral thing other folks need?
>
> The walk through looks good!
Thanks for taking a look.
> > 3. Re a previous point made by Stanislav regarding "taking over a NAPI
> > ID" when the channel count changes: mlx5 seems to call napi_disable
> > followed by netif_napi_del for the old queues and then calls
> > napi_enable for the new ones. In this RFC, the NAPI ID generation
> > is deferred to napi_enable. This means we won't end up with two of
> > the same NAPI IDs added to the hash at the same time (I am pretty
> > sure).
>
>
> [..]
>
> > Can we assume all drivers will napi_disable the old queues before
> > napi_enable the new ones? If yes, we might not need to worry about
> > a NAPI ID takeover function.
>
> With the explicit driver opt-in via netif_napi_add_config, this
> shouldn't matter? When somebody gets to converting the drivers that
> don't follow this common pattern they'll have to solve the takeover
> part :-)
That is true; that's a good point. I'll let the RFC hang out on the
list for another day or two just to give Jakub time to catch up on
his mails ;) but if you all agree... this might be ready to be
resent as a PATCH instead of an RFC.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC net-next v4 0/9] Add support for per-NAPI config via netlink
2024-10-03 23:53 ` Joe Damato
@ 2024-10-04 2:33 ` Joe Damato
2024-10-04 16:22 ` Stanislav Fomichev
0 siblings, 1 reply; 14+ messages in thread
From: Joe Damato @ 2024-10-04 2:33 UTC (permalink / raw)
To: Stanislav Fomichev, netdev, mkarsten, skhawaja, sdf, bjorn,
amritha.nambiar, sridhar.samudrala, willemdebruijn.kernel,
Alexander Lobakin, Breno Leitao, Daniel Jurgens, David Ahern,
David S. Miller, Donald Hunter, Eric Dumazet,
moderated list:INTEL ETHERNET DRIVERS, Jakub Kicinski,
Jesper Dangaard Brouer, Jiri Pirko, Johannes Berg,
Jonathan Corbet, Kory Maincent, Leon Romanovsky,
open list:DOCUMENTATION, open list,
open list:MELLANOX MLX4 core VPI driver, Lorenzo Bianconi,
Michael Chan, Mina Almasry, Paolo Abeni, Przemek Kitszel,
Saeed Mahameed, Sebastian Andrzej Siewior, Tariq Toukan,
Tony Nguyen, Xuan Zhuo
On Thu, Oct 03, 2024 at 04:53:13PM -0700, Joe Damato wrote:
> On Thu, Oct 03, 2024 at 04:29:37PM -0700, Stanislav Fomichev wrote:
> > On 10/01, Joe Damato wrote:
>
> [...]
>
> > > 2. This revision seems to work (see below for a full walk through). Is
> > > this the behavior we want? Am I missing some use case or some
> > > behavioral thing other folks need?
> >
> > The walk through looks good!
>
> Thanks for taking a look.
>
> > > 3. Re a previous point made by Stanislav regarding "taking over a NAPI
> > > ID" when the channel count changes: mlx5 seems to call napi_disable
> > > followed by netif_napi_del for the old queues and then calls
> > > napi_enable for the new ones. In this RFC, the NAPI ID generation
> > > is deferred to napi_enable. This means we won't end up with two of
> > > the same NAPI IDs added to the hash at the same time (I am pretty
> > > sure).
> >
> >
> > [..]
> >
> > > Can we assume all drivers will napi_disable the old queues before
> > > napi_enable the new ones? If yes, we might not need to worry about
> > > a NAPI ID takeover function.
> >
> > With the explicit driver opt-in via netif_napi_add_config, this
> > shouldn't matter? When somebody gets to converting the drivers that
> > don't follow this common pattern they'll have to solve the takeover
> > part :-)
>
> That is true; that's a good point.
Actually, sorry, that isn't strictly true. NAPI ID generation is
moved for everything to napi_enable; they just are (or are not)
persisted depending on whether the driver opted in to add_config or
not.
So, the change does affect all drivers. NAPI IDs won't be generated
and added to the hash until napi_enable and they will be removed
from the hash in napi_disable... even if you didn't opt-in to having
storage.
Opt-ing in to storage via netif_napi_add_config just means that your
NAPI IDs (and other settings) will be persistent.
Sorry about my confusion when replying earlier.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC net-next v4 0/9] Add support for per-NAPI config via netlink
2024-10-04 2:33 ` Joe Damato
@ 2024-10-04 16:22 ` Stanislav Fomichev
0 siblings, 0 replies; 14+ messages in thread
From: Stanislav Fomichev @ 2024-10-04 16:22 UTC (permalink / raw)
To: Joe Damato, netdev, mkarsten, skhawaja, sdf, bjorn,
amritha.nambiar, sridhar.samudrala, willemdebruijn.kernel,
Alexander Lobakin, Breno Leitao, Daniel Jurgens, David Ahern,
David S. Miller, Donald Hunter, Eric Dumazet,
moderated list:INTEL ETHERNET DRIVERS, Jakub Kicinski,
Jesper Dangaard Brouer, Jiri Pirko, Johannes Berg,
Jonathan Corbet, Kory Maincent, Leon Romanovsky,
open list:DOCUMENTATION, open list,
open list:MELLANOX MLX4 core VPI driver, Lorenzo Bianconi,
Michael Chan, Mina Almasry, Paolo Abeni, Przemek Kitszel,
Saeed Mahameed, Sebastian Andrzej Siewior, Tariq Toukan,
Tony Nguyen, Xuan Zhuo
On 10/03, Joe Damato wrote:
> On Thu, Oct 03, 2024 at 04:53:13PM -0700, Joe Damato wrote:
> > On Thu, Oct 03, 2024 at 04:29:37PM -0700, Stanislav Fomichev wrote:
> > > On 10/01, Joe Damato wrote:
> >
> > [...]
> >
> > > > 2. This revision seems to work (see below for a full walk through). Is
> > > > this the behavior we want? Am I missing some use case or some
> > > > behavioral thing other folks need?
> > >
> > > The walk through looks good!
> >
> > Thanks for taking a look.
> >
> > > > 3. Re a previous point made by Stanislav regarding "taking over a NAPI
> > > > ID" when the channel count changes: mlx5 seems to call napi_disable
> > > > followed by netif_napi_del for the old queues and then calls
> > > > napi_enable for the new ones. In this RFC, the NAPI ID generation
> > > > is deferred to napi_enable. This means we won't end up with two of
> > > > the same NAPI IDs added to the hash at the same time (I am pretty
> > > > sure).
> > >
> > >
> > > [..]
> > >
> > > > Can we assume all drivers will napi_disable the old queues before
> > > > napi_enable the new ones? If yes, we might not need to worry about
> > > > a NAPI ID takeover function.
> > >
> > > With the explicit driver opt-in via netif_napi_add_config, this
> > > shouldn't matter? When somebody gets to converting the drivers that
> > > don't follow this common pattern they'll have to solve the takeover
> > > part :-)
> >
> > That is true; that's a good point.
>
> Actually, sorry, that isn't strictly true. NAPI ID generation is
> moved for everything to napi_enable; they just are (or are not)
> persisted depending on whether the driver opted in to add_config or
> not.
>
> So, the change does affect all drivers. NAPI IDs won't be generated
> and added to the hash until napi_enable and they will be removed
> from the hash in napi_disable... even if you didn't opt-in to having
> storage.
>
> Opt-ing in to storage via netif_napi_add_config just means that your
> NAPI IDs (and other settings) will be persistent.
>
> Sorry about my confusion when replying earlier.
AFAIA, all control operations (ethtool or similar ones via netlink),
should grab rtnl lock. So as long as both enable/disable happen
under rtnl (and in my mind they should), I don't think there is gonna
be any user-visible side-effects of your change. But I might be wrong,
let's see if others can come up with some corner cases..
^ permalink raw reply [flat|nested] 14+ messages in thread