* [net-next v5 1/9] net: napi: Make napi_defer_hard_irqs per-NAPI
2024-10-09 0:54 [net-next v5 0/9] Add support for per-NAPI config via netlink Joe Damato
@ 2024-10-09 0:54 ` Joe Damato
2024-10-10 4:10 ` Eric Dumazet
2024-10-09 0:54 ` [net-next v5 2/9] netdev-genl: Dump napi_defer_hard_irqs Joe Damato
` (8 subsequent siblings)
9 siblings, 1 reply; 31+ messages in thread
From: Joe Damato @ 2024-10-09 0:54 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Joe Damato,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Jonathan Corbet, Jiri Pirko, Sebastian Andrzej Siewior,
Lorenzo Bianconi, Johannes Berg, Breno Leitao, Alexander Lobakin,
open list:DOCUMENTATION, open list
Add defer_hard_irqs to napi_struct in preparation for per-NAPI
settings.
The existing sysfs parameter is respected; writes to sysfs will write to
all NAPI structs for the device and the net_device defer_hard_irq field.
Reads from sysfs show the net_device field.
The ability to set defer_hard_irqs on specific NAPI instances will be
added in a later commit, via netdev-genl.
Signed-off-by: Joe Damato <jdamato@fastly.com>
---
.../networking/net_cachelines/net_device.rst | 2 +-
include/linux/netdevice.h | 3 +-
net/core/dev.c | 10 +++---
net/core/dev.h | 36 +++++++++++++++++++
net/core/net-sysfs.c | 2 +-
5 files changed, 45 insertions(+), 8 deletions(-)
diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 556711c4d3cf..d78b1362f12a 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -100,7 +100,6 @@ unsigned_int num_rx_queues
unsigned_int real_num_rx_queues - read_mostly get_rps_cpu
struct_bpf_prog* xdp_prog - read_mostly netif_elide_gro()
unsigned_long gro_flush_timeout - read_mostly napi_complete_done
-u32 napi_defer_hard_irqs - read_mostly napi_complete_done
unsigned_int gro_max_size - read_mostly skb_gro_receive
unsigned_int gro_ipv4_max_size - read_mostly skb_gro_receive
rx_handler_func_t* rx_handler read_mostly - __netif_receive_skb_core
@@ -185,3 +184,4 @@ struct_dpll_pin* dpll_pin
struct hlist_head page_pools
struct dim_irq_moder* irq_moder
u64 max_pacing_offload_horizon
+u32 napi_defer_hard_irqs
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3baf8e539b6f..9042920cdd1a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -373,6 +373,7 @@ struct napi_struct {
unsigned int napi_id;
struct hrtimer timer;
struct task_struct *thread;
+ u32 defer_hard_irqs;
/* control-path-only fields follow */
struct list_head dev_list;
struct hlist_node napi_hash_node;
@@ -2077,7 +2078,6 @@ struct net_device {
unsigned int real_num_rx_queues;
struct netdev_rx_queue *_rx;
unsigned long gro_flush_timeout;
- u32 napi_defer_hard_irqs;
unsigned int gro_max_size;
unsigned int gro_ipv4_max_size;
rx_handler_func_t __rcu *rx_handler;
@@ -2405,6 +2405,7 @@ struct net_device {
struct dim_irq_moder *irq_moder;
u64 max_pacing_offload_horizon;
+ u32 napi_defer_hard_irqs;
u8 priv[] ____cacheline_aligned
__counted_by(priv_len);
diff --git a/net/core/dev.c b/net/core/dev.c
index ea5fbcd133ae..3487eec284a6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6231,7 +6231,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
if (work_done) {
if (n->gro_bitmask)
timeout = READ_ONCE(n->dev->gro_flush_timeout);
- n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
+ n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
}
if (n->defer_hard_irqs_count > 0) {
n->defer_hard_irqs_count--;
@@ -6369,7 +6369,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (flags & NAPI_F_PREFER_BUSY_POLL) {
- napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
+ napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
timeout = READ_ONCE(napi->dev->gro_flush_timeout);
if (napi->defer_hard_irqs_count && timeout) {
hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
@@ -6651,6 +6651,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
+ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
init_gro_hash(napi);
napi->skb = NULL;
INIT_LIST_HEAD(&napi->rx_list);
@@ -11057,7 +11058,7 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
dev->gro_flush_timeout = 20000;
- dev->napi_defer_hard_irqs = 1;
+ netdev_set_defer_hard_irqs(dev, 1);
}
}
EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
@@ -11995,7 +11996,6 @@ static void __init net_dev_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
@@ -12007,7 +12007,7 @@ static void __init net_dev_struct_check(void)
#ifdef CONFIG_NET_XGRESS
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
#endif
- CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 100);
}
/*
diff --git a/net/core/dev.h b/net/core/dev.h
index 5654325c5b71..b3792219879b 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -138,6 +138,42 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
WRITE_ONCE(dev->gro_ipv4_max_size, size);
}
+/**
+ * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs
+ * @n: napi struct to get the defer_hard_irqs field from
+ *
+ * Return: the per-NAPI value of the defar_hard_irqs field.
+ */
+static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n)
+{
+ return READ_ONCE(n->defer_hard_irqs);
+}
+
+/**
+ * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi
+ * @n: napi_struct to set the defer_hard_irqs field
+ * @defer: the value the field should be set to
+ */
+static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
+{
+ WRITE_ONCE(n->defer_hard_irqs, defer);
+}
+
+/**
+ * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev
+ * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set
+ * @defer: the defer_hard_irqs value to set
+ */
+static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
+ u32 defer)
+{
+ struct napi_struct *napi;
+
+ WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
+ list_for_each_entry(napi, &netdev->napi_list, dev_list)
+ napi_set_defer_hard_irqs(napi, defer);
+}
+
int rps_cpumask_housekeeping(struct cpumask *mask);
#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 05cf5347f25e..25125f356a15 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -429,7 +429,7 @@ static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val
if (val > S32_MAX)
return -ERANGE;
- WRITE_ONCE(dev->napi_defer_hard_irqs, val);
+ netdev_set_defer_hard_irqs(dev, (u32)val);
return 0;
}
--
2.34.1
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [net-next v5 1/9] net: napi: Make napi_defer_hard_irqs per-NAPI
2024-10-09 0:54 ` [net-next v5 1/9] net: napi: Make napi_defer_hard_irqs per-NAPI Joe Damato
@ 2024-10-10 4:10 ` Eric Dumazet
0 siblings, 0 replies; 31+ messages in thread
From: Eric Dumazet @ 2024-10-10 4:10 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, David S. Miller,
Jakub Kicinski, Paolo Abeni, Jonathan Corbet, Jiri Pirko,
Sebastian Andrzej Siewior, Lorenzo Bianconi, Johannes Berg,
Breno Leitao, Alexander Lobakin, open list:DOCUMENTATION,
open list
On Wed, Oct 9, 2024 at 2:55 AM Joe Damato <jdamato@fastly.com> wrote:
>
> Add defer_hard_irqs to napi_struct in preparation for per-NAPI
> settings.
>
> The existing sysfs parameter is respected; writes to sysfs will write to
> all NAPI structs for the device and the net_device defer_hard_irq field.
> Reads from sysfs show the net_device field.
>
> The ability to set defer_hard_irqs on specific NAPI instances will be
> added in a later commit, via netdev-genl.
>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply [flat|nested] 31+ messages in thread
* [net-next v5 2/9] netdev-genl: Dump napi_defer_hard_irqs
2024-10-09 0:54 [net-next v5 0/9] Add support for per-NAPI config via netlink Joe Damato
2024-10-09 0:54 ` [net-next v5 1/9] net: napi: Make napi_defer_hard_irqs per-NAPI Joe Damato
@ 2024-10-09 0:54 ` Joe Damato
2024-10-10 4:11 ` Eric Dumazet
2024-10-09 0:54 ` [net-next v5 3/9] net: napi: Make gro_flush_timeout per-NAPI Joe Damato
` (7 subsequent siblings)
9 siblings, 1 reply; 31+ messages in thread
From: Joe Damato @ 2024-10-09 0:54 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Joe Damato,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Donald Hunter, Jesper Dangaard Brouer, Mina Almasry, Xuan Zhuo,
Daniel Jurgens, open list
Support dumping defer_hard_irqs for a NAPI ID.
Signed-off-by: Joe Damato <jdamato@fastly.com>
---
Documentation/netlink/specs/netdev.yaml | 8 ++++++++
include/uapi/linux/netdev.h | 1 +
net/core/netdev-genl.c | 6 ++++++
tools/include/uapi/linux/netdev.h | 1 +
4 files changed, 16 insertions(+)
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 08412c279297..585e87ec3c16 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -248,6 +248,13 @@ attribute-sets:
threaded mode. If NAPI is not in threaded mode (i.e. uses normal
softirq context), the attribute will be absent.
type: u32
+ -
+ name: defer-hard-irqs
+ doc: The number of consecutive empty polls before IRQ deferral ends
+ and hardware IRQs are re-enabled.
+ type: u32
+ checks:
+ max: s32-max
-
name: queue
attributes:
@@ -636,6 +643,7 @@ operations:
- ifindex
- irq
- pid
+ - defer-hard-irqs
dump:
request:
attributes:
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 7c308f04e7a0..13dc0b027e86 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -122,6 +122,7 @@ enum {
NETDEV_A_NAPI_ID,
NETDEV_A_NAPI_IRQ,
NETDEV_A_NAPI_PID,
+ NETDEV_A_NAPI_DEFER_HARD_IRQS,
__NETDEV_A_NAPI_MAX,
NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 1cb954f2d39e..de9bd76f43f8 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -161,6 +161,7 @@ static int
netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
const struct genl_info *info)
{
+ u32 napi_defer_hard_irqs;
void *hdr;
pid_t pid;
@@ -189,6 +190,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
goto nla_put_failure;
}
+ napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi);
+ if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS,
+ napi_defer_hard_irqs))
+ goto nla_put_failure;
+
genlmsg_end(rsp, hdr);
return 0;
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 7c308f04e7a0..13dc0b027e86 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -122,6 +122,7 @@ enum {
NETDEV_A_NAPI_ID,
NETDEV_A_NAPI_IRQ,
NETDEV_A_NAPI_PID,
+ NETDEV_A_NAPI_DEFER_HARD_IRQS,
__NETDEV_A_NAPI_MAX,
NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
--
2.34.1
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [net-next v5 2/9] netdev-genl: Dump napi_defer_hard_irqs
2024-10-09 0:54 ` [net-next v5 2/9] netdev-genl: Dump napi_defer_hard_irqs Joe Damato
@ 2024-10-10 4:11 ` Eric Dumazet
0 siblings, 0 replies; 31+ messages in thread
From: Eric Dumazet @ 2024-10-10 4:11 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, David S. Miller,
Jakub Kicinski, Paolo Abeni, Donald Hunter,
Jesper Dangaard Brouer, Mina Almasry, Xuan Zhuo, Daniel Jurgens,
open list
On Wed, Oct 9, 2024 at 2:55 AM Joe Damato <jdamato@fastly.com> wrote:
>
> Support dumping defer_hard_irqs for a NAPI ID.
>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
Reviewed-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply [flat|nested] 31+ messages in thread
* [net-next v5 3/9] net: napi: Make gro_flush_timeout per-NAPI
2024-10-09 0:54 [net-next v5 0/9] Add support for per-NAPI config via netlink Joe Damato
2024-10-09 0:54 ` [net-next v5 1/9] net: napi: Make napi_defer_hard_irqs per-NAPI Joe Damato
2024-10-09 0:54 ` [net-next v5 2/9] netdev-genl: Dump napi_defer_hard_irqs Joe Damato
@ 2024-10-09 0:54 ` Joe Damato
2024-10-10 4:13 ` Eric Dumazet
2024-10-09 0:54 ` [net-next v5 4/9] netdev-genl: Dump gro_flush_timeout Joe Damato
` (6 subsequent siblings)
9 siblings, 1 reply; 31+ messages in thread
From: Joe Damato @ 2024-10-09 0:54 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Joe Damato,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Jonathan Corbet, Jiri Pirko, Sebastian Andrzej Siewior,
Lorenzo Bianconi, Kory Maincent, Johannes Berg, Breno Leitao,
Alexander Lobakin, open list:DOCUMENTATION, open list
Allow per-NAPI gro_flush_timeout setting.
The existing sysfs parameter is respected; writes to sysfs will write to
all NAPI structs for the device and the net_device gro_flush_timeout
field. Reads from sysfs will read from the net_device field.
The ability to set gro_flush_timeout on specific NAPI instances will be
added in a later commit, via netdev-genl.
Signed-off-by: Joe Damato <jdamato@fastly.com>
---
.../networking/net_cachelines/net_device.rst | 2 +-
include/linux/netdevice.h | 3 +-
net/core/dev.c | 12 +++---
net/core/dev.h | 40 +++++++++++++++++++
net/core/net-sysfs.c | 2 +-
5 files changed, 50 insertions(+), 9 deletions(-)
diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index d78b1362f12a..3ab663b6cf16 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -99,7 +99,6 @@ struct_netdev_queue* _rx read_mostly
unsigned_int num_rx_queues
unsigned_int real_num_rx_queues - read_mostly get_rps_cpu
struct_bpf_prog* xdp_prog - read_mostly netif_elide_gro()
-unsigned_long gro_flush_timeout - read_mostly napi_complete_done
unsigned_int gro_max_size - read_mostly skb_gro_receive
unsigned_int gro_ipv4_max_size - read_mostly skb_gro_receive
rx_handler_func_t* rx_handler read_mostly - __netif_receive_skb_core
@@ -184,4 +183,5 @@ struct_dpll_pin* dpll_pin
struct hlist_head page_pools
struct dim_irq_moder* irq_moder
u64 max_pacing_offload_horizon
+unsigned_long gro_flush_timeout
u32 napi_defer_hard_irqs
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9042920cdd1a..4239a4a9d295 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -373,6 +373,7 @@ struct napi_struct {
unsigned int napi_id;
struct hrtimer timer;
struct task_struct *thread;
+ unsigned long gro_flush_timeout;
u32 defer_hard_irqs;
/* control-path-only fields follow */
struct list_head dev_list;
@@ -2077,7 +2078,6 @@ struct net_device {
int ifindex;
unsigned int real_num_rx_queues;
struct netdev_rx_queue *_rx;
- unsigned long gro_flush_timeout;
unsigned int gro_max_size;
unsigned int gro_ipv4_max_size;
rx_handler_func_t __rcu *rx_handler;
@@ -2405,6 +2405,7 @@ struct net_device {
struct dim_irq_moder *irq_moder;
u64 max_pacing_offload_horizon;
+ unsigned long gro_flush_timeout;
u32 napi_defer_hard_irqs;
u8 priv[] ____cacheline_aligned
diff --git a/net/core/dev.c b/net/core/dev.c
index 3487eec284a6..fca2295f4d95 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6230,12 +6230,12 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
if (work_done) {
if (n->gro_bitmask)
- timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ timeout = napi_get_gro_flush_timeout(n);
n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
}
if (n->defer_hard_irqs_count > 0) {
n->defer_hard_irqs_count--;
- timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ timeout = napi_get_gro_flush_timeout(n);
if (timeout)
ret = false;
}
@@ -6370,7 +6370,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
if (flags & NAPI_F_PREFER_BUSY_POLL) {
napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
- timeout = READ_ONCE(napi->dev->gro_flush_timeout);
+ timeout = napi_get_gro_flush_timeout(napi);
if (napi->defer_hard_irqs_count && timeout) {
hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
skip_schedule = true;
@@ -6652,6 +6652,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+ napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
init_gro_hash(napi);
napi->skb = NULL;
INIT_LIST_HEAD(&napi->rx_list);
@@ -11057,7 +11058,7 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
WARN_ON(dev->reg_state == NETREG_REGISTERED);
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
- dev->gro_flush_timeout = 20000;
+ netdev_set_gro_flush_timeout(dev, 20000);
netdev_set_defer_hard_irqs(dev, 1);
}
}
@@ -11995,7 +11996,6 @@ static void __init net_dev_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
@@ -12007,7 +12007,7 @@ static void __init net_dev_struct_check(void)
#ifdef CONFIG_NET_XGRESS
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
#endif
- CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 100);
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
}
/*
diff --git a/net/core/dev.h b/net/core/dev.h
index b3792219879b..26e598aa56c3 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -174,6 +174,46 @@ static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
napi_set_defer_hard_irqs(napi, defer);
}
+/**
+ * napi_get_gro_flush_timeout - get the gro_flush_timeout
+ * @n: napi struct to get the gro_flush_timeout from
+ *
+ * Return: the per-NAPI value of the gro_flush_timeout field.
+ */
+static inline unsigned long
+napi_get_gro_flush_timeout(const struct napi_struct *n)
+{
+ return READ_ONCE(n->gro_flush_timeout);
+}
+
+/**
+ * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi
+ * @n: napi struct to set the gro_flush_timeout
+ * @timeout: timeout value to set
+ *
+ * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout
+ */
+static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
+ unsigned long timeout)
+{
+ WRITE_ONCE(n->gro_flush_timeout, timeout);
+}
+
+/**
+ * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs
+ * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set
+ * @timeout: the timeout value to set
+ */
+static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
+ unsigned long timeout)
+{
+ struct napi_struct *napi;
+
+ WRITE_ONCE(netdev->gro_flush_timeout, timeout);
+ list_for_each_entry(napi, &netdev->napi_list, dev_list)
+ napi_set_gro_flush_timeout(napi, timeout);
+}
+
int rps_cpumask_housekeeping(struct cpumask *mask);
#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 25125f356a15..2d9afc6e2161 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -409,7 +409,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
{
- WRITE_ONCE(dev->gro_flush_timeout, val);
+ netdev_set_gro_flush_timeout(dev, val);
return 0;
}
--
2.34.1
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [net-next v5 3/9] net: napi: Make gro_flush_timeout per-NAPI
2024-10-09 0:54 ` [net-next v5 3/9] net: napi: Make gro_flush_timeout per-NAPI Joe Damato
@ 2024-10-10 4:13 ` Eric Dumazet
0 siblings, 0 replies; 31+ messages in thread
From: Eric Dumazet @ 2024-10-10 4:13 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, David S. Miller,
Jakub Kicinski, Paolo Abeni, Jonathan Corbet, Jiri Pirko,
Sebastian Andrzej Siewior, Lorenzo Bianconi, Kory Maincent,
Johannes Berg, Breno Leitao, Alexander Lobakin,
open list:DOCUMENTATION, open list
On Wed, Oct 9, 2024 at 2:56 AM Joe Damato <jdamato@fastly.com> wrote:
>
> Allow per-NAPI gro_flush_timeout setting.
>
> The existing sysfs parameter is respected; writes to sysfs will write to
> all NAPI structs for the device and the net_device gro_flush_timeout
> field. Reads from sysfs will read from the net_device field.
>
> The ability to set gro_flush_timeout on specific NAPI instances will be
> added in a later commit, via netdev-genl.
>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply [flat|nested] 31+ messages in thread
* [net-next v5 4/9] netdev-genl: Dump gro_flush_timeout
2024-10-09 0:54 [net-next v5 0/9] Add support for per-NAPI config via netlink Joe Damato
` (2 preceding siblings ...)
2024-10-09 0:54 ` [net-next v5 3/9] net: napi: Make gro_flush_timeout per-NAPI Joe Damato
@ 2024-10-09 0:54 ` Joe Damato
2024-10-10 3:14 ` Jakub Kicinski
2024-10-09 0:54 ` [net-next v5 5/9] net: napi: Add napi_config Joe Damato
` (5 subsequent siblings)
9 siblings, 1 reply; 31+ messages in thread
From: Joe Damato @ 2024-10-09 0:54 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Joe Damato,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Donald Hunter, Jesper Dangaard Brouer, Mina Almasry, Xuan Zhuo,
open list
Support dumping gro_flush_timeout for a NAPI ID.
Signed-off-by: Joe Damato <jdamato@fastly.com>
---
Documentation/netlink/specs/netdev.yaml | 6 ++++++
include/uapi/linux/netdev.h | 1 +
net/core/netdev-genl.c | 6 ++++++
tools/include/uapi/linux/netdev.h | 1 +
4 files changed, 14 insertions(+)
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 585e87ec3c16..bf13613eaa0d 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -255,6 +255,11 @@ attribute-sets:
type: u32
checks:
max: s32-max
+ -
+ name: gro-flush-timeout
+ doc: The timeout, in nanoseconds, of when to trigger the NAPI
+ watchdog timer and schedule NAPI processing.
+ type: uint
-
name: queue
attributes:
@@ -644,6 +649,7 @@ operations:
- irq
- pid
- defer-hard-irqs
+ - gro-flush-timeout
dump:
request:
attributes:
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 13dc0b027e86..cacd33359c76 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -123,6 +123,7 @@ enum {
NETDEV_A_NAPI_IRQ,
NETDEV_A_NAPI_PID,
NETDEV_A_NAPI_DEFER_HARD_IRQS,
+ NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
__NETDEV_A_NAPI_MAX,
NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index de9bd76f43f8..64e5e4cee60d 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -161,6 +161,7 @@ static int
netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
const struct genl_info *info)
{
+ unsigned long gro_flush_timeout;
u32 napi_defer_hard_irqs;
void *hdr;
pid_t pid;
@@ -195,6 +196,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
napi_defer_hard_irqs))
goto nla_put_failure;
+ gro_flush_timeout = napi_get_gro_flush_timeout(napi);
+ if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+ gro_flush_timeout))
+ goto nla_put_failure;
+
genlmsg_end(rsp, hdr);
return 0;
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 13dc0b027e86..cacd33359c76 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -123,6 +123,7 @@ enum {
NETDEV_A_NAPI_IRQ,
NETDEV_A_NAPI_PID,
NETDEV_A_NAPI_DEFER_HARD_IRQS,
+ NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
__NETDEV_A_NAPI_MAX,
NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
--
2.34.1
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [net-next v5 4/9] netdev-genl: Dump gro_flush_timeout
2024-10-09 0:54 ` [net-next v5 4/9] netdev-genl: Dump gro_flush_timeout Joe Damato
@ 2024-10-10 3:14 ` Jakub Kicinski
2024-10-10 4:34 ` Joe Damato
0 siblings, 1 reply; 31+ messages in thread
From: Jakub Kicinski @ 2024-10-10 3:14 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, David S. Miller,
Eric Dumazet, Paolo Abeni, Donald Hunter, Jesper Dangaard Brouer,
Mina Almasry, Xuan Zhuo, open list
On Wed, 9 Oct 2024 00:54:58 +0000 Joe Damato wrote:
> + name: gro-flush-timeout
> + doc: The timeout, in nanoseconds, of when to trigger the NAPI
> + watchdog timer and schedule NAPI processing.
You gotta respin because we reformatted the cacheline info.
So while at it perhaps throw in a sentence here about the GRO effects?
The initial use of GRO flush timeout was to hold incomplete GRO
super-frames in the GRO engine across NAPI cycles.
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [net-next v5 4/9] netdev-genl: Dump gro_flush_timeout
2024-10-10 3:14 ` Jakub Kicinski
@ 2024-10-10 4:34 ` Joe Damato
2024-10-10 4:45 ` Eric Dumazet
0 siblings, 1 reply; 31+ messages in thread
From: Joe Damato @ 2024-10-10 4:34 UTC (permalink / raw)
To: Jakub Kicinski
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, David S. Miller,
Eric Dumazet, Paolo Abeni, Donald Hunter, Jesper Dangaard Brouer,
Mina Almasry, Xuan Zhuo, open list
On Wed, Oct 09, 2024 at 08:14:40PM -0700, Jakub Kicinski wrote:
> On Wed, 9 Oct 2024 00:54:58 +0000 Joe Damato wrote:
> > + name: gro-flush-timeout
> > + doc: The timeout, in nanoseconds, of when to trigger the NAPI
> > + watchdog timer and schedule NAPI processing.
>
> You gotta respin because we reformatted the cacheline info.
Yea, I figured I'd be racing with that change and would need a
respin.
I'm not sure how the queue works exactly, but it looks like I might
also be racing with another change [1], I think.
I think I'm just over 24hr and could respin and resend now, but
should I wait longer in case [1] is merged before you see my
respin?
Just trying to figure out how to get the fewest number of respins
possible ;)
> So while at it perhaps throw in a sentence here about the GRO effects?
> The initial use of GRO flush timeout was to hold incomplete GRO
> super-frames in the GRO engine across NAPI cycles.
From my reading of the code, if the timeout is non-zero, then
napi_gro_flush will flush only "old" super-frames in
napi_complete_done.
If that's accurate (and maybe I missed something?), then how about:
doc: The timeout, in nanoseconds, of when to trigger the NAPI
watchdog timer which schedules NAPI processing. Additionally, a
non-zero value will also prevent GRO from flushing recent
super-frames at the end of a NAPI cycle. This may add receive
latency in exchange for reducing the number of frames processed
by the network stack.
LMK if that's accurate and sounds OK or if it's wrong / too verbose?
[1]: https://lore.kernel.org/netdev/20241009232728.107604-1-edumazet@google.com/T/#m3f11aae53b3244037ac641ef36985c5e85e2ed5e
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [net-next v5 4/9] netdev-genl: Dump gro_flush_timeout
2024-10-10 4:34 ` Joe Damato
@ 2024-10-10 4:45 ` Eric Dumazet
2024-10-10 4:59 ` Joe Damato
0 siblings, 1 reply; 31+ messages in thread
From: Eric Dumazet @ 2024-10-10 4:45 UTC (permalink / raw)
To: Joe Damato, Jakub Kicinski, netdev, mkarsten, skhawaja, sdf,
bjorn, amritha.nambiar, sridhar.samudrala, willemdebruijn.kernel,
David S. Miller, Eric Dumazet, Paolo Abeni, Donald Hunter,
Jesper Dangaard Brouer, Mina Almasry, Xuan Zhuo, open list
On Thu, Oct 10, 2024 at 6:34 AM Joe Damato <jdamato@fastly.com> wrote:
>
> On Wed, Oct 09, 2024 at 08:14:40PM -0700, Jakub Kicinski wrote:
> > On Wed, 9 Oct 2024 00:54:58 +0000 Joe Damato wrote:
> > > + name: gro-flush-timeout
> > > + doc: The timeout, in nanoseconds, of when to trigger the NAPI
> > > + watchdog timer and schedule NAPI processing.
> >
> > You gotta respin because we reformatted the cacheline info.
>
> Yea, I figured I'd be racing with that change and would need a
> respin.
>
> I'm not sure how the queue works exactly, but it looks like I might
> also be racing with another change [1], I think.
>
> I think I'm just over 24hr and could respin and resend now, but
> should I wait longer in case [1] is merged before you see my
> respin?
I would avoid the rtnl_lock() addition in "netdev-genl: Support
setting per-NAPI config values"
before re-sending ?
>
> Just trying to figure out how to get the fewest number of respins
> possible ;)
>
> > So while at it perhaps throw in a sentence here about the GRO effects?
> > The initial use of GRO flush timeout was to hold incomplete GRO
> > super-frames in the GRO engine across NAPI cycles.
>
> From my reading of the code, if the timeout is non-zero, then
> napi_gro_flush will flush only "old" super-frames in
> napi_complete_done.
>
> If that's accurate (and maybe I missed something?), then how about:
>
> doc: The timeout, in nanoseconds, of when to trigger the NAPI
> watchdog timer which schedules NAPI processing. Additionally, a
> non-zero value will also prevent GRO from flushing recent
> super-frames at the end of a NAPI cycle. This may add receive
> latency in exchange for reducing the number of frames processed
> by the network stack.
Note that linux TCP always has a PSH flag at the end of each TSO packet,
so the latency increase is only possible in presence of tail drop,
if the last MSS (with the PSH) was dropped.
>
> LMK if that's accurate and sounds OK or if it's wrong / too verbose?
I do not think it is too verbose.
>
> [1]: https://lore.kernel.org/netdev/20241009232728.107604-1-edumazet@google.com/T/#m3f11aae53b3244037ac641ef36985c5e85e2ed5e
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [net-next v5 4/9] netdev-genl: Dump gro_flush_timeout
2024-10-10 4:45 ` Eric Dumazet
@ 2024-10-10 4:59 ` Joe Damato
0 siblings, 0 replies; 31+ messages in thread
From: Joe Damato @ 2024-10-10 4:59 UTC (permalink / raw)
To: Eric Dumazet
Cc: Jakub Kicinski, netdev, mkarsten, skhawaja, sdf, bjorn,
amritha.nambiar, sridhar.samudrala, willemdebruijn.kernel,
David S. Miller, Paolo Abeni, Donald Hunter,
Jesper Dangaard Brouer, Mina Almasry, Xuan Zhuo, open list
On Thu, Oct 10, 2024 at 06:45:11AM +0200, Eric Dumazet wrote:
> On Thu, Oct 10, 2024 at 6:34 AM Joe Damato <jdamato@fastly.com> wrote:
> >
> > On Wed, Oct 09, 2024 at 08:14:40PM -0700, Jakub Kicinski wrote:
> > > On Wed, 9 Oct 2024 00:54:58 +0000 Joe Damato wrote:
> > > > + name: gro-flush-timeout
> > > > + doc: The timeout, in nanoseconds, of when to trigger the NAPI
> > > > + watchdog timer and schedule NAPI processing.
> > >
> > > You gotta respin because we reformatted the cacheline info.
> >
> > Yea, I figured I'd be racing with that change and would need a
> > respin.
> >
> > I'm not sure how the queue works exactly, but it looks like I might
> > also be racing with another change [1], I think.
> >
> > I think I'm just over 24hr and could respin and resend now, but
> > should I wait longer in case [1] is merged before you see my
> > respin?
>
> I would avoid the rtnl_lock() addition in "netdev-genl: Support
> setting per-NAPI config values"
> before re-sending ?
OK.
> >
> > Just trying to figure out how to get the fewest number of respins
> > possible ;)
> >
> > > So while at it perhaps throw in a sentence here about the GRO effects?
> > > The initial use of GRO flush timeout was to hold incomplete GRO
> > > super-frames in the GRO engine across NAPI cycles.
> >
> > From my reading of the code, if the timeout is non-zero, then
> > napi_gro_flush will flush only "old" super-frames in
> > napi_complete_done.
> >
> > If that's accurate (and maybe I missed something?), then how about:
> >
> > doc: The timeout, in nanoseconds, of when to trigger the NAPI
> > watchdog timer which schedules NAPI processing. Additionally, a
> > non-zero value will also prevent GRO from flushing recent
> > super-frames at the end of a NAPI cycle. This may add receive
> > latency in exchange for reducing the number of frames processed
> > by the network stack.
>
> Note that linux TCP always has a PSH flag at the end of each TSO packet,
> so the latency increase is only possible in presence of tail drop,
> if the last MSS (with the PSH) was dropped.
Would you like me to note that in the doc, as well?
^ permalink raw reply [flat|nested] 31+ messages in thread
* [net-next v5 5/9] net: napi: Add napi_config
2024-10-09 0:54 [net-next v5 0/9] Add support for per-NAPI config via netlink Joe Damato
` (3 preceding siblings ...)
2024-10-09 0:54 ` [net-next v5 4/9] netdev-genl: Dump gro_flush_timeout Joe Damato
@ 2024-10-09 0:54 ` Joe Damato
2024-10-10 4:20 ` Eric Dumazet
2024-10-09 0:55 ` [net-next v5 6/9] netdev-genl: Support setting per-NAPI config values Joe Damato
` (4 subsequent siblings)
9 siblings, 1 reply; 31+ messages in thread
From: Joe Damato @ 2024-10-09 0:54 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Joe Damato,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Jonathan Corbet, Jiri Pirko, Sebastian Andrzej Siewior,
Lorenzo Bianconi, Kory Maincent, Johannes Berg,
open list:DOCUMENTATION, open list
Add a persistent NAPI config area for NAPI configuration to the core.
Drivers opt-in to setting the persistent config for a NAPI by passing an
index when calling netif_napi_add_config.
napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
(after the NAPIs are deleted).
Drivers which call netif_napi_add_config will have persistent per-NAPI
settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
Per-NAPI settings are saved in napi_disable and restored in napi_enable.
Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Joe Damato <jdamato@fastly.com>
---
.../networking/net_cachelines/net_device.rst | 1 +
include/linux/netdevice.h | 36 ++++++++-
net/core/dev.c | 79 +++++++++++++++++--
net/core/dev.h | 12 +++
4 files changed, 118 insertions(+), 10 deletions(-)
diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 3ab663b6cf16..9d86720cb722 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -183,5 +183,6 @@ struct_dpll_pin* dpll_pin
struct hlist_head page_pools
struct dim_irq_moder* irq_moder
u64 max_pacing_offload_horizon
+struct_napi_config* napi_config
unsigned_long gro_flush_timeout
u32 napi_defer_hard_irqs
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4239a4a9d295..b65a901ab4e7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -342,6 +342,15 @@ struct gro_list {
*/
#define GRO_HASH_BUCKETS 8
+/*
+ * Structure for per-NAPI config
+ */
+struct napi_config {
+ u64 gro_flush_timeout;
+ u32 defer_hard_irqs;
+ unsigned int napi_id;
+};
+
/*
* Structure for NAPI scheduling similar to tasklet but with weighting
*/
@@ -379,6 +388,8 @@ struct napi_struct {
struct list_head dev_list;
struct hlist_node napi_hash_node;
int irq;
+ int index;
+ struct napi_config *config;
};
enum {
@@ -1860,9 +1871,6 @@ enum netdev_reg_state {
* allocated at register_netdev() time
* @real_num_rx_queues: Number of RX queues currently active in device
* @xdp_prog: XDP sockets filter program pointer
- * @gro_flush_timeout: timeout for GRO layer in NAPI
- * @napi_defer_hard_irqs: If not zero, provides a counter that would
- * allow to avoid NIC hard IRQ, on busy queues.
*
* @rx_handler: handler for received packets
* @rx_handler_data: XXX: need comments on this one
@@ -2012,6 +2020,11 @@ enum netdev_reg_state {
* where the clock is recovered.
*
* @max_pacing_offload_horizon: max EDT offload horizon in nsec.
+ * @napi_config: An array of napi_config structures containing per-NAPI
+ * settings.
+ * @gro_flush_timeout: timeout for GRO layer in NAPI
+ * @napi_defer_hard_irqs: If not zero, provides a counter that would
+ * allow to avoid NIC hard IRQ, on busy queues.
*
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
@@ -2405,6 +2418,7 @@ struct net_device {
struct dim_irq_moder *irq_moder;
u64 max_pacing_offload_horizon;
+ struct napi_config *napi_config;
unsigned long gro_flush_timeout;
u32 napi_defer_hard_irqs;
@@ -2657,6 +2671,22 @@ netif_napi_add_tx_weight(struct net_device *dev,
netif_napi_add_weight(dev, napi, poll, weight);
}
+/**
+ * netif_napi_add_config - initialize a NAPI context with persistent config
+ * @dev: network device
+ * @napi: NAPI context
+ * @poll: polling function
+ * @index: the NAPI index
+ */
+static inline void
+netif_napi_add_config(struct net_device *dev, struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int), int index)
+{
+ napi->index = index;
+ napi->config = &dev->napi_config[index];
+ netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
+}
+
/**
* netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
* @dev: network device
diff --git a/net/core/dev.c b/net/core/dev.c
index fca2295f4d95..bd87232f7b37 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6503,6 +6503,22 @@ EXPORT_SYMBOL(napi_busy_loop);
#endif /* CONFIG_NET_RX_BUSY_POLL */
+static void __napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ napi->napi_id = napi_id;
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+}
+
+static void napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ spin_lock(&napi_hash_lock);
+ __napi_hash_add_with_id(napi, napi_id);
+ spin_unlock(&napi_hash_lock);
+}
+
static void napi_hash_add(struct napi_struct *napi)
{
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
@@ -6515,10 +6531,8 @@ static void napi_hash_add(struct napi_struct *napi)
if (unlikely(++napi_gen_id < MIN_NAPI_ID))
napi_gen_id = MIN_NAPI_ID;
} while (napi_by_id(napi_gen_id));
- napi->napi_id = napi_gen_id;
- hlist_add_head_rcu(&napi->napi_hash_node,
- &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+ __napi_hash_add_with_id(napi, napi_gen_id);
spin_unlock(&napi_hash_lock);
}
@@ -6641,6 +6655,28 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
}
EXPORT_SYMBOL(netif_queue_set_napi);
+static void napi_restore_config(struct napi_struct *n)
+{
+ n->defer_hard_irqs = n->config->defer_hard_irqs;
+ n->gro_flush_timeout = n->config->gro_flush_timeout;
+ /* a NAPI ID might be stored in the config, if so use it. if not, use
+ * napi_hash_add to generate one for us. It will be saved to the config
+ * in napi_disable.
+ */
+ if (n->config->napi_id)
+ napi_hash_add_with_id(n, n->config->napi_id);
+ else
+ napi_hash_add(n);
+}
+
+static void napi_save_config(struct napi_struct *n)
+{
+ n->config->defer_hard_irqs = n->defer_hard_irqs;
+ n->config->gro_flush_timeout = n->gro_flush_timeout;
+ n->config->napi_id = n->napi_id;
+ napi_hash_del(n);
+}
+
void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
@@ -6651,8 +6687,6 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
- napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
- napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
init_gro_hash(napi);
napi->skb = NULL;
INIT_LIST_HEAD(&napi->rx_list);
@@ -6670,7 +6704,13 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
- napi_hash_add(napi);
+
+ /* default settings from sysfs are applied to all NAPIs. any per-NAPI
+ * configuration will be loaded in napi_enable
+ */
+ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+ napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
+
napi_get_frags_check(napi);
/* Create kthread for this napi if dev->threaded is set.
* Clear dev->threaded if kthread creation failed so that
@@ -6702,6 +6742,11 @@ void napi_disable(struct napi_struct *n)
hrtimer_cancel(&n->timer);
+ if (n->config)
+ napi_save_config(n);
+ else
+ napi_hash_del(n);
+
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable);
@@ -6717,6 +6762,11 @@ void napi_enable(struct napi_struct *n)
{
unsigned long new, val = READ_ONCE(n->state);
+ if (n->config)
+ napi_restore_config(n);
+ else
+ napi_hash_add(n);
+
do {
BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
@@ -6746,7 +6796,11 @@ void __netif_napi_del(struct napi_struct *napi)
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
return;
- napi_hash_del(napi);
+ if (napi->config) {
+ napi->index = -1;
+ napi->config = NULL;
+ }
+
list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
@@ -11083,6 +11137,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
+ size_t napi_config_sz;
+ unsigned int maxqs;
BUG_ON(strlen(name) >= sizeof(dev->name));
@@ -11096,6 +11152,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
return NULL;
}
+ maxqs = max(txqs, rxqs);
+
dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!dev)
@@ -11170,6 +11228,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
if (!dev->ethtool)
goto free_all;
+ napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
+ dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
+ if (!dev->napi_config)
+ goto free_all;
+
strscpy(dev->name, name);
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
@@ -11231,6 +11294,8 @@ void free_netdev(struct net_device *dev)
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
+ kvfree(dev->napi_config);
+
ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
diff --git a/net/core/dev.h b/net/core/dev.h
index 26e598aa56c3..f22cb532de7a 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -167,11 +167,17 @@ static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
u32 defer)
{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
struct napi_struct *napi;
+ int i;
WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
list_for_each_entry(napi, &netdev->napi_list, dev_list)
napi_set_defer_hard_irqs(napi, defer);
+
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].defer_hard_irqs = defer;
}
/**
@@ -207,11 +213,17 @@ static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
unsigned long timeout)
{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
struct napi_struct *napi;
+ int i;
WRITE_ONCE(netdev->gro_flush_timeout, timeout);
list_for_each_entry(napi, &netdev->napi_list, dev_list)
napi_set_gro_flush_timeout(napi, timeout);
+
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].gro_flush_timeout = timeout;
}
int rps_cpumask_housekeeping(struct cpumask *mask);
--
2.34.1
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [net-next v5 5/9] net: napi: Add napi_config
2024-10-09 0:54 ` [net-next v5 5/9] net: napi: Add napi_config Joe Damato
@ 2024-10-10 4:20 ` Eric Dumazet
2024-10-10 15:59 ` Joe Damato
0 siblings, 1 reply; 31+ messages in thread
From: Eric Dumazet @ 2024-10-10 4:20 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, David S. Miller,
Jakub Kicinski, Paolo Abeni, Jonathan Corbet, Jiri Pirko,
Sebastian Andrzej Siewior, Lorenzo Bianconi, Kory Maincent,
Johannes Berg, open list:DOCUMENTATION, open list
On Wed, Oct 9, 2024 at 2:56 AM Joe Damato <jdamato@fastly.com> wrote:
>
> Add a persistent NAPI config area for NAPI configuration to the core.
> Drivers opt-in to setting the persistent config for a NAPI by passing an
> index when calling netif_napi_add_config.
>
> napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> (after the NAPIs are deleted).
>
> Drivers which call netif_napi_add_config will have persistent per-NAPI
> settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
>
> Per-NAPI settings are saved in napi_disable and restored in napi_enable.
>
> Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
> Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
> .../networking/net_cachelines/net_device.rst | 1 +
> include/linux/netdevice.h | 36 ++++++++-
> net/core/dev.c | 79 +++++++++++++++++--
> net/core/dev.h | 12 +++
> 4 files changed, 118 insertions(+), 10 deletions(-)
>
> diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
> index 3ab663b6cf16..9d86720cb722 100644
> --- a/Documentation/networking/net_cachelines/net_device.rst
> +++ b/Documentation/networking/net_cachelines/net_device.rst
> @@ -183,5 +183,6 @@ struct_dpll_pin* dpll_pin
> struct hlist_head page_pools
> struct dim_irq_moder* irq_moder
> u64 max_pacing_offload_horizon
> +struct_napi_config* napi_config
> unsigned_long gro_flush_timeout
> u32 napi_defer_hard_irqs
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 4239a4a9d295..b65a901ab4e7 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -342,6 +342,15 @@ struct gro_list {
> */
> #define GRO_HASH_BUCKETS 8
>
> +/*
> + * Structure for per-NAPI config
> + */
> +struct napi_config {
> + u64 gro_flush_timeout;
> + u32 defer_hard_irqs;
> + unsigned int napi_id;
> +};
> +
> /*
> * Structure for NAPI scheduling similar to tasklet but with weighting
> */
> @@ -379,6 +388,8 @@ struct napi_struct {
> struct list_head dev_list;
> struct hlist_node napi_hash_node;
> int irq;
> + int index;
> + struct napi_config *config;
> };
>
> enum {
> @@ -1860,9 +1871,6 @@ enum netdev_reg_state {
> * allocated at register_netdev() time
> * @real_num_rx_queues: Number of RX queues currently active in device
> * @xdp_prog: XDP sockets filter program pointer
> - * @gro_flush_timeout: timeout for GRO layer in NAPI
> - * @napi_defer_hard_irqs: If not zero, provides a counter that would
> - * allow to avoid NIC hard IRQ, on busy queues.
> *
> * @rx_handler: handler for received packets
> * @rx_handler_data: XXX: need comments on this one
> @@ -2012,6 +2020,11 @@ enum netdev_reg_state {
> * where the clock is recovered.
> *
> * @max_pacing_offload_horizon: max EDT offload horizon in nsec.
> + * @napi_config: An array of napi_config structures containing per-NAPI
> + * settings.
> + * @gro_flush_timeout: timeout for GRO layer in NAPI
> + * @napi_defer_hard_irqs: If not zero, provides a counter that would
> + * allow to avoid NIC hard IRQ, on busy queues.
> *
> * FIXME: cleanup struct net_device such that network protocol info
> * moves out.
> @@ -2405,6 +2418,7 @@ struct net_device {
> struct dim_irq_moder *irq_moder;
>
> u64 max_pacing_offload_horizon;
> + struct napi_config *napi_config;
> unsigned long gro_flush_timeout;
> u32 napi_defer_hard_irqs;
>
> @@ -2657,6 +2671,22 @@ netif_napi_add_tx_weight(struct net_device *dev,
> netif_napi_add_weight(dev, napi, poll, weight);
> }
>
> +/**
> + * netif_napi_add_config - initialize a NAPI context with persistent config
> + * @dev: network device
> + * @napi: NAPI context
> + * @poll: polling function
> + * @index: the NAPI index
> + */
> +static inline void
> +netif_napi_add_config(struct net_device *dev, struct napi_struct *napi,
> + int (*poll)(struct napi_struct *, int), int index)
> +{
> + napi->index = index;
> + napi->config = &dev->napi_config[index];
> + netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
> +}
> +
> /**
> * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
> * @dev: network device
> diff --git a/net/core/dev.c b/net/core/dev.c
> index fca2295f4d95..bd87232f7b37 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6503,6 +6503,22 @@ EXPORT_SYMBOL(napi_busy_loop);
>
> #endif /* CONFIG_NET_RX_BUSY_POLL */
>
> +static void __napi_hash_add_with_id(struct napi_struct *napi,
> + unsigned int napi_id)
> +{
> + napi->napi_id = napi_id;
> + hlist_add_head_rcu(&napi->napi_hash_node,
> + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
> +}
> +
> +static void napi_hash_add_with_id(struct napi_struct *napi,
> + unsigned int napi_id)
> +{
> + spin_lock(&napi_hash_lock);
> + __napi_hash_add_with_id(napi, napi_id);
Hmmm... there is no check if 'napi_id' is already used and hashed.
I would add
WARN_ON_ONCE(napi_by_id(napi_id));
> + spin_unlock(&napi_hash_lock);
> +}
> +
> static void napi_hash_add(struct napi_struct *napi)
> {
> if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
> @@ -6515,10 +6531,8 @@ static void napi_hash_add(struct napi_struct *napi)
> if (unlikely(++napi_gen_id < MIN_NAPI_ID))
> napi_gen_id = MIN_NAPI_ID;
> } while (napi_by_id(napi_gen_id));
> - napi->napi_id = napi_gen_id;
>
> - hlist_add_head_rcu(&napi->napi_hash_node,
> - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
> + __napi_hash_add_with_id(napi, napi_gen_id);
>
> spin_unlock(&napi_hash_lock);
> }
>
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [net-next v5 5/9] net: napi: Add napi_config
2024-10-10 4:20 ` Eric Dumazet
@ 2024-10-10 15:59 ` Joe Damato
2024-10-10 16:08 ` Jakub Kicinski
0 siblings, 1 reply; 31+ messages in thread
From: Joe Damato @ 2024-10-10 15:59 UTC (permalink / raw)
To: Eric Dumazet
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, David S. Miller,
Jakub Kicinski, Paolo Abeni, Jonathan Corbet, Jiri Pirko,
Sebastian Andrzej Siewior, Lorenzo Bianconi, Kory Maincent,
Johannes Berg, open list:DOCUMENTATION, open list
On Thu, Oct 10, 2024 at 06:20:57AM +0200, Eric Dumazet wrote:
> On Wed, Oct 9, 2024 at 2:56 AM Joe Damato <jdamato@fastly.com> wrote:
[...]
> > +
> > +static void napi_hash_add_with_id(struct napi_struct *napi,
> > + unsigned int napi_id)
> > +{
> > + spin_lock(&napi_hash_lock);
> > + __napi_hash_add_with_id(napi, napi_id);
>
> Hmmm... there is no check if 'napi_id' is already used and hashed.
>
> I would add
>
> WARN_ON_ONCE(napi_by_id(napi_id));
Thanks for the careful review, Eric.
I agree that adding this is a good idea and I will do so for the v6.
Jakub: I hope that it's OK if I retain your Reviewed-by tag as the
change Eric suggested is a minor one?
If you'd prefer I drop your Reviewed-by from this one, please let me
know.
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [net-next v5 5/9] net: napi: Add napi_config
2024-10-10 15:59 ` Joe Damato
@ 2024-10-10 16:08 ` Jakub Kicinski
0 siblings, 0 replies; 31+ messages in thread
From: Jakub Kicinski @ 2024-10-10 16:08 UTC (permalink / raw)
To: Joe Damato
Cc: Eric Dumazet, netdev, mkarsten, skhawaja, sdf, bjorn,
amritha.nambiar, sridhar.samudrala, willemdebruijn.kernel,
David S. Miller, Paolo Abeni, Jonathan Corbet, Jiri Pirko,
Sebastian Andrzej Siewior, Lorenzo Bianconi, Kory Maincent,
Johannes Berg, open list:DOCUMENTATION, open list
On Thu, 10 Oct 2024 08:59:29 -0700 Joe Damato wrote:
> Jakub: I hope that it's OK if I retain your Reviewed-by tag as the
> change Eric suggested is a minor one?
Yes
^ permalink raw reply [flat|nested] 31+ messages in thread
* [net-next v5 6/9] netdev-genl: Support setting per-NAPI config values
2024-10-09 0:54 [net-next v5 0/9] Add support for per-NAPI config via netlink Joe Damato
` (4 preceding siblings ...)
2024-10-09 0:54 ` [net-next v5 5/9] net: napi: Add napi_config Joe Damato
@ 2024-10-09 0:55 ` Joe Damato
2024-10-10 4:24 ` Eric Dumazet
2024-10-09 0:55 ` [net-next v5 7/9] bnxt: Add support for persistent NAPI config Joe Damato
` (3 subsequent siblings)
9 siblings, 1 reply; 31+ messages in thread
From: Joe Damato @ 2024-10-09 0:55 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Joe Damato,
Donald Hunter, Jakub Kicinski, David S. Miller, Eric Dumazet,
Paolo Abeni, Jesper Dangaard Brouer, Mina Almasry, Xuan Zhuo,
open list
Add support to set per-NAPI defer_hard_irqs and gro_flush_timeout.
Signed-off-by: Joe Damato <jdamato@fastly.com>
---
Documentation/netlink/specs/netdev.yaml | 11 ++++++
include/uapi/linux/netdev.h | 1 +
net/core/netdev-genl-gen.c | 18 ++++++++++
net/core/netdev-genl-gen.h | 1 +
net/core/netdev-genl.c | 45 +++++++++++++++++++++++++
tools/include/uapi/linux/netdev.h | 1 +
6 files changed, 77 insertions(+)
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index bf13613eaa0d..7b4ea5a6e73d 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -690,6 +690,17 @@ operations:
reply:
attributes:
- id
+ -
+ name: napi-set
+ doc: Set configurable NAPI instance settings.
+ attribute-set: napi
+ flags: [ admin-perm ]
+ do:
+ request:
+ attributes:
+ - id
+ - defer-hard-irqs
+ - gro-flush-timeout
kernel-family:
headers: [ "linux/list.h"]
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index cacd33359c76..e3ebb49f60d2 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -201,6 +201,7 @@ enum {
NETDEV_CMD_NAPI_GET,
NETDEV_CMD_QSTATS_GET,
NETDEV_CMD_BIND_RX,
+ NETDEV_CMD_NAPI_SET,
__NETDEV_CMD_MAX,
NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index b28424ae06d5..e197bd84997c 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -22,6 +22,10 @@ static const struct netlink_range_validation netdev_a_page_pool_ifindex_range =
.max = 2147483647ULL,
};
+static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = {
+ .max = 2147483647ULL,
+};
+
/* Common nested types */
const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = {
[NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
@@ -87,6 +91,13 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1]
[NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
};
+/* NETDEV_CMD_NAPI_SET - do */
+static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT + 1] = {
+ [NETDEV_A_NAPI_ID] = { .type = NLA_U32, },
+ [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
+ [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
+};
+
/* Ops table for netdev */
static const struct genl_split_ops netdev_nl_ops[] = {
{
@@ -171,6 +182,13 @@ static const struct genl_split_ops netdev_nl_ops[] = {
.maxattr = NETDEV_A_DMABUF_FD,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
+ {
+ .cmd = NETDEV_CMD_NAPI_SET,
+ .doit = netdev_nl_napi_set_doit,
+ .policy = netdev_napi_set_nl_policy,
+ .maxattr = NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
};
static const struct genl_multicast_group netdev_nl_mcgrps[] = {
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index 8cda334fd042..e09dd7539ff2 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -33,6 +33,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
struct netlink_callback *cb);
int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
enum {
NETDEV_NLGRP_MGMT,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 64e5e4cee60d..59523318d620 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -303,6 +303,51 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
return err;
}
+static int
+netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
+{
+ u64 gro_flush_timeout = 0;
+ u32 defer = 0;
+
+ if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) {
+ defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]);
+ napi_set_defer_hard_irqs(napi, defer);
+ }
+
+ if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) {
+ gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]);
+ napi_set_gro_flush_timeout(napi, gro_flush_timeout);
+ }
+
+ return 0;
+}
+
+int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct napi_struct *napi;
+ unsigned int napi_id;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
+ return -EINVAL;
+
+ napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
+
+ rtnl_lock();
+
+ napi = napi_by_id(napi_id);
+ if (napi) {
+ err = netdev_nl_napi_set_config(napi, info);
+ } else {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
+ err = -ENOENT;
+ }
+
+ rtnl_unlock();
+
+ return err;
+}
+
static int
netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
u32 q_idx, u32 q_type, const struct genl_info *info)
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index cacd33359c76..e3ebb49f60d2 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -201,6 +201,7 @@ enum {
NETDEV_CMD_NAPI_GET,
NETDEV_CMD_QSTATS_GET,
NETDEV_CMD_BIND_RX,
+ NETDEV_CMD_NAPI_SET,
__NETDEV_CMD_MAX,
NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
--
2.34.1
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [net-next v5 6/9] netdev-genl: Support setting per-NAPI config values
2024-10-09 0:55 ` [net-next v5 6/9] netdev-genl: Support setting per-NAPI config values Joe Damato
@ 2024-10-10 4:24 ` Eric Dumazet
2024-10-10 15:19 ` Jakub Kicinski
0 siblings, 1 reply; 31+ messages in thread
From: Eric Dumazet @ 2024-10-10 4:24 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Donald Hunter,
Jakub Kicinski, David S. Miller, Paolo Abeni,
Jesper Dangaard Brouer, Mina Almasry, Xuan Zhuo, open list
On Wed, Oct 9, 2024 at 2:56 AM Joe Damato <jdamato@fastly.com> wrote:
>
> Add support to set per-NAPI defer_hard_irqs and gro_flush_timeout.
>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
> Documentation/netlink/specs/netdev.yaml | 11 ++++++
> include/uapi/linux/netdev.h | 1 +
> net/core/netdev-genl-gen.c | 18 ++++++++++
> net/core/netdev-genl-gen.h | 1 +
> net/core/netdev-genl.c | 45 +++++++++++++++++++++++++
> tools/include/uapi/linux/netdev.h | 1 +
> 6 files changed, 77 insertions(+)
>
> diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
> index bf13613eaa0d..7b4ea5a6e73d 100644
> --- a/Documentation/netlink/specs/netdev.yaml
> +++ b/Documentation/netlink/specs/netdev.yaml
> @@ -690,6 +690,17 @@ operations:
> reply:
> attributes:
> - id
> + -
> + name: napi-set
> + doc: Set configurable NAPI instance settings.
> + attribute-set: napi
> + flags: [ admin-perm ]
> + do:
> + request:
> + attributes:
> + - id
> + - defer-hard-irqs
> + - gro-flush-timeout
>
> kernel-family:
> headers: [ "linux/list.h"]
> diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
> index cacd33359c76..e3ebb49f60d2 100644
> --- a/include/uapi/linux/netdev.h
> +++ b/include/uapi/linux/netdev.h
> @@ -201,6 +201,7 @@ enum {
> NETDEV_CMD_NAPI_GET,
> NETDEV_CMD_QSTATS_GET,
> NETDEV_CMD_BIND_RX,
> + NETDEV_CMD_NAPI_SET,
>
> __NETDEV_CMD_MAX,
> NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
> diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
> index b28424ae06d5..e197bd84997c 100644
> --- a/net/core/netdev-genl-gen.c
> +++ b/net/core/netdev-genl-gen.c
> @@ -22,6 +22,10 @@ static const struct netlink_range_validation netdev_a_page_pool_ifindex_range =
> .max = 2147483647ULL,
> };
>
> +static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = {
> + .max = 2147483647ULL,
Would (u64)INT_MAX work ?
> +};
> +
> /* Common nested types */
> const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = {
> [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
> @@ -87,6 +91,13 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1]
> [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
> };
>
> +/* NETDEV_CMD_NAPI_SET - do */
> +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT + 1] = {
> + [NETDEV_A_NAPI_ID] = { .type = NLA_U32, },
> + [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
> + [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
> +};
> +
> /* Ops table for netdev */
> static const struct genl_split_ops netdev_nl_ops[] = {
> {
> @@ -171,6 +182,13 @@ static const struct genl_split_ops netdev_nl_ops[] = {
> .maxattr = NETDEV_A_DMABUF_FD,
> .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
> },
> + {
> + .cmd = NETDEV_CMD_NAPI_SET,
> + .doit = netdev_nl_napi_set_doit,
> + .policy = netdev_napi_set_nl_policy,
> + .maxattr = NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
> + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
> + },
> };
>
> static const struct genl_multicast_group netdev_nl_mcgrps[] = {
> diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
> index 8cda334fd042..e09dd7539ff2 100644
> --- a/net/core/netdev-genl-gen.h
> +++ b/net/core/netdev-genl-gen.h
> @@ -33,6 +33,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
> int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
> struct netlink_callback *cb);
> int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
> +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
>
> enum {
> NETDEV_NLGRP_MGMT,
> diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
> index 64e5e4cee60d..59523318d620 100644
> --- a/net/core/netdev-genl.c
> +++ b/net/core/netdev-genl.c
> @@ -303,6 +303,51 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
> return err;
> }
>
> +static int
> +netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
> +{
> + u64 gro_flush_timeout = 0;
> + u32 defer = 0;
> +
> + if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) {
> + defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]);
> + napi_set_defer_hard_irqs(napi, defer);
> + }
> +
> + if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) {
> + gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]);
> + napi_set_gro_flush_timeout(napi, gro_flush_timeout);
> + }
> +
> + return 0;
> +}
> +
> +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
> +{
> + struct napi_struct *napi;
> + unsigned int napi_id;
> + int err;
> +
> + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
> + return -EINVAL;
> +
> + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
> +
> + rtnl_lock();
Hmm.... please see my patch there :
https://patchwork.kernel.org/project/netdevbpf/patch/20241009232728.107604-2-edumazet@google.com/
Lets not add another rtnl_lock() :/
> +
> + napi = napi_by_id(napi_id);
> + if (napi) {
> + err = netdev_nl_napi_set_config(napi, info);
> + } else {
> + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
> + err = -ENOENT;
> + }
> +
> + rtnl_unlock();
> +
> + return err;
> +}
> +
> static int
> netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
> u32 q_idx, u32 q_type, const struct genl_info *info)
> diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
> index cacd33359c76..e3ebb49f60d2 100644
> --- a/tools/include/uapi/linux/netdev.h
> +++ b/tools/include/uapi/linux/netdev.h
> @@ -201,6 +201,7 @@ enum {
> NETDEV_CMD_NAPI_GET,
> NETDEV_CMD_QSTATS_GET,
> NETDEV_CMD_BIND_RX,
> + NETDEV_CMD_NAPI_SET,
>
> __NETDEV_CMD_MAX,
> NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
> --
> 2.34.1
>
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [net-next v5 6/9] netdev-genl: Support setting per-NAPI config values
2024-10-10 4:24 ` Eric Dumazet
@ 2024-10-10 15:19 ` Jakub Kicinski
2024-10-10 15:30 ` Eric Dumazet
0 siblings, 1 reply; 31+ messages in thread
From: Jakub Kicinski @ 2024-10-10 15:19 UTC (permalink / raw)
To: Eric Dumazet
Cc: Joe Damato, netdev, mkarsten, skhawaja, sdf, bjorn,
amritha.nambiar, sridhar.samudrala, willemdebruijn.kernel,
Donald Hunter, David S. Miller, Paolo Abeni,
Jesper Dangaard Brouer, Mina Almasry, Xuan Zhuo, open list
On Thu, 10 Oct 2024 06:24:54 +0200 Eric Dumazet wrote:
> > +static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = {
> > + .max = 2147483647ULL,
>
> Would (u64)INT_MAX work ?
I sent a codegen change for this. The codegen is a bit of a mess.
> > +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
> > +{
> > + struct napi_struct *napi;
> > + unsigned int napi_id;
> > + int err;
> > +
> > + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
> > + return -EINVAL;
> > +
> > + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
> > +
> > + rtnl_lock();
>
> Hmm.... please see my patch there :
>
> https://patchwork.kernel.org/project/netdevbpf/patch/20241009232728.107604-2-edumazet@google.com/
>
> Lets not add another rtnl_lock() :/
It's not as easy since NAPIs can come and go at driver's whim.
I'm quietly hoping we can convert all netdev-nl NAPI accesses
to use the netdev->lock protection I strong-armed Paolo into
adding in his shaper series. But perhaps we can do that after
this series? NAPI GET already takes RTNL lock.
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [net-next v5 6/9] netdev-genl: Support setting per-NAPI config values
2024-10-10 15:19 ` Jakub Kicinski
@ 2024-10-10 15:30 ` Eric Dumazet
2024-10-10 16:40 ` Joe Damato
0 siblings, 1 reply; 31+ messages in thread
From: Eric Dumazet @ 2024-10-10 15:30 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Joe Damato, netdev, mkarsten, skhawaja, sdf, bjorn,
amritha.nambiar, sridhar.samudrala, willemdebruijn.kernel,
Donald Hunter, David S. Miller, Paolo Abeni,
Jesper Dangaard Brouer, Mina Almasry, Xuan Zhuo, open list
On Thu, Oct 10, 2024 at 5:19 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Thu, 10 Oct 2024 06:24:54 +0200 Eric Dumazet wrote:
> > > +static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = {
> > > + .max = 2147483647ULL,
> >
> > Would (u64)INT_MAX work ?
>
> I sent a codegen change for this. The codegen is a bit of a mess.
>
> > > +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
> > > +{
> > > + struct napi_struct *napi;
> > > + unsigned int napi_id;
> > > + int err;
> > > +
> > > + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
> > > + return -EINVAL;
> > > +
> > > + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
> > > +
> > > + rtnl_lock();
> >
> > Hmm.... please see my patch there :
> >
> > https://patchwork.kernel.org/project/netdevbpf/patch/20241009232728.107604-2-edumazet@google.com/
> >
> > Lets not add another rtnl_lock() :/
>
> It's not as easy since NAPIs can come and go at driver's whim.
> I'm quietly hoping we can convert all netdev-nl NAPI accesses
> to use the netdev->lock protection I strong-armed Paolo into
> adding in his shaper series. But perhaps we can do that after
> this series? NAPI GET already takes RTNL lock.
napi_by_id() is protected by rcu and its own spinlock ( napi_hash_lock )
I do not see why rtnl is needed.
This will also be a big issue with per netns-RTNL anyway.
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [net-next v5 6/9] netdev-genl: Support setting per-NAPI config values
2024-10-10 15:30 ` Eric Dumazet
@ 2024-10-10 16:40 ` Joe Damato
2024-10-11 17:19 ` Eric Dumazet
0 siblings, 1 reply; 31+ messages in thread
From: Joe Damato @ 2024-10-10 16:40 UTC (permalink / raw)
To: Eric Dumazet
Cc: Jakub Kicinski, netdev, mkarsten, skhawaja, sdf, bjorn,
amritha.nambiar, sridhar.samudrala, willemdebruijn.kernel,
Donald Hunter, David S. Miller, Paolo Abeni,
Jesper Dangaard Brouer, Mina Almasry, Xuan Zhuo, open list
On Thu, Oct 10, 2024 at 05:30:26PM +0200, Eric Dumazet wrote:
> On Thu, Oct 10, 2024 at 5:19 PM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Thu, 10 Oct 2024 06:24:54 +0200 Eric Dumazet wrote:
> > > > +static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = {
> > > > + .max = 2147483647ULL,
> > >
> > > Would (u64)INT_MAX work ?
> >
> > I sent a codegen change for this. The codegen is a bit of a mess.
> >
> > > > +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
> > > > +{
> > > > + struct napi_struct *napi;
> > > > + unsigned int napi_id;
> > > > + int err;
> > > > +
> > > > + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
> > > > + return -EINVAL;
> > > > +
> > > > + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
> > > > +
> > > > + rtnl_lock();
> > >
> > > Hmm.... please see my patch there :
> > >
> > > https://patchwork.kernel.org/project/netdevbpf/patch/20241009232728.107604-2-edumazet@google.com/
> > >
> > > Lets not add another rtnl_lock() :/
> >
> > It's not as easy since NAPIs can come and go at driver's whim.
> > I'm quietly hoping we can convert all netdev-nl NAPI accesses
> > to use the netdev->lock protection I strong-armed Paolo into
> > adding in his shaper series. But perhaps we can do that after
> > this series? NAPI GET already takes RTNL lock.
>
>
> napi_by_id() is protected by rcu and its own spinlock ( napi_hash_lock )
> I do not see why rtnl is needed.
> This will also be a big issue with per netns-RTNL anyway.
I deeply appreciate and respect both of your thoughts on this; I
will hold off on sending a v6 until a decision is made on this
particular issue.
Thank you both for your careful review and analysis.
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [net-next v5 6/9] netdev-genl: Support setting per-NAPI config values
2024-10-10 16:40 ` Joe Damato
@ 2024-10-11 17:19 ` Eric Dumazet
2024-10-11 17:50 ` Joe Damato
0 siblings, 1 reply; 31+ messages in thread
From: Eric Dumazet @ 2024-10-11 17:19 UTC (permalink / raw)
To: Joe Damato, Eric Dumazet, Jakub Kicinski, netdev, mkarsten,
skhawaja, sdf, bjorn, amritha.nambiar, sridhar.samudrala,
willemdebruijn.kernel, Donald Hunter, David S. Miller,
Paolo Abeni, Jesper Dangaard Brouer, Mina Almasry, Xuan Zhuo,
open list
On Thu, Oct 10, 2024 at 6:40 PM Joe Damato <jdamato@fastly.com> wrote:
>
> On Thu, Oct 10, 2024 at 05:30:26PM +0200, Eric Dumazet wrote:
> > On Thu, Oct 10, 2024 at 5:19 PM Jakub Kicinski <kuba@kernel.org> wrote:
> > >
> > > On Thu, 10 Oct 2024 06:24:54 +0200 Eric Dumazet wrote:
> > > > > +static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = {
> > > > > + .max = 2147483647ULL,
> > > >
> > > > Would (u64)INT_MAX work ?
> > >
> > > I sent a codegen change for this. The codegen is a bit of a mess.
> > >
> > > > > +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
> > > > > +{
> > > > > + struct napi_struct *napi;
> > > > > + unsigned int napi_id;
> > > > > + int err;
> > > > > +
> > > > > + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
> > > > > + return -EINVAL;
> > > > > +
> > > > > + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
> > > > > +
> > > > > + rtnl_lock();
> > > >
> > > > Hmm.... please see my patch there :
> > > >
> > > > https://patchwork.kernel.org/project/netdevbpf/patch/20241009232728.107604-2-edumazet@google.com/
> > > >
> > > > Lets not add another rtnl_lock() :/
> > >
> > > It's not as easy since NAPIs can come and go at driver's whim.
> > > I'm quietly hoping we can convert all netdev-nl NAPI accesses
> > > to use the netdev->lock protection I strong-armed Paolo into
> > > adding in his shaper series. But perhaps we can do that after
> > > this series? NAPI GET already takes RTNL lock.
> >
> >
> > napi_by_id() is protected by rcu and its own spinlock ( napi_hash_lock )
> > I do not see why rtnl is needed.
> > This will also be a big issue with per netns-RTNL anyway.
>
> I deeply appreciate and respect both of your thoughts on this; I
> will hold off on sending a v6 until a decision is made on this
> particular issue.
>
I do not want to block your series.
Whatever is needed later, I can handle.
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [net-next v5 6/9] netdev-genl: Support setting per-NAPI config values
2024-10-11 17:19 ` Eric Dumazet
@ 2024-10-11 17:50 ` Joe Damato
0 siblings, 0 replies; 31+ messages in thread
From: Joe Damato @ 2024-10-11 17:50 UTC (permalink / raw)
To: Eric Dumazet
Cc: Jakub Kicinski, netdev, mkarsten, skhawaja, sdf, bjorn,
amritha.nambiar, sridhar.samudrala, willemdebruijn.kernel,
Donald Hunter, David S. Miller, Paolo Abeni,
Jesper Dangaard Brouer, Mina Almasry, Xuan Zhuo, open list
On Fri, Oct 11, 2024 at 07:19:47PM +0200, Eric Dumazet wrote:
> On Thu, Oct 10, 2024 at 6:40 PM Joe Damato <jdamato@fastly.com> wrote:
> >
> > On Thu, Oct 10, 2024 at 05:30:26PM +0200, Eric Dumazet wrote:
> > > On Thu, Oct 10, 2024 at 5:19 PM Jakub Kicinski <kuba@kernel.org> wrote:
> > > >
> > > > On Thu, 10 Oct 2024 06:24:54 +0200 Eric Dumazet wrote:
> > > > > > +static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = {
> > > > > > + .max = 2147483647ULL,
> > > > >
> > > > > Would (u64)INT_MAX work ?
> > > >
> > > > I sent a codegen change for this. The codegen is a bit of a mess.
> > > >
> > > > > > +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
> > > > > > +{
> > > > > > + struct napi_struct *napi;
> > > > > > + unsigned int napi_id;
> > > > > > + int err;
> > > > > > +
> > > > > > + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
> > > > > > + return -EINVAL;
> > > > > > +
> > > > > > + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
> > > > > > +
> > > > > > + rtnl_lock();
> > > > >
> > > > > Hmm.... please see my patch there :
> > > > >
> > > > > https://patchwork.kernel.org/project/netdevbpf/patch/20241009232728.107604-2-edumazet@google.com/
> > > > >
> > > > > Lets not add another rtnl_lock() :/
> > > >
> > > > It's not as easy since NAPIs can come and go at driver's whim.
> > > > I'm quietly hoping we can convert all netdev-nl NAPI accesses
> > > > to use the netdev->lock protection I strong-armed Paolo into
> > > > adding in his shaper series. But perhaps we can do that after
> > > > this series? NAPI GET already takes RTNL lock.
> > >
> > >
> > > napi_by_id() is protected by rcu and its own spinlock ( napi_hash_lock )
> > > I do not see why rtnl is needed.
> > > This will also be a big issue with per netns-RTNL anyway.
> >
> > I deeply appreciate and respect both of your thoughts on this; I
> > will hold off on sending a v6 until a decision is made on this
> > particular issue.
> >
>
> I do not want to block your series.
>
> Whatever is needed later, I can handle.
Thank you, Eric.
I am happy to help with the future changes if needed. Feel free to
reach out if you'd like me to assist in any way as I know you have a
tremendous amount of work on your plate.
I will submit the v6 shortly, after I've rebased and retested.
^ permalink raw reply [flat|nested] 31+ messages in thread
* [net-next v5 7/9] bnxt: Add support for persistent NAPI config
2024-10-09 0:54 [net-next v5 0/9] Add support for per-NAPI config via netlink Joe Damato
` (5 preceding siblings ...)
2024-10-09 0:55 ` [net-next v5 6/9] netdev-genl: Support setting per-NAPI config values Joe Damato
@ 2024-10-09 0:55 ` Joe Damato
2024-10-10 4:25 ` Eric Dumazet
2024-10-09 0:55 ` [net-next v5 8/9] mlx5: " Joe Damato
` (2 subsequent siblings)
9 siblings, 1 reply; 31+ messages in thread
From: Joe Damato @ 2024-10-09 0:55 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Joe Damato,
Michael Chan, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, open list
Use netif_napi_add_config to assign persistent per-NAPI config when
initializing NAPIs.
Signed-off-by: Joe Damato <jdamato@fastly.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 6e422e24750a..f5da2dace982 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -10986,7 +10986,8 @@ static void bnxt_init_napi(struct bnxt *bp)
cp_nr_rings--;
for (i = 0; i < cp_nr_rings; i++) {
bnapi = bp->bnapi[i];
- netif_napi_add(bp->dev, &bnapi->napi, poll_fn);
+ netif_napi_add_config(bp->dev, &bnapi->napi, poll_fn,
+ bnapi->index);
}
if (BNXT_CHIP_TYPE_NITRO_A0(bp)) {
bnapi = bp->bnapi[cp_nr_rings];
--
2.34.1
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [net-next v5 7/9] bnxt: Add support for persistent NAPI config
2024-10-09 0:55 ` [net-next v5 7/9] bnxt: Add support for persistent NAPI config Joe Damato
@ 2024-10-10 4:25 ` Eric Dumazet
0 siblings, 0 replies; 31+ messages in thread
From: Eric Dumazet @ 2024-10-10 4:25 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Michael Chan,
David S. Miller, Jakub Kicinski, Paolo Abeni, open list
On Wed, Oct 9, 2024 at 2:56 AM Joe Damato <jdamato@fastly.com> wrote:
>
> Use netif_napi_add_config to assign persistent per-NAPI config when
> initializing NAPIs.
>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
Reviewed-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply [flat|nested] 31+ messages in thread
* [net-next v5 8/9] mlx5: Add support for persistent NAPI config
2024-10-09 0:54 [net-next v5 0/9] Add support for per-NAPI config via netlink Joe Damato
` (6 preceding siblings ...)
2024-10-09 0:55 ` [net-next v5 7/9] bnxt: Add support for persistent NAPI config Joe Damato
@ 2024-10-09 0:55 ` Joe Damato
2024-10-10 4:26 ` Eric Dumazet
2024-10-09 0:55 ` [net-next v5 9/9] mlx4: Add support for persistent NAPI config to RX CQs Joe Damato
2024-10-10 3:17 ` [net-next v5 0/9] Add support for per-NAPI config via netlink Jakub Kicinski
9 siblings, 1 reply; 31+ messages in thread
From: Joe Damato @ 2024-10-09 0:55 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Joe Damato,
Saeed Mahameed, Tariq Toukan, Leon Romanovsky, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni,
open list:MELLANOX MLX5 core VPI driver, open list
Use netif_napi_add_config to assign persistent per-NAPI config when
initializing NAPIs.
Signed-off-by: Joe Damato <jdamato@fastly.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a5659c0c4236..09ab7ac07c29 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2697,7 +2697,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
c->aff_mask = irq_get_effective_affinity_mask(irq);
c->lag_port = mlx5e_enumerate_lag_port(mdev, ix);
- netif_napi_add(netdev, &c->napi, mlx5e_napi_poll);
+ netif_napi_add_config(netdev, &c->napi, mlx5e_napi_poll, ix);
netif_napi_set_irq(&c->napi, irq);
err = mlx5e_open_queues(c, params, cparam);
--
2.34.1
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [net-next v5 8/9] mlx5: Add support for persistent NAPI config
2024-10-09 0:55 ` [net-next v5 8/9] mlx5: " Joe Damato
@ 2024-10-10 4:26 ` Eric Dumazet
0 siblings, 0 replies; 31+ messages in thread
From: Eric Dumazet @ 2024-10-10 4:26 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Saeed Mahameed,
Tariq Toukan, Leon Romanovsky, David S. Miller, Jakub Kicinski,
Paolo Abeni, open list:MELLANOX MLX5 core VPI driver, open list
On Wed, Oct 9, 2024 at 2:56 AM Joe Damato <jdamato@fastly.com> wrote:
>
> Use netif_napi_add_config to assign persistent per-NAPI config when
> initializing NAPIs.
>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
Reviewed-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply [flat|nested] 31+ messages in thread
* [net-next v5 9/9] mlx4: Add support for persistent NAPI config to RX CQs
2024-10-09 0:54 [net-next v5 0/9] Add support for per-NAPI config via netlink Joe Damato
` (7 preceding siblings ...)
2024-10-09 0:55 ` [net-next v5 8/9] mlx5: " Joe Damato
@ 2024-10-09 0:55 ` Joe Damato
2024-10-10 4:28 ` Eric Dumazet
2024-10-10 3:17 ` [net-next v5 0/9] Add support for per-NAPI config via netlink Jakub Kicinski
9 siblings, 1 reply; 31+ messages in thread
From: Joe Damato @ 2024-10-09 0:55 UTC (permalink / raw)
To: netdev
Cc: mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Joe Damato,
Tariq Toukan, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, open list:MELLANOX MLX4 core VPI driver, open list
Use netif_napi_add_config to assign persistent per-NAPI config when
initializing RX CQ NAPIs.
Presently, struct napi_config only has support for two fields used for
RX, so there is no need to support them with TX CQs, yet.
Signed-off-by: Joe Damato <jdamato@fastly.com>
---
drivers/net/ethernet/mellanox/mlx4/en_cq.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
index 461cc2c79c71..0e92956e84cf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -156,7 +156,8 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
break;
case RX:
cq->mcq.comp = mlx4_en_rx_irq;
- netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_rx_cq);
+ netif_napi_add_config(cq->dev, &cq->napi, mlx4_en_poll_rx_cq,
+ cq_idx);
netif_napi_set_irq(&cq->napi, irq);
napi_enable(&cq->napi);
netif_queue_set_napi(cq->dev, cq_idx, NETDEV_QUEUE_TYPE_RX, &cq->napi);
--
2.34.1
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [net-next v5 9/9] mlx4: Add support for persistent NAPI config to RX CQs
2024-10-09 0:55 ` [net-next v5 9/9] mlx4: Add support for persistent NAPI config to RX CQs Joe Damato
@ 2024-10-10 4:28 ` Eric Dumazet
2024-10-10 16:07 ` Joe Damato
0 siblings, 1 reply; 31+ messages in thread
From: Eric Dumazet @ 2024-10-10 4:28 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Tariq Toukan,
David S. Miller, Jakub Kicinski, Paolo Abeni,
open list:MELLANOX MLX4 core VPI driver, open list
On Wed, Oct 9, 2024 at 2:56 AM Joe Damato <jdamato@fastly.com> wrote:
>
> Use netif_napi_add_config to assign persistent per-NAPI config when
> initializing RX CQ NAPIs.
>
> Presently, struct napi_config only has support for two fields used for
> RX, so there is no need to support them with TX CQs, yet.
>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
nit: technically, the napi_defer_hard_irqs could benefit TX completions as well.
Reviewed-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [net-next v5 9/9] mlx4: Add support for persistent NAPI config to RX CQs
2024-10-10 4:28 ` Eric Dumazet
@ 2024-10-10 16:07 ` Joe Damato
0 siblings, 0 replies; 31+ messages in thread
From: Joe Damato @ 2024-10-10 16:07 UTC (permalink / raw)
To: Eric Dumazet
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Tariq Toukan,
David S. Miller, Jakub Kicinski, Paolo Abeni,
open list:MELLANOX MLX4 core VPI driver, open list
On Thu, Oct 10, 2024 at 06:28:59AM +0200, Eric Dumazet wrote:
> On Wed, Oct 9, 2024 at 2:56 AM Joe Damato <jdamato@fastly.com> wrote:
> >
> > Use netif_napi_add_config to assign persistent per-NAPI config when
> > initializing RX CQ NAPIs.
> >
> > Presently, struct napi_config only has support for two fields used for
> > RX, so there is no need to support them with TX CQs, yet.
> >
> > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > ---
>
> nit: technically, the napi_defer_hard_irqs could benefit TX completions as well.
That's true - I think I missed updating this commit message when I
realized it. I can correct the commit message while retaining your
Reviewed-by for the v6.
Note: This adds to the confusion I have around the support for
allocating max(rxqs, txqs) config structs; it would seem we'll be
missing config structure for some queues if the system is configured
to use the maximum number of each? Perhaps that configuration is
uncommon enough that it doesn't matter?
> Reviewed-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [net-next v5 0/9] Add support for per-NAPI config via netlink
2024-10-09 0:54 [net-next v5 0/9] Add support for per-NAPI config via netlink Joe Damato
` (8 preceding siblings ...)
2024-10-09 0:55 ` [net-next v5 9/9] mlx4: Add support for persistent NAPI config to RX CQs Joe Damato
@ 2024-10-10 3:17 ` Jakub Kicinski
9 siblings, 0 replies; 31+ messages in thread
From: Jakub Kicinski @ 2024-10-10 3:17 UTC (permalink / raw)
To: Joe Damato
Cc: netdev, mkarsten, skhawaja, sdf, bjorn, amritha.nambiar,
sridhar.samudrala, willemdebruijn.kernel, Alexander Lobakin,
Breno Leitao, Daniel Jurgens, David Ahern, David S. Miller,
Donald Hunter, Eric Dumazet, Jesper Dangaard Brouer, Jiri Pirko,
Johannes Berg, Jonathan Corbet, Kory Maincent, Leon Romanovsky,
open list:DOCUMENTATION, open list,
open list:MELLANOX MLX4 core VPI driver, Lorenzo Bianconi,
Michael Chan, Mina Almasry, Paolo Abeni, Saeed Mahameed,
Sebastian Andrzej Siewior, Tariq Toukan, Xuan Zhuo
On Wed, 9 Oct 2024 00:54:54 +0000 Joe Damato wrote:
> Welcome to v5, the first non-RFC version of this code! See the changelog
> below for more details on changes between revisions.
I'm probably going to end up applying this but just in case:
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Very neat work !
^ permalink raw reply [flat|nested] 31+ messages in thread