From: Shuhao Tan <tanshuhao@google.com>
To: Andrew Lunn <andrew+netdev@lunn.ch>,
"David S . Miller" <davem@davemloft.net>,
Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Simon Horman <horms@kernel.org>, Shuah Khan <shuah@kernel.org>
Cc: Shuhao Tan <tanshuhao@google.com>,
Mina Almasry <almasrymina@google.com>,
Samiullah Khawaja <skhawaja@google.com>,
Kuniyuki Iwashima <kuniyu@google.com>,
netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-kselftest@vger.kernel.org
Subject: [RFC net-next v1 1/2] net: Save kthread of threaded NAPI in napi_config in napi_del and restore in napi_add
Date: Fri, 12 Jun 2026 10:36:41 -0700 [thread overview]
Message-ID: <20260612173644.380972-2-tanshuhao@google.com> (raw)
In-Reply-To: <20260612173644.380972-1-tanshuhao@google.com>
Replace napi->thread with a new thread_node struct that has a back
pointer to napi_struct.
Make the NAPI thread to use the thread_node as data pointer so that
it can poll on different NAPIs thoughout its lifetime.
Park the thread and save the thread_node in napi_config on napi_del.
Restore the node and unpark the thread on napi_add_config.
Signed-off-by: Shuhao Tan <tanshuhao@google.com>
---
include/linux/netdevice.h | 13 +++-
net/core/dev.c | 151 +++++++++++++++++++++++++++++---------
net/core/netdev-genl.c | 12 ++-
3 files changed, 139 insertions(+), 37 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7f4f0837c09f..1cda88607e99 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -63,6 +63,7 @@ struct dsa_port;
struct ip_tunnel_parm_kern;
struct macsec_context;
struct macsec_ops;
+struct napi_struct;
struct netdev_config;
struct netdev_name_node;
struct sd_flow_limit;
@@ -363,10 +364,20 @@ struct gro_node {
u32 cached_napi_id;
};
+/*
+ * Structure for persisting threaded NAPI kthread
+ */
+struct napi_thread_node {
+ struct task_struct *thread;
+ struct napi_struct *napi;
+ struct rcu_head rcu;
+};
+
/*
* Structure for per-NAPI config
*/
struct napi_config {
+ struct napi_thread_node *thread_node;
u64 gro_flush_timeout;
u64 irq_suspend_timeout;
u32 defer_hard_irqs;
@@ -403,7 +414,7 @@ struct napi_struct {
struct gro_node gro;
struct hrtimer timer;
/* all fields past this point are write-protected by netdev_lock */
- struct task_struct *thread;
+ struct napi_thread_node __rcu *thread_node;
unsigned long gro_flush_timeout;
unsigned long irq_suspend_timeout;
u32 defer_hard_irqs;
diff --git a/net/core/dev.c b/net/core/dev.c
index 202e35acb15b..f5e3b9e526af 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1645,25 +1645,62 @@ EXPORT_SYMBOL(netdev_notify_peers);
static int napi_threaded_poll(void *data);
-static int napi_kthread_create(struct napi_struct *n)
+static int napi_thread_node_create(struct napi_struct *n)
{
+ struct napi_thread_node *thread_node = NULL;
+ struct task_struct *thread = NULL;
int err = 0;
+ thread_node = kvzalloc_obj(*thread_node);
+ if (!thread_node)
+ return -ENOMEM;
+
/* Create and wake up the kthread once to put it in
* TASK_INTERRUPTIBLE mode to avoid the blocked task
* warning and work with loadavg.
*/
- n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
- n->dev->name, n->napi_id);
- if (IS_ERR(n->thread)) {
- err = PTR_ERR(n->thread);
+ thread_node->napi = n;
+ thread = kthread_run(napi_threaded_poll, thread_node, "napi/%s-%d",
+ n->dev->name, n->napi_id);
+ if (IS_ERR(thread)) {
+ err = PTR_ERR(thread);
pr_err("kthread_run failed with err %d\n", err);
- n->thread = NULL;
+ goto free_thread_node;
}
+ thread_node->thread = thread;
+ rcu_assign_pointer(n->thread_node, thread_node);
+
+ return 0;
+
+free_thread_node:
+ kvfree(thread_node);
+
return err;
}
+static void napi_thread_node_stop(struct napi_thread_node *thread_node)
+{
+ kthread_stop(thread_node->thread);
+ kvfree_rcu(thread_node, rcu);
+}
+
+static int napi_kthread_create(struct napi_struct *n)
+{
+ struct napi_thread_node *thread_node;
+
+ if (n->config && n->config->thread_node) {
+ thread_node = n->config->thread_node;
+ rcu_assign_pointer(n->thread_node, thread_node);
+ n->config->thread_node = NULL;
+ WRITE_ONCE(thread_node->napi, n);
+ kthread_unpark(thread_node->thread);
+ return 0;
+ }
+
+ return napi_thread_node_create(n);
+}
+
static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -4949,7 +4986,7 @@ EXPORT_SYMBOL(__dev_direct_xmit);
/*************************************************************************
* Receiver routines
*************************************************************************/
-static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
+static DEFINE_PER_CPU(struct napi_thread_node, backlog_napi);
int weight_p __read_mostly = 64; /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
@@ -4959,10 +4996,11 @@ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
- struct task_struct *thread;
+ struct napi_thread_node *thread_node;
lockdep_assert_irqs_disabled();
+ rcu_read_lock();
if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
/* Paired with smp_mb__before_atomic() in
* napi_enable()/netif_set_threaded().
@@ -4970,18 +5008,21 @@ static inline void ____napi_schedule(struct softnet_data *sd,
* read on napi->thread. Only call
* wake_up_process() when it's not NULL.
*/
- thread = READ_ONCE(napi->thread);
- if (thread) {
- if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
+ thread_node = rcu_dereference(napi->thread_node);
+ if (thread_node) {
+ if (use_backlog_threads() &&
+ thread_node == this_cpu_ptr(&backlog_napi))
goto use_local_napi;
set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
- wake_up_process(thread);
+ wake_up_process(thread_node->thread);
+ rcu_read_unlock();
return;
}
}
use_local_napi:
+ rcu_read_unlock();
DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list));
list_add_tail(&napi->poll_list, &sd->poll_list);
WRITE_ONCE(napi->list_owner, smp_processor_id());
@@ -7148,6 +7189,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
static void napi_stop_kthread(struct napi_struct *napi)
{
+ struct napi_thread_node *thread_node;
unsigned long val, new;
/* Wait until the napi STATE_THREADED is unset. */
@@ -7180,8 +7222,9 @@ static void napi_stop_kthread(struct napi_struct *napi)
msleep(20);
}
- kthread_stop(napi->thread);
- napi->thread = NULL;
+ thread_node = netdev_lock_dereference(napi->thread_node, napi->dev);
+ rcu_assign_pointer(napi->thread_node, NULL);
+ napi_thread_node_stop(thread_node);
}
static void napi_set_threaded_state(struct napi_struct *napi,
@@ -7197,9 +7240,13 @@ static void napi_set_threaded_state(struct napi_struct *napi,
int napi_set_threaded(struct napi_struct *napi,
enum netdev_napi_threaded threaded)
{
+ struct napi_thread_node *thread_node;
+
+ thread_node = netdev_lock_dereference(napi->thread_node, napi->dev);
+
if (threaded) {
- if (!napi->thread) {
- int err = napi_kthread_create(napi);
+ if (!thread_node) {
+ int err = napi_thread_node_create(napi);
if (err)
return err;
@@ -7215,7 +7262,7 @@ int napi_set_threaded(struct napi_struct *napi,
* softirq mode will happen in the next round of napi_schedule().
* This should not cause hiccups/stalls to the live traffic.
*/
- if (!threaded && napi->thread) {
+ if (!threaded && thread_node) {
napi_stop_kthread(napi);
} else {
/* Make sure kthread is created before THREADED bit is set. */
@@ -7236,8 +7283,9 @@ int netif_set_threaded(struct net_device *dev,
if (threaded) {
list_for_each_entry(napi, &dev->napi_list, dev_list) {
- if (!napi->thread) {
- err = napi_kthread_create(napi);
+ /* protected by assertion above */
+ if (!rcu_dereference_protected(napi->thread_node, 1)) {
+ err = napi_thread_node_create(napi);
if (err) {
threaded = NETDEV_NAPI_THREADED_DISABLED;
break;
@@ -7253,8 +7301,14 @@ int netif_set_threaded(struct net_device *dev,
WARN_ON_ONCE(napi_set_threaded(napi, threaded));
/* Override the config for all NAPIs even if currently not listed */
- for (i = 0; i < dev->num_napi_configs; i++)
+ for (i = 0; i < dev->num_napi_configs; i++) {
dev->napi_config[i].threaded = threaded;
+ /* Stop parked threads in inactive napi_configs */
+ if (!threaded && dev->napi_config[i].thread_node) {
+ napi_thread_node_stop(dev->napi_config[i].thread_node);
+ dev->napi_config[i].thread_node = NULL;
+ }
+ }
return err;
}
@@ -7657,7 +7711,7 @@ void napi_enable_locked(struct napi_struct *n)
BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
- if (n->dev->threaded && n->thread)
+ if (n->dev->threaded && n->thread_node)
new |= NAPIF_STATE_THREADED;
} while (!try_cmpxchg(&n->state, &val, new));
}
@@ -7682,6 +7736,8 @@ EXPORT_SYMBOL(napi_enable);
/* Must be called in process context */
void __netif_napi_del_locked(struct napi_struct *napi)
{
+ struct napi_thread_node *thread_node;
+
netdev_assert_locked(napi->dev);
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
@@ -7693,6 +7749,18 @@ void __netif_napi_del_locked(struct napi_struct *napi)
if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
irq_set_affinity_notifier(napi->irq, NULL);
+ thread_node = netdev_lock_dereference(napi->thread_node, napi->dev);
+ if (thread_node) {
+ rcu_assign_pointer(napi->thread_node, NULL);
+ if (napi->config) {
+ kthread_park(thread_node->thread);
+ napi->config->thread_node = thread_node;
+ napi->config->thread_node->napi = NULL;
+ } else {
+ napi_thread_node_stop(thread_node);
+ }
+ }
+
if (napi->config) {
napi->index = -1;
napi->config = NULL;
@@ -7702,11 +7770,6 @@ void __netif_napi_del_locked(struct napi_struct *napi)
napi_free_frags(napi);
gro_cleanup(&napi->gro);
-
- if (napi->thread) {
- kthread_stop(napi->thread);
- napi->thread = NULL;
- }
}
EXPORT_SYMBOL(__netif_napi_del_locked);
@@ -7802,11 +7865,21 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
return work;
}
-static int napi_thread_wait(struct napi_struct *napi)
+static struct napi_struct *
+napi_thread_wait(struct napi_thread_node *thread_node)
{
+ struct napi_struct *napi = READ_ONCE(thread_node->napi);
+
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
+ if (kthread_should_park()) {
+ kthread_parkme();
+ napi = READ_ONCE(thread_node->napi);
+ /* Might be awakened for stopping */
+ continue;
+ }
+
/* Testing SCHED_THREADED bit here to make sure the current
* kthread owns this napi and could poll on this napi.
* Testing SCHED bit is not enough because SCHED bit might be
@@ -7815,7 +7888,7 @@ static int napi_thread_wait(struct napi_struct *napi)
if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
WARN_ON(!list_empty(&napi->poll_list));
__set_current_state(TASK_RUNNING);
- return 0;
+ return napi;
}
schedule();
@@ -7823,7 +7896,7 @@ static int napi_thread_wait(struct napi_struct *napi)
}
__set_current_state(TASK_RUNNING);
- return -1;
+ return NULL;
}
static void napi_threaded_poll_loop(struct napi_struct *napi,
@@ -7880,13 +7953,19 @@ static void napi_threaded_poll_loop(struct napi_struct *napi,
static int napi_threaded_poll(void *data)
{
- struct napi_struct *napi = data;
+ struct napi_thread_node *thread_node = data;
unsigned long last_qs = jiffies;
+ struct napi_struct *napi;
bool want_busy_poll;
bool in_busy_poll;
unsigned long val;
- while (!napi_thread_wait(napi)) {
+ while (1) {
+ napi = napi_thread_wait(thread_node);
+
+ if (!napi)
+ break;
+
val = READ_ONCE(napi->state);
want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL;
@@ -12155,6 +12234,8 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
static void netdev_napi_exit(struct net_device *dev)
{
+ unsigned int i;
+
if (!list_empty(&dev->napi_list)) {
struct napi_struct *p, *n;
@@ -12166,6 +12247,10 @@ static void netdev_napi_exit(struct net_device *dev)
synchronize_net();
}
+ for (i = 0; i < dev->num_napi_configs; i++) {
+ if (dev->napi_config[i].thread_node)
+ napi_thread_node_stop(dev->napi_config[i].thread_node);
+ }
kvfree(dev->napi_config);
}
@@ -13204,12 +13289,12 @@ static void backlog_napi_setup(unsigned int cpu)
struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
struct napi_struct *napi = &sd->backlog;
- napi->thread = this_cpu_read(backlog_napi);
+ rcu_assign_pointer(napi->thread_node, this_cpu_ptr(&backlog_napi));
set_bit(NAPI_STATE_THREADED, &napi->state);
}
static struct smp_hotplug_thread backlog_threads = {
- .store = &backlog_napi,
+ .store = &backlog_napi.thread,
.thread_should_run = backlog_napi_should_run,
.thread_fn = run_backlog_napi,
.thread_comm = "backlog_napi/%u",
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 11b0b91683d7..f2ecdb26d6f1 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -162,6 +162,7 @@ static int
netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
const struct genl_info *info)
{
+ struct napi_thread_node *thread_node;
unsigned long irq_suspend_timeout;
unsigned long gro_flush_timeout;
u32 napi_defer_hard_irqs;
@@ -188,11 +189,16 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
napi_get_threaded(napi)))
goto nla_put_failure;
- if (napi->thread) {
- pid = task_pid_nr(napi->thread);
- if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid))
+ rcu_read_lock();
+ thread_node = rcu_dereference(napi->thread_node);
+ if (thread_node) {
+ pid = task_pid_nr(thread_node->thread);
+ if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) {
+ rcu_read_unlock();
goto nla_put_failure;
+ }
}
+ rcu_read_unlock();
napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi);
if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS,
--
2.54.0.1136.gdb2ca164c4-goog
next prev parent reply other threads:[~2026-06-12 17:36 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-12 17:36 [RFC net-next v1 0/2] Reuse threaded NAPI kthread across napi_del()/napi_add() Shuhao Tan
2026-06-12 17:36 ` Shuhao Tan [this message]
2026-06-12 17:36 ` [RFC net-next v1 2/2] selftests: net: Add kthread preserving test in napi_threaded and busy_poll_test Shuhao Tan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260612173644.380972-2-tanshuhao@google.com \
--to=tanshuhao@google.com \
--cc=almasrymina@google.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=horms@kernel.org \
--cc=kuba@kernel.org \
--cc=kuniyu@google.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=shuah@kernel.org \
--cc=skhawaja@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox