netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next v2] net: stop napi kthreads when THREADED napi is disabled
@ 2025-05-19 22:43 Samiullah Khawaja
  2025-05-20  0:34 ` Wei Wang
  2025-05-21  2:09 ` Jakub Kicinski
  0 siblings, 2 replies; 9+ messages in thread
From: Samiullah Khawaja @ 2025-05-19 22:43 UTC (permalink / raw)
  To: Jakub Kicinski, David S . Miller , Eric Dumazet, Paolo Abeni,
	almasrymina, willemb, jdamato, mkarsten, weiwan
  Cc: netdev, skhawaja

Once the THREADED napi is disabled, the napi kthread should also be
stopped. Keeping the kthread intact after disabling THREADED napi makes
the PID of this kthread show up in the output of netlink 'napi-get' and
ps -ef output.

The is discussed in the patch below:
https://lore.kernel.org/all/20250502191548.559cc416@kernel.org

The napi THREADED state should be unset before stopping the thread. This
makes sure that this napi will not be scheduled again for threaded
polling (SCHED_THREADED). In the napi_thread_wait function we need to
make sure that the SCHED_THREADED is not set before stopping. Once the
SCHED_THREADED is unset it means that the napi kthread doesn't own this
napi and it is safe to stop it.

____napi_schedule for threaded napi is also modified to make sure that
the STATE_THREADED is not unset while we are setting SCHED_THREADED and
waking up the napi kthread. If STATE_THREADED is unset while the
threaded napi is being scheduled, the schedule code will recheck the
STATE_THREADED to prevent wakeup of a stopped thread. Use try_cmpxchg to
make sure the value of napi->state is not changed.

Add a new test in nl_netdev to verify this behaviour.

Tested:
 ./tools/testing/selftests/net/nl_netdev.py
 TAP version 13
 1..6
 ok 1 nl_netdev.empty_check
 ok 2 nl_netdev.lo_check
 ok 3 nl_netdev.page_pool_check
 ok 4 nl_netdev.napi_list_check
 ok 5 nl_netdev.dev_set_threaded
 ok 6 nl_netdev.nsim_rxq_reset_down
 # Totals: pass:6 fail:0 xfail:0 xpass:0 skip:0 error:0

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---

v2:
- Handle the race where the STATE_THREADED is disabled and kthread is
  stopped while the ____napi_schedule code has already checked the
  STATE_THREADED and tries to wakeup napi kthread that is stopped.

 net/core/dev.c                           | 77 +++++++++++++++++++-----
 tools/testing/selftests/net/nl_netdev.py | 38 +++++++++++-
 2 files changed, 98 insertions(+), 17 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index d0563ddff6ca..52d173f5206c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4754,15 +4754,18 @@ int weight_p __read_mostly = 64;           /* old backlog weight */
 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 
-/* Called with irq disabled */
-static inline void ____napi_schedule(struct softnet_data *sd,
-				     struct napi_struct *napi)
+static inline bool ____try_napi_schedule_threaded(struct softnet_data *sd,
+						  struct napi_struct *napi)
 {
 	struct task_struct *thread;
+	unsigned long new, val;
 
-	lockdep_assert_irqs_disabled();
+	do {
+		val = READ_ONCE(napi->state);
+
+		if (!(val & NAPIF_STATE_THREADED))
+			return false;
 
-	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
 		/* Paired with smp_mb__before_atomic() in
 		 * napi_enable()/dev_set_threaded().
 		 * Use READ_ONCE() to guarantee a complete
@@ -4770,17 +4773,30 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 		 * wake_up_process() when it's not NULL.
 		 */
 		thread = READ_ONCE(napi->thread);
-		if (thread) {
-			if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
-				goto use_local_napi;
+		if (!thread)
+			return false;
 
-			set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
-			wake_up_process(thread);
-			return;
-		}
-	}
+		if (use_backlog_threads() &&
+		    thread == raw_cpu_read(backlog_napi))
+			return false;
+
+		new = val | NAPIF_STATE_SCHED_THREADED;
+	} while (!try_cmpxchg(&napi->state, &val, new));
+
+	wake_up_process(thread);
+	return true;
+}
+
+/* Called with irq disabled */
+static inline void ____napi_schedule(struct softnet_data *sd,
+				     struct napi_struct *napi)
+{
+	lockdep_assert_irqs_disabled();
+
+	/* try to schedule threaded napi if enabled */
+	if (____try_napi_schedule_threaded(sd, napi))
+		return;
 
-use_local_napi:
 	list_add_tail(&napi->poll_list, &sd->poll_list);
 	WRITE_ONCE(napi->list_owner, smp_processor_id());
 	/* If not called from net_rx_action()
@@ -6888,6 +6904,18 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+static void dev_stop_napi_threads(struct net_device *dev)
+{
+	struct napi_struct *napi;
+
+	list_for_each_entry(napi, &dev->napi_list, dev_list) {
+		if (napi->thread) {
+			kthread_stop(napi->thread);
+			napi->thread = NULL;
+		}
+	}
+}
+
 int dev_set_threaded(struct net_device *dev, bool threaded)
 {
 	struct napi_struct *napi;
@@ -6926,6 +6954,15 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
 	list_for_each_entry(napi, &dev->napi_list, dev_list)
 		assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
 
+	/* Calling kthread_stop on napi threads should be safe now as the
+	 * threaded state is disabled.
+	 */
+	if (!threaded) {
+		/* Make sure the state is set before stopping threads.*/
+		smp_mb__before_atomic();
+		dev_stop_napi_threads(dev);
+	}
+
 	return err;
 }
 EXPORT_SYMBOL(dev_set_threaded);
@@ -7451,7 +7488,8 @@ static int napi_thread_wait(struct napi_struct *napi)
 {
 	set_current_state(TASK_INTERRUPTIBLE);
 
-	while (!kthread_should_stop()) {
+	/* Wait until we are scheduled or asked to stop. */
+	while (true) {
 		/* Testing SCHED_THREADED bit here to make sure the current
 		 * kthread owns this napi and could poll on this napi.
 		 * Testing SCHED bit is not enough because SCHED bit might be
@@ -7463,6 +7501,15 @@ static int napi_thread_wait(struct napi_struct *napi)
 			return 0;
 		}
 
+		/* Since the SCHED_THREADED is not set so this napi kthread does
+		 * not own this napi and it is safe to stop here. Checking the
+		 * SCHED_THREADED before stopping here makes sure that this napi
+		 * was not scheduled again while napi threaded was being
+		 * disabled.
+		 */
+		if (kthread_should_stop())
+			break;
+
 		schedule();
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py
index beaee5e4e2aa..c9109627a741 100755
--- a/tools/testing/selftests/net/nl_netdev.py
+++ b/tools/testing/selftests/net/nl_netdev.py
@@ -2,8 +2,9 @@
 # SPDX-License-Identifier: GPL-2.0
 
 import time
+from os import system
 from lib.py import ksft_run, ksft_exit, ksft_pr
-from lib.py import ksft_eq, ksft_ge, ksft_busy_wait
+from lib.py import ksft_eq, ksft_ge, ksft_ne, ksft_busy_wait
 from lib.py import NetdevFamily, NetdevSimDev, ip
 
 
@@ -34,6 +35,39 @@ def napi_list_check(nf) -> None:
                 ksft_eq(len(napis), 100,
                         comment=f"queue count after reset queue {q} mode {i}")
 
+def dev_set_threaded(nf) -> None:
+    """
+    Test that verifies various cases of napi threaded
+    set and unset at device level using sysfs.
+    """
+    with NetdevSimDev(queue_count=2) as nsimdev:
+        nsim = nsimdev.nsims[0]
+
+        ip(f"link set dev {nsim.ifname} up")
+
+        napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True)
+        ksft_eq(len(napis), 2)
+
+        napi0_id = napis[0]['id']
+        napi1_id = napis[1]['id']
+
+        # set threaded
+        system(f"echo 1 > /sys/class/net/{nsim.ifname}/threaded")
+
+        # check napi threaded is set for both napis
+        napi0 = nf.napi_get({'id': napi0_id})
+        ksft_ne(napi0.get('pid'), None)
+        napi1 = nf.napi_get({'id': napi1_id})
+        ksft_ne(napi1.get('pid'), None)
+
+        # unset threaded
+        system(f"echo 0 > /sys/class/net/{nsim.ifname}/threaded")
+
+        # check napi threaded is unset for both napis
+        napi0 = nf.napi_get({'id': napi0_id})
+        ksft_eq(napi0.get('pid'), None)
+        napi1 = nf.napi_get({'id': napi1_id})
+        ksft_eq(napi1.get('pid'), None)
 
 def nsim_rxq_reset_down(nf) -> None:
     """
@@ -122,7 +156,7 @@ def page_pool_check(nf) -> None:
 def main() -> None:
     nf = NetdevFamily()
     ksft_run([empty_check, lo_check, page_pool_check, napi_list_check,
-              nsim_rxq_reset_down],
+              dev_set_threaded, nsim_rxq_reset_down],
              args=(nf, ))
     ksft_exit()
 

base-commit: b65999e7238e6f2a48dc77c8c2109c48318ff41b
-- 
2.49.0.1101.gccaa498523-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] net: stop napi kthreads when THREADED napi is disabled
  2025-05-19 22:43 [PATCH net-next v2] net: stop napi kthreads when THREADED napi is disabled Samiullah Khawaja
@ 2025-05-20  0:34 ` Wei Wang
  2025-05-21  2:09 ` Jakub Kicinski
  1 sibling, 0 replies; 9+ messages in thread
From: Wei Wang @ 2025-05-20  0:34 UTC (permalink / raw)
  To: Samiullah Khawaja
  Cc: Jakub Kicinski, David S . Miller, Eric Dumazet, Paolo Abeni,
	almasrymina, willemb, jdamato, mkarsten, netdev

On Mon, May 19, 2025 at 3:43 PM Samiullah Khawaja <skhawaja@google.com> wrote:
>
> Once the THREADED napi is disabled, the napi kthread should also be
> stopped. Keeping the kthread intact after disabling THREADED napi makes
> the PID of this kthread show up in the output of netlink 'napi-get' and
> ps -ef output.
>
> The is discussed in the patch below:
> https://lore.kernel.org/all/20250502191548.559cc416@kernel.org
>
> The napi THREADED state should be unset before stopping the thread. This
> makes sure that this napi will not be scheduled again for threaded
> polling (SCHED_THREADED). In the napi_thread_wait function we need to
> make sure that the SCHED_THREADED is not set before stopping. Once the
> SCHED_THREADED is unset it means that the napi kthread doesn't own this
> napi and it is safe to stop it.
>
> ____napi_schedule for threaded napi is also modified to make sure that
> the STATE_THREADED is not unset while we are setting SCHED_THREADED and
> waking up the napi kthread. If STATE_THREADED is unset while the
> threaded napi is being scheduled, the schedule code will recheck the
> STATE_THREADED to prevent wakeup of a stopped thread. Use try_cmpxchg to
> make sure the value of napi->state is not changed.
>
> Add a new test in nl_netdev to verify this behaviour.
>
> Tested:
>  ./tools/testing/selftests/net/nl_netdev.py
>  TAP version 13
>  1..6
>  ok 1 nl_netdev.empty_check
>  ok 2 nl_netdev.lo_check
>  ok 3 nl_netdev.page_pool_check
>  ok 4 nl_netdev.napi_list_check
>  ok 5 nl_netdev.dev_set_threaded
>  ok 6 nl_netdev.nsim_rxq_reset_down
>  # Totals: pass:6 fail:0 xfail:0 xpass:0 skip:0 error:0
>
> Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
> ---
>
> v2:
> - Handle the race where the STATE_THREADED is disabled and kthread is
>   stopped while the ____napi_schedule code has already checked the
>   STATE_THREADED and tries to wakeup napi kthread that is stopped.
>
>  net/core/dev.c                           | 77 +++++++++++++++++++-----
>  tools/testing/selftests/net/nl_netdev.py | 38 +++++++++++-
>  2 files changed, 98 insertions(+), 17 deletions(-)
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index d0563ddff6ca..52d173f5206c 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4754,15 +4754,18 @@ int weight_p __read_mostly = 64;           /* old backlog weight */
>  int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
>  int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
>
> -/* Called with irq disabled */
> -static inline void ____napi_schedule(struct softnet_data *sd,
> -                                    struct napi_struct *napi)
> +static inline bool ____try_napi_schedule_threaded(struct softnet_data *sd,
> +                                                 struct napi_struct *napi)
>  {
>         struct task_struct *thread;
> +       unsigned long new, val;
>
> -       lockdep_assert_irqs_disabled();
> +       do {
> +               val = READ_ONCE(napi->state);
> +
> +               if (!(val & NAPIF_STATE_THREADED))
> +                       return false;
>
> -       if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
>                 /* Paired with smp_mb__before_atomic() in
>                  * napi_enable()/dev_set_threaded().
>                  * Use READ_ONCE() to guarantee a complete
> @@ -4770,17 +4773,30 @@ static inline void ____napi_schedule(struct softnet_data *sd,
>                  * wake_up_process() when it's not NULL.
>                  */
>                 thread = READ_ONCE(napi->thread);
> -               if (thread) {
> -                       if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
> -                               goto use_local_napi;
> +               if (!thread)
> +                       return false;
>
> -                       set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
> -                       wake_up_process(thread);
> -                       return;
> -               }
> -       }
> +               if (use_backlog_threads() &&
> +                   thread == raw_cpu_read(backlog_napi))
> +                       return false;
> +
> +               new = val | NAPIF_STATE_SCHED_THREADED;
> +       } while (!try_cmpxchg(&napi->state, &val, new));
> +
> +       wake_up_process(thread);
> +       return true;
> +}
> +
> +/* Called with irq disabled */
> +static inline void ____napi_schedule(struct softnet_data *sd,
> +                                    struct napi_struct *napi)
> +{
> +       lockdep_assert_irqs_disabled();
> +
> +       /* try to schedule threaded napi if enabled */
> +       if (____try_napi_schedule_threaded(sd, napi))
> +               return;
>
> -use_local_napi:
>         list_add_tail(&napi->poll_list, &sd->poll_list);
>         WRITE_ONCE(napi->list_owner, smp_processor_id());
>         /* If not called from net_rx_action()
> @@ -6888,6 +6904,18 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
>         return HRTIMER_NORESTART;
>  }
>
> +static void dev_stop_napi_threads(struct net_device *dev)
> +{
> +       struct napi_struct *napi;
> +
> +       list_for_each_entry(napi, &dev->napi_list, dev_list) {
> +               if (napi->thread) {
> +                       kthread_stop(napi->thread);
> +                       napi->thread = NULL;
> +               }
> +       }
> +}
> +
>  int dev_set_threaded(struct net_device *dev, bool threaded)
>  {
>         struct napi_struct *napi;
> @@ -6926,6 +6954,15 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
>         list_for_each_entry(napi, &dev->napi_list, dev_list)
>                 assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
>
> +       /* Calling kthread_stop on napi threads should be safe now as the
> +        * threaded state is disabled.
> +        */
> +       if (!threaded) {
> +               /* Make sure the state is set before stopping threads.*/
> +               smp_mb__before_atomic();

This should be smp_mb__after_atomic() right?

> +               dev_stop_napi_threads(dev);
> +       }
> +
>         return err;
>  }
>  EXPORT_SYMBOL(dev_set_threaded);
> @@ -7451,7 +7488,8 @@ static int napi_thread_wait(struct napi_struct *napi)
>  {
>         set_current_state(TASK_INTERRUPTIBLE);
>
> -       while (!kthread_should_stop()) {
> +       /* Wait until we are scheduled or asked to stop. */
> +       while (true) {
>                 /* Testing SCHED_THREADED bit here to make sure the current
>                  * kthread owns this napi and could poll on this napi.
>                  * Testing SCHED bit is not enough because SCHED bit might be
> @@ -7463,6 +7501,15 @@ static int napi_thread_wait(struct napi_struct *napi)
>                         return 0;
>                 }
>
> +               /* Since the SCHED_THREADED is not set so this napi kthread does
> +                * not own this napi and it is safe to stop here. Checking the
> +                * SCHED_THREADED before stopping here makes sure that this napi
> +                * was not scheduled again while napi threaded was being
> +                * disabled.
> +                */
> +               if (kthread_should_stop())
> +                       break;
> +
>                 schedule();
>                 set_current_state(TASK_INTERRUPTIBLE);
>         }
> diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py
> index beaee5e4e2aa..c9109627a741 100755
> --- a/tools/testing/selftests/net/nl_netdev.py
> +++ b/tools/testing/selftests/net/nl_netdev.py
> @@ -2,8 +2,9 @@
>  # SPDX-License-Identifier: GPL-2.0
>
>  import time
> +from os import system
>  from lib.py import ksft_run, ksft_exit, ksft_pr
> -from lib.py import ksft_eq, ksft_ge, ksft_busy_wait
> +from lib.py import ksft_eq, ksft_ge, ksft_ne, ksft_busy_wait
>  from lib.py import NetdevFamily, NetdevSimDev, ip
>
>
> @@ -34,6 +35,39 @@ def napi_list_check(nf) -> None:
>                  ksft_eq(len(napis), 100,
>                          comment=f"queue count after reset queue {q} mode {i}")
>
> +def dev_set_threaded(nf) -> None:
> +    """
> +    Test that verifies various cases of napi threaded
> +    set and unset at device level using sysfs.
> +    """
> +    with NetdevSimDev(queue_count=2) as nsimdev:
> +        nsim = nsimdev.nsims[0]
> +
> +        ip(f"link set dev {nsim.ifname} up")
> +
> +        napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True)
> +        ksft_eq(len(napis), 2)
> +
> +        napi0_id = napis[0]['id']
> +        napi1_id = napis[1]['id']
> +
> +        # set threaded
> +        system(f"echo 1 > /sys/class/net/{nsim.ifname}/threaded")
> +
> +        # check napi threaded is set for both napis
> +        napi0 = nf.napi_get({'id': napi0_id})
> +        ksft_ne(napi0.get('pid'), None)
> +        napi1 = nf.napi_get({'id': napi1_id})
> +        ksft_ne(napi1.get('pid'), None)
> +
> +        # unset threaded
> +        system(f"echo 0 > /sys/class/net/{nsim.ifname}/threaded")
> +
> +        # check napi threaded is unset for both napis
> +        napi0 = nf.napi_get({'id': napi0_id})
> +        ksft_eq(napi0.get('pid'), None)
> +        napi1 = nf.napi_get({'id': napi1_id})
> +        ksft_eq(napi1.get('pid'), None)
>
>  def nsim_rxq_reset_down(nf) -> None:
>      """
> @@ -122,7 +156,7 @@ def page_pool_check(nf) -> None:
>  def main() -> None:
>      nf = NetdevFamily()
>      ksft_run([empty_check, lo_check, page_pool_check, napi_list_check,
> -              nsim_rxq_reset_down],
> +              dev_set_threaded, nsim_rxq_reset_down],
>               args=(nf, ))
>      ksft_exit()
>
>
> base-commit: b65999e7238e6f2a48dc77c8c2109c48318ff41b
> --
> 2.49.0.1101.gccaa498523-goog
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] net: stop napi kthreads when THREADED napi is disabled
  2025-05-19 22:43 [PATCH net-next v2] net: stop napi kthreads when THREADED napi is disabled Samiullah Khawaja
  2025-05-20  0:34 ` Wei Wang
@ 2025-05-21  2:09 ` Jakub Kicinski
  2025-05-21 16:41   ` Wei Wang
  1 sibling, 1 reply; 9+ messages in thread
From: Jakub Kicinski @ 2025-05-21  2:09 UTC (permalink / raw)
  To: Samiullah Khawaja
  Cc: David S . Miller , Eric Dumazet, Paolo Abeni, almasrymina,
	willemb, jdamato, mkarsten, weiwan, netdev

On Mon, 19 May 2025 22:43:25 +0000 Samiullah Khawaja wrote:
> -/* Called with irq disabled */
> -static inline void ____napi_schedule(struct softnet_data *sd,
> -				     struct napi_struct *napi)
> +static inline bool ____try_napi_schedule_threaded(struct softnet_data *sd,
> +						  struct napi_struct *napi)
>  {
>  	struct task_struct *thread;
> +	unsigned long new, val;
>  
> -	lockdep_assert_irqs_disabled();
> +	do {
> +		val = READ_ONCE(napi->state);
> +
> +		if (!(val & NAPIF_STATE_THREADED))
> +			return false;

Do we really need to complicate the fastpath to make the slowpath easy?

Plus I'm not sure it works. 

          CPU 0 (IRQ)             CPU 1 (NAPI thr)          CPU 2 (config)
                         if (test_bit(NAPI_STATE_SCHED_THREADED))
                               ...

  ____napi_schedule()
  cmpxchg(...)
  wake_up_process(thread);
                                                       clear_bit(NAPI_STATE_THREADED)
                                                       kthread_stop(thread)

                         if (kthread_should_stop())
                                exit

Right?

I think the shutting down thread should do this:

	while (true) {
		state = READ_ONCE()

		// safe to clear if thread owns the NAPI,
		// or NAPI is completely idle
		if (state & SCHED_THREADED || !(state & SCHED)) {
			state &= ~THREADED;
		} else {
			msleep(1);
			continue;
		}

		if (try_cmpxchg())
			break;
	}

But that's just an idea, it could also be wrong... :S

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] net: stop napi kthreads when THREADED napi is disabled
  2025-05-21  2:09 ` Jakub Kicinski
@ 2025-05-21 16:41   ` Wei Wang
  2025-05-21 17:28     ` Samiullah Khawaja
  0 siblings, 1 reply; 9+ messages in thread
From: Wei Wang @ 2025-05-21 16:41 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Samiullah Khawaja, David S . Miller, Eric Dumazet, Paolo Abeni,
	almasrymina, willemb, jdamato, mkarsten, netdev

On Tue, May 20, 2025 at 7:09 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Mon, 19 May 2025 22:43:25 +0000 Samiullah Khawaja wrote:
> > -/* Called with irq disabled */
> > -static inline void ____napi_schedule(struct softnet_data *sd,
> > -                                  struct napi_struct *napi)
> > +static inline bool ____try_napi_schedule_threaded(struct softnet_data *sd,
> > +                                               struct napi_struct *napi)
> >  {
> >       struct task_struct *thread;
> > +     unsigned long new, val;
> >
> > -     lockdep_assert_irqs_disabled();
> > +     do {
> > +             val = READ_ONCE(napi->state);
> > +
> > +             if (!(val & NAPIF_STATE_THREADED))
> > +                     return false;
>
> Do we really need to complicate the fastpath to make the slowpath easy?
>
> Plus I'm not sure it works.
>
>           CPU 0 (IRQ)             CPU 1 (NAPI thr)          CPU 2 (config)
>                          if (test_bit(NAPI_STATE_SCHED_THREADED))
>                                ...
>
>   ____napi_schedule()
>   cmpxchg(...)
>   wake_up_process(thread);
>                                                        clear_bit(NAPI_STATE_THREADED)
>                                                        kthread_stop(thread)
>
>                          if (kthread_should_stop())
>                                 exit
>
> Right?
>
Hmm right... I think the main issue is that while dev_set_threaded()
clears STATE_THREADED, SCHED_THREADED could already be set meaning the
napi is already scheduled. And napi_thread_wait() does not really
check STATE_THREADED...

> I think the shutting down thread should do this:
>
>         while (true) {
>                 state = READ_ONCE()
>
>                 // safe to clear if thread owns the NAPI,
>                 // or NAPI is completely idle
>                 if (state & SCHED_THREADED || !(state & SCHED)) {

I think we should make sure SCHED_THREADED is cleared as well, or
otherwise the thread is in the middle of calling napi_threaded_poll()
and we can't just disable the thread?

>                         state &= ~THREADED;

STATE_THREADED to be exact. Right?

>                 } else {
>                         msleep(1);
>                         continue;
>                 }
>
>                 if (try_cmpxchg())
>                         break;
>         }
>
> But that's just an idea, it could also be wrong... :S

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] net: stop napi kthreads when THREADED napi is disabled
  2025-05-21 16:41   ` Wei Wang
@ 2025-05-21 17:28     ` Samiullah Khawaja
  2025-05-21 19:51       ` Samiullah Khawaja
  0 siblings, 1 reply; 9+ messages in thread
From: Samiullah Khawaja @ 2025-05-21 17:28 UTC (permalink / raw)
  To: Wei Wang
  Cc: Jakub Kicinski, David S . Miller, Eric Dumazet, Paolo Abeni,
	almasrymina, willemb, jdamato, mkarsten, netdev

On Wed, May 21, 2025 at 9:41 AM Wei Wang <weiwan@google.com> wrote:
>
> On Tue, May 20, 2025 at 7:09 PM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Mon, 19 May 2025 22:43:25 +0000 Samiullah Khawaja wrote:
> > > -/* Called with irq disabled */
> > > -static inline void ____napi_schedule(struct softnet_data *sd,
> > > -                                  struct napi_struct *napi)
> > > +static inline bool ____try_napi_schedule_threaded(struct softnet_data *sd,
> > > +                                               struct napi_struct *napi)
> > >  {
> > >       struct task_struct *thread;
> > > +     unsigned long new, val;
> > >
> > > -     lockdep_assert_irqs_disabled();
> > > +     do {
> > > +             val = READ_ONCE(napi->state);
> > > +
> > > +             if (!(val & NAPIF_STATE_THREADED))
> > > +                     return false;
> >
> > Do we really need to complicate the fastpath to make the slowpath easy?
> >
> > Plus I'm not sure it works.
> >
> >           CPU 0 (IRQ)             CPU 1 (NAPI thr)          CPU 2 (config)
> >                          if (test_bit(NAPI_STATE_SCHED_THREADED))
> >                                ...
> >
> >   ____napi_schedule()
> >   cmpxchg(...)
> >   wake_up_process(thread);
> >                                                        clear_bit(NAPI_STATE_THREADED)
> >                                                        kthread_stop(thread)
> >
> >                          if (kthread_should_stop())
> >                                 exit
> >
> > Right?
+1

If the kthread checks whether it was not SCHED before it died then
this should not occur.
> >
> Hmm right... I think the main issue is that while dev_set_threaded()
> clears STATE_THREADED, SCHED_THREADED could already be set meaning the
> napi is already scheduled. And napi_thread_wait() does not really
> check STATE_THREADED...
>
> > I think the shutting down thread should do this:
> >
> >         while (true) {
> >                 state = READ_ONCE()
> >
> >                 // safe to clear if thread owns the NAPI,
> >                 // or NAPI is completely idle
> >                 if (state & SCHED_THREADED || !(state & SCHED)) {
This might suffer from the problem you highlighted earlier,
CPU 0 (IRQ)             CPU 1 (NAPI thr)          CPU 2 (config)

  ____napi_schedule()
    if (test_bit(NAPI_STATE_THREADED))
    if (thread) {

 kthread_stop()
                              if (state & SCHED_THREADED || !(state & SCHED)) {
                                   state &= ~THREADED;
                              if (try_cmp_xchg())
                                   break

       set_bit(NAPI_STATE_SCHED_THREADED)
       wake_up_process(thread);

This would happen without the try_cmp_xchg logic that I added in my
patch in the __napi_schedule (in the fast path). __napi_schedule would
have to make sure that the kthread is not stopping while it is trying
to do SCHED. This is similar to the logic we have in
napi_schedule_prep that handles the STATE_DISABLE, STATE_SCHED and
STATE_MISSED scenarios. Also if it falls back to normal softirq, it
needs to make sure that the kthread is not polling at the same time.

> I think we should make sure SCHED_THREADED is cleared as well, or
> otherwise the thread is in the middle of calling napi_threaded_poll()
> and we can't just disable the thread?
+1

We need to make sure that any scheduled polling should be completed.

>
> >                         state &= ~THREADED;
>
> STATE_THREADED to be exact. Right?
>
> >                 } else {
> >                         msleep(1);
> >                         continue;
> >                 }
> >
> >                 if (try_cmpxchg())
> >                         break;
> >         }
> >
> > But that's just an idea, it could also be wrong... :S

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] net: stop napi kthreads when THREADED napi is disabled
  2025-05-21 17:28     ` Samiullah Khawaja
@ 2025-05-21 19:51       ` Samiullah Khawaja
  2025-05-21 22:21         ` Jakub Kicinski
  0 siblings, 1 reply; 9+ messages in thread
From: Samiullah Khawaja @ 2025-05-21 19:51 UTC (permalink / raw)
  To: Wei Wang
  Cc: Jakub Kicinski, David S . Miller, Eric Dumazet, Paolo Abeni,
	almasrymina, willemb, jdamato, mkarsten, netdev

On Wed, May 21, 2025 at 10:28 AM Samiullah Khawaja <skhawaja@google.com> wrote:
>
> On Wed, May 21, 2025 at 9:41 AM Wei Wang <weiwan@google.com> wrote:
> >
> > On Tue, May 20, 2025 at 7:09 PM Jakub Kicinski <kuba@kernel.org> wrote:
> > >
> > > On Mon, 19 May 2025 22:43:25 +0000 Samiullah Khawaja wrote:
> > > > -/* Called with irq disabled */
> > > > -static inline void ____napi_schedule(struct softnet_data *sd,
> > > > -                                  struct napi_struct *napi)
> > > > +static inline bool ____try_napi_schedule_threaded(struct softnet_data *sd,
> > > > +                                               struct napi_struct *napi)
> > > >  {
> > > >       struct task_struct *thread;
> > > > +     unsigned long new, val;
> > > >
> > > > -     lockdep_assert_irqs_disabled();
> > > > +     do {
> > > > +             val = READ_ONCE(napi->state);
> > > > +
> > > > +             if (!(val & NAPIF_STATE_THREADED))
> > > > +                     return false;
> > >
> > > Do we really need to complicate the fastpath to make the slowpath easy?
> > >
> > > Plus I'm not sure it works.
> > >
> > >           CPU 0 (IRQ)             CPU 1 (NAPI thr)          CPU 2 (config)
> > >                          if (test_bit(NAPI_STATE_SCHED_THREADED))
> > >                                ...
> > >
> > >   ____napi_schedule()
> > >   cmpxchg(...)
> > >   wake_up_process(thread);
> > >                                                        clear_bit(NAPI_STATE_THREADED)
> > >                                                        kthread_stop(thread)
> > >
> > >                          if (kthread_should_stop())
> > >                                 exit
> > >
> > > Right?
> +1
>
> If the kthread checks whether it was not SCHED before it died then
> this should not occur.
> > >
> > Hmm right... I think the main issue is that while dev_set_threaded()
> > clears STATE_THREADED, SCHED_THREADED could already be set meaning the
> > napi is already scheduled. And napi_thread_wait() does not really
> > check STATE_THREADED...
> >
> > > I think the shutting down thread should do this:
> > >
> > >         while (true) {
> > >                 state = READ_ONCE()
> > >
> > >                 // safe to clear if thread owns the NAPI,
> > >                 // or NAPI is completely idle
> > >                 if (state & SCHED_THREADED || !(state & SCHED)) {
> This might suffer from the problem you highlighted earlier,
> CPU 0 (IRQ)             CPU 1 (NAPI thr)          CPU 2 (config)
>
>   ____napi_schedule()
>     if (test_bit(NAPI_STATE_THREADED))
>     if (thread) {
>
>  kthread_stop()
>                               if (state & SCHED_THREADED || !(state & SCHED)) {
>                                    state &= ~THREADED;
>                               if (try_cmp_xchg())
>                                    break
>
>        set_bit(NAPI_STATE_SCHED_THREADED)
>        wake_up_process(thread);
>
> This would happen without the try_cmp_xchg logic that I added in my
> patch in the __napi_schedule (in the fast path). __napi_schedule would
> have to make sure that the kthread is not stopping while it is trying
> to do SCHED. This is similar to the logic we have in
> napi_schedule_prep that handles the STATE_DISABLE, STATE_SCHED and
> STATE_MISSED scenarios. Also if it falls back to normal softirq, it
> needs to make sure that the kthread is not polling at the same time.
Discard this as the SCHED would be set in napi_schedule_prepare before
__napi_schedule is called in IRQ, so try_cmp_xchg would return false.
I think if the thread stops if the napi is idle(SCHED is not) set then
it should do. This should make sure any pending SCHED_THREADED are
also done. The existing logic in napi_schedulle_prep should handle all
the cases.
>
> > I think we should make sure SCHED_THREADED is cleared as well, or
> > otherwise the thread is in the middle of calling napi_threaded_poll()
> > and we can't just disable the thread?
> +1
>
> We need to make sure that any scheduled polling should be completed.
>
> >
> > >                         state &= ~THREADED;
> >
> > STATE_THREADED to be exact. Right?
> >
> > >                 } else {
> > >                         msleep(1);
> > >                         continue;
> > >                 }
> > >
> > >                 if (try_cmpxchg())
> > >                         break;
> > >         }
> > >
> > > But that's just an idea, it could also be wrong... :S

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] net: stop napi kthreads when THREADED napi is disabled
  2025-05-21 19:51       ` Samiullah Khawaja
@ 2025-05-21 22:21         ` Jakub Kicinski
  2025-05-21 22:50           ` Samiullah Khawaja
  0 siblings, 1 reply; 9+ messages in thread
From: Jakub Kicinski @ 2025-05-21 22:21 UTC (permalink / raw)
  To: Samiullah Khawaja
  Cc: Wei Wang, David S . Miller, Eric Dumazet, Paolo Abeni,
	almasrymina, willemb, jdamato, mkarsten, netdev

On Wed, 21 May 2025 12:51:33 -0700 Samiullah Khawaja wrote:
> > This might suffer from the problem you highlighted earlier,
> > CPU 0 (IRQ)             CPU 1 (NAPI thr)          CPU 2 (config)
> >
> >   ____napi_schedule()
> >     if (test_bit(NAPI_STATE_THREADED))
> >     if (thread) {
> >
> >  kthread_stop()
> >                               if (state & SCHED_THREADED || !(state & SCHED)) {
> >                                    state &= ~THREADED;
> >                               if (try_cmp_xchg())
> >                                    break
> >
> >        set_bit(NAPI_STATE_SCHED_THREADED)
> >        wake_up_process(thread);

This got a bit line wrapped for me so can't judge :(

> > This would happen without the try_cmp_xchg logic that I added in my
> > patch in the __napi_schedule (in the fast path). __napi_schedule would
> > have to make sure that the kthread is not stopping while it is trying
> > to do SCHED. This is similar to the logic we have in
> > napi_schedule_prep that handles the STATE_DISABLE, STATE_SCHED and
> > STATE_MISSED scenarios. Also if it falls back to normal softirq, it
> > needs to make sure that the kthread is not polling at the same time.  
> Discard this as the SCHED would be set in napi_schedule_prepare before
> __napi_schedule is called in IRQ, so try_cmp_xchg would return false.
> I think if the thread stops if the napi is idle(SCHED is not) set then
> it should do. This should make sure any pending SCHED_THREADED are
> also done. The existing logic in napi_schedulle_prep should handle all
> the cases.

I think we're on the same page. We're clearing the THREADED bit 
(not SCHED_THREADED). Only napi_schedule() path looks at that bit, 
after setting SCHED. So if we cmpxchg on a state where SCHED was 
clear - we can't race with anything that cares about THREADED bit.

Just to be clear - the stopping of the thread has to be after the
proposed loop, so kthread_should_stop() does not come into play.

And FWIW my understanding is that we don't need any barriers on the
fast path (SCHED vs checking THREADED) because memory ordering is
a thing which exists only between distinct memory words.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] net: stop napi kthreads when THREADED napi is disabled
  2025-05-21 22:21         ` Jakub Kicinski
@ 2025-05-21 22:50           ` Samiullah Khawaja
  2025-05-29  0:04             ` Jakub Kicinski
  0 siblings, 1 reply; 9+ messages in thread
From: Samiullah Khawaja @ 2025-05-21 22:50 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Wei Wang, David S . Miller, Eric Dumazet, Paolo Abeni,
	almasrymina, willemb, jdamato, mkarsten, netdev

On Wed, May 21, 2025 at 3:21 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Wed, 21 May 2025 12:51:33 -0700 Samiullah Khawaja wrote:
> > > This might suffer from the problem you highlighted earlier,
> > > CPU 0 (IRQ)             CPU 1 (NAPI thr)          CPU 2 (config)
> > >
> > >   ____napi_schedule()
> > >     if (test_bit(NAPI_STATE_THREADED))
> > >     if (thread) {
> > >
> > >  kthread_stop()
> > >                               if (state & SCHED_THREADED || !(state & SCHED)) {
> > >                                    state &= ~THREADED;
> > >                               if (try_cmp_xchg())
> > >                                    break
> > >
> > >        set_bit(NAPI_STATE_SCHED_THREADED)
> > >        wake_up_process(thread);
>
> This got a bit line wrapped for me so can't judge :(
Yep, sorry about that. But I think we are on the same page.
>
> > > This would happen without the try_cmp_xchg logic that I added in my
> > > patch in the __napi_schedule (in the fast path). __napi_schedule would
> > > have to make sure that the kthread is not stopping while it is trying
> > > to do SCHED. This is similar to the logic we have in
> > > napi_schedule_prep that handles the STATE_DISABLE, STATE_SCHED and
> > > STATE_MISSED scenarios. Also if it falls back to normal softirq, it
> > > needs to make sure that the kthread is not polling at the same time.
> > Discard this as the SCHED would be set in napi_schedule_prepare before
> > __napi_schedule is called in IRQ, so try_cmp_xchg would return false.
> > I think if the thread stops if the napi is idle(SCHED is not) set then
> > it should do. This should make sure any pending SCHED_THREADED are
> > also done. The existing logic in napi_schedulle_prep should handle all
> > the cases.
>
> I think we're on the same page. We're clearing the THREADED bit
> (not SCHED_THREADED). Only napi_schedule() path looks at that bit,
> after setting SCHED. So if we cmpxchg on a state where SCHED was
> clear - we can't race with anything that cares about THREADED bit.
+1

Yes, the logic in napi_sched_prep should provide enough mutual
exclusion. The only problem is that we have to do a poll if
SCHED_THREADED is set. But if we unset THREADED (using the cmpxchg
logic) then do a poll, the final poll should unset SCHED_THREADED also
and would be safe to stop. I am going to move this loop to the
napi_thread_poll function so it is a little bit clean.
>
> Just to be clear - the stopping of the thread has to be after the
> proposed loop, so kthread_should_stop() does not come into play.
Wait, the thread will unset STATE_THREADED if kthread_should_stop is
true. right? Otherwise how would thread know that it has to unset the
bit and stop?

As I understand, we should be doing something like following:

while (true) {
   state = READ_ONCE()
   can_stop = false;

   if (kthread_should_stop) {
       if (SCHED_THREADED || !SCHED) {
           state &= !THREADED
       } else {
           msleep(1);
           continue;
       }

        if (try_cmpxchg) {
            can_stop = true;
            if (!SCHED_THREADED)
                break;
        }
   }

   if (SCHED_THREADED)
       poll()

   if (can_stop))
       break;
}

dev_set_threaded() {
    ...
   if (!threaded)
      kthread_stop(thread)
}

>
> And FWIW my understanding is that we don't need any barriers on the
> fast path (SCHED vs checking THREADED) because memory ordering is
> a thing which exists only between distinct memory words.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] net: stop napi kthreads when THREADED napi is disabled
  2025-05-21 22:50           ` Samiullah Khawaja
@ 2025-05-29  0:04             ` Jakub Kicinski
  0 siblings, 0 replies; 9+ messages in thread
From: Jakub Kicinski @ 2025-05-29  0:04 UTC (permalink / raw)
  To: Samiullah Khawaja
  Cc: Wei Wang, David S . Miller, Eric Dumazet, Paolo Abeni,
	almasrymina, willemb, jdamato, mkarsten, netdev

On Wed, 21 May 2025 15:50:19 -0700 Samiullah Khawaja wrote:
> > Just to be clear - the stopping of the thread has to be after the
> > proposed loop, so kthread_should_stop() does not come into play.  
> Wait, the thread will unset STATE_THREADED if kthread_should_stop is
> true. right? Otherwise how would thread know that it has to unset the
> bit and stop?
> 
> As I understand, we should be doing something like following:
> 
> while (true) {
>    state = READ_ONCE()
>    can_stop = false;
> 
>    if (kthread_should_stop) {
>        if (SCHED_THREADED || !SCHED) {
>            state &= !THREADED
>        } else {
>            msleep(1);
>            continue;
>        }
> 
>         if (try_cmpxchg) {
>             can_stop = true;
>             if (!SCHED_THREADED)
>                 break;
>         }
>    }
> 
>    if (SCHED_THREADED)
>        poll()
> 
>    if (can_stop))
>        break;
> }

So moving the stopping logic into the polling thread? I don't think this
helps anything. 

Once we unset the THREADED bit we should wait for the thread to clear
SCHED_THREADED (before trying to stop it). SCHED_THREADED going away
is our signal that it's safe to reap the thread.

If you're afraid that the thread will not terminate (because packets
continue to flow in) - we probably want something like:

diff --git a/net/core/dev.c b/net/core/dev.c
index 2b514d95c528..33d4b726395b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7548,6 +7548,13 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
                if (!repoll)
                        break;
 
+               /* Thread is going away, give up the ownership */
+               if (!likely(READ_ONCE(napi->state) & NAPIF_STATE_THREADED)) {
+                       clear_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+                       __napi_schedule(napi);
+                       break;
+               }
+
                rcu_softirq_qs_periodic(last_qs);
                cond_resched();
        }

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2025-05-29  0:04 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-19 22:43 [PATCH net-next v2] net: stop napi kthreads when THREADED napi is disabled Samiullah Khawaja
2025-05-20  0:34 ` Wei Wang
2025-05-21  2:09 ` Jakub Kicinski
2025-05-21 16:41   ` Wei Wang
2025-05-21 17:28     ` Samiullah Khawaja
2025-05-21 19:51       ` Samiullah Khawaja
2025-05-21 22:21         ` Jakub Kicinski
2025-05-21 22:50           ` Samiullah Khawaja
2025-05-29  0:04             ` Jakub Kicinski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).