Linux RCU subsystem development
 help / color / mirror / Atom feed
* [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
       [not found] <20240807160228.26206-1-frederic@kernel.org>
@ 2024-08-07 16:02 ` Frederic Weisbecker
  2024-08-07 17:01   ` Vlastimil Babka
  0 siblings, 1 reply; 22+ messages in thread
From: Frederic Weisbecker @ 2024-08-07 16:02 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Andrew Morton, Kees Cook, Peter Zijlstra,
	Thomas Gleixner, Michal Hocko, Vlastimil Babka, linux-mm,
	Paul E. McKenney, Neeraj Upadhyay, Joel Fernandes, Boqun Feng,
	Zqiang, rcu

Kthreads attached to a preferred NUMA node for their task structure
allocation can also be assumed to run preferrably within that same node.

A more precise affinity is usually notified by calling
kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.

For the others, a default affinity to the node is desired and sometimes
implemented with more or less success when it comes to deal with hotplug
events and nohz_full / CPU Isolation interactions:

- kcompactd is affine to its node and handles hotplug but not CPU Isolation
- kswapd is affine to its node and ignores hotplug and CPU Isolation
- A bunch of drivers create their kthreads on a specific node and
  don't take care about affining further.

Handle that default node affinity preference at the generic level
instead, provided a kthread is created on an actual node and doesn't
apply any specific affinity such as a given CPU or a custom cpumask to
bind to before its first wake-up.

This generic handling is aware of CPU hotplug events and CPU isolation
such that:

* When a housekeeping CPU goes up and is part of the node of a given
  kthread, it is added to its applied affinity set (and
  possibly the default last resort online housekeeping set is removed
  from the set).

* When a housekeeping CPU goes down while it was part of the node of a
  kthread, it is removed from the kthread's applied
  affinity. The last resort is to affine the kthread to all online
  housekeeping CPUs.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/cpuhotplug.h |   1 +
 kernel/kthread.c           | 120 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 9316c39260e0..89d852538b72 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -240,6 +240,7 @@ enum cpuhp_state {
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RANDOM_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
+	CPUHP_AP_KTHREADS_ONLINE,
 	CPUHP_AP_BASE_CACHEINFO_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 40,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ecb719f54f7a..eee5925e7725 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -35,6 +35,10 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 
+static struct cpumask kthread_online_mask;
+static LIST_HEAD(kthreads_hotplug);
+static DEFINE_MUTEX(kthreads_hotplug_lock);
+
 struct kthread_create_info
 {
 	/* Information passed to kthread() from kthreadd. */
@@ -53,6 +57,7 @@ struct kthread_create_info
 struct kthread {
 	unsigned long flags;
 	unsigned int cpu;
+	unsigned int node;
 	int started;
 	int result;
 	int (*threadfn)(void *);
@@ -64,6 +69,8 @@ struct kthread {
 #endif
 	/* To store the full name if task comm is truncated. */
 	char *full_name;
+	struct task_struct *task;
+	struct list_head hotplug_node;
 };
 
 enum KTHREAD_BITS {
@@ -122,8 +129,11 @@ bool set_kthread_struct(struct task_struct *p)
 
 	init_completion(&kthread->exited);
 	init_completion(&kthread->parked);
+	INIT_LIST_HEAD(&kthread->hotplug_node);
 	p->vfork_done = &kthread->exited;
 
+	kthread->task = p;
+	kthread->node = tsk_fork_get_node(current);
 	p->worker_private = kthread;
 	return true;
 }
@@ -314,6 +324,13 @@ void __noreturn kthread_exit(long result)
 {
 	struct kthread *kthread = to_kthread(current);
 	kthread->result = result;
+	if (!list_empty(&kthread->hotplug_node)) {
+		mutex_lock(&kthreads_hotplug_lock);
+		list_del(&kthread->hotplug_node);
+		/* Make sure the kthread never gets re-affined globally */
+		set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
+		mutex_unlock(&kthreads_hotplug_lock);
+	}
 	do_exit(0);
 }
 EXPORT_SYMBOL(kthread_exit);
@@ -339,6 +356,45 @@ void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
 }
 EXPORT_SYMBOL(kthread_complete_and_exit);
 
+static void kthread_fetch_affinity(struct kthread *k, struct cpumask *mask)
+{
+	if (k->node == NUMA_NO_NODE) {
+		cpumask_copy(mask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+	} else {
+		/*
+		 * The node cpumask is racy when read from kthread() but:
+		 * - a racing CPU going down won't be present in kthread_online_mask
+		 * - a racing CPU going up will be handled by kthreads_online_cpu()
+		 */
+		cpumask_and(mask, cpumask_of_node(k->node), &kthread_online_mask);
+		cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+		if (cpumask_empty(mask))
+			cpumask_copy(mask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+	}
+}
+
+static int kthread_affine_node(void)
+{
+	struct kthread *kthread = to_kthread(current);
+	cpumask_var_t affinity;
+
+	WARN_ON_ONCE(kthread_is_per_cpu(current));
+
+	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+		return -ENOMEM;
+
+	mutex_lock(&kthreads_hotplug_lock);
+	WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
+	list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+	kthread_fetch_affinity(kthread, affinity);
+	set_cpus_allowed_ptr(current, affinity);
+	mutex_unlock(&kthreads_hotplug_lock);
+
+	free_cpumask_var(affinity);
+
+	return 0;
+}
+
 static int kthread(void *_create)
 {
 	static const struct sched_param param = { .sched_priority = 0 };
@@ -369,7 +425,6 @@ static int kthread(void *_create)
 	 * back to default in case they have been changed.
 	 */
 	sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
-	set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -385,6 +440,9 @@ static int kthread(void *_create)
 
 	self->started = 1;
 
+	if (!(current->flags & PF_NO_SETAFFINITY))
+		kthread_affine_node();
+
 	ret = -EINTR;
 	if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
 		cgroup_kthread_ready();
@@ -779,6 +837,66 @@ int kthreadd(void *unused)
 	return 0;
 }
 
+static int kthreads_hotplug_update(void)
+{
+	cpumask_var_t affinity;
+	struct kthread *k;
+	int err;
+
+	if (list_empty(&kthreads_hotplug))
+		return 0;
+
+	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+		return -ENOMEM;
+
+	err = 0;
+
+	list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
+		if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
+				 kthread_is_per_cpu(k->task))) {
+			err = -EINVAL;
+			continue;
+		}
+		kthread_fetch_affinity(k, affinity);
+		set_cpus_allowed_ptr(k->task, affinity);
+	}
+
+	free_cpumask_var(affinity);
+
+	return err;
+}
+
+static int kthreads_offline_cpu(unsigned int cpu)
+{
+	int ret = 0;
+
+	mutex_lock(&kthreads_hotplug_lock);
+	cpumask_clear_cpu(cpu, &kthread_online_mask);
+	ret = kthreads_hotplug_update();
+	mutex_unlock(&kthreads_hotplug_lock);
+
+	return ret;
+}
+
+static int kthreads_online_cpu(unsigned int cpu)
+{
+	int ret = 0;
+
+	mutex_lock(&kthreads_hotplug_lock);
+	cpumask_set_cpu(cpu, &kthread_online_mask);
+	ret = kthreads_hotplug_update();
+	mutex_unlock(&kthreads_hotplug_lock);
+
+	return ret;
+}
+
+static int kthreads_init(void)
+{
+	return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
+				kthreads_online_cpu, kthreads_offline_cpu);
+}
+early_initcall(kthreads_init);
+
 void __kthread_init_worker(struct kthread_worker *worker,
 				const char *name,
 				struct lock_class_key *key)
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
  2024-08-07 16:02 ` Frederic Weisbecker
@ 2024-08-07 17:01   ` Vlastimil Babka
  0 siblings, 0 replies; 22+ messages in thread
From: Vlastimil Babka @ 2024-08-07 17:01 UTC (permalink / raw)
  To: Frederic Weisbecker, LKML
  Cc: Andrew Morton, Kees Cook, Peter Zijlstra, Thomas Gleixner,
	Michal Hocko, linux-mm, Paul E. McKenney, Neeraj Upadhyay,
	Joel Fernandes, Boqun Feng, Zqiang, rcu

On 8/7/24 18:02, Frederic Weisbecker wrote:
> Kthreads attached to a preferred NUMA node for their task structure
> allocation can also be assumed to run preferrably within that same node.
> 
> A more precise affinity is usually notified by calling
> kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.
> 
> For the others, a default affinity to the node is desired and sometimes
> implemented with more or less success when it comes to deal with hotplug
> events and nohz_full / CPU Isolation interactions:
> 
> - kcompactd is affine to its node and handles hotplug but not CPU Isolation
> - kswapd is affine to its node and ignores hotplug and CPU Isolation
> - A bunch of drivers create their kthreads on a specific node and
>   don't take care about affining further.
> 
> Handle that default node affinity preference at the generic level
> instead, provided a kthread is created on an actual node and doesn't
> apply any specific affinity such as a given CPU or a custom cpumask to
> bind to before its first wake-up.
> 
> This generic handling is aware of CPU hotplug events and CPU isolation
> such that:
> 
> * When a housekeeping CPU goes up and is part of the node of a given
>   kthread, it is added to its applied affinity set (and
>   possibly the default last resort online housekeeping set is removed
>   from the set).
> 
> * When a housekeeping CPU goes down while it was part of the node of a
>   kthread, it is removed from the kthread's applied
>   affinity. The last resort is to affine the kthread to all online
>   housekeeping CPUs.
> 
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>

Acked-by: Vlastimil Babka <vbabka@suse.cz>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
       [not found] <20240916224925.20540-1-frederic@kernel.org>
@ 2024-09-16 22:49 ` Frederic Weisbecker
  2024-09-17  6:26   ` Michal Hocko
  0 siblings, 1 reply; 22+ messages in thread
From: Frederic Weisbecker @ 2024-09-16 22:49 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Andrew Morton, Kees Cook, Peter Zijlstra,
	Thomas Gleixner, Michal Hocko, Vlastimil Babka, linux-mm,
	Paul E. McKenney, Neeraj Upadhyay, Joel Fernandes, Boqun Feng,
	Zqiang, rcu

Kthreads attached to a preferred NUMA node for their task structure
allocation can also be assumed to run preferrably within that same node.

A more precise affinity is usually notified by calling
kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.

For the others, a default affinity to the node is desired and sometimes
implemented with more or less success when it comes to deal with hotplug
events and nohz_full / CPU Isolation interactions:

- kcompactd is affine to its node and handles hotplug but not CPU Isolation
- kswapd is affine to its node and ignores hotplug and CPU Isolation
- A bunch of drivers create their kthreads on a specific node and
  don't take care about affining further.

Handle that default node affinity preference at the generic level
instead, provided a kthread is created on an actual node and doesn't
apply any specific affinity such as a given CPU or a custom cpumask to
bind to before its first wake-up.

This generic handling is aware of CPU hotplug events and CPU isolation
such that:

* When a housekeeping CPU goes up and is part of the node of a given
  kthread, it is added to its applied affinity set (and
  possibly the default last resort online housekeeping set is removed
  from the set).

* When a housekeeping CPU goes down while it was part of the node of a
  kthread, it is removed from the kthread's applied
  affinity. The last resort is to affine the kthread to all online
  housekeeping CPUs.

Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/cpuhotplug.h |   1 +
 kernel/kthread.c           | 120 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 9316c39260e0..89d852538b72 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -240,6 +240,7 @@ enum cpuhp_state {
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RANDOM_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
+	CPUHP_AP_KTHREADS_ONLINE,
 	CPUHP_AP_BASE_CACHEINFO_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 40,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ecb719f54f7a..eee5925e7725 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -35,6 +35,10 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 
+static struct cpumask kthread_online_mask;
+static LIST_HEAD(kthreads_hotplug);
+static DEFINE_MUTEX(kthreads_hotplug_lock);
+
 struct kthread_create_info
 {
 	/* Information passed to kthread() from kthreadd. */
@@ -53,6 +57,7 @@ struct kthread_create_info
 struct kthread {
 	unsigned long flags;
 	unsigned int cpu;
+	unsigned int node;
 	int started;
 	int result;
 	int (*threadfn)(void *);
@@ -64,6 +69,8 @@ struct kthread {
 #endif
 	/* To store the full name if task comm is truncated. */
 	char *full_name;
+	struct task_struct *task;
+	struct list_head hotplug_node;
 };
 
 enum KTHREAD_BITS {
@@ -122,8 +129,11 @@ bool set_kthread_struct(struct task_struct *p)
 
 	init_completion(&kthread->exited);
 	init_completion(&kthread->parked);
+	INIT_LIST_HEAD(&kthread->hotplug_node);
 	p->vfork_done = &kthread->exited;
 
+	kthread->task = p;
+	kthread->node = tsk_fork_get_node(current);
 	p->worker_private = kthread;
 	return true;
 }
@@ -314,6 +324,13 @@ void __noreturn kthread_exit(long result)
 {
 	struct kthread *kthread = to_kthread(current);
 	kthread->result = result;
+	if (!list_empty(&kthread->hotplug_node)) {
+		mutex_lock(&kthreads_hotplug_lock);
+		list_del(&kthread->hotplug_node);
+		/* Make sure the kthread never gets re-affined globally */
+		set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
+		mutex_unlock(&kthreads_hotplug_lock);
+	}
 	do_exit(0);
 }
 EXPORT_SYMBOL(kthread_exit);
@@ -339,6 +356,45 @@ void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
 }
 EXPORT_SYMBOL(kthread_complete_and_exit);
 
+static void kthread_fetch_affinity(struct kthread *k, struct cpumask *mask)
+{
+	if (k->node == NUMA_NO_NODE) {
+		cpumask_copy(mask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+	} else {
+		/*
+		 * The node cpumask is racy when read from kthread() but:
+		 * - a racing CPU going down won't be present in kthread_online_mask
+		 * - a racing CPU going up will be handled by kthreads_online_cpu()
+		 */
+		cpumask_and(mask, cpumask_of_node(k->node), &kthread_online_mask);
+		cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+		if (cpumask_empty(mask))
+			cpumask_copy(mask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+	}
+}
+
+static int kthread_affine_node(void)
+{
+	struct kthread *kthread = to_kthread(current);
+	cpumask_var_t affinity;
+
+	WARN_ON_ONCE(kthread_is_per_cpu(current));
+
+	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+		return -ENOMEM;
+
+	mutex_lock(&kthreads_hotplug_lock);
+	WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
+	list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+	kthread_fetch_affinity(kthread, affinity);
+	set_cpus_allowed_ptr(current, affinity);
+	mutex_unlock(&kthreads_hotplug_lock);
+
+	free_cpumask_var(affinity);
+
+	return 0;
+}
+
 static int kthread(void *_create)
 {
 	static const struct sched_param param = { .sched_priority = 0 };
@@ -369,7 +425,6 @@ static int kthread(void *_create)
 	 * back to default in case they have been changed.
 	 */
 	sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
-	set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -385,6 +440,9 @@ static int kthread(void *_create)
 
 	self->started = 1;
 
+	if (!(current->flags & PF_NO_SETAFFINITY))
+		kthread_affine_node();
+
 	ret = -EINTR;
 	if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
 		cgroup_kthread_ready();
@@ -779,6 +837,66 @@ int kthreadd(void *unused)
 	return 0;
 }
 
+static int kthreads_hotplug_update(void)
+{
+	cpumask_var_t affinity;
+	struct kthread *k;
+	int err;
+
+	if (list_empty(&kthreads_hotplug))
+		return 0;
+
+	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+		return -ENOMEM;
+
+	err = 0;
+
+	list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
+		if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
+				 kthread_is_per_cpu(k->task))) {
+			err = -EINVAL;
+			continue;
+		}
+		kthread_fetch_affinity(k, affinity);
+		set_cpus_allowed_ptr(k->task, affinity);
+	}
+
+	free_cpumask_var(affinity);
+
+	return err;
+}
+
+static int kthreads_offline_cpu(unsigned int cpu)
+{
+	int ret = 0;
+
+	mutex_lock(&kthreads_hotplug_lock);
+	cpumask_clear_cpu(cpu, &kthread_online_mask);
+	ret = kthreads_hotplug_update();
+	mutex_unlock(&kthreads_hotplug_lock);
+
+	return ret;
+}
+
+static int kthreads_online_cpu(unsigned int cpu)
+{
+	int ret = 0;
+
+	mutex_lock(&kthreads_hotplug_lock);
+	cpumask_set_cpu(cpu, &kthread_online_mask);
+	ret = kthreads_hotplug_update();
+	mutex_unlock(&kthreads_hotplug_lock);
+
+	return ret;
+}
+
+static int kthreads_init(void)
+{
+	return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
+				kthreads_online_cpu, kthreads_offline_cpu);
+}
+early_initcall(kthreads_init);
+
 void __kthread_init_worker(struct kthread_worker *worker,
 				const char *name,
 				struct lock_class_key *key)
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
  2024-09-16 22:49 ` [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node Frederic Weisbecker
@ 2024-09-17  6:26   ` Michal Hocko
  2024-09-17  7:01     ` Vlastimil Babka
  2024-09-17 10:34     ` Frederic Weisbecker
  0 siblings, 2 replies; 22+ messages in thread
From: Michal Hocko @ 2024-09-17  6:26 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Andrew Morton, Kees Cook, Peter Zijlstra, Thomas Gleixner,
	Vlastimil Babka, linux-mm, Paul E. McKenney, Neeraj Upadhyay,
	Joel Fernandes, Boqun Feng, Zqiang, rcu

On Tue 17-09-24 00:49:16, Frederic Weisbecker wrote:
> Kthreads attached to a preferred NUMA node for their task structure
> allocation can also be assumed to run preferrably within that same node.
> 
> A more precise affinity is usually notified by calling
> kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.
> 
> For the others, a default affinity to the node is desired and sometimes
> implemented with more or less success when it comes to deal with hotplug
> events and nohz_full / CPU Isolation interactions:
> 
> - kcompactd is affine to its node and handles hotplug but not CPU Isolation
> - kswapd is affine to its node and ignores hotplug and CPU Isolation
> - A bunch of drivers create their kthreads on a specific node and
>   don't take care about affining further.
> 
> Handle that default node affinity preference at the generic level
> instead, provided a kthread is created on an actual node and doesn't
> apply any specific affinity such as a given CPU or a custom cpumask to
> bind to before its first wake-up.

Makes sense.

> This generic handling is aware of CPU hotplug events and CPU isolation
> such that:
> 
> * When a housekeeping CPU goes up and is part of the node of a given
>   kthread, it is added to its applied affinity set (and
>   possibly the default last resort online housekeeping set is removed
>   from the set).
> 
> * When a housekeeping CPU goes down while it was part of the node of a
>   kthread, it is removed from the kthread's applied
>   affinity. The last resort is to affine the kthread to all online
>   housekeeping CPUs.

But I am not really sure about this part. Sure it makes sense to set the
affinity to exclude isolated CPUs but why do we care about hotplug
events at all. Let's say we offline all cpus from a given node (or
that all but isolated cpus are offline - is this even
realistic/reasonable usecase?). Wouldn't scheduler ignore the kthread's
affinity in such a case? In other words how is that different from
tasksetting an userspace task to a cpu that goes offline? We still do
allow such a task to run, right? We just do not care about affinity
anymore.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
  2024-09-17  6:26   ` Michal Hocko
@ 2024-09-17  7:01     ` Vlastimil Babka
  2024-09-17  7:05       ` Michal Hocko
  2024-09-17 10:34     ` Frederic Weisbecker
  1 sibling, 1 reply; 22+ messages in thread
From: Vlastimil Babka @ 2024-09-17  7:01 UTC (permalink / raw)
  To: Michal Hocko, Frederic Weisbecker
  Cc: LKML, Andrew Morton, Kees Cook, Peter Zijlstra, Thomas Gleixner,
	linux-mm, Paul E. McKenney, Neeraj Upadhyay, Joel Fernandes,
	Boqun Feng, Zqiang, rcu

On 9/17/24 8:26 AM, Michal Hocko wrote:
> On Tue 17-09-24 00:49:16, Frederic Weisbecker wrote:
>> Kthreads attached to a preferred NUMA node for their task structure
>> allocation can also be assumed to run preferrably within that same node.
>>
>> A more precise affinity is usually notified by calling
>> kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.
>>
>> For the others, a default affinity to the node is desired and sometimes
>> implemented with more or less success when it comes to deal with hotplug
>> events and nohz_full / CPU Isolation interactions:
>>
>> - kcompactd is affine to its node and handles hotplug but not CPU Isolation
>> - kswapd is affine to its node and ignores hotplug and CPU Isolation
>> - A bunch of drivers create their kthreads on a specific node and
>>   don't take care about affining further.
>>
>> Handle that default node affinity preference at the generic level
>> instead, provided a kthread is created on an actual node and doesn't
>> apply any specific affinity such as a given CPU or a custom cpumask to
>> bind to before its first wake-up.
> 
> Makes sense.
> 
>> This generic handling is aware of CPU hotplug events and CPU isolation
>> such that:
>>
>> * When a housekeeping CPU goes up and is part of the node of a given
>>   kthread, it is added to its applied affinity set (and
>>   possibly the default last resort online housekeeping set is removed
>>   from the set).
>>
>> * When a housekeeping CPU goes down while it was part of the node of a
>>   kthread, it is removed from the kthread's applied
>>   affinity. The last resort is to affine the kthread to all online
>>   housekeeping CPUs.
> 
> But I am not really sure about this part. Sure it makes sense to set the
> affinity to exclude isolated CPUs but why do we care about hotplug
> events at all. Let's say we offline all cpus from a given node (or
> that all but isolated cpus are offline - is this even
> realistic/reasonable usecase?). Wouldn't scheduler ignore the kthread's
> affinity in such a case? In other words how is that different from
> tasksetting an userspace task to a cpu that goes offline? We still do
> allow such a task to run, right? We just do not care about affinity
> anymore.

AFAIU it handles better the situation where all houskeeping cpus from
the preferred node go down, then it affines to houskeeping cpus from any
node vs any cpu including isolated ones.
Yes it's probably a scenario that's not recommendable, but someone might
do it anyway...

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
  2024-09-17  7:01     ` Vlastimil Babka
@ 2024-09-17  7:05       ` Michal Hocko
  2024-09-17  7:14         ` Vlastimil Babka
  0 siblings, 1 reply; 22+ messages in thread
From: Michal Hocko @ 2024-09-17  7:05 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Frederic Weisbecker, LKML, Andrew Morton, Kees Cook,
	Peter Zijlstra, Thomas Gleixner, linux-mm, Paul E. McKenney,
	Neeraj Upadhyay, Joel Fernandes, Boqun Feng, Zqiang, rcu

On Tue 17-09-24 09:01:08, Vlastimil Babka wrote:
> On 9/17/24 8:26 AM, Michal Hocko wrote:
> > On Tue 17-09-24 00:49:16, Frederic Weisbecker wrote:
> >> Kthreads attached to a preferred NUMA node for their task structure
> >> allocation can also be assumed to run preferrably within that same node.
> >>
> >> A more precise affinity is usually notified by calling
> >> kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.
> >>
> >> For the others, a default affinity to the node is desired and sometimes
> >> implemented with more or less success when it comes to deal with hotplug
> >> events and nohz_full / CPU Isolation interactions:
> >>
> >> - kcompactd is affine to its node and handles hotplug but not CPU Isolation
> >> - kswapd is affine to its node and ignores hotplug and CPU Isolation
> >> - A bunch of drivers create their kthreads on a specific node and
> >>   don't take care about affining further.
> >>
> >> Handle that default node affinity preference at the generic level
> >> instead, provided a kthread is created on an actual node and doesn't
> >> apply any specific affinity such as a given CPU or a custom cpumask to
> >> bind to before its first wake-up.
> > 
> > Makes sense.
> > 
> >> This generic handling is aware of CPU hotplug events and CPU isolation
> >> such that:
> >>
> >> * When a housekeeping CPU goes up and is part of the node of a given
> >>   kthread, it is added to its applied affinity set (and
> >>   possibly the default last resort online housekeeping set is removed
> >>   from the set).
> >>
> >> * When a housekeeping CPU goes down while it was part of the node of a
> >>   kthread, it is removed from the kthread's applied
> >>   affinity. The last resort is to affine the kthread to all online
> >>   housekeeping CPUs.
> > 
> > But I am not really sure about this part. Sure it makes sense to set the
> > affinity to exclude isolated CPUs but why do we care about hotplug
> > events at all. Let's say we offline all cpus from a given node (or
> > that all but isolated cpus are offline - is this even
> > realistic/reasonable usecase?). Wouldn't scheduler ignore the kthread's
> > affinity in such a case? In other words how is that different from
> > tasksetting an userspace task to a cpu that goes offline? We still do
> > allow such a task to run, right? We just do not care about affinity
> > anymore.
> 
> AFAIU it handles better the situation where all houskeeping cpus from
> the preferred node go down, then it affines to houskeeping cpus from any
> node vs any cpu including isolated ones.

Doesn't that happen automagically? Or can it end up on a random
isolated cpu?

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
  2024-09-17  7:05       ` Michal Hocko
@ 2024-09-17  7:14         ` Vlastimil Babka
  0 siblings, 0 replies; 22+ messages in thread
From: Vlastimil Babka @ 2024-09-17  7:14 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Frederic Weisbecker, LKML, Andrew Morton, Kees Cook,
	Peter Zijlstra, Thomas Gleixner, linux-mm, Paul E. McKenney,
	Neeraj Upadhyay, Joel Fernandes, Boqun Feng, Zqiang, rcu

On 9/17/24 9:05 AM, Michal Hocko wrote:
> On Tue 17-09-24 09:01:08, Vlastimil Babka wrote:
>> On 9/17/24 8:26 AM, Michal Hocko wrote:
>>> On Tue 17-09-24 00:49:16, Frederic Weisbecker wrote:
>>>> Kthreads attached to a preferred NUMA node for their task structure
>>>> allocation can also be assumed to run preferrably within that same node.
>>>>
>>>> A more precise affinity is usually notified by calling
>>>> kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.
>>>>
>>>> For the others, a default affinity to the node is desired and sometimes
>>>> implemented with more or less success when it comes to deal with hotplug
>>>> events and nohz_full / CPU Isolation interactions:
>>>>
>>>> - kcompactd is affine to its node and handles hotplug but not CPU Isolation
>>>> - kswapd is affine to its node and ignores hotplug and CPU Isolation
>>>> - A bunch of drivers create their kthreads on a specific node and
>>>>   don't take care about affining further.
>>>>
>>>> Handle that default node affinity preference at the generic level
>>>> instead, provided a kthread is created on an actual node and doesn't
>>>> apply any specific affinity such as a given CPU or a custom cpumask to
>>>> bind to before its first wake-up.
>>>
>>> Makes sense.
>>>
>>>> This generic handling is aware of CPU hotplug events and CPU isolation
>>>> such that:
>>>>
>>>> * When a housekeeping CPU goes up and is part of the node of a given
>>>>   kthread, it is added to its applied affinity set (and
>>>>   possibly the default last resort online housekeeping set is removed
>>>>   from the set).
>>>>
>>>> * When a housekeeping CPU goes down while it was part of the node of a
>>>>   kthread, it is removed from the kthread's applied
>>>>   affinity. The last resort is to affine the kthread to all online
>>>>   housekeeping CPUs.
>>>
>>> But I am not really sure about this part. Sure it makes sense to set the
>>> affinity to exclude isolated CPUs but why do we care about hotplug
>>> events at all. Let's say we offline all cpus from a given node (or
>>> that all but isolated cpus are offline - is this even
>>> realistic/reasonable usecase?). Wouldn't scheduler ignore the kthread's
>>> affinity in such a case? In other words how is that different from
>>> tasksetting an userspace task to a cpu that goes offline? We still do
>>> allow such a task to run, right? We just do not care about affinity
>>> anymore.
>>
>> AFAIU it handles better the situation where all houskeeping cpus from
>> the preferred node go down, then it affines to houskeeping cpus from any
>> node vs any cpu including isolated ones.
> 
> Doesn't that happen automagically? Or can it end up on a random
> isolated cpu?

Good question, perhaps it can and there's no automagic, as I see code like:

+		/* Make sure the kthread never gets re-affined globally */
+		set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
  2024-09-17  6:26   ` Michal Hocko
  2024-09-17  7:01     ` Vlastimil Babka
@ 2024-09-17 10:34     ` Frederic Weisbecker
  2024-09-17 11:07       ` Michal Hocko
  1 sibling, 1 reply; 22+ messages in thread
From: Frederic Weisbecker @ 2024-09-17 10:34 UTC (permalink / raw)
  To: Michal Hocko
  Cc: LKML, Andrew Morton, Kees Cook, Peter Zijlstra, Thomas Gleixner,
	Vlastimil Babka, linux-mm, Paul E. McKenney, Neeraj Upadhyay,
	Joel Fernandes, Boqun Feng, Zqiang, rcu

Le Tue, Sep 17, 2024 at 08:26:49AM +0200, Michal Hocko a écrit :
> On Tue 17-09-24 00:49:16, Frederic Weisbecker wrote:
> > Kthreads attached to a preferred NUMA node for their task structure
> > allocation can also be assumed to run preferrably within that same node.
> > 
> > A more precise affinity is usually notified by calling
> > kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.
> > 
> > For the others, a default affinity to the node is desired and sometimes
> > implemented with more or less success when it comes to deal with hotplug
> > events and nohz_full / CPU Isolation interactions:
> > 
> > - kcompactd is affine to its node and handles hotplug but not CPU Isolation
> > - kswapd is affine to its node and ignores hotplug and CPU Isolation
> > - A bunch of drivers create their kthreads on a specific node and
> >   don't take care about affining further.
> > 
> > Handle that default node affinity preference at the generic level
> > instead, provided a kthread is created on an actual node and doesn't
> > apply any specific affinity such as a given CPU or a custom cpumask to
> > bind to before its first wake-up.
> 
> Makes sense.
> 
> > This generic handling is aware of CPU hotplug events and CPU isolation
> > such that:
> > 
> > * When a housekeeping CPU goes up and is part of the node of a given
> >   kthread, it is added to its applied affinity set (and
> >   possibly the default last resort online housekeeping set is removed
> >   from the set).
> > 
> > * When a housekeeping CPU goes down while it was part of the node of a
> >   kthread, it is removed from the kthread's applied
> >   affinity. The last resort is to affine the kthread to all online
> >   housekeeping CPUs.
> 
> But I am not really sure about this part. Sure it makes sense to set the
> affinity to exclude isolated CPUs but why do we care about hotplug
> events at all. Let's say we offline all cpus from a given node (or
> that all but isolated cpus are offline - is this even
> realistic/reasonable usecase?). Wouldn't scheduler ignore the kthread's
> affinity in such a case? In other words how is that different from
> tasksetting an userspace task to a cpu that goes offline? We still do
> allow such a task to run, right? We just do not care about affinity
> anymore.

Suppose we have this artificial online set:

NODE 0 -> CPU 0
NODE 1 -> CPU 1
NODE 2 -> CPU 2

And we have nohz_full=1,2

So there is kswapd/2 that is affine to NODE 2 and thus CPU 2 for now.

Now CPU 2 goes offline. The scheduler migrates off all
tasks. select_fallback_rq() for kswapd/2 doesn't find a suitable CPU
to run to so it affines kswapd/2 to all remaining online CPUs (CPU 0, CPU 1)
(see the "No more Mr. Nice Guy" comment).

But CPU 1 is nohz_full, so kswapd/2 could run on that isolated CPU. Unless we
handle things before, like this patchset does.

And note that adding isolcpus=domain,1,2 or setting 1,2 as isolated
cpuset partition (like most isolated workloads should do) is not helping
here. And I'm not sure this last resort scheduler code is the right place
to handle isolated cpumasks.

So it looks necessary, unless I am missing something else?

And that is just for reaffine on CPU down. CPU up needs mirroring treatment
and also it must handle new CPUs freshly added to a node.

Thanks.

> -- 
> Michal Hocko
> SUSE Labs
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
  2024-09-17 10:34     ` Frederic Weisbecker
@ 2024-09-17 11:07       ` Michal Hocko
  2024-09-18  9:37         ` Frederic Weisbecker
  0 siblings, 1 reply; 22+ messages in thread
From: Michal Hocko @ 2024-09-17 11:07 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Andrew Morton, Kees Cook, Peter Zijlstra, Thomas Gleixner,
	Vlastimil Babka, linux-mm, Paul E. McKenney, Neeraj Upadhyay,
	Joel Fernandes, Boqun Feng, Zqiang, rcu

On Tue 17-09-24 12:34:51, Frederic Weisbecker wrote:
> Le Tue, Sep 17, 2024 at 08:26:49AM +0200, Michal Hocko a écrit :
> > On Tue 17-09-24 00:49:16, Frederic Weisbecker wrote:
> > > Kthreads attached to a preferred NUMA node for their task structure
> > > allocation can also be assumed to run preferrably within that same node.
> > > 
> > > A more precise affinity is usually notified by calling
> > > kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.
> > > 
> > > For the others, a default affinity to the node is desired and sometimes
> > > implemented with more or less success when it comes to deal with hotplug
> > > events and nohz_full / CPU Isolation interactions:
> > > 
> > > - kcompactd is affine to its node and handles hotplug but not CPU Isolation
> > > - kswapd is affine to its node and ignores hotplug and CPU Isolation
> > > - A bunch of drivers create their kthreads on a specific node and
> > >   don't take care about affining further.
> > > 
> > > Handle that default node affinity preference at the generic level
> > > instead, provided a kthread is created on an actual node and doesn't
> > > apply any specific affinity such as a given CPU or a custom cpumask to
> > > bind to before its first wake-up.
> > 
> > Makes sense.
> > 
> > > This generic handling is aware of CPU hotplug events and CPU isolation
> > > such that:
> > > 
> > > * When a housekeeping CPU goes up and is part of the node of a given
> > >   kthread, it is added to its applied affinity set (and
> > >   possibly the default last resort online housekeeping set is removed
> > >   from the set).
> > > 
> > > * When a housekeeping CPU goes down while it was part of the node of a
> > >   kthread, it is removed from the kthread's applied
> > >   affinity. The last resort is to affine the kthread to all online
> > >   housekeeping CPUs.
> > 
> > But I am not really sure about this part. Sure it makes sense to set the
> > affinity to exclude isolated CPUs but why do we care about hotplug
> > events at all. Let's say we offline all cpus from a given node (or
> > that all but isolated cpus are offline - is this even
> > realistic/reasonable usecase?). Wouldn't scheduler ignore the kthread's
> > affinity in such a case? In other words how is that different from
> > tasksetting an userspace task to a cpu that goes offline? We still do
> > allow such a task to run, right? We just do not care about affinity
> > anymore.
> 
> Suppose we have this artificial online set:
> 
> NODE 0 -> CPU 0
> NODE 1 -> CPU 1
> NODE 2 -> CPU 2
> 
> And we have nohz_full=1,2
> 
> So there is kswapd/2 that is affine to NODE 2 and thus CPU 2 for now.
> 
> Now CPU 2 goes offline. The scheduler migrates off all
> tasks. select_fallback_rq() for kswapd/2 doesn't find a suitable CPU
> to run to so it affines kswapd/2 to all remaining online CPUs (CPU 0, CPU 1)
> (see the "No more Mr. Nice Guy" comment).
> 
> But CPU 1 is nohz_full, so kswapd/2 could run on that isolated CPU. Unless we
> handle things before, like this patchset does.

But that is equally broken as before, no? CPU2 is isolated as well so it
doesn't really make much of a difference.

> And note that adding isolcpus=domain,1,2 or setting 1,2 as isolated
> cpuset partition (like most isolated workloads should do) is not helping
> here. And I'm not sure this last resort scheduler code is the right place
> to handle isolated cpumasks.

Well, we would have the same situation with userspace tasks, no? Say I
have taskset -p 2 (because I want bidning to node2) and that CPU2 goes
offline. The task needs to be moved somewhere. And it would be last
resort logic to do that unless I am missing anything. Why should kernel
threads be any different?

> So it looks necessary, unless I am missing something else?

I am not objecting to patch per se. I am just not sure this is really
needed. It is great to have kernel threads bound to non isolated cpus by
default if they have node preferences. But as soon as somebody starts
offlining cpus excessively and make the initial cpumask empty then
select_fallback_rq sounds like the right thing to do.

Not my call though. I was just curious why this is needed and it seems
to me you are looking for some sort of correctness for broken setups.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
  2024-09-17 11:07       ` Michal Hocko
@ 2024-09-18  9:37         ` Frederic Weisbecker
  2024-09-18 11:17           ` Michal Hocko
  0 siblings, 1 reply; 22+ messages in thread
From: Frederic Weisbecker @ 2024-09-18  9:37 UTC (permalink / raw)
  To: Michal Hocko
  Cc: LKML, Andrew Morton, Kees Cook, Peter Zijlstra, Thomas Gleixner,
	Vlastimil Babka, linux-mm, Paul E. McKenney, Neeraj Upadhyay,
	Joel Fernandes, Boqun Feng, Zqiang, rcu

Le Tue, Sep 17, 2024 at 01:07:25PM +0200, Michal Hocko a écrit :
> On Tue 17-09-24 12:34:51, Frederic Weisbecker wrote:
> > Le Tue, Sep 17, 2024 at 08:26:49AM +0200, Michal Hocko a écrit :
> > > On Tue 17-09-24 00:49:16, Frederic Weisbecker wrote:
> > > > Kthreads attached to a preferred NUMA node for their task structure
> > > > allocation can also be assumed to run preferrably within that same node.
> > > > 
> > > > A more precise affinity is usually notified by calling
> > > > kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.
> > > > 
> > > > For the others, a default affinity to the node is desired and sometimes
> > > > implemented with more or less success when it comes to deal with hotplug
> > > > events and nohz_full / CPU Isolation interactions:
> > > > 
> > > > - kcompactd is affine to its node and handles hotplug but not CPU Isolation
> > > > - kswapd is affine to its node and ignores hotplug and CPU Isolation
> > > > - A bunch of drivers create their kthreads on a specific node and
> > > >   don't take care about affining further.
> > > > 
> > > > Handle that default node affinity preference at the generic level
> > > > instead, provided a kthread is created on an actual node and doesn't
> > > > apply any specific affinity such as a given CPU or a custom cpumask to
> > > > bind to before its first wake-up.
> > > 
> > > Makes sense.
> > > 
> > > > This generic handling is aware of CPU hotplug events and CPU isolation
> > > > such that:
> > > > 
> > > > * When a housekeeping CPU goes up and is part of the node of a given
> > > >   kthread, it is added to its applied affinity set (and
> > > >   possibly the default last resort online housekeeping set is removed
> > > >   from the set).
> > > > 
> > > > * When a housekeeping CPU goes down while it was part of the node of a
> > > >   kthread, it is removed from the kthread's applied
> > > >   affinity. The last resort is to affine the kthread to all online
> > > >   housekeeping CPUs.
> > > 
> > > But I am not really sure about this part. Sure it makes sense to set the
> > > affinity to exclude isolated CPUs but why do we care about hotplug
> > > events at all. Let's say we offline all cpus from a given node (or
> > > that all but isolated cpus are offline - is this even
> > > realistic/reasonable usecase?). Wouldn't scheduler ignore the kthread's
> > > affinity in such a case? In other words how is that different from
> > > tasksetting an userspace task to a cpu that goes offline? We still do
> > > allow such a task to run, right? We just do not care about affinity
> > > anymore.
> > 
> > Suppose we have this artificial online set:
> > 
> > NODE 0 -> CPU 0
> > NODE 1 -> CPU 1
> > NODE 2 -> CPU 2
> > 
> > And we have nohz_full=1,2
> > 
> > So there is kswapd/2 that is affine to NODE 2 and thus CPU 2 for now.
> > 
> > Now CPU 2 goes offline. The scheduler migrates off all
> > tasks. select_fallback_rq() for kswapd/2 doesn't find a suitable CPU
> > to run to so it affines kswapd/2 to all remaining online CPUs (CPU 0, CPU 1)
> > (see the "No more Mr. Nice Guy" comment).
> > 
> > But CPU 1 is nohz_full, so kswapd/2 could run on that isolated CPU. Unless we
> > handle things before, like this patchset does.
> 
> But that is equally broken as before, no? CPU2 is isolated as well so it
> doesn't really make much of a difference.

Right. I should correct my example with nohz_full=1 only.

> 
> > And note that adding isolcpus=domain,1,2 or setting 1,2 as isolated
> > cpuset partition (like most isolated workloads should do) is not helping
> > here. And I'm not sure this last resort scheduler code is the right place
> > to handle isolated cpumasks.
> 
> Well, we would have the same situation with userspace tasks, no? Say I
> have taskset -p 2 (because I want bidning to node2) and that CPU2 goes
> offline. The task needs to be moved somewhere. And it would be last
> resort logic to do that unless I am missing anything. Why should kernel
> threads be any different?

Good point.

> 
> > So it looks necessary, unless I am missing something else?
> 
> I am not objecting to patch per se. I am just not sure this is really
> needed. It is great to have kernel threads bound to non isolated cpus by
> default if they have node preferences. But as soon as somebody starts
> offlining cpus excessively and make the initial cpumask empty then
> select_fallback_rq sounds like the right thing to do.
> 
> Not my call though. I was just curious why this is needed and it seems
> to me you are looking for some sort of correctness for broken setups.

It looks like it makes sense to explore that path. We still need the
cpu up probe to reaffine when a suitable target comes up. But it seems
the CPU down part can be handled by select_fallback_rq. I'll try that.

Thanks.

> -- 
> Michal Hocko
> SUSE Labs
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
  2024-09-18  9:37         ` Frederic Weisbecker
@ 2024-09-18 11:17           ` Michal Hocko
  0 siblings, 0 replies; 22+ messages in thread
From: Michal Hocko @ 2024-09-18 11:17 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Andrew Morton, Kees Cook, Peter Zijlstra, Thomas Gleixner,
	Vlastimil Babka, linux-mm, Paul E. McKenney, Neeraj Upadhyay,
	Joel Fernandes, Boqun Feng, Zqiang, rcu

On Wed 18-09-24 11:37:42, Frederic Weisbecker wrote:
> Le Tue, Sep 17, 2024 at 01:07:25PM +0200, Michal Hocko a écrit :
[...]
> > I am not objecting to patch per se. I am just not sure this is really
> > needed. It is great to have kernel threads bound to non isolated cpus by
> > default if they have node preferences. But as soon as somebody starts
> > offlining cpus excessively and make the initial cpumask empty then
> > select_fallback_rq sounds like the right thing to do.
> > 
> > Not my call though. I was just curious why this is needed and it seems
> > to me you are looking for some sort of correctness for broken setups.
> 
> It looks like it makes sense to explore that path. We still need the
> cpu up probe to reaffine when a suitable target comes up. But it seems
> the CPU down part can be handled by select_fallback_rq. I'll try that.

THanks! Btw. when you are looking at this, would it make sense to make
select_fallback_rq more cpu isolation aware as well? I mean using
housekeeping cpus before falling back to task_cpu_possible_mask?
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 10/19] sched,arm64: Handle CPU isolation on last resort fallback rq selection
       [not found] <20241211154035.75565-1-frederic@kernel.org>
@ 2024-12-11 15:40 ` Frederic Weisbecker
  2025-01-03 15:27   ` Will Deacon
  2024-12-11 15:40 ` [PATCH 11/19] kthread: Make sure kthread hasn't started while binding it Frederic Weisbecker
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 22+ messages in thread
From: Frederic Weisbecker @ 2024-12-11 15:40 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Catalin Marinas, Will Deacon, Marc Zyngier,
	Oliver Upton, Ard Biesheuvel, Mark Rutland, Peter Zijlstra,
	Vincent Guittot, Thomas Gleixner, Vlastimil Babka,
	Paul E. McKenney, Neeraj Upadhyay, Joel Fernandes, Boqun Feng,
	Zqiang, Uladzislau Rezki, rcu, Michal Hocko, Andrew Morton

When a kthread or any other task has an affinity mask that is fully
offline or unallowed, the scheduler reaffines the task to all possible
CPUs as a last resort.

This default decision doesn't mix up very well with nohz_full CPUs that
are part of the possible cpumask but don't want to be disturbed by
unbound kthreads or even detached pinned user tasks.

Make the fallback affinity setting aware of nohz_full.

Suggested-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 arch/arm64/include/asm/cpufeature.h  |  1 +
 arch/arm64/include/asm/mmu_context.h |  2 ++
 arch/arm64/kernel/cpufeature.c       | 11 +++++++++++
 include/linux/mmu_context.h          |  1 +
 kernel/sched/core.c                  |  2 +-
 5 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 8b4e5a3cd24c..cac5efc836c0 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -671,6 +671,7 @@ static inline bool supports_clearbhb(int scope)
 }
 
 const struct cpumask *system_32bit_el0_cpumask(void);
+const struct cpumask *fallback_32bit_el0_cpumask(void);
 DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0);
 
 static inline bool system_supports_32bit_el0(void)
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 48b3d9553b67..7883abd6b29a 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -283,6 +283,8 @@ task_cpu_possible_mask(struct task_struct *p)
 }
 #define task_cpu_possible_mask	task_cpu_possible_mask
 
+const struct cpumask *task_cpu_fallback_mask(struct task_struct *p);
+
 void verify_cpu_asid_bits(void);
 void post_ttbr_update_workaround(void);
 
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 7ce1b8ab417f..2b7aa32bf436 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1642,6 +1642,17 @@ const struct cpumask *system_32bit_el0_cpumask(void)
 	return cpu_possible_mask;
 }
 
+const struct cpumask *task_cpu_fallback_mask(struct task_struct *p)
+{
+	if (!static_branch_unlikely(&arm64_mismatched_32bit_el0))
+		return housekeeping_cpumask(HK_TYPE_TICK);
+
+	if (!is_compat_thread(task_thread_info(p)))
+		return housekeeping_cpumask(HK_TYPE_TICK);
+
+	return system_32bit_el0_cpumask();
+}
+
 static int __init parse_32bit_el0_param(char *str)
 {
 	allow_mismatched_32bit_el0 = true;
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index bbaec80c78c5..ac01dc4eb2ce 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -24,6 +24,7 @@ static inline void leave_mm(void) { }
 #ifndef task_cpu_possible_mask
 # define task_cpu_possible_mask(p)	cpu_possible_mask
 # define task_cpu_possible(cpu, p)	true
+# define task_cpu_fallback_mask(p)	housekeeping_cpumask(HK_TYPE_TICK)
 #else
 # define task_cpu_possible(cpu, p)	cpumask_test_cpu((cpu), task_cpu_possible_mask(p))
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 95e40895a519..233b50b0e123 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3534,7 +3534,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 			 *
 			 * More yuck to audit.
 			 */
-			do_set_cpus_allowed(p, task_cpu_possible_mask(p));
+			do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
 			state = fail;
 			break;
 		case fail:
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 11/19] kthread: Make sure kthread hasn't started while binding it
       [not found] <20241211154035.75565-1-frederic@kernel.org>
  2024-12-11 15:40 ` [PATCH 10/19] sched,arm64: Handle CPU isolation on last resort fallback rq selection Frederic Weisbecker
@ 2024-12-11 15:40 ` Frederic Weisbecker
  2024-12-11 15:40 ` [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node Frederic Weisbecker
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 22+ messages in thread
From: Frederic Weisbecker @ 2024-12-11 15:40 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Andrew Morton, Kees Cook, Peter Zijlstra,
	Thomas Gleixner, Michal Hocko, Vlastimil Babka, linux-mm,
	Paul E. McKenney, Neeraj Upadhyay, Joel Fernandes, Boqun Feng,
	Zqiang, rcu, Uladzislau Rezki

Make sure the kthread is sleeping in the schedule_preempt_disabled()
call before calling its handler when kthread_bind[_mask]() is called
on it. This provides a sanity check verifying that the task is not
randomly blocked later at some point within its function handler, in
which case it could be just concurrently awaken, leaving the call to
do_set_cpus_allowed() without any effect until the next voluntary sleep.

Rely on the wake-up ordering to ensure that the newly introduced "started"
field returns the expected value:

    TASK A                                   TASK B
    ------                                   ------
READ kthread->started
wake_up_process(B)
   rq_lock()
   ...
   rq_unlock() // RELEASE
                                           schedule()
                                              rq_lock() // ACQUIRE
                                              // schedule task B
                                              rq_unlock()
                                              WRITE kthread->started

Similarly, writing kthread->started before subsequent voluntary sleeps
will be visible after calling wait_task_inactive() in
__kthread_bind_mask(), reporting potential misuse of the API.

Upcoming patches will make further use of this facility.

Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/kthread.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index a5ac612b1609..b6f9ce475a4f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -53,6 +53,7 @@ struct kthread_create_info
 struct kthread {
 	unsigned long flags;
 	unsigned int cpu;
+	int started;
 	int result;
 	int (*threadfn)(void *);
 	void *data;
@@ -382,6 +383,8 @@ static int kthread(void *_create)
 	schedule_preempt_disabled();
 	preempt_enable();
 
+	self->started = 1;
+
 	ret = -EINTR;
 	if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
 		cgroup_kthread_ready();
@@ -540,7 +543,9 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int
 
 void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
 {
+	struct kthread *kthread = to_kthread(p);
 	__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
+	WARN_ON_ONCE(kthread->started);
 }
 
 /**
@@ -554,7 +559,9 @@ void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
  */
 void kthread_bind(struct task_struct *p, unsigned int cpu)
 {
+	struct kthread *kthread = to_kthread(p);
 	__kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
+	WARN_ON_ONCE(kthread->started);
 }
 EXPORT_SYMBOL(kthread_bind);
 
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
       [not found] <20241211154035.75565-1-frederic@kernel.org>
  2024-12-11 15:40 ` [PATCH 10/19] sched,arm64: Handle CPU isolation on last resort fallback rq selection Frederic Weisbecker
  2024-12-11 15:40 ` [PATCH 11/19] kthread: Make sure kthread hasn't started while binding it Frederic Weisbecker
@ 2024-12-11 15:40 ` Frederic Weisbecker
  2024-12-11 15:40 ` [PATCH 15/19] kthread: Implement preferred affinity Frederic Weisbecker
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 22+ messages in thread
From: Frederic Weisbecker @ 2024-12-11 15:40 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Andrew Morton, Kees Cook, Peter Zijlstra,
	Thomas Gleixner, Michal Hocko, Vlastimil Babka, linux-mm,
	Paul E. McKenney, Neeraj Upadhyay, Joel Fernandes, Boqun Feng,
	Uladzislau Rezki, Zqiang, rcu

Kthreads attached to a preferred NUMA node for their task structure
allocation can also be assumed to run preferrably within that same node.

A more precise affinity is usually notified by calling
kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.

For the others, a default affinity to the node is desired and sometimes
implemented with more or less success when it comes to deal with hotplug
events and nohz_full / CPU Isolation interactions:

- kcompactd is affine to its node and handles hotplug but not CPU Isolation
- kswapd is affine to its node and ignores hotplug and CPU Isolation
- A bunch of drivers create their kthreads on a specific node and
  don't take care about affining further.

Handle that default node affinity preference at the generic level
instead, provided a kthread is created on an actual node and doesn't
apply any specific affinity such as a given CPU or a custom cpumask to
bind to before its first wake-up.

This generic handling is aware of CPU hotplug events and CPU isolation
such that:

* When a housekeeping CPU goes up that is part of the node of a given
  kthread, the related task is re-affined to that own node if it was
  previously running on the default last resort online housekeeping set
  from other nodes.

* When a housekeeping CPU goes down while it was part of the node of a
  kthread, the running task is migrated (or the sleeping task is woken
  up) automatically by the scheduler to other housekeepers within the
  same node or, as a last resort, to all housekeepers from other nodes.

Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/cpuhotplug.h |   1 +
 kernel/kthread.c           | 106 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index a04b73c40173..6cc5e484547c 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -240,6 +240,7 @@ enum cpuhp_state {
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RANDOM_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
+	CPUHP_AP_KTHREADS_ONLINE,
 	CPUHP_AP_BASE_CACHEINFO_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 40,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b6f9ce475a4f..3394ff024a5a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -35,6 +35,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 
+static LIST_HEAD(kthreads_hotplug);
+static DEFINE_MUTEX(kthreads_hotplug_lock);
+
 struct kthread_create_info
 {
 	/* Information passed to kthread() from kthreadd. */
@@ -53,6 +56,7 @@ struct kthread_create_info
 struct kthread {
 	unsigned long flags;
 	unsigned int cpu;
+	unsigned int node;
 	int started;
 	int result;
 	int (*threadfn)(void *);
@@ -64,6 +68,8 @@ struct kthread {
 #endif
 	/* To store the full name if task comm is truncated. */
 	char *full_name;
+	struct task_struct *task;
+	struct list_head hotplug_node;
 };
 
 enum KTHREAD_BITS {
@@ -122,8 +128,11 @@ bool set_kthread_struct(struct task_struct *p)
 
 	init_completion(&kthread->exited);
 	init_completion(&kthread->parked);
+	INIT_LIST_HEAD(&kthread->hotplug_node);
 	p->vfork_done = &kthread->exited;
 
+	kthread->task = p;
+	kthread->node = tsk_fork_get_node(current);
 	p->worker_private = kthread;
 	return true;
 }
@@ -314,6 +323,11 @@ void __noreturn kthread_exit(long result)
 {
 	struct kthread *kthread = to_kthread(current);
 	kthread->result = result;
+	if (!list_empty(&kthread->hotplug_node)) {
+		mutex_lock(&kthreads_hotplug_lock);
+		list_del(&kthread->hotplug_node);
+		mutex_unlock(&kthreads_hotplug_lock);
+	}
 	do_exit(0);
 }
 EXPORT_SYMBOL(kthread_exit);
@@ -339,6 +353,48 @@ void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
 }
 EXPORT_SYMBOL(kthread_complete_and_exit);
 
+static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask)
+{
+	cpumask_and(cpumask, cpumask_of_node(kthread->node),
+		    housekeeping_cpumask(HK_TYPE_KTHREAD));
+
+	if (cpumask_empty(cpumask))
+		cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+}
+
+static void kthread_affine_node(void)
+{
+	struct kthread *kthread = to_kthread(current);
+	cpumask_var_t affinity;
+
+	WARN_ON_ONCE(kthread_is_per_cpu(current));
+
+	if (kthread->node == NUMA_NO_NODE) {
+		housekeeping_affine(current, HK_TYPE_KTHREAD);
+	} else {
+		if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
+			WARN_ON_ONCE(1);
+			return;
+		}
+
+		mutex_lock(&kthreads_hotplug_lock);
+		WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
+		list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+		/*
+		 * The node cpumask is racy when read from kthread() but:
+		 * - a racing CPU going down will either fail on the subsequent
+		 *   call to set_cpus_allowed_ptr() or be migrated to housekeepers
+		 *   afterwards by the scheduler.
+		 * - a racing CPU going up will be handled by kthreads_online_cpu()
+		 */
+		kthread_fetch_affinity(kthread, affinity);
+		set_cpus_allowed_ptr(current, affinity);
+		mutex_unlock(&kthreads_hotplug_lock);
+
+		free_cpumask_var(affinity);
+	}
+}
+
 static int kthread(void *_create)
 {
 	static const struct sched_param param = { .sched_priority = 0 };
@@ -369,7 +425,6 @@ static int kthread(void *_create)
 	 * back to default in case they have been changed.
 	 */
 	sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
-	set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -385,6 +440,9 @@ static int kthread(void *_create)
 
 	self->started = 1;
 
+	if (!(current->flags & PF_NO_SETAFFINITY))
+		kthread_affine_node();
+
 	ret = -EINTR;
 	if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
 		cgroup_kthread_ready();
@@ -781,6 +839,52 @@ int kthreadd(void *unused)
 	return 0;
 }
 
+/*
+ * Re-affine kthreads according to their preferences
+ * and the newly online CPU. The CPU down part is handled
+ * by select_fallback_rq() which default re-affines to
+ * housekeepers in case the preferred affinity doesn't
+ * apply anymore.
+ */
+static int kthreads_online_cpu(unsigned int cpu)
+{
+	cpumask_var_t affinity;
+	struct kthread *k;
+	int ret;
+
+	guard(mutex)(&kthreads_hotplug_lock);
+
+	if (list_empty(&kthreads_hotplug))
+		return 0;
+
+	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = 0;
+
+	list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
+		if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
+				 kthread_is_per_cpu(k->task) ||
+				 k->node == NUMA_NO_NODE)) {
+			ret = -EINVAL;
+			continue;
+		}
+		kthread_fetch_affinity(k, affinity);
+		set_cpus_allowed_ptr(k->task, affinity);
+	}
+
+	free_cpumask_var(affinity);
+
+	return ret;
+}
+
+static int kthreads_init(void)
+{
+	return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
+				kthreads_online_cpu, NULL);
+}
+early_initcall(kthreads_init);
+
 void __kthread_init_worker(struct kthread_worker *worker,
 				const char *name,
 				struct lock_class_key *key)
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 15/19] kthread: Implement preferred affinity
       [not found] <20241211154035.75565-1-frederic@kernel.org>
                   ` (2 preceding siblings ...)
  2024-12-11 15:40 ` [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node Frederic Weisbecker
@ 2024-12-11 15:40 ` Frederic Weisbecker
  2024-12-11 15:40 ` [PATCH 16/19] rcu: Use kthread preferred affinity for RCU boost Frederic Weisbecker
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 22+ messages in thread
From: Frederic Weisbecker @ 2024-12-11 15:40 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Andrew Morton, Kees Cook, Peter Zijlstra,
	Thomas Gleixner, Michal Hocko, Vlastimil Babka, linux-mm,
	Paul E. McKenney, Neeraj Upadhyay, Joel Fernandes, Boqun Feng,
	Uladzislau Rezki, Zqiang, rcu

Affining kthreads follow either of four existing different patterns:

1) Per-CPU kthreads must stay affine to a single CPU and never execute
   relevant code on any other CPU. This is currently handled by smpboot
   code which takes care of CPU-hotplug operations.

2) Kthreads that _have_ to be affine to a specific set of CPUs and can't
   run anywhere else. The affinity is set through kthread_bind_mask()
   and the subsystem takes care by itself to handle CPU-hotplug operations.

3) Kthreads that prefer to be affine to a specific NUMA node. That
   preferred affinity is applied by default when an actual node ID is
   passed on kthread creation, provided the kthread is not per-CPU and
   no call to kthread_bind_mask() has been issued before the first
   wake-up.

4) Similar to the previous point but kthreads have a preferred affinity
   different than a node. It is set manually like any other task and
   CPU-hotplug is supposed to be handled by the relevant subsystem so
   that the task is properly reaffined whenever a given CPU from the
   preferred affinity comes up. Also care must be taken so that the
   preferred affinity doesn't cross housekeeping cpumask boundaries.

Provide a function to handle the last usecase, mostly reusing the
current node default affinity infrastructure. kthread_affine_preferred()
is introduced, to be used just like kthread_bind_mask(), right after
kthread creation and before the first wake up. The kthread is then
affine right away to the cpumask passed through the API if it has online
housekeeping CPUs. Otherwise it will be affine to all online
housekeeping CPUs as a last resort.

As with node affinity, it is aware of CPU hotplug events such that:

* When a housekeeping CPU goes up that is part of the preferred affinity
  of a given kthread, the related task is re-affined to that preferred
  affinity if it was previously running on the default last resort
  online housekeeping set.

* When a housekeeping CPU goes down while it was part of the preferred
  affinity of a kthread, the running task is migrated (or the sleeping
  task is woken up) automatically by the scheduler to other housekeepers
  within the preferred affinity or, as a last resort, to all
  housekeepers from other nodes.

Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/kthread.h |  1 +
 kernel/kthread.c        | 68 ++++++++++++++++++++++++++++++++++++-----
 2 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index b11f53c1ba2e..30209bdf83a2 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -85,6 +85,7 @@ kthread_run_on_cpu(int (*threadfn)(void *data), void *data,
 void free_kthread_struct(struct task_struct *k);
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
+int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
 int kthread_stop_put(struct task_struct *k);
 bool kthread_should_stop(void);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3394ff024a5a..6bb958a75a0b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -70,6 +70,7 @@ struct kthread {
 	char *full_name;
 	struct task_struct *task;
 	struct list_head hotplug_node;
+	struct cpumask *preferred_affinity;
 };
 
 enum KTHREAD_BITS {
@@ -327,6 +328,11 @@ void __noreturn kthread_exit(long result)
 		mutex_lock(&kthreads_hotplug_lock);
 		list_del(&kthread->hotplug_node);
 		mutex_unlock(&kthreads_hotplug_lock);
+
+		if (kthread->preferred_affinity) {
+			kfree(kthread->preferred_affinity);
+			kthread->preferred_affinity = NULL;
+		}
 	}
 	do_exit(0);
 }
@@ -355,9 +361,17 @@ EXPORT_SYMBOL(kthread_complete_and_exit);
 
 static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask)
 {
-	cpumask_and(cpumask, cpumask_of_node(kthread->node),
-		    housekeeping_cpumask(HK_TYPE_KTHREAD));
+	const struct cpumask *pref;
 
+	if (kthread->preferred_affinity) {
+		pref = kthread->preferred_affinity;
+	} else {
+		if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE))
+			return;
+		pref = cpumask_of_node(kthread->node);
+	}
+
+	cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD));
 	if (cpumask_empty(cpumask))
 		cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD));
 }
@@ -440,7 +454,7 @@ static int kthread(void *_create)
 
 	self->started = 1;
 
-	if (!(current->flags & PF_NO_SETAFFINITY))
+	if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
 		kthread_affine_node();
 
 	ret = -EINTR;
@@ -839,12 +853,53 @@ int kthreadd(void *unused)
 	return 0;
 }
 
+int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
+{
+	struct kthread *kthread = to_kthread(p);
+	cpumask_var_t affinity;
+	unsigned long flags;
+	int ret;
+
+	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	WARN_ON_ONCE(kthread->preferred_affinity);
+
+	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+		return -ENOMEM;
+
+	kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL);
+	if (!kthread->preferred_affinity) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&kthreads_hotplug_lock);
+	cpumask_copy(kthread->preferred_affinity, mask);
+	WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
+	list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+	kthread_fetch_affinity(kthread, affinity);
+
+	/* It's safe because the task is inactive. */
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	do_set_cpus_allowed(p, affinity);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	mutex_unlock(&kthreads_hotplug_lock);
+out:
+	free_cpumask_var(affinity);
+
+	return 0;
+}
+
 /*
  * Re-affine kthreads according to their preferences
  * and the newly online CPU. The CPU down part is handled
  * by select_fallback_rq() which default re-affines to
- * housekeepers in case the preferred affinity doesn't
- * apply anymore.
+ * housekeepers from other nodes in case the preferred
+ * affinity doesn't apply anymore.
  */
 static int kthreads_online_cpu(unsigned int cpu)
 {
@@ -864,8 +919,7 @@ static int kthreads_online_cpu(unsigned int cpu)
 
 	list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
 		if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
-				 kthread_is_per_cpu(k->task) ||
-				 k->node == NUMA_NO_NODE)) {
+				 kthread_is_per_cpu(k->task))) {
 			ret = -EINVAL;
 			continue;
 		}
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 16/19] rcu: Use kthread preferred affinity for RCU boost
       [not found] <20241211154035.75565-1-frederic@kernel.org>
                   ` (3 preceding siblings ...)
  2024-12-11 15:40 ` [PATCH 15/19] kthread: Implement preferred affinity Frederic Weisbecker
@ 2024-12-11 15:40 ` Frederic Weisbecker
  2024-12-11 15:40 ` [PATCH 17/19] kthread: Unify kthread_create_on_cpu() and kthread_create_worker_on_cpu() automatic format Frederic Weisbecker
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 22+ messages in thread
From: Frederic Weisbecker @ 2024-12-11 15:40 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Paul E. McKenney, Uladzislau Rezki,
	Neeraj Upadhyay, Joel Fernandes, Boqun Feng, Zqiang, rcu,
	Andrew Morton, Peter Zijlstra, Thomas Gleixner, Michal Hocko,
	Vlastimil Babka

Now that kthreads have an infrastructure to handle preferred affinity
against CPU hotplug and housekeeping cpumask, convert RCU boost to use
it instead of handling all the constraints by itself.

Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree.c        | 27 +++++++++++++++++++--------
 kernel/rcu/tree_plugin.h | 11 ++---------
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ff98233d4aa5..4a4c49821058 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -149,7 +149,6 @@ static int rcu_scheduler_fully_active __read_mostly;
 
 static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
 			      unsigned long gps, unsigned long flags);
-static struct task_struct *rcu_boost_task(struct rcu_node *rnp);
 static void invoke_rcu_core(void);
 static void rcu_report_exp_rdp(struct rcu_data *rdp);
 static void sync_sched_exp_online_cleanup(int cpu);
@@ -5011,6 +5010,22 @@ int rcutree_prepare_cpu(unsigned int cpu)
 	return 0;
 }
 
+static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp)
+{
+	cpumask_var_t affinity;
+	int cpu;
+
+	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+		return;
+
+	for_each_leaf_node_possible_cpu(rnp, cpu)
+		cpumask_set_cpu(cpu, affinity);
+
+	kthread_affine_preferred(t, affinity);
+
+	free_cpumask_var(affinity);
+}
+
 /*
  * Update kthreads affinity during CPU-hotplug changes.
  *
@@ -5030,19 +5045,18 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
 	unsigned long mask;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
-	struct task_struct *task_boost, *task_exp;
+	struct task_struct *task_exp;
 
 	rdp = per_cpu_ptr(&rcu_data, cpu);
 	rnp = rdp->mynode;
 
-	task_boost = rcu_boost_task(rnp);
 	task_exp = rcu_exp_par_gp_task(rnp);
 
 	/*
-	 * If CPU is the boot one, those tasks are created later from early
+	 * If CPU is the boot one, this task is created later from early
 	 * initcall since kthreadd must be created first.
 	 */
-	if (!task_boost && !task_exp)
+	if (!task_exp)
 		return;
 
 	if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
@@ -5064,9 +5078,6 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
 	if (task_exp)
 		set_cpus_allowed_ptr(task_exp, cm);
 
-	if (task_boost)
-		set_cpus_allowed_ptr(task_boost, cm);
-
 	mutex_unlock(&rnp->kthread_mutex);
 
 	free_cpumask_var(cm);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3927ea5f7955..b141b8cc4f41 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1217,16 +1217,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	rnp->boost_kthread_task = t;
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
 	sp.sched_priority = kthread_prio;
 	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+	rcu_thread_affine_rnp(t, rnp);
 	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
 }
 
-static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
-{
-	return READ_ONCE(rnp->boost_kthread_task);
-}
-
 #else /* #ifdef CONFIG_RCU_BOOST */
 
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1243,10 +1240,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
 {
 }
 
-static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
-{
-	return NULL;
-}
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
 
 /*
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 17/19] kthread: Unify kthread_create_on_cpu() and kthread_create_worker_on_cpu() automatic format
       [not found] <20241211154035.75565-1-frederic@kernel.org>
                   ` (4 preceding siblings ...)
  2024-12-11 15:40 ` [PATCH 16/19] rcu: Use kthread preferred affinity for RCU boost Frederic Weisbecker
@ 2024-12-11 15:40 ` Frederic Weisbecker
  2024-12-11 15:40 ` [PATCH 18/19] treewide: Introduce kthread_run_worker[_on_cpu]() Frederic Weisbecker
  2024-12-11 15:40 ` [PATCH 19/19] rcu: Use kthread preferred affinity for RCU exp kworkers Frederic Weisbecker
  7 siblings, 0 replies; 22+ messages in thread
From: Frederic Weisbecker @ 2024-12-11 15:40 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Paul E. McKenney, Uladzislau Rezki,
	Neeraj Upadhyay, Joel Fernandes, Boqun Feng, Zqiang, rcu,
	Andrew Morton, Peter Zijlstra, Thomas Gleixner, Michal Hocko,
	Vlastimil Babka

kthread_create_on_cpu() uses the CPU argument as an implicit and unique
printf argument to add to the format whereas
kthread_create_worker_on_cpu() still relies on explicitly passing the
printf arguments. This difference in behaviour is error prone and
doesn't help standardizing per-CPU kthread names.

Unify the behaviours and convert kthread_create_worker_on_cpu() to
use the printf behaviour of kthread_create_on_cpu().

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 fs/erofs/zdata.c        |  2 +-
 include/linux/kthread.h | 21 +++++++++++----
 kernel/kthread.c        | 59 ++++++++++++++++++++++++-----------------
 3 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 01f147505487..a23392327ce2 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -320,7 +320,7 @@ static void erofs_destroy_percpu_workers(void)
 static struct kthread_worker *erofs_init_percpu_worker(int cpu)
 {
 	struct kthread_worker *worker =
-		kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu);
+		kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u");
 
 	if (IS_ERR(worker))
 		return worker;
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 30209bdf83a2..0c66e7c1092a 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -187,13 +187,24 @@ extern void __kthread_init_worker(struct kthread_worker *worker,
 
 int kthread_worker_fn(void *worker_ptr);
 
-__printf(2, 3)
+__printf(3, 4)
+struct kthread_worker *kthread_create_worker_on_node(unsigned int flags,
+						     int node,
+						     const char namefmt[], ...);
+
+#define kthread_create_worker(flags, namefmt, ...) \
+({									   \
+	struct kthread_worker *__kw					   \
+		= kthread_create_worker_on_node(flags, NUMA_NO_NODE,	   \
+						namefmt, ## __VA_ARGS__);  \
+	if (!IS_ERR(__kw))						   \
+		wake_up_process(__kw->task);				   \
+	__kw;								   \
+})
+
 struct kthread_worker *
-kthread_create_worker(unsigned int flags, const char namefmt[], ...);
-
-__printf(3, 4) struct kthread_worker *
 kthread_create_worker_on_cpu(int cpu, unsigned int flags,
-			     const char namefmt[], ...);
+			     const char namefmt[]);
 
 bool kthread_queue_work(struct kthread_worker *worker,
 			struct kthread_work *work);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6bb958a75a0b..429ada0dbe93 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1030,12 +1030,11 @@ int kthread_worker_fn(void *worker_ptr)
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
 
 static __printf(3, 0) struct kthread_worker *
-__kthread_create_worker(int cpu, unsigned int flags,
-			const char namefmt[], va_list args)
+__kthread_create_worker_on_node(unsigned int flags, int node,
+				const char namefmt[], va_list args)
 {
 	struct kthread_worker *worker;
 	struct task_struct *task;
-	int node = NUMA_NO_NODE;
 
 	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
 	if (!worker)
@@ -1043,20 +1042,14 @@ __kthread_create_worker(int cpu, unsigned int flags,
 
 	kthread_init_worker(worker);
 
-	if (cpu >= 0)
-		node = cpu_to_node(cpu);
-
 	task = __kthread_create_on_node(kthread_worker_fn, worker,
-						node, namefmt, args);
+					node, namefmt, args);
 	if (IS_ERR(task))
 		goto fail_task;
 
-	if (cpu >= 0)
-		kthread_bind(task, cpu);
-
 	worker->flags = flags;
 	worker->task = task;
-	wake_up_process(task);
+
 	return worker;
 
 fail_task:
@@ -1067,6 +1060,7 @@ __kthread_create_worker(int cpu, unsigned int flags,
 /**
  * kthread_create_worker - create a kthread worker
  * @flags: flags modifying the default behavior of the worker
+ * @node: task structure for the thread is allocated on this node
  * @namefmt: printf-style name for the kthread worker (task).
  *
  * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
@@ -1074,25 +1068,49 @@ __kthread_create_worker(int cpu, unsigned int flags,
  * when the caller was killed by a fatal signal.
  */
 struct kthread_worker *
-kthread_create_worker(unsigned int flags, const char namefmt[], ...)
+kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...)
 {
 	struct kthread_worker *worker;
 	va_list args;
 
 	va_start(args, namefmt);
-	worker = __kthread_create_worker(-1, flags, namefmt, args);
+	worker = __kthread_create_worker_on_node(flags, node, namefmt, args);
 	va_end(args);
 
+	if (worker)
+		wake_up_process(worker->task);
+
+	return worker;
+}
+EXPORT_SYMBOL(kthread_create_worker_on_node);
+
+static __printf(3, 4) struct kthread_worker *
+__kthread_create_worker_on_cpu(int cpu, unsigned int flags,
+			       const char namefmt[], ...)
+{
+	struct kthread_worker *worker;
+	va_list args;
+
+	va_start(args, namefmt);
+	worker = __kthread_create_worker_on_node(flags, cpu_to_node(cpu),
+						 namefmt, args);
+	va_end(args);
+
+	if (worker) {
+		kthread_bind(worker->task, cpu);
+		wake_up_process(worker->task);
+	}
+
 	return worker;
 }
-EXPORT_SYMBOL(kthread_create_worker);
 
 /**
  * kthread_create_worker_on_cpu - create a kthread worker and bind it
  *	to a given CPU and the associated NUMA node.
  * @cpu: CPU number
  * @flags: flags modifying the default behavior of the worker
- * @namefmt: printf-style name for the kthread worker (task).
+ * @namefmt: printf-style name for the thread. Format is restricted
+ *	     to "name.*%u". Code fills in cpu number.
  *
  * Use a valid CPU number if you want to bind the kthread worker
  * to the given CPU and the associated NUMA node.
@@ -1124,16 +1142,9 @@ EXPORT_SYMBOL(kthread_create_worker);
  */
 struct kthread_worker *
 kthread_create_worker_on_cpu(int cpu, unsigned int flags,
-			     const char namefmt[], ...)
+			     const char namefmt[])
 {
-	struct kthread_worker *worker;
-	va_list args;
-
-	va_start(args, namefmt);
-	worker = __kthread_create_worker(cpu, flags, namefmt, args);
-	va_end(args);
-
-	return worker;
+	return __kthread_create_worker_on_cpu(cpu, flags, namefmt, cpu);
 }
 EXPORT_SYMBOL(kthread_create_worker_on_cpu);
 
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 18/19] treewide: Introduce kthread_run_worker[_on_cpu]()
       [not found] <20241211154035.75565-1-frederic@kernel.org>
                   ` (5 preceding siblings ...)
  2024-12-11 15:40 ` [PATCH 17/19] kthread: Unify kthread_create_on_cpu() and kthread_create_worker_on_cpu() automatic format Frederic Weisbecker
@ 2024-12-11 15:40 ` Frederic Weisbecker
  2024-12-11 15:40 ` [PATCH 19/19] rcu: Use kthread preferred affinity for RCU exp kworkers Frederic Weisbecker
  7 siblings, 0 replies; 22+ messages in thread
From: Frederic Weisbecker @ 2024-12-11 15:40 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Paul E. McKenney, Uladzislau Rezki,
	Neeraj Upadhyay, Joel Fernandes, Boqun Feng, Zqiang, rcu,
	Andrew Morton, Peter Zijlstra, Thomas Gleixner, Michal Hocko,
	Vlastimil Babka

kthread_create() creates a kthread without running it yet. kthread_run()
creates a kthread and runs it.

On the other hand, kthread_create_worker() creates a kthread worker and
runs it.

This difference in behaviours is confusing. Also there is no way to
create a kthread worker and affine it using kthread_bind_mask() or
kthread_affine_preferred() before starting it.

Consolidate the behaviours and introduce kthread_run_worker[_on_cpu]()
that behaves just like kthread_run(). kthread_create_worker[_on_cpu]()
will now only create a kthread worker without starting it.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 arch/x86/kvm/i8254.c                          |  2 +-
 crypto/crypto_engine.c                        |  2 +-
 drivers/cpufreq/cppc_cpufreq.c                |  2 +-
 drivers/gpu/drm/drm_vblank_work.c             |  2 +-
 .../drm/i915/gem/selftests/i915_gem_context.c |  2 +-
 drivers/gpu/drm/i915/gt/selftest_execlists.c  |  2 +-
 drivers/gpu/drm/i915/gt/selftest_hangcheck.c  |  2 +-
 drivers/gpu/drm/i915/gt/selftest_slpc.c       |  2 +-
 drivers/gpu/drm/i915/selftests/i915_request.c |  8 ++--
 drivers/gpu/drm/msm/disp/msm_disp_snapshot.c  |  2 +-
 drivers/gpu/drm/msm/msm_atomic.c              |  2 +-
 drivers/gpu/drm/msm/msm_gpu.c                 |  2 +-
 drivers/gpu/drm/msm/msm_kms.c                 |  2 +-
 .../platform/chips-media/wave5/wave5-vpu.c    |  2 +-
 drivers/net/dsa/mv88e6xxx/chip.c              |  2 +-
 drivers/net/ethernet/intel/ice/ice_dpll.c     |  2 +-
 drivers/net/ethernet/intel/ice/ice_gnss.c     |  2 +-
 drivers/net/ethernet/intel/ice/ice_ptp.c      |  2 +-
 drivers/platform/chrome/cros_ec_spi.c         |  2 +-
 drivers/ptp/ptp_clock.c                       |  2 +-
 drivers/spi/spi.c                             |  2 +-
 drivers/usb/typec/tcpm/tcpm.c                 |  2 +-
 drivers/vdpa/vdpa_sim/vdpa_sim.c              |  2 +-
 drivers/watchdog/watchdog_dev.c               |  2 +-
 fs/erofs/zdata.c                              |  2 +-
 include/linux/kthread.h                       | 48 ++++++++++++++++---
 kernel/kthread.c                              | 31 +++---------
 kernel/rcu/tree.c                             |  4 +-
 kernel/sched/ext.c                            |  2 +-
 kernel/workqueue.c                            |  2 +-
 net/dsa/tag_ksz.c                             |  2 +-
 net/dsa/tag_ocelot_8021q.c                    |  2 +-
 net/dsa/tag_sja1105.c                         |  2 +-
 33 files changed, 83 insertions(+), 66 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index cd57a517d04a..d7ab8780ab9e 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -681,7 +681,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	pid_nr = pid_vnr(pid);
 	put_pid(pid);
 
-	pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr);
+	pit->worker = kthread_run_worker(0, "kvm-pit/%d", pid_nr);
 	if (IS_ERR(pit->worker))
 		goto fail_kthread;
 
diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c
index e60a0eb628e8..c7c16da5e649 100644
--- a/crypto/crypto_engine.c
+++ b/crypto/crypto_engine.c
@@ -517,7 +517,7 @@ struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
 	crypto_init_queue(&engine->queue, qlen);
 	spin_lock_init(&engine->queue_lock);
 
-	engine->kworker = kthread_create_worker(0, "%s", engine->name);
+	engine->kworker = kthread_run_worker(0, "%s", engine->name);
 	if (IS_ERR(engine->kworker)) {
 		dev_err(dev, "failed to create crypto request pump task\n");
 		return NULL;
diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index bd8f75accfa0..2486a6c5256a 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -225,7 +225,7 @@ static void __init cppc_freq_invariance_init(void)
 	if (fie_disabled)
 		return;
 
-	kworker_fie = kthread_create_worker(0, "cppc_fie");
+	kworker_fie = kthread_run_worker(0, "cppc_fie");
 	if (IS_ERR(kworker_fie)) {
 		pr_warn("%s: failed to create kworker_fie: %ld\n", __func__,
 			PTR_ERR(kworker_fie));
diff --git a/drivers/gpu/drm/drm_vblank_work.c b/drivers/gpu/drm/drm_vblank_work.c
index 1752ffb44e1d..9cc71120246f 100644
--- a/drivers/gpu/drm/drm_vblank_work.c
+++ b/drivers/gpu/drm/drm_vblank_work.c
@@ -277,7 +277,7 @@ int drm_vblank_worker_init(struct drm_vblank_crtc *vblank)
 
 	INIT_LIST_HEAD(&vblank->pending_work);
 	init_waitqueue_head(&vblank->work_wait_queue);
-	worker = kthread_create_worker(0, "card%d-crtc%d",
+	worker = kthread_run_worker(0, "card%d-crtc%d",
 				       vblank->dev->primary->index,
 				       vblank->pipe);
 	if (IS_ERR(worker))
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
index 89d4dc8b60c6..eb0158e43417 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
@@ -369,7 +369,7 @@ static int live_parallel_switch(void *arg)
 		if (!data[n].ce[0])
 			continue;
 
-		worker = kthread_create_worker(0, "igt/parallel:%s",
+		worker = kthread_run_worker(0, "igt/parallel:%s",
 					       data[n].ce[0]->engine->name);
 		if (IS_ERR(worker)) {
 			err = PTR_ERR(worker);
diff --git a/drivers/gpu/drm/i915/gt/selftest_execlists.c b/drivers/gpu/drm/i915/gt/selftest_execlists.c
index 222ca7c44951..81c31396eceb 100644
--- a/drivers/gpu/drm/i915/gt/selftest_execlists.c
+++ b/drivers/gpu/drm/i915/gt/selftest_execlists.c
@@ -3574,7 +3574,7 @@ static int smoke_crescendo(struct preempt_smoke *smoke, unsigned int flags)
 			arg[id].batch = NULL;
 		arg[id].count = 0;
 
-		worker[id] = kthread_create_worker(0, "igt/smoke:%d", id);
+		worker[id] = kthread_run_worker(0, "igt/smoke:%d", id);
 		if (IS_ERR(worker[id])) {
 			err = PTR_ERR(worker[id]);
 			break;
diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
index 9ce8ff1c04fe..9d3aeb237295 100644
--- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
@@ -1025,7 +1025,7 @@ static int __igt_reset_engines(struct intel_gt *gt,
 			threads[tmp].engine = other;
 			threads[tmp].flags = flags;
 
-			worker = kthread_create_worker(0, "igt/%s",
+			worker = kthread_run_worker(0, "igt/%s",
 						       other->name);
 			if (IS_ERR(worker)) {
 				err = PTR_ERR(worker);
diff --git a/drivers/gpu/drm/i915/gt/selftest_slpc.c b/drivers/gpu/drm/i915/gt/selftest_slpc.c
index 4ecc4ae74a54..e218b229681f 100644
--- a/drivers/gpu/drm/i915/gt/selftest_slpc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_slpc.c
@@ -489,7 +489,7 @@ static int live_slpc_tile_interaction(void *arg)
 		return -ENOMEM;
 
 	for_each_gt(gt, i915, i) {
-		threads[i].worker = kthread_create_worker(0, "igt/slpc_parallel:%d", gt->info.id);
+		threads[i].worker = kthread_run_worker(0, "igt/slpc_parallel:%d", gt->info.id);
 
 		if (IS_ERR(threads[i].worker)) {
 			ret = PTR_ERR(threads[i].worker);
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index acae30a04a94..88870844b5bd 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -492,7 +492,7 @@ static int mock_breadcrumbs_smoketest(void *arg)
 	for (n = 0; n < ncpus; n++) {
 		struct kthread_worker *worker;
 
-		worker = kthread_create_worker(0, "igt/%d", n);
+		worker = kthread_run_worker(0, "igt/%d", n);
 		if (IS_ERR(worker)) {
 			ret = PTR_ERR(worker);
 			ncpus = n;
@@ -1645,7 +1645,7 @@ static int live_parallel_engines(void *arg)
 		for_each_uabi_engine(engine, i915) {
 			struct kthread_worker *worker;
 
-			worker = kthread_create_worker(0, "igt/parallel:%s",
+			worker = kthread_run_worker(0, "igt/parallel:%s",
 						       engine->name);
 			if (IS_ERR(worker)) {
 				err = PTR_ERR(worker);
@@ -1806,7 +1806,7 @@ static int live_breadcrumbs_smoketest(void *arg)
 			unsigned int i = idx * ncpus + n;
 			struct kthread_worker *worker;
 
-			worker = kthread_create_worker(0, "igt/%d.%d", idx, n);
+			worker = kthread_run_worker(0, "igt/%d.%d", idx, n);
 			if (IS_ERR(worker)) {
 				ret = PTR_ERR(worker);
 				goto out_flush;
@@ -3219,7 +3219,7 @@ static int perf_parallel_engines(void *arg)
 
 			memset(&engines[idx].p, 0, sizeof(engines[idx].p));
 
-			worker = kthread_create_worker(0, "igt:%s",
+			worker = kthread_run_worker(0, "igt:%s",
 						       engine->name);
 			if (IS_ERR(worker)) {
 				err = PTR_ERR(worker);
diff --git a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
index e75b97127c0d..2be00b11e557 100644
--- a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
+++ b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
@@ -109,7 +109,7 @@ int msm_disp_snapshot_init(struct drm_device *drm_dev)
 
 	mutex_init(&kms->dump_mutex);
 
-	kms->dump_worker = kthread_create_worker(0, "%s", "disp_snapshot");
+	kms->dump_worker = kthread_run_worker(0, "%s", "disp_snapshot");
 	if (IS_ERR(kms->dump_worker))
 		DRM_ERROR("failed to create disp state task\n");
 
diff --git a/drivers/gpu/drm/msm/msm_atomic.c b/drivers/gpu/drm/msm/msm_atomic.c
index 9c45d641b521..a7a2384044ff 100644
--- a/drivers/gpu/drm/msm/msm_atomic.c
+++ b/drivers/gpu/drm/msm/msm_atomic.c
@@ -115,7 +115,7 @@ int msm_atomic_init_pending_timer(struct msm_pending_timer *timer,
 	timer->kms = kms;
 	timer->crtc_idx = crtc_idx;
 
-	timer->worker = kthread_create_worker(0, "atomic-worker-%d", crtc_idx);
+	timer->worker = kthread_run_worker(0, "atomic-worker-%d", crtc_idx);
 	if (IS_ERR(timer->worker)) {
 		int ret = PTR_ERR(timer->worker);
 		timer->worker = NULL;
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 0d4a3744cfcb..8557998e0c92 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -859,7 +859,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	gpu->funcs = funcs;
 	gpu->name = name;
 
-	gpu->worker = kthread_create_worker(0, "gpu-worker");
+	gpu->worker = kthread_run_worker(0, "gpu-worker");
 	if (IS_ERR(gpu->worker)) {
 		ret = PTR_ERR(gpu->worker);
 		gpu->worker = NULL;
diff --git a/drivers/gpu/drm/msm/msm_kms.c b/drivers/gpu/drm/msm/msm_kms.c
index f3326d09bdbc..dac831ba6219 100644
--- a/drivers/gpu/drm/msm/msm_kms.c
+++ b/drivers/gpu/drm/msm/msm_kms.c
@@ -269,7 +269,7 @@ int msm_drm_kms_init(struct device *dev, const struct drm_driver *drv)
 		/* initialize event thread */
 		ev_thread = &priv->event_thread[drm_crtc_index(crtc)];
 		ev_thread->dev = ddev;
-		ev_thread->worker = kthread_create_worker(0, "crtc_event:%d", crtc->base.id);
+		ev_thread->worker = kthread_run_worker(0, "crtc_event:%d", crtc->base.id);
 		if (IS_ERR(ev_thread->worker)) {
 			ret = PTR_ERR(ev_thread->worker);
 			DRM_DEV_ERROR(dev, "failed to create crtc_event kthread\n");
diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu.c b/drivers/media/platform/chips-media/wave5/wave5-vpu.c
index 6b294a2d6717..d1320298a0f7 100644
--- a/drivers/media/platform/chips-media/wave5/wave5-vpu.c
+++ b/drivers/media/platform/chips-media/wave5/wave5-vpu.c
@@ -271,7 +271,7 @@ static int wave5_vpu_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "failed to get irq resource, falling back to polling\n");
 		hrtimer_init(&dev->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 		dev->hrtimer.function = &wave5_vpu_timer_callback;
-		dev->worker = kthread_create_worker(0, "vpu_irq_thread");
+		dev->worker = kthread_run_worker(0, "vpu_irq_thread");
 		if (IS_ERR(dev->worker)) {
 			dev_err(&pdev->dev, "failed to create vpu irq worker\n");
 			ret = PTR_ERR(dev->worker);
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 3a792f79270d..377e66cf7a48 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -394,7 +394,7 @@ static int mv88e6xxx_irq_poll_setup(struct mv88e6xxx_chip *chip)
 	kthread_init_delayed_work(&chip->irq_poll_work,
 				  mv88e6xxx_irq_poll);
 
-	chip->kworker = kthread_create_worker(0, "%s", dev_name(chip->dev));
+	chip->kworker = kthread_run_worker(0, "%s", dev_name(chip->dev));
 	if (IS_ERR(chip->kworker))
 		return PTR_ERR(chip->kworker);
 
diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c
index d5ad6d84007c..75570be61fef 100644
--- a/drivers/net/ethernet/intel/ice/ice_dpll.c
+++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
@@ -2053,7 +2053,7 @@ static int ice_dpll_init_worker(struct ice_pf *pf)
 	struct kthread_worker *kworker;
 
 	kthread_init_delayed_work(&d->work, ice_dpll_periodic_work);
-	kworker = kthread_create_worker(0, "ice-dplls-%s",
+	kworker = kthread_run_worker(0, "ice-dplls-%s",
 					dev_name(ice_pf_to_dev(pf)));
 	if (IS_ERR(kworker))
 		return PTR_ERR(kworker);
diff --git a/drivers/net/ethernet/intel/ice/ice_gnss.c b/drivers/net/ethernet/intel/ice/ice_gnss.c
index f02e8ca55375..b2148dbe49b2 100644
--- a/drivers/net/ethernet/intel/ice/ice_gnss.c
+++ b/drivers/net/ethernet/intel/ice/ice_gnss.c
@@ -182,7 +182,7 @@ static struct gnss_serial *ice_gnss_struct_init(struct ice_pf *pf)
 	pf->gnss_serial = gnss;
 
 	kthread_init_delayed_work(&gnss->read_work, ice_gnss_read);
-	kworker = kthread_create_worker(0, "ice-gnss-%s", dev_name(dev));
+	kworker = kthread_run_worker(0, "ice-gnss-%s", dev_name(dev));
 	if (IS_ERR(kworker)) {
 		kfree(gnss);
 		return NULL;
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c
index a999fface272..3154bb674dd3 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp.c
+++ b/drivers/net/ethernet/intel/ice/ice_ptp.c
@@ -3080,7 +3080,7 @@ static int ice_ptp_init_work(struct ice_pf *pf, struct ice_ptp *ptp)
 	/* Allocate a kworker for handling work required for the ports
 	 * connected to the PTP hardware clock.
 	 */
-	kworker = kthread_create_worker(0, "ice-ptp-%s",
+	kworker = kthread_run_worker(0, "ice-ptp-%s",
 					dev_name(ice_pf_to_dev(pf)));
 	if (IS_ERR(kworker))
 		return PTR_ERR(kworker);
diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c
index 86a3d32a7763..08f566cc1480 100644
--- a/drivers/platform/chrome/cros_ec_spi.c
+++ b/drivers/platform/chrome/cros_ec_spi.c
@@ -715,7 +715,7 @@ static int cros_ec_spi_devm_high_pri_alloc(struct device *dev,
 	int err;
 
 	ec_spi->high_pri_worker =
-		kthread_create_worker(0, "cros_ec_spi_high_pri");
+		kthread_run_worker(0, "cros_ec_spi_high_pri");
 
 	if (IS_ERR(ec_spi->high_pri_worker)) {
 		err = PTR_ERR(ec_spi->high_pri_worker);
diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 77a36e7bddd5..b932425ddc6a 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -296,7 +296,7 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 
 	if (ptp->info->do_aux_work) {
 		kthread_init_delayed_work(&ptp->aux_work, ptp_aux_kworker);
-		ptp->kworker = kthread_create_worker(0, "ptp%d", ptp->index);
+		ptp->kworker = kthread_run_worker(0, "ptp%d", ptp->index);
 		if (IS_ERR(ptp->kworker)) {
 			err = PTR_ERR(ptp->kworker);
 			pr_err("failed to create ptp aux_worker %d\n", err);
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index ff1add2ecb91..e4aa8f838934 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -2060,7 +2060,7 @@ static int spi_init_queue(struct spi_controller *ctlr)
 	ctlr->busy = false;
 	ctlr->queue_empty = true;
 
-	ctlr->kworker = kthread_create_worker(0, dev_name(&ctlr->dev));
+	ctlr->kworker = kthread_run_worker(0, dev_name(&ctlr->dev));
 	if (IS_ERR(ctlr->kworker)) {
 		dev_err(&ctlr->dev, "failed to create message pump kworker\n");
 		return PTR_ERR(ctlr->kworker);
diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 6021eeb903fe..95c0c63119ac 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -7635,7 +7635,7 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc)
 	mutex_init(&port->lock);
 	mutex_init(&port->swap_lock);
 
-	port->wq = kthread_create_worker(0, dev_name(dev));
+	port->wq = kthread_run_worker(0, dev_name(dev));
 	if (IS_ERR(port->wq))
 		return ERR_CAST(port->wq);
 	sched_set_fifo(port->wq->task);
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 8ffea8430f95..c204fc8e471a 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -229,7 +229,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr,
 	dev = &vdpasim->vdpa.dev;
 
 	kthread_init_work(&vdpasim->work, vdpasim_work_fn);
-	vdpasim->worker = kthread_create_worker(0, "vDPA sim worker: %s",
+	vdpasim->worker = kthread_run_worker(0, "vDPA sim worker: %s",
 						dev_attr->name);
 	if (IS_ERR(vdpasim->worker))
 		goto err_iommu;
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 4190cb800cc4..19698d87dc57 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -1229,7 +1229,7 @@ int __init watchdog_dev_init(void)
 {
 	int err;
 
-	watchdog_kworker = kthread_create_worker(0, "watchdogd");
+	watchdog_kworker = kthread_run_worker(0, "watchdogd");
 	if (IS_ERR(watchdog_kworker)) {
 		pr_err("Failed to create watchdog kworker\n");
 		return PTR_ERR(watchdog_kworker);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index a23392327ce2..35381c00ee09 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -320,7 +320,7 @@ static void erofs_destroy_percpu_workers(void)
 static struct kthread_worker *erofs_init_percpu_worker(int cpu)
 {
 	struct kthread_worker *worker =
-		kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u");
+		kthread_run_worker_on_cpu(cpu, 0, "erofs_worker/%u");
 
 	if (IS_ERR(worker))
 		return worker;
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 0c66e7c1092a..8d27403888ce 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -193,19 +193,53 @@ struct kthread_worker *kthread_create_worker_on_node(unsigned int flags,
 						     const char namefmt[], ...);
 
 #define kthread_create_worker(flags, namefmt, ...) \
-({									   \
-	struct kthread_worker *__kw					   \
-		= kthread_create_worker_on_node(flags, NUMA_NO_NODE,	   \
-						namefmt, ## __VA_ARGS__);  \
-	if (!IS_ERR(__kw))						   \
-		wake_up_process(__kw->task);				   \
-	__kw;								   \
+	kthread_create_worker_on_node(flags, NUMA_NO_NODE, namefmt, ## __VA_ARGS__);
+
+/**
+ * kthread_run_worker - create and wake a kthread worker.
+ * @flags: flags modifying the default behavior of the worker
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: Convenient wrapper for kthread_create_worker() followed by
+ * wake_up_process().  Returns the kthread_worker or ERR_PTR(-ENOMEM).
+ */
+#define kthread_run_worker(flags, namefmt, ...)					\
+({										\
+	struct kthread_worker *__kw						\
+		= kthread_create_worker(flags, namefmt, ## __VA_ARGS__);	\
+	if (!IS_ERR(__kw))							\
+		wake_up_process(__kw->task);					\
+	__kw;									\
 })
 
 struct kthread_worker *
 kthread_create_worker_on_cpu(int cpu, unsigned int flags,
 			     const char namefmt[]);
 
+/**
+ * kthread_run_worker_on_cpu - create and wake a cpu bound kthread worker.
+ * @cpu: CPU number
+ * @flags: flags modifying the default behavior of the worker
+ * @namefmt: printf-style name for the thread. Format is restricted
+ *	     to "name.*%u". Code fills in cpu number.
+ *
+ * Description: Convenient wrapper for kthread_create_worker_on_cpu()
+ * followed by wake_up_process().  Returns the kthread_worker or
+ * ERR_PTR(-ENOMEM).
+ */
+static inline struct kthread_worker *
+kthread_run_worker_on_cpu(int cpu, unsigned int flags,
+			  const char namefmt[])
+{
+	struct kthread_worker *kw;
+
+	kw = kthread_create_worker_on_cpu(cpu, flags, namefmt);
+	if (!IS_ERR(kw))
+		wake_up_process(kw->task);
+
+	return kw;
+}
+
 bool kthread_queue_work(struct kthread_worker *worker,
 			struct kthread_work *work);
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 429ada0dbe93..97a62e88b858 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1077,33 +1077,10 @@ kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[]
 	worker = __kthread_create_worker_on_node(flags, node, namefmt, args);
 	va_end(args);
 
-	if (worker)
-		wake_up_process(worker->task);
-
 	return worker;
 }
 EXPORT_SYMBOL(kthread_create_worker_on_node);
 
-static __printf(3, 4) struct kthread_worker *
-__kthread_create_worker_on_cpu(int cpu, unsigned int flags,
-			       const char namefmt[], ...)
-{
-	struct kthread_worker *worker;
-	va_list args;
-
-	va_start(args, namefmt);
-	worker = __kthread_create_worker_on_node(flags, cpu_to_node(cpu),
-						 namefmt, args);
-	va_end(args);
-
-	if (worker) {
-		kthread_bind(worker->task, cpu);
-		wake_up_process(worker->task);
-	}
-
-	return worker;
-}
-
 /**
  * kthread_create_worker_on_cpu - create a kthread worker and bind it
  *	to a given CPU and the associated NUMA node.
@@ -1144,7 +1121,13 @@ struct kthread_worker *
 kthread_create_worker_on_cpu(int cpu, unsigned int flags,
 			     const char namefmt[])
 {
-	return __kthread_create_worker_on_cpu(cpu, flags, namefmt, cpu);
+	struct kthread_worker *worker;
+
+	worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu);
+	if (worker)
+		kthread_bind(worker->task, cpu);
+
+	return worker;
 }
 EXPORT_SYMBOL(kthread_create_worker_on_cpu);
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4a4c49821058..d4b8e87a473b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4906,7 +4906,7 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
 	if (rnp->exp_kworker)
 		return;
 
-	kworker = kthread_create_worker(0, name, rnp_index);
+	kworker = kthread_run_worker(0, name, rnp_index);
 	if (IS_ERR_OR_NULL(kworker)) {
 		pr_err("Failed to create par gp kworker on %d/%d\n",
 		       rnp->grplo, rnp->grphi);
@@ -4933,7 +4933,7 @@ static void __init rcu_start_exp_gp_kworker(void)
 	const char *name = "rcu_exp_gp_kthread_worker";
 	struct sched_param param = { .sched_priority = kthread_prio };
 
-	rcu_exp_gp_kworker = kthread_create_worker(0, name);
+	rcu_exp_gp_kworker = kthread_run_worker(0, name);
 	if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
 		pr_err("Failed to create %s!\n", name);
 		rcu_exp_gp_kworker = NULL;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7fff1d045477..ab8962d2e9d3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5352,7 +5352,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
 {
 	struct kthread_worker *helper;
 
-	helper = kthread_create_worker(0, name);
+	helper = kthread_run_worker(0, name);
 	if (helper)
 		sched_set_fifo(helper->task);
 	return helper;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8b07576814a5..fe01c1f8095c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -7828,7 +7828,7 @@ static void __init wq_cpu_intensive_thresh_init(void)
 	unsigned long thresh;
 	unsigned long bogo;
 
-	pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
+	pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release");
 	BUG_ON(IS_ERR(pwq_release_worker));
 
 	/* if the user set it to a specific value, keep it */
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 281bbac5539d..c33d4bf17929 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -66,7 +66,7 @@ static int ksz_connect(struct dsa_switch *ds)
 	if (!priv)
 		return -ENOMEM;
 
-	xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit",
+	xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit",
 					    ds->dst->index, ds->index);
 	if (IS_ERR(xmit_worker)) {
 		ret = PTR_ERR(xmit_worker);
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 8e8b1bef6af6..6ce0bc166792 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -110,7 +110,7 @@ static int ocelot_connect(struct dsa_switch *ds)
 	if (!priv)
 		return -ENOMEM;
 
-	priv->xmit_worker = kthread_create_worker(0, "felix_xmit");
+	priv->xmit_worker = kthread_run_worker(0, "felix_xmit");
 	if (IS_ERR(priv->xmit_worker)) {
 		err = PTR_ERR(priv->xmit_worker);
 		kfree(priv);
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 3e902af7eea6..02adec693811 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -707,7 +707,7 @@ static int sja1105_connect(struct dsa_switch *ds)
 
 	spin_lock_init(&priv->meta_lock);
 
-	xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit",
+	xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit",
 					    ds->dst->index, ds->index);
 	if (IS_ERR(xmit_worker)) {
 		err = PTR_ERR(xmit_worker);
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 19/19] rcu: Use kthread preferred affinity for RCU exp kworkers
       [not found] <20241211154035.75565-1-frederic@kernel.org>
                   ` (6 preceding siblings ...)
  2024-12-11 15:40 ` [PATCH 18/19] treewide: Introduce kthread_run_worker[_on_cpu]() Frederic Weisbecker
@ 2024-12-11 15:40 ` Frederic Weisbecker
  7 siblings, 0 replies; 22+ messages in thread
From: Frederic Weisbecker @ 2024-12-11 15:40 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Paul E. McKenney, Uladzislau Rezki,
	Neeraj Upadhyay, Joel Fernandes, Boqun Feng, Zqiang, rcu,
	Andrew Morton, Peter Zijlstra, Thomas Gleixner, Michal Hocko,
	Vlastimil Babka

Now that kthreads have an infrastructure to handle preferred affinity
against CPU hotplug and housekeeping cpumask, convert RCU exp workers to
use it instead of handling all the constraints by itself.

Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree.c | 105 +++++++++-------------------------------------
 1 file changed, 19 insertions(+), 86 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d4b8e87a473b..c160e05dfb7c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4894,6 +4894,22 @@ rcu_boot_init_percpu_data(int cpu)
 	rcu_boot_init_nocb_percpu_data(rdp);
 }
 
+static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp)
+{
+	cpumask_var_t affinity;
+	int cpu;
+
+	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+		return;
+
+	for_each_leaf_node_possible_cpu(rnp, cpu)
+		cpumask_set_cpu(cpu, affinity);
+
+	kthread_affine_preferred(t, affinity);
+
+	free_cpumask_var(affinity);
+}
+
 struct kthread_worker *rcu_exp_gp_kworker;
 
 static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
@@ -4906,7 +4922,7 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
 	if (rnp->exp_kworker)
 		return;
 
-	kworker = kthread_run_worker(0, name, rnp_index);
+	kworker = kthread_create_worker(0, name, rnp_index);
 	if (IS_ERR_OR_NULL(kworker)) {
 		pr_err("Failed to create par gp kworker on %d/%d\n",
 		       rnp->grplo, rnp->grphi);
@@ -4916,16 +4932,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
 
 	if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
 		sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
-}
 
-static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
-{
-	struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker);
-
-	if (!kworker)
-		return NULL;
-
-	return kworker->task;
+	rcu_thread_affine_rnp(kworker->task, rnp);
+	wake_up_process(kworker->task);
 }
 
 static void __init rcu_start_exp_gp_kworker(void)
@@ -5010,79 +5019,6 @@ int rcutree_prepare_cpu(unsigned int cpu)
 	return 0;
 }
 
-static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp)
-{
-	cpumask_var_t affinity;
-	int cpu;
-
-	if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
-		return;
-
-	for_each_leaf_node_possible_cpu(rnp, cpu)
-		cpumask_set_cpu(cpu, affinity);
-
-	kthread_affine_preferred(t, affinity);
-
-	free_cpumask_var(affinity);
-}
-
-/*
- * Update kthreads affinity during CPU-hotplug changes.
- *
- * Set the per-rcu_node kthread's affinity to cover all CPUs that are
- * served by the rcu_node in question.  The CPU hotplug lock is still
- * held, so the value of rnp->qsmaskinit will be stable.
- *
- * We don't include outgoingcpu in the affinity set, use -1 if there is
- * no outgoing CPU.  If there are no CPUs left in the affinity set,
- * this function allows the kthread to execute on any CPU.
- *
- * Any future concurrent calls are serialized via ->kthread_mutex.
- */
-static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
-{
-	cpumask_var_t cm;
-	unsigned long mask;
-	struct rcu_data *rdp;
-	struct rcu_node *rnp;
-	struct task_struct *task_exp;
-
-	rdp = per_cpu_ptr(&rcu_data, cpu);
-	rnp = rdp->mynode;
-
-	task_exp = rcu_exp_par_gp_task(rnp);
-
-	/*
-	 * If CPU is the boot one, this task is created later from early
-	 * initcall since kthreadd must be created first.
-	 */
-	if (!task_exp)
-		return;
-
-	if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
-		return;
-
-	mutex_lock(&rnp->kthread_mutex);
-	mask = rcu_rnp_online_cpus(rnp);
-	for_each_leaf_node_possible_cpu(rnp, cpu)
-		if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
-		    cpu != outgoingcpu)
-			cpumask_set_cpu(cpu, cm);
-	cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
-	if (cpumask_empty(cm)) {
-		cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
-		if (outgoingcpu >= 0)
-			cpumask_clear_cpu(outgoingcpu, cm);
-	}
-
-	if (task_exp)
-		set_cpus_allowed_ptr(task_exp, cm);
-
-	mutex_unlock(&rnp->kthread_mutex);
-
-	free_cpumask_var(cm);
-}
-
 /*
  * Has the specified (known valid) CPU ever been fully online?
  */
@@ -5111,7 +5047,6 @@ int rcutree_online_cpu(unsigned int cpu)
 	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
 		return 0; /* Too early in boot for scheduler work. */
 	sync_sched_exp_online_cleanup(cpu);
-	rcutree_affinity_setting(cpu, -1);
 
 	// Stop-machine done, so allow nohz_full to disable tick.
 	tick_dep_clear(TICK_DEP_BIT_RCU);
@@ -5328,8 +5263,6 @@ int rcutree_offline_cpu(unsigned int cpu)
 	rnp->ffmask &= ~rdp->grpmask;
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
-	rcutree_affinity_setting(cpu, cpu);
-
 	// nohz_full CPUs need the tick for stop-machine to work quickly
 	tick_dep_set(TICK_DEP_BIT_RCU);
 	return 0;
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 10/19] sched,arm64: Handle CPU isolation on last resort fallback rq selection
  2024-12-11 15:40 ` [PATCH 10/19] sched,arm64: Handle CPU isolation on last resort fallback rq selection Frederic Weisbecker
@ 2025-01-03 15:27   ` Will Deacon
  2025-01-04 23:22     ` Frederic Weisbecker
  0 siblings, 1 reply; 22+ messages in thread
From: Will Deacon @ 2025-01-03 15:27 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Catalin Marinas, Marc Zyngier, Oliver Upton, Ard Biesheuvel,
	Mark Rutland, Peter Zijlstra, Vincent Guittot, Thomas Gleixner,
	Vlastimil Babka, Paul E. McKenney, Neeraj Upadhyay,
	Joel Fernandes, Boqun Feng, Zqiang, Uladzislau Rezki, rcu,
	Michal Hocko, Andrew Morton

On Wed, Dec 11, 2024 at 04:40:23PM +0100, Frederic Weisbecker wrote:
> When a kthread or any other task has an affinity mask that is fully
> offline or unallowed, the scheduler reaffines the task to all possible
> CPUs as a last resort.
> 
> This default decision doesn't mix up very well with nohz_full CPUs that
> are part of the possible cpumask but don't want to be disturbed by
> unbound kthreads or even detached pinned user tasks.
> 
> Make the fallback affinity setting aware of nohz_full.
> 
> Suggested-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> ---
>  arch/arm64/include/asm/cpufeature.h  |  1 +
>  arch/arm64/include/asm/mmu_context.h |  2 ++
>  arch/arm64/kernel/cpufeature.c       | 11 +++++++++++
>  include/linux/mmu_context.h          |  1 +
>  kernel/sched/core.c                  |  2 +-
>  5 files changed, 16 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
> index 8b4e5a3cd24c..cac5efc836c0 100644
> --- a/arch/arm64/include/asm/cpufeature.h
> +++ b/arch/arm64/include/asm/cpufeature.h
> @@ -671,6 +671,7 @@ static inline bool supports_clearbhb(int scope)
>  }
>  
>  const struct cpumask *system_32bit_el0_cpumask(void);
> +const struct cpumask *fallback_32bit_el0_cpumask(void);
>  DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0);
>  
>  static inline bool system_supports_32bit_el0(void)
> diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
> index 48b3d9553b67..7883abd6b29a 100644
> --- a/arch/arm64/include/asm/mmu_context.h
> +++ b/arch/arm64/include/asm/mmu_context.h
> @@ -283,6 +283,8 @@ task_cpu_possible_mask(struct task_struct *p)
>  }
>  #define task_cpu_possible_mask	task_cpu_possible_mask
>  
> +const struct cpumask *task_cpu_fallback_mask(struct task_struct *p);
> +
>  void verify_cpu_asid_bits(void);
>  void post_ttbr_update_workaround(void);
>  
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index 7ce1b8ab417f..2b7aa32bf436 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -1642,6 +1642,17 @@ const struct cpumask *system_32bit_el0_cpumask(void)
>  	return cpu_possible_mask;
>  }
>  
> +const struct cpumask *task_cpu_fallback_mask(struct task_struct *p)
> +{
> +	if (!static_branch_unlikely(&arm64_mismatched_32bit_el0))
> +		return housekeeping_cpumask(HK_TYPE_TICK);
> +
> +	if (!is_compat_thread(task_thread_info(p)))
> +		return housekeeping_cpumask(HK_TYPE_TICK);
> +
> +	return system_32bit_el0_cpumask();
> +}

I think this is correct, but damn what we really want to ask for is the
intersection of task_cpu_possible_mask(p) and
housekeeping_cpumask(HK_TYPE_TICK). It's a shame to duplicate the logic
in task_cpu_possible_mask() here because we don't want to allocate a
temporary mask.

Maybe we could have a helper to consolidate things a little?

static inline const struct cpumask *
__task_cpu_possible_mask(struct task_struct *p, const struct cpumask *mask)
{
	if (!static_branch_unlikely(&arm64_mismatched_32bit_el0))
		return mask;

	if (!is_compat_thread(task_thread_info(p)))
		return mask;

	return system_32bit_el0_cpumask();
}

Then we could call that from both task_cpu_possible_mask() and
task_cpu_fallback_mask(), but passing 'cpu_possible_mask' and
housekeeping_cpumask(HK_TYPE_TICK) for the 'mask' argument respectively?

Will

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 10/19] sched,arm64: Handle CPU isolation on last resort fallback rq selection
  2025-01-03 15:27   ` Will Deacon
@ 2025-01-04 23:22     ` Frederic Weisbecker
  2025-01-08 15:50       ` Will Deacon
  0 siblings, 1 reply; 22+ messages in thread
From: Frederic Weisbecker @ 2025-01-04 23:22 UTC (permalink / raw)
  To: Will Deacon
  Cc: LKML, Catalin Marinas, Marc Zyngier, Oliver Upton, Ard Biesheuvel,
	Mark Rutland, Peter Zijlstra, Vincent Guittot, Thomas Gleixner,
	Vlastimil Babka, Paul E. McKenney, Neeraj Upadhyay,
	Joel Fernandes, Boqun Feng, Zqiang, Uladzislau Rezki, rcu,
	Michal Hocko, Andrew Morton

Le Fri, Jan 03, 2025 at 03:27:03PM +0000, Will Deacon a écrit :
> On Wed, Dec 11, 2024 at 04:40:23PM +0100, Frederic Weisbecker wrote:
> > +const struct cpumask *task_cpu_fallback_mask(struct task_struct *p)
> > +{
> > +	if (!static_branch_unlikely(&arm64_mismatched_32bit_el0))
> > +		return housekeeping_cpumask(HK_TYPE_TICK);
> > +
> > +	if (!is_compat_thread(task_thread_info(p)))
> > +		return housekeeping_cpumask(HK_TYPE_TICK);
> > +
> > +	return system_32bit_el0_cpumask();
> > +}
> 
> I think this is correct, but damn what we really want to ask for is the
> intersection of task_cpu_possible_mask(p) and
> housekeeping_cpumask(HK_TYPE_TICK). It's a shame to duplicate the logic
> in task_cpu_possible_mask() here because we don't want to allocate a
> temporary mask.

Yeah I know :-/

> 
> Maybe we could have a helper to consolidate things a little?
> 
> static inline const struct cpumask *
> __task_cpu_possible_mask(struct task_struct *p, const struct cpumask *mask)
> {
> 	if (!static_branch_unlikely(&arm64_mismatched_32bit_el0))
> 		return mask;
> 
> 	if (!is_compat_thread(task_thread_info(p)))
> 		return mask;
> 
> 	return system_32bit_el0_cpumask();
> }
> 
> Then we could call that from both task_cpu_possible_mask() and
> task_cpu_fallback_mask(), but passing 'cpu_possible_mask' and
> housekeeping_cpumask(HK_TYPE_TICK) for the 'mask' argument respectively?

Good point! How is the following updated version?

---
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Sep 2024 00:48:59 +0200
Subject: [PATCH 2/2] sched,arm64: Handle CPU isolation on last resort fallback
 rq selection

When a kthread or any other task has an affinity mask that is fully
offline or unallowed, the scheduler reaffines the task to all possible
CPUs as a last resort.

This default decision doesn't mix up very well with nohz_full CPUs that
are part of the possible cpumask but don't want to be disturbed by
unbound kthreads or even detached pinned user tasks.

Make the fallback affinity setting aware of nohz_full.

Suggested-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 arch/arm64/include/asm/cpufeature.h  |  1 +
 arch/arm64/include/asm/mmu_context.h | 14 +++++++++++---
 arch/arm64/kernel/cpufeature.c       |  5 +++++
 include/linux/mmu_context.h          |  1 +
 kernel/sched/core.c                  |  2 +-
 5 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 8b4e5a3cd24c..cac5efc836c0 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -671,6 +671,7 @@ static inline bool supports_clearbhb(int scope)
 }
 
 const struct cpumask *system_32bit_el0_cpumask(void);
+const struct cpumask *fallback_32bit_el0_cpumask(void);
 DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0);
 
 static inline bool system_supports_32bit_el0(void)
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 48b3d9553b67..0dbe3b29049b 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -271,18 +271,26 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 }
 
 static inline const struct cpumask *
-task_cpu_possible_mask(struct task_struct *p)
+__task_cpu_possible_mask(struct task_struct *p, const struct cpumask *mask)
 {
 	if (!static_branch_unlikely(&arm64_mismatched_32bit_el0))
-		return cpu_possible_mask;
+		return mask;
 
 	if (!is_compat_thread(task_thread_info(p)))
-		return cpu_possible_mask;
+		return mask;
 
 	return system_32bit_el0_cpumask();
 }
+
+static inline const struct cpumask *
+task_cpu_possible_mask(struct task_struct *p)
+{
+	return __task_cpu_possible_mask(p, cpu_possible_mask);
+}
 #define task_cpu_possible_mask	task_cpu_possible_mask
 
+const struct cpumask *task_cpu_fallback_mask(struct task_struct *p);
+
 void verify_cpu_asid_bits(void);
 void post_ttbr_update_workaround(void);
 
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 3c87659c14db..a983e8660987 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1642,6 +1642,11 @@ const struct cpumask *system_32bit_el0_cpumask(void)
 	return cpu_possible_mask;
 }
 
+const struct cpumask *task_cpu_fallback_mask(struct task_struct *p)
+{
+	return __task_cpu_possible_mask(p, housekeeping_cpumask(HK_TYPE_TICK));
+}
+
 static int __init parse_32bit_el0_param(char *str)
 {
 	allow_mismatched_32bit_el0 = true;
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index bbaec80c78c5..ac01dc4eb2ce 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -24,6 +24,7 @@ static inline void leave_mm(void) { }
 #ifndef task_cpu_possible_mask
 # define task_cpu_possible_mask(p)	cpu_possible_mask
 # define task_cpu_possible(cpu, p)	true
+# define task_cpu_fallback_mask(p)	housekeeping_cpumask(HK_TYPE_TICK)
 #else
 # define task_cpu_possible(cpu, p)	cpumask_test_cpu((cpu), task_cpu_possible_mask(p))
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 95e40895a519..233b50b0e123 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3534,7 +3534,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 			 *
 			 * More yuck to audit.
 			 */
-			do_set_cpus_allowed(p, task_cpu_possible_mask(p));
+			do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
 			state = fail;
 			break;
 		case fail:
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 10/19] sched,arm64: Handle CPU isolation on last resort fallback rq selection
  2025-01-04 23:22     ` Frederic Weisbecker
@ 2025-01-08 15:50       ` Will Deacon
  0 siblings, 0 replies; 22+ messages in thread
From: Will Deacon @ 2025-01-08 15:50 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Catalin Marinas, Marc Zyngier, Oliver Upton, Ard Biesheuvel,
	Mark Rutland, Peter Zijlstra, Vincent Guittot, Thomas Gleixner,
	Vlastimil Babka, Paul E. McKenney, Neeraj Upadhyay,
	Joel Fernandes, Boqun Feng, Zqiang, Uladzislau Rezki, rcu,
	Michal Hocko, Andrew Morton

On Sun, Jan 05, 2025 at 12:22:33AM +0100, Frederic Weisbecker wrote:
> Le Fri, Jan 03, 2025 at 03:27:03PM +0000, Will Deacon a écrit :
> > Maybe we could have a helper to consolidate things a little?
> > 
> > static inline const struct cpumask *
> > __task_cpu_possible_mask(struct task_struct *p, const struct cpumask *mask)
> > {
> > 	if (!static_branch_unlikely(&arm64_mismatched_32bit_el0))
> > 		return mask;
> > 
> > 	if (!is_compat_thread(task_thread_info(p)))
> > 		return mask;
> > 
> > 	return system_32bit_el0_cpumask();
> > }
> > 
> > Then we could call that from both task_cpu_possible_mask() and
> > task_cpu_fallback_mask(), but passing 'cpu_possible_mask' and
> > housekeeping_cpumask(HK_TYPE_TICK) for the 'mask' argument respectively?
> 
> Good point! How is the following updated version?

That's exactly what I had in mind, thanks!

Acked-by: Will Deacon <will@kernel.org>

Will

> ---
> From: Frederic Weisbecker <frederic@kernel.org>
> Date: Fri, 27 Sep 2024 00:48:59 +0200
> Subject: [PATCH 2/2] sched,arm64: Handle CPU isolation on last resort fallback
>  rq selection
> 
> When a kthread or any other task has an affinity mask that is fully
> offline or unallowed, the scheduler reaffines the task to all possible
> CPUs as a last resort.
> 
> This default decision doesn't mix up very well with nohz_full CPUs that
> are part of the possible cpumask but don't want to be disturbed by
> unbound kthreads or even detached pinned user tasks.
> 
> Make the fallback affinity setting aware of nohz_full.
> 
> Suggested-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> ---
>  arch/arm64/include/asm/cpufeature.h  |  1 +
>  arch/arm64/include/asm/mmu_context.h | 14 +++++++++++---
>  arch/arm64/kernel/cpufeature.c       |  5 +++++
>  include/linux/mmu_context.h          |  1 +
>  kernel/sched/core.c                  |  2 +-
>  5 files changed, 19 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
> index 8b4e5a3cd24c..cac5efc836c0 100644
> --- a/arch/arm64/include/asm/cpufeature.h
> +++ b/arch/arm64/include/asm/cpufeature.h
> @@ -671,6 +671,7 @@ static inline bool supports_clearbhb(int scope)
>  }
>  
>  const struct cpumask *system_32bit_el0_cpumask(void);
> +const struct cpumask *fallback_32bit_el0_cpumask(void);
>  DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0);
>  
>  static inline bool system_supports_32bit_el0(void)
> diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
> index 48b3d9553b67..0dbe3b29049b 100644
> --- a/arch/arm64/include/asm/mmu_context.h
> +++ b/arch/arm64/include/asm/mmu_context.h
> @@ -271,18 +271,26 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
>  }
>  
>  static inline const struct cpumask *
> -task_cpu_possible_mask(struct task_struct *p)
> +__task_cpu_possible_mask(struct task_struct *p, const struct cpumask *mask)
>  {
>  	if (!static_branch_unlikely(&arm64_mismatched_32bit_el0))
> -		return cpu_possible_mask;
> +		return mask;
>  
>  	if (!is_compat_thread(task_thread_info(p)))
> -		return cpu_possible_mask;
> +		return mask;
>  
>  	return system_32bit_el0_cpumask();
>  }
> +
> +static inline const struct cpumask *
> +task_cpu_possible_mask(struct task_struct *p)
> +{
> +	return __task_cpu_possible_mask(p, cpu_possible_mask);
> +}
>  #define task_cpu_possible_mask	task_cpu_possible_mask
>  
> +const struct cpumask *task_cpu_fallback_mask(struct task_struct *p);
> +
>  void verify_cpu_asid_bits(void);
>  void post_ttbr_update_workaround(void);
>  
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index 3c87659c14db..a983e8660987 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -1642,6 +1642,11 @@ const struct cpumask *system_32bit_el0_cpumask(void)
>  	return cpu_possible_mask;
>  }
>  
> +const struct cpumask *task_cpu_fallback_mask(struct task_struct *p)
> +{
> +	return __task_cpu_possible_mask(p, housekeeping_cpumask(HK_TYPE_TICK));
> +}
> +
>  static int __init parse_32bit_el0_param(char *str)
>  {
>  	allow_mismatched_32bit_el0 = true;
> diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
> index bbaec80c78c5..ac01dc4eb2ce 100644
> --- a/include/linux/mmu_context.h
> +++ b/include/linux/mmu_context.h
> @@ -24,6 +24,7 @@ static inline void leave_mm(void) { }
>  #ifndef task_cpu_possible_mask
>  # define task_cpu_possible_mask(p)	cpu_possible_mask
>  # define task_cpu_possible(cpu, p)	true
> +# define task_cpu_fallback_mask(p)	housekeeping_cpumask(HK_TYPE_TICK)
>  #else
>  # define task_cpu_possible(cpu, p)	cpumask_test_cpu((cpu), task_cpu_possible_mask(p))
>  #endif
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 95e40895a519..233b50b0e123 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3534,7 +3534,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
>  			 *
>  			 * More yuck to audit.
>  			 */
> -			do_set_cpus_allowed(p, task_cpu_possible_mask(p));
> +			do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
>  			state = fail;
>  			break;
>  		case fail:
> -- 
> 2.46.0
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2025-01-08 15:50 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20241211154035.75565-1-frederic@kernel.org>
2024-12-11 15:40 ` [PATCH 10/19] sched,arm64: Handle CPU isolation on last resort fallback rq selection Frederic Weisbecker
2025-01-03 15:27   ` Will Deacon
2025-01-04 23:22     ` Frederic Weisbecker
2025-01-08 15:50       ` Will Deacon
2024-12-11 15:40 ` [PATCH 11/19] kthread: Make sure kthread hasn't started while binding it Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 15/19] kthread: Implement preferred affinity Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 16/19] rcu: Use kthread preferred affinity for RCU boost Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 17/19] kthread: Unify kthread_create_on_cpu() and kthread_create_worker_on_cpu() automatic format Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 18/19] treewide: Introduce kthread_run_worker[_on_cpu]() Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 19/19] rcu: Use kthread preferred affinity for RCU exp kworkers Frederic Weisbecker
     [not found] <20240916224925.20540-1-frederic@kernel.org>
2024-09-16 22:49 ` [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node Frederic Weisbecker
2024-09-17  6:26   ` Michal Hocko
2024-09-17  7:01     ` Vlastimil Babka
2024-09-17  7:05       ` Michal Hocko
2024-09-17  7:14         ` Vlastimil Babka
2024-09-17 10:34     ` Frederic Weisbecker
2024-09-17 11:07       ` Michal Hocko
2024-09-18  9:37         ` Frederic Weisbecker
2024-09-18 11:17           ` Michal Hocko
     [not found] <20240807160228.26206-1-frederic@kernel.org>
2024-08-07 16:02 ` Frederic Weisbecker
2024-08-07 17:01   ` Vlastimil Babka

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox