From: Frederic Weisbecker <frederic@kernel.org>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>,
Andrew Morton <akpm@linux-foundation.org>,
Kees Cook <kees@kernel.org>,
Peter Zijlstra <peterz@infradead.org>,
Thomas Gleixner <tglx@linutronix.de>,
Michal Hocko <mhocko@kernel.org>,
Vlastimil Babka <vbabka@suse.cz>,
linux-mm@kvack.org, "Paul E. McKenney" <paulmck@kernel.org>,
Neeraj Upadhyay <neeraj.upadhyay@kernel.org>,
Joel Fernandes <joel@joelfernandes.org>,
Boqun Feng <boqun.feng@gmail.com>,
Uladzislau Rezki <urezki@gmail.com>,
Zqiang <qiang.zhang1211@gmail.com>,
rcu@vger.kernel.org
Subject: [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
Date: Wed, 11 Dec 2024 16:40:25 +0100 [thread overview]
Message-ID: <20241211154035.75565-13-frederic@kernel.org> (raw)
In-Reply-To: <20241211154035.75565-1-frederic@kernel.org>
Kthreads attached to a preferred NUMA node for their task structure
allocation can also be assumed to run preferrably within that same node.
A more precise affinity is usually notified by calling
kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.
For the others, a default affinity to the node is desired and sometimes
implemented with more or less success when it comes to deal with hotplug
events and nohz_full / CPU Isolation interactions:
- kcompactd is affine to its node and handles hotplug but not CPU Isolation
- kswapd is affine to its node and ignores hotplug and CPU Isolation
- A bunch of drivers create their kthreads on a specific node and
don't take care about affining further.
Handle that default node affinity preference at the generic level
instead, provided a kthread is created on an actual node and doesn't
apply any specific affinity such as a given CPU or a custom cpumask to
bind to before its first wake-up.
This generic handling is aware of CPU hotplug events and CPU isolation
such that:
* When a housekeeping CPU goes up that is part of the node of a given
kthread, the related task is re-affined to that own node if it was
previously running on the default last resort online housekeeping set
from other nodes.
* When a housekeeping CPU goes down while it was part of the node of a
kthread, the running task is migrated (or the sleeping task is woken
up) automatically by the scheduler to other housekeepers within the
same node or, as a last resort, to all housekeepers from other nodes.
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
include/linux/cpuhotplug.h | 1 +
kernel/kthread.c | 106 ++++++++++++++++++++++++++++++++++++-
2 files changed, 106 insertions(+), 1 deletion(-)
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index a04b73c40173..6cc5e484547c 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -240,6 +240,7 @@ enum cpuhp_state {
CPUHP_AP_WORKQUEUE_ONLINE,
CPUHP_AP_RANDOM_ONLINE,
CPUHP_AP_RCUTREE_ONLINE,
+ CPUHP_AP_KTHREADS_ONLINE,
CPUHP_AP_BASE_CACHEINFO_ONLINE,
CPUHP_AP_ONLINE_DYN,
CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 40,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b6f9ce475a4f..3394ff024a5a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -35,6 +35,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
+static LIST_HEAD(kthreads_hotplug);
+static DEFINE_MUTEX(kthreads_hotplug_lock);
+
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
@@ -53,6 +56,7 @@ struct kthread_create_info
struct kthread {
unsigned long flags;
unsigned int cpu;
+ unsigned int node;
int started;
int result;
int (*threadfn)(void *);
@@ -64,6 +68,8 @@ struct kthread {
#endif
/* To store the full name if task comm is truncated. */
char *full_name;
+ struct task_struct *task;
+ struct list_head hotplug_node;
};
enum KTHREAD_BITS {
@@ -122,8 +128,11 @@ bool set_kthread_struct(struct task_struct *p)
init_completion(&kthread->exited);
init_completion(&kthread->parked);
+ INIT_LIST_HEAD(&kthread->hotplug_node);
p->vfork_done = &kthread->exited;
+ kthread->task = p;
+ kthread->node = tsk_fork_get_node(current);
p->worker_private = kthread;
return true;
}
@@ -314,6 +323,11 @@ void __noreturn kthread_exit(long result)
{
struct kthread *kthread = to_kthread(current);
kthread->result = result;
+ if (!list_empty(&kthread->hotplug_node)) {
+ mutex_lock(&kthreads_hotplug_lock);
+ list_del(&kthread->hotplug_node);
+ mutex_unlock(&kthreads_hotplug_lock);
+ }
do_exit(0);
}
EXPORT_SYMBOL(kthread_exit);
@@ -339,6 +353,48 @@ void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
}
EXPORT_SYMBOL(kthread_complete_and_exit);
+static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask)
+{
+ cpumask_and(cpumask, cpumask_of_node(kthread->node),
+ housekeeping_cpumask(HK_TYPE_KTHREAD));
+
+ if (cpumask_empty(cpumask))
+ cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+}
+
+static void kthread_affine_node(void)
+{
+ struct kthread *kthread = to_kthread(current);
+ cpumask_var_t affinity;
+
+ WARN_ON_ONCE(kthread_is_per_cpu(current));
+
+ if (kthread->node == NUMA_NO_NODE) {
+ housekeeping_affine(current, HK_TYPE_KTHREAD);
+ } else {
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ mutex_lock(&kthreads_hotplug_lock);
+ WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
+ list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+ /*
+ * The node cpumask is racy when read from kthread() but:
+ * - a racing CPU going down will either fail on the subsequent
+ * call to set_cpus_allowed_ptr() or be migrated to housekeepers
+ * afterwards by the scheduler.
+ * - a racing CPU going up will be handled by kthreads_online_cpu()
+ */
+ kthread_fetch_affinity(kthread, affinity);
+ set_cpus_allowed_ptr(current, affinity);
+ mutex_unlock(&kthreads_hotplug_lock);
+
+ free_cpumask_var(affinity);
+ }
+}
+
static int kthread(void *_create)
{
static const struct sched_param param = { .sched_priority = 0 };
@@ -369,7 +425,6 @@ static int kthread(void *_create)
* back to default in case they have been changed.
*/
sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m);
- set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -385,6 +440,9 @@ static int kthread(void *_create)
self->started = 1;
+ if (!(current->flags & PF_NO_SETAFFINITY))
+ kthread_affine_node();
+
ret = -EINTR;
if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
cgroup_kthread_ready();
@@ -781,6 +839,52 @@ int kthreadd(void *unused)
return 0;
}
+/*
+ * Re-affine kthreads according to their preferences
+ * and the newly online CPU. The CPU down part is handled
+ * by select_fallback_rq() which default re-affines to
+ * housekeepers in case the preferred affinity doesn't
+ * apply anymore.
+ */
+static int kthreads_online_cpu(unsigned int cpu)
+{
+ cpumask_var_t affinity;
+ struct kthread *k;
+ int ret;
+
+ guard(mutex)(&kthreads_hotplug_lock);
+
+ if (list_empty(&kthreads_hotplug))
+ return 0;
+
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = 0;
+
+ list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
+ if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
+ kthread_is_per_cpu(k->task) ||
+ k->node == NUMA_NO_NODE)) {
+ ret = -EINVAL;
+ continue;
+ }
+ kthread_fetch_affinity(k, affinity);
+ set_cpus_allowed_ptr(k->task, affinity);
+ }
+
+ free_cpumask_var(affinity);
+
+ return ret;
+}
+
+static int kthreads_init(void)
+{
+ return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
+ kthreads_online_cpu, NULL);
+}
+early_initcall(kthreads_init);
+
void __kthread_init_worker(struct kthread_worker *worker,
const char *name,
struct lock_class_key *key)
--
2.46.0
next prev parent reply other threads:[~2024-12-11 15:41 UTC|newest]
Thread overview: 44+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-12-11 15:40 [PATCH 00/19] kthread: Introduce preferred affinity v6 Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 01/19] arm/bL_switcher: Use kthread_run_on_cpu() Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 02/19] firmware: stratix10-svc: " Frederic Weisbecker
2024-12-17 0:37 ` Dinh Nguyen
2024-12-11 15:40 ` [PATCH 03/19] scsi: bnx2fc: Use kthread_create_on_cpu() Frederic Weisbecker
2025-01-02 17:54 ` Martin K. Petersen
2024-12-11 15:40 ` [PATCH 04/19] scsi: bnx2i: " Frederic Weisbecker
2025-01-02 17:55 ` Martin K. Petersen
2024-12-11 15:40 ` [PATCH 05/19] scsi: qedi: " Frederic Weisbecker
2025-01-02 17:55 ` Martin K. Petersen
2024-12-11 15:40 ` [PATCH 06/19] soc/qman: test: Use kthread_run_on_cpu() Frederic Weisbecker
2024-12-13 7:27 ` LEROY Christophe
2024-12-13 11:46 ` Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 07/19] kallsyms: " Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 08/19] lib: test_objpool: " Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 09/19] arm64: Exclude nohz_full CPUs from 32bits el0 support Frederic Weisbecker
2025-01-03 15:20 ` Will Deacon
2025-01-04 23:18 ` Frederic Weisbecker
2025-01-08 15:49 ` Will Deacon
2024-12-11 15:40 ` [PATCH 10/19] sched,arm64: Handle CPU isolation on last resort fallback rq selection Frederic Weisbecker
2025-01-03 15:27 ` Will Deacon
2025-01-04 23:22 ` Frederic Weisbecker
2025-01-08 15:50 ` Will Deacon
2024-12-11 15:40 ` [PATCH 11/19] kthread: Make sure kthread hasn't started while binding it Frederic Weisbecker
2024-12-11 15:40 ` Frederic Weisbecker [this message]
2024-12-11 15:40 ` [PATCH 13/19] mm: Create/affine kcompactd to its preferred node Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 14/19] mm: Create/affine kswapd " Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 15/19] kthread: Implement preferred affinity Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 16/19] rcu: Use kthread preferred affinity for RCU boost Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 17/19] kthread: Unify kthread_create_on_cpu() and kthread_create_worker_on_cpu() automatic format Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 18/19] treewide: Introduce kthread_run_worker[_on_cpu]() Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 19/19] rcu: Use kthread preferred affinity for RCU exp kworkers Frederic Weisbecker
2025-01-10 21:16 ` (subset) [PATCH 00/19] kthread: Introduce preferred affinity v6 Martin K. Petersen
-- strict thread matches above, loose matches on Subject: below --
2024-09-16 22:49 [PATCH 00/19] kthread: Introduce preferred affinity v3 Frederic Weisbecker
2024-09-16 22:49 ` [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node Frederic Weisbecker
2024-09-17 6:26 ` Michal Hocko
2024-09-17 7:01 ` Vlastimil Babka
2024-09-17 7:05 ` Michal Hocko
2024-09-17 7:14 ` Vlastimil Babka
2024-09-17 10:34 ` Frederic Weisbecker
2024-09-17 11:07 ` Michal Hocko
2024-09-18 9:37 ` Frederic Weisbecker
2024-09-18 11:17 ` Michal Hocko
2024-08-07 16:02 [PATCH 00/19] kthread: Introduce preferred affinity v2 Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node Frederic Weisbecker
2024-08-07 17:01 ` Vlastimil Babka
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241211154035.75565-13-frederic@kernel.org \
--to=frederic@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=boqun.feng@gmail.com \
--cc=joel@joelfernandes.org \
--cc=kees@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=neeraj.upadhyay@kernel.org \
--cc=paulmck@kernel.org \
--cc=peterz@infradead.org \
--cc=qiang.zhang1211@gmail.com \
--cc=rcu@vger.kernel.org \
--cc=tglx@linutronix.de \
--cc=urezki@gmail.com \
--cc=vbabka@suse.cz \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.