From: Frederic Weisbecker <frederic@kernel.org>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>,
Andrew Morton <akpm@linux-foundation.org>,
Kees Cook <kees@kernel.org>,
Peter Zijlstra <peterz@infradead.org>,
Thomas Gleixner <tglx@linutronix.de>,
Michal Hocko <mhocko@kernel.org>,
Vlastimil Babka <vbabka@suse.cz>,
linux-mm@kvack.org, "Paul E. McKenney" <paulmck@kernel.org>,
Neeraj Upadhyay <neeraj.upadhyay@kernel.org>,
Joel Fernandes <joel@joelfernandes.org>,
Boqun Feng <boqun.feng@gmail.com>,
Zqiang <qiang.zhang1211@gmail.com>,
rcu@vger.kernel.org
Subject: [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node
Date: Wed, 7 Aug 2024 18:02:18 +0200 [thread overview]
Message-ID: <20240807160228.26206-13-frederic@kernel.org> (raw)
In-Reply-To: <20240807160228.26206-1-frederic@kernel.org>
Kthreads attached to a preferred NUMA node for their task structure
allocation can also be assumed to run preferrably within that same node.
A more precise affinity is usually notified by calling
kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup.
For the others, a default affinity to the node is desired and sometimes
implemented with more or less success when it comes to deal with hotplug
events and nohz_full / CPU Isolation interactions:
- kcompactd is affine to its node and handles hotplug but not CPU Isolation
- kswapd is affine to its node and ignores hotplug and CPU Isolation
- A bunch of drivers create their kthreads on a specific node and
don't take care about affining further.
Handle that default node affinity preference at the generic level
instead, provided a kthread is created on an actual node and doesn't
apply any specific affinity such as a given CPU or a custom cpumask to
bind to before its first wake-up.
This generic handling is aware of CPU hotplug events and CPU isolation
such that:
* When a housekeeping CPU goes up and is part of the node of a given
kthread, it is added to its applied affinity set (and
possibly the default last resort online housekeeping set is removed
from the set).
* When a housekeeping CPU goes down while it was part of the node of a
kthread, it is removed from the kthread's applied
affinity. The last resort is to affine the kthread to all online
housekeeping CPUs.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
include/linux/cpuhotplug.h | 1 +
kernel/kthread.c | 120 ++++++++++++++++++++++++++++++++++++-
2 files changed, 120 insertions(+), 1 deletion(-)
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 9316c39260e0..89d852538b72 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -240,6 +240,7 @@ enum cpuhp_state {
CPUHP_AP_WORKQUEUE_ONLINE,
CPUHP_AP_RANDOM_ONLINE,
CPUHP_AP_RCUTREE_ONLINE,
+ CPUHP_AP_KTHREADS_ONLINE,
CPUHP_AP_BASE_CACHEINFO_ONLINE,
CPUHP_AP_ONLINE_DYN,
CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 40,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ecb719f54f7a..eee5925e7725 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -35,6 +35,10 @@ static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
+static struct cpumask kthread_online_mask;
+static LIST_HEAD(kthreads_hotplug);
+static DEFINE_MUTEX(kthreads_hotplug_lock);
+
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
@@ -53,6 +57,7 @@ struct kthread_create_info
struct kthread {
unsigned long flags;
unsigned int cpu;
+ unsigned int node;
int started;
int result;
int (*threadfn)(void *);
@@ -64,6 +69,8 @@ struct kthread {
#endif
/* To store the full name if task comm is truncated. */
char *full_name;
+ struct task_struct *task;
+ struct list_head hotplug_node;
};
enum KTHREAD_BITS {
@@ -122,8 +129,11 @@ bool set_kthread_struct(struct task_struct *p)
init_completion(&kthread->exited);
init_completion(&kthread->parked);
+ INIT_LIST_HEAD(&kthread->hotplug_node);
p->vfork_done = &kthread->exited;
+ kthread->task = p;
+ kthread->node = tsk_fork_get_node(current);
p->worker_private = kthread;
return true;
}
@@ -314,6 +324,13 @@ void __noreturn kthread_exit(long result)
{
struct kthread *kthread = to_kthread(current);
kthread->result = result;
+ if (!list_empty(&kthread->hotplug_node)) {
+ mutex_lock(&kthreads_hotplug_lock);
+ list_del(&kthread->hotplug_node);
+ /* Make sure the kthread never gets re-affined globally */
+ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ mutex_unlock(&kthreads_hotplug_lock);
+ }
do_exit(0);
}
EXPORT_SYMBOL(kthread_exit);
@@ -339,6 +356,45 @@ void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
}
EXPORT_SYMBOL(kthread_complete_and_exit);
+static void kthread_fetch_affinity(struct kthread *k, struct cpumask *mask)
+{
+ if (k->node == NUMA_NO_NODE) {
+ cpumask_copy(mask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ } else {
+ /*
+ * The node cpumask is racy when read from kthread() but:
+ * - a racing CPU going down won't be present in kthread_online_mask
+ * - a racing CPU going up will be handled by kthreads_online_cpu()
+ */
+ cpumask_and(mask, cpumask_of_node(k->node), &kthread_online_mask);
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ if (cpumask_empty(mask))
+ cpumask_copy(mask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ }
+}
+
+static int kthread_affine_node(void)
+{
+ struct kthread *kthread = to_kthread(current);
+ cpumask_var_t affinity;
+
+ WARN_ON_ONCE(kthread_is_per_cpu(current));
+
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+ return -ENOMEM;
+
+ mutex_lock(&kthreads_hotplug_lock);
+ WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
+ list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+ kthread_fetch_affinity(kthread, affinity);
+ set_cpus_allowed_ptr(current, affinity);
+ mutex_unlock(&kthreads_hotplug_lock);
+
+ free_cpumask_var(affinity);
+
+ return 0;
+}
+
static int kthread(void *_create)
{
static const struct sched_param param = { .sched_priority = 0 };
@@ -369,7 +425,6 @@ static int kthread(void *_create)
* back to default in case they have been changed.
*/
sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m);
- set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -385,6 +440,9 @@ static int kthread(void *_create)
self->started = 1;
+ if (!(current->flags & PF_NO_SETAFFINITY))
+ kthread_affine_node();
+
ret = -EINTR;
if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
cgroup_kthread_ready();
@@ -779,6 +837,66 @@ int kthreadd(void *unused)
return 0;
}
+static int kthreads_hotplug_update(void)
+{
+ cpumask_var_t affinity;
+ struct kthread *k;
+ int err;
+
+ if (list_empty(&kthreads_hotplug))
+ return 0;
+
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = 0;
+
+ list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
+ if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
+ kthread_is_per_cpu(k->task))) {
+ err = -EINVAL;
+ continue;
+ }
+ kthread_fetch_affinity(k, affinity);
+ set_cpus_allowed_ptr(k->task, affinity);
+ }
+
+ free_cpumask_var(affinity);
+
+ return err;
+}
+
+static int kthreads_offline_cpu(unsigned int cpu)
+{
+ int ret = 0;
+
+ mutex_lock(&kthreads_hotplug_lock);
+ cpumask_clear_cpu(cpu, &kthread_online_mask);
+ ret = kthreads_hotplug_update();
+ mutex_unlock(&kthreads_hotplug_lock);
+
+ return ret;
+}
+
+static int kthreads_online_cpu(unsigned int cpu)
+{
+ int ret = 0;
+
+ mutex_lock(&kthreads_hotplug_lock);
+ cpumask_set_cpu(cpu, &kthread_online_mask);
+ ret = kthreads_hotplug_update();
+ mutex_unlock(&kthreads_hotplug_lock);
+
+ return ret;
+}
+
+static int kthreads_init(void)
+{
+ return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
+ kthreads_online_cpu, kthreads_offline_cpu);
+}
+early_initcall(kthreads_init);
+
void __kthread_init_worker(struct kthread_worker *worker,
const char *name,
struct lock_class_key *key)
--
2.45.2
next prev parent reply other threads:[~2024-08-07 16:03 UTC|newest]
Thread overview: 37+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-08-07 16:02 [PATCH 00/19] kthread: Introduce preferred affinity v2 Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 01/19] arm/bL_switcher: Use kthread_run_on_cpu() Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 02/19] x86/resctrl: " Frederic Weisbecker
2024-12-02 15:19 ` [tip: x86/cache] " tip-bot2 for Frederic Weisbecker
2024-12-13 21:02 ` tip-bot2 for Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 03/19] firmware: stratix10-svc: " Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 04/19] scsi: bnx2fc: Use kthread_create_on_cpu() Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 05/19] scsi: bnx2i: " Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 06/19] scsi: qedi: " Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 07/19] soc/qman: test: Use kthread_run_on_cpu() Frederic Weisbecker
2024-08-07 16:02 ` Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 08/19] kallsyms: " Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 09/19] lib: test_objpool: " Frederic Weisbecker
2024-08-14 2:53 ` wuqiang.matt
2024-08-07 16:02 ` [PATCH 10/19] net: pktgen: Use kthread_create_on_node() Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 11/19] kthread: Make sure kthread hasn't started while binding it Frederic Weisbecker
2024-08-07 16:02 ` Frederic Weisbecker [this message]
2024-08-07 17:01 ` [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node Vlastimil Babka
2024-08-07 16:02 ` [PATCH 13/19] mm: Create/affine kcompactd to its preferred node Frederic Weisbecker
2024-08-07 17:02 ` Vlastimil Babka
2024-08-07 16:02 ` [PATCH 14/19] mm: Create/affine kswapd " Frederic Weisbecker
2024-08-07 17:05 ` Vlastimil Babka
2024-08-07 16:02 ` [PATCH 15/19] kthread: Implement preferred affinity Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 16/19] rcu: Use kthread preferred affinity for RCU boost Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 17/19] kthread: Unify kthread_create_on_cpu() and kthread_create_worker_on_cpu() automatic format Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 18/19] treewide: Introduce kthread_run_worker[_on_cpu]() Frederic Weisbecker
2024-08-07 16:02 ` [PATCH 19/19] rcu: Use kthread preferred affinity for RCU exp kworkers Frederic Weisbecker
-- strict thread matches above, loose matches on Subject: below --
2024-09-16 22:49 [PATCH 00/19] kthread: Introduce preferred affinity v3 Frederic Weisbecker
2024-09-16 22:49 ` [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node Frederic Weisbecker
2024-09-17 6:26 ` Michal Hocko
2024-09-17 7:01 ` Vlastimil Babka
2024-09-17 7:05 ` Michal Hocko
2024-09-17 7:14 ` Vlastimil Babka
2024-09-17 10:34 ` Frederic Weisbecker
2024-09-17 11:07 ` Michal Hocko
2024-09-18 9:37 ` Frederic Weisbecker
2024-09-18 11:17 ` Michal Hocko
2024-12-11 15:40 [PATCH 00/19] kthread: Introduce preferred affinity v6 Frederic Weisbecker
2024-12-11 15:40 ` [PATCH 12/19] kthread: Default affine kthread to its preferred NUMA node Frederic Weisbecker
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240807160228.26206-13-frederic@kernel.org \
--to=frederic@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=boqun.feng@gmail.com \
--cc=joel@joelfernandes.org \
--cc=kees@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=neeraj.upadhyay@kernel.org \
--cc=paulmck@kernel.org \
--cc=peterz@infradead.org \
--cc=qiang.zhang1211@gmail.com \
--cc=rcu@vger.kernel.org \
--cc=tglx@linutronix.de \
--cc=vbabka@suse.cz \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.