From: Frederic Weisbecker <frederic@kernel.org>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>,
Anna-Maria Behnsen <anna-maria@linutronix.de>,
Sehee Jeong <sehee1.jeong@samsung.com>,
Thomas Gleixner <tglx@linutronix.de>
Subject: [PATCH 4/6] timers/migration: Split per-capacity hierarchies
Date: Thu, 23 Apr 2026 18:53:52 +0200 [thread overview]
Message-ID: <20260423165354.95152-5-frederic@kernel.org> (raw)
In-Reply-To: <20260423165354.95152-1-frederic@kernel.org>
Systems with heterogeneous CPU capacities, such as big.LITTLE, have
reported power issues since the introduction of the new timer migration
code.
Timers migrate from small capacity CPUs to big ones, degrading their
target residency and thus overall power consumption.
Solve this with splitting hierarchies per CPU capacity. For example in
a big.LITTLE machine, split a single hierarchy in two: one for big
capacity CPUs and another one for small capacity CPUs. This way global
timers only migrate across CPUs of the same capacity.
For simplicity purpose, split hierarchies keep the same number of
possible levels as if there were a single hierarchy, even though the
CPUs are distributed between multiple hierarchies. This could be a
problem on NUMA systems with heterogeneous CPU capacities (provided that
ever exists yet) where useless intermediate nodes may be created.
Solving this properly will imply on boot to know in advance how many
capacities are available and the number of CPUs for each of them.
Reported-by: Sehee Jeong <sehee1.jeong@samsung.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
kernel/time/timer_migration.c | 125 +++++++++++++++++++++++++---------
kernel/time/timer_migration.h | 7 ++
2 files changed, 101 insertions(+), 31 deletions(-)
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index d0de9f64528e..0a8c893353a2 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -417,7 +417,7 @@
static DEFINE_MUTEX(tmigr_mutex);
-static struct tmigr_hierarchy *hierarchy;
+static LIST_HEAD(tmigr_hierarchy_list);
static unsigned int tmigr_hierarchy_levels __read_mostly;
static unsigned int tmigr_crossnode_level __read_mostly;
@@ -1893,6 +1893,12 @@ static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
data.childmask = start->groupmask;
__walk_groups_from(tmigr_active_up, &data, start, start->parent);
}
+ } else if (start) {
+ union tmigr_state state;
+
+ /* Remote activation assumes the whole target's hierarchy is inactive */
+ state.state = atomic_read(&start->migr_state);
+ WARN_ON_ONCE(state.active);
}
/* Root update */
@@ -1912,36 +1918,80 @@ static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
return err;
}
-static struct tmigr_hierarchy *tmigr_get_hierarchy(void)
+static struct tmigr_hierarchy *tmigr_get_hierarchy(unsigned int capacity)
{
- if (hierarchy)
- return hierarchy;
+ struct tmigr_hierarchy *hier = NULL, *iter;
- hierarchy = kzalloc(sizeof(*hierarchy), GFP_KERNEL);
- if (!hierarchy)
+ list_for_each_entry(iter, &tmigr_hierarchy_list, node) {
+ if (iter->capacity == capacity)
+ hier = iter;
+ }
+
+ if (hier)
+ return hier;
+
+ hier = kzalloc(sizeof(*hier), GFP_KERNEL);
+ if (!hier)
return ERR_PTR(-ENOMEM);
- hierarchy->cpumask = kzalloc(cpumask_size(), GFP_KERNEL);
- if (!hierarchy->cpumask)
+ hier->cpumask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!hier->cpumask)
goto err;
- hierarchy->level_list = kzalloc_objs(struct list_head,
- tmigr_hierarchy_levels);
- if (!hierarchy->level_list)
+ hier->level_list = kzalloc_objs(struct list_head,
+ tmigr_hierarchy_levels);
+ if (!hier->level_list)
goto err;
for (int i = 0; i < tmigr_hierarchy_levels; i++)
- INIT_LIST_HEAD(&hierarchy->level_list[i]);
+ INIT_LIST_HEAD(&hier->level_list[i]);
- return hierarchy;
+ hier->capacity = capacity;
+ list_add_tail(&hier->node, &tmigr_hierarchy_list);
+
+ return hier;
err:
- kfree(hierarchy->cpumask);
- kfree(hierarchy);
- hierarchy = NULL;
+ kfree(hier->cpumask);
+ kfree(hier);
return ERR_PTR(-ENOMEM);
}
+static int tmigr_connect_old_root(struct tmigr_hierarchy *hier, int cpu,
+ struct tmigr_group *old_root, bool activate)
+{
+ /*
+ * The target CPU must never do the prepare work, except
+ * on early boot when the boot CPU is the target. Otherwise
+ * it may spuriously activate the old top level group inside
+ * the new one (nevertheless whether old top level group is
+ * active or not) and/or release an uninitialized childmask.
+ */
+ WARN_ON_ONCE(cpu == smp_processor_id());
+ if (activate) {
+ /*
+ * The current CPU is expected to be online in the hierarchy,
+ * otherwise the old root may not be active as expected.
+ */
+ WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available));
+ }
+
+ return tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, activate);
+}
+
+static long connect_old_root_work(void *arg)
+{
+ struct tmigr_group *old_root = arg;
+ struct tmigr_hierarchy *hier;
+ int cpu = smp_processor_id();
+
+ hier = tmigr_get_hierarchy(arch_scale_cpu_capacity(cpu));
+ if (IS_ERR(hier))
+ return PTR_ERR(hier);
+
+ return tmigr_connect_old_root(hier, cpu, old_root, true);
+}
+
static int tmigr_add_cpu(unsigned int cpu)
{
struct tmigr_hierarchy *hier;
@@ -1951,7 +2001,7 @@ static int tmigr_add_cpu(unsigned int cpu)
guard(mutex)(&tmigr_mutex);
- hier = tmigr_get_hierarchy();
+ hier = tmigr_get_hierarchy(arch_scale_cpu_capacity(cpu));
if (IS_ERR(hier))
return PTR_ERR(hier);
@@ -1964,20 +2014,33 @@ static int tmigr_add_cpu(unsigned int cpu)
/* Root has changed? Connect the old one to the new */
if (old_root && old_root != hier->root) {
- /*
- * The target CPU must never do the prepare work, except
- * on early boot when the boot CPU is the target. Otherwise
- * it may spuriously activate the old top level group inside
- * the new one (nevertheless whether old top level group is
- * active or not) and/or release an uninitialized childmask.
- */
- WARN_ON_ONCE(cpu == raw_smp_processor_id());
- /*
- * The (likely) current CPU is expected to be online in the hierarchy,
- * otherwise the old root may not be active as expected.
- */
- WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
- ret = tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, true);
+ guard(migrate)();
+
+ if (cpumask_test_cpu(smp_processor_id(), hier->cpumask)) {
+ /*
+ * If the target belong to the same hierarchy, the old root is expected
+ * to be active. Link and propagate to the new root.
+ */
+ ret = tmigr_connect_old_root(hier, cpu, old_root, true);
+ } else {
+ int target = cpumask_first_and(hier->cpumask, tmigr_available_cpumask);
+
+ if (target < nr_cpu_ids) {
+ /*
+ * If the target doesn't belong to the same hierarchy as the current
+ * CPU, activate from a relevant one to make sure the old root is
+ * active.
+ */
+ ret = work_on_cpu(target, connect_old_root_work, old_root);
+ } else {
+ /*
+ * No other available CPUs in the remote hierarchy. Link the
+ * old root remotely but don't propagate activation since the
+ * old root is not expected to be active.
+ */
+ ret = tmigr_connect_old_root(hier, cpu, old_root, false);
+ }
+ }
}
if (ret >= 0)
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index 0cfbb8d799a6..291bfb6adfc3 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -7,14 +7,21 @@
/**
* struct tmigr_hierarchy - a hierarchy associated to a given CPU capacity.
+ * Homogeneous systems have only one hierarchy.
+ * Heterogenous have one hierarchy per CPU capacity.
* @level_list: Per level lists of tmigr groups
* @cpumask: CPUs belonging to this hierarchy
* @root: The current root of the hierarchy
+ * @capacity: CPU capacity associated to this hierarchy
+ * @node: Node in the global hierarchy list
*/
struct tmigr_hierarchy {
struct list_head *level_list;
struct cpumask *cpumask;
struct tmigr_group *root;
+ unsigned long capacity;
+ struct list_head node;
+
};
/**
--
2.53.0
next prev parent reply other threads:[~2026-04-23 16:54 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-23 16:53 [PATCH 0/6] timers/migration: Handle heterogenous CPU capacities Frederic Weisbecker
2026-04-23 16:53 ` [PATCH 1/6] timers/migration: Fix another hotplug activation race Frederic Weisbecker
2026-04-23 16:53 ` [PATCH 2/6] timers/migration: Abstract out hierarchy to prepare for CPU capacity awareness Frederic Weisbecker
2026-04-23 16:53 ` [PATCH 3/6] timers/migration: Track CPUs in a hierarchy Frederic Weisbecker
2026-04-23 16:53 ` Frederic Weisbecker [this message]
2026-04-23 16:53 ` [PATCH 5/6] timers/migration: Handle capacity in connect tracepoints Frederic Weisbecker
2026-04-23 16:53 ` [PATCH 6/6] scripts/timers: Add timer_migration_tree.py Frederic Weisbecker
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260423165354.95152-5-frederic@kernel.org \
--to=frederic@kernel.org \
--cc=anna-maria@linutronix.de \
--cc=linux-kernel@vger.kernel.org \
--cc=sehee1.jeong@samsung.com \
--cc=tglx@linutronix.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox