* [PATCH 1/6] timers/migration: Revert per CPU capacity hierarchy
2026-06-25 16:41 [RFT][DONOTMERGE][PATCH 0/6] timers/migration: Prioritize lower capacity CPUs as migrators Frederic Weisbecker
@ 2026-06-25 16:41 ` Frederic Weisbecker
2026-06-25 16:41 ` [PATCH 2/6] timers/migration: Defer initialization after capacity topology is setup Frederic Weisbecker
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Frederic Weisbecker @ 2026-06-25 16:41 UTC (permalink / raw)
To: Christian Loehle
Cc: LKML, Frederic Weisbecker, Anna-Maria Behnsen, Sehee Jeong,
Thomas Gleixner, Peter Zijlstra
The per CPU-capacity hierarchies may not be the right answer to
solve timers migrating too often to high capacity CPUs. Revert that in
favour of later code that will gather groups per capacity under a common
root and prefer lower capacity groups as migrators.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
include/trace/events/timer_migration.h | 24 +--
kernel/time/timer_migration.c | 241 ++++++-------------------
kernel/time/timer_migration.h | 18 --
scripts/timer_migration_tree.py | 50 ++---
4 files changed, 87 insertions(+), 246 deletions(-)
diff --git a/include/trace/events/timer_migration.h b/include/trace/events/timer_migration.h
index 0b135e9301b1..61171b13c687 100644
--- a/include/trace/events/timer_migration.h
+++ b/include/trace/events/timer_migration.h
@@ -33,16 +33,15 @@ TRACE_EVENT(tmigr_group_set,
TRACE_EVENT(tmigr_connect_child_parent,
- TP_PROTO(struct tmigr_hierarchy *hier, struct tmigr_group *child),
+ TP_PROTO(struct tmigr_group *child),
- TP_ARGS(hier, child),
+ TP_ARGS(child),
TP_STRUCT__entry(
__field( void *, child )
__field( void *, parent )
__field( unsigned int, lvl )
__field( unsigned int, numa_node )
- __field( unsigned int, capacity )
__field( unsigned int, num_children )
__field( u32, groupmask )
),
@@ -52,28 +51,26 @@ TRACE_EVENT(tmigr_connect_child_parent,
__entry->parent = child->parent;
__entry->lvl = child->parent->level;
__entry->numa_node = child->parent->numa_node;
- __entry->capacity = hier->capacity;
__entry->num_children = child->parent->num_children;
__entry->groupmask = child->groupmask;
),
- TP_printk("group=%p groupmask=%0x parent=%p lvl=%d numa=%d capacity=%d num_children=%d",
- __entry->child, __entry->groupmask, __entry->parent, __entry->lvl,
- __entry->numa_node, __entry->capacity, __entry->num_children)
+ TP_printk("group=%p groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
+ __entry->child, __entry->groupmask, __entry->parent,
+ __entry->lvl, __entry->numa_node, __entry->num_children)
);
TRACE_EVENT(tmigr_connect_cpu_parent,
- TP_PROTO(struct tmigr_hierarchy *hier, struct tmigr_cpu *tmc),
+ TP_PROTO(struct tmigr_cpu *tmc),
- TP_ARGS(hier, tmc),
+ TP_ARGS(tmc),
TP_STRUCT__entry(
__field( void *, parent )
__field( unsigned int, cpu )
__field( unsigned int, lvl )
__field( unsigned int, numa_node )
- __field( unsigned int, capacity )
__field( unsigned int, num_children )
__field( u32, groupmask )
),
@@ -83,14 +80,13 @@ TRACE_EVENT(tmigr_connect_cpu_parent,
__entry->cpu = tmc->cpuevt.cpu;
__entry->lvl = tmc->tmgroup->level;
__entry->numa_node = tmc->tmgroup->numa_node;
- __entry->capacity = hier->capacity;
__entry->num_children = tmc->tmgroup->num_children;
__entry->groupmask = tmc->groupmask;
),
- TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d numa=%d capacity=%d num_children=%d",
- __entry->cpu, __entry->groupmask, __entry->parent, __entry->lvl,
- __entry->numa_node, __entry->capacity, __entry->num_children)
+ TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
+ __entry->cpu, __entry->groupmask, __entry->parent,
+ __entry->lvl, __entry->numa_node, __entry->num_children)
);
DECLARE_EVENT_CLASS(tmigr_group_and_cpu,
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 806c23cf71fc..52c15affdbff 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -102,7 +102,7 @@
* active CPU/group information atomic_try_cmpxchg() is used instead and only
* the per CPU tmigr_cpu->lock is held.
*
- * During the setup of groups, hier->level_list is required. It is protected by
+ * During the setup of groups tmigr_level_list is required. It is protected by
* @tmigr_mutex.
*
* When @timer_base->lock as well as tmigr related locks are required, the lock
@@ -416,12 +416,13 @@
*/
static DEFINE_MUTEX(tmigr_mutex);
-
-static LIST_HEAD(tmigr_hierarchy_list);
+static struct list_head *tmigr_level_list __read_mostly;
static unsigned int tmigr_hierarchy_levels __read_mostly;
static unsigned int tmigr_crossnode_level __read_mostly;
+static struct tmigr_group *tmigr_root;
+
static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
/*
@@ -1468,34 +1469,6 @@ static long tmigr_trigger_active(void *unused)
return 0;
}
-static unsigned int tmigr_get_capacity(int cpu)
-{
- /*
- * nohz_full CPUs need to make sure there is always an available (online)
- * and never idle migrator to handle all their global timers. That duty
- * is served by the timekeeper which then never stops its tick. But the
- * timekeeper must then belong to the same hierarchy as all the nohz_full
- * CPUs. Simply turn off capacity awareness when nohz_full is running.
- */
- if (tick_nohz_full_enabled() || !IS_ENABLED(CONFIG_BROKEN))
- return SCHED_CAPACITY_SCALE;
- else
- return arch_scale_cpu_capacity(cpu);
-}
-
-static struct tmigr_hierarchy *__tmigr_get_hierarchy(int cpu)
-{
- unsigned int capacity = tmigr_get_capacity(cpu);
- struct tmigr_hierarchy *iter;
-
- list_for_each_entry(iter, &tmigr_hierarchy_list, node) {
- if (iter->capacity == capacity)
- return iter;
- }
-
- return NULL;
-}
-
static int tmigr_clear_cpu_available(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
@@ -1520,21 +1493,8 @@ static int tmigr_clear_cpu_available(unsigned int cpu)
}
if (firstexp != KTIME_MAX) {
- struct tmigr_hierarchy *hier = __tmigr_get_hierarchy(cpu);
-
- if (WARN_ON_ONCE(!hier))
- return -EINVAL;
-
- migrator = cpumask_any_and(tmigr_available_cpumask, hier->cpumask);
- if (migrator < nr_cpu_ids) {
- work_on_cpu(migrator, tmigr_trigger_active, NULL);
- } else {
- /*
- * If deactivation returned an expiration, it belongs to an available
- * nohz CPU in the hierarchy.
- */
- WARN_ONCE(1, "Expected available CPU in the hierarchy\n");
- }
+ migrator = cpumask_any(tmigr_available_cpumask);
+ work_on_cpu(migrator, tmigr_trigger_active, NULL);
}
return 0;
@@ -1697,14 +1657,14 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
group->groupevt.ignore = true;
}
-static struct tmigr_group *tmigr_get_group(struct tmigr_hierarchy *hier, int node, unsigned int lvl)
+static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
{
struct tmigr_group *tmp, *group = NULL;
lockdep_assert_held(&tmigr_mutex);
/* Try to attach to an existing group first */
- list_for_each_entry(tmp, &hier->level_list[lvl], list) {
+ list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
/*
* If @lvl is below the cross NUMA node level, check whether
* this group belongs to the same NUMA node.
@@ -1738,14 +1698,14 @@ static struct tmigr_group *tmigr_get_group(struct tmigr_hierarchy *hier, int nod
tmigr_init_group(group, lvl, node);
/* Setup successful. Add it to the hierarchy */
- list_add(&group->list, &hier->level_list[lvl]);
+ list_add(&group->list, &tmigr_level_list[lvl]);
trace_tmigr_group_set(group);
return group;
}
-static bool tmigr_init_root(struct tmigr_hierarchy *hier, struct tmigr_group *group, bool activate)
+static bool tmigr_init_root(struct tmigr_group *group, bool activate)
{
- if (!group->parent && group != hier->root) {
+ if (!group->parent && group != tmigr_root) {
/*
* This is the new top-level, prepare its groupmask in advance
* to avoid accidents where yet another new top-level is
@@ -1761,10 +1721,11 @@ static bool tmigr_init_root(struct tmigr_hierarchy *hier, struct tmigr_group *gr
}
-static void tmigr_connect_child_parent(struct tmigr_hierarchy *hier, struct tmigr_group *child,
- struct tmigr_group *parent, bool activate)
+static void tmigr_connect_child_parent(struct tmigr_group *child,
+ struct tmigr_group *parent,
+ bool activate)
{
- if (tmigr_init_root(hier, parent, activate)) {
+ if (tmigr_init_root(parent, activate)) {
/*
* The previous top level had prepared its groupmask already,
* simply account it in advance as the first child. If some groups
@@ -1797,13 +1758,13 @@ static void tmigr_connect_child_parent(struct tmigr_hierarchy *hier, struct tmig
*/
smp_store_release(&child->parent, parent);
- trace_tmigr_connect_child_parent(hier, child);
+ trace_tmigr_connect_child_parent(child);
}
-static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
- unsigned int node, struct tmigr_group *start, bool activate)
+static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
+ struct tmigr_group *start, bool activate)
{
- struct tmigr_group *root = hier->root, *group, *child, **stack;
+ struct tmigr_group *group, *child, **stack;
int i, top = 0, err = 0, start_lvl = 0;
bool root_mismatch = false;
@@ -1816,11 +1777,11 @@ static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
start_lvl = start->level + 1;
}
- if (root)
- root_mismatch = root->numa_node != node;
+ if (tmigr_root)
+ root_mismatch = tmigr_root->numa_node != node;
for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
- group = tmigr_get_group(hier, node, i);
+ group = tmigr_get_group(node, i);
if (IS_ERR(group)) {
err = PTR_ERR(group);
i--;
@@ -1842,7 +1803,7 @@ static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
if (group->parent)
break;
if ((!root_mismatch || i >= tmigr_crossnode_level) &&
- list_is_singular(&hier->level_list[i]))
+ list_is_singular(&tmigr_level_list[i]))
break;
}
@@ -1870,15 +1831,15 @@ static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
tmc->tmgroup = group;
tmc->groupmask = BIT(group->num_children++);
- tmigr_init_root(hier, group, activate);
+ tmigr_init_root(group, activate);
- trace_tmigr_connect_cpu_parent(hier, tmc);
+ trace_tmigr_connect_cpu_parent(tmc);
/* There are no children that need to be connected */
continue;
} else {
child = stack[i - 1];
- tmigr_connect_child_parent(hier, child, group, activate);
+ tmigr_connect_child_parent(child, group, activate);
}
}
@@ -1934,23 +1895,18 @@ static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
data.childmask = start->groupmask;
__walk_groups_from(tmigr_active_up, &data, start, start->parent);
}
- } else if (start) {
- union tmigr_state state;
-
- /* Remote activation assumes the whole target's hierarchy is inactive */
- state.state = atomic_read(&start->migr_state);
- WARN_ON_ONCE(state.active);
}
/* Root update */
- if (list_is_singular(&hier->level_list[top])) {
- group = list_first_entry(&hier->level_list[top], typeof(*group), list);
+ if (list_is_singular(&tmigr_level_list[top])) {
+ group = list_first_entry(&tmigr_level_list[top],
+ typeof(*group), list);
WARN_ON_ONCE(group->parent);
- if (root) {
+ if (tmigr_root) {
/* Old root should be the same or below */
- WARN_ON_ONCE(root->level > top);
+ WARN_ON_ONCE(tmigr_root->level > top);
}
- hier->root = group;
+ tmigr_root = group;
}
out:
kfree(stack);
@@ -1958,123 +1914,34 @@ static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
return err;
}
-static struct tmigr_hierarchy *tmigr_get_hierarchy(int cpu)
-{
- struct tmigr_hierarchy *hier;
-
- hier = __tmigr_get_hierarchy(cpu);
-
- if (hier)
- return hier;
-
- hier = kzalloc_flex(*hier, level_list, tmigr_hierarchy_levels);
- if (!hier)
- return ERR_PTR(-ENOMEM);
-
- hier->cpumask = kzalloc(cpumask_size(), GFP_KERNEL);
- if (!hier->cpumask) {
- kfree(hier);
- return ERR_PTR(-ENOMEM);
- }
-
- for (int i = 0; i < tmigr_hierarchy_levels; i++)
- INIT_LIST_HEAD(&hier->level_list[i]);
-
- hier->capacity = tmigr_get_capacity(cpu);
- list_add_tail(&hier->node, &tmigr_hierarchy_list);
-
- return hier;
-}
-
-static int tmigr_connect_old_root(struct tmigr_hierarchy *hier, int cpu,
- struct tmigr_group *old_root, bool activate)
-{
- /*
- * The target CPU must never do the prepare work, except
- * on early boot when the boot CPU is the target. Otherwise
- * it may spuriously activate the old top level group inside
- * the new one (nevertheless whether old top level group is
- * active or not) and/or release an uninitialized childmask.
- */
- WARN_ON_ONCE(cpu == smp_processor_id());
- if (activate) {
- /*
- * The current CPU is expected to be online in the hierarchy,
- * otherwise the old root may not be active as expected.
- */
- WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available));
- }
-
- return tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, activate);
-}
-
-static long connect_old_root_work(void *arg)
-{
- struct tmigr_group *old_root = arg;
- struct tmigr_hierarchy *hier;
- int cpu = smp_processor_id();
-
- hier = __tmigr_get_hierarchy(cpu);
- if (WARN_ON_ONCE(!hier))
- return -EINVAL;
-
- return tmigr_connect_old_root(hier, cpu, old_root, true);
-}
-
static int tmigr_add_cpu(unsigned int cpu)
{
- struct tmigr_hierarchy *hier;
- struct tmigr_group *old_root;
+ struct tmigr_group *old_root = tmigr_root;
int node = cpu_to_node(cpu);
int ret;
guard(mutex)(&tmigr_mutex);
- hier = tmigr_get_hierarchy(cpu);
- if (IS_ERR(hier))
- return PTR_ERR(hier);
-
- old_root = hier->root;
-
- ret = tmigr_setup_groups(hier, cpu, node, NULL, false);
-
- if (ret < 0)
- return ret;
+ ret = tmigr_setup_groups(cpu, node, NULL, false);
/* Root has changed? Connect the old one to the new */
- if (old_root && old_root != hier->root) {
- guard(migrate)();
-
- if (cpumask_test_cpu(smp_processor_id(), hier->cpumask)) {
- /*
- * If the target belong to the same hierarchy, the old root is expected
- * to be active. Link and propagate to the new root.
- */
- ret = tmigr_connect_old_root(hier, cpu, old_root, true);
- } else {
- int target = cpumask_first_and(hier->cpumask, tmigr_available_cpumask);
-
- if (target < nr_cpu_ids) {
- /*
- * If the target doesn't belong to the same hierarchy as the current
- * CPU, activate from a relevant one to make sure the old root is
- * active.
- */
- ret = work_on_cpu(target, connect_old_root_work, old_root);
- } else {
- /*
- * No other available CPUs in the remote hierarchy. Link the
- * old root remotely but don't propagate activation since the
- * old root is not expected to be active.
- */
- ret = tmigr_connect_old_root(hier, cpu, old_root, false);
- }
- }
+ if (ret >= 0 && old_root && old_root != tmigr_root) {
+ /*
+ * The target CPU must never do the prepare work, except
+ * on early boot when the boot CPU is the target. Otherwise
+ * it may spuriously activate the old top level group inside
+ * the new one (nevertheless whether old top level group is
+ * active or not) and/or release an uninitialized childmask.
+ */
+ WARN_ON_ONCE(cpu == raw_smp_processor_id());
+ /*
+ * The (likely) current CPU is expected to be online in the hierarchy,
+ * otherwise the old root may not be active as expected.
+ */
+ WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
+ ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
}
- if (ret >= 0)
- cpumask_set_cpu(cpu, hier->cpumask);
-
return ret;
}
@@ -2107,7 +1974,7 @@ static int tmigr_cpu_prepare(unsigned int cpu)
static int __init tmigr_init(void)
{
- unsigned int cpulvl, nodelvl, cpus_per_node;
+ unsigned int cpulvl, nodelvl, cpus_per_node, i;
unsigned int nnodes = num_possible_nodes();
unsigned int ncpus = num_possible_cpus();
int ret = -ENOMEM;
@@ -2154,6 +2021,14 @@ static int __init tmigr_init(void)
*/
tmigr_crossnode_level = cpulvl;
+ tmigr_level_list = kzalloc_objs(struct list_head,
+ tmigr_hierarchy_levels);
+ if (!tmigr_level_list)
+ goto err;
+
+ for (i = 0; i < tmigr_hierarchy_levels; i++)
+ INIT_LIST_HEAD(&tmigr_level_list[i]);
+
pr_info("Timer migration: %d hierarchy levels; %d children per group;"
" %d crossnode level\n",
tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index 31735dd52327..4c0073f3d321 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -5,24 +5,6 @@
/* Per group capacity. Must be a power of 2! */
#define TMIGR_CHILDREN_PER_GROUP 8
-/**
- * struct tmigr_hierarchy - a hierarchy associated to a given CPU capacity.
- * Homogeneous systems have only one hierarchy.
- * Heterogenous have one hierarchy per CPU capacity.
- * @cpumask: CPUs belonging to this hierarchy
- * @root: The current root of the hierarchy
- * @capacity: CPU capacity associated to this hierarchy
- * @node: Node in the global hierarchy list
- * @level_list: Per level lists of tmigr groups
- */
-struct tmigr_hierarchy {
- struct cpumask *cpumask;
- struct tmigr_group *root;
- unsigned long capacity;
- struct list_head node;
- struct list_head level_list[];
-};
-
/**
* struct tmigr_event - a timer event associated to a CPU
* @nextevt: The node to enqueue an event in the parent group queue
diff --git a/scripts/timer_migration_tree.py b/scripts/timer_migration_tree.py
index faac9de854bd..abb321b903c4 100755
--- a/scripts/timer_migration_tree.py
+++ b/scripts/timer_migration_tree.py
@@ -12,6 +12,7 @@ import re, sys
from ete3 import Tree
class Node:
+ node_list = { }
def __init__(self, group):
self.group = group
self.children = []
@@ -45,29 +46,21 @@ class Node:
parent_grp = "-"
return "Group: %s mask: %s parent: %s lvl: %d numa: %d num_children: %d" % (self.group, self.groupmask, parent_grp, self.lvl, self.numa, self.num_children)
-hierarchies = { }
-
-def get_hierarchy(capacity):
- if capacity not in hierarchies:
- hierarchies[capacity] = {}
- return hierarchies[capacity]
-
-def get_node(capacity, group):
- hier = get_hierarchy(capacity)
- if group in hier:
- return hier[group]
+def get_node(group):
+ if group in Node.node_list:
+ return Node.node_list[group]
else:
n = Node(group)
- hier[group] = n
+ Node.node_list[group] = n
return n
def tmigr_connect_cpu_parent(ts, line):
- s = re.search("tmigr_connect_cpu_parent: cpu=([0-9]+) groupmask=([0-9a-zA-Z]+) parent=([0-9a-zA-Z]+) lvl=([0-9]+) numa=([-]?[0-9]+) capacity=([-]?[0-9]+) num_children=([0-9]+)", line)
+ s = re.search("tmigr_connect_cpu_parent: cpu=([0-9]+) groupmask=([0-9a-zA-Z]+) parent=([0-9a-zA-Z]+) lvl=([0-9]+) numa=([-]?[0-9]+) num_children=([0-9]+)", line)
if s is None:
return False
- (cpu, groupmask, parent, lvl, numa, capacity, num_children) = (int(s.group(1)), s.group(2), s.group(3), int(s.group(4)), int(s.group(5)), int(s.group(6)), int(s.group(7)))
- n = get_node(capacity, cpu)
- p = get_node(capacity, parent)
+ (cpu, groupmask, parent, lvl, numa, num_children) = (int(s.group(1)), s.group(2), s.group(3), int(s.group(4)), int(s.group(5)), int(s.group(6)))
+ n = get_node(cpu)
+ p = get_node(parent)
n.set_parent(p)
n.set_groupmask(groupmask)
n.set_lvl(-1)
@@ -78,12 +71,12 @@ def tmigr_connect_cpu_parent(ts, line):
p.add_child(n)
def tmigr_connect_child_parent(ts, line):
- s = re.search("tmigr_connect_child_parent: group=([0-9a-zA-Z]+) groupmask=([0-9a-zA-Z]+) parent=([0-9a-zA-Z]+) lvl=([0-9]+) numa=([-]?[0-9]+) capacity=([-]?[0-9]+) num_children=([0-9]+)", line)
+ s = re.search("tmigr_connect_child_parent: group=([0-9a-zA-Z]+) groupmask=([0-9a-zA-Z]+) parent=([0-9a-zA-Z]+) lvl=([0-9]+) numa=([-]?[0-9]+) num_children=([0-9]+)", line)
if s is None:
return False
- (group, groupmask, parent, lvl, numa, capacity, num_children) = (s.group(1), s.group(2), s.group(3), int(s.group(4)), int(s.group(5)), int(s.group(6)), int(s.group(7)))
- n = get_node(capacity, group)
- p = get_node(capacity, parent)
+ (group, groupmask, parent, lvl, numa, num_children) = (s.group(1), s.group(2), s.group(3), int(s.group(4)), int(s.group(5)), int(s.group(6)))
+ n = get_node(group)
+ p = get_node(parent)
n.set_parent(p)
n.set_groupmask(groupmask)
p.set_lvl(lvl)
@@ -109,14 +102,9 @@ if __name__ == "__main__":
if tmigr_connect_child_parent(float(s.group(1)), s.group(2)):
continue
- for cap in hierarchies:
- h = hierarchies[cap]
- print("Tree for capacity %d" % cap)
- for k in h:
- n = h[k]
- while n.parent != None:
- n = n.parent
- root = Tree()
- populate(root, n)
- print(root.get_ascii(show_internal=True, attributes=["name", "numa", "lvl"]))
- break
+ group = list(Node.node_list.values())[0]
+ while group.parent != None:
+ group = group.parent
+ root = Tree()
+ populate(root, group)
+ print(root.get_ascii(show_internal=True, attributes=["name", "numa", "lvl"]))
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 2/6] timers/migration: Defer initialization after capacity topology is setup
2026-06-25 16:41 [RFT][DONOTMERGE][PATCH 0/6] timers/migration: Prioritize lower capacity CPUs as migrators Frederic Weisbecker
2026-06-25 16:41 ` [PATCH 1/6] timers/migration: Revert per CPU capacity hierarchy Frederic Weisbecker
@ 2026-06-25 16:41 ` Frederic Weisbecker
2026-06-25 16:41 ` [PATCH 3/6] sched/topology: Account asym capacities number Frederic Weisbecker
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Frederic Weisbecker @ 2026-06-25 16:41 UTC (permalink / raw)
To: Christian Loehle
Cc: LKML, Frederic Weisbecker, Anna-Maria Behnsen, Sehee Jeong,
Thomas Gleixner, Peter Zijlstra
In order for the timer migration code to get informations from the CPU
capacity list, the tree must be prepared after the scheduler topology is
initialized.
Defer this part and the hotplug callbacks to a late initcall. Two new
cases are taken care of now that the registration of the callbacks
is made after CPUs have booted:
* The prepare callback may now run on the target.
* A subsequent root level can be created even if no CPU is available in
the hierarchy because prepare callbacks are called on all CPUs before
online callbacks.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
kernel/time/timer_migration.c | 92 +++++++++++++++++++++++++----------
1 file changed, 67 insertions(+), 25 deletions(-)
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 52c15affdbff..4209e695ec7b 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -1631,7 +1631,6 @@ static int __init tmigr_init_isolation(void)
/* Protect against RCU torture hotplug testing */
return tmigr_isolated_exclude_cpumask(cpumask);
}
-late_initcall(tmigr_init_isolation);
static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
int node)
@@ -1703,7 +1702,7 @@ static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
return group;
}
-static bool tmigr_init_root(struct tmigr_group *group, bool activate)
+static bool tmigr_init_root(struct tmigr_group *group, bool root_up)
{
if (!group->parent && group != tmigr_root) {
/*
@@ -1712,7 +1711,7 @@ static bool tmigr_init_root(struct tmigr_group *group, bool activate)
* created in the future and made visible before this groupmask.
*/
group->groupmask = BIT(0);
- WARN_ON_ONCE(activate);
+ WARN_ON_ONCE(root_up);
return true;
}
@@ -1723,9 +1722,9 @@ static bool tmigr_init_root(struct tmigr_group *group, bool activate)
static void tmigr_connect_child_parent(struct tmigr_group *child,
struct tmigr_group *parent,
- bool activate)
+ bool root_up)
{
- if (tmigr_init_root(parent, activate)) {
+ if (tmigr_init_root(parent, root_up)) {
/*
* The previous top level had prepared its groupmask already,
* simply account it in advance as the first child. If some groups
@@ -1736,7 +1735,7 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
}
/* Connecting old root to new root ? */
- if (!parent->parent && activate) {
+ if (!parent->parent && root_up) {
/*
* @child is the old top, or in case of node mismatch, some
* intermediate group between the old top and the new one in
@@ -1831,7 +1830,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
tmc->tmgroup = group;
tmc->groupmask = BIT(group->num_children++);
- tmigr_init_root(group, activate);
+ tmigr_init_root(group, start);
trace_tmigr_connect_cpu_parent(tmc);
@@ -1839,7 +1838,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
continue;
} else {
child = stack[i - 1];
- tmigr_connect_child_parent(child, group, activate);
+ tmigr_connect_child_parent(child, group, start);
}
}
@@ -1894,8 +1893,14 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
if (state.active) {
data.childmask = start->groupmask;
__walk_groups_from(tmigr_active_up, &data, start, start->parent);
+ } else if (start) {
+ union tmigr_state state;
+
+ /* No available CPU so the old root should be inactive */
+ state.state = atomic_read(&start->migr_state);
+ WARN_ON_ONCE(state.active);
}
- }
+ }
/* Root update */
if (list_is_singular(&tmigr_level_list[top])) {
@@ -1914,18 +1919,9 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
return err;
}
-static int tmigr_add_cpu(unsigned int cpu)
+static int tmigr_connect_old_root(int cpu, struct tmigr_group *old_root, bool activate)
{
- struct tmigr_group *old_root = tmigr_root;
- int node = cpu_to_node(cpu);
- int ret;
-
- guard(mutex)(&tmigr_mutex);
-
- ret = tmigr_setup_groups(cpu, node, NULL, false);
-
- /* Root has changed? Connect the old one to the new */
- if (ret >= 0 && old_root && old_root != tmigr_root) {
+ if (activate) {
/*
* The target CPU must never do the prepare work, except
* on early boot when the boot CPU is the target. Otherwise
@@ -1933,13 +1929,55 @@ static int tmigr_add_cpu(unsigned int cpu)
* the new one (nevertheless whether old top level group is
* active or not) and/or release an uninitialized childmask.
*/
- WARN_ON_ONCE(cpu == raw_smp_processor_id());
+ WARN_ON_ONCE(cpu == smp_processor_id());
/*
- * The (likely) current CPU is expected to be online in the hierarchy,
+ * The current CPU is expected to be online in the hierarchy,
* otherwise the old root may not be active as expected.
*/
- WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
- ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
+ WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available));
+ }
+
+ return tmigr_setup_groups(-1, old_root->numa_node, old_root, activate);
+}
+
+static long connect_old_root_work(void *arg)
+{
+ struct tmigr_group *old_root = arg;
+
+ return tmigr_connect_old_root(smp_processor_id(), old_root, true);
+}
+
+static int tmigr_add_cpu(unsigned int cpu)
+{
+ struct tmigr_group *old_root = tmigr_root;
+ int node = cpu_to_node(cpu);
+ int ret;
+
+ guard(mutex)(&tmigr_mutex);
+
+ ret = tmigr_setup_groups(cpu, node, NULL, false);
+
+ if (ret < 0 || !old_root || old_root == tmigr_root)
+ return ret;
+
+ /* Root has changed. Connect the old one to the new */
+ guard(migrate)();
+ if (cpumask_test_cpu(smp_processor_id(), tmigr_available_cpumask)) {
+ /*
+ * If the CPU is available in the hierarchy, the old root is expected
+ * to be active. Link and propagate to the new root.
+ */
+ ret = tmigr_connect_old_root(cpu, old_root, true);
+ } else {
+ int target = cpumask_first(tmigr_available_cpumask);
+
+ if (target < nr_cpu_ids) {
+ /* Defer the connection to an available CPU to propagate activation */
+ ret = work_on_cpu(target, connect_old_root_work, old_root);
+ } else {
+ /* No CPU available yet, connect but don't activate */
+ ret = tmigr_connect_old_root(cpu, old_root, false);
+ }
}
return ret;
@@ -2044,10 +2082,14 @@ static int __init tmigr_init(void)
if (ret)
goto err;
+ ret = tmigr_init_isolation();
+ if (ret)
+ goto err;
+
return 0;
err:
pr_err("Timer migration setup failed\n");
return ret;
}
-early_initcall(tmigr_init);
+late_initcall(tmigr_init);
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 3/6] sched/topology: Account asym capacities number
2026-06-25 16:41 [RFT][DONOTMERGE][PATCH 0/6] timers/migration: Prioritize lower capacity CPUs as migrators Frederic Weisbecker
2026-06-25 16:41 ` [PATCH 1/6] timers/migration: Revert per CPU capacity hierarchy Frederic Weisbecker
2026-06-25 16:41 ` [PATCH 2/6] timers/migration: Defer initialization after capacity topology is setup Frederic Weisbecker
@ 2026-06-25 16:41 ` Frederic Weisbecker
2026-06-25 16:41 ` [PATCH 4/6] timers/migration: Group CPUs per capacity Frederic Weisbecker
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Frederic Weisbecker @ 2026-06-25 16:41 UTC (permalink / raw)
To: Christian Loehle
Cc: LKML, Frederic Weisbecker, Anna-Maria Behnsen, Sehee Jeong,
Thomas Gleixner, Peter Zijlstra
The timer migration code will need to know the number of capacities
in the system and the maximum number of CPUs a capacity can contain
in order to build the tree accordingly.
Prepare the relevant APIs.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
include/linux/sched/topology.h | 2 ++
kernel/sched/topology.c | 24 ++++++++++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index b5d9d7c2b8ad..88632825136e 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -50,6 +50,8 @@ extern const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl,
extern const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu);
extern int arch_asym_cpu_priority(int cpu);
+extern int sched_asym_count(void);
+extern int sched_asym_max_cpus(void);
struct sched_domain_attr {
int relax_domain_level;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 622e2e01974c..3b3bd32aea40 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1746,6 +1746,20 @@ static inline void asym_cpu_capacity_update_data(int cpu)
__cpumask_set_cpu(cpu, cpu_capacity_span(entry));
}
+static int asym_capacity_nr;
+
+int sched_asym_count(void)
+{
+ return asym_capacity_nr;
+}
+
+static int asym_capacity_max_cpus;
+
+int sched_asym_max_cpus(void)
+{
+ return asym_capacity_max_cpus;
+}
+
/*
* Build-up/update list of CPUs grouped by their capacities
* An update requires explicit request to rebuild sched domains
@@ -1756,6 +1770,9 @@ static void asym_cpu_capacity_scan(void)
struct asym_cap_data *entry, *next;
int cpu;
+ asym_capacity_nr = 0;
+ asym_capacity_max_cpus = 0;
+
list_for_each_entry(entry, &asym_cap_list, link)
cpumask_clear(cpu_capacity_span(entry));
@@ -1766,6 +1783,13 @@ static void asym_cpu_capacity_scan(void)
if (cpumask_empty(cpu_capacity_span(entry))) {
list_del_rcu(&entry->link);
call_rcu(&entry->rcu, free_asym_cap_entry);
+ } else {
+ int weight;
+
+ asym_capacity_nr++;
+ weight = cpumask_weight(cpu_capacity_span(entry));
+ if (weight > asym_capacity_max_cpus)
+ asym_capacity_max_cpus = weight;
}
}
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 4/6] timers/migration: Group CPUs per capacity
2026-06-25 16:41 [RFT][DONOTMERGE][PATCH 0/6] timers/migration: Prioritize lower capacity CPUs as migrators Frederic Weisbecker
` (2 preceding siblings ...)
2026-06-25 16:41 ` [PATCH 3/6] sched/topology: Account asym capacities number Frederic Weisbecker
@ 2026-06-25 16:41 ` Frederic Weisbecker
2026-06-25 16:41 ` [PATCH 5/6] timers/migration: Prefer lower capacity groups as migrators Frederic Weisbecker
2026-06-25 16:41 ` [PATCH 6/6] scripts/timer_migration_tree.py: Dump mask of each group Frederic Weisbecker
5 siblings, 0 replies; 7+ messages in thread
From: Frederic Weisbecker @ 2026-06-25 16:41 UTC (permalink / raw)
To: Christian Loehle
Cc: LKML, Frederic Weisbecker, Anna-Maria Behnsen, Sehee Jeong,
Thomas Gleixner, Peter Zijlstra
In the same way CPUs are grouped by node, group CPUs by capacity (this
assumes that asymetric capacity systems aren't NUMA but reality either
is or will be certainly disappointing about this).
This way, timers have fewer chances to migrate from low to high capacity
CPUs, and related ground work is more likely to execute while consuming
less energy.
Also in order to stimulate migration from high to low capacity CPUs, a
further change will encourage low capacity global migrators.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
include/trace/events/timer_migration.h | 32 +++----
kernel/time/timer_migration.c | 125 ++++++++++++++++---------
kernel/time/timer_migration.h | 13 +--
3 files changed, 103 insertions(+), 67 deletions(-)
diff --git a/include/trace/events/timer_migration.h b/include/trace/events/timer_migration.h
index 61171b13c687..5484056b0012 100644
--- a/include/trace/events/timer_migration.h
+++ b/include/trace/events/timer_migration.h
@@ -18,17 +18,17 @@ TRACE_EVENT(tmigr_group_set,
TP_STRUCT__entry(
__field( void *, group )
__field( unsigned int, lvl )
- __field( unsigned int, numa_node )
+ __field( unsigned int, family )
),
TP_fast_assign(
__entry->group = group;
__entry->lvl = group->level;
- __entry->numa_node = group->numa_node;
+ __entry->family = group->family;
),
- TP_printk("group=%p lvl=%d numa=%d",
- __entry->group, __entry->lvl, __entry->numa_node)
+ TP_printk("group=%p lvl=%d family=%d",
+ __entry->group, __entry->lvl, __entry->family)
);
TRACE_EVENT(tmigr_connect_child_parent,
@@ -41,7 +41,7 @@ TRACE_EVENT(tmigr_connect_child_parent,
__field( void *, child )
__field( void *, parent )
__field( unsigned int, lvl )
- __field( unsigned int, numa_node )
+ __field( unsigned int, family )
__field( unsigned int, num_children )
__field( u32, groupmask )
),
@@ -50,14 +50,14 @@ TRACE_EVENT(tmigr_connect_child_parent,
__entry->child = child;
__entry->parent = child->parent;
__entry->lvl = child->parent->level;
- __entry->numa_node = child->parent->numa_node;
+ __entry->family = child->parent->family;
__entry->num_children = child->parent->num_children;
__entry->groupmask = child->groupmask;
),
- TP_printk("group=%p groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
+ TP_printk("group=%p groupmask=%0x parent=%p lvl=%d family=%d num_children=%d",
__entry->child, __entry->groupmask, __entry->parent,
- __entry->lvl, __entry->numa_node, __entry->num_children)
+ __entry->lvl, __entry->family, __entry->num_children)
);
TRACE_EVENT(tmigr_connect_cpu_parent,
@@ -70,7 +70,7 @@ TRACE_EVENT(tmigr_connect_cpu_parent,
__field( void *, parent )
__field( unsigned int, cpu )
__field( unsigned int, lvl )
- __field( unsigned int, numa_node )
+ __field( unsigned int, family )
__field( unsigned int, num_children )
__field( u32, groupmask )
),
@@ -79,14 +79,14 @@ TRACE_EVENT(tmigr_connect_cpu_parent,
__entry->parent = tmc->tmgroup;
__entry->cpu = tmc->cpuevt.cpu;
__entry->lvl = tmc->tmgroup->level;
- __entry->numa_node = tmc->tmgroup->numa_node;
+ __entry->family = tmc->tmgroup->family;
__entry->num_children = tmc->tmgroup->num_children;
__entry->groupmask = tmc->groupmask;
),
- TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
+ TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d family=%d num_children=%d",
__entry->cpu, __entry->groupmask, __entry->parent,
- __entry->lvl, __entry->numa_node, __entry->num_children)
+ __entry->lvl, __entry->family, __entry->num_children)
);
DECLARE_EVENT_CLASS(tmigr_group_and_cpu,
@@ -99,7 +99,7 @@ DECLARE_EVENT_CLASS(tmigr_group_and_cpu,
__field( void *, group )
__field( void *, parent )
__field( unsigned int, lvl )
- __field( unsigned int, numa_node )
+ __field( unsigned int, family )
__field( u32, childmask )
__field( u8, active )
__field( u8, migrator )
@@ -109,15 +109,15 @@ DECLARE_EVENT_CLASS(tmigr_group_and_cpu,
__entry->group = group;
__entry->parent = group->parent;
__entry->lvl = group->level;
- __entry->numa_node = group->numa_node;
+ __entry->family = group->family;
__entry->childmask = childmask;
__entry->active = state.active;
__entry->migrator = state.migrator;
),
- TP_printk("group=%p lvl=%d numa=%d active=%0x migrator=%0x "
+ TP_printk("group=%p lvl=%d family=%d active=%0x migrator=%0x "
"parent=%p childmask=%0x",
- __entry->group, __entry->lvl, __entry->numa_node,
+ __entry->group, __entry->lvl, __entry->family,
__entry->active, __entry->migrator,
__entry->parent, __entry->childmask)
);
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 4209e695ec7b..2c2925046f43 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -21,10 +21,10 @@
/*
* The timer migration mechanism is built on a hierarchy of groups. The
* lowest level group contains CPUs, the next level groups of CPU groups
- * and so forth. The CPU groups are kept per node so for the normal case
- * lock contention won't happen across nodes. Depending on the number of
- * CPUs per node even the next level might be kept as groups of CPU groups
- * per node and only the levels above cross the node topology.
+ * and so forth. The CPU groups are kept per family so for the normal case
+ * lock contention won't happen across nodes/capacity. Depending on the
+ * number of CPUs per family even the next level might be kept as groups of
+ * CPU groups per family and only the levels above cross the family topology.
*
* Example topology for a two node system with 24 CPUs each.
*
@@ -419,7 +419,7 @@ static DEFINE_MUTEX(tmigr_mutex);
static struct list_head *tmigr_level_list __read_mostly;
static unsigned int tmigr_hierarchy_levels __read_mostly;
-static unsigned int tmigr_crossnode_level __read_mostly;
+static unsigned int tmigr_crossfamily_level __read_mostly;
static struct tmigr_group *tmigr_root;
@@ -1633,14 +1633,14 @@ static int __init tmigr_init_isolation(void)
}
static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
- int node)
+ int family)
{
union tmigr_state s;
raw_spin_lock_init(&group->lock);
group->level = lvl;
- group->numa_node = lvl < tmigr_crossnode_level ? node : NUMA_NO_NODE;
+ group->family = lvl < tmigr_crossfamily_level ? family : NUMA_NO_NODE;
group->num_children = 0;
@@ -1656,19 +1656,20 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
group->groupevt.ignore = true;
}
-static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
+static struct tmigr_group *tmigr_get_group(int family, unsigned int lvl)
{
struct tmigr_group *tmp, *group = NULL;
+ int node;
lockdep_assert_held(&tmigr_mutex);
/* Try to attach to an existing group first */
list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
/*
- * If @lvl is below the cross NUMA node level, check whether
- * this group belongs to the same NUMA node.
+ * If @lvl is below the cross family level, check whether
+ * this group belongs to the same family.
*/
- if (lvl < tmigr_crossnode_level && tmp->numa_node != node)
+ if (lvl < tmigr_crossfamily_level && tmp->family != family)
continue;
/* Capacity left? */
@@ -1689,12 +1690,17 @@ static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
if (group)
return group;
+ if (sched_asym_count() > 1)
+ node = NUMA_NO_NODE;
+ else
+ node = family;
+
/* Allocate and set up a new group */
group = kzalloc_node(sizeof(*group), GFP_KERNEL, node);
if (!group)
return ERR_PTR(-ENOMEM);
- tmigr_init_group(group, lvl, node);
+ tmigr_init_group(group, lvl, family);
/* Setup successful. Add it to the hierarchy */
list_add(&group->list, &tmigr_level_list[lvl]);
@@ -1728,7 +1734,7 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
/*
* The previous top level had prepared its groupmask already,
* simply account it in advance as the first child. If some groups
- * have been created between the old and new root due to node
+ * have been created between the old and new root due to family
* mismatch, the new root's child will be intialized accordingly.
*/
parent->num_children = 1;
@@ -1737,7 +1743,7 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
/* Connecting old root to new root ? */
if (!parent->parent && root_up) {
/*
- * @child is the old top, or in case of node mismatch, some
+ * @child is the old top, or in case of family mismatch, some
* intermediate group between the old top and the new one in
* @parent. In this case the @child must be pre-accounted above
* as the first child. Its new inactive sibling corresponding
@@ -1760,7 +1766,7 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
trace_tmigr_connect_child_parent(child);
}
-static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
+static int tmigr_setup_groups(unsigned int cpu, unsigned int family,
struct tmigr_group *start, bool activate)
{
struct tmigr_group *group, *child, **stack;
@@ -1777,10 +1783,10 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
}
if (tmigr_root)
- root_mismatch = tmigr_root->numa_node != node;
+ root_mismatch = tmigr_root->family != family;
for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
- group = tmigr_get_group(node, i);
+ group = tmigr_get_group(family, i);
if (IS_ERR(group)) {
err = PTR_ERR(group);
i--;
@@ -1793,15 +1799,15 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
/*
* When booting only less CPUs of a system than CPUs are
* available, not all calculated hierarchy levels are required,
- * unless a node mismatch is detected.
+ * unless a family mismatch is detected.
*
* The loop is aborted as soon as the highest level, which might
* be different from tmigr_hierarchy_levels, contains only a
- * single group, unless the nodes mismatch below tmigr_crossnode_level
+ * single group, unless the family mismatch below tmigr_crossfamily_level
*/
if (group->parent)
break;
- if ((!root_mismatch || i >= tmigr_crossnode_level) &&
+ if ((!root_mismatch || i >= tmigr_crossfamily_level) &&
list_is_singular(&tmigr_level_list[i]))
break;
}
@@ -1937,7 +1943,7 @@ static int tmigr_connect_old_root(int cpu, struct tmigr_group *old_root, bool ac
WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available));
}
- return tmigr_setup_groups(-1, old_root->numa_node, old_root, activate);
+ return tmigr_setup_groups(-1, old_root->family, old_root, activate);
}
static long connect_old_root_work(void *arg)
@@ -1947,15 +1953,23 @@ static long connect_old_root_work(void *arg)
return tmigr_connect_old_root(smp_processor_id(), old_root, true);
}
+static int tmigr_get_cpu_family(unsigned int cpu)
+{
+ if (sched_asym_count() > 1)
+ return arch_scale_cpu_capacity(cpu);
+ else
+ return cpu_to_node(cpu);
+}
+
static int tmigr_add_cpu(unsigned int cpu)
{
struct tmigr_group *old_root = tmigr_root;
- int node = cpu_to_node(cpu);
+ int family = tmigr_get_cpu_family(cpu);
int ret;
guard(mutex)(&tmigr_mutex);
- ret = tmigr_setup_groups(cpu, node, NULL, false);
+ ret = tmigr_setup_groups(cpu, family, NULL, false);
if (ret < 0 || !old_root || old_root == tmigr_root)
return ret;
@@ -2010,10 +2024,41 @@ static int tmigr_cpu_prepare(unsigned int cpu)
return ret;
}
+static int __init tmigr_num_possible_families(void)
+{
+ if (sched_asym_count() > 1)
+ return sched_asym_count();
+ else
+ return num_possible_nodes();
+}
+
+static int __init tmigr_cpus_per_family(int ncpus, int nfamilies)
+{
+ if (sched_asym_count() > 1) {
+ /*
+ * CPUs may not be equally distributed accross capacities.
+ * Pick the maximum number of CPUs a capacity can hold.
+ */
+ return sched_asym_max_cpus();
+ } else {
+ /*
+ * Calculate the required hierarchy levels. Unfortunately there is no
+ * reliable information available, unless all possible CPUs have been
+ * brought up and all NUMA nodes are populated.
+ *
+ * Estimate the number of levels with the number of possible nodes and
+ * the number of possible CPUs. Assume CPUs are spread evenly across
+ * nodes. We cannot rely on cpumask_of_node() because it only works for
+ * online CPUs.
+ */
+ return DIV_ROUND_UP(ncpus, nfamilies);
+ }
+}
+
static int __init tmigr_init(void)
{
- unsigned int cpulvl, nodelvl, cpus_per_node, i;
- unsigned int nnodes = num_possible_nodes();
+ unsigned int cpulvl, familylvl, cpus_per_family, i;
+ unsigned int nfamilies = tmigr_num_possible_families();
unsigned int ncpus = num_possible_cpus();
int ret = -ENOMEM;
@@ -2028,36 +2073,26 @@ static int __init tmigr_init(void)
goto err;
}
- /*
- * Calculate the required hierarchy levels. Unfortunately there is no
- * reliable information available, unless all possible CPUs have been
- * brought up and all NUMA nodes are populated.
- *
- * Estimate the number of levels with the number of possible nodes and
- * the number of possible CPUs. Assume CPUs are spread evenly across
- * nodes. We cannot rely on cpumask_of_node() because it only works for
- * online CPUs.
- */
- cpus_per_node = DIV_ROUND_UP(ncpus, nnodes);
+ cpus_per_family = tmigr_cpus_per_family(ncpus, nfamilies);
/* Calc the hierarchy levels required to hold the CPUs of a node */
- cpulvl = DIV_ROUND_UP(order_base_2(cpus_per_node),
+ cpulvl = DIV_ROUND_UP(order_base_2(cpus_per_family),
ilog2(TMIGR_CHILDREN_PER_GROUP));
/* Calculate the extra levels to connect all nodes */
- nodelvl = DIV_ROUND_UP(order_base_2(nnodes),
- ilog2(TMIGR_CHILDREN_PER_GROUP));
+ familylvl = DIV_ROUND_UP(order_base_2(nfamilies),
+ ilog2(TMIGR_CHILDREN_PER_GROUP));
- tmigr_hierarchy_levels = cpulvl + nodelvl;
+ tmigr_hierarchy_levels = cpulvl + familylvl;
/*
- * If a NUMA node spawns more than one CPU level group then the next
+ * If a family spawns more than one CPU level group then the next
* level(s) of the hierarchy contains groups which handle all CPU groups
- * of the same NUMA node. The level above goes across NUMA nodes. Store
+ * of the same family. The level above goes across NUMA nodes. Store
* this information for the setup code to decide in which level node
* matching is no longer required.
*/
- tmigr_crossnode_level = cpulvl;
+ tmigr_crossfamily_level = cpulvl;
tmigr_level_list = kzalloc_objs(struct list_head,
tmigr_hierarchy_levels);
@@ -2068,9 +2103,9 @@ static int __init tmigr_init(void)
INIT_LIST_HEAD(&tmigr_level_list[i]);
pr_info("Timer migration: %d hierarchy levels; %d children per group;"
- " %d crossnode level\n",
+ " %d crossfamily level\n",
tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,
- tmigr_crossnode_level);
+ tmigr_crossfamily_level);
ret = cpuhp_setup_state(CPUHP_TMIGR_PREPARE, "tmigr:prepare",
tmigr_cpu_prepare, NULL);
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index 4c0073f3d321..3f6c7a110e3c 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -43,11 +43,12 @@ struct tmigr_event {
* @events: Timer queue for child events queued in the group
* @migr_state: State of the group (see union tmigr_state)
* @level: Hierarchy level of the group; Required during setup
- * @numa_node: Required for setup only to make sure CPU and low level
- * group information is NUMA local. It is set to NUMA node
- * as long as the group level is per NUMA node (level <
- * tmigr_crossnode_level); otherwise it is set to
- * NUMA_NO_NODE
+ * @family: Either NUMA node id or asym CPU capacity. Required for
+ * setup only to make sure CPU and low level
+ * group information is NUMA/capacity local. It is set to
+ * NUMA node as long as the group level is per NUMA node
+ * (level < tmigr_family_level); otherwise it is set to
+ * the CPUs capacities.
* @num_children: Counter of group children to make sure the group is only
* filled with TMIGR_CHILDREN_PER_GROUP; Required for setup
* only
@@ -66,7 +67,7 @@ struct tmigr_group {
struct timerqueue_head events;
atomic_t migr_state;
unsigned int level;
- int numa_node;
+ int family;
unsigned int num_children;
u8 groupmask;
struct list_head list;
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 5/6] timers/migration: Prefer lower capacity groups as migrators
2026-06-25 16:41 [RFT][DONOTMERGE][PATCH 0/6] timers/migration: Prioritize lower capacity CPUs as migrators Frederic Weisbecker
` (3 preceding siblings ...)
2026-06-25 16:41 ` [PATCH 4/6] timers/migration: Group CPUs per capacity Frederic Weisbecker
@ 2026-06-25 16:41 ` Frederic Weisbecker
2026-06-25 16:41 ` [PATCH 6/6] scripts/timer_migration_tree.py: Dump mask of each group Frederic Weisbecker
5 siblings, 0 replies; 7+ messages in thread
From: Frederic Weisbecker @ 2026-06-25 16:41 UTC (permalink / raw)
To: Christian Loehle
Cc: LKML, Frederic Weisbecker, Anna-Maria Behnsen, Sehee Jeong,
Thomas Gleixner, Peter Zijlstra
Pulling timers to low capacity CPUs may improve power consumption
by executing non performance critical ground work there and increasing
the chances to turn the small CPUs into idle global migrators if such
work keeps them alive for long enough. This way low power CPUs may be
woken up from deep idle instead of high power CPUs.
To implement this, migrators going idle will select the lowest alive
capacity groups as new migrators. And CPUs exiting idle will force
select themselves as the new migrator if their capacity is lower than
the current migrator.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
include/linux/sched/topology.h | 1 +
kernel/sched/topology.c | 20 ++++++++++++++++++++
kernel/time/timer_migration.c | 33 +++++++++++++++++++++++++++------
kernel/time/timer_migration.h | 2 ++
scripts/timer_migration_tree.py | 24 ++++++++++++------------
5 files changed, 62 insertions(+), 18 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 88632825136e..de584bde7dee 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -52,6 +52,7 @@ extern const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl,
extern int arch_asym_cpu_priority(int cpu);
extern int sched_asym_count(void);
extern int sched_asym_max_cpus(void);
+extern int sched_asym_capacity_rank(unsigned long capacity);
struct sched_domain_attr {
int relax_domain_level;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3b3bd32aea40..7673dfd579db 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1760,6 +1760,26 @@ int sched_asym_max_cpus(void)
return asym_capacity_max_cpus;
}
+int sched_asym_capacity_rank(unsigned long capacity)
+{
+ struct asym_cap_data *entry;
+ int i = 0;
+
+ /*
+ * Search if capacity already exits. If not, track which the entry
+ * where we should insert to keep the list ordered descending.
+ */
+ list_for_each_entry_reverse(entry, &asym_cap_list, link) {
+ if (capacity == entry->capacity)
+ return i;
+ i++;
+ }
+
+ WARN_ONCE(1, "Capacity %lu not found in capacity list", capacity);
+
+ return 0;
+}
+
/*
* Build-up/update list of CPUs grouped by their capacities
* An update requires explicit request to rebuild sched domains
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 2c2925046f43..a16d265df33e 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -677,11 +677,14 @@ static bool tmigr_active_up(struct tmigr_group *group,
newstate = curstate;
walk_done = true;
- if (newstate.migrator == TMIGR_NONE) {
+ if (curstate.migrator == TMIGR_NONE ||
+ (group->want_low_migrator && childmask < curstate.migrator)) {
newstate.migrator = childmask;
- /* Changes need to be propagated */
- walk_done = false;
+ if (curstate.migrator == TMIGR_NONE) {
+ /* Changes need to be propagated */
+ walk_done = false;
+ }
}
newstate.active |= childmask;
@@ -1644,6 +1647,12 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
group->num_children = 0;
+ /* Always prefer a migrator with lower capacity */
+ if (sched_asym_count() > 1 && lvl == tmigr_crossfamily_level)
+ group->want_low_migrator = true;
+ else
+ group->want_low_migrator = false;
+
s.migrator = TMIGR_NONE;
s.active = 0;
s.seq = 0;
@@ -1708,6 +1717,18 @@ static struct tmigr_group *tmigr_get_group(int family, unsigned int lvl)
return group;
}
+static void tmigr_init_groupmask(struct tmigr_group *group, u8 groupmask)
+{
+ /*
+ * Overwrite the groupmask if this is a whole capacity group so that
+ * candidate migrators are sorted by capacity.
+ */
+ if (sched_asym_count() > 1 && group->level == tmigr_crossfamily_level - 1)
+ groupmask = BIT(sched_asym_capacity_rank(group->family));
+
+ group->groupmask = groupmask;
+}
+
static bool tmigr_init_root(struct tmigr_group *group, bool root_up)
{
if (!group->parent && group != tmigr_root) {
@@ -1716,7 +1737,7 @@ static bool tmigr_init_root(struct tmigr_group *group, bool root_up)
* to avoid accidents where yet another new top-level is
* created in the future and made visible before this groupmask.
*/
- group->groupmask = BIT(0);
+ tmigr_init_groupmask(group, BIT(0));
WARN_ON_ONCE(root_up);
return true;
@@ -1750,10 +1771,10 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
* to the CPU going up has been accounted as the second child.
*/
WARN_ON_ONCE(parent->num_children != 2);
- child->groupmask = BIT(0);
+ tmigr_init_groupmask(child, BIT(0));
} else {
/* Common case adding @child for the CPU going up to @parent. */
- child->groupmask = BIT(parent->num_children++);
+ tmigr_init_groupmask(child, BIT(parent->num_children++));
}
/*
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index 3f6c7a110e3c..0bf3a0e7d54c 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -58,6 +58,7 @@ struct tmigr_event {
* tmigr_level_list; is required during setup when a
* new group needs to be connected to the existing
* hierarchy groups
+ * @want_low_migrator: Group wants the lowest capacity migrator
*/
struct tmigr_group {
raw_spinlock_t lock;
@@ -71,6 +72,7 @@ struct tmigr_group {
unsigned int num_children;
u8 groupmask;
struct list_head list;
+ bool want_low_migrator;
};
/**
diff --git a/scripts/timer_migration_tree.py b/scripts/timer_migration_tree.py
index abb321b903c4..4f055fc08435 100755
--- a/scripts/timer_migration_tree.py
+++ b/scripts/timer_migration_tree.py
@@ -33,8 +33,8 @@ class Node:
def set_lvl(self, lvl):
self.lvl = lvl
- def set_numa(self, numa):
- self.numa = numa
+ def set_family(self, family):
+ self.family = family
def set_num_children(self, num_children):
self.num_children = num_children
@@ -44,7 +44,7 @@ class Node:
parent_grp = self.parent.group
else:
parent_grp = "-"
- return "Group: %s mask: %s parent: %s lvl: %d numa: %d num_children: %d" % (self.group, self.groupmask, parent_grp, self.lvl, self.numa, self.num_children)
+ return "Group: %s mask: %s parent: %s lvl: %d family: %d num_children: %d" % (self.group, self.groupmask, parent_grp, self.lvl, self.family, self.num_children)
def get_node(group):
if group in Node.node_list:
@@ -55,32 +55,32 @@ def get_node(group):
return n
def tmigr_connect_cpu_parent(ts, line):
- s = re.search("tmigr_connect_cpu_parent: cpu=([0-9]+) groupmask=([0-9a-zA-Z]+) parent=([0-9a-zA-Z]+) lvl=([0-9]+) numa=([-]?[0-9]+) num_children=([0-9]+)", line)
+ s = re.search("tmigr_connect_cpu_parent: cpu=([0-9]+) groupmask=([0-9a-zA-Z]+) parent=([0-9a-zA-Z]+) lvl=([0-9]+) family=([-]?[0-9]+) num_children=([0-9]+)", line)
if s is None:
return False
- (cpu, groupmask, parent, lvl, numa, num_children) = (int(s.group(1)), s.group(2), s.group(3), int(s.group(4)), int(s.group(5)), int(s.group(6)))
+ (cpu, groupmask, parent, lvl, family, num_children) = (int(s.group(1)), s.group(2), s.group(3), int(s.group(4)), int(s.group(5)), int(s.group(6)))
n = get_node(cpu)
p = get_node(parent)
n.set_parent(p)
n.set_groupmask(groupmask)
n.set_lvl(-1)
p.set_lvl(lvl)
- p.set_numa(numa)
- n.set_numa(numa)
+ p.set_family(family)
+ n.set_family(family)
p.set_num_children(num_children)
p.add_child(n)
def tmigr_connect_child_parent(ts, line):
- s = re.search("tmigr_connect_child_parent: group=([0-9a-zA-Z]+) groupmask=([0-9a-zA-Z]+) parent=([0-9a-zA-Z]+) lvl=([0-9]+) numa=([-]?[0-9]+) num_children=([0-9]+)", line)
+ s = re.search("tmigr_connect_child_parent: group=([0-9a-zA-Z]+) groupmask=([0-9a-zA-Z]+) parent=([0-9a-zA-Z]+) lvl=([0-9]+) family=([-]?[0-9]+) num_children=([0-9]+)", line)
if s is None:
return False
- (group, groupmask, parent, lvl, numa, num_children) = (s.group(1), s.group(2), s.group(3), int(s.group(4)), int(s.group(5)), int(s.group(6)))
+ (group, groupmask, parent, lvl, family, num_children) = (s.group(1), s.group(2), s.group(3), int(s.group(4)), int(s.group(5)), int(s.group(6)))
n = get_node(group)
p = get_node(parent)
n.set_parent(p)
n.set_groupmask(groupmask)
p.set_lvl(lvl)
- p.set_numa(numa)
+ p.set_family(family)
p.set_num_children(num_children)
p.add_child(n)
@@ -88,7 +88,7 @@ def populate(enode, node):
enode = enode.add_child(name = node.group)
enode.add_feature("groupmask", "m:%s" % node.groupmask)
enode.add_feature("lvl", "lvl:%d" % node.lvl)
- enode.add_feature("numa", "node %d" % node.numa)
+ enode.add_feature("family", "family %d" % node.family)
enode.add_feature("num_children", "c=%d" % node.num_children)
for child in node.children:
populate(enode, child)
@@ -107,4 +107,4 @@ if __name__ == "__main__":
group = group.parent
root = Tree()
populate(root, group)
- print(root.get_ascii(show_internal=True, attributes=["name", "numa", "lvl"]))
+ print(root.get_ascii(show_internal=True, attributes=["name", "family", "lvl"]))
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 6/6] scripts/timer_migration_tree.py: Dump mask of each group
2026-06-25 16:41 [RFT][DONOTMERGE][PATCH 0/6] timers/migration: Prioritize lower capacity CPUs as migrators Frederic Weisbecker
` (4 preceding siblings ...)
2026-06-25 16:41 ` [PATCH 5/6] timers/migration: Prefer lower capacity groups as migrators Frederic Weisbecker
@ 2026-06-25 16:41 ` Frederic Weisbecker
5 siblings, 0 replies; 7+ messages in thread
From: Frederic Weisbecker @ 2026-06-25 16:41 UTC (permalink / raw)
To: Christian Loehle
Cc: LKML, Frederic Weisbecker, Anna-Maria Behnsen, Sehee Jeong,
Thomas Gleixner, Peter Zijlstra
Dump the mask of each group in order to verify they are correct and well
sorted with respect to CPU capacities.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
scripts/timer_migration_tree.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/scripts/timer_migration_tree.py b/scripts/timer_migration_tree.py
index 4f055fc08435..275b05fd6c1b 100755
--- a/scripts/timer_migration_tree.py
+++ b/scripts/timer_migration_tree.py
@@ -90,6 +90,7 @@ def populate(enode, node):
enode.add_feature("lvl", "lvl:%d" % node.lvl)
enode.add_feature("family", "family %d" % node.family)
enode.add_feature("num_children", "c=%d" % node.num_children)
+ enode.add_feature("groupmask", "mask=%s" % node.groupmask)
for child in node.children:
populate(enode, child)
@@ -107,4 +108,4 @@ if __name__ == "__main__":
group = group.parent
root = Tree()
populate(root, group)
- print(root.get_ascii(show_internal=True, attributes=["name", "family", "lvl"]))
+ print(root.get_ascii(show_internal=True, attributes=["name", "family", "lvl", "groupmask"]))
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread