[PATCH 4/6] timers/migration: Group CPUs per capacity

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Frederic Weisbecker <frederic@kernel.org>
To: Christian Loehle <christian.loehle@arm.com>
Cc: LKML <linux-kernel@vger.kernel.org>,
	Frederic Weisbecker <frederic@kernel.org>,
	Anna-Maria Behnsen <anna-maria@linutronix.de>,
	Sehee Jeong <sehee1.jeong@samsung.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Peter Zijlstra <peterz@infradead.org>
Subject: [PATCH 4/6] timers/migration: Group CPUs per capacity
Date: Thu, 25 Jun 2026 18:41:12 +0200	[thread overview]
Message-ID: <20260625164114.51454-5-frederic@kernel.org> (raw)
In-Reply-To: <20260625164114.51454-1-frederic@kernel.org>

In the same way CPUs are grouped by node, group CPUs by capacity (this
assumes that asymetric capacity systems aren't NUMA but reality either
is or will be certainly disappointing about this).

This way, timers have fewer chances to migrate from low to high capacity
CPUs, and related ground work is more likely to execute while consuming
less energy.

Also in order to stimulate migration from high to low capacity CPUs, a
further change will encourage low capacity global migrators.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/trace/events/timer_migration.h |  32 +++----
 kernel/time/timer_migration.c          | 125 ++++++++++++++++---------
 kernel/time/timer_migration.h          |  13 +--
 3 files changed, 103 insertions(+), 67 deletions(-)

diff --git a/include/trace/events/timer_migration.h b/include/trace/events/timer_migration.h
index 61171b13c687..5484056b0012 100644
--- a/include/trace/events/timer_migration.h
+++ b/include/trace/events/timer_migration.h
@@ -18,17 +18,17 @@ TRACE_EVENT(tmigr_group_set,
 	TP_STRUCT__entry(
 		__field( void *,	group		)
 		__field( unsigned int,	lvl		)
-		__field( unsigned int,	numa_node	)
+		__field( unsigned int,	family	)
 	),
 
 	TP_fast_assign(
 		__entry->group		= group;
 		__entry->lvl		= group->level;
-		__entry->numa_node	= group->numa_node;
+		__entry->family		= group->family;
 	),
 
-	TP_printk("group=%p lvl=%d numa=%d",
-		  __entry->group, __entry->lvl, __entry->numa_node)
+	TP_printk("group=%p lvl=%d family=%d",
+		  __entry->group, __entry->lvl, __entry->family)
 );
 
 TRACE_EVENT(tmigr_connect_child_parent,
@@ -41,7 +41,7 @@ TRACE_EVENT(tmigr_connect_child_parent,
 		__field( void *,	child		)
 		__field( void *,	parent		)
 		__field( unsigned int,	lvl		)
-		__field( unsigned int,	numa_node	)
+		__field( unsigned int,	family	)
 		__field( unsigned int,	num_children	)
 		__field( u32,		groupmask	)
 	),
@@ -50,14 +50,14 @@ TRACE_EVENT(tmigr_connect_child_parent,
 		__entry->child		= child;
 		__entry->parent		= child->parent;
 		__entry->lvl		= child->parent->level;
-		__entry->numa_node	= child->parent->numa_node;
+		__entry->family		= child->parent->family;
 		__entry->num_children	= child->parent->num_children;
 		__entry->groupmask	= child->groupmask;
 	),
 
-	TP_printk("group=%p groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
+	TP_printk("group=%p groupmask=%0x parent=%p lvl=%d family=%d num_children=%d",
 		  __entry->child,  __entry->groupmask, __entry->parent,
-		  __entry->lvl, __entry->numa_node, __entry->num_children)
+		  __entry->lvl, __entry->family, __entry->num_children)
 );
 
 TRACE_EVENT(tmigr_connect_cpu_parent,
@@ -70,7 +70,7 @@ TRACE_EVENT(tmigr_connect_cpu_parent,
 		__field( void *,	parent		)
 		__field( unsigned int,	cpu		)
 		__field( unsigned int,	lvl		)
-		__field( unsigned int,	numa_node	)
+		__field( unsigned int,	family	)
 		__field( unsigned int,	num_children	)
 		__field( u32,		groupmask	)
 	),
@@ -79,14 +79,14 @@ TRACE_EVENT(tmigr_connect_cpu_parent,
 		__entry->parent		= tmc->tmgroup;
 		__entry->cpu		= tmc->cpuevt.cpu;
 		__entry->lvl		= tmc->tmgroup->level;
-		__entry->numa_node	= tmc->tmgroup->numa_node;
+		__entry->family		= tmc->tmgroup->family;
 		__entry->num_children	= tmc->tmgroup->num_children;
 		__entry->groupmask	= tmc->groupmask;
 	),
 
-	TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
+	TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d family=%d num_children=%d",
 		  __entry->cpu,	 __entry->groupmask, __entry->parent,
-		  __entry->lvl, __entry->numa_node, __entry->num_children)
+		  __entry->lvl, __entry->family, __entry->num_children)
 );
 
 DECLARE_EVENT_CLASS(tmigr_group_and_cpu,
@@ -99,7 +99,7 @@ DECLARE_EVENT_CLASS(tmigr_group_and_cpu,
 		__field( void *,	group		)
 		__field( void *,	parent		)
 		__field( unsigned int,	lvl		)
-		__field( unsigned int,	numa_node	)
+		__field( unsigned int,	family	)
 		__field( u32,		childmask	)
 		__field( u8,		active		)
 		__field( u8,		migrator	)
@@ -109,15 +109,15 @@ DECLARE_EVENT_CLASS(tmigr_group_and_cpu,
 		__entry->group		= group;
 		__entry->parent		= group->parent;
 		__entry->lvl		= group->level;
-		__entry->numa_node	= group->numa_node;
+		__entry->family		= group->family;
 		__entry->childmask	= childmask;
 		__entry->active		= state.active;
 		__entry->migrator	= state.migrator;
 	),
 
-	TP_printk("group=%p lvl=%d numa=%d active=%0x migrator=%0x "
+	TP_printk("group=%p lvl=%d family=%d active=%0x migrator=%0x "
 		  "parent=%p childmask=%0x",
-		  __entry->group, __entry->lvl, __entry->numa_node,
+		  __entry->group, __entry->lvl, __entry->family,
 		  __entry->active, __entry->migrator,
 		  __entry->parent, __entry->childmask)
 );
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 4209e695ec7b..2c2925046f43 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -21,10 +21,10 @@
 /*
  * The timer migration mechanism is built on a hierarchy of groups. The
  * lowest level group contains CPUs, the next level groups of CPU groups
- * and so forth. The CPU groups are kept per node so for the normal case
- * lock contention won't happen across nodes. Depending on the number of
- * CPUs per node even the next level might be kept as groups of CPU groups
- * per node and only the levels above cross the node topology.
+ * and so forth. The CPU groups are kept per family so for the normal case
+ * lock contention won't happen across nodes/capacity. Depending on the
+ * number of CPUs per family even the next level might be kept as groups of
+ * CPU groups per family and only the levels above cross the family topology.
  *
  * Example topology for a two node system with 24 CPUs each.
  *
@@ -419,7 +419,7 @@ static DEFINE_MUTEX(tmigr_mutex);
 static struct list_head *tmigr_level_list __read_mostly;
 
 static unsigned int tmigr_hierarchy_levels __read_mostly;
-static unsigned int tmigr_crossnode_level __read_mostly;
+static unsigned int tmigr_crossfamily_level __read_mostly;
 
 static struct tmigr_group *tmigr_root;
 
@@ -1633,14 +1633,14 @@ static int __init tmigr_init_isolation(void)
 }
 
 static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
-			     int node)
+			     int family)
 {
 	union tmigr_state s;
 
 	raw_spin_lock_init(&group->lock);
 
 	group->level = lvl;
-	group->numa_node = lvl < tmigr_crossnode_level ? node : NUMA_NO_NODE;
+	group->family = lvl < tmigr_crossfamily_level ? family : NUMA_NO_NODE;
 
 	group->num_children = 0;
 
@@ -1656,19 +1656,20 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
 	group->groupevt.ignore = true;
 }
 
-static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
+static struct tmigr_group *tmigr_get_group(int family, unsigned int lvl)
 {
 	struct tmigr_group *tmp, *group = NULL;
+	int node;
 
 	lockdep_assert_held(&tmigr_mutex);
 
 	/* Try to attach to an existing group first */
 	list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
 		/*
-		 * If @lvl is below the cross NUMA node level, check whether
-		 * this group belongs to the same NUMA node.
+		 * If @lvl is below the cross family level, check whether
+		 * this group belongs to the same family.
 		 */
-		if (lvl < tmigr_crossnode_level && tmp->numa_node != node)
+		if (lvl < tmigr_crossfamily_level && tmp->family != family)
 			continue;
 
 		/* Capacity left? */
@@ -1689,12 +1690,17 @@ static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
 	if (group)
 		return group;
 
+	if (sched_asym_count() > 1)
+		node = NUMA_NO_NODE;
+	else
+		node = family;
+
 	/* Allocate and	set up a new group */
 	group = kzalloc_node(sizeof(*group), GFP_KERNEL, node);
 	if (!group)
 		return ERR_PTR(-ENOMEM);
 
-	tmigr_init_group(group, lvl, node);
+	tmigr_init_group(group, lvl, family);
 
 	/* Setup successful. Add it to the hierarchy */
 	list_add(&group->list, &tmigr_level_list[lvl]);
@@ -1728,7 +1734,7 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
 		/*
 		 * The previous top level had prepared its groupmask already,
 		 * simply account it in advance as the first child. If some groups
-		 * have been created between the old and new root due to node
+		 * have been created between the old and new root due to family
 		 * mismatch, the new root's child will be intialized accordingly.
 		 */
 		parent->num_children = 1;
@@ -1737,7 +1743,7 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
 	/* Connecting old root to new root ? */
 	if (!parent->parent && root_up) {
 		/*
-		 * @child is the old top, or in case of node mismatch, some
+		 * @child is the old top, or in case of family mismatch, some
 		 * intermediate group between the old top and the new one in
 		 * @parent. In this case the @child must be pre-accounted above
 		 * as the first child. Its new inactive sibling corresponding
@@ -1760,7 +1766,7 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
 	trace_tmigr_connect_child_parent(child);
 }
 
-static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
+static int tmigr_setup_groups(unsigned int cpu, unsigned int family,
 			      struct tmigr_group *start, bool activate)
 {
 	struct tmigr_group *group, *child, **stack;
@@ -1777,10 +1783,10 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 	}
 
 	if (tmigr_root)
-		root_mismatch = tmigr_root->numa_node != node;
+		root_mismatch = tmigr_root->family != family;
 
 	for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
-		group = tmigr_get_group(node, i);
+		group = tmigr_get_group(family, i);
 		if (IS_ERR(group)) {
 			err = PTR_ERR(group);
 			i--;
@@ -1793,15 +1799,15 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 		/*
 		 * When booting only less CPUs of a system than CPUs are
 		 * available, not all calculated hierarchy levels are required,
-		 * unless a node mismatch is detected.
+		 * unless a family mismatch is detected.
 		 *
 		 * The loop is aborted as soon as the highest level, which might
 		 * be different from tmigr_hierarchy_levels, contains only a
-		 * single group, unless the nodes mismatch below tmigr_crossnode_level
+		 * single group, unless the family mismatch below tmigr_crossfamily_level
 		 */
 		if (group->parent)
 			break;
-		if ((!root_mismatch || i >= tmigr_crossnode_level) &&
+		if ((!root_mismatch || i >= tmigr_crossfamily_level) &&
 		    list_is_singular(&tmigr_level_list[i]))
 			break;
 	}
@@ -1937,7 +1943,7 @@ static int tmigr_connect_old_root(int cpu, struct tmigr_group *old_root, bool ac
 		WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available));
 	}
 
-	return tmigr_setup_groups(-1, old_root->numa_node, old_root, activate);
+	return tmigr_setup_groups(-1, old_root->family, old_root, activate);
 }
 
 static long connect_old_root_work(void *arg)
@@ -1947,15 +1953,23 @@ static long connect_old_root_work(void *arg)
 	return tmigr_connect_old_root(smp_processor_id(), old_root, true);
 }
 
+static int tmigr_get_cpu_family(unsigned int cpu)
+{
+	if (sched_asym_count() > 1)
+		return arch_scale_cpu_capacity(cpu);
+	else
+		return cpu_to_node(cpu);
+}
+
 static int tmigr_add_cpu(unsigned int cpu)
 {
 	struct tmigr_group *old_root = tmigr_root;
-	int node = cpu_to_node(cpu);
+	int family = tmigr_get_cpu_family(cpu);
 	int ret;
 
 	guard(mutex)(&tmigr_mutex);
 
-	ret = tmigr_setup_groups(cpu, node, NULL, false);
+	ret = tmigr_setup_groups(cpu, family, NULL, false);
 
 	if (ret < 0 || !old_root || old_root == tmigr_root)
 		return ret;
@@ -2010,10 +2024,41 @@ static int tmigr_cpu_prepare(unsigned int cpu)
 	return ret;
 }
 
+static int __init tmigr_num_possible_families(void)
+{
+	if (sched_asym_count() > 1)
+		return sched_asym_count();
+	else
+		return num_possible_nodes();
+}
+
+static int __init tmigr_cpus_per_family(int ncpus, int nfamilies)
+{
+	if (sched_asym_count() > 1) {
+		/*
+		 * CPUs may not be equally distributed accross capacities.
+		 * Pick the maximum number of CPUs a capacity can hold.
+		 */
+		return sched_asym_max_cpus();
+	} else {
+		/*
+		 * Calculate the required hierarchy levels. Unfortunately there is no
+		 * reliable information available, unless all possible CPUs have been
+		 * brought up and all NUMA nodes are populated.
+		 *
+		 * Estimate the number of levels with the number of possible nodes and
+		 * the number of possible CPUs. Assume CPUs are spread evenly across
+		 * nodes. We cannot rely on cpumask_of_node() because it only works for
+		 * online CPUs.
+		 */
+		return DIV_ROUND_UP(ncpus, nfamilies);
+	}
+}
+
 static int __init tmigr_init(void)
 {
-	unsigned int cpulvl, nodelvl, cpus_per_node, i;
-	unsigned int nnodes = num_possible_nodes();
+	unsigned int cpulvl, familylvl, cpus_per_family, i;
+	unsigned int nfamilies = tmigr_num_possible_families();
 	unsigned int ncpus = num_possible_cpus();
 	int ret = -ENOMEM;
 
@@ -2028,36 +2073,26 @@ static int __init tmigr_init(void)
 		goto err;
 	}
 
-	/*
-	 * Calculate the required hierarchy levels. Unfortunately there is no
-	 * reliable information available, unless all possible CPUs have been
-	 * brought up and all NUMA nodes are populated.
-	 *
-	 * Estimate the number of levels with the number of possible nodes and
-	 * the number of possible CPUs. Assume CPUs are spread evenly across
-	 * nodes. We cannot rely on cpumask_of_node() because it only works for
-	 * online CPUs.
-	 */
-	cpus_per_node = DIV_ROUND_UP(ncpus, nnodes);
+	cpus_per_family = tmigr_cpus_per_family(ncpus, nfamilies);
 
 	/* Calc the hierarchy levels required to hold the CPUs of a node */
-	cpulvl = DIV_ROUND_UP(order_base_2(cpus_per_node),
+	cpulvl = DIV_ROUND_UP(order_base_2(cpus_per_family),
 			      ilog2(TMIGR_CHILDREN_PER_GROUP));
 
 	/* Calculate the extra levels to connect all nodes */
-	nodelvl = DIV_ROUND_UP(order_base_2(nnodes),
-			       ilog2(TMIGR_CHILDREN_PER_GROUP));
+	familylvl = DIV_ROUND_UP(order_base_2(nfamilies),
+				 ilog2(TMIGR_CHILDREN_PER_GROUP));
 
-	tmigr_hierarchy_levels = cpulvl + nodelvl;
+	tmigr_hierarchy_levels = cpulvl + familylvl;
 
 	/*
-	 * If a NUMA node spawns more than one CPU level group then the next
+	 * If a family spawns more than one CPU level group then the next
 	 * level(s) of the hierarchy contains groups which handle all CPU groups
-	 * of the same NUMA node. The level above goes across NUMA nodes. Store
+	 * of the same family. The level above goes across NUMA nodes. Store
 	 * this information for the setup code to decide in which level node
 	 * matching is no longer required.
 	 */
-	tmigr_crossnode_level = cpulvl;
+	tmigr_crossfamily_level = cpulvl;
 
 	tmigr_level_list = kzalloc_objs(struct list_head,
 					tmigr_hierarchy_levels);
@@ -2068,9 +2103,9 @@ static int __init tmigr_init(void)
 		INIT_LIST_HEAD(&tmigr_level_list[i]);
 
 	pr_info("Timer migration: %d hierarchy levels; %d children per group;"
-		" %d crossnode level\n",
+		" %d crossfamily level\n",
 		tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,
-		tmigr_crossnode_level);
+		tmigr_crossfamily_level);
 
 	ret = cpuhp_setup_state(CPUHP_TMIGR_PREPARE, "tmigr:prepare",
 				tmigr_cpu_prepare, NULL);
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index 4c0073f3d321..3f6c7a110e3c 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -43,11 +43,12 @@ struct tmigr_event {
  * @events:		Timer queue for child events queued in the group
  * @migr_state:		State of the group (see union tmigr_state)
  * @level:		Hierarchy level of the group; Required during setup
- * @numa_node:		Required for setup only to make sure CPU and low level
- *			group information is NUMA local. It is set to NUMA node
- *			as long as the group level is per NUMA node (level <
- *			tmigr_crossnode_level); otherwise it is set to
- *			NUMA_NO_NODE
+ * @family:		Either NUMA node id or asym CPU capacity. Required for
+ *			setup only to make sure CPU and low level
+ *			group information is NUMA/capacity local. It is set to
+ *			NUMA node as long as the group level is per NUMA node
+ * 			(level < tmigr_family_level); otherwise it is set to
+ * 			the CPUs capacities.
  * @num_children:	Counter of group children to make sure the group is only
  *			filled with TMIGR_CHILDREN_PER_GROUP; Required for setup
  *			only
@@ -66,7 +67,7 @@ struct tmigr_group {
 	struct timerqueue_head	events;
 	atomic_t		migr_state;
 	unsigned int		level;
-	int			numa_node;
+	int			family;
 	unsigned int		num_children;
 	u8			groupmask;
 	struct list_head	list;
-- 
2.53.0

next prev parent reply	other threads:[~2026-06-25 16:41 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-25 16:41 [RFT][DONOTMERGE][PATCH 0/6] timers/migration: Prioritize lower capacity CPUs as migrators Frederic Weisbecker
2026-06-25 16:41 ` [PATCH 1/6] timers/migration: Revert per CPU capacity hierarchy Frederic Weisbecker
2026-06-25 16:41 ` [PATCH 2/6] timers/migration: Defer initialization after capacity topology is setup Frederic Weisbecker
2026-06-25 16:41 ` [PATCH 3/6] sched/topology: Account asym capacities number Frederic Weisbecker
2026-06-25 16:41 ` Frederic Weisbecker [this message]
2026-06-25 16:41 ` [PATCH 5/6] timers/migration: Prefer lower capacity groups as migrators Frederic Weisbecker
2026-06-25 16:41 ` [PATCH 6/6] scripts/timer_migration_tree.py: Dump mask of each group Frederic Weisbecker

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:61171b13c68 dfblob:5484056b001 dfblob:4209e695ec7
dfblob:2c2925046f4 dfblob:4c0073f3d32 dfblob:3f6c7a110e3 )
 OR (
bs:"[PATCH 4/6] timers/migration: Group CPUs per capacity" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260625164114.51454-5-frederic@kernel.org \
    --to=frederic@kernel.org \
    --cc=anna-maria@linutronix.de \
    --cc=christian.loehle@arm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=peterz@infradead.org \
    --cc=sehee1.jeong@samsung.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.