From: Andrea Righi <arighi@nvidia.com>
To: Tejun Heo <tj@kernel.org>, David Vernet <void@manifault.com>,
Changwoo Min <changwoo@igalia.com>
Cc: Yury Norov <yury.norov@gmail.com>, Ingo Molnar <mingo@redhat.com>,
Peter Zijlstra <peterz@infradead.org>,
Juri Lelli <juri.lelli@redhat.com>,
Vincent Guittot <vincent.guittot@linaro.org>,
Dietmar Eggemann <dietmar.eggemann@arm.com>,
Steven Rostedt <rostedt@goodmis.org>,
Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
Valentin Schneider <vschneid@redhat.com>,
bpf@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 09/10] sched_ext: idle: Get rid of the scx_selcpu_topo_numa logic
Date: Fri, 20 Dec 2024 16:11:41 +0100 [thread overview]
Message-ID: <20241220154107.287478-10-arighi@nvidia.com> (raw)
In-Reply-To: <20241220154107.287478-1-arighi@nvidia.com>
With the introduction of separate per-NUMA node cpumasks, we
automatically track idle CPUs within each NUMA node.
This makes the special logic for determining idle CPUs in each NUMA node
redundant and unnecessary, so we can get rid of it.
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
kernel/sched/ext_idle.c | 93 ++++++++++-------------------------------
1 file changed, 23 insertions(+), 70 deletions(-)
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 013deaa08f12..b36e93da1b75 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -82,7 +82,6 @@ static void idle_masks_init(void)
}
static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
-static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
/*
* Return the node id associated to a target idle CPU (used to determine
@@ -259,25 +258,6 @@ static unsigned int numa_weight(s32 cpu)
return sg->group_weight;
}
-/*
- * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
- * domain is not defined).
- */
-static struct cpumask *numa_span(s32 cpu)
-{
- struct sched_domain *sd;
- struct sched_group *sg;
-
- sd = rcu_dereference(per_cpu(sd_numa, cpu));
- if (!sd)
- return NULL;
- sg = sd->groups;
- if (!sg)
- return NULL;
-
- return sched_group_span(sg);
-}
-
/*
* Return true if the LLC domains do not perfectly overlap with the NUMA
* domains, false otherwise.
@@ -329,7 +309,7 @@ static bool llc_numa_mismatch(void)
*/
static void update_selcpu_topology(struct sched_ext_ops *ops)
{
- bool enable_llc = false, enable_numa = false;
+ bool enable_llc = false;
unsigned int nr_cpus;
s32 cpu = cpumask_first(cpu_online_mask);
@@ -348,41 +328,34 @@ static void update_selcpu_topology(struct sched_ext_ops *ops)
if (nr_cpus > 0) {
if (nr_cpus < num_online_cpus())
enable_llc = true;
+ /*
+ * No need to enable LLC optimization if the LLC domains are
+ * perfectly overlapping with the NUMA domains when per-node
+ * cpumasks are enabled.
+ */
+ if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
+ !llc_numa_mismatch())
+ enable_llc = false;
pr_debug("sched_ext: LLC=%*pb weight=%u\n",
cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
}
-
- /*
- * Enable NUMA optimization only when there are multiple NUMA domains
- * among the online CPUs and the NUMA domains don't perfectly overlaps
- * with the LLC domains.
- *
- * If all CPUs belong to the same NUMA node and the same LLC domain,
- * enabling both NUMA and LLC optimizations is unnecessary, as checking
- * for an idle CPU in the same domain twice is redundant.
- */
- nr_cpus = numa_weight(cpu);
- if (nr_cpus > 0) {
- if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
- enable_numa = true;
- pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
- cpumask_pr_args(numa_span(cpu)), numa_weight(cpu));
- }
rcu_read_unlock();
pr_debug("sched_ext: LLC idle selection %s\n",
enable_llc ? "enabled" : "disabled");
- pr_debug("sched_ext: NUMA idle selection %s\n",
- enable_numa ? "enabled" : "disabled");
if (enable_llc)
static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
else
static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
- if (enable_numa)
- static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
+
+ /*
+ * Check if we need to enable per-node cpumasks.
+ */
+ if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
+ static_branch_enable_cpuslocked(&scx_builtin_idle_per_node);
else
- static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
+ static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
}
/*
@@ -405,9 +378,8 @@ static void update_selcpu_topology(struct sched_ext_ops *ops)
*
* 5. Pick any idle CPU usable by the task.
*
- * Step 3 and 4 are performed only if the system has, respectively, multiple
- * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
- * scx_selcpu_topo_numa).
+ * Step 3 is performed only if the system has multiple LLC domains that are not
+ * perfectly overlapping with the NUMA domains (see scx_selcpu_topo_llc).
*
* NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
* we never call ops.select_cpu() for them, see select_task_rq().
@@ -416,7 +388,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *found)
{
const struct cpumask *llc_cpus = NULL;
- const struct cpumask *numa_cpus = NULL;
int node = idle_cpu_to_node(prev_cpu);
s32 cpu;
@@ -438,13 +409,9 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
* CPU affinity), the task will simply use the flat scheduling domain
* defined by user-space.
*/
- if (p->nr_cpus_allowed >= num_possible_cpus()) {
- if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = numa_span(prev_cpu);
-
+ if (p->nr_cpus_allowed >= num_possible_cpus())
if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
llc_cpus = llc_span(prev_cpu);
- }
/*
* If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
@@ -507,15 +474,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
goto cpu_found;
}
- /*
- * Search for any fully idle core in the same NUMA node.
- */
- if (numa_cpus) {
- cpu = scx_pick_idle_cpu(numa_cpus, node, SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
- }
-
/*
* Search for any full idle core usable by the task.
*
@@ -545,17 +503,12 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
goto cpu_found;
}
- /*
- * Search for any idle CPU in the same NUMA node.
- */
- if (numa_cpus) {
- cpu = pick_idle_cpu_from_node(numa_cpus, node, 0);
- if (cpu >= 0)
- goto cpu_found;
- }
-
/*
* Search for any idle CPU usable by the task.
+ *
+ * If NUMA aware idle selection is enabled, the search will begin
+ * in prev_cpu's node and proceed to other nodes in order of
+ * increasing distance.
*/
cpu = scx_pick_idle_cpu(p->cpus_ptr, node, 0);
if (cpu >= 0)
--
2.47.1
next prev parent reply other threads:[~2024-12-20 15:41 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-12-20 15:11 [PATCHSET v8 sched_ext/for-6.14] sched_ext: split global idle cpumask into per-NUMA cpumasks Andrea Righi
2024-12-20 15:11 ` [PATCH 01/10] sched/topology: introduce for_each_numa_hop_node() / sched_numa_hop_node() Andrea Righi
2024-12-23 21:18 ` Yury Norov
2024-12-24 7:54 ` Andrea Righi
2024-12-24 17:33 ` Yury Norov
2024-12-20 15:11 ` [PATCH 02/10] sched_ext: Move built-in idle CPU selection policy to a separate file Andrea Righi
2024-12-24 21:21 ` Tejun Heo
2024-12-20 15:11 ` [PATCH 03/10] sched_ext: idle: introduce check_builtin_idle_enabled() helper Andrea Righi
2024-12-20 15:11 ` [PATCH 04/10] sched_ext: idle: use assign_cpu() to update the idle cpumask Andrea Righi
2024-12-23 22:26 ` Yury Norov
2024-12-20 15:11 ` [PATCH 05/10] sched_ext: idle: clarify comments Andrea Righi
2024-12-23 22:28 ` Yury Norov
2024-12-20 15:11 ` [PATCH 06/10] sched_ext: Introduce SCX_OPS_NODE_BUILTIN_IDLE Andrea Righi
2024-12-20 15:11 ` [PATCH 07/10] sched_ext: Introduce per-node idle cpumasks Andrea Righi
2024-12-24 4:05 ` Yury Norov
2024-12-24 8:18 ` Andrea Righi
2024-12-24 17:59 ` Yury Norov
2024-12-20 15:11 ` [PATCH 08/10] sched_ext: idle: introduce SCX_PICK_IDLE_NODE Andrea Righi
2024-12-24 2:48 ` Yury Norov
2024-12-24 3:53 ` Yury Norov
2024-12-24 8:37 ` Andrea Righi
2024-12-24 18:15 ` Yury Norov
2024-12-24 8:22 ` Andrea Righi
2024-12-24 21:29 ` Tejun Heo
2024-12-20 15:11 ` Andrea Righi [this message]
2024-12-23 23:39 ` [PATCH 09/10] sched_ext: idle: Get rid of the scx_selcpu_topo_numa logic Yury Norov
2024-12-24 8:58 ` Andrea Righi
2024-12-20 15:11 ` [PATCH 10/10] sched_ext: idle: Introduce NUMA aware idle cpu kfunc helpers Andrea Righi
2024-12-24 0:57 ` Yury Norov
2024-12-24 9:32 ` Andrea Righi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241220154107.287478-10-arighi@nvidia.com \
--to=arighi@nvidia.com \
--cc=bpf@vger.kernel.org \
--cc=bsegall@google.com \
--cc=changwoo@igalia.com \
--cc=dietmar.eggemann@arm.com \
--cc=juri.lelli@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=rostedt@goodmis.org \
--cc=tj@kernel.org \
--cc=vincent.guittot@linaro.org \
--cc=void@manifault.com \
--cc=vschneid@redhat.com \
--cc=yury.norov@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox