From: Shrikanth Hegde <sshegde@linux.ibm.com>
To: mingo@kernel.org, peterz@infradead.org,
vincent.guittot@linaro.org, linux-kernel@vger.kernel.org
Cc: sshegde@linux.ibm.com, kprateek.nayak@amd.com,
juri.lelli@redhat.com, vschneid@redhat.com, tglx@linutronix.de,
dietmar.eggemann@arm.com, frederic@kernel.org,
longman@redhat.com
Subject: [PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb
Date: Thu, 19 Mar 2026 12:23:13 +0530 [thread overview]
Message-ID: <20260319065314.343932-2-sshegde@linux.ibm.com> (raw)
In-Reply-To: <20260319065314.343932-1-sshegde@linux.ibm.com>
Current code around nohz_balancer_kick and kick_ilb:
1. Checks for nohz.idle_cpus_mask to see if idle load balance(ilb) is
needed.
2. Does a few checks to see if any conditions meet the criteria.
3. Tries to find the idle CPU. But the idle CPU found should be part of
housekeeping CPUs.
If there is no housekeeping idle CPU, then step 2 is done
un-necessarily, since 3 bails out without doing the ilb.
Fix that by making the decision early and pass it on to find_new_ilb.
Use a percpu cpumask instead of allocating it everytime since this is in
fastpath.
If flags is set to NOHZ_STATS_KICK since the time is after nohz.next_blocked
but before nohz.next_balance and there are idle CPUs which are part of
housekeeping, need to copy the same logic there too.
While there, fix the stale comments around nohz.nr_cpus
Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
Didn't add the fixes tag since it addresses more than stale comments.
kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++--------------
1 file changed, 31 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b19aeaa51ebc..02cca2c7a98d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7392,6 +7392,7 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
+static DEFINE_PER_CPU(cpumask_var_t, kick_ilb_tmpmask);
#ifdef CONFIG_NO_HZ_COMMON
@@ -12629,15 +12630,14 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notices that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
+ *
+ * @cpus idle CPUs in HK_TYPE_KERNEL_NOISE housekeeping
*/
-static inline int find_new_ilb(void)
+static inline int find_new_ilb(struct cpumask *cpus)
{
- const struct cpumask *hk_mask;
int ilb_cpu;
- hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
-
- for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
+ for_each_cpu(ilb_cpu, cpus) {
if (ilb_cpu == smp_processor_id())
continue;
@@ -12656,7 +12656,7 @@ static inline int find_new_ilb(void)
* We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
* (if there is one).
*/
-static void kick_ilb(unsigned int flags)
+static void kick_ilb(unsigned int flags, struct cpumask *cpus)
{
int ilb_cpu;
@@ -12667,7 +12667,7 @@ static void kick_ilb(unsigned int flags)
if (flags & NOHZ_BALANCE_KICK)
nohz.next_balance = jiffies+1;
- ilb_cpu = find_new_ilb();
+ ilb_cpu = find_new_ilb(cpus);
if (ilb_cpu < 0)
return;
@@ -12700,6 +12700,7 @@ static void kick_ilb(unsigned int flags)
*/
static void nohz_balancer_kick(struct rq *rq)
{
+ struct cpumask *ilb_cpus = this_cpu_cpumask_var_ptr(kick_ilb_tmpmask);
unsigned long now = jiffies;
struct sched_domain_shared *sds;
struct sched_domain *sd;
@@ -12715,27 +12716,41 @@ static void nohz_balancer_kick(struct rq *rq)
*/
nohz_balance_exit_idle(rq);
+ /* ILB considers only HK_TYPE_KERNEL_NOISE housekeeping CPUs */
+
if (READ_ONCE(nohz.has_blocked_load) &&
- time_after(now, READ_ONCE(nohz.next_blocked)))
+ time_after(now, READ_ONCE(nohz.next_blocked))) {
flags = NOHZ_STATS_KICK;
+ cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
+ housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
+ }
/*
- * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0
- * Skip the read if time is not due.
+ * Most of the time system is not 100% busy. i.e there are idle
+ * housekeeping CPUs.
+ *
+ * So, Skip the reading idle_cpus_mask if time is not due.
*
* If none are in tickless mode, there maybe a narrow window
* (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called.
* But idle load balancing is not done as find_new_ilb fails.
- * That's very rare. So read nohz.nr_cpus only if time is due.
+ * That's very rare. So check (idle_cpus_mask & HK_TYPE_KERNEL_NOISE)
+ * only if time is due.
+ *
*/
if (time_before(now, nohz.next_balance))
goto out;
+ /* Avoid the double computation */
+ if (flags != NOHZ_STATS_KICK)
+ cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
+ housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
+
/*
* None are in tickless mode and hence no need for NOHZ idle load
* balancing
*/
- if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
+ if (unlikely(cpumask_empty(ilb_cpus)))
return;
if (rq->nr_running >= 2) {
@@ -12767,7 +12782,7 @@ static void nohz_balancer_kick(struct rq *rq)
* When balancing between cores, all the SMT siblings of the
* preferred CPU must be idle.
*/
- for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
+ for_each_cpu_and(i, sched_domain_span(sd), ilb_cpus) {
if (sched_asym(sd, i, cpu)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
@@ -12820,7 +12835,7 @@ static void nohz_balancer_kick(struct rq *rq)
flags |= NOHZ_NEXT_KICK;
if (flags)
- kick_ilb(flags);
+ kick_ilb(flags, ilb_cpus);
}
static void set_cpu_sd_state_busy(int cpu)
@@ -14253,6 +14268,8 @@ __init void init_sched_fair_class(void)
zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(kick_ilb_tmpmask, i),
+ GFP_KERNEL, cpu_to_node(i));
#ifdef CONFIG_CFS_BANDWIDTH
INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
--
2.43.0
next prev parent reply other threads:[~2026-03-19 6:54 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-19 6:53 [PATCH 0/2] sched/fair: Minor improvements while triggering idle load balance Shrikanth Hegde
2026-03-19 6:53 ` Shrikanth Hegde [this message]
2026-03-19 8:15 ` [PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb Mukesh Kumar Chaurasiya
2026-03-19 13:13 ` Shrikanth Hegde
2026-03-19 22:58 ` Shubhang Kaushik
2026-03-20 2:47 ` Shrikanth Hegde
2026-03-20 3:37 ` K Prateek Nayak
2026-03-20 9:19 ` Shrikanth Hegde
2026-03-20 11:43 ` Peter Zijlstra
2026-03-20 14:12 ` Shrikanth Hegde
2026-03-20 14:28 ` Shrikanth Hegde
2026-03-19 6:53 ` [PATCH 2/2] sched/fair: get this cpu once in find_new_ilb Shrikanth Hegde
2026-03-19 8:18 ` Mukesh Kumar Chaurasiya
2026-03-19 9:20 ` Peter Zijlstra
2026-03-19 13:03 ` Shrikanth Hegde
2026-03-19 13:39 ` Peter Zijlstra
2026-03-20 3:40 ` K Prateek Nayak
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260319065314.343932-2-sshegde@linux.ibm.com \
--to=sshegde@linux.ibm.com \
--cc=dietmar.eggemann@arm.com \
--cc=frederic@kernel.org \
--cc=juri.lelli@redhat.com \
--cc=kprateek.nayak@amd.com \
--cc=linux-kernel@vger.kernel.org \
--cc=longman@redhat.com \
--cc=mingo@kernel.org \
--cc=peterz@infradead.org \
--cc=tglx@linutronix.de \
--cc=vincent.guittot@linaro.org \
--cc=vschneid@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox