* Re: [RESEND PATCH] sched/fair: Skip sched_balance_running cmpxchg when balance is not due
2025-10-14 9:24 ` Peter Zijlstra
@ 2025-10-14 9:33 ` Shrikanth Hegde
2025-10-14 9:42 ` Peter Zijlstra
2025-10-14 13:50 ` Srikar Dronamraju
` (2 subsequent siblings)
3 siblings, 1 reply; 19+ messages in thread
From: Shrikanth Hegde @ 2025-10-14 9:33 UTC (permalink / raw)
To: Peter Zijlstra, Tim Chen
Cc: Ingo Molnar, Chen Yu, Doug Nelson, Mohini Narkhede, linux-kernel,
Vincent Guittot, K Prateek Nayak
On 10/14/25 2:54 PM, Peter Zijlstra wrote:
> On Mon, Oct 13, 2025 at 02:54:19PM -0700, Tim Chen wrote:
>
>>> So I'm not sure I understand the situation, @continue_balancing should
>>> limit this concurrency to however many groups are on this domain -- your
>>> granite thing with SNC on would have something like 6 groups?
>>
>> That's a good point. But I think the contention is worse than
>> 6 CPUs.
>>
>> The hierarchy would be
>>
>> SMT
>> NUMA-level1
>> NUMA-level2
>> NUMA-level3
>> NUMA-level4
>
> Aren't you missing the LLC/NODE domain here? We should have at least one
> !SD_NUMA domain above SMT.
>
>> There would be multiple CPUs in that are first in the SMT group
>> with continue_balancing=1 going up in the hierachy and
>> attempting the cmpxchg in the first NUMA domain level,
>> before calling should_we_balance() and finding that they are
>> not the first in the NUMA domain and set continue_balancing=0
>> and abort. Those CPUS are in same L3.
>> But at the same time, there could be CPUs in other sockets
>> cmpxchg on sched_balance_running.
>
> Right, Yu Chen said something like that as well, should_we_balance() is
> too late.
>
> Should we instead move the whole serialize thing inside
> sched_balance_rq() like so:
>
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bc0b7ce8a65d..e9f719ba17e1 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -11722,6 +11722,22 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
> }
> }
>
> +
> +/*
> + * This flag serializes load-balancing passes over large domains
> + * (above the NODE topology level) - only one load-balancing instance
> + * may run at a time, to reduce overhead on very large systems with
> + * lots of CPUs and large NUMA distances.
> + *
> + * - Note that load-balancing passes triggered while another one
> + * is executing are skipped and not re-tried.
> + *
> + * - Also note that this does not serialize rebalance_domains()
> + * execution, as non-SD_SERIALIZE domains will still be
> + * load-balanced in parallel.
> + */
> +static atomic_t sched_balance_running = ATOMIC_INIT(0);
> +
> /*
> * Check this_cpu to ensure it is balanced within domain. Attempt to move
> * tasks if there is an imbalance.
> @@ -11747,6 +11763,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
> .fbq_type = all,
> .tasks = LIST_HEAD_INIT(env.tasks),
> };
> + int need_unlock = false;
>
> cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
>
> @@ -11758,6 +11775,12 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
> goto out_balanced;
> }
>
> + if (idle != CPU_NEWLY_IDLE && (sd->flags & SD_SERIALIZE)) {
> + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
> + goto out_balanced;
Maybe goto out instead of out_balanced ?
> + need_unlock = true;
> + }
> +
> group = sched_balance_find_src_group(&env);
> if (!group) {
> schedstat_inc(sd->lb_nobusyg[idle]);
> @@ -11998,6 +12021,9 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
> sd->balance_interval < sd->max_interval)
> sd->balance_interval *= 2;
> out:
> + if (need_unlock)
> + atomic_set_release(&sched_balance_running, 0);
> +
> return ld_moved;
> }
>
> @@ -12122,21 +12148,6 @@ static int active_load_balance_cpu_stop(void *data)
> return 0;
> }
>
> -/*
> - * This flag serializes load-balancing passes over large domains
> - * (above the NODE topology level) - only one load-balancing instance
> - * may run at a time, to reduce overhead on very large systems with
> - * lots of CPUs and large NUMA distances.
> - *
> - * - Note that load-balancing passes triggered while another one
> - * is executing are skipped and not re-tried.
> - *
> - * - Also note that this does not serialize rebalance_domains()
> - * execution, as non-SD_SERIALIZE domains will still be
> - * load-balanced in parallel.
> - */
> -static atomic_t sched_balance_running = ATOMIC_INIT(0);
> -
> /*
> * Scale the max sched_balance_rq interval with the number of CPUs in the system.
> * This trades load-balance latency on larger machines for less cross talk.
> @@ -12192,7 +12203,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> /* Earliest time when we have to do rebalance again */
> unsigned long next_balance = jiffies + 60*HZ;
> int update_next_balance = 0;
> - int need_serialize, need_decay = 0;
> + int need_decay = 0;
> u64 max_cost = 0;
>
> rcu_read_lock();
> @@ -12216,13 +12227,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> }
>
> interval = get_sd_balance_interval(sd, busy);
> -
> - need_serialize = sd->flags & SD_SERIALIZE;
> - if (need_serialize) {
> - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
> - goto out;
> - }
> -
> if (time_after_eq(jiffies, sd->last_balance + interval)) {
> if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
> /*
> @@ -12236,9 +12240,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> sd->last_balance = jiffies;
> interval = get_sd_balance_interval(sd, busy);
> }
> - if (need_serialize)
> - atomic_set_release(&sched_balance_running, 0);
> -out:
> +
> if (time_after(next_balance, sd->last_balance + interval)) {
> next_balance = sd->last_balance + interval;
> update_next_balance = 1;
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [RESEND PATCH] sched/fair: Skip sched_balance_running cmpxchg when balance is not due
2025-10-14 9:33 ` Shrikanth Hegde
@ 2025-10-14 9:42 ` Peter Zijlstra
2025-10-14 9:51 ` Shrikanth Hegde
2025-10-16 14:03 ` Shrikanth Hegde
0 siblings, 2 replies; 19+ messages in thread
From: Peter Zijlstra @ 2025-10-14 9:42 UTC (permalink / raw)
To: Shrikanth Hegde
Cc: Tim Chen, Ingo Molnar, Chen Yu, Doug Nelson, Mohini Narkhede,
linux-kernel, Vincent Guittot, K Prateek Nayak
On Tue, Oct 14, 2025 at 03:03:41PM +0530, Shrikanth Hegde wrote:
> > @@ -11758,6 +11775,12 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
> > goto out_balanced;
> > }
> > + if (idle != CPU_NEWLY_IDLE && (sd->flags & SD_SERIALIZE)) {
> > + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
> > + goto out_balanced;
>
> Maybe goto out instead of out_balanced ?
That would be inconsistent with the !should_we_balance() goto
out_balanced right above this, no?
> > + need_unlock = true;
> > + }
> > +
> > group = sched_balance_find_src_group(&env);
> > if (!group) {
> > schedstat_inc(sd->lb_nobusyg[idle]);
> > @@ -11998,6 +12021,9 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
> > sd->balance_interval < sd->max_interval)
> > sd->balance_interval *= 2;
> > out:
> > + if (need_unlock)
> > + atomic_set_release(&sched_balance_running, 0);
> > +
> > return ld_moved;
> > }
> > @@ -12122,21 +12148,6 @@ static int active_load_balance_cpu_stop(void *data)
> > return 0;
> > }
> > -/*
> > - * This flag serializes load-balancing passes over large domains
> > - * (above the NODE topology level) - only one load-balancing instance
> > - * may run at a time, to reduce overhead on very large systems with
> > - * lots of CPUs and large NUMA distances.
> > - *
> > - * - Note that load-balancing passes triggered while another one
> > - * is executing are skipped and not re-tried.
> > - *
> > - * - Also note that this does not serialize rebalance_domains()
> > - * execution, as non-SD_SERIALIZE domains will still be
> > - * load-balanced in parallel.
> > - */
> > -static atomic_t sched_balance_running = ATOMIC_INIT(0);
> > -
> > /*
> > * Scale the max sched_balance_rq interval with the number of CPUs in the system.
> > * This trades load-balance latency on larger machines for less cross talk.
> > @@ -12192,7 +12203,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> > /* Earliest time when we have to do rebalance again */
> > unsigned long next_balance = jiffies + 60*HZ;
> > int update_next_balance = 0;
> > - int need_serialize, need_decay = 0;
> > + int need_decay = 0;
> > u64 max_cost = 0;
> > rcu_read_lock();
> > @@ -12216,13 +12227,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> > }
> > interval = get_sd_balance_interval(sd, busy);
> > -
> > - need_serialize = sd->flags & SD_SERIALIZE;
> > - if (need_serialize) {
> > - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
> > - goto out;
> > - }
> > -
> > if (time_after_eq(jiffies, sd->last_balance + interval)) {
> > if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
> > /*
> > @@ -12236,9 +12240,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> > sd->last_balance = jiffies;
> > interval = get_sd_balance_interval(sd, busy);
> > }
> > - if (need_serialize)
> > - atomic_set_release(&sched_balance_running, 0);
> > -out:
> > +
> > if (time_after(next_balance, sd->last_balance + interval)) {
> > next_balance = sd->last_balance + interval;
> > update_next_balance = 1;
>
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [RESEND PATCH] sched/fair: Skip sched_balance_running cmpxchg when balance is not due
2025-10-14 9:42 ` Peter Zijlstra
@ 2025-10-14 9:51 ` Shrikanth Hegde
2025-10-16 14:03 ` Shrikanth Hegde
1 sibling, 0 replies; 19+ messages in thread
From: Shrikanth Hegde @ 2025-10-14 9:51 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Tim Chen, Ingo Molnar, Chen Yu, Doug Nelson, Mohini Narkhede,
linux-kernel, Vincent Guittot, K Prateek Nayak
On 10/14/25 3:12 PM, Peter Zijlstra wrote:
> On Tue, Oct 14, 2025 at 03:03:41PM +0530, Shrikanth Hegde wrote:
>
>>> @@ -11758,6 +11775,12 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
>>> goto out_balanced;
>>> }
>>> + if (idle != CPU_NEWLY_IDLE && (sd->flags & SD_SERIALIZE)) {
>>> + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
>>> + goto out_balanced;
>>
>> Maybe goto out instead of out_balanced ?
>
> That would be inconsistent with the !should_we_balance() goto
> out_balanced right above this, no?
Yes. But whats the reason for saying out_balanced for !should_we_balance?
Load balance wasn't even attempted there right? Could this be updating it wrongly?
At-least comments around out_all_pinned doesn't make sense if we came here via !swb
schedstat_inc(sd->lb_balanced[idle]);
sd->nr_balance_failed = 0;
>
>>> + need_unlock = true;
>>> + }
>>> +
>>> group = sched_balance_find_src_group(&env);
>>> if (!group) {
>>> schedstat_inc(sd->lb_nobusyg[idle]);
>>> @@ -11998,6 +12021,9 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
>>> sd->balance_interval < sd->max_interval)
>>> sd->balance_interval *= 2;
>>> out:
>>> + if (need_unlock)
>>> + atomic_set_release(&sched_balance_running, 0);
>>> +
>>> return ld_moved;
>>> }
>>> @@ -12122,21 +12148,6 @@ static int active_load_balance_cpu_stop(void *data)
>>> return 0;
>>> }
>>> -/*
>>> - * This flag serializes load-balancing passes over large domains
>>> - * (above the NODE topology level) - only one load-balancing instance
>>> - * may run at a time, to reduce overhead on very large systems with
>>> - * lots of CPUs and large NUMA distances.
>>> - *
>>> - * - Note that load-balancing passes triggered while another one
>>> - * is executing are skipped and not re-tried.
>>> - *
>>> - * - Also note that this does not serialize rebalance_domains()
>>> - * execution, as non-SD_SERIALIZE domains will still be
>>> - * load-balanced in parallel.
>>> - */
>>> -static atomic_t sched_balance_running = ATOMIC_INIT(0);
>>> -
>>> /*
>>> * Scale the max sched_balance_rq interval with the number of CPUs in the system.
>>> * This trades load-balance latency on larger machines for less cross talk.
>>> @@ -12192,7 +12203,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
>>> /* Earliest time when we have to do rebalance again */
>>> unsigned long next_balance = jiffies + 60*HZ;
>>> int update_next_balance = 0;
>>> - int need_serialize, need_decay = 0;
>>> + int need_decay = 0;
>>> u64 max_cost = 0;
>>> rcu_read_lock();
>>> @@ -12216,13 +12227,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
>>> }
>>> interval = get_sd_balance_interval(sd, busy);
>>> -
>>> - need_serialize = sd->flags & SD_SERIALIZE;
>>> - if (need_serialize) {
>>> - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
>>> - goto out;
>>> - }
>>> -
>>> if (time_after_eq(jiffies, sd->last_balance + interval)) {
>>> if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
>>> /*
>>> @@ -12236,9 +12240,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
>>> sd->last_balance = jiffies;
>>> interval = get_sd_balance_interval(sd, busy);
>>> }
>>> - if (need_serialize)
>>> - atomic_set_release(&sched_balance_running, 0);
>>> -out:
>>> +
>>> if (time_after(next_balance, sd->last_balance + interval)) {
>>> next_balance = sd->last_balance + interval;
>>> update_next_balance = 1;
>>
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [RESEND PATCH] sched/fair: Skip sched_balance_running cmpxchg when balance is not due
2025-10-14 9:42 ` Peter Zijlstra
2025-10-14 9:51 ` Shrikanth Hegde
@ 2025-10-16 14:03 ` Shrikanth Hegde
2025-10-22 17:42 ` Shrikanth Hegde
1 sibling, 1 reply; 19+ messages in thread
From: Shrikanth Hegde @ 2025-10-16 14:03 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Tim Chen, Ingo Molnar, Chen Yu, Doug Nelson, Mohini Narkhede,
linux-kernel, Vincent Guittot, K Prateek Nayak
On 10/14/25 3:12 PM, Peter Zijlstra wrote:
> On Tue, Oct 14, 2025 at 03:03:41PM +0530, Shrikanth Hegde wrote:
>
>>> @@ -11758,6 +11775,12 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
>>> goto out_balanced;
>>> }
>>> + if (idle != CPU_NEWLY_IDLE && (sd->flags & SD_SERIALIZE)) {
>>> + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
>>> + goto out_balanced;
>>
>> Maybe goto out instead of out_balanced ?
>
> That would be inconsistent with the !should_we_balance() goto
> out_balanced right above this, no?
>
Hi Peter.
Did similar probe points numbers compared to this. Even the patch is quite similar to what
was suggested there a while ago.
https://lore.kernel.org/all/41e11090-a100-48a7-a0dd-c989772822d7@linux.ibm.com/
480 CPUs system with 6 NUMA nodes. (different system than last time)
tl;dr
- Number of time sched_balance_running is taken is way less after the swb check. (which is great)
- Number of time it fails to set is very low after swb. (So out_balanced vs out may not make a
significant difference.)
- Patch is at the end. It is this patch + redo stuff + (ref_variable_stuff(ignore))
--- detailed log----
++++++++++++ probe points +++++++++++++++
(added a ref("crap") so i could put a probe where i want )
0 static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
...
20 max_cost += sd->max_newidle_lb_cost;
/*
* Stop the load balance at this level. There is another
* CPU in our sched group which is doing load balancing more
* actively.
*/
if (!continue_balancing) {
if (need_decay)
continue;
break;
}
33 if (sd->flags & SD_SERIALIZE)
34 ref = ref + 5;
<sched_balance_rq@/home/shrikanth/sched_tip/kernel/sched/fair.c:0>
0 static int sched_balance_rq(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
...
int need_unlock = false;
cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
25 schedstat_inc(sd->lb_count[idle]);
...
34 if (idle != CPU_NEWLY_IDLE && (sd->flags & SD_SERIALIZE)) {
35 if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) {
36 ref = ref+1;
37 goto out_balanced;
}
39 ref = ref + 2;
40 need_unlock = true;
...
env.loop_break = SCHED_NR_MIGRATE_BREAK;
167 if (need_unlock) {
168 ref = ref+3;
169 atomic_set_release(&sched_balance_running, 0);
}
goto redo;
...
out:
287 if (need_unlock) {
288 ref = ref +4;
289 atomic_set_release(&sched_balance_running, 0);
}
return ld_moved;
probe:sched_balance_domains_L34 (on sched_balance_domains:34@kernel/sched/fair.c)
probe:sched_balance_rq_L168 (on sched_balance_rq:168@kernel/sched/fair.c)
probe:sched_balance_rq_L21 (on sched_balance_rq+312@kernel/sched/fair.c)
probe:sched_balance_rq_L288 (on sched_balance_rq+312@kernel/sched/fair.c)
probe:sched_balance_rq_L35 (on sched_balance_rq+312@kernel/sched/fair.c)
probe:sched_balance_rq_L36 (on sched_balance_rq+312@kernel/sched/fair.c)
probe:sched_balance_rq_L39 (on sched_balance_rq+312@kernel/sched/fair.c)
+++++++++++ Data on various load points ++++++++++++++++++++++++
--- idle ---
perf stat -a -e probe:* sleep 10
6,123 probe:sched_balance_domains_L34
10,378 probe:sched_balance_rq_L21
79 probe:sched_balance_rq_L35
17 probe:sched_balance_rq_L36
62 probe:sched_balance_rq_L39
0 probe:sched_balance_rq_L168
62 probe:sched_balance_rq_L288
--- 25% load ---
perf stat -a -e probe:* stress-ng --cpu=480 -l 25 -t 10
510,551 probe:sched_balance_domains_L34
303,892 probe:sched_balance_rq_L21
442 probe:sched_balance_rq_L35
3 probe:sched_balance_rq_L36
439 probe:sched_balance_rq_L39
0 probe:sched_balance_rq_L168
439 probe:sched_balance_rq_L288
--- 50% load ---
248,969 probe:sched_balance_domains_L34
187,864 probe:sched_balance_rq_L21
926 probe:sched_balance_rq_L35
6 probe:sched_balance_rq_L36
920 probe:sched_balance_rq_L39
0 probe:sched_balance_rq_L168
920 probe:sched_balance_rq_L288
--- 75% load ---
110,294 probe:sched_balance_domains_L34
71,568 probe:sched_balance_rq_L21
861 probe:sched_balance_rq_L35
6 probe:sched_balance_rq_L36
855 probe:sched_balance_rq_L39
0 probe:sched_balance_rq_L168
855 probe:sched_balance_rq_L288
--- 100% load ---
85,960 probe:sched_balance_domains_L34
48,169 probe:sched_balance_rq_L21
71 probe:sched_balance_rq_L35
4 probe:sched_balance_rq_L36
67 probe:sched_balance_rq_L39
0 probe:sched_balance_rq_L168
67 probe:sched_balance_rq_L288
++++++++++++++++++ patch ++++++++++++++++++++++++++++++++++++
(ignore ref crap)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cee1793e8277..832104705500 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11722,10 +11722,29 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
}
}
+
+/*
+ * This flag serializes load-balancing passes over large domains
+ * (above the NODE topology level) - only one load-balancing instance
+ * may run at a time, to reduce overhead on very large systems with
+ * lots of CPUs and large NUMA distances.
+ *
+ * - Note that load-balancing passes triggered while another one
+ * is executing are skipped and not re-tried.
+ *
+ * - Also note that this does not serialize rebalance_domains()
+ * execution, as non-SD_SERIALIZE domains will still be
+ * load-balanced in parallel.
+ */
+static atomic_t sched_balance_running = ATOMIC_INIT(0);
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
+
+int ref = 0;
+
static int sched_balance_rq(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
@@ -11747,10 +11766,12 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
};
+ int need_unlock = false;
cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
schedstat_inc(sd->lb_count[idle]);
+ ref = 1;
redo:
if (!should_we_balance(&env)) {
@@ -11758,6 +11779,15 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
goto out_balanced;
}
+ if (idle != CPU_NEWLY_IDLE && (sd->flags & SD_SERIALIZE)) {
+ if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) {
+ ref = ref+1;
+ goto out_balanced;
+ }
+ ref = ref + 2;
+ need_unlock = true;
+ }
+
group = sched_balance_find_src_group(&env);
if (!group) {
schedstat_inc(sd->lb_nobusyg[idle]);
@@ -11882,6 +11912,10 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
env.loop_break = SCHED_NR_MIGRATE_BREAK;
+ if (need_unlock) {
+ ref = ref+3;
+ atomic_set_release(&sched_balance_running, 0);
+ }
goto redo;
}
goto out_all_pinned;
@@ -11998,6 +12032,11 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
out:
+ if (need_unlock) {
+ ref = ref +4;
+ atomic_set_release(&sched_balance_running, 0);
+ }
+
return ld_moved;
}
@@ -12122,21 +12161,6 @@ static int active_load_balance_cpu_stop(void *data)
return 0;
}
-/*
- * This flag serializes load-balancing passes over large domains
- * (above the NODE topology level) - only one load-balancing instance
- * may run at a time, to reduce overhead on very large systems with
- * lots of CPUs and large NUMA distances.
- *
- * - Note that load-balancing passes triggered while another one
- * is executing are skipped and not re-tried.
- *
- * - Also note that this does not serialize rebalance_domains()
- * execution, as non-SD_SERIALIZE domains will still be
- * load-balanced in parallel.
- */
-static atomic_t sched_balance_running = ATOMIC_INIT(0);
-
/*
* Scale the max sched_balance_rq interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
@@ -12192,7 +12216,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
- int need_serialize, need_decay = 0;
+ int need_decay = 0;
u64 max_cost = 0;
rcu_read_lock();
@@ -12215,14 +12239,10 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
break;
}
- interval = get_sd_balance_interval(sd, busy);
-
- need_serialize = sd->flags & SD_SERIALIZE;
- if (need_serialize) {
- if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
- goto out;
- }
+ if (sd->flags & SD_SERIALIZE)
+ ref = ref + 5;
+ interval = get_sd_balance_interval(sd, busy);
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
/*
@@ -12236,9 +12256,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
}
- if (need_serialize)
- atomic_set_release(&sched_balance_running, 0);
-out:
+
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
update_next_balance = 1;
^ permalink raw reply related [flat|nested] 19+ messages in thread* Re: [RESEND PATCH] sched/fair: Skip sched_balance_running cmpxchg when balance is not due
2025-10-16 14:03 ` Shrikanth Hegde
@ 2025-10-22 17:42 ` Shrikanth Hegde
0 siblings, 0 replies; 19+ messages in thread
From: Shrikanth Hegde @ 2025-10-22 17:42 UTC (permalink / raw)
To: Peter Zijlstra, Tim Chen
Cc: Ingo Molnar, Chen Yu, Doug Nelson, Mohini Narkhede, linux-kernel,
Vincent Guittot, K Prateek Nayak
On 10/16/25 7:33 PM, Shrikanth Hegde wrote:
>
>
> On 10/14/25 3:12 PM, Peter Zijlstra wrote:
>> On Tue, Oct 14, 2025 at 03:03:41PM +0530, Shrikanth Hegde wrote:
>>
>>>> @@ -11758,6 +11775,12 @@ static int sched_balance_rq(int this_cpu,
>>>> struct rq *this_rq,
>>>> goto out_balanced;
>>>> }
>>>> + if (idle != CPU_NEWLY_IDLE && (sd->flags & SD_SERIALIZE)) {
>>>> + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
>>>> + goto out_balanced;
>>>
>>> Maybe goto out instead of out_balanced ?
>>
>> That would be inconsistent with the !should_we_balance() goto
>> out_balanced right above this, no?
>>
> Hi Peter.
>
> Did similar probe points numbers compared to this. Even the patch is
> quite similar to what
> was suggested there a while ago.
> https://lore.kernel.org/all/41e11090-a100-48a7-a0dd-
> c989772822d7@linux.ibm.com/
>
> 480 CPUs system with 6 NUMA nodes. (different system than last time)
>
Hi Peter, Tim.
How are we proceeding here? Should i send the below patch with changelog or
is it being worked out by either of you.
It is really beneficial for large systems to avoid un-neccessary cache bouncing.
> tl;dr
>
> - Number of time sched_balance_running is taken is way less after the
> swb check. (which is great)
> - Number of time it fails to set is very low after swb. (So out_balanced
> vs out may not make a
> significant difference.)
> - Patch is at the end. It is this patch + redo stuff +
> (ref_variable_stuff(ignore))
>
>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RESEND PATCH] sched/fair: Skip sched_balance_running cmpxchg when balance is not due
2025-10-14 9:24 ` Peter Zijlstra
2025-10-14 9:33 ` Shrikanth Hegde
@ 2025-10-14 13:50 ` Srikar Dronamraju
2025-10-14 13:59 ` Peter Zijlstra
2025-10-14 14:28 ` Shrikanth Hegde
2025-10-14 18:05 ` Tim Chen
3 siblings, 1 reply; 19+ messages in thread
From: Srikar Dronamraju @ 2025-10-14 13:50 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Tim Chen, Ingo Molnar, Chen Yu, Doug Nelson, Mohini Narkhede,
linux-kernel, Vincent Guittot, Shrikanth Hegde, K Prateek Nayak
* Peter Zijlstra <peterz@infradead.org> [2025-10-14 11:24:36]:
> On Mon, Oct 13, 2025 at 02:54:19PM -0700, Tim Chen wrote:
>
>
> Right, Yu Chen said something like that as well, should_we_balance() is
> too late.
>
> Should we instead move the whole serialize thing inside
> sched_balance_rq() like so:
>
> @@ -12122,21 +12148,6 @@ static int active_load_balance_cpu_stop(void *data)
> return 0;
> }
>
> -/*
> - * This flag serializes load-balancing passes over large domains
> - * (above the NODE topology level) - only one load-balancing instance
> - * may run at a time, to reduce overhead on very large systems with
> - * lots of CPUs and large NUMA distances.
> - *
> - * - Note that load-balancing passes triggered while another one
> - * is executing are skipped and not re-tried.
> - *
> - * - Also note that this does not serialize rebalance_domains()
> - * execution, as non-SD_SERIALIZE domains will still be
> - * load-balanced in parallel.
> - */
> -static atomic_t sched_balance_running = ATOMIC_INIT(0);
> -
> /*
> * Scale the max sched_balance_rq interval with the number of CPUs in the system.
> * This trades load-balance latency on larger machines for less cross talk.
> @@ -12192,7 +12203,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> /* Earliest time when we have to do rebalance again */
> unsigned long next_balance = jiffies + 60*HZ;
> int update_next_balance = 0;
> - int need_serialize, need_decay = 0;
> + int need_decay = 0;
> u64 max_cost = 0;
>
> rcu_read_lock();
> @@ -12216,13 +12227,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> }
>
> interval = get_sd_balance_interval(sd, busy);
> -
> - need_serialize = sd->flags & SD_SERIALIZE;
> - if (need_serialize) {
> - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
> - goto out;
> - }
> -
> if (time_after_eq(jiffies, sd->last_balance + interval)) {
> if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
> /*
> @@ -12236,9 +12240,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> sd->last_balance = jiffies;
> interval = get_sd_balance_interval(sd, busy);
> }
> - if (need_serialize)
> - atomic_set_release(&sched_balance_running, 0);
> -out:
> +
> if (time_after(next_balance, sd->last_balance + interval)) {
> next_balance = sd->last_balance + interval;
> update_next_balance = 1;
I think this is better since previously the one CPU which was not suppose to
do the balancing may increment the atomic variable. If the CPU, that was
suppose to do the balance now tries it may fail since the variable was not
yet decremented.
--
Thanks and Regards
Srikar Dronamraju
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [RESEND PATCH] sched/fair: Skip sched_balance_running cmpxchg when balance is not due
2025-10-14 13:50 ` Srikar Dronamraju
@ 2025-10-14 13:59 ` Peter Zijlstra
0 siblings, 0 replies; 19+ messages in thread
From: Peter Zijlstra @ 2025-10-14 13:59 UTC (permalink / raw)
To: Srikar Dronamraju
Cc: Tim Chen, Ingo Molnar, Chen Yu, Doug Nelson, Mohini Narkhede,
linux-kernel, Vincent Guittot, Shrikanth Hegde, K Prateek Nayak
On Tue, Oct 14, 2025 at 07:20:35PM +0530, Srikar Dronamraju wrote:
> * Peter Zijlstra <peterz@infradead.org> [2025-10-14 11:24:36]:
>
> > On Mon, Oct 13, 2025 at 02:54:19PM -0700, Tim Chen wrote:
> >
> >
> > Right, Yu Chen said something like that as well, should_we_balance() is
> > too late.
> >
> > Should we instead move the whole serialize thing inside
> > sched_balance_rq() like so:
> >
> > @@ -12122,21 +12148,6 @@ static int active_load_balance_cpu_stop(void *data)
> > return 0;
> > }
> >
> > -/*
> > - * This flag serializes load-balancing passes over large domains
> > - * (above the NODE topology level) - only one load-balancing instance
> > - * may run at a time, to reduce overhead on very large systems with
> > - * lots of CPUs and large NUMA distances.
> > - *
> > - * - Note that load-balancing passes triggered while another one
> > - * is executing are skipped and not re-tried.
> > - *
> > - * - Also note that this does not serialize rebalance_domains()
> > - * execution, as non-SD_SERIALIZE domains will still be
> > - * load-balanced in parallel.
> > - */
> > -static atomic_t sched_balance_running = ATOMIC_INIT(0);
> > -
> > /*
> > * Scale the max sched_balance_rq interval with the number of CPUs in the system.
> > * This trades load-balance latency on larger machines for less cross talk.
> > @@ -12192,7 +12203,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> > /* Earliest time when we have to do rebalance again */
> > unsigned long next_balance = jiffies + 60*HZ;
> > int update_next_balance = 0;
> > - int need_serialize, need_decay = 0;
> > + int need_decay = 0;
> > u64 max_cost = 0;
> >
> > rcu_read_lock();
> > @@ -12216,13 +12227,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> > }
> >
> > interval = get_sd_balance_interval(sd, busy);
> > -
> > - need_serialize = sd->flags & SD_SERIALIZE;
> > - if (need_serialize) {
> > - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
> > - goto out;
> > - }
> > -
> > if (time_after_eq(jiffies, sd->last_balance + interval)) {
> > if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
> > /*
> > @@ -12236,9 +12240,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> > sd->last_balance = jiffies;
> > interval = get_sd_balance_interval(sd, busy);
> > }
> > - if (need_serialize)
> > - atomic_set_release(&sched_balance_running, 0);
> > -out:
> > +
> > if (time_after(next_balance, sd->last_balance + interval)) {
> > next_balance = sd->last_balance + interval;
> > update_next_balance = 1;
>
> I think this is better since previously the one CPU which was not suppose to
> do the balancing may increment the atomic variable. If the CPU, that was
> suppose to do the balance now tries it may fail since the variable was not
> yet decremented.
Right, it would do that acquire and then still have at least 2 ways to
not actually balance, which is a waste.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RESEND PATCH] sched/fair: Skip sched_balance_running cmpxchg when balance is not due
2025-10-14 9:24 ` Peter Zijlstra
2025-10-14 9:33 ` Shrikanth Hegde
2025-10-14 13:50 ` Srikar Dronamraju
@ 2025-10-14 14:28 ` Shrikanth Hegde
2025-10-14 18:05 ` Tim Chen
3 siblings, 0 replies; 19+ messages in thread
From: Shrikanth Hegde @ 2025-10-14 14:28 UTC (permalink / raw)
To: Peter Zijlstra, Tim Chen
Cc: Ingo Molnar, Chen Yu, Doug Nelson, Mohini Narkhede, linux-kernel,
Vincent Guittot, K Prateek Nayak
On 10/14/25 2:54 PM, Peter Zijlstra wrote:
>
> Should we instead move the whole serialize thing inside
> sched_balance_rq() like so:
>
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bc0b7ce8a65d..e9f719ba17e1 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -11722,6 +11722,22 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
> }
> }
>
> +
> +/*
> + * This flag serializes load-balancing passes over large domains
> + * (above the NODE topology level) - only one load-balancing instance
> + * may run at a time, to reduce overhead on very large systems with
> + * lots of CPUs and large NUMA distances.
> + *
> + * - Note that load-balancing passes triggered while another one
> + * is executing are skipped and not re-tried.
> + *
> + * - Also note that this does not serialize rebalance_domains()
> + * execution, as non-SD_SERIALIZE domains will still be
> + * load-balanced in parallel.
> + */
> +static atomic_t sched_balance_running = ATOMIC_INIT(0);
> +
> /*
> * Check this_cpu to ensure it is balanced within domain. Attempt to move
> * tasks if there is an imbalance.
> @@ -11747,6 +11763,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
> .fbq_type = all,
> .tasks = LIST_HEAD_INIT(env.tasks),
> };
> + int need_unlock = false;
>
> cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
>
> @@ -11758,6 +11775,12 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
> goto out_balanced;
> }
>
> + if (idle != CPU_NEWLY_IDLE && (sd->flags & SD_SERIALIZE)) {
> + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
> + goto out_balanced;
> + need_unlock = true;
> + }
> +
> group = sched_balance_find_src_group(&env);
> if (!group) {
> schedstat_inc(sd->lb_nobusyg[idle]);
> @@ -11998,6 +12021,9 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
> sd->balance_interval < sd->max_interval)
> sd->balance_interval *= 2;
> out:
> + if (need_unlock)
> + atomic_set_release(&sched_balance_running, 0);
> +
> return ld_moved;
> }
>
> @@ -12122,21 +12148,6 @@ static int active_load_balance_cpu_stop(void *data)
> return 0;
> }
>
> -/*
> - * This flag serializes load-balancing passes over large domains
> - * (above the NODE topology level) - only one load-balancing instance
> - * may run at a time, to reduce overhead on very large systems with
> - * lots of CPUs and large NUMA distances.
> - *
> - * - Note that load-balancing passes triggered while another one
> - * is executing are skipped and not re-tried.
> - *
> - * - Also note that this does not serialize rebalance_domains()
> - * execution, as non-SD_SERIALIZE domains will still be
> - * load-balanced in parallel.
> - */
> -static atomic_t sched_balance_running = ATOMIC_INIT(0);
> -
> /*
> * Scale the max sched_balance_rq interval with the number of CPUs in the system.
> * This trades load-balance latency on larger machines for less cross talk.
> @@ -12192,7 +12203,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> /* Earliest time when we have to do rebalance again */
> unsigned long next_balance = jiffies + 60*HZ;
> int update_next_balance = 0;
> - int need_serialize, need_decay = 0;
> + int need_decay = 0;
> u64 max_cost = 0;
>
> rcu_read_lock();
> @@ -12216,13 +12227,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> }
>
> interval = get_sd_balance_interval(sd, busy);
> -
> - need_serialize = sd->flags & SD_SERIALIZE;
> - if (need_serialize) {
> - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
> - goto out;
> - }
> -
> if (time_after_eq(jiffies, sd->last_balance + interval)) {
> if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
> /*
> @@ -12236,9 +12240,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> sd->last_balance = jiffies;
> interval = get_sd_balance_interval(sd, busy);
> }
> - if (need_serialize)
> - atomic_set_release(&sched_balance_running, 0);
> -out:
> +
> if (time_after(next_balance, sd->last_balance + interval)) {
> next_balance = sd->last_balance + interval;
> update_next_balance = 1;
One thing is missing, need to reset to 0 for redo logic to work.
@@ -11882,6 +11905,8 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
env.loop_break = SCHED_NR_MIGRATE_BREAK;
+ if (need_unlock)
+ atomic_set_release(&sched_balance_running, 0);
goto redo;
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [RESEND PATCH] sched/fair: Skip sched_balance_running cmpxchg when balance is not due
2025-10-14 9:24 ` Peter Zijlstra
` (2 preceding siblings ...)
2025-10-14 14:28 ` Shrikanth Hegde
@ 2025-10-14 18:05 ` Tim Chen
3 siblings, 0 replies; 19+ messages in thread
From: Tim Chen @ 2025-10-14 18:05 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Chen Yu, Doug Nelson, Mohini Narkhede, linux-kernel,
Vincent Guittot, Shrikanth Hegde, K Prateek Nayak
On Tue, 2025-10-14 at 11:24 +0200, Peter Zijlstra wrote:
> On Mon, Oct 13, 2025 at 02:54:19PM -0700, Tim Chen wrote:
>
> > > So I'm not sure I understand the situation, @continue_balancing should
> > > limit this concurrency to however many groups are on this domain -- your
> > > granite thing with SNC on would have something like 6 groups?
> >
> > That's a good point. But I think the contention is worse than
> > 6 CPUs.
> >
> > The hierarchy would be
> >
> > SMT
> > NUMA-level1
> > NUMA-level2
> > NUMA-level3
> > NUMA-level4
>
> Aren't you missing the LLC/NODE domain here? We should have at least one
> !SD_NUMA domain above SMT.
Yeah, I should have said the MC level which contains SMT groups
SMT
MC
NUMA-level1
...
Actual dmesg log:
[ 7.977893] CPU0 attaching sched-domain(s):
[ 7.977897] domain-0: span=0,192 level=SMT
[ 7.977902] groups: 0:{ span=0 cap=972 }, 192:{ span=192 cap=1022 }[ 7.977907] domain-1: span=0-31,192-223 level=MC[ 7.977909] groups: 0:{ span=0,192 cap=1994 }, 1:{ span=1,193
cap=2048 }, 2:{ span=2,194 cap=2047 }, 3:{ span=3,195 cap=2047 }, 4:{ span=4,196 cap=2048 }, 5:{ span=5,197 cap=2046 }, 6:{ span=6,198 cap=2047 }, 7:{ span=7,199 cap=2048 }, 8:{ span=8,200 cap=2047 },
9:{ span=9,201 cap=2046 }, 10:{ span=10,202 cap=2046 }, 11:{ span=11,203 cap=2046 }, 12:{ span=12,204 cap=2044 }, 13:{ span=13,205 cap=2048 }, 14:{ span=14,206 cap=2046 }, 15:{ span=15,207 cap=2043 },
16:{ span=16,208 cap=2046 }, 17:{ span=17,209 cap=2048 }, 18:{ span=18,210 cap=2046 }, 19:{ span=19,211 cap=2045 }, 20:{ span=20,212 cap=2046 }, 21:{ span=21,213 cap=2044 }, 22:{ span=22,214 cap=2045
}, 23:{ span=23,215 cap=2046 }, 24:{ span=24,216 cap=2045 }, 25:{ span=25,217 cap=2044 }, 26:{ span=26,218 cap=2046 }, 27:{ span=27,219 cap=2045 }, 28:{ span=28,220 cap=2045 }, 29:{ span=29,221
cap=2046 }, 30:{ span=30,222 cap=2045 }, 31:{ span=31,223 cap=2045 }
[ 7.977956] domain-2: span=0-63,192-255 level=NUMA
[ 7.977958] groups: 0:{ span=0-31,192-223 cap=65418 }, 32:{ span=32-63,224-255 cap=65453 }
[ 7.977962] domain-3: span=0-95,192-287 level=NUMA
[ 7.977963] groups: 0:{ span=0-63,192-255 mask=0-31,192-223 cap=130871 }, 64:{ span=32-95,224-287 mask=64-95,256-287 cap=130929 }
[ 7.977968] domain-4: span=0-127,192-319 level=NUMA
[ 7.977970] groups: 0:{ span=0-95,192-287 cap=196314 }, 96:{ span=96-127,288-319 cap=65439 }
[ 7.977974] domain-5: span=0-127,160-319,352-383 level=NUMA
[ 7.977975] groups: 0:{ span=0-127,192-319 mask=0-31,192-223 cap=261753 }, 160:{ span=160-191,352-383 cap=65430 }
[ 7.977980] domain-6: span=0-383 level=NUMA
>
> > There would be multiple CPUs in that are first in the SMT group
> > with continue_balancing=1 going up in the hierachy and
> > attempting the cmpxchg in the first NUMA domain level,
> > before calling should_we_balance() and finding that they are
> > not the first in the NUMA domain and set continue_balancing=0
> > and abort. Those CPUS are in same L3.
> > But at the same time, there could be CPUs in other sockets
> > cmpxchg on sched_balance_running.
>
> Right, Yu Chen said something like that as well, should_we_balance() is
> too late.
>
> Should we instead move the whole serialize thing inside
> sched_balance_rq() like so:
I think that makes sense. Probably a separate patch.
Tim
>
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bc0b7ce8a65d..e9f719ba17e1 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -11722,6 +11722,22 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
> }
> }
>
> +
> +/*
> + * This flag serializes load-balancing passes over large domains
> + * (above the NODE topology level) - only one load-balancing instance
> + * may run at a time, to reduce overhead on very large systems with
> + * lots of CPUs and large NUMA distances.
> + *
> + * - Note that load-balancing passes triggered while another one
> + * is executing are skipped and not re-tried.
> + *
> + * - Also note that this does not serialize rebalance_domains()
> + * execution, as non-SD_SERIALIZE domains will still be
> + * load-balanced in parallel.
> + */
> +static atomic_t sched_balance_running = ATOMIC_INIT(0);
> +
> /*
> * Check this_cpu to ensure it is balanced within domain. Attempt to move
> * tasks if there is an imbalance.
> @@ -11747,6 +11763,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
> .fbq_type = all,
> .tasks = LIST_HEAD_INIT(env.tasks),
> };
> + int need_unlock = false;
>
> cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
>
> @@ -11758,6 +11775,12 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
> goto out_balanced;
> }
>
> + if (idle != CPU_NEWLY_IDLE && (sd->flags & SD_SERIALIZE)) {
> + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
> + goto out_balanced;
> + need_unlock = true;
> + }
> +
> group = sched_balance_find_src_group(&env);
> if (!group) {
> schedstat_inc(sd->lb_nobusyg[idle]);
> @@ -11998,6 +12021,9 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
> sd->balance_interval < sd->max_interval)
> sd->balance_interval *= 2;
> out:
> + if (need_unlock)
> + atomic_set_release(&sched_balance_running, 0);
> +
> return ld_moved;
> }
>
> @@ -12122,21 +12148,6 @@ static int active_load_balance_cpu_stop(void *data)
> return 0;
> }
>
> -/*
> - * This flag serializes load-balancing passes over large domains
> - * (above the NODE topology level) - only one load-balancing instance
> - * may run at a time, to reduce overhead on very large systems with
> - * lots of CPUs and large NUMA distances.
> - *
> - * - Note that load-balancing passes triggered while another one
> - * is executing are skipped and not re-tried.
> - *
> - * - Also note that this does not serialize rebalance_domains()
> - * execution, as non-SD_SERIALIZE domains will still be
> - * load-balanced in parallel.
> - */
> -static atomic_t sched_balance_running = ATOMIC_INIT(0);
> -
> /*
> * Scale the max sched_balance_rq interval with the number of CPUs in the system.
> * This trades load-balance latency on larger machines for less cross talk.
> @@ -12192,7 +12203,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> /* Earliest time when we have to do rebalance again */
> unsigned long next_balance = jiffies + 60*HZ;
> int update_next_balance = 0;
> - int need_serialize, need_decay = 0;
> + int need_decay = 0;
> u64 max_cost = 0;
>
> rcu_read_lock();
> @@ -12216,13 +12227,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> }
>
> interval = get_sd_balance_interval(sd, busy);
> -
> - need_serialize = sd->flags & SD_SERIALIZE;
> - if (need_serialize) {
> - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
> - goto out;
> - }
> -
> if (time_after_eq(jiffies, sd->last_balance + interval)) {
> if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
> /*
> @@ -12236,9 +12240,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> sd->last_balance = jiffies;
> interval = get_sd_balance_interval(sd, busy);
> }
> - if (need_serialize)
> - atomic_set_release(&sched_balance_running, 0);
> -out:
> +
> if (time_after(next_balance, sd->last_balance + interval)) {
> next_balance = sd->last_balance + interval;
> update_next_balance = 1;
^ permalink raw reply [flat|nested] 19+ messages in thread