* [patch 1/6] sched, nohz: introduce nohz_flags in the struct rq
2011-11-18 23:03 [patch 0/6] sched, nohz: load balancing patches Suresh Siddha
@ 2011-11-18 23:03 ` Suresh Siddha
2011-11-24 10:24 ` Peter Zijlstra
2011-11-18 23:03 ` [patch 2/6] sched, nohz: track nr_busy_cpus in the sched_group_power Suresh Siddha
` (4 subsequent siblings)
5 siblings, 1 reply; 22+ messages in thread
From: Suresh Siddha @ 2011-11-18 23:03 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri,
Mike Galbraith
Cc: linux-kernel, Tim Chen, alex.shi, Suresh Siddha
[-- Attachment #1: introduce_rq_nohz_flags.patch --]
[-- Type: text/plain, Size: 4787 bytes --]
Introduce nohz_flags in the struct rq, which will track two flags for now.
NOHZ_TICK_STOPPED will convey the stop tick status that gets set when
the tick is stopped and cleared during the first busy tick after the tick
is restarted.
NOHZ_BALANCE_KICK will track the need for nohz idle load balance
on this rq.
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
kernel/sched/core.c | 5 +++--
kernel/sched/fair.c | 40 +++++++++++++++++++++-------------------
kernel/sched/sched.h | 10 +++++++++-
3 files changed, 33 insertions(+), 22 deletions(-)
Index: tip/kernel/sched/core.c
===================================================================
--- tip.orig/kernel/sched/core.c
+++ tip/kernel/sched/core.c
@@ -575,7 +575,8 @@ void wake_up_idle_cpu(int cpu)
static inline bool got_nohz_idle_kick(void)
{
- return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
+ int cpu = smp_processor_id();
+ return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
}
#else /* CONFIG_NO_HZ */
@@ -6833,7 +6834,7 @@ void __init sched_init(void)
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ
- rq->nohz_balance_kick = 0;
+ rq->nohz_flags = 0;
#endif
#endif
init_rq_hrtick(rq);
Index: tip/kernel/sched/fair.c
===================================================================
--- tip.orig/kernel/sched/fair.c
+++ tip/kernel/sched/fair.c
@@ -4866,18 +4866,15 @@ static void nohz_balancer_kick(int cpu)
return;
}
- if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
- cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
-
- smp_mb();
- /*
- * Use smp_send_reschedule() instead of resched_cpu().
- * This way we generate a sched IPI on the target cpu which
- * is idle. And the softirq performing nohz idle load balance
- * will be run before returning from the IPI.
- */
- smp_send_reschedule(ilb_cpu);
- }
+ if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+ return;
+ /*
+ * Use smp_send_reschedule() instead of resched_cpu().
+ * This way we generate a sched IPI on the target cpu which
+ * is idle. And the softirq performing nohz idle load balance
+ * will be run before returning from the IPI.
+ */
+ smp_send_reschedule(ilb_cpu);
return;
}
@@ -4941,6 +4938,8 @@ void select_nohz_load_balancer(int stop_
}
return;
}
+
+ set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
} else {
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return;
@@ -5056,8 +5055,9 @@ static void nohz_idle_balance(int this_c
struct rq *rq;
int balance_cpu;
- if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
- return;
+ if (idle != CPU_IDLE ||
+ !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
+ goto end;
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
if (balance_cpu == this_cpu)
@@ -5068,10 +5068,8 @@ static void nohz_idle_balance(int this_c
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
- if (need_resched()) {
- this_rq->nohz_balance_kick = 0;
+ if (need_resched())
break;
- }
raw_spin_lock_irq(&this_rq->lock);
update_rq_clock(this_rq);
@@ -5085,7 +5083,8 @@ static void nohz_idle_balance(int this_c
this_rq->next_balance = rq->next_balance;
}
nohz.next_balance = this_rq->next_balance;
- this_rq->nohz_balance_kick = 0;
+end:
+ clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}
/*
@@ -5106,6 +5105,9 @@ static inline int nohz_kick_needed(struc
int ret;
int first_pick_cpu, second_pick_cpu;
+ if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))))
+ clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+
if (time_before(now, nohz.next_balance))
return 0;
@@ -5173,7 +5175,7 @@ void trigger_load_balance(struct rq *rq,
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ
- else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+ if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
nohz_balancer_kick(cpu);
#endif
}
Index: tip/kernel/sched/sched.h
===================================================================
--- tip.orig/kernel/sched/sched.h
+++ tip/kernel/sched/sched.h
@@ -371,7 +371,7 @@ struct rq {
unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ
u64 nohz_stamp;
- unsigned char nohz_balance_kick;
+ unsigned long nohz_flags;
#endif
int skip_clock_update;
@@ -1062,3 +1062,11 @@ extern void init_rt_rq(struct rt_rq *rt_
extern void unthrottle_offline_cfs_rqs(struct rq *rq);
extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+
+#ifdef CONFIG_NO_HZ
+enum rq_nohz_flag_bits {
+ NOHZ_TICK_STOPPED,
+ NOHZ_BALANCE_KICK,
+};
+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
+#endif /* CONFIG_NO_HZ */
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [patch 1/6] sched, nohz: introduce nohz_flags in the struct rq
2011-11-18 23:03 ` [patch 1/6] sched, nohz: introduce nohz_flags in the struct rq Suresh Siddha
@ 2011-11-24 10:24 ` Peter Zijlstra
2011-11-28 23:59 ` Suresh Siddha
0 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2011-11-24 10:24 UTC (permalink / raw)
To: Suresh Siddha
Cc: Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri, Mike Galbraith,
linux-kernel, Tim Chen, alex.shi
On Fri, 2011-11-18 at 15:03 -0800, Suresh Siddha wrote:
> plain text document attachment (introduce_rq_nohz_flags.patch)
> Introduce nohz_flags in the struct rq, which will track two flags for now.
>
> NOHZ_TICK_STOPPED will convey the stop tick status that gets set when
> the tick is stopped and cleared during the first busy tick after the tick
> is restarted.
>
> NOHZ_BALANCE_KICK will track the need for nohz idle load balance
> on this rq.
The changelog is missing the crucial part: Why are we doing this? :-)
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [patch 1/6] sched, nohz: introduce nohz_flags in the struct rq
2011-11-24 10:24 ` Peter Zijlstra
@ 2011-11-28 23:59 ` Suresh Siddha
2011-11-29 9:47 ` Peter Zijlstra
0 siblings, 1 reply; 22+ messages in thread
From: Suresh Siddha @ 2011-11-28 23:59 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri, Mike Galbraith,
linux-kernel, Tim Chen, Shi, Alex
On Thu, 2011-11-24 at 02:24 -0800, Peter Zijlstra wrote:
> On Fri, 2011-11-18 at 15:03 -0800, Suresh Siddha wrote:
> > plain text document attachment (introduce_rq_nohz_flags.patch)
> > Introduce nohz_flags in the struct rq, which will track two flags for now.
> >
> > NOHZ_TICK_STOPPED will convey the stop tick status that gets set when
> > the tick is stopped and cleared during the first busy tick after the tick
> > is restarted.
> >
> > NOHZ_BALANCE_KICK will track the need for nohz idle load balance
> > on this rq.
>
> The changelog is missing the crucial part: Why are we doing this? :-)
We already had nohz_balance_kick in the rq, which was not being updated
atomically. So it is quite possible that two cpu's can simultaneously
kick the idle load balancing cpu. NOHZ_BALANCE_KICK bit of the
nohz_flags will track the need for nohz idle load balance on this rq and
will be updated atomically.
When the rq is ready to run a task when coming out of tickless idle,
tick is restarted and data structures used for nohz idle load balancing
are updated immediately. This will result in too many nohz idle load
balancer status updates, if the rq comes out of tickless for running one
(or very few) short running tasks before going back to tickless idle.
This is resulting in scalability issues especially if there are lot of
logical cpu's that enters and exit idle often. There is no need to
update the nohz idle load balancer data structures for the semi-idle cpu
so often (as the information for the need of the idle load balancing and
who needs to be kicked is needed only at the busy cpu's tick).
NOHZ_TICK_STOPPED in rq's nohz_flags is introduced to keep track of the
tick stopped status that gets set when the tick is stopped. It will be
used to update the nohz idle load balancer data structures during the
first busy tick after the tick is restarted. At this first busy tick
after tickless idle, NOHZ_TICK_STOPPED flag will be reset.
I will include this info in the next update.
thanks,
suresh
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [patch 1/6] sched, nohz: introduce nohz_flags in the struct rq
2011-11-28 23:59 ` Suresh Siddha
@ 2011-11-29 9:47 ` Peter Zijlstra
0 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2011-11-29 9:47 UTC (permalink / raw)
To: Suresh Siddha
Cc: Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri, Mike Galbraith,
linux-kernel, Tim Chen, Shi, Alex
On Mon, 2011-11-28 at 15:59 -0800, Suresh Siddha wrote:
> I will include this info in the next update.
That was slightly more than I'd expected, but great, thanks!
Always try to answer the: What and Why of patches, the how etc. is
mostly clear from the patch itself. I know its hard, I often write
crappy changelogs myself (and often kick myself for having written
crappy changelogs), so I'm trying to improve the situation by pestering
other people ;-)
^ permalink raw reply [flat|nested] 22+ messages in thread
* [patch 2/6] sched, nohz: track nr_busy_cpus in the sched_group_power
2011-11-18 23:03 [patch 0/6] sched, nohz: load balancing patches Suresh Siddha
2011-11-18 23:03 ` [patch 1/6] sched, nohz: introduce nohz_flags in the struct rq Suresh Siddha
@ 2011-11-18 23:03 ` Suresh Siddha
2011-11-18 23:03 ` [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing Suresh Siddha
` (3 subsequent siblings)
5 siblings, 0 replies; 22+ messages in thread
From: Suresh Siddha @ 2011-11-18 23:03 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri,
Mike Galbraith
Cc: linux-kernel, Tim Chen, alex.shi, Suresh Siddha
[-- Attachment #1: track_nr_busy_cpus_in_sched_group.patch --]
[-- Type: text/plain, Size: 2877 bytes --]
Introduce nr_busy_cpus in the struct sched_group_power [Not in sched_group
because sched groups are duplicated for the SD_OVERLAP scheduler domain]
and for each cpu that enters and exits tickless, this parameter will
be updated in each scheduler group of the scheduler domain that this cpu
belongs to.
To avoid the frequent update of this state as the cpu enters
and exits tickless idle, the update of the stat during tickless exit is
delayed to the first timer tick that happens after the cpu becomes busy.
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
include/linux/sched.h | 4 ++++
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 18 +++++++++++++++++-
3 files changed, 22 insertions(+), 1 deletion(-)
Index: tip/include/linux/sched.h
===================================================================
--- tip.orig/include/linux/sched.h
+++ tip/include/linux/sched.h
@@ -901,6 +901,10 @@ struct sched_group_power {
* single CPU.
*/
unsigned int power, power_orig;
+ /*
+ * Number of busy cpus in this group.
+ */
+ atomic_t nr_busy_cpus;
};
struct sched_group {
Index: tip/kernel/sched/core.c
===================================================================
--- tip.orig/kernel/sched/core.c
+++ tip/kernel/sched/core.c
@@ -6017,6 +6017,7 @@ static void init_sched_groups_power(int
return;
update_group_power(sd, cpu);
+ atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
}
int __weak arch_sd_sibling_asym_packing(void)
Index: tip/kernel/sched/fair.c
===================================================================
--- tip.orig/kernel/sched/fair.c
+++ tip/kernel/sched/fair.c
@@ -4894,6 +4894,7 @@ static void nohz_balancer_kick(int cpu)
void select_nohz_load_balancer(int stop_tick)
{
int cpu = smp_processor_id();
+ struct sched_domain *sd;
if (stop_tick) {
if (!cpu_active(cpu)) {
@@ -4940,6 +4941,12 @@ void select_nohz_load_balancer(int stop_
}
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+ /*
+ * Indicate the idle state to all the scheduler groups that
+ * this cpu is part of.
+ */
+ for_each_domain(cpu, sd)
+ atomic_dec(&sd->groups->sgp->nr_busy_cpus);
} else {
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return;
@@ -5104,10 +5111,19 @@ static inline int nohz_kick_needed(struc
unsigned long now = jiffies;
int ret;
int first_pick_cpu, second_pick_cpu;
+ struct sched_domain *sd;
- if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))))
+ /*
+ * We were recently in tickless idle mode. At the first busy tick
+ * after returning from idle, we will update the busy stats.
+ */
+ if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+ for_each_domain(cpu, sd)
+ atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ }
+
if (time_before(now, nohz.next_balance))
return 0;
^ permalink raw reply [flat|nested] 22+ messages in thread* [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing
2011-11-18 23:03 [patch 0/6] sched, nohz: load balancing patches Suresh Siddha
2011-11-18 23:03 ` [patch 1/6] sched, nohz: introduce nohz_flags in the struct rq Suresh Siddha
2011-11-18 23:03 ` [patch 2/6] sched, nohz: track nr_busy_cpus in the sched_group_power Suresh Siddha
@ 2011-11-18 23:03 ` Suresh Siddha
2011-11-24 11:47 ` Peter Zijlstra
2011-11-24 11:53 ` Peter Zijlstra
2011-11-18 23:03 ` [patch 4/6] sched, nohz: cleanup the find_new_ilb() using sched groups nr_busy_cpus Suresh Siddha
` (2 subsequent siblings)
5 siblings, 2 replies; 22+ messages in thread
From: Suresh Siddha @ 2011-11-18 23:03 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri,
Mike Galbraith
Cc: linux-kernel, Tim Chen, alex.shi, Suresh Siddha
[-- Attachment #1: simplify_ilb_using_grp_nr_busy_cpus.patch --]
[-- Type: text/plain, Size: 9455 bytes --]
Make nohz idle load balancing more scalabale by using the nr_busy_cpus in
the struct sched_group_power.
Idle load balance is kicked on one of the idle cpu's when there is atleast
one idle cpu and
- a busy rq having more than one task or
- a busy scheduler group having multiple busy cpus that exceed the sched group
power or
- for the SD_ASYM_PACKING domain, if the lower numbered cpu's in that
domain are idle compared to the busy ones.
This will help in kicking the idle load balancing request only when
there is a real imbalance. And once it is mostly balanced, these kicks will
be minimized.
These changes helped improve the workload that is context switch intensive
between number of task pairs by 2x on a 8 socket NHM-EX based system.
Reported-by: Tim Chen <tim.c.chen@intel.com>
Tested-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
kernel/sched/fair.c | 161 ++++++++++++++--------------------------------------
1 file changed, 46 insertions(+), 115 deletions(-)
Index: tip/kernel/sched/fair.c
===================================================================
--- tip.orig/kernel/sched/fair.c
+++ tip/kernel/sched/fair.c
@@ -4704,28 +4704,17 @@ out_unlock:
#ifdef CONFIG_NO_HZ
/*
* idle load balancing details
- * - One of the idle CPUs nominates itself as idle load_balancer, while
- * entering idle.
- * - This idle load balancer CPU will also go into tickless mode when
- * it is idle, just like all other idle CPUs
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
static struct {
- atomic_t load_balancer;
- atomic_t first_pick_cpu;
- atomic_t second_pick_cpu;
cpumask_var_t idle_cpus_mask;
cpumask_var_t grp_idle_mask;
+ atomic_t nr_cpus;
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
-int get_nohz_load_balancer(void)
-{
- return atomic_read(&nohz.load_balancer);
-}
-
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
/**
* lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -4802,9 +4791,9 @@ static inline int is_semi_idle_group(str
*/
static int find_new_ilb(int cpu)
{
+ int ilb = cpumask_first(nohz.idle_cpus_mask);
struct sched_domain *sd;
struct sched_group *ilb_group;
- int ilb = nr_cpu_ids;
/*
* Have idle load balancer selection from semi-idle packages only
@@ -4858,13 +4847,10 @@ static void nohz_balancer_kick(int cpu)
nohz.next_balance++;
- ilb_cpu = get_nohz_load_balancer();
+ ilb_cpu = find_new_ilb(cpu);
- if (ilb_cpu >= nr_cpu_ids) {
- ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
- if (ilb_cpu >= nr_cpu_ids)
- return;
- }
+ if (ilb_cpu >= nr_cpu_ids)
+ return;
if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
return;
@@ -4879,17 +4865,8 @@ static void nohz_balancer_kick(int cpu)
}
/*
- * This routine will try to nominate the ilb (idle load balancing)
- * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus.
- *
- * When the ilb owner becomes busy, we will not have new ilb owner until some
- * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
- * idle load balancing by kicking one of the idle CPUs.
- *
- * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
- * ilb owner CPU in future (when there is a need for idle load balancing on
- * behalf of all idle CPUs).
+ * This routine will record that this cpu is going idle with tick stopped.
+ * This info will be used in performing idle load balancing in the future.
*/
void select_nohz_load_balancer(int stop_tick)
{
@@ -4897,49 +4874,11 @@ void select_nohz_load_balancer(int stop_
struct sched_domain *sd;
if (stop_tick) {
- if (!cpu_active(cpu)) {
- if (atomic_read(&nohz.load_balancer) != cpu)
- return;
-
- /*
- * If we are going offline and still the leader,
- * give up!
- */
- if (atomic_cmpxchg(&nohz.load_balancer, cpu,
- nr_cpu_ids) != cpu)
- BUG();
-
+ if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
return;
- }
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
-
- if (atomic_read(&nohz.first_pick_cpu) == cpu)
- atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
- if (atomic_read(&nohz.second_pick_cpu) == cpu)
- atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
-
- if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
- int new_ilb;
-
- /* make me the ilb owner */
- if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
- cpu) != nr_cpu_ids)
- return;
-
- /*
- * Check to see if there is a more power-efficient
- * ilb.
- */
- new_ilb = find_new_ilb(cpu);
- if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
- atomic_set(&nohz.load_balancer, nr_cpu_ids);
- resched_cpu(new_ilb);
- return;
- }
- return;
- }
-
+ atomic_inc(&nohz.nr_cpus);
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
/*
* Indicate the idle state to all the scheduler groups that
@@ -4947,16 +4886,6 @@ void select_nohz_load_balancer(int stop_
*/
for_each_domain(cpu, sd)
atomic_dec(&sd->groups->sgp->nr_busy_cpus);
- } else {
- if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
- return;
-
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
-
- if (atomic_read(&nohz.load_balancer) == cpu)
- if (atomic_cmpxchg(&nohz.load_balancer, cpu,
- nr_cpu_ids) != cpu)
- BUG();
}
return;
}
@@ -5067,7 +4996,7 @@ static void nohz_idle_balance(int this_c
goto end;
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
- if (balance_cpu == this_cpu)
+ if (balance_cpu == this_cpu || !idle_cpu(this_cpu))
continue;
/*
@@ -5095,24 +5024,22 @@ end:
}
/*
- * Current heuristic for kicking the idle load balancer
- * - first_pick_cpu is the one of the busy CPUs. It will kick
- * idle load balancer when it has more than one process active. This
- * eliminates the need for idle load balancing altogether when we have
- * only one running process in the system (common case).
- * - If there are more than one busy CPU, idle load balancer may have
- * to run for active_load_balance to happen (i.e., two busy CPUs are
- * SMT or core siblings and can run better if they move to different
- * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
- * which will kick idle load balancer as soon as it has any load.
+ * Current heuristic for kicking the idle load balancer in the presence
+ * of an idle cpu is the system.
+ * - This rq has more than one task.
+ * - At any scheduler domain level, this cpu's scheduler group has multiple
+ * busy cpu's exceeding the group's power.
+ * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
+ * domain span are idle.
*/
static inline int nohz_kick_needed(struct rq *rq, int cpu)
{
unsigned long now = jiffies;
- int ret;
- int first_pick_cpu, second_pick_cpu;
struct sched_domain *sd;
+ if (unlikely(idle_cpu(cpu)))
+ return 0;
+
/*
* We were recently in tickless idle mode. At the first busy tick
* after returning from idle, we will update the busy stats.
@@ -5120,36 +5047,43 @@ static inline int nohz_kick_needed(struc
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+
for_each_domain(cpu, sd)
atomic_inc(&sd->groups->sgp->nr_busy_cpus);
}
- if (time_before(now, nohz.next_balance))
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing.
+ */
+ if (likely(!atomic_read(&nohz.nr_cpus)))
return 0;
- if (idle_cpu(cpu))
+ if (time_before(now, nohz.next_balance))
return 0;
- first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
- second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
-
- if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
- second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
- return 0;
+ if (rq->nr_running >= 2)
+ goto need_kick;
- ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
- if (ret == nr_cpu_ids || ret == cpu) {
- atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
- if (rq->nr_running > 1)
- return 1;
- } else {
- ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
- if (ret == nr_cpu_ids || ret == cpu) {
- if (rq->nr_running)
- return 1;
- }
+ for_each_domain(cpu, sd) {
+ struct sched_group *sg = sd->groups;
+ struct sched_group_power *sgp = sg->sgp;
+ int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+
+ if (nr_busy > 1 && (nr_busy * SCHED_LOAD_SCALE > sgp->power))
+ goto need_kick;
+
+ if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
+ && (cpumask_first_and(nohz.idle_cpus_mask,
+ sched_domain_span(sd)) < cpu))
+ goto need_kick;
}
+
return 0;
+need_kick:
+ return 1;
}
#else
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
@@ -5610,9 +5544,6 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_NO_HZ
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
- atomic_set(&nohz.load_balancer, nr_cpu_ids);
- atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
- atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
#endif
#endif /* SMP */
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing
2011-11-18 23:03 ` [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing Suresh Siddha
@ 2011-11-24 11:47 ` Peter Zijlstra
2011-11-28 23:51 ` Suresh Siddha
2011-11-24 11:53 ` Peter Zijlstra
1 sibling, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2011-11-24 11:47 UTC (permalink / raw)
To: Suresh Siddha
Cc: Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri, Mike Galbraith,
linux-kernel, Tim Chen, alex.shi
On Fri, 2011-11-18 at 15:03 -0800, Suresh Siddha wrote:
> static inline int nohz_kick_needed(struct rq *rq, int cpu)
> {
> unsigned long now = jiffies;
> struct sched_domain *sd;
>
> + if (unlikely(idle_cpu(cpu)))
> + return 0;
> +
> /*
> * We were recently in tickless idle mode. At the first busy tick
> * after returning from idle, we will update the busy stats.
> @@ -5120,36 +5047,43 @@ static inline int nohz_kick_needed(struc
> if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
> clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
>
> + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
> + atomic_dec(&nohz.nr_cpus);
> +
> for_each_domain(cpu, sd)
> atomic_inc(&sd->groups->sgp->nr_busy_cpus);
> }
>
> + /*
> + * None are in tickless mode and hence no need for NOHZ idle load
> + * balancing.
> + */
> + if (likely(!atomic_read(&nohz.nr_cpus)))
> return 0;
>
> + if (time_before(now, nohz.next_balance))
> return 0;
>
> + if (rq->nr_running >= 2)
> + goto need_kick;
>
> + for_each_domain(cpu, sd) {
> + struct sched_group *sg = sd->groups;
> + struct sched_group_power *sgp = sg->sgp;
> + int nr_busy = atomic_read(&sgp->nr_busy_cpus);
> +
> + if (nr_busy > 1 && (nr_busy * SCHED_LOAD_SCALE > sgp->power))
> + goto need_kick;
This looks wrong, its basically always true for a box with HT.
sgp->power is a measure of how much compute power this group has, its
basic form is sg->weight * SCHED_POWER_SCALE and is reduced from there;
HT siblings get less since they're not as powerful as two actual cores
and we deduct time spend on RT-tasks and IRQs etc..
So how does comparing the load of non-nohz cpus to that make sense?
> +
> + if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
> + && (cpumask_first_and(nohz.idle_cpus_mask,
> + sched_domain_span(sd)) < cpu))
> + goto need_kick;
> }
> +
> return 0;
> +need_kick:
> + return 1;
> }
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing
2011-11-24 11:47 ` Peter Zijlstra
@ 2011-11-28 23:51 ` Suresh Siddha
2011-11-29 9:44 ` Peter Zijlstra
0 siblings, 1 reply; 22+ messages in thread
From: Suresh Siddha @ 2011-11-28 23:51 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri, Mike Galbraith,
linux-kernel, Tim Chen, Shi, Alex
On Thu, 2011-11-24 at 03:47 -0800, Peter Zijlstra wrote:
> On Fri, 2011-11-18 at 15:03 -0800, Suresh Siddha wrote:
> > + for_each_domain(cpu, sd) {
> > + struct sched_group *sg = sd->groups;
> > + struct sched_group_power *sgp = sg->sgp;
> > + int nr_busy = atomic_read(&sgp->nr_busy_cpus);
> > +
> > + if (nr_busy > 1 && (nr_busy * SCHED_LOAD_SCALE > sgp->power))
> > + goto need_kick;
>
> This looks wrong, its basically always true for a box with HT.
In the presence of two busy HT siblings, we need to do the idle load
balance to figure out if the load from the busy core can be migrated to
any other idle core/sibling in the platform. And at this point, we
already know there are idle cpu's in the platform.
But you are right. using group power like the above is not right. For
example in the case of two sockets with each socket having dual core
with no HT, if one socket is completely busy with another completely
idle, we would like to identify this. But the group power of that socket
will be 2 * SCHED_POWER_SCALE.
In the older kernels, for the domains which was sharing package
resources, we were setting the group power to SCHED_POWER_SCALE for the
default performance mode. And I has that old code in the mind, while
doing the above check.
I will modify the above check to:
if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
goto need_kick;
This way, if there is a SMT/MC domain with more than one busy cpu in the
group, then we will request for the idle load balancing.
Current mainline code kicks the idle load balancer if there are two busy
cpus in the system. Above mentioned modification makes this decision
some what better. For example, two busy cpu's in two different sockets
or two busy cpu's in a dual-core single socket system will never kick
idle load balancer (as there is no need).
In future we can add more heuristics to kick the idle load balancer only
when it is really necessary (for example when there is a real imbalance
between the highest and lowest loaded groups etc). Only catch is to
identify those scenarios with out adding much penality to the busy cpu
which is identifying the imbalance and kicking the idle load balancer.
Above proposed approach is the simplest approach that is trying to do
better than the current logic we have in the kernel now.
Any more thoughts in making the kick decisions (for doing idle load
balancing) more robust are welcome.
thanks,
suresh
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing
2011-11-28 23:51 ` Suresh Siddha
@ 2011-11-29 9:44 ` Peter Zijlstra
2011-12-01 1:03 ` Suresh Siddha
2011-12-01 1:17 ` Suresh Siddha
0 siblings, 2 replies; 22+ messages in thread
From: Peter Zijlstra @ 2011-11-29 9:44 UTC (permalink / raw)
To: Suresh Siddha
Cc: Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri, Mike Galbraith,
linux-kernel, Tim Chen, Shi, Alex
On Mon, 2011-11-28 at 15:51 -0800, Suresh Siddha wrote:
> On Thu, 2011-11-24 at 03:47 -0800, Peter Zijlstra wrote:
> > On Fri, 2011-11-18 at 15:03 -0800, Suresh Siddha wrote:
> > > + for_each_domain(cpu, sd) {
> > > + struct sched_group *sg = sd->groups;
> > > + struct sched_group_power *sgp = sg->sgp;
> > > + int nr_busy = atomic_read(&sgp->nr_busy_cpus);
> > > +
> > > + if (nr_busy > 1 && (nr_busy * SCHED_LOAD_SCALE > sgp->power))
> > > + goto need_kick;
> >
> > This looks wrong, its basically always true for a box with HT.
>
> In the presence of two busy HT siblings, we need to do the idle load
> balance to figure out if the load from the busy core can be migrated to
> any other idle core/sibling in the platform. And at this point, we
> already know there are idle cpu's in the platform.
might have to, this nr_busy doesn't mean its actually busy, just that
its not nohz, it might very well be idle.
> But you are right. using group power like the above is not right. For
> example in the case of two sockets with each socket having dual core
> with no HT, if one socket is completely busy with another completely
> idle, we would like to identify this. But the group power of that socket
> will be 2 * SCHED_POWER_SCALE.
Right, minus whatever time was taken by interrupts and RT tasks.
> In the older kernels, for the domains which was sharing package
> resources, we were setting the group power to SCHED_POWER_SCALE for the
> default performance mode. And I has that old code in the mind, while
> doing the above check.
>
> I will modify the above check to:
>
> if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
> goto need_kick;
>
> This way, if there is a SMT/MC domain with more than one busy cpu in the
> group, then we will request for the idle load balancing.
Potentially 1 more than 1 busy, right? And we do the balancing just in
case there are indeed busy cpus.
I think its useful to mention that somewhere near, that this nr_busy
measure we use is an upper bound on actual busy.
> Current mainline code kicks the idle load balancer if there are two busy
> cpus in the system. Above mentioned modification makes this decision
> some what better. For example, two busy cpu's in two different sockets
> or two busy cpu's in a dual-core single socket system will never kick
> idle load balancer (as there is no need).
Except in power balance mode (although that's probably busted anyway),
where we want to aggregate the two tasks over two sockets onto one
socket.
> In future we can add more heuristics to kick the idle load balancer only
> when it is really necessary (for example when there is a real imbalance
> between the highest and lowest loaded groups etc). Only catch is to
> identify those scenarios with out adding much penality to the busy cpu
> which is identifying the imbalance and kicking the idle load balancer.
> Above proposed approach is the simplest approach that is trying to do
> better than the current logic we have in the kernel now.
Fair enough, and yeah, each patch only needs to do better and eventually
we'll get somewhere ;-)
> Any more thoughts in making the kick decisions (for doing idle load
> balancing) more robust are welcome.
Ha, I'll certainly share them if I have any.. I'd still love to split
this up somehow, but I've yet to figure out how to do so without
observing the whole machine.
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing
2011-11-29 9:44 ` Peter Zijlstra
@ 2011-12-01 1:03 ` Suresh Siddha
2011-12-01 1:17 ` Suresh Siddha
1 sibling, 0 replies; 22+ messages in thread
From: Suresh Siddha @ 2011-12-01 1:03 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri, Mike Galbraith,
linux-kernel, Tim Chen, Shi, Alex
On Tue, 2011-11-29 at 01:44 -0800, Peter Zijlstra wrote:
> On Mon, 2011-11-28 at 15:51 -0800, Suresh Siddha wrote:
> > On Thu, 2011-11-24 at 03:47 -0800, Peter Zijlstra wrote:
> > > On Fri, 2011-11-18 at 15:03 -0800, Suresh Siddha wrote:
> > > > + for_each_domain(cpu, sd) {
> > > > + struct sched_group *sg = sd->groups;
> > > > + struct sched_group_power *sgp = sg->sgp;
> > > > + int nr_busy = atomic_read(&sgp->nr_busy_cpus);
> > > > +
> > > > + if (nr_busy > 1 && (nr_busy * SCHED_LOAD_SCALE > sgp->power))
> > > > + goto need_kick;
> > >
> > > This looks wrong, its basically always true for a box with HT.
> >
> > In the presence of two busy HT siblings, we need to do the idle load
> > balance to figure out if the load from the busy core can be migrated to
> > any other idle core/sibling in the platform. And at this point, we
> > already know there are idle cpu's in the platform.
>
> might have to, this nr_busy doesn't mean its actually busy, just that
> its not nohz, it might very well be idle.
correct. But we can change that.
We can track nr_busy_cpus separately and can be updated when ever the rq
goes into idle and during the first busy tick after idle. Whereas the
nohz.idle_cpus_mask can be updated only during tickless entry.
> > I will modify the above check to:
> >
> > if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
> > goto need_kick;
> >
> > This way, if there is a SMT/MC domain with more than one busy cpu in the
> > group, then we will request for the idle load balancing.
>
> Potentially 1 more than 1 busy, right? And we do the balancing just in
> case there are indeed busy cpus.
>
> I think its useful to mention that somewhere near, that this nr_busy
> measure we use is an upper bound on actual busy.
The above should cover this.
I will send the updated version shortly.
thanks,
suresh
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing
2011-11-29 9:44 ` Peter Zijlstra
2011-12-01 1:03 ` Suresh Siddha
@ 2011-12-01 1:17 ` Suresh Siddha
2011-12-01 8:36 ` Peter Zijlstra
1 sibling, 1 reply; 22+ messages in thread
From: Suresh Siddha @ 2011-12-01 1:17 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri, Mike Galbraith,
linux-kernel, Tim Chen, Shi, Alex, PaulTurner
On Tue, 2011-11-29 at 01:44 -0800, Peter Zijlstra wrote:
> On Mon, 2011-11-28 at 15:51 -0800, Suresh Siddha wrote:
> > Current mainline code kicks the idle load balancer if there are two busy
> > cpus in the system. Above mentioned modification makes this decision
> > some what better. For example, two busy cpu's in two different sockets
> > or two busy cpu's in a dual-core single socket system will never kick
> > idle load balancer (as there is no need).
>
> Except in power balance mode (although that's probably busted anyway),
It is already busted, no doubt about that.
> where we want to aggregate the two tasks over two sockets onto one
> socket.
So was there any discussion regarding restoring the power savings mode
at the recent scheduler bof? Even in the latest platforms, there is a
small power savings when we consolidate the two tasks in two sockets
onto one socket (though the power difference is shrinking). Turbo
headroom also plays some role in this. Two tasks in two sockets
potentially have more turbo headroom compared to the two tasks in one
socket and thus increased performance with increased power.
thanks,
suresh
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing
2011-12-01 1:17 ` Suresh Siddha
@ 2011-12-01 8:36 ` Peter Zijlstra
0 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2011-12-01 8:36 UTC (permalink / raw)
To: Suresh Siddha
Cc: Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri, Mike Galbraith,
linux-kernel, Tim Chen, Shi, Alex, PaulTurner
On Wed, 2011-11-30 at 17:17 -0800, Suresh Siddha wrote:
> So was there any discussion regarding restoring the power savings mode
> at the recent scheduler bof? Even in the latest platforms, there is a
> small power savings when we consolidate the two tasks in two sockets
> onto one socket (though the power difference is shrinking). Turbo
> headroom also plays some role in this. Two tasks in two sockets
> potentially have more turbo headroom compared to the two tasks in one
> socket and thus increased performance with increased power.
Right, so if someone is very keen on it they could try now, but
otherwise it'll come once pjt's got his linsched stuff public and we're
going to do the big topology overhaul.
Anyway, for anybody going to work on it, I think the most important
first step is removing those crazy ass {smt,mc}_power knobs and their
multiple values :-)
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing
2011-11-18 23:03 ` [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing Suresh Siddha
2011-11-24 11:47 ` Peter Zijlstra
@ 2011-11-24 11:53 ` Peter Zijlstra
2011-11-28 23:58 ` Suresh Siddha
1 sibling, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2011-11-24 11:53 UTC (permalink / raw)
To: Suresh Siddha
Cc: Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri, Mike Galbraith,
linux-kernel, Tim Chen, alex.shi
On Fri, 2011-11-18 at 15:03 -0800, Suresh Siddha wrote:
> Make nohz idle load balancing more scalabale by using the nr_busy_cpus in
> the struct sched_group_power.
>
> Idle load balance is kicked on one of the idle cpu's when there is atleast
> one idle cpu and
>
> - a busy rq having more than one task or
>
> - a busy scheduler group having multiple busy cpus that exceed the sched group
> power or
>
> - for the SD_ASYM_PACKING domain, if the lower numbered cpu's in that
> domain are idle compared to the busy ones.
>
> This will help in kicking the idle load balancing request only when
> there is a real imbalance. And once it is mostly balanced, these kicks will
> be minimized.
>
> These changes helped improve the workload that is context switch intensive
> between number of task pairs by 2x on a 8 socket NHM-EX based system.
OK, but the nohz idle balance will still iterate the whole machine
instead of smaller parts, right?
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing
2011-11-24 11:53 ` Peter Zijlstra
@ 2011-11-28 23:58 ` Suresh Siddha
2011-11-29 9:45 ` Peter Zijlstra
0 siblings, 1 reply; 22+ messages in thread
From: Suresh Siddha @ 2011-11-28 23:58 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri, Mike Galbraith,
linux-kernel, Tim Chen, Shi, Alex
On Thu, 2011-11-24 at 03:53 -0800, Peter Zijlstra wrote:
> On Fri, 2011-11-18 at 15:03 -0800, Suresh Siddha wrote:
> > Make nohz idle load balancing more scalabale by using the nr_busy_cpus in
> > the struct sched_group_power.
> >
> > Idle load balance is kicked on one of the idle cpu's when there is atleast
> > one idle cpu and
> >
> > - a busy rq having more than one task or
> >
> > - a busy scheduler group having multiple busy cpus that exceed the sched group
> > power or
> >
> > - for the SD_ASYM_PACKING domain, if the lower numbered cpu's in that
> > domain are idle compared to the busy ones.
> >
> > This will help in kicking the idle load balancing request only when
> > there is a real imbalance. And once it is mostly balanced, these kicks will
> > be minimized.
> >
> > These changes helped improve the workload that is context switch intensive
> > between number of task pairs by 2x on a 8 socket NHM-EX based system.
>
> OK, but the nohz idle balance will still iterate the whole machine
> instead of smaller parts, right?
In the current series, yes. one idle cpu spending a bit more time doing
idle load balancing might be better compared to waking up multiple idle
cpu's from deep c-states.
But if needed, we can easily partition the nohz idle load balancer load
to multiple idle cpu's. But we need a balance between the right
partition size vs how many idle cpu's we need to bring out of tickless
mode to do this idle load balancing.
Current proposed series already has the infrastructure to identify
which scheduler domain has the imbalance. Perhaps we can use that to do
the nohz idle load balancing only for that domain.
For now, I am trying to do better than what mainline has.
thanks,
suresh
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing
2011-11-28 23:58 ` Suresh Siddha
@ 2011-11-29 9:45 ` Peter Zijlstra
0 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2011-11-29 9:45 UTC (permalink / raw)
To: Suresh Siddha
Cc: Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri, Mike Galbraith,
linux-kernel, Tim Chen, Shi, Alex
On Mon, 2011-11-28 at 15:58 -0800, Suresh Siddha wrote:
> On Thu, 2011-11-24 at 03:53 -0800, Peter Zijlstra wrote:
> > On Fri, 2011-11-18 at 15:03 -0800, Suresh Siddha wrote:
> > > Make nohz idle load balancing more scalabale by using the nr_busy_cpus in
> > > the struct sched_group_power.
> > >
> > > Idle load balance is kicked on one of the idle cpu's when there is atleast
> > > one idle cpu and
> > >
> > > - a busy rq having more than one task or
> > >
> > > - a busy scheduler group having multiple busy cpus that exceed the sched group
> > > power or
> > >
> > > - for the SD_ASYM_PACKING domain, if the lower numbered cpu's in that
> > > domain are idle compared to the busy ones.
> > >
> > > This will help in kicking the idle load balancing request only when
> > > there is a real imbalance. And once it is mostly balanced, these kicks will
> > > be minimized.
> > >
> > > These changes helped improve the workload that is context switch intensive
> > > between number of task pairs by 2x on a 8 socket NHM-EX based system.
> >
> > OK, but the nohz idle balance will still iterate the whole machine
> > instead of smaller parts, right?
>
> In the current series, yes. one idle cpu spending a bit more time doing
> idle load balancing might be better compared to waking up multiple idle
> cpu's from deep c-states.
>
> But if needed, we can easily partition the nohz idle load balancer load
> to multiple idle cpu's. But we need a balance between the right
> partition size vs how many idle cpu's we need to bring out of tickless
> mode to do this idle load balancing.
>
> Current proposed series already has the infrastructure to identify
> which scheduler domain has the imbalance. Perhaps we can use that to do
> the nohz idle load balancing only for that domain.
Yeah, trouble with that is if tehre's inter-domain balance things, like
for example balance for power, where you want to idle a domain. If you
only ever look within the one domain, you'll never see the possibility
to move your last one task to this other domain.
Pesky stuff that.
^ permalink raw reply [flat|nested] 22+ messages in thread
* [patch 4/6] sched, nohz: cleanup the find_new_ilb() using sched groups nr_busy_cpus
2011-11-18 23:03 [patch 0/6] sched, nohz: load balancing patches Suresh Siddha
` (2 preceding siblings ...)
2011-11-18 23:03 ` [patch 3/6] sched, nohz: sched group, domain aware nohz idle load balancing Suresh Siddha
@ 2011-11-18 23:03 ` Suresh Siddha
2011-11-18 23:03 ` [patch 5/6] sched: disable sched feature TTWU_QUEUE by default Suresh Siddha
2011-11-18 23:03 ` [patch 6/6] sched: fix the sched group node allocation for SD_OVERLAP domain Suresh Siddha
5 siblings, 0 replies; 22+ messages in thread
From: Suresh Siddha @ 2011-11-18 23:03 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri,
Mike Galbraith
Cc: linux-kernel, Tim Chen, alex.shi, Suresh Siddha
[-- Attachment #1: cleanup_find_ilb.patch --]
[-- Type: text/plain, Size: 3258 bytes --]
nr_busy_cpus in the sched_group_power indicates whether the group
is semi idle or not. This helps remove the is_semi_idle_group() and simplify
the find_new_ilb() in the context of finding an optimal cpu that can do
idle load balancing.
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
kernel/sched/fair.c | 48 ++++++++++++------------------------------------
1 file changed, 12 insertions(+), 36 deletions(-)
Index: tip/kernel/sched/fair.c
===================================================================
--- tip.orig/kernel/sched/fair.c
+++ tip/kernel/sched/fair.c
@@ -4710,7 +4710,6 @@ out_unlock:
*/
static struct {
cpumask_var_t idle_cpus_mask;
- cpumask_var_t grp_idle_mask;
atomic_t nr_cpus;
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
@@ -4751,33 +4750,6 @@ static inline struct sched_domain *lowes
(sd && (sd->flags & flag)); sd = sd->parent)
/**
- * is_semi_idle_group - Checks if the given sched_group is semi-idle.
- * @ilb_group: group to be checked for semi-idleness
- *
- * Returns: 1 if the group is semi-idle. 0 otherwise.
- *
- * We define a sched_group to be semi idle if it has atleast one idle-CPU
- * and atleast one non-idle CPU. This helper function checks if the given
- * sched_group is semi-idle or not.
- */
-static inline int is_semi_idle_group(struct sched_group *ilb_group)
-{
- cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
- sched_group_cpus(ilb_group));
-
- /*
- * A sched_group is semi-idle when it has atleast one busy cpu
- * and atleast one idle cpu.
- */
- if (cpumask_empty(nohz.grp_idle_mask))
- return 0;
-
- if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
- return 0;
-
- return 1;
-}
-/**
* find_new_ilb - Finds the optimum idle load balancer for nomination.
* @cpu: The cpu which is nominating a new idle_load_balancer.
*
@@ -4792,8 +4764,8 @@ static inline int is_semi_idle_group(str
static int find_new_ilb(int cpu)
{
int ilb = cpumask_first(nohz.idle_cpus_mask);
+ struct sched_group *ilbg;
struct sched_domain *sd;
- struct sched_group *ilb_group;
/*
* Have idle load balancer selection from semi-idle packages only
@@ -4811,23 +4783,28 @@ static int find_new_ilb(int cpu)
rcu_read_lock();
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
- ilb_group = sd->groups;
+ ilbg = sd->groups;
do {
- if (is_semi_idle_group(ilb_group)) {
- ilb = cpumask_first(nohz.grp_idle_mask);
+ if (ilbg->group_weight !=
+ atomic_read(&ilbg->sgp->nr_busy_cpus)) {
+ ilb = cpumask_first_and(nohz.idle_cpus_mask,
+ sched_group_cpus(ilbg));
goto unlock;
}
- ilb_group = ilb_group->next;
+ ilbg = ilbg->next;
- } while (ilb_group != sd->groups);
+ } while (ilbg != sd->groups);
}
unlock:
rcu_read_unlock();
out_done:
- return ilb;
+ if (ilb < nr_cpu_ids && idle_cpu(ilb))
+ return ilb;
+
+ return nr_cpu_ids;
}
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
static inline int find_new_ilb(int call_cpu)
@@ -5543,7 +5520,6 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_NO_HZ
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
- alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
#endif
#endif /* SMP */
^ permalink raw reply [flat|nested] 22+ messages in thread* [patch 5/6] sched: disable sched feature TTWU_QUEUE by default
2011-11-18 23:03 [patch 0/6] sched, nohz: load balancing patches Suresh Siddha
` (3 preceding siblings ...)
2011-11-18 23:03 ` [patch 4/6] sched, nohz: cleanup the find_new_ilb() using sched groups nr_busy_cpus Suresh Siddha
@ 2011-11-18 23:03 ` Suresh Siddha
2011-11-19 4:30 ` Mike Galbraith
2011-11-18 23:03 ` [patch 6/6] sched: fix the sched group node allocation for SD_OVERLAP domain Suresh Siddha
5 siblings, 1 reply; 22+ messages in thread
From: Suresh Siddha @ 2011-11-18 23:03 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri,
Mike Galbraith
Cc: linux-kernel, Tim Chen, alex.shi, Suresh Siddha
[-- Attachment #1: disable_sched_ttwu_queue.patch --]
[-- Type: text/plain, Size: 1044 bytes --]
Context-switch intensive microbenchmark on a 8-socket system had
~600K times more resched IPI's on each logical CPU with this feature enabled
by default. Disabling this features makes that microbenchmark perform 5 times
better.
Also disabling this feature showed 2% performance improvement on a 8-socket
OLTP workload.
More heurestics are needed when and how to use this feature by default.
For now, disable it by default.
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
kernel/sched/features.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: tip/kernel/sched/features.h
===================================================================
--- tip.orig/kernel/sched/features.h
+++ tip/kernel/sched/features.h
@@ -64,7 +64,7 @@ SCHED_FEAT(NONTASK_POWER, 1)
* Queue remote wakeups on the target CPU and process them
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
-SCHED_FEAT(TTWU_QUEUE, 1)
+SCHED_FEAT(TTWU_QUEUE, 0)
SCHED_FEAT(FORCE_SD_OVERLAP, 0)
SCHED_FEAT(RT_RUNTIME_SHARE, 1)
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [patch 5/6] sched: disable sched feature TTWU_QUEUE by default
2011-11-18 23:03 ` [patch 5/6] sched: disable sched feature TTWU_QUEUE by default Suresh Siddha
@ 2011-11-19 4:30 ` Mike Galbraith
2011-11-19 4:41 ` Mike Galbraith
0 siblings, 1 reply; 22+ messages in thread
From: Mike Galbraith @ 2011-11-19 4:30 UTC (permalink / raw)
To: Suresh Siddha
Cc: Peter Zijlstra, Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri,
linux-kernel, Tim Chen, alex.shi
On Fri, 2011-11-18 at 15:03 -0800, Suresh Siddha wrote:
> plain text document attachment (disable_sched_ttwu_queue.patch)
> Context-switch intensive microbenchmark on a 8-socket system had
> ~600K times more resched IPI's on each logical CPU with this feature enabled
> by default. Disabling this features makes that microbenchmark perform 5 times
> better.
>
> Also disabling this feature showed 2% performance improvement on a 8-socket
> OLTP workload.
>
> More heurestics are needed when and how to use this feature by default.
> For now, disable it by default.
Yeah, the overhead for very hefty switchers is high enough to increase
TCP_RR latency up to 13% in my testing. I used a trylock() to generally
not eat that, but leave the contended case improvement intact.
Peter suggested trying doing the IPI only when crossing cache
boundaries, which worked for me as well.
---
kernel/sched.c | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
Index: linux-3.0-tip/kernel/sched.c
===================================================================
--- linux-3.0-tip.orig/kernel/sched.c
+++ linux-3.0-tip/kernel/sched.c
@@ -2784,12 +2784,34 @@ static int ttwu_activate_remote(struct t
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
#endif /* CONFIG_SMP */
+static int ttwu_share_cache(int this_cpu, int cpu)
+{
+#ifndef CONFIG_X86
+ struct sched_domain *sd;
+ int ret = 0;
+
+ rcu_read_lock();
+ for_each_domain(this_cpu, sd) {
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ continue;
+
+ ret = (sd->flags & SD_SHARE_PKG_RESOURCES);
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+#else
+ return per_cpu(cpu_llc_id, this_cpu) == per_cpu(cpu_llc_id, cpu);
+#endif
+}
+
static void ttwu_queue(struct task_struct *p, int cpu)
{
struct rq *rq = cpu_rq(cpu);
#if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+ if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
sched_clock_cpu(cpu); /* sync clocks x-cpu */
ttwu_queue_remote(p, cpu);
return;
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [patch 5/6] sched: disable sched feature TTWU_QUEUE by default
2011-11-19 4:30 ` Mike Galbraith
@ 2011-11-19 4:41 ` Mike Galbraith
0 siblings, 0 replies; 22+ messages in thread
From: Mike Galbraith @ 2011-11-19 4:41 UTC (permalink / raw)
To: Suresh Siddha
Cc: Peter Zijlstra, Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri,
linux-kernel, Tim Chen, alex.shi
On Sat, 2011-11-19 at 05:30 +0100, Mike Galbraith wrote:
> On Fri, 2011-11-18 at 15:03 -0800, Suresh Siddha wrote:
> > plain text document attachment (disable_sched_ttwu_queue.patch)
> > Context-switch intensive microbenchmark on a 8-socket system had
> > ~600K times more resched IPI's on each logical CPU with this feature enabled
> > by default. Disabling this features makes that microbenchmark perform 5 times
> > better.
> >
> > Also disabling this feature showed 2% performance improvement on a 8-socket
> > OLTP workload.
> >
> > More heurestics are needed when and how to use this feature by default.
> > For now, disable it by default.
>
> Yeah, the overhead for very hefty switchers is high enough to increase
> TCP_RR latency up to 13% in my testing. I used a trylock() to generally
> not eat that, but leave the contended case improvement intact.
>
> Peter suggested trying doing the IPI only when crossing cache
> boundaries, which worked for me as well.
On a related TTWU_QUEUE note, I was pondering idle_balance().
---
kernel/sched_fair.c | 25 ++++++++++++++++---------
1 file changed, 16 insertions(+), 9 deletions(-)
Index: linux-3.0/kernel/sched_fair.c
===================================================================
--- linux-3.0.orig/kernel/sched_fair.c
+++ linux-3.0/kernel/sched_fair.c
@@ -3500,8 +3500,7 @@ out:
static void idle_balance(int this_cpu, struct rq *this_rq)
{
struct sched_domain *sd;
- int pulled_task = 0;
- unsigned long next_balance = jiffies + HZ;
+ unsigned long next_balance;
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
@@ -3512,33 +3511,41 @@ static void idle_balance(int this_cpu, s
raw_spin_unlock(&this_rq->lock);
update_shares(this_cpu);
+ next_balance = jiffies + HZ;
rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
int balance = 1;
+ if (this_rq->nr_running || this_rq->wake_list)
+ break;
+
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
- if (sd->flags & SD_BALANCE_NEWIDLE) {
- /* If we've pulled tasks over stop searching: */
- pulled_task = load_balance(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE, &balance);
- }
+ if (!(sd->flags & SD_BALANCE_NEWIDLE))
+ continue;
+
+ load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &balance);
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
next_balance = sd->last_balance + interval;
- if (pulled_task) {
+ if (this_rq->nr_running || this_rq->wake_list) {
this_rq->idle_stamp = 0;
break;
}
}
rcu_read_unlock();
+ /* IPI in flighht? Let the it happen */
+ if (unlikely(this_rq->wake_list)) {
+ local_irq_enable();
+ local_irq_disable();
+ }
raw_spin_lock(&this_rq->lock);
- if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
+ if (this_rq->nr_running || time_after(jiffies, this_rq->next_balance)) {
/*
* We are going idle. next_balance may be set based on
* a busy processor. So reset next_balance.
^ permalink raw reply [flat|nested] 22+ messages in thread
* [patch 6/6] sched: fix the sched group node allocation for SD_OVERLAP domain
2011-11-18 23:03 [patch 0/6] sched, nohz: load balancing patches Suresh Siddha
` (4 preceding siblings ...)
2011-11-18 23:03 ` [patch 5/6] sched: disable sched feature TTWU_QUEUE by default Suresh Siddha
@ 2011-11-18 23:03 ` Suresh Siddha
2011-12-06 9:51 ` [tip:sched/core] sched: Fix the sched group node allocation for SD_OVERLAP domains tip-bot for Suresh Siddha
5 siblings, 1 reply; 22+ messages in thread
From: Suresh Siddha @ 2011-11-18 23:03 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Venki Pallipadi, Srivatsa Vaddagiri,
Mike Galbraith
Cc: linux-kernel, Tim Chen, alex.shi, Suresh Siddha
[-- Attachment #1: fix_sched_group_node_allocation.patch --]
[-- Type: text/plain, Size: 786 bytes --]
For the SD_OVERLAP domain, sched_groups for each CPU's sched_domain are
privately allocated and not shared with any other cpu. So the
sched group allocation should come from the cpu's node for which
SD_OVERLAP sched domain is being setup.
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
kernel/sched/core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: tip/kernel/sched/core.c
===================================================================
--- tip.orig/kernel/sched/core.c
+++ tip/kernel/sched/core.c
@@ -5879,7 +5879,7 @@ build_overlap_sched_groups(struct sched_
continue;
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(i));
+ GFP_KERNEL, cpu_to_node(cpu));
if (!sg)
goto fail;
^ permalink raw reply [flat|nested] 22+ messages in thread* [tip:sched/core] sched: Fix the sched group node allocation for SD_OVERLAP domains
2011-11-18 23:03 ` [patch 6/6] sched: fix the sched group node allocation for SD_OVERLAP domain Suresh Siddha
@ 2011-12-06 9:51 ` tip-bot for Suresh Siddha
0 siblings, 0 replies; 22+ messages in thread
From: tip-bot for Suresh Siddha @ 2011-12-06 9:51 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, hpa, mingo, a.p.zijlstra, suresh.b.siddha, tglx,
mingo
Commit-ID: 4d78a2239e393f09e0964a2f8da394cc91d75155
Gitweb: http://git.kernel.org/tip/4d78a2239e393f09e0964a2f8da394cc91d75155
Author: Suresh Siddha <suresh.b.siddha@intel.com>
AuthorDate: Fri, 18 Nov 2011 15:03:29 -0800
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Tue, 6 Dec 2011 09:06:26 +0100
sched: Fix the sched group node allocation for SD_OVERLAP domains
For the SD_OVERLAP domain, sched_groups for each CPU's sched_domain are
privately allocated and not shared with any other cpu. So the
sched group allocation should come from the cpu's node for which
SD_OVERLAP sched domain is being setup.
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20111118230554.164910950@sbsiddha-desk.sc.intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched/core.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index db313c3..07f1e99 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5885,7 +5885,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
continue;
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(i));
+ GFP_KERNEL, cpu_to_node(cpu));
if (!sg)
goto fail;
^ permalink raw reply related [flat|nested] 22+ messages in thread