* [RFC][PATCH 1/6] sched: restore __cpu_power to a straight sum of power
2009-08-27 15:00 [RFC][PATCH 0/6] load-balancing and cpu_power Peter Zijlstra
@ 2009-08-27 15:00 ` Peter Zijlstra
2009-08-27 15:00 ` [RFC][PATCH 2/6] sched: SD_PREFER_SIBLING Peter Zijlstra
` (5 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2009-08-27 15:00 UTC (permalink / raw)
To: Ingo Molnar
Cc: linux-kernel, Gautham R Shenoy, Andreas Herrmann, Balbir Singh,
Peter Zijlstra
[-- Attachment #1: sched-lb-1.patch --]
[-- Type: text/plain, Size: 3387 bytes --]
cpu_power is supposed to be a representation of the process capacity
of the cpu, not a value to randomly tweak in order to affect
placement.
Remove the placement hacks and add SMT gain.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
include/linux/sched.h | 1 +
include/linux/topology.h | 1 +
kernel/sched.c | 34 ++++++++++++++++++----------------
3 files changed, 20 insertions(+), 16 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -930,6 +930,7 @@ struct sched_domain {
unsigned int newidle_idx;
unsigned int wake_idx;
unsigned int forkexec_idx;
+ unsigned int smt_gain;
int flags; /* See SD_* */
enum sched_domain_level level;
Index: linux-2.6/include/linux/topology.h
===================================================================
--- linux-2.6.orig/include/linux/topology.h
+++ linux-2.6/include/linux/topology.h
@@ -99,6 +99,7 @@ int arch_update_cpu_topology(void);
| SD_SHARE_CPUPOWER, \
.last_balance = jiffies, \
.balance_interval = 1, \
+ .smt_gain = 1178, /* 15% */ \
}
#endif
#endif /* CONFIG_SCHED_SMT */
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -8468,15 +8468,13 @@ static void free_sched_groups(const stru
* there are asymmetries in the topology. If there are asymmetries, group
* having more cpu_power will pickup more load compared to the group having
* less cpu_power.
- *
- * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
- * the maximum number of tasks a group can handle in the presence of other idle
- * or lightly loaded groups in the same sched domain.
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
struct sched_domain *child;
struct sched_group *group;
+ long power;
+ int weight;
WARN_ON(!sd || !sd->groups);
@@ -8487,22 +8485,26 @@ static void init_sched_groups_power(int
sd->groups->__cpu_power = 0;
- /*
- * For perf policy, if the groups in child domain share resources
- * (for example cores sharing some portions of the cache hierarchy
- * or SMT), then set this domain groups cpu_power such that each group
- * can handle only one task, when there are other idle groups in the
- * same sched domain.
- */
- if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
- (child->flags &
- (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
- sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
+ if (!child) {
+ power = SCHED_LOAD_SCALE;
+ weight = cpumask_weight(sched_domain_span(sd));
+ /*
+ * SMT siblings share the power of a single core.
+ * Usually multiple threads get a better yield out of
+ * that one core than a single thread would have,
+ * reflect that in sd->smt_gain.
+ */
+ if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1)
+ power *= sd->smt_gain;
+ power /= weight;
+ power >>= SCHED_LOAD_SHIFT;
+ }
+ sg_inc_cpu_power(sd->groups, power);
return;
}
/*
- * add cpu_power of each child group to this groups cpu_power
+ * Add cpu_power of each child group to this groups cpu_power.
*/
group = child->groups;
do {
--
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC][PATCH 2/6] sched: SD_PREFER_SIBLING
2009-08-27 15:00 [RFC][PATCH 0/6] load-balancing and cpu_power Peter Zijlstra
2009-08-27 15:00 ` [RFC][PATCH 1/6] sched: restore __cpu_power to a straight sum of power Peter Zijlstra
@ 2009-08-27 15:00 ` Peter Zijlstra
2009-08-27 15:00 ` [RFC][PATCH 3/6] sched: update the cpu_power sum during load-balance Peter Zijlstra
` (4 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2009-08-27 15:00 UTC (permalink / raw)
To: Ingo Molnar
Cc: linux-kernel, Gautham R Shenoy, Andreas Herrmann, Balbir Singh,
Peter Zijlstra
[-- Attachment #1: sched-lb-2.patch --]
[-- Type: text/plain, Size: 3934 bytes --]
Do the placement thing using SD flags
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
include/linux/sched.h | 29 +++++++++++++++--------------
kernel/sched.c | 14 +++++++++++++-
2 files changed, 28 insertions(+), 15 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -808,18 +808,19 @@ enum cpu_idle_type {
#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE
#ifdef CONFIG_SMP
-#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
-#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */
-#define SD_BALANCE_EXEC 4 /* Balance on exec */
-#define SD_BALANCE_FORK 8 /* Balance on fork, clone */
-#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */
-#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */
-#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */
-#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */
-#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
-#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */
-#define SD_SERIALIZE 1024 /* Only a single load balancing instance */
-#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */
+#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
+#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
+#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
+#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
+#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */
+#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
+#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */
+#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
+#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
+#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
+#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
+#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */
+#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
enum powersavings_balance_level {
POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
@@ -839,7 +840,7 @@ static inline int sd_balance_for_mc_powe
if (sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE;
- return 0;
+ return SD_PREFER_SIBLING;
}
static inline int sd_balance_for_package_power(void)
@@ -847,7 +848,7 @@ static inline int sd_balance_for_package
if (sched_mc_power_savings | sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE;
- return 0;
+ return SD_PREFER_SIBLING;
}
/*
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -3803,9 +3803,13 @@ static inline void update_sd_lb_stats(st
const struct cpumask *cpus, int *balance,
struct sd_lb_stats *sds)
{
+ struct sched_domain *child = sd->child;
struct sched_group *group = sd->groups;
struct sg_lb_stats sgs;
- int load_idx;
+ int load_idx, prefer_sibling = 0;
+
+ if (child && child->flags & SD_PREFER_SIBLING)
+ prefer_sibling = 1;
init_sd_power_savings_stats(sd, sds, idle);
load_idx = get_sd_load_idx(sd, idle);
@@ -3825,6 +3829,14 @@ static inline void update_sd_lb_stats(st
sds->total_load += sgs.group_load;
sds->total_pwr += group->__cpu_power;
+ /*
+ * In case the child domain prefers tasks go to siblings
+ * first, lower the group capacity to one so that we'll try
+ * and move all the excess tasks away.
+ */
+ if (prefer_sibling)
+ sgs.group_capacity = 1;
+
if (local_group) {
sds->this_load = sgs.avg_load;
sds->this = group;
--
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC][PATCH 3/6] sched: update the cpu_power sum during load-balance
2009-08-27 15:00 [RFC][PATCH 0/6] load-balancing and cpu_power Peter Zijlstra
2009-08-27 15:00 ` [RFC][PATCH 1/6] sched: restore __cpu_power to a straight sum of power Peter Zijlstra
2009-08-27 15:00 ` [RFC][PATCH 2/6] sched: SD_PREFER_SIBLING Peter Zijlstra
@ 2009-08-27 15:00 ` Peter Zijlstra
2009-08-27 15:00 ` [RFC][PATCH 4/6] sched: dynamic cpu_power Peter Zijlstra
` (3 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2009-08-27 15:00 UTC (permalink / raw)
To: Ingo Molnar
Cc: linux-kernel, Gautham R Shenoy, Andreas Herrmann, Balbir Singh,
Peter Zijlstra
[-- Attachment #1: sched-lb-3.patch --]
[-- Type: text/plain, Size: 1897 bytes --]
In order to prepare for a more dynamic cpu_power, update the group sum
while walking the sched domains during load-balance.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
kernel/sched.c | 31 +++++++++++++++++++++++++++++--
1 file changed, 29 insertions(+), 2 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -3691,6 +3691,31 @@ static inline int check_power_save_busie
}
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static void update_sched_power(struct sched_domain *sd)
+{
+ struct sched_domain *child = sd->child;
+ struct sched_group *group, *sdg = sd->groups;
+ unsigned long power = sdg->__cpu_power;
+
+ if (!child) {
+ /* compute cpu power for this cpu */
+ return;
+ }
+
+ sdg->__cpu_power = 0;
+
+ group = child->groups;
+ do {
+ sdg->__cpu_power += group->__cpu_power;
+ group = group->next;
+ } while (group != child->groups);
+
+ if (sdg->__cpu_power < SCHED_LOAD_SCALE)
+ sdg->__cpu_power = SCHED_LOAD_SCALE;
+
+ if (power != sdg->__cpu_power)
+ sdg->reciprocal_cpu_power = reciprocal_value(sdg->__cpu_power);
+}
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
@@ -3715,8 +3740,11 @@ static inline void update_sg_lb_stats(st
unsigned long sum_avg_load_per_task;
unsigned long avg_load_per_task;
- if (local_group)
+ if (local_group) {
balance_cpu = group_first_cpu(group);
+ if (balance_cpu == this_cpu)
+ update_sched_power(sd);
+ }
/* Tally up the load of all CPUs in the group */
sum_avg_load_per_task = avg_load_per_task = 0;
@@ -3855,7 +3883,6 @@ static inline void update_sd_lb_stats(st
update_sd_power_savings_stats(group, sds, local_group, &sgs);
group = group->next;
} while (group != sd->groups);
-
}
/**
--
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC][PATCH 4/6] sched: dynamic cpu_power
2009-08-27 15:00 [RFC][PATCH 0/6] load-balancing and cpu_power Peter Zijlstra
` (2 preceding siblings ...)
2009-08-27 15:00 ` [RFC][PATCH 3/6] sched: update the cpu_power sum during load-balance Peter Zijlstra
@ 2009-08-27 15:00 ` Peter Zijlstra
2009-08-27 15:00 ` [RFC][PATCH 5/6] sched: scale down cpu_power due to RT tasks Peter Zijlstra
` (2 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2009-08-27 15:00 UTC (permalink / raw)
To: Ingo Molnar
Cc: linux-kernel, Gautham R Shenoy, Andreas Herrmann, Balbir Singh,
Peter Zijlstra
[-- Attachment #1: sched-lb-4.patch --]
[-- Type: text/plain, Size: 2016 bytes --]
Recompute the cpu_power for each cpu during load-balance
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
kernel/sched.c | 38 +++++++++++++++++++++++++++++++++++---
1 file changed, 35 insertions(+), 3 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -3691,14 +3691,46 @@ static inline int check_power_save_busie
}
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static void update_sched_power(struct sched_domain *sd)
+unsigned long __weak arch_smt_gain(struct sched_domain *sd, int cpu)
+{
+ unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ unsigned long smt_gain = sd->smt_gain;
+
+ smt_gain /= weight;
+
+ return smt_gain;
+}
+
+static void update_cpu_power(struct sched_domain *sd, int cpu)
+{
+ unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ unsigned long power = SCHED_LOAD_SCALE;
+ struct sched_group *sdg = sd->groups;
+ unsigned long old = sdg->__cpu_power;
+
+ /* here we could scale based on cpufreq */
+
+ if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+ power *= arch_smt_gain(sd, cpu);
+ power >>= SCHED_LOAD_SHIFT;
+ }
+
+ /* here we could scale based on RT time */
+
+ if (power != old) {
+ sdg->__cpu_power = power;
+ sdg->reciprocal_cpu_power = reciprocal_value(power);
+ }
+}
+
+static void update_group_power(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
unsigned long power = sdg->__cpu_power;
if (!child) {
- /* compute cpu power for this cpu */
+ update_cpu_power(sd, cpu);
return;
}
@@ -3743,7 +3775,7 @@ static inline void update_sg_lb_stats(st
if (local_group) {
balance_cpu = group_first_cpu(group);
if (balance_cpu == this_cpu)
- update_sched_power(sd);
+ update_group_power(sd, this_cpu);
}
/* Tally up the load of all CPUs in the group */
--
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC][PATCH 5/6] sched: scale down cpu_power due to RT tasks
2009-08-27 15:00 [RFC][PATCH 0/6] load-balancing and cpu_power Peter Zijlstra
` (3 preceding siblings ...)
2009-08-27 15:00 ` [RFC][PATCH 4/6] sched: dynamic cpu_power Peter Zijlstra
@ 2009-08-27 15:00 ` Peter Zijlstra
2009-08-27 15:00 ` [RFC][PATCH 6/6] sched: try to deal with low capacity Peter Zijlstra
2009-08-28 18:17 ` [RFC][PATCH 0/6] load-balancing and cpu_power Balbir Singh
6 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2009-08-27 15:00 UTC (permalink / raw)
To: Ingo Molnar
Cc: linux-kernel, Gautham R Shenoy, Andreas Herrmann, Balbir Singh,
Peter Zijlstra
[-- Attachment #1: sched-lb-5.patch --]
[-- Type: text/plain, Size: 4855 bytes --]
Keep an average on the amount of time spend on RT tasks and use that
fraction to scale down the cpu_power for regular tasks.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
include/linux/sched.h | 1
kernel/sched.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++---
kernel/sched_rt.c | 2 +
kernel/sysctl.c | 8 ++++++
4 files changed, 70 insertions(+), 3 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1837,6 +1837,7 @@ extern unsigned int sysctl_sched_child_r
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_sched_time_avg;
extern unsigned int sysctl_timer_migration;
int sched_nr_latency_handler(struct ctl_table *table, int write,
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -627,6 +627,9 @@ struct rq {
struct task_struct *migration_thread;
struct list_head migration_queue;
+
+ u64 rt_avg;
+ u64 age_stamp;
#endif
/* calc_load related fields */
@@ -863,6 +866,14 @@ unsigned int sysctl_sched_shares_ratelim
unsigned int sysctl_sched_shares_thresh = 4;
/*
+ * period over which we average the RT time consumption, measured
+ * in ms.
+ *
+ * default: 1s
+ */
+const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
+
+/*
* period over which we measure -rt task cpu usage in us.
* default: 1s
*/
@@ -1280,12 +1291,36 @@ void wake_up_idle_cpu(int cpu)
}
#endif /* CONFIG_NO_HZ */
+static u64 sched_avg_period(void)
+{
+ return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
+}
+
+static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+{
+ rq->rt_avg += delta;
+}
+
+static void sched_avg_update(struct rq *rq)
+{
+ s64 period = sched_avg_period();
+
+ while ((s64)(rq->clock - rq->age_stamp) > period) {
+ rq->age_stamp += period;
+ rq->rt_avg /= 2;
+ }
+}
+
#else /* !CONFIG_SMP */
static void resched_task(struct task_struct *p)
{
assert_spin_locked(&task_rq(p)->lock);
set_tsk_need_resched(p);
}
+
+static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+{
+}
#endif /* CONFIG_SMP */
#if BITS_PER_LONG == 32
@@ -3691,7 +3726,7 @@ static inline int check_power_save_busie
}
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-unsigned long __weak arch_smt_gain(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = cpumask_weight(sched_domain_span(sd));
unsigned long smt_gain = sd->smt_gain;
@@ -3701,6 +3736,23 @@ unsigned long __weak arch_smt_gain(struc
return smt_gain;
}
+unsigned long scale_rt_power(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ u64 total, available;
+
+ total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ available = total - rq->rt_avg;
+
+ if (unlikely((s64)available < (1 << 15)))
+ available = 1 << 15;
+
+ available >>= 15;
+ total >>= 15 - SCHED_LOAD_SHIFT;
+
+ return div_u64(total, available);
+}
+
static void update_cpu_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = cpumask_weight(sched_domain_span(sd));
@@ -3711,11 +3763,15 @@ static void update_cpu_power(struct sche
/* here we could scale based on cpufreq */
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- power *= arch_smt_gain(sd, cpu);
+ power *= arch_scale_smt_power(sd, cpu);
power >>= SCHED_LOAD_SHIFT;
}
- /* here we could scale based on RT time */
+ power *= scale_rt_power(cpu);
+ power >>= SCHED_LOAD_SHIFT;
+
+ if (!power)
+ power = 1;
if (power != old) {
sdg->__cpu_power = power;
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -615,6 +615,8 @@ static void update_curr_rt(struct rq *rq
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);
+ sched_rt_avg_update(rq, delta_exec);
+
if (!rt_bandwidth_enabled())
return;
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -332,6 +332,14 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_time_avg",
+ .data = &sysctl_sched_time_avg,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
.procname = "timer_migration",
.data = &sysctl_timer_migration,
.maxlen = sizeof(unsigned int),
--
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC][PATCH 6/6] sched: try to deal with low capacity
2009-08-27 15:00 [RFC][PATCH 0/6] load-balancing and cpu_power Peter Zijlstra
` (4 preceding siblings ...)
2009-08-27 15:00 ` [RFC][PATCH 5/6] sched: scale down cpu_power due to RT tasks Peter Zijlstra
@ 2009-08-27 15:00 ` Peter Zijlstra
2009-08-28 18:17 ` [RFC][PATCH 0/6] load-balancing and cpu_power Balbir Singh
6 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2009-08-27 15:00 UTC (permalink / raw)
To: Ingo Molnar
Cc: linux-kernel, Gautham R Shenoy, Andreas Herrmann, Balbir Singh,
Peter Zijlstra
[-- Attachment #1: sched-lb-6.patch --]
[-- Type: text/plain, Size: 1784 bytes --]
When the capacity drops low, we want to migrate load away. Allow the
load-balancer to remove all tasks when we hit rock bottom.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
kernel/sched.c | 29 ++++++++++++++++++++++++++---
1 file changed, 26 insertions(+), 3 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -3951,7 +3951,7 @@ static inline void update_sd_lb_stats(st
* and move all the excess tasks away.
*/
if (prefer_sibling)
- sgs.group_capacity = 1;
+ sgs.group_capacity = min(sgs.group_capacity, 1);
if (local_group) {
sds->this_load = sgs.avg_load;
@@ -4183,6 +4183,26 @@ ret:
return NULL;
}
+static struct sched_group *group_of(int cpu)
+{
+ struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+
+ if (!sd)
+ return NULL;
+
+ return sd->groups;
+}
+
+static unsigned long power_of(int cpu)
+{
+ struct sched_group *group = group_of(cpu);
+
+ if (!group)
+ return SCHED_LOAD_SCALE;
+
+ return group->__cpu_power;
+}
+
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
@@ -4195,15 +4215,18 @@ find_busiest_queue(struct sched_group *g
int i;
for_each_cpu(i, sched_group_cpus(group)) {
+ unsigned long power = power_of(i);
+ unsigned long capacity = power >> SCHED_LOAD_SHIFT;
unsigned long wl;
if (!cpumask_test_cpu(i, cpus))
continue;
rq = cpu_rq(i);
- wl = weighted_cpuload(i);
+ wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
+ wl /= power;
- if (rq->nr_running == 1 && wl > imbalance)
+ if (capacity && rq->nr_running == 1 && wl > imbalance)
continue;
if (wl > max_load) {
--
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [RFC][PATCH 0/6] load-balancing and cpu_power
2009-08-27 15:00 [RFC][PATCH 0/6] load-balancing and cpu_power Peter Zijlstra
` (5 preceding siblings ...)
2009-08-27 15:00 ` [RFC][PATCH 6/6] sched: try to deal with low capacity Peter Zijlstra
@ 2009-08-28 18:17 ` Balbir Singh
6 siblings, 0 replies; 8+ messages in thread
From: Balbir Singh @ 2009-08-28 18:17 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, linux-kernel, Gautham R Shenoy, Andreas Herrmann
* Peter Zijlstra <a.p.zijlstra@chello.nl> [2009-08-27 17:00:51]:
> Hi
>
> These are utterly untested patches, and even if they'd compile I guess they'll
> balance your system straight like a banana. But it should show the direction
> I'm wanting to take this thing.
>
> I'll continue prodding at this, but wanted to put this out early so we don't
> all start doing the same, and so that you can poke holes in it early.
>
Let me put them on my TODO queue for testing.
--
Balbir
^ permalink raw reply [flat|nested] 8+ messages in thread