* [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
@ 2008-08-21 9:18 Peter Zijlstra
2008-08-21 10:47 ` Ingo Molnar
` (2 more replies)
0 siblings, 3 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-08-21 9:18 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Nick Piggin, Gregory Haskins, vatsa, linux-kernel
Subject: sched: properly account IRQ and RT load in SCHED_OTHER load balancing
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu Aug 14 09:31:20 CEST 2008
We used to account for RT tasks in SCHED_OTHER load-balancing by giving
them some phantom weight.
This is incorrect because there is no saying how much time a RT task will
actually consume. Also, it doesn't take IRQ time into account.
This patch tries to solve this issue by accounting the time spend on both
Real-Time tasks and IRQ handling, and using that to proportionally inflate
the SCHED_OTHER load.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/hardirq.h | 10 +++
include/linux/sched.h | 1
kernel/sched.c | 126 +++++++++++++++++++++++++++++++++++++++++++-----
kernel/sched_debug.c | 2
kernel/sched_rt.c | 8 +++
kernel/softirq.c | 1
kernel/sysctl.c | 8 +++
7 files changed, 145 insertions(+), 11 deletions(-)
Index: linux-2.6/include/linux/hardirq.h
===================================================================
--- linux-2.6.orig/include/linux/hardirq.h
+++ linux-2.6/include/linux/hardirq.h
@@ -127,6 +127,14 @@ static inline void account_system_vtime(
}
#endif
+#ifdef CONFIG_SMP
+extern void sched_irq_enter(void);
+extern void sched_irq_exit(void);
+#else
+# define sched_irq_enter() do { } while (0)
+# define sched_irq_exit() do { } while (0)
+#endif
+
#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
extern void rcu_irq_enter(void);
extern void rcu_irq_exit(void);
@@ -143,6 +151,7 @@ extern void rcu_irq_exit(void);
*/
#define __irq_enter() \
do { \
+ sched_irq_enter(); \
rcu_irq_enter(); \
account_system_vtime(current); \
add_preempt_count(HARDIRQ_OFFSET); \
@@ -163,6 +172,7 @@ extern void irq_enter(void);
account_system_vtime(current); \
sub_preempt_count(HARDIRQ_OFFSET); \
rcu_irq_exit(); \
+ sched_irq_exit(); \
} while (0)
/*
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1614,6 +1614,7 @@ extern unsigned int sysctl_sched_feature
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_shares_ratelimit;
+extern unsigned int sysctl_sched_time_avg;
int sched_nr_latency_handler(struct ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length,
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -571,6 +571,12 @@ struct rq {
struct task_struct *migration_thread;
struct list_head migration_queue;
+
+ u64 irq_stamp;
+ unsigned long irq_time;
+ unsigned long rt_time;
+ u64 age_stamp;
+
#endif
#ifdef CONFIG_SCHED_HRTICK
@@ -816,14 +822,21 @@ const_debug unsigned int sysctl_sched_nr
unsigned int sysctl_sched_shares_ratelimit = 250000;
/*
- * period over which we measure -rt task cpu usage in us.
+ * period over which we average the IRQ and RT cpu consumption, measured in
+ * jiffies.
* default: 1s
*/
-unsigned int sysctl_sched_rt_period = 1000000;
+const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
static __read_mostly int scheduler_running;
/*
+ * period over which we measure -rt task cpu usage in us.
+ * default: 1s
+ */
+unsigned int sysctl_sched_rt_period = 1000000;
+
+/*
* part of the period that we allow rt tasks to run in us.
* default: 9.5s
*/
@@ -1143,6 +1156,82 @@ static inline void init_hrtick(void)
}
#endif
+#ifdef CONFIG_SMP
+/*
+ * Measure IRQ time, we start when we first enter IRQ state
+ * and stop when we last leave IRQ state (nested IRQs).
+ */
+void sched_irq_enter(void)
+{
+ if (!in_irq()) {
+ struct rq *rq = this_rq();
+
+ update_rq_clock(rq);
+ rq->irq_stamp = rq->clock;
+ }
+}
+
+void sched_irq_exit(void)
+{
+ if (!in_irq()) {
+ struct rq *rq = this_rq();
+
+ update_rq_clock(rq);
+ rq->irq_time += rq->clock - rq->irq_stamp;
+ }
+}
+
+static inline u64 sched_avg_period(void)
+{
+ return (u64)sysctl_sched_time_avg * (NSEC_PER_MSEC / 2);
+}
+
+/*
+ * Every period/2 we half the accumulated time. See lib/proportions.c
+ */
+static void sched_age_time(struct rq *rq)
+{
+ if (rq->clock - rq->age_stamp >= sched_avg_period()) {
+ rq->irq_time /= 2;
+ rq->rt_time /= 2;
+ rq->age_stamp = rq->clock;
+ }
+}
+
+/*
+ * Scale the SCHED_OTHER load on this rq up to compensate for the pressure
+ * of IRQ and RT usage of this CPU.
+ *
+ * See lib/proportions.c
+ */
+static unsigned long sched_scale_load(struct rq *rq, u64 load)
+{
+ u64 total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ u64 available = total - rq->irq_time - rq->rt_time;
+
+ /*
+ * Shift back to roughly us scale, so that the divisor fits in u32.
+ */
+ total >>= 10;
+ available >>= 10;
+
+ if (unlikely((s64)available <= 0))
+ available = 1;
+
+ load *= total;
+ load = div_u64(load, available);
+
+ /*
+ * Clip the maximal load value to something plenty high.
+ */
+ return min_t(unsigned long, load, 1UL << 22);
+}
+#else
+static inline void sched_age_time(struct rq *rq)
+{
+}
+#endif
+
/*
* resched_task - mark a task 'to be rescheduled now'.
*
@@ -1635,8 +1724,12 @@ static void dec_nr_running(struct rq *rq
static void set_load_weight(struct task_struct *p)
{
if (task_has_rt_policy(p)) {
- p->se.load.weight = prio_to_weight[0] * 2;
- p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+ /*
+ * Real-time tasks do not contribute to SCHED_OTHER load
+ * this is compensated by sched_scale_load() usage.
+ */
+ p->se.load.weight = 0;
+ p->se.load.inv_weight = 0;
return;
}
@@ -2028,10 +2121,10 @@ static unsigned long source_load(int cpu
struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
+ if (type && sched_feat(LB_BIAS))
+ total = min(rq->cpu_load[type-1], total);
- return min(rq->cpu_load[type-1], total);
+ return sched_scale_load(rq, total);
}
/*
@@ -2043,10 +2136,10 @@ static unsigned long target_load(int cpu
struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
+ if (type && sched_feat(LB_BIAS))
+ total = max(rq->cpu_load[type-1], total);
- return max(rq->cpu_load[type-1], total);
+ return sched_scale_load(rq, total);
}
/*
@@ -2956,10 +3049,20 @@ balance_tasks(struct rq *this_rq, int th
int loops = 0, pulled = 0, pinned = 0;
struct task_struct *p;
long rem_load_move = max_load_move;
+ unsigned long busy_weight, this_weight, weight_scale;
if (max_load_move == 0)
goto out;
+ /*
+ * Compute a weight scale to properly account for the varying
+ * load inflation between these CPUs.
+ */
+ busy_weight = sched_scale_load(busiest, NICE_0_LOAD);
+ this_weight = sched_scale_load(this_rq, NICE_0_LOAD);
+
+ weight_scale = div_u64((u64)this_weight * NICE_0_LOAD, busy_weight);
+
pinned = 1;
/*
@@ -2978,7 +3081,7 @@ next:
pull_task(busiest, p, this_rq, this_cpu);
pulled++;
- rem_load_move -= p->se.load.weight;
+ rem_load_move -= (weight_scale * p->se.load.weight) >> NICE_0_SHIFT;
/*
* We only want to steal up to the prescribed amount of weighted load.
@@ -4211,6 +4314,7 @@ void scheduler_tick(void)
spin_lock(&rq->lock);
update_rq_clock(rq);
update_cpu_load(rq);
+ sched_age_time(rq);
curr->sched_class->task_tick(rq, curr, 0);
spin_unlock(&rq->lock);
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -478,6 +478,14 @@ static void update_curr_rt(struct rq *rq
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
+#ifdef CONFIG_SMP
+ /*
+ * Account the time spend running RT tasks on this rq. Used to inflate
+ * this rq's load values.
+ */
+ rq->rt_time += delta_exec;
+#endif
+
schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
curr->se.sum_exec_runtime += delta_exec;
Index: linux-2.6/kernel/softirq.c
===================================================================
--- linux-2.6.orig/kernel/softirq.c
+++ linux-2.6/kernel/softirq.c
@@ -280,6 +280,7 @@ void irq_exit(void)
account_system_vtime(current);
trace_hardirq_exit();
sub_preempt_count(IRQ_EXIT_OFFSET);
+ sched_irq_exit();
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -309,6 +309,14 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_time_avg_ms",
+ .data = &sysctl_sched_time_avg,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
#endif
{
.ctl_name = CTL_UNNUMBERED,
Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c
+++ linux-2.6/kernel/sched_debug.c
@@ -245,6 +245,8 @@ static void print_cpu(struct seq_file *m
P(nr_running);
SEQ_printf(m, " .%-30s: %lu\n", "load",
rq->load.weight);
+ SEQ_printf(m, " .%-30s: %ld\n", "scaled_load",
+ sched_scale_load(rq, rq->load.weight));
P(nr_switches);
P(nr_load_updates);
P(nr_uninterruptible);
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
2008-08-21 9:18 [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing Peter Zijlstra
@ 2008-08-21 10:47 ` Ingo Molnar
2008-08-21 11:17 ` Ingo Molnar
2008-08-21 11:36 ` Gregory Haskins
2008-08-21 12:43 ` Peter Zijlstra
2 siblings, 1 reply; 11+ messages in thread
From: Ingo Molnar @ 2008-08-21 10:47 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Nick Piggin, Gregory Haskins, vatsa, linux-kernel
* Peter Zijlstra <peterz@infradead.org> wrote:
> Subject: sched: properly account IRQ and RT load in SCHED_OTHER load balancing
> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date: Thu Aug 14 09:31:20 CEST 2008
>
> We used to account for RT tasks in SCHED_OTHER load-balancing by giving
> them some phantom weight.
>
> This is incorrect because there is no saying how much time a RT task
> will actually consume. Also, it doesn't take IRQ time into account.
>
> This patch tries to solve this issue by accounting the time spend on
> both Real-Time tasks and IRQ handling, and using that to
> proportionally inflate the SCHED_OTHER load.
applied it to tip/sched/devel for more testing.
this bit:
> +void sched_irq_enter(void)
> +{
> + if (!in_irq()) {
> + struct rq *rq = this_rq();
> +
> + update_rq_clock(rq);
> + rq->irq_stamp = rq->clock;
> + }
if we do this we might as well use the opportunity to do accurate IRQ
(and softirq) CPU time accounting. I.e. right now it only drives
balancing, but isnt fed into the wider IRQ/softirq /proc usage stats.
Ingo
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
2008-08-21 10:47 ` Ingo Molnar
@ 2008-08-21 11:17 ` Ingo Molnar
2008-08-21 11:22 ` Peter Zijlstra
0 siblings, 1 reply; 11+ messages in thread
From: Ingo Molnar @ 2008-08-21 11:17 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Nick Piggin, Gregory Haskins, vatsa, linux-kernel
* Ingo Molnar <mingo@elte.hu> wrote:
> > This patch tries to solve this issue by accounting the time spend on
> > both Real-Time tasks and IRQ handling, and using that to
> > proportionally inflate the SCHED_OTHER load.
>
> applied it to tip/sched/devel for more testing.
-tip testing found that it wouldnt build on UP in sched_debug.c - made
that bit dependent on CONFIG_SMP. sched_scale_load() does not seem to be
too meaningful on UP.
Ingo
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
2008-08-21 11:17 ` Ingo Molnar
@ 2008-08-21 11:22 ` Peter Zijlstra
2008-08-21 11:40 ` Ingo Molnar
0 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2008-08-21 11:22 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Nick Piggin, Gregory Haskins, vatsa, linux-kernel
On Thu, 2008-08-21 at 13:17 +0200, Ingo Molnar wrote:
> * Ingo Molnar <mingo@elte.hu> wrote:
>
> > > This patch tries to solve this issue by accounting the time spend on
> > > both Real-Time tasks and IRQ handling, and using that to
> > > proportionally inflate the SCHED_OTHER load.
> >
> > applied it to tip/sched/devel for more testing.
>
> -tip testing found that it wouldnt build on UP in sched_debug.c - made
> that bit dependent on CONFIG_SMP. sched_scale_load() does not seem to be
> too meaningful on UP.
Thanks - just prior to posting I removed the empty stub for
sched_scale_load() because it didn't make sense for UP code to use it...
So much for double checking myself.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
2008-08-21 9:18 [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing Peter Zijlstra
2008-08-21 10:47 ` Ingo Molnar
@ 2008-08-21 11:36 ` Gregory Haskins
2008-08-21 11:41 ` Ingo Molnar
2008-08-21 12:43 ` Peter Zijlstra
2 siblings, 1 reply; 11+ messages in thread
From: Gregory Haskins @ 2008-08-21 11:36 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Ingo Molnar, Nick Piggin, vatsa, linux-kernel
[-- Attachment #1: Type: text/plain, Size: 10874 bytes --]
Peter Zijlstra wrote:
> Subject: sched: properly account IRQ and RT load in SCHED_OTHER load balancing
> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date: Thu Aug 14 09:31:20 CEST 2008
>
> We used to account for RT tasks in SCHED_OTHER load-balancing by giving
> them some phantom weight.
>
> This is incorrect because there is no saying how much time a RT task will
> actually consume. Also, it doesn't take IRQ time into account.
>
> This patch tries to solve this issue by accounting the time spend on both
> Real-Time tasks and IRQ handling, and using that to proportionally inflate
> the SCHED_OTHER load.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
>
I haven't had a chance to review the code thoroughly yet, but I had been
working on a similar fix and know that this is sorely needed. So...
Acked-by: Gregory Haskins <ghaskins@novell.com>
> ---
> include/linux/hardirq.h | 10 +++
> include/linux/sched.h | 1
> kernel/sched.c | 126 +++++++++++++++++++++++++++++++++++++++++++-----
> kernel/sched_debug.c | 2
> kernel/sched_rt.c | 8 +++
> kernel/softirq.c | 1
> kernel/sysctl.c | 8 +++
> 7 files changed, 145 insertions(+), 11 deletions(-)
>
> Index: linux-2.6/include/linux/hardirq.h
> ===================================================================
> --- linux-2.6.orig/include/linux/hardirq.h
> +++ linux-2.6/include/linux/hardirq.h
> @@ -127,6 +127,14 @@ static inline void account_system_vtime(
> }
> #endif
>
> +#ifdef CONFIG_SMP
> +extern void sched_irq_enter(void);
> +extern void sched_irq_exit(void);
> +#else
> +# define sched_irq_enter() do { } while (0)
> +# define sched_irq_exit() do { } while (0)
> +#endif
> +
> #if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
> extern void rcu_irq_enter(void);
> extern void rcu_irq_exit(void);
> @@ -143,6 +151,7 @@ extern void rcu_irq_exit(void);
> */
> #define __irq_enter() \
> do { \
> + sched_irq_enter(); \
> rcu_irq_enter(); \
> account_system_vtime(current); \
> add_preempt_count(HARDIRQ_OFFSET); \
> @@ -163,6 +172,7 @@ extern void irq_enter(void);
> account_system_vtime(current); \
> sub_preempt_count(HARDIRQ_OFFSET); \
> rcu_irq_exit(); \
> + sched_irq_exit(); \
> } while (0)
>
> /*
> Index: linux-2.6/include/linux/sched.h
> ===================================================================
> --- linux-2.6.orig/include/linux/sched.h
> +++ linux-2.6/include/linux/sched.h
> @@ -1614,6 +1614,7 @@ extern unsigned int sysctl_sched_feature
> extern unsigned int sysctl_sched_migration_cost;
> extern unsigned int sysctl_sched_nr_migrate;
> extern unsigned int sysctl_sched_shares_ratelimit;
> +extern unsigned int sysctl_sched_time_avg;
>
> int sched_nr_latency_handler(struct ctl_table *table, int write,
> struct file *file, void __user *buffer, size_t *length,
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -571,6 +571,12 @@ struct rq {
>
> struct task_struct *migration_thread;
> struct list_head migration_queue;
> +
> + u64 irq_stamp;
> + unsigned long irq_time;
> + unsigned long rt_time;
> + u64 age_stamp;
> +
> #endif
>
> #ifdef CONFIG_SCHED_HRTICK
> @@ -816,14 +822,21 @@ const_debug unsigned int sysctl_sched_nr
> unsigned int sysctl_sched_shares_ratelimit = 250000;
>
> /*
> - * period over which we measure -rt task cpu usage in us.
> + * period over which we average the IRQ and RT cpu consumption, measured in
> + * jiffies.
> * default: 1s
> */
> -unsigned int sysctl_sched_rt_period = 1000000;
> +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
>
> static __read_mostly int scheduler_running;
>
> /*
> + * period over which we measure -rt task cpu usage in us.
> + * default: 1s
> + */
> +unsigned int sysctl_sched_rt_period = 1000000;
> +
> +/*
> * part of the period that we allow rt tasks to run in us.
> * default: 9.5s
> */
> @@ -1143,6 +1156,82 @@ static inline void init_hrtick(void)
> }
> #endif
>
> +#ifdef CONFIG_SMP
> +/*
> + * Measure IRQ time, we start when we first enter IRQ state
> + * and stop when we last leave IRQ state (nested IRQs).
> + */
> +void sched_irq_enter(void)
> +{
> + if (!in_irq()) {
> + struct rq *rq = this_rq();
> +
> + update_rq_clock(rq);
> + rq->irq_stamp = rq->clock;
> + }
> +}
> +
> +void sched_irq_exit(void)
> +{
> + if (!in_irq()) {
> + struct rq *rq = this_rq();
> +
> + update_rq_clock(rq);
> + rq->irq_time += rq->clock - rq->irq_stamp;
> + }
> +}
> +
> +static inline u64 sched_avg_period(void)
> +{
> + return (u64)sysctl_sched_time_avg * (NSEC_PER_MSEC / 2);
> +}
> +
> +/*
> + * Every period/2 we half the accumulated time. See lib/proportions.c
> + */
> +static void sched_age_time(struct rq *rq)
> +{
> + if (rq->clock - rq->age_stamp >= sched_avg_period()) {
> + rq->irq_time /= 2;
> + rq->rt_time /= 2;
> + rq->age_stamp = rq->clock;
> + }
> +}
> +
> +/*
> + * Scale the SCHED_OTHER load on this rq up to compensate for the pressure
> + * of IRQ and RT usage of this CPU.
> + *
> + * See lib/proportions.c
> + */
> +static unsigned long sched_scale_load(struct rq *rq, u64 load)
> +{
> + u64 total = sched_avg_period() + (rq->clock - rq->age_stamp);
> + u64 available = total - rq->irq_time - rq->rt_time;
> +
> + /*
> + * Shift back to roughly us scale, so that the divisor fits in u32.
> + */
> + total >>= 10;
> + available >>= 10;
> +
> + if (unlikely((s64)available <= 0))
> + available = 1;
> +
> + load *= total;
> + load = div_u64(load, available);
> +
> + /*
> + * Clip the maximal load value to something plenty high.
> + */
> + return min_t(unsigned long, load, 1UL << 22);
> +}
> +#else
> +static inline void sched_age_time(struct rq *rq)
> +{
> +}
> +#endif
> +
> /*
> * resched_task - mark a task 'to be rescheduled now'.
> *
> @@ -1635,8 +1724,12 @@ static void dec_nr_running(struct rq *rq
> static void set_load_weight(struct task_struct *p)
> {
> if (task_has_rt_policy(p)) {
> - p->se.load.weight = prio_to_weight[0] * 2;
> - p->se.load.inv_weight = prio_to_wmult[0] >> 1;
> + /*
> + * Real-time tasks do not contribute to SCHED_OTHER load
> + * this is compensated by sched_scale_load() usage.
> + */
> + p->se.load.weight = 0;
> + p->se.load.inv_weight = 0;
> return;
> }
>
> @@ -2028,10 +2121,10 @@ static unsigned long source_load(int cpu
> struct rq *rq = cpu_rq(cpu);
> unsigned long total = weighted_cpuload(cpu);
>
> - if (type == 0 || !sched_feat(LB_BIAS))
> - return total;
> + if (type && sched_feat(LB_BIAS))
> + total = min(rq->cpu_load[type-1], total);
>
> - return min(rq->cpu_load[type-1], total);
> + return sched_scale_load(rq, total);
> }
>
> /*
> @@ -2043,10 +2136,10 @@ static unsigned long target_load(int cpu
> struct rq *rq = cpu_rq(cpu);
> unsigned long total = weighted_cpuload(cpu);
>
> - if (type == 0 || !sched_feat(LB_BIAS))
> - return total;
> + if (type && sched_feat(LB_BIAS))
> + total = max(rq->cpu_load[type-1], total);
>
> - return max(rq->cpu_load[type-1], total);
> + return sched_scale_load(rq, total);
> }
>
> /*
> @@ -2956,10 +3049,20 @@ balance_tasks(struct rq *this_rq, int th
> int loops = 0, pulled = 0, pinned = 0;
> struct task_struct *p;
> long rem_load_move = max_load_move;
> + unsigned long busy_weight, this_weight, weight_scale;
>
> if (max_load_move == 0)
> goto out;
>
> + /*
> + * Compute a weight scale to properly account for the varying
> + * load inflation between these CPUs.
> + */
> + busy_weight = sched_scale_load(busiest, NICE_0_LOAD);
> + this_weight = sched_scale_load(this_rq, NICE_0_LOAD);
> +
> + weight_scale = div_u64((u64)this_weight * NICE_0_LOAD, busy_weight);
> +
> pinned = 1;
>
> /*
> @@ -2978,7 +3081,7 @@ next:
>
> pull_task(busiest, p, this_rq, this_cpu);
> pulled++;
> - rem_load_move -= p->se.load.weight;
> + rem_load_move -= (weight_scale * p->se.load.weight) >> NICE_0_SHIFT;
>
> /*
> * We only want to steal up to the prescribed amount of weighted load.
> @@ -4211,6 +4314,7 @@ void scheduler_tick(void)
> spin_lock(&rq->lock);
> update_rq_clock(rq);
> update_cpu_load(rq);
> + sched_age_time(rq);
> curr->sched_class->task_tick(rq, curr, 0);
> spin_unlock(&rq->lock);
>
> Index: linux-2.6/kernel/sched_rt.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_rt.c
> +++ linux-2.6/kernel/sched_rt.c
> @@ -478,6 +478,14 @@ static void update_curr_rt(struct rq *rq
> if (unlikely((s64)delta_exec < 0))
> delta_exec = 0;
>
> +#ifdef CONFIG_SMP
> + /*
> + * Account the time spend running RT tasks on this rq. Used to inflate
> + * this rq's load values.
> + */
> + rq->rt_time += delta_exec;
> +#endif
> +
> schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
>
> curr->se.sum_exec_runtime += delta_exec;
> Index: linux-2.6/kernel/softirq.c
> ===================================================================
> --- linux-2.6.orig/kernel/softirq.c
> +++ linux-2.6/kernel/softirq.c
> @@ -280,6 +280,7 @@ void irq_exit(void)
> account_system_vtime(current);
> trace_hardirq_exit();
> sub_preempt_count(IRQ_EXIT_OFFSET);
> + sched_irq_exit();
> if (!in_interrupt() && local_softirq_pending())
> invoke_softirq();
>
> Index: linux-2.6/kernel/sysctl.c
> ===================================================================
> --- linux-2.6.orig/kernel/sysctl.c
> +++ linux-2.6/kernel/sysctl.c
> @@ -309,6 +309,14 @@ static struct ctl_table kern_table[] = {
> .mode = 0644,
> .proc_handler = &proc_dointvec,
> },
> + {
> + .ctl_name = CTL_UNNUMBERED,
> + .procname = "sched_time_avg_ms",
> + .data = &sysctl_sched_time_avg,
> + .maxlen = sizeof(unsigned int),
> + .mode = 0644,
> + .proc_handler = &proc_dointvec,
> + },
> #endif
> {
> .ctl_name = CTL_UNNUMBERED,
> Index: linux-2.6/kernel/sched_debug.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_debug.c
> +++ linux-2.6/kernel/sched_debug.c
> @@ -245,6 +245,8 @@ static void print_cpu(struct seq_file *m
> P(nr_running);
> SEQ_printf(m, " .%-30s: %lu\n", "load",
> rq->load.weight);
> + SEQ_printf(m, " .%-30s: %ld\n", "scaled_load",
> + sched_scale_load(rq, rq->load.weight));
> P(nr_switches);
> P(nr_load_updates);
> P(nr_uninterruptible);
>
>
>
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
2008-08-21 11:22 ` Peter Zijlstra
@ 2008-08-21 11:40 ` Ingo Molnar
0 siblings, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2008-08-21 11:40 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Nick Piggin, Gregory Haskins, vatsa, linux-kernel
* Peter Zijlstra <peterz@infradead.org> wrote:
> On Thu, 2008-08-21 at 13:17 +0200, Ingo Molnar wrote:
> > * Ingo Molnar <mingo@elte.hu> wrote:
> >
> > > > This patch tries to solve this issue by accounting the time spend on
> > > > both Real-Time tasks and IRQ handling, and using that to
> > > > proportionally inflate the SCHED_OTHER load.
> > >
> > > applied it to tip/sched/devel for more testing.
> >
> > -tip testing found that it wouldnt build on UP in sched_debug.c - made
> > that bit dependent on CONFIG_SMP. sched_scale_load() does not seem to be
> > too meaningful on UP.
>
> Thanks - just prior to posting I removed the empty stub for
> sched_scale_load() because it didn't make sense for UP code to use
> it...
>
> So much for double checking myself.
hm, the empty stub might be useful still and would remove the #ifdef
from sched_debug.c. Could you please send a delta patch against
tip/master?
Ingo
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
2008-08-21 11:36 ` Gregory Haskins
@ 2008-08-21 11:41 ` Ingo Molnar
2008-08-21 12:26 ` Gregory Haskins
0 siblings, 1 reply; 11+ messages in thread
From: Ingo Molnar @ 2008-08-21 11:41 UTC (permalink / raw)
To: Gregory Haskins; +Cc: Peter Zijlstra, Nick Piggin, vatsa, linux-kernel
* Gregory Haskins <ghaskins@novell.com> wrote:
> I haven't had a chance to review the code thoroughly yet, but I had
> been working on a similar fix and know that this is sorely needed.
> So...
btw., why exactly does this patch speed up certain workloads? I'm not
quite sure about the exact reasons of that.
Ingo
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
2008-08-21 11:41 ` Ingo Molnar
@ 2008-08-21 12:26 ` Gregory Haskins
0 siblings, 0 replies; 11+ messages in thread
From: Gregory Haskins @ 2008-08-21 12:26 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Peter Zijlstra, Nick Piggin, vatsa, linux-kernel, D. Bahi
[-- Attachment #1: Type: text/plain, Size: 4241 bytes --]
Ingo Molnar wrote:
> * Gregory Haskins <ghaskins@novell.com> wrote:
>
>
>> I haven't had a chance to review the code thoroughly yet, but I had
>> been working on a similar fix and know that this is sorely needed.
>> So...
>>
>
> btw., why exactly does this patch speed up certain workloads? I'm not
> quite sure about the exact reasons of that.
>
> Ingo
>
I used to have a great demo for the prototype I was working on, but id
have to dig it up. The gist of it is that the pre-patched scheduler
basically gets thrown for a completely loop in the presence of a mixed
CFS/RT environment. This isn't a PREEMPT_RT specific problem per se,
though PREEMPT_RT does bring the problem to the forefront since it has
so many active RT tasks by default (for the IRQs, etc) which make it
more evident.
Since an RT tasks previous usage of declaring "load" did not actually
express the true nature of the RQ load, CFS tasks would have a few
really nasty things happen to them while trying to run on the system
simultaneously. One of them was that you could starve out CFS tasks
from certain cores (even though there was plenty of CPU bandwidth
available elsewhere) and the load-balancer would think everything is
fine and thus fail to make adjustments.
Say you have a 4 core system. You could, for instance, get into a
situation where the softirq-net-rx thread was consuming 80% of core 0,
yet the load balancer would still spread, say, a 40 thread CFS load
evenly across all cores (approximately 10 per core, though you would
account for the "load" that the softirq thread contributed too). The
threads on the other cores would of course enjoy 100% bandwidth, while
the ~10 threads on core 0 would only see 1/5th of that bandwidth.
What it comes down to is that the CFS load should have been evenly
distributed across the available bandwidth of 3*100% + 1*20%, not 4*100%
as it does today. The net result is that the application performs in a
very lopsided manner, with some threads getting significantly less (or
sometimes zero!) cpu time compared to their peers. You can make this
more obvious by nice'ing the CFS load up as high as it will go, which
will approximate 1/2 of the load of the softirq (since RT tasks
previously enjoyed a 2*MAX_SCHED_OTHER_LOAD rating.
I have observed this phenomenon (and its fix) while looking at things
like network intensive workloads. I'm sure there are plenty of others
that could cause similar ripples.
The fact is, the scheduler treats "load" to mean certain things which
simply did not apply to RT tasks. As you know very well im sure ;),
"load" is a metric which expresses the share of the cpu that will be
consumed and this is used by the load balancer to make its decisions.
However, you can put whatever rating you want on an RT task and it would
always be irrelevant. RT tasks run as frequently and as long as they
want (w.r.t. SCHED_OTHER) independent of what their load rating implies
to the balancer, so you cannot make an accurate assessment of the true
"available shares". This is why the load-balancer would become confused
and fail to see true imbalance in a mixed environment. Fixing this, as
Peter has attempted to do, will result in a much better distribution of
SCHED_OTHER tasks across the true available bandwidth, and thus improve
overall performance.
In previous discussions with people, I had always used a metaphor of a
stream. A system running SCHED_OTHER tasks is like a smooth running
stream, but dispatching an RT task (or an IRQ, even) is like throwing a
boulder into the water. It makes a big disruptive splash and causes
turbulent white water behind it. And the stream has no influence over
the size of the boulder, its placement in the stream, nor how long it
will be staying.
This fix (at least in concept) allows it to become more like gently
slipping a streamlined aerodynamic object into the water. The stream
still cannot do anything about the size or placement of the object, but
it can at least flow around it and smoothly adapt to the reduced volume
of water that the stream can carry. :)
HTH
-Greg
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
2008-08-21 9:18 [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing Peter Zijlstra
2008-08-21 10:47 ` Ingo Molnar
2008-08-21 11:36 ` Gregory Haskins
@ 2008-08-21 12:43 ` Peter Zijlstra
2008-08-21 12:47 ` Gregory Haskins
2 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2008-08-21 12:43 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Nick Piggin, Gregory Haskins, vatsa, linux-kernel
OK, how overboard is this? (utterly uncompiled and such)
I realized while trying to do the (soft)irq accounting Ingo asked for,
that IRQs can preempt SoftIRQs which can preempt RT tasks.
Therefore we actually need to account all these times, so that we can
subtract irq time from measured softirq time, etc.
So this patch does all that.. we could even use this more accurate time
spend on the task delta to drive the scheduler.
NOTE - for now I've only considered softirq from hardirq time, as
ksoftirqd is its own task and is already accounted the regular way.
---
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -572,9 +572,17 @@ struct rq {
struct task_struct *migration_thread;
struct list_head migration_queue;
- u64 irq_stamp;
- unsigned long irq_time;
- unsigned long rt_time;
+ u64 irq_clock_stamp;
+ u64 sirq_clock_stamp, sirq_irq_stamp;
+ u64 rt_sirq_stamp, rt_irq_stamp;
+
+ u64 rt_time;
+ u64 sirq_time;
+ u64 rt_time;
+
+ unsigned long irq_avg;
+ unsigned long sirq_avg;
+ unsigned long rt_avg;
u64 age_stamp;
#endif
@@ -1167,7 +1175,7 @@ void sched_irq_enter(void)
struct rq *rq = this_rq();
update_rq_clock(rq);
- rq->irq_stamp = rq->clock;
+ rq->irq_clock_stamp = rq->clock;
}
}
@@ -1175,12 +1183,58 @@ void sched_irq_exit(void)
{
if (!in_irq()) {
struct rq *rq = this_rq();
+ u64 irq_delta;
update_rq_clock(rq);
- rq->irq_time += rq->clock - rq->irq_stamp;
+ irq_delta = rq->clock - rq->irq_clock_stamp;
+ rq->irq_time += irq_delta;
+ rq->irq_avg += irq_delta;
}
}
+void sched_softirq_enter(void)
+{
+ struct rq *rq = this_rq();
+
+ update_rq_clock(rq);
+ rq->sirq_clock_stamp = rq->clock;
+ rq->sirq_irq_stamp = rq->irq_time;
+}
+
+void sched_softirq_exit(void)
+{
+ struct rq *rq = this_rq();
+ u64 sirq_delta, irq_delta;
+
+ update_rq_clock(rq);
+ sirq_delta = rq->clock - rq->sirq_clock_stamp;
+ irq_delta = rq->irq_time - rq->sirq_irq_stamp;
+ sirq_delta -= irq_delta;
+ rq->sirq_time += sirq_delta;
+ rq->sirq_avg += sirq_delta;
+}
+
+void sched_rt_start(struct rq *rq)
+{
+ rq->rt_sirq_stamp = rq->sirt_time;
+ rq->rt_irq_stamp = rq->irq_time;
+}
+
+void sched_rt_update(struct rq *rq, u64 rt_delta)
+{
+ u64 sirq_delta, irq_delta;
+
+ sirq_delta = rq->sirq_time - rq->rt_sirq_stamp;
+ irq_delta = rq->irq_time - rq->rt_irq_stamp;
+
+ rt_delta -= sirq_delta + irq_delta;
+
+ rq->rt_time += rt_delta;
+ rq->rt_avg += rt_delta;
+
+ sched_rt_start(rq);
+}
+
static inline u64 sched_avg_period(void)
{
return (u64)sysctl_sched_time_avg * (NSEC_PER_MSEC / 2);
@@ -1192,8 +1246,9 @@ static inline u64 sched_avg_period(void)
static void sched_age_time(struct rq *rq)
{
if (rq->clock - rq->age_stamp >= sched_avg_period()) {
- rq->irq_time /= 2;
- rq->rt_time /= 2;
+ rq->rt_avg /= 2;
+ rq->irq_avg /= 2;
+ rq->sirq_avg /= 2;
rq->age_stamp = rq->clock;
}
}
@@ -1207,7 +1262,7 @@ static void sched_age_time(struct rq *rq
static unsigned long sched_scale_load(struct rq *rq, u64 load)
{
u64 total = sched_avg_period() + (rq->clock - rq->age_stamp);
- u64 available = total - rq->irq_time - rq->rt_time;
+ u64 available = total - rq->sirq_avg - rq->irq_avg - rq->rt_avg;
/*
* Shift back to roughly us scale, so that the divisor fits in u32.
@@ -1227,9 +1282,22 @@ static unsigned long sched_scale_load(st
return min_t(unsigned long, load, 1UL << 22);
}
#else
+static inline void sched_rt_start(struct rq *rq)
+{
+}
+
+static inline void sched_rt_update(struct rq *rq, u64 delta)
+{
+}
+
static inline void sched_age_time(struct rq *rq)
{
}
+
+static inline unsigned long sched_scale_load(unsigned long load)
+{
+ return load;
+}
#endif
/*
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -478,13 +478,7 @@ static void update_curr_rt(struct rq *rq
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
-#ifdef CONFIG_SMP
- /*
- * Account the time spend running RT tasks on this rq. Used to inflate
- * this rq's load values.
- */
- rq->rt_time += delta_exec;
-#endif
+ sched_rt_update(rq, delta_exec);
schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
@@ -678,8 +672,6 @@ static void enqueue_task_rt(struct rq *r
rt_se->timeout = 0;
enqueue_rt_entity(rt_se);
-
- inc_cpu_load(rq, p->se.load.weight);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -688,8 +680,6 @@ static void dequeue_task_rt(struct rq *r
update_curr_rt(rq);
dequeue_rt_entity(rt_se);
-
- dec_cpu_load(rq, p->se.load.weight);
}
/*
@@ -1458,6 +1448,7 @@ static void set_curr_task_rt(struct rq *
struct task_struct *p = rq->curr;
p->se.exec_start = rq->clock;
+ sched_rt_start(rq);
}
static const struct sched_class rt_sched_class = {
Index: linux-2.6/kernel/softirq.c
===================================================================
--- linux-2.6.orig/kernel/softirq.c
+++ linux-2.6/kernel/softirq.c
@@ -272,6 +272,14 @@ void irq_enter(void)
# define invoke_softirq() do_softirq()
#endif
+#ifdef CONFIG_SMP
+extern void sched_softirq_enter(void);
+extern void sched_softirq_exit(void);
+#else
+#define sched_softirq_enter() do { } while (0)
+#define sched_softirq_exit() do { } while (0)
+#endif
+
/*
* Exit an interrupt context. Process softirqs if needed and possible:
*/
@@ -281,8 +289,11 @@ void irq_exit(void)
trace_hardirq_exit();
sub_preempt_count(IRQ_EXIT_OFFSET);
sched_irq_exit();
- if (!in_interrupt() && local_softirq_pending())
+ if (!in_interrupt() && local_softirq_pending()) {
+ sched_softirq_enter();
invoke_softirq();
+ sched_softirq_exit();
+ }
#ifdef CONFIG_NO_HZ
/* Make sure that timer wheel updates are propagated */
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
2008-08-21 12:43 ` Peter Zijlstra
@ 2008-08-21 12:47 ` Gregory Haskins
2008-08-21 12:56 ` Peter Zijlstra
0 siblings, 1 reply; 11+ messages in thread
From: Gregory Haskins @ 2008-08-21 12:47 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Ingo Molnar, Nick Piggin, vatsa, linux-kernel
[-- Attachment #1: Type: text/plain, Size: 7291 bytes --]
Peter Zijlstra wrote:
> OK, how overboard is this? (utterly uncompiled and such)
>
> I realized while trying to do the (soft)irq accounting Ingo asked for,
> that IRQs can preempt SoftIRQs which can preempt RT tasks.
>
> Therefore we actually need to account all these times, so that we can
> subtract irq time from measured softirq time, etc.
>
> So this patch does all that.. we could even use this more accurate time
> spend on the task delta to drive the scheduler.
>
> NOTE - for now I've only considered softirq from hardirq time, as
> ksoftirqd is its own task and is already accounted the regular way.
>
Actually, if you really want to get crazy, you could account for each RT
prio level as well ;)
e.g. RT98 tasks have to account for RT99 + softirqs + irqs, RT97 need to
look at RT98, 99, softirqs, irqs, etc.
I'm not suggesting we do this, per se. Just food for thought. It
would have the benefit of allowing us to make even better routing
decisions for RT tasks. E.g. if cores 2 and 6 both have the lowest
priority, we currently sort by sched-domain topology, but we could also
factor in the load that is "above" us.
BTW: this is probably not a bad idea even if its just to look at the
softirq/hardirq load. Perhaps I will draft up a patch.
-Greg
> ---
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -572,9 +572,17 @@ struct rq {
> struct task_struct *migration_thread;
> struct list_head migration_queue;
>
> - u64 irq_stamp;
> - unsigned long irq_time;
> - unsigned long rt_time;
> + u64 irq_clock_stamp;
> + u64 sirq_clock_stamp, sirq_irq_stamp;
> + u64 rt_sirq_stamp, rt_irq_stamp;
> +
> + u64 rt_time;
> + u64 sirq_time;
> + u64 rt_time;
> +
> + unsigned long irq_avg;
> + unsigned long sirq_avg;
> + unsigned long rt_avg;
> u64 age_stamp;
>
> #endif
> @@ -1167,7 +1175,7 @@ void sched_irq_enter(void)
> struct rq *rq = this_rq();
>
> update_rq_clock(rq);
> - rq->irq_stamp = rq->clock;
> + rq->irq_clock_stamp = rq->clock;
> }
> }
>
> @@ -1175,12 +1183,58 @@ void sched_irq_exit(void)
> {
> if (!in_irq()) {
> struct rq *rq = this_rq();
> + u64 irq_delta;
>
> update_rq_clock(rq);
> - rq->irq_time += rq->clock - rq->irq_stamp;
> + irq_delta = rq->clock - rq->irq_clock_stamp;
> + rq->irq_time += irq_delta;
> + rq->irq_avg += irq_delta;
> }
> }
>
> +void sched_softirq_enter(void)
> +{
> + struct rq *rq = this_rq();
> +
> + update_rq_clock(rq);
> + rq->sirq_clock_stamp = rq->clock;
> + rq->sirq_irq_stamp = rq->irq_time;
> +}
> +
> +void sched_softirq_exit(void)
> +{
> + struct rq *rq = this_rq();
> + u64 sirq_delta, irq_delta;
> +
> + update_rq_clock(rq);
> + sirq_delta = rq->clock - rq->sirq_clock_stamp;
> + irq_delta = rq->irq_time - rq->sirq_irq_stamp;
> + sirq_delta -= irq_delta;
> + rq->sirq_time += sirq_delta;
> + rq->sirq_avg += sirq_delta;
> +}
> +
> +void sched_rt_start(struct rq *rq)
> +{
> + rq->rt_sirq_stamp = rq->sirt_time;
> + rq->rt_irq_stamp = rq->irq_time;
> +}
> +
> +void sched_rt_update(struct rq *rq, u64 rt_delta)
> +{
> + u64 sirq_delta, irq_delta;
> +
> + sirq_delta = rq->sirq_time - rq->rt_sirq_stamp;
> + irq_delta = rq->irq_time - rq->rt_irq_stamp;
> +
> + rt_delta -= sirq_delta + irq_delta;
> +
> + rq->rt_time += rt_delta;
> + rq->rt_avg += rt_delta;
> +
> + sched_rt_start(rq);
> +}
> +
> static inline u64 sched_avg_period(void)
> {
> return (u64)sysctl_sched_time_avg * (NSEC_PER_MSEC / 2);
> @@ -1192,8 +1246,9 @@ static inline u64 sched_avg_period(void)
> static void sched_age_time(struct rq *rq)
> {
> if (rq->clock - rq->age_stamp >= sched_avg_period()) {
> - rq->irq_time /= 2;
> - rq->rt_time /= 2;
> + rq->rt_avg /= 2;
> + rq->irq_avg /= 2;
> + rq->sirq_avg /= 2;
> rq->age_stamp = rq->clock;
> }
> }
> @@ -1207,7 +1262,7 @@ static void sched_age_time(struct rq *rq
> static unsigned long sched_scale_load(struct rq *rq, u64 load)
> {
> u64 total = sched_avg_period() + (rq->clock - rq->age_stamp);
> - u64 available = total - rq->irq_time - rq->rt_time;
> + u64 available = total - rq->sirq_avg - rq->irq_avg - rq->rt_avg;
>
> /*
> * Shift back to roughly us scale, so that the divisor fits in u32.
> @@ -1227,9 +1282,22 @@ static unsigned long sched_scale_load(st
> return min_t(unsigned long, load, 1UL << 22);
> }
> #else
> +static inline void sched_rt_start(struct rq *rq)
> +{
> +}
> +
> +static inline void sched_rt_update(struct rq *rq, u64 delta)
> +{
> +}
> +
> static inline void sched_age_time(struct rq *rq)
> {
> }
> +
> +static inline unsigned long sched_scale_load(unsigned long load)
> +{
> + return load;
> +}
> #endif
>
> /*
> Index: linux-2.6/kernel/sched_rt.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_rt.c
> +++ linux-2.6/kernel/sched_rt.c
> @@ -478,13 +478,7 @@ static void update_curr_rt(struct rq *rq
> if (unlikely((s64)delta_exec < 0))
> delta_exec = 0;
>
> -#ifdef CONFIG_SMP
> - /*
> - * Account the time spend running RT tasks on this rq. Used to inflate
> - * this rq's load values.
> - */
> - rq->rt_time += delta_exec;
> -#endif
> + sched_rt_update(rq, delta_exec);
>
> schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
>
> @@ -678,8 +672,6 @@ static void enqueue_task_rt(struct rq *r
> rt_se->timeout = 0;
>
> enqueue_rt_entity(rt_se);
> -
> - inc_cpu_load(rq, p->se.load.weight);
> }
>
> static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
> @@ -688,8 +680,6 @@ static void dequeue_task_rt(struct rq *r
>
> update_curr_rt(rq);
> dequeue_rt_entity(rt_se);
> -
> - dec_cpu_load(rq, p->se.load.weight);
> }
>
> /*
> @@ -1458,6 +1448,7 @@ static void set_curr_task_rt(struct rq *
> struct task_struct *p = rq->curr;
>
> p->se.exec_start = rq->clock;
> + sched_rt_start(rq);
> }
>
> static const struct sched_class rt_sched_class = {
> Index: linux-2.6/kernel/softirq.c
> ===================================================================
> --- linux-2.6.orig/kernel/softirq.c
> +++ linux-2.6/kernel/softirq.c
> @@ -272,6 +272,14 @@ void irq_enter(void)
> # define invoke_softirq() do_softirq()
> #endif
>
> +#ifdef CONFIG_SMP
> +extern void sched_softirq_enter(void);
> +extern void sched_softirq_exit(void);
> +#else
> +#define sched_softirq_enter() do { } while (0)
> +#define sched_softirq_exit() do { } while (0)
> +#endif
> +
> /*
> * Exit an interrupt context. Process softirqs if needed and possible:
> */
> @@ -281,8 +289,11 @@ void irq_exit(void)
> trace_hardirq_exit();
> sub_preempt_count(IRQ_EXIT_OFFSET);
> sched_irq_exit();
> - if (!in_interrupt() && local_softirq_pending())
> + if (!in_interrupt() && local_softirq_pending()) {
> + sched_softirq_enter();
> invoke_softirq();
> + sched_softirq_exit();
> + }
>
> #ifdef CONFIG_NO_HZ
> /* Make sure that timer wheel updates are propagated */
>
>
>
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
2008-08-21 12:47 ` Gregory Haskins
@ 2008-08-21 12:56 ` Peter Zijlstra
0 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-08-21 12:56 UTC (permalink / raw)
To: Gregory Haskins; +Cc: Ingo Molnar, Nick Piggin, vatsa, linux-kernel
On Thu, 2008-08-21 at 08:47 -0400, Gregory Haskins wrote:
> Peter Zijlstra wrote:
> > OK, how overboard is this? (utterly uncompiled and such)
> >
> > I realized while trying to do the (soft)irq accounting Ingo asked for,
> > that IRQs can preempt SoftIRQs which can preempt RT tasks.
> >
> > Therefore we actually need to account all these times, so that we can
> > subtract irq time from measured softirq time, etc.
> >
> > So this patch does all that.. we could even use this more accurate time
> > spend on the task delta to drive the scheduler.
> >
> > NOTE - for now I've only considered softirq from hardirq time, as
> > ksoftirqd is its own task and is already accounted the regular way.
> >
>
> Actually, if you really want to get crazy, you could account for each RT
> prio level as well ;)
>
> e.g. RT98 tasks have to account for RT99 + softirqs + irqs, RT97 need to
> look at RT98, 99, softirqs, irqs, etc.
>
> I'm not suggesting we do this, per se. Just food for thought. It
> would have the benefit of allowing us to make even better routing
> decisions for RT tasks. E.g. if cores 2 and 6 both have the lowest
> priority, we currently sort by sched-domain topology, but we could also
> factor in the load that is "above" us.
I'll let you be that crazy ;-) It'd be a 3-rd order placement decision,
I doubt that's going to make a large difference.
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2008-08-21 12:57 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-21 9:18 [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing Peter Zijlstra
2008-08-21 10:47 ` Ingo Molnar
2008-08-21 11:17 ` Ingo Molnar
2008-08-21 11:22 ` Peter Zijlstra
2008-08-21 11:40 ` Ingo Molnar
2008-08-21 11:36 ` Gregory Haskins
2008-08-21 11:41 ` Ingo Molnar
2008-08-21 12:26 ` Gregory Haskins
2008-08-21 12:43 ` Peter Zijlstra
2008-08-21 12:47 ` Gregory Haskins
2008-08-21 12:56 ` Peter Zijlstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox