public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
@ 2008-08-21  9:18 Peter Zijlstra
  2008-08-21 10:47 ` Ingo Molnar
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-08-21  9:18 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Nick Piggin, Gregory Haskins, vatsa, linux-kernel

Subject: sched: properly account IRQ and RT load in SCHED_OTHER load balancing
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu Aug 14 09:31:20 CEST 2008

We used to account for RT tasks in SCHED_OTHER load-balancing by giving
them some phantom weight.

This is incorrect because there is no saying how much time a RT task will
actually consume. Also, it doesn't take IRQ time into account.

This patch tries to solve this issue by accounting the time spend on both
Real-Time tasks and IRQ handling, and using that to proportionally inflate
the SCHED_OTHER load.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/hardirq.h |   10 +++
 include/linux/sched.h   |    1 
 kernel/sched.c          |  126 +++++++++++++++++++++++++++++++++++++++++++-----
 kernel/sched_debug.c    |    2 
 kernel/sched_rt.c       |    8 +++
 kernel/softirq.c        |    1 
 kernel/sysctl.c         |    8 +++
 7 files changed, 145 insertions(+), 11 deletions(-)

Index: linux-2.6/include/linux/hardirq.h
===================================================================
--- linux-2.6.orig/include/linux/hardirq.h
+++ linux-2.6/include/linux/hardirq.h
@@ -127,6 +127,14 @@ static inline void account_system_vtime(
 }
 #endif
 
+#ifdef CONFIG_SMP
+extern void sched_irq_enter(void);
+extern void sched_irq_exit(void);
+#else
+# define sched_irq_enter() do { } while (0)
+# define sched_irq_exit() do { } while (0)
+#endif
+
 #if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
 extern void rcu_irq_enter(void);
 extern void rcu_irq_exit(void);
@@ -143,6 +151,7 @@ extern void rcu_irq_exit(void);
  */
 #define __irq_enter()					\
 	do {						\
+		sched_irq_enter();			\
 		rcu_irq_enter();			\
 		account_system_vtime(current);		\
 		add_preempt_count(HARDIRQ_OFFSET);	\
@@ -163,6 +172,7 @@ extern void irq_enter(void);
 		account_system_vtime(current);		\
 		sub_preempt_count(HARDIRQ_OFFSET);	\
 		rcu_irq_exit();				\
+		sched_irq_exit();			\
 	} while (0)
 
 /*
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1614,6 +1614,7 @@ extern unsigned int sysctl_sched_feature
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_shares_ratelimit;
+extern unsigned int sysctl_sched_time_avg;
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -571,6 +571,12 @@ struct rq {
 
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
+
+	u64 irq_stamp;
+	unsigned long irq_time;
+	unsigned long rt_time;
+	u64 age_stamp;
+
 #endif
 
 #ifdef CONFIG_SCHED_HRTICK
@@ -816,14 +822,21 @@ const_debug unsigned int sysctl_sched_nr
 unsigned int sysctl_sched_shares_ratelimit = 250000;
 
 /*
- * period over which we measure -rt task cpu usage in us.
+ * period over which we average the IRQ and RT cpu consumption, measured in
+ * jiffies.
  * default: 1s
  */
-unsigned int sysctl_sched_rt_period = 1000000;
+const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
 
 static __read_mostly int scheduler_running;
 
 /*
+ * period over which we measure -rt task cpu usage in us.
+ * default: 1s
+ */
+unsigned int sysctl_sched_rt_period = 1000000;
+
+/*
  * part of the period that we allow rt tasks to run in us.
  * default: 9.5s
  */
@@ -1143,6 +1156,82 @@ static inline void init_hrtick(void)
 }
 #endif
 
+#ifdef CONFIG_SMP
+/*
+ * Measure IRQ time, we start when we first enter IRQ state
+ * and stop when we last leave IRQ state (nested IRQs).
+ */
+void sched_irq_enter(void)
+{
+	if (!in_irq()) {
+		struct rq *rq = this_rq();
+
+		update_rq_clock(rq);
+		rq->irq_stamp = rq->clock;
+	}
+}
+
+void sched_irq_exit(void)
+{
+	if (!in_irq()) {
+		struct rq *rq = this_rq();
+
+		update_rq_clock(rq);
+		rq->irq_time += rq->clock - rq->irq_stamp;
+	}
+}
+
+static inline u64 sched_avg_period(void)
+{
+	return (u64)sysctl_sched_time_avg * (NSEC_PER_MSEC / 2);
+}
+
+/*
+ * Every period/2 we half the accumulated time. See lib/proportions.c
+ */
+static void sched_age_time(struct rq *rq)
+{
+	if (rq->clock - rq->age_stamp >= sched_avg_period()) {
+		rq->irq_time /= 2;
+		rq->rt_time /= 2;
+		rq->age_stamp = rq->clock;
+	}
+}
+
+/*
+ * Scale the SCHED_OTHER load on this rq up to compensate for the pressure
+ * of IRQ and RT usage of this CPU.
+ *
+ * See lib/proportions.c
+ */
+static unsigned long sched_scale_load(struct rq *rq, u64 load)
+{
+	u64 total = sched_avg_period() + (rq->clock - rq->age_stamp);
+	u64 available = total - rq->irq_time - rq->rt_time;
+
+	/*
+	 * Shift back to roughly us scale, so that the divisor fits in u32.
+	 */
+	total >>= 10;
+	available >>= 10;
+
+	if (unlikely((s64)available <= 0))
+		available = 1;
+
+	load *= total;
+	load = div_u64(load, available);
+
+	/*
+	 * Clip the maximal load value to something plenty high.
+	 */
+	return min_t(unsigned long, load, 1UL << 22);
+}
+#else
+static inline void sched_age_time(struct rq *rq)
+{
+}
+#endif
+
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
@@ -1635,8 +1724,12 @@ static void dec_nr_running(struct rq *rq
 static void set_load_weight(struct task_struct *p)
 {
 	if (task_has_rt_policy(p)) {
-		p->se.load.weight = prio_to_weight[0] * 2;
-		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+		/*
+		 * Real-time tasks do not contribute to SCHED_OTHER load
+		 * this is compensated by sched_scale_load() usage.
+		 */
+		p->se.load.weight = 0;
+		p->se.load.inv_weight = 0;
 		return;
 	}
 
@@ -2028,10 +2121,10 @@ static unsigned long source_load(int cpu
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 
-	if (type == 0 || !sched_feat(LB_BIAS))
-		return total;
+	if (type && sched_feat(LB_BIAS))
+		total = min(rq->cpu_load[type-1], total);
 
-	return min(rq->cpu_load[type-1], total);
+	return sched_scale_load(rq, total);
 }
 
 /*
@@ -2043,10 +2136,10 @@ static unsigned long target_load(int cpu
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 
-	if (type == 0 || !sched_feat(LB_BIAS))
-		return total;
+	if (type && sched_feat(LB_BIAS))
+		total = max(rq->cpu_load[type-1], total);
 
-	return max(rq->cpu_load[type-1], total);
+	return sched_scale_load(rq, total);
 }
 
 /*
@@ -2956,10 +3049,20 @@ balance_tasks(struct rq *this_rq, int th
 	int loops = 0, pulled = 0, pinned = 0;
 	struct task_struct *p;
 	long rem_load_move = max_load_move;
+	unsigned long busy_weight, this_weight, weight_scale;
 
 	if (max_load_move == 0)
 		goto out;
 
+	/*
+	 * Compute a weight scale to properly account for the varying
+	 * load inflation between these CPUs.
+	 */
+	busy_weight = sched_scale_load(busiest, NICE_0_LOAD);
+	this_weight = sched_scale_load(this_rq, NICE_0_LOAD);
+
+	weight_scale = div_u64((u64)this_weight * NICE_0_LOAD, busy_weight);
+
 	pinned = 1;
 
 	/*
@@ -2978,7 +3081,7 @@ next:
 
 	pull_task(busiest, p, this_rq, this_cpu);
 	pulled++;
-	rem_load_move -= p->se.load.weight;
+	rem_load_move -= (weight_scale * p->se.load.weight) >> NICE_0_SHIFT;
 
 	/*
 	 * We only want to steal up to the prescribed amount of weighted load.
@@ -4211,6 +4314,7 @@ void scheduler_tick(void)
 	spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	update_cpu_load(rq);
+	sched_age_time(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	spin_unlock(&rq->lock);
 
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -478,6 +478,14 @@ static void update_curr_rt(struct rq *rq
 	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
 
+#ifdef CONFIG_SMP
+	/*
+	 * Account the time spend running RT tasks on this rq. Used to inflate
+	 * this rq's load values.
+	 */
+	rq->rt_time += delta_exec;
+#endif
+
 	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 
 	curr->se.sum_exec_runtime += delta_exec;
Index: linux-2.6/kernel/softirq.c
===================================================================
--- linux-2.6.orig/kernel/softirq.c
+++ linux-2.6/kernel/softirq.c
@@ -280,6 +280,7 @@ void irq_exit(void)
 	account_system_vtime(current);
 	trace_hardirq_exit();
 	sub_preempt_count(IRQ_EXIT_OFFSET);
+	sched_irq_exit();
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();
 
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -309,6 +309,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_time_avg_ms",
+		.data		= &sysctl_sched_time_avg,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c
+++ linux-2.6/kernel/sched_debug.c
@@ -245,6 +245,8 @@ static void print_cpu(struct seq_file *m
 	P(nr_running);
 	SEQ_printf(m, "  .%-30s: %lu\n", "load",
 		   rq->load.weight);
+	SEQ_printf(m, "  .%-30s: %ld\n", "scaled_load",
+			sched_scale_load(rq, rq->load.weight));
 	P(nr_switches);
 	P(nr_load_updates);
 	P(nr_uninterruptible);



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
  2008-08-21  9:18 [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing Peter Zijlstra
@ 2008-08-21 10:47 ` Ingo Molnar
  2008-08-21 11:17   ` Ingo Molnar
  2008-08-21 11:36 ` Gregory Haskins
  2008-08-21 12:43 ` Peter Zijlstra
  2 siblings, 1 reply; 11+ messages in thread
From: Ingo Molnar @ 2008-08-21 10:47 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Nick Piggin, Gregory Haskins, vatsa, linux-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> Subject: sched: properly account IRQ and RT load in SCHED_OTHER load balancing
> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date: Thu Aug 14 09:31:20 CEST 2008
> 
> We used to account for RT tasks in SCHED_OTHER load-balancing by giving
> them some phantom weight.
> 
> This is incorrect because there is no saying how much time a RT task 
> will actually consume. Also, it doesn't take IRQ time into account.
> 
> This patch tries to solve this issue by accounting the time spend on 
> both Real-Time tasks and IRQ handling, and using that to 
> proportionally inflate the SCHED_OTHER load.

applied it to tip/sched/devel for more testing.

this bit:

> +void sched_irq_enter(void)
> +{
> +	if (!in_irq()) {
> +		struct rq *rq = this_rq();
> +
> +		update_rq_clock(rq);
> +		rq->irq_stamp = rq->clock;
> +	}

if we do this we might as well use the opportunity to do accurate IRQ 
(and softirq) CPU time accounting. I.e. right now it only drives 
balancing, but isnt fed into the wider IRQ/softirq /proc usage stats.

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
  2008-08-21 10:47 ` Ingo Molnar
@ 2008-08-21 11:17   ` Ingo Molnar
  2008-08-21 11:22     ` Peter Zijlstra
  0 siblings, 1 reply; 11+ messages in thread
From: Ingo Molnar @ 2008-08-21 11:17 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Nick Piggin, Gregory Haskins, vatsa, linux-kernel


* Ingo Molnar <mingo@elte.hu> wrote:

> > This patch tries to solve this issue by accounting the time spend on 
> > both Real-Time tasks and IRQ handling, and using that to 
> > proportionally inflate the SCHED_OTHER load.
> 
> applied it to tip/sched/devel for more testing.

-tip testing found that it wouldnt build on UP in sched_debug.c - made 
that bit dependent on CONFIG_SMP. sched_scale_load() does not seem to be 
too meaningful on UP.

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
  2008-08-21 11:17   ` Ingo Molnar
@ 2008-08-21 11:22     ` Peter Zijlstra
  2008-08-21 11:40       ` Ingo Molnar
  0 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2008-08-21 11:22 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Nick Piggin, Gregory Haskins, vatsa, linux-kernel

On Thu, 2008-08-21 at 13:17 +0200, Ingo Molnar wrote:
> * Ingo Molnar <mingo@elte.hu> wrote:
> 
> > > This patch tries to solve this issue by accounting the time spend on 
> > > both Real-Time tasks and IRQ handling, and using that to 
> > > proportionally inflate the SCHED_OTHER load.
> > 
> > applied it to tip/sched/devel for more testing.
> 
> -tip testing found that it wouldnt build on UP in sched_debug.c - made 
> that bit dependent on CONFIG_SMP. sched_scale_load() does not seem to be 
> too meaningful on UP.

Thanks - just prior to posting I removed the empty stub for
sched_scale_load() because it didn't make sense for UP code to use it...

So much for double checking myself.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
  2008-08-21  9:18 [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing Peter Zijlstra
  2008-08-21 10:47 ` Ingo Molnar
@ 2008-08-21 11:36 ` Gregory Haskins
  2008-08-21 11:41   ` Ingo Molnar
  2008-08-21 12:43 ` Peter Zijlstra
  2 siblings, 1 reply; 11+ messages in thread
From: Gregory Haskins @ 2008-08-21 11:36 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Nick Piggin, vatsa, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 10874 bytes --]

Peter Zijlstra wrote:
> Subject: sched: properly account IRQ and RT load in SCHED_OTHER load balancing
> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date: Thu Aug 14 09:31:20 CEST 2008
>
> We used to account for RT tasks in SCHED_OTHER load-balancing by giving
> them some phantom weight.
>
> This is incorrect because there is no saying how much time a RT task will
> actually consume. Also, it doesn't take IRQ time into account.
>
> This patch tries to solve this issue by accounting the time spend on both
> Real-Time tasks and IRQ handling, and using that to proportionally inflate
> the SCHED_OTHER load.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
>   

I haven't had a chance to review the code thoroughly yet, but I had been 
working on a similar fix and know that this is sorely needed.  So...

Acked-by: Gregory Haskins <ghaskins@novell.com>

> ---
>  include/linux/hardirq.h |   10 +++
>  include/linux/sched.h   |    1 
>  kernel/sched.c          |  126 +++++++++++++++++++++++++++++++++++++++++++-----
>  kernel/sched_debug.c    |    2 
>  kernel/sched_rt.c       |    8 +++
>  kernel/softirq.c        |    1 
>  kernel/sysctl.c         |    8 +++
>  7 files changed, 145 insertions(+), 11 deletions(-)
>
> Index: linux-2.6/include/linux/hardirq.h
> ===================================================================
> --- linux-2.6.orig/include/linux/hardirq.h
> +++ linux-2.6/include/linux/hardirq.h
> @@ -127,6 +127,14 @@ static inline void account_system_vtime(
>  }
>  #endif
>  
> +#ifdef CONFIG_SMP
> +extern void sched_irq_enter(void);
> +extern void sched_irq_exit(void);
> +#else
> +# define sched_irq_enter() do { } while (0)
> +# define sched_irq_exit() do { } while (0)
> +#endif
> +
>  #if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
>  extern void rcu_irq_enter(void);
>  extern void rcu_irq_exit(void);
> @@ -143,6 +151,7 @@ extern void rcu_irq_exit(void);
>   */
>  #define __irq_enter()					\
>  	do {						\
> +		sched_irq_enter();			\
>  		rcu_irq_enter();			\
>  		account_system_vtime(current);		\
>  		add_preempt_count(HARDIRQ_OFFSET);	\
> @@ -163,6 +172,7 @@ extern void irq_enter(void);
>  		account_system_vtime(current);		\
>  		sub_preempt_count(HARDIRQ_OFFSET);	\
>  		rcu_irq_exit();				\
> +		sched_irq_exit();			\
>  	} while (0)
>  
>  /*
> Index: linux-2.6/include/linux/sched.h
> ===================================================================
> --- linux-2.6.orig/include/linux/sched.h
> +++ linux-2.6/include/linux/sched.h
> @@ -1614,6 +1614,7 @@ extern unsigned int sysctl_sched_feature
>  extern unsigned int sysctl_sched_migration_cost;
>  extern unsigned int sysctl_sched_nr_migrate;
>  extern unsigned int sysctl_sched_shares_ratelimit;
> +extern unsigned int sysctl_sched_time_avg;
>  
>  int sched_nr_latency_handler(struct ctl_table *table, int write,
>  		struct file *file, void __user *buffer, size_t *length,
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -571,6 +571,12 @@ struct rq {
>  
>  	struct task_struct *migration_thread;
>  	struct list_head migration_queue;
> +
> +	u64 irq_stamp;
> +	unsigned long irq_time;
> +	unsigned long rt_time;
> +	u64 age_stamp;
> +
>  #endif
>  
>  #ifdef CONFIG_SCHED_HRTICK
> @@ -816,14 +822,21 @@ const_debug unsigned int sysctl_sched_nr
>  unsigned int sysctl_sched_shares_ratelimit = 250000;
>  
>  /*
> - * period over which we measure -rt task cpu usage in us.
> + * period over which we average the IRQ and RT cpu consumption, measured in
> + * jiffies.
>   * default: 1s
>   */
> -unsigned int sysctl_sched_rt_period = 1000000;
> +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
>  
>  static __read_mostly int scheduler_running;
>  
>  /*
> + * period over which we measure -rt task cpu usage in us.
> + * default: 1s
> + */
> +unsigned int sysctl_sched_rt_period = 1000000;
> +
> +/*
>   * part of the period that we allow rt tasks to run in us.
>   * default: 9.5s
>   */
> @@ -1143,6 +1156,82 @@ static inline void init_hrtick(void)
>  }
>  #endif
>  
> +#ifdef CONFIG_SMP
> +/*
> + * Measure IRQ time, we start when we first enter IRQ state
> + * and stop when we last leave IRQ state (nested IRQs).
> + */
> +void sched_irq_enter(void)
> +{
> +	if (!in_irq()) {
> +		struct rq *rq = this_rq();
> +
> +		update_rq_clock(rq);
> +		rq->irq_stamp = rq->clock;
> +	}
> +}
> +
> +void sched_irq_exit(void)
> +{
> +	if (!in_irq()) {
> +		struct rq *rq = this_rq();
> +
> +		update_rq_clock(rq);
> +		rq->irq_time += rq->clock - rq->irq_stamp;
> +	}
> +}
> +
> +static inline u64 sched_avg_period(void)
> +{
> +	return (u64)sysctl_sched_time_avg * (NSEC_PER_MSEC / 2);
> +}
> +
> +/*
> + * Every period/2 we half the accumulated time. See lib/proportions.c
> + */
> +static void sched_age_time(struct rq *rq)
> +{
> +	if (rq->clock - rq->age_stamp >= sched_avg_period()) {
> +		rq->irq_time /= 2;
> +		rq->rt_time /= 2;
> +		rq->age_stamp = rq->clock;
> +	}
> +}
> +
> +/*
> + * Scale the SCHED_OTHER load on this rq up to compensate for the pressure
> + * of IRQ and RT usage of this CPU.
> + *
> + * See lib/proportions.c
> + */
> +static unsigned long sched_scale_load(struct rq *rq, u64 load)
> +{
> +	u64 total = sched_avg_period() + (rq->clock - rq->age_stamp);
> +	u64 available = total - rq->irq_time - rq->rt_time;
> +
> +	/*
> +	 * Shift back to roughly us scale, so that the divisor fits in u32.
> +	 */
> +	total >>= 10;
> +	available >>= 10;
> +
> +	if (unlikely((s64)available <= 0))
> +		available = 1;
> +
> +	load *= total;
> +	load = div_u64(load, available);
> +
> +	/*
> +	 * Clip the maximal load value to something plenty high.
> +	 */
> +	return min_t(unsigned long, load, 1UL << 22);
> +}
> +#else
> +static inline void sched_age_time(struct rq *rq)
> +{
> +}
> +#endif
> +
>  /*
>   * resched_task - mark a task 'to be rescheduled now'.
>   *
> @@ -1635,8 +1724,12 @@ static void dec_nr_running(struct rq *rq
>  static void set_load_weight(struct task_struct *p)
>  {
>  	if (task_has_rt_policy(p)) {
> -		p->se.load.weight = prio_to_weight[0] * 2;
> -		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
> +		/*
> +		 * Real-time tasks do not contribute to SCHED_OTHER load
> +		 * this is compensated by sched_scale_load() usage.
> +		 */
> +		p->se.load.weight = 0;
> +		p->se.load.inv_weight = 0;
>  		return;
>  	}
>  
> @@ -2028,10 +2121,10 @@ static unsigned long source_load(int cpu
>  	struct rq *rq = cpu_rq(cpu);
>  	unsigned long total = weighted_cpuload(cpu);
>  
> -	if (type == 0 || !sched_feat(LB_BIAS))
> -		return total;
> +	if (type && sched_feat(LB_BIAS))
> +		total = min(rq->cpu_load[type-1], total);
>  
> -	return min(rq->cpu_load[type-1], total);
> +	return sched_scale_load(rq, total);
>  }
>  
>  /*
> @@ -2043,10 +2136,10 @@ static unsigned long target_load(int cpu
>  	struct rq *rq = cpu_rq(cpu);
>  	unsigned long total = weighted_cpuload(cpu);
>  
> -	if (type == 0 || !sched_feat(LB_BIAS))
> -		return total;
> +	if (type && sched_feat(LB_BIAS))
> +		total = max(rq->cpu_load[type-1], total);
>  
> -	return max(rq->cpu_load[type-1], total);
> +	return sched_scale_load(rq, total);
>  }
>  
>  /*
> @@ -2956,10 +3049,20 @@ balance_tasks(struct rq *this_rq, int th
>  	int loops = 0, pulled = 0, pinned = 0;
>  	struct task_struct *p;
>  	long rem_load_move = max_load_move;
> +	unsigned long busy_weight, this_weight, weight_scale;
>  
>  	if (max_load_move == 0)
>  		goto out;
>  
> +	/*
> +	 * Compute a weight scale to properly account for the varying
> +	 * load inflation between these CPUs.
> +	 */
> +	busy_weight = sched_scale_load(busiest, NICE_0_LOAD);
> +	this_weight = sched_scale_load(this_rq, NICE_0_LOAD);
> +
> +	weight_scale = div_u64((u64)this_weight * NICE_0_LOAD, busy_weight);
> +
>  	pinned = 1;
>  
>  	/*
> @@ -2978,7 +3081,7 @@ next:
>  
>  	pull_task(busiest, p, this_rq, this_cpu);
>  	pulled++;
> -	rem_load_move -= p->se.load.weight;
> +	rem_load_move -= (weight_scale * p->se.load.weight) >> NICE_0_SHIFT;
>  
>  	/*
>  	 * We only want to steal up to the prescribed amount of weighted load.
> @@ -4211,6 +4314,7 @@ void scheduler_tick(void)
>  	spin_lock(&rq->lock);
>  	update_rq_clock(rq);
>  	update_cpu_load(rq);
> +	sched_age_time(rq);
>  	curr->sched_class->task_tick(rq, curr, 0);
>  	spin_unlock(&rq->lock);
>  
> Index: linux-2.6/kernel/sched_rt.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_rt.c
> +++ linux-2.6/kernel/sched_rt.c
> @@ -478,6 +478,14 @@ static void update_curr_rt(struct rq *rq
>  	if (unlikely((s64)delta_exec < 0))
>  		delta_exec = 0;
>  
> +#ifdef CONFIG_SMP
> +	/*
> +	 * Account the time spend running RT tasks on this rq. Used to inflate
> +	 * this rq's load values.
> +	 */
> +	rq->rt_time += delta_exec;
> +#endif
> +
>  	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
>  
>  	curr->se.sum_exec_runtime += delta_exec;
> Index: linux-2.6/kernel/softirq.c
> ===================================================================
> --- linux-2.6.orig/kernel/softirq.c
> +++ linux-2.6/kernel/softirq.c
> @@ -280,6 +280,7 @@ void irq_exit(void)
>  	account_system_vtime(current);
>  	trace_hardirq_exit();
>  	sub_preempt_count(IRQ_EXIT_OFFSET);
> +	sched_irq_exit();
>  	if (!in_interrupt() && local_softirq_pending())
>  		invoke_softirq();
>  
> Index: linux-2.6/kernel/sysctl.c
> ===================================================================
> --- linux-2.6.orig/kernel/sysctl.c
> +++ linux-2.6/kernel/sysctl.c
> @@ -309,6 +309,14 @@ static struct ctl_table kern_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= &proc_dointvec,
>  	},
> +	{
> +		.ctl_name	= CTL_UNNUMBERED,
> +		.procname	= "sched_time_avg_ms",
> +		.data		= &sysctl_sched_time_avg,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= &proc_dointvec,
> +	},
>  #endif
>  	{
>  		.ctl_name	= CTL_UNNUMBERED,
> Index: linux-2.6/kernel/sched_debug.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_debug.c
> +++ linux-2.6/kernel/sched_debug.c
> @@ -245,6 +245,8 @@ static void print_cpu(struct seq_file *m
>  	P(nr_running);
>  	SEQ_printf(m, "  .%-30s: %lu\n", "load",
>  		   rq->load.weight);
> +	SEQ_printf(m, "  .%-30s: %ld\n", "scaled_load",
> +			sched_scale_load(rq, rq->load.weight));
>  	P(nr_switches);
>  	P(nr_load_updates);
>  	P(nr_uninterruptible);
>
>
>   



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
  2008-08-21 11:22     ` Peter Zijlstra
@ 2008-08-21 11:40       ` Ingo Molnar
  0 siblings, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2008-08-21 11:40 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Nick Piggin, Gregory Haskins, vatsa, linux-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, 2008-08-21 at 13:17 +0200, Ingo Molnar wrote:
> > * Ingo Molnar <mingo@elte.hu> wrote:
> > 
> > > > This patch tries to solve this issue by accounting the time spend on 
> > > > both Real-Time tasks and IRQ handling, and using that to 
> > > > proportionally inflate the SCHED_OTHER load.
> > > 
> > > applied it to tip/sched/devel for more testing.
> > 
> > -tip testing found that it wouldnt build on UP in sched_debug.c - made 
> > that bit dependent on CONFIG_SMP. sched_scale_load() does not seem to be 
> > too meaningful on UP.
> 
> Thanks - just prior to posting I removed the empty stub for 
> sched_scale_load() because it didn't make sense for UP code to use 
> it...
> 
> So much for double checking myself.

hm, the empty stub might be useful still and would remove the #ifdef 
from sched_debug.c. Could you please send a delta patch against 
tip/master?

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
  2008-08-21 11:36 ` Gregory Haskins
@ 2008-08-21 11:41   ` Ingo Molnar
  2008-08-21 12:26     ` Gregory Haskins
  0 siblings, 1 reply; 11+ messages in thread
From: Ingo Molnar @ 2008-08-21 11:41 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: Peter Zijlstra, Nick Piggin, vatsa, linux-kernel


* Gregory Haskins <ghaskins@novell.com> wrote:

> I haven't had a chance to review the code thoroughly yet, but I had 
> been working on a similar fix and know that this is sorely needed.  
> So...

btw., why exactly does this patch speed up certain workloads? I'm not 
quite sure about the exact reasons of that.

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
  2008-08-21 11:41   ` Ingo Molnar
@ 2008-08-21 12:26     ` Gregory Haskins
  0 siblings, 0 replies; 11+ messages in thread
From: Gregory Haskins @ 2008-08-21 12:26 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Peter Zijlstra, Nick Piggin, vatsa, linux-kernel, D. Bahi

[-- Attachment #1: Type: text/plain, Size: 4241 bytes --]

Ingo Molnar wrote:
> * Gregory Haskins <ghaskins@novell.com> wrote:
>
>   
>> I haven't had a chance to review the code thoroughly yet, but I had 
>> been working on a similar fix and know that this is sorely needed.  
>> So...
>>     
>
> btw., why exactly does this patch speed up certain workloads? I'm not 
> quite sure about the exact reasons of that.
>
> 	Ingo
>   

I used to have a great demo for the prototype I was working on, but id 
have to dig it up.  The gist of it is that the pre-patched scheduler 
basically gets thrown for a completely loop in the presence of a mixed 
CFS/RT environment.  This isn't a PREEMPT_RT specific problem per se, 
though PREEMPT_RT does bring the problem to the forefront since it has 
so many active RT tasks by default (for the IRQs, etc) which make it 
more evident.

Since an RT tasks previous usage of declaring "load" did not actually 
express the true nature of the RQ load, CFS tasks would have a few 
really nasty things happen to them while trying to run on the system 
simultaneously.  One of them was that you could starve out CFS tasks 
from certain cores (even though there was plenty of CPU bandwidth 
available elsewhere) and the load-balancer would think everything is 
fine and thus fail to make adjustments.

Say you have a 4 core system.  You could, for instance, get into a 
situation where the softirq-net-rx thread was consuming 80% of core 0, 
yet the load balancer would still spread, say, a 40 thread CFS load 
evenly across all cores (approximately 10 per core, though you would 
account for the "load" that the softirq thread contributed too).  The 
threads on the other cores would of course enjoy 100% bandwidth, while 
the ~10 threads on core 0 would only see 1/5th of that bandwidth.

What it comes down to is that the CFS load should have been evenly 
distributed across the available bandwidth of 3*100% + 1*20%, not 4*100% 
as it does today.  The net result is that the application performs in a 
very lopsided manner, with some threads getting significantly less (or 
sometimes zero!) cpu time compared to their peers.  You can make this 
more obvious by nice'ing the CFS load up as high as it will go, which 
will approximate 1/2 of the load of the softirq (since RT tasks 
previously enjoyed a 2*MAX_SCHED_OTHER_LOAD rating.

I have observed this phenomenon (and its fix) while looking at things 
like network intensive workloads.  I'm sure there are plenty of others 
that could cause similar ripples.

The fact is, the scheduler treats "load" to mean certain things which 
simply did not apply to RT tasks.  As you know very well im sure ;), 
"load" is a metric which expresses the share of the cpu that will be 
consumed and this is used by the load balancer to make its decisions.  
However, you can put whatever rating you want on an RT task and it would 
always be irrelevant.  RT tasks run as frequently and as long as they 
want (w.r.t. SCHED_OTHER) independent of what their load rating implies 
to the balancer, so you cannot make an accurate assessment of the true 
"available shares".  This is why the load-balancer would become confused 
and fail to see true imbalance in a mixed environment.  Fixing this, as 
Peter has attempted to do, will result in a much better distribution of 
SCHED_OTHER tasks across the true available bandwidth, and thus improve 
overall performance.

In previous discussions with people, I had always used a metaphor of a 
stream.  A system running SCHED_OTHER tasks is like a smooth running 
stream, but  dispatching an RT task (or an IRQ, even) is like throwing a 
boulder into the water.  It makes a big disruptive splash and causes 
turbulent white water behind it.  And the stream has no influence over 
the size of the boulder, its placement in the stream, nor how long it 
will be staying.

This fix (at least in concept) allows it to become more like gently 
slipping a streamlined aerodynamic object into the water.  The stream 
still cannot do anything about the size or placement of the object, but 
it can at least flow around it and smoothly adapt to the reduced volume 
of water that the stream can carry. :)

HTH
-Greg


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
  2008-08-21  9:18 [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing Peter Zijlstra
  2008-08-21 10:47 ` Ingo Molnar
  2008-08-21 11:36 ` Gregory Haskins
@ 2008-08-21 12:43 ` Peter Zijlstra
  2008-08-21 12:47   ` Gregory Haskins
  2 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2008-08-21 12:43 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Nick Piggin, Gregory Haskins, vatsa, linux-kernel

OK, how overboard is this? (utterly uncompiled and such)

I realized while trying to do the (soft)irq accounting Ingo asked for,
that IRQs can preempt SoftIRQs which can preempt RT tasks.

Therefore we actually need to account all these times, so that we can
subtract irq time from measured softirq time, etc.

So this patch does all that.. we could even use this more accurate time
spend on the task delta to drive the scheduler.

NOTE - for now I've only considered softirq from hardirq time, as
ksoftirqd is its own task and is already accounted the regular way.

---
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -572,9 +572,17 @@ struct rq {
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
 
-	u64 irq_stamp;
-	unsigned long irq_time;
-	unsigned long rt_time;
+	u64 irq_clock_stamp;
+	u64 sirq_clock_stamp, sirq_irq_stamp;
+	u64 rt_sirq_stamp, rt_irq_stamp;
+
+	u64 rt_time;
+	u64 sirq_time;
+	u64 rt_time;
+
+	unsigned long irq_avg;
+	unsigned long sirq_avg;
+	unsigned long rt_avg;
 	u64 age_stamp;
 
 #endif
@@ -1167,7 +1175,7 @@ void sched_irq_enter(void)
 		struct rq *rq = this_rq();
 
 		update_rq_clock(rq);
-		rq->irq_stamp = rq->clock;
+		rq->irq_clock_stamp = rq->clock;
 	}
 }
 
@@ -1175,12 +1183,58 @@ void sched_irq_exit(void)
 {
 	if (!in_irq()) {
 		struct rq *rq = this_rq();
+		u64 irq_delta;
 
 		update_rq_clock(rq);
-		rq->irq_time += rq->clock - rq->irq_stamp;
+		irq_delta = rq->clock - rq->irq_clock_stamp;
+		rq->irq_time += irq_delta;
+		rq->irq_avg += irq_delta;
 	}
 }
 
+void sched_softirq_enter(void)
+{
+	struct rq *rq = this_rq();
+
+	update_rq_clock(rq);
+	rq->sirq_clock_stamp = rq->clock;
+	rq->sirq_irq_stamp = rq->irq_time;
+}
+
+void sched_softirq_exit(void)
+{
+	struct rq *rq = this_rq();
+	u64 sirq_delta, irq_delta;
+
+	update_rq_clock(rq);
+	sirq_delta = rq->clock - rq->sirq_clock_stamp;
+	irq_delta = rq->irq_time - rq->sirq_irq_stamp;
+	sirq_delta -= irq_delta;
+	rq->sirq_time += sirq_delta;
+	rq->sirq_avg += sirq_delta;
+}
+
+void sched_rt_start(struct rq *rq)
+{
+	rq->rt_sirq_stamp = rq->sirt_time;
+	rq->rt_irq_stamp = rq->irq_time;
+}
+
+void sched_rt_update(struct rq *rq, u64 rt_delta)
+{
+	u64 sirq_delta, irq_delta;
+
+	sirq_delta = rq->sirq_time - rq->rt_sirq_stamp;
+	irq_delta = rq->irq_time - rq->rt_irq_stamp;
+
+	rt_delta -= sirq_delta + irq_delta;
+
+	rq->rt_time += rt_delta;
+	rq->rt_avg += rt_delta;
+
+	sched_rt_start(rq);
+}
+
 static inline u64 sched_avg_period(void)
 {
 	return (u64)sysctl_sched_time_avg * (NSEC_PER_MSEC / 2);
@@ -1192,8 +1246,9 @@ static inline u64 sched_avg_period(void)
 static void sched_age_time(struct rq *rq)
 {
 	if (rq->clock - rq->age_stamp >= sched_avg_period()) {
-		rq->irq_time /= 2;
-		rq->rt_time /= 2;
+		rq->rt_avg /= 2;
+		rq->irq_avg /= 2;
+		rq->sirq_avg /= 2;
 		rq->age_stamp = rq->clock;
 	}
 }
@@ -1207,7 +1262,7 @@ static void sched_age_time(struct rq *rq
 static unsigned long sched_scale_load(struct rq *rq, u64 load)
 {
 	u64 total = sched_avg_period() + (rq->clock - rq->age_stamp);
-	u64 available = total - rq->irq_time - rq->rt_time;
+	u64 available = total - rq->sirq_avg - rq->irq_avg - rq->rt_avg;
 
 	/*
 	 * Shift back to roughly us scale, so that the divisor fits in u32.
@@ -1227,9 +1282,22 @@ static unsigned long sched_scale_load(st
 	return min_t(unsigned long, load, 1UL << 22);
 }
 #else
+static inline void sched_rt_start(struct rq *rq)
+{
+}
+
+static inline void sched_rt_update(struct rq *rq, u64 delta)
+{
+}
+
 static inline void sched_age_time(struct rq *rq)
 {
 }
+
+static inline unsigned long sched_scale_load(unsigned long load)
+{
+	return load;
+}
 #endif
 
 /*
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -478,13 +478,7 @@ static void update_curr_rt(struct rq *rq
 	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
 
-#ifdef CONFIG_SMP
-	/*
-	 * Account the time spend running RT tasks on this rq. Used to inflate
-	 * this rq's load values.
-	 */
-	rq->rt_time += delta_exec;
-#endif
+	sched_rt_update(rq, delta_exec);
 
 	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 
@@ -678,8 +672,6 @@ static void enqueue_task_rt(struct rq *r
 		rt_se->timeout = 0;
 
 	enqueue_rt_entity(rt_se);
-
-	inc_cpu_load(rq, p->se.load.weight);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -688,8 +680,6 @@ static void dequeue_task_rt(struct rq *r
 
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se);
-
-	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
@@ -1458,6 +1448,7 @@ static void set_curr_task_rt(struct rq *
 	struct task_struct *p = rq->curr;
 
 	p->se.exec_start = rq->clock;
+	sched_rt_start(rq);
 }
 
 static const struct sched_class rt_sched_class = {
Index: linux-2.6/kernel/softirq.c
===================================================================
--- linux-2.6.orig/kernel/softirq.c
+++ linux-2.6/kernel/softirq.c
@@ -272,6 +272,14 @@ void irq_enter(void)
 # define invoke_softirq()	do_softirq()
 #endif
 
+#ifdef CONFIG_SMP
+extern void sched_softirq_enter(void);
+extern void sched_softirq_exit(void);
+#else
+#define sched_softirq_enter() do { } while (0)
+#define sched_softirq_exit()  do { } while (0)
+#endif
+
 /*
  * Exit an interrupt context. Process softirqs if needed and possible:
  */
@@ -281,8 +289,11 @@ void irq_exit(void)
 	trace_hardirq_exit();
 	sub_preempt_count(IRQ_EXIT_OFFSET);
 	sched_irq_exit();
-	if (!in_interrupt() && local_softirq_pending())
+	if (!in_interrupt() && local_softirq_pending()) {
+		sched_softirq_enter();
 		invoke_softirq();
+		sched_softirq_exit();
+	}
 
 #ifdef CONFIG_NO_HZ
 	/* Make sure that timer wheel updates are propagated */



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
  2008-08-21 12:43 ` Peter Zijlstra
@ 2008-08-21 12:47   ` Gregory Haskins
  2008-08-21 12:56     ` Peter Zijlstra
  0 siblings, 1 reply; 11+ messages in thread
From: Gregory Haskins @ 2008-08-21 12:47 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Nick Piggin, vatsa, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 7291 bytes --]

Peter Zijlstra wrote:
> OK, how overboard is this? (utterly uncompiled and such)
>
> I realized while trying to do the (soft)irq accounting Ingo asked for,
> that IRQs can preempt SoftIRQs which can preempt RT tasks.
>
> Therefore we actually need to account all these times, so that we can
> subtract irq time from measured softirq time, etc.
>
> So this patch does all that.. we could even use this more accurate time
> spend on the task delta to drive the scheduler.
>
> NOTE - for now I've only considered softirq from hardirq time, as
> ksoftirqd is its own task and is already accounted the regular way.
>   

Actually, if you really want to get crazy, you could account for each RT 
prio level as well ;)

e.g. RT98 tasks have to account for RT99 + softirqs + irqs, RT97 need to 
look at RT98, 99, softirqs, irqs, etc.

I'm not suggesting we do this, per se.   Just food for thought.  It 
would have the benefit of allowing us to make even better routing 
decisions for RT tasks.  E.g. if cores 2 and 6 both have the lowest 
priority, we currently sort by sched-domain topology, but we could also 
factor in the load that is "above" us.

BTW: this is probably not a bad idea even if its just to look at the 
softirq/hardirq load.  Perhaps I will draft up a patch.

-Greg

> ---
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -572,9 +572,17 @@ struct rq {
>  	struct task_struct *migration_thread;
>  	struct list_head migration_queue;
>  
> -	u64 irq_stamp;
> -	unsigned long irq_time;
> -	unsigned long rt_time;
> +	u64 irq_clock_stamp;
> +	u64 sirq_clock_stamp, sirq_irq_stamp;
> +	u64 rt_sirq_stamp, rt_irq_stamp;
> +
> +	u64 rt_time;
> +	u64 sirq_time;
> +	u64 rt_time;
> +
> +	unsigned long irq_avg;
> +	unsigned long sirq_avg;
> +	unsigned long rt_avg;
>  	u64 age_stamp;
>  
>  #endif
> @@ -1167,7 +1175,7 @@ void sched_irq_enter(void)
>  		struct rq *rq = this_rq();
>  
>  		update_rq_clock(rq);
> -		rq->irq_stamp = rq->clock;
> +		rq->irq_clock_stamp = rq->clock;
>  	}
>  }
>  
> @@ -1175,12 +1183,58 @@ void sched_irq_exit(void)
>  {
>  	if (!in_irq()) {
>  		struct rq *rq = this_rq();
> +		u64 irq_delta;
>  
>  		update_rq_clock(rq);
> -		rq->irq_time += rq->clock - rq->irq_stamp;
> +		irq_delta = rq->clock - rq->irq_clock_stamp;
> +		rq->irq_time += irq_delta;
> +		rq->irq_avg += irq_delta;
>  	}
>  }
>  
> +void sched_softirq_enter(void)
> +{
> +	struct rq *rq = this_rq();
> +
> +	update_rq_clock(rq);
> +	rq->sirq_clock_stamp = rq->clock;
> +	rq->sirq_irq_stamp = rq->irq_time;
> +}
> +
> +void sched_softirq_exit(void)
> +{
> +	struct rq *rq = this_rq();
> +	u64 sirq_delta, irq_delta;
> +
> +	update_rq_clock(rq);
> +	sirq_delta = rq->clock - rq->sirq_clock_stamp;
> +	irq_delta = rq->irq_time - rq->sirq_irq_stamp;
> +	sirq_delta -= irq_delta;
> +	rq->sirq_time += sirq_delta;
> +	rq->sirq_avg += sirq_delta;
> +}
> +
> +void sched_rt_start(struct rq *rq)
> +{
> +	rq->rt_sirq_stamp = rq->sirt_time;
> +	rq->rt_irq_stamp = rq->irq_time;
> +}
> +
> +void sched_rt_update(struct rq *rq, u64 rt_delta)
> +{
> +	u64 sirq_delta, irq_delta;
> +
> +	sirq_delta = rq->sirq_time - rq->rt_sirq_stamp;
> +	irq_delta = rq->irq_time - rq->rt_irq_stamp;
> +
> +	rt_delta -= sirq_delta + irq_delta;
> +
> +	rq->rt_time += rt_delta;
> +	rq->rt_avg += rt_delta;
> +
> +	sched_rt_start(rq);
> +}
> +
>  static inline u64 sched_avg_period(void)
>  {
>  	return (u64)sysctl_sched_time_avg * (NSEC_PER_MSEC / 2);
> @@ -1192,8 +1246,9 @@ static inline u64 sched_avg_period(void)
>  static void sched_age_time(struct rq *rq)
>  {
>  	if (rq->clock - rq->age_stamp >= sched_avg_period()) {
> -		rq->irq_time /= 2;
> -		rq->rt_time /= 2;
> +		rq->rt_avg /= 2;
> +		rq->irq_avg /= 2;
> +		rq->sirq_avg /= 2;
>  		rq->age_stamp = rq->clock;
>  	}
>  }
> @@ -1207,7 +1262,7 @@ static void sched_age_time(struct rq *rq
>  static unsigned long sched_scale_load(struct rq *rq, u64 load)
>  {
>  	u64 total = sched_avg_period() + (rq->clock - rq->age_stamp);
> -	u64 available = total - rq->irq_time - rq->rt_time;
> +	u64 available = total - rq->sirq_avg - rq->irq_avg - rq->rt_avg;
>  
>  	/*
>  	 * Shift back to roughly us scale, so that the divisor fits in u32.
> @@ -1227,9 +1282,22 @@ static unsigned long sched_scale_load(st
>  	return min_t(unsigned long, load, 1UL << 22);
>  }
>  #else
> +static inline void sched_rt_start(struct rq *rq)
> +{
> +}
> +
> +static inline void sched_rt_update(struct rq *rq, u64 delta)
> +{
> +}
> +
>  static inline void sched_age_time(struct rq *rq)
>  {
>  }
> +
> +static inline unsigned long sched_scale_load(unsigned long load)
> +{
> +	return load;
> +}
>  #endif
>  
>  /*
> Index: linux-2.6/kernel/sched_rt.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_rt.c
> +++ linux-2.6/kernel/sched_rt.c
> @@ -478,13 +478,7 @@ static void update_curr_rt(struct rq *rq
>  	if (unlikely((s64)delta_exec < 0))
>  		delta_exec = 0;
>  
> -#ifdef CONFIG_SMP
> -	/*
> -	 * Account the time spend running RT tasks on this rq. Used to inflate
> -	 * this rq's load values.
> -	 */
> -	rq->rt_time += delta_exec;
> -#endif
> +	sched_rt_update(rq, delta_exec);
>  
>  	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
>  
> @@ -678,8 +672,6 @@ static void enqueue_task_rt(struct rq *r
>  		rt_se->timeout = 0;
>  
>  	enqueue_rt_entity(rt_se);
> -
> -	inc_cpu_load(rq, p->se.load.weight);
>  }
>  
>  static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
> @@ -688,8 +680,6 @@ static void dequeue_task_rt(struct rq *r
>  
>  	update_curr_rt(rq);
>  	dequeue_rt_entity(rt_se);
> -
> -	dec_cpu_load(rq, p->se.load.weight);
>  }
>  
>  /*
> @@ -1458,6 +1448,7 @@ static void set_curr_task_rt(struct rq *
>  	struct task_struct *p = rq->curr;
>  
>  	p->se.exec_start = rq->clock;
> +	sched_rt_start(rq);
>  }
>  
>  static const struct sched_class rt_sched_class = {
> Index: linux-2.6/kernel/softirq.c
> ===================================================================
> --- linux-2.6.orig/kernel/softirq.c
> +++ linux-2.6/kernel/softirq.c
> @@ -272,6 +272,14 @@ void irq_enter(void)
>  # define invoke_softirq()	do_softirq()
>  #endif
>  
> +#ifdef CONFIG_SMP
> +extern void sched_softirq_enter(void);
> +extern void sched_softirq_exit(void);
> +#else
> +#define sched_softirq_enter() do { } while (0)
> +#define sched_softirq_exit()  do { } while (0)
> +#endif
> +
>  /*
>   * Exit an interrupt context. Process softirqs if needed and possible:
>   */
> @@ -281,8 +289,11 @@ void irq_exit(void)
>  	trace_hardirq_exit();
>  	sub_preempt_count(IRQ_EXIT_OFFSET);
>  	sched_irq_exit();
> -	if (!in_interrupt() && local_softirq_pending())
> +	if (!in_interrupt() && local_softirq_pending()) {
> +		sched_softirq_enter();
>  		invoke_softirq();
> +		sched_softirq_exit();
> +	}
>  
>  #ifdef CONFIG_NO_HZ
>  	/* Make sure that timer wheel updates are propagated */
>
>
>   



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing
  2008-08-21 12:47   ` Gregory Haskins
@ 2008-08-21 12:56     ` Peter Zijlstra
  0 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-08-21 12:56 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: Ingo Molnar, Nick Piggin, vatsa, linux-kernel

On Thu, 2008-08-21 at 08:47 -0400, Gregory Haskins wrote:
> Peter Zijlstra wrote:
> > OK, how overboard is this? (utterly uncompiled and such)
> >
> > I realized while trying to do the (soft)irq accounting Ingo asked for,
> > that IRQs can preempt SoftIRQs which can preempt RT tasks.
> >
> > Therefore we actually need to account all these times, so that we can
> > subtract irq time from measured softirq time, etc.
> >
> > So this patch does all that.. we could even use this more accurate time
> > spend on the task delta to drive the scheduler.
> >
> > NOTE - for now I've only considered softirq from hardirq time, as
> > ksoftirqd is its own task and is already accounted the regular way.
> >   
> 
> Actually, if you really want to get crazy, you could account for each RT 
> prio level as well ;)
> 
> e.g. RT98 tasks have to account for RT99 + softirqs + irqs, RT97 need to 
> look at RT98, 99, softirqs, irqs, etc.
> 
> I'm not suggesting we do this, per se.   Just food for thought.  It 
> would have the benefit of allowing us to make even better routing 
> decisions for RT tasks.  E.g. if cores 2 and 6 both have the lowest 
> priority, we currently sort by sched-domain topology, but we could also 
> factor in the load that is "above" us.

I'll let you be that crazy ;-) It'd be a 3-rd order placement decision,
I doubt that's going to make a large difference.


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2008-08-21 12:57 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-21  9:18 [PATCH] sched: properly account IRQ and RT load in SCHED_OTHER load balancing Peter Zijlstra
2008-08-21 10:47 ` Ingo Molnar
2008-08-21 11:17   ` Ingo Molnar
2008-08-21 11:22     ` Peter Zijlstra
2008-08-21 11:40       ` Ingo Molnar
2008-08-21 11:36 ` Gregory Haskins
2008-08-21 11:41   ` Ingo Molnar
2008-08-21 12:26     ` Gregory Haskins
2008-08-21 12:43 ` Peter Zijlstra
2008-08-21 12:47   ` Gregory Haskins
2008-08-21 12:56     ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox