sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
@ 2008-05-07  4:55 Zhang, Yanmin
  2008-05-07  9:16 ` Ingo Molnar
  2008-05-07 16:26 ` Peter Zijlstra
  0 siblings, 2 replies; 27+ messages in thread
From: Zhang, Yanmin @ 2008-05-07  4:55 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Ingo Molnar

Comparing with kernel 2.6.25, sysbench+mysql(oltp, readonly) has many
regression with 2.6.26-rc1.

1) 8-core stoakley: 28%;
2) 16-core tigerton: 20%;
3) Itanium Montvale: 50%.

Bisect located below patch.

8f1bc385cfbab474db6c27b5af1e439614f3025c is first bad commit
commit 8f1bc385cfbab474db6c27b5af1e439614f3025c
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Sat Apr 19 19:45:00 2008 +0200

    sched: fair: weight calculations
    
    In order to level the hierarchy, we need to calculate load based on the
    root view. That is, each task's load is in the same unit.



After I manually reverted the patch against 2.6.26-rc1 while fixing a couple of
conflictions/errors, sysbench oltp regression became less than 3% on 8-core
stoakley.

-yanmin



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-07  4:55 sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1 Zhang, Yanmin
@ 2008-05-07  9:16 ` Ingo Molnar
  2008-05-07  9:33   ` Zhang, Yanmin
  2008-05-07 16:26 ` Peter Zijlstra
  1 sibling, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2008-05-07  9:16 UTC (permalink / raw)
  To: Zhang, Yanmin; +Cc: Peter Zijlstra, LKML


* Zhang, Yanmin <yanmin_zhang@linux.intel.com> wrote:

> ???Comparing with kernel 2.6.25, sysbench+mysql(oltp, readonly) has many
> regression with 2.6.26-rc1.
> 
> 1) 8-core stoakley: 28%;
> 2) 16-core tigerton: 20%;
> 3) Itanium Montvale: 50%.
> 
> Bisect located below patch.

thanks Yanmin, i've queued up your reverter patch.

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-07  9:16 ` Ingo Molnar
@ 2008-05-07  9:33   ` Zhang, Yanmin
  2008-05-07 10:40     ` Ingo Molnar
  0 siblings, 1 reply; 27+ messages in thread
From: Zhang, Yanmin @ 2008-05-07  9:33 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Peter Zijlstra, LKML

[-- Attachment #1: Type: text/plain, Size: 514 bytes --]


On Wed, 2008-05-07 at 11:16 +0200, Ingo Molnar wrote:
> * Zhang, Yanmin <yanmin_zhang@linux.intel.com> wrote:
> 
> > ???Comparing with kernel 2.6.25, sysbench+mysql(oltp, readonly) has many
> > regression with 2.6.26-rc1.
> > 
> > 1) 8-core stoakley: 28%;
> > 2) 16-core tigerton: 20%;
> > 3) Itanium Montvale: 50%.
> > 
> > Bisect located below patch.
> 
> thanks Yanmin, i've queued up your reverter patch.
Sorry. The reverting patch has a comment-out block. I need delete it if you queue the
patch officially.

[-- Attachment #2: revert_fair_weight_calculation.patch --]
[-- Type: text/x-patch, Size: 5216 bytes --]

diff -Nraup linux-2.6.26-rc1/kernel/sched.c linux-2.6.26-rc1_oltp/kernel/sched.c
--- linux-2.6.26-rc1/kernel/sched.c	2008-05-06 06:27:56.000000000 +0800
+++ linux-2.6.26-rc1_oltp/kernel/sched.c	2008-05-07 03:57:39.000000000 +0800
@@ -1429,9 +1429,6 @@ static void __resched_task(struct task_s
  */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 
-/*
- * delta *= weight / lw
- */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
@@ -1454,6 +1451,12 @@ calc_delta_mine(unsigned long delta_exec
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 
+static inline unsigned long
+calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
+{
+	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
+}
+
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
diff -Nraup linux-2.6.26-rc1/kernel/sched_fair.c linux-2.6.26-rc1_oltp/kernel/sched_fair.c
--- linux-2.6.26-rc1/kernel/sched_fair.c	2008-05-06 06:27:56.000000000 +0800
+++ linux-2.6.26-rc1_oltp/kernel/sched_fair.c	2008-05-07 10:28:25.000000000 +0800
@@ -334,34 +334,6 @@ int sched_nr_latency_handler(struct ctl_
 #endif
 
 /*
- * delta *= w / rw
- */
-static inline unsigned long
-calc_delta_weight(unsigned long delta, struct sched_entity *se)
-{
-	for_each_sched_entity(se) {
-		delta = calc_delta_mine(delta,
-				se->load.weight, &cfs_rq_of(se)->load);
-	}
-
-	return delta;
-}
-
-/*
- * delta *= rw / w
- */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
-{
-	for_each_sched_entity(se) {
-		delta = calc_delta_mine(delta,
-				cfs_rq_of(se)->load.weight, &se->load);
-	}
-
-	return delta;
-}
-
-/*
  * The idea is to set a period in which each task runs once.
  *
  * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -390,54 +362,47 @@ static u64 __sched_period(unsigned long 
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
+	u64 slice = __sched_period(cfs_rq->nr_running);
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		slice *= se->load.weight;
+		do_div(slice, cfs_rq->load.weight);
+	}
+
+
+	return slice;
 }
 
 /*
  * We calculate the vruntime slice of a to be inserted task
  *
- * vs = s*rw/w = p
+ * vs = s/w = p/rw
  */
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long nr_running = cfs_rq->nr_running;
+	unsigned long weight;
+	u64 vslice;
 
 	if (!se->on_rq)
 		nr_running++;
 
-	return __sched_period(nr_running);
-}
-
-/*
- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
- * that it favours >=0 over <0.
- *
- *   -20         |
- *               |
- *     0 --------+-------
- *             .'
- *    19     .'
- *
- */
-static unsigned long
-calc_delta_asym(unsigned long delta, struct sched_entity *se)
-{
-	struct load_weight lw = {
-		.weight = NICE_0_LOAD,
-		.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
-	};
+	vslice = __sched_period(nr_running);
 
 	for_each_sched_entity(se) {
-		struct load_weight *se_lw = &se->load;
+		cfs_rq = cfs_rq_of(se);
 
-		if (se->load.weight < NICE_0_LOAD)
-			se_lw = &lw;
+		weight = cfs_rq->load.weight;
+		if (!se->on_rq)
+			weight += se->load.weight;
 
-		delta = calc_delta_mine(delta,
-				cfs_rq_of(se)->load.weight, se_lw);
+		vslice *= NICE_0_LOAD;
+		do_div(vslice, weight);
 	}
 
-	return delta;
+	return vslice;
 }
 
 /*
@@ -454,7 +419,11 @@ __update_curr(struct cfs_rq *cfs_rq, str
 
 	curr->sum_exec_runtime += delta_exec;
 	schedstat_add(cfs_rq, exec_clock, delta_exec);
-	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
+	delta_exec_weighted = delta_exec;
+	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+							&curr->load);
+	}
 	curr->vruntime += delta_exec_weighted;
 }
 
@@ -663,7 +632,8 @@ place_entity(struct cfs_rq *cfs_rq, stru
 		/* sleeps upto a single latency don't count. */
 		if (sched_feat(NEW_FAIR_SLEEPERS)) {
 			if (sched_feat(NORMALIZED_SLEEPER))
-				vruntime -= calc_delta_weight(sysctl_sched_latency, se);
+				vruntime -= calc_delta_fair(sysctl_sched_latency,
+						&cfs_rq->load);
 			else
 				vruntime -= sysctl_sched_latency;
 		}
@@ -1162,10 +1132,11 @@ static unsigned long wakeup_gran(struct 
 	unsigned long gran = sysctl_sched_wakeup_granularity;
 
 	/*
-	 * More easily preempt - nice tasks, while not making it harder for
-	 * + nice tasks.
+	 * More easily preempt - nice tasks, while not making
+	 * it harder for + nice tasks.
 	 */
-	gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+	if (unlikely(se->load.weight > NICE_0_LOAD))
+		gran = calc_delta_fair(gran, &se->load);
 
 	return gran;
 }
@@ -1625,11 +1596,6 @@ print_cfs_rq_tasks(struct seq_file *m, s
 		for (i = depth; i; i--)
 			seq_puts(m, "  ");
 
-		seq_printf(m, "%lu %s %lu\n",
-				se->load.weight,
-				entity_is_task(se) ? "T" : "G",
-				calc_delta_weight(SCHED_LOAD_SCALE, se)
-				);
 		if (!entity_is_task(se))
 			print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
 	}

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-07  9:33   ` Zhang, Yanmin
@ 2008-05-07 10:40     ` Ingo Molnar
  0 siblings, 0 replies; 27+ messages in thread
From: Ingo Molnar @ 2008-05-07 10:40 UTC (permalink / raw)
  To: Zhang, Yanmin; +Cc: Peter Zijlstra, LKML


* Zhang, Yanmin <yanmin_zhang@linux.intel.com> wrote:

> > thanks Yanmin, i've queued up your reverter patch.
>
> Sorry. The reverting patch has a comment-out block. I need delete it 
> if you queue the patch officially.

yep, i noticed that and skipped that bit already.

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-07  4:55 sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1 Zhang, Yanmin
  2008-05-07  9:16 ` Ingo Molnar
@ 2008-05-07 16:26 ` Peter Zijlstra
  2008-05-08  6:35   ` Zhang, Yanmin
  1 sibling, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2008-05-07 16:26 UTC (permalink / raw)
  To: Zhang, Yanmin; +Cc: LKML, Ingo Molnar, Mike Galbraith

On Wed, 2008-05-07 at 12:55 +0800, Zhang, Yanmin wrote:
> Comparing with kernel 2.6.25, sysbench+mysql(oltp, readonly) has many
> regression with 2.6.26-rc1.
> 
> 1) 8-core stoakley: 28%;
> 2) 16-core tigerton: 20%;
> 3) Itanium Montvale: 50%.
> 
> Bisect located below patch.
> 
> 8f1bc385cfbab474db6c27b5af1e439614f3025c is first bad commit
> commit 8f1bc385cfbab474db6c27b5af1e439614f3025c
> Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date:   Sat Apr 19 19:45:00 2008 +0200
> 
>     sched: fair: weight calculations
>     
>     In order to level the hierarchy, we need to calculate load based on the
>     root view. That is, each task's load is in the same unit.
> 
> 
> 
> After I manually reverted the patch against 2.6.26-rc1 while fixing a couple of
> conflictions/errors, sysbench oltp regression became less than 3% on 8-core
> stoakley.

Does this patch help?

---
From: Mike Galbraith <efault@gmx.de>
Subject: sched: fix weight calculations

The conversion between virtual and real time is as follows:

  dvt = rw/w * dt <=> dt = w/rw * dvt

Since we want the fair sleeper granularity to be in real time, we actually
need to do:

  dvt = - rw/w * l

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched_fair.c |   11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

Index: linux-2.6-2/kernel/sched_fair.c
===================================================================
--- linux-2.6-2.orig/kernel/sched_fair.c
+++ linux-2.6-2/kernel/sched_fair.c
@@ -662,10 +662,15 @@ place_entity(struct cfs_rq *cfs_rq, stru
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
 		if (sched_feat(NEW_FAIR_SLEEPERS)) {
+			unsigned long thresh = sysctl_sched_latency;
+
+			/*
+			 * convert the sleeper threshold into virtual time
+			 */
 			if (sched_feat(NORMALIZED_SLEEPER))
-				vruntime -= calc_delta_weight(sysctl_sched_latency, se);
-			else
-				vruntime -= sysctl_sched_latency;
+				thresh = calc_delta_fair(thresh, se);
+
+			vruntime -= thresh;
 		}
 
 		/* ensure we never gain time by being placed backwards. */



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-07 16:26 ` Peter Zijlstra
@ 2008-05-08  6:35   ` Zhang, Yanmin
  2008-05-08  8:00     ` Mike Galbraith
  0 siblings, 1 reply; 27+ messages in thread
From: Zhang, Yanmin @ 2008-05-08  6:35 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Ingo Molnar, Mike Galbraith


On Wed, 2008-05-07 at 18:26 +0200, Peter Zijlstra wrote: 
> On Wed, 2008-05-07 at 12:55 +0800, Zhang, Yanmin wrote:
> > Comparing with kernel 2.6.25, sysbench+mysql(oltp, readonly) has many
> > regression with 2.6.26-rc1.
> > 
> > 1) 8-core stoakley: 28%;
> > 2) 16-core tigerton: 20%;
> > 3) Itanium Montvale: 50%.
> > 
> > Bisect located below patch.
> > 
> > 8f1bc385cfbab474db6c27b5af1e439614f3025c is first bad commit
> > commit 8f1bc385cfbab474db6c27b5af1e439614f3025c
> > Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > Date:   Sat Apr 19 19:45:00 2008 +0200
> > 
> >     sched: fair: weight calculations
> >     
> >     In order to level the hierarchy, we need to calculate load based on the
> >     root view. That is, each task's load is in the same unit.
> > 
> > 
> > 
> > After I manually reverted the patch against 2.6.26-rc1 while fixing a couple of
> > conflictions/errors, sysbench oltp regression became less than 3% on 8-core
> > stoakley.
> 
> Does this patch help?
With the patch, oltp testing result is about 50% worse than the one of pure
2.6.26-rc1.

cpu idle with 16 threads( a parameter of sysbench):
1) 2.6.25: 1%
2) 2.6.26-rc1: 33%
3) 2.6.26-rc1+new_patch: 70%.

> 
> ---
> From: Mike Galbraith <efault@gmx.de>
> Subject: sched: fix weight calculations
> 
> The conversion between virtual and real time is as follows:
> 
>   dvt = rw/w * dt <=> dt = w/rw * dvt
> 
> Since we want the fair sleeper granularity to be in real time, we actually
> need to do:
> 
>   dvt = - rw/w * l
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  kernel/sched_fair.c |   11 ++++++++---
>  1 file changed, 8 insertions(+), 3 deletions(-)
> 
> Index: linux-2.6-2/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6-2.orig/kernel/sched_fair.c
> +++ linux-2.6-2/kernel/sched_fair.c
> @@ -662,10 +662,15 @@ place_entity(struct cfs_rq *cfs_rq, stru
>  	if (!initial) {
>  		/* sleeps upto a single latency don't count. */
>  		if (sched_feat(NEW_FAIR_SLEEPERS)) {
> +			unsigned long thresh = sysctl_sched_latency;
> +
> +			/*
> +			 * convert the sleeper threshold into virtual time
> +			 */
>  			if (sched_feat(NORMALIZED_SLEEPER))
> -				vruntime -= calc_delta_weight(sysctl_sched_latency, se);
> -			else
> -				vruntime -= sysctl_sched_latency;
> +				thresh = calc_delta_fair(thresh, se);
> +
> +			vruntime -= thresh;
>  		}
>  
>  		/* ensure we never gain time by being placed backwards. */
> 
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-08  6:35   ` Zhang, Yanmin
@ 2008-05-08  8:00     ` Mike Galbraith
  2008-05-08  8:46       ` Ingo Molnar
  2008-05-08  9:01       ` Zhang, Yanmin
  0 siblings, 2 replies; 27+ messages in thread
From: Mike Galbraith @ 2008-05-08  8:00 UTC (permalink / raw)
  To: Zhang, Yanmin; +Cc: Peter Zijlstra, LKML, Ingo Molnar

[-- Attachment #1: Type: text/plain, Size: 1482 bytes --]


On Thu, 2008-05-08 at 14:35 +0800, Zhang, Yanmin wrote: 
> On Wed, 2008-05-07 at 18:26 +0200, Peter Zijlstra wrote: 
> > On Wed, 2008-05-07 at 12:55 +0800, Zhang, Yanmin wrote:
> > > Comparing with kernel 2.6.25, sysbench+mysql(oltp, readonly) has many
> > > regression with 2.6.26-rc1.
> > > 
> > > 1) 8-core stoakley: 28%;
> > > 2) 16-core tigerton: 20%;
> > > 3) Itanium Montvale: 50%.
> > > 
> > > Bisect located below patch.
> > > 
> > > 8f1bc385cfbab474db6c27b5af1e439614f3025c is first bad commit
> > > commit 8f1bc385cfbab474db6c27b5af1e439614f3025c
> > > Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > > Date:   Sat Apr 19 19:45:00 2008 +0200
> > > 
> > >     sched: fair: weight calculations
> > >     
> > >     In order to level the hierarchy, we need to calculate load based on the
> > >     root view. That is, each task's load is in the same unit.
> > > 
> > > 
> > > 
> > > After I manually reverted the patch against 2.6.26-rc1 while fixing a couple of
> > > conflictions/errors, sysbench oltp regression became less than 3% on 8-core
> > > stoakley.
> > 
> > Does this patch help?
> With the patch, oltp testing result is about 50% worse than the one of pure
> 2.6.26-rc1.

Hm.  I was doing some sysbench+postgress(oltp, ro) testing on my little
Q6600 box this morning, and saw a different picture.

In attached pdf, .bkl refers to Linus' BKL patch, .weight is the weight
fix, both are applied to git.today.  The script I used is also attached.

	-Mike  

[-- Attachment #2: xx.pdf --]
[-- Type: application/pdf, Size: 18377 bytes --]

[-- Attachment #3: sysbench.test --]
[-- Type: application/x-shellscript, Size: 2468 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-08  8:00     ` Mike Galbraith
@ 2008-05-08  8:46       ` Ingo Molnar
  2008-05-08  9:25         ` Ingo Molnar
  2008-05-08  9:37         ` Mike Galbraith
  2008-05-08  9:01       ` Zhang, Yanmin
  1 sibling, 2 replies; 27+ messages in thread
From: Ingo Molnar @ 2008-05-08  8:46 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Zhang, Yanmin, Peter Zijlstra, LKML


* Mike Galbraith <efault@gmx.de> wrote:

> > With the patch, oltp testing result is about 50% worse than the one 
> > of pure 2.6.26-rc1.
> 
> Hm.  I was doing some sysbench+postgress(oltp, ro) testing on my 
> little Q6600 box this morning, and saw a different picture.
> 
> In attached pdf, .bkl refers to Linus' BKL patch, .weight is the 
> weight fix, both are applied to git.today.  The script I used is also 
> attached.

nice numbers. I'm curious about the following detail related to Linus's 
BKL-spinlock patch: am i correct that it shows a small but systematic 
improvement with many clients? sysbench has not shown sensitivity to any 
BKL detail before.

Could you perhaps try the hack below that uses /proc/sys/kernel/panic as 
a flag whether the BKL should cause us to block or should be spun upon.

Can you confirm that with that patch too sysbench shows sensitivity to 
the value of the panic flag? [this is the surest way to measure such 
effects as flipping the sysctl only minimally impacts the system.]

if sysbench truly is dependent on the BKL then that's another argument 
in favor of Linus's patch.

	Ingo

-------------------------->
Subject: BKL: spin on acquire
From: Ingo Molnar <mingo@elte.hu>
Date: Wed May 07 19:05:40 CEST 2008

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/kernel_lock.c |   13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

Index: linux/lib/kernel_lock.c
===================================================================
--- linux.orig/lib/kernel_lock.c
+++ linux/lib/kernel_lock.c
@@ -46,7 +46,10 @@ int __lockfunc __reacquire_kernel_lock(v
 	task->lock_depth = -1;
 	preempt_enable_no_resched();
 
-	down(&kernel_sem);
+	if (panic_timeout) {
+		while (down_trylock(&kernel_sem))
+			cpu_relax();
+	}
 
 	preempt_disable();
 	task->lock_depth = saved_lock_depth;
@@ -67,11 +70,15 @@ void __lockfunc lock_kernel(void)
 	struct task_struct *task = current;
 	int depth = task->lock_depth + 1;
 
-	if (likely(!depth))
+	if (likely(!depth)) {
 		/*
 		 * No recursion worries - we set up lock_depth _after_
 		 */
-		down(&kernel_sem);
+		if (panic_timeout) {
+			while (down_trylock(&kernel_sem))
+				cpu_relax();
+		}
+	}
 
 	task->lock_depth = depth;
 }

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-08  8:00     ` Mike Galbraith
  2008-05-08  8:46       ` Ingo Molnar
@ 2008-05-08  9:01       ` Zhang, Yanmin
  2008-05-08  9:13         ` Peter Zijlstra
  2008-05-08  9:15         ` Mike Galbraith
  1 sibling, 2 replies; 27+ messages in thread
From: Zhang, Yanmin @ 2008-05-08  9:01 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Peter Zijlstra, LKML, Ingo Molnar


On Thu, 2008-05-08 at 10:00 +0200, Mike Galbraith wrote:
> On Thu, 2008-05-08 at 14:35 +0800, Zhang, Yanmin wrote: 
> > On Wed, 2008-05-07 at 18:26 +0200, Peter Zijlstra wrote: 
> > > On Wed, 2008-05-07 at 12:55 +0800, Zhang, Yanmin wrote:
> > > > Comparing with kernel 2.6.25, sysbench+mysql(oltp, readonly) has many
> > > > regression with 2.6.26-rc1.
> > > > 
> > > > 1) 8-core stoakley: 28%;
> > > > 2) 16-core tigerton: 20%;
> > > > 3) Itanium Montvale: 50%.
> > > > 
> > > > Bisect located below patch.
> > > > 
> > > > 8f1bc385cfbab474db6c27b5af1e439614f3025c is first bad commit
> > > > commit 8f1bc385cfbab474db6c27b5af1e439614f3025c
> > > > Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > > > Date:   Sat Apr 19 19:45:00 2008 +0200
> > > > 
> > > >     sched: fair: weight calculations
> > > >     
> > > >     In order to level the hierarchy, we need to calculate load based on the
> > > >     root view. That is, each task's load is in the same unit.
> > > > 
> > > > 
> > > > 
> > > > After I manually reverted the patch against 2.6.26-rc1 while fixing a couple of
> > > > conflictions/errors, sysbench oltp regression became less than 3% on 8-core
> > > > stoakley.
> > > 
> > > Does this patch help?
> > With the patch, oltp testing result is about 50% worse than the one of pure
> > 2.6.26-rc1.
> 
> Hm.  I was doing some sysbench+postgress(oltp, ro) testing on my little
> Q6600 box this morning, and saw a different picture.
How many cpu are in the Q6600?

> 
> In attached pdf, .bkl refers to Linus' BKL patch, .weight is the weight
> fix, both are applied to git.today.  The script I used is also attached.
With my 8-core stoakley (using mysql):
1) 2.6.25:
Number of threads: 6
    read/write requests:                 8025024 (66874.53 per sec.)
Number of threads: 8
    read/write requests:                 9132816 (76106.14 per sec.)
Number of threads: 10
    read/write requests:                 9244998 (77040.75 per sec.)
Number of threads: 12
    read/write requests:                 8994174 (74950.36 per sec.)
Number of threads: 14
    read/write requests:                 9051322 (75426.54 per sec.)
Number of threads: 16
    read/write requests:                 9015412 (75126.93 per sec.)

2) 2.6.26-rc1:
Number of threads: 6
    read/write requests:                 5754056 (47949.87 per sec.)
Number of threads: 8
    read/write requests:                 6528480 (54403.29 per sec.)
Number of threads: 10
    read/write requests:                 6444690 (53705.16 per sec.)
Number of threads: 12
    read/write requests:                 6544258 (54534.23 per sec.)
Number of threads: 14
    read/write requests:                 6796650 (56637.65 per sec.)
Number of threads: 16
    read/write requests:                 6718110 (55983.18 per sec.)

3) 2.6.26-rc1+weight
Number of threads: 16
	read/write requests:                 3219076 (26824.22 per sec.)

I'm not sure if more cpu could introduce more contention in this test.

-yanmin



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-08  9:01       ` Zhang, Yanmin
@ 2008-05-08  9:13         ` Peter Zijlstra
  2008-05-08  9:15         ` Mike Galbraith
  1 sibling, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2008-05-08  9:13 UTC (permalink / raw)
  To: Zhang, Yanmin; +Cc: Mike Galbraith, LKML, Ingo Molnar

On Thu, 2008-05-08 at 17:01 +0800, Zhang, Yanmin wrote:
> On Thu, 2008-05-08 at 10:00 +0200, Mike Galbraith wrote:
> > On Thu, 2008-05-08 at 14:35 +0800, Zhang, Yanmin wrote: 
> > > On Wed, 2008-05-07 at 18:26 +0200, Peter Zijlstra wrote: 
> > > > On Wed, 2008-05-07 at 12:55 +0800, Zhang, Yanmin wrote:
> > > > > Comparing with kernel 2.6.25, sysbench+mysql(oltp, readonly) has many
> > > > > regression with 2.6.26-rc1.
> > > > > 
> > > > > 1) 8-core stoakley: 28%;
> > > > > 2) 16-core tigerton: 20%;
> > > > > 3) Itanium Montvale: 50%.
> > > > > 
> > > > > Bisect located below patch.
> > > > > 
> > > > > 8f1bc385cfbab474db6c27b5af1e439614f3025c is first bad commit
> > > > > commit 8f1bc385cfbab474db6c27b5af1e439614f3025c
> > > > > Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > > > > Date:   Sat Apr 19 19:45:00 2008 +0200
> > > > > 
> > > > >     sched: fair: weight calculations
> > > > >     
> > > > >     In order to level the hierarchy, we need to calculate load based on the
> > > > >     root view. That is, each task's load is in the same unit.
> > > > > 
> > > > > 
> > > > > 
> > > > > After I manually reverted the patch against 2.6.26-rc1 while fixing a couple of
> > > > > conflictions/errors, sysbench oltp regression became less than 3% on 8-core
> > > > > stoakley.
> > > > 
> > > > Does this patch help?
> > > With the patch, oltp testing result is about 50% worse than the one of pure
> > > 2.6.26-rc1.
> > 
> > Hm.  I was doing some sysbench+postgress(oltp, ro) testing on my little
> > Q6600 box this morning, and saw a different picture.
> How many cpu are in the Q6600?

That's an Intel Quad core (Kentsfield).

> > In attached pdf, .bkl refers to Linus' BKL patch, .weight is the weight
> > fix, both are applied to git.today.  The script I used is also attached.
> With my 8-core stoakley (using mysql):
> 1) 2.6.25:
> Number of threads: 6
>     read/write requests:                 8025024 (66874.53 per sec.)
> Number of threads: 8
>     read/write requests:                 9132816 (76106.14 per sec.)
> Number of threads: 10
>     read/write requests:                 9244998 (77040.75 per sec.)
> Number of threads: 12
>     read/write requests:                 8994174 (74950.36 per sec.)
> Number of threads: 14
>     read/write requests:                 9051322 (75426.54 per sec.)
> Number of threads: 16
>     read/write requests:                 9015412 (75126.93 per sec.)
> 
> 2) 2.6.26-rc1:
> Number of threads: 6
>     read/write requests:                 5754056 (47949.87 per sec.)
> Number of threads: 8
>     read/write requests:                 6528480 (54403.29 per sec.)
> Number of threads: 10
>     read/write requests:                 6444690 (53705.16 per sec.)
> Number of threads: 12
>     read/write requests:                 6544258 (54534.23 per sec.)
> Number of threads: 14
>     read/write requests:                 6796650 (56637.65 per sec.)
> Number of threads: 16
>     read/write requests:                 6718110 (55983.18 per sec.)
> 
> 3) 2.6.26-rc1+weight
> Number of threads: 16
> 	read/write requests:                 3219076 (26824.22 per sec.)
> 
> I'm not sure if more cpu could introduce more contention in this test.

those numbers make me sad :-(


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-08  9:01       ` Zhang, Yanmin
  2008-05-08  9:13         ` Peter Zijlstra
@ 2008-05-08  9:15         ` Mike Galbraith
  2008-05-08  9:22           ` Zhang, Yanmin
  2008-05-08  9:23           ` Peter Zijlstra
  1 sibling, 2 replies; 27+ messages in thread
From: Mike Galbraith @ 2008-05-08  9:15 UTC (permalink / raw)
  To: Zhang, Yanmin; +Cc: Peter Zijlstra, LKML, Ingo Molnar


On Thu, 2008-05-08 at 17:01 +0800, Zhang, Yanmin wrote:
> On Thu, 2008-05-08 at 10:00 +0200, Mike Galbraith wrote:

> > Hm.  I was doing some sysbench+postgress(oltp, ro) testing on my little
> > Q6600 box this morning, and saw a different picture.

> How many cpu are in the Q6600?

1.

> > 
> > In attached pdf, .bkl refers to Linus' BKL patch, .weight is the weight
> > fix, both are applied to git.today.  The script I used is also attached.
> With my 8-core stoakley (using mysql):
> 1) 2.6.25:
> Number of threads: 6
>     read/write requests:                 8025024 (66874.53 per sec.)
> Number of threads: 8
>     read/write requests:                 9132816 (76106.14 per sec.)
> Number of threads: 10
>     read/write requests:                 9244998 (77040.75 per sec.)
> Number of threads: 12
>     read/write requests:                 8994174 (74950.36 per sec.)
> Number of threads: 14
>     read/write requests:                 9051322 (75426.54 per sec.)
> Number of threads: 16
>     read/write requests:                 9015412 (75126.93 per sec.)
> 
> 2) 2.6.26-rc1:
> Number of threads: 6
>     read/write requests:                 5754056 (47949.87 per sec.)
> Number of threads: 8
>     read/write requests:                 6528480 (54403.29 per sec.)
> Number of threads: 10
>     read/write requests:                 6444690 (53705.16 per sec.)
> Number of threads: 12
>     read/write requests:                 6544258 (54534.23 per sec.)
> Number of threads: 14
>     read/write requests:                 6796650 (56637.65 per sec.)
> Number of threads: 16
>     read/write requests:                 6718110 (55983.18 per sec.)
> 
> 3) 2.6.26-rc1+weight
> Number of threads: 16
> 	read/write requests:                 3219076 (26824.22 per sec.)
> 
> I'm not sure if more cpu could introduce more contention in this test.

-rc1.  Do you have the fix below applied?

commit a992241de614dd2b7c97a9ba64e28c0e563f19bf
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Mon May 5 23:56:17 2008 +0200

    sched: fix normalized sleeper
    
    Normalized sleeper uses calc_delta*() which requires that the rq load is
    already updated, so move account_entity_enqueue() before place_entity()
    
    Tested-by: Frans Pop <elendil@planet.nl>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 89fa32b..1295ddc 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -682,6 +682,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+	account_entity_enqueue(cfs_rq, se);
 
 	if (wakeup) {
 		place_entity(cfs_rq, se, 0);
@@ -692,7 +693,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	check_spread(cfs_rq, se);
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
-	account_entity_enqueue(cfs_rq, se);
 }
 
 static void update_avg(u64 *avg, u64 sample)



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-08  9:15         ` Mike Galbraith
@ 2008-05-08  9:22           ` Zhang, Yanmin
  2008-05-08  9:23           ` Peter Zijlstra
  1 sibling, 0 replies; 27+ messages in thread
From: Zhang, Yanmin @ 2008-05-08  9:22 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Peter Zijlstra, LKML, Ingo Molnar


On Thu, 2008-05-08 at 11:15 +0200, Mike Galbraith wrote:
> On Thu, 2008-05-08 at 17:01 +0800, Zhang, Yanmin wrote:
> > On Thu, 2008-05-08 at 10:00 +0200, Mike Galbraith wrote:
> 
> > > Hm.  I was doing some sysbench+postgress(oltp, ro) testing on my little
> > > Q6600 box this morning, and saw a different picture.
> 
> > How many cpu are in the Q6600?
> 
> 1.

> > 3) 2.6.26-rc1+weight
> > Number of threads: 16
> > 	read/write requests:                 3219076 (26824.22 per sec.)
> > 
> > I'm not sure if more cpu could introduce more contention in this test.
> 
> -rc1.  Do you have the fix below applied?
No. 

> 
> commit a992241de614dd2b7c97a9ba64e28c0e563f19bf
> Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date:   Mon May 5 23:56:17 2008 +0200
> 
>     sched: fix normalized sleeper
>     
>     Normalized sleeper uses calc_delta*() which requires that the rq load is
>     already updated, so move account_entity_enqueue() before place_entity()
>     
>     Tested-by: Frans Pop <elendil@planet.nl>
>     Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
>     Signed-off-by: Ingo Molnar <mingo@elte.hu>
> 
> diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
> index 89fa32b..1295ddc 100644
> --- a/kernel/sched_fair.c
> +++ b/kernel/sched_fair.c
> @@ -682,6 +682,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
>  	 * Update run-time statistics of the 'current'.
>  	 */
>  	update_curr(cfs_rq);
> +	account_entity_enqueue(cfs_rq, se);
>  
>  	if (wakeup) {
>  		place_entity(cfs_rq, se, 0);
> @@ -692,7 +693,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
>  	check_spread(cfs_rq, se);
>  	if (se != cfs_rq->curr)
>  		__enqueue_entity(cfs_rq, se);
> -	account_entity_enqueue(cfs_rq, se);
>  }
>  
>  static void update_avg(u64 *avg, u64 sample)
> 
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-08  9:15         ` Mike Galbraith
  2008-05-08  9:22           ` Zhang, Yanmin
@ 2008-05-08  9:23           ` Peter Zijlstra
  2008-05-09  1:16             ` Zhang, Yanmin
  1 sibling, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2008-05-08  9:23 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Zhang, Yanmin, LKML, Ingo Molnar

On Thu, 2008-05-08 at 11:15 +0200, Mike Galbraith wrote:
> On Thu, 2008-05-08 at 17:01 +0800, Zhang, Yanmin wrote:
> > On Thu, 2008-05-08 at 10:00 +0200, Mike Galbraith wrote:
> 
> > > Hm.  I was doing some sysbench+postgress(oltp, ro) testing on my little
> > > Q6600 box this morning, and saw a different picture.
> 
> > How many cpu are in the Q6600?
> 
> 1.
> 
> > > 
> > > In attached pdf, .bkl refers to Linus' BKL patch, .weight is the weight
> > > fix, both are applied to git.today.  The script I used is also attached.
> > With my 8-core stoakley (using mysql):
> > 1) 2.6.25:
> > Number of threads: 6
> >     read/write requests:                 8025024 (66874.53 per sec.)
> > Number of threads: 8
> >     read/write requests:                 9132816 (76106.14 per sec.)
> > Number of threads: 10
> >     read/write requests:                 9244998 (77040.75 per sec.)
> > Number of threads: 12
> >     read/write requests:                 8994174 (74950.36 per sec.)
> > Number of threads: 14
> >     read/write requests:                 9051322 (75426.54 per sec.)
> > Number of threads: 16
> >     read/write requests:                 9015412 (75126.93 per sec.)
> > 
> > 2) 2.6.26-rc1:
> > Number of threads: 6
> >     read/write requests:                 5754056 (47949.87 per sec.)
> > Number of threads: 8
> >     read/write requests:                 6528480 (54403.29 per sec.)
> > Number of threads: 10
> >     read/write requests:                 6444690 (53705.16 per sec.)
> > Number of threads: 12
> >     read/write requests:                 6544258 (54534.23 per sec.)
> > Number of threads: 14
> >     read/write requests:                 6796650 (56637.65 per sec.)
> > Number of threads: 16
> >     read/write requests:                 6718110 (55983.18 per sec.)
> > 
> > 3) 2.6.26-rc1+weight
> > Number of threads: 16
> > 	read/write requests:                 3219076 (26824.22 per sec.)
> > 
> > I'm not sure if more cpu could introduce more contention in this test.
> 
> -rc1.  Do you have the fix below applied?

Oooh - good catch, that seems to be a post -rc1 merge.

Yes this is required.

> commit a992241de614dd2b7c97a9ba64e28c0e563f19bf
> Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date:   Mon May 5 23:56:17 2008 +0200
> 
>     sched: fix normalized sleeper
>     
>     Normalized sleeper uses calc_delta*() which requires that the rq load is
>     already updated, so move account_entity_enqueue() before place_entity()
>     
>     Tested-by: Frans Pop <elendil@planet.nl>
>     Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
>     Signed-off-by: Ingo Molnar <mingo@elte.hu>
> 
> diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
> index 89fa32b..1295ddc 100644
> --- a/kernel/sched_fair.c
> +++ b/kernel/sched_fair.c
> @@ -682,6 +682,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
>  	 * Update run-time statistics of the 'current'.
>  	 */
>  	update_curr(cfs_rq);
> +	account_entity_enqueue(cfs_rq, se);
>  
>  	if (wakeup) {
>  		place_entity(cfs_rq, se, 0);
> @@ -692,7 +693,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
>  	check_spread(cfs_rq, se);
>  	if (se != cfs_rq->curr)
>  		__enqueue_entity(cfs_rq, se);
> -	account_entity_enqueue(cfs_rq, se);
>  }
>  
>  static void update_avg(u64 *avg, u64 sample)
> 
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-08  8:46       ` Ingo Molnar
@ 2008-05-08  9:25         ` Ingo Molnar
  2008-05-08 10:00           ` Mike Galbraith
  2008-05-08  9:37         ` Mike Galbraith
  1 sibling, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2008-05-08  9:25 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Zhang, Yanmin, Peter Zijlstra, LKML


* Ingo Molnar <mingo@elte.hu> wrote:

> -	down(&kernel_sem);
> +	if (panic_timeout) {
> +		while (down_trylock(&kernel_sem))
> +			cpu_relax();

updated one below - this will work fine in the !panic_timeout case too 
;-)

	Ingo

Index: linux/lib/kernel_lock.c
===================================================================
--- linux.orig/lib/kernel_lock.c
+++ linux/lib/kernel_lock.c
@@ -46,7 +46,12 @@ int __lockfunc __reacquire_kernel_lock(v
 	task->lock_depth = -1;
 	preempt_enable_no_resched();
 
-	down(&kernel_sem);
+	if (panic_timeout) {
+		while (down_trylock(&kernel_sem))
+			cpu_relax();
+	} else {
+		down(&kernel_sem);
+	}
 
 	preempt_disable();
 	task->lock_depth = saved_lock_depth;
@@ -67,11 +72,17 @@ void __lockfunc lock_kernel(void)
 	struct task_struct *task = current;
 	int depth = task->lock_depth + 1;
 
-	if (likely(!depth))
+	if (likely(!depth)) {
 		/*
 		 * No recursion worries - we set up lock_depth _after_
 		 */
-		down(&kernel_sem);
+		if (panic_timeout) {
+			while (down_trylock(&kernel_sem))
+				cpu_relax();
+		} else {
+			down(&kernel_sem);
+		}
+	}
 
 	task->lock_depth = depth;
 }

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-08  8:46       ` Ingo Molnar
  2008-05-08  9:25         ` Ingo Molnar
@ 2008-05-08  9:37         ` Mike Galbraith
  1 sibling, 0 replies; 27+ messages in thread
From: Mike Galbraith @ 2008-05-08  9:37 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Zhang, Yanmin, Peter Zijlstra, LKML


On Thu, 2008-05-08 at 10:46 +0200, Ingo Molnar wrote:

> Could you perhaps try the hack below that uses /proc/sys/kernel/panic as 
> a flag whether the BKL should cause us to block or should be spun upon.
> 
> Can you confirm that with that patch too sysbench shows sensitivity to 
> the value of the panic flag? [this is the surest way to measure such 
> effects as flipping the sysctl only minimally impacts the system.]

Difference for average of 3 runs at 128 clients is 0.3% with this
patch.. iow, nope.

	-Mike


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-08  9:25         ` Ingo Molnar
@ 2008-05-08 10:00           ` Mike Galbraith
  0 siblings, 0 replies; 27+ messages in thread
From: Mike Galbraith @ 2008-05-08 10:00 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Zhang, Yanmin, Peter Zijlstra, LKML


On Thu, 2008-05-08 at 11:25 +0200, Ingo Molnar wrote:
> * Ingo Molnar <mingo@elte.hu> wrote:
> 
> > -	down(&kernel_sem);
> > +	if (panic_timeout) {
> > +		while (down_trylock(&kernel_sem))
> > +			cpu_relax();
> 
> updated one below - this will work fine in the !panic_timeout case too 
> ;-)

Same result.

	-Mike


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-08  9:23           ` Peter Zijlstra
@ 2008-05-09  1:16             ` Zhang, Yanmin
  2008-05-09  6:51               ` Peter Zijlstra
  0 siblings, 1 reply; 27+ messages in thread
From: Zhang, Yanmin @ 2008-05-09  1:16 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Mike Galbraith, LKML, Ingo Molnar


On Thu, 2008-05-08 at 11:23 +0200, Peter Zijlstra wrote:
> On Thu, 2008-05-08 at 11:15 +0200, Mike Galbraith wrote:
> > On Thu, 2008-05-08 at 17:01 +0800, Zhang, Yanmin wrote:
> > > On Thu, 2008-05-08 at 10:00 +0200, Mike Galbraith wrote:
> > 
> > > > Hm.  I was doing some sysbench+postgress(oltp, ro) testing on my little
> > > > Q6600 box this morning, and saw a different picture.
> > 
> > > How many cpu are in the Q6600?
> > 
> > 1.
> > 
> > > > 
> > > > In attached pdf, .bkl refers to Linus' BKL patch, .weight is the weight
> > > > fix, both are applied to git.today.  The script I used is also attached.
> > > With my 8-core stoakley (using mysql):
> > > 1) 2.6.25:
> > > Number of threads: 6
> > >     read/write requests:                 8025024 (66874.53 per sec.)
> > > Number of threads: 8
> > >     read/write requests:                 9132816 (76106.14 per sec.)
> > > Number of threads: 10
> > >     read/write requests:                 9244998 (77040.75 per sec.)
> > > Number of threads: 12
> > >     read/write requests:                 8994174 (74950.36 per sec.)
> > > Number of threads: 14
> > >     read/write requests:                 9051322 (75426.54 per sec.)
> > > Number of threads: 16
> > >     read/write requests:                 9015412 (75126.93 per sec.)
> > > 
> > > 2) 2.6.26-rc1:
> > > Number of threads: 6
> > >     read/write requests:                 5754056 (47949.87 per sec.)
> > > Number of threads: 8
> > >     read/write requests:                 6528480 (54403.29 per sec.)
> > > Number of threads: 10
> > >     read/write requests:                 6444690 (53705.16 per sec.)
> > > Number of threads: 12
> > >     read/write requests:                 6544258 (54534.23 per sec.)
> > > Number of threads: 14
> > >     read/write requests:                 6796650 (56637.65 per sec.)
> > > Number of threads: 16
> > >     read/write requests:                 6718110 (55983.18 per sec.)
> > > 
> > > 3) 2.6.26-rc1+weight
> > > Number of threads: 16
> > > 	read/write requests:                 3219076 (26824.22 per sec.)
> > > 
> > > I'm not sure if more cpu could introduce more contention in this test.
> > 
> > -rc1.  Do you have the fix below applied?
> 
> Oooh - good catch, that seems to be a post -rc1 merge.
> 
> Yes this is required.
With below patch+previous_fix_weight_calc, the result is improved, but is still
about 10% worse than the one of pure 2.6.26-rc1.
Number of threads: 16
    read/write requests:                 6104336 (50867.11 per sec.)


> 
> > commit a992241de614dd2b7c97a9ba64e28c0e563f19bf
> > Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > Date:   Mon May 5 23:56:17 2008 +0200
> > 
> >     sched: fix normalized sleeper
> >     
> >     Normalized sleeper uses calc_delta*() which requires that the rq load is
> >     already updated, so move account_entity_enqueue() before place_entity()
> >     
> >     Tested-by: Frans Pop <elendil@planet.nl>
> >     Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> >     Signed-off-by: Ingo Molnar <mingo@elte.hu>
> > 
> > diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
> > index 89fa32b..1295ddc 100644
> > --- a/kernel/sched_fair.c
> > +++ b/kernel/sched_fair.c
> > @@ -682,6 +682,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
> >  	 * Update run-time statistics of the 'current'.
> >  	 */
> >  	update_curr(cfs_rq);
> > +	account_entity_enqueue(cfs_rq, se);
> >  
> >  	if (wakeup) {
> >  		place_entity(cfs_rq, se, 0);
> > @@ -692,7 +693,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
> >  	check_spread(cfs_rq, se);
> >  	if (se != cfs_rq->curr)
> >  		__enqueue_entity(cfs_rq, se);
> > -	account_entity_enqueue(cfs_rq, se);
> >  }
> >  
> >  static void update_avg(u64 *avg, u64 sample)
> > 
> > 
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-09  1:16             ` Zhang, Yanmin
@ 2008-05-09  6:51               ` Peter Zijlstra
  2008-05-09  7:32                 ` Mike Galbraith
  0 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2008-05-09  6:51 UTC (permalink / raw)
  To: Zhang, Yanmin; +Cc: Mike Galbraith, LKML, Ingo Molnar


> > > -rc1.  Do you have the fix below applied?
> > 
> > Oooh - good catch, that seems to be a post -rc1 merge.
> > 
> > Yes this is required.

> With below patch+previous_fix_weight_calc, the result is improved, but is still
> about 10% worse than the one of pure 2.6.26-rc1.
> Number of threads: 16
>     read/write requests:                 6104336 (50867.11 per sec.)
> 

So puzzling - could you try -git + fix_weight_calc, as I think that is
what Mike tested and proved good on his Quad.




^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-09  6:51               ` Peter Zijlstra
@ 2008-05-09  7:32                 ` Mike Galbraith
  2008-05-09  7:50                   ` Peter Zijlstra
  0 siblings, 1 reply; 27+ messages in thread
From: Mike Galbraith @ 2008-05-09  7:32 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Zhang, Yanmin, LKML, Ingo Molnar

[-- Attachment #1: Type: text/plain, Size: 842 bytes --]


On Fri, 2008-05-09 at 08:51 +0200, Peter Zijlstra wrote:
> > > > -rc1.  Do you have the fix below applied?
> > > 
> > > Oooh - good catch, that seems to be a post -rc1 merge.
> > > 
> > > Yes this is required.
> 
> > With below patch+previous_fix_weight_calc, the result is improved, but is still
> > about 10% worse than the one of pure 2.6.26-rc1.
> > Number of threads: 16
> >     read/write requests:                 6104336 (50867.11 per sec.)
> > 
> 
> So puzzling - could you try -git + fix_weight_calc, as I think that is
> what Mike tested and proved good on his Quad.

If he still has group scheduling enabled, my box agrees that there's a
regression.  I got mysql running, and tested with group scheduling both
enabled and disabled, results attached.  (for some reason, I can only
test up to 64 threads with mysql though)

	-Mike

[-- Attachment #2: yy.pdf --]
[-- Type: application/pdf, Size: 16940 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-09  7:32                 ` Mike Galbraith
@ 2008-05-09  7:50                   ` Peter Zijlstra
  2008-05-09  7:58                     ` Mike Galbraith
  2008-05-09  7:59                     ` Mike Galbraith
  0 siblings, 2 replies; 27+ messages in thread
From: Peter Zijlstra @ 2008-05-09  7:50 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Zhang, Yanmin, LKML, Ingo Molnar

On Fri, 2008-05-09 at 09:32 +0200, Mike Galbraith wrote:
> On Fri, 2008-05-09 at 08:51 +0200, Peter Zijlstra wrote:
> > > > > -rc1.  Do you have the fix below applied?
> > > > 
> > > > Oooh - good catch, that seems to be a post -rc1 merge.
> > > > 
> > > > Yes this is required.
> > 
> > > With below patch+previous_fix_weight_calc, the result is improved, but is still
> > > about 10% worse than the one of pure 2.6.26-rc1.
> > > Number of threads: 16
> > >     read/write requests:                 6104336 (50867.11 per sec.)
> > > 
> > 
> > So puzzling - could you try -git + fix_weight_calc, as I think that is
> > what Mike tested and proved good on his Quad.
> 
> If he still has group scheduling enabled, my box agrees that there's a
> regression.  I got mysql running, and tested with group scheduling both
> enabled and disabled, results attached.  (for some reason, I can only
> test up to 64 threads with mysql though)

Hm, ok. Does the patch at hand improve or regress the group config for
you?


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-09  7:50                   ` Peter Zijlstra
@ 2008-05-09  7:58                     ` Mike Galbraith
  2008-05-09  8:02                       ` Peter Zijlstra
  2008-05-09  7:59                     ` Mike Galbraith
  1 sibling, 1 reply; 27+ messages in thread
From: Mike Galbraith @ 2008-05-09  7:58 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Zhang, Yanmin, LKML, Ingo Molnar


On Fri, 2008-05-09 at 09:50 +0200, Peter Zijlstra wrote:
> On Fri, 2008-05-09 at 09:32 +0200, Mike Galbraith wrote:
> > On Fri, 2008-05-09 at 08:51 +0200, Peter Zijlstra wrote:
> > > > > > -rc1.  Do you have the fix below applied?
> > > > > 
> > > > > Oooh - good catch, that seems to be a post -rc1 merge.
> > > > > 
> > > > > Yes this is required.
> > > 
> > > > With below patch+previous_fix_weight_calc, the result is improved, but is still
> > > > about 10% worse than the one of pure 2.6.26-rc1.
> > > > Number of threads: 16
> > > >     read/write requests:                 6104336 (50867.11 per sec.)
> > > > 
> > > 
> > > So puzzling - could you try -git + fix_weight_calc, as I think that is
> > > what Mike tested and proved good on his Quad.
> > 
> > If he still has group scheduling enabled, my box agrees that there's a
> > regression.  I got mysql running, and tested with group scheduling both
> > enabled and disabled, results attached.  (for some reason, I can only
> > test up to 64 threads with mysql though)
> 
> Hm, ok. Does the patch at hand improve or regress the group config for
> you?

I'm testing got.today.

You mean 46151122e0a2e80e5a6b2889f595e371fe2b600d?

	-Mike


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-09  7:50                   ` Peter Zijlstra
  2008-05-09  7:58                     ` Mike Galbraith
@ 2008-05-09  7:59                     ` Mike Galbraith
  2008-05-09  8:10                       ` Peter Zijlstra
  1 sibling, 1 reply; 27+ messages in thread
From: Mike Galbraith @ 2008-05-09  7:59 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Zhang, Yanmin, LKML, Ingo Molnar


On Fri, 2008-05-09 at 09:50 +0200, Peter Zijlstra wrote:
> On Fri, 2008-05-09 at 09:32 +0200, Mike Galbraith wrote:
> > On Fri, 2008-05-09 at 08:51 +0200, Peter Zijlstra wrote:
> > > > > > -rc1.  Do you have the fix below applied?
> > > > > 
> > > > > Oooh - good catch, that seems to be a post -rc1 merge.
> > > > > 
> > > > > Yes this is required.
> > > 
> > > > With below patch+previous_fix_weight_calc, the result is improved, but is still
> > > > about 10% worse than the one of pure 2.6.26-rc1.
> > > > Number of threads: 16
> > > >     read/write requests:                 6104336 (50867.11 per sec.)
> > > > 
> > > 
> > > So puzzling - could you try -git + fix_weight_calc, as I think that is
> > > what Mike tested and proved good on his Quad.
> > 
> > If he still has group scheduling enabled, my box agrees that there's a
> > regression.  I got mysql running, and tested with group scheduling both
> > enabled and disabled, results attached.  (for some reason, I can only
> > test up to 64 threads with mysql though)
> 
> Hm, ok. Does the patch at hand improve or regress the group config for
> you?

Oh, you want me to revert it and retest group?

	-Mike


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-09  7:58                     ` Mike Galbraith
@ 2008-05-09  8:02                       ` Peter Zijlstra
  2008-05-09  8:23                         ` Mike Galbraith
  0 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2008-05-09  8:02 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Zhang, Yanmin, LKML, Ingo Molnar, Dhaval Giani

On Fri, 2008-05-09 at 09:58 +0200, Mike Galbraith wrote:
> On Fri, 2008-05-09 at 09:50 +0200, Peter Zijlstra wrote:
> > On Fri, 2008-05-09 at 09:32 +0200, Mike Galbraith wrote:
> > > On Fri, 2008-05-09 at 08:51 +0200, Peter Zijlstra wrote:
> > > > > > > -rc1.  Do you have the fix below applied?
> > > > > > 
> > > > > > Oooh - good catch, that seems to be a post -rc1 merge.
> > > > > > 
> > > > > > Yes this is required.
> > > > 
> > > > > With below patch+previous_fix_weight_calc, the result is improved, but is still
> > > > > about 10% worse than the one of pure 2.6.26-rc1.
> > > > > Number of threads: 16
> > > > >     read/write requests:                 6104336 (50867.11 per sec.)
> > > > > 
> > > > 
> > > > So puzzling - could you try -git + fix_weight_calc, as I think that is
> > > > what Mike tested and proved good on his Quad.
> > > 
> > > If he still has group scheduling enabled, my box agrees that there's a
> > > regression.  I got mysql running, and tested with group scheduling both
> > > enabled and disabled, results attached.  (for some reason, I can only
> > > test up to 64 threads with mysql though)
> > 
> > Hm, ok. Does the patch at hand improve or regress the group config for
> > you?
> 
> I'm testing got.today.
> 
> You mean 46151122e0a2e80e5a6b2889f595e371fe2b600d?

Exactly - Yanmin says it makes it worse for him, I'm wondering if you
can see a similar problem with group scheduling enabled.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-09  7:59                     ` Mike Galbraith
@ 2008-05-09  8:10                       ` Peter Zijlstra
  2008-05-09  8:25                         ` Mike Galbraith
  0 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2008-05-09  8:10 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Zhang, Yanmin, LKML, Ingo Molnar, Dhaval Giani

On Fri, 2008-05-09 at 09:59 +0200, Mike Galbraith wrote:
> On Fri, 2008-05-09 at 09:50 +0200, Peter Zijlstra wrote:
> > On Fri, 2008-05-09 at 09:32 +0200, Mike Galbraith wrote:
> > > On Fri, 2008-05-09 at 08:51 +0200, Peter Zijlstra wrote:
> > > > > > > -rc1.  Do you have the fix below applied?
> > > > > > 
> > > > > > Oooh - good catch, that seems to be a post -rc1 merge.
> > > > > > 
> > > > > > Yes this is required.
> > > > 
> > > > > With below patch+previous_fix_weight_calc, the result is improved, but is still
> > > > > about 10% worse than the one of pure 2.6.26-rc1.
> > > > > Number of threads: 16
> > > > >     read/write requests:                 6104336 (50867.11 per sec.)
> > > > > 
> > > > 
> > > > So puzzling - could you try -git + fix_weight_calc, as I think that is
> > > > what Mike tested and proved good on his Quad.
> > > 
> > > If he still has group scheduling enabled, my box agrees that there's a
> > > regression.  I got mysql running, and tested with group scheduling both
> > > enabled and disabled, results attached.  (for some reason, I can only
> > > test up to 64 threads with mysql though)
> > 
> > Hm, ok. Does the patch at hand improve or regress the group config for
> > you?
> 
> Oh, you want me to revert it and retest group?

Yes, I guess that would be what I'm asking ;-)


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-09  8:02                       ` Peter Zijlstra
@ 2008-05-09  8:23                         ` Mike Galbraith
  2008-05-09  9:47                           ` Mike Galbraith
  0 siblings, 1 reply; 27+ messages in thread
From: Mike Galbraith @ 2008-05-09  8:23 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Zhang, Yanmin, LKML, Ingo Molnar, Dhaval Giani

[-- Attachment #1: Type: text/plain, Size: 1597 bytes --]


On Fri, 2008-05-09 at 10:02 +0200, Peter Zijlstra wrote:
> On Fri, 2008-05-09 at 09:58 +0200, Mike Galbraith wrote:
> > On Fri, 2008-05-09 at 09:50 +0200, Peter Zijlstra wrote:
> > > On Fri, 2008-05-09 at 09:32 +0200, Mike Galbraith wrote:
> > > > On Fri, 2008-05-09 at 08:51 +0200, Peter Zijlstra wrote:
> > > > > > > > -rc1.  Do you have the fix below applied?
> > > > > > > 
> > > > > > > Oooh - good catch, that seems to be a post -rc1 merge.
> > > > > > > 
> > > > > > > Yes this is required.
> > > > > 
> > > > > > With below patch+previous_fix_weight_calc, the result is improved, but is still
> > > > > > about 10% worse than the one of pure 2.6.26-rc1.
> > > > > > Number of threads: 16
> > > > > >     read/write requests:                 6104336 (50867.11 per sec.)
> > > > > > 
> > > > > 
> > > > > So puzzling - could you try -git + fix_weight_calc, as I think that is
> > > > > what Mike tested and proved good on his Quad.
> > > > 
> > > > If he still has group scheduling enabled, my box agrees that there's a
> > > > regression.  I got mysql running, and tested with group scheduling both
> > > > enabled and disabled, results attached.  (for some reason, I can only
> > > > test up to 64 threads with mysql though)
> > > 
> > > Hm, ok. Does the patch at hand improve or regress the group config for
> > > you?
> > 
> > I'm testing got.today.
> > 
> > You mean 46151122e0a2e80e5a6b2889f595e371fe2b600d?
> 
> Exactly - Yanmin says it makes it worse for him, I'm wondering if you
> can see a similar problem with group scheduling enabled.

Nope, same fugly curve +-jitter.

	-Mike

[-- Attachment #2: yy.pdf --]
[-- Type: application/pdf, Size: 17674 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-09  8:10                       ` Peter Zijlstra
@ 2008-05-09  8:25                         ` Mike Galbraith
  0 siblings, 0 replies; 27+ messages in thread
From: Mike Galbraith @ 2008-05-09  8:25 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Zhang, Yanmin, LKML, Ingo Molnar, Dhaval Giani


On Fri, 2008-05-09 at 10:10 +0200, Peter Zijlstra wrote:
> On Fri, 2008-05-09 at 09:59 +0200, Mike Galbraith wrote:
> > On Fri, 2008-05-09 at 09:50 +0200, Peter Zijlstra wrote:
> > > On Fri, 2008-05-09 at 09:32 +0200, Mike Galbraith wrote:
> > > > On Fri, 2008-05-09 at 08:51 +0200, Peter Zijlstra wrote:
> > > > > > > > -rc1.  Do you have the fix below applied?
> > > > > > > 
> > > > > > > Oooh - good catch, that seems to be a post -rc1 merge.
> > > > > > > 
> > > > > > > Yes this is required.
> > > > > 
> > > > > > With below patch+previous_fix_weight_calc, the result is improved, but is still
> > > > > > about 10% worse than the one of pure 2.6.26-rc1.
> > > > > > Number of threads: 16
> > > > > >     read/write requests:                 6104336 (50867.11 per sec.)
> > > > > > 
> > > > > 
> > > > > So puzzling - could you try -git + fix_weight_calc, as I think that is
> > > > > what Mike tested and proved good on his Quad.
> > > > 
> > > > If he still has group scheduling enabled, my box agrees that there's a
> > > > regression.  I got mysql running, and tested with group scheduling both
> > > > enabled and disabled, results attached.  (for some reason, I can only
> > > > test up to 64 threads with mysql though)
> > > 
> > > Hm, ok. Does the patch at hand improve or regress the group config for
> > > you?
> > 
> > Oh, you want me to revert it and retest group?
> 
> Yes, I guess that would be what I'm asking ;-)

(yeah yeah, ok, i went dense, results sent;)


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1
  2008-05-09  8:23                         ` Mike Galbraith
@ 2008-05-09  9:47                           ` Mike Galbraith
  0 siblings, 0 replies; 27+ messages in thread
From: Mike Galbraith @ 2008-05-09  9:47 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Zhang, Yanmin, LKML, Ingo Molnar, Dhaval Giani

[-- Attachment #1: Type: text/plain, Size: 22826 bytes --]


On Fri, 2008-05-09 at 10:23 +0200, Mike Galbraith wrote:
> On Fri, 2008-05-09 at 10:02 +0200, Peter Zijlstra wrote:

> > > > Hm, ok. Does the patch at hand improve or regress the group config for
> > > > you?
> > > 
> > > I'm testing got.today.
> > > 
> > > You mean 46151122e0a2e80e5a6b2889f595e371fe2b600d?
> > 
> > Exactly - Yanmin says it makes it worse for him, I'm wondering if you
> > can see a similar problem with group scheduling enabled.
> 
> Nope, same fugly curve +-jitter.

Seems to be smpnice, see 2.6.26.git.grp.n- in attached.

	-Mike

Index: linux-2.6.26.git/include/linux/sched.h
===================================================================
--- linux-2.6.26.git.orig/include/linux/sched.h
+++ linux-2.6.26.git/include/linux/sched.h
@@ -766,7 +766,6 @@ struct sched_domain {
 	struct sched_domain *child;	/* bottom domain must be null terminated */
 	struct sched_group *groups;	/* the balancing groups of the domain */
 	cpumask_t span;			/* span of all CPUs in this domain */
-	int first_cpu;			/* cache of the first cpu in this domain */
 	unsigned long min_interval;	/* Minimum balance interval ms */
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
Index: linux-2.6.26.git/kernel/sched.c
===================================================================
--- linux-2.6.26.git.orig/kernel/sched.c
+++ linux-2.6.26.git/kernel/sched.c
@@ -398,43 +398,6 @@ struct cfs_rq {
 	 */
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
-
-#ifdef CONFIG_SMP
-	unsigned long task_weight;
-	unsigned long shares;
-	/*
-	 * We need space to build a sched_domain wide view of the full task
-	 * group tree, in order to avoid depending on dynamic memory allocation
-	 * during the load balancing we place this in the per cpu task group
-	 * hierarchy. This limits the load balancing to one instance per cpu,
-	 * but more should not be needed anyway.
-	 */
-	struct aggregate_struct {
-		/*
-		 *   load = weight(cpus) * f(tg)
-		 *
-		 * Where f(tg) is the recursive weight fraction assigned to
-		 * this group.
-		 */
-		unsigned long load;
-
-		/*
-		 * part of the group weight distributed to this span.
-		 */
-		unsigned long shares;
-
-		/*
-		 * The sum of all runqueue weights within this span.
-		 */
-		unsigned long rq_weight;
-
-		/*
-		 * Weight contributed by tasks; this is the part we can
-		 * influence by moving tasks around.
-		 */
-		unsigned long task_weight;
-	} aggregate;
-#endif
 #endif
 };
 
@@ -1490,349 +1453,11 @@ static void cpuacct_charge(struct task_s
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 #endif
 
-static inline void inc_cpu_load(struct rq *rq, unsigned long load)
-{
-	update_load_add(&rq->load, load);
-}
-
-static inline void dec_cpu_load(struct rq *rq, unsigned long load)
-{
-	update_load_sub(&rq->load, load);
-}
-
 #ifdef CONFIG_SMP
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/*
- * Group load balancing.
- *
- * We calculate a few balance domain wide aggregate numbers; load and weight.
- * Given the pictures below, and assuming each item has equal weight:
- *
- *         root          1 - thread
- *         / | \         A - group
- *        A  1  B
- *       /|\   / \
- *      C 2 D 3   4
- *      |   |
- *      5   6
- *
- * load:
- *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
- *    which equals 1/9-th of the total load.
- *
- * shares:
- *    The weight of this group on the selected cpus.
- *
- * rq_weight:
- *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
- *    B would get 2.
- *
- * task_weight:
- *    Part of the rq_weight contributed by tasks; all groups except B would
- *    get 1, B gets 2.
- */
-
-static inline struct aggregate_struct *
-aggregate(struct task_group *tg, struct sched_domain *sd)
-{
-	return &tg->cfs_rq[sd->first_cpu]->aggregate;
-}
-
-typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
-
-/*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
- */
-static
-void aggregate_walk_tree(aggregate_func down, aggregate_func up,
-			 struct sched_domain *sd)
-{
-	struct task_group *parent, *child;
-
-	rcu_read_lock();
-	parent = &root_task_group;
-down:
-	(*down)(parent, sd);
-	list_for_each_entry_rcu(child, &parent->children, siblings) {
-		parent = child;
-		goto down;
-
-up:
-		continue;
-	}
-	(*up)(parent, sd);
-
-	child = parent;
-	parent = parent->parent;
-	if (parent)
-		goto up;
-	rcu_read_unlock();
-}
-
-/*
- * Calculate the aggregate runqueue weight.
- */
-static
-void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
-{
-	unsigned long rq_weight = 0;
-	unsigned long task_weight = 0;
-	int i;
-
-	for_each_cpu_mask(i, sd->span) {
-		rq_weight += tg->cfs_rq[i]->load.weight;
-		task_weight += tg->cfs_rq[i]->task_weight;
-	}
-
-	aggregate(tg, sd)->rq_weight = rq_weight;
-	aggregate(tg, sd)->task_weight = task_weight;
-}
-
-/*
- * Compute the weight of this group on the given cpus.
- */
-static
-void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
-{
-	unsigned long shares = 0;
-	int i;
-
-	for_each_cpu_mask(i, sd->span)
-		shares += tg->cfs_rq[i]->shares;
-
-	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
-		shares = tg->shares;
-
-	aggregate(tg, sd)->shares = shares;
-}
-
-/*
- * Compute the load fraction assigned to this group, relies on the aggregate
- * weight and this group's parent's load, i.e. top-down.
- */
-static
-void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
-{
-	unsigned long load;
-
-	if (!tg->parent) {
-		int i;
-
-		load = 0;
-		for_each_cpu_mask(i, sd->span)
-			load += cpu_rq(i)->load.weight;
-
-	} else {
-		load = aggregate(tg->parent, sd)->load;
-
-		/*
-		 * shares is our weight in the parent's rq so
-		 * shares/parent->rq_weight gives our fraction of the load
-		 */
-		load *= aggregate(tg, sd)->shares;
-		load /= aggregate(tg->parent, sd)->rq_weight + 1;
-	}
-
-	aggregate(tg, sd)->load = load;
-}
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void
-__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
-			  int tcpu)
-{
-	int boost = 0;
-	unsigned long shares;
-	unsigned long rq_weight;
-
-	if (!tg->se[tcpu])
-		return;
-
-	rq_weight = tg->cfs_rq[tcpu]->load.weight;
-
-	/*
-	 * If there are currently no tasks on the cpu pretend there is one of
-	 * average load so that when a new task gets to run here it will not
-	 * get delayed by group starvation.
-	 */
-	if (!rq_weight) {
-		boost = 1;
-		rq_weight = NICE_0_LOAD;
-	}
-
-	/*
-	 *           \Sum shares * rq_weight
-	 * shares =  -----------------------
-	 *               \Sum rq_weight
-	 *
-	 */
-	shares = aggregate(tg, sd)->shares * rq_weight;
-	shares /= aggregate(tg, sd)->rq_weight + 1;
-
-	/*
-	 * record the actual number of shares, not the boosted amount.
-	 */
-	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
-
-	if (shares < MIN_SHARES)
-		shares = MIN_SHARES;
-	else if (shares > MAX_SHARES)
-		shares = MAX_SHARES;
-
-	__set_se_shares(tg->se[tcpu], shares);
-}
-
-/*
- * Re-adjust the weights on the cpu the task came from and on the cpu the
- * task went to.
- */
-static void
-__move_group_shares(struct task_group *tg, struct sched_domain *sd,
-		    int scpu, int dcpu)
-{
-	unsigned long shares;
-
-	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
-
-	__update_group_shares_cpu(tg, sd, scpu);
-	__update_group_shares_cpu(tg, sd, dcpu);
-
-	/*
-	 * ensure we never loose shares due to rounding errors in the
-	 * above redistribution.
-	 */
-	shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
-	if (shares)
-		tg->cfs_rq[dcpu]->shares += shares;
-}
-
-/*
- * Because changing a group's shares changes the weight of the super-group
- * we need to walk up the tree and change all shares until we hit the root.
- */
-static void
-move_group_shares(struct task_group *tg, struct sched_domain *sd,
-		  int scpu, int dcpu)
-{
-	while (tg) {
-		__move_group_shares(tg, sd, scpu, dcpu);
-		tg = tg->parent;
-	}
-}
-
-static
-void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
-{
-	unsigned long shares = aggregate(tg, sd)->shares;
-	int i;
-
-	for_each_cpu_mask(i, sd->span) {
-		struct rq *rq = cpu_rq(i);
-		unsigned long flags;
-
-		spin_lock_irqsave(&rq->lock, flags);
-		__update_group_shares_cpu(tg, sd, i);
-		spin_unlock_irqrestore(&rq->lock, flags);
-	}
-
-	aggregate_group_shares(tg, sd);
-
-	/*
-	 * ensure we never loose shares due to rounding errors in the
-	 * above redistribution.
-	 */
-	shares -= aggregate(tg, sd)->shares;
-	if (shares) {
-		tg->cfs_rq[sd->first_cpu]->shares += shares;
-		aggregate(tg, sd)->shares += shares;
-	}
-}
-
-/*
- * Calculate the accumulative weight and recursive load of each task group
- * while walking down the tree.
- */
-static
-void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
-{
-	aggregate_group_weight(tg, sd);
-	aggregate_group_shares(tg, sd);
-	aggregate_group_load(tg, sd);
-}
-
-/*
- * Rebalance the cpu shares while walking back up the tree.
- */
-static
-void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
-{
-	aggregate_group_set_shares(tg, sd);
-}
-
-static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
-
-static void __init init_aggregate(void)
-{
-	int i;
-
-	for_each_possible_cpu(i)
-		spin_lock_init(&per_cpu(aggregate_lock, i));
-}
-
-static int get_aggregate(struct sched_domain *sd)
-{
-	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
-		return 0;
-
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
-	return 1;
-}
-
-static void put_aggregate(struct sched_domain *sd)
-{
-	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
-}
-
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-	cfs_rq->shares = shares;
-}
-
-#else
-
-static inline void init_aggregate(void)
-{
-}
-
-static inline int get_aggregate(struct sched_domain *sd)
-{
-	return 0;
-}
-
-static inline void put_aggregate(struct sched_domain *sd)
-{
-}
-#endif
-
-#else /* CONFIG_SMP */
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-}
-#endif
-
 #endif /* CONFIG_SMP */
 
 #include "sched_stats.h"
@@ -1845,14 +1470,26 @@ static void cfs_rq_set_shares(struct cfs
 
 #define sched_class_highest (&rt_sched_class)
 
-static void inc_nr_running(struct rq *rq)
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
+{
+	update_load_add(&rq->load, p->se.load.weight);
+}
+
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
+{
+	update_load_sub(&rq->load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running++;
+	inc_load(rq, p);
 }
 
-static void dec_nr_running(struct rq *rq)
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running--;
+	dec_load(rq, p);
 }
 
 static void set_load_weight(struct task_struct *p)
@@ -1944,7 +1581,7 @@ static void activate_task(struct rq *rq,
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, wakeup);
-	inc_nr_running(rq);
+	inc_nr_running(p, rq);
 }
 
 /*
@@ -1956,7 +1593,7 @@ static void deactivate_task(struct rq *r
 		rq->nr_uninterruptible++;
 
 	dequeue_task(rq, p, sleep);
-	dec_nr_running(rq);
+	dec_nr_running(p, rq);
 }
 
 /**
@@ -2609,7 +2246,7 @@ void wake_up_new_task(struct task_struct
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p);
-		inc_nr_running(rq);
+		inc_nr_running(p, rq);
 	}
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
@@ -3600,12 +3237,9 @@ static int load_balance(int this_cpu, st
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
-	int unlock_aggregate;
 
 	cpus_setall(*cpus);
 
-	unlock_aggregate = get_aggregate(sd);
-
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3721,9 +3355,8 @@ redo:
 
 	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		ld_moved = -1;
-
-	goto out;
+		return -1;
+	return ld_moved;
 
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
@@ -3738,13 +3371,8 @@ out_one_pinned:
 
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		ld_moved = -1;
-	else
-		ld_moved = 0;
-out:
-	if (unlock_aggregate)
-		put_aggregate(sd);
-	return ld_moved;
+		return -1;
+	return 0;
 }
 
 /*
@@ -4955,8 +4583,10 @@ void set_user_nice(struct task_struct *p
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	if (on_rq) {
 		dequeue_task(rq, p, 0);
+		dec_load(rq, p);
+	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -4966,6 +4596,7 @@ void set_user_nice(struct task_struct *p
 
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
+		inc_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -7339,7 +6970,6 @@ static int __build_sched_domains(const c
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
-			sd->first_cpu = first_cpu(sd->span);
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
@@ -7350,7 +6980,6 @@ static int __build_sched_domains(const c
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), &sd->span);
-		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7362,7 +6991,6 @@ static int __build_sched_domains(const c
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		sd->span = *nodemask;
-		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7374,7 +7002,6 @@ static int __build_sched_domains(const c
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		sd->span = cpu_coregroup_map(i);
-		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7387,7 +7014,6 @@ static int __build_sched_domains(const c
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		sd->span = per_cpu(cpu_sibling_map, i);
-		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -8057,7 +7683,6 @@ void __init sched_init(void)
 	}
 
 #ifdef CONFIG_SMP
-	init_aggregate();
 	init_defrootdomain();
 #endif
 
@@ -8622,11 +8247,14 @@ void sched_move_task(struct task_struct 
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
+	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 
+	spin_lock_irq(&rq->lock);
+
 	on_rq = se->on_rq;
 	if (on_rq)
 		dequeue_entity(cfs_rq, se, 0);
@@ -8636,17 +8264,8 @@ static void __set_se_shares(struct sched
 
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
-}
 
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-	struct cfs_rq *cfs_rq = se->cfs_rq;
-	struct rq *rq = cfs_rq->rq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&rq->lock, flags);
-	__set_se_shares(se, shares);
-	spin_unlock_irqrestore(&rq->lock, flags);
+	spin_unlock_irq(&rq->lock);
 }
 
 static DEFINE_MUTEX(shares_mutex);
@@ -8685,13 +8304,8 @@ int sched_group_set_shares(struct task_g
 	 * w/o tripping rebalance_share or load_balance_fair.
 	 */
 	tg->shares = shares;
-	for_each_possible_cpu(i) {
-		/*
-		 * force a rebalance
-		 */
-		cfs_rq_set_shares(tg->cfs_rq[i], 0);
+	for_each_possible_cpu(i)
 		set_se_shares(tg->se[i], shares);
-	}
 
 	/*
 	 * Enable load balance activity on this group, by inserting it back on
Index: linux-2.6.26.git/kernel/sched_fair.c
===================================================================
--- linux-2.6.26.git.orig/kernel/sched_fair.c
+++ linux-2.6.26.git/kernel/sched_fair.c
@@ -541,27 +541,10 @@ update_stats_curr_start(struct cfs_rq *c
  * Scheduling class queueing methods:
  */
 
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-	cfs_rq->task_weight += weight;
-}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-}
-#endif
-
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
-	if (!parent_entity(se))
-		inc_cpu_load(rq_of(cfs_rq), se->load.weight);
-	if (entity_is_task(se))
-		add_cfs_task_weight(cfs_rq, se->load.weight);
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
 	list_add(&se->group_node, &cfs_rq->tasks);
@@ -571,10 +554,6 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_sub(&cfs_rq->load, se->load.weight);
-	if (!parent_entity(se))
-		dec_cpu_load(rq_of(cfs_rq), se->load.weight);
-	if (entity_is_task(se))
-		add_cfs_task_weight(cfs_rq, -se->load.weight);
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
 	list_del_init(&se->group_node);
@@ -1366,90 +1345,75 @@ static struct task_struct *load_balance_
 	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
 }
 
-static unsigned long
-__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		unsigned long max_load_move, struct sched_domain *sd,
-		enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
-		struct cfs_rq *cfs_rq)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 {
-	struct rq_iterator cfs_rq_iterator;
+	struct sched_entity *curr;
+	struct task_struct *p;
 
-	cfs_rq_iterator.start = load_balance_start_fair;
-	cfs_rq_iterator.next = load_balance_next_fair;
-	cfs_rq_iterator.arg = cfs_rq;
+	if (!cfs_rq->nr_running || !first_fair(cfs_rq))
+		return MAX_PRIO;
+
+	curr = cfs_rq->curr;
+	if (!curr)
+		curr = __pick_next_entity(cfs_rq);
 
-	return balance_tasks(this_rq, this_cpu, busiest,
-			max_load_move, sd, idle, all_pinned,
-			this_best_prio, &cfs_rq_iterator);
+	p = task_of(curr);
+
+	return p->prio;
 }
+#endif
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
 		  int *all_pinned, int *this_best_prio)
 {
+	struct cfs_rq *busy_cfs_rq;
 	long rem_load_move = max_load_move;
-	int busiest_cpu = cpu_of(busiest);
-	struct task_group *tg;
-
-	rcu_read_lock();
-	list_for_each_entry(tg, &task_groups, list) {
-		long imbalance;
-		unsigned long this_weight, busiest_weight;
-		long rem_load, max_load, moved_load;
-
-		/*
-		 * empty group
-		 */
-		if (!aggregate(tg, sd)->task_weight)
-			continue;
-
-		rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
-		rem_load /= aggregate(tg, sd)->load + 1;
-
-		this_weight = tg->cfs_rq[this_cpu]->task_weight;
-		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
+	struct rq_iterator cfs_rq_iterator;
 
-		imbalance = (busiest_weight - this_weight) / 2;
+	cfs_rq_iterator.start = load_balance_start_fair;
+	cfs_rq_iterator.next = load_balance_next_fair;
 
-		if (imbalance < 0)
-			imbalance = busiest_weight;
+	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		struct cfs_rq *this_cfs_rq;
+		long imbalance;
+		unsigned long maxload;
 
-		max_load = max(rem_load, imbalance);
-		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
-				max_load, sd, idle, all_pinned, this_best_prio,
-				tg->cfs_rq[busiest_cpu]);
+		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
 
-		if (!moved_load)
+		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+		if (imbalance <= 0)
 			continue;
 
-		move_group_shares(tg, sd, busiest_cpu, this_cpu);
+		/* Don't pull more than imbalance/2 */
+		imbalance /= 2;
+		maxload = min(rem_load_move, imbalance);
 
-		moved_load *= aggregate(tg, sd)->load;
-		moved_load /= aggregate(tg, sd)->rq_weight + 1;
+		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+#else
+# define maxload rem_load_move
+#endif
+		/*
+		 * pass busy_cfs_rq argument into
+		 * load_balance_[start|next]_fair iterators
+		 */
+		cfs_rq_iterator.arg = busy_cfs_rq;
+		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+					       maxload, sd, idle, all_pinned,
+					       this_best_prio,
+					       &cfs_rq_iterator);
 
-		rem_load_move -= moved_load;
-		if (rem_load_move < 0)
+		if (rem_load_move <= 0)
 			break;
 	}
-	rcu_read_unlock();
 
 	return max_load_move - rem_load_move;
 }
-#else
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		  unsigned long max_load_move,
-		  struct sched_domain *sd, enum cpu_idle_type idle,
-		  int *all_pinned, int *this_best_prio)
-{
-	return __load_balance_fair(this_rq, this_cpu, busiest,
-			max_load_move, sd, idle, all_pinned,
-			this_best_prio, &busiest->cfs);
-}
-#endif
 
 static int
 move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
Index: linux-2.6.26.git/kernel/sched_rt.c
===================================================================
--- linux-2.6.26.git.orig/kernel/sched_rt.c
+++ linux-2.6.26.git/kernel/sched_rt.c
@@ -513,8 +513,6 @@ static void enqueue_task_rt(struct rq *r
 	 */
 	for_each_sched_rt_entity(rt_se)
 		enqueue_rt_entity(rt_se);
-
-	inc_cpu_load(rq, p->se.load.weight);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -534,8 +532,6 @@ static void dequeue_task_rt(struct rq *r
 		if (rt_rq && rt_rq->rt_nr_running)
 			enqueue_rt_entity(rt_se);
 	}
-
-	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
Index: linux-2.6.26.git/kernel/sched_debug.c
===================================================================
--- linux-2.6.26.git.orig/kernel/sched_debug.c
+++ linux-2.6.26.git/kernel/sched_debug.c
@@ -169,7 +169,7 @@ void print_cfs_rq(struct seq_file *m, in
 			cfs_rq->nr_spread_over);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
-	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->tg->shares);
 #endif
 #endif
 }


[-- Attachment #2: yy.pdf --]
[-- Type: application/pdf, Size: 17099 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2008-05-09  9:48 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-07  4:55 sysbench+mysql(oltp, readonly) 30% regression with 2.6.26-rc1 Zhang, Yanmin
2008-05-07  9:16 ` Ingo Molnar
2008-05-07  9:33   ` Zhang, Yanmin
2008-05-07 10:40     ` Ingo Molnar
2008-05-07 16:26 ` Peter Zijlstra
2008-05-08  6:35   ` Zhang, Yanmin
2008-05-08  8:00     ` Mike Galbraith
2008-05-08  8:46       ` Ingo Molnar
2008-05-08  9:25         ` Ingo Molnar
2008-05-08 10:00           ` Mike Galbraith
2008-05-08  9:37         ` Mike Galbraith
2008-05-08  9:01       ` Zhang, Yanmin
2008-05-08  9:13         ` Peter Zijlstra
2008-05-08  9:15         ` Mike Galbraith
2008-05-08  9:22           ` Zhang, Yanmin
2008-05-08  9:23           ` Peter Zijlstra
2008-05-09  1:16             ` Zhang, Yanmin
2008-05-09  6:51               ` Peter Zijlstra
2008-05-09  7:32                 ` Mike Galbraith
2008-05-09  7:50                   ` Peter Zijlstra
2008-05-09  7:58                     ` Mike Galbraith
2008-05-09  8:02                       ` Peter Zijlstra
2008-05-09  8:23                         ` Mike Galbraith
2008-05-09  9:47                           ` Mike Galbraith
2008-05-09  7:59                     ` Mike Galbraith
2008-05-09  8:10                       ` Peter Zijlstra
2008-05-09  8:25                         ` Mike Galbraith

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox