[PATCH] kernel <linux-2.6.11.10> kernel/sched.c

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
@ 2005-05-19 16:56 chen Shang
  2005-05-20  3:26 ` Nick Piggin
  0 siblings, 1 reply; 15+ messages in thread
From: chen Shang @ 2005-05-19 16:56 UTC (permalink / raw)
  To: linux-kernel, rml; +Cc: shangcs

Given the frequency of schedule() calling, there is a margin to
improve it. In step of recalculate task priority, it first dequeues
from one priority queue, calls recalc_task_prio(), then enqueue the
task into another priority queue. However, statistics shows only
around 0.5% of recalculation changed task priority (see below). While
rest of 99.5% of recalculation do not change priority, it is
reasonably to use requeue_task() to avoid overhead of dequeue and
enqueue on the same priority queue.

The patch is implemented with above idea. Note, a new help function,
change_queue_task(), to combine dequeue() and enqueue() to reduce one
function call overhead. Two statistics fields, sched_prio_changed and
sched_prio_unchanged, are added to provide statistic data on priority
recalculation.

Thanks,

chen
shangcs@gmail.com

/*===== Statistics ===== */
The statistics is based on Intel x86 machine with 2 Xeon 1.8G
processors with hyperthreading enable.
	Prio_unchanged	prio_changed	sched_cnt
CPU0	109			22743			59123 CPU1	120			23733			60407
CPU2	73			29981			86153 CPU3	96			22050			53094

/*===== Patch <linux-2.6.11.10> kernel/sched.c =====*/
--- linux-2.6.11.10.orig/kernel/sched.c	2005-05-16 10:51:53.000000000 -0700
+++ linux-2.6.11.10/kernel/sched.c	2005-05-18 22:31:32.000000000 -0700
@@ -249,6 +249,8 @@
 	unsigned long sched_noswitch;
 	unsigned long sched_switch;
 	unsigned long sched_cnt;
+	unsigned long sched_prio_changed;
+	unsigned long sched_prio_unchanged;
 	unsigned long sched_goidle;
 
 	/* pull_task() stats */
@@ -347,12 +349,20 @@
 
 		/* runqueue-specific stats */
 		seq_printf(seq,
-		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
-		    "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+		    "cpu%d \n\tyld: both(%lu) act(%lu) both(%lu) cnt(%lu) "
+			"\n\tsched: noswitch(%lu) switch(%lu) "
+			"\n\t\t cnt(%lu) prio_changed(%lu) prio_unchanged(%lu)"
+			"\n\t\t goidle(%lu) "
+			"\n\talb: cnt(%lu) gained(%lu) lost(%lu) failed(%lu) "
+		    "\n\tttwu:cnt(%lu) moved(%lu) attempts(%lu) "
+			"\n\twunt: cnt(%lu) moved(%lu) "
+			"\n\tsmt: cnt(%lu) \n\tsbe: cnt(%lu) "
+			"\n\trq_sched_info: cpu_time(%lu) run_delay(%lu) pcnt(%lu)",
 		    cpu, rq->yld_both_empty,
 		    rq->yld_act_empty, rq->yld_exp_empty,
 		    rq->yld_cnt, rq->sched_noswitch,
-		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
+		    rq->sched_switch, rq->sched_cnt, rq->sched_prio_changed, 
+			rq->sched_prio_unchanged, rq->sched_goidle,
 		    rq->alb_cnt, rq->alb_gained, rq->alb_lost,
 		    rq->alb_failed,
 		    rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts,
@@ -374,14 +384,14 @@
 			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
 			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
 						itype++) {
-				seq_printf(seq, " %lu %lu %lu %lu %lu",
+				seq_printf(seq, "lb: cnt(%lu) failed(%lu) imbl(%lu) nobzq(%lu) %lu",
 				    sd->lb_cnt[itype],
 				    sd->lb_failed[itype],
 				    sd->lb_imbalance[itype],
 				    sd->lb_nobusyq[itype],
 				    sd->lb_nobusyg[itype]);
 			}
-			seq_printf(seq, " %lu %lu %lu %lu\n",
+			seq_printf(seq, "sbe: pushed(%lu) attempts(%lu) %lu %lu\n",
 			    sd->sbe_pushed, sd->sbe_attempts,
 			    sd->ttwu_wake_affine, sd->ttwu_wake_balance);
 		}
@@ -580,6 +590,18 @@
 	p->array = array;
 }
 
+static void change_queue_task(struct task_struct *p, prio_array_t *array, 
+							int old_prio)
+{
+	list_del(&p->run_list);
+	if (list_empty(array->queue + old_prio))
+		__clear_bit(old_prio, array->bitmap);
+	
+	sched_info_queued(p);
+	list_add_tail(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	p->array = array;
+}
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
@@ -2668,7 +2690,7 @@
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int cpu, idx;
+	int cpu, idx, prio;
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
@@ -2787,9 +2809,19 @@
 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 
 		array = next->array;
-		dequeue_task(next, array);
+		prio = next->prio;
 		recalc_task_prio(next, next->timestamp + delta);
-		enqueue_task(next, array);
+		
+		if (unlikely(prio != next->prio))
+		{
+			change_queue_task(next, array, prio);
+			schedstat_inc(rq, sched_prio_changed);
+		}
+		else
+		{
+			requeue_task(next, array);
+			schedstat_inc(rq, sched_prio_unchanged);
+		}
 	}
 	next->activated = 0;
 switch_tasks:

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-19 16:56 [PATCH] kernel <linux-2.6.11.10> kernel/sched.c chen Shang
@ 2005-05-20  3:26 ` Nick Piggin
  2005-05-20  4:17   ` chen Shang
  0 siblings, 1 reply; 15+ messages in thread
From: Nick Piggin @ 2005-05-20  3:26 UTC (permalink / raw)
  To: chen Shang; +Cc: linux-kernel, rml

chen Shang wrote:

>Given the frequency of schedule() calling, there is a margin to
>improve it. In step of recalculate task priority, it first dequeues
>from one priority queue, calls recalc_task_prio(), then enqueue the
>task into another priority queue. However, statistics shows only
>around 0.5% of recalculation changed task priority (see below). While
>rest of 99.5% of recalculation do not change priority, it is
>reasonably to use requeue_task() to avoid overhead of dequeue and
>enqueue on the same priority queue.
>
>The patch is implemented with above idea. Note, a new help function,
>change_queue_task(), to combine dequeue() and enqueue() to reduce one
>function call overhead. Two statistics fields, sched_prio_changed and
>sched_prio_unchanged, are added to provide statistic data on priority
>recalculation.
>
>Thanks,
>
>chen
>shangcs@gmail.com
>

Hi Chen,
With the added branch and the extra icache footprint, it isn't clear
that this would be a win.

Also, you didn't say where your statistics came from (what workload).

So you really need to start by demonstrating some increase on some workload.

Also, minor comments on the patch: please work against mm kernels, 
please follow
kernel coding style, and don't change schedstat output format in the 
same patch
(makes it easier for those with schedstat parsing tools).

> 
>+static void change_queue_task(struct task_struct *p, prio_array_t *array, 
>+							int old_prio)
>+{
>+	list_del(&p->run_list);
>+	if (list_empty(array->queue + old_prio))
>+		__clear_bit(old_prio, array->bitmap);
>+	
>+	sched_info_queued(p);
>+	list_add_tail(&p->run_list, array->queue + p->prio);
>+	__set_bit(p->prio, array->bitmap);
>+	p->array = array;
>+}
> /*
>  * Put task to the end of the run list without the overhead of dequeue
>  * followed by enqueue.
>@@ -2668,7 +2690,7 @@
> 	struct list_head *queue;
> 	unsigned long long now;
> 	unsigned long run_time;
>-	int cpu, idx;
>+	int cpu, idx, prio;
> 
> 	/*
> 	 * Test if we are atomic.  Since do_exit() needs to call into
>@@ -2787,9 +2809,19 @@
> 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
> 
> 		array = next->array;
>-		dequeue_task(next, array);
>+		prio = next->prio;
> 		recalc_task_prio(next, next->timestamp + delta);
>-		enqueue_task(next, array);
>+		
>+		if (unlikely(prio != next->prio))
>+		{
>+			change_queue_task(next, array, prio);
>+			schedstat_inc(rq, sched_prio_changed);
>+		}
>+		else
>+		{
>+			requeue_task(next, array);
>+			schedstat_inc(rq, sched_prio_unchanged);
>+		}
> 	}
> 	next->activated = 0;
> switch_tasks:
>-
>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
>Please read the FAQ at  http://www.tux.org/lkml/
>
>



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-20  3:26 ` Nick Piggin
@ 2005-05-20  4:17   ` chen Shang
  2005-05-20  4:32     ` Lee Revell
  2005-05-20  5:13     ` Nick Piggin
  0 siblings, 2 replies; 15+ messages in thread
From: chen Shang @ 2005-05-20  4:17 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel

> Hi Chen,
> With the added branch and the extra icache footprint, it isn't clear
> that this would be a win.
> 
> Also, you didn't say where your statistics came from (what workload).
> 
> So you really need to start by demonstrating some increase on some workload.
> 
> Also, minor comments on the patch: please work against mm kernels,
> please follow
> kernel coding style, and don't change schedstat output format in the
> same patch
> (makes it easier for those with schedstat parsing tools).
> 
Hi Nick,

Thank you very much for your comments. This is the first time of my
kernel hacking. I will reduce the lines of changes as much as
possible. As regard to the statistics, there are just count, ie, the
total number of priority-recalculations vs. the number of priority
changed from the former recalculation.

Thanks again,
-chen

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-20  4:17   ` chen Shang
@ 2005-05-20  4:32     ` Lee Revell
  2005-05-20  5:13     ` Nick Piggin
  1 sibling, 0 replies; 15+ messages in thread
From: Lee Revell @ 2005-05-20  4:32 UTC (permalink / raw)
  To: chen Shang; +Cc: Nick Piggin, linux-kernel

On Thu, 2005-05-19 at 21:17 -0700, chen Shang wrote:
> > Hi Chen,
> > With the added branch and the extra icache footprint, it isn't clear
> > that this would be a win.
> > 
> > Also, you didn't say where your statistics came from (what workload).
> > 
> > So you really need to start by demonstrating some increase on some workload.
> > 
> > Also, minor comments on the patch: please work against mm kernels,
> > please follow
> > kernel coding style, and don't change schedstat output format in the
> > same patch
> > (makes it easier for those with schedstat parsing tools).
> > 
> Hi Nick,
> 
> Thank you very much for your comments. This is the first time of my
> kernel hacking. I will reduce the lines of changes as much as
> possible. As regard to the statistics, there are just count, ie, the
> total number of priority-recalculations vs. the number of priority
> changed from the former recalculation.

A kernel profile (check list archives for oprofile) would easily
demonstrate any performance gain.  On my system the residency of
schedule() is around 1% so this will be easy to spot.

Lee


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-20  4:17   ` chen Shang
  2005-05-20  4:32     ` Lee Revell
@ 2005-05-20  5:13     ` Nick Piggin
  2005-05-20  7:12       ` chen Shang
  1 sibling, 1 reply; 15+ messages in thread
From: Nick Piggin @ 2005-05-20  5:13 UTC (permalink / raw)
  To: chen Shang; +Cc: linux-kernel

chen Shang wrote:

>>Hi Chen,
>>With the added branch and the extra icache footprint, it isn't clear
>>that this would be a win.
>>
>>Also, you didn't say where your statistics came from (what workload).
>>
>>So you really need to start by demonstrating some increase on some workload.
>>
>>Also, minor comments on the patch: please work against mm kernels,
>>please follow
>>kernel coding style, and don't change schedstat output format in the
>>same patch
>>(makes it easier for those with schedstat parsing tools).
>>
>>
>Hi Nick,
>
>Thank you very much for your comments. This is the first time of my
>kernel hacking. I will reduce the lines of changes as much as
>possible. As regard to the statistics, there are just count, ie, the
>total number of priority-recalculations vs. the number of priority
>changed from the former recalculation.
>
>

The only problem there is that changing the format will break
at least my schedstats parser.

Seeing as it is not some functional change to the scheduler,
your patch should not change schedstats. If you are *also*
interested in the priority changed vs unchanged statistic, then
you can do a patch for that seperately (ie. it doesn't depend
on your optimisation in question).

Thanks,
Nick



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-20  5:13     ` Nick Piggin
@ 2005-05-20  7:12       ` chen Shang
  2005-05-20  7:21         ` Nick Piggin
  2005-05-20  9:49         ` Ingo Molnar
  0 siblings, 2 replies; 15+ messages in thread
From: chen Shang @ 2005-05-20  7:12 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel, rml

I minimized my patch and against to 2.6.12-rc4 this time, see below.

The new schedstat fields are for the test propose only, so I removed
them completedly from patch. Theoritically, requeue_task() is always
cheaper than dequeue_task() followed by enqueue_task(). So, if 99% of
priority recalculation trigger requeue_task(), it will save.

In addition, my load is to build the kernel, which took around 30
minutes with around 30% CPU usage on 2x2 processors (duel processors
with HT enable).
Here is the statistics:
         
CPU0: priority_changed (669 times), priority_unchanged(335,138 times)
CPU1: priority_changed (784 times), priority_unchanged(342,419 times)
CPU2: priority_changed (782 times), priority_unchanged(283,494 times)
CPU3: priority_changed (872 times), priority_unchanged(365,865 times)

Thanks,
-chen


/*=====Patch=====*/
--- linux-2.6.12-rc4.orig/kernel/sched.c	2005-05-19 14:57:55.000000000 -0700
+++ linux-2.6.12-rc4/kernel/sched.c	2005-05-19 23:47:22.000000000 -0700
@@ -2613,7 +2613,7 @@
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int cpu, idx;
+	int cpu, idx, prio;
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
@@ -2735,9 +2735,17 @@
 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 
 		array = next->array;
-		dequeue_task(next, array);
+		prio = next->prio;
+		
 		recalc_task_prio(next, next->timestamp + delta);
-		enqueue_task(next, array);
+
+		if (unlikely(prio != next->prio))
+		{
+			dequeue_task(next, array);
+			enqueue_task(next, array);
+		}
+		else
+			requeue_task(next, array);
 	}
 	next->activated = 0;
 switch_tasks:

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-20  7:12       ` chen Shang
@ 2005-05-20  7:21         ` Nick Piggin
  2005-05-20  7:36           ` Con Kolivas
  2005-05-20  9:49         ` Ingo Molnar
  1 sibling, 1 reply; 15+ messages in thread
From: Nick Piggin @ 2005-05-20  7:21 UTC (permalink / raw)
  To: chen Shang; +Cc: linux-kernel, rml

chen Shang wrote:

>I minimized my patch and against to 2.6.12-rc4 this time, see below.
>
>The new schedstat fields are for the test propose only, so I removed
>them completedly from patch. Theoritically, requeue_task() is always
>cheaper than dequeue_task() followed by enqueue_task(). So, if 99% of
>priority recalculation trigger requeue_task(), it will save.
>
>In addition, my load is to build the kernel, which took around 30
>minutes with around 30% CPU usage on 2x2 processors (duel processors
>with HT enable).
>Here is the statistics:
>         
>CPU0: priority_changed (669 times), priority_unchanged(335,138 times)
>CPU1: priority_changed (784 times), priority_unchanged(342,419 times)
>CPU2: priority_changed (782 times), priority_unchanged(283,494 times)
>CPU3: priority_changed (872 times), priority_unchanged(365,865 times)
>
>

OK that gives you a good grounds to look at the patch, but _performance_
improvement is what is needed to get it included.

>Thanks,
>-chen
>
>
>/*=====Patch=====*/
>--- linux-2.6.12-rc4.orig/kernel/sched.c	2005-05-19 14:57:55.000000000 -0700
>+++ linux-2.6.12-rc4/kernel/sched.c	2005-05-19 23:47:22.000000000 -0700
>@@ -2613,7 +2613,7 @@
> 	struct list_head *queue;
> 	unsigned long long now;
> 	unsigned long run_time;
>-	int cpu, idx;
>+	int cpu, idx, prio;
> 
> 	/*
> 	 * Test if we are atomic.  Since do_exit() needs to call into
>@@ -2735,9 +2735,17 @@
> 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
> 
> 		array = next->array;
>-		dequeue_task(next, array);
>+		prio = next->prio;
>+		
> 		recalc_task_prio(next, next->timestamp + delta);
>-		enqueue_task(next, array);
>+
>+		if (unlikely(prio != next->prio))
>+		{
>+			dequeue_task(next, array);
>+			enqueue_task(next, array);
>+		}
>+		else
>+			requeue_task(next, array);
>

Coding style says
if (unlikely()) {
    ...
} else
    ...



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-20  7:21         ` Nick Piggin
@ 2005-05-20  7:36           ` Con Kolivas
  2005-05-20 13:41             ` chen Shang
  0 siblings, 1 reply; 15+ messages in thread
From: Con Kolivas @ 2005-05-20  7:36 UTC (permalink / raw)
  To: Nick Piggin; +Cc: chen Shang, linux-kernel, rml

[-- Attachment #1: Type: text/plain, Size: 1417 bytes --]

On Fri, 20 May 2005 17:21, Nick Piggin wrote:
> chen Shang wrote:
> >I minimized my patch and against to 2.6.12-rc4 this time, see below.
> >
> >The new schedstat fields are for the test propose only, so I removed
> >them completedly from patch. Theoritically, requeue_task() is always
> >cheaper than dequeue_task() followed by enqueue_task(). So, if 99% of
> >priority recalculation trigger requeue_task(), it will save.
> >
> >In addition, my load is to build the kernel, which took around 30
> >minutes with around 30% CPU usage on 2x2 processors (duel processors
> >with HT enable).
> >Here is the statistics:
> >
> >CPU0: priority_changed (669 times), priority_unchanged(335,138 times)
> >CPU1: priority_changed (784 times), priority_unchanged(342,419 times)
> >CPU2: priority_changed (782 times), priority_unchanged(283,494 times)
> >CPU3: priority_changed (872 times), priority_unchanged(365,865 times)
>
> OK that gives you a good grounds to look at the patch, but _performance_
> improvement is what is needed to get it included.

If you end up using requeue_task() in the fast path and it is hit frequently 
with your code you'll need to modify requeue_task to be inline as well. 
Currently it is hit only via sched_yield and once every 10 scheduler ticks 
which is why it is not inline. The performance hit will be demonstrable if it 
is hit in every schedule()

Cheers,
Con

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-20  7:12       ` chen Shang
  2005-05-20  7:21         ` Nick Piggin
@ 2005-05-20  9:49         ` Ingo Molnar
  2005-05-20 10:40           ` Con Kolivas
  1 sibling, 1 reply; 15+ messages in thread
From: Ingo Molnar @ 2005-05-20  9:49 UTC (permalink / raw)
  To: chen Shang; +Cc: Nick Piggin, linux-kernel, rml, Andrew Morton


* chen Shang <shangcs@gmail.com> wrote:

> I minimized my patch and against to 2.6.12-rc4 this time, see below.

looks good - i've done some small style/whitespace cleanups and renamed 
prio to old_prio, patch against -rc4 below.

	Ingo

From: Chen Shang <shangcs@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -2613,7 +2613,7 @@ asmlinkage void __sched schedule(void)
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int cpu, idx;
+	int cpu, idx, old_prio;
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
@@ -2735,9 +2735,15 @@ go_idle:
 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 
 		array = next->array;
-		dequeue_task(next, array);
+		old_prio = next->prio;
+
 		recalc_task_prio(next, next->timestamp + delta);
-		enqueue_task(next, array);
+
+		if (unlikely(old_prio != next->prio)) {
+			dequeue_task(next, array);
+			enqueue_task(next, array);
+		} else
+			requeue_task(next, array);
 	}
 	next->activated = 0;
 switch_tasks:

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-20  9:49         ` Ingo Molnar
@ 2005-05-20 10:40           ` Con Kolivas
  2005-05-20 11:34             ` Ingo Molnar
  0 siblings, 1 reply; 15+ messages in thread
From: Con Kolivas @ 2005-05-20 10:40 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: chen Shang, Nick Piggin, linux-kernel, rml, Andrew Morton


[-- Attachment #1.1: Type: text/plain, Size: 333 bytes --]

On Fri, 20 May 2005 19:49, Ingo Molnar wrote:
> * chen Shang <shangcs@gmail.com> wrote:
> > I minimized my patch and against to 2.6.12-rc4 this time, see below.
>
> looks good - i've done some small style/whitespace cleanups and renamed
> prio to old_prio, patch against -rc4 below.

We should inline requeue_task as well.

Con
----

[-- Attachment #1.2: inline_requeue_task.diff --]
[-- Type: text/x-diff, Size: 810 bytes --]

Putting requeue_task into the common fast path code in schedule() will
benefit generically from inlining the requeue_task function.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

Index: linux-2.6.12-rc4/kernel/sched.c
===================================================================
--- linux-2.6.12-rc4.orig/kernel/sched.c	2005-05-20 20:28:29.000000000 +1000
+++ linux-2.6.12-rc4/kernel/sched.c	2005-05-20 20:28:55.000000000 +1000
@@ -560,7 +560,7 @@ static void enqueue_task(struct task_str
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
  */
-static void requeue_task(struct task_struct *p, prio_array_t *array)
+static inline void requeue_task(struct task_struct *p, prio_array_t *array)
 {
 	list_move_tail(&p->run_list, array->queue + p->prio);
 }

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-20 10:40           ` Con Kolivas
@ 2005-05-20 11:34             ` Ingo Molnar
  2005-05-22  4:41               ` Chen Shang
  0 siblings, 1 reply; 15+ messages in thread
From: Ingo Molnar @ 2005-05-20 11:34 UTC (permalink / raw)
  To: Con Kolivas; +Cc: chen Shang, Nick Piggin, linux-kernel, rml, Andrew Morton


* Con Kolivas <kernel@kolivas.org> wrote:

> On Fri, 20 May 2005 19:49, Ingo Molnar wrote:
> > * chen Shang <shangcs@gmail.com> wrote:
> > > I minimized my patch and against to 2.6.12-rc4 this time, see below.
> >
> > looks good - i've done some small style/whitespace cleanups and renamed
> > prio to old_prio, patch against -rc4 below.
> 
> We should inline requeue_task as well.

yeah.

Acked-by: Ingo Molnar <mingo@elte.hu>

	Ingo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-20  7:36           ` Con Kolivas
@ 2005-05-20 13:41             ` chen Shang
  0 siblings, 0 replies; 15+ messages in thread
From: chen Shang @ 2005-05-20 13:41 UTC (permalink / raw)
  To: Con Kolivas; +Cc: Nick Piggin, linux-kernel, rml

Here is the statistics data to support requeue_task() inline  with the
counter on sched_cnt at the same period. In brief, every 10 schedule()
will trigger recalc_task_prio() a little more than 4 times.

CPU0: priority_changed (669 times), priority_unchanged(335,138 times),
schedule_cnt(787,085 times)
CPU1: priority_changed (784 times), priority_unchanged(342,419 times),
schedule_cnt(784,873 times)
CPU2: priority_changed (782 times), priority_unchanged(283,494 times),
schedule_cnt(681,917 times)
CPU3: priority_changed (872 times), priority_unchanged(365,865 times),
schedule_cnt(809,624 times)

On 5/20/05, Con Kolivas <kernel@kolivas.org> wrote:
> On Fri, 20 May 2005 17:21, Nick Piggin wrote:
> > chen Shang wrote:
> > >I minimized my patch and against to 2.6.12-rc4 this time, see below.
> > >
> > >The new schedstat fields are for the test propose only, so I removed
> > >them completedly from patch. Theoritically, requeue_task() is always
> > >cheaper than dequeue_task() followed by enqueue_task(). So, if 99% of
> > >priority recalculation trigger requeue_task(), it will save.
> > >
> > >In addition, my load is to build the kernel, which took around 30
> > >minutes with around 30% CPU usage on 2x2 processors (duel processors
> > >with HT enable).
> > >Here is the statistics:
> > >
> > >CPU0: priority_changed (669 times), priority_unchanged(335,138 times)
> > >CPU1: priority_changed (784 times), priority_unchanged(342,419 times)
> > >CPU2: priority_changed (782 times), priority_unchanged(283,494 times)
> > >CPU3: priority_changed (872 times), priority_unchanged(365,865 times)
> >
> > OK that gives you a good grounds to look at the patch, but _performance_
> > improvement is what is needed to get it included.
> 
> If you end up using requeue_task() in the fast path and it is hit frequently
> with your code you'll need to modify requeue_task to be inline as well.
> Currently it is hit only via sched_yield and once every 10 scheduler ticks
> which is why it is not inline. The performance hit will be demonstrable if it
> is hit in every schedule()
> 
> Cheers,
> Con
> 
> 
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-20 11:34             ` Ingo Molnar
@ 2005-05-22  4:41               ` Chen Shang
  2005-05-23  7:11                 ` Ingo Molnar
  0 siblings, 1 reply; 15+ messages in thread
From: Chen Shang @ 2005-05-22  4:41 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Con Kolivas, Nick Piggin, linux-kernel, rml, Andrew Morton

/*===== ISSUE ====*/
My second version of patch has a defect.

+  if (unlikely(old_prio != next->prio))             {
+      dequeue_task(next, array);  --> ### dequeue should against
old_prio, NOT next->prio ###
+      enqueue_task(next, array);
+  }

unforunately, dequeue_task does not accept the third parameter to make
adjustment. Personally, I feel it's good to add extra function as my
first version of patch to combine dequeue and enqueue together.
Reasons as following:
1) adding the third parameter to dequeue_task() would cause other
places' code change;
2) for schedule functions, performance is the first consideration.
Notice both dequeue_task() and enqueue_task() are NOT inline.
Combining those two in one saves one function call overhead;


/* ===== NEW PATCH ===== */
The new patch, see below, adds new function change_queue_task() to
dequeue from "old_prio queue" and enqueue the "next task" to
"next->prio queue".

The patch also inlines requeue_task().

The patch has been tested with 2.6.11.10, looks good. -For somehow,
2.6.12-rc4 is still not stable on my machine (Fedora 3).


/* ===== [PATCH 2.6.11.?] kernel/sched.c =====*/
--- linux-2.6.12-rc4.orig/kernel/sched.c	2005-05-06 22:20:31.000000000 -0700
+++ linux12/kernel/sched.c	2005-05-21 16:19:11.000000000 -0700
@@ -556,11 +556,23 @@
 	p->array = array;
 }
 
+static void change_queue_task(struct task_struct *p, prio_array_t
*array, int old_prio)
+{
+	list_del(&p->run_list);
+	if (list_empty(array->queue + old_prio))
+		__clear_bit(old_prio, array->bitmap);
+
+	sched_info_queued(p);
+	list_add_tail(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	p->array = array;
+}
+
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
  */
-static void requeue_task(struct task_struct *p, prio_array_t *array)
+static inline void requeue_task(struct task_struct *p, prio_array_t *array)
 {
 	list_move_tail(&p->run_list, array->queue + p->prio);
 }
@@ -2613,7 +2625,7 @@
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int cpu, idx;
+	int cpu, idx, old_prio;
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
@@ -2735,9 +2747,14 @@
 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 
 		array = next->array;
-		dequeue_task(next, array);
+		old_prio = next->prio;
+
 		recalc_task_prio(next, next->timestamp + delta);
-		enqueue_task(next, array);
+		
+		if (unlikely(old_prio != next->prio)) 
+			change_queue_task(next, array, old_prio);
+		else
+			requeue_task(next, array);
 	}
 	next->activated = 0;
 switch_tasks:

/* ===== PATCH END ===== */

Thanks,
-chen

On 5/20/05, Ingo Molnar <mingo@elte.hu> wrote:
> 
> * Con Kolivas <kernel@kolivas.org> wrote:
> 
> > On Fri, 20 May 2005 19:49, Ingo Molnar wrote:
> > > * chen Shang <shangcs@gmail.com> wrote:
> > > > I minimized my patch and against to 2.6.12-rc4 this time, see below.
> > >
> > > looks good - i've done some small style/whitespace cleanups and renamed
> > > prio to old_prio, patch against -rc4 below.
> >
> > We should inline requeue_task as well.
> 
> yeah.
> 
> Acked-by: Ingo Molnar <mingo@elte.hu>
> 
>         Ingo
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-22  4:41               ` Chen Shang
@ 2005-05-23  7:11                 ` Ingo Molnar
  2005-05-23 14:45                   ` Chen Shang
  0 siblings, 1 reply; 15+ messages in thread
From: Ingo Molnar @ 2005-05-23  7:11 UTC (permalink / raw)
  To: Chen Shang; +Cc: Con Kolivas, Nick Piggin, linux-kernel, rml, Andrew Morton


* Chen Shang <shangcs@gmail.com> wrote:

> /*===== ISSUE ====*/
> My second version of patch has a defect.
> 
> +  if (unlikely(old_prio != next->prio))             {
> +      dequeue_task(next, array);  --> ### dequeue should against
> old_prio, NOT next->prio ###
> +      enqueue_task(next, array);
> +  }

indeed...

> unforunately, dequeue_task does not accept the third parameter to make
> adjustment. Personally, I feel it's good to add extra function as my
> first version of patch to combine dequeue and enqueue together.
> Reasons as following:
> 1) adding the third parameter to dequeue_task() would cause other
> places' code change;
> 2) for schedule functions, performance is the first consideration.
> Notice both dequeue_task() and enqueue_task() are NOT inline.
> Combining those two in one saves one function call overhead;

the real problem comes from recalc_task_prio() having a side-effect, 
which makes requeueing of tasks harder. The solution is to return the 
prio from recalc_task_prio() - see the tested patch below. Agreed?

	Ingo

--

micro-optimize task requeueing in schedule() & clean up 
recalc_task_prio().

From: Chen Shang <shangcs@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -675,7 +675,7 @@ static inline void __activate_idle_task(
 	rq->nr_running++;
 }
 
-static void recalc_task_prio(task_t *p, unsigned long long now)
+static int recalc_task_prio(task_t *p, unsigned long long now)
 {
 	/* Caller must always ensure 'now >= p->timestamp' */
 	unsigned long long __sleep_time = now - p->timestamp;
@@ -734,7 +734,7 @@ static void recalc_task_prio(task_t *p, 
 		}
 	}
 
-	p->prio = effective_prio(p);
+	return effective_prio(p);
 }
 
 /*
@@ -757,7 +757,7 @@ static void activate_task(task_t *p, run
 	}
 #endif
 
-	recalc_task_prio(p, now);
+	p->prio = recalc_task_prio(p, now);
 
 	/*
 	 * This checks to make sure it's not an uninterruptible task
@@ -2751,7 +2751,7 @@ asmlinkage void __sched schedule(void)
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int cpu, idx;
+	int cpu, idx, new_prio;
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
@@ -2873,9 +2873,14 @@ go_idle:
 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 
 		array = next->array;
-		dequeue_task(next, array);
-		recalc_task_prio(next, next->timestamp + delta);
-		enqueue_task(next, array);
+		new_prio = recalc_task_prio(next, next->timestamp + delta);
+
+		if (unlikely(next->prio != new_prio)) {
+			dequeue_task(next, array);
+			next->prio = new_prio;
+			enqueue_task(next, array);
+		} else
+			requeue_task(next, array);
 	}
 	next->activated = 0;
 switch_tasks:

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel <linux-2.6.11.10> kernel/sched.c
  2005-05-23  7:11                 ` Ingo Molnar
@ 2005-05-23 14:45                   ` Chen Shang
  0 siblings, 0 replies; 15+ messages in thread
From: Chen Shang @ 2005-05-23 14:45 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Con Kolivas, Nick Piggin, linux-kernel, rml, Andrew Morton

much better.
If so, another place, activate_task(), need to make adjustment as well.

-chen

/* ============================ */
 /*
@@ -704,7 +704,7 @@
 	}
 #endif
 
-	recalc_task_prio(p, now);
+	p->prio = recalc_task_prio(p, now);

/* ============================ */

On 5/23/05, Ingo Molnar <mingo@elte.hu> wrote:
> 
> the real problem comes from recalc_task_prio() having a side-effect,
> which makes requeueing of tasks harder. The solution is to return the
> prio from recalc_task_prio() - see the tested patch below. Agreed?
> 
>         Ingo
> 
> --
> 
> micro-optimize task requeueing in schedule() & clean up
> recalc_task_prio().
> 
> --- linux/kernel/sched.c.orig
> +++ linux/kernel/sched.c
> @@ -675,7 +675,7 @@ static inline void __activate_idle_task(
>         rq->nr_running++;
>  }
> 
> -static void recalc_task_prio(task_t *p, unsigned long long now)
> +static int recalc_task_prio(task_t *p, unsigned long long now)
>  {
>         /* Caller must always ensure 'now >= p->timestamp' */
>         unsigned long long __sleep_time = now - p->timestamp;
> @@ -734,7 +734,7 @@ static void recalc_task_prio(task_t *p,
>                 }
>         }
> 
> -       p->prio = effective_prio(p);
> +       return effective_prio(p);
>  }
> 
>  /*
> @@ -757,7 +757,7 @@ static void activate_task(task_t *p, run
>         }
>  #endif
> 
> -       recalc_task_prio(p, now);
> +       p->prio = recalc_task_prio(p, now);
> 
>         /*
>          * This checks to make sure it's not an uninterruptible task
> @@ -2751,7 +2751,7 @@ asmlinkage void __sched schedule(void)
>         struct list_head *queue;
>         unsigned long long now;
>         unsigned long run_time;
> -       int cpu, idx;
> +       int cpu, idx, new_prio;
> 
>         /*
>          * Test if we are atomic.  Since do_exit() needs to call into
> @@ -2873,9 +2873,14 @@ go_idle:
>                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
> 
>                 array = next->array;
> -               dequeue_task(next, array);
> -               recalc_task_prio(next, next->timestamp + delta);
> -               enqueue_task(next, array);
> +               new_prio = recalc_task_prio(next, next->timestamp + delta);
> +
> +               if (unlikely(next->prio != new_prio)) {
> +                       dequeue_task(next, array);
> +                       next->prio = new_prio;
> +                       enqueue_task(next, array);
> +               } else
> +                       requeue_task(next, array);
>         }
>         next->activated = 0;
>  switch_tasks:
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2005-05-23 14:46 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-05-19 16:56 [PATCH] kernel <linux-2.6.11.10> kernel/sched.c chen Shang
2005-05-20  3:26 ` Nick Piggin
2005-05-20  4:17   ` chen Shang
2005-05-20  4:32     ` Lee Revell
2005-05-20  5:13     ` Nick Piggin
2005-05-20  7:12       ` chen Shang
2005-05-20  7:21         ` Nick Piggin
2005-05-20  7:36           ` Con Kolivas
2005-05-20 13:41             ` chen Shang
2005-05-20  9:49         ` Ingo Molnar
2005-05-20 10:40           ` Con Kolivas
2005-05-20 11:34             ` Ingo Molnar
2005-05-22  4:41               ` Chen Shang
2005-05-23  7:11                 ` Ingo Molnar
2005-05-23 14:45                   ` Chen Shang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox