[PATCH 0/6] various scheduler patches

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/6] various scheduler patches
@ 2007-10-31 21:10 Peter Zijlstra
  2007-10-31 21:10 ` [PATCH 1/6] sched: move the group scheduling primitives around Peter Zijlstra
                   ` (6 more replies)
  0 siblings, 7 replies; 22+ messages in thread
From: Peter Zijlstra @ 2007-10-31 21:10 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ingo Molnar, Mike Galbraith, Dmitry Adamushko

My current scheduler queue, seems to work well on lappy

--


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 1/6] sched: move the group scheduling primitives around
  2007-10-31 21:10 [PATCH 0/6] various scheduler patches Peter Zijlstra
@ 2007-10-31 21:10 ` Peter Zijlstra
  2007-10-31 21:10 ` [PATCH 2/6] sched: make sched_slice() group scheduling savvy Peter Zijlstra
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2007-10-31 21:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Mike Galbraith, Dmitry Adamushko, Peter Zijlstra,
	Srivatsa Vaddagiri

[-- Attachment #1: sched-move-group.patch --]
[-- Type: text/plain, Size: 6092 bytes --]

The next patch will make sched_slice group aware, reorder the group scheduling
primitives so that they don't need fwd declarations.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
---
 kernel/sched_fair.c |  190 +++++++++++++++++++++++++---------------------------
 1 file changed, 92 insertions(+), 98 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -80,6 +80,11 @@ const_debug unsigned int sysctl_sched_mi
  * CFS operations on generic schedulable entities:
  */
 
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+	return container_of(se, struct task_struct, se);
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
 /* cpu runqueue to which this cfs_rq is attached */
@@ -91,6 +96,54 @@ static inline struct rq *rq_of(struct cf
 /* An entity is a task if it doesn't "own" a runqueue */
 #define entity_is_task(se)	(!se->my_q)
 
+/* Walk up scheduling entities hierarchy */
+#define for_each_sched_entity(se) \
+		for (; se; se = se->parent)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+	return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+	return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+	return grp->my_q;
+}
+
+/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
+ * another cpu ('this_cpu')
+ */
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+	return cfs_rq->tg->cfs_rq[this_cpu];
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+	list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+/* Do the two (enqueued) entities belong to the same group ? */
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+	if (se->cfs_rq == pse->cfs_rq)
+		return 1;
+
+	return 0;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+	return se->parent;
+}
+
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -100,13 +153,49 @@ static inline struct rq *rq_of(struct cf
 
 #define entity_is_task(se)	1
 
-#endif	/* CONFIG_FAIR_GROUP_SCHED */
+#define for_each_sched_entity(se) \
+		for (; se; se = NULL)
 
-static inline struct task_struct *task_of(struct sched_entity *se)
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 {
-	return container_of(se, struct task_struct, se);
+	return &task_rq(p)->cfs;
 }
 
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+	struct task_struct *p = task_of(se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+	return NULL;
+}
+
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+	return &cpu_rq(this_cpu)->cfs;
+}
+
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+	return 1;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+	return NULL;
+}
+
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
+
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -626,101 +715,6 @@ static void entity_tick(struct cfs_rq *c
  * CFS operations on tasks:
  */
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* Walk up scheduling entities hierarchy */
-#define for_each_sched_entity(se) \
-		for (; se; se = se->parent)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-	return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-	return grp->my_q;
-}
-
-/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
- * another cpu ('this_cpu')
- */
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-	return cfs_rq->tg->cfs_rq[this_cpu];
-}
-
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
-
-/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-	if (se->cfs_rq == pse->cfs_rq)
-		return 1;
-
-	return 0;
-}
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
-	return se->parent;
-}
-
-#else	/* CONFIG_FAIR_GROUP_SCHED */
-
-#define for_each_sched_entity(se) \
-		for (; se; se = NULL)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return &task_rq(p)->cfs;
-}
-
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-	struct task_struct *p = task_of(se);
-	struct rq *rq = task_rq(p);
-
-	return &rq->cfs;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-	return NULL;
-}
-
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-	return &cpu_rq(this_cpu)->cfs;
-}
-
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-	return 1;
-}
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
-	return NULL;
-}
-
-#endif	/* CONFIG_FAIR_GROUP_SCHED */
-
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and

--


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 2/6] sched: make sched_slice() group scheduling savvy
  2007-10-31 21:10 [PATCH 0/6] various scheduler patches Peter Zijlstra
  2007-10-31 21:10 ` [PATCH 1/6] sched: move the group scheduling primitives around Peter Zijlstra
@ 2007-10-31 21:10 ` Peter Zijlstra
  2007-11-01 11:31   ` Srivatsa Vaddagiri
  2007-10-31 21:10 ` [PATCH 3/6] sched: high-res preemption tick Peter Zijlstra
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2007-10-31 21:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Mike Galbraith, Dmitry Adamushko, Peter Zijlstra,
	Srivatsa Vaddagiri

[-- Attachment #1: sched-slice-group.patch --]
[-- Type: text/plain, Size: 2481 bytes --]

Currently the ideal slice length does not take group scheduling into account.
Change it so that it properly takes all the runnable tasks on this cpu into
account and caluclate the weight according to the grouping hierarchy.

Also fixes a bug in vslice which missed a factor NICE_0_LOAD.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
---
 kernel/sched_fair.c |   42 +++++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 11 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -331,10 +331,15 @@ static u64 __sched_period(unsigned long 
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	u64 slice = __sched_period(cfs_rq->nr_running);
+	unsigned long nr_running = rq_of(cfs_rq)->nr_running;
+	u64 slice = __sched_period(nr_running);
 
-	slice *= se->load.weight;
-	do_div(slice, cfs_rq->load.weight);
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		slice *= se->load.weight;
+		do_div(slice, cfs_rq->load.weight);
+	}
 
 	return slice;
 }
@@ -344,24 +349,39 @@ static u64 sched_slice(struct cfs_rq *cf
  *
  * vs = s/w = p/rw
  */
-static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
+static u64 __sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *new)
 {
-	u64 vslice = __sched_period(nr_running);
+	struct sched_entity *se = cfs_rq->curr;
+	unsigned long nr_running = rq_of(cfs_rq)->nr_running;
+	unsigned long weight = 0;
+	u64 vslice;
+
+	if (new) {
+		nr_running++;
+		weight = new->load.weight;
+	}
 
-	do_div(vslice, rq_weight);
+	vslice = __sched_period(nr_running);
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		vslice *= NICE_0_LOAD;
+		do_div(vslice, cfs_rq->load.weight + weight);
+		weight = 0;
+	}
 
 	return vslice;
 }
 
-static u64 sched_vslice(struct cfs_rq *cfs_rq)
+static inline u64 sched_vslice(struct cfs_rq *cfs_rq)
 {
-	return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
+	return __sched_vslice(cfs_rq, NULL);
 }
 
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *new)
 {
-	return __sched_vslice(cfs_rq->load.weight + se->load.weight,
-			cfs_rq->nr_running + 1);
+	return __sched_vslice(cfs_rq, new);
 }
 
 /*

--


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/6] sched: make sched_slice() group scheduling savvy
  2007-10-31 21:10 ` [PATCH 2/6] sched: make sched_slice() group scheduling savvy Peter Zijlstra
@ 2007-11-01 11:31   ` Srivatsa Vaddagiri
  2007-11-01 11:51     ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Srivatsa Vaddagiri @ 2007-11-01 11:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Mike Galbraith, Dmitry Adamushko

On Wed, Oct 31, 2007 at 10:10:32PM +0100, Peter Zijlstra wrote:
> Currently the ideal slice length does not take group scheduling into account.
> Change it so that it properly takes all the runnable tasks on this cpu into
> account and caluclate the weight according to the grouping hierarchy.
> 
> Also fixes a bug in vslice which missed a factor NICE_0_LOAD.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
> ---
>  kernel/sched_fair.c |   42 +++++++++++++++++++++++++++++++-----------
>  1 file changed, 31 insertions(+), 11 deletions(-)
> 
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c
> +++ linux-2.6/kernel/sched_fair.c
> @@ -331,10 +331,15 @@ static u64 __sched_period(unsigned long 
>   */
>  static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> -	u64 slice = __sched_period(cfs_rq->nr_running);
> +	unsigned long nr_running = rq_of(cfs_rq)->nr_running;
> +	u64 slice = __sched_period(nr_running);
> 
> -	slice *= se->load.weight;
> -	do_div(slice, cfs_rq->load.weight);
> +	for_each_sched_entity(se) {
> +		cfs_rq = cfs_rq_of(se);
> +
> +		slice *= se->load.weight;
> +		do_div(slice, cfs_rq->load.weight);
> +	}
> 
>  	return slice;


Lets say we have two groups A and B on CPU0, of equal weight (1024).

Further,

A has 1 task (A0)
B has 1000 tasks (B0 .. B999) 

Agreed its a extreme case, but illustrates the problem I have in mind
for this patch.

All tasks of same weight=1024.

Before this patch
=================

	sched_slice(grp A) = 20ms * 1/2 = 10ms
	sched_slice(A0) = 20ms

	sched_slice(grp B) = 20ms * 1/2 = 10ms
	sched_slice(B0) = (20ms * 1000/20) * 1 / 1000 = 1ms
	sched_slice(B1) = ... = sched_slice(B99) = 1 ms

Fairness between groups and tasks would be obtained as below:

    A0       B0-B9     A0    B10-B19     A0     B20-B29
 |--------|--------|--------|--------|--------|--------|-----//--| 
 0       10ms	   20ms	   30ms     40ms     50ms     60ms

After this patch
================

	sched_slice(grp A) = (20ms * 1001/20) * 1/2 ~= 500ms
	sched_slice(A0) = 500ms

	sched_slice(grp B) = 500ms
	sched_slice(B0) = 0.5ms 

Fairness between groups and tasks would be obtained as below:

	    A0		          B0 - B99  	            A0
 |-----------------------|-----------------------|-----------------------|
 0		        500ms			1000ms 			1500ms

Did I get it right? If so, I don't like the fact that group A is allowed to run 
for a long time (500ms) before giving chance to group B.

Can I know what real problem is being addressed by this change to
sched_slice()?

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/6] sched: make sched_slice() group scheduling savvy
  2007-11-01 11:31   ` Srivatsa Vaddagiri
@ 2007-11-01 11:51     ` Peter Zijlstra
  2007-11-01 11:58       ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2007-11-01 11:51 UTC (permalink / raw)
  To: vatsa; +Cc: linux-kernel, Ingo Molnar, Mike Galbraith, Dmitry Adamushko

[-- Attachment #1: Type: text/plain, Size: 3374 bytes --]

On Thu, 2007-11-01 at 17:01 +0530, Srivatsa Vaddagiri wrote:
> On Wed, Oct 31, 2007 at 10:10:32PM +0100, Peter Zijlstra wrote:
> > Currently the ideal slice length does not take group scheduling into account.
> > Change it so that it properly takes all the runnable tasks on this cpu into
> > account and caluclate the weight according to the grouping hierarchy.
> > 
> > Also fixes a bug in vslice which missed a factor NICE_0_LOAD.
> > 
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
> > ---
> >  kernel/sched_fair.c |   42 +++++++++++++++++++++++++++++++-----------
> >  1 file changed, 31 insertions(+), 11 deletions(-)
> > 
> > Index: linux-2.6/kernel/sched_fair.c
> > ===================================================================
> > --- linux-2.6.orig/kernel/sched_fair.c
> > +++ linux-2.6/kernel/sched_fair.c
> > @@ -331,10 +331,15 @@ static u64 __sched_period(unsigned long 
> >   */
> >  static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >  {
> > -	u64 slice = __sched_period(cfs_rq->nr_running);
> > +	unsigned long nr_running = rq_of(cfs_rq)->nr_running;
> > +	u64 slice = __sched_period(nr_running);
> > 
> > -	slice *= se->load.weight;
> > -	do_div(slice, cfs_rq->load.weight);
> > +	for_each_sched_entity(se) {
> > +		cfs_rq = cfs_rq_of(se);
> > +
> > +		slice *= se->load.weight;
> > +		do_div(slice, cfs_rq->load.weight);
> > +	}
> > 
> >  	return slice;
> 
> 
> Lets say we have two groups A and B on CPU0, of equal weight (1024).
> 
> Further,
> 
> A has 1 task (A0)
> B has 1000 tasks (B0 .. B999) 
> 
> Agreed its a extreme case, but illustrates the problem I have in mind
> for this patch.
> 
> All tasks of same weight=1024.
> 
> Before this patch
> =================
> 
> 	sched_slice(grp A) = 20ms * 1/2 = 10ms
> 	sched_slice(A0) = 20ms
> 
> 	sched_slice(grp B) = 20ms * 1/2 = 10ms
> 	sched_slice(B0) = (20ms * 1000/20) * 1 / 1000 = 1ms
> 	sched_slice(B1) = ... = sched_slice(B99) = 1 ms
> 
> Fairness between groups and tasks would be obtained as below:
> 
>     A0       B0-B9     A0    B10-B19     A0     B20-B29
>  |--------|--------|--------|--------|--------|--------|-----//--| 
>  0       10ms	   20ms	   30ms     40ms     50ms     60ms
> 
> After this patch
> ================
> 
> 	sched_slice(grp A) = (20ms * 1001/20) * 1/2 ~= 500ms
> 	sched_slice(A0) = 500ms

Hmm, right that is indeed not intended

> 	sched_slice(grp B) = 500ms
> 	sched_slice(B0) = 0.5ms 

This 0.5 is indeed correct, whereas the previous 1ms was not

> Fairness between groups and tasks would be obtained as below:
> 
> 	    A0		          B0 - B99  	            A0
>  |-----------------------|-----------------------|-----------------------|
>  0		        500ms			1000ms 			1500ms
> 
> Did I get it right? If so, I don't like the fact that group A is allowed to run 
> for a long time (500ms) before giving chance to group B.

Hmm, quite bad indeed.

> Can I know what real problem is being addressed by this change to
> sched_slice()?

sched_slice() is about lantecy, its intended purpose is to ensure each
task is ran exactly once during sched_period() - which is
sysctl_sched_latency when nr_running <= sysctl_sched_nr_latency, and
otherwise linearly scales latency.



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/6] sched: make sched_slice() group scheduling savvy
  2007-11-01 11:51     ` Peter Zijlstra
@ 2007-11-01 11:58       ` Peter Zijlstra
  2007-11-01 12:03         ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2007-11-01 11:58 UTC (permalink / raw)
  To: vatsa; +Cc: linux-kernel, Ingo Molnar, Mike Galbraith, Dmitry Adamushko

On Thu, 2007-11-01 at 12:51 +0100, Peter Zijlstra wrote:
> On Thu, 2007-11-01 at 17:01 +0530, Srivatsa Vaddagiri wrote:
> > On Wed, Oct 31, 2007 at 10:10:32PM +0100, Peter Zijlstra wrote:
> > > Currently the ideal slice length does not take group scheduling into account.
> > > Change it so that it properly takes all the runnable tasks on this cpu into
> > > account and caluclate the weight according to the grouping hierarchy.
> > > 
> > > Also fixes a bug in vslice which missed a factor NICE_0_LOAD.
> > > 
> > > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > > CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
> > > ---
> > >  kernel/sched_fair.c |   42 +++++++++++++++++++++++++++++++-----------
> > >  1 file changed, 31 insertions(+), 11 deletions(-)
> > > 
> > > Index: linux-2.6/kernel/sched_fair.c
> > > ===================================================================
> > > --- linux-2.6.orig/kernel/sched_fair.c
> > > +++ linux-2.6/kernel/sched_fair.c
> > > @@ -331,10 +331,15 @@ static u64 __sched_period(unsigned long 
> > >   */
> > >  static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > >  {
> > > -	u64 slice = __sched_period(cfs_rq->nr_running);
> > > +	unsigned long nr_running = rq_of(cfs_rq)->nr_running;
> > > +	u64 slice = __sched_period(nr_running);
> > > 
> > > -	slice *= se->load.weight;
> > > -	do_div(slice, cfs_rq->load.weight);
> > > +	for_each_sched_entity(se) {
> > > +		cfs_rq = cfs_rq_of(se);
> > > +
> > > +		slice *= se->load.weight;
> > > +		do_div(slice, cfs_rq->load.weight);
> > > +	}
> > > 
> > >  	return slice;
> > 
> > 
> > Lets say we have two groups A and B on CPU0, of equal weight (1024).
> > 
> > Further,
> > 
> > A has 1 task (A0)
> > B has 1000 tasks (B0 .. B999) 
> > 
> > Agreed its a extreme case, but illustrates the problem I have in mind
> > for this patch.
> > 
> > All tasks of same weight=1024.
> > 
> > Before this patch
> > =================
> > 
> > 	sched_slice(grp A) = 20ms * 1/2 = 10ms
> > 	sched_slice(A0) = 20ms
> > 
> > 	sched_slice(grp B) = 20ms * 1/2 = 10ms
> > 	sched_slice(B0) = (20ms * 1000/20) * 1 / 1000 = 1ms
> > 	sched_slice(B1) = ... = sched_slice(B99) = 1 ms
> > 
> > Fairness between groups and tasks would be obtained as below:
> > 
> >     A0       B0-B9     A0    B10-B19     A0     B20-B29
> >  |--------|--------|--------|--------|--------|--------|-----//--| 
> >  0       10ms	   20ms	   30ms     40ms     50ms     60ms
> > 
> > After this patch
> > ================
> > 
> > 	sched_slice(grp A) = (20ms * 1001/20) * 1/2 ~= 500ms
> > 	sched_slice(A0) = 500ms
> 
> Hmm, right that is indeed not intended
> 
> > 	sched_slice(grp B) = 500ms
> > 	sched_slice(B0) = 0.5ms 
> 
> This 0.5 is indeed correct, whereas the previous 1ms was not
> 
> > Fairness between groups and tasks would be obtained as below:
> > 
> > 	    A0		          B0 - B99  	            A0
> >  |-----------------------|-----------------------|-----------------------|
> >  0		        500ms			1000ms 			1500ms
> > 
> > Did I get it right? If so, I don't like the fact that group A is allowed to run 
> > for a long time (500ms) before giving chance to group B.
> 
> Hmm, quite bad indeed.

hmm, then again, with 1001 tasks running, that is exactly what should
happen.

> > Can I know what real problem is being addressed by this change to
> > sched_slice()?
> 
> sched_slice() is about lantecy, its intended purpose is to ensure each
> task is ran exactly once during sched_period() - which is
> sysctl_sched_latency when nr_running <= sysctl_sched_nr_latency, and
> otherwise linearly scales latency.
> 
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/6] sched: make sched_slice() group scheduling savvy
  2007-11-01 11:58       ` Peter Zijlstra
@ 2007-11-01 12:03         ` Peter Zijlstra
  2007-11-01 12:20           ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2007-11-01 12:03 UTC (permalink / raw)
  To: vatsa; +Cc: linux-kernel, Ingo Molnar, Mike Galbraith, Dmitry Adamushko

On Thu, 2007-11-01 at 12:58 +0100, Peter Zijlstra wrote:

> > sched_slice() is about lantecy, its intended purpose is to ensure each
> > task is ran exactly once during sched_period() - which is
> > sysctl_sched_latency when nr_running <= sysctl_sched_nr_latency, and
> > otherwise linearly scales latency.

The thing that got my brain in a twist is what to do about the non-leaf
nodes, for those it seems I'm not doing the right thing - I think.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/6] sched: make sched_slice() group scheduling savvy
  2007-11-01 12:03         ` Peter Zijlstra
@ 2007-11-01 12:20           ` Peter Zijlstra
  2007-11-01 16:31             ` Srivatsa Vaddagiri
  0 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2007-11-01 12:20 UTC (permalink / raw)
  To: vatsa; +Cc: linux-kernel, Ingo Molnar, Mike Galbraith, Dmitry Adamushko

On Thu, 2007-11-01 at 13:03 +0100, Peter Zijlstra wrote:
> On Thu, 2007-11-01 at 12:58 +0100, Peter Zijlstra wrote:
> 
> > > sched_slice() is about lantecy, its intended purpose is to ensure each
> > > task is ran exactly once during sched_period() - which is
> > > sysctl_sched_latency when nr_running <= sysctl_sched_nr_latency, and
> > > otherwise linearly scales latency.
> 
> The thing that got my brain in a twist is what to do about the non-leaf
> nodes, for those it seems I'm not doing the right thing - I think.

Ok, suppose a tree like so:


level 2                   cfs_rq
                       A           B

level 1             cfs_rqA     cfs_rqB
                     A0        B0 - B99


So for sake of determinism, we want to impose a period in which all
level 1 tasks will have ran (at least) once.

Now what sched_slice() does is calculate the weighted proportion of the
given period for each task to run, so that each task runs exactly once.

Now level 2, can introduce these large weight differences, which in this
case result in 'lumps' of time.

In the given example above the weight difference is 1:100, which is
already at the edges of what regular nice levels could do.

How about limiting the max output of sched_slice() to
sysctl_sched_latency in order to break up these large stretches of time?

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -341,7 +341,7 @@ static u64 sched_slice(struct cfs_rq *cf
 		do_div(slice, cfs_rq->load.weight);
 	}
 
-	return slice;
+	return min_t(u64, sysctl_sched_latency, slice);
 }
 
 /*




^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/6] sched: make sched_slice() group scheduling savvy
  2007-11-01 12:20           ` Peter Zijlstra
@ 2007-11-01 16:31             ` Srivatsa Vaddagiri
  2007-11-01 16:55               ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Srivatsa Vaddagiri @ 2007-11-01 16:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Mike Galbraith, Dmitry Adamushko

On Thu, Nov 01, 2007 at 01:20:08PM +0100, Peter Zijlstra wrote:
> On Thu, 2007-11-01 at 13:03 +0100, Peter Zijlstra wrote:
> > On Thu, 2007-11-01 at 12:58 +0100, Peter Zijlstra wrote:
> > 
> > > > sched_slice() is about lantecy, its intended purpose is to ensure each
> > > > task is ran exactly once during sched_period() - which is
> > > > sysctl_sched_latency when nr_running <= sysctl_sched_nr_latency, and
> > > > otherwise linearly scales latency.
> > 
> > The thing that got my brain in a twist is what to do about the non-leaf
> > nodes, for those it seems I'm not doing the right thing - I think.
> 
> Ok, suppose a tree like so:
> 
> 
> level 2                   cfs_rq
>                        A           B
> 
> level 1             cfs_rqA     cfs_rqB
>                      A0        B0 - B99
> 
> 
> So for sake of determinism, we want to impose a period in which all
> level 1 tasks will have ran (at least) once.

Peter,
	I fail to see why this requirement to "determine a period in
which all level 1 tasks will have ran (at least) once" is essential.

I am visualizing each of the groups to be similar to Xen-like partitions
which are given fair timeslices by the hypervisor (Linux kernel in this
case). How each partition (group in this case) manages the allocated
timeslice(s) to provide fairness to tasks within that partition/group should not
(IMHO) depend on other groups and esp. how many tasks other groups has.

For ex: before this patch, fair time would be allocated to group and
their tasks as below:

    A0       B0-B9     A0    B10-B19     A0     B20-B29
 |--------|--------|--------|--------|--------|--------|-----//--| 
 0       10ms	   20ms	   30ms     40ms     50ms     60ms

i.e during the first 10ms allocated to group B, B0-B9 run,
    during the next  10ms allocated to group B, B10-B19 run etc

What's wrong with this scheme?

By letting __sched_period() be determined for each group independently,
we are building stronger isolation between them, which is good IMO
(imagine a rogue container that does a fork bomb).

> Now what sched_slice() does is calculate the weighted proportion of the
> given period for each task to run, so that each task runs exactly once.
> 
> Now level 2, can introduce these large weight differences, which in this
> case result in 'lumps' of time.
> 
> In the given example above the weight difference is 1:100, which is
> already at the edges of what regular nice levels could do.
> 
> How about limiting the max output of sched_slice() to
> sysctl_sched_latency in order to break up these large stretches of time?
> 
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c
> +++ linux-2.6/kernel/sched_fair.c
> @@ -341,7 +341,7 @@ static u64 sched_slice(struct cfs_rq *cf
>  		do_div(slice, cfs_rq->load.weight);
>  	}
> 
> -	return slice;
> +	return min_t(u64, sysctl_sched_latency, slice);

Hmm, going back to the previous example I cited, this will lead to:

	sched_slice(grp A) = min(20ms, 500ms) = 20ms
	sched_slice(A0) = min(20ms, 500ms) = 20ms

	sched_slice(grp B) = min(20ms, 500ms) = 20ms
	sched_slice(B0) = min(20ms, 0.5ms) = 0.5ms

Fairness between groups and tasks would be obtained as below:

     A0     B0-B39     A0     B40-B79    A0
 |--------|--------|--------|--------|--------|
 0     20ms       40ms      60ms     80ms

which seems to be more or less giving what we already have w/o the
patch?

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/6] sched: make sched_slice() group scheduling savvy
  2007-11-01 16:31             ` Srivatsa Vaddagiri
@ 2007-11-01 16:55               ` Peter Zijlstra
  0 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2007-11-01 16:55 UTC (permalink / raw)
  To: vatsa; +Cc: linux-kernel, Ingo Molnar, Mike Galbraith, Dmitry Adamushko

On Thu, 2007-11-01 at 22:01 +0530, Srivatsa Vaddagiri wrote:
> On Thu, Nov 01, 2007 at 01:20:08PM +0100, Peter Zijlstra wrote:
> > On Thu, 2007-11-01 at 13:03 +0100, Peter Zijlstra wrote:
> > > On Thu, 2007-11-01 at 12:58 +0100, Peter Zijlstra wrote:
> > > 
> > > > > sched_slice() is about lantecy, its intended purpose is to ensure each
> > > > > task is ran exactly once during sched_period() - which is
> > > > > sysctl_sched_latency when nr_running <= sysctl_sched_nr_latency, and
> > > > > otherwise linearly scales latency.
> > > 
> > > The thing that got my brain in a twist is what to do about the non-leaf
> > > nodes, for those it seems I'm not doing the right thing - I think.
> > 
> > Ok, suppose a tree like so:
> > 
> > 
> > level 2                   cfs_rq
> >                        A           B
> > 
> > level 1             cfs_rqA     cfs_rqB
> >                      A0        B0 - B99
> > 
> > 
> > So for sake of determinism, we want to impose a period in which all
> > level 1 tasks will have ran (at least) once.
> 
> Peter,
> 	I fail to see why this requirement to "determine a period in
> which all level 1 tasks will have ran (at least) once" is essential.

Because it gives a steady feel to things. For humans its most essential
that things run in a predicable fashion. So no only does it matter how
much time a task gets, it also very much matters when it gets that.

It contributes to the feeling of gradual slow down. 

> I am visualizing each of the groups to be similar to Xen-like partitions
> which are given fair timeslices by the hypervisor (Linux kernel in this
> case). How each partition (group in this case) manages the allocated
> timeslice(s) to provide fairness to tasks within that partition/group should not
> (IMHO) depend on other groups and esp. how many tasks other groups has.

Agreed, I've realised this since my last mail, one group should not
influence another in such a fashion, in this respect you don't want to
flatten it like I did.

> For ex: before this patch, fair time would be allocated to group and
> their tasks as below:
> 
>     A0       B0-B9     A0    B10-B19     A0     B20-B29
>  |--------|--------|--------|--------|--------|--------|-----//--| 
>  0       10ms	   20ms	   30ms     40ms     50ms     60ms
> 
> i.e during the first 10ms allocated to group B, B0-B9 run,
>     during the next  10ms allocated to group B, B10-B19 run etc
> 
> What's wrong with this scheme?

What made me start tinkering here is that the nested level is again
distributing wall-time without being aware of the fraction it gets from
the upper levels.

So if we have two groups, A and B, and B is selected for 1/2 of period,
then Bn will again divide period, even though in reality it will only
have p/2.

So I guess, I need a top down traversal, not a bottom up traversal to
get this fixed up... I'll ponder this.

> By letting __sched_period() be determined for each group independently,
> we are building stronger isolation between them, which is good IMO
> (imagine a rogue container that does a fork bomb).

Agreed.

> > Index: linux-2.6/kernel/sched_fair.c
> > ===================================================================
> > --- linux-2.6.orig/kernel/sched_fair.c
> > +++ linux-2.6/kernel/sched_fair.c
> > @@ -341,7 +341,7 @@ static u64 sched_slice(struct cfs_rq *cf
> >  		do_div(slice, cfs_rq->load.weight);
> >  	}
> > 
> > -	return slice;
> > +	return min_t(u64, sysctl_sched_latency, slice);

> which seems to be more or less giving what we already have w/o the
> patch?

Well, its basically giving up on overload, admittedly not very nice.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 3/6] sched: high-res preemption tick
  2007-10-31 21:10 [PATCH 0/6] various scheduler patches Peter Zijlstra
  2007-10-31 21:10 ` [PATCH 1/6] sched: move the group scheduling primitives around Peter Zijlstra
  2007-10-31 21:10 ` [PATCH 2/6] sched: make sched_slice() group scheduling savvy Peter Zijlstra
@ 2007-10-31 21:10 ` Peter Zijlstra
  2007-10-31 21:53   ` Andi Kleen
  2007-10-31 21:10 ` [PATCH 4/6] sched: sched_rt_entity Peter Zijlstra
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2007-10-31 21:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Mike Galbraith, Dmitry Adamushko, Peter Zijlstra,
	Thomas Gleixner

[-- Attachment #1: sched-hrtick.patch --]
[-- Type: text/plain, Size: 19647 bytes --]

Use HR-timers (when available) to deliver an accurate preemption tick.

The regular scheduler tick that runs at 1/HZ can be too coarse when nice
level are used. The fairness system will still keep the cpu utilisation 'fair'
by then delaying the task that got an excessive amount of CPU time but try to
minimize this by delivering preemption points spot-on.

The average frequency of this extra interrupt is sched_latency / nr_latency.
Which need not be higher than 1/HZ, its just that the distribution within the
sched_latency period is important.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/entry_64.S       |    6 -
 arch/x86/kernel/signal_32.c      |    3 
 arch/x86/kernel/signal_64.c      |    3 
 include/asm-x86/thread_info_32.h |    2 
 include/asm-x86/thread_info_64.h |    5 
 include/linux/hrtimer.h          |    9 +
 include/linux/sched.h            |    3 
 kernel/Kconfig.hz                |    4 
 kernel/sched.c                   |  205 ++++++++++++++++++++++++++++++++++++---
 kernel/sched_fair.c              |   83 +++++++++++++++
 kernel/sched_idletask.c          |    2 
 kernel/sched_rt.c                |    2 
 12 files changed, 306 insertions(+), 21 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -64,6 +64,7 @@
 #include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
+#include <linux/hrtimer.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -338,6 +339,12 @@ struct rq {
 	struct list_head migration_queue;
 #endif
 
+#ifdef CONFIG_SCHED_HRTICK
+	unsigned long hrtick_flags;
+	ktime_t hrtick_expire;
+	struct hrtimer hrtick_timer;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
@@ -461,6 +468,8 @@ enum {
 	SCHED_FEAT_APPROX_AVG           = 8,
 	SCHED_FEAT_WAKEUP_PREEMPT	= 16,
 	SCHED_FEAT_PREEMPT_RESTRICT	= 32,
+	SCHED_FEAT_HRTICK		= 64,
+	SCHED_FEAT_DOUBLE_TICK		= 128,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -469,7 +478,9 @@ const_debug unsigned int sysctl_sched_fe
 		SCHED_FEAT_TREE_AVG		* 0 |
 		SCHED_FEAT_APPROX_AVG		* 0 |
 		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
-		SCHED_FEAT_PREEMPT_RESTRICT	* 1;
+		SCHED_FEAT_PREEMPT_RESTRICT	* 1 |
+		SCHED_FEAT_HRTICK		* 1 |
+		SCHED_FEAT_DOUBLE_TICK		* 0;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
@@ -669,6 +680,168 @@ void sched_clock_idle_wakeup_event(u64 d
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
+static void __resched_task(struct task_struct *p, int tif_bit);
+
+static inline void resched_task(struct task_struct *p)
+{
+	__resched_task(p, TIF_NEED_RESCHED);
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ *
+ * Its all a bit involved since we cannot program an hrt while holding the
+ * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
+ * reschedule event.
+ *
+ * When we get rescheduled we reprogram the hrtick_timer outside of the
+ * rq->lock.
+ */
+static inline void resched_hrt(struct task_struct *p)
+{
+	__resched_task(p, TIF_HRTICK_RESCHED);
+}
+
+static inline void resched_rq(struct rq *rq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	resched_task(rq->curr);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+enum {
+	HRTICK_SET,		/* re-programm hrtick_timer */
+	HRTICK_RESET,		/* not a new slice */
+};
+
+/*
+ * Use hrtick when:
+ *  - enabled by features
+ *  - hrtimer is actually high res
+ */
+static inline int hrtick_enabled(struct rq *rq)
+{
+	if (!sched_feat(HRTICK))
+		return 0;
+	return hrtimer_is_hres_active(&rq->hrtick_timer);
+}
+
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay, int reset)
+{
+	assert_spin_locked(&rq->lock);
+
+	/*
+	 * preempt at: now + delay
+	 */
+	rq->hrtick_expire =
+		ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
+	/*
+	 * indicate we need to program the timer
+	 */
+	__set_bit(HRTICK_SET, &rq->hrtick_flags);
+	if (reset)
+		__set_bit(HRTICK_RESET, &rq->hrtick_flags);
+
+	/*
+	 * New slices are called from the schedule path and don't need a
+	 * forced reschedule.
+	 */
+	if (reset)
+		resched_hrt(rq->curr);
+}
+
+static void hrtick_clear(struct rq *rq)
+{
+	if (hrtimer_active(&rq->hrtick_timer))
+		hrtimer_cancel(&rq->hrtick_timer);
+}
+
+/*
+ * Update the timer from the possible pending state.
+ */
+static void hrtick_set(struct rq *rq)
+{
+	ktime_t time;
+	int set, reset;
+	unsigned long flags;
+
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+	spin_lock_irqsave(&rq->lock, flags);
+	set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
+	reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
+	time = rq->hrtick_expire;
+	clear_thread_flag(TIF_HRTICK_RESCHED);
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	if (set) {
+		hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
+		if (reset && !hrtimer_active(&rq->hrtick_timer))
+			resched_rq(rq);
+	} else hrtick_clear(rq);
+}
+
+/*
+ * High-resolution timer tick.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrtick(struct hrtimer *timer)
+{
+	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+	spin_lock(&rq->lock);
+	__update_rq_clock(rq);
+	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+	spin_unlock(&rq->lock);
+
+	return HRTIMER_NORESTART;
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+	rq->hrtick_flags = 0;
+	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+}
+#else
+static inline void hrtick_clear(struct rq *rq)
+{
+}
+
+static inline void hrtick_set(struct rq *rq)
+{
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+}
+#endif
+
+void hrtick_resched(void)
+{
+	struct rq *rq;
+	unsigned long flags;
+
+	if (!test_thread_flag(TIF_HRTICK_RESCHED))
+		return;
+
+	local_irq_save(flags);
+	rq = cpu_rq(smp_processor_id());
+	hrtick_set(rq);
+	local_irq_restore(flags);
+}
+
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
@@ -682,16 +855,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeu
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 
-static void resched_task(struct task_struct *p)
+static void __resched_task(struct task_struct *p, int tif_bit)
 {
 	int cpu;
 
 	assert_spin_locked(&task_rq(p)->lock);
 
-	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+	if (unlikely(test_tsk_thread_flag(p, tif_bit)))
 		return;
 
-	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+	set_tsk_thread_flag(p, tif_bit);
 
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
@@ -714,10 +887,10 @@ static void resched_cpu(int cpu)
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else
-static inline void resched_task(struct task_struct *p)
+static inline void __resched_task(struct task_struct *p, int tif_bit)
 {
 	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_need_resched(p);
+	set_tsk_thread_flag(tsk, tif_bit);
 }
 #endif
 
@@ -3474,7 +3647,7 @@ void scheduler_tick(void)
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	if (curr != rq->idle) /* FIXME: needed? */
-		curr->sched_class->task_tick(rq, curr);
+		curr->sched_class->task_tick(rq, curr, 0);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -3620,6 +3793,8 @@ need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
+	hrtick_clear(rq);
+
 	/*
 	 * Do the rq-clock update outside the rq lock:
 	 */
@@ -3652,14 +3827,20 @@ need_resched_nonpreemptible:
 		++*switch_count;
 
 		context_switch(rq, prev, next); /* unlocks the rq */
+		/*
+		 * the context switch might have flipped the stack from under
+		 * us, hence refresh the local variables.
+		 */
+		cpu = smp_processor_id();
+		rq = cpu_rq(cpu);
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	if (unlikely(reacquire_kernel_lock(current) < 0)) {
-		cpu = smp_processor_id();
-		rq = cpu_rq(cpu);
+	hrtick_set(rq);
+
+	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
-	}
+
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
@@ -6765,6 +6946,8 @@ void __init sched_init(void)
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 #endif
+		init_rq_hrtick(rq);
+
 		atomic_set(&rq->nr_iowait, 0);
 
 		array = &rq->rt.active;
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -443,6 +443,7 @@ static void update_curr(struct cfs_rq *c
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	se->exec_start = rq_of(cfs_rq)->clock;
 	schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 }
 
@@ -720,13 +721,29 @@ static void put_prev_entity(struct cfs_r
 	cfs_rq->curr = NULL;
 }
 
-static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+static void
+entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 {
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
 
+#ifdef CONFIG_SCHED_HRTICK
+	/*
+	 * queued ticks are scheduled to match the slice, so don't bother
+	 * validating it and just reschedule.
+	 */
+	if (queued)
+		return resched_task(rq_of(cfs_rq)->curr);
+	/*
+	 * don't let the period tick interfere with the hrtick preemption
+	 */
+	if (!sched_feat(DOUBLE_TICK) &&
+			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
+		return;
+#endif
+
 	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
 		check_preempt_tick(cfs_rq, curr);
 }
@@ -735,6 +752,58 @@ static void entity_tick(struct cfs_rq *c
  * CFS operations on tasks:
  */
 
+#ifdef CONFIG_SCHED_HRTICK
+static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+	int requeue = rq->curr == p;
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	WARN_ON(task_rq(p) != rq);
+
+	if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
+		u64 slice = sched_slice(cfs_rq, se);
+		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+		s64 delta = slice - ran;
+
+		if (delta < 0) {
+			if (rq->curr == p)
+				resched_task(p);
+			return;
+		}
+
+		/*
+		 * Don't schedule slices shorter than 10000ns, that just
+		 * doesn't make sense. Rely on vruntime for fairness.
+		 */
+		if (!requeue)
+			delta = max(10000LL, delta);
+
+		if (sched_feat(PREEMPT_RESTRICT) && first_fair(cfs_rq)) {
+			struct sched_entity *next = __pick_next_entity(cfs_rq);
+
+			if (next->vruntime < se->vruntime) {
+				s64 wakeup = sysctl_sched_wakeup_granularity;
+				u64 now = rq->clock;
+
+				wakeup -= now - next->exec_start;
+				if (wakeup < 0)
+					resched_task(rq->curr);
+				else
+					delta = min(delta, wakeup);
+			}
+		}
+
+		hrtick_start(rq, delta, requeue);
+	}
+}
+#else
+static inline void
+hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+}
+#endif
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -752,6 +821,7 @@ static void enqueue_task_fair(struct rq 
 		enqueue_entity(cfs_rq, se, wakeup);
 		wakeup = 1;
 	}
+	hrtick_start_fair(rq, rq->curr);
 }
 
 /*
@@ -772,6 +842,7 @@ static void dequeue_task_fair(struct rq 
 			break;
 		sleep = 1;
 	}
+	hrtick_start_fair(rq, rq->curr);
 }
 
 /*
@@ -862,6 +933,7 @@ static void check_preempt_wakeup(struct 
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
 {
+	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
 
@@ -873,7 +945,10 @@ static struct task_struct *pick_next_tas
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
-	return task_of(se);
+	p = task_of(se);
+	hrtick_start_fair(rq, p);
+
+	return p;
 }
 
 /*
@@ -1028,14 +1103,14 @@ move_one_task_fair(struct rq *this_rq, i
 /*
  * scheduler tick hitting a task of our scheduling class:
  */
-static void task_tick_fair(struct rq *rq, struct task_struct *curr)
+static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		entity_tick(cfs_rq, se);
+		entity_tick(cfs_rq, se, queued);
 	}
 }
 
Index: linux-2.6/kernel/Kconfig.hz
===================================================================
--- linux-2.6.orig/kernel/Kconfig.hz
+++ linux-2.6/kernel/Kconfig.hz
@@ -54,3 +54,7 @@ config HZ
 	default 300 if HZ_300
 	default 1000 if HZ_1000
 
+config SCHED_HRTICK
+	bool "High Resolution Timer preemption tick"
+	depends on HIGH_RES_TIMERS && X86
+	default y
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -256,6 +256,7 @@ extern void cpu_init (void);
 extern void trap_init(void);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
+extern void hrtick_resched(void);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 extern void softlockup_tick(void);
@@ -840,7 +841,7 @@ struct sched_class {
 #endif
 
 	void (*set_curr_task) (struct rq *rq);
-	void (*task_tick) (struct rq *rq, struct task_struct *p);
+	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 };
 
Index: linux-2.6/include/linux/hrtimer.h
===================================================================
--- linux-2.6.orig/include/linux/hrtimer.h
+++ linux-2.6/include/linux/hrtimer.h
@@ -217,6 +217,11 @@ static inline ktime_t hrtimer_cb_get_tim
 	return timer->base->get_time();
 }
 
+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
+{
+	return timer->base->cpu_base->hres_active;
+}
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
@@ -248,6 +253,10 @@ static inline ktime_t hrtimer_cb_get_tim
 	return timer->base->softirq_time;
 }
 
+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
+{
+	return 0;
+}
 #endif
 
 extern ktime_t ktime_get(void);
Index: linux-2.6/kernel/sched_idletask.c
===================================================================
--- linux-2.6.orig/kernel/sched_idletask.c
+++ linux-2.6/kernel/sched_idletask.c
@@ -55,7 +55,7 @@ move_one_task_idle(struct rq *this_rq, i
 }
 #endif
 
-static void task_tick_idle(struct rq *rq, struct task_struct *curr)
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
 
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -205,7 +205,7 @@ move_one_task_rt(struct rq *this_rq, int
 }
 #endif
 
-static void task_tick_rt(struct rq *rq, struct task_struct *p)
+static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
 	/*
 	 * RR tasks need a special form of timeslice management.
Index: linux-2.6/arch/x86/kernel/signal_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/signal_32.c
+++ linux-2.6/arch/x86/kernel/signal_32.c
@@ -658,6 +658,9 @@ void do_notify_resume(struct pt_regs *re
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
 		do_signal(regs);
+
+	if (thread_info_flags & _TIF_HRTICK_RESCHED)
+		hrtick_resched();
 	
 	clear_thread_flag(TIF_IRET);
 }
Index: linux-2.6/include/asm-x86/thread_info_32.h
===================================================================
--- linux-2.6.orig/include/asm-x86/thread_info_32.h
+++ linux-2.6/include/asm-x86/thread_info_32.h
@@ -132,6 +132,7 @@ static inline struct thread_info *curren
 #define TIF_SYSCALL_AUDIT	6	/* syscall auditing active */
 #define TIF_SECCOMP		7	/* secure computing */
 #define TIF_RESTORE_SIGMASK	8	/* restore signal mask in do_signal() */
+#define TIF_HRTICK_RESCHED	9	/* reprogram hrtick timer */
 #define TIF_MEMDIE		16
 #define TIF_DEBUG		17	/* uses debug registers */
 #define TIF_IO_BITMAP		18	/* uses I/O bitmap */
@@ -147,6 +148,7 @@ static inline struct thread_info *curren
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
+#define _TIF_HRTICK_RESCHED	(1<<TIF_HRTICK_RESCHED)
 #define _TIF_DEBUG		(1<<TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
Index: linux-2.6/arch/x86/kernel/signal_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/signal_64.c
+++ linux-2.6/arch/x86/kernel/signal_64.c
@@ -480,6 +480,9 @@ do_notify_resume(struct pt_regs *regs, v
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
 		do_signal(regs);
+
+	if (thread_info_flags & _TIF_HRTICK_RESCHED)
+		hrtick_resched();
 }
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
Index: linux-2.6/include/asm-x86/thread_info_64.h
===================================================================
--- linux-2.6.orig/include/asm-x86/thread_info_64.h
+++ linux-2.6/include/asm-x86/thread_info_64.h
@@ -115,6 +115,7 @@ static inline struct thread_info *stack_
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_HRTICK_RESCHED	11	/* reprogram hrtick timer */
 /* 16 free */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
@@ -133,6 +134,7 @@ static inline struct thread_info *stack_
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_MCE_NOTIFY		(1<<TIF_MCE_NOTIFY)
+#define _TIF_HRTICK_RESCHED	(1<<TIF_HRTICK_RESCHED)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
@@ -146,6 +148,9 @@ static inline struct thread_info *stack_
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
 
+#define _TIF_DO_NOTIFY_MASK \
+	(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
+
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)
 
Index: linux-2.6/arch/x86/kernel/entry_64.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/entry_64.S
+++ linux-2.6/arch/x86/kernel/entry_64.S
@@ -283,7 +283,7 @@ sysret_careful:
 sysret_signal:
 	TRACE_IRQS_ON
 	sti
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    1f
 
 	/* Really a signal */
@@ -377,7 +377,7 @@ int_very_careful:
 	jmp int_restore_rest
 	
 int_signal:
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz 1f
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
@@ -603,7 +603,7 @@ retint_careful:
 	jmp retint_check
 	
 retint_signal:
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    retint_swapgs
 	TRACE_IRQS_ON
 	sti

--


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/6] sched: high-res preemption tick
  2007-10-31 21:10 ` [PATCH 3/6] sched: high-res preemption tick Peter Zijlstra
@ 2007-10-31 21:53   ` Andi Kleen
  2007-10-31 22:04     ` Peter Zijlstra
  2007-11-01 10:12     ` Peter Zijlstra
  0 siblings, 2 replies; 22+ messages in thread
From: Andi Kleen @ 2007-10-31 21:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Mike Galbraith, Dmitry Adamushko,
	Thomas Gleixner

Peter Zijlstra <a.p.zijlstra@chello.nl> writes:

> Use HR-timers (when available) to deliver an accurate preemption tick.
>
> The regular scheduler tick that runs at 1/HZ can be too coarse when nice
> level are used. The fairness system will still keep the cpu utilisation 'fair'
> by then delaying the task that got an excessive amount of CPU time but try to
> minimize this by delivering preemption points spot-on.

This might be costly when hrtimers happen to use an more expensive
to reprogram time source. Even an APIC timer access is fairly slow. 
And you'll potentially add the to lots of context switces.

Not sure that is a good idea for performance in general.

-Andi

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/6] sched: high-res preemption tick
  2007-10-31 21:53   ` Andi Kleen
@ 2007-10-31 22:04     ` Peter Zijlstra
  2007-11-01 10:12     ` Peter Zijlstra
  1 sibling, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2007-10-31 22:04 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, Ingo Molnar, Mike Galbraith, Dmitry Adamushko,
	Thomas Gleixner


On Wed, 2007-10-31 at 22:53 +0100, Andi Kleen wrote:
> Peter Zijlstra <a.p.zijlstra@chello.nl> writes:
> 
> > Use HR-timers (when available) to deliver an accurate preemption tick.
> >
> > The regular scheduler tick that runs at 1/HZ can be too coarse when nice
> > level are used. The fairness system will still keep the cpu utilisation 'fair'
> > by then delaying the task that got an excessive amount of CPU time but try to
> > minimize this by delivering preemption points spot-on.
> 
> This might be costly when hrtimers happen to use an more expensive
> to reprogram time source. Even an APIC timer access is fairly slow. 
> And you'll potentially add the to lots of context switces.
> 
> Not sure that is a good idea for performance in general.

Well, me neither, it was just an idea, and a challenge to get
working :-)




^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/6] sched: high-res preemption tick
  2007-10-31 21:53   ` Andi Kleen
  2007-10-31 22:04     ` Peter Zijlstra
@ 2007-11-01 10:12     ` Peter Zijlstra
  1 sibling, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2007-11-01 10:12 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, Ingo Molnar, Mike Galbraith, Dmitry Adamushko,
	Thomas Gleixner

[-- Attachment #1: Type: text/plain, Size: 1122 bytes --]

On Wed, 2007-10-31 at 22:53 +0100, Andi Kleen wrote:
> Peter Zijlstra <a.p.zijlstra@chello.nl> writes:
> 
> > Use HR-timers (when available) to deliver an accurate preemption tick.
> >
> > The regular scheduler tick that runs at 1/HZ can be too coarse when nice
> > level are used. The fairness system will still keep the cpu utilisation 'fair'
> > by then delaying the task that got an excessive amount of CPU time but try to
> > minimize this by delivering preemption points spot-on.
> 
> This might be costly when hrtimers happen to use an more expensive
> to reprogram time source. Even an APIC timer access is fairly slow. 
> And you'll potentially add the to lots of context switces.
> 
> Not sure that is a good idea for performance in general.

Right, now I remember. 

The idea was to run the rest of the kernel at HZ=50 or so, nothing but
scheduling needs it anymore, and with this patch the scheduler doesn't
need it anymore either. Should be good for power. This new hrtick thing
only does a lot of ticks when there are a lot of runnable tasks, it
starts at 2 with a tick per latency/2.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 4/6] sched: sched_rt_entity
  2007-10-31 21:10 [PATCH 0/6] various scheduler patches Peter Zijlstra
                   ` (2 preceding siblings ...)
  2007-10-31 21:10 ` [PATCH 3/6] sched: high-res preemption tick Peter Zijlstra
@ 2007-10-31 21:10 ` Peter Zijlstra
  2007-10-31 21:10 ` [PATCH 5/6] sched: SCHED_FIFO/SCHED_RR watchdog timer Peter Zijlstra
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2007-10-31 21:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Mike Galbraith, Dmitry Adamushko, Peter Zijlstra,
	Srivatsa Vaddagiri

[-- Attachment #1: sched-rt-entity.patch --]
[-- Type: text/plain, Size: 5165 bytes --]

Move the task_struct members specific to rt scheduling together.
A future optimization could be to put sched_entity and sched_rt_entity
into a union.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
---
 include/linux/init_task.h |    5 +++--
 include/linux/sched.h     |    8 ++++++--
 kernel/sched.c            |    2 +-
 kernel/sched_rt.c         |   18 +++++++++---------
 mm/oom_kill.c             |    2 +-
 5 files changed, 20 insertions(+), 15 deletions(-)

Index: linux-2.6/include/linux/init_task.h
===================================================================
--- linux-2.6.orig/include/linux/init_task.h
+++ linux-2.6/include/linux/init_task.h
@@ -132,9 +132,10 @@ extern struct group_info init_groups;
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
-	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.rt		= {						\
+		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
+		.time_slice	= HZ, },				\
 	.ioprio		= 0,						\
-	.time_slice	= HZ,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -911,6 +911,11 @@ struct sched_entity {
 #endif
 };
 
+struct sched_rt_entity {
+	struct list_head run_list;
+	unsigned int time_slice;
+};
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -927,9 +932,9 @@ struct task_struct {
 #endif
 
 	int prio, static_prio, normal_prio;
-	struct list_head run_list;
 	const struct sched_class *sched_class;
 	struct sched_entity se;
+	struct sched_rt_entity rt;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	/* list of struct preempt_notifier: */
@@ -953,7 +958,6 @@ struct task_struct {
 
 	unsigned int policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice;
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -29,7 +29,7 @@ static void enqueue_task_rt(struct rq *r
 {
 	struct rt_prio_array *array = &rq->rt.active;
 
-	list_add_tail(&p->run_list, array->queue + p->prio);
+	list_add_tail(&p->rt.run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 }
 
@@ -42,7 +42,7 @@ static void dequeue_task_rt(struct rq *r
 
 	update_curr_rt(rq);
 
-	list_del(&p->run_list);
+	list_del(&p->rt.run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
 }
@@ -55,7 +55,7 @@ static void requeue_task_rt(struct rq *r
 {
 	struct rt_prio_array *array = &rq->rt.active;
 
-	list_move_tail(&p->run_list, array->queue + p->prio);
+	list_move_tail(&p->rt.run_list, array->queue + p->prio);
 }
 
 static void
@@ -85,7 +85,7 @@ static struct task_struct *pick_next_tas
 		return NULL;
 
 	queue = array->queue + idx;
-	next = list_entry(queue->next, struct task_struct, run_list);
+	next = list_entry(queue->next, struct task_struct, rt.run_list);
 
 	next->se.exec_start = rq->clock;
 
@@ -121,7 +121,7 @@ static struct task_struct *load_balance_
 	head = array->queue + idx;
 	curr = head->prev;
 
-	p = list_entry(curr, struct task_struct, run_list);
+	p = list_entry(curr, struct task_struct, rt.run_list);
 
 	curr = curr->prev;
 
@@ -162,7 +162,7 @@ static struct task_struct *load_balance_
 		rq->rt.rt_load_balance_head = head;
 	}
 
-	p = list_entry(curr, struct task_struct, run_list);
+	p = list_entry(curr, struct task_struct, rt.run_list);
 
 	curr = curr->prev;
 
@@ -214,16 +214,16 @@ static void task_tick_rt(struct rq *rq, 
 	if (p->policy != SCHED_RR)
 		return;
 
-	if (--p->time_slice)
+	if (--p->rt.time_slice)
 		return;
 
-	p->time_slice = DEF_TIMESLICE;
+	p->rt.time_slice = DEF_TIMESLICE;
 
 	/*
 	 * Requeue to the end of queue if we are not the only element
 	 * on the queue:
 	 */
-	if (p->run_list.prev != p->run_list.next) {
+	if (p->rt.run_list.prev != p->rt.run_list.next) {
 		requeue_task_rt(rq, p);
 		set_tsk_need_resched(p);
 	}
Index: linux-2.6/mm/oom_kill.c
===================================================================
--- linux-2.6.orig/mm/oom_kill.c
+++ linux-2.6/mm/oom_kill.c
@@ -286,7 +286,7 @@ static void __oom_kill_task(struct task_
 	 * all the memory it needs. That way it should be able to
 	 * exit() and clear out its resources quickly...
 	 */
-	p->time_slice = HZ;
+	p->rt.time_slice = HZ;
 	set_tsk_thread_flag(p, TIF_MEMDIE);
 
 	force_sig(SIGKILL, p);
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -1838,7 +1838,7 @@ static void __sched_fork(struct task_str
 	p->se.wait_max			= 0;
 #endif
 
-	INIT_LIST_HEAD(&p->run_list);
+	INIT_LIST_HEAD(&p->rt.run_list);
 	p->se.on_rq = 0;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS

--


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 5/6] sched: SCHED_FIFO/SCHED_RR watchdog timer
  2007-10-31 21:10 [PATCH 0/6] various scheduler patches Peter Zijlstra
                   ` (3 preceding siblings ...)
  2007-10-31 21:10 ` [PATCH 4/6] sched: sched_rt_entity Peter Zijlstra
@ 2007-10-31 21:10 ` Peter Zijlstra
  2007-10-31 21:49   ` Andi Kleen
  2007-10-31 21:10 ` [PATCH 6/6] sched: place_entity() comments Peter Zijlstra
  2007-11-01  8:29 ` [PATCH 0/6] various scheduler patches Ingo Molnar
  6 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2007-10-31 21:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Mike Galbraith, Dmitry Adamushko, Peter Zijlstra,
	Thomas Gleixner, Lennart Poettering

[-- Attachment #1: sched-watchdog.patch --]
[-- Type: text/plain, Size: 2719 bytes --]

Introduce a new rlimit that allows the user to set a runtime timeout
on real-time tasks. Once this limit is exceeded the task will receive
SIGXCPU.

Input and ideas by Thomas Gleixner and Lennart Poettering.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Lennart Poettering <mzxreary@0pointer.de>
---
 include/asm-generic/resource.h |    5 +++--
 include/linux/sched.h          |    1 +
 kernel/sched.c                 |    2 +-
 kernel/sched_rt.c              |   18 ++++++++++++++++++
 4 files changed, 23 insertions(+), 3 deletions(-)

Index: linux-2.6/include/asm-generic/resource.h
===================================================================
--- linux-2.6.orig/include/asm-generic/resource.h
+++ linux-2.6/include/asm-generic/resource.h
@@ -44,8 +44,8 @@
 #define RLIMIT_NICE		13	/* max nice prio allowed to raise to
 					   0-39 for nice level 19 .. -20 */
 #define RLIMIT_RTPRIO		14	/* maximum realtime priority */
-
-#define RLIM_NLIMITS		15
+#define RLIMIT_RTTIME		15	/* timeout for RT tasks in us */
+#define RLIM_NLIMITS		16
 
 /*
  * SuS says limits have to be unsigned.
@@ -86,6 +86,7 @@
 	[RLIMIT_MSGQUEUE]	= {   MQ_BYTES_MAX,   MQ_BYTES_MAX },	\
 	[RLIMIT_NICE]		= { 0, 0 },				\
 	[RLIMIT_RTPRIO]		= { 0, 0 },				\
+	[RLIMIT_RTTIME]		= {  RLIM_INFINITY,  RLIM_INFINITY },	\
 }
 
 #endif	/* __KERNEL__ */
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -31,6 +31,9 @@ static void enqueue_task_rt(struct rq *r
 
 	list_add_tail(&p->rt.run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
+
+	if (wakeup)
+		p->rt.timeout = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
 }
 
 /*
@@ -205,8 +208,23 @@ move_one_task_rt(struct rq *this_rq, int
 }
 #endif
 
+#define TICK_US (1000000UL / HZ)
+
+static void watchdog(struct rq *rq, struct task_struct *p)
+{
+	if (p->rt.timeout != RLIM_INFINITY) {
+		if (p->rt.timeout < TICK_US) {
+			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, p);
+			return;
+		}
+		p->rt.timeout -= TICK_US;
+	}
+}
+
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
+	watchdog(rq, p);
+
 	/*
 	 * RR tasks need a special form of timeslice management.
 	 * FIFO tasks have no timeslices.
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -914,6 +914,7 @@ struct sched_entity {
 struct sched_rt_entity {
 	struct list_head run_list;
 	unsigned int time_slice;
+	unsigned long timeout;
 };
 
 struct task_struct {

--


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5/6] sched: SCHED_FIFO/SCHED_RR watchdog timer
  2007-10-31 21:10 ` [PATCH 5/6] sched: SCHED_FIFO/SCHED_RR watchdog timer Peter Zijlstra
@ 2007-10-31 21:49   ` Andi Kleen
  2007-10-31 22:03     ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Andi Kleen @ 2007-10-31 21:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Mike Galbraith, Dmitry Adamushko,
	Thomas Gleixner, Lennart Poettering

Peter Zijlstra <a.p.zijlstra@chello.nl> writes:

> Introduce a new rlimit that allows the user to set a runtime timeout
> on real-time tasks. Once this limit is exceeded the task will receive
> SIGXCPU.

Nice idea.

It would be even nicer if you could allow a couple of them. Partition
the RT priorities into a few classes and have an own limit for each them.

A small number of classes (3-4) would be probably enough and not bloat
the rlimits too much.

I'm thinking of the case where you have different kinds of real
time processes. Like your mp3 player which you want to be slightly
real time, but with a low SIGXCPU limit. 

And then something else real time which is more important and
you would set a higher limit. etc.

-Andi

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5/6] sched: SCHED_FIFO/SCHED_RR watchdog timer
  2007-10-31 21:49   ` Andi Kleen
@ 2007-10-31 22:03     ` Peter Zijlstra
  2007-11-03 18:16       ` Andi Kleen
  0 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2007-10-31 22:03 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, Ingo Molnar, Mike Galbraith, Dmitry Adamushko,
	Thomas Gleixner, Lennart Poettering


On Wed, 2007-10-31 at 22:49 +0100, Andi Kleen wrote:
> Peter Zijlstra <a.p.zijlstra@chello.nl> writes:
> 
> > Introduce a new rlimit that allows the user to set a runtime timeout
> > on real-time tasks. Once this limit is exceeded the task will receive
> > SIGXCPU.
> 
> Nice idea.
> 
> It would be even nicer if you could allow a couple of them. Partition
> the RT priorities into a few classes and have an own limit for each them.
> 
> A small number of classes (3-4) would be probably enough and not bloat
> the rlimits too much.
> 
> I'm thinking of the case where you have different kinds of real
> time processes. Like your mp3 player which you want to be slightly
> real time, but with a low SIGXCPU limit. 
> 
> And then something else real time which is more important and
> you would set a higher limit. etc.

But its an rlimit, it can be set per process. Not sure what multiple
classes per process would gain us, let alone how that process has to
figure out which class to use.



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5/6] sched: SCHED_FIFO/SCHED_RR watchdog timer
  2007-10-31 22:03     ` Peter Zijlstra
@ 2007-11-03 18:16       ` Andi Kleen
  0 siblings, 0 replies; 22+ messages in thread
From: Andi Kleen @ 2007-11-03 18:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, linux-kernel, Ingo Molnar, Mike Galbraith,
	Dmitry Adamushko, Thomas Gleixner, Lennart Poettering

On Wed, Oct 31, 2007 at 11:03:28PM +0100, Peter Zijlstra wrote:
> 
> On Wed, 2007-10-31 at 22:49 +0100, Andi Kleen wrote:
> > Peter Zijlstra <a.p.zijlstra@chello.nl> writes:
> > 
> > > Introduce a new rlimit that allows the user to set a runtime timeout
> > > on real-time tasks. Once this limit is exceeded the task will receive
> > > SIGXCPU.
> > 
> > Nice idea.
> > 
> > It would be even nicer if you could allow a couple of them. Partition
> > the RT priorities into a few classes and have an own limit for each them.
> > 
> > A small number of classes (3-4) would be probably enough and not bloat
> > the rlimits too much.
> > 
> > I'm thinking of the case where you have different kinds of real
> > time processes. Like your mp3 player which you want to be slightly
> > real time, but with a low SIGXCPU limit. 
> > 
> > And then something else real time which is more important and
> > you would set a higher limit. etc.
> 
> But its an rlimit, it can be set per process. Not sure what multiple

That's impractical -- you would need to patch the process or call
it from a special program, which is not nice.

rlimits are useful to set a limit during log in. For that the
children can be all kinds of different processes and possibly use
different settings.

> classes per process would gain us, let alone how that process has to
> figure out which class to use.

You set the classes once per rlimit (e.g. in a pam module)
Then the processes set different scheduling priorities by themselves
(standard programs do that). Then that priority would map to a different
class.

-Andi


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 6/6] sched: place_entity() comments
  2007-10-31 21:10 [PATCH 0/6] various scheduler patches Peter Zijlstra
                   ` (4 preceding siblings ...)
  2007-10-31 21:10 ` [PATCH 5/6] sched: SCHED_FIFO/SCHED_RR watchdog timer Peter Zijlstra
@ 2007-10-31 21:10 ` Peter Zijlstra
  2007-11-01  8:29 ` [PATCH 0/6] various scheduler patches Ingo Molnar
  6 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2007-10-31 21:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Mike Galbraith, Dmitry Adamushko, Peter Zijlstra

[-- Attachment #1: sched-place-entity.patch --]
[-- Type: text/plain, Size: 1292 bytes --]

Add a few comments to place_entity().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched_fair.c |   11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -582,19 +582,26 @@ place_entity(struct cfs_rq *cfs_rq, stru
 	} else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
 		vruntime += sched_vslice(cfs_rq)/2;
 
+	/*
+	 * The 'current' period is already promised to the current tasks,
+	 * however the extra weight of the new task will slow them down a
+	 * little, place the new task so that it fits in the slot that
+	 * stays open at the end.
+	 */
 	if (initial && sched_feat(START_DEBIT))
 		vruntime += sched_vslice_add(cfs_rq, se);
 
 	if (!initial) {
+		/* sleeps upto a single latency don't count. */
 		if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) &&
 				task_of(se)->policy != SCHED_BATCH)
 			vruntime -= sysctl_sched_latency;
 
-		vruntime = max_t(s64, vruntime, se->vruntime);
+		/* ensure we never gain time by being placed backwards. */
+		vruntime = max_vruntime(se->vruntime, vruntime);
 	}
 
 	se->vruntime = vruntime;
-
 }
 
 static void

--


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 0/6] various scheduler patches
  2007-10-31 21:10 [PATCH 0/6] various scheduler patches Peter Zijlstra
                   ` (5 preceding siblings ...)
  2007-10-31 21:10 ` [PATCH 6/6] sched: place_entity() comments Peter Zijlstra
@ 2007-11-01  8:29 ` Ingo Molnar
  2007-11-01 10:08   ` Peter Zijlstra
  6 siblings, 1 reply; 22+ messages in thread
From: Ingo Molnar @ 2007-11-01  8:29 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, Mike Galbraith, Dmitry Adamushko


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> My current scheduler queue, seems to work well on lappy

nice stuff! Both the hrtimers-tick feature and the rtlimit looks pretty 
good. I'm wondering how well it works on SMP?

	Ingo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 0/6] various scheduler patches
  2007-11-01  8:29 ` [PATCH 0/6] various scheduler patches Ingo Molnar
@ 2007-11-01 10:08   ` Peter Zijlstra
  0 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2007-11-01 10:08 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, Mike Galbraith, Dmitry Adamushko

[-- Attachment #1: Type: text/plain, Size: 399 bytes --]

On Thu, 2007-11-01 at 09:29 +0100, Ingo Molnar wrote:
> * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > My current scheduler queue, seems to work well on lappy
> 
> nice stuff! Both the hrtimers-tick feature and the rtlimit looks pretty 
> good. 

Thanks!

> I'm wondering how well it works on SMP?

It booted and seemed to properly compile a kernel on a dual core
opteron :-)

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2007-11-03 18:16 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-31 21:10 [PATCH 0/6] various scheduler patches Peter Zijlstra
2007-10-31 21:10 ` [PATCH 1/6] sched: move the group scheduling primitives around Peter Zijlstra
2007-10-31 21:10 ` [PATCH 2/6] sched: make sched_slice() group scheduling savvy Peter Zijlstra
2007-11-01 11:31   ` Srivatsa Vaddagiri
2007-11-01 11:51     ` Peter Zijlstra
2007-11-01 11:58       ` Peter Zijlstra
2007-11-01 12:03         ` Peter Zijlstra
2007-11-01 12:20           ` Peter Zijlstra
2007-11-01 16:31             ` Srivatsa Vaddagiri
2007-11-01 16:55               ` Peter Zijlstra
2007-10-31 21:10 ` [PATCH 3/6] sched: high-res preemption tick Peter Zijlstra
2007-10-31 21:53   ` Andi Kleen
2007-10-31 22:04     ` Peter Zijlstra
2007-11-01 10:12     ` Peter Zijlstra
2007-10-31 21:10 ` [PATCH 4/6] sched: sched_rt_entity Peter Zijlstra
2007-10-31 21:10 ` [PATCH 5/6] sched: SCHED_FIFO/SCHED_RR watchdog timer Peter Zijlstra
2007-10-31 21:49   ` Andi Kleen
2007-10-31 22:03     ` Peter Zijlstra
2007-11-03 18:16       ` Andi Kleen
2007-10-31 21:10 ` [PATCH 6/6] sched: place_entity() comments Peter Zijlstra
2007-11-01  8:29 ` [PATCH 0/6] various scheduler patches Ingo Molnar
2007-11-01 10:08   ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox