public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/7] Single RQ group scheduling
@ 2008-02-18  9:55 Peter Zijlstra
  2008-02-18  9:55 ` [PATCH 1/7] sched: cleanup old and rarely used debug features Peter Zijlstra
                   ` (7 more replies)
  0 siblings, 8 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18  9:55 UTC (permalink / raw)
  To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
	tong.n.li
  Cc: linux-kernel

This is my current queue for single RQ group scheduling.

Next on the list is the SMP load balancing, for that I'm looking at Tong's
DWRR algorithm.



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/7] sched: cleanup old and rarely used debug features.
  2008-02-18  9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
@ 2008-02-18  9:55 ` Peter Zijlstra
  2008-02-18  9:55 ` [PATCH 2/7] sched: fair-group scheduling vs latency Peter Zijlstra
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18  9:55 UTC (permalink / raw)
  To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
	tong.n.li
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-fair-clean-up.patch --]
[-- Type: text/plain, Size: 2105 bytes --]

TREE_AVG and APPROX_AVG are initial task placement policies that have been
disabled for a long while.. time to remove them.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c      |    8 ++------
 kernel/sched_fair.c |   14 --------------
 2 files changed, 2 insertions(+), 20 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -717,18 +717,14 @@ enum {
 	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 1,
 	SCHED_FEAT_WAKEUP_PREEMPT	= 2,
 	SCHED_FEAT_START_DEBIT		= 4,
-	SCHED_FEAT_TREE_AVG		= 8,
-	SCHED_FEAT_APPROX_AVG		= 16,
-	SCHED_FEAT_HRTICK		= 32,
-	SCHED_FEAT_DOUBLE_TICK		= 64,
+	SCHED_FEAT_HRTICK		= 8,
+	SCHED_FEAT_DOUBLE_TICK		= 16,
 };
 
 const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_NEW_FAIR_SLEEPERS	* 1 |
 		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
 		SCHED_FEAT_START_DEBIT		* 1 |
-		SCHED_FEAT_TREE_AVG		* 0 |
-		SCHED_FEAT_APPROX_AVG		* 0 |
 		SCHED_FEAT_HRTICK		* 1 |
 		SCHED_FEAT_DOUBLE_TICK		* 0;
 
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -288,11 +288,6 @@ static u64 __sched_vslice(unsigned long 
 	return vslice;
 }
 
-static u64 sched_vslice(struct cfs_rq *cfs_rq)
-{
-	return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
-}
-
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	return __sched_vslice(cfs_rq->load.weight + se->load.weight,
@@ -500,15 +495,6 @@ place_entity(struct cfs_rq *cfs_rq, stru
 
 	vruntime = cfs_rq->min_vruntime;
 
-	if (sched_feat(TREE_AVG)) {
-		struct sched_entity *last = __pick_last_entity(cfs_rq);
-		if (last) {
-			vruntime += last->vruntime;
-			vruntime >>= 1;
-		}
-	} else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
-		vruntime += sched_vslice(cfs_rq)/2;
-
 	/*
 	 * The 'current' period is already promised to the current tasks,
 	 * however the extra weight of the new task will slow them down a

--


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/7] sched: fair-group scheduling vs latency
  2008-02-18  9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
  2008-02-18  9:55 ` [PATCH 1/7] sched: cleanup old and rarely used debug features Peter Zijlstra
@ 2008-02-18  9:55 ` Peter Zijlstra
  2008-02-18  9:55 ` [PATCH 3/7] sched: fair-group: de-couple load-balancing from the rb-trees Peter Zijlstra
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18  9:55 UTC (permalink / raw)
  To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
	tong.n.li
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-fair-group-slice.patch --]
[-- Type: text/plain, Size: 7398 bytes --]

Currently FAIR_GROUP sched grows the scheduler latency outside of
sysctl_sched_latency, invert this so it stays within.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched_fair.c |  233 ++++++++++++++++++++++++++--------------------------
 1 file changed, 120 insertions(+), 113 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -87,6 +87,11 @@ const_debug unsigned int sysctl_sched_mi
  * CFS operations on generic schedulable entities:
  */
 
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+	return container_of(se, struct task_struct, se);
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
 /* cpu runqueue to which this cfs_rq is attached */
@@ -98,6 +103,56 @@ static inline struct rq *rq_of(struct cf
 /* An entity is a task if it doesn't "own" a runqueue */
 #define entity_is_task(se)	(!se->my_q)
 
+/* Walk up scheduling entities hierarchy */
+#define for_each_sched_entity(se) \
+		for (; se; se = se->parent)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+	return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+	return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+	return grp->my_q;
+}
+
+/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
+ * another cpu ('this_cpu')
+ */
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+	return cfs_rq->tg->cfs_rq[this_cpu];
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+/* Do the two (enqueued) entities belong to the same group ? */
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+	if (se->cfs_rq == pse->cfs_rq)
+		return 1;
+
+	return 0;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+	return se->parent;
+}
+
+#define GROUP_IMBALANCE_PCT	20
+
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -107,13 +162,49 @@ static inline struct rq *rq_of(struct cf
 
 #define entity_is_task(se)	1
 
-#endif	/* CONFIG_FAIR_GROUP_SCHED */
+#define for_each_sched_entity(se) \
+		for (; se; se = NULL)
 
-static inline struct task_struct *task_of(struct sched_entity *se)
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 {
-	return container_of(se, struct task_struct, se);
+	return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+	struct task_struct *p = task_of(se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+	return NULL;
+}
+
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+	return &cpu_rq(this_cpu)->cfs;
+}
+
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+	return 1;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+	return NULL;
 }
 
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
+
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -267,31 +358,44 @@ static u64 sched_slice(struct cfs_rq *cf
 {
 	u64 slice = __sched_period(cfs_rq->nr_running);
 
-	slice *= se->load.weight;
-	do_div(slice, cfs_rq->load.weight);
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		slice *= se->load.weight;
+		do_div(slice, cfs_rq->load.weight);
+	}
 
 	return slice;
 }
 
 /*
- * We calculate the vruntime slice.
+ * We calculate the vruntime slice of a to be inserted task
  *
  * vs = s/w = p/rw
  */
-static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
+static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	u64 vslice = __sched_period(nr_running);
+	unsigned long nr_running = cfs_rq->nr_running;
+	unsigned long weight;
+	u64 vslice;
 
-	vslice *= NICE_0_LOAD;
-	do_div(vslice, rq_weight);
+	if (!se->on_rq)
+		nr_running++;
 
-	return vslice;
-}
+	vslice = __sched_period(nr_running);
 
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	return __sched_vslice(cfs_rq->load.weight + se->load.weight,
-			cfs_rq->nr_running + 1);
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		weight = cfs_rq->load.weight;
+		if (!se->on_rq)
+			weight += se->load.weight;
+
+		vslice *= NICE_0_LOAD;
+		do_div(vslice, weight);
+	}
+
+	return vslice;
 }
 
 /*
@@ -668,103 +772,6 @@ entity_tick(struct cfs_rq *cfs_rq, struc
  * CFS operations on tasks:
  */
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* Walk up scheduling entities hierarchy */
-#define for_each_sched_entity(se) \
-		for (; se; se = se->parent)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-	return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-	return grp->my_q;
-}
-
-/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
- * another cpu ('this_cpu')
- */
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-	return cfs_rq->tg->cfs_rq[this_cpu];
-}
-
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
-
-/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-	if (se->cfs_rq == pse->cfs_rq)
-		return 1;
-
-	return 0;
-}
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
-	return se->parent;
-}
-
-#define GROUP_IMBALANCE_PCT	20
-
-#else	/* CONFIG_FAIR_GROUP_SCHED */
-
-#define for_each_sched_entity(se) \
-		for (; se; se = NULL)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return &task_rq(p)->cfs;
-}
-
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-	struct task_struct *p = task_of(se);
-	struct rq *rq = task_rq(p);
-
-	return &rq->cfs;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-	return NULL;
-}
-
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-	return &cpu_rq(this_cpu)->cfs;
-}
-
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-	return 1;
-}
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
-	return NULL;
-}
-
-#endif	/* CONFIG_FAIR_GROUP_SCHED */
-
 #ifdef CONFIG_SCHED_HRTICK
 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {

--


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 3/7] sched: fair-group: de-couple load-balancing from the rb-trees
  2008-02-18  9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
  2008-02-18  9:55 ` [PATCH 1/7] sched: cleanup old and rarely used debug features Peter Zijlstra
  2008-02-18  9:55 ` [PATCH 2/7] sched: fair-group scheduling vs latency Peter Zijlstra
@ 2008-02-18  9:55 ` Peter Zijlstra
  2008-02-18  9:55 ` [PATCH 4/7] sched: fair-group: single RQ approach Peter Zijlstra
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18  9:55 UTC (permalink / raw)
  To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
	tong.n.li
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-fair-group-smp.patch --]
[-- Type: text/plain, Size: 4286 bytes --]

De-couple load-balancing from the rb-trees, so that I can change their
organization. Afterwards we'll probably want to redo load balancing to
not need the grouping info.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/init_task.h |    3 +++
 include/linux/sched.h     |    1 +
 kernel/sched.c            |   12 ++++++++++--
 kernel/sched_fair.c       |   15 +++++++++------
 4 files changed, 23 insertions(+), 8 deletions(-)

Index: linux-2.6/include/linux/init_task.h
===================================================================
--- linux-2.6.orig/include/linux/init_task.h
+++ linux-2.6/include/linux/init_task.h
@@ -132,6 +132,9 @@ extern struct group_info init_groups;
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
+	.se		= {						\
+		.group_node 	= LIST_HEAD_INIT(tsk.se.group_node),	\
+	},								\
 	.rt		= {						\
 		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
 		.time_slice	= HZ, 					\
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -909,6 +909,7 @@ struct load_weight {
 struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
+	struct list_head	group_node;
 	unsigned int		on_rq;
 
 	u64			exec_start;
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -437,8 +437,12 @@ struct cfs_rq {
 
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
-	struct rb_node *rb_load_balance_curr;
-	/* 'curr' points to currently running entity on this cfs_rq.
+
+	struct list_head tasks;
+	struct list_head *balance_iterator;
+
+	/*
+	 * 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
 	struct sched_entity *curr;
@@ -2025,6 +2029,7 @@ static void __sched_fork(struct task_str
 
 	INIT_LIST_HEAD(&p->rt.run_list);
 	p->se.on_rq = 0;
+	INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -7196,6 +7201,7 @@ int in_sched_functions(unsigned long add
 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
+	INIT_LIST_HEAD(&cfs_rq->tasks);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 #endif
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -523,6 +523,7 @@ account_entity_enqueue(struct cfs_rq *cf
 	update_load_add(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
+	list_add(&se->group_node, &cfs_rq->tasks);
 }
 
 static void
@@ -531,6 +532,7 @@ account_entity_dequeue(struct cfs_rq *cf
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
+	list_del_init(&se->group_node);
 }
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1156,16 +1158,17 @@ static void put_prev_task_fair(struct rq
  * achieve that by always pre-iterating before returning
  * the current task:
  */
+
 static struct task_struct *
-__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
+__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
 {
 	struct task_struct *p;
 
-	if (!curr)
+	if (next == &cfs_rq->tasks)
 		return NULL;
 
-	p = rb_entry(curr, struct task_struct, se.run_node);
-	cfs_rq->rb_load_balance_curr = rb_next(curr);
+	p = list_entry(next, struct task_struct, se.group_node);
+	cfs_rq->balance_iterator = next->next;
 
 	return p;
 }
@@ -1174,14 +1177,14 @@ static struct task_struct *load_balance_
 {
 	struct cfs_rq *cfs_rq = arg;
 
-	return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
+	return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
 }
 
 static struct task_struct *load_balance_next_fair(void *arg)
 {
 	struct cfs_rq *cfs_rq = arg;
 
-	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
+	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
 }
 
 static unsigned long

--


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 4/7] sched: fair-group: single RQ approach
  2008-02-18  9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
                   ` (2 preceding siblings ...)
  2008-02-18  9:55 ` [PATCH 3/7] sched: fair-group: de-couple load-balancing from the rb-trees Peter Zijlstra
@ 2008-02-18  9:55 ` Peter Zijlstra
  2008-02-18  9:55 ` [PATCH 5/7] sched: remove sysctl_sched_batch_wakeup_granularity Peter Zijlstra
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18  9:55 UTC (permalink / raw)
  To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
	tong.n.li
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-fair-group-single-rq.patch --]
[-- Type: text/plain, Size: 9659 bytes --]

The current hierarchical RQ group scheduler suffers from a number of problems:
 - yield
 - wakeup preemption
 - sleeper fairness

All these problems are due to the isolation caused by the multiple RQ design.
They are caused by the fact that vruntime becomes a local property.

Solve this by returning to a single RQ model.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c      |    9 --
 kernel/sched_fair.c |  160 +++++++++++++++++++++++++++++++++-------------------
 2 files changed, 105 insertions(+), 64 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -1197,6 +1197,9 @@ static void __resched_task(struct task_s
  */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 
+/*
+ * delta *= weight / lw
+ */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
@@ -1219,12 +1222,6 @@ calc_delta_mine(unsigned long delta_exec
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 
-static inline unsigned long
-calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
-{
-	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
-}
-
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -238,12 +238,22 @@ static inline s64 entity_key(struct cfs_
  */
 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+	struct rb_node **link;
 	struct rb_node *parent = NULL;
 	struct sched_entity *entry;
-	s64 key = entity_key(cfs_rq, se);
+	s64 key;
 	int leftmost = 1;
 
+	if (!entity_is_task(se))
+		return;
+
+	if (se == cfs_rq->curr)
+		return;
+
+	cfs_rq = &rq_of(cfs_rq)->cfs;
+
+	link = &cfs_rq->tasks_timeline.rb_node;
+	key = entity_key(cfs_rq, se);
 	/*
 	 * Find the right place in the rbtree:
 	 */
@@ -275,6 +285,14 @@ static void __enqueue_entity(struct cfs_
 
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	if (!entity_is_task(se))
+		return;
+
+	if (se == cfs_rq->curr)
+		return;
+
+	cfs_rq = &rq_of(cfs_rq)->cfs;
+
 	if (cfs_rq->rb_leftmost == &se->run_node)
 		cfs_rq->rb_leftmost = rb_next(&se->run_node);
 
@@ -358,6 +376,13 @@ static u64 sched_slice(struct cfs_rq *cf
 {
 	u64 slice = __sched_period(cfs_rq->nr_running);
 
+	/*
+	 * FIXME: curious 'hack' to make it boot - when the tick is started we
+	 * hit this with the init_task, which is not actually enqueued.
+	 */
+	if (unlikely(rq_of(cfs_rq)->load.weight <= se->load.weight))
+		goto out;
+
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 
@@ -365,37 +390,66 @@ static u64 sched_slice(struct cfs_rq *cf
 		do_div(slice, cfs_rq->load.weight);
 	}
 
+out:
 	return slice;
 }
 
 /*
  * We calculate the vruntime slice of a to be inserted task
  *
- * vs = s/w = p/rw
+ * vs = s*rw/w = p
  */
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long nr_running = cfs_rq->nr_running;
-	unsigned long weight;
-	u64 vslice;
 
 	if (!se->on_rq)
 		nr_running++;
 
-	vslice = __sched_period(nr_running);
+	return __sched_period(nr_running);
+}
 
+static inline unsigned long
+calc_delta_fair(unsigned long delta, struct sched_entity *se)
+{
 	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
+		delta = calc_delta_mine(delta,
+				cfs_rq_of(se)->load.weight, &se->load);
+	}
 
-		weight = cfs_rq->load.weight;
-		if (!se->on_rq)
-			weight += se->load.weight;
+	return delta;
+}
 
-		vslice *= NICE_0_LOAD;
-		do_div(vslice, weight);
+/*
+ * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
+ * that it favours >=0 over <0.
+ *
+ *   -20         |
+ *               |
+ *     0 --------+-------
+ *             .'
+ *    19     .'
+ *
+ */
+static unsigned long
+calc_delta_asym(unsigned long delta, struct sched_entity *se)
+{
+	struct load_weight lw = {
+		.weight = NICE_0_LOAD,
+		.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
+	};
+
+	for_each_sched_entity(se) {
+		struct load_weight *se_lw = &se->load;
+
+		if (se->load.weight < NICE_0_LOAD)
+			se_lw = &lw;
+
+		delta = calc_delta_mine(delta,
+				cfs_rq_of(se)->load.weight, se_lw);
 	}
 
-	return vslice;
+	return delta;
 }
 
 /*
@@ -413,13 +467,14 @@ __update_curr(struct cfs_rq *cfs_rq, str
 
 	curr->sum_exec_runtime += delta_exec;
 	schedstat_add(cfs_rq, exec_clock, delta_exec);
-	delta_exec_weighted = delta_exec;
-	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
-		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
-							&curr->load);
-	}
+	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
 	curr->vruntime += delta_exec_weighted;
 
+	if (!entity_is_task(curr))
+		return;
+
+	cfs_rq = &rq_of(cfs_rq)->cfs;
+
 	/*
 	 * maintain cfs_rq->min_vruntime to be a monotonic increasing
 	 * value tracking the leftmost vruntime in the tree.
@@ -599,6 +654,11 @@ place_entity(struct cfs_rq *cfs_rq, stru
 {
 	u64 vruntime;
 
+	if (!entity_is_task(se))
+		return;
+
+	cfs_rq = &rq_of(cfs_rq)->cfs;
+
 	vruntime = cfs_rq->min_vruntime;
 
 	/*
@@ -613,7 +673,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
 		if (sched_feat(NEW_FAIR_SLEEPERS))
-			vruntime -= sysctl_sched_latency;
+			vruntime -= calc_delta_asym(sysctl_sched_latency, se);
 
 		/* ensure we never gain time by being placed backwards. */
 		vruntime = max_vruntime(se->vruntime, vruntime);
@@ -637,8 +697,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
 
 	update_stats_enqueue(cfs_rq, se);
 	check_spread(cfs_rq, se);
-	if (se != cfs_rq->curr)
-		__enqueue_entity(cfs_rq, se);
+	__enqueue_entity(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
 }
 
@@ -664,8 +723,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 #endif
 	}
 
-	if (se != cfs_rq->curr)
-		__dequeue_entity(cfs_rq, se);
+	__dequeue_entity(cfs_rq, se);
 	account_entity_dequeue(cfs_rq, se);
 }
 
@@ -694,6 +752,8 @@ set_next_entity(struct cfs_rq *cfs_rq, s
 		 * runqueue.
 		 */
 		update_stats_wait_end(cfs_rq, se);
+		if (WARN_ON_ONCE(cfs_rq->curr))
+			cfs_rq->curr = NULL;
 		__dequeue_entity(cfs_rq, se);
 	}
 
@@ -713,18 +773,6 @@ set_next_entity(struct cfs_rq *cfs_rq, s
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
-{
-	struct sched_entity *se = NULL;
-
-	if (first_fair(cfs_rq)) {
-		se = __pick_next_entity(cfs_rq);
-		set_next_entity(cfs_rq, se);
-	}
-
-	return se;
-}
-
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
 	/*
@@ -735,12 +783,12 @@ static void put_prev_entity(struct cfs_r
 		update_curr(cfs_rq);
 
 	check_spread(cfs_rq, prev);
+	cfs_rq->curr = NULL;
 	if (prev->on_rq) {
 		update_stats_wait_start(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
 	}
-	cfs_rq->curr = NULL;
 }
 
 static void
@@ -751,6 +799,9 @@ entity_tick(struct cfs_rq *cfs_rq, struc
 	 */
 	update_curr(cfs_rq);
 
+	if (!entity_is_task(curr))
+		return;
+
 #ifdef CONFIG_SCHED_HRTICK
 	/*
 	 * queued ticks are scheduled to match the slice, so don't bother
@@ -766,7 +817,8 @@ entity_tick(struct cfs_rq *cfs_rq, struc
 		return;
 #endif
 
-	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
+	if (rq_of(cfs_rq)->load.weight != curr->load.weight ||
+			!sched_feat(WAKEUP_PREEMPT))
 		check_preempt_tick(cfs_rq, curr);
 }
 
@@ -891,7 +943,7 @@ static void yield_task_fair(struct rq *r
 	/*
 	 * Are we the only task in the tree?
 	 */
-	if (unlikely(cfs_rq->nr_running == 1))
+	if (unlikely(rq->load.weight == curr->se.load.weight))
 		return;
 
 	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
@@ -906,7 +958,7 @@ static void yield_task_fair(struct rq *r
 	/*
 	 * Find the rightmost entry in the rbtree:
 	 */
-	rightmost = __pick_last_entity(cfs_rq);
+	rightmost = __pick_last_entity(&rq->cfs);
 	/*
 	 * Already in the rightmost position?
 	 */
@@ -1068,7 +1120,6 @@ out_set_cpu:
 }
 #endif /* CONFIG_SMP */
 
-
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -1095,19 +1146,11 @@ static void check_preempt_wakeup(struct 
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
-	while (!is_same_group(se, pse)) {
-		se = parent_entity(se);
-		pse = parent_entity(pse);
-	}
-
-	gran = sysctl_sched_wakeup_granularity;
 	/*
-	 * More easily preempt - nice tasks, while not making
-	 * it harder for + nice tasks.
+	 * More easily preempt - nice tasks, while not making it harder for
+	 * + nice tasks.
 	 */
-	if (unlikely(se->load.weight > NICE_0_LOAD))
-		gran = calc_delta_fair(gran, &se->load);
-
+	gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
 	if (pse->vruntime + gran < se->vruntime)
 		resched_task(curr);
 }
@@ -1116,17 +1159,18 @@ static struct task_struct *pick_next_tas
 {
 	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
-	struct sched_entity *se;
+	struct sched_entity *se, *next;
 
-	if (unlikely(!cfs_rq->nr_running))
+	if (!first_fair(cfs_rq))
 		return NULL;
 
-	do {
-		se = pick_next_entity(cfs_rq);
-		cfs_rq = group_cfs_rq(se);
-	} while (cfs_rq);
+	next = se = __pick_next_entity(cfs_rq);
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		set_next_entity(cfs_rq, se);
+	}
 
-	p = task_of(se);
+	p = task_of(next);
 	hrtick_start_fair(rq, p);
 
 	return p;

--


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 5/7] sched: remove sysctl_sched_batch_wakeup_granularity
  2008-02-18  9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
                   ` (3 preceding siblings ...)
  2008-02-18  9:55 ` [PATCH 4/7] sched: fair-group: single RQ approach Peter Zijlstra
@ 2008-02-18  9:55 ` Peter Zijlstra
  2008-02-18  9:55 ` [PATCH 6/7] sched: fair: optimize sched_slice() Peter Zijlstra
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18  9:55 UTC (permalink / raw)
  To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
	tong.n.li
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-fair-remove-batch-wakeup.patch --]
[-- Type: text/plain, Size: 3256 bytes --]

Its unused,.. toss it!

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |    1 -
 kernel/sched.c        |    1 -
 kernel/sched_debug.c  |    1 -
 kernel/sched_fair.c   |   10 ----------
 kernel/sysctl.c       |   11 -----------
 5 files changed, 24 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1526,7 +1526,6 @@ extern void sched_idle_next(void);
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -62,16 +62,6 @@ const_debug unsigned int sysctl_sched_ch
 unsigned int __read_mostly sysctl_sched_compat_yield;
 
 /*
- * SCHED_BATCH wake-up granularity.
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- */
-unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
-
-/*
  * SCHED_OTHER wake-up granularity.
  * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
  *
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -270,17 +270,6 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_batch_wakeup_granularity_ns",
-		.data		= &sysctl_sched_batch_wakeup_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_wakeup_granularity_ns,
-		.extra2		= &max_wakeup_granularity_ns,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_child_runs_first",
 		.data		= &sysctl_sched_child_runs_first,
 		.maxlen		= sizeof(unsigned int),
Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c
+++ linux-2.6/kernel/sched_debug.c
@@ -214,7 +214,6 @@ static int sched_debug_show(struct seq_f
 	PN(sysctl_sched_latency);
 	PN(sysctl_sched_min_granularity);
 	PN(sysctl_sched_wakeup_granularity);
-	PN(sysctl_sched_batch_wakeup_granularity);
 	PN(sysctl_sched_child_runs_first);
 	P(sysctl_sched_features);
 #undef PN
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -5381,7 +5381,6 @@ static inline void sched_init_granularit
 		sysctl_sched_latency = limit;
 
 	sysctl_sched_wakeup_granularity *= factor;
-	sysctl_sched_batch_wakeup_granularity *= factor;
 }
 
 #ifdef CONFIG_SMP

--


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 6/7] sched: fair: optimize sched_slice()
  2008-02-18  9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
                   ` (4 preceding siblings ...)
  2008-02-18  9:55 ` [PATCH 5/7] sched: remove sysctl_sched_batch_wakeup_granularity Peter Zijlstra
@ 2008-02-18  9:55 ` Peter Zijlstra
  2008-02-18  9:55 ` [PATCH 7/7] sched: fair: vruntime spread Peter Zijlstra
  2008-02-19  6:10 ` [PATCH 0/7] Single RQ group scheduling Mike Galbraith
  7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18  9:55 UTC (permalink / raw)
  To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
	tong.n.li
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-opt-sched_slice.patch --]
[-- Type: text/plain, Size: 1824 bytes --]

Cache the divide in the hope we can avoid doing it in steady state optertion.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c       |    2 ++
 kernel/sched_debug.c |    1 +
 kernel/sched_fair.c  |    6 ++----
 3 files changed, 5 insertions(+), 4 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -1225,11 +1225,13 @@ calc_delta_mine(unsigned long delta_exec
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
+	lw->inv_weight = 0;
 }
 
 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 {
 	lw->weight -= dec;
+	lw->inv_weight = 0;
 }
 
 /*
Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c
+++ linux-2.6/kernel/sched_debug.c
@@ -137,6 +137,7 @@ void print_cfs_rq(struct seq_file *m, in
 			SPLIT_NS(spread0));
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
+	SEQ_printf(m, "  .%-30s: %ld\n", "load^-1", cfs_rq->load.inv_weight);
 #ifdef CONFIG_SCHEDSTATS
 	SEQ_printf(m, "  .%-30s: %d\n", "bkl_count",
 			rq->bkl_count);
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -374,10 +374,8 @@ static u64 sched_slice(struct cfs_rq *cf
 		goto out;
 
 	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
-
-		slice *= se->load.weight;
-		do_div(slice, cfs_rq->load.weight);
+		slice = calc_delta_mine(slice,
+				se->load.weight, &cfs_rq_of(se)->load);
 	}
 
 out:

--


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 7/7] sched: fair: vruntime spread
  2008-02-18  9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
                   ` (5 preceding siblings ...)
  2008-02-18  9:55 ` [PATCH 6/7] sched: fair: optimize sched_slice() Peter Zijlstra
@ 2008-02-18  9:55 ` Peter Zijlstra
  2008-02-19  6:10 ` [PATCH 0/7] Single RQ group scheduling Mike Galbraith
  7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18  9:55 UTC (permalink / raw)
  To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
	tong.n.li
  Cc: linux-kernel, Peter Zijlstra

[-- Attachment #1: sched-fair-spread.patch --]
[-- Type: text/plain, Size: 5635 bytes --]

By its very nature we try to converge vruntime between tasks. This makes it
very hard to interleave the groups that have varying latency requirements,
they end up in a single 'lump'. Avoid this by introducing an artificial
vruntime offset:

A1 |--------------|
A2      |--------------|
A3           |--------------|

New            |--------------|

Because tasks have no stable (dense) enumeration within a group
and we'd want the tasks evenly spaced within the period in a regular
fashion, we use an ascending iterator (nr_iter).

This ensures that in a steady state, each task will have the same offset

However, when a new task gets inserted we cannot re-adjust all offsets,
hence we will approximate by inserting the new task at p(1-1/n).
This is why account_entity_enqueue() sets nr_iter to nr_running-1.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |    1 
 kernel/sched.c        |    2 +
 kernel/sched_fair.c   |   56 ++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 55 insertions(+), 4 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -923,6 +923,7 @@ struct sched_entity {
 	u64			exec_start;
 	u64			sum_exec_runtime;
 	u64			vruntime;
+	u64			voffset;
 	u64			prev_sum_exec_runtime;
 
 #ifdef CONFIG_SCHEDSTATS
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -220,9 +220,11 @@ static inline u64 min_vruntime(u64 min_v
 
 static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	return se->vruntime - cfs_rq->min_vruntime;
+	return se->vruntime + se->voffset - cfs_rq->min_vruntime;
 }
 
+static u64 sched_voffset(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
 /*
  * Enqueue an entity into the rb-tree:
  */
@@ -240,6 +242,8 @@ static void __enqueue_entity(struct cfs_
 	if (se == cfs_rq->curr)
 		return;
 
+	se->voffset = sched_voffset(cfs_rq, se);
+
 	cfs_rq = &rq_of(cfs_rq)->cfs;
 
 	link = &cfs_rq->tasks_timeline.rb_node;
@@ -387,7 +391,7 @@ out:
  *
  * vs = s*rw/w = p
  */
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long nr_running = cfs_rq->nr_running;
 
@@ -397,6 +401,46 @@ static u64 sched_vslice_add(struct cfs_r
 	return __sched_period(nr_running);
 }
 
+/*
+ * By its very nature we try to converge vruntime between tasks. This makes it
+ * very hard to interleave the groups that have varying latency requirements,
+ * they end up in a single 'lump'. Avoid this by introducing an artificial
+ * vruntime offset:
+ *
+ * A1 |--------------|
+ * A2      |--------------|
+ * A3           |--------------|
+ *
+ * New            |--------------|
+ *
+ * Because tasks have no stable (dense) enumeration within a group [*]
+ * and we'd want the tasks evenly spaced within the period in a regular
+ * fashion, we use an ascending iterator (nr_iter).
+ *
+ * This ensures that in a steady state, each task will have the same offset
+ *
+ * However, when a new task gets inserted we cannot re-adjust all offsets,
+ * hence we will approximate by inserting the new task at p(1-1/n).
+ * This is why account_entity_enqueue() sets nr_iter to nr_running-1.
+ *
+ * [*] it would be possible to arrange for one, but it seems unnecessarily
+ *     complex, esp. as we still can't re-adjust all tasks on insert.
+ */
+static u64 sched_voffset(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	/*
+	 * approximate p/n with min_granularity
+	 */
+	u64 frac = sysctl_sched_min_granularity;
+
+	frac *= cfs_rq->nr_iter;
+	cfs_rq->nr_iter++;
+	if (cfs_rq->nr_iter >= cfs_rq->nr_running)
+		cfs_rq->nr_iter = 0;
+
+	return frac;
+}
+
 static inline unsigned long
 calc_delta_fair(unsigned long delta, struct sched_entity *se)
 {
@@ -564,6 +608,7 @@ static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
+	cfs_rq->nr_iter = cfs_rq->nr_running;
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
 	list_add(&se->group_node, &cfs_rq->tasks);
@@ -574,6 +619,8 @@ account_entity_dequeue(struct cfs_rq *cf
 {
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running--;
+	if (cfs_rq->nr_iter >= cfs_rq->nr_running)
+		cfs_rq->nr_iter = 0;
 	se->on_rq = 0;
 	list_del_init(&se->group_node);
 }
@@ -656,7 +703,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
 	 * stays open at the end.
 	 */
 	if (initial && sched_feat(START_DEBIT))
-		vruntime += sched_vslice_add(cfs_rq, se);
+		vruntime += sched_vslice(cfs_rq, se);
 
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
@@ -678,6 +725,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
 	 */
 	update_curr(cfs_rq);
 
+	account_entity_enqueue(cfs_rq, se);
+
 	if (wakeup) {
 		place_entity(cfs_rq, se, 0);
 		enqueue_sleeper(cfs_rq, se);
@@ -686,7 +735,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
 	update_stats_enqueue(cfs_rq, se);
 	check_spread(cfs_rq, se);
 	__enqueue_entity(cfs_rq, se);
-	account_entity_enqueue(cfs_rq, se);
 }
 
 static void
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -425,6 +425,8 @@ struct cfs_rq {
 	struct load_weight load;
 	unsigned long nr_running;
 
+	unsigned long nr_iter;
+
 	u64 exec_clock;
 	u64 min_vruntime;
 

--


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/7] Single RQ group scheduling
  2008-02-18  9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
                   ` (6 preceding siblings ...)
  2008-02-18  9:55 ` [PATCH 7/7] sched: fair: vruntime spread Peter Zijlstra
@ 2008-02-19  6:10 ` Mike Galbraith
  7 siblings, 0 replies; 9+ messages in thread
From: Mike Galbraith @ 2008-02-19  6:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Srivatsa Vaddagiri, dhaval, tong.n.li, linux-kernel


On Mon, 2008-02-18 at 10:55 +0100, Peter Zijlstra wrote:
> This is my current queue for single RQ group scheduling.

I took these out for a brief maxcpus=1 spin yesterday, and noticed
something.  Running 4 copies of chew-max, one as a user, the context
switch rate was high (~1800/s).  I increased sched_min_granularity_ns to
see if that would lower it, and it did, but it also upset fairness.
With sched_min_granularity_ns bumped to half of sched_latency_ns (40ms
default) it was a largish skew.

top - 06:58:46 up 3 min, 15 users,  load average: 4.71, 2.53, 1.04
Tasks: 214 total,   9 running, 204 sleeping,   0 stopped,   1 zombie
Cpu(s): 39.0%us, 61.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  P COMMAND
 5389 mikeg     20   0  1464  364  304 R 40.1  0.0   0:29.71 0 chew-max
 5373 root      20   0  1464  364  304 R 19.0  0.0   0:16.30 0 chew-max
 5388 root      20   0  1464  364  304 R 19.0  0.0   0:15.79 0 chew-max
 5374 root      20   0  1464  364  304 R 18.6  0.0   0:15.90 0 chew-max

	-Mike



^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2008-02-19  6:10 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-02-18  9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
2008-02-18  9:55 ` [PATCH 1/7] sched: cleanup old and rarely used debug features Peter Zijlstra
2008-02-18  9:55 ` [PATCH 2/7] sched: fair-group scheduling vs latency Peter Zijlstra
2008-02-18  9:55 ` [PATCH 3/7] sched: fair-group: de-couple load-balancing from the rb-trees Peter Zijlstra
2008-02-18  9:55 ` [PATCH 4/7] sched: fair-group: single RQ approach Peter Zijlstra
2008-02-18  9:55 ` [PATCH 5/7] sched: remove sysctl_sched_batch_wakeup_granularity Peter Zijlstra
2008-02-18  9:55 ` [PATCH 6/7] sched: fair: optimize sched_slice() Peter Zijlstra
2008-02-18  9:55 ` [PATCH 7/7] sched: fair: vruntime spread Peter Zijlstra
2008-02-19  6:10 ` [PATCH 0/7] Single RQ group scheduling Mike Galbraith

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox