* [PATCH 0/7] Single RQ group scheduling
@ 2008-02-18 9:55 Peter Zijlstra
2008-02-18 9:55 ` [PATCH 1/7] sched: cleanup old and rarely used debug features Peter Zijlstra
` (7 more replies)
0 siblings, 8 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18 9:55 UTC (permalink / raw)
To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
tong.n.li
Cc: linux-kernel
This is my current queue for single RQ group scheduling.
Next on the list is the SMP load balancing, for that I'm looking at Tong's
DWRR algorithm.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 1/7] sched: cleanup old and rarely used debug features.
2008-02-18 9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
@ 2008-02-18 9:55 ` Peter Zijlstra
2008-02-18 9:55 ` [PATCH 2/7] sched: fair-group scheduling vs latency Peter Zijlstra
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18 9:55 UTC (permalink / raw)
To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
tong.n.li
Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: sched-fair-clean-up.patch --]
[-- Type: text/plain, Size: 2105 bytes --]
TREE_AVG and APPROX_AVG are initial task placement policies that have been
disabled for a long while.. time to remove them.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 8 ++------
kernel/sched_fair.c | 14 --------------
2 files changed, 2 insertions(+), 20 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -717,18 +717,14 @@ enum {
SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
SCHED_FEAT_WAKEUP_PREEMPT = 2,
SCHED_FEAT_START_DEBIT = 4,
- SCHED_FEAT_TREE_AVG = 8,
- SCHED_FEAT_APPROX_AVG = 16,
- SCHED_FEAT_HRTICK = 32,
- SCHED_FEAT_DOUBLE_TICK = 64,
+ SCHED_FEAT_HRTICK = 8,
+ SCHED_FEAT_DOUBLE_TICK = 16,
};
const_debug unsigned int sysctl_sched_features =
SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 |
SCHED_FEAT_WAKEUP_PREEMPT * 1 |
SCHED_FEAT_START_DEBIT * 1 |
- SCHED_FEAT_TREE_AVG * 0 |
- SCHED_FEAT_APPROX_AVG * 0 |
SCHED_FEAT_HRTICK * 1 |
SCHED_FEAT_DOUBLE_TICK * 0;
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -288,11 +288,6 @@ static u64 __sched_vslice(unsigned long
return vslice;
}
-static u64 sched_vslice(struct cfs_rq *cfs_rq)
-{
- return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
-}
-
static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return __sched_vslice(cfs_rq->load.weight + se->load.weight,
@@ -500,15 +495,6 @@ place_entity(struct cfs_rq *cfs_rq, stru
vruntime = cfs_rq->min_vruntime;
- if (sched_feat(TREE_AVG)) {
- struct sched_entity *last = __pick_last_entity(cfs_rq);
- if (last) {
- vruntime += last->vruntime;
- vruntime >>= 1;
- }
- } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
- vruntime += sched_vslice(cfs_rq)/2;
-
/*
* The 'current' period is already promised to the current tasks,
* however the extra weight of the new task will slow them down a
--
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 2/7] sched: fair-group scheduling vs latency
2008-02-18 9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
2008-02-18 9:55 ` [PATCH 1/7] sched: cleanup old and rarely used debug features Peter Zijlstra
@ 2008-02-18 9:55 ` Peter Zijlstra
2008-02-18 9:55 ` [PATCH 3/7] sched: fair-group: de-couple load-balancing from the rb-trees Peter Zijlstra
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18 9:55 UTC (permalink / raw)
To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
tong.n.li
Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: sched-fair-group-slice.patch --]
[-- Type: text/plain, Size: 7398 bytes --]
Currently FAIR_GROUP sched grows the scheduler latency outside of
sysctl_sched_latency, invert this so it stays within.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched_fair.c | 233 ++++++++++++++++++++++++++--------------------------
1 file changed, 120 insertions(+), 113 deletions(-)
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -87,6 +87,11 @@ const_debug unsigned int sysctl_sched_mi
* CFS operations on generic schedulable entities:
*/
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ return container_of(se, struct task_struct, se);
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* cpu runqueue to which this cfs_rq is attached */
@@ -98,6 +103,56 @@ static inline struct rq *rq_of(struct cf
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
+/* Walk up scheduling entities hierarchy */
+#define for_each_sched_entity(se) \
+ for (; se; se = se->parent)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return grp->my_q;
+}
+
+/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
+ * another cpu ('this_cpu')
+ */
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+ return cfs_rq->tg->cfs_rq[this_cpu];
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+/* Do the two (enqueued) entities belong to the same group ? */
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+ if (se->cfs_rq == pse->cfs_rq)
+ return 1;
+
+ return 0;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+ return se->parent;
+}
+
+#define GROUP_IMBALANCE_PCT 20
+
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -107,13 +162,49 @@ static inline struct rq *rq_of(struct cf
#define entity_is_task(se) 1
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#define for_each_sched_entity(se) \
+ for (; se; se = NULL)
-static inline struct task_struct *task_of(struct sched_entity *se)
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
- return container_of(se, struct task_struct, se);
+ return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return NULL;
+}
+
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+ return &cpu_rq(this_cpu)->cfs;
+}
+
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+ return 1;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+ return NULL;
}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
/**************************************************************
* Scheduling class tree data structure manipulation methods:
@@ -267,31 +358,44 @@ static u64 sched_slice(struct cfs_rq *cf
{
u64 slice = __sched_period(cfs_rq->nr_running);
- slice *= se->load.weight;
- do_div(slice, cfs_rq->load.weight);
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ slice *= se->load.weight;
+ do_div(slice, cfs_rq->load.weight);
+ }
return slice;
}
/*
- * We calculate the vruntime slice.
+ * We calculate the vruntime slice of a to be inserted task
*
* vs = s/w = p/rw
*/
-static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
+static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 vslice = __sched_period(nr_running);
+ unsigned long nr_running = cfs_rq->nr_running;
+ unsigned long weight;
+ u64 vslice;
- vslice *= NICE_0_LOAD;
- do_div(vslice, rq_weight);
+ if (!se->on_rq)
+ nr_running++;
- return vslice;
-}
+ vslice = __sched_period(nr_running);
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- return __sched_vslice(cfs_rq->load.weight + se->load.weight,
- cfs_rq->nr_running + 1);
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ weight = cfs_rq->load.weight;
+ if (!se->on_rq)
+ weight += se->load.weight;
+
+ vslice *= NICE_0_LOAD;
+ do_div(vslice, weight);
+ }
+
+ return vslice;
}
/*
@@ -668,103 +772,6 @@ entity_tick(struct cfs_rq *cfs_rq, struc
* CFS operations on tasks:
*/
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* Walk up scheduling entities hierarchy */
-#define for_each_sched_entity(se) \
- for (; se; se = se->parent)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
- return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
- return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return grp->my_q;
-}
-
-/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
- * another cpu ('this_cpu')
- */
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
- return cfs_rq->tg->cfs_rq[this_cpu];
-}
-
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
-
-/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
- if (se->cfs_rq == pse->cfs_rq)
- return 1;
-
- return 0;
-}
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
- return se->parent;
-}
-
-#define GROUP_IMBALANCE_PCT 20
-
-#else /* CONFIG_FAIR_GROUP_SCHED */
-
-#define for_each_sched_entity(se) \
- for (; se; se = NULL)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
- return &task_rq(p)->cfs;
-}
-
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
- struct task_struct *p = task_of(se);
- struct rq *rq = task_rq(p);
-
- return &rq->cfs;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return NULL;
-}
-
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
- return &cpu_rq(this_cpu)->cfs;
-}
-
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
- return 1;
-}
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
- return NULL;
-}
-
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
#ifdef CONFIG_SCHED_HRTICK
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
--
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 3/7] sched: fair-group: de-couple load-balancing from the rb-trees
2008-02-18 9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
2008-02-18 9:55 ` [PATCH 1/7] sched: cleanup old and rarely used debug features Peter Zijlstra
2008-02-18 9:55 ` [PATCH 2/7] sched: fair-group scheduling vs latency Peter Zijlstra
@ 2008-02-18 9:55 ` Peter Zijlstra
2008-02-18 9:55 ` [PATCH 4/7] sched: fair-group: single RQ approach Peter Zijlstra
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18 9:55 UTC (permalink / raw)
To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
tong.n.li
Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: sched-fair-group-smp.patch --]
[-- Type: text/plain, Size: 4286 bytes --]
De-couple load-balancing from the rb-trees, so that I can change their
organization. Afterwards we'll probably want to redo load balancing to
not need the grouping info.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/init_task.h | 3 +++
include/linux/sched.h | 1 +
kernel/sched.c | 12 ++++++++++--
kernel/sched_fair.c | 15 +++++++++------
4 files changed, 23 insertions(+), 8 deletions(-)
Index: linux-2.6/include/linux/init_task.h
===================================================================
--- linux-2.6.orig/include/linux/init_task.h
+++ linux-2.6/include/linux/init_task.h
@@ -132,6 +132,9 @@ extern struct group_info init_groups;
.cpus_allowed = CPU_MASK_ALL, \
.mm = NULL, \
.active_mm = &init_mm, \
+ .se = { \
+ .group_node = LIST_HEAD_INIT(tsk.se.group_node), \
+ }, \
.rt = { \
.run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
.time_slice = HZ, \
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -909,6 +909,7 @@ struct load_weight {
struct sched_entity {
struct load_weight load; /* for load-balancing */
struct rb_node run_node;
+ struct list_head group_node;
unsigned int on_rq;
u64 exec_start;
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -437,8 +437,12 @@ struct cfs_rq {
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
- struct rb_node *rb_load_balance_curr;
- /* 'curr' points to currently running entity on this cfs_rq.
+
+ struct list_head tasks;
+ struct list_head *balance_iterator;
+
+ /*
+ * 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr;
@@ -2025,6 +2029,7 @@ static void __sched_fork(struct task_str
INIT_LIST_HEAD(&p->rt.run_list);
p->se.on_rq = 0;
+ INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -7196,6 +7201,7 @@ int in_sched_functions(unsigned long add
static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
{
cfs_rq->tasks_timeline = RB_ROOT;
+ INIT_LIST_HEAD(&cfs_rq->tasks);
#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq->rq = rq;
#endif
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -523,6 +523,7 @@ account_entity_enqueue(struct cfs_rq *cf
update_load_add(&cfs_rq->load, se->load.weight);
cfs_rq->nr_running++;
se->on_rq = 1;
+ list_add(&se->group_node, &cfs_rq->tasks);
}
static void
@@ -531,6 +532,7 @@ account_entity_dequeue(struct cfs_rq *cf
update_load_sub(&cfs_rq->load, se->load.weight);
cfs_rq->nr_running--;
se->on_rq = 0;
+ list_del_init(&se->group_node);
}
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1156,16 +1158,17 @@ static void put_prev_task_fair(struct rq
* achieve that by always pre-iterating before returning
* the current task:
*/
+
static struct task_struct *
-__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
+__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
{
struct task_struct *p;
- if (!curr)
+ if (next == &cfs_rq->tasks)
return NULL;
- p = rb_entry(curr, struct task_struct, se.run_node);
- cfs_rq->rb_load_balance_curr = rb_next(curr);
+ p = list_entry(next, struct task_struct, se.group_node);
+ cfs_rq->balance_iterator = next->next;
return p;
}
@@ -1174,14 +1177,14 @@ static struct task_struct *load_balance_
{
struct cfs_rq *cfs_rq = arg;
- return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
+ return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
}
static struct task_struct *load_balance_next_fair(void *arg)
{
struct cfs_rq *cfs_rq = arg;
- return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
+ return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
}
static unsigned long
--
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 4/7] sched: fair-group: single RQ approach
2008-02-18 9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
` (2 preceding siblings ...)
2008-02-18 9:55 ` [PATCH 3/7] sched: fair-group: de-couple load-balancing from the rb-trees Peter Zijlstra
@ 2008-02-18 9:55 ` Peter Zijlstra
2008-02-18 9:55 ` [PATCH 5/7] sched: remove sysctl_sched_batch_wakeup_granularity Peter Zijlstra
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18 9:55 UTC (permalink / raw)
To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
tong.n.li
Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: sched-fair-group-single-rq.patch --]
[-- Type: text/plain, Size: 9659 bytes --]
The current hierarchical RQ group scheduler suffers from a number of problems:
- yield
- wakeup preemption
- sleeper fairness
All these problems are due to the isolation caused by the multiple RQ design.
They are caused by the fact that vruntime becomes a local property.
Solve this by returning to a single RQ model.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 9 --
kernel/sched_fair.c | 160 +++++++++++++++++++++++++++++++++-------------------
2 files changed, 105 insertions(+), 64 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -1197,6 +1197,9 @@ static void __resched_task(struct task_s
*/
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+/*
+ * delta *= weight / lw
+ */
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
struct load_weight *lw)
@@ -1219,12 +1222,6 @@ calc_delta_mine(unsigned long delta_exec
return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}
-static inline unsigned long
-calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
-{
- return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
-}
-
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -238,12 +238,22 @@ static inline s64 entity_key(struct cfs_
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+ struct rb_node **link;
struct rb_node *parent = NULL;
struct sched_entity *entry;
- s64 key = entity_key(cfs_rq, se);
+ s64 key;
int leftmost = 1;
+ if (!entity_is_task(se))
+ return;
+
+ if (se == cfs_rq->curr)
+ return;
+
+ cfs_rq = &rq_of(cfs_rq)->cfs;
+
+ link = &cfs_rq->tasks_timeline.rb_node;
+ key = entity_key(cfs_rq, se);
/*
* Find the right place in the rbtree:
*/
@@ -275,6 +285,14 @@ static void __enqueue_entity(struct cfs_
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ if (!entity_is_task(se))
+ return;
+
+ if (se == cfs_rq->curr)
+ return;
+
+ cfs_rq = &rq_of(cfs_rq)->cfs;
+
if (cfs_rq->rb_leftmost == &se->run_node)
cfs_rq->rb_leftmost = rb_next(&se->run_node);
@@ -358,6 +376,13 @@ static u64 sched_slice(struct cfs_rq *cf
{
u64 slice = __sched_period(cfs_rq->nr_running);
+ /*
+ * FIXME: curious 'hack' to make it boot - when the tick is started we
+ * hit this with the init_task, which is not actually enqueued.
+ */
+ if (unlikely(rq_of(cfs_rq)->load.weight <= se->load.weight))
+ goto out;
+
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -365,37 +390,66 @@ static u64 sched_slice(struct cfs_rq *cf
do_div(slice, cfs_rq->load.weight);
}
+out:
return slice;
}
/*
* We calculate the vruntime slice of a to be inserted task
*
- * vs = s/w = p/rw
+ * vs = s*rw/w = p
*/
static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long nr_running = cfs_rq->nr_running;
- unsigned long weight;
- u64 vslice;
if (!se->on_rq)
nr_running++;
- vslice = __sched_period(nr_running);
+ return __sched_period(nr_running);
+}
+static inline unsigned long
+calc_delta_fair(unsigned long delta, struct sched_entity *se)
+{
for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
+ delta = calc_delta_mine(delta,
+ cfs_rq_of(se)->load.weight, &se->load);
+ }
- weight = cfs_rq->load.weight;
- if (!se->on_rq)
- weight += se->load.weight;
+ return delta;
+}
- vslice *= NICE_0_LOAD;
- do_div(vslice, weight);
+/*
+ * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
+ * that it favours >=0 over <0.
+ *
+ * -20 |
+ * |
+ * 0 --------+-------
+ * .'
+ * 19 .'
+ *
+ */
+static unsigned long
+calc_delta_asym(unsigned long delta, struct sched_entity *se)
+{
+ struct load_weight lw = {
+ .weight = NICE_0_LOAD,
+ .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
+ };
+
+ for_each_sched_entity(se) {
+ struct load_weight *se_lw = &se->load;
+
+ if (se->load.weight < NICE_0_LOAD)
+ se_lw = &lw;
+
+ delta = calc_delta_mine(delta,
+ cfs_rq_of(se)->load.weight, se_lw);
}
- return vslice;
+ return delta;
}
/*
@@ -413,13 +467,14 @@ __update_curr(struct cfs_rq *cfs_rq, str
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
- delta_exec_weighted = delta_exec;
- if (unlikely(curr->load.weight != NICE_0_LOAD)) {
- delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
- &curr->load);
- }
+ delta_exec_weighted = calc_delta_fair(delta_exec, curr);
curr->vruntime += delta_exec_weighted;
+ if (!entity_is_task(curr))
+ return;
+
+ cfs_rq = &rq_of(cfs_rq)->cfs;
+
/*
* maintain cfs_rq->min_vruntime to be a monotonic increasing
* value tracking the leftmost vruntime in the tree.
@@ -599,6 +654,11 @@ place_entity(struct cfs_rq *cfs_rq, stru
{
u64 vruntime;
+ if (!entity_is_task(se))
+ return;
+
+ cfs_rq = &rq_of(cfs_rq)->cfs;
+
vruntime = cfs_rq->min_vruntime;
/*
@@ -613,7 +673,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
if (!initial) {
/* sleeps upto a single latency don't count. */
if (sched_feat(NEW_FAIR_SLEEPERS))
- vruntime -= sysctl_sched_latency;
+ vruntime -= calc_delta_asym(sysctl_sched_latency, se);
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
@@ -637,8 +697,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
- if (se != cfs_rq->curr)
- __enqueue_entity(cfs_rq, se);
+ __enqueue_entity(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
}
@@ -664,8 +723,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
#endif
}
- if (se != cfs_rq->curr)
- __dequeue_entity(cfs_rq, se);
+ __dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
}
@@ -694,6 +752,8 @@ set_next_entity(struct cfs_rq *cfs_rq, s
* runqueue.
*/
update_stats_wait_end(cfs_rq, se);
+ if (WARN_ON_ONCE(cfs_rq->curr))
+ cfs_rq->curr = NULL;
__dequeue_entity(cfs_rq, se);
}
@@ -713,18 +773,6 @@ set_next_entity(struct cfs_rq *cfs_rq, s
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
-{
- struct sched_entity *se = NULL;
-
- if (first_fair(cfs_rq)) {
- se = __pick_next_entity(cfs_rq);
- set_next_entity(cfs_rq, se);
- }
-
- return se;
-}
-
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
@@ -735,12 +783,12 @@ static void put_prev_entity(struct cfs_r
update_curr(cfs_rq);
check_spread(cfs_rq, prev);
+ cfs_rq->curr = NULL;
if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
}
- cfs_rq->curr = NULL;
}
static void
@@ -751,6 +799,9 @@ entity_tick(struct cfs_rq *cfs_rq, struc
*/
update_curr(cfs_rq);
+ if (!entity_is_task(curr))
+ return;
+
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
@@ -766,7 +817,8 @@ entity_tick(struct cfs_rq *cfs_rq, struc
return;
#endif
- if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
+ if (rq_of(cfs_rq)->load.weight != curr->load.weight ||
+ !sched_feat(WAKEUP_PREEMPT))
check_preempt_tick(cfs_rq, curr);
}
@@ -891,7 +943,7 @@ static void yield_task_fair(struct rq *r
/*
* Are we the only task in the tree?
*/
- if (unlikely(cfs_rq->nr_running == 1))
+ if (unlikely(rq->load.weight == curr->se.load.weight))
return;
if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
@@ -906,7 +958,7 @@ static void yield_task_fair(struct rq *r
/*
* Find the rightmost entry in the rbtree:
*/
- rightmost = __pick_last_entity(cfs_rq);
+ rightmost = __pick_last_entity(&rq->cfs);
/*
* Already in the rightmost position?
*/
@@ -1068,7 +1120,6 @@ out_set_cpu:
}
#endif /* CONFIG_SMP */
-
/*
* Preempt the current task with a newly woken task if needed:
*/
@@ -1095,19 +1146,11 @@ static void check_preempt_wakeup(struct
if (!sched_feat(WAKEUP_PREEMPT))
return;
- while (!is_same_group(se, pse)) {
- se = parent_entity(se);
- pse = parent_entity(pse);
- }
-
- gran = sysctl_sched_wakeup_granularity;
/*
- * More easily preempt - nice tasks, while not making
- * it harder for + nice tasks.
+ * More easily preempt - nice tasks, while not making it harder for
+ * + nice tasks.
*/
- if (unlikely(se->load.weight > NICE_0_LOAD))
- gran = calc_delta_fair(gran, &se->load);
-
+ gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
if (pse->vruntime + gran < se->vruntime)
resched_task(curr);
}
@@ -1116,17 +1159,18 @@ static struct task_struct *pick_next_tas
{
struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
- struct sched_entity *se;
+ struct sched_entity *se, *next;
- if (unlikely(!cfs_rq->nr_running))
+ if (!first_fair(cfs_rq))
return NULL;
- do {
- se = pick_next_entity(cfs_rq);
- cfs_rq = group_cfs_rq(se);
- } while (cfs_rq);
+ next = se = __pick_next_entity(cfs_rq);
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ set_next_entity(cfs_rq, se);
+ }
- p = task_of(se);
+ p = task_of(next);
hrtick_start_fair(rq, p);
return p;
--
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 5/7] sched: remove sysctl_sched_batch_wakeup_granularity
2008-02-18 9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
` (3 preceding siblings ...)
2008-02-18 9:55 ` [PATCH 4/7] sched: fair-group: single RQ approach Peter Zijlstra
@ 2008-02-18 9:55 ` Peter Zijlstra
2008-02-18 9:55 ` [PATCH 6/7] sched: fair: optimize sched_slice() Peter Zijlstra
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18 9:55 UTC (permalink / raw)
To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
tong.n.li
Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: sched-fair-remove-batch-wakeup.patch --]
[-- Type: text/plain, Size: 3256 bytes --]
Its unused,.. toss it!
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sched.h | 1 -
kernel/sched.c | 1 -
kernel/sched_debug.c | 1 -
kernel/sched_fair.c | 10 ----------
kernel/sysctl.c | 11 -----------
5 files changed, 24 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1526,7 +1526,6 @@ extern void sched_idle_next(void);
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_batch_wakeup_granularity;
extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -62,16 +62,6 @@ const_debug unsigned int sysctl_sched_ch
unsigned int __read_mostly sysctl_sched_compat_yield;
/*
- * SCHED_BATCH wake-up granularity.
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- */
-unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
-
-/*
* SCHED_OTHER wake-up granularity.
* (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -270,17 +270,6 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
- .procname = "sched_batch_wakeup_granularity_ns",
- .data = &sysctl_sched_batch_wakeup_granularity,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = &min_wakeup_granularity_ns,
- .extra2 = &max_wakeup_granularity_ns,
- },
- {
- .ctl_name = CTL_UNNUMBERED,
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
.maxlen = sizeof(unsigned int),
Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c
+++ linux-2.6/kernel/sched_debug.c
@@ -214,7 +214,6 @@ static int sched_debug_show(struct seq_f
PN(sysctl_sched_latency);
PN(sysctl_sched_min_granularity);
PN(sysctl_sched_wakeup_granularity);
- PN(sysctl_sched_batch_wakeup_granularity);
PN(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
#undef PN
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -5381,7 +5381,6 @@ static inline void sched_init_granularit
sysctl_sched_latency = limit;
sysctl_sched_wakeup_granularity *= factor;
- sysctl_sched_batch_wakeup_granularity *= factor;
}
#ifdef CONFIG_SMP
--
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 6/7] sched: fair: optimize sched_slice()
2008-02-18 9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
` (4 preceding siblings ...)
2008-02-18 9:55 ` [PATCH 5/7] sched: remove sysctl_sched_batch_wakeup_granularity Peter Zijlstra
@ 2008-02-18 9:55 ` Peter Zijlstra
2008-02-18 9:55 ` [PATCH 7/7] sched: fair: vruntime spread Peter Zijlstra
2008-02-19 6:10 ` [PATCH 0/7] Single RQ group scheduling Mike Galbraith
7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18 9:55 UTC (permalink / raw)
To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
tong.n.li
Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: sched-opt-sched_slice.patch --]
[-- Type: text/plain, Size: 1824 bytes --]
Cache the divide in the hope we can avoid doing it in steady state optertion.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 2 ++
kernel/sched_debug.c | 1 +
kernel/sched_fair.c | 6 ++----
3 files changed, 5 insertions(+), 4 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -1225,11 +1225,13 @@ calc_delta_mine(unsigned long delta_exec
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
+ lw->inv_weight = 0;
}
static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
lw->weight -= dec;
+ lw->inv_weight = 0;
}
/*
Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c
+++ linux-2.6/kernel/sched_debug.c
@@ -137,6 +137,7 @@ void print_cfs_rq(struct seq_file *m, in
SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
+ SEQ_printf(m, " .%-30s: %ld\n", "load^-1", cfs_rq->load.inv_weight);
#ifdef CONFIG_SCHEDSTATS
SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
rq->bkl_count);
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -374,10 +374,8 @@ static u64 sched_slice(struct cfs_rq *cf
goto out;
for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
-
- slice *= se->load.weight;
- do_div(slice, cfs_rq->load.weight);
+ slice = calc_delta_mine(slice,
+ se->load.weight, &cfs_rq_of(se)->load);
}
out:
--
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 7/7] sched: fair: vruntime spread
2008-02-18 9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
` (5 preceding siblings ...)
2008-02-18 9:55 ` [PATCH 6/7] sched: fair: optimize sched_slice() Peter Zijlstra
@ 2008-02-18 9:55 ` Peter Zijlstra
2008-02-19 6:10 ` [PATCH 0/7] Single RQ group scheduling Mike Galbraith
7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-02-18 9:55 UTC (permalink / raw)
To: Ingo Molnar, Mike Galbraith, Srivatsa Vaddagiri, dhaval,
tong.n.li
Cc: linux-kernel, Peter Zijlstra
[-- Attachment #1: sched-fair-spread.patch --]
[-- Type: text/plain, Size: 5635 bytes --]
By its very nature we try to converge vruntime between tasks. This makes it
very hard to interleave the groups that have varying latency requirements,
they end up in a single 'lump'. Avoid this by introducing an artificial
vruntime offset:
A1 |--------------|
A2 |--------------|
A3 |--------------|
New |--------------|
Because tasks have no stable (dense) enumeration within a group
and we'd want the tasks evenly spaced within the period in a regular
fashion, we use an ascending iterator (nr_iter).
This ensures that in a steady state, each task will have the same offset
However, when a new task gets inserted we cannot re-adjust all offsets,
hence we will approximate by inserting the new task at p(1-1/n).
This is why account_entity_enqueue() sets nr_iter to nr_running-1.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sched.h | 1
kernel/sched.c | 2 +
kernel/sched_fair.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++----
3 files changed, 55 insertions(+), 4 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -923,6 +923,7 @@ struct sched_entity {
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
+ u64 voffset;
u64 prev_sum_exec_runtime;
#ifdef CONFIG_SCHEDSTATS
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -220,9 +220,11 @@ static inline u64 min_vruntime(u64 min_v
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return se->vruntime - cfs_rq->min_vruntime;
+ return se->vruntime + se->voffset - cfs_rq->min_vruntime;
}
+static u64 sched_voffset(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
/*
* Enqueue an entity into the rb-tree:
*/
@@ -240,6 +242,8 @@ static void __enqueue_entity(struct cfs_
if (se == cfs_rq->curr)
return;
+ se->voffset = sched_voffset(cfs_rq, se);
+
cfs_rq = &rq_of(cfs_rq)->cfs;
link = &cfs_rq->tasks_timeline.rb_node;
@@ -387,7 +391,7 @@ out:
*
* vs = s*rw/w = p
*/
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long nr_running = cfs_rq->nr_running;
@@ -397,6 +401,46 @@ static u64 sched_vslice_add(struct cfs_r
return __sched_period(nr_running);
}
+/*
+ * By its very nature we try to converge vruntime between tasks. This makes it
+ * very hard to interleave the groups that have varying latency requirements,
+ * they end up in a single 'lump'. Avoid this by introducing an artificial
+ * vruntime offset:
+ *
+ * A1 |--------------|
+ * A2 |--------------|
+ * A3 |--------------|
+ *
+ * New |--------------|
+ *
+ * Because tasks have no stable (dense) enumeration within a group [*]
+ * and we'd want the tasks evenly spaced within the period in a regular
+ * fashion, we use an ascending iterator (nr_iter).
+ *
+ * This ensures that in a steady state, each task will have the same offset
+ *
+ * However, when a new task gets inserted we cannot re-adjust all offsets,
+ * hence we will approximate by inserting the new task at p(1-1/n).
+ * This is why account_entity_enqueue() sets nr_iter to nr_running-1.
+ *
+ * [*] it would be possible to arrange for one, but it seems unnecessarily
+ * complex, esp. as we still can't re-adjust all tasks on insert.
+ */
+static u64 sched_voffset(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /*
+ * approximate p/n with min_granularity
+ */
+ u64 frac = sysctl_sched_min_granularity;
+
+ frac *= cfs_rq->nr_iter;
+ cfs_rq->nr_iter++;
+ if (cfs_rq->nr_iter >= cfs_rq->nr_running)
+ cfs_rq->nr_iter = 0;
+
+ return frac;
+}
+
static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
@@ -564,6 +608,7 @@ static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
+ cfs_rq->nr_iter = cfs_rq->nr_running;
cfs_rq->nr_running++;
se->on_rq = 1;
list_add(&se->group_node, &cfs_rq->tasks);
@@ -574,6 +619,8 @@ account_entity_dequeue(struct cfs_rq *cf
{
update_load_sub(&cfs_rq->load, se->load.weight);
cfs_rq->nr_running--;
+ if (cfs_rq->nr_iter >= cfs_rq->nr_running)
+ cfs_rq->nr_iter = 0;
se->on_rq = 0;
list_del_init(&se->group_node);
}
@@ -656,7 +703,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
* stays open at the end.
*/
if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice_add(cfs_rq, se);
+ vruntime += sched_vslice(cfs_rq, se);
if (!initial) {
/* sleeps upto a single latency don't count. */
@@ -678,6 +725,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
*/
update_curr(cfs_rq);
+ account_entity_enqueue(cfs_rq, se);
+
if (wakeup) {
place_entity(cfs_rq, se, 0);
enqueue_sleeper(cfs_rq, se);
@@ -686,7 +735,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
__enqueue_entity(cfs_rq, se);
- account_entity_enqueue(cfs_rq, se);
}
static void
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -425,6 +425,8 @@ struct cfs_rq {
struct load_weight load;
unsigned long nr_running;
+ unsigned long nr_iter;
+
u64 exec_clock;
u64 min_vruntime;
--
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 0/7] Single RQ group scheduling
2008-02-18 9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
` (6 preceding siblings ...)
2008-02-18 9:55 ` [PATCH 7/7] sched: fair: vruntime spread Peter Zijlstra
@ 2008-02-19 6:10 ` Mike Galbraith
7 siblings, 0 replies; 9+ messages in thread
From: Mike Galbraith @ 2008-02-19 6:10 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Srivatsa Vaddagiri, dhaval, tong.n.li, linux-kernel
On Mon, 2008-02-18 at 10:55 +0100, Peter Zijlstra wrote:
> This is my current queue for single RQ group scheduling.
I took these out for a brief maxcpus=1 spin yesterday, and noticed
something. Running 4 copies of chew-max, one as a user, the context
switch rate was high (~1800/s). I increased sched_min_granularity_ns to
see if that would lower it, and it did, but it also upset fairness.
With sched_min_granularity_ns bumped to half of sched_latency_ns (40ms
default) it was a largish skew.
top - 06:58:46 up 3 min, 15 users, load average: 4.71, 2.53, 1.04
Tasks: 214 total, 9 running, 204 sleeping, 0 stopped, 1 zombie
Cpu(s): 39.0%us, 61.0%sy, 0.0%ni, 0.0%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%st
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ P COMMAND
5389 mikeg 20 0 1464 364 304 R 40.1 0.0 0:29.71 0 chew-max
5373 root 20 0 1464 364 304 R 19.0 0.0 0:16.30 0 chew-max
5388 root 20 0 1464 364 304 R 19.0 0.0 0:15.79 0 chew-max
5374 root 20 0 1464 364 304 R 18.6 0.0 0:15.90 0 chew-max
-Mike
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2008-02-19 6:10 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-02-18 9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
2008-02-18 9:55 ` [PATCH 1/7] sched: cleanup old and rarely used debug features Peter Zijlstra
2008-02-18 9:55 ` [PATCH 2/7] sched: fair-group scheduling vs latency Peter Zijlstra
2008-02-18 9:55 ` [PATCH 3/7] sched: fair-group: de-couple load-balancing from the rb-trees Peter Zijlstra
2008-02-18 9:55 ` [PATCH 4/7] sched: fair-group: single RQ approach Peter Zijlstra
2008-02-18 9:55 ` [PATCH 5/7] sched: remove sysctl_sched_batch_wakeup_granularity Peter Zijlstra
2008-02-18 9:55 ` [PATCH 6/7] sched: fair: optimize sched_slice() Peter Zijlstra
2008-02-18 9:55 ` [PATCH 7/7] sched: fair: vruntime spread Peter Zijlstra
2008-02-19 6:10 ` [PATCH 0/7] Single RQ group scheduling Mike Galbraith
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox