[PATCH 7/7] sched: fair: vruntime spread

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Ingo Molnar <mingo@elte.hu>, Mike Galbraith <efault@gmx.de>,
	Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>,
	dhaval@linux.vnet.ibm.com, tong.n.li@intel.com
Cc: linux-kernel@vger.kernel.org, Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [PATCH 7/7] sched: fair: vruntime spread
Date: Mon, 18 Feb 2008 10:55:42 +0100	[thread overview]
Message-ID: <20080218095625.836989000@chello.nl> (raw)
In-Reply-To: 20080218095535.629736000@chello.nl

[-- Attachment #1: sched-fair-spread.patch --]
[-- Type: text/plain, Size: 5635 bytes --]

By its very nature we try to converge vruntime between tasks. This makes it
very hard to interleave the groups that have varying latency requirements,
they end up in a single 'lump'. Avoid this by introducing an artificial
vruntime offset:

A1 |--------------|
A2      |--------------|
A3           |--------------|

New            |--------------|

Because tasks have no stable (dense) enumeration within a group
and we'd want the tasks evenly spaced within the period in a regular
fashion, we use an ascending iterator (nr_iter).

This ensures that in a steady state, each task will have the same offset

However, when a new task gets inserted we cannot re-adjust all offsets,
hence we will approximate by inserting the new task at p(1-1/n).
This is why account_entity_enqueue() sets nr_iter to nr_running-1.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |    1 
 kernel/sched.c        |    2 +
 kernel/sched_fair.c   |   56 ++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 55 insertions(+), 4 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -923,6 +923,7 @@ struct sched_entity {
 	u64			exec_start;
 	u64			sum_exec_runtime;
 	u64			vruntime;
+	u64			voffset;
 	u64			prev_sum_exec_runtime;
 
 #ifdef CONFIG_SCHEDSTATS
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -220,9 +220,11 @@ static inline u64 min_vruntime(u64 min_v
 
 static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	return se->vruntime - cfs_rq->min_vruntime;
+	return se->vruntime + se->voffset - cfs_rq->min_vruntime;
 }
 
+static u64 sched_voffset(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
 /*
  * Enqueue an entity into the rb-tree:
  */
@@ -240,6 +242,8 @@ static void __enqueue_entity(struct cfs_
 	if (se == cfs_rq->curr)
 		return;
 
+	se->voffset = sched_voffset(cfs_rq, se);
+
 	cfs_rq = &rq_of(cfs_rq)->cfs;
 
 	link = &cfs_rq->tasks_timeline.rb_node;
@@ -387,7 +391,7 @@ out:
  *
  * vs = s*rw/w = p
  */
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long nr_running = cfs_rq->nr_running;
 
@@ -397,6 +401,46 @@ static u64 sched_vslice_add(struct cfs_r
 	return __sched_period(nr_running);
 }
 
+/*
+ * By its very nature we try to converge vruntime between tasks. This makes it
+ * very hard to interleave the groups that have varying latency requirements,
+ * they end up in a single 'lump'. Avoid this by introducing an artificial
+ * vruntime offset:
+ *
+ * A1 |--------------|
+ * A2      |--------------|
+ * A3           |--------------|
+ *
+ * New            |--------------|
+ *
+ * Because tasks have no stable (dense) enumeration within a group [*]
+ * and we'd want the tasks evenly spaced within the period in a regular
+ * fashion, we use an ascending iterator (nr_iter).
+ *
+ * This ensures that in a steady state, each task will have the same offset
+ *
+ * However, when a new task gets inserted we cannot re-adjust all offsets,
+ * hence we will approximate by inserting the new task at p(1-1/n).
+ * This is why account_entity_enqueue() sets nr_iter to nr_running-1.
+ *
+ * [*] it would be possible to arrange for one, but it seems unnecessarily
+ *     complex, esp. as we still can't re-adjust all tasks on insert.
+ */
+static u64 sched_voffset(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	/*
+	 * approximate p/n with min_granularity
+	 */
+	u64 frac = sysctl_sched_min_granularity;
+
+	frac *= cfs_rq->nr_iter;
+	cfs_rq->nr_iter++;
+	if (cfs_rq->nr_iter >= cfs_rq->nr_running)
+		cfs_rq->nr_iter = 0;
+
+	return frac;
+}
+
 static inline unsigned long
 calc_delta_fair(unsigned long delta, struct sched_entity *se)
 {
@@ -564,6 +608,7 @@ static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
+	cfs_rq->nr_iter = cfs_rq->nr_running;
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
 	list_add(&se->group_node, &cfs_rq->tasks);
@@ -574,6 +619,8 @@ account_entity_dequeue(struct cfs_rq *cf
 {
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running--;
+	if (cfs_rq->nr_iter >= cfs_rq->nr_running)
+		cfs_rq->nr_iter = 0;
 	se->on_rq = 0;
 	list_del_init(&se->group_node);
 }
@@ -656,7 +703,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
 	 * stays open at the end.
 	 */
 	if (initial && sched_feat(START_DEBIT))
-		vruntime += sched_vslice_add(cfs_rq, se);
+		vruntime += sched_vslice(cfs_rq, se);
 
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
@@ -678,6 +725,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
 	 */
 	update_curr(cfs_rq);
 
+	account_entity_enqueue(cfs_rq, se);
+
 	if (wakeup) {
 		place_entity(cfs_rq, se, 0);
 		enqueue_sleeper(cfs_rq, se);
@@ -686,7 +735,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
 	update_stats_enqueue(cfs_rq, se);
 	check_spread(cfs_rq, se);
 	__enqueue_entity(cfs_rq, se);
-	account_entity_enqueue(cfs_rq, se);
 }
 
 static void
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -425,6 +425,8 @@ struct cfs_rq {
 	struct load_weight load;
 	unsigned long nr_running;
 
+	unsigned long nr_iter;
+
 	u64 exec_clock;
 	u64 min_vruntime;
 

--

next prev parent reply	other threads:[~2008-02-18 10:00 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-02-18  9:55 [PATCH 0/7] Single RQ group scheduling Peter Zijlstra
2008-02-18  9:55 ` [PATCH 1/7] sched: cleanup old and rarely used debug features Peter Zijlstra
2008-02-18  9:55 ` [PATCH 2/7] sched: fair-group scheduling vs latency Peter Zijlstra
2008-02-18  9:55 ` [PATCH 3/7] sched: fair-group: de-couple load-balancing from the rb-trees Peter Zijlstra
2008-02-18  9:55 ` [PATCH 4/7] sched: fair-group: single RQ approach Peter Zijlstra
2008-02-18  9:55 ` [PATCH 5/7] sched: remove sysctl_sched_batch_wakeup_granularity Peter Zijlstra
2008-02-18  9:55 ` [PATCH 6/7] sched: fair: optimize sched_slice() Peter Zijlstra
2008-02-18  9:55 ` Peter Zijlstra [this message]
2008-02-19  6:10 ` [PATCH 0/7] Single RQ group scheduling Mike Galbraith

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080218095625.836989000@chello.nl \
    --to=a.p.zijlstra@chello.nl \
    --cc=dhaval@linux.vnet.ibm.com \
    --cc=efault@gmx.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=tong.n.li@intel.com \
    --cc=vatsa@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox