Re: [PATCH 6/8] sched: avg_vruntime

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: linux-kernel@vger.kernel.org
Cc: mingo@elte.hu, efault@gmx.de, vatsa@in.ibm.com,
	fabio <fabio@gandalf.sssup.it>
Subject: Re: [PATCH 6/8] sched: avg_vruntime
Date: Wed, 29 Oct 2008 16:48:34 +0100	[thread overview]
Message-ID: <1225295314.9315.10.camel@lappy.programming.kicks-ass.net> (raw)
In-Reply-To: <20081024091109.832347739@chello.nl>

On Fri, 2008-10-24 at 11:06 +0200, Peter Zijlstra wrote:
> plain text document attachment (sched-avg-vruntime.patch)
> Renicing requires scaling the lag. Therefore we need a way to compute the it.
> Lag is defined as the difference between the service time received from the
> ideal model and the actual scheduler.
> 
> The defining property of a fair scheduler is that the sum of all lags is zero;
> which can be seen is trivially true for the ideal case, as all lags are zero.
> 
> Therefore, the average of all virtual runtimes will be the point of zero lag.
> 
> We cannot prove fairness for CFS due to sleeper fairness (without it we can).
> However since we can observe it does converge to fairness in stable operation,
> we can say the zero lag point converges to the average.
> 
> We can't just take the average of vruntime - as it will use the full range
> of its u64 and will wrap around. Instead we'll use the average of
> (vruntime - min_vruntime)
> 
> \Sum_{i}^{n} 1/n (v_{i} - v) = 1/n (\Sum_{i}^{n} v_{i}) - vn
> 
> By factoring out the 1/n (never storing that) we avoid rounding, which
> would bring an accumulating error.

Hi Fabio,

you were right, this is wrong.

How about this..

The fluid model, would for each task t_i, generate an execution time e_i

  de_i = w_i / w_sum * dt

However, any real scheduler will be imperfect and have an error eps_i

  dE_i = de_i + eps_i,

But due to only dt actual time having past we can state that

  \Sum_i dE_i = dt, therefore  \Sum_i eps_i = 0.

This will be reflected in a virtual runtime skew of

  dv_i = eps_i / w_i

If we now wish to obtain the zero lag point, there were all tasks would
be in the fluid model, we get

  eps_i = dv_i * w_i, which yields: \Sum dv_i * w_i = 0

IOW avg(v_i*w_i) = v_fluid

1/n \Sum_i v_i*w_i, [v_i -> v_i-x] ->
1/n \sum_i (v_i-x)*w_i =
1/n \Sum v_i*w_i - \Sum x*w_i =
1/n \Sum v_i*w_i - x \Sum w_i

which in turn would yield a patch like below..

I'll also try and quantify the error and effect of using min_vruntime as
zero lag point as Ingo suggested.

---
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c	2008-10-29 16:43:16.000000000 +0100
+++ linux-2.6/kernel/sched.c	2008-10-29 16:43:27.000000000 +0100
@@ -384,6 +384,10 @@ struct cfs_rq {
 	struct load_weight load;
 	unsigned long nr_running;
 
+	long nr_queued;
+	long avg_load;
+	s64 avg_vruntime;
+
 	u64 exec_clock;
 	u64 min_vruntime;
 
Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c	2008-10-29 16:43:04.000000000 +0100
+++ linux-2.6/kernel/sched_debug.c	2008-10-29 16:43:37.000000000 +0100
@@ -161,6 +161,9 @@ void print_cfs_rq(struct seq_file *m, in
 			SPLIT_NS(spread0));
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "avg_vruntime",
+			SPLIT_NS(avg_vruntime(cfs_rq)));
+
 #ifdef CONFIG_SCHEDSTATS
 #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
 
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c	2008-10-29 16:43:17.000000000 +0100
+++ linux-2.6/kernel/sched_fair.c	2008-10-29 16:46:41.000000000 +0100
@@ -271,6 +271,60 @@ static inline s64 entity_key(struct cfs_
 	return se->vruntime - cfs_rq->min_vruntime;
 }
 
+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	s64 key = entity_key(cfs_rq, se);
+	cfs_rq->avg_load += se->load.weight;
+	cfs_rq->avg_vruntime += key * se->load.weight;
+	cfs_rq->nr_queued++;
+}
+
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	s64 key = entity_key(cfs_rq, se);
+	cfs_rq->avg_load -= se->load.weight;
+	cfs_rq->avg_vruntime -= key * se->load.weight;
+	cfs_rq->nr_queued--;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+	cfs_rq->avg_vruntime -= cfs_rq->nr_queued * cfs_rq->avg_load * delta;
+}
+
+static u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+	s64 avg = cfs_rq->avg_vruntime;
+	long nr_queued = cfs_rq->nr_queued;
+
+	if (cfs_rq->curr) {
+		nr_queued++;
+		avg += entity_key(cfs_rq, cfs_rq->curr) * cfs_rq->curr->load.weight;
+	}
+
+	avg >>= NICE_0_SHIFT;
+
+	if (nr_queued)
+		avg = div_s64(avg, nr_queued);
+
+	return cfs_rq->min_vruntime + avg;
+}
+
+static void __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+	/*
+	 * open coded max_vruntime() to allow updating avg_vruntime
+	 */
+	s64 delta = (s64)(vruntime - cfs_rq->min_vruntime);
+	if (delta > 0) {
+		avg_vruntime_update(cfs_rq, delta);
+		cfs_rq->min_vruntime = vruntime;
+	}
+}
+
 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 {
 	u64 vruntime = cfs_rq->min_vruntime;
@@ -289,7 +343,7 @@ static void update_min_vruntime(struct c
 			vruntime = min_vruntime(vruntime, se->vruntime);
 	}
 
-	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+	__update_min_vruntime(cfs_rq, vruntime);
 }
 
 /*
@@ -303,6 +357,8 @@ static void __enqueue_entity(struct cfs_
 	s64 key = entity_key(cfs_rq, se);
 	int leftmost = 1;
 
+	avg_vruntime_add(cfs_rq, se);
+
 	/*
 	 * Find the right place in the rbtree:
 	 */
@@ -345,6 +401,7 @@ static void __dequeue_entity(struct cfs_
 		cfs_rq->next = NULL;
 
 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+	avg_vruntime_sub(cfs_rq, se);
 }
 
 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)

next prev parent reply	other threads:[~2008-10-29 18:03 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-10-24  9:06 [PATCH 0/8] scheduler patches Peter Zijlstra
2008-10-24  9:06 ` [PATCH 1/8] sched: fix a find_busiest_group buglet Peter Zijlstra
2008-10-24  9:06 ` [PATCH 2/8] sched: more accurate min_vruntime accounting Peter Zijlstra
2008-10-24  9:06 ` [PATCH 3/8] sched: weaken sync hint Peter Zijlstra
2008-10-24  9:06 ` [PATCH 4/8] sched: re-instate vruntime based wakeup preemption Peter Zijlstra
2008-10-24  9:06 ` [PATCH 5/8] sched: virtual time buddy preemption Peter Zijlstra
2008-10-24  9:06 ` [PATCH 6/8] sched: avg_vruntime Peter Zijlstra
2008-10-29 15:48   ` Peter Zijlstra [this message]
2008-11-01 18:13     ` Fabio Checconi
2008-10-24  9:06 ` [PATCH 7/8] sched: non-zero lag renice Peter Zijlstra
2008-10-24 17:47   ` Chris Friesen
2008-10-24 20:28     ` Peter Zijlstra
2008-10-24 21:13       ` Chris Friesen
2008-10-24  9:06 ` [PATCH 8/8] use avg_vruntime for task placement Peter Zijlstra
2008-10-24 10:26 ` [PATCH 0/8] scheduler patches Ingo Molnar
2008-10-24 10:29   ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1225295314.9315.10.camel@lappy.programming.kicks-ass.net \
    --to=a.p.zijlstra@chello.nl \
    --cc=efault@gmx.de \
    --cc=fabio@gandalf.sssup.it \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=vatsa@in.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.