linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: pjt@google.com
To: linux-kernel@vger.kernel.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Ingo Molnar <mingo@elte.hu>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>,
	Venki Pallipadi <venki@google.com>,
	Ben Segall <bsegall@google.com>, Mike Galbraith <efault@gmx.de>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>,
	Morten Rasmussen <Morten.Rasmussen@arm.com>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Namhyung Kim <namhyung@kernel.org>
Subject: [patch 01/16] sched: track the runnable average on a per-task entitiy basis
Date: Thu, 23 Aug 2012 07:14:23 -0700	[thread overview]
Message-ID: <20120823141506.372695337@google.com> (raw)
In-Reply-To: 20120823141422.444396696@google.com

[-- Attachment #1: sched-sched_avg.patch --]
[-- Type: text/plain, Size: 7771 bytes --]

From: Paul Turner <pjt@google.com>

Instead of tracking averaging the load parented by a cfs_rq, we can track
entity load directly.  With the load for a given cfs_rq then being the sum of
its children.

To do this we represent the historical contribution to runnable average within each
trailing 1024us of execution as the coefficients of a geometric series.

We can express this for a given task t as:
  runnable_sum(t) = \Sum u_i * y^i, runnable_avg_period(t) = \Sum 1024 * y^i
  load(t) = weight_t * runnable_sum(t) / runnable_avg_period(t)

Where: u_i is the usage in the last i`th 1024us period (approximately 1ms) ~ms
and y is chosen such that y^k = 1/2.  We currently choose k to be 32 which
roughly translates to about a sched period.

Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Ben Segall <bsegall@google.com>
---
 include/linux/sched.h |   13 +++++
 kernel/sched/core.c   |    5 ++
 kernel/sched/debug.c  |    4 ++
 kernel/sched/fair.c   |  128 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 150 insertions(+), 0 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f3eebc1..f553da9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1139,6 +1139,16 @@ struct load_weight {
 	unsigned long weight, inv_weight;
 };
 
+struct sched_avg {
+	/*
+	 * These sums represent an infinite geometric series and so are bound
+	 * above by 1024/(1-y).  Thus we only need a u32 to store them for for all
+	 * choices of y < 1-2^(-32)*1024.
+	 */
+	u32 runnable_avg_sum, runnable_avg_period;
+	u64 last_runnable_update;
+};
+
 #ifdef CONFIG_SCHEDSTATS
 struct sched_statistics {
 	u64			wait_start;
@@ -1199,6 +1209,9 @@ struct sched_entity {
 	/* rq "owned" by this entity/group: */
 	struct cfs_rq		*my_q;
 #endif
+#ifdef CONFIG_SMP
+	struct sched_avg	avg;
+#endif
 };
 
 struct sched_rt_entity {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78d9c96..fcc3cad 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1556,6 +1556,11 @@ static void __sched_fork(struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
+#ifdef CONFIG_SMP
+	p->se.avg.runnable_avg_period = 0;
+	p->se.avg.runnable_avg_sum = 0;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596..61f7097 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -85,6 +85,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 	P(se->statistics.wait_count);
 #endif
 	P(se->load.weight);
+#ifdef CONFIG_SMP
+	P(se->avg.runnable_avg_sum);
+	P(se->avg.runnable_avg_period);
+#endif
 #undef PN
 #undef P
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 01d3eda..2c53263 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -971,6 +971,125 @@ static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_SMP
+/*
+ * Approximate:
+ *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, u64 n)
+{
+	for (; n && val; n--) {
+		val *= 4008;
+		val >>= 12;
+	}
+
+	return val;
+}
+
+/* We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series.  To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ *      p0            p1           p1
+ *     (now)       (~1ms ago)  (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ *   y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+							struct sched_avg *sa,
+							int runnable)
+{
+	u64 delta;
+	int delta_w, decayed = 0;
+
+	delta = now - sa->last_runnable_update;
+	/*
+	 * This should only happen when time goes backwards, which it
+	 * unfortunately does during sched clock init when we swap over to TSC.
+	 */
+	if ((s64)delta < 0) {
+		sa->last_runnable_update = now;
+		return 0;
+	}
+
+	/*
+	 * Use 1024ns as the unit of measurement since it's a reasonable
+	 * approximation of 1us and fast to compute.
+	 */
+	delta >>= 10;
+	if (!delta)
+		return 0;
+	sa->last_runnable_update = now;
+
+	/* delta_w is the amount already accumulated against our next period */
+	delta_w = sa->runnable_avg_period % 1024;
+	if (delta + delta_w >= 1024) {
+		/* period roll-over */
+		decayed = 1;
+
+		/*
+		 * Now that we know we're crossing a period boundary, figure
+		 * out how much from delta we need to complete the current
+		 * period and accrue it.
+		 */
+		delta_w = 1024 - delta_w;
+		BUG_ON(delta_w > delta);
+		do {
+			if (runnable)
+				sa->runnable_avg_sum += delta_w;
+			sa->runnable_avg_period += delta_w;
+
+			/*
+			 * Remainder of delta initiates a new period, roll over
+			 * the previous.
+			 */
+			sa->runnable_avg_sum =
+				decay_load(sa->runnable_avg_sum, 1);
+			sa->runnable_avg_period =
+				decay_load(sa->runnable_avg_period, 1);
+
+			delta -= delta_w;
+			/* New period is empty */
+			delta_w = 1024;
+		} while (delta >= 1024);
+	}
+
+	/* Remainder of delta accrued against u_0` */
+	if (runnable)
+		sa->runnable_avg_sum += delta;
+	sa->runnable_avg_period += delta;
+
+	return decayed;
+}
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se)
+{
+	__update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
+				     se->on_rq);
+}
+#else
+static inline void update_entity_load_avg(struct sched_entity *se) {}
+#endif
+
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -1097,6 +1216,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	update_curr(cfs_rq);
 	update_cfs_load(cfs_rq, 0);
+	update_entity_load_avg(se);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
 
@@ -1171,6 +1291,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+	update_entity_load_avg(se);
 
 	update_stats_dequeue(cfs_rq, se);
 	if (flags & DEQUEUE_SLEEP) {
@@ -1340,6 +1461,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		update_stats_wait_start(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
+		/* in !on_rq case, update occurred at dequeue */
+		update_entity_load_avg(prev);
 	}
 	cfs_rq->curr = NULL;
 }
@@ -1353,6 +1476,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	update_curr(cfs_rq);
 
 	/*
+	 * Ensure that runnable average is periodically updated.
+	 */
+	update_entity_load_avg(curr);
+
+	/*
 	 * Update share accounting for long-running entities.
 	 */
 	update_entity_shares_tick(cfs_rq);



  reply	other threads:[~2012-08-23 14:16 UTC|newest]

Thread overview: 59+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-08-23 14:14 [patch 00/16] sched: per-entity load-tracking pjt
2012-08-23 14:14 ` pjt [this message]
2012-08-24  8:20   ` [patch 01/16] sched: track the runnable average on a per-task entitiy basis Namhyung Kim
2012-08-28 22:12     ` Paul Turner
2012-10-24  9:43   ` [tip:sched/core] sched: Track the runnable average on a per-task entity basis tip-bot for Paul Turner
2012-10-25  3:28     ` li guang
2012-10-25 16:58       ` Benjamin Segall
2012-08-23 14:14 ` [patch 02/16] sched: maintain per-rq runnable averages pjt
2012-10-24  9:44   ` [tip:sched/core] sched: Maintain " tip-bot for Ben Segall
2012-10-28 10:12   ` [patch 02/16] sched: maintain " Preeti Murthy
2012-10-29 17:38     ` Benjamin Segall
2012-11-07  8:28       ` Preeti U Murthy
2012-08-23 14:14 ` [patch 03/16] sched: aggregate load contributed by task entities on parenting cfs_rq pjt
2012-10-24  9:45   ` [tip:sched/core] sched: Aggregate " tip-bot for Paul Turner
2012-08-23 14:14 ` [patch 04/16] sched: maintain the load contribution of blocked entities pjt
2012-10-24  9:46   ` [tip:sched/core] sched: Maintain " tip-bot for Paul Turner
2012-08-23 14:14 ` [patch 05/16] sched: add an rq migration call-back to sched_class pjt
2012-10-24  9:47   ` [tip:sched/core] sched: Add " tip-bot for Paul Turner
2012-08-23 14:14 ` [patch 06/16] sched: account for blocked load waking back up pjt
     [not found]   ` <CAM4v1pO8SPCmqJTTBHpqwrwuO7noPdskg0RSooxyPsWoE395_A@mail.gmail.com>
2012-09-04 17:29     ` Benjamin Segall
2012-10-24  9:48   ` [tip:sched/core] sched: Account " tip-bot for Paul Turner
2012-08-23 14:14 ` [patch 07/16] sched: aggregate total task_group load pjt
2012-10-24  9:49   ` [tip:sched/core] sched: Aggregate " tip-bot for Paul Turner
2012-08-23 14:14 ` [patch 08/16] sched: compute load contribution by a group entity pjt
2012-10-24  9:50   ` [tip:sched/core] sched: Compute " tip-bot for Paul Turner
2012-08-23 14:14 ` [patch 09/16] sched: normalize tg load contributions against runnable time pjt
2012-10-24  9:51   ` [tip:sched/core] sched: Normalize " tip-bot for Paul Turner
2012-08-23 14:14 ` [patch 10/16] sched: maintain runnable averages across throttled periods pjt
2012-10-24  9:52   ` [tip:sched/core] sched: Maintain " tip-bot for Paul Turner
2012-08-23 14:14 ` [patch 11/16] sched: replace update_shares weight distribution with per-entity computation pjt
2012-09-24 19:44   ` "Jan H. Schönherr"
2012-09-24 20:39     ` Benjamin Segall
2012-10-02 21:14       ` Paul Turner
2012-10-24  9:53   ` [tip:sched/core] sched: Replace " tip-bot for Paul Turner
2012-08-23 14:14 ` [patch 12/16] sched: refactor update_shares_cpu() -> update_blocked_avgs() pjt
2012-10-24  9:54   ` [tip:sched/core] sched: Refactor " tip-bot for Paul Turner
2012-08-23 14:14 ` [patch 13/16] sched: update_cfs_shares at period edge pjt
2012-09-24 19:51   ` "Jan H. Schönherr"
2012-10-02 21:09     ` Paul Turner
2012-10-24  9:55   ` [tip:sched/core] sched: Update_cfs_shares " tip-bot for Paul Turner
2012-08-23 14:14 ` [patch 14/16] sched: make __update_entity_runnable_avg() fast pjt
2012-08-24  8:28   ` Namhyung Kim
2012-08-28 22:18     ` Paul Turner
2012-10-24  9:56   ` [tip:sched/core] sched: Make " tip-bot for Paul Turner
2012-08-23 14:14 ` [patch 15/16] sched: implement usage tracking pjt
2012-10-19 12:18   ` Vincent Guittot
2012-08-23 14:14 ` [patch 16/16] sched: introduce temporary FAIR_GROUP_SCHED dependency for load-tracking pjt
2012-10-24  9:57   ` [tip:sched/core] sched: Introduce " tip-bot for Paul Turner
2012-09-24  9:30 ` [patch 00/16] sched: per-entity load-tracking "Jan H. Schönherr"
2012-09-24 17:16   ` Benjamin Segall
2012-10-05  9:07     ` Paul Turner
2012-11-26 13:08 ` Jassi Brar
2012-12-20  7:39   ` Stephen Boyd
2012-12-20  8:08     ` Jassi Brar
  -- strict thread matches above, loose matches on Subject: below --
2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
2012-06-28  2:24 ` [PATCH 01/16] sched: track the runnable average on a per-task entitiy basis Paul Turner
2012-06-28  6:06   ` Namhyung Kim
2012-07-12  0:14     ` Paul Turner
2012-07-04 15:32   ` Peter Zijlstra
2012-07-12  0:12     ` Paul Turner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20120823141506.372695337@google.com \
    --to=pjt@google.com \
    --cc=Morten.Rasmussen@arm.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=bsegall@google.com \
    --cc=efault@gmx.de \
    --cc=kamalesh@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=namhyung@kernel.org \
    --cc=nikunj@linux.vnet.ibm.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=vatsa@in.ibm.com \
    --cc=venki@google.com \
    --cc=vincent.guittot@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).