[PATCH v2 6/6] sched: hierarchical task accounting for FAIR_GROUP_SCHED

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Paul Turner <pjt@google.com>
To: linux-kernel@vger.kernel.org
Cc: Paul Menage <menage@google.com>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	Dhaval Giani <dhaval@linux.vnet.ibm.com>,
	Gautham R Shenoy <ego@in.ibm.com>,
	Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>,
	Herbert Poetzl <herbert@13thfloor.at>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	Chris Friesen <cfriesen@nortel.com>, Avi Kivity <avi@redhat.com>,
	Bharata B Rao <bharata@linux.vnet.ibm.com>,
	Nikhil Rao <ncrao@google.com>, Ingo Molnar <mingo@elte.hu>,
	Pavel Emelyanov <xemul@openvz.org>,
	Mike Waychison <mikew@google.com>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [PATCH v2 6/6] sched: hierarchical task accounting for FAIR_GROUP_SCHED
Date: Wed, 28 Apr 2010 04:17:17 -0700	[thread overview]
Message-ID: <20100428111717.7954.30963.stgit@kitami.corp.google.com> (raw)
In-Reply-To: <20100428110720.7954.53537.stgit@kitami.corp.google.com>

With task entities participating in throttled sub-trees it is possible for
task activation/de-activation to not lead to root visible changes to
rq->nr_running.  This in turn leads to incorrect idle and weight-per-task load
balance decisions.

To allow correct accounting we move responsibility for updating rq->nr_running
to the respective sched::classes.  In the fair-group case this update is
hierarchical, tracking the number of active tasks rooted at each group entity.

Note: technically this issue also exists with the existing sched_rt
throttling; however due to the nearly complete provisioning of system
resources for rt scheduling this is much less common by default.
---
 kernel/sched.c      |    9 ++++++---
 kernel/sched_fair.c |   42 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_rt.c   |    5 ++++-
 3 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index ac74d3a..87fb0c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -368,7 +368,7 @@ static inline struct task_group *task_group(struct task_struct *p)
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
-	unsigned long nr_running;
+	unsigned long nr_running, h_nr_tasks;
 
 	u64 exec_clock;
 	u64 min_vruntime;
@@ -1967,6 +1967,11 @@ static inline u64 sched_cfs_bandwidth_slice(void)
 
 #include "sched_stats.h"
 
+static void mod_nr_running(struct rq *rq, long delta)
+{
+	rq->nr_running += delta;
+}
+
 static void inc_nr_running(struct rq *rq)
 {
 	rq->nr_running++;
@@ -2042,7 +2047,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, wakeup, false);
-	inc_nr_running(rq);
 }
 
 /*
@@ -2054,7 +2058,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 		rq->nr_uninterruptible++;
 
 	dequeue_task(rq, p, sleep);
-	dec_nr_running(rq);
 }
 
 #include "sched_idletask.c"
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index edea44e..eb6ed15 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -76,6 +76,8 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
  */
 unsigned int __read_mostly sysctl_sched_compat_yield;
 
+static void account_hier_tasks(struct sched_entity *se, int delta);
+
 /*
  * SCHED_OTHER wake-up granularity.
  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
@@ -682,6 +684,40 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->on_rq = 0;
 }
 
+#ifdef CONFIG_CFS_BANDWIDTH
+/* maintain hierarchal task counts on group entities */
+static void account_hier_tasks(struct sched_entity *se, int delta)
+{
+	struct rq *rq = rq_of(cfs_rq_of(se));
+	struct cfs_rq *cfs_rq;
+
+	for_each_sched_entity(se) {
+		/* a throttled entity cannot affect its parent hierarchy */
+		if (group_cfs_rq(se) && cfs_rq_throttled(group_cfs_rq(se)))
+			break;
+
+		/* we affect our queuing entity */
+		cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_nr_tasks += delta;
+	}
+
+	/* account for global nr_running delta to hierarchy change */
+	if (!se)
+		mod_nr_running(rq, delta);
+}
+#else
+/*
+ * In the absence of group throttling, all operations are guaranteed to be
+ * globally visible at the root rq level.
+ */
+static void account_hier_tasks(struct sched_entity *se, int delta)
+{
+	struct rq *rq = rq_of(cfs_rq_of(se));
+
+	mod_nr_running(rq, delta);
+}
+#endif
+
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -1117,6 +1153,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 		flags = ENQUEUE_WAKEUP;
 	}
 
+	account_hier_tasks(&p->se, 1);
 	hrtick_update(rq);
 }
 
@@ -1142,6 +1179,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 		sleep = 1;
 	}
 
+	account_hier_tasks(&p->se, -1);
 	hrtick_update(rq);
 }
 
@@ -1215,12 +1253,15 @@ static u64 tg_request_cfs_quota(struct task_group *tg)
 	return delta;
 }
 
+static void account_hier_tasks(struct sched_entity *se, int delta);
+
 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *se;
 
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
+	account_hier_tasks(se, -cfs_rq->h_nr_tasks);
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
@@ -1249,6 +1290,7 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->throttled = 0;
 	cfs_rq->throttled_timestamp = 0;
 
+	account_hier_tasks(se, cfs_rq->h_nr_tasks);
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 15bbc45..c908bc0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -882,6 +882,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 
 	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
+
+	inc_nr_running(rq);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -892,6 +894,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 	dequeue_rt_entity(rt_se);
 
 	dequeue_pushable_task(rq, p);
+
+	dec_nr_running(rq);
 }
 
 /*
@@ -1758,4 +1762,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
 	rcu_read_unlock();
 }
 #endif /* CONFIG_SCHED_DEBUG */
-

     prev parent reply	other threads:[~2010-04-28 11:18 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-04-28 11:16 [PATCH v2 0/6] CFS Bandwidth Control Paul Turner
2010-04-28 11:16 ` [PATCH v2 1/6] sched: introduce primitives to account for CFS bandwidth tracking Paul Turner
2010-04-28 11:16 ` [PATCH v2 2/6] sched: accumulate per-cfs_rq cpu usage Paul Turner
2010-04-28 11:17 ` [PATCH v2 3/6] sched: throttle cfs_rq entities which exceed their local quota Paul Turner
2010-04-28 11:17 ` [PATCH v2 4/6] sched: unthrottle cfs_rq(s) who ran out of quota at period refresh Paul Turner
2010-04-28 11:17 ` [PATCH v2 5/6] sched: add exports tracking cfs bandwidth control statistics Paul Turner
2010-04-28 11:17 ` Paul Turner [this message]

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:ac74d3a dfblob:87fb0c0 dfblob:edea44e dfblob:eb6ed15
dfblob:15bbc45 dfblob:c908bc0 )
 OR (
bs:"[PATCH v2 6/6] sched: hierarchical task accounting for FAIR_GROUP_SCHED" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100428111717.7954.30963.stgit@kitami.corp.google.com \
    --to=pjt@google.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=avi@redhat.com \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=bharata@linux.vnet.ibm.com \
    --cc=cfriesen@nortel.com \
    --cc=dhaval@linux.vnet.ibm.com \
    --cc=ego@in.ibm.com \
    --cc=herbert@13thfloor.at \
    --cc=kamalesh@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=menage@google.com \
    --cc=mikew@google.com \
    --cc=mingo@elte.hu \
    --cc=ncrao@google.com \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=vatsa@in.ibm.com \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.