From: Paul Turner <pjt@google.com>
To: linux-kernel@vger.kernel.org
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>,
Dhaval Giani <dhaval@linux.vnet.ibm.com>,
Balbir Singh <balbir@linux.vnet.ibm.com>,
Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
Gautham R Shenoy <ego@in.ibm.com>,
Srivatsa Vaddagiri <vatsa@in.ibm.com>,
Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>,
Ingo Molnar <mingo@elte.hu>,
Peter Zijlstra <a.p.zijlstra@chello.nl>,
Pavel Emelyanov <xemul@openvz.org>,
Herbert Poetzl <herbert@13thfloor.at>,
Avi Kivity <avi@redhat.com>, Chris Friesen <cfriesen@nortel.com>
Subject: [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED_OTHER
Date: Tue, 15 Feb 2011 19:18:37 -0800 [thread overview]
Message-ID: <20110216031841.352204682@google.com> (raw)
In-Reply-To: 20110216031831.571628191@google.com
[-- Attachment #1: sched-bwc-account_cfs_tasks.patch --]
[-- Type: text/plain, Size: 5254 bytes --]
With task entities participating in throttled sub-trees it is possible for
task activation/de-activation to not lead to root visible changes to
rq->nr_running. This in turn leads to incorrect idle and weight-per-task load
balance decisions.
To allow correct accounting we move responsibility for updating rq->nr_running
to the respective sched::classes. In the fair-group case this update is
hierarchical, tracking the number of active tasks rooted at each group entity.
Note: technically this issue also exists with the existing sched_rt
throttling; however due to the nearly complete provisioning of system
resources for rt scheduling this is much less common by default.
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
kernel/sched.c | 9 ++++++---
kernel/sched_fair.c | 42 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched_rt.c | 5 ++++-
3 files changed, 52 insertions(+), 4 deletions(-)
Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -330,7 +330,7 @@ struct task_group root_task_group;
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
- unsigned long nr_running;
+ unsigned long nr_running, h_nr_tasks;
u64 exec_clock;
u64 min_vruntime;
@@ -1846,6 +1846,11 @@ static const struct sched_class rt_sched
#include "sched_stats.h"
+static void mod_nr_running(struct rq *rq, long delta)
+{
+ rq->nr_running += delta;
+}
+
static void inc_nr_running(struct rq *rq)
{
rq->nr_running++;
@@ -1896,7 +1901,6 @@ static void activate_task(struct rq *rq,
rq->nr_uninterruptible--;
enqueue_task(rq, p, flags);
- inc_nr_running(rq);
}
/*
@@ -1908,7 +1912,6 @@ static void deactivate_task(struct rq *r
rq->nr_uninterruptible++;
dequeue_task(rq, p, flags);
- dec_nr_running(rq);
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -81,6 +81,8 @@ unsigned int normalized_sysctl_sched_wak
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+static void account_hier_tasks(struct sched_entity *se, int delta);
+
/*
* The exponential sliding window over which load is averaged for shares
* distribution.
@@ -933,6 +935,40 @@ static inline void update_entity_shares_
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_CFS_BANDWIDTH
+/* maintain hierarchal task counts on group entities */
+static void account_hier_tasks(struct sched_entity *se, int delta)
+{
+ struct rq *rq = rq_of(cfs_rq_of(se));
+ struct cfs_rq *cfs_rq;
+
+ for_each_sched_entity(se) {
+ /* a throttled entity cannot affect its parent hierarchy */
+ if (group_cfs_rq(se) && cfs_rq_throttled(group_cfs_rq(se)))
+ break;
+
+ /* we affect our queuing entity */
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_nr_tasks += delta;
+ }
+
+ /* account for global nr_running delta to hierarchy change */
+ if (!se)
+ mod_nr_running(rq, delta);
+}
+#else
+/*
+ * In the absence of group throttling, all operations are guaranteed to be
+ * globally visible at the root rq level.
+ */
+static void account_hier_tasks(struct sched_entity *se, int delta)
+{
+ struct rq *rq = rq_of(cfs_rq_of(se));
+
+ mod_nr_running(rq, delta);
+}
+#endif
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
@@ -1428,6 +1464,7 @@ enqueue_task_fair(struct rq *rq, struct
update_cfs_shares(cfs_rq);
}
+ account_hier_tasks(&p->se, 1);
hrtick_update(rq);
}
@@ -1461,6 +1498,7 @@ static void dequeue_task_fair(struct rq
update_cfs_shares(cfs_rq);
}
+ account_hier_tasks(&p->se, -1);
hrtick_update(rq);
}
@@ -1488,6 +1526,8 @@ static u64 tg_request_cfs_quota(struct t
return delta;
}
+static void account_hier_tasks(struct sched_entity *se, int delta);
+
static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct sched_entity *se;
@@ -1507,6 +1547,7 @@ static void throttle_cfs_rq(struct cfs_r
if (!se->on_rq)
goto out_throttled;
+ account_hier_tasks(se, -cfs_rq->h_nr_tasks);
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -1541,6 +1582,7 @@ static void unthrottle_cfs_rq(struct cfs
cfs_rq->load_stamp = cfs_rq->load_last = rq->clock_task;
cfs_rq->throttled = 0;
+ account_hier_tasks(se, cfs_rq->h_nr_tasks);
for_each_sched_entity(se) {
if (se->on_rq)
break;
Index: tip/kernel/sched_rt.c
===================================================================
--- tip.orig/kernel/sched_rt.c
+++ tip/kernel/sched_rt.c
@@ -906,6 +906,8 @@ enqueue_task_rt(struct rq *rq, struct ta
if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+ inc_nr_running(rq);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -916,6 +918,8 @@ static void dequeue_task_rt(struct rq *r
dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p);
+
+ dec_nr_running(rq);
}
/*
@@ -1783,4 +1787,3 @@ static void print_rt_stats(struct seq_fi
rcu_read_unlock();
}
#endif /* CONFIG_SCHED_DEBUG */
-
next prev parent reply other threads:[~2011-02-16 3:20 UTC|newest]
Thread overview: 71+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-02-16 3:18 [CFS Bandwidth Control v4 0/7] Introduction Paul Turner
2011-02-16 3:18 ` [CFS Bandwidth Control v4 1/7] sched: introduce primitives to account for CFS bandwidth tracking Paul Turner
2011-02-16 16:52 ` Balbir Singh
2011-02-17 2:54 ` Bharata B Rao
2011-02-23 13:32 ` Peter Zijlstra
2011-02-25 3:11 ` Paul Turner
2011-02-25 20:53 ` Paul Turner
2011-02-16 3:18 ` [CFS Bandwidth Control v4 2/7] sched: accumulate per-cfs_rq cpu usage Paul Turner
2011-02-16 17:45 ` Balbir Singh
2011-02-23 13:32 ` Peter Zijlstra
2011-02-25 3:33 ` Paul Turner
2011-02-25 12:31 ` Peter Zijlstra
2011-02-16 3:18 ` [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota Paul Turner
2011-02-18 6:52 ` Balbir Singh
2011-02-23 13:32 ` Peter Zijlstra
2011-02-24 5:21 ` Bharata B Rao
2011-02-24 11:05 ` Peter Zijlstra
2011-02-24 15:45 ` Bharata B Rao
2011-02-24 15:52 ` Peter Zijlstra
2011-02-24 16:39 ` Bharata B Rao
2011-02-24 17:20 ` Peter Zijlstra
2011-02-25 3:59 ` Paul Turner
2011-02-25 3:41 ` Paul Turner
2011-02-25 3:10 ` Paul Turner
2011-02-25 13:58 ` Bharata B Rao
2011-02-25 20:51 ` Paul Turner
2011-02-28 3:50 ` Bharata B Rao
2011-02-28 6:38 ` Paul Turner
2011-02-28 13:48 ` Peter Zijlstra
2011-03-01 8:31 ` Paul Turner
2011-03-02 7:23 ` Bharata B Rao
2011-03-02 8:05 ` Paul Turner
2011-02-16 3:18 ` [CFS Bandwidth Control v4 4/7] sched: unthrottle cfs_rq(s) who ran out of quota at period refresh Paul Turner
2011-02-18 7:19 ` Balbir Singh
2011-02-18 8:10 ` Bharata B Rao
2011-02-23 12:23 ` Peter Zijlstra
2011-02-23 13:32 ` Peter Zijlstra
2011-02-24 7:04 ` Bharata B Rao
2011-02-24 11:14 ` Peter Zijlstra
2011-02-26 0:02 ` Paul Turner
2011-02-16 3:18 ` [CFS Bandwidth Control v4 5/7] sched: add exports tracking cfs bandwidth control statistics Paul Turner
2011-02-22 3:14 ` Balbir Singh
2011-02-22 4:13 ` Bharata B Rao
2011-02-22 4:40 ` Balbir Singh
2011-02-23 8:03 ` Paul Turner
2011-02-23 10:13 ` Balbir Singh
2011-02-23 13:32 ` Peter Zijlstra
2011-02-25 3:26 ` Paul Turner
2011-02-25 8:54 ` Peter Zijlstra
2011-02-16 3:18 ` Paul Turner [this message]
2011-02-22 3:17 ` [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED_OTHER Balbir Singh
2011-02-23 8:05 ` Paul Turner
2011-02-23 2:02 ` Hidetoshi Seto
2011-02-23 2:20 ` Paul Turner
2011-02-23 2:43 ` Balbir Singh
2011-02-23 13:32 ` Peter Zijlstra
2011-02-25 3:25 ` Paul Turner
2011-02-25 12:17 ` Peter Zijlstra
2011-02-16 3:18 ` [CFS Bandwidth Control v4 7/7] sched: add documentation for bandwidth control Paul Turner
2011-02-21 2:47 ` [CFS Bandwidth Control v4 0/7] Introduction Xiao Guangrong
2011-02-22 10:28 ` Bharata B Rao
2011-02-23 7:42 ` Paul Turner
2011-02-23 7:51 ` Balbir Singh
2011-02-23 7:56 ` Paul Turner
2011-02-23 8:31 ` Bharata B Rao
[not found] ` <20110224161111.7d83a884@jacob-laptop>
2011-02-25 10:03 ` Paul Turner
2011-02-25 13:06 ` jacob pan
2011-03-08 3:57 ` Balbir Singh
2011-03-08 18:18 ` Jacob Pan
2011-03-09 10:12 ` Paul Turner
2011-03-09 21:57 ` jacob pan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20110216031841.352204682@google.com \
--to=pjt@google.com \
--cc=a.p.zijlstra@chello.nl \
--cc=avi@redhat.com \
--cc=balbir@linux.vnet.ibm.com \
--cc=bharata@linux.vnet.ibm.com \
--cc=cfriesen@nortel.com \
--cc=dhaval@linux.vnet.ibm.com \
--cc=ego@in.ibm.com \
--cc=herbert@13thfloor.at \
--cc=kamalesh@linux.vnet.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=svaidy@linux.vnet.ibm.com \
--cc=vatsa@in.ibm.com \
--cc=xemul@openvz.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.