From: Paul Turner <pjt@google.com>
To: linux-kernel@vger.kernel.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
Bharata B Rao <bharata@linux.vnet.ibm.com>,
Dhaval Giani <dhaval.giani@gmail.com>,
Balbir Singh <balbir@linux.vnet.ibm.com>,
Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
Srivatsa Vaddagiri <vatsa@in.ibm.com>,
Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>,
Ingo Molnar <mingo@elte.hu>, Pavel Emelyanov <xemul@openvz.org>
Subject: [patch 07/15] sched: prevent interactions between throttled entities and load-balance
Date: Tue, 22 Mar 2011 20:03:33 -0700 [thread overview]
Message-ID: <20110323030449.333793180@google.com> (raw)
In-Reply-To: 20110323030326.789836913@google.com
[-- Attachment #1: sched-bwc-throttled_shares.patch --]
[-- Type: text/plain, Size: 5032 bytes --]
>From the perspective of load-balance and shares distribution, throttled
entities should be invisible.
However, both of these operations work on 'active' lists and are not
inherently aware of what group hierarchies may be present. In some cases this
may be side-stepped (e.g. we could sideload via tg_load_down in load balance)
while in others (e.g. update_shares()) it is more difficult to compute without
incurring some O(n**2) costs.
Instead, track hierarchal throttled state at time of transition. This allows
us to easily identify whether an entity belongs to a throttled hierarchy and
avoid incorrect interactions with it.
Also, when an entity leaves a throttled hierarchy we need to advance its
time averaging for shares averaging so that the elapsed throttled time is not
considered as part of the cfs_rq's operation.
Signed-off-by: Paul Turner <pjt@google.com>
---
kernel/sched.c | 2 -
kernel/sched_fair.c | 78 +++++++++++++++++++++++++++++++++++++---------------
2 files changed, 57 insertions(+), 23 deletions(-)
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -739,13 +739,15 @@ static void update_cfs_rq_load_contribut
}
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
{
u64 period = sysctl_sched_shares_window;
u64 now, delta;
unsigned long load = cfs_rq->load.weight;
- if (cfs_rq->tg == &root_task_group)
+ if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
return;
now = rq_of(cfs_rq)->clock_task;
@@ -1312,23 +1314,7 @@ static inline int cfs_rq_throttled(struc
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
- struct task_group *tg;
- struct sched_entity *se;
-
- if (cfs_rq_throttled(cfs_rq))
- return 1;
-
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se)
- return 0;
-
- for_each_sched_entity(se) {
- if (cfs_rq_throttled(cfs_rq_of(se)))
- return 1;
- }
-
- return 0;
+ return cfs_rq->throttle_count > 0;
}
static inline int within_bandwidth(struct cfs_rq *cfs_rq)
@@ -1381,6 +1367,41 @@ static void account_cfs_rq_quota(struct
request_cfs_rq_quota(cfs_rq);
}
+struct tg_unthrottle_down_data {
+ int cpu;
+ u64 now;
+};
+
+static int tg_unthrottle_down(struct task_group *tg, void *data)
+{
+ struct tg_unthrottle_down_data *udd = data;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[udd->cpu];
+ u64 delta;
+
+ cfs_rq->throttle_count--;
+ if (!cfs_rq->throttle_count) {
+ /* leaving throttled state, move up windows */
+ delta = udd->now - cfs_rq->load_stamp;
+ cfs_rq->load_stamp += delta;
+ cfs_rq->load_last += delta;
+ }
+
+ return 0;
+}
+
+static int tg_throttle_down(struct task_group *tg, void *data)
+{
+ long cpu = (long)data;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+
+ /* group is entering throttled state, record last load */
+ if (!cfs_rq->throttle_count)
+ update_cfs_load(cfs_rq, 0);
+ cfs_rq->throttle_count++;
+
+ return 0;
+}
+
static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct sched_entity *se;
@@ -1388,7 +1409,10 @@ static void throttle_cfs_rq(struct cfs_r
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
/* account load preceding throttle */
- update_cfs_load(cfs_rq, 0);
+ rcu_read_lock();
+ walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop,
+ (void*)(long)rq_of(cfs_rq)->cpu);
+ rcu_read_unlock();
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
@@ -1408,11 +1432,18 @@ static void unthrottle_cfs_rq(struct cfs
{
struct rq *rq = rq_of(cfs_rq);
struct sched_entity *se;
+ struct tg_unthrottle_down_data udd;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
update_rq_clock(rq);
+ /* don't include throttled window for load statistics */
+ udd.cpu = rq->cpu;
+ udd.now = rq->clock_task;
+ walk_tg_tree_from(cfs_rq->tg, tg_unthrottle_down, tg_nop,
+ (void*)&udd);
+
cfs_rq->throttled = 0;
if (!cfs_rq->load.weight)
return;
@@ -2488,8 +2519,10 @@ static void update_shares(int cpu)
struct rq *rq = cpu_rq(cpu);
rcu_read_lock();
- for_each_leaf_cfs_rq(rq, cfs_rq)
- update_shares_cpu(cfs_rq->tg, cpu);
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
+ if (!throttled_hierarchy(cfs_rq))
+ update_shares_cpu(cfs_rq->tg, cpu);
+ }
rcu_read_unlock();
}
@@ -2515,7 +2548,8 @@ load_balance_fair(struct rq *this_rq, in
/*
* empty group
*/
- if (!busiest_cfs_rq->task_weight)
+ if (!busiest_cfs_rq->task_weight ||
+ throttled_hierarchy(busiest_cfs_rq));
continue;
rem_load = (u64)rem_load_move * busiest_weight;
Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -386,7 +386,7 @@ struct cfs_rq {
unsigned long load_contribution;
#endif
#ifdef CONFIG_CFS_BANDWIDTH
- int quota_enabled, throttled;
+ int quota_enabled, throttled, throttle_count;
s64 quota_remaining;
#endif
#endif
next prev parent reply other threads:[~2011-03-23 3:10 UTC|newest]
Thread overview: 63+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-03-23 3:03 [patch 00/15] CFS Bandwidth Control V5 Paul Turner
2011-03-23 3:03 ` [patch 01/15] sched: introduce primitives to account for CFS bandwidth tracking Paul Turner
2011-03-24 12:38 ` Kamalesh Babulal
2011-04-05 13:28 ` Peter Zijlstra
2011-03-23 3:03 ` [patch 02/15] sched: validate CFS quota hierarchies Paul Turner
2011-03-23 10:39 ` torbenh
2011-03-23 20:49 ` Paul Turner
2011-03-24 6:31 ` Bharata B Rao
2011-04-08 17:01 ` Peter Zijlstra
2011-03-29 6:57 ` Hidetoshi Seto
2011-04-04 23:10 ` Paul Turner
2011-04-05 13:28 ` Peter Zijlstra
2011-03-23 3:03 ` [patch 03/15] sched: accumulate per-cfs_rq cpu usage Paul Turner
2011-04-05 13:28 ` Peter Zijlstra
2011-04-06 20:44 ` Paul Turner
2011-04-05 13:28 ` Peter Zijlstra
2011-04-06 20:47 ` Paul Turner
2011-03-23 3:03 ` [patch 04/15] sched: throttle cfs_rq entities which exceed their local quota Paul Turner
2011-03-23 5:09 ` Mike Galbraith
2011-03-23 20:53 ` Paul Turner
2011-03-24 6:36 ` Bharata B Rao
2011-03-24 7:40 ` Paul Turner
2011-04-05 13:28 ` Peter Zijlstra
2011-04-05 23:15 ` Paul Turner
2011-03-23 3:03 ` [patch 05/15] sched: unthrottle cfs_rq(s) who ran out of quota at period refresh Paul Turner
2011-04-05 13:28 ` Peter Zijlstra
2011-04-05 13:33 ` Peter Zijlstra
2011-04-05 13:28 ` Peter Zijlstra
2011-04-05 13:28 ` Peter Zijlstra
2011-03-23 3:03 ` [patch 06/15] sched: allow for positional tg_tree walks Paul Turner
2011-03-23 3:03 ` Paul Turner [this message]
2011-04-05 13:28 ` [patch 07/15] sched: prevent interactions between throttled entities and load-balance Peter Zijlstra
2011-03-23 3:03 ` [patch 08/15] sched: migrate throttled tasks on HOTPLUG Paul Turner
2011-04-05 13:28 ` Peter Zijlstra
2011-04-06 2:31 ` Paul Turner
2011-03-23 3:03 ` [patch 09/15] sched: add exports tracking cfs bandwidth control statistics Paul Turner
2011-04-05 13:28 ` Peter Zijlstra
2011-03-23 3:03 ` [patch 10/15] sched: (fixlet) dont update shares twice on on_rq parent Paul Turner
2011-04-05 13:28 ` Peter Zijlstra
2011-03-23 3:03 ` [patch 11/15] sched: hierarchical task accounting for SCHED_OTHER Paul Turner
2011-04-05 13:28 ` Peter Zijlstra
2011-03-23 3:03 ` [patch 12/15] sched: maintain throttled rqs as a list Paul Turner
2011-04-22 2:50 ` Hidetoshi Seto
2011-04-24 21:23 ` Paul Turner
2011-03-23 3:03 ` [patch 13/15] sched: expire slack quota using generation counters Paul Turner
2011-04-05 13:28 ` Peter Zijlstra
2011-04-06 7:22 ` Paul Turner
2011-04-06 8:15 ` Peter Zijlstra
2011-04-06 11:26 ` Peter Zijlstra
2011-03-23 3:03 ` [patch 14/15] sched: return unused quota on voluntary sleep Paul Turner
2011-04-05 13:28 ` Peter Zijlstra
2011-04-06 2:25 ` Paul Turner
2011-03-23 3:03 ` [patch 15/15] sched: add documentation for bandwidth control Paul Turner
2011-03-24 6:38 ` Bharata B Rao
2011-03-24 16:12 ` [patch 00/15] CFS Bandwidth Control V5 Bharata B Rao
2011-03-31 7:57 ` Xiao Guangrong
2011-04-04 23:10 ` Paul Turner
2011-04-05 13:28 ` Peter Zijlstra
2011-05-20 2:12 ` Test for CFS Bandwidth Control V6 Xiao Guangrong
2011-05-24 0:53 ` Hidetoshi Seto
2011-05-24 7:56 ` Xiao Guangrong
2011-06-08 2:54 ` Paul Turner
2011-06-08 5:55 ` Hidetoshi Seto
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20110323030449.333793180@google.com \
--to=pjt@google.com \
--cc=a.p.zijlstra@chello.nl \
--cc=balbir@linux.vnet.ibm.com \
--cc=bharata@linux.vnet.ibm.com \
--cc=dhaval.giani@gmail.com \
--cc=kamalesh@linux.vnet.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=svaidy@linux.vnet.ibm.com \
--cc=vatsa@in.ibm.com \
--cc=xemul@openvz.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.