From: Paul Turner <pjt@google.com>
To: linux-kernel@vger.kernel.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
Bharata B Rao <bharata@linux.vnet.ibm.com>,
Dhaval Giani <dhaval.giani@gmail.com>,
Balbir Singh <bsingharora@gmail.com>,
Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
Srivatsa Vaddagiri <vatsa@in.ibm.com>,
Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>,
Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>,
Ingo Molnar <mingo@elte.hu>, Pavel Emelyanov <xemul@openvz.org>,
Jason Baron <jbaron@redhat.com>
Subject: [patch 16/18] sched: return unused runtime on group dequeue
Date: Thu, 21 Jul 2011 09:43:41 -0700 [thread overview]
Message-ID: <20110721184758.306848658@google.com> (raw)
In-Reply-To: 20110721164325.231521704@google.com
[-- Attachment #1: sched-bwc-simple_return_quota.patch --]
[-- Type: text/plain, Size: 8152 bytes --]
When a local cfs_rq blocks we return the majority of its remaining quota to the
global bandwidth pool for use by other runqueues.
We do this only when the quota is current and there is more than
min_cfs_rq_quota [1ms by default] of runtime remaining on the rq.
In the case where there are throttled runqueues and we have sufficient
bandwidth to meter out a slice, a second timer is kicked off to handle this
delivery, unthrottling where appropriate.
Using a 'worst case' antagonist which executes on each cpu
for 1ms before moving onto the next on a fairly large machine:
no quota generations:
197.47 ms /cgroup/a/cpuacct.usage
199.46 ms /cgroup/a/cpuacct.usage
205.46 ms /cgroup/a/cpuacct.usage
198.46 ms /cgroup/a/cpuacct.usage
208.39 ms /cgroup/a/cpuacct.usage
Since we are allowed to use "stale" quota our usage is effectively bounded by
the rate of input into the global pool and performance is relatively stable.
with quota generations [1s increments]:
119.58 ms /cgroup/a/cpuacct.usage
119.65 ms /cgroup/a/cpuacct.usage
119.64 ms /cgroup/a/cpuacct.usage
119.63 ms /cgroup/a/cpuacct.usage
119.60 ms /cgroup/a/cpuacct.usage
The large deficit here is due to quota generations (/intentionally/) preventing
us from now using previously stranded slack quota. The cost is that this quota
becomes unavailable.
with quota generations and quota return:
200.09 ms /cgroup/a/cpuacct.usage
200.09 ms /cgroup/a/cpuacct.usage
198.09 ms /cgroup/a/cpuacct.usage
200.09 ms /cgroup/a/cpuacct.usage
200.06 ms /cgroup/a/cpuacct.usage
By returning unused quota we're able to both stably consume our desired quota
and prevent unintentional overages due to the abuse of slack quota from
previous quota periods (especially on a large machine).
Signed-off-by: Paul Turner <pjt@google.com>
---
kernel/sched.c | 15 ++++++-
kernel/sched_fair.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 122 insertions(+), 1 deletion(-)
Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -256,7 +256,7 @@ struct cfs_bandwidth {
u64 runtime_expires;
int idle, timer_active;
- struct hrtimer period_timer;
+ struct hrtimer period_timer, slack_timer;
struct list_head throttled_cfs_rq;
/* statistics */
@@ -417,6 +417,16 @@ static inline struct cfs_bandwidth *tg_c
static inline u64 default_cfs_period(void);
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, slack_timer);
+ do_sched_cfs_slack_timer(cfs_b);
+
+ return HRTIMER_NORESTART;
+}
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
@@ -449,6 +459,8 @@ static void init_cfs_bandwidth(struct cf
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->period_timer.function = sched_cfs_period_timer;
+ hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->slack_timer.function = sched_cfs_slack_timer;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -484,6 +496,7 @@ static void __start_cfs_bandwidth(struct
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
hrtimer_cancel(&cfs_b->period_timer);
+ hrtimer_cancel(&cfs_b->slack_timer);
}
#else
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1071,6 +1071,8 @@ static void clear_buddies(struct cfs_rq
__clear_buddies_skip(se);
}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -1109,6 +1111,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
+ /* return excess runtime on last dequeue */
+ return_cfs_rq_runtime(cfs_rq);
+
update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq);
}
@@ -1696,6 +1701,108 @@ out_unlock:
return idle;
}
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+
+/* are we near the end of the current quota period? */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+ struct hrtimer *refresh_timer = &cfs_b->period_timer;
+ u64 remaining;
+
+ /* if the call-back is running a quota refresh is already occurring */
+ if (hrtimer_callback_running(refresh_timer))
+ return 1;
+
+ /* is a quota refresh about to occur? */
+ remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+ if (remaining < min_expire)
+ return 1;
+
+ return 0;
+}
+
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+
+ /* if there's a quota refresh soon don't bother with slack */
+ if (runtime_refresh_within(cfs_b, min_left))
+ return;
+
+ start_bandwidth_timer(&cfs_b->slack_timer,
+ ns_to_ktime(cfs_bandwidth_slack_period));
+}
+
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+
+ if (slack_runtime <= 0)
+ return;
+
+ raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->quota != RUNTIME_INF &&
+ cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+ cfs_b->runtime += slack_runtime;
+
+ /* we are under rq->lock, defer unthrottling using a timer */
+ if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+ !list_empty(&cfs_b->throttled_cfs_rq))
+ start_cfs_slack_bandwidth(cfs_b);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ /* even if it's not valid for return we don't want to try again */
+ cfs_rq->runtime_remaining -= slack_runtime;
+}
+
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+ return;
+
+ __return_cfs_rq_runtime(cfs_rq);
+}
+
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+ u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+ u64 expires;
+
+ /* confirm we're still not at a refresh boundary */
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+ return;
+
+ raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+ runtime = cfs_b->runtime;
+ cfs_b->runtime = 0;
+ }
+ expires = cfs_b->runtime_expires;
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!runtime)
+ return;
+
+ runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+
+ raw_spin_lock(&cfs_b->lock);
+ if (expires == cfs_b->runtime_expires)
+ cfs_b->runtime = runtime;
+ raw_spin_unlock(&cfs_b->lock);
+}
+
/*
* When a group wakes up we want to make sure that its quota is not already
* expired/exceeded, otherwise it may be allowed to steal additional ticks of
@@ -1737,6 +1844,7 @@ static void account_cfs_rq_runtime(struc
unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
next prev parent reply other threads:[~2011-07-21 23:01 UTC|newest]
Thread overview: 60+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-07-21 16:43 [patch 00/18] CFS Bandwidth Control v7.2 Paul Turner
2011-07-21 16:43 ` [patch 01/18] sched: (fixlet) dont update shares twice on on_rq parent Paul Turner
2011-07-22 11:06 ` Kamalesh Babulal
2011-07-21 16:43 ` [patch 02/18] sched: hierarchical task accounting for SCHED_OTHER Paul Turner
2011-08-14 16:15 ` [tip:sched/core] sched: Implement " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 03/18] sched: introduce primitives to account for CFS bandwidth tracking Paul Turner
2011-07-22 11:14 ` Kamalesh Babulal
2011-08-14 16:17 ` [tip:sched/core] sched: Introduce " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 04/18] sched: validate CFS quota hierarchies Paul Turner
2011-08-14 16:19 ` [tip:sched/core] sched: Validate " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 05/18] sched: accumulate per-cfs_rq cpu usage and charge against bandwidth Paul Turner
2011-08-14 16:21 ` [tip:sched/core] sched: Accumulate " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 06/18] sched: add a timer to handle CFS bandwidth refresh Paul Turner
2011-08-14 16:23 ` [tip:sched/core] sched: Add " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 07/18] sched: expire invalid runtime Paul Turner
2011-08-14 16:24 ` [tip:sched/core] sched: Expire " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 08/18] sched: add support for throttling group entities Paul Turner
2011-08-08 15:46 ` Lin Ming
2011-08-08 16:00 ` Peter Zijlstra
2011-08-08 16:16 ` Paul Turner
2011-08-14 16:26 ` [tip:sched/core] sched: Add " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 09/18] sched: add support for unthrottling " Paul Turner
2011-08-14 16:27 ` [tip:sched/core] sched: Add " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 10/18] sched: allow for positional tg_tree walks Paul Turner
2011-08-14 16:29 ` [tip:sched/core] sched: Allow " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 11/18] sched: prevent interactions with throttled entities Paul Turner
2011-07-22 11:26 ` Kamalesh Babulal
2011-07-22 11:37 ` Peter Zijlstra
2011-07-22 11:41 ` Kamalesh Babulal
2011-07-22 11:43 ` Peter Zijlstra
2011-07-22 18:16 ` Kamalesh Babulal
2011-08-14 16:30 ` [tip:sched/core] sched: Prevent " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 12/18] sched: prevent buddy " Paul Turner
2011-08-14 16:32 ` [tip:sched/core] sched: Prevent " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 13/18] sched: migrate throttled tasks on HOTPLUG Paul Turner
2011-08-14 16:34 ` [tip:sched/core] sched: Migrate " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 14/18] sched: throttle entities exceeding their allowed bandwidth Paul Turner
2011-08-14 16:35 ` [tip:sched/core] sched: Throttle " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 15/18] sched: add exports tracking cfs bandwidth control statistics Paul Turner
2011-08-14 16:37 ` [tip:sched/core] sched: Add " tip-bot for Nikhil Rao
2011-07-21 16:43 ` Paul Turner [this message]
2011-08-14 16:39 ` [tip:sched/core] sched: Return unused runtime on group dequeue tip-bot for Paul Turner
2011-07-21 16:43 ` [RFT][patch 17/18] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
2011-07-21 16:43 ` [patch 18/18] sched: add documentation for bandwidth control Paul Turner
2011-08-14 16:41 ` [tip:sched/core] sched: Add " tip-bot for Bharata B Rao
2011-07-21 23:01 ` [patch 00/18] CFS Bandwidth Control v7.2 Paul Turner
2011-07-25 14:58 ` Peter Zijlstra
2011-07-25 15:00 ` Peter Zijlstra
2011-07-25 16:21 ` Paul E. McKenney
2011-07-25 16:28 ` Peter Zijlstra
2011-07-25 16:46 ` Paul E. McKenney
2011-07-25 17:08 ` Peter Zijlstra
2011-07-25 17:11 ` Dhaval Giani
2011-07-25 17:35 ` Peter Zijlstra
2011-07-28 2:59 ` Paul Turner
2011-09-13 12:10 ` Vladimir Davydov
2011-09-13 14:00 ` Peter Zijlstra
2011-09-16 8:06 ` Paul Turner
2011-09-19 8:22 ` Vladimir Davydov
2011-09-19 8:33 ` Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20110721184758.306848658@google.com \
--to=pjt@google.com \
--cc=a.p.zijlstra@chello.nl \
--cc=bharata@linux.vnet.ibm.com \
--cc=bsingharora@gmail.com \
--cc=dhaval.giani@gmail.com \
--cc=jbaron@redhat.com \
--cc=kamalesh@linux.vnet.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=seto.hidetoshi@jp.fujitsu.com \
--cc=svaidy@linux.vnet.ibm.com \
--cc=vatsa@in.ibm.com \
--cc=xemul@openvz.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox