[patch 14/15] sched: return unused quota on voluntary sleep

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Paul Turner <pjt@google.com>
To: linux-kernel@vger.kernel.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Bharata B Rao <bharata@linux.vnet.ibm.com>,
	Dhaval Giani <dhaval.giani@gmail.com>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>,
	Ingo Molnar <mingo@elte.hu>, Pavel Emelyanov <xemul@openvz.org>
Subject: [patch 14/15] sched: return unused quota on voluntary sleep
Date: Tue, 22 Mar 2011 20:03:40 -0700	[thread overview]
Message-ID: <20110323030450.042414995@google.com> (raw)
In-Reply-To: 20110323030326.789836913@google.com

[-- Attachment #1: sched-bwc-simple_return_quota.patch --]
[-- Type: text/plain, Size: 7540 bytes --]

When a local cfs_rq blocks we return the majority of its remaining quota to the
global bandwidth pool for use by other runqueues.

We do this only when the quota is current and there is more than 
min_cfs_rq_quota [1ms by default] of runtime remaining on the rq.

In the case where there are throttled runqueues and we have sufficient
bandwidth to meter out a slice, a second timer is kicked off to handle this
delivery, unthrottling where appropriate.

Using a 'worst case' antagonist which executes on each cpu
for 1ms before moving onto the next on a fairly large machine:

no quota generations:
 197.47 ms       /cgroup/a/cpuacct.usage
 199.46 ms       /cgroup/a/cpuacct.usage
 205.46 ms       /cgroup/a/cpuacct.usage
 198.46 ms       /cgroup/a/cpuacct.usage
 208.39 ms       /cgroup/a/cpuacct.usage
Since we are allowed to use "stale" quota our usage is effectively bounded by
the rate of input into the global pool and performance is relatively stable.

with quota generations [1s increments]:
 119.58 ms       /cgroup/a/cpuacct.usage
 119.65 ms       /cgroup/a/cpuacct.usage
 119.64 ms       /cgroup/a/cpuacct.usage
 119.63 ms       /cgroup/a/cpuacct.usage
 119.60 ms       /cgroup/a/cpuacct.usage
The large deficit here is due to quota generations (/intentionally/) preventing
us from now using previously stranded slack quota.  The cost is that this quota
becomes unavailable.

with quota generations and quota return:
 200.09 ms       /cgroup/a/cpuacct.usage
 200.09 ms       /cgroup/a/cpuacct.usage
 198.09 ms       /cgroup/a/cpuacct.usage
 200.09 ms       /cgroup/a/cpuacct.usage
 200.06 ms       /cgroup/a/cpuacct.usage
By returning unused quota we're able to both stably consume our desired quota
and prevent unintentional overages due to the abuse of slack quota from 
previous quota periods (especially on a large machine).

Bharata's idea to use a slack timer to handle the return helped make this patch
cleaner.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>

---
 kernel/sched.c      |   16 ++++++++-
 kernel/sched_fair.c |   92 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 1 deletion(-)

Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -254,7 +254,7 @@ struct cfs_bandwidth {
 	ktime_t period;
 	u64 runtime, runtime_assigned, quota;
 	s64 hierarchal_quota; /* used for validating consistency */
-	struct hrtimer period_timer;
+	struct hrtimer period_timer, slack_timer;
 
 	int quota_generation;
 	struct list_head throttled_cfs_rq;
@@ -432,6 +432,17 @@ static enum hrtimer_restart sched_cfs_pe
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 }
 
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+	struct cfs_bandwidth *cfs_b =
+		container_of(timer, struct cfs_bandwidth, slack_timer);
+	do_sched_cfs_slack_timer(cfs_b);
+
+	return HRTIMER_NORESTART;
+}
+
 static
 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, u64 quota, u64 period)
 {
@@ -444,6 +455,8 @@ void init_cfs_bandwidth(struct cfs_bandw
 
 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->period_timer.function = sched_cfs_period_timer;
+	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_b->slack_timer.function = sched_cfs_slack_timer;
 
 	cfs_b->nr_periods = 0;
 	cfs_b->nr_throttled = 0;
@@ -477,6 +490,7 @@ static void start_cfs_bandwidth(struct c
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
 	hrtimer_cancel(&cfs_b->period_timer);
+	hrtimer_cancel(&cfs_b->slack_timer);
 }
 #else
 #ifdef CONFIG_FAIR_GROUP_SCHED
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1225,6 +1225,7 @@ static struct sched_entity *pick_next_en
 
 static void throttle_cfs_rq(struct cfs_rq *cfs_rq);
 static inline int within_bandwidth(struct cfs_rq *cfs_rq);
+static void return_cfs_rq_quota(struct cfs_rq *cfs_rq);
 
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
@@ -1237,6 +1238,8 @@ static void put_prev_entity(struct cfs_r
 
 	if (!within_bandwidth(cfs_rq))
 		throttle_cfs_rq(cfs_rq);
+	else
+		return_cfs_rq_quota(cfs_rq);
 
 	check_spread(cfs_rq, prev);
 	if (prev->on_rq) {
@@ -1589,6 +1592,94 @@ static int do_sched_cfs_period_timer(str
 
 	return idle;
 }
+
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_quota = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+
+/* are we near the end of the current quota period? */
+static int quota_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+	struct hrtimer *refresh_timer = &cfs_b->period_timer;
+	u64 remaining;
+
+	/* if the call back is running a quota refresh is occurring */
+	if (hrtimer_callback_running(refresh_timer))
+		return 1;
+
+	/* is a quota refresh about to occur? */
+	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+	if (remaining < min_expire)
+		return 1;
+
+	return 0;
+}
+
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+	int generation;
+
+	/* confirm we're still not at a refresh boundary */
+	if (quota_refresh_within(cfs_b, min_bandwidth_expiration))
+		return;
+
+	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+		runtime = cfs_b->runtime;
+		cfs_b->runtime = 0;
+	}
+	generation = cfs_b->quota_generation;
+	raw_spin_unlock(&cfs_b->lock);
+
+	if (!runtime)
+		return;
+
+	runtime = distribute_cfs_bandwidth(cfs_b, runtime, generation);
+
+	raw_spin_lock(&cfs_b->lock);
+	cfs_b->runtime = runtime;
+	raw_spin_unlock(&cfs_b->lock);
+}
+
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+
+	/* if there's a quota refresh soon don't bother with slack */
+	if (quota_refresh_within(cfs_b, min_left))
+		return;
+
+	start_bandwidth_timer(&cfs_b->slack_timer,
+				ns_to_ktime(cfs_bandwidth_slack_period));
+}
+
+static void return_cfs_rq_quota(struct cfs_rq *cfs_rq)
+{
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	s64 slack_quota = cfs_rq->quota_remaining - min_cfs_rq_quota;
+
+	if (!cfs_rq->quota_enabled || cfs_rq->load.weight)
+		return;
+
+	if (slack_quota <= 0 || !cfs_rq_quota_current(cfs_rq))
+		return;
+
+	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->quota != RUNTIME_INF && cfs_rq_quota_current(cfs_rq)) {
+		cfs_b->runtime += slack_quota;
+
+		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+		    !list_empty(&cfs_b->throttled_cfs_rq))
+			start_cfs_slack_bandwidth(cfs_b);
+	}
+	raw_spin_unlock(&cfs_b->lock);
+	cfs_rq->quota_remaining -= slack_quota;
+}
+
 #else
 static inline u64 default_cfs_period(void)
 {
@@ -1614,6 +1705,7 @@ static void check_cfs_rq_quota(struct cf
 static void throttle_cfs_rq(struct cfs_rq *cfs_rq) {}
 static void account_cfs_rq_quota(struct cfs_rq *cfs_rq,
 		unsigned long delta_exec) {}
+static void return_cfs_rq_quota(struct cfs_rq *cfs_rq) {}
 #endif

next prev parent reply	other threads:[~2011-03-23  3:10 UTC|newest]

Thread overview: 63+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-03-23  3:03 [patch 00/15] CFS Bandwidth Control V5 Paul Turner
2011-03-23  3:03 ` [patch 01/15] sched: introduce primitives to account for CFS bandwidth tracking Paul Turner
2011-03-24 12:38   ` Kamalesh Babulal
2011-04-05 13:28   ` Peter Zijlstra
2011-03-23  3:03 ` [patch 02/15] sched: validate CFS quota hierarchies Paul Turner
2011-03-23 10:39   ` torbenh
2011-03-23 20:49     ` Paul Turner
2011-03-24  6:31   ` Bharata B Rao
2011-04-08 17:01     ` Peter Zijlstra
2011-03-29  6:57   ` Hidetoshi Seto
2011-04-04 23:10     ` Paul Turner
2011-04-05 13:28   ` Peter Zijlstra
2011-03-23  3:03 ` [patch 03/15] sched: accumulate per-cfs_rq cpu usage Paul Turner
2011-04-05 13:28   ` Peter Zijlstra
2011-04-06 20:44     ` Paul Turner
2011-04-05 13:28   ` Peter Zijlstra
2011-04-06 20:47     ` Paul Turner
2011-03-23  3:03 ` [patch 04/15] sched: throttle cfs_rq entities which exceed their local quota Paul Turner
2011-03-23  5:09   ` Mike Galbraith
2011-03-23 20:53     ` Paul Turner
2011-03-24  6:36   ` Bharata B Rao
2011-03-24  7:40     ` Paul Turner
2011-04-05 13:28   ` Peter Zijlstra
2011-04-05 23:15     ` Paul Turner
2011-03-23  3:03 ` [patch 05/15] sched: unthrottle cfs_rq(s) who ran out of quota at period refresh Paul Turner
2011-04-05 13:28   ` Peter Zijlstra
2011-04-05 13:33     ` Peter Zijlstra
2011-04-05 13:28   ` Peter Zijlstra
2011-04-05 13:28   ` Peter Zijlstra
2011-03-23  3:03 ` [patch 06/15] sched: allow for positional tg_tree walks Paul Turner
2011-03-23  3:03 ` [patch 07/15] sched: prevent interactions between throttled entities and load-balance Paul Turner
2011-04-05 13:28   ` Peter Zijlstra
2011-03-23  3:03 ` [patch 08/15] sched: migrate throttled tasks on HOTPLUG Paul Turner
2011-04-05 13:28   ` Peter Zijlstra
2011-04-06  2:31     ` Paul Turner
2011-03-23  3:03 ` [patch 09/15] sched: add exports tracking cfs bandwidth control statistics Paul Turner
2011-04-05 13:28   ` Peter Zijlstra
2011-03-23  3:03 ` [patch 10/15] sched: (fixlet) dont update shares twice on on_rq parent Paul Turner
2011-04-05 13:28   ` Peter Zijlstra
2011-03-23  3:03 ` [patch 11/15] sched: hierarchical task accounting for SCHED_OTHER Paul Turner
2011-04-05 13:28   ` Peter Zijlstra
2011-03-23  3:03 ` [patch 12/15] sched: maintain throttled rqs as a list Paul Turner
2011-04-22  2:50   ` Hidetoshi Seto
2011-04-24 21:23     ` Paul Turner
2011-03-23  3:03 ` [patch 13/15] sched: expire slack quota using generation counters Paul Turner
2011-04-05 13:28   ` Peter Zijlstra
2011-04-06  7:22     ` Paul Turner
2011-04-06  8:15       ` Peter Zijlstra
2011-04-06 11:26       ` Peter Zijlstra
2011-03-23  3:03 ` Paul Turner [this message]
2011-04-05 13:28   ` [patch 14/15] sched: return unused quota on voluntary sleep Peter Zijlstra
2011-04-06  2:25     ` Paul Turner
2011-03-23  3:03 ` [patch 15/15] sched: add documentation for bandwidth control Paul Turner
2011-03-24  6:38   ` Bharata B Rao
2011-03-24 16:12 ` [patch 00/15] CFS Bandwidth Control V5 Bharata B Rao
2011-03-31  7:57 ` Xiao Guangrong
2011-04-04 23:10   ` Paul Turner
2011-04-05 13:28 ` Peter Zijlstra
2011-05-20  2:12 ` Test for CFS Bandwidth Control V6 Xiao Guangrong
2011-05-24  0:53   ` Hidetoshi Seto
2011-05-24  7:56     ` Xiao Guangrong
2011-06-08  2:54     ` Paul Turner
2011-06-08  5:55       ` Hidetoshi Seto

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110323030450.042414995@google.com \
    --to=pjt@google.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=bharata@linux.vnet.ibm.com \
    --cc=dhaval.giani@gmail.com \
    --cc=kamalesh@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=vatsa@in.ibm.com \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.