All of lore.kernel.org
 help / color / mirror / Atom feed
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
To: linux-kernel@vger.kernel.org
Cc: Dhaval Giani <dhaval.giani@gmail.com>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>,
	Ingo Molnar <mingo@elte.hu>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Pavel Emelyanov <xemul@openvz.org>,
	Herbert Poetzl <herbert@13thfloor.at>,
	Avi Kivity <avi@redhat.com>, Chris Friesen <cfriesen@nortel.com>,
	Paul Menage <menage@google.com>,
	Mike Waychison <mikew@google.com>, Paul Turner <pjt@google.com>,
	Nikhil Rao <ncrao@google.com>
Subject: [PATCH v3 7/7] sched: Return/expire slack quota using generation counters
Date: Tue, 12 Oct 2010 13:25:16 +0530	[thread overview]
Message-ID: <20101012075516.GH9893@in.ibm.com> (raw)
In-Reply-To: <20101012074910.GA9893@in.ibm.com>

>From Paul Turner <pjt@google.com>

sched: Return/expire slack quota using generation counters

This patch adds generation counters to track and expire quotas.

This allows for two useful semantics:

1) On voluntary dequeue quota can be returned to the global pool provided it
   is still "current".  In this patch we return all but one tick's worth of
   quota so that workloads with high rates of turn-over do not incur
   significant contention.

   When returning quota to the global pool, if there are throttled runqueues
   and we have more than a slice of quota available, attempt to unthrottle
   them (again this is to prevent contention in the high turn over case).

2) On period expiration the generation counter is incremented, naturally
   expiring outstanding slack quota in the system.


A separate hrtimer is used to drive the slack quota redistribution and
subsequent unthrottling of throttled entities.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
 kernel/sched.c      |   54 +++++++++++++++++++++++--
 kernel/sched_fair.c |  111 ++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 146 insertions(+), 19 deletions(-)

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -250,8 +250,10 @@ static LIST_HEAD(task_groups);
 struct cfs_bandwidth {
 	raw_spinlock_t		lock;
 	ktime_t			period;
-	u64			runtime, quota;
+	u64			runtime, quota, generation;
+	int 			throttled_rqs;
 	struct hrtimer		period_timer;
+	struct hrtimer		slack_timer;
 
 	/* throttle statistics */
 	u64			nr_periods;
@@ -391,7 +393,7 @@ struct cfs_rq {
 	unsigned long rq_weight;
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
-	u64 quota_assigned, quota_used;
+	u64 quota_assigned, quota_used, quota_generation;
 	int throttled;
 	u64 throttled_timestamp;
 #endif
@@ -399,6 +401,17 @@ struct cfs_rq {
 };
 
 #ifdef CONFIG_CFS_BANDWIDTH
+
+static int do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+	struct cfs_bandwidth *cfs_b =
+		container_of(timer, struct cfs_bandwidth, slack_timer);
+	do_sched_cfs_slack_timer(cfs_b);
+	return HRTIMER_NORESTART;
+}
+
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
 
 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
@@ -428,9 +441,11 @@ void init_cfs_bandwidth(struct cfs_bandw
 	raw_spin_lock_init(&cfs_b->lock);
 	cfs_b->quota = cfs_b->runtime = quota;
 	cfs_b->period = ns_to_ktime(period);
-
+	cfs_b->generation = 0;
 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->period_timer.function = sched_cfs_period_timer;
+	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_b->slack_timer.function = sched_cfs_slack_timer;
 
 	cfs_b->nr_periods = 0;
 	cfs_b->nr_throttled = 0;
@@ -464,6 +479,35 @@ static void destroy_cfs_bandwidth(struct
 {
 	hrtimer_cancel(&cfs_b->period_timer);
 }
+
+
+/* Should this be a tunable ? */
+#define CFS_SLACK_PERIOD	2000000	/* 2ms */
+
+static void destroy_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	hrtimer_cancel(&cfs_b->slack_timer);
+}
+
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	if (cfs_b->quota == RUNTIME_INF)
+		return;
+
+	if (hrtimer_active(&cfs_b->slack_timer))
+		return;
+
+	raw_spin_lock(&cfs_b->lock);
+
+	/*
+	 * TODO: Don't start the slack timer if the
+	 * period timer is about to fire.
+	 */
+	start_bandwidth_timer(&cfs_b->slack_timer,
+		ns_to_ktime(CFS_SLACK_PERIOD));
+	raw_spin_unlock(&cfs_b->lock);
+}
+
 #endif
 
 /* Real-Time classes' related field in a runqueue: */
@@ -8182,6 +8226,7 @@ static void free_fair_sched_group(struct
 
 #ifdef CONFIG_CFS_BANDWIDTH
 	destroy_cfs_bandwidth(&tg->cfs_bandwidth);
+	destroy_cfs_slack_bandwidth(&tg->cfs_bandwidth);
 #endif
 
 	for_each_possible_cpu(i) {
@@ -8936,6 +8981,7 @@ static u64 cpu_shares_read_u64(struct cg
 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 {
 	int i;
+	u64 next_generation;
 	static DEFINE_MUTEX(mutex);
 
 	if (tg == &init_task_group)
@@ -8956,6 +9002,7 @@ static int tg_set_cfs_bandwidth(struct t
 	raw_spin_lock_irq(&tg->cfs_bandwidth.lock);
 	tg->cfs_bandwidth.period = ns_to_ktime(period);
 	tg->cfs_bandwidth.runtime = tg->cfs_bandwidth.quota = quota;
+	next_generation = ++tg->cfs_bandwidth.generation;
 	raw_spin_unlock_irq(&tg->cfs_bandwidth.lock);
 
 	for_each_possible_cpu(i) {
@@ -8964,6 +9011,7 @@ static int tg_set_cfs_bandwidth(struct t
 
 		raw_spin_lock_irq(&rq->lock);
 		init_cfs_rq_quota(cfs_rq);
+		cfs_rq->quota_generation = next_generation;
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
 		raw_spin_unlock_irq(&rq->lock);
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -287,6 +287,8 @@ static inline int cfs_rq_throttled(struc
 	return cfs_rq->throttled;
 }
 
+static void cfs_rq_return_unused_quota(struct cfs_rq *cfs_rq);
+
 static void account_cfs_rq_quota(struct cfs_rq *cfs_rq,
 		unsigned long delta_exec);
 #else
@@ -912,6 +914,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 	 */
 	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
+#ifdef CONFIG_CFS_BANDWIDTH
+	else if (cfs_rq->quota_assigned != RUNTIME_INF)
+		cfs_rq_return_unused_quota(cfs_rq);
+#endif
 }
 
 /*
@@ -1266,6 +1272,7 @@ static void throttle_cfs_rq(struct cfs_r
 out_throttled:
 	cfs_rq->throttled = 1;
 	cfs_rq->throttled_timestamp = rq_of(cfs_rq)->clock;
+	tg_cfs_bandwidth(cfs_rq->tg)->throttled_rqs = 1;
 }
 
 static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -1304,16 +1311,24 @@ static void unthrottle_cfs_rq(struct cfs
 static void account_cfs_rq_quota(struct cfs_rq *cfs_rq,
 		unsigned long delta_exec)
 {
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	if (cfs_rq->quota_assigned == RUNTIME_INF)
 		return;
 
 	cfs_rq->quota_used += delta_exec;
 
-	if (cfs_rq_throttled(cfs_rq) ||
-		cfs_rq->quota_used < cfs_rq->quota_assigned)
+	if (cfs_rq_throttled(cfs_rq))
+		return;
+
+	if (cfs_rq->quota_generation != cfs_b->generation)
+		cfs_rq->quota_assigned = min(cfs_rq->quota_used,
+				cfs_rq->quota_assigned);
+
+	if (cfs_rq->quota_used < cfs_rq->quota_assigned)
 		return;
 
 	cfs_rq->quota_assigned += tg_request_cfs_quota(cfs_rq->tg);
+	cfs_rq->quota_generation = cfs_b->generation;
 
 	if (cfs_rq->quota_used >= cfs_rq->quota_assigned) {
 		throttle_cfs_rq(cfs_rq);
@@ -1321,19 +1336,11 @@ static void account_cfs_rq_quota(struct 
 	}
 }
 
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+static int redistribute_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
-	int i, idle = 1, num_throttled = 0;
-	u64 delta;
+	int i, idle = 1, num_throttled = 0, throttled_rqs = 0;
 	const struct cpumask *span;
-
-	if (cfs_b->quota == RUNTIME_INF)
-		return 1;
-
-	/* reset group quota */
-	raw_spin_lock(&cfs_b->lock);
-	cfs_b->runtime = cfs_b->quota;
-	raw_spin_unlock(&cfs_b->lock);
+	u64 delta;
 
 	span = sched_bw_period_mask();
 	for_each_cpu(i, span) {
@@ -1346,27 +1353,99 @@ static int do_sched_cfs_period_timer(str
 		if (!cfs_rq_throttled(cfs_rq))
 			continue;
 		num_throttled++;
+		throttled_rqs++;
 
 		delta = tg_request_cfs_quota(cfs_rq->tg);
 
 		if (delta) {
 			raw_spin_lock(&rq->lock);
 			cfs_rq->quota_assigned += delta;
+			cfs_rq->quota_generation = cfs_b->generation;
 
-			if (cfs_rq->quota_used < cfs_rq->quota_assigned)
+			if (cfs_rq->quota_used < cfs_rq->quota_assigned) {
 				unthrottle_cfs_rq(cfs_rq);
+				throttled_rqs--;
+			}
 			raw_spin_unlock(&rq->lock);
 		}
 	}
 
-	/* update throttled stats */
-	cfs_b->nr_periods++;
 	if (num_throttled)
 		cfs_b->nr_throttled++;
 
+	cfs_b->throttled_rqs = throttled_rqs;
 	return idle;
 }
 
+static void cfs_rq_return_unused_quota(struct cfs_rq *cfs_rq)
+{
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	s64 quota_remaining;
+
+	if (cfs_rq->quota_used > cfs_rq->quota_assigned ||
+	    cfs_rq->quota_generation != cfs_b->generation)
+		return;
+
+	quota_remaining = cfs_rq->quota_assigned - cfs_rq->quota_used;
+	/* hold 1 tick of quota in reserve for workloads with high turnover */
+	if (NS_TO_JIFFIES(quota_remaining) < 1)
+		return;
+
+	quota_remaining -= NSEC_PER_SEC / HZ;
+	BUG_ON(quota_remaining < 0);
+
+	if (!quota_remaining)
+		return;
+
+	raw_spin_lock(&cfs_b->lock);
+	/* previous was speculative */
+	if (cfs_rq->quota_generation == cfs_b->generation) {
+		cfs_b->runtime += quota_remaining;
+		cfs_rq->quota_assigned -= quota_remaining;
+	}
+	raw_spin_unlock(&cfs_b->lock);
+
+	/*
+	 * if we've re-accumulated more than a slice and there are throttled
+	 * rq's, try to unthrottle them.
+	 */
+	if (cfs_b->throttled_rqs &&
+		cfs_b->runtime > sched_cfs_bandwidth_slice())
+		start_cfs_slack_bandwidth(cfs_b);
+}
+
+
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+{
+	int idle = 1;
+
+	if (cfs_b->quota == RUNTIME_INF)
+		return 1;
+
+	/* reset group quota */
+	raw_spin_lock(&cfs_b->lock);
+	idle = cfs_b->runtime == cfs_b->quota;
+	cfs_b->runtime = cfs_b->quota;
+	cfs_b->generation++;
+	raw_spin_unlock(&cfs_b->lock);
+
+	idle = redistribute_cfs_bandwidth(cfs_b);
+
+	/* update throttled stats */
+	cfs_b->nr_periods++;
+
+	return idle;
+}
+
+static int do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+	if (cfs_b->quota == RUNTIME_INF)
+		return 0;
+
+	redistribute_cfs_bandwidth(cfs_b);
+	return 0;
+}
+
 #endif
 
 #ifdef CONFIG_SMP

  parent reply	other threads:[~2010-10-12  7:55 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-10-12  7:49 [PATCH v3 0/7] CFS Bandwidth Control Bharata B Rao
2010-10-12  7:50 ` [PATCH v3 1/7] sched: introduce primitives to account for CFS bandwidth tracking Bharata B Rao
2010-10-13 13:00   ` Balbir Singh
2010-10-14  5:14     ` Bharata B Rao
2010-10-14  7:52     ` Peter Zijlstra
2010-10-14 12:38       ` Balbir Singh
2010-10-14 13:24         ` Peter Zijlstra
2010-12-06  9:02         ` Bharata B Rao
2010-10-12  7:51 ` [PATCH v3 2/7] sched: accumulate per-cfs_rq cpu usage Bharata B Rao
2010-10-13 13:30   ` Balbir Singh
2010-10-13 13:46     ` Nikhil Rao
2010-10-13 13:59       ` Balbir Singh
2010-10-13 14:41         ` Nikhil Rao
2010-10-14  5:39           ` Balbir Singh
2010-10-14  8:57   ` Peter Zijlstra
2010-10-14  9:07     ` Paul Turner
2010-10-14  9:13       ` Peter Zijlstra
2010-10-14  9:01   ` Peter Zijlstra
2010-10-14  9:14     ` Paul Turner
2010-10-14  9:27       ` Peter Zijlstra
2010-10-14  9:53         ` Paul Turner
2010-10-14  9:19   ` Peter Zijlstra
2010-10-14  9:27     ` Paul Turner
2010-10-14  9:40       ` Bharata B Rao
2010-10-12  7:52 ` [PATCH v3 3/7] sched: throttle cfs_rq entities which exceed their local quota Bharata B Rao
2010-10-13  6:34   ` KAMEZAWA Hiroyuki
2010-10-13  6:44     ` Paul Turner
2010-10-13  6:47       ` Bharata B Rao
2010-10-13  6:52         ` Paul Turner
2010-10-13  7:00       ` KAMEZAWA Hiroyuki
2010-10-13  7:13         ` Paul Turner
2010-10-14  9:12     ` Peter Zijlstra
2010-10-14  9:50       ` KAMEZAWA Hiroyuki
2010-10-14  9:59         ` Peter Zijlstra
2010-10-14 10:08           ` KAMEZAWA Hiroyuki
2010-10-14 10:25             ` Paul Turner
2010-10-14 10:41               ` Peter Zijlstra
2010-10-14 23:30                 ` KAMEZAWA Hiroyuki
2010-10-14 10:37             ` Peter Zijlstra
2010-10-14  9:58       ` Paul Turner
2010-10-12  7:52 ` [PATCH v3 4/7] sched: unthrottle cfs_rq(s) who ran out of quota at period refresh Bharata B Rao
2010-10-15  4:45   ` Balbir Singh
2010-12-07 13:13     ` Bharata B Rao
2010-10-12  7:53 ` [PATCH v3 5/7] sched: add exports tracking cfs bandwidth control statistics Bharata B Rao
2010-10-12  7:54 ` [PATCH v3 6/7] sched: hierarchical task accounting for FAIR_GROUP_SCHED Bharata B Rao
2010-10-12  7:55 ` Bharata B Rao [this message]
2010-10-13  5:14 ` [PATCH v3 0/7] CFS Bandwidth Control KAMEZAWA Hiroyuki
2010-10-13  5:44 ` Herbert Poetzl
2010-10-13  6:26   ` Paul Turner
2010-11-17  8:32 ` Lai Jiangshan
2010-11-19  3:24   ` Bharata B Rao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20101012075516.GH9893@in.ibm.com \
    --to=bharata@linux.vnet.ibm.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=avi@redhat.com \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=cfriesen@nortel.com \
    --cc=dhaval.giani@gmail.com \
    --cc=herbert@13thfloor.at \
    --cc=kamalesh@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=menage@google.com \
    --cc=mikew@google.com \
    --cc=mingo@elte.hu \
    --cc=ncrao@google.com \
    --cc=pjt@google.com \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=vatsa@in.ibm.com \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.