[CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Paul Turner <pjt@google.com>
To: linux-kernel@vger.kernel.org
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>,
	Dhaval Giani <dhaval@linux.vnet.ibm.com>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
	Gautham R Shenoy <ego@in.ibm.com>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>,
	Ingo Molnar <mingo@elte.hu>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Pavel Emelyanov <xemul@openvz.org>,
	Herbert Poetzl <herbert@13thfloor.at>,
	Avi Kivity <avi@redhat.com>, Chris Friesen <cfriesen@nortel.com>,
	Nikhil Rao <ncrao@google.com>
Subject: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota
Date: Tue, 15 Feb 2011 19:18:34 -0800	[thread overview]
Message-ID: <20110216031841.068673650@google.com> (raw)
In-Reply-To: 20110216031831.571628191@google.com

[-- Attachment #1: sched-bwc-throttle_entities.patch --]
[-- Type: text/plain, Size: 7813 bytes --]

In account_cfs_rq_quota() (via update_curr()) we track consumption versus a
cfs_rq's local quota and whether there is global quota available to continue
enabling it in the event we run out.

This patch adds the required support for the latter case, throttling entities
until quota is available to run.  Throttling dequeues the entity in question
and sends a reschedule to the owning cpu so that it can be evicted.

The following restrictions apply to a throttled cfs_rq:
- It is dequeued from sched_entity hierarchy and restricted from being
  re-enqueued.  This means that new/waking children of this entity will be
  queued up to it, but not past it.
- It does not contribute to weight calculations in tg_shares_up
- In the case that the cfs_rq of the cpu we are trying to pull from is throttled
  it is  is ignored by the loadbalancer in __load_balance_fair() and
  move_one_task_fair().

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
 kernel/sched.c      |    3 +
 kernel/sched_fair.c |  121 +++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 114 insertions(+), 10 deletions(-)

Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -388,6 +388,7 @@ struct cfs_rq {
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	u64 quota_assigned, quota_used;
+	int throttled;
 #endif
 #endif
 };
@@ -1656,6 +1657,8 @@ static void update_h_load(long cpu)
 
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
 
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+
 /*
  * fair double_lock_balance: Safely acquires both rq->locks in a fair
  * way at the expense of forcing extra atomic operations in all
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -331,8 +331,34 @@ static inline struct cfs_bandwidth *tg_c
 	return &tg->cfs_bandwidth;
 }
 
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->throttled;
+}
+
+/* it's possible to be 'on_rq' in a dequeued (e.g. throttled) hierarchy */
+static inline int entity_on_rq(struct sched_entity *se)
+{
+	for_each_sched_entity(se)
+		if (!se->on_rq)
+			return 0;
+
+	return 1;
+}
+
 static void account_cfs_rq_quota(struct cfs_rq *cfs_rq,
 		unsigned long delta_exec);
+#else
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+static inline int entity_on_rq(struct sched_entity *se)
+{
+	return se->on_rq;
+}
+
 #endif
 
 
@@ -744,9 +770,10 @@ static void update_cfs_rq_load_contribut
 					    int global_update)
 {
 	struct task_group *tg = cfs_rq->tg;
-	long load_avg;
+	long load_avg = 0;
 
-	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+	if (!cfs_rq_throttled(cfs_rq))
+		load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
 	load_avg -= cfs_rq->load_contribution;
 
 	if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
@@ -761,7 +788,11 @@ static void update_cfs_load(struct cfs_r
 	u64 now, delta;
 	unsigned long load = cfs_rq->load.weight;
 
-	if (cfs_rq->tg == &root_task_group)
+	/*
+	 * Don't maintain averages for the root task group, or while we are
+	 * throttled.
+	 */
+	if (cfs_rq->tg == &root_task_group || cfs_rq_throttled(cfs_rq))
 		return;
 
 	now = rq_of(cfs_rq)->clock_task;
@@ -1015,6 +1046,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+
+
+#ifdef CONFIG_CFS_BANDWIDTH
+	if (!entity_is_task(se) && (cfs_rq_throttled(group_cfs_rq(se)) ||
+	     !group_cfs_rq(se)->nr_running))
+		return;
+#endif
+
 	update_cfs_load(cfs_rq, 0);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
@@ -1087,6 +1126,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 	 */
 	update_curr(cfs_rq);
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	if (!entity_is_task(se) && cfs_rq_throttled(group_cfs_rq(se)))
+		return;
+#endif
+
 	update_stats_dequeue(cfs_rq, se);
 	if (flags & DEQUEUE_SLEEP) {
 #ifdef CONFIG_SCHEDSTATS
@@ -1363,6 +1407,9 @@ enqueue_task_fair(struct rq *rq, struct 
 			break;
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, flags);
+		/* don't continue to enqueue if our parent is throttled */
+		if (cfs_rq_throttled(cfs_rq))
+			break;
 		flags = ENQUEUE_WAKEUP;
 	}
 
@@ -1390,8 +1437,11 @@ static void dequeue_task_fair(struct rq 
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
 
-		/* Don't dequeue parent if it has other entities besides us */
-		if (cfs_rq->load.weight)
+		/*
+		 * Don't dequeue parent if it has other entities besides us,
+		 * or if it is throttled
+		 */
+		if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
 			break;
 		flags |= DEQUEUE_SLEEP;
 	}
@@ -1430,6 +1480,42 @@ static u64 tg_request_cfs_quota(struct t
 	return delta;
 }
 
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *se;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	/* account load preceeding throttle */
+	update_cfs_load(cfs_rq, 0);
+
+	/* prevent previous buddy nominations from re-picking this se */
+	clear_buddies(cfs_rq_of(se), se);
+
+	/*
+	 * It's possible for the current task to block and re-wake before task
+	 * switch, leading to a throttle within enqueue_task->update_curr()
+	 * versus an an entity that has not technically been enqueued yet.
+	 *
+	 * In this case, since we haven't actually done the enqueue yet, cut
+	 * out and allow enqueue_entity() to short-circuit
+	 */
+	if (!se->on_rq)
+		goto out_throttled;
+
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+		dequeue_entity(cfs_rq, se, 1);
+		if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
+			break;
+	}
+
+out_throttled:
+	cfs_rq->throttled = 1;
+	update_cfs_rq_load_contribution(cfs_rq, 1);
+}
+
 static void account_cfs_rq_quota(struct cfs_rq *cfs_rq,
 		unsigned long delta_exec)
 {
@@ -1438,10 +1524,16 @@ static void account_cfs_rq_quota(struct 
 
 	cfs_rq->quota_used += delta_exec;
 
-	if (cfs_rq->quota_used < cfs_rq->quota_assigned)
+	if (cfs_rq_throttled(cfs_rq) ||
+		cfs_rq->quota_used < cfs_rq->quota_assigned)
 		return;
 
 	cfs_rq->quota_assigned += tg_request_cfs_quota(cfs_rq->tg);
+
+	if (cfs_rq->quota_used >= cfs_rq->quota_assigned) {
+		throttle_cfs_rq(cfs_rq);
+		resched_task(cfs_rq->rq->curr);
+	}
 }
 
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
@@ -1941,6 +2033,12 @@ static void check_preempt_wakeup(struct 
 	if (unlikely(se == pse))
 		return;
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	/* avoid pre-emption check/buddy nomination for throttled tasks */
+	if (!entity_on_rq(pse))
+		return;
+#endif
+
 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
 		set_next_buddy(pse);
 
@@ -2060,7 +2158,8 @@ static bool yield_to_task_fair(struct rq
 {
 	struct sched_entity *se = &p->se;
 
-	if (!se->on_rq)
+	/* ensure entire hierarchy is on rq (e.g. running & not throttled) */
+	if (!entity_on_rq(se))
 		return false;
 
 	/* Tell the scheduler that we'd really like pse to run next. */
@@ -2280,7 +2379,8 @@ static void update_shares(int cpu)
 
 	rcu_read_lock();
 	for_each_leaf_cfs_rq(rq, cfs_rq)
-		update_shares_cpu(cfs_rq->tg, cpu);
+		if (!cfs_rq_throttled(cfs_rq))
+			update_shares_cpu(cfs_rq->tg, cpu);
 	rcu_read_unlock();
 }
 
@@ -2304,9 +2404,10 @@ load_balance_fair(struct rq *this_rq, in
 		u64 rem_load, moved_load;
 
 		/*
-		 * empty group
+		 * empty group or throttled cfs_rq
 		 */
-		if (!busiest_cfs_rq->task_weight)
+		if (!busiest_cfs_rq->task_weight ||
+				cfs_rq_throttled(busiest_cfs_rq))
 			continue;
 
 		rem_load = (u64)rem_load_move * busiest_weight;

next prev parent reply	other threads:[~2011-02-16  3:20 UTC|newest]

Thread overview: 71+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-02-16  3:18 [CFS Bandwidth Control v4 0/7] Introduction Paul Turner
2011-02-16  3:18 ` [CFS Bandwidth Control v4 1/7] sched: introduce primitives to account for CFS bandwidth tracking Paul Turner
2011-02-16 16:52   ` Balbir Singh
2011-02-17  2:54     ` Bharata B Rao
2011-02-23 13:32   ` Peter Zijlstra
2011-02-25  3:11     ` Paul Turner
2011-02-25 20:53     ` Paul Turner
2011-02-16  3:18 ` [CFS Bandwidth Control v4 2/7] sched: accumulate per-cfs_rq cpu usage Paul Turner
2011-02-16 17:45   ` Balbir Singh
2011-02-23 13:32   ` Peter Zijlstra
2011-02-25  3:33     ` Paul Turner
2011-02-25 12:31       ` Peter Zijlstra
2011-02-16  3:18 ` Paul Turner [this message]
2011-02-18  6:52   ` [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota Balbir Singh
2011-02-23 13:32   ` Peter Zijlstra
2011-02-24  5:21     ` Bharata B Rao
2011-02-24 11:05       ` Peter Zijlstra
2011-02-24 15:45         ` Bharata B Rao
2011-02-24 15:52           ` Peter Zijlstra
2011-02-24 16:39             ` Bharata B Rao
2011-02-24 17:20               ` Peter Zijlstra
2011-02-25  3:59                 ` Paul Turner
2011-02-25  3:41         ` Paul Turner
2011-02-25  3:10     ` Paul Turner
2011-02-25 13:58       ` Bharata B Rao
2011-02-25 20:51         ` Paul Turner
2011-02-28  3:50           ` Bharata B Rao
2011-02-28  6:38             ` Paul Turner
2011-02-28 13:48       ` Peter Zijlstra
2011-03-01  8:31         ` Paul Turner
2011-03-02  7:23   ` Bharata B Rao
2011-03-02  8:05     ` Paul Turner
2011-02-16  3:18 ` [CFS Bandwidth Control v4 4/7] sched: unthrottle cfs_rq(s) who ran out of quota at period refresh Paul Turner
2011-02-18  7:19   ` Balbir Singh
2011-02-18  8:10     ` Bharata B Rao
2011-02-23 12:23   ` Peter Zijlstra
2011-02-23 13:32   ` Peter Zijlstra
2011-02-24  7:04     ` Bharata B Rao
2011-02-24 11:14       ` Peter Zijlstra
2011-02-26  0:02     ` Paul Turner
2011-02-16  3:18 ` [CFS Bandwidth Control v4 5/7] sched: add exports tracking cfs bandwidth control statistics Paul Turner
2011-02-22  3:14   ` Balbir Singh
2011-02-22  4:13     ` Bharata B Rao
2011-02-22  4:40       ` Balbir Singh
2011-02-23  8:03         ` Paul Turner
2011-02-23 10:13           ` Balbir Singh
2011-02-23 13:32   ` Peter Zijlstra
2011-02-25  3:26     ` Paul Turner
2011-02-25  8:54       ` Peter Zijlstra
2011-02-16  3:18 ` [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED_OTHER Paul Turner
2011-02-22  3:17   ` Balbir Singh
2011-02-23  8:05     ` Paul Turner
2011-02-23  2:02   ` Hidetoshi Seto
2011-02-23  2:20     ` Paul Turner
2011-02-23  2:43     ` Balbir Singh
2011-02-23 13:32   ` Peter Zijlstra
2011-02-25  3:25     ` Paul Turner
2011-02-25 12:17       ` Peter Zijlstra
2011-02-16  3:18 ` [CFS Bandwidth Control v4 7/7] sched: add documentation for bandwidth control Paul Turner
2011-02-21  2:47 ` [CFS Bandwidth Control v4 0/7] Introduction Xiao Guangrong
2011-02-22 10:28   ` Bharata B Rao
2011-02-23  7:42   ` Paul Turner
2011-02-23  7:51     ` Balbir Singh
2011-02-23  7:56       ` Paul Turner
2011-02-23  8:31         ` Bharata B Rao
     [not found] ` <20110224161111.7d83a884@jacob-laptop>
2011-02-25 10:03   ` Paul Turner
2011-02-25 13:06     ` jacob pan
2011-03-08  3:57       ` Balbir Singh
2011-03-08 18:18         ` Jacob Pan
2011-03-09 10:12       ` Paul Turner
2011-03-09 21:57         ` jacob pan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110216031841.068673650@google.com \
    --to=pjt@google.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=avi@redhat.com \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=bharata@linux.vnet.ibm.com \
    --cc=cfriesen@nortel.com \
    --cc=dhaval@linux.vnet.ibm.com \
    --cc=ego@in.ibm.com \
    --cc=herbert@13thfloor.at \
    --cc=kamalesh@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=ncrao@google.com \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=vatsa@in.ibm.com \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox