[RFC v1 PATCH 4/7] sched: Enforce hard limits by throttling

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Bharata B Rao <bharata@linux.vnet.ibm.com>
To: linux-kernel@vger.kernel.org
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
	Gautham R Shenoy <ego@in.ibm.com>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	Ingo Molnar <mingo@elte.hu>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Pavel Emelyanov <xemul@openvz.org>,
	Herbert Poetzl <herbert@13thfloor.at>,
	Avi Kivity <avi@redhat.com>, Chris Friesen <cfriesen@nortel.com>,
	Paul Menage <menage@google.com>,
	Mike Waychison <mikew@google.com>
Subject: [RFC v1 PATCH 4/7] sched: Enforce hard limits by throttling
Date: Tue, 25 Aug 2009 15:20:28 +0530	[thread overview]
Message-ID: <20090825095028.GT3663@in.ibm.com> (raw)
In-Reply-To: <20090825094729.GP3663@in.ibm.com>

sched: Enforce hard limits by throttling.

From: Bharata B Rao <bharata@linux.vnet.ibm.com>

Throttle the task-groups which exceed the runtime allocated to them.
Throttled group entities are removed from the run queue.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
 include/linux/sched.h |    1 
 kernel/sched.c        |   32 ++++++++++
 kernel/sched_debug.c  |    2 
 kernel/sched_fair.c   |  146 ++++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 177 insertions(+), 4 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1124,6 +1124,7 @@ struct sched_entity {
 	u64			nr_failed_migrations_affine;
 	u64			nr_failed_migrations_running;
 	u64			nr_failed_migrations_hot;
+	u64			nr_failed_migrations_throttled;
 	u64			nr_forced_migrations;
 	u64			nr_forced2_migrations;
 
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1580,6 +1580,7 @@ update_group_shares_cpu(struct task_grou
 	}
 }
 
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
 /*
  * Re-compute the task group their per cpu shares over the given domain.
  * This needs to be done in a bottom-up fashion because the rq weight of a
@@ -1597,9 +1598,11 @@ static int tg_shares_up(struct task_grou
 		 * If there are currently no tasks on the cpu pretend there
 		 * is one of average load so that when a new task gets to
 		 * run here it will not get delayed by group starvation.
+		 * Also if the group is throttled on this cpu, pretend that
+		 * it has no tasks.
 		 */
 		weight = tg->cfs_rq[i]->load.weight;
-		if (!weight)
+		if (!weight || cfs_rq_throttled(tg->cfs_rq[i]))
 			weight = NICE_0_LOAD;
 
 		tg->cfs_rq[i]->rq_weight = weight;
@@ -1623,6 +1626,7 @@ static int tg_shares_up(struct task_grou
  * Compute the cpu's hierarchical load factor for each task group.
  * This needs to be done in a top-down fashion because the load of a child
  * group is a fraction of its parents load.
+ * A throttled group's h_load is set to 0.
  */
 static int tg_load_down(struct task_group *tg, void *data)
 {
@@ -1631,6 +1635,8 @@ static int tg_load_down(struct task_grou
 
 	if (!tg->parent) {
 		load = cpu_rq(cpu)->load.weight;
+	} else if (cfs_rq_throttled(tg->cfs_rq[cpu])) {
+		load = 0;
 	} else {
 		load = tg->parent->cfs_rq[cpu]->h_load;
 		load *= tg->cfs_rq[cpu]->shares;
@@ -1808,6 +1814,8 @@ static inline u64 global_cfs_runtime(voi
 	return RUNTIME_INF;
 }
 
+int task_group_throttled(struct task_group *tg, int cpu);
+
 static inline int cfs_bandwidth_enabled(struct task_group *tg)
 {
 	return tg->hard_limit_enabled;
@@ -1892,7 +1900,18 @@ static void init_cfs_hard_limits(struct 
 	return;
 }
 
+int task_group_throttled(struct task_group *tg, int cpu)
+{
+	return 0;
+}
+
 #endif /* CONFIG_CFS_HARD_LIMITS */
+
+#else /* CONFIG_FAIR_GROUP_SCHED */
+int task_group_throttled(struct task_group *tg, int cpu)
+{
+	return 0;
+}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #include "sched_stats.h"
@@ -3364,6 +3383,7 @@ int can_migrate_task(struct task_struct 
 	 * 1) running (obviously), or
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
+	 * 4) end up in throttled task groups on this CPU.
 	 */
 	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
 		schedstat_inc(p, se.nr_failed_migrations_affine);
@@ -3377,6 +3397,16 @@ int can_migrate_task(struct task_struct 
 	}
 
 	/*
+	 * Don't migrate the task if
+	 * - it belongs to a group which is throttled on this_cpu or
+	 * - it belongs to a group whose hierarchy is throttled on this_cpu
+	 */
+	if (task_group_throttled(task_group(p), this_cpu)) {
+		schedstat_inc(p, se.nr_failed_migrations_throttled);
+		return 0;
+	}
+
+	/*
 	 * Aggressive migration if:
 	 * 1) task is cache cold, or
 	 * 2) too many balance attempts have failed.
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -417,6 +417,7 @@ void proc_sched_show_task(struct task_st
 	P(se.nr_failed_migrations_affine);
 	P(se.nr_failed_migrations_running);
 	P(se.nr_failed_migrations_hot);
+	P(se.nr_failed_migrations_throttled);
 	P(se.nr_forced_migrations);
 	P(se.nr_forced2_migrations);
 	P(se.nr_wakeups);
@@ -491,6 +492,7 @@ void proc_sched_set_task(struct task_str
 	p->se.nr_failed_migrations_affine	= 0;
 	p->se.nr_failed_migrations_running	= 0;
 	p->se.nr_failed_migrations_hot		= 0;
+	p->se.nr_failed_migrations_throttled	= 0;
 	p->se.nr_forced_migrations		= 0;
 	p->se.nr_forced2_migrations		= 0;
 	p->se.nr_wakeups			= 0;
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -186,6 +186,89 @@ find_matching_se(struct sched_entity **s
 	}
 }
 
+#ifdef CONFIG_CFS_HARD_LIMITS
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->cfs_throttled;
+}
+
+/*
+ * Check if group entity exceeded its runtime. If so, mark the cfs_rq as
+ * throttled mark the current task for reschedling.
+ */
+static void sched_cfs_runtime_exceeded(struct sched_entity *se,
+	struct task_struct *tsk_curr, unsigned long delta_exec)
+{
+	struct cfs_rq *cfs_rq;
+
+	cfs_rq = group_cfs_rq(se);
+
+	if (!cfs_bandwidth_enabled(cfs_rq->tg))
+		return;
+
+	if (cfs_rq->cfs_runtime == RUNTIME_INF)
+		return;
+
+	cfs_rq->cfs_time += delta_exec;
+
+	if (cfs_rq_throttled(cfs_rq))
+		return;
+
+	if (cfs_rq->cfs_time > cfs_rq->cfs_runtime) {
+		cfs_rq->cfs_throttled = 1;
+		resched_task(tsk_curr);
+	}
+}
+
+/*
+ * Check if the entity is throttled.
+ */
+static int entity_throttled(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq;
+
+	/* Only group entities can be throttled */
+	if (entity_is_task(se))
+		return 0;
+
+	cfs_rq = group_cfs_rq(se);
+	if (cfs_rq_throttled(cfs_rq))
+		return 1;
+	return 0;
+}
+
+int task_group_throttled(struct task_group *tg, int cpu)
+{
+	struct sched_entity *se = tg->se[cpu];
+
+	for_each_sched_entity(se) {
+		if (entity_throttled(se))
+			return 1;
+	}
+	return 0;
+}
+
+#else
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+static void sched_cfs_runtime_exceeded(struct sched_entity *se,
+	struct task_struct *tsk_curr, unsigned long delta_exec)
+{
+	return;
+}
+
+static int entity_throttled(struct sched_entity *se)
+{
+	return 0;
+}
+
+#endif /* CONFIG_CFS_HARD_LIMITS */
+
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -241,6 +324,17 @@ find_matching_se(struct sched_entity **s
 {
 }
 
+static void sched_cfs_runtime_exceeded(struct sched_entity *se,
+	struct task_struct *tsk_curr, unsigned long delta_exec)
+{
+	return;
+}
+
+static int entity_throttled(struct sched_entity *se)
+{
+	return 0;
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 static void add_cfs_rq_tasks_running(struct sched_entity *se,
@@ -505,7 +599,9 @@ __update_curr(struct cfs_rq *cfs_rq, str
 static void update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
-	u64 now = rq_of(cfs_rq)->clock;
+	struct rq *rq = rq_of(cfs_rq);
+	struct task_struct *tsk_curr = rq->curr;
+	u64 now = rq->clock;
 	unsigned long delta_exec;
 
 	if (unlikely(!curr))
@@ -528,6 +624,8 @@ static void update_curr(struct cfs_rq *c
 
 		cpuacct_charge(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
+	} else {
+		sched_cfs_runtime_exceeded(curr, tsk_curr, delta_exec);
 	}
 }
 
@@ -865,8 +963,40 @@ static struct sched_entity *pick_next_en
 	return se;
 }
 
+/*
+ * Called from put_prev_entity()
+ * If a group entity (@se) is found to be throttled, it will not be put back
+ * on @cfs_rq, which is equivalent to dequeing it.
+ */
+static void dequeue_throttled_entity(struct cfs_rq *cfs_rq,
+		struct sched_entity *se)
+{
+	unsigned long nr_tasks = group_cfs_rq(se)->nr_tasks_running;
+
+	__clear_buddies(cfs_rq, se);
+	account_entity_dequeue(cfs_rq, se);
+	cfs_rq->curr = NULL;
+
+	if (!nr_tasks)
+		return;
+
+	/*
+	 * Decrement the number of tasks this entity has from
+	 * all of its parent entities.
+	 */
+	sub_cfs_rq_tasks_running(se, nr_tasks);
+
+	/*
+	 * Decrement the number of tasks this entity has from
+	 * this cpu's rq.
+	 */
+	rq_of(cfs_rq)->nr_running -= nr_tasks;
+}
+
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
+	struct cfs_rq *gcfs_rq = group_cfs_rq(prev);
+
 	/*
 	 * If still on the runqueue then deactivate_task()
 	 * was not called and update_curr() has to be done:
@@ -876,6 +1006,15 @@ static void put_prev_entity(struct cfs_r
 
 	check_spread(cfs_rq, prev);
 	if (prev->on_rq) {
+		/*
+		 * If the group entity is throttled or if it has no
+		 * no child entities, then don't enqueue it back.
+		 */
+		if (entity_throttled(prev) ||
+			(gcfs_rq && !gcfs_rq->nr_running)) {
+			dequeue_throttled_entity(cfs_rq, prev);
+			return;
+		}
 		update_stats_wait_start(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
@@ -1541,6 +1680,7 @@ static struct task_struct *pick_next_tas
 
 	do {
 		se = pick_next_entity(cfs_rq);
+
 		/*
 		 * If se was a buddy, clear it so that it will have to earn
 		 * the favour again.
@@ -1650,9 +1790,9 @@ load_balance_fair(struct rq *this_rq, in
 		u64 rem_load, moved_load;
 
 		/*
-		 * empty group
+		 * empty group or a group with no h_load (throttled)
 		 */
-		if (!busiest_cfs_rq->task_weight)
+		if (!busiest_cfs_rq->task_weight || !busiest_h_load)
 			continue;
 
 		rem_load = (u64)rem_load_move * busiest_weight;

next prev parent reply	other threads:[~2009-08-25  9:50 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-08-25  9:47 [RFC v1 PATCH 0/7] CFS Hard limits - v1 Bharata B Rao
2009-08-25  9:48 ` [RFC v1 PATCH 1/7] 1/7 sched: Rename sched_rt_period_mask() and use it in CFS also Bharata B Rao
2009-08-25  9:49 ` [RFC v1 PATCH 2/7] sched: Maintain aggregated tasks count in cfs_rq at each hierarchy level Bharata B Rao
2009-08-25  9:49 ` [RFC v1 PATCH 3/7] sched: Bandwidth initialization for fair task groups Bharata B Rao
2009-09-04 10:43   ` Andrea Righi
2009-09-04 12:32     ` Bharata B Rao
2009-09-04 12:36       ` Andrea Righi
2009-08-25  9:50 ` Bharata B Rao [this message]
2009-08-25  9:51 ` [RFC v1 PATCH 5/7] sched: Unthrottle the throttled tasks Bharata B Rao
2009-08-25  9:51 ` [RFC v1 PATCH 6/7] sched: Add throttle time statistics to /proc/sched_debug Bharata B Rao
2009-08-25  9:53 ` [RFC v1 PATCH 7/7] sched: Hard limits documentation Bharata B Rao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090825095028.GT3663@in.ibm.com \
    --to=bharata@linux.vnet.ibm.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=avi@redhat.com \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=cfriesen@nortel.com \
    --cc=dhaval@linux.vnet.ibm.com \
    --cc=ego@in.ibm.com \
    --cc=herbert@13thfloor.at \
    --cc=linux-kernel@vger.kernel.org \
    --cc=menage@google.com \
    --cc=mikew@google.com \
    --cc=mingo@elte.hu \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=vatsa@in.ibm.com \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.