[RFC v5 PATCH 3/8] sched: Bandwidth initialization for fair task groups

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Bharata B Rao <bharata@linux.vnet.ibm.com>
To: linux-kernel@vger.kernel.org
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
	Gautham R Shenoy <ego@in.ibm.com>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>,
	Ingo Molnar <mingo@elte.hu>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Pavel Emelyanov <xemul@openvz.org>,
	Herbert Poetzl <herbert@13thfloor.at>,
	Avi Kivity <avi@redhat.com>, Chris Friesen <cfriesen@nortel.com>,
	Paul Menage <menage@google.com>,
	Mike Waychison <mikew@google.com>
Subject: [RFC v5 PATCH 3/8] sched: Bandwidth initialization for fair task groups
Date: Tue, 5 Jan 2010 13:30:21 +0530	[thread overview]
Message-ID: <20100105080021.GH27899@in.ibm.com> (raw)
In-Reply-To: <20100105075703.GE27899@in.ibm.com>

sched: Bandwidth initialization for fair task groups.

From: Bharata B Rao <bharata@linux.vnet.ibm.com>

Introduce the notion of hard limiting for CFS groups by bringing in
the concept of runtime and period for them. Add cgroup files to control
runtime and period.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
 init/Kconfig        |   13 ++++
 kernel/sched.c      |  164 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_fair.c |   18 ++++++
 3 files changed, 195 insertions(+), 0 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index a23da9f..6b76df4 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -486,6 +486,19 @@ config CGROUP_SCHED
 
 endchoice
 
+config CFS_HARD_LIMITS
+	bool "Hard Limits for CFS Group Scheduler"
+	depends on EXPERIMENTAL
+	depends on FAIR_GROUP_SCHED && CGROUP_SCHED
+	default n
+	help
+	  This option enables hard limiting of CPU time obtained by
+	  a fair task group. Use this if you want to throttle a group of tasks
+	  based on its CPU usage. For more details refer to
+	  Documentation/scheduler/sched-cfs-hard-limits.txt
+
+	  Say N if unsure.
+
 menuconfig CGROUPS
 	boolean "Control Group support"
 	help
diff --git a/kernel/sched.c b/kernel/sched.c
index 4a24d62..48d5483 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -151,6 +151,14 @@ static struct sched_bandwidth def_rt_bandwidth;
 
 static int do_sched_rt_period_timer(struct sched_bandwidth *sched_b, int overrun);
 
+/*
+ * Nothing much to do now. Will be populated in subsequent hard limit patches.
+ */
+static int do_sched_cfs_period_timer(struct sched_bandwidth *sched_b, int overrun)
+{
+	return 0;
+}
+
 static enum hrtimer_restart sched_period_timer(struct hrtimer *timer, int rt)
 {
 	struct sched_bandwidth *sched_b =
@@ -168,6 +176,8 @@ static enum hrtimer_restart sched_period_timer(struct hrtimer *timer, int rt)
 
 		if (rt)
 			idle = do_sched_rt_period_timer(sched_b, overrun);
+		else
+			idle = do_sched_cfs_period_timer(sched_b, overrun);
 	}
 
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
@@ -266,6 +276,7 @@ struct task_group {
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	unsigned long shares;
+	struct sched_bandwidth cfs_bandwidth;
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -463,6 +474,7 @@ struct cfs_rq {
 	unsigned long rq_weight;
 #endif
 #endif
+	struct rq_bandwidth rq_bandwidth;
 };
 
 /* Real-Time classes' related field in a runqueue: */
@@ -2075,6 +2087,38 @@ static inline void balance_runtime(struct rq_bandwidth *rq_b,
 }
 #endif /* CONFIG_SMP */
 
+/*
+ * Runtime allowed for a cfs group before it is hard limited.
+ * default: Infinite which means no hard limiting.
+ */
+u64 sched_cfs_runtime = RUNTIME_INF;
+
+/*
+ * period over which we hard limit the cfs group's bandwidth.
+ * default: 0.5s
+ */
+u64 sched_cfs_period = 500000;
+
+static inline u64 global_cfs_period(void)
+{
+	return sched_cfs_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_cfs_runtime(void)
+{
+	return RUNTIME_INF;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Refresh the runtimes of the throttled groups.
+ */
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+	return sched_period_timer(timer, 0);
+}
+#endif
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -9640,6 +9684,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	struct rq *rq = cpu_rq(cpu);
 	tg->cfs_rq[cpu] = cfs_rq;
 	init_cfs_rq(cfs_rq, rq);
+	init_rq_bandwidth(&cfs_rq->rq_bandwidth, tg->cfs_bandwidth.runtime);
 	cfs_rq->tg = tg;
 	if (add)
 		list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
@@ -9765,6 +9810,12 @@ void __init sched_init(void)
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	init_sched_bandwidth(&init_task_group.cfs_bandwidth,
+		global_cfs_period(), global_cfs_runtime(),
+		&sched_cfs_period_timer);
+#endif
+
 #ifdef CONFIG_GROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&init_task_group.children);
@@ -9791,6 +9842,8 @@ void __init sched_init(void)
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
+		init_rq_bandwidth(&rq->cfs.rq_bandwidth,
+				init_task_group.cfs_bandwidth.runtime);
 		init_task_group.shares = init_task_group_load;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
@@ -10070,6 +10123,7 @@ static void free_fair_sched_group(struct task_group *tg)
 {
 	int i;
 
+	destroy_sched_bandwidth(&tg->cfs_bandwidth);
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
@@ -10096,6 +10150,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	if (!tg->se)
 		goto err;
 
+	init_sched_bandwidth(&tg->cfs_bandwidth, global_cfs_period(),
+		global_cfs_runtime(), &sched_cfs_period_timer);
 	tg->shares = NICE_0_LOAD;
 
 	for_each_possible_cpu(i) {
@@ -10824,6 +10880,102 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 
 	return (u64) tg->shares;
 }
+
+#ifdef CONFIG_CFS_HARD_LIMITS
+
+static int tg_set_cfs_bandwidth(struct task_group *tg,
+		u64 cfs_period, u64 cfs_runtime)
+{
+	int i;
+
+	if (tg == &init_task_group)
+		return -EINVAL;
+
+	raw_spin_lock_irq(&tg->cfs_bandwidth.runtime_lock);
+	tg->cfs_bandwidth.period = ns_to_ktime(cfs_period);
+	tg->cfs_bandwidth.runtime = cfs_runtime;
+
+	for_each_possible_cpu(i) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+
+		raw_spin_lock(&cfs_rq->rq_bandwidth.runtime_lock);
+		cfs_rq->rq_bandwidth.runtime = cfs_runtime;
+		raw_spin_unlock(&cfs_rq->rq_bandwidth.runtime_lock);
+	}
+
+	raw_spin_unlock_irq(&tg->cfs_bandwidth.runtime_lock);
+	return 0;
+}
+
+int tg_set_cfs_runtime(struct task_group *tg, long cfs_runtime_us)
+{
+	u64 cfs_runtime, cfs_period;
+
+	cfs_period = ktime_to_ns(tg->cfs_bandwidth.period);
+	cfs_runtime = (u64)cfs_runtime_us * NSEC_PER_USEC;
+	if (cfs_runtime_us < 0)
+		cfs_runtime = RUNTIME_INF;
+
+	return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime);
+}
+
+long tg_get_cfs_runtime(struct task_group *tg)
+{
+	u64 cfs_runtime_us;
+
+	if (tg->cfs_bandwidth.runtime == RUNTIME_INF)
+		return -1;
+
+	cfs_runtime_us = tg->cfs_bandwidth.runtime;
+	do_div(cfs_runtime_us, NSEC_PER_USEC);
+	return cfs_runtime_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+	u64 cfs_runtime, cfs_period;
+
+	cfs_period = (u64)cfs_period_us * NSEC_PER_USEC;
+	cfs_runtime = tg->cfs_bandwidth.runtime;
+
+	if (cfs_period == 0)
+		return -EINVAL;
+
+	return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+	u64 cfs_period_us;
+
+	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
+	do_div(cfs_period_us, NSEC_PER_USEC);
+	return cfs_period_us;
+}
+
+static s64 cpu_cfs_runtime_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_runtime(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_runtime_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+				s64 cfs_runtime_us)
+{
+	return tg_set_cfs_runtime(cgroup_tg(cgrp), cfs_runtime_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+				u64 cfs_period_us)
+{
+	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+
+#endif /* CONFIG_CFS_HARD_LIMITS */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -10857,6 +11009,18 @@ static struct cftype cpu_files[] = {
 		.read_u64 = cpu_shares_read_u64,
 		.write_u64 = cpu_shares_write_u64,
 	},
+#ifdef CONFIG_CFS_HARD_LIMITS
+	{
+		.name = "cfs_runtime_us",
+		.read_s64 = cpu_cfs_runtime_read_s64,
+		.write_s64 = cpu_cfs_runtime_write_s64,
+	},
+	{
+		.name = "cfs_period_us",
+		.read_u64 = cpu_cfs_period_read_u64,
+		.write_u64 = cpu_cfs_period_write_u64,
+	},
+#endif /* CONFIG_CFS_HARD_LIMITS */
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 42ac3c9..0dfb7a5 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -205,6 +205,18 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 	}
 }
 
+static inline struct sched_bandwidth *sched_cfs_bandwidth(struct cfs_rq *cfs_rq)
+{
+	return &cfs_rq->tg->cfs_bandwidth;
+}
+
+static inline void start_cfs_bandwidth(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->tg)
+		start_sched_bandwidth(sched_cfs_bandwidth(cfs_rq), 0);
+	return;
+}
+
 #else	/* !CONFIG_FAIR_GROUP_SCHED */
 
 static inline struct task_struct *task_of(struct sched_entity *se)
@@ -265,6 +277,11 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
 }
 
+static inline void start_cfs_bandwidth(struct cfs_rq *cfs_rq)
+{
+	return;
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 
@@ -360,6 +377,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	rb_link_node(&se->run_node, parent, link);
 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+	start_cfs_bandwidth(cfs_rq);
 }
 
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

next prev parent reply	other threads:[~2010-01-05  8:01 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-01-05  7:57 [RFC v5 PATCH 0/8] CFS Hard limits - v5 Bharata B Rao
2010-01-05  7:58 ` [RFC v5 PATCH 1/8] sched: Rename struct rt_bandwidth to sched_bandwidth Bharata B Rao
2010-01-29  8:59   ` Balbir Singh
2010-01-29 14:07     ` Bharata B Rao
2010-01-05  7:59 ` [RFC v5 PATCH 2/8] sched: Make rt bandwidth timer and runtime related code generic Bharata B Rao
2010-01-05  8:00 ` Bharata B Rao [this message]
2010-01-05  8:01 ` [RFC v5 PATCH 4/8] sched: Enforce hard limits by throttling Bharata B Rao
2010-01-05  8:01 ` [RFC v5 PATCH 5/8] sched: Unthrottle the throttled tasks Bharata B Rao
2010-01-05  8:02 ` [RFC v5 PATCH 6/8] sched: Add throttle time statistics to /proc/sched_debug Bharata B Rao
2010-01-05  8:03 ` [RFC v5 PATCH 7/8] sched: CFS runtime borrowing Bharata B Rao
2010-01-06  5:02   ` Bharata B Rao
2010-01-05  8:04 ` [RFC v5 PATCH 8/8] sched: Hard limits documentation Bharata B Rao
2010-01-05  8:06 ` [RFC v5 PATCH 0/8] CFS Hard limits - v5 Bharata B Rao
2010-01-08 20:45 ` Paul Turner
2010-01-29  3:49   ` Bharata B Rao
2010-01-29  4:26     ` Paul Turner
2010-02-01  8:21       ` Bharata B Rao
2010-02-01 11:04         ` Paul Turner
2010-02-01 18:25           ` Paul Turner
2010-02-02  4:14             ` Bharata B Rao
2010-02-02  7:13               ` Paul Turner
2010-02-02  7:57                 ` Bharata B Rao

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:a23da9f dfblob:6b76df4 dfblob:4a24d62 dfblob:48d5483
dfblob:42ac3c9 dfblob:0dfb7a5 )
 OR (
bs:"[RFC v5 PATCH 3/8] sched: Bandwidth initialization for fair task groups" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100105080021.GH27899@in.ibm.com \
    --to=bharata@linux.vnet.ibm.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=avi@redhat.com \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=cfriesen@nortel.com \
    --cc=dhaval@linux.vnet.ibm.com \
    --cc=ego@in.ibm.com \
    --cc=herbert@13thfloor.at \
    --cc=kamalesh@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=menage@google.com \
    --cc=mikew@google.com \
    --cc=mingo@elte.hu \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=vatsa@in.ibm.com \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.