* [RFC v1 PATCH 1/7] 1/7 sched: Rename sched_rt_period_mask() and use it in CFS also
2009-08-25 9:47 [RFC v1 PATCH 0/7] CFS Hard limits - v1 Bharata B Rao
@ 2009-08-25 9:48 ` Bharata B Rao
2009-08-25 9:49 ` [RFC v1 PATCH 2/7] sched: Maintain aggregated tasks count in cfs_rq at each hierarchy level Bharata B Rao
` (5 subsequent siblings)
6 siblings, 0 replies; 11+ messages in thread
From: Bharata B Rao @ 2009-08-25 9:48 UTC (permalink / raw)
To: linux-kernel
Cc: Dhaval Giani, Balbir Singh, Vaidyanathan Srinivasan,
Gautham R Shenoy, Srivatsa Vaddagiri, Ingo Molnar, Peter Zijlstra,
Pavel Emelyanov, Herbert Poetzl, Avi Kivity, Chris Friesen,
Paul Menage, Mike Waychison
sched: Rename sched_rt_period_mask() and use it in CFS also.
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
sched_rt_period_mask() is needed in CFS also. Rename it to a generic name
and move it to kernel/sched.c. No functionality change in this patch.
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
kernel/sched.c | 23 +++++++++++++++++++++++
kernel/sched_rt.c | 19 +------------------
2 files changed, 24 insertions(+), 18 deletions(-)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1732,6 +1732,29 @@ static void cfs_rq_set_shares(struct cfs
static void calc_load_account_active(struct rq *this_rq);
+
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED)
+
+#ifdef CONFIG_SMP
+static inline const struct cpumask *sched_bw_period_mask(void)
+{
+ return cpu_rq(smp_processor_id())->rd->span;
+}
+#else /* !CONFIG_SMP */
+static inline const struct cpumask *sched_bw_period_mask(void)
+{
+ return cpu_online_mask;
+}
+#endif /* CONFIG_SMP */
+
+#else
+static inline const struct cpumask *sched_bw_period_mask(void)
+{
+ return cpu_online_mask;
+}
+
+#endif
+
#include "sched_stats.h"
#include "sched_idletask.c"
#include "sched_fair.c"
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -222,18 +222,6 @@ static int rt_se_boosted(struct sched_rt
return p->prio != p->normal_prio;
}
-#ifdef CONFIG_SMP
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
- return cpu_rq(smp_processor_id())->rd->span;
-}
-#else
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
- return cpu_online_mask;
-}
-#endif
-
static inline
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
{
@@ -283,11 +271,6 @@ static inline int rt_rq_throttled(struct
return rt_rq->rt_throttled;
}
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
- return cpu_online_mask;
-}
-
static inline
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
{
@@ -505,7 +488,7 @@ static int do_sched_rt_period_timer(stru
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return 1;
- span = sched_rt_period_mask();
+ span = sched_bw_period_mask();
for_each_cpu(i, span) {
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
^ permalink raw reply [flat|nested] 11+ messages in thread* [RFC v1 PATCH 2/7] sched: Maintain aggregated tasks count in cfs_rq at each hierarchy level
2009-08-25 9:47 [RFC v1 PATCH 0/7] CFS Hard limits - v1 Bharata B Rao
2009-08-25 9:48 ` [RFC v1 PATCH 1/7] 1/7 sched: Rename sched_rt_period_mask() and use it in CFS also Bharata B Rao
@ 2009-08-25 9:49 ` Bharata B Rao
2009-08-25 9:49 ` [RFC v1 PATCH 3/7] sched: Bandwidth initialization for fair task groups Bharata B Rao
` (4 subsequent siblings)
6 siblings, 0 replies; 11+ messages in thread
From: Bharata B Rao @ 2009-08-25 9:49 UTC (permalink / raw)
To: linux-kernel
Cc: Dhaval Giani, Balbir Singh, Vaidyanathan Srinivasan,
Gautham R Shenoy, Srivatsa Vaddagiri, Ingo Molnar, Peter Zijlstra,
Pavel Emelyanov, Herbert Poetzl, Avi Kivity, Chris Friesen,
Paul Menage, Mike Waychison
sched: Maintain aggregated tasks count in cfs_rq at each hierarchy level
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
This patch adds a counter to cfs_rq (->nr_tasks_running) to record the
aggregated tasks count at each level in the task group hierarchy.
This is needed by later hard limit patches where it is required to
know how many tasks go off the rq when a throttled group entity
is dequeued.
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
kernel/sched.c | 4 ++++
kernel/sched_debug.c | 2 ++
kernel/sched_fair.c | 23 +++++++++++++++++++++++
3 files changed, 29 insertions(+)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -477,6 +477,10 @@ struct cfs_rq {
unsigned long rq_weight;
#endif
#endif
+ /*
+ * Number of tasks at this heirarchy.
+ */
+ unsigned long nr_tasks_running;
};
/* Real-Time classes' related field in a runqueue: */
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -214,6 +214,8 @@ void print_cfs_rq(struct seq_file *m, in
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
#endif
+ SEQ_printf(m, " .%-30s: %ld\n", "nr_tasks_running",
+ cfs_rq->nr_tasks_running);
print_cfs_group_stats(m, cpu, cfs_rq->tg);
#endif
}
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -243,6 +243,27 @@ find_matching_se(struct sched_entity **s
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static void add_cfs_rq_tasks_running(struct sched_entity *se,
+ unsigned long count)
+{
+ struct cfs_rq *cfs_rq;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->nr_tasks_running += count;
+ }
+}
+
+static void sub_cfs_rq_tasks_running(struct sched_entity *se,
+ unsigned long count)
+{
+ struct cfs_rq *cfs_rq;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->nr_tasks_running -= count;
+ }
+}
/**************************************************************
* Scheduling class tree data structure manipulation methods:
@@ -969,6 +990,7 @@ static void enqueue_task_fair(struct rq
wakeup = 1;
}
+ add_cfs_rq_tasks_running(&p->se, 1);
hrtick_update(rq);
}
@@ -991,6 +1013,7 @@ static void dequeue_task_fair(struct rq
sleep = 1;
}
+ sub_cfs_rq_tasks_running(&p->se, 1);
hrtick_update(rq);
}
^ permalink raw reply [flat|nested] 11+ messages in thread* [RFC v1 PATCH 3/7] sched: Bandwidth initialization for fair task groups
2009-08-25 9:47 [RFC v1 PATCH 0/7] CFS Hard limits - v1 Bharata B Rao
2009-08-25 9:48 ` [RFC v1 PATCH 1/7] 1/7 sched: Rename sched_rt_period_mask() and use it in CFS also Bharata B Rao
2009-08-25 9:49 ` [RFC v1 PATCH 2/7] sched: Maintain aggregated tasks count in cfs_rq at each hierarchy level Bharata B Rao
@ 2009-08-25 9:49 ` Bharata B Rao
2009-09-04 10:43 ` Andrea Righi
2009-08-25 9:50 ` [RFC v1 PATCH 4/7] sched: Enforce hard limits by throttling Bharata B Rao
` (3 subsequent siblings)
6 siblings, 1 reply; 11+ messages in thread
From: Bharata B Rao @ 2009-08-25 9:49 UTC (permalink / raw)
To: linux-kernel
Cc: Dhaval Giani, Balbir Singh, Vaidyanathan Srinivasan,
Gautham R Shenoy, Srivatsa Vaddagiri, Ingo Molnar, Peter Zijlstra,
Pavel Emelyanov, Herbert Poetzl, Avi Kivity, Chris Friesen,
Paul Menage, Mike Waychison
sched: Bandwidth initialization for fair task groups.
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Introduce the notion of hard limiting for CFS groups by bringing in
the concept of runtime and period for them. Add cgroup files to control
runtime and period.
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
init/Kconfig | 13 ++
kernel/sched.c | 279 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 292 insertions(+)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -456,6 +456,19 @@ config FAIR_GROUP_SCHED
depends on GROUP_SCHED
default GROUP_SCHED
+config CFS_HARD_LIMITS
+ bool "Hard Limits for CFS Group Scheduler"
+ depends on EXPERIMENTAL
+ depends on FAIR_GROUP_SCHED
+ default n
+ help
+ This option enables hard limiting of CPU time obtained by
+ a fair task group. Use this if you want to throttle a group of tasks
+ based on its CPU usage. For more details refer to
+ Documentation/scheduler/sched-cfs-hard-limits.txt
+
+ Say N if unsure.
+
config RT_GROUP_SCHED
bool "Group scheduling for SCHED_RR/FIFO"
depends on EXPERIMENTAL
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -262,6 +262,15 @@ static DEFINE_MUTEX(sched_domains_mutex)
#include <linux/cgroup.h>
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_CFS_HARD_LIMITS)
+struct cfs_bandwidth {
+ spinlock_t cfs_runtime_lock;
+ ktime_t cfs_period;
+ u64 cfs_runtime;
+ struct hrtimer cfs_period_timer;
+};
+#endif
+
struct cfs_rq;
static LIST_HEAD(task_groups);
@@ -282,6 +291,11 @@ struct task_group {
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
+#ifdef CONFIG_CFS_HARD_LIMITS
+ struct cfs_bandwidth cfs_bandwidth;
+ /* If set, throttle when the group exceeds its bandwidth */
+ int hard_limit_enabled;
+#endif
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -477,6 +491,16 @@ struct cfs_rq {
unsigned long rq_weight;
#endif
#endif
+#ifdef CONFIG_CFS_HARD_LIMITS
+ /* set when the group is throttled on this cpu */
+ int cfs_throttled;
+
+ /* runtime currently consumed by the group on this rq */
+ u64 cfs_time;
+
+ /* runtime available to the group on this rq */
+ u64 cfs_runtime;
+#endif
/*
* Number of tasks at this heirarchy.
*/
@@ -1759,6 +1783,118 @@ static inline const struct cpumask *sche
#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_CFS_HARD_LIMITS
+
+/*
+ * Runtime allowed for a cfs group before it is hard limited.
+ * default: Infinite which means no hard limiting.
+ */
+u64 sched_cfs_runtime = RUNTIME_INF;
+
+/*
+ * period over which we hard limit the cfs group's bandwidth.
+ * default: 0.5s
+ */
+u64 sched_cfs_period = 500000;
+
+static inline u64 global_cfs_period(void)
+{
+ return sched_cfs_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_cfs_runtime(void)
+{
+ return RUNTIME_INF;
+}
+
+static inline int cfs_bandwidth_enabled(struct task_group *tg)
+{
+ return tg->hard_limit_enabled;
+}
+
+/*
+ * Refresh the runtimes of the throttled groups.
+ * But nothing much to do now, will populate this in later patches.
+ */
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, cfs_period_timer);
+
+ hrtimer_add_expires_ns(timer, ktime_to_ns(cfs_b->cfs_period));
+ return HRTIMER_RESTART;
+}
+
+/*
+ * TODO: Check if this kind of timer setup is sufficient for cfs or
+ * should we do what rt is doing.
+ */
+static void start_cfs_bandwidth(struct task_group *tg)
+{
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+ /*
+ * Timer isn't setup for groups with infinite runtime or for groups
+ * for which hard limiting isn't enabled.
+ */
+ if (!cfs_bandwidth_enabled(tg) || (cfs_b->cfs_runtime == RUNTIME_INF))
+ return;
+
+ if (hrtimer_active(&cfs_b->cfs_period_timer))
+ return;
+
+ hrtimer_start_range_ns(&cfs_b->cfs_period_timer, cfs_b->cfs_period,
+ 0, HRTIMER_MODE_REL);
+}
+
+static void init_cfs_bandwidth(struct task_group *tg)
+{
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+ cfs_b->cfs_period = ns_to_ktime(global_cfs_period());
+ cfs_b->cfs_runtime = global_cfs_runtime();
+
+ spin_lock_init(&cfs_b->cfs_runtime_lock);
+
+ hrtimer_init(&cfs_b->cfs_period_timer,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->cfs_period_timer.function = &sched_cfs_period_timer;
+}
+
+static inline void destroy_cfs_bandwidth(struct task_group *tg)
+{
+ hrtimer_cancel(&tg->cfs_bandwidth.cfs_period_timer);
+}
+
+static void init_cfs_hard_limits(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+ cfs_rq->cfs_time = 0;
+ cfs_rq->cfs_throttled = 0;
+ cfs_rq->cfs_runtime = tg->cfs_bandwidth.cfs_runtime;
+ tg->hard_limit_enabled = 0;
+}
+
+#else /* !CONFIG_CFS_HARD_LIMITS */
+
+static void init_cfs_bandwidth(struct task_group *tg)
+{
+ return;
+}
+
+static inline void destroy_cfs_bandwidth(struct task_group *tg)
+{
+ return;
+}
+
+static void init_cfs_hard_limits(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+ return;
+}
+
+#endif /* CONFIG_CFS_HARD_LIMITS */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
#include "sched_stats.h"
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -9146,6 +9282,7 @@ static void init_tg_cfs_entry(struct tas
struct rq *rq = cpu_rq(cpu);
tg->cfs_rq[cpu] = cfs_rq;
init_cfs_rq(cfs_rq, rq);
+ init_cfs_hard_limits(cfs_rq, tg);
cfs_rq->tg = tg;
if (add)
list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
@@ -9275,6 +9412,10 @@ void __init sched_init(void)
#endif /* CONFIG_USER_SCHED */
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ init_cfs_bandwidth(&init_task_group);
+#endif
+
#ifdef CONFIG_GROUP_SCHED
list_add(&init_task_group.list, &task_groups);
INIT_LIST_HEAD(&init_task_group.children);
@@ -9564,6 +9705,7 @@ static void free_fair_sched_group(struct
{
int i;
+ destroy_cfs_bandwidth(tg);
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
@@ -9590,6 +9732,7 @@ int alloc_fair_sched_group(struct task_g
if (!tg->se)
goto err;
+ init_cfs_bandwidth(tg);
tg->shares = NICE_0_LOAD;
for_each_possible_cpu(i) {
@@ -10284,6 +10427,125 @@ static u64 cpu_shares_read_u64(struct cg
return (u64) tg->shares;
}
+
+#ifdef CONFIG_CFS_HARD_LIMITS
+
+static int tg_set_cfs_bandwidth(struct task_group *tg,
+ u64 cfs_period, u64 cfs_runtime)
+{
+ int i, err = 0;
+
+ spin_lock_irq(&tg->cfs_bandwidth.cfs_runtime_lock);
+ tg->cfs_bandwidth.cfs_period = ns_to_ktime(cfs_period);
+ tg->cfs_bandwidth.cfs_runtime = cfs_runtime;
+
+ for_each_possible_cpu(i) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+
+ spin_lock(&(rq_of(cfs_rq)->lock));
+ cfs_rq->cfs_runtime = cfs_runtime;
+ spin_unlock(&(rq_of(cfs_rq)->lock));
+ }
+
+ start_cfs_bandwidth(tg);
+ spin_unlock_irq(&tg->cfs_bandwidth.cfs_runtime_lock);
+ return err;
+}
+
+int tg_set_cfs_runtime(struct task_group *tg, long cfs_runtime_us)
+{
+ u64 cfs_runtime, cfs_period;
+
+ cfs_period = ktime_to_ns(tg->cfs_bandwidth.cfs_period);
+ cfs_runtime = (u64)cfs_runtime_us * NSEC_PER_USEC;
+ if (cfs_runtime_us < 0)
+ cfs_runtime = RUNTIME_INF;
+
+ return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime);
+}
+
+long tg_get_cfs_runtime(struct task_group *tg)
+{
+ u64 cfs_runtime_us;
+
+ if (tg->cfs_bandwidth.cfs_runtime == RUNTIME_INF)
+ return -1;
+
+ cfs_runtime_us = tg->cfs_bandwidth.cfs_runtime;
+ do_div(cfs_runtime_us, NSEC_PER_USEC);
+ return cfs_runtime_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+ u64 cfs_runtime, cfs_period;
+
+ cfs_period = (u64)cfs_period_us * NSEC_PER_USEC;
+ cfs_runtime = tg->cfs_bandwidth.cfs_runtime;
+
+ if (cfs_period == 0)
+ return -EINVAL;
+
+ return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+ u64 cfs_period_us;
+
+ cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.cfs_period);
+ do_div(cfs_period_us, NSEC_PER_USEC);
+ return cfs_period_us;
+}
+
+int tg_set_hard_limit_enabled(struct task_group *tg, u64 val)
+{
+ spin_lock_irq(&tg->cfs_bandwidth.cfs_runtime_lock);
+ if (val > 0) {
+ tg->hard_limit_enabled = 1;
+ start_cfs_bandwidth(tg);
+ } else {
+ destroy_cfs_bandwidth(tg);
+ tg->hard_limit_enabled = 0;
+ }
+ spin_unlock_irq(&tg->cfs_bandwidth.cfs_runtime_lock);
+ return 0;
+}
+
+static s64 cpu_cfs_runtime_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+ return tg_get_cfs_runtime(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_runtime_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+ s64 cfs_runtime_us)
+{
+ return tg_set_cfs_runtime(cgroup_tg(cgrp), cfs_runtime_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+ return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+ u64 cfs_period_us)
+{
+ return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+
+static u64 cpu_cfs_hard_limit_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+ return cfs_bandwidth_enabled(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_hard_limit_write_u64(struct cgroup *cgrp,
+ struct cftype *cftype, u64 val)
+{
+ return tg_set_hard_limit_enabled(cgroup_tg(cgrp), val);
+}
+
+#endif /* CONFIG_CFS_HARD_LIMITS */
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -10317,6 +10579,23 @@ static struct cftype cpu_files[] = {
.read_u64 = cpu_shares_read_u64,
.write_u64 = cpu_shares_write_u64,
},
+#ifdef CONFIG_CFS_HARD_LIMITS
+ {
+ .name = "cfs_runtime_us",
+ .read_s64 = cpu_cfs_runtime_read_s64,
+ .write_s64 = cpu_cfs_runtime_write_s64,
+ },
+ {
+ .name = "cfs_period_us",
+ .read_u64 = cpu_cfs_period_read_u64,
+ .write_u64 = cpu_cfs_period_write_u64,
+ },
+ {
+ .name = "cfs_hard_limit",
+ .read_u64 = cpu_cfs_hard_limit_read_u64,
+ .write_u64 = cpu_cfs_hard_limit_write_u64,
+ },
+#endif /* CONFIG_CFS_HARD_LIMITS */
#endif
#ifdef CONFIG_RT_GROUP_SCHED
{
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [RFC v1 PATCH 3/7] sched: Bandwidth initialization for fair task groups
2009-08-25 9:49 ` [RFC v1 PATCH 3/7] sched: Bandwidth initialization for fair task groups Bharata B Rao
@ 2009-09-04 10:43 ` Andrea Righi
2009-09-04 12:32 ` Bharata B Rao
0 siblings, 1 reply; 11+ messages in thread
From: Andrea Righi @ 2009-09-04 10:43 UTC (permalink / raw)
To: Bharata B Rao
Cc: linux-kernel, Dhaval Giani, Balbir Singh, Vaidyanathan Srinivasan,
Gautham R Shenoy, Srivatsa Vaddagiri, Ingo Molnar, Peter Zijlstra,
Pavel Emelyanov, Herbert Poetzl, Avi Kivity, Chris Friesen,
Paul Menage, Mike Waychison
On Tue, Aug 25, 2009 at 03:19:50PM +0530, Bharata B Rao wrote:
> +config CFS_HARD_LIMITS
> + bool "Hard Limits for CFS Group Scheduler"
> + depends on EXPERIMENTAL
> + depends on FAIR_GROUP_SCHED
Shouldn't depend also on CGROUPS and CGROUP_SCHED? without them hard
limits can't be defined, right?
Signed-off-by: Andrea Righi <arighi@develer.com>
---
init/Kconfig | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/init/Kconfig b/init/Kconfig
index 71868a0..19c0290 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -459,7 +459,7 @@ config FAIR_GROUP_SCHED
config CFS_HARD_LIMITS
bool "Hard Limits for CFS Group Scheduler"
depends on EXPERIMENTAL
- depends on FAIR_GROUP_SCHED
+ depends on FAIR_GROUP_SCHED && CGROUPS && CGROUP_SCHED
default n
help
This option enables hard limiting of CPU time obtained by
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [RFC v1 PATCH 3/7] sched: Bandwidth initialization for fair task groups
2009-09-04 10:43 ` Andrea Righi
@ 2009-09-04 12:32 ` Bharata B Rao
2009-09-04 12:36 ` Andrea Righi
0 siblings, 1 reply; 11+ messages in thread
From: Bharata B Rao @ 2009-09-04 12:32 UTC (permalink / raw)
To: Andrea Righi
Cc: linux-kernel, Dhaval Giani, Balbir Singh, Vaidyanathan Srinivasan,
Gautham R Shenoy, Srivatsa Vaddagiri, Ingo Molnar, Peter Zijlstra,
Pavel Emelyanov, Herbert Poetzl, Avi Kivity, Chris Friesen,
Paul Menage, Mike Waychison
On Fri, Sep 04, 2009 at 12:43:25PM +0200, Andrea Righi wrote:
> On Tue, Aug 25, 2009 at 03:19:50PM +0530, Bharata B Rao wrote:
> > +config CFS_HARD_LIMITS
> > + bool "Hard Limits for CFS Group Scheduler"
> > + depends on EXPERIMENTAL
> > + depends on FAIR_GROUP_SCHED
>
> Shouldn't depend also on CGROUPS and CGROUP_SCHED? without them hard
> limits can't be defined, right?
Right, but do we need to explicitly mention CGROUPS as dependency since
CGROUP_SCHED is already dependent on it ?
>
> Signed-off-by: Andrea Righi <arighi@develer.com>
> ---
> init/Kconfig | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/init/Kconfig b/init/Kconfig
> index 71868a0..19c0290 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -459,7 +459,7 @@ config FAIR_GROUP_SCHED
> config CFS_HARD_LIMITS
> bool "Hard Limits for CFS Group Scheduler"
> depends on EXPERIMENTAL
> - depends on FAIR_GROUP_SCHED
> + depends on FAIR_GROUP_SCHED && CGROUPS && CGROUP_SCHED
> default n
> help
> This option enables hard limiting of CPU time obtained by
Thanks for looking at the patches.
Regards,
Bharata.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFC v1 PATCH 3/7] sched: Bandwidth initialization for fair task groups
2009-09-04 12:32 ` Bharata B Rao
@ 2009-09-04 12:36 ` Andrea Righi
0 siblings, 0 replies; 11+ messages in thread
From: Andrea Righi @ 2009-09-04 12:36 UTC (permalink / raw)
To: Bharata B Rao
Cc: linux-kernel, Dhaval Giani, Balbir Singh, Vaidyanathan Srinivasan,
Gautham R Shenoy, Srivatsa Vaddagiri, Ingo Molnar, Peter Zijlstra,
Pavel Emelyanov, Herbert Poetzl, Avi Kivity, Chris Friesen,
Paul Menage, Mike Waychison
On Fri, Sep 04, 2009 at 06:02:15PM +0530, Bharata B Rao wrote:
> On Fri, Sep 04, 2009 at 12:43:25PM +0200, Andrea Righi wrote:
> > On Tue, Aug 25, 2009 at 03:19:50PM +0530, Bharata B Rao wrote:
> > > +config CFS_HARD_LIMITS
> > > + bool "Hard Limits for CFS Group Scheduler"
> > > + depends on EXPERIMENTAL
> > > + depends on FAIR_GROUP_SCHED
> >
> > Shouldn't depend also on CGROUPS and CGROUP_SCHED? without them hard
> > limits can't be defined, right?
>
> Right, but do we need to explicitly mention CGROUPS as dependency since
> CGROUP_SCHED is already dependent on it ?
Correct, CGROUP_SCHED is enough.
Thanks,
-Andrea
--
Andrea Righi - Develer s.r.l
http://www.develer.com
^ permalink raw reply [flat|nested] 11+ messages in thread
* [RFC v1 PATCH 4/7] sched: Enforce hard limits by throttling
2009-08-25 9:47 [RFC v1 PATCH 0/7] CFS Hard limits - v1 Bharata B Rao
` (2 preceding siblings ...)
2009-08-25 9:49 ` [RFC v1 PATCH 3/7] sched: Bandwidth initialization for fair task groups Bharata B Rao
@ 2009-08-25 9:50 ` Bharata B Rao
2009-08-25 9:51 ` [RFC v1 PATCH 5/7] sched: Unthrottle the throttled tasks Bharata B Rao
` (2 subsequent siblings)
6 siblings, 0 replies; 11+ messages in thread
From: Bharata B Rao @ 2009-08-25 9:50 UTC (permalink / raw)
To: linux-kernel
Cc: Dhaval Giani, Balbir Singh, Vaidyanathan Srinivasan,
Gautham R Shenoy, Srivatsa Vaddagiri, Ingo Molnar, Peter Zijlstra,
Pavel Emelyanov, Herbert Poetzl, Avi Kivity, Chris Friesen,
Paul Menage, Mike Waychison
sched: Enforce hard limits by throttling.
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Throttle the task-groups which exceed the runtime allocated to them.
Throttled group entities are removed from the run queue.
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
include/linux/sched.h | 1
kernel/sched.c | 32 ++++++++++
kernel/sched_debug.c | 2
kernel/sched_fair.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++++--
4 files changed, 177 insertions(+), 4 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1124,6 +1124,7 @@ struct sched_entity {
u64 nr_failed_migrations_affine;
u64 nr_failed_migrations_running;
u64 nr_failed_migrations_hot;
+ u64 nr_failed_migrations_throttled;
u64 nr_forced_migrations;
u64 nr_forced2_migrations;
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1580,6 +1580,7 @@ update_group_shares_cpu(struct task_grou
}
}
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
/*
* Re-compute the task group their per cpu shares over the given domain.
* This needs to be done in a bottom-up fashion because the rq weight of a
@@ -1597,9 +1598,11 @@ static int tg_shares_up(struct task_grou
* If there are currently no tasks on the cpu pretend there
* is one of average load so that when a new task gets to
* run here it will not get delayed by group starvation.
+ * Also if the group is throttled on this cpu, pretend that
+ * it has no tasks.
*/
weight = tg->cfs_rq[i]->load.weight;
- if (!weight)
+ if (!weight || cfs_rq_throttled(tg->cfs_rq[i]))
weight = NICE_0_LOAD;
tg->cfs_rq[i]->rq_weight = weight;
@@ -1623,6 +1626,7 @@ static int tg_shares_up(struct task_grou
* Compute the cpu's hierarchical load factor for each task group.
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
+ * A throttled group's h_load is set to 0.
*/
static int tg_load_down(struct task_group *tg, void *data)
{
@@ -1631,6 +1635,8 @@ static int tg_load_down(struct task_grou
if (!tg->parent) {
load = cpu_rq(cpu)->load.weight;
+ } else if (cfs_rq_throttled(tg->cfs_rq[cpu])) {
+ load = 0;
} else {
load = tg->parent->cfs_rq[cpu]->h_load;
load *= tg->cfs_rq[cpu]->shares;
@@ -1808,6 +1814,8 @@ static inline u64 global_cfs_runtime(voi
return RUNTIME_INF;
}
+int task_group_throttled(struct task_group *tg, int cpu);
+
static inline int cfs_bandwidth_enabled(struct task_group *tg)
{
return tg->hard_limit_enabled;
@@ -1892,7 +1900,18 @@ static void init_cfs_hard_limits(struct
return;
}
+int task_group_throttled(struct task_group *tg, int cpu)
+{
+ return 0;
+}
+
#endif /* CONFIG_CFS_HARD_LIMITS */
+
+#else /* CONFIG_FAIR_GROUP_SCHED */
+int task_group_throttled(struct task_group *tg, int cpu)
+{
+ return 0;
+}
#endif /* CONFIG_FAIR_GROUP_SCHED */
#include "sched_stats.h"
@@ -3364,6 +3383,7 @@ int can_migrate_task(struct task_struct
* 1) running (obviously), or
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU.
+ * 4) end up in throttled task groups on this CPU.
*/
if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
schedstat_inc(p, se.nr_failed_migrations_affine);
@@ -3377,6 +3397,16 @@ int can_migrate_task(struct task_struct
}
/*
+ * Don't migrate the task if
+ * - it belongs to a group which is throttled on this_cpu or
+ * - it belongs to a group whose hierarchy is throttled on this_cpu
+ */
+ if (task_group_throttled(task_group(p), this_cpu)) {
+ schedstat_inc(p, se.nr_failed_migrations_throttled);
+ return 0;
+ }
+
+ /*
* Aggressive migration if:
* 1) task is cache cold, or
* 2) too many balance attempts have failed.
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -417,6 +417,7 @@ void proc_sched_show_task(struct task_st
P(se.nr_failed_migrations_affine);
P(se.nr_failed_migrations_running);
P(se.nr_failed_migrations_hot);
+ P(se.nr_failed_migrations_throttled);
P(se.nr_forced_migrations);
P(se.nr_forced2_migrations);
P(se.nr_wakeups);
@@ -491,6 +492,7 @@ void proc_sched_set_task(struct task_str
p->se.nr_failed_migrations_affine = 0;
p->se.nr_failed_migrations_running = 0;
p->se.nr_failed_migrations_hot = 0;
+ p->se.nr_failed_migrations_throttled = 0;
p->se.nr_forced_migrations = 0;
p->se.nr_forced2_migrations = 0;
p->se.nr_wakeups = 0;
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -186,6 +186,89 @@ find_matching_se(struct sched_entity **s
}
}
+#ifdef CONFIG_CFS_HARD_LIMITS
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->cfs_throttled;
+}
+
+/*
+ * Check if group entity exceeded its runtime. If so, mark the cfs_rq as
+ * throttled mark the current task for reschedling.
+ */
+static void sched_cfs_runtime_exceeded(struct sched_entity *se,
+ struct task_struct *tsk_curr, unsigned long delta_exec)
+{
+ struct cfs_rq *cfs_rq;
+
+ cfs_rq = group_cfs_rq(se);
+
+ if (!cfs_bandwidth_enabled(cfs_rq->tg))
+ return;
+
+ if (cfs_rq->cfs_runtime == RUNTIME_INF)
+ return;
+
+ cfs_rq->cfs_time += delta_exec;
+
+ if (cfs_rq_throttled(cfs_rq))
+ return;
+
+ if (cfs_rq->cfs_time > cfs_rq->cfs_runtime) {
+ cfs_rq->cfs_throttled = 1;
+ resched_task(tsk_curr);
+ }
+}
+
+/*
+ * Check if the entity is throttled.
+ */
+static int entity_throttled(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ /* Only group entities can be throttled */
+ if (entity_is_task(se))
+ return 0;
+
+ cfs_rq = group_cfs_rq(se);
+ if (cfs_rq_throttled(cfs_rq))
+ return 1;
+ return 0;
+}
+
+int task_group_throttled(struct task_group *tg, int cpu)
+{
+ struct sched_entity *se = tg->se[cpu];
+
+ for_each_sched_entity(se) {
+ if (entity_throttled(se))
+ return 1;
+ }
+ return 0;
+}
+
+#else
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static void sched_cfs_runtime_exceeded(struct sched_entity *se,
+ struct task_struct *tsk_curr, unsigned long delta_exec)
+{
+ return;
+}
+
+static int entity_throttled(struct sched_entity *se)
+{
+ return 0;
+}
+
+#endif /* CONFIG_CFS_HARD_LIMITS */
+
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -241,6 +324,17 @@ find_matching_se(struct sched_entity **s
{
}
+static void sched_cfs_runtime_exceeded(struct sched_entity *se,
+ struct task_struct *tsk_curr, unsigned long delta_exec)
+{
+ return;
+}
+
+static int entity_throttled(struct sched_entity *se)
+{
+ return 0;
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
static void add_cfs_rq_tasks_running(struct sched_entity *se,
@@ -505,7 +599,9 @@ __update_curr(struct cfs_rq *cfs_rq, str
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_of(cfs_rq)->clock;
+ struct rq *rq = rq_of(cfs_rq);
+ struct task_struct *tsk_curr = rq->curr;
+ u64 now = rq->clock;
unsigned long delta_exec;
if (unlikely(!curr))
@@ -528,6 +624,8 @@ static void update_curr(struct cfs_rq *c
cpuacct_charge(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
+ } else {
+ sched_cfs_runtime_exceeded(curr, tsk_curr, delta_exec);
}
}
@@ -865,8 +963,40 @@ static struct sched_entity *pick_next_en
return se;
}
+/*
+ * Called from put_prev_entity()
+ * If a group entity (@se) is found to be throttled, it will not be put back
+ * on @cfs_rq, which is equivalent to dequeing it.
+ */
+static void dequeue_throttled_entity(struct cfs_rq *cfs_rq,
+ struct sched_entity *se)
+{
+ unsigned long nr_tasks = group_cfs_rq(se)->nr_tasks_running;
+
+ __clear_buddies(cfs_rq, se);
+ account_entity_dequeue(cfs_rq, se);
+ cfs_rq->curr = NULL;
+
+ if (!nr_tasks)
+ return;
+
+ /*
+ * Decrement the number of tasks this entity has from
+ * all of its parent entities.
+ */
+ sub_cfs_rq_tasks_running(se, nr_tasks);
+
+ /*
+ * Decrement the number of tasks this entity has from
+ * this cpu's rq.
+ */
+ rq_of(cfs_rq)->nr_running -= nr_tasks;
+}
+
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(prev);
+
/*
* If still on the runqueue then deactivate_task()
* was not called and update_curr() has to be done:
@@ -876,6 +1006,15 @@ static void put_prev_entity(struct cfs_r
check_spread(cfs_rq, prev);
if (prev->on_rq) {
+ /*
+ * If the group entity is throttled or if it has no
+ * no child entities, then don't enqueue it back.
+ */
+ if (entity_throttled(prev) ||
+ (gcfs_rq && !gcfs_rq->nr_running)) {
+ dequeue_throttled_entity(cfs_rq, prev);
+ return;
+ }
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
@@ -1541,6 +1680,7 @@ static struct task_struct *pick_next_tas
do {
se = pick_next_entity(cfs_rq);
+
/*
* If se was a buddy, clear it so that it will have to earn
* the favour again.
@@ -1650,9 +1790,9 @@ load_balance_fair(struct rq *this_rq, in
u64 rem_load, moved_load;
/*
- * empty group
+ * empty group or a group with no h_load (throttled)
*/
- if (!busiest_cfs_rq->task_weight)
+ if (!busiest_cfs_rq->task_weight || !busiest_h_load)
continue;
rem_load = (u64)rem_load_move * busiest_weight;
^ permalink raw reply [flat|nested] 11+ messages in thread* [RFC v1 PATCH 5/7] sched: Unthrottle the throttled tasks
2009-08-25 9:47 [RFC v1 PATCH 0/7] CFS Hard limits - v1 Bharata B Rao
` (3 preceding siblings ...)
2009-08-25 9:50 ` [RFC v1 PATCH 4/7] sched: Enforce hard limits by throttling Bharata B Rao
@ 2009-08-25 9:51 ` Bharata B Rao
2009-08-25 9:51 ` [RFC v1 PATCH 6/7] sched: Add throttle time statistics to /proc/sched_debug Bharata B Rao
2009-08-25 9:53 ` [RFC v1 PATCH 7/7] sched: Hard limits documentation Bharata B Rao
6 siblings, 0 replies; 11+ messages in thread
From: Bharata B Rao @ 2009-08-25 9:51 UTC (permalink / raw)
To: linux-kernel
Cc: Dhaval Giani, Balbir Singh, Vaidyanathan Srinivasan,
Gautham R Shenoy, Srivatsa Vaddagiri, Ingo Molnar, Peter Zijlstra,
Pavel Emelyanov, Herbert Poetzl, Avi Kivity, Chris Friesen,
Paul Menage, Mike Waychison
sched: Unthrottle the throttled tasks.
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Refresh runtimes when group's bandwidth period expires. Unthrottle any
throttled groups at that time. Refreshing runtimes is driven through
a periodic timer.
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
kernel/sched.c | 8 +++++
kernel/sched_fair.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 87 insertions(+)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1815,6 +1815,7 @@ static inline u64 global_cfs_runtime(voi
}
int task_group_throttled(struct task_group *tg, int cpu);
+void do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b);
static inline int cfs_bandwidth_enabled(struct task_group *tg)
{
@@ -1830,6 +1831,7 @@ static enum hrtimer_restart sched_cfs_pe
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, cfs_period_timer);
+ do_sched_cfs_period_timer(cfs_b);
hrtimer_add_expires_ns(timer, ktime_to_ns(cfs_b->cfs_period));
return HRTIMER_RESTART;
}
@@ -10536,6 +10538,12 @@ int tg_set_hard_limit_enabled(struct tas
start_cfs_bandwidth(tg);
} else {
destroy_cfs_bandwidth(tg);
+ /*
+ * Hard limiting is being disabled for this group.
+ * Refresh runtimes and put the throttled entities
+ * of the group back onto runqueue.
+ */
+ do_sched_cfs_period_timer(&tg->cfs_bandwidth);
tg->hard_limit_enabled = 0;
}
spin_unlock_irq(&tg->cfs_bandwidth.cfs_runtime_lock);
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -249,6 +249,78 @@ int task_group_throttled(struct task_gro
return 0;
}
+static void
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup);
+static void add_cfs_rq_tasks_running(struct sched_entity *se,
+ unsigned long count);
+static void sub_cfs_rq_tasks_running(struct sched_entity *se,
+ unsigned long count);
+
+static void enqueue_throttled_entity(struct rq *rq, struct sched_entity *se)
+{
+ unsigned long nr_tasks = 0;
+ struct sched_entity *se_tmp = se;
+ int throttled = 0;
+
+ for_each_sched_entity(se) {
+ if (se->on_rq)
+ break;
+
+ if (entity_throttled(se)) {
+ throttled = 1;
+ break;
+ }
+
+ enqueue_entity(cfs_rq_of(se), se, 0);
+ nr_tasks += group_cfs_rq(se)->nr_tasks_running;
+ }
+
+ if (!nr_tasks)
+ return;
+
+ /*
+ * Add the number of tasks this entity has to
+ * all of its parent entities.
+ */
+ add_cfs_rq_tasks_running(se_tmp, nr_tasks);
+
+ /*
+ * Add the number of tasks this entity has to
+ * this cpu's rq only if the entity got enqueued all the
+ * way up without any throttled entity in the hierarchy.
+ */
+ if (!throttled)
+ rq->nr_running += nr_tasks;
+}
+
+/*
+ * Refresh runtimes of all cfs_rqs in this group, i,e.,
+ * refresh runtimes of the representative cfs_rq of this
+ * tg on all cpus. Enqueue any throttled entity back.
+ */
+void do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b)
+{
+ int i;
+ const struct cpumask *span = sched_bw_period_mask();
+ struct task_group *tg = container_of(cfs_b, struct task_group,
+ cfs_bandwidth);
+ unsigned long flags;
+
+ for_each_cpu(i, span) {
+ struct rq *rq = cpu_rq(i);
+ struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+ struct sched_entity *se = tg->se[i];
+
+ spin_lock_irqsave(&rq->lock, flags);
+ cfs_rq->cfs_time = 0;
+ if (cfs_rq_throttled(cfs_rq)) {
+ cfs_rq->cfs_throttled = 0;
+ enqueue_throttled_entity(rq, se);
+ }
+ spin_unlock_irqrestore(&rq->lock, flags);
+ }
+}
+
#else
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -343,6 +415,13 @@ static void add_cfs_rq_tasks_running(str
struct cfs_rq *cfs_rq;
for_each_sched_entity(se) {
+ /*
+ * If any entity in the hierarchy is throttled, don't
+ * propogate the tasks count up since this entity isn't
+ * on rq yet.
+ */
+ if (entity_throttled(se))
+ break;
cfs_rq = cfs_rq_of(se);
cfs_rq->nr_tasks_running += count;
}
^ permalink raw reply [flat|nested] 11+ messages in thread* [RFC v1 PATCH 6/7] sched: Add throttle time statistics to /proc/sched_debug
2009-08-25 9:47 [RFC v1 PATCH 0/7] CFS Hard limits - v1 Bharata B Rao
` (4 preceding siblings ...)
2009-08-25 9:51 ` [RFC v1 PATCH 5/7] sched: Unthrottle the throttled tasks Bharata B Rao
@ 2009-08-25 9:51 ` Bharata B Rao
2009-08-25 9:53 ` [RFC v1 PATCH 7/7] sched: Hard limits documentation Bharata B Rao
6 siblings, 0 replies; 11+ messages in thread
From: Bharata B Rao @ 2009-08-25 9:51 UTC (permalink / raw)
To: linux-kernel
Cc: Dhaval Giani, Balbir Singh, Vaidyanathan Srinivasan,
Gautham R Shenoy, Srivatsa Vaddagiri, Ingo Molnar, Peter Zijlstra,
Pavel Emelyanov, Herbert Poetzl, Avi Kivity, Chris Friesen,
Paul Menage, Mike Waychison
sched: Add throttle time statistics to /proc/sched_debug
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
With hard limits, provide stats about throttle time, throttle count
and max throttle time for group sched entities in /proc/sched_debug
Throttle stats are collected only for group entities.
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
include/linux/sched.h | 6 ++++++
kernel/sched_debug.c | 17 ++++++++++++++++-
kernel/sched_fair.c | 20 ++++++++++++++++++++
3 files changed, 42 insertions(+), 1 deletion(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1137,6 +1137,12 @@ struct sched_entity {
u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
+#ifdef CONFIG_CFS_HARD_LIMITS
+ u64 throttle_start;
+ u64 throttle_max;
+ u64 throttle_count;
+ u64 throttle_sum;
+#endif
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -80,6 +80,11 @@ static void print_cfs_group_stats(struct
PN(se->wait_max);
PN(se->wait_sum);
P(se->wait_count);
+#ifdef CONFIG_CFS_HARD_LIMITS
+ PN(se->throttle_max);
+ PN(se->throttle_sum);
+ P(se->throttle_count);
+#endif
#endif
P(se->load.weight);
#undef PN
@@ -216,6 +221,16 @@ void print_cfs_rq(struct seq_file *m, in
#endif
SEQ_printf(m, " .%-30s: %ld\n", "nr_tasks_running",
cfs_rq->nr_tasks_running);
+#ifdef CONFIG_CFS_HARD_LIMITS
+ spin_lock_irqsave(&rq->lock, flags);
+ SEQ_printf(m, " .%-30s: %d\n", "cfs_throttled",
+ cfs_rq->cfs_throttled);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "cfs_time",
+ SPLIT_NS(cfs_rq->cfs_time));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "cfs_runtime",
+ SPLIT_NS(cfs_rq->cfs_runtime));
+ spin_unlock_irqrestore(&rq->lock, flags);
+#endif
print_cfs_group_stats(m, cpu, cfs_rq->tg);
#endif
}
@@ -312,7 +327,7 @@ static int sched_debug_show(struct seq_f
u64 now = ktime_to_ns(ktime_get());
int cpu;
- SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
+ SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -188,6 +188,23 @@ find_matching_se(struct sched_entity **s
#ifdef CONFIG_CFS_HARD_LIMITS
+static inline void update_stats_throttle_start(struct cfs_rq *cfs_rq,
+ struct sched_entity *se)
+{
+ schedstat_set(se->throttle_start, rq_of(cfs_rq)->clock);
+}
+
+static inline void update_stats_throttle_end(struct cfs_rq *cfs_rq,
+ struct sched_entity *se)
+{
+ schedstat_set(se->throttle_max, max(se->throttle_max,
+ rq_of(cfs_rq)->clock - se->throttle_start));
+ schedstat_set(se->throttle_count, se->throttle_count + 1);
+ schedstat_set(se->throttle_sum, se->throttle_sum +
+ rq_of(cfs_rq)->clock - se->throttle_start);
+ schedstat_set(se->throttle_start, 0);
+}
+
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
return cfs_rq->cfs_throttled;
@@ -217,6 +234,7 @@ static void sched_cfs_runtime_exceeded(s
if (cfs_rq->cfs_time > cfs_rq->cfs_runtime) {
cfs_rq->cfs_throttled = 1;
+ update_stats_throttle_start(cfs_rq, se);
resched_task(tsk_curr);
}
}
@@ -314,6 +332,8 @@ void do_sched_cfs_period_timer(struct cf
spin_lock_irqsave(&rq->lock, flags);
cfs_rq->cfs_time = 0;
if (cfs_rq_throttled(cfs_rq)) {
+ update_rq_clock(rq);
+ update_stats_throttle_end(cfs_rq, se);
cfs_rq->cfs_throttled = 0;
enqueue_throttled_entity(rq, se);
}
^ permalink raw reply [flat|nested] 11+ messages in thread* [RFC v1 PATCH 7/7] sched: Hard limits documentation
2009-08-25 9:47 [RFC v1 PATCH 0/7] CFS Hard limits - v1 Bharata B Rao
` (5 preceding siblings ...)
2009-08-25 9:51 ` [RFC v1 PATCH 6/7] sched: Add throttle time statistics to /proc/sched_debug Bharata B Rao
@ 2009-08-25 9:53 ` Bharata B Rao
6 siblings, 0 replies; 11+ messages in thread
From: Bharata B Rao @ 2009-08-25 9:53 UTC (permalink / raw)
To: linux-kernel
Cc: Dhaval Giani, Balbir Singh, Vaidyanathan Srinivasan,
Gautham R Shenoy, Srivatsa Vaddagiri, Ingo Molnar, Peter Zijlstra,
Pavel Emelyanov, Herbert Poetzl, Avi Kivity, Chris Friesen,
Paul Menage, Mike Waychison
sched: Hard limits documentation
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Documentation for hard limits feature.
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
Documentation/scheduler/sched-cfs-hard-limits.txt | 52 ++++++++++++++++++++++
1 file changed, 52 insertions(+)
create mode 100644 Documentation/scheduler/sched-cfs-hard-limits.txt
--- /dev/null
+++ b/Documentation/scheduler/sched-cfs-hard-limits.txt
@@ -0,0 +1,52 @@
+CPU HARD LIMITS FOR CFS GROUPS
+==============================
+
+1. Overview
+2. Interface
+3. Examples
+
+1. Overview
+-----------
+
+CFS is a proportional share scheduler which tries to divide the CPU time
+proportionately between tasks or groups of tasks (task group/cgroup) depending
+on the priority/weight of the task or shares assigned to groups of tasks.
+In CFS, a task/task group can get more than its share of CPU if there are
+enough idle CPU cycles available in the system, due to the work conserving
+nature of the scheduler. However in certain scenarios (like pay-per-use),
+it is desirable not to provide extra time to a group even in the presence
+of idle CPU cycles. This is where hard limiting can be of use.
+
+Hard limits for task groups can be set by specifying how much CPU runtime a
+group can consume within a given period. If the group consumes more CPU time
+than the runtime in a given period, it gets throttled. None of the tasks of
+the throttled group gets to run until the runtime of the group gets refreshed
+at the beginning of the next period.
+
+2. Interface
+------------
+
+Hard limit feature adds 3 cgroup files for CFS group scheduler:
+
+cfs_runtime_us: Hard limit for the group in microseconds.
+
+cfs_period_us: Time period in microseconds within which hard limits is
+enforced.
+
+cfs_hard_limit: The control file to enable or disable hard limiting for the
+group.
+
+A group gets created with default values for runtime and period and with
+hard limit disabled. Each group can set its own values for runtime and period
+independent of other groups in the system.
+
+3. Examples
+-----------
+
+# mount -t cgroup -ocpu none /cgroups/
+# cd /cgroups
+# mkdir 1
+# cd 1/
+# echo 250000 > cfs_runtime_us /* set a 250ms runtime or limit */
+# echo 500000 > cfs_period_us /* set a 500ms period */
+# echo 1 > cfs_hard_limit /* enable hard limiting for group 1/ */
^ permalink raw reply [flat|nested] 11+ messages in thread