From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>,
Balbir Singh <balbir@linux.vnet.ibm.com>,
dmitry.adamushko@gmail.com,
Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>,
Steven Rostedt <rostedt@goodmis.org>,
Gregory Haskins <ghaskins@novell.com>,
Peter Zijlstra <a.p.zijlstra@chello.nl>,
Thomas Gleixner <tglx@linutronix.de>
Subject: [PATCH 07/11] sched: rt-group: per group period
Date: Sun, 06 Jan 2008 17:11:35 +0100 [thread overview]
Message-ID: <20080106162124.111762000@chello.nl> (raw)
In-Reply-To: 20080106161128.152634000@chello.nl
[-- Attachment #1: sched-rt-rq-hrtimer.patch --]
[-- Type: text/plain, Size: 14233 bytes --]
Steven asked for per group periods in order to get closer to RMA or EDF
scheduling.
Use the fancy new hrtimers to provide a per group period
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sched.h | 2
kernel/sched.c | 229 ++++++++++++++++++++++++++++++++++++++++++-----
kernel/sched_rt.c | 61 ++++++------
kernel/sysctl.c | 2
kernel/time/tick-sched.c | 5 -
5 files changed, 237 insertions(+), 62 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -177,6 +177,7 @@ struct task_group {
struct rt_rq **rt_rq;
unsigned int rt_ratio;
+ ktime_t rt_period;
/*
* shares assigned to a task group governs how much of cpu bandwidth
@@ -372,6 +373,7 @@ struct rt_rq {
#endif
int rt_throttled;
u64 rt_time;
+ struct hrtimer rt_period_timer;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq;
@@ -441,8 +443,6 @@ struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
- u64 rt_period_expire;
- int rt_throttled;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
@@ -595,23 +595,6 @@ static void update_rq_clock(struct rq *r
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-unsigned long rt_needs_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- u64 delta;
-
- if (!rq->rt_throttled)
- return 0;
-
- if (rq->clock > rq->rt_period_expire)
- return 1;
-
- delta = rq->rt_period_expire - rq->clock;
- do_div(delta, NSEC_PER_SEC / HZ);
-
- return (unsigned long)delta;
-}
-
/*
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
@@ -652,10 +635,10 @@ const_debug unsigned int sysctl_sched_fe
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
* default: 1s
*/
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+const_debug unsigned int sysctl_sched_rt_period = 1000000;
#define SCHED_RT_FRAC_SHIFT 16
#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT)
@@ -1245,6 +1228,12 @@ static unsigned long cpu_avg_load_per_ta
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
#endif /* CONFIG_SMP */
+static inline ktime_t ns_to_ktime(u64 ns)
+{
+ static const ktime_t ktime_zero = { .tv64 = 0 };
+ return ktime_add_ns(ktime_zero, ns);
+}
+
#include "sched_stats.h"
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -3741,7 +3730,6 @@ void scheduler_tick(void)
rq->tick_timestamp = rq->clock;
update_cpu_load(rq);
curr->sched_class->task_tick(rq, curr, 0);
- update_sched_rt_period(rq);
spin_unlock(&rq->lock);
#ifdef CONFIG_SMP
@@ -5287,6 +5275,158 @@ static inline void sched_init_granularit
sysctl_sched_batch_wakeup_granularity *= factor;
}
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+ struct rt_rq *rt_rq =
+ container_of(timer, struct rt_rq, rt_period_timer);
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+ ktime_t now = ktime_get();
+
+ WARN_ON(smp_processor_id() != cpu_of(rq));
+ WARN_ON(!in_irq());
+
+ spin_lock(&rq->lock);
+ update_sched_rt_period(rt_rq);
+ spin_unlock(&rq->lock);
+
+ hrtimer_forward(timer, now, sched_rt_period(rt_rq));
+ return HRTIMER_RESTART;
+}
+
+static void sched_rt_period_start(struct rt_rq *rt_rq)
+{
+ ktime_t period = sched_rt_period(rt_rq);
+
+ WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq)));
+
+ for (;;) {
+ ktime_t now = ktime_get();
+ hrtimer_forward(&rt_rq->rt_period_timer, now, period);
+ hrtimer_start(&rt_rq->rt_period_timer,
+ rt_rq->rt_period_timer.expires,
+ HRTIMER_MODE_ABS);
+ if (hrtimer_active(&rt_rq->rt_period_timer))
+ break;
+ }
+}
+
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+static void sched_rt_period_stop(struct rt_rq *rt_rq)
+{
+ hrtimer_cancel(&rt_rq->rt_period_timer);
+}
+#endif
+
+static void sched_rt_period_start_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rt_rq *rt_rq;
+
+ for_each_leaf_rt_rq(rt_rq, rq)
+ sched_rt_period_start(rt_rq);
+}
+
+#ifdef CONFIG_SMP
+static void sched_rt_period_stop_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rt_rq *rt_rq;
+
+ for_each_leaf_rt_rq(rt_rq, rq)
+ sched_rt_period_stop(rt_rq);
+}
+
+static int sched_rt_period_hotplug(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ sched_rt_period_start_cpu(cpu);
+ return NOTIFY_OK;
+
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ sched_rt_period_stop_cpu(cpu);
+ return NOTIFY_OK;
+
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ return NOTIFY_OK;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+
+static void __init __sched_rt_period_init(void *arg)
+{
+ int cpu = smp_processor_id();
+ sched_rt_period_start_cpu(cpu);
+}
+
+static void __init sched_rt_period_init(void)
+{
+ on_each_cpu(__sched_rt_period_init, NULL, 0, 1);
+ hotcpu_notifier(sched_rt_period_hotplug, 0);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void __sched_rt_period_init_tg(void *arg)
+{
+ struct task_group *tg = arg;
+ int cpu = smp_processor_id();
+
+ sched_rt_period_start(tg->rt_rq[cpu]);
+}
+
+static void sched_rt_period_init_tg(struct task_group *tg)
+{
+ on_each_cpu(__sched_rt_period_init_tg, tg, 0, 1);
+}
+
+static void __sched_rt_period_destroy_tg(void *arg)
+{
+ struct task_group *tg = arg;
+ int cpu = smp_processor_id();
+
+ sched_rt_period_stop(tg->rt_rq[cpu]);
+}
+
+static void sched_rt_period_destroy_tg(struct task_group *tg)
+{
+ on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1);
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#else /* CONFIG_SMP */
+static void __init sched_rt_period_init(void)
+{
+ sched_rt_period_start_cpu(0);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void sched_rt_period_init_tg(struct task_group *tg)
+{
+ sched_rt_period_start(tg->rt_rq[0]);
+}
+
+static void sched_rt_period_destroy_tg(struct task_group *tg)
+{
+ sched_rt_period_stop(tg->rt_rq[0]);
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_SMP */
+
#ifdef CONFIG_SMP
/*
* This is how migration works:
@@ -7068,6 +7208,7 @@ void __init sched_init_smp(void)
if (set_cpus_allowed(current, non_isolated_cpus) < 0)
BUG();
sched_init_granularity();
+ sched_rt_period_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
if (nr_cpu_ids == 1)
@@ -7088,6 +7229,7 @@ void __init sched_init_smp(void)
void __init sched_init_smp(void)
{
sched_init_granularity();
+ sched_rt_period_init();
}
#endif /* CONFIG_SMP */
@@ -7131,6 +7273,11 @@ static void init_rt_rq(struct rt_rq *rt_
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
+ hrtimer_init(&rt_rq->rt_period_timer,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rt_rq->rt_period_timer.function = sched_rt_period_timer;
+ rt_rq->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+
#ifdef CONFIG_FAIR_GROUP_SCHED
rt_rq->rq = rq;
#endif
@@ -7201,6 +7348,8 @@ void __init sched_init(void)
&per_cpu(init_sched_entity, i), i, 1);
init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+ init_task_group.rt_period =
+ ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
init_tg_rt_entry(rq, &init_task_group,
&per_cpu(init_rt_rq, i),
@@ -7208,8 +7357,6 @@ void __init sched_init(void)
list_add(&init_task_group.list, &task_groups);
#endif
- rq->rt_period_expire = 0;
- rq->rt_throttled = 0;
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
@@ -7598,6 +7745,7 @@ struct task_group *sched_create_group(vo
tg->shares = NICE_0_LOAD;
tg->rt_ratio = 0; /* XXX */
+ tg->rt_period = ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
for_each_possible_cpu(i) {
rq = cpu_rq(i);
@@ -7637,6 +7785,8 @@ struct task_group *sched_create_group(vo
list_add_rcu(&tg->list, &task_groups);
unlock_task_group_list();
+ sched_rt_period_init_tg(tg);
+
return tg;
err:
@@ -7658,6 +7808,8 @@ void sched_destroy_group(struct task_gro
struct rt_rq *rt_rq = NULL;
int i;
+ sched_rt_period_destroy_tg(tg);
+
lock_task_group_list();
for_each_possible_cpu(i) {
cfs_rq = tg->cfs_rq[i];
@@ -7815,6 +7967,19 @@ unsigned long sched_group_rt_ratio(struc
return tg->rt_ratio;
}
+int sched_group_set_rt_period(struct task_group *tg, unsigned long rt_period)
+{
+ tg->rt_period = ns_to_ktime((u64)rt_period * NSEC_PER_USEC);
+ return 0;
+}
+
+unsigned long sched_group_rt_period(struct task_group *tg)
+{
+ u64 ns = ktime_to_ns(tg->rt_period);
+ do_div(ns, NSEC_PER_USEC);
+ return ns;
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7903,6 +8068,17 @@ static u64 cpu_rt_ratio_read_uint(struct
return (u64) tg->rt_ratio;
}
+static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+ u64 rt_period_val)
+{
+ return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_val);
+}
+
+static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+ return (u64) sched_group_rt_period(cgroup_tg(cgrp));
+}
+
static struct cftype cpu_files[] = {
{
.name = "shares",
@@ -7914,6 +8090,11 @@ static struct cftype cpu_files[] = {
.read_uint = cpu_rt_ratio_read_uint,
.write_uint = cpu_rt_ratio_write_uint,
},
+ {
+ .name = "rt_period_us",
+ .read_uint = cpu_rt_period_read_uint,
+ .write_uint = cpu_rt_period_write_uint,
+ },
};
static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -230,8 +230,6 @@ static inline int select_nohz_load_balan
}
#endif
-extern unsigned long rt_needs_cpu(int cpu);
-
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -65,6 +65,17 @@ static inline unsigned int sched_rt_rati
return rt_rq->tg->rt_ratio;
}
+static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
+{
+ BUG_ON(!rt_rq->tg);
+ return rt_rq->tg->rt_period;
+}
+
+static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq)
+{
+ return ktime_to_ns(sched_rt_period(rt_rq));
+}
+
#define for_each_leaf_rt_rq(rt_rq, rq) \
list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
@@ -117,6 +128,16 @@ static inline unsigned int sched_rt_rati
return sysctl_sched_rt_ratio;
}
+static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
+{
+ return ns_to_ktime((u64)sysctl_sched_rt_period * NSEC_PER_USEC);
+}
+
+static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq)
+{
+ return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+
#define for_each_leaf_rt_rq(rt_rq, rq) \
for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
@@ -174,15 +195,11 @@ static int sched_rt_ratio_exceeded(struc
if (rt_rq->rt_throttled)
return 1;
- period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+ period = sched_rt_period_ns(rt_rq);
ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
if (rt_rq->rt_time > ratio) {
- struct rq *rq = rq_of_rt_rq(rt_rq);
-
- rq->rt_throttled = 1;
rt_rq->rt_throttled = 1;
-
sched_rt_ratio_dequeue(rt_rq);
return 1;
}
@@ -190,27 +207,16 @@ static int sched_rt_ratio_exceeded(struc
return 0;
}
-static void update_sched_rt_period(struct rq *rq)
+static void update_sched_rt_period(struct rt_rq *rt_rq)
{
- struct rt_rq *rt_rq;
- u64 period;
-
- while (rq->clock > rq->rt_period_expire) {
- period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
- rq->rt_period_expire += period;
-
- for_each_leaf_rt_rq(rt_rq, rq) {
- unsigned long rt_ratio = sched_rt_ratio(rt_rq);
- u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
- rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
- if (rt_rq->rt_throttled) {
- rt_rq->rt_throttled = 0;
- sched_rt_ratio_enqueue(rt_rq);
- }
- }
-
- rq->rt_throttled = 0;
+ u64 period = sched_rt_period_ns(rt_rq);
+ unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+ u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+ rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+ if (rt_rq->rt_throttled) {
+ rt_rq->rt_throttled = 0;
+ sched_rt_ratio_enqueue(rt_rq);
}
}
@@ -238,11 +244,6 @@ static void update_curr_rt(struct rq *rq
cpuacct_charge(curr, delta_exec);
rt_rq->rt_time += delta_exec;
- /*
- * might make it a tad more accurate:
- *
- * update_sched_rt_period(rq);
- */
if (sched_rt_ratio_exceeded(rt_rq))
resched_task(curr);
}
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -311,7 +311,7 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
- .procname = "sched_rt_period_ms",
+ .procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
.maxlen = sizeof(unsigned int),
.mode = 0644,
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -153,7 +153,6 @@ void tick_nohz_update_jiffies(void)
void tick_nohz_stop_sched_tick(void)
{
unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
- unsigned long rt_jiffies;
struct tick_sched *ts;
ktime_t last_update, expires, now, delta;
struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -217,10 +216,6 @@ void tick_nohz_stop_sched_tick(void)
next_jiffies = get_next_timer_interrupt(last_jiffies);
delta_jiffies = next_jiffies - last_jiffies;
- rt_jiffies = rt_needs_cpu(cpu);
- if (rt_jiffies && rt_jiffies < delta_jiffies)
- delta_jiffies = rt_jiffies;
-
if (rcu_needs_cpu(cpu))
delta_jiffies = 1;
/*
--
next prev parent reply other threads:[~2008-01-06 16:26 UTC|newest]
Thread overview: 31+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
2008-01-06 16:11 ` [PATCH 01/11] sched: rt throttling vs no_hz Peter Zijlstra
2008-01-06 16:11 ` [PATCH 02/11] sched: load_balance_monitor rename Peter Zijlstra
2008-01-06 16:11 ` [PATCH 03/11] hrtimer: clean up cpu->base locking tricks Peter Zijlstra
2008-01-06 16:11 ` [PATCH 04/11] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
2008-01-07 11:56 ` Peter Zijlstra
2008-01-08 11:16 ` Ingo Molnar
2008-01-06 16:11 ` [PATCH 05/11] hrtimer: unlock hrtimer_wakeup Peter Zijlstra
2008-01-06 16:11 ` [PATCH 06/11] sched: rt-group: reduce rescheduling Peter Zijlstra
2008-01-06 16:11 ` Peter Zijlstra [this message]
2008-01-06 16:11 ` [PATCH 08/11] sched: rt-group: deal with PI Peter Zijlstra
2008-01-06 16:11 ` [PATCH 09/11] sched: rt-group: dynamic period ticks Peter Zijlstra
2008-01-06 16:11 ` [PATCH 10/11] sched: rt-group: EDF Peter Zijlstra
2008-01-06 16:11 ` [PATCH 11/11] sched: rt-group: interface Peter Zijlstra
2008-01-07 10:51 ` [PATCH 00/11] another rt group sched update Peter Zijlstra
2008-01-07 11:24 ` Peter Zijlstra
2008-01-07 12:23 ` Srivatsa Vaddagiri
2008-01-07 12:12 ` Peter Zijlstra
2008-01-07 16:57 ` [PATCH 12/11] sched: rt-group: uid-group interface Peter Zijlstra
2008-01-08 10:33 ` Ingo Molnar
2008-01-08 10:57 ` Dhaval Giani
2008-01-08 11:02 ` Peter Zijlstra
2008-01-08 14:31 ` Kay Sievers
2008-01-08 23:35 ` Peter Zijlstra
2008-01-08 23:58 ` Greg KH
2008-01-08 23:57 ` Ingo Molnar
2008-01-10 0:05 ` Greg KH
2008-02-07 4:17 ` Dhaval Giani
2008-02-07 5:42 ` Greg KH
2008-01-08 23:26 ` Peter Zijlstra
2008-01-07 11:17 ` [PATCH 00/11] another rt group sched update Ingo Molnar
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080106162124.111762000@chello.nl \
--to=a.p.zijlstra@chello.nl \
--cc=balbir@linux.vnet.ibm.com \
--cc=dmitry.adamushko@gmail.com \
--cc=ghaskins@novell.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=rostedt@goodmis.org \
--cc=tglx@linutronix.de \
--cc=vatsa@linux.vnet.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.