* [PATCH 0/7] more rt group sched updates
@ 2008-01-04 13:54 Peter Zijlstra
2008-01-04 13:54 ` [PATCH 1/7] sched: rt throttling vs no_hz Peter Zijlstra
` (7 more replies)
0 siblings, 8 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:54 UTC (permalink / raw)
To: LKML
Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner
includes the two patches from yesterday,
series against sched-devel
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 1/7] sched: rt throttling vs no_hz
2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
@ 2008-01-04 13:54 ` Peter Zijlstra
2008-01-04 13:54 ` [PATCH 2/7] sched: load_balance_monitor rename Peter Zijlstra
` (6 subsequent siblings)
7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:54 UTC (permalink / raw)
To: LKML
Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner
[-- Attachment #1: sched-rt-group-update.patch --]
[-- Type: text/plain, Size: 4621 bytes --]
We need to teach no_hz about the rt throttling because its tick driven.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sched.h | 2 ++
kernel/sched.c | 23 ++++++++++++++++++++++-
kernel/sched_rt.c | 30 ++++++++++++++++--------------
kernel/time/tick-sched.c | 5 +++++
4 files changed, 45 insertions(+), 15 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -230,6 +230,8 @@ static inline int select_nohz_load_balan
}
#endif
+extern unsigned long rt_needs_cpu(int cpu);
+
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -442,6 +442,7 @@ struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
u64 rt_period_expire;
+ int rt_throttled;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
@@ -594,6 +595,23 @@ static void update_rq_clock(struct rq *r
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+unsigned long rt_needs_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ u64 delta;
+
+ if (!rq->rt_throttled)
+ return 0;
+
+ if (rq->clock > rq->rt_period_expire)
+ return 1;
+
+ delta = rq->rt_period_expire - rq->clock;
+ do_div(delta, NSEC_PER_SEC / HZ);
+
+ return (unsigned long)delta;
+}
+
/*
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
@@ -7099,9 +7117,11 @@ static void init_rt_rq(struct rt_rq *rt_
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array->bitmap);
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+ rt_rq->highest_prio = MAX_RT_PRIO;
+#endif
#ifdef CONFIG_SMP
rt_rq->rt_nr_migratory = 0;
- rt_rq->highest_prio = MAX_RT_PRIO;
rt_rq->overloaded = 0;
#endif
@@ -7186,6 +7206,7 @@ void __init sched_init(void)
list_add(&init_task_group.list, &task_groups);
#endif
rq->rt_period_expire = 0;
+ rq->rt_throttled = 0;
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -175,7 +175,11 @@ static int sched_rt_ratio_exceeded(struc
ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
if (rt_rq->rt_time > ratio) {
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ rq->rt_throttled = 1;
rt_rq->rt_throttled = 1;
+
sched_rt_ratio_dequeue(rt_rq);
return 1;
}
@@ -183,18 +187,6 @@ static int sched_rt_ratio_exceeded(struc
return 0;
}
-static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period)
-{
- unsigned long rt_ratio = sched_rt_ratio(rt_rq);
- u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
- rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
- if (rt_rq->rt_throttled) {
- rt_rq->rt_throttled = 0;
- sched_rt_ratio_enqueue(rt_rq);
- }
-}
-
static void update_sched_rt_period(struct rq *rq)
{
struct rt_rq *rt_rq;
@@ -204,8 +196,18 @@ static void update_sched_rt_period(struc
period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
rq->rt_period_expire += period;
- for_each_leaf_rt_rq(rt_rq, rq)
- __update_sched_rt_period(rt_rq, period);
+ for_each_leaf_rt_rq(rt_rq, rq) {
+ unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+ u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+ rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+ if (rt_rq->rt_throttled) {
+ rt_rq->rt_throttled = 0;
+ sched_rt_ratio_enqueue(rt_rq);
+ }
+ }
+
+ rq->rt_throttled = 0;
}
}
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void)
void tick_nohz_stop_sched_tick(void)
{
unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+ unsigned long rt_jiffies;
struct tick_sched *ts;
ktime_t last_update, expires, now, delta;
struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void)
next_jiffies = get_next_timer_interrupt(last_jiffies);
delta_jiffies = next_jiffies - last_jiffies;
+ rt_jiffies = rt_needs_cpu(cpu);
+ if (rt_jiffies && rt_jiffies < delta_jiffies)
+ delta_jiffies = rt_jiffies;
+
if (rcu_needs_cpu(cpu))
delta_jiffies = 1;
/*
--
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 2/7] sched: load_balance_monitor rename
2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
2008-01-04 13:54 ` [PATCH 1/7] sched: rt throttling vs no_hz Peter Zijlstra
@ 2008-01-04 13:54 ` Peter Zijlstra
2008-01-04 13:55 ` [PATCH 3/7] hrtimer: clean up cpu->base locking tricks Peter Zijlstra
` (5 subsequent siblings)
7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:54 UTC (permalink / raw)
To: LKML
Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner
[-- Attachment #1: sched-group-fixes.patch --]
[-- Type: text/plain, Size: 830 bytes --]
don't start the load_balance_monitor when there is only a single cpu.
rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -7070,8 +7070,11 @@ void __init sched_init_smp(void)
sched_init_granularity();
#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (nr_cpu_ids == 1)
+ return;
+
lb_monitor_task = kthread_create(load_balance_monitor, NULL,
- "load_balance_monitor");
+ "group_balance");
if (!IS_ERR(lb_monitor_task)) {
lb_monitor_task->flags |= PF_NOFREEZE;
wake_up_process(lb_monitor_task);
--
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 3/7] hrtimer: clean up cpu->base locking tricks
2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
2008-01-04 13:54 ` [PATCH 1/7] sched: rt throttling vs no_hz Peter Zijlstra
2008-01-04 13:54 ` [PATCH 2/7] sched: load_balance_monitor rename Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
2008-01-04 13:55 ` [PATCH 4/7] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
` (4 subsequent siblings)
7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
To: LKML
Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner
[-- Attachment #1: hrtimer-unlocked-callback.patch --]
[-- Type: text/plain, Size: 3319 bytes --]
In order to more easily allow for the scheduler to use timers, clean up
the locking a bit.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/hrtimer.c | 109 +++++++++++++++++++++++++++++++++++++++++++----
kernel/time/tick-sched.c | 8 ---
2 files changed, 102 insertions(+), 15 deletions(-)
Index: linux-2.6/kernel/hrtimer.c
===================================================================
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -1063,7 +1063,9 @@ void hrtimer_interrupt(struct clock_even
basenow = ktime_add(now, base->offset);
while ((node = base->first)) {
+ enum hrtimer_restart (*fn)(struct hrtimer *);
struct hrtimer *timer;
+ int restart;
timer = rb_entry(node, struct hrtimer, node);
@@ -1091,13 +1093,29 @@ void hrtimer_interrupt(struct clock_even
HRTIMER_STATE_CALLBACK, 0);
timer_stats_account_hrtimer(timer);
+ fn = timer->function;
+ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+ /*
+ * Used for scheduler timers, avoid lock
+ * inversion with rq->lock and tasklist_lock.
+ *
+ * These timers are required to deal with
+ * enqueue expiry themselves and are not
+ * allowed to migrate.
+ */
+ spin_unlock(&cpu_base->lock);
+ restart = fn(timer);
+ spin_lock(&cpu_base->lock);
+ } else
+ restart = fn(timer);
+
/*
* Note: We clear the CALLBACK bit after
* enqueue_hrtimer to avoid reprogramming of
* the event hardware. This happens at the end
* of this function anyway.
*/
- if (timer->function(timer) != HRTIMER_NORESTART) {
+ if (restart != HRTIMER_NORESTART) {
BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
enqueue_hrtimer(timer, base, 0);
}
@@ -1268,7 +1361,7 @@ void hrtimer_init_sleeper(struct hrtimer
sl->timer.function = hrtimer_wakeup;
sl->task = task;
#ifdef CONFIG_HIGH_RES_TIMERS
- sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+ sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
#endif
}
@@ -1279,6 +1372,8 @@ static int __sched do_nanosleep(struct h
do {
set_current_state(TASK_INTERRUPTIBLE);
hrtimer_start(&t->timer, t->timer.expires, mode);
+ if (!hrtimer_active(&t->timer))
+ t->task = NULL;
if (likely(t->task))
schedule();
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -514,7 +514,6 @@ static enum hrtimer_restart tick_sched_t
{
struct tick_sched *ts =
container_of(timer, struct tick_sched, sched_timer);
- struct hrtimer_cpu_base *base = timer->base->cpu_base;
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
int cpu = smp_processor_id();
@@ -552,15 +551,8 @@ static enum hrtimer_restart tick_sched_t
touch_softlockup_watchdog();
ts->idle_jiffies++;
}
- /*
- * update_process_times() might take tasklist_lock, hence
- * drop the base lock. sched-tick hrtimers are per-CPU and
- * never accessible by userspace APIs, so this is safe to do.
- */
- spin_unlock(&base->lock);
update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING);
- spin_lock(&base->lock);
}
/* Do not restart, when we are in the idle loop */
--
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 4/7] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback
2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
` (2 preceding siblings ...)
2008-01-04 13:55 ` [PATCH 3/7] hrtimer: clean up cpu->base locking tricks Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
2008-01-04 13:55 ` [PATCH 5/7] sched: rt-group: reduce rescheduling Peter Zijlstra
` (3 subsequent siblings)
7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
To: LKML
Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner
[-- Attachment #1: hrtimer-fallback.patch --]
[-- Type: text/plain, Size: 9833 bytes --]
Currently all highres=off timers are run from softirq context, but
HRTIMER_CB_IRQSAFE_NO_SOFTIRQ timers expect to run from irq context.
Fix this up by splitting it similar to the highres=on case.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/hrtimer.h | 1
kernel/hrtimer.c | 223 ++++++++++++++++++++++++------------------------
kernel/timer.c | 3
3 files changed, 117 insertions(+), 110 deletions(-)
Index: linux-2.6/kernel/hrtimer.c
===================================================================
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -1030,6 +1030,85 @@ int hrtimer_get_res(const clockid_t whic
}
EXPORT_SYMBOL_GPL(hrtimer_get_res);
+static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
+{
+ spin_lock_irq(&cpu_base->lock);
+
+ while (!list_empty(&cpu_base->cb_pending)) {
+ enum hrtimer_restart (*fn)(struct hrtimer *);
+ struct hrtimer *timer;
+ int restart;
+
+ timer = list_entry(cpu_base->cb_pending.next,
+ struct hrtimer, cb_entry);
+
+ timer_stats_account_hrtimer(timer);
+
+ fn = timer->function;
+ __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
+ spin_unlock_irq(&cpu_base->lock);
+
+ restart = fn(timer);
+
+ spin_lock_irq(&cpu_base->lock);
+
+ timer->state &= ~HRTIMER_STATE_CALLBACK;
+ if (restart == HRTIMER_RESTART) {
+ BUG_ON(hrtimer_active(timer));
+ /*
+ * Enqueue the timer, allow reprogramming of the event
+ * device
+ */
+ enqueue_hrtimer(timer, timer->base, 1);
+ } else if (hrtimer_active(timer)) {
+ /*
+ * If the timer was rearmed on another CPU, reprogram
+ * the event device.
+ */
+ if (timer->base->first == &timer->node)
+ hrtimer_reprogram(timer, timer->base);
+ }
+ }
+ spin_unlock_irq(&cpu_base->lock);
+}
+
+static void __run_hrtimer(struct hrtimer *timer)
+{
+ struct hrtimer_clock_base *base = timer->base;
+ struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+ enum hrtimer_restart (*fn)(struct hrtimer *);
+ int restart;
+
+ __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ timer_stats_account_hrtimer(timer);
+
+ fn = timer->function;
+ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+ /*
+ * Used for scheduler timers, avoid lock inversion with
+ * rq->lock and tasklist_lock.
+ *
+ * These timers are required to deal with enqueue expiry
+ * themselves and are not allowed to migrate.
+ */
+ spin_unlock(&cpu_base->lock);
+ restart = fn(timer);
+ spin_lock(&cpu_base->lock);
+ } else
+ restart = fn(timer);
+
+ /*
+ * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
+ * reprogramming of the event hardware. This happens at the end of this
+ * function anyway.
+ */
+ if (restart != HRTIMER_NORESTART) {
+ BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+ enqueue_hrtimer(timer, base, 0);
+ }
+ timer->state &= ~HRTIMER_STATE_CALLBACK;
+}
+
#ifdef CONFIG_HIGH_RES_TIMERS
/*
@@ -1063,9 +1142,7 @@ void hrtimer_interrupt(struct clock_even
basenow = ktime_add(now, base->offset);
while ((node = base->first)) {
- enum hrtimer_restart (*fn)(struct hrtimer *);
struct hrtimer *timer;
- int restart;
timer = rb_entry(node, struct hrtimer, node);
@@ -1089,37 +1166,7 @@ void hrtimer_interrupt(struct clock_even
continue;
}
- __remove_hrtimer(timer, base,
- HRTIMER_STATE_CALLBACK, 0);
- timer_stats_account_hrtimer(timer);
-
- fn = timer->function;
- if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
- /*
- * Used for scheduler timers, avoid lock
- * inversion with rq->lock and tasklist_lock.
- *
- * These timers are required to deal with
- * enqueue expiry themselves and are not
- * allowed to migrate.
- */
- spin_unlock(&cpu_base->lock);
- restart = fn(timer);
- spin_lock(&cpu_base->lock);
- } else
- restart = fn(timer);
-
- /*
- * Note: We clear the CALLBACK bit after
- * enqueue_hrtimer to avoid reprogramming of
- * the event hardware. This happens at the end
- * of this function anyway.
- */
- if (restart != HRTIMER_NORESTART) {
- BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
- enqueue_hrtimer(timer, base, 0);
- }
- timer->state &= ~HRTIMER_STATE_CALLBACK;
+ __run_hrtimer(timer);
}
spin_unlock(&cpu_base->lock);
base++;
@@ -1140,52 +1187,41 @@ void hrtimer_interrupt(struct clock_even
static void run_hrtimer_softirq(struct softirq_action *h)
{
- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-
- spin_lock_irq(&cpu_base->lock);
-
- while (!list_empty(&cpu_base->cb_pending)) {
- enum hrtimer_restart (*fn)(struct hrtimer *);
- struct hrtimer *timer;
- int restart;
-
- timer = list_entry(cpu_base->cb_pending.next,
- struct hrtimer, cb_entry);
+ run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
+}
- timer_stats_account_hrtimer(timer);
+#endif /* CONFIG_HIGH_RES_TIMERS */
- fn = timer->function;
- __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
- spin_unlock_irq(&cpu_base->lock);
+/*
+ * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
+ */
+void hrtimer_run_pending(void)
+{
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
- restart = fn(timer);
+ if (hrtimer_hres_active())
+ return;
- spin_lock_irq(&cpu_base->lock);
+ /*
+ * This _is_ ugly: We have to check in the softirq context,
+ * whether we can switch to highres and / or nohz mode. The
+ * clocksource switch happens in the timer interrupt with
+ * xtime_lock held. Notification from there only sets the
+ * check bit in the tick_oneshot code, otherwise we might
+ * deadlock vs. xtime_lock.
+ */
+ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+ hrtimer_switch_to_hres();
- timer->state &= ~HRTIMER_STATE_CALLBACK;
- if (restart == HRTIMER_RESTART) {
- BUG_ON(hrtimer_active(timer));
- /*
- * Enqueue the timer, allow reprogramming of the event
- * device
- */
- enqueue_hrtimer(timer, timer->base, 1);
- } else if (hrtimer_active(timer)) {
- /*
- * If the timer was rearmed on another CPU, reprogram
- * the event device.
- */
- if (timer->base->first == &timer->node)
- hrtimer_reprogram(timer, timer->base);
- }
- }
- spin_unlock_irq(&cpu_base->lock);
+ run_hrtimer_pending(cpu_base);
}
-#endif /* CONFIG_HIGH_RES_TIMERS */
-
/*
- * Expire the per base hrtimer-queue:
+ * Called from hardirq context every jiffy
*/
static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
int index)
@@ -1203,42 +1239,23 @@ static inline void run_hrtimer_queue(str
while ((node = base->first)) {
struct hrtimer *timer;
- enum hrtimer_restart (*fn)(struct hrtimer *);
- int restart;
timer = rb_entry(node, struct hrtimer, node);
if (base->softirq_time.tv64 <= timer->expires.tv64)
break;
-#ifdef CONFIG_HIGH_RES_TIMERS
- WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ);
-#endif
- timer_stats_account_hrtimer(timer);
-
- fn = timer->function;
- __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
- spin_unlock_irq(&cpu_base->lock);
-
- restart = fn(timer);
-
- spin_lock_irq(&cpu_base->lock);
-
- timer->state &= ~HRTIMER_STATE_CALLBACK;
- if (restart != HRTIMER_NORESTART) {
- BUG_ON(hrtimer_active(timer));
- enqueue_hrtimer(timer, base, 0);
+ if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
+ __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
+ list_add_tail(&timer->cb_entry,
+ &base->cpu_base->cb_pending);
+ continue;
}
+
+ __run_hrtimer(timer);
}
spin_unlock_irq(&cpu_base->lock);
}
-/*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
- */
void hrtimer_run_queues(void)
{
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
@@ -1247,18 +1264,6 @@ void hrtimer_run_queues(void)
if (hrtimer_hres_active())
return;
- /*
- * This _is_ ugly: We have to check in the softirq context,
- * whether we can switch to highres and / or nohz mode. The
- * clocksource switch happens in the timer interrupt with
- * xtime_lock held. Notification from there only sets the
- * check bit in the tick_oneshot code, otherwise we might
- * deadlock vs. xtime_lock.
- */
- if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
- if (hrtimer_switch_to_hres())
- return;
-
hrtimer_get_softirq_time(cpu_base);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -896,7 +896,7 @@ static void run_timer_softirq(struct sof
{
tvec_base_t *base = __get_cpu_var(tvec_bases);
- hrtimer_run_queues();
+ hrtimer_run_pending();
if (time_after_eq(jiffies, base->timer_jiffies))
__run_timers(base);
@@ -907,6 +907,7 @@ static void run_timer_softirq(struct sof
*/
void run_local_timers(void)
{
+ hrtimer_run_queues();
raise_softirq(TIMER_SOFTIRQ);
softlockup_tick();
}
Index: linux-2.6/include/linux/hrtimer.h
===================================================================
--- linux-2.6.orig/include/linux/hrtimer.h
+++ linux-2.6/include/linux/hrtimer.h
@@ -319,6 +319,7 @@ extern void hrtimer_init_sleeper(struct
/* Soft interrupt function to run the hrtimer queues: */
extern void hrtimer_run_queues(void);
+extern void hrtimer_run_pending(void);
/* Bootup initialization: */
extern void __init hrtimers_init(void);
--
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 5/7] sched: rt-group: reduce rescheduling
2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
` (3 preceding siblings ...)
2008-01-04 13:55 ` [PATCH 4/7] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
2008-01-04 13:55 ` [PATCH 6/7] sched: rt-group: per group period Peter Zijlstra
` (2 subsequent siblings)
7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
To: LKML
Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner
[-- Attachment #1: sched-rt-group-fix-enqueue.patch --]
[-- Type: text/plain, Size: 748 bytes --]
Only reschedule if the new group has a higher prio task.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched_rt.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -94,8 +94,11 @@ static void sched_rt_ratio_enqueue(struc
struct sched_rt_entity *rt_se = rt_rq->rt_se;
if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
+ struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+
enqueue_rt_entity(rt_se);
- resched_task(rq_of_rt_rq(rt_rq)->curr);
+ if (rt_rq->highest_prio < curr->prio)
+ resched_task(curr);
}
}
--
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 6/7] sched: rt-group: per group period
2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
` (4 preceding siblings ...)
2008-01-04 13:55 ` [PATCH 5/7] sched: rt-group: reduce rescheduling Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
2008-01-05 14:51 ` Peter Zijlstra
2008-01-04 13:55 ` [PATCH 7/7] sched: rt-group: deal with PI Peter Zijlstra
2008-01-05 13:32 ` [PATCH 0/7] more rt group sched updates Ingo Molnar
7 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
To: LKML
Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner
[-- Attachment #1: sched-rt-rq-hrtimer.patch --]
[-- Type: text/plain, Size: 14318 bytes --]
Steven asked for per group periods in order to get closer to RMA or EDF
scheduling.
Use the fancy new hrtimers to provide a per group period
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sched.h | 2
kernel/sched.c | 225 +++++++++++++++++++++++++++++++++++++++++------
kernel/sched_rt.c | 61 ++++++------
kernel/sysctl.c | 2
kernel/time/tick-sched.c | 5 -
5 files changed, 232 insertions(+), 63 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -230,8 +230,6 @@ static inline int select_nohz_load_balan
}
#endif
-extern unsigned long rt_needs_cpu(int cpu);
-
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -177,6 +177,7 @@ struct task_group {
struct rt_rq **rt_rq;
unsigned int rt_ratio;
+ ktime_t rt_period;
/*
* shares assigned to a task group governs how much of cpu bandwidth
@@ -372,6 +373,7 @@ struct rt_rq {
#endif
int rt_throttled;
u64 rt_time;
+ struct hrtimer rt_period_timer;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq;
@@ -441,8 +443,6 @@ struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
- u64 rt_period_expire;
- int rt_throttled;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
@@ -595,23 +595,6 @@ static void update_rq_clock(struct rq *r
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-unsigned long rt_needs_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- u64 delta;
-
- if (!rq->rt_throttled)
- return 0;
-
- if (rq->clock > rq->rt_period_expire)
- return 1;
-
- delta = rq->rt_period_expire - rq->clock;
- do_div(delta, NSEC_PER_SEC / HZ);
-
- return (unsigned long)delta;
-}
-
/*
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
@@ -652,10 +635,10 @@ const_debug unsigned int sysctl_sched_fe
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
* default: 1s
*/
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+const_debug unsigned int sysctl_sched_rt_period = 1000000;
#define SCHED_RT_FRAC_SHIFT 16
#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT)
@@ -664,7 +647,7 @@ const_debug unsigned int sysctl_sched_rt
* ratio of time -rt tasks may consume.
* default: 95%
*/
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+const_debug unsigned int sysctl_sched_rt_ratio = 32768; //62259;
/*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1245,6 +1228,12 @@ static unsigned long cpu_avg_load_per_ta
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
#endif /* CONFIG_SMP */
+static inline ktime_t ns_to_ktime(u64 ns)
+{
+ static const ktime_t ktime_zero = { .tv64 = 0 };
+ return ktime_add_ns(ktime_zero, ns);
+}
+
#include "sched_stats.h"
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -3741,7 +3730,6 @@ void scheduler_tick(void)
rq->tick_timestamp = rq->clock;
update_cpu_load(rq);
curr->sched_class->task_tick(rq, curr, 0);
- update_sched_rt_period(rq);
spin_unlock(&rq->lock);
#ifdef CONFIG_SMP
@@ -5287,6 +5275,152 @@ static inline void sched_init_granularit
sysctl_sched_batch_wakeup_granularity *= factor;
}
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+ struct rt_rq *rt_rq =
+ container_of(timer, struct rt_rq, rt_period_timer);
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+ ktime_t now = ktime_get();
+
+ WARN_ON(smp_processor_id() != cpu_of(rq));
+ WARN_ON(!in_irq());
+
+ spin_lock(&rq->lock);
+ update_sched_rt_period(rt_rq);
+ spin_unlock(&rq->lock);
+
+ hrtimer_forward(timer, now, sched_rt_period(rt_rq));
+ return HRTIMER_RESTART;
+}
+
+static void sched_rt_period_start(struct rt_rq *rt_rq)
+{
+ ktime_t period = sched_rt_period(rt_rq);
+
+ WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq)));
+
+ for (;;) {
+ ktime_t now = ktime_get();
+ hrtimer_forward(&rt_rq->rt_period_timer, now, period);
+ hrtimer_start(&rt_rq->rt_period_timer,
+ rt_rq->rt_period_timer.expires,
+ HRTIMER_MODE_ABS);
+ if (hrtimer_active(&rt_rq->rt_period_timer))
+ break;
+ }
+}
+
+static void sched_rt_period_stop(struct rt_rq *rt_rq)
+{
+ hrtimer_cancel(&rt_rq->rt_period_timer);
+}
+
+static void sched_rt_period_start_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rt_rq *rt_rq;
+
+ for_each_leaf_rt_rq(rt_rq, rq)
+ sched_rt_period_start(rt_rq);
+}
+
+#ifdef CONFIG_SMP
+static void sched_rt_period_stop_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rt_rq *rt_rq;
+
+ for_each_leaf_rt_rq(rt_rq, rq)
+ sched_rt_period_stop(rt_rq);
+}
+
+static int sched_rt_period_hotplug(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ sched_rt_period_start_cpu(cpu);
+ return NOTIFY_OK;
+
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ sched_rt_period_stop_cpu(cpu);
+ return NOTIFY_OK;
+
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ return NOTIFY_OK;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+
+static void __init __sched_rt_period_init(void *arg)
+{
+ int cpu = smp_processor_id();
+ sched_rt_period_start_cpu(cpu);
+}
+
+static void __init sched_rt_period_init(void)
+{
+ on_each_cpu(__sched_rt_period_init, NULL, 0, 1);
+ hotcpu_notifier(sched_rt_period_hotplug, 0);
+}
+
+static void __sched_rt_period_init_tg(void *arg)
+{
+ struct task_group *tg = arg;
+ int cpu = smp_processor_id();
+
+ sched_rt_period_start(tg->rt_rq[cpu]);
+}
+
+static void sched_rt_period_init_tg(struct task_group *tg)
+{
+ on_each_cpu(__sched_rt_period_init_tg, tg, 0, 1);
+}
+
+static void __sched_rt_period_destroy_tg(void *arg)
+{
+ struct task_group *tg = arg;
+ int cpu = smp_processor_id();
+
+ sched_rt_period_stop(tg->rt_rq[cpu]);
+}
+
+static void sched_rt_period_destroy_tg(struct task_group *tg)
+{
+ on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1);
+}
+#else
+static void __init sched_rt_period_init(void)
+{
+ sched_rt_period_start_cpu(0);
+}
+
+static void sched_rt_period_init_tg(struct task_group *tg)
+{
+ sched_rt_period_start(tg->rt_rq[0]);
+}
+
+static void sched_rt_period_destroy_tg(struct task_group *tg)
+{
+ sched_rt_period_stop(tg->rt_rq[0]);
+}
+#endif
+
#ifdef CONFIG_SMP
/*
* This is how migration works:
@@ -7068,6 +7202,7 @@ void __init sched_init_smp(void)
if (set_cpus_allowed(current, non_isolated_cpus) < 0)
BUG();
sched_init_granularity();
+ sched_rt_period_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
if (nr_cpu_ids == 1)
@@ -7088,6 +7223,7 @@ void __init sched_init_smp(void)
void __init sched_init_smp(void)
{
sched_init_granularity();
+ sched_rt_period_init();
}
#endif /* CONFIG_SMP */
@@ -7131,6 +7267,11 @@ static void init_rt_rq(struct rt_rq *rt_
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
+ hrtimer_init(&rt_rq->rt_period_timer,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rt_rq->rt_period_timer.function = sched_rt_period_timer;
+ rt_rq->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+
#ifdef CONFIG_FAIR_GROUP_SCHED
rt_rq->rq = rq;
#endif
@@ -7201,6 +7342,8 @@ void __init sched_init(void)
&per_cpu(init_sched_entity, i), i, 1);
init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+ init_task_group.rt_period =
+ ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
init_tg_rt_entry(rq, &init_task_group,
&per_cpu(init_rt_rq, i),
@@ -7208,8 +7351,6 @@ void __init sched_init(void)
list_add(&init_task_group.list, &task_groups);
#endif
- rq->rt_period_expire = 0;
- rq->rt_throttled = 0;
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
@@ -7598,6 +7739,7 @@ struct task_group *sched_create_group(vo
tg->shares = NICE_0_LOAD;
tg->rt_ratio = 0; /* XXX */
+ tg->rt_period = ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
for_each_possible_cpu(i) {
rq = cpu_rq(i);
@@ -7637,6 +7779,8 @@ struct task_group *sched_create_group(vo
list_add_rcu(&tg->list, &task_groups);
unlock_task_group_list();
+ sched_rt_period_init_tg(tg);
+
return tg;
err:
@@ -7658,6 +7802,8 @@ void sched_destroy_group(struct task_gro
struct rt_rq *rt_rq = NULL;
int i;
+ sched_rt_period_destroy_tg(tg);
+
lock_task_group_list();
for_each_possible_cpu(i) {
cfs_rq = tg->cfs_rq[i];
@@ -7815,6 +7961,19 @@ unsigned long sched_group_rt_ratio(struc
return tg->rt_ratio;
}
+int sched_group_set_rt_period(struct task_group *tg, unsigned long rt_period)
+{
+ tg->rt_period = ns_to_ktime((u64)rt_period * NSEC_PER_USEC);
+ return 0;
+}
+
+unsigned long sched_group_rt_period(struct task_group *tg)
+{
+ u64 ns = ktime_to_ns(tg->rt_period);
+ do_div(ns, NSEC_PER_USEC);
+ return ns;
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7903,6 +8062,17 @@ static u64 cpu_rt_ratio_read_uint(struct
return (u64) tg->rt_ratio;
}
+static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+ u64 rt_period_val)
+{
+ return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_val);
+}
+
+static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+ return (u64) sched_group_rt_period(cgroup_tg(cgrp));
+}
+
static struct cftype cpu_files[] = {
{
.name = "shares",
@@ -7914,6 +8084,11 @@ static struct cftype cpu_files[] = {
.read_uint = cpu_rt_ratio_read_uint,
.write_uint = cpu_rt_ratio_write_uint,
},
+ {
+ .name = "rt_period_us",
+ .read_uint = cpu_rt_period_read_uint,
+ .write_uint = cpu_rt_period_write_uint,
+ },
};
static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -65,6 +65,17 @@ static inline unsigned int sched_rt_rati
return rt_rq->tg->rt_ratio;
}
+static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
+{
+ BUG_ON(!rt_rq->tg);
+ return rt_rq->tg->rt_period;
+}
+
+static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq)
+{
+ return ktime_to_ns(sched_rt_period(rt_rq));
+}
+
#define for_each_leaf_rt_rq(rt_rq, rq) \
list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
@@ -117,6 +128,16 @@ static inline unsigned int sched_rt_rati
return sysctl_sched_rt_ratio;
}
+static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
+{
+ return ns_to_ktime((u64)sysctl_sched_rt_period * NSEC_PER_USEC);
+}
+
+static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq)
+{
+ return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+
#define for_each_leaf_rt_rq(rt_rq, rq) \
for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
@@ -174,15 +195,11 @@ static int sched_rt_ratio_exceeded(struc
if (rt_rq->rt_throttled)
return 1;
- period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+ period = sched_rt_period_ns(rt_rq);
ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
if (rt_rq->rt_time > ratio) {
- struct rq *rq = rq_of_rt_rq(rt_rq);
-
- rq->rt_throttled = 1;
rt_rq->rt_throttled = 1;
-
sched_rt_ratio_dequeue(rt_rq);
return 1;
}
@@ -190,27 +207,16 @@ static int sched_rt_ratio_exceeded(struc
return 0;
}
-static void update_sched_rt_period(struct rq *rq)
+static void update_sched_rt_period(struct rt_rq *rt_rq)
{
- struct rt_rq *rt_rq;
- u64 period;
-
- while (rq->clock > rq->rt_period_expire) {
- period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
- rq->rt_period_expire += period;
-
- for_each_leaf_rt_rq(rt_rq, rq) {
- unsigned long rt_ratio = sched_rt_ratio(rt_rq);
- u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
- rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
- if (rt_rq->rt_throttled) {
- rt_rq->rt_throttled = 0;
- sched_rt_ratio_enqueue(rt_rq);
- }
- }
-
- rq->rt_throttled = 0;
+ u64 period = sched_rt_period_ns(rt_rq);
+ unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+ u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+ rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+ if (rt_rq->rt_throttled) {
+ rt_rq->rt_throttled = 0;
+ sched_rt_ratio_enqueue(rt_rq);
}
}
@@ -238,11 +244,6 @@ static void update_curr_rt(struct rq *rq
cpuacct_charge(curr, delta_exec);
rt_rq->rt_time += delta_exec;
- /*
- * might make it a tad more accurate:
- *
- * update_sched_rt_period(rq);
- */
if (sched_rt_ratio_exceeded(rt_rq))
resched_task(curr);
}
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -311,7 +311,7 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
- .procname = "sched_rt_period_ms",
+ .procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
.maxlen = sizeof(unsigned int),
.mode = 0644,
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -153,7 +153,6 @@ void tick_nohz_update_jiffies(void)
void tick_nohz_stop_sched_tick(void)
{
unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
- unsigned long rt_jiffies;
struct tick_sched *ts;
ktime_t last_update, expires, now, delta;
struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -217,10 +216,6 @@ void tick_nohz_stop_sched_tick(void)
next_jiffies = get_next_timer_interrupt(last_jiffies);
delta_jiffies = next_jiffies - last_jiffies;
- rt_jiffies = rt_needs_cpu(cpu);
- if (rt_jiffies && rt_jiffies < delta_jiffies)
- delta_jiffies = rt_jiffies;
-
if (rcu_needs_cpu(cpu))
delta_jiffies = 1;
/*
--
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 7/7] sched: rt-group: deal with PI
2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
` (5 preceding siblings ...)
2008-01-04 13:55 ` [PATCH 6/7] sched: rt-group: per group period Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
2008-01-05 13:32 ` [PATCH 0/7] more rt group sched updates Ingo Molnar
7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
To: LKML
Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner
[-- Attachment #1: sched-rt-group-pi.patch --]
[-- Type: text/plain, Size: 4082 bytes --]
Steven mentioned the fun case where a lock holding task will be throttled.
Simple fix: allow groups that have boosted tasks to run anyway.
This is ofcourse not quite correct. Needs more tricks.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 3 +++
kernel/sched_rt.c | 50 ++++++++++++++++++++++++++++++++++++++++++--------
2 files changed, 45 insertions(+), 8 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -376,6 +376,8 @@ struct rt_rq {
struct hrtimer rt_period_timer;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ unsigned long rt_nr_boosted;
+
struct rq *rq;
struct list_head leaf_rt_rq_list;
struct task_group *tg;
@@ -7273,6 +7275,7 @@ static void init_rt_rq(struct rt_rq *rt_
rt_rq->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ rt_rq->rt_nr_boosted = 0;
rt_rq->rq = rq;
#endif
}
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -121,6 +121,11 @@ static void sched_rt_ratio_dequeue(struc
dequeue_rt_entity(rt_se);
}
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
#else
static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
@@ -170,6 +175,10 @@ static inline void sched_rt_ratio_dequeu
{
}
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled;
+}
#endif
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -184,27 +193,42 @@ static inline int rt_se_prio(struct sche
return rt_task_of(rt_se)->prio;
}
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+ struct task_struct *p;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+ if (rt_rq)
+ return !!rt_rq->rt_nr_boosted;
+#endif
+
+ p = rt_task_of(rt_se);
+ return p->prio != p->normal_prio;
+}
+
static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
{
unsigned int rt_ratio = sched_rt_ratio(rt_rq);
u64 period, ratio;
if (rt_ratio == SCHED_RT_FRAC)
- return 0;
+ goto out;
if (rt_rq->rt_throttled)
- return 1;
+ goto out;
period = sched_rt_period_ns(rt_rq);
ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
if (rt_rq->rt_time > ratio) {
rt_rq->rt_throttled = 1;
- sched_rt_ratio_dequeue(rt_rq);
- return 1;
+ if (rt_rq_throttled(rt_rq))
+ sched_rt_ratio_dequeue(rt_rq);
}
- return 0;
+out:
+ return rt_rq_throttled(rt_rq);
}
static void update_sched_rt_period(struct rt_rq *rt_rq)
@@ -265,6 +289,10 @@ void inc_rt_tasks(struct sched_rt_entity
update_rt_migration(rq_of_rt_rq(rt_rq));
#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted++;
+#endif
}
static inline
@@ -295,6 +323,12 @@ void dec_rt_tasks(struct sched_rt_entity
update_rt_migration(rq_of_rt_rq(rt_rq));
#endif /* CONFIG_SMP */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted--;
+
+ WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+#endif
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +337,7 @@ static void enqueue_rt_entity(struct sch
struct rt_prio_array *array = &rt_rq->active;
struct rt_rq *group_rq = group_rt_rq(rt_se);
- if (group_rq && group_rq->rt_throttled)
+ if (group_rq && rt_rq_throttled(group_rq))
return;
list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -476,7 +510,7 @@ static struct sched_rt_entity *pick_next
struct list_head *queue;
int idx;
- if (sched_rt_ratio_exceeded(rt_rq))
+ if (rt_rq_throttled(rt_rq))
goto out;
idx = sched_find_first_bit(array->bitmap);
@@ -500,7 +534,7 @@ static struct task_struct *pick_next_tas
if (unlikely(!rt_rq->rt_nr_running))
return NULL;
- if (sched_rt_ratio_exceeded(rt_rq))
+ if (rt_rq_throttled(rt_rq))
return NULL;
do {
--
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 0/7] more rt group sched updates
2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
` (6 preceding siblings ...)
2008-01-04 13:55 ` [PATCH 7/7] sched: rt-group: deal with PI Peter Zijlstra
@ 2008-01-05 13:32 ` Ingo Molnar
7 siblings, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2008-01-05 13:32 UTC (permalink / raw)
To: Peter Zijlstra
Cc: LKML, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
Steven Rostedt, Gregory Haskins, Thomas Gleixner
* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> includes the two patches from yesterday,
>
> series against sched-devel
thanks, applied.
Ingo
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 6/7] sched: rt-group: per group period
2008-01-04 13:55 ` [PATCH 6/7] sched: rt-group: per group period Peter Zijlstra
@ 2008-01-05 14:51 ` Peter Zijlstra
2008-01-05 15:05 ` Ingo Molnar
0 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-05 14:51 UTC (permalink / raw)
To: LKML
Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
Steven Rostedt, Gregory Haskins, Thomas Gleixner
Could you please fold this into the 6/7 patch.
It reverts a wandering chunk (the 32768 thing), but more importantly
it fixes !FAIR_GROUP_SCHED compilation.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -647,7 +647,7 @@ const_debug unsigned int sysctl_sched_rt
* ratio of time -rt tasks may consume.
* default: 95%
*/
-const_debug unsigned int sysctl_sched_rt_ratio = 32768; //62259;
+const_debug unsigned int sysctl_sched_rt_ratio = 62259;
/*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -5379,6 +5379,7 @@ static void __init sched_rt_period_init(
hotcpu_notifier(sched_rt_period_hotplug, 0);
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
static void __sched_rt_period_init_tg(void *arg)
{
struct task_group *tg = arg;
@@ -5404,12 +5405,14 @@ static void sched_rt_period_destroy_tg(s
{
on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1);
}
-#else
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#else /* CONFIG_SMP */
static void __init sched_rt_period_init(void)
{
sched_rt_period_start_cpu(0);
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
static void sched_rt_period_init_tg(struct task_group *tg)
{
sched_rt_period_start(tg->rt_rq[0]);
@@ -5419,7 +5422,8 @@ static void sched_rt_period_destroy_tg(s
{
sched_rt_period_stop(tg->rt_rq[0]);
}
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_SMP */
#ifdef CONFIG_SMP
/*
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 6/7] sched: rt-group: per group period
2008-01-05 14:51 ` Peter Zijlstra
@ 2008-01-05 15:05 ` Ingo Molnar
0 siblings, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2008-01-05 15:05 UTC (permalink / raw)
To: Peter Zijlstra
Cc: LKML, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
Steven Rostedt, Gregory Haskins, Thomas Gleixner
* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> Could you please fold this into the 6/7 patch.
>
> It reverts a wandering chunk (the 32768 thing), but more importantly
> it fixes !FAIR_GROUP_SCHED compilation.
done. Btw., there's a new warning:
kernel/sched_rt.c:197: warning: 'rt_se_boosted' defined but not used
Ingo
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2008-01-05 15:06 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
2008-01-04 13:54 ` [PATCH 1/7] sched: rt throttling vs no_hz Peter Zijlstra
2008-01-04 13:54 ` [PATCH 2/7] sched: load_balance_monitor rename Peter Zijlstra
2008-01-04 13:55 ` [PATCH 3/7] hrtimer: clean up cpu->base locking tricks Peter Zijlstra
2008-01-04 13:55 ` [PATCH 4/7] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
2008-01-04 13:55 ` [PATCH 5/7] sched: rt-group: reduce rescheduling Peter Zijlstra
2008-01-04 13:55 ` [PATCH 6/7] sched: rt-group: per group period Peter Zijlstra
2008-01-05 14:51 ` Peter Zijlstra
2008-01-05 15:05 ` Ingo Molnar
2008-01-04 13:55 ` [PATCH 7/7] sched: rt-group: deal with PI Peter Zijlstra
2008-01-05 13:32 ` [PATCH 0/7] more rt group sched updates Ingo Molnar
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox