[PATCH 0/7] more rt group sched updates

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/7] more rt group sched updates
@ 2008-01-04 13:54 Peter Zijlstra
  2008-01-04 13:54 ` [PATCH 1/7] sched: rt throttling vs no_hz Peter Zijlstra
                   ` (7 more replies)
  0 siblings, 8 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:54 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

includes the two patches from yesterday,

series against sched-devel



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/7] sched: rt throttling vs no_hz
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
@ 2008-01-04 13:54 ` Peter Zijlstra
  2008-01-04 13:54 ` [PATCH 2/7] sched: load_balance_monitor rename Peter Zijlstra
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:54 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-group-update.patch --]
[-- Type: text/plain, Size: 4621 bytes --]

We need to teach no_hz about the rt throttling because its tick driven.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h    |    2 ++
 kernel/sched.c           |   23 ++++++++++++++++++++++-
 kernel/sched_rt.c        |   30 ++++++++++++++++--------------
 kernel/time/tick-sched.c |    5 +++++
 4 files changed, 45 insertions(+), 15 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -230,6 +230,8 @@ static inline int select_nohz_load_balan
 }
 #endif
 
+extern unsigned long rt_needs_cpu(int cpu);
+
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -442,6 +442,7 @@ struct rq {
 	struct cfs_rq cfs;
 	struct rt_rq rt;
 	u64 rt_period_expire;
+	int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
@@ -594,6 +595,23 @@ static void update_rq_clock(struct rq *r
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+unsigned long rt_needs_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	u64 delta;
+
+	if (!rq->rt_throttled)
+		return 0;
+
+	if (rq->clock > rq->rt_period_expire)
+		return 1;
+
+	delta = rq->rt_period_expire - rq->clock;
+	do_div(delta, NSEC_PER_SEC / HZ);
+
+	return (unsigned long)delta;
+}
+
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -7099,9 +7117,11 @@ static void init_rt_rq(struct rt_rq *rt_
 	/* delimiter for bitsearch: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	rt_rq->highest_prio = MAX_RT_PRIO;
+#endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
-	rt_rq->highest_prio = MAX_RT_PRIO;
 	rt_rq->overloaded = 0;
 #endif
 
@@ -7186,6 +7206,7 @@ void __init sched_init(void)
 		list_add(&init_task_group.list, &task_groups);
 #endif
 		rq->rt_period_expire = 0;
+		rq->rt_throttled = 0;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -175,7 +175,11 @@ static int sched_rt_ratio_exceeded(struc
 	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
 	if (rt_rq->rt_time > ratio) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
+
+		rq->rt_throttled = 1;
 		rt_rq->rt_throttled = 1;
+
 		sched_rt_ratio_dequeue(rt_rq);
 		return 1;
 	}
@@ -183,18 +187,6 @@ static int sched_rt_ratio_exceeded(struc
 	return 0;
 }
 
-static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period)
-{
-	unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-	u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-	rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
-	if (rt_rq->rt_throttled) {
-		rt_rq->rt_throttled = 0;
-		sched_rt_ratio_enqueue(rt_rq);
-	}
-}
-
 static void update_sched_rt_period(struct rq *rq)
 {
 	struct rt_rq *rt_rq;
@@ -204,8 +196,18 @@ static void update_sched_rt_period(struc
 		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
 		rq->rt_period_expire += period;
 
-		for_each_leaf_rt_rq(rt_rq, rq)
-			__update_sched_rt_period(rt_rq, period);
+		for_each_leaf_rt_rq(rt_rq, rq) {
+			unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+			u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+			rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+			if (rt_rq->rt_throttled) {
+				rt_rq->rt_throttled = 0;
+				sched_rt_ratio_enqueue(rt_rq);
+			}
+		}
+
+		rq->rt_throttled = 0;
 	}
 }
 
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void)
 void tick_nohz_stop_sched_tick(void)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+	unsigned long rt_jiffies;
 	struct tick_sched *ts;
 	ktime_t last_update, expires, now, delta;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void)
 	next_jiffies = get_next_timer_interrupt(last_jiffies);
 	delta_jiffies = next_jiffies - last_jiffies;
 
+	rt_jiffies = rt_needs_cpu(cpu);
+	if (rt_jiffies && rt_jiffies < delta_jiffies)
+		delta_jiffies = rt_jiffies;
+
 	if (rcu_needs_cpu(cpu))
 		delta_jiffies = 1;
 	/*

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 2/7] sched: load_balance_monitor rename
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
  2008-01-04 13:54 ` [PATCH 1/7] sched: rt throttling vs no_hz Peter Zijlstra
@ 2008-01-04 13:54 ` Peter Zijlstra
  2008-01-04 13:55 ` [PATCH 3/7] hrtimer: clean up cpu->base locking tricks Peter Zijlstra
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:54 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-group-fixes.patch --]
[-- Type: text/plain, Size: 830 bytes --]

don't start the load_balance_monitor when there is only a single cpu.
rename the kthread because its currently longer than TASK_COMM_LEN

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -7070,8 +7070,11 @@ void __init sched_init_smp(void)
 	sched_init_granularity();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	if (nr_cpu_ids == 1)
+		return;
+
 	lb_monitor_task = kthread_create(load_balance_monitor, NULL,
-					 "load_balance_monitor");
+					 "group_balance");
 	if (!IS_ERR(lb_monitor_task)) {
 		lb_monitor_task->flags |= PF_NOFREEZE;
 		wake_up_process(lb_monitor_task);

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 3/7] hrtimer: clean up cpu->base locking tricks
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
  2008-01-04 13:54 ` [PATCH 1/7] sched: rt throttling vs no_hz Peter Zijlstra
  2008-01-04 13:54 ` [PATCH 2/7] sched: load_balance_monitor rename Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
  2008-01-04 13:55 ` [PATCH 4/7] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: hrtimer-unlocked-callback.patch --]
[-- Type: text/plain, Size: 3319 bytes --]

In order to more easily allow for the scheduler to use timers, clean up
the locking a bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/hrtimer.c         |  109 +++++++++++++++++++++++++++++++++++++++++++----
 kernel/time/tick-sched.c |    8 ---
 2 files changed, 102 insertions(+), 15 deletions(-)

Index: linux-2.6/kernel/hrtimer.c
===================================================================
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -1063,7 +1063,9 @@ void hrtimer_interrupt(struct clock_even
 		basenow = ktime_add(now, base->offset);
 
 		while ((node = base->first)) {
+			enum hrtimer_restart (*fn)(struct hrtimer *);
 			struct hrtimer *timer;
+			int restart;
 
 			timer = rb_entry(node, struct hrtimer, node);
 
@@ -1091,13 +1093,29 @@ void hrtimer_interrupt(struct clock_even
 					 HRTIMER_STATE_CALLBACK, 0);
 			timer_stats_account_hrtimer(timer);
 
+			fn = timer->function;
+			if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+				/*
+				 * Used for scheduler timers, avoid lock
+				 * inversion with rq->lock and tasklist_lock.
+				 *
+				 * These timers are required to deal with
+				 * enqueue expiry themselves and are not
+				 * allowed to migrate.
+				 */
+				spin_unlock(&cpu_base->lock);
+				restart = fn(timer);
+				spin_lock(&cpu_base->lock);
+			} else
+				restart = fn(timer);
+
 			/*
 			 * Note: We clear the CALLBACK bit after
 			 * enqueue_hrtimer to avoid reprogramming of
 			 * the event hardware. This happens at the end
 			 * of this function anyway.
 			 */
-			if (timer->function(timer) != HRTIMER_NORESTART) {
+			if (restart != HRTIMER_NORESTART) {
 				BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
 				enqueue_hrtimer(timer, base, 0);
 			}
@@ -1268,7 +1361,7 @@ void hrtimer_init_sleeper(struct hrtimer
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
 #ifdef CONFIG_HIGH_RES_TIMERS
-	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 #endif
 }
 
@@ -1279,6 +1372,8 @@ static int __sched do_nanosleep(struct h
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
 		hrtimer_start(&t->timer, t->timer.expires, mode);
+		if (!hrtimer_active(&t->timer))
+			t->task = NULL;
 
 		if (likely(t->task))
 			schedule();
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -514,7 +514,6 @@ static enum hrtimer_restart tick_sched_t
 {
 	struct tick_sched *ts =
 		container_of(timer, struct tick_sched, sched_timer);
-	struct hrtimer_cpu_base *base = timer->base->cpu_base;
 	struct pt_regs *regs = get_irq_regs();
 	ktime_t now = ktime_get();
 	int cpu = smp_processor_id();
@@ -552,15 +551,8 @@ static enum hrtimer_restart tick_sched_t
 			touch_softlockup_watchdog();
 			ts->idle_jiffies++;
 		}
-		/*
-		 * update_process_times() might take tasklist_lock, hence
-		 * drop the base lock. sched-tick hrtimers are per-CPU and
-		 * never accessible by userspace APIs, so this is safe to do.
-		 */
-		spin_unlock(&base->lock);
 		update_process_times(user_mode(regs));
 		profile_tick(CPU_PROFILING);
-		spin_lock(&base->lock);
 	}
 
 	/* Do not restart, when we are in the idle loop */

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 4/7] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
                   ` (2 preceding siblings ...)
  2008-01-04 13:55 ` [PATCH 3/7] hrtimer: clean up cpu->base locking tricks Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
  2008-01-04 13:55 ` [PATCH 5/7] sched: rt-group: reduce rescheduling Peter Zijlstra
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: hrtimer-fallback.patch --]
[-- Type: text/plain, Size: 9833 bytes --]

Currently all highres=off timers are run from softirq context, but
HRTIMER_CB_IRQSAFE_NO_SOFTIRQ timers expect to run from irq context.

Fix this up by splitting it similar to the highres=on case.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/hrtimer.h |    1 
 kernel/hrtimer.c        |  223 ++++++++++++++++++++++++------------------------
 kernel/timer.c          |    3 
 3 files changed, 117 insertions(+), 110 deletions(-)

Index: linux-2.6/kernel/hrtimer.c
===================================================================
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -1030,6 +1030,85 @@ int hrtimer_get_res(const clockid_t whic
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
 
+static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
+{
+	spin_lock_irq(&cpu_base->lock);
+
+	while (!list_empty(&cpu_base->cb_pending)) {
+		enum hrtimer_restart (*fn)(struct hrtimer *);
+		struct hrtimer *timer;
+		int restart;
+
+		timer = list_entry(cpu_base->cb_pending.next,
+				   struct hrtimer, cb_entry);
+
+		timer_stats_account_hrtimer(timer);
+
+		fn = timer->function;
+		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
+		spin_unlock_irq(&cpu_base->lock);
+
+		restart = fn(timer);
+
+		spin_lock_irq(&cpu_base->lock);
+
+		timer->state &= ~HRTIMER_STATE_CALLBACK;
+		if (restart == HRTIMER_RESTART) {
+			BUG_ON(hrtimer_active(timer));
+			/*
+			 * Enqueue the timer, allow reprogramming of the event
+			 * device
+			 */
+			enqueue_hrtimer(timer, timer->base, 1);
+		} else if (hrtimer_active(timer)) {
+			/*
+			 * If the timer was rearmed on another CPU, reprogram
+			 * the event device.
+			 */
+			if (timer->base->first == &timer->node)
+				hrtimer_reprogram(timer, timer->base);
+		}
+	}
+	spin_unlock_irq(&cpu_base->lock);
+}
+
+static void __run_hrtimer(struct hrtimer *timer)
+{
+	struct hrtimer_clock_base *base = timer->base;
+	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+	enum hrtimer_restart (*fn)(struct hrtimer *);
+	int restart;
+
+	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	timer_stats_account_hrtimer(timer);
+
+	fn = timer->function;
+	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+		/*
+		 * Used for scheduler timers, avoid lock inversion with
+		 * rq->lock and tasklist_lock.
+		 *
+		 * These timers are required to deal with enqueue expiry
+		 * themselves and are not allowed to migrate.
+		 */
+		spin_unlock(&cpu_base->lock);
+		restart = fn(timer);
+		spin_lock(&cpu_base->lock);
+	} else
+		restart = fn(timer);
+
+	/*
+	 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
+	 * reprogramming of the event hardware. This happens at the end of this
+	 * function anyway.
+	 */
+	if (restart != HRTIMER_NORESTART) {
+		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+		enqueue_hrtimer(timer, base, 0);
+	}
+	timer->state &= ~HRTIMER_STATE_CALLBACK;
+}
+
 #ifdef CONFIG_HIGH_RES_TIMERS
 
 /*
@@ -1063,9 +1142,7 @@ void hrtimer_interrupt(struct clock_even
 		basenow = ktime_add(now, base->offset);
 
 		while ((node = base->first)) {
-			enum hrtimer_restart (*fn)(struct hrtimer *);
 			struct hrtimer *timer;
-			int restart;
 
 			timer = rb_entry(node, struct hrtimer, node);
 
@@ -1089,37 +1166,7 @@ void hrtimer_interrupt(struct clock_even
 				continue;
 			}
 
-			__remove_hrtimer(timer, base,
-					 HRTIMER_STATE_CALLBACK, 0);
-			timer_stats_account_hrtimer(timer);
-
-			fn = timer->function;
-			if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
-				/*
-				 * Used for scheduler timers, avoid lock
-				 * inversion with rq->lock and tasklist_lock.
-				 *
-				 * These timers are required to deal with
-				 * enqueue expiry themselves and are not
-				 * allowed to migrate.
-				 */
-				spin_unlock(&cpu_base->lock);
-				restart = fn(timer);
-				spin_lock(&cpu_base->lock);
-			} else
-				restart = fn(timer);
-
-			/*
-			 * Note: We clear the CALLBACK bit after
-			 * enqueue_hrtimer to avoid reprogramming of
-			 * the event hardware. This happens at the end
-			 * of this function anyway.
-			 */
-			if (restart != HRTIMER_NORESTART) {
-				BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
-				enqueue_hrtimer(timer, base, 0);
-			}
-			timer->state &= ~HRTIMER_STATE_CALLBACK;
+			__run_hrtimer(timer);
 		}
 		spin_unlock(&cpu_base->lock);
 		base++;
@@ -1140,52 +1187,41 @@ void hrtimer_interrupt(struct clock_even
 
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
-	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-
-	spin_lock_irq(&cpu_base->lock);
-
-	while (!list_empty(&cpu_base->cb_pending)) {
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		struct hrtimer *timer;
-		int restart;
-
-		timer = list_entry(cpu_base->cb_pending.next,
-				   struct hrtimer, cb_entry);
+	run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
+}
 
-		timer_stats_account_hrtimer(timer);
+#endif	/* CONFIG_HIGH_RES_TIMERS */
 
-		fn = timer->function;
-		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
+/*
+ * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
+ */
+void hrtimer_run_pending(void)
+{
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 
-		restart = fn(timer);
+	if (hrtimer_hres_active())
+		return;
 
-		spin_lock_irq(&cpu_base->lock);
+	/*
+	 * This _is_ ugly: We have to check in the softirq context,
+	 * whether we can switch to highres and / or nohz mode. The
+	 * clocksource switch happens in the timer interrupt with
+	 * xtime_lock held. Notification from there only sets the
+	 * check bit in the tick_oneshot code, otherwise we might
+	 * deadlock vs. xtime_lock.
+	 */
+	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+		hrtimer_switch_to_hres();
 
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart == HRTIMER_RESTART) {
-			BUG_ON(hrtimer_active(timer));
-			/*
-			 * Enqueue the timer, allow reprogramming of the event
-			 * device
-			 */
-			enqueue_hrtimer(timer, timer->base, 1);
-		} else if (hrtimer_active(timer)) {
-			/*
-			 * If the timer was rearmed on another CPU, reprogram
-			 * the event device.
-			 */
-			if (timer->base->first == &timer->node)
-				hrtimer_reprogram(timer, timer->base);
-		}
-	}
-	spin_unlock_irq(&cpu_base->lock);
+	run_hrtimer_pending(cpu_base);
 }
 
-#endif	/* CONFIG_HIGH_RES_TIMERS */
-
 /*
- * Expire the per base hrtimer-queue:
+ * Called from hardirq context every jiffy
  */
 static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
 				     int index)
@@ -1203,42 +1239,23 @@ static inline void run_hrtimer_queue(str
 
 	while ((node = base->first)) {
 		struct hrtimer *timer;
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		int restart;
 
 		timer = rb_entry(node, struct hrtimer, node);
 		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-		WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ);
-#endif
-		timer_stats_account_hrtimer(timer);
-
-		fn = timer->function;
-		__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
-
-		restart = fn(timer);
-
-		spin_lock_irq(&cpu_base->lock);
-
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart != HRTIMER_NORESTART) {
-			BUG_ON(hrtimer_active(timer));
-			enqueue_hrtimer(timer, base, 0);
+		if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
+			__remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
+			list_add_tail(&timer->cb_entry,
+					&base->cpu_base->cb_pending);
+			continue;
 		}
+
+		__run_hrtimer(timer);
 	}
 	spin_unlock_irq(&cpu_base->lock);
 }
 
-/*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
- */
 void hrtimer_run_queues(void)
 {
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
@@ -1247,18 +1264,6 @@ void hrtimer_run_queues(void)
 	if (hrtimer_hres_active())
 		return;
 
-	/*
-	 * This _is_ ugly: We have to check in the softirq context,
-	 * whether we can switch to highres and / or nohz mode. The
-	 * clocksource switch happens in the timer interrupt with
-	 * xtime_lock held. Notification from there only sets the
-	 * check bit in the tick_oneshot code, otherwise we might
-	 * deadlock vs. xtime_lock.
-	 */
-	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
-		if (hrtimer_switch_to_hres())
-			return;
-
 	hrtimer_get_softirq_time(cpu_base);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -896,7 +896,7 @@ static void run_timer_softirq(struct sof
 {
 	tvec_base_t *base = __get_cpu_var(tvec_bases);
 
-	hrtimer_run_queues();
+	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
@@ -907,6 +907,7 @@ static void run_timer_softirq(struct sof
  */
 void run_local_timers(void)
 {
+	hrtimer_run_queues();
 	raise_softirq(TIMER_SOFTIRQ);
 	softlockup_tick();
 }
Index: linux-2.6/include/linux/hrtimer.h
===================================================================
--- linux-2.6.orig/include/linux/hrtimer.h
+++ linux-2.6/include/linux/hrtimer.h
@@ -319,6 +319,7 @@ extern void hrtimer_init_sleeper(struct 
 
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
+extern void hrtimer_run_pending(void);
 
 /* Bootup initialization: */
 extern void __init hrtimers_init(void);

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 5/7] sched: rt-group: reduce rescheduling
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
                   ` (3 preceding siblings ...)
  2008-01-04 13:55 ` [PATCH 4/7] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
  2008-01-04 13:55 ` [PATCH 6/7] sched: rt-group: per group period Peter Zijlstra
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-group-fix-enqueue.patch --]
[-- Type: text/plain, Size: 748 bytes --]

Only reschedule if the new group has a higher prio task.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched_rt.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -94,8 +94,11 @@ static void sched_rt_ratio_enqueue(struc
 	struct sched_rt_entity *rt_se = rt_rq->rt_se;
 
 	if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
+		struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+
 		enqueue_rt_entity(rt_se);
-		resched_task(rq_of_rt_rq(rt_rq)->curr);
+		if (rt_rq->highest_prio < curr->prio)
+			resched_task(curr);
 	}
 }
 

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 6/7] sched: rt-group: per group period
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
                   ` (4 preceding siblings ...)
  2008-01-04 13:55 ` [PATCH 5/7] sched: rt-group: reduce rescheduling Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
  2008-01-05 14:51   ` Peter Zijlstra
  2008-01-04 13:55 ` [PATCH 7/7] sched: rt-group: deal with PI Peter Zijlstra
  2008-01-05 13:32 ` [PATCH 0/7] more rt group sched updates Ingo Molnar
  7 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-rq-hrtimer.patch --]
[-- Type: text/plain, Size: 14318 bytes --]

Steven asked for per group periods in order to get closer to RMA or EDF
scheduling.

Use the fancy new hrtimers to provide a per group period

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h    |    2 
 kernel/sched.c           |  225 +++++++++++++++++++++++++++++++++++++++++------
 kernel/sched_rt.c        |   61 ++++++------
 kernel/sysctl.c          |    2 
 kernel/time/tick-sched.c |    5 -
 5 files changed, 232 insertions(+), 63 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -230,8 +230,6 @@ static inline int select_nohz_load_balan
 }
 #endif
 
-extern unsigned long rt_needs_cpu(int cpu);
-
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -177,6 +177,7 @@ struct task_group {
 	struct rt_rq **rt_rq;
 
 	unsigned int rt_ratio;
+	ktime_t rt_period;
 
 	/*
 	 * shares assigned to a task group governs how much of cpu bandwidth
@@ -372,6 +373,7 @@ struct rt_rq {
 #endif
 	int rt_throttled;
 	u64 rt_time;
+	struct hrtimer rt_period_timer;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;
@@ -441,8 +443,6 @@ struct rq {
 
 	struct cfs_rq cfs;
 	struct rt_rq rt;
-	u64 rt_period_expire;
-	int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
@@ -595,23 +595,6 @@ static void update_rq_clock(struct rq *r
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
-unsigned long rt_needs_cpu(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	u64 delta;
-
-	if (!rq->rt_throttled)
-		return 0;
-
-	if (rq->clock > rq->rt_period_expire)
-		return 1;
-
-	delta = rq->rt_period_expire - rq->clock;
-	do_div(delta, NSEC_PER_SEC / HZ);
-
-	return (unsigned long)delta;
-}
-
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -652,10 +635,10 @@ const_debug unsigned int sysctl_sched_fe
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+const_debug unsigned int sysctl_sched_rt_period = 1000000;
 
 #define SCHED_RT_FRAC_SHIFT	16
 #define SCHED_RT_FRAC		(1UL << SCHED_RT_FRAC_SHIFT)
@@ -664,7 +647,7 @@ const_debug unsigned int sysctl_sched_rt
  * ratio of time -rt tasks may consume.
  * default: 95%
  */
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+const_debug unsigned int sysctl_sched_rt_ratio = 32768; //62259;
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1245,6 +1228,12 @@ static unsigned long cpu_avg_load_per_ta
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #endif /* CONFIG_SMP */
 
+static inline ktime_t ns_to_ktime(u64 ns)
+{
+	static const ktime_t ktime_zero = { .tv64 = 0 };
+	return ktime_add_ns(ktime_zero, ns);
+}
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -3741,7 +3730,6 @@ void scheduler_tick(void)
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
-	update_sched_rt_period(rq);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -5287,6 +5275,152 @@ static inline void sched_init_granularit
 	sysctl_sched_batch_wakeup_granularity *= factor;
 }
 
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+	struct rt_rq *rt_rq =
+		container_of(timer, struct rt_rq, rt_period_timer);
+	struct rq *rq = rq_of_rt_rq(rt_rq);
+	ktime_t now = ktime_get();
+
+	WARN_ON(smp_processor_id() != cpu_of(rq));
+	WARN_ON(!in_irq());
+
+	spin_lock(&rq->lock);
+	update_sched_rt_period(rt_rq);
+	spin_unlock(&rq->lock);
+
+	hrtimer_forward(timer, now, sched_rt_period(rt_rq));
+	return HRTIMER_RESTART;
+}
+
+static void sched_rt_period_start(struct rt_rq *rt_rq)
+{
+	ktime_t period = sched_rt_period(rt_rq);
+
+	WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq)));
+
+	for (;;) {
+		ktime_t now = ktime_get();
+		hrtimer_forward(&rt_rq->rt_period_timer, now, period);
+		hrtimer_start(&rt_rq->rt_period_timer,
+				rt_rq->rt_period_timer.expires,
+				HRTIMER_MODE_ABS);
+		if (hrtimer_active(&rt_rq->rt_period_timer))
+			break;
+	}
+}
+
+static void sched_rt_period_stop(struct rt_rq *rt_rq)
+{
+	hrtimer_cancel(&rt_rq->rt_period_timer);
+}
+
+static void sched_rt_period_start_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct rt_rq *rt_rq;
+
+	for_each_leaf_rt_rq(rt_rq, rq)
+		sched_rt_period_start(rt_rq);
+}
+
+#ifdef CONFIG_SMP
+static void sched_rt_period_stop_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct rt_rq *rt_rq;
+
+	for_each_leaf_rt_rq(rt_rq, rq)
+		sched_rt_period_stop(rt_rq);
+}
+
+static int sched_rt_period_hotplug(struct notifier_block *nfb,
+		unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+		sched_rt_period_start_cpu(cpu);
+		return NOTIFY_OK;
+
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		sched_rt_period_stop_cpu(cpu);
+		return NOTIFY_OK;
+
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+static void __init __sched_rt_period_init(void *arg)
+{
+	int cpu = smp_processor_id();
+	sched_rt_period_start_cpu(cpu);
+}
+
+static void __init sched_rt_period_init(void)
+{
+	on_each_cpu(__sched_rt_period_init, NULL, 0, 1);
+	hotcpu_notifier(sched_rt_period_hotplug, 0);
+}
+
+static void __sched_rt_period_init_tg(void *arg)
+{
+	struct task_group *tg = arg;
+	int cpu = smp_processor_id();
+
+	sched_rt_period_start(tg->rt_rq[cpu]);
+}
+
+static void sched_rt_period_init_tg(struct task_group *tg)
+{
+	on_each_cpu(__sched_rt_period_init_tg, tg, 0, 1);
+}
+
+static void __sched_rt_period_destroy_tg(void *arg)
+{
+	struct task_group *tg = arg;
+	int cpu = smp_processor_id();
+
+	sched_rt_period_stop(tg->rt_rq[cpu]);
+}
+
+static void sched_rt_period_destroy_tg(struct task_group *tg)
+{
+	on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1);
+}
+#else
+static void __init sched_rt_period_init(void)
+{
+	sched_rt_period_start_cpu(0);
+}
+
+static void sched_rt_period_init_tg(struct task_group *tg)
+{
+	sched_rt_period_start(tg->rt_rq[0]);
+}
+
+static void sched_rt_period_destroy_tg(struct task_group *tg)
+{
+	sched_rt_period_stop(tg->rt_rq[0]);
+}
+#endif
+
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
@@ -7068,6 +7202,7 @@ void __init sched_init_smp(void)
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
+	sched_rt_period_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (nr_cpu_ids == 1)
@@ -7088,6 +7223,7 @@ void __init sched_init_smp(void)
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
+	sched_rt_period_init();
 }
 #endif /* CONFIG_SMP */
 
@@ -7131,6 +7267,11 @@ static void init_rt_rq(struct rt_rq *rt_
 	rt_rq->rt_time = 0;
 	rt_rq->rt_throttled = 0;
 
+	hrtimer_init(&rt_rq->rt_period_timer,
+			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rt_rq->rt_period_timer.function = sched_rt_period_timer;
+	rt_rq->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	rt_rq->rq = rq;
 #endif
@@ -7201,6 +7342,8 @@ void __init sched_init(void)
 				&per_cpu(init_sched_entity, i), i, 1);
 
 		init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+		init_task_group.rt_period =
+			ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 		init_tg_rt_entry(rq, &init_task_group,
 				&per_cpu(init_rt_rq, i),
@@ -7208,8 +7351,6 @@ void __init sched_init(void)
 
 		list_add(&init_task_group.list, &task_groups);
 #endif
-		rq->rt_period_expire = 0;
-		rq->rt_throttled = 0;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
@@ -7598,6 +7739,7 @@ struct task_group *sched_create_group(vo
 
 	tg->shares = NICE_0_LOAD;
 	tg->rt_ratio = 0; /* XXX */
+	tg->rt_period = ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
 
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
@@ -7637,6 +7779,8 @@ struct task_group *sched_create_group(vo
 	list_add_rcu(&tg->list, &task_groups);
 	unlock_task_group_list();
 
+	sched_rt_period_init_tg(tg);
+
 	return tg;
 
 err:
@@ -7658,6 +7802,8 @@ void sched_destroy_group(struct task_gro
 	struct rt_rq *rt_rq = NULL;
 	int i;
 
+	sched_rt_period_destroy_tg(tg);
+
 	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
@@ -7815,6 +7961,19 @@ unsigned long sched_group_rt_ratio(struc
 	return tg->rt_ratio;
 }
 
+int sched_group_set_rt_period(struct task_group *tg, unsigned long rt_period)
+{
+	tg->rt_period = ns_to_ktime((u64)rt_period * NSEC_PER_USEC);
+	return 0;
+}
+
+unsigned long sched_group_rt_period(struct task_group *tg)
+{
+	u64 ns = ktime_to_ns(tg->rt_period);
+	do_div(ns, NSEC_PER_USEC);
+	return ns;
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7903,6 +8062,17 @@ static u64 cpu_rt_ratio_read_uint(struct
 	return (u64) tg->rt_ratio;
 }
 
+static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+		u64 rt_period_val)
+{
+	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_val);
+}
+
+static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+	return (u64) sched_group_rt_period(cgroup_tg(cgrp));
+}
+
 static struct cftype cpu_files[] = {
 	{
 		.name = "shares",
@@ -7914,6 +8084,11 @@ static struct cftype cpu_files[] = {
 		.read_uint = cpu_rt_ratio_read_uint,
 		.write_uint = cpu_rt_ratio_write_uint,
 	},
+	{
+		.name = "rt_period_us",
+		.read_uint = cpu_rt_period_read_uint,
+		.write_uint = cpu_rt_period_write_uint,
+	},
 };
 
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -65,6 +65,17 @@ static inline unsigned int sched_rt_rati
 	return rt_rq->tg->rt_ratio;
 }
 
+static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
+{
+	BUG_ON(!rt_rq->tg);
+	return rt_rq->tg->rt_period;
+}
+
+static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq)
+{
+	return ktime_to_ns(sched_rt_period(rt_rq));
+}
+
 #define for_each_leaf_rt_rq(rt_rq, rq) \
 	list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 
@@ -117,6 +128,16 @@ static inline unsigned int sched_rt_rati
 	return sysctl_sched_rt_ratio;
 }
 
+static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
+{
+	return ns_to_ktime((u64)sysctl_sched_rt_period * NSEC_PER_USEC);
+}
+
+static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq)
+{
+	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+
 #define for_each_leaf_rt_rq(rt_rq, rq) \
 	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 
@@ -174,15 +195,11 @@ static int sched_rt_ratio_exceeded(struc
 	if (rt_rq->rt_throttled)
 		return 1;
 
-	period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+	period = sched_rt_period_ns(rt_rq);
 	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
 	if (rt_rq->rt_time > ratio) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
-		rq->rt_throttled = 1;
 		rt_rq->rt_throttled = 1;
-
 		sched_rt_ratio_dequeue(rt_rq);
 		return 1;
 	}
@@ -190,27 +207,16 @@ static int sched_rt_ratio_exceeded(struc
 	return 0;
 }
 
-static void update_sched_rt_period(struct rq *rq)
+static void update_sched_rt_period(struct rt_rq *rt_rq)
 {
-	struct rt_rq *rt_rq;
-	u64 period;
-
-	while (rq->clock > rq->rt_period_expire) {
-		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
-		rq->rt_period_expire += period;
-
-		for_each_leaf_rt_rq(rt_rq, rq) {
-			unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-			u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-			rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
-			if (rt_rq->rt_throttled) {
-				rt_rq->rt_throttled = 0;
-				sched_rt_ratio_enqueue(rt_rq);
-			}
-		}
-
-		rq->rt_throttled = 0;
+	u64 period = sched_rt_period_ns(rt_rq);
+	unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+	u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+	rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+	if (rt_rq->rt_throttled) {
+		rt_rq->rt_throttled = 0;
+		sched_rt_ratio_enqueue(rt_rq);
 	}
 }
 
@@ -238,11 +244,6 @@ static void update_curr_rt(struct rq *rq
 	cpuacct_charge(curr, delta_exec);
 
 	rt_rq->rt_time += delta_exec;
-	/*
-	 * might make it a tad more accurate:
-	 *
-	 * update_sched_rt_period(rq);
-	 */
 	if (sched_rt_ratio_exceeded(rt_rq))
 		resched_task(curr);
 }
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -311,7 +311,7 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_rt_period_ms",
+		.procname	= "sched_rt_period_us",
 		.data		= &sysctl_sched_rt_period,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -153,7 +153,6 @@ void tick_nohz_update_jiffies(void)
 void tick_nohz_stop_sched_tick(void)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
-	unsigned long rt_jiffies;
 	struct tick_sched *ts;
 	ktime_t last_update, expires, now, delta;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -217,10 +216,6 @@ void tick_nohz_stop_sched_tick(void)
 	next_jiffies = get_next_timer_interrupt(last_jiffies);
 	delta_jiffies = next_jiffies - last_jiffies;
 
-	rt_jiffies = rt_needs_cpu(cpu);
-	if (rt_jiffies && rt_jiffies < delta_jiffies)
-		delta_jiffies = rt_jiffies;
-
 	if (rcu_needs_cpu(cpu))
 		delta_jiffies = 1;
 	/*

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 7/7] sched: rt-group: deal with PI
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
                   ` (5 preceding siblings ...)
  2008-01-04 13:55 ` [PATCH 6/7] sched: rt-group: per group period Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
  2008-01-05 13:32 ` [PATCH 0/7] more rt group sched updates Ingo Molnar
  7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-group-pi.patch --]
[-- Type: text/plain, Size: 4082 bytes --]

Steven mentioned the fun case where a lock holding task will be throttled.

Simple fix: allow groups that have boosted tasks to run anyway.
This is ofcourse not quite correct. Needs more tricks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c    |    3 +++
 kernel/sched_rt.c |   50 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 45 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -376,6 +376,8 @@ struct rt_rq {
 	struct hrtimer rt_period_timer;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	unsigned long rt_nr_boosted;
+
 	struct rq *rq;
 	struct list_head leaf_rt_rq_list;
 	struct task_group *tg;
@@ -7273,6 +7275,7 @@ static void init_rt_rq(struct rt_rq *rt_
 	rt_rq->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	rt_rq->rt_nr_boosted = 0;
 	rt_rq->rq = rq;
 #endif
 }
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -121,6 +121,11 @@ static void sched_rt_ratio_dequeue(struc
 		dequeue_rt_entity(rt_se);
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
 #else
 
 static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
@@ -170,6 +175,10 @@ static inline void sched_rt_ratio_dequeu
 {
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_throttled;
+}
 #endif
 
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -184,27 +193,42 @@ static inline int rt_se_prio(struct sche
 	return rt_task_of(rt_se)->prio;
 }
 
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+	struct task_struct *p;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+	if (rt_rq)
+		return !!rt_rq->rt_nr_boosted;
+#endif
+
+	p = rt_task_of(rt_se);
+	return p->prio != p->normal_prio;
+}
+
 static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
 {
 	unsigned int rt_ratio = sched_rt_ratio(rt_rq);
 	u64 period, ratio;
 
 	if (rt_ratio == SCHED_RT_FRAC)
-		return 0;
+		goto out;
 
 	if (rt_rq->rt_throttled)
-		return 1;
+		goto out;
 
 	period = sched_rt_period_ns(rt_rq);
 	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
 	if (rt_rq->rt_time > ratio) {
 		rt_rq->rt_throttled = 1;
-		sched_rt_ratio_dequeue(rt_rq);
-		return 1;
+		if (rt_rq_throttled(rt_rq))
+			sched_rt_ratio_dequeue(rt_rq);
 	}
 
-	return 0;
+out:
+	return rt_rq_throttled(rt_rq);
 }
 
 static void update_sched_rt_period(struct rt_rq *rt_rq)
@@ -265,6 +289,10 @@ void inc_rt_tasks(struct sched_rt_entity
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (rt_se_boosted(rt_se))
+		rt_rq->rt_nr_boosted++;
+#endif
 }
 
 static inline
@@ -295,6 +323,12 @@ void dec_rt_tasks(struct sched_rt_entity
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (rt_se_boosted(rt_se))
+		rt_rq->rt_nr_boosted--;
+
+	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+#endif
 }
 
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +337,7 @@ static void enqueue_rt_entity(struct sch
 	struct rt_prio_array *array = &rt_rq->active;
 	struct rt_rq *group_rq = group_rt_rq(rt_se);
 
-	if (group_rq && group_rq->rt_throttled)
+	if (group_rq && rt_rq_throttled(group_rq))
 		return;
 
 	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -476,7 +510,7 @@ static struct sched_rt_entity *pick_next
 	struct list_head *queue;
 	int idx;
 
-	if (sched_rt_ratio_exceeded(rt_rq))
+	if (rt_rq_throttled(rt_rq))
 		goto out;
 
 	idx = sched_find_first_bit(array->bitmap);
@@ -500,7 +534,7 @@ static struct task_struct *pick_next_tas
 	if (unlikely(!rt_rq->rt_nr_running))
 		return NULL;
 
-	if (sched_rt_ratio_exceeded(rt_rq))
+	if (rt_rq_throttled(rt_rq))
 		return NULL;
 
 	do {

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 0/7] more rt group sched updates
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
                   ` (6 preceding siblings ...)
  2008-01-04 13:55 ` [PATCH 7/7] sched: rt-group: deal with PI Peter Zijlstra
@ 2008-01-05 13:32 ` Ingo Molnar
  7 siblings, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2008-01-05 13:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> includes the two patches from yesterday,
> 
> series against sched-devel

thanks, applied.

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 6/7] sched: rt-group: per group period
  2008-01-04 13:55 ` [PATCH 6/7] sched: rt-group: per group period Peter Zijlstra
@ 2008-01-05 14:51   ` Peter Zijlstra
  2008-01-05 15:05     ` Ingo Molnar
  0 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-05 14:51 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner

Could you please fold this into the 6/7 patch.

It reverts a wandering chunk (the 32768 thing), but more importantly
it fixes !FAIR_GROUP_SCHED compilation.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |   10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -647,7 +647,7 @@ const_debug unsigned int sysctl_sched_rt
  * ratio of time -rt tasks may consume.
  * default: 95%
  */
-const_debug unsigned int sysctl_sched_rt_ratio = 32768; //62259;
+const_debug unsigned int sysctl_sched_rt_ratio = 62259;
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -5379,6 +5379,7 @@ static void __init sched_rt_period_init(
 	hotcpu_notifier(sched_rt_period_hotplug, 0);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static void __sched_rt_period_init_tg(void *arg)
 {
 	struct task_group *tg = arg;
@@ -5404,12 +5405,14 @@ static void sched_rt_period_destroy_tg(s
 {
 	on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1);
 }
-#else
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#else /* CONFIG_SMP */
 static void __init sched_rt_period_init(void)
 {
 	sched_rt_period_start_cpu(0);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static void sched_rt_period_init_tg(struct task_group *tg)
 {
 	sched_rt_period_start(tg->rt_rq[0]);
@@ -5419,7 +5422,8 @@ static void sched_rt_period_destroy_tg(s
 {
 	sched_rt_period_stop(tg->rt_rq[0]);
 }
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_SMP
 /*



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 6/7] sched: rt-group: per group period
  2008-01-05 14:51   ` Peter Zijlstra
@ 2008-01-05 15:05     ` Ingo Molnar
  0 siblings, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2008-01-05 15:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Could you please fold this into the 6/7 patch.
> 
> It reverts a wandering chunk (the 32768 thing), but more importantly
> it fixes !FAIR_GROUP_SCHED compilation.

done. Btw., there's a new warning:

kernel/sched_rt.c:197: warning: 'rt_se_boosted' defined but not used

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2008-01-05 15:06 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
2008-01-04 13:54 ` [PATCH 1/7] sched: rt throttling vs no_hz Peter Zijlstra
2008-01-04 13:54 ` [PATCH 2/7] sched: load_balance_monitor rename Peter Zijlstra
2008-01-04 13:55 ` [PATCH 3/7] hrtimer: clean up cpu->base locking tricks Peter Zijlstra
2008-01-04 13:55 ` [PATCH 4/7] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
2008-01-04 13:55 ` [PATCH 5/7] sched: rt-group: reduce rescheduling Peter Zijlstra
2008-01-04 13:55 ` [PATCH 6/7] sched: rt-group: per group period Peter Zijlstra
2008-01-05 14:51   ` Peter Zijlstra
2008-01-05 15:05     ` Ingo Molnar
2008-01-04 13:55 ` [PATCH 7/7] sched: rt-group: deal with PI Peter Zijlstra
2008-01-05 13:32 ` [PATCH 0/7] more rt group sched updates Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox