[PATCH 0/7] more rt group sched updates

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/7] more rt group sched updates
@ 2008-01-04 13:54 Peter Zijlstra
  2008-01-04 13:54 ` [PATCH 1/7] sched: rt throttling vs no_hz Peter Zijlstra
                   ` (7 more replies)
  0 siblings, 8 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:54 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

includes the two patches from yesterday,

series against sched-devel



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/7] sched: rt throttling vs no_hz
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
@ 2008-01-04 13:54 ` Peter Zijlstra
  2008-01-04 13:54 ` [PATCH 2/7] sched: load_balance_monitor rename Peter Zijlstra
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:54 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-group-update.patch --]
[-- Type: text/plain, Size: 4621 bytes --]

We need to teach no_hz about the rt throttling because its tick driven.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h    |    2 ++
 kernel/sched.c           |   23 ++++++++++++++++++++++-
 kernel/sched_rt.c        |   30 ++++++++++++++++--------------
 kernel/time/tick-sched.c |    5 +++++
 4 files changed, 45 insertions(+), 15 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -230,6 +230,8 @@ static inline int select_nohz_load_balan
 }
 #endif
 
+extern unsigned long rt_needs_cpu(int cpu);
+
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -442,6 +442,7 @@ struct rq {
 	struct cfs_rq cfs;
 	struct rt_rq rt;
 	u64 rt_period_expire;
+	int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
@@ -594,6 +595,23 @@ static void update_rq_clock(struct rq *r
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+unsigned long rt_needs_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	u64 delta;
+
+	if (!rq->rt_throttled)
+		return 0;
+
+	if (rq->clock > rq->rt_period_expire)
+		return 1;
+
+	delta = rq->rt_period_expire - rq->clock;
+	do_div(delta, NSEC_PER_SEC / HZ);
+
+	return (unsigned long)delta;
+}
+
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -7099,9 +7117,11 @@ static void init_rt_rq(struct rt_rq *rt_
 	/* delimiter for bitsearch: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	rt_rq->highest_prio = MAX_RT_PRIO;
+#endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
-	rt_rq->highest_prio = MAX_RT_PRIO;
 	rt_rq->overloaded = 0;
 #endif
 
@@ -7186,6 +7206,7 @@ void __init sched_init(void)
 		list_add(&init_task_group.list, &task_groups);
 #endif
 		rq->rt_period_expire = 0;
+		rq->rt_throttled = 0;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -175,7 +175,11 @@ static int sched_rt_ratio_exceeded(struc
 	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
 	if (rt_rq->rt_time > ratio) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
+
+		rq->rt_throttled = 1;
 		rt_rq->rt_throttled = 1;
+
 		sched_rt_ratio_dequeue(rt_rq);
 		return 1;
 	}
@@ -183,18 +187,6 @@ static int sched_rt_ratio_exceeded(struc
 	return 0;
 }
 
-static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period)
-{
-	unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-	u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-	rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
-	if (rt_rq->rt_throttled) {
-		rt_rq->rt_throttled = 0;
-		sched_rt_ratio_enqueue(rt_rq);
-	}
-}
-
 static void update_sched_rt_period(struct rq *rq)
 {
 	struct rt_rq *rt_rq;
@@ -204,8 +196,18 @@ static void update_sched_rt_period(struc
 		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
 		rq->rt_period_expire += period;
 
-		for_each_leaf_rt_rq(rt_rq, rq)
-			__update_sched_rt_period(rt_rq, period);
+		for_each_leaf_rt_rq(rt_rq, rq) {
+			unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+			u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+			rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+			if (rt_rq->rt_throttled) {
+				rt_rq->rt_throttled = 0;
+				sched_rt_ratio_enqueue(rt_rq);
+			}
+		}
+
+		rq->rt_throttled = 0;
 	}
 }
 
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void)
 void tick_nohz_stop_sched_tick(void)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+	unsigned long rt_jiffies;
 	struct tick_sched *ts;
 	ktime_t last_update, expires, now, delta;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void)
 	next_jiffies = get_next_timer_interrupt(last_jiffies);
 	delta_jiffies = next_jiffies - last_jiffies;
 
+	rt_jiffies = rt_needs_cpu(cpu);
+	if (rt_jiffies && rt_jiffies < delta_jiffies)
+		delta_jiffies = rt_jiffies;
+
 	if (rcu_needs_cpu(cpu))
 		delta_jiffies = 1;
 	/*

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 2/7] sched: load_balance_monitor rename
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
  2008-01-04 13:54 ` [PATCH 1/7] sched: rt throttling vs no_hz Peter Zijlstra
@ 2008-01-04 13:54 ` Peter Zijlstra
  2008-01-04 13:55 ` [PATCH 3/7] hrtimer: clean up cpu->base locking tricks Peter Zijlstra
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:54 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-group-fixes.patch --]
[-- Type: text/plain, Size: 830 bytes --]

don't start the load_balance_monitor when there is only a single cpu.
rename the kthread because its currently longer than TASK_COMM_LEN

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -7070,8 +7070,11 @@ void __init sched_init_smp(void)
 	sched_init_granularity();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	if (nr_cpu_ids == 1)
+		return;
+
 	lb_monitor_task = kthread_create(load_balance_monitor, NULL,
-					 "load_balance_monitor");
+					 "group_balance");
 	if (!IS_ERR(lb_monitor_task)) {
 		lb_monitor_task->flags |= PF_NOFREEZE;
 		wake_up_process(lb_monitor_task);

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 3/7] hrtimer: clean up cpu->base locking tricks
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
  2008-01-04 13:54 ` [PATCH 1/7] sched: rt throttling vs no_hz Peter Zijlstra
  2008-01-04 13:54 ` [PATCH 2/7] sched: load_balance_monitor rename Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
  2008-01-04 13:55 ` [PATCH 4/7] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: hrtimer-unlocked-callback.patch --]
[-- Type: text/plain, Size: 3319 bytes --]

In order to more easily allow for the scheduler to use timers, clean up
the locking a bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/hrtimer.c         |  109 +++++++++++++++++++++++++++++++++++++++++++----
 kernel/time/tick-sched.c |    8 ---
 2 files changed, 102 insertions(+), 15 deletions(-)

Index: linux-2.6/kernel/hrtimer.c
===================================================================
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -1063,7 +1063,9 @@ void hrtimer_interrupt(struct clock_even
 		basenow = ktime_add(now, base->offset);
 
 		while ((node = base->first)) {
+			enum hrtimer_restart (*fn)(struct hrtimer *);
 			struct hrtimer *timer;
+			int restart;
 
 			timer = rb_entry(node, struct hrtimer, node);
 
@@ -1091,13 +1093,29 @@ void hrtimer_interrupt(struct clock_even
 					 HRTIMER_STATE_CALLBACK, 0);
 			timer_stats_account_hrtimer(timer);
 
+			fn = timer->function;
+			if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+				/*
+				 * Used for scheduler timers, avoid lock
+				 * inversion with rq->lock and tasklist_lock.
+				 *
+				 * These timers are required to deal with
+				 * enqueue expiry themselves and are not
+				 * allowed to migrate.
+				 */
+				spin_unlock(&cpu_base->lock);
+				restart = fn(timer);
+				spin_lock(&cpu_base->lock);
+			} else
+				restart = fn(timer);
+
 			/*
 			 * Note: We clear the CALLBACK bit after
 			 * enqueue_hrtimer to avoid reprogramming of
 			 * the event hardware. This happens at the end
 			 * of this function anyway.
 			 */
-			if (timer->function(timer) != HRTIMER_NORESTART) {
+			if (restart != HRTIMER_NORESTART) {
 				BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
 				enqueue_hrtimer(timer, base, 0);
 			}
@@ -1268,7 +1361,7 @@ void hrtimer_init_sleeper(struct hrtimer
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
 #ifdef CONFIG_HIGH_RES_TIMERS
-	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 #endif
 }
 
@@ -1279,6 +1372,8 @@ static int __sched do_nanosleep(struct h
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
 		hrtimer_start(&t->timer, t->timer.expires, mode);
+		if (!hrtimer_active(&t->timer))
+			t->task = NULL;
 
 		if (likely(t->task))
 			schedule();
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -514,7 +514,6 @@ static enum hrtimer_restart tick_sched_t
 {
 	struct tick_sched *ts =
 		container_of(timer, struct tick_sched, sched_timer);
-	struct hrtimer_cpu_base *base = timer->base->cpu_base;
 	struct pt_regs *regs = get_irq_regs();
 	ktime_t now = ktime_get();
 	int cpu = smp_processor_id();
@@ -552,15 +551,8 @@ static enum hrtimer_restart tick_sched_t
 			touch_softlockup_watchdog();
 			ts->idle_jiffies++;
 		}
-		/*
-		 * update_process_times() might take tasklist_lock, hence
-		 * drop the base lock. sched-tick hrtimers are per-CPU and
-		 * never accessible by userspace APIs, so this is safe to do.
-		 */
-		spin_unlock(&base->lock);
 		update_process_times(user_mode(regs));
 		profile_tick(CPU_PROFILING);
-		spin_lock(&base->lock);
 	}
 
 	/* Do not restart, when we are in the idle loop */

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 4/7] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
                   ` (2 preceding siblings ...)
  2008-01-04 13:55 ` [PATCH 3/7] hrtimer: clean up cpu->base locking tricks Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
  2008-01-04 13:55 ` [PATCH 5/7] sched: rt-group: reduce rescheduling Peter Zijlstra
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: hrtimer-fallback.patch --]
[-- Type: text/plain, Size: 9833 bytes --]

Currently all highres=off timers are run from softirq context, but
HRTIMER_CB_IRQSAFE_NO_SOFTIRQ timers expect to run from irq context.

Fix this up by splitting it similar to the highres=on case.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/hrtimer.h |    1 
 kernel/hrtimer.c        |  223 ++++++++++++++++++++++++------------------------
 kernel/timer.c          |    3 
 3 files changed, 117 insertions(+), 110 deletions(-)

Index: linux-2.6/kernel/hrtimer.c
===================================================================
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -1030,6 +1030,85 @@ int hrtimer_get_res(const clockid_t whic
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
 
+static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
+{
+	spin_lock_irq(&cpu_base->lock);
+
+	while (!list_empty(&cpu_base->cb_pending)) {
+		enum hrtimer_restart (*fn)(struct hrtimer *);
+		struct hrtimer *timer;
+		int restart;
+
+		timer = list_entry(cpu_base->cb_pending.next,
+				   struct hrtimer, cb_entry);
+
+		timer_stats_account_hrtimer(timer);
+
+		fn = timer->function;
+		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
+		spin_unlock_irq(&cpu_base->lock);
+
+		restart = fn(timer);
+
+		spin_lock_irq(&cpu_base->lock);
+
+		timer->state &= ~HRTIMER_STATE_CALLBACK;
+		if (restart == HRTIMER_RESTART) {
+			BUG_ON(hrtimer_active(timer));
+			/*
+			 * Enqueue the timer, allow reprogramming of the event
+			 * device
+			 */
+			enqueue_hrtimer(timer, timer->base, 1);
+		} else if (hrtimer_active(timer)) {
+			/*
+			 * If the timer was rearmed on another CPU, reprogram
+			 * the event device.
+			 */
+			if (timer->base->first == &timer->node)
+				hrtimer_reprogram(timer, timer->base);
+		}
+	}
+	spin_unlock_irq(&cpu_base->lock);
+}
+
+static void __run_hrtimer(struct hrtimer *timer)
+{
+	struct hrtimer_clock_base *base = timer->base;
+	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+	enum hrtimer_restart (*fn)(struct hrtimer *);
+	int restart;
+
+	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	timer_stats_account_hrtimer(timer);
+
+	fn = timer->function;
+	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+		/*
+		 * Used for scheduler timers, avoid lock inversion with
+		 * rq->lock and tasklist_lock.
+		 *
+		 * These timers are required to deal with enqueue expiry
+		 * themselves and are not allowed to migrate.
+		 */
+		spin_unlock(&cpu_base->lock);
+		restart = fn(timer);
+		spin_lock(&cpu_base->lock);
+	} else
+		restart = fn(timer);
+
+	/*
+	 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
+	 * reprogramming of the event hardware. This happens at the end of this
+	 * function anyway.
+	 */
+	if (restart != HRTIMER_NORESTART) {
+		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+		enqueue_hrtimer(timer, base, 0);
+	}
+	timer->state &= ~HRTIMER_STATE_CALLBACK;
+}
+
 #ifdef CONFIG_HIGH_RES_TIMERS
 
 /*
@@ -1063,9 +1142,7 @@ void hrtimer_interrupt(struct clock_even
 		basenow = ktime_add(now, base->offset);
 
 		while ((node = base->first)) {
-			enum hrtimer_restart (*fn)(struct hrtimer *);
 			struct hrtimer *timer;
-			int restart;
 
 			timer = rb_entry(node, struct hrtimer, node);
 
@@ -1089,37 +1166,7 @@ void hrtimer_interrupt(struct clock_even
 				continue;
 			}
 
-			__remove_hrtimer(timer, base,
-					 HRTIMER_STATE_CALLBACK, 0);
-			timer_stats_account_hrtimer(timer);
-
-			fn = timer->function;
-			if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
-				/*
-				 * Used for scheduler timers, avoid lock
-				 * inversion with rq->lock and tasklist_lock.
-				 *
-				 * These timers are required to deal with
-				 * enqueue expiry themselves and are not
-				 * allowed to migrate.
-				 */
-				spin_unlock(&cpu_base->lock);
-				restart = fn(timer);
-				spin_lock(&cpu_base->lock);
-			} else
-				restart = fn(timer);
-
-			/*
-			 * Note: We clear the CALLBACK bit after
-			 * enqueue_hrtimer to avoid reprogramming of
-			 * the event hardware. This happens at the end
-			 * of this function anyway.
-			 */
-			if (restart != HRTIMER_NORESTART) {
-				BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
-				enqueue_hrtimer(timer, base, 0);
-			}
-			timer->state &= ~HRTIMER_STATE_CALLBACK;
+			__run_hrtimer(timer);
 		}
 		spin_unlock(&cpu_base->lock);
 		base++;
@@ -1140,52 +1187,41 @@ void hrtimer_interrupt(struct clock_even
 
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
-	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-
-	spin_lock_irq(&cpu_base->lock);
-
-	while (!list_empty(&cpu_base->cb_pending)) {
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		struct hrtimer *timer;
-		int restart;
-
-		timer = list_entry(cpu_base->cb_pending.next,
-				   struct hrtimer, cb_entry);
+	run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
+}
 
-		timer_stats_account_hrtimer(timer);
+#endif	/* CONFIG_HIGH_RES_TIMERS */
 
-		fn = timer->function;
-		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
+/*
+ * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
+ */
+void hrtimer_run_pending(void)
+{
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 
-		restart = fn(timer);
+	if (hrtimer_hres_active())
+		return;
 
-		spin_lock_irq(&cpu_base->lock);
+	/*
+	 * This _is_ ugly: We have to check in the softirq context,
+	 * whether we can switch to highres and / or nohz mode. The
+	 * clocksource switch happens in the timer interrupt with
+	 * xtime_lock held. Notification from there only sets the
+	 * check bit in the tick_oneshot code, otherwise we might
+	 * deadlock vs. xtime_lock.
+	 */
+	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+		hrtimer_switch_to_hres();
 
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart == HRTIMER_RESTART) {
-			BUG_ON(hrtimer_active(timer));
-			/*
-			 * Enqueue the timer, allow reprogramming of the event
-			 * device
-			 */
-			enqueue_hrtimer(timer, timer->base, 1);
-		} else if (hrtimer_active(timer)) {
-			/*
-			 * If the timer was rearmed on another CPU, reprogram
-			 * the event device.
-			 */
-			if (timer->base->first == &timer->node)
-				hrtimer_reprogram(timer, timer->base);
-		}
-	}
-	spin_unlock_irq(&cpu_base->lock);
+	run_hrtimer_pending(cpu_base);
 }
 
-#endif	/* CONFIG_HIGH_RES_TIMERS */
-
 /*
- * Expire the per base hrtimer-queue:
+ * Called from hardirq context every jiffy
  */
 static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
 				     int index)
@@ -1203,42 +1239,23 @@ static inline void run_hrtimer_queue(str
 
 	while ((node = base->first)) {
 		struct hrtimer *timer;
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		int restart;
 
 		timer = rb_entry(node, struct hrtimer, node);
 		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-		WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ);
-#endif
-		timer_stats_account_hrtimer(timer);
-
-		fn = timer->function;
-		__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
-
-		restart = fn(timer);
-
-		spin_lock_irq(&cpu_base->lock);
-
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart != HRTIMER_NORESTART) {
-			BUG_ON(hrtimer_active(timer));
-			enqueue_hrtimer(timer, base, 0);
+		if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
+			__remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
+			list_add_tail(&timer->cb_entry,
+					&base->cpu_base->cb_pending);
+			continue;
 		}
+
+		__run_hrtimer(timer);
 	}
 	spin_unlock_irq(&cpu_base->lock);
 }
 
-/*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
- */
 void hrtimer_run_queues(void)
 {
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
@@ -1247,18 +1264,6 @@ void hrtimer_run_queues(void)
 	if (hrtimer_hres_active())
 		return;
 
-	/*
-	 * This _is_ ugly: We have to check in the softirq context,
-	 * whether we can switch to highres and / or nohz mode. The
-	 * clocksource switch happens in the timer interrupt with
-	 * xtime_lock held. Notification from there only sets the
-	 * check bit in the tick_oneshot code, otherwise we might
-	 * deadlock vs. xtime_lock.
-	 */
-	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
-		if (hrtimer_switch_to_hres())
-			return;
-
 	hrtimer_get_softirq_time(cpu_base);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -896,7 +896,7 @@ static void run_timer_softirq(struct sof
 {
 	tvec_base_t *base = __get_cpu_var(tvec_bases);
 
-	hrtimer_run_queues();
+	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
@@ -907,6 +907,7 @@ static void run_timer_softirq(struct sof
  */
 void run_local_timers(void)
 {
+	hrtimer_run_queues();
 	raise_softirq(TIMER_SOFTIRQ);
 	softlockup_tick();
 }
Index: linux-2.6/include/linux/hrtimer.h
===================================================================
--- linux-2.6.orig/include/linux/hrtimer.h
+++ linux-2.6/include/linux/hrtimer.h
@@ -319,6 +319,7 @@ extern void hrtimer_init_sleeper(struct 
 
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
+extern void hrtimer_run_pending(void);
 
 /* Bootup initialization: */
 extern void __init hrtimers_init(void);

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 5/7] sched: rt-group: reduce rescheduling
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
                   ` (3 preceding siblings ...)
  2008-01-04 13:55 ` [PATCH 4/7] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
  2008-01-04 13:55 ` [PATCH 6/7] sched: rt-group: per group period Peter Zijlstra
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-group-fix-enqueue.patch --]
[-- Type: text/plain, Size: 748 bytes --]

Only reschedule if the new group has a higher prio task.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched_rt.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -94,8 +94,11 @@ static void sched_rt_ratio_enqueue(struc
 	struct sched_rt_entity *rt_se = rt_rq->rt_se;
 
 	if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
+		struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+
 		enqueue_rt_entity(rt_se);
-		resched_task(rq_of_rt_rq(rt_rq)->curr);
+		if (rt_rq->highest_prio < curr->prio)
+			resched_task(curr);
 	}
 }
 

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 6/7] sched: rt-group: per group period
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
                   ` (4 preceding siblings ...)
  2008-01-04 13:55 ` [PATCH 5/7] sched: rt-group: reduce rescheduling Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
  2008-01-05 14:51   ` Peter Zijlstra
  2008-01-04 13:55 ` [PATCH 7/7] sched: rt-group: deal with PI Peter Zijlstra
  2008-01-05 13:32 ` [PATCH 0/7] more rt group sched updates Ingo Molnar
  7 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-rq-hrtimer.patch --]
[-- Type: text/plain, Size: 14318 bytes --]

Steven asked for per group periods in order to get closer to RMA or EDF
scheduling.

Use the fancy new hrtimers to provide a per group period

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h    |    2 
 kernel/sched.c           |  225 +++++++++++++++++++++++++++++++++++++++++------
 kernel/sched_rt.c        |   61 ++++++------
 kernel/sysctl.c          |    2 
 kernel/time/tick-sched.c |    5 -
 5 files changed, 232 insertions(+), 63 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -230,8 +230,6 @@ static inline int select_nohz_load_balan
 }
 #endif
 
-extern unsigned long rt_needs_cpu(int cpu);
-
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -177,6 +177,7 @@ struct task_group {
 	struct rt_rq **rt_rq;
 
 	unsigned int rt_ratio;
+	ktime_t rt_period;
 
 	/*
 	 * shares assigned to a task group governs how much of cpu bandwidth
@@ -372,6 +373,7 @@ struct rt_rq {
 #endif
 	int rt_throttled;
 	u64 rt_time;
+	struct hrtimer rt_period_timer;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;
@@ -441,8 +443,6 @@ struct rq {
 
 	struct cfs_rq cfs;
 	struct rt_rq rt;
-	u64 rt_period_expire;
-	int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
@@ -595,23 +595,6 @@ static void update_rq_clock(struct rq *r
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
-unsigned long rt_needs_cpu(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	u64 delta;
-
-	if (!rq->rt_throttled)
-		return 0;
-
-	if (rq->clock > rq->rt_period_expire)
-		return 1;
-
-	delta = rq->rt_period_expire - rq->clock;
-	do_div(delta, NSEC_PER_SEC / HZ);
-
-	return (unsigned long)delta;
-}
-
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -652,10 +635,10 @@ const_debug unsigned int sysctl_sched_fe
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+const_debug unsigned int sysctl_sched_rt_period = 1000000;
 
 #define SCHED_RT_FRAC_SHIFT	16
 #define SCHED_RT_FRAC		(1UL << SCHED_RT_FRAC_SHIFT)
@@ -664,7 +647,7 @@ const_debug unsigned int sysctl_sched_rt
  * ratio of time -rt tasks may consume.
  * default: 95%
  */
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+const_debug unsigned int sysctl_sched_rt_ratio = 32768; //62259;
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1245,6 +1228,12 @@ static unsigned long cpu_avg_load_per_ta
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #endif /* CONFIG_SMP */
 
+static inline ktime_t ns_to_ktime(u64 ns)
+{
+	static const ktime_t ktime_zero = { .tv64 = 0 };
+	return ktime_add_ns(ktime_zero, ns);
+}
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -3741,7 +3730,6 @@ void scheduler_tick(void)
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
-	update_sched_rt_period(rq);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -5287,6 +5275,152 @@ static inline void sched_init_granularit
 	sysctl_sched_batch_wakeup_granularity *= factor;
 }
 
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+	struct rt_rq *rt_rq =
+		container_of(timer, struct rt_rq, rt_period_timer);
+	struct rq *rq = rq_of_rt_rq(rt_rq);
+	ktime_t now = ktime_get();
+
+	WARN_ON(smp_processor_id() != cpu_of(rq));
+	WARN_ON(!in_irq());
+
+	spin_lock(&rq->lock);
+	update_sched_rt_period(rt_rq);
+	spin_unlock(&rq->lock);
+
+	hrtimer_forward(timer, now, sched_rt_period(rt_rq));
+	return HRTIMER_RESTART;
+}
+
+static void sched_rt_period_start(struct rt_rq *rt_rq)
+{
+	ktime_t period = sched_rt_period(rt_rq);
+
+	WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq)));
+
+	for (;;) {
+		ktime_t now = ktime_get();
+		hrtimer_forward(&rt_rq->rt_period_timer, now, period);
+		hrtimer_start(&rt_rq->rt_period_timer,
+				rt_rq->rt_period_timer.expires,
+				HRTIMER_MODE_ABS);
+		if (hrtimer_active(&rt_rq->rt_period_timer))
+			break;
+	}
+}
+
+static void sched_rt_period_stop(struct rt_rq *rt_rq)
+{
+	hrtimer_cancel(&rt_rq->rt_period_timer);
+}
+
+static void sched_rt_period_start_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct rt_rq *rt_rq;
+
+	for_each_leaf_rt_rq(rt_rq, rq)
+		sched_rt_period_start(rt_rq);
+}
+
+#ifdef CONFIG_SMP
+static void sched_rt_period_stop_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct rt_rq *rt_rq;
+
+	for_each_leaf_rt_rq(rt_rq, rq)
+		sched_rt_period_stop(rt_rq);
+}
+
+static int sched_rt_period_hotplug(struct notifier_block *nfb,
+		unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+		sched_rt_period_start_cpu(cpu);
+		return NOTIFY_OK;
+
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		sched_rt_period_stop_cpu(cpu);
+		return NOTIFY_OK;
+
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+static void __init __sched_rt_period_init(void *arg)
+{
+	int cpu = smp_processor_id();
+	sched_rt_period_start_cpu(cpu);
+}
+
+static void __init sched_rt_period_init(void)
+{
+	on_each_cpu(__sched_rt_period_init, NULL, 0, 1);
+	hotcpu_notifier(sched_rt_period_hotplug, 0);
+}
+
+static void __sched_rt_period_init_tg(void *arg)
+{
+	struct task_group *tg = arg;
+	int cpu = smp_processor_id();
+
+	sched_rt_period_start(tg->rt_rq[cpu]);
+}
+
+static void sched_rt_period_init_tg(struct task_group *tg)
+{
+	on_each_cpu(__sched_rt_period_init_tg, tg, 0, 1);
+}
+
+static void __sched_rt_period_destroy_tg(void *arg)
+{
+	struct task_group *tg = arg;
+	int cpu = smp_processor_id();
+
+	sched_rt_period_stop(tg->rt_rq[cpu]);
+}
+
+static void sched_rt_period_destroy_tg(struct task_group *tg)
+{
+	on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1);
+}
+#else
+static void __init sched_rt_period_init(void)
+{
+	sched_rt_period_start_cpu(0);
+}
+
+static void sched_rt_period_init_tg(struct task_group *tg)
+{
+	sched_rt_period_start(tg->rt_rq[0]);
+}
+
+static void sched_rt_period_destroy_tg(struct task_group *tg)
+{
+	sched_rt_period_stop(tg->rt_rq[0]);
+}
+#endif
+
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
@@ -7068,6 +7202,7 @@ void __init sched_init_smp(void)
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
+	sched_rt_period_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (nr_cpu_ids == 1)
@@ -7088,6 +7223,7 @@ void __init sched_init_smp(void)
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
+	sched_rt_period_init();
 }
 #endif /* CONFIG_SMP */
 
@@ -7131,6 +7267,11 @@ static void init_rt_rq(struct rt_rq *rt_
 	rt_rq->rt_time = 0;
 	rt_rq->rt_throttled = 0;
 
+	hrtimer_init(&rt_rq->rt_period_timer,
+			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rt_rq->rt_period_timer.function = sched_rt_period_timer;
+	rt_rq->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	rt_rq->rq = rq;
 #endif
@@ -7201,6 +7342,8 @@ void __init sched_init(void)
 				&per_cpu(init_sched_entity, i), i, 1);
 
 		init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+		init_task_group.rt_period =
+			ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 		init_tg_rt_entry(rq, &init_task_group,
 				&per_cpu(init_rt_rq, i),
@@ -7208,8 +7351,6 @@ void __init sched_init(void)
 
 		list_add(&init_task_group.list, &task_groups);
 #endif
-		rq->rt_period_expire = 0;
-		rq->rt_throttled = 0;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
@@ -7598,6 +7739,7 @@ struct task_group *sched_create_group(vo
 
 	tg->shares = NICE_0_LOAD;
 	tg->rt_ratio = 0; /* XXX */
+	tg->rt_period = ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
 
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
@@ -7637,6 +7779,8 @@ struct task_group *sched_create_group(vo
 	list_add_rcu(&tg->list, &task_groups);
 	unlock_task_group_list();
 
+	sched_rt_period_init_tg(tg);
+
 	return tg;
 
 err:
@@ -7658,6 +7802,8 @@ void sched_destroy_group(struct task_gro
 	struct rt_rq *rt_rq = NULL;
 	int i;
 
+	sched_rt_period_destroy_tg(tg);
+
 	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
@@ -7815,6 +7961,19 @@ unsigned long sched_group_rt_ratio(struc
 	return tg->rt_ratio;
 }
 
+int sched_group_set_rt_period(struct task_group *tg, unsigned long rt_period)
+{
+	tg->rt_period = ns_to_ktime((u64)rt_period * NSEC_PER_USEC);
+	return 0;
+}
+
+unsigned long sched_group_rt_period(struct task_group *tg)
+{
+	u64 ns = ktime_to_ns(tg->rt_period);
+	do_div(ns, NSEC_PER_USEC);
+	return ns;
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7903,6 +8062,17 @@ static u64 cpu_rt_ratio_read_uint(struct
 	return (u64) tg->rt_ratio;
 }
 
+static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+		u64 rt_period_val)
+{
+	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_val);
+}
+
+static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+	return (u64) sched_group_rt_period(cgroup_tg(cgrp));
+}
+
 static struct cftype cpu_files[] = {
 	{
 		.name = "shares",
@@ -7914,6 +8084,11 @@ static struct cftype cpu_files[] = {
 		.read_uint = cpu_rt_ratio_read_uint,
 		.write_uint = cpu_rt_ratio_write_uint,
 	},
+	{
+		.name = "rt_period_us",
+		.read_uint = cpu_rt_period_read_uint,
+		.write_uint = cpu_rt_period_write_uint,
+	},
 };
 
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -65,6 +65,17 @@ static inline unsigned int sched_rt_rati
 	return rt_rq->tg->rt_ratio;
 }
 
+static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
+{
+	BUG_ON(!rt_rq->tg);
+	return rt_rq->tg->rt_period;
+}
+
+static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq)
+{
+	return ktime_to_ns(sched_rt_period(rt_rq));
+}
+
 #define for_each_leaf_rt_rq(rt_rq, rq) \
 	list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 
@@ -117,6 +128,16 @@ static inline unsigned int sched_rt_rati
 	return sysctl_sched_rt_ratio;
 }
 
+static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
+{
+	return ns_to_ktime((u64)sysctl_sched_rt_period * NSEC_PER_USEC);
+}
+
+static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq)
+{
+	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+
 #define for_each_leaf_rt_rq(rt_rq, rq) \
 	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 
@@ -174,15 +195,11 @@ static int sched_rt_ratio_exceeded(struc
 	if (rt_rq->rt_throttled)
 		return 1;
 
-	period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+	period = sched_rt_period_ns(rt_rq);
 	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
 	if (rt_rq->rt_time > ratio) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
-		rq->rt_throttled = 1;
 		rt_rq->rt_throttled = 1;
-
 		sched_rt_ratio_dequeue(rt_rq);
 		return 1;
 	}
@@ -190,27 +207,16 @@ static int sched_rt_ratio_exceeded(struc
 	return 0;
 }
 
-static void update_sched_rt_period(struct rq *rq)
+static void update_sched_rt_period(struct rt_rq *rt_rq)
 {
-	struct rt_rq *rt_rq;
-	u64 period;
-
-	while (rq->clock > rq->rt_period_expire) {
-		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
-		rq->rt_period_expire += period;
-
-		for_each_leaf_rt_rq(rt_rq, rq) {
-			unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-			u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-			rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
-			if (rt_rq->rt_throttled) {
-				rt_rq->rt_throttled = 0;
-				sched_rt_ratio_enqueue(rt_rq);
-			}
-		}
-
-		rq->rt_throttled = 0;
+	u64 period = sched_rt_period_ns(rt_rq);
+	unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+	u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+	rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+	if (rt_rq->rt_throttled) {
+		rt_rq->rt_throttled = 0;
+		sched_rt_ratio_enqueue(rt_rq);
 	}
 }
 
@@ -238,11 +244,6 @@ static void update_curr_rt(struct rq *rq
 	cpuacct_charge(curr, delta_exec);
 
 	rt_rq->rt_time += delta_exec;
-	/*
-	 * might make it a tad more accurate:
-	 *
-	 * update_sched_rt_period(rq);
-	 */
 	if (sched_rt_ratio_exceeded(rt_rq))
 		resched_task(curr);
 }
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -311,7 +311,7 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_rt_period_ms",
+		.procname	= "sched_rt_period_us",
 		.data		= &sysctl_sched_rt_period,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -153,7 +153,6 @@ void tick_nohz_update_jiffies(void)
 void tick_nohz_stop_sched_tick(void)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
-	unsigned long rt_jiffies;
 	struct tick_sched *ts;
 	ktime_t last_update, expires, now, delta;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -217,10 +216,6 @@ void tick_nohz_stop_sched_tick(void)
 	next_jiffies = get_next_timer_interrupt(last_jiffies);
 	delta_jiffies = next_jiffies - last_jiffies;
 
-	rt_jiffies = rt_needs_cpu(cpu);
-	if (rt_jiffies && rt_jiffies < delta_jiffies)
-		delta_jiffies = rt_jiffies;
-
 	if (rcu_needs_cpu(cpu))
 		delta_jiffies = 1;
 	/*

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 7/7] sched: rt-group: deal with PI
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
                   ` (5 preceding siblings ...)
  2008-01-04 13:55 ` [PATCH 6/7] sched: rt-group: per group period Peter Zijlstra
@ 2008-01-04 13:55 ` Peter Zijlstra
  2008-01-05 13:32 ` [PATCH 0/7] more rt group sched updates Ingo Molnar
  7 siblings, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-04 13:55 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-group-pi.patch --]
[-- Type: text/plain, Size: 4082 bytes --]

Steven mentioned the fun case where a lock holding task will be throttled.

Simple fix: allow groups that have boosted tasks to run anyway.
This is ofcourse not quite correct. Needs more tricks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c    |    3 +++
 kernel/sched_rt.c |   50 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 45 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -376,6 +376,8 @@ struct rt_rq {
 	struct hrtimer rt_period_timer;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	unsigned long rt_nr_boosted;
+
 	struct rq *rq;
 	struct list_head leaf_rt_rq_list;
 	struct task_group *tg;
@@ -7273,6 +7275,7 @@ static void init_rt_rq(struct rt_rq *rt_
 	rt_rq->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	rt_rq->rt_nr_boosted = 0;
 	rt_rq->rq = rq;
 #endif
 }
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -121,6 +121,11 @@ static void sched_rt_ratio_dequeue(struc
 		dequeue_rt_entity(rt_se);
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
 #else
 
 static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
@@ -170,6 +175,10 @@ static inline void sched_rt_ratio_dequeu
 {
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_throttled;
+}
 #endif
 
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -184,27 +193,42 @@ static inline int rt_se_prio(struct sche
 	return rt_task_of(rt_se)->prio;
 }
 
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+	struct task_struct *p;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+	if (rt_rq)
+		return !!rt_rq->rt_nr_boosted;
+#endif
+
+	p = rt_task_of(rt_se);
+	return p->prio != p->normal_prio;
+}
+
 static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
 {
 	unsigned int rt_ratio = sched_rt_ratio(rt_rq);
 	u64 period, ratio;
 
 	if (rt_ratio == SCHED_RT_FRAC)
-		return 0;
+		goto out;
 
 	if (rt_rq->rt_throttled)
-		return 1;
+		goto out;
 
 	period = sched_rt_period_ns(rt_rq);
 	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
 	if (rt_rq->rt_time > ratio) {
 		rt_rq->rt_throttled = 1;
-		sched_rt_ratio_dequeue(rt_rq);
-		return 1;
+		if (rt_rq_throttled(rt_rq))
+			sched_rt_ratio_dequeue(rt_rq);
 	}
 
-	return 0;
+out:
+	return rt_rq_throttled(rt_rq);
 }
 
 static void update_sched_rt_period(struct rt_rq *rt_rq)
@@ -265,6 +289,10 @@ void inc_rt_tasks(struct sched_rt_entity
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (rt_se_boosted(rt_se))
+		rt_rq->rt_nr_boosted++;
+#endif
 }
 
 static inline
@@ -295,6 +323,12 @@ void dec_rt_tasks(struct sched_rt_entity
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (rt_se_boosted(rt_se))
+		rt_rq->rt_nr_boosted--;
+
+	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+#endif
 }
 
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +337,7 @@ static void enqueue_rt_entity(struct sch
 	struct rt_prio_array *array = &rt_rq->active;
 	struct rt_rq *group_rq = group_rt_rq(rt_se);
 
-	if (group_rq && group_rq->rt_throttled)
+	if (group_rq && rt_rq_throttled(group_rq))
 		return;
 
 	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -476,7 +510,7 @@ static struct sched_rt_entity *pick_next
 	struct list_head *queue;
 	int idx;
 
-	if (sched_rt_ratio_exceeded(rt_rq))
+	if (rt_rq_throttled(rt_rq))
 		goto out;
 
 	idx = sched_find_first_bit(array->bitmap);
@@ -500,7 +534,7 @@ static struct task_struct *pick_next_tas
 	if (unlikely(!rt_rq->rt_nr_running))
 		return NULL;
 
-	if (sched_rt_ratio_exceeded(rt_rq))
+	if (rt_rq_throttled(rt_rq))
 		return NULL;
 
 	do {

--


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 0/7] more rt group sched updates
  2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
                   ` (6 preceding siblings ...)
  2008-01-04 13:55 ` [PATCH 7/7] sched: rt-group: deal with PI Peter Zijlstra
@ 2008-01-05 13:32 ` Ingo Molnar
  7 siblings, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2008-01-05 13:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> includes the two patches from yesterday,
> 
> series against sched-devel

thanks, applied.

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 6/7] sched: rt-group: per group period
  2008-01-04 13:55 ` [PATCH 6/7] sched: rt-group: per group period Peter Zijlstra
@ 2008-01-05 14:51   ` Peter Zijlstra
  2008-01-05 15:05     ` Ingo Molnar
  0 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2008-01-05 14:51 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner

Could you please fold this into the 6/7 patch.

It reverts a wandering chunk (the 32768 thing), but more importantly
it fixes !FAIR_GROUP_SCHED compilation.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |   10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -647,7 +647,7 @@ const_debug unsigned int sysctl_sched_rt
  * ratio of time -rt tasks may consume.
  * default: 95%
  */
-const_debug unsigned int sysctl_sched_rt_ratio = 32768; //62259;
+const_debug unsigned int sysctl_sched_rt_ratio = 62259;
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -5379,6 +5379,7 @@ static void __init sched_rt_period_init(
 	hotcpu_notifier(sched_rt_period_hotplug, 0);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static void __sched_rt_period_init_tg(void *arg)
 {
 	struct task_group *tg = arg;
@@ -5404,12 +5405,14 @@ static void sched_rt_period_destroy_tg(s
 {
 	on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1);
 }
-#else
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#else /* CONFIG_SMP */
 static void __init sched_rt_period_init(void)
 {
 	sched_rt_period_start_cpu(0);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static void sched_rt_period_init_tg(struct task_group *tg)
 {
 	sched_rt_period_start(tg->rt_rq[0]);
@@ -5419,7 +5422,8 @@ static void sched_rt_period_destroy_tg(s
 {
 	sched_rt_period_stop(tg->rt_rq[0]);
 }
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_SMP
 /*



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 6/7] sched: rt-group: per group period
  2008-01-05 14:51   ` Peter Zijlstra
@ 2008-01-05 15:05     ` Ingo Molnar
  0 siblings, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2008-01-05 15:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Could you please fold this into the 6/7 patch.
> 
> It reverts a wandering chunk (the 32768 thing), but more importantly
> it fixes !FAIR_GROUP_SCHED compilation.

done. Btw., there's a new warning:

kernel/sched_rt.c:197: warning: 'rt_se_boosted' defined but not used

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2008-01-05 15:06 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-01-04 13:54 [PATCH 0/7] more rt group sched updates Peter Zijlstra
2008-01-04 13:54 ` [PATCH 1/7] sched: rt throttling vs no_hz Peter Zijlstra
2008-01-04 13:54 ` [PATCH 2/7] sched: load_balance_monitor rename Peter Zijlstra
2008-01-04 13:55 ` [PATCH 3/7] hrtimer: clean up cpu->base locking tricks Peter Zijlstra
2008-01-04 13:55 ` [PATCH 4/7] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
2008-01-04 13:55 ` [PATCH 5/7] sched: rt-group: reduce rescheduling Peter Zijlstra
2008-01-04 13:55 ` [PATCH 6/7] sched: rt-group: per group period Peter Zijlstra
2008-01-05 14:51   ` Peter Zijlstra
2008-01-05 15:05     ` Ingo Molnar
2008-01-04 13:55 ` [PATCH 7/7] sched: rt-group: deal with PI Peter Zijlstra
2008-01-05 13:32 ` [PATCH 0/7] more rt group sched updates Ingo Molnar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.