[PATCH 0/7] RT: RT-Overload/Sched enhancements (v2)

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/7] RT: RT-Overload/Sched enhancements (v2)
@ 2007-10-13  0:15 Gregory Haskins
  2007-10-13  0:15 ` [PATCH 1/7] RT: Add a per-cpu rt_overload indication Gregory Haskins
                   ` (6 more replies)
  0 siblings, 7 replies; 18+ messages in thread
From: Gregory Haskins @ 2007-10-13  0:15 UTC (permalink / raw)
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

This series applies to 2.6.23-rt1 + Steven Rostedt's last published "push-rt"
patch.

Changes since v1:

- Rebased to the final 23-rt1 from 23-rt1-pre1
- Rebased to Steve's last published patch
- Removed controversial "cpupri" algorithm (may revisit later, drop for now)
- Fixed a missing priority update in the set_user_nice() code
- Added experimental "phase 3" patch for limiting pulls to when necessary

Comments/suggestions/feedback are all welcome.

Regards,
-Greg 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/7] RT: Add a per-cpu rt_overload indication
  2007-10-13  0:15 [PATCH 0/7] RT: RT-Overload/Sched enhancements (v2) Gregory Haskins
@ 2007-10-13  0:15 ` Gregory Haskins
  2007-10-15 17:42   ` Steven Rostedt
  2007-10-13  0:15 ` [PATCH 2/7] RT: Wrap the RQ notion of priority to make it conditional Gregory Haskins
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 18+ messages in thread
From: Gregory Haskins @ 2007-10-13  0:15 UTC (permalink / raw)
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

The system currently evaluates all online CPUs whenever one or more enters
an rt_overload condition.  This suffers from scalability limitations as
the # of online CPUs increases.  So we introduce a cpumask to track
exactly which CPUs need RT balancing.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Peter W. Morreale <pmorreale@novell.com>
---

 kernel/sched.c |   12 +++++++++---
 1 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 86ff36d..0a1ad0e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -632,6 +632,7 @@ static inline struct rq *this_rq_lock(void)
 
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 static __cacheline_aligned_in_smp atomic_t rt_overload;
+static cpumask_t rto_cpus;
 #endif
 
 static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
@@ -640,8 +641,11 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
 	if (rt_task(p)) {
 		rq->rt_nr_running++;
 # ifdef CONFIG_SMP
-		if (rq->rt_nr_running == 2)
+		if (rq->rt_nr_running == 2) {
+			cpu_set(rq->cpu, rto_cpus);
+			smp_wmb();
 			atomic_inc(&rt_overload);
+		}
 # endif
 	}
 #endif
@@ -654,8 +658,10 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
 		WARN_ON(!rq->rt_nr_running);
 		rq->rt_nr_running--;
 # ifdef CONFIG_SMP
-		if (rq->rt_nr_running == 1)
+		if (rq->rt_nr_running == 1) {
 			atomic_dec(&rt_overload);
+			cpu_clear(rq->cpu, rto_cpus);
+		}
 # endif
 	}
 #endif
@@ -1624,7 +1630,7 @@ static void balance_rt_tasks(struct rq *this_rq, int this_cpu)
 	 */
 	next = pick_next_task(this_rq, this_rq->curr);
 
-	for_each_online_cpu(cpu) {
+	for_each_cpu_mask(cpu, rto_cpus) {
 		if (cpu == this_cpu)
 			continue;
 		src_rq = cpu_rq(cpu);


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/7] RT: Wrap the RQ notion of priority to make it conditional
  2007-10-13  0:15 [PATCH 0/7] RT: RT-Overload/Sched enhancements (v2) Gregory Haskins
  2007-10-13  0:15 ` [PATCH 1/7] RT: Add a per-cpu rt_overload indication Gregory Haskins
@ 2007-10-13  0:15 ` Gregory Haskins
  2007-10-15 17:45   ` Steven Rostedt
  2007-10-13  0:15 ` [PATCH 3/7] RT: Initialize the priority value Gregory Haskins
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 18+ messages in thread
From: Gregory Haskins @ 2007-10-13  0:15 UTC (permalink / raw)
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

A little cleanup to avoid #ifdef proliferation later in the series

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |   23 ++++++++++++++++++++---
 1 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 0a1ad0e..c9afc8a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -255,6 +255,10 @@ struct cfs_rq {
 #endif
 };
 
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+#define  ENABLE_RQ_PRIORITY   /* Steve wants this conditional on PREEMPT_RT */
+#endif /* CONFIG_PREEMPT_RT */
+
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
 	struct rt_prio_array active;
@@ -304,6 +308,9 @@ struct rq {
 #ifdef CONFIG_PREEMPT_RT
 	unsigned long rt_nr_running;
 	unsigned long rt_nr_uninterruptible;
+#endif
+
+#ifdef ENABLE_RQ_PRIORITY
 	int curr_prio;
 #endif
 
@@ -365,6 +372,16 @@ struct rq {
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static DEFINE_MUTEX(sched_hotcpu_mutex);
 
+#ifdef ENABLE_RQ_PRIORITY
+static inline void set_rq_prio(struct rq *rq, int prio)
+{
+	rq->curr_prio = prio;
+}
+
+#else
+#define set_rq_prio(rq, prio) do { } while(0)
+#endif
+
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 {
 	rq->curr->sched_class->check_preempt_curr(rq, p);
@@ -2331,9 +2348,9 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	_finish_arch_switch(prev);
-#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
-	rq->curr_prio = current->prio;
-#endif
+
+	set_rq_prio(rq, current->prio);
+
 	finish_lock_switch(rq, prev);
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 	/*


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/7] RT: Initialize the priority value
  2007-10-13  0:15 [PATCH 0/7] RT: RT-Overload/Sched enhancements (v2) Gregory Haskins
  2007-10-13  0:15 ` [PATCH 1/7] RT: Add a per-cpu rt_overload indication Gregory Haskins
  2007-10-13  0:15 ` [PATCH 2/7] RT: Wrap the RQ notion of priority to make it conditional Gregory Haskins
@ 2007-10-13  0:15 ` Gregory Haskins
  2007-10-15 17:46   ` Steven Rostedt
  2007-10-13  0:15 ` [PATCH 4/7] RT: Add support for updating push-rt priority under PI boost Gregory Haskins
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 18+ messages in thread
From: Gregory Haskins @ 2007-10-13  0:15 UTC (permalink / raw)
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index c9afc8a..50c88e8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7395,6 +7395,8 @@ void __init sched_init(void)
 		highest_cpu = i;
 		/* delimiter for bitsearch: */
 		__set_bit(MAX_RT_PRIO, array->bitmap);
+
+		set_rq_prio(rq, MAX_PRIO);
 	}
 
 	set_load_weight(&init_task);


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/7] RT: Add support for updating push-rt priority under PI boost
  2007-10-13  0:15 [PATCH 0/7] RT: RT-Overload/Sched enhancements (v2) Gregory Haskins
                   ` (2 preceding siblings ...)
  2007-10-13  0:15 ` [PATCH 3/7] RT: Initialize the priority value Gregory Haskins
@ 2007-10-13  0:15 ` Gregory Haskins
  2007-10-15 17:47   ` Steven Rostedt
  2007-10-13  0:15 ` [PATCH 5/7] RT: Add support for low-priority wake-up to push_rt feature Gregory Haskins
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 18+ messages in thread
From: Gregory Haskins @ 2007-10-13  0:15 UTC (permalink / raw)
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 50c88e8..62f9f0b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4663,6 +4663,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		 * this runqueue and our priority is higher than the current's
 		 */
 		if (task_running(rq, p)) {
+			set_rq_prio(rq, p->prio);
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else {
@@ -4722,6 +4723,9 @@ void set_user_nice(struct task_struct *p, long nice)
 		 */
 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
+
+		if (delta && task_running(rq, p))
+			set_rq_prio(rq, p->prio);
 	}
 out_unlock:
 	task_rq_unlock(rq, &flags);


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 5/7] RT: Add support for low-priority wake-up to push_rt feature
  2007-10-13  0:15 [PATCH 0/7] RT: RT-Overload/Sched enhancements (v2) Gregory Haskins
                   ` (3 preceding siblings ...)
  2007-10-13  0:15 ` [PATCH 4/7] RT: Add support for updating push-rt priority under PI boost Gregory Haskins
@ 2007-10-13  0:15 ` Gregory Haskins
  2007-10-13  9:46   ` Peter Zijlstra
  2007-10-15 18:05   ` Steven Rostedt
  2007-10-13  0:16 ` [PATCH 6/7] RT: Select tasks based on relative affinity Gregory Haskins
  2007-10-13  0:16 ` [PATCH 7/7] RT: (RFC) Only try to pull tasks in if we are downgrading our priority Gregory Haskins
  6 siblings, 2 replies; 18+ messages in thread
From: Gregory Haskins @ 2007-10-13  0:15 UTC (permalink / raw)
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

There are three events that require consideration for redistributing RT
tasks:

1) When one or more higher-priority tasks preempts a lower-one from a
   RQ
2) When a lower-priority task is woken up on a RQ
3) When a RQ downgrades its current priority

Steve Rostedt's push_rt patch addresses (1).  It hooks in right after
a new task has been switched-in.  If this was the result of an RT
preemption, or if more than one task was awoken at the same time, we
can try to push some of those other tasks away.

This patch addresses (2).  When we wake up a task, we check to see
if it would preempt the current task on the queue.  If it will not, we
attempt to find a better suited CPU (e.g. one running something lower
priority than the task being woken) and try to activate the task there.

Finally, we have (3).  In theory, we only need to balance_rt_tasks() if
the following conditions are met:
   1) One or more CPUs are in overload, AND
   2) We are about to switch to a task that lowers our priority.

(3) will be addressed in a later patch.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |   68 ++++++++++++++++++++++++++++++++++----------------------
 1 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 62f9f0b..3c71156 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1628,6 +1628,12 @@ out:
 	return ret;
 }
 
+/* Push all tasks that we can to other CPUs */
+static void push_rt_tasks(struct rq *this_rq)
+{
+	while (push_rt_task(this_rq));
+}
+
 /*
  * Pull RT tasks from other CPUs in the RT-overload
  * case. Interrupts are disabled, local rq is locked.
@@ -1988,7 +1994,33 @@ out_set_cpu:
 		this_cpu = smp_processor_id();
 		cpu = task_cpu(p);
 	}
-
+	
+	/*
+	 * If a newly woken up RT task cannot preempt the
+	 * current (RT) task (on a target runqueue) then try
+	 * to find another CPU it can preempt:
+	 */
+	if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
+		cpumask_t cpu_mask;
+		cpus_and(cpu_mask, cpu_online_map, p->cpus_allowed);
+
+		new_cpu = find_lowest_cpu(&cpu_mask, p, rq);
+		if ((new_cpu != -1) && (new_cpu != cpu)) {
+			set_task_cpu(p, new_cpu);
+			spin_unlock(&rq->lock);
+
+			/* The new lock was already acquired in find_lowest */ 
+			rq = cpu_rq(new_cpu);
+			old_state = p->state;
+			if (!(old_state & state))
+				goto out;
+			if (p->se.on_rq)
+				goto out_running;
+			
+			this_cpu = smp_processor_id();
+			cpu = task_cpu(p);
+		}
+	}
 out_activate:
 #endif /* CONFIG_SMP */
 	update_rq_clock(rq);
@@ -2002,30 +2034,13 @@ out_activate:
 	 * to find another CPU it can preempt:
 	 */
 	if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
-		struct rq *this_rq = cpu_rq(this_cpu);
 		/*
-		 * Special-case: the task on this CPU can be
-		 * preempted. In that case there's no need to
-		 * trigger reschedules on other CPUs, we can
-		 * mark the current task for reschedule.
-		 *
-		 * (Note that it's safe to access this_rq without
-		 * extra locking in this particular case, because
-		 * we are on the current CPU.)
+		 * FIXME: Do we still need to do this here anymore, or
+		 * does the preemption-check above suffice.  The path
+		 * that makes my head hurt is when we have the
+		 * task_running->out_activate path
 		 */
-		if (TASK_PREEMPTS_CURR(p, this_rq))
-			set_tsk_need_resched(this_rq->curr);
-		else
-			/*
-			 * Neither the intended target runqueue
-			 * nor the current CPU can take this task.
-			 * Trigger a reschedule on all other CPUs
-			 * nevertheless, maybe one of them can take
-			 * this task:
-			 */
-			smp_send_reschedule_allbutself_cpumask(p->cpus_allowed);
-
-		schedstat_inc(this_rq, rto_wakeup);
+		push_rt_tasks(rq);
 	} else {
 		/*
 		 * Sync wakeups (i.e. those types of wakeups where the waker
@@ -2360,13 +2375,12 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 * the lock was owned by prev, we need to release it
 	 * first via finish_lock_switch and then reaquire it.
 	 */
-	if (unlikely(rt_task(current))) {
+	if (unlikely(rq->rt_nr_running > 1)) {
 		spin_lock(&rq->lock);
-		/* push_rt_task will return true if it moved an RT */
-		while (push_rt_task(rq))
-			;
+		push_rt_tasks(rq);
 		spin_unlock(&rq->lock);
 	}
+
 #endif
 	fire_sched_in_preempt_notifiers(current);
 	trace_stop_sched_switched(current);


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 6/7] RT: Select tasks based on relative affinity
  2007-10-13  0:15 [PATCH 0/7] RT: RT-Overload/Sched enhancements (v2) Gregory Haskins
                   ` (4 preceding siblings ...)
  2007-10-13  0:15 ` [PATCH 5/7] RT: Add support for low-priority wake-up to push_rt feature Gregory Haskins
@ 2007-10-13  0:16 ` Gregory Haskins
  2007-10-15 20:08   ` Gregory Haskins
  2007-10-13  0:16 ` [PATCH 7/7] RT: (RFC) Only try to pull tasks in if we are downgrading our priority Gregory Haskins
  6 siblings, 1 reply; 18+ messages in thread
From: Gregory Haskins @ 2007-10-13  0:16 UTC (permalink / raw)
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

In theory, tasks will be most efficient if they are allowed to re-wake to
the CPU that they last ran on due to cache affinity.  Short of that, it is
cheaper to wake up the current CPU.  If neither of those two are options,
than the lowest CPU will do. 

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |   72 +++++++++++++++++++++++++++++++++++++++++---------------
 1 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3c71156..b79b968 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1511,6 +1511,15 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 /* Only try this algorithm three times */
 #define RT_PUSH_MAX_TRIES 3
 
+static int non_rt_cpu(cpumask_t *mask, int cpu)
+{
+	if (!rt_prio(cpu_rq(cpu)->curr_prio) &&
+	    cpu_isset(cpu, *mask))
+		return 1;
+
+	return 0;
+}
+
 /* Will lock the rq it finds */
 static int find_lowest_cpu(cpumask_t *cpu_mask, struct task_struct *task,
 				 struct rq *this_rq)
@@ -1519,32 +1528,57 @@ static int find_lowest_cpu(cpumask_t *cpu_mask, struct task_struct *task,
 	int dst_cpu = -1;
 	int cpu;
 	int tries;
+	int this_cpu = smp_processor_id();
 
 	for (tries = 0; tries < RT_PUSH_MAX_TRIES; tries++) {
+
 		/*
-		 * Scan each rq for the lowest prio.
+		 * We select a CPU in the following priority:
+		 *
+		 *       task_cpu, this_cpu, first_cpu
+		 *
+		 * for efficiency.
+		 *
+		 * - task_cpu preserves cache affinity
+		 * - this_cpu is (presumably) cheaper to preempt
+		 *   (note that sometimes task_cpu and this_cpu
+		 *   are the same).
+		 * - Finally, we will take whatever is available
+		 *   if the first two don't pan out by scanning.
 		 */
-		for_each_cpu_mask(cpu, *cpu_mask) {
-			struct rq *rq = &per_cpu(runqueues, cpu);
-
-			if (cpu == smp_processor_id())
-				continue;
-
-			/* We look for lowest RT prio or non-rt CPU */
-			if (rq->curr_prio >= MAX_RT_PRIO) {
-				lowest_rq = rq;
-				dst_cpu = cpu;
-				break;
-			}
+		if (non_rt_cpu(cpu_mask, task_cpu(task))) {
+			lowest_rq = task_rq(task);	
+			dst_cpu = lowest_rq->cpu;
+		} else if (non_rt_cpu(cpu_mask, this_cpu)) {
+			dst_cpu = this_cpu;
+			lowest_rq = cpu_rq(this_cpu);	
+		} else {
 
-			/* no locking for now */
-			if (rq->curr_prio > task->prio &&
-			    (!lowest_rq || rq->curr_prio < lowest_rq->curr_prio)) {
-				dst_cpu = cpu;
-				lowest_rq = rq;
+			/*
+			 * Scan each rq for the lowest prio.
+			 */
+			for_each_cpu_mask(cpu, *cpu_mask) {
+				struct rq *rq = &per_cpu(runqueues, cpu);
+				
+				if (cpu == this_cpu)
+					continue;
+				
+				/* We look for lowest RT prio or non-rt CPU */
+				if (rq->curr_prio >= MAX_RT_PRIO) {
+					lowest_rq = rq;
+					dst_cpu = cpu;
+					break;
+				}
+				
+				/* no locking for now */
+				if (rq->curr_prio > task->prio &&
+				    (!lowest_rq || rq->curr_prio < lowest_rq->curr_prio)) {
+					dst_cpu = cpu;
+					lowest_rq = rq;
+				}
 			}
 		}
-
+			
 		if (!lowest_rq) {
 			dst_cpu = -1;
 			break;


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 7/7] RT: (RFC) Only try to pull tasks in if we are downgrading our priority
  2007-10-13  0:15 [PATCH 0/7] RT: RT-Overload/Sched enhancements (v2) Gregory Haskins
                   ` (5 preceding siblings ...)
  2007-10-13  0:16 ` [PATCH 6/7] RT: Select tasks based on relative affinity Gregory Haskins
@ 2007-10-13  0:16 ` Gregory Haskins
  6 siblings, 0 replies; 18+ messages in thread
From: Gregory Haskins @ 2007-10-13  0:16 UTC (permalink / raw)
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

The current code runs the balance_rt_tasks() on every _schedule()
once the system enters an overload state.  Now that we have better
distribution on the push side, we can reduce the conditions that
require us to pull tasks.  They are as follows:

At the time of a _schedule(), if our priority is staying the same or
going logically higher between _prev_ and _next_ we can skip trying
to balance tasks because any tasks that were runnable by our queue
have already been pushed to us.  However, if our priority is
transitioning lower AND some CPUs out there are in overload, we
should check to see if we can pull some over.

Note: DO NOT RUN TEST WITH THIS PATCH APPLIED !

This patch is a concept only (for now) and is here for
demonstration/comment purposes only.  The current series is still
racy with respect to acquiring the lowest RQ in the system.  The
concept in this patch is predicated on raceless distribution to our
current priority level.  So the current balancer is a nice big
safety net to cleanup any issues related to that. This patch
effectively takes that net away, so we need to be sure we have all
our ducks lined up before turning this one on.  But once we do we can
eliminate the (fairly expensive) checks (e.g. rq double-locks, etc)
in a subset (hopefully significant #) of the calls to schedule(),
which sounds like a good optimization to me ;)  We shall see if that
pans out.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |   16 ++++++++++++++--
 1 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index b79b968..9e1f3ec 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4118,8 +4118,20 @@ asmlinkage void __sched __schedule(void)
 		sub_preempt_count(PREEMPT_ACTIVE);

 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
-	if (unlikely(atomic_read(&rt_overload)))
-		balance_rt_tasks(rq, cpu);
+	/*
+	 * If we are switching away from an RT task while the system is in
+	 * overload, we need to see if there is a way we can help a different
+	 * cpu.  If the next task to run is not RT, or if it *is* RT but
+	 * is a logically lower priority, try to balance the system.  Otherwise
+	 * we can skip it, because anyone that could have pushed their tasks
+	 * to us already did.
+	 */
+	if (unlikely(rt_task(prev) && atomic_read(&rt_overload))) {
+		struct task_struct *preview = pick_rt_task(rq, cpu);
+
+		if (!preview || (preview->prio > prev->prio))
+			balance_rt_tasks(rq, cpu);
+	}
 #endif

 	if (unlikely(!rq->nr_running))

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 5/7] RT: Add support for low-priority wake-up to push_rt feature
  2007-10-13  0:15 ` [PATCH 5/7] RT: Add support for low-priority wake-up to push_rt feature Gregory Haskins
@ 2007-10-13  9:46   ` Peter Zijlstra
  2007-10-15 18:57     ` Steven Rostedt
  2007-10-15 18:05   ` Steven Rostedt
  1 sibling, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2007-10-13  9:46 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: Steven Rostedt, RT, Ingo Molnar, LKML

[-- Attachment #1: Type: text/plain, Size: 314 bytes --]

On Fri, 2007-10-12 at 20:15 -0400, Gregory Haskins wrote:

> +/* Push all tasks that we can to other CPUs */
> +static void push_rt_tasks(struct rq *this_rq)
> +{
> +	while (push_rt_task(this_rq));
> +}

I'd like to see an additional termination condition to this loop (might
just be paranoia though).


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/7] RT: Add a per-cpu rt_overload indication
  2007-10-13  0:15 ` [PATCH 1/7] RT: Add a per-cpu rt_overload indication Gregory Haskins
@ 2007-10-15 17:42   ` Steven Rostedt
  0 siblings, 0 replies; 18+ messages in thread
From: Steven Rostedt @ 2007-10-15 17:42 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: Peter Zijlstra, RT, Ingo Molnar, LKML, Thomas Gleixner



--
On Fri, 12 Oct 2007, Gregory Haskins wrote:

> The system currently evaluates all online CPUs whenever one or more enters
> an rt_overload condition.  This suffers from scalability limitations as
> the # of online CPUs increases.  So we introduce a cpumask to track
> exactly which CPUs need RT balancing.
>
> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> CC: Peter W. Morreale <pmorreale@novell.com>

Acked-by: Steven Rostedt <rostedt@goodmis.org>

This patch seems reasonable to me. I'll pull it into the queue unless
there's any NACKs.

-- Steve

> ---
>
>  kernel/sched.c |   12 +++++++++---
>  1 files changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 86ff36d..0a1ad0e 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -632,6 +632,7 @@ static inline struct rq *this_rq_lock(void)
>
>  #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
>  static __cacheline_aligned_in_smp atomic_t rt_overload;
> +static cpumask_t rto_cpus;
>  #endif
>
>  static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
> @@ -640,8 +641,11 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
>  	if (rt_task(p)) {
>  		rq->rt_nr_running++;
>  # ifdef CONFIG_SMP
> -		if (rq->rt_nr_running == 2)
> +		if (rq->rt_nr_running == 2) {
> +			cpu_set(rq->cpu, rto_cpus);
> +			smp_wmb();
>  			atomic_inc(&rt_overload);
> +		}
>  # endif
>  	}
>  #endif
> @@ -654,8 +658,10 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
>  		WARN_ON(!rq->rt_nr_running);
>  		rq->rt_nr_running--;
>  # ifdef CONFIG_SMP
> -		if (rq->rt_nr_running == 1)
> +		if (rq->rt_nr_running == 1) {
>  			atomic_dec(&rt_overload);
> +			cpu_clear(rq->cpu, rto_cpus);
> +		}
>  # endif
>  	}
>  #endif
> @@ -1624,7 +1630,7 @@ static void balance_rt_tasks(struct rq *this_rq, int this_cpu)
>  	 */
>  	next = pick_next_task(this_rq, this_rq->curr);
>
> -	for_each_online_cpu(cpu) {
> +	for_each_cpu_mask(cpu, rto_cpus) {
>  		if (cpu == this_cpu)
>  			continue;
>  		src_rq = cpu_rq(cpu);
>
>
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/7] RT: Wrap the RQ notion of priority to make it conditional
  2007-10-13  0:15 ` [PATCH 2/7] RT: Wrap the RQ notion of priority to make it conditional Gregory Haskins
@ 2007-10-15 17:45   ` Steven Rostedt
  2007-10-15 19:57     ` Gregory Haskins
  0 siblings, 1 reply; 18+ messages in thread
From: Steven Rostedt @ 2007-10-15 17:45 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: Peter Zijlstra, RT, Ingo Molnar, LKML, Thomas Gleixner



--

On Fri, 12 Oct 2007, Gregory Haskins wrote:

> A little cleanup to avoid #ifdef proliferation later in the series
>
> Signed-off-by: Gregory Haskins <ghaskins@novell.com>

NACK

> ---
>
>  kernel/sched.c |   23 ++++++++++++++++++++---
>  1 files changed, 20 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 0a1ad0e..c9afc8a 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -255,6 +255,10 @@ struct cfs_rq {
>  #endif
>  };
>
> +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
> +#define  ENABLE_RQ_PRIORITY   /* Steve wants this conditional on PREEMPT_RT */

Yes, I do want it conditional. But only because we want to test it first,
so the condition is really temporary. I agree with Peter that this should
be for non PREEMPT_RT as well, and I'll work on a patch for upstream.

But these defines setting defines is just ugly.

-- Steve


> +#endif /* CONFIG_PREEMPT_RT */
> +
>  /* Real-Time classes' related field in a runqueue: */
>  struct rt_rq {
>  	struct rt_prio_array active;
> @@ -304,6 +308,9 @@ struct rq {
>  #ifdef CONFIG_PREEMPT_RT
>  	unsigned long rt_nr_running;
>  	unsigned long rt_nr_uninterruptible;
> +#endif
> +
> +#ifdef ENABLE_RQ_PRIORITY
>  	int curr_prio;
>  #endif
>
> @@ -365,6 +372,16 @@ struct rq {
>  static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
>  static DEFINE_MUTEX(sched_hotcpu_mutex);
>
> +#ifdef ENABLE_RQ_PRIORITY
> +static inline void set_rq_prio(struct rq *rq, int prio)
> +{
> +	rq->curr_prio = prio;
> +}
> +
> +#else
> +#define set_rq_prio(rq, prio) do { } while(0)
> +#endif
> +
>  static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
>  {
>  	rq->curr->sched_class->check_preempt_curr(rq, p);
> @@ -2331,9 +2348,9 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
>  	 */
>  	prev_state = prev->state;
>  	_finish_arch_switch(prev);
> -#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
> -	rq->curr_prio = current->prio;
> -#endif
> +
> +	set_rq_prio(rq, current->prio);
> +
>  	finish_lock_switch(rq, prev);
>  #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
>  	/*
>
>
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/7] RT: Initialize the priority value
  2007-10-13  0:15 ` [PATCH 3/7] RT: Initialize the priority value Gregory Haskins
@ 2007-10-15 17:46   ` Steven Rostedt
  0 siblings, 0 replies; 18+ messages in thread
From: Steven Rostedt @ 2007-10-15 17:46 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: Peter Zijlstra, RT, Ingo Molnar, LKML, Thomas Gleixner



On Fri, 12 Oct 2007, Gregory Haskins wrote:


hmm, You forgot a description.

-- Steve

> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> ---
>
>  kernel/sched.c |    2 ++
>  1 files changed, 2 insertions(+), 0 deletions(-)
>
> diff --git a/kernel/sched.c b/kernel/sched.c
> index c9afc8a..50c88e8 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -7395,6 +7395,8 @@ void __init sched_init(void)
>  		highest_cpu = i;
>  		/* delimiter for bitsearch: */
>  		__set_bit(MAX_RT_PRIO, array->bitmap);
> +
> +		set_rq_prio(rq, MAX_PRIO);
>  	}
>
>  	set_load_weight(&init_task);
>
>
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/7] RT: Add support for updating push-rt priority under PI boost
  2007-10-13  0:15 ` [PATCH 4/7] RT: Add support for updating push-rt priority under PI boost Gregory Haskins
@ 2007-10-15 17:47   ` Steven Rostedt
  0 siblings, 0 replies; 18+ messages in thread
From: Steven Rostedt @ 2007-10-15 17:47 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: Peter Zijlstra, RT, Ingo Molnar, LKML, Thomas Gleixner



On Fri, 12 Oct 2007, Gregory Haskins wrote:

Again no description.

-- Steve

> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> ---
>
>  kernel/sched.c |    4 ++++
>  1 files changed, 4 insertions(+), 0 deletions(-)
>
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 50c88e8..62f9f0b 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -4663,6 +4663,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
>  		 * this runqueue and our priority is higher than the current's
>  		 */
>  		if (task_running(rq, p)) {
> +			set_rq_prio(rq, p->prio);
>  			if (p->prio > oldprio)
>  				resched_task(rq->curr);
>  		} else {
> @@ -4722,6 +4723,9 @@ void set_user_nice(struct task_struct *p, long nice)
>  		 */
>  		if (delta < 0 || (delta > 0 && task_running(rq, p)))
>  			resched_task(rq->curr);
> +
> +		if (delta && task_running(rq, p))
> +			set_rq_prio(rq, p->prio);
>  	}
>  out_unlock:
>  	task_rq_unlock(rq, &flags);
>
>
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 5/7] RT: Add support for low-priority wake-up to push_rt feature
  2007-10-13  0:15 ` [PATCH 5/7] RT: Add support for low-priority wake-up to push_rt feature Gregory Haskins
  2007-10-13  9:46   ` Peter Zijlstra
@ 2007-10-15 18:05   ` Steven Rostedt
  2007-10-15 19:54     ` Gregory Haskins
  1 sibling, 1 reply; 18+ messages in thread
From: Steven Rostedt @ 2007-10-15 18:05 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: Peter Zijlstra, RT, Ingo Molnar, LKML


--
On Fri, 12 Oct 2007, Gregory Haskins wrote:

> There are three events that require consideration for redistributing RT
> tasks:
>
> 1) When one or more higher-priority tasks preempts a lower-one from a
>    RQ
> 2) When a lower-priority task is woken up on a RQ
> 3) When a RQ downgrades its current priority
>
> Steve Rostedt's push_rt patch addresses (1).  It hooks in right after
> a new task has been switched-in.  If this was the result of an RT
> preemption, or if more than one task was awoken at the same time, we
> can try to push some of those other tasks away.
>
> This patch addresses (2).  When we wake up a task, we check to see
> if it would preempt the current task on the queue.  If it will not, we
> attempt to find a better suited CPU (e.g. one running something lower
> priority than the task being woken) and try to activate the task there.
>
> Finally, we have (3).  In theory, we only need to balance_rt_tasks() if
> the following conditions are met:
>    1) One or more CPUs are in overload, AND
>    2) We are about to switch to a task that lowers our priority.
>
> (3) will be addressed in a later patch.
>
> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> ---
>
>  kernel/sched.c |   68 ++++++++++++++++++++++++++++++++++----------------------
>  1 files changed, 41 insertions(+), 27 deletions(-)
>
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 62f9f0b..3c71156 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -1628,6 +1628,12 @@ out:
>  	return ret;
>  }
>
> +/* Push all tasks that we can to other CPUs */
> +static void push_rt_tasks(struct rq *this_rq)
> +{
> +	while (push_rt_task(this_rq));

Loop conditions like this must be written as:

   while (push_rt_task(this_rq))
	;

So we don't accidently put something inside the loop if we forget to add
the semicolon, like:

   while (push_rt_task(this_rq)

   do_something_not_expected_to_loop();

Of course you end your function after that and thus we would get an
compile error if the semicolon were to be missing. But we might add
code afterwards.

> +}
> +
>  /*
>   * Pull RT tasks from other CPUs in the RT-overload
>   * case. Interrupts are disabled, local rq is locked.
> @@ -1988,7 +1994,33 @@ out_set_cpu:
>  		this_cpu = smp_processor_id();
>  		cpu = task_cpu(p);
>  	}
> -
> +
> +	/*
> +	 * If a newly woken up RT task cannot preempt the
> +	 * current (RT) task (on a target runqueue) then try
> +	 * to find another CPU it can preempt:
> +	 */
> +	if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
> +		cpumask_t cpu_mask;
> +		cpus_and(cpu_mask, cpu_online_map, p->cpus_allowed);

Hmm, maybe I should put that mask into the find_lowest_cpu function.
Of course I changed this a little in my last patch.

> +
> +		new_cpu = find_lowest_cpu(&cpu_mask, p, rq);
> +		if ((new_cpu != -1) && (new_cpu != cpu)) {
> +			set_task_cpu(p, new_cpu);
> +			spin_unlock(&rq->lock);
> +
> +			/* The new lock was already acquired in find_lowest */
> +			rq = cpu_rq(new_cpu);
> +			old_state = p->state;
> +			if (!(old_state & state))
> +				goto out;
> +			if (p->se.on_rq)
> +				goto out_running;
> +
> +			this_cpu = smp_processor_id();

Could we have preempted to get a new this_cpu?

> +			cpu = task_cpu(p);
> +		}
> +	}
>  out_activate:
>  #endif /* CONFIG_SMP */
>  	update_rq_clock(rq);
> @@ -2002,30 +2034,13 @@ out_activate:
>  	 * to find another CPU it can preempt:
>  	 */
>  	if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
> -		struct rq *this_rq = cpu_rq(this_cpu);
>  		/*
> -		 * Special-case: the task on this CPU can be
> -		 * preempted. In that case there's no need to
> -		 * trigger reschedules on other CPUs, we can
> -		 * mark the current task for reschedule.
> -		 *
> -		 * (Note that it's safe to access this_rq without
> -		 * extra locking in this particular case, because
> -		 * we are on the current CPU.)
> +		 * FIXME: Do we still need to do this here anymore, or
> +		 * does the preemption-check above suffice.  The path
> +		 * that makes my head hurt is when we have the
> +		 * task_running->out_activate path
>  		 */
> -		if (TASK_PREEMPTS_CURR(p, this_rq))
> -			set_tsk_need_resched(this_rq->curr);
> -		else
> -			/*
> -			 * Neither the intended target runqueue
> -			 * nor the current CPU can take this task.
> -			 * Trigger a reschedule on all other CPUs
> -			 * nevertheless, maybe one of them can take
> -			 * this task:
> -			 */
> -			smp_send_reschedule_allbutself_cpumask(p->cpus_allowed);
> -
> -		schedstat_inc(this_rq, rto_wakeup);
> +		push_rt_tasks(rq);

I think the question is, doesn't this make the above not needed? The
push_rt_tasks should do what the previous condition did.

Maybe I'm missing something.

>  	} else {
>  		/*
>  		 * Sync wakeups (i.e. those types of wakeups where the waker
> @@ -2360,13 +2375,12 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
>  	 * the lock was owned by prev, we need to release it
>  	 * first via finish_lock_switch and then reaquire it.
>  	 */
> -	if (unlikely(rt_task(current))) {
> +	if (unlikely(rq->rt_nr_running > 1)) {

Heh, I guess that would work.

-- Steve

>  		spin_lock(&rq->lock);
> -		/* push_rt_task will return true if it moved an RT */
> -		while (push_rt_task(rq))
> -			;
> +		push_rt_tasks(rq);
>  		spin_unlock(&rq->lock);
>  	}
> +
>  #endif
>  	fire_sched_in_preempt_notifiers(current);
>  	trace_stop_sched_switched(current);
>
>
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 5/7] RT: Add support for low-priority wake-up to push_rt feature
  2007-10-13  9:46   ` Peter Zijlstra
@ 2007-10-15 18:57     ` Steven Rostedt
  0 siblings, 0 replies; 18+ messages in thread
From: Steven Rostedt @ 2007-10-15 18:57 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Gregory Haskins, RT, Ingo Molnar, LKML

--
On Sat, 13 Oct 2007, Peter Zijlstra wrote:

> On Fri, 2007-10-12 at 20:15 -0400, Gregory Haskins wrote:
>
> > +/* Push all tasks that we can to other CPUs */
> > +static void push_rt_tasks(struct rq *this_rq)
> > +{
> > +	while (push_rt_task(this_rq));
> > +}
>
> I'd like to see an additional termination condition to this loop (might
> just be paranoia though).

Well if this fails to terminate, then we have a major bug.

Perhaps we can add something like this:

{
	int count = NR_CPUS * 2;

	while (push_rt_tasks(this_rq) && count--)
		;

	BUG_ON(!count);
}

Since we should do it really at most CPU times, and I added a CPU * 2 just
to be safe that we don't have a unrealistic point of moving tasks onto
other CPUS and have them finish before we finish this loop.

But really, I don't think we need to worry too much about that while loop
going too long.  It terminates when there's no more RT tasks to migrate
off to other CPUs that have lower priority tasks.

Hmm, the thing we need to be careful with here is that we need to change
the prio of the CPU, otherwise we can a task to a CPU even though a higher
one was already queued.

/me plays to fix that!

-- Steve

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 5/7] RT: Add support for low-priority wake-up to push_rt feature
  2007-10-15 18:05   ` Steven Rostedt
@ 2007-10-15 19:54     ` Gregory Haskins
  0 siblings, 0 replies; 18+ messages in thread
From: Gregory Haskins @ 2007-10-15 19:54 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Peter Zijlstra, RT, Ingo Molnar, LKML

On Mon, 2007-10-15 at 14:05 -0400, Steven Rostedt wrote:
> --
> On Fri, 12 Oct 2007, Gregory Haskins wrote:
> 
> > There are three events that require consideration for redistributing RT
> > tasks:
> >
> > 1) When one or more higher-priority tasks preempts a lower-one from a
> >    RQ
> > 2) When a lower-priority task is woken up on a RQ
> > 3) When a RQ downgrades its current priority
> >
> > Steve Rostedt's push_rt patch addresses (1).  It hooks in right after
> > a new task has been switched-in.  If this was the result of an RT
> > preemption, or if more than one task was awoken at the same time, we
> > can try to push some of those other tasks away.
> >
> > This patch addresses (2).  When we wake up a task, we check to see
> > if it would preempt the current task on the queue.  If it will not, we
> > attempt to find a better suited CPU (e.g. one running something lower
> > priority than the task being woken) and try to activate the task there.
> >
> > Finally, we have (3).  In theory, we only need to balance_rt_tasks() if
> > the following conditions are met:
> >    1) One or more CPUs are in overload, AND
> >    2) We are about to switch to a task that lowers our priority.
> >
> > (3) will be addressed in a later patch.
> >
> > Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> > ---
> >
> >  kernel/sched.c |   68 ++++++++++++++++++++++++++++++++++----------------------
> >  1 files changed, 41 insertions(+), 27 deletions(-)
> >
> > diff --git a/kernel/sched.c b/kernel/sched.c
> > index 62f9f0b..3c71156 100644
> > --- a/kernel/sched.c
> > +++ b/kernel/sched.c
> > @@ -1628,6 +1628,12 @@ out:
> >  	return ret;
> >  }
> >
> > +/* Push all tasks that we can to other CPUs */
> > +static void push_rt_tasks(struct rq *this_rq)
> > +{
> > +	while (push_rt_task(this_rq));
> 
> Loop conditions like this must be written as:
> 
>    while (push_rt_task(this_rq))
> 	;
> 
> So we don't accidently put something inside the loop if we forget to add
> the semicolon, like:
> 
>    while (push_rt_task(this_rq)
> 
>    do_something_not_expected_to_loop();
> 
> Of course you end your function after that and thus we would get an
> compile error if the semicolon were to be missing. But we might add
> code afterwards.

I'm not sure I really get what the difference is, but I don't feel
strongly one way or the other.  I can make that change.


> 
> > +}
> > +
> >  /*
> >   * Pull RT tasks from other CPUs in the RT-overload
> >   * case. Interrupts are disabled, local rq is locked.
> > @@ -1988,7 +1994,33 @@ out_set_cpu:
> >  		this_cpu = smp_processor_id();
> >  		cpu = task_cpu(p);
> >  	}
> > -
> > +
> > +	/*
> > +	 * If a newly woken up RT task cannot preempt the
> > +	 * current (RT) task (on a target runqueue) then try
> > +	 * to find another CPU it can preempt:
> > +	 */
> > +	if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
> > +		cpumask_t cpu_mask;
> > +		cpus_and(cpu_mask, cpu_online_map, p->cpus_allowed);
> 
> Hmm, maybe I should put that mask into the find_lowest_cpu function.
> Of course I changed this a little in my last patch.

If you have an update, send it along and I will rebase.


> 
> > +
> > +		new_cpu = find_lowest_cpu(&cpu_mask, p, rq);
> > +		if ((new_cpu != -1) && (new_cpu != cpu)) {
> > +			set_task_cpu(p, new_cpu);
> > +			spin_unlock(&rq->lock);
> > +
> > +			/* The new lock was already acquired in find_lowest */
> > +			rq = cpu_rq(new_cpu);
> > +			old_state = p->state;
> > +			if (!(old_state & state))
> > +				goto out;
> > +			if (p->se.on_rq)
> > +				goto out_running;
> > +
> > +			this_cpu = smp_processor_id();
> 
> Could we have preempted to get a new this_cpu?

That was a leftover from before we had the double_lock inside the search
function.  I will clean this up.

> 
> > +			cpu = task_cpu(p);
> > +		}
> > +	}
> >  out_activate:
> >  #endif /* CONFIG_SMP */
> >  	update_rq_clock(rq);
> > @@ -2002,30 +2034,13 @@ out_activate:
> >  	 * to find another CPU it can preempt:
> >  	 */
> >  	if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
> > -		struct rq *this_rq = cpu_rq(this_cpu);
> >  		/*
> > -		 * Special-case: the task on this CPU can be
> > -		 * preempted. In that case there's no need to
> > -		 * trigger reschedules on other CPUs, we can
> > -		 * mark the current task for reschedule.
> > -		 *
> > -		 * (Note that it's safe to access this_rq without
> > -		 * extra locking in this particular case, because
> > -		 * we are on the current CPU.)
> > +		 * FIXME: Do we still need to do this here anymore, or
> > +		 * does the preemption-check above suffice.  The path
> > +		 * that makes my head hurt is when we have the
> > +		 * task_running->out_activate path
> >  		 */
> > -		if (TASK_PREEMPTS_CURR(p, this_rq))
> > -			set_tsk_need_resched(this_rq->curr);
> > -		else
> > -			/*
> > -			 * Neither the intended target runqueue
> > -			 * nor the current CPU can take this task.
> > -			 * Trigger a reschedule on all other CPUs
> > -			 * nevertheless, maybe one of them can take
> > -			 * this task:
> > -			 */
> > -			smp_send_reschedule_allbutself_cpumask(p->cpus_allowed);
> > -
> > -		schedstat_inc(this_rq, rto_wakeup);
> > +		push_rt_tasks(rq);
> 
> I think the question is, doesn't this make the above not needed? The
> push_rt_tasks should do what the previous condition did.
> 
> Maybe I'm missing something.

Well, only that it has a few efficiency related advantages (1) to doing
this check before the activate() call.  You are correct that either
place would yield correct behavior.

(1) -> We can save the overhead of an unnecessary activate/deactivate
cycle, and avoid placing the system (even if only briefly) into overload
which will potentially affect other CPUs if they happen to call
_schedule() during the window.


> 
> >  	} else {
> >  		/*
> >  		 * Sync wakeups (i.e. those types of wakeups where the waker
> > @@ -2360,13 +2375,12 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
> >  	 * the lock was owned by prev, we need to release it
> >  	 * first via finish_lock_switch and then reaquire it.
> >  	 */
> > -	if (unlikely(rt_task(current))) {
> > +	if (unlikely(rq->rt_nr_running > 1)) {
> 
> Heh, I guess that would work.
> 
> -- Steve
> 
> >  		spin_lock(&rq->lock);
> > -		/* push_rt_task will return true if it moved an RT */
> > -		while (push_rt_task(rq))
> > -			;
> > +		push_rt_tasks(rq);
> >  		spin_unlock(&rq->lock);
> >  	}
> > +
> >  #endif
> >  	fire_sched_in_preempt_notifiers(current);
> >  	trace_stop_sched_switched(current);
> >
> >
> >


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/7] RT: Wrap the RQ notion of priority to make it conditional
  2007-10-15 17:45   ` Steven Rostedt
@ 2007-10-15 19:57     ` Gregory Haskins
  0 siblings, 0 replies; 18+ messages in thread
From: Gregory Haskins @ 2007-10-15 19:57 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Peter Zijlstra, RT, Ingo Molnar, LKML, Thomas Gleixner

On Mon, 2007-10-15 at 13:45 -0400, Steven Rostedt wrote:
> 
> --
> 
> On Fri, 12 Oct 2007, Gregory Haskins wrote:
> 
> > A little cleanup to avoid #ifdef proliferation later in the series
> >
> > Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> 
> NACK
> 
> > ---
> >
> >  kernel/sched.c |   23 ++++++++++++++++++++---
> >  1 files changed, 20 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/sched.c b/kernel/sched.c
> > index 0a1ad0e..c9afc8a 100644
> > --- a/kernel/sched.c
> > +++ b/kernel/sched.c
> > @@ -255,6 +255,10 @@ struct cfs_rq {
> >  #endif
> >  };
> >
> > +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
> > +#define  ENABLE_RQ_PRIORITY   /* Steve wants this conditional on PREEMPT_RT */
> 
> Yes, I do want it conditional. But only because we want to test it first,
> so the condition is really temporary. I agree with Peter that this should
> be for non PREEMPT_RT as well, and I'll work on a patch for upstream.
> 
> But these defines setting defines is just ugly.

Ok.  Keep in mind the spirit of the patch was to wrap the setting so we
didn't need more #ifdefs later, not so much to make that define.  I can
clean up the ifdef of ifdef part no problem.  I was just trying to keep
it one place to make the inevitable future change to share with non
PREEMPT_RT easier ;)


> 
> -- Steve
> 
> 
> > +#endif /* CONFIG_PREEMPT_RT */
> > +
> >  /* Real-Time classes' related field in a runqueue: */
> >  struct rt_rq {
> >  	struct rt_prio_array active;
> > @@ -304,6 +308,9 @@ struct rq {
> >  #ifdef CONFIG_PREEMPT_RT
> >  	unsigned long rt_nr_running;
> >  	unsigned long rt_nr_uninterruptible;
> > +#endif
> > +
> > +#ifdef ENABLE_RQ_PRIORITY
> >  	int curr_prio;
> >  #endif
> >
> > @@ -365,6 +372,16 @@ struct rq {
> >  static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> >  static DEFINE_MUTEX(sched_hotcpu_mutex);
> >
> > +#ifdef ENABLE_RQ_PRIORITY
> > +static inline void set_rq_prio(struct rq *rq, int prio)
> > +{
> > +	rq->curr_prio = prio;
> > +}
> > +
> > +#else
> > +#define set_rq_prio(rq, prio) do { } while(0)
> > +#endif
> > +
> >  static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
> >  {
> >  	rq->curr->sched_class->check_preempt_curr(rq, p);
> > @@ -2331,9 +2348,9 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
> >  	 */
> >  	prev_state = prev->state;
> >  	_finish_arch_switch(prev);
> > -#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
> > -	rq->curr_prio = current->prio;
> > -#endif
> > +
> > +	set_rq_prio(rq, current->prio);
> > +
> >  	finish_lock_switch(rq, prev);
> >  #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
> >  	/*
> >
> >
> >


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/7] RT: Select tasks based on relative affinity
  2007-10-13  0:16 ` [PATCH 6/7] RT: Select tasks based on relative affinity Gregory Haskins
@ 2007-10-15 20:08   ` Gregory Haskins
  0 siblings, 0 replies; 18+ messages in thread
From: Gregory Haskins @ 2007-10-15 20:08 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Peter Zijlstra, RT, Ingo Molnar, LKML

On Fri, 2007-10-12 at 20:16 -0400, Gregory Haskins wrote:
> In theory, tasks will be most efficient if they are allowed to re-wake to
> the CPU that they last ran on due to cache affinity.  Short of that, it is
> cheaper to wake up the current CPU.  If neither of those two are options,
> than the lowest CPU will do. 
> 
> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> ---
> 
>  kernel/sched.c |   72 +++++++++++++++++++++++++++++++++++++++++---------------
>  1 files changed, 53 insertions(+), 19 deletions(-)
> 
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 3c71156..b79b968 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -1511,6 +1511,15 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
>  /* Only try this algorithm three times */
>  #define RT_PUSH_MAX_TRIES 3
>  
> +static int non_rt_cpu(cpumask_t *mask, int cpu)
> +{
> +	if (!rt_prio(cpu_rq(cpu)->curr_prio) &&
> +	    cpu_isset(cpu, *mask))
> +		return 1;
> +
> +	return 0;
> +}
> +
>  /* Will lock the rq it finds */
>  static int find_lowest_cpu(cpumask_t *cpu_mask, struct task_struct *task,
>  				 struct rq *this_rq)
> @@ -1519,32 +1528,57 @@ static int find_lowest_cpu(cpumask_t *cpu_mask, struct task_struct *task,
>  	int dst_cpu = -1;
>  	int cpu;
>  	int tries;
> +	int this_cpu = smp_processor_id();
>  
>  	for (tries = 0; tries < RT_PUSH_MAX_TRIES; tries++) {
> +
>  		/*
> -		 * Scan each rq for the lowest prio.
> +		 * We select a CPU in the following priority:
> +		 *
> +		 *       task_cpu, this_cpu, first_cpu
> +		 *
> +		 * for efficiency.
> +		 *
> +		 * - task_cpu preserves cache affinity

After thinking about this over the weekend, the task_cpu optimization
doesn't make sense.  It made sense in my original design before I
integrated with Steve's series, but not here.  I think the second
optimization (this_cpu) still makes sense though, so I will update this
patch for the next drop.

> +		 * - this_cpu is (presumably) cheaper to preempt
> +		 *   (note that sometimes task_cpu and this_cpu
> +		 *   are the same).
> +		 * - Finally, we will take whatever is available
> +		 *   if the first two don't pan out by scanning.
>  		 */
> -		for_each_cpu_mask(cpu, *cpu_mask) {
> -			struct rq *rq = &per_cpu(runqueues, cpu);
> -
> -			if (cpu == smp_processor_id())
> -				continue;
> -
> -			/* We look for lowest RT prio or non-rt CPU */
> -			if (rq->curr_prio >= MAX_RT_PRIO) {
> -				lowest_rq = rq;
> -				dst_cpu = cpu;
> -				break;
> -			}
> +		if (non_rt_cpu(cpu_mask, task_cpu(task))) {
> +			lowest_rq = task_rq(task);	
> +			dst_cpu = lowest_rq->cpu;
> +		} else if (non_rt_cpu(cpu_mask, this_cpu)) {
> +			dst_cpu = this_cpu;
> +			lowest_rq = cpu_rq(this_cpu);	
> +		} else {
>  
> -			/* no locking for now */
> -			if (rq->curr_prio > task->prio &&
> -			    (!lowest_rq || rq->curr_prio < lowest_rq->curr_prio)) {
> -				dst_cpu = cpu;
> -				lowest_rq = rq;
> +			/*
> +			 * Scan each rq for the lowest prio.
> +			 */
> +			for_each_cpu_mask(cpu, *cpu_mask) {
> +				struct rq *rq = &per_cpu(runqueues, cpu);
> +				
> +				if (cpu == this_cpu)
> +					continue;
> +				
> +				/* We look for lowest RT prio or non-rt CPU */
> +				if (rq->curr_prio >= MAX_RT_PRIO) {
> +					lowest_rq = rq;
> +					dst_cpu = cpu;
> +					break;
> +				}
> +				
> +				/* no locking for now */
> +				if (rq->curr_prio > task->prio &&
> +				    (!lowest_rq || rq->curr_prio < lowest_rq->curr_prio)) {
> +					dst_cpu = cpu;
> +					lowest_rq = rq;
> +				}
>  			}
>  		}
> -
> +			
>  		if (!lowest_rq) {
>  			dst_cpu = -1;
>  			break;
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2007-10-15 20:11 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-13  0:15 [PATCH 0/7] RT: RT-Overload/Sched enhancements (v2) Gregory Haskins
2007-10-13  0:15 ` [PATCH 1/7] RT: Add a per-cpu rt_overload indication Gregory Haskins
2007-10-15 17:42   ` Steven Rostedt
2007-10-13  0:15 ` [PATCH 2/7] RT: Wrap the RQ notion of priority to make it conditional Gregory Haskins
2007-10-15 17:45   ` Steven Rostedt
2007-10-15 19:57     ` Gregory Haskins
2007-10-13  0:15 ` [PATCH 3/7] RT: Initialize the priority value Gregory Haskins
2007-10-15 17:46   ` Steven Rostedt
2007-10-13  0:15 ` [PATCH 4/7] RT: Add support for updating push-rt priority under PI boost Gregory Haskins
2007-10-15 17:47   ` Steven Rostedt
2007-10-13  0:15 ` [PATCH 5/7] RT: Add support for low-priority wake-up to push_rt feature Gregory Haskins
2007-10-13  9:46   ` Peter Zijlstra
2007-10-15 18:57     ` Steven Rostedt
2007-10-15 18:05   ` Steven Rostedt
2007-10-15 19:54     ` Gregory Haskins
2007-10-13  0:16 ` [PATCH 6/7] RT: Select tasks based on relative affinity Gregory Haskins
2007-10-15 20:08   ` Gregory Haskins
2007-10-13  0:16 ` [PATCH 7/7] RT: (RFC) Only try to pull tasks in if we are downgrading our priority Gregory Haskins

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox