[RFC PATCH 0/3] Series short description

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH 0/3] Series short description
@ 2008-11-11 14:26 Gregory Haskins
  2008-11-11 14:26 ` [RFC PATCH 1/3] sched: track the next-highest priority on each runqueue Gregory Haskins
                   ` (5 more replies)
  0 siblings, 6 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-11-11 14:26 UTC (permalink / raw)
  To: mingo, rostedt, peterz; +Cc: linux-kernel, =linux-rt-users

Hi Ingo, Steven, Peter,
  This series applies roughly to mainline as an enhancement to the sched_rt
  logic.  Peter and I were discussing some of the unecessary overhead in
  pull_rt_tasks() via IRC, and this is my RFC attempt to address the problem.

  I have built/booted this on a 4-way C2D Xeon box and it passes preempt-test.

  Comments, please.

  -Greg

---

Gregory Haskins (3):
      sched: use highest_prio.next to optimize pull operations
      sched: use highest_prio.curr for pull threshold
      sched: track the next-highest priority on each runqueue


 kernel/sched.c    |    8 ++-
 kernel/sched_rt.c |  158 +++++++++++++++++++++++++++++++----------------------
 2 files changed, 99 insertions(+), 67 deletions(-)

-- 
Signature

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [RFC PATCH 1/3] sched: track the next-highest priority on each runqueue
  2008-11-11 14:26 [RFC PATCH 0/3] Series short description Gregory Haskins
@ 2008-11-11 14:26 ` Gregory Haskins
  2008-11-11 14:26 ` [RFC PATCH 2/3] sched: use highest_prio.curr for pull threshold Gregory Haskins
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-11-11 14:26 UTC (permalink / raw)
  To: mingo, rostedt, peterz; +Cc: linux-kernel, =linux-rt-users

We will use this later in the series to reduce the amount of rq-lock
contention during a pull operation

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c    |    8 +++-
 kernel/sched_rt.c |  115 +++++++++++++++++++++++++++++++++++------------------
 2 files changed, 81 insertions(+), 42 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 6625c3c..a29193b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -448,7 +448,10 @@ struct rt_rq {
 	struct rt_prio_array active;
 	unsigned long rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	int highest_prio; /* highest queued rt task prio */
+	struct {
+		int curr; /* highest queued rt task prio */
+		int next; /* next highest */
+	} highest_prio;
 #endif
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
@@ -8040,7 +8043,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	rt_rq->highest_prio = MAX_RT_PRIO;
+	rt_rq->highest_prio.curr = MAX_RT_PRIO;
+	rt_rq->highest_prio.next = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b446dc8..d3b54ba 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -108,7 +108,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 	if (rt_rq->rt_nr_running) {
 		if (rt_se && !on_rt_rq(rt_se))
 			enqueue_rt_entity(rt_se);
-		if (rt_rq->highest_prio < curr->prio)
+		if (rt_rq->highest_prio.curr < curr->prio)
 			resched_task(curr);
 	}
 }
@@ -473,7 +473,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 	struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
 	if (rt_rq)
-		return rt_rq->highest_prio;
+		return rt_rq->highest_prio.curr;
 #endif
 
 	return rt_task_of(rt_se)->prio;
@@ -547,33 +547,66 @@ static void update_curr_rt(struct rq *rq)
 	}
 }
 
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
+
+static inline int next_prio(struct rq *rq)
+{
+	struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
+
+	if (next && rt_prio(next->prio))
+		return next->prio;
+	else
+		return MAX_RT_PRIO;
+}
+#endif
+
 static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
-	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+	int prio = rt_se_prio(rt_se);
+#endif
 #ifdef CONFIG_SMP
-		struct rq *rq = rq_of_rt_rq(rt_rq);
+	struct rq *rq = rq_of_rt_rq(rt_rq);
 #endif
 
-		rt_rq->highest_prio = rt_se_prio(rt_se);
+	WARN_ON(!rt_prio(prio));
+	rt_rq->rt_nr_running++;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+	if (prio < rt_rq->highest_prio.curr) {
+
+		/*
+		 * If the new task is higher in priority than anything on the
+		 * run-queue, we have a new high that must be published to
+		 * the world.  We also know that the previous high becomes
+		 * our next-highest.
+		 */
+		rt_rq->highest_prio.next = rt_rq->highest_prio.curr;
+		rt_rq->highest_prio.curr = prio;
 #ifdef CONFIG_SMP
 		if (rq->online)
-			cpupri_set(&rq->rd->cpupri, rq->cpu,
-				   rt_se_prio(rt_se));
+			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 #endif
-	}
+	} else if (prio == rt_rq->highest_prio.curr)
+		/*
+		 * If the next task is equal in priority to the highest on
+		 * the run-queue, then we implicitly know that the next highest
+		 * task cannot be any lower than current
+		 */
+		rt_rq->highest_prio.next = prio;
+	else if (prio < rt_rq->highest_prio.next)
+		/*
+		 * Otherwise, we need to recompute next-highest
+		 */
+		rt_rq->highest_prio.next = next_prio(rq);
 #endif
 #ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
+	if (rt_se->nr_cpus_allowed > 1)
 		rq->rt.rt_nr_migratory++;
-	}
 
-	update_rt_migration(rq_of_rt_rq(rt_rq));
+	update_rt_migration(rq);
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (rt_se_boosted(rt_se))
@@ -589,8 +622,9 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 static inline
 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
-#ifdef CONFIG_SMP
-	int highest_prio = rt_rq->highest_prio;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+	struct rq *rq = rq_of_rt_rq(rt_rq);
+	int highest_prio = rt_rq->highest_prio.curr;
 #endif
 
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
@@ -598,33 +632,34 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rt_nr_running--;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	if (rt_rq->rt_nr_running) {
-		struct rt_prio_array *array;
+		int prio = rt_se_prio(rt_se);
+
+		WARN_ON(prio < rt_rq->highest_prio.curr);
 
-		WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
-		if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
-			/* recalculate */
-			array = &rt_rq->active;
-			rt_rq->highest_prio =
+		/*
+		 * This may have been our highest or next-highest priority
+		 * task and therefore we may have some recomputation to do
+		 */
+		if (prio == rt_rq->highest_prio.curr) {
+			struct rt_prio_array *array = &rt_rq->active;
+
+			rt_rq->highest_prio.curr =
 				sched_find_first_bit(array->bitmap);
-		} /* otherwise leave rq->highest prio alone */
+		}
+
+		if (prio == rt_rq->highest_prio.next)
+			rt_rq->highest_prio.next = next_prio(rq);
 	} else
-		rt_rq->highest_prio = MAX_RT_PRIO;
+		rt_rq->highest_prio.curr = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
+	if (rt_se->nr_cpus_allowed > 1)
 		rq->rt.rt_nr_migratory--;
-	}
 
-	if (rt_rq->highest_prio != highest_prio) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
-		if (rq->online)
-			cpupri_set(&rq->rd->cpupri, rq->cpu,
-				   rt_rq->highest_prio);
-	}
+	if (rq->online && rt_rq->highest_prio.curr != highest_prio)
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 
-	update_rt_migration(rq_of_rt_rq(rt_rq));
+	update_rt_migration(rq);
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (rt_se_boosted(rt_se))
@@ -1069,7 +1104,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 		}
 
 		/* If this rq is still suitable use it. */
-		if (lowest_rq->rt.highest_prio > task->prio)
+		if (lowest_rq->rt.highest_prio.curr > task->prio)
 			break;
 
 		/* try again */
@@ -1257,7 +1292,7 @@ static int pull_rt_task(struct rq *this_rq)
 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
-	if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+	if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
 		pull_rt_task(rq);
 }
 
@@ -1343,7 +1378,7 @@ static void rq_online_rt(struct rq *rq)
 
 	__enable_runtime(rq);
 
-	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
 }
 
 /* Assumes rq->lock is held */
@@ -1426,7 +1461,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
 		 * can release the rq lock and p could migrate.
 		 * Only reschedule if p is still on the same runqueue.
 		 */
-		if (p->prio > rq->rt.highest_prio && rq->curr == p)
+		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
 			resched_task(p);
 #else
 		/* For UP simply resched on drop of prio */


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [RFC PATCH 2/3] sched: use highest_prio.curr for pull threshold
  2008-11-11 14:26 [RFC PATCH 0/3] Series short description Gregory Haskins
  2008-11-11 14:26 ` [RFC PATCH 1/3] sched: track the next-highest priority on each runqueue Gregory Haskins
@ 2008-11-11 14:26 ` Gregory Haskins
  2008-11-11 14:26 ` [RFC PATCH 3/3] sched: use highest_prio.next to optimize pull operations Gregory Haskins
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-11-11 14:26 UTC (permalink / raw)
  To: mingo, rostedt, peterz; +Cc: linux-kernel, =linux-rt-users

highest_prio.curr is actually a more accurate way to keep track of
the pull_rt_task() threshold since it is always up to date, even
if the "next" task migrates during double_lock.  Therefore, stop
looking at the "next" task object and simply use the highest_prio.curr.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched_rt.c |   31 ++++++-------------------------
 1 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d3b54ba..b2305c9 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1206,14 +1206,12 @@ static void push_rt_tasks(struct rq *rq)
 static int pull_rt_task(struct rq *this_rq)
 {
 	int this_cpu = this_rq->cpu, ret = 0, cpu;
-	struct task_struct *p, *next;
+	struct task_struct *p;
 	struct rq *src_rq;
 
 	if (likely(!rt_overloaded(this_rq)))
 		return 0;
 
-	next = pick_next_task_rt(this_rq);
-
 	for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
 		if (this_cpu == cpu)
 			continue;
@@ -1222,17 +1220,9 @@ static int pull_rt_task(struct rq *this_rq)
 		/*
 		 * We can potentially drop this_rq's lock in
 		 * double_lock_balance, and another CPU could
-		 * steal our next task - hence we must cause
-		 * the caller to recalculate the next task
-		 * in that case:
+		 * alter this_rq
 		 */
-		if (double_lock_balance(this_rq, src_rq)) {
-			struct task_struct *old_next = next;
-
-			next = pick_next_task_rt(this_rq);
-			if (next != old_next)
-				ret = 1;
-		}
+		double_lock_balance(this_rq, src_rq);
 
 		/*
 		 * Are there still pullable RT tasks?
@@ -1246,7 +1236,7 @@ static int pull_rt_task(struct rq *this_rq)
 		 * Do we have an RT task that preempts
 		 * the to-be-scheduled task?
 		 */
-		if (p && (!next || (p->prio < next->prio))) {
+		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
 			WARN_ON(p == src_rq->curr);
 			WARN_ON(!p->se.on_rq);
 
@@ -1256,12 +1246,9 @@ static int pull_rt_task(struct rq *this_rq)
 			 * This is just that p is wakeing up and hasn't
 			 * had a chance to schedule. We only pull
 			 * p if it is lower in priority than the
-			 * current task on the run queue or
-			 * this_rq next task is lower in prio than
-			 * the current task on that rq.
+			 * current task on the run queue
 			 */
-			if (p->prio < src_rq->curr->prio ||
-			    (next && next->prio < src_rq->curr->prio))
+			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
 			ret = 1;
@@ -1274,13 +1261,7 @@ static int pull_rt_task(struct rq *this_rq)
 			 * case there's an even higher prio task
 			 * in another runqueue. (low likelyhood
 			 * but possible)
-			 *
-			 * Update next so that we won't pick a task
-			 * on another cpu with a priority lower (or equal)
-			 * than the one we just picked.
 			 */
-			next = p;
-
 		}
  skip:
 		double_unlock_balance(this_rq, src_rq);


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [RFC PATCH 3/3] sched: use highest_prio.next to optimize pull operations
  2008-11-11 14:26 [RFC PATCH 0/3] Series short description Gregory Haskins
  2008-11-11 14:26 ` [RFC PATCH 1/3] sched: track the next-highest priority on each runqueue Gregory Haskins
  2008-11-11 14:26 ` [RFC PATCH 2/3] sched: use highest_prio.curr for pull threshold Gregory Haskins
@ 2008-11-11 14:26 ` Gregory Haskins
  2008-11-11 17:56 ` [RFC PATCH 0/3] Series short description Ingo Molnar
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-11-11 14:26 UTC (permalink / raw)
  To: mingo, rostedt, peterz; +Cc: linux-kernel, =linux-rt-users

We currently take the rq->lock for every cpu in an overload state during
pull_rt_tasks().  However, we now have enough information via the
highest_prio.[curr|next] fields to determine if there is any tasks of
interest to warrant the overhead of the rq->lock, before we actually take
it.  So we use this information to reduce lock contention during the
pull for the case where the source-rq doesnt have tasks that preempt
the current task.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched_rt.c |   12 ++++++++++++
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b2305c9..d722aef 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1217,6 +1217,18 @@ static int pull_rt_task(struct rq *this_rq)
 			continue;
 
 		src_rq = cpu_rq(cpu);
+
+		/*
+		 * Don't bother taking the src_rq->lock if the next highest
+		 * task is known to be lower-priority than our current task.
+		 * This may look racy, but if this value is about to go
+		 * logically higher, the src_rq will push this task away.
+		 * And if its going logically lower, we do not care
+		 */
+		if (src_rq->rt.highest_prio.next >=
+		    this_rq->rt.highest_prio.curr)
+			continue;
+
 		/*
 		 * We can potentially drop this_rq's lock in
 		 * double_lock_balance, and another CPU could


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [RFC PATCH 0/3] Series short description
  2008-11-11 14:26 [RFC PATCH 0/3] Series short description Gregory Haskins
                   ` (2 preceding siblings ...)
  2008-11-11 14:26 ` [RFC PATCH 3/3] sched: use highest_prio.next to optimize pull operations Gregory Haskins
@ 2008-11-11 17:56 ` Ingo Molnar
  2008-11-11 18:51   ` Gregory Haskins
  2008-11-11 19:31 ` Chris Friesen
  2008-12-03 20:01 ` Gregory Haskins
  5 siblings, 1 reply; 25+ messages in thread
From: Ingo Molnar @ 2008-11-11 17:56 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: rostedt, peterz, linux-kernel, =linux-rt-users


* Gregory Haskins <ghaskins@novell.com> wrote:

> Hi Ingo, Steven, Peter,
>   This series applies roughly to mainline as an enhancement to the sched_rt
>   logic.  Peter and I were discussing some of the unecessary overhead in
>   pull_rt_tasks() via IRC, and this is my RFC attempt to address the problem.
> 
>   I have built/booted this on a 4-way C2D Xeon box and it passes preempt-test.
> 
>   Comments, please.

this direction looks very nifty to me. Peter, do you Ack them?

	Ingo

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC PATCH 0/3] Series short description
  2008-11-11 17:56 ` [RFC PATCH 0/3] Series short description Ingo Molnar
@ 2008-11-11 18:51   ` Gregory Haskins
  0 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-11-11 18:51 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: rostedt, peterz, linux-kernel, linux-rt-users

[-- Attachment #1: Type: text/plain, Size: 728 bytes --]

Ingo Molnar wrote:
> * Gregory Haskins <ghaskins@novell.com> wrote:
>
>   
>> Hi Ingo, Steven, Peter,
>>   This series applies roughly to mainline as an enhancement to the sched_rt
>>   logic.  Peter and I were discussing some of the unecessary overhead in
>>   pull_rt_tasks() via IRC, and this is my RFC attempt to address the problem.
>>
>>   I have built/booted this on a 4-way C2D Xeon box and it passes preempt-test.
>>
>>   Comments, please.
>>     
>
> this direction looks very nifty to me. Peter, do you Ack them?
>
> 	Ingo
>   
[I just noticed that I fat-fingered the -rt list address, so here is a
link to the thread for those subscribed to -rt]

http://lkml.org/lkml/2008/11/11/193

-Greg


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC PATCH 0/3] Series short description
  2008-11-11 14:26 [RFC PATCH 0/3] Series short description Gregory Haskins
                   ` (3 preceding siblings ...)
  2008-11-11 17:56 ` [RFC PATCH 0/3] Series short description Ingo Molnar
@ 2008-11-11 19:31 ` Chris Friesen
  2008-11-11 19:50   ` Gregory Haskins
  2008-12-03 20:01 ` Gregory Haskins
  5 siblings, 1 reply; 25+ messages in thread
From: Chris Friesen @ 2008-11-11 19:31 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: mingo, rostedt, peterz, linux-kernel, linux-rt-users

Gregory Haskins wrote:
> Hi Ingo, Steven, Peter,
>   This series applies roughly to mainline as an enhancement to the sched_rt
>   logic.  Peter and I were discussing some of the unecessary overhead in
>   pull_rt_tasks() via IRC, and this is my RFC attempt to address the problem.
> 
>   I have built/booted this on a 4-way C2D Xeon box and it passes preempt-test.

At first blush it looks reasonable, but do you have any performance numbers?

Chris

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC PATCH 0/3] Series short description
  2008-11-11 19:31 ` Chris Friesen
@ 2008-11-11 19:50   ` Gregory Haskins
  0 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-11-11 19:50 UTC (permalink / raw)
  To: Chris Friesen; +Cc: mingo, rostedt, peterz, linux-kernel, linux-rt-users

[-- Attachment #1: Type: text/plain, Size: 947 bytes --]

Chris Friesen wrote:
> Gregory Haskins wrote:
>> Hi Ingo, Steven, Peter,
>>   This series applies roughly to mainline as an enhancement to the
>> sched_rt
>>   logic.  Peter and I were discussing some of the unecessary overhead in
>>   pull_rt_tasks() via IRC, and this is my RFC attempt to address the
>> problem.
>>
>>   I have built/booted this on a 4-way C2D Xeon box and it passes
>> preempt-test.
>
> At first blush it looks reasonable, but do you have any performance
> numbers?

Hi Chris,
  That is a perfectly reasonable request, but unfortunately I do not
have any at this time.  Its currently all based on the observation that
pull_rt_tasks() can cause excessive rq->lock contention and the theory
that this should reduce the cases where that happens.  It still needs to
be proven and quantified, outside of the basic litmus test I gave it
before posting.

I will try to get to this ASAP.

Regards,
-Greg



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC PATCH 0/3] Series short description
  2008-11-11 14:26 [RFC PATCH 0/3] Series short description Gregory Haskins
                   ` (4 preceding siblings ...)
  2008-11-11 19:31 ` Chris Friesen
@ 2008-12-03 20:01 ` Gregory Haskins
  2008-12-03 20:30   ` Randy Dunlap
  2008-12-03 20:40   ` [RFC PATCH 0/3] Series short description Ingo Molnar
  5 siblings, 2 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-03 20:01 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: mingo, rostedt, peterz, linux-kernel, =linux-rt-users

Gregory Haskins wrote:
> Hi Ingo, Steven, Peter,
>   This series applies roughly to mainline as an enhancement to the sched_rt
>   logic.  Peter and I were discussing some of the unecessary overhead in
>   pull_rt_tasks() via IRC, and this is my RFC attempt to address the problem.
>   

Hi Ingo,
  I was putting together a refresh of this patch and I rebased on
tip/sched/devel.  I noticed that sched/devel is on 27-rc8, which is
obviously a little stale.  Is this the proper tree to send patches
against, or would you prefer a different HEAD?

-Greg

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC PATCH 0/3] Series short description
  2008-12-03 20:01 ` Gregory Haskins
@ 2008-12-03 20:30   ` Randy Dunlap
  2008-12-03 20:39     ` [RFC PATCH 0/3] sched: track next-highest priority (was "Series short discription") Gregory Haskins
  2008-12-03 20:40   ` [RFC PATCH 0/3] Series short description Ingo Molnar
  1 sibling, 1 reply; 25+ messages in thread
From: Randy Dunlap @ 2008-12-03 20:30 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: Gregory Haskins, mingo, rostedt, peterz, linux-kernel,
	=linux-rt-users

Gregory Haskins wrote:
> Gregory Haskins wrote:
>> Hi Ingo, Steven, Peter,
>>   This series applies roughly to mainline as an enhancement to the sched_rt
>>   logic.  Peter and I were discussing some of the unecessary overhead in
>>   pull_rt_tasks() via IRC, and this is my RFC attempt to address the problem.
>>   
> 
> Hi Ingo,
>   I was putting together a refresh of this patch and I rebased on
> tip/sched/devel.  I noticed that sched/devel is on 27-rc8, which is
> obviously a little stale.  Is this the proper tree to send patches
> against, or would you prefer a different HEAD?

BTW, it sure would be nice if the Subject: gave some idea of
the, uh, subject of the patch series.

Thanks,
~Randy


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC PATCH 0/3] sched: track next-highest priority (was "Series short discription")
  2008-12-03 20:30   ` Randy Dunlap
@ 2008-12-03 20:39     ` Gregory Haskins
  0 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-03 20:39 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: Gregory Haskins, mingo, rostedt, peterz, linux-kernel,
	linux-rt-users

Randy Dunlap wrote:
> BTW, it sure would be nice if the Subject: gave some idea of
> the, uh, subject of the patch series.
>
>   

Hi Randy,
  Indeed.  I fat-fingered that one on the original patch submission and
now it persists in the thread for all eternity for all who reply to the
original ;)

I have fixed it above.  Sorry for the confusion.

-Greg


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC PATCH 0/3] Series short description
  2008-12-03 20:01 ` Gregory Haskins
  2008-12-03 20:30   ` Randy Dunlap
@ 2008-12-03 20:40   ` Ingo Molnar
  2008-12-03 22:09     ` [PATCH v2 0/4] sched: track next-highest priority Gregory Haskins
  1 sibling, 1 reply; 25+ messages in thread
From: Ingo Molnar @ 2008-12-03 20:40 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: Gregory Haskins, rostedt, peterz, linux-kernel, =linux-rt-users


* Gregory Haskins <gregory.haskins.ml@gmail.com> wrote:

> Gregory Haskins wrote:
> > Hi Ingo, Steven, Peter,
> >   This series applies roughly to mainline as an enhancement to the sched_rt
> >   logic.  Peter and I were discussing some of the unecessary overhead in
> >   pull_rt_tasks() via IRC, and this is my RFC attempt to address the problem.
> >   
> 
> Hi Ingo,
>   I was putting together a refresh of this patch and I rebased on
> tip/sched/devel.  I noticed that sched/devel is on 27-rc8, which is
> obviously a little stale.  Is this the proper tree to send patches
> against, or would you prefer a different HEAD?

tip/master would be the base to use, or tip/sched/core. The sched/devel 
branch is indeed stale.

	Ingo

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v2 0/4] sched: track next-highest priority
  2008-12-03 20:40   ` [RFC PATCH 0/3] Series short description Ingo Molnar
@ 2008-12-03 22:09     ` Gregory Haskins
  2008-12-03 22:09       ` [PATCH v2 1/4] sched: cleanup inc/dec_rt_tasks Gregory Haskins
                         ` (4 more replies)
  0 siblings, 5 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-03 22:09 UTC (permalink / raw)
  To: mingo; +Cc: peterz, rostedt, ghaskins, linux-kernel, linux-rt-users

Hi Ingo,
  This version of the patches is logically equivelent to v1 with the following
  exceptions.

1) Rebased to tip/master df9fc0207
2) split v1-1/3 into v2-1/4 (cleanup) and v2-2/4 (changes)


I talked to Peter today about reviewing/acking these.  I told him to wait for
v2, so here it is.  Hopefully he will have a chance to look at these soon.


Thanks guys,
-Greg

---

Gregory Haskins (4):
      sched: use highest_prio.next to optimize pull operations
      sched: use highest_prio.curr for pull threshold
      sched: track the next-highest priority on each runqueue
      sched: cleanup inc/dec_rt_tasks


 kernel/sched.c    |    8 ++-
 kernel/sched_rt.c |  156 +++++++++++++++++++++++++++++++----------------------
 2 files changed, 98 insertions(+), 66 deletions(-)

-- 
Signature

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v2 1/4] sched: cleanup inc/dec_rt_tasks
  2008-12-03 22:09     ` [PATCH v2 0/4] sched: track next-highest priority Gregory Haskins
@ 2008-12-03 22:09       ` Gregory Haskins
  2008-12-03 22:09       ` [PATCH v2 2/4] sched: track the next-highest priority on each runqueue Gregory Haskins
                         ` (3 subsequent siblings)
  4 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-03 22:09 UTC (permalink / raw)
  To: mingo; +Cc: peterz, rostedt, ghaskins, linux-kernel, linux-rt-users

Move some common definitions up to the function prologe to simplify the
body logic.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched_rt.c |   40 +++++++++++++++++-----------------------
 1 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1bbd990..fb1d4d7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -550,30 +550,30 @@ static void update_curr_rt(struct rq *rq)
 static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
-	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+	int prio = rt_se_prio(rt_se);
+#endif
 #ifdef CONFIG_SMP
-		struct rq *rq = rq_of_rt_rq(rt_rq);
+	struct rq *rq = rq_of_rt_rq(rt_rq);
 #endif
 
-		rt_rq->highest_prio = rt_se_prio(rt_se);
+	WARN_ON(!rt_prio(prio));
+	rt_rq->rt_nr_running++;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+	if (prio < rt_rq->highest_prio) {
+
+		rt_rq->highest_prio = prio;
 #ifdef CONFIG_SMP
 		if (rq->online)
-			cpupri_set(&rq->rd->cpupri, rq->cpu,
-				   rt_se_prio(rt_se));
+			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 #endif
 	}
 #endif
 #ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
+	if (rt_se->nr_cpus_allowed > 1)
 		rq->rt.rt_nr_migratory++;
-	}
 
-	update_rt_migration(rq_of_rt_rq(rt_rq));
+	update_rt_migration(rq);
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (rt_se_boosted(rt_se))
@@ -590,6 +590,7 @@ static inline
 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 #ifdef CONFIG_SMP
+	struct rq *rq = rq_of_rt_rq(rt_rq);
 	int highest_prio = rt_rq->highest_prio;
 #endif
 
@@ -611,20 +612,13 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 		rt_rq->highest_prio = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
+	if (rt_se->nr_cpus_allowed > 1)
 		rq->rt.rt_nr_migratory--;
-	}
 
-	if (rt_rq->highest_prio != highest_prio) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
-		if (rq->online)
-			cpupri_set(&rq->rd->cpupri, rq->cpu,
-				   rt_rq->highest_prio);
-	}
+	if (rq->online && rt_rq->highest_prio != highest_prio)
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
 
-	update_rt_migration(rq_of_rt_rq(rt_rq));
+	update_rt_migration(rq);
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (rt_se_boosted(rt_se))

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v2 2/4] sched: track the next-highest priority on each runqueue
  2008-12-03 22:09     ` [PATCH v2 0/4] sched: track next-highest priority Gregory Haskins
  2008-12-03 22:09       ` [PATCH v2 1/4] sched: cleanup inc/dec_rt_tasks Gregory Haskins
@ 2008-12-03 22:09       ` Gregory Haskins
  2008-12-04  4:22         ` Gregory Haskins
  2008-12-03 22:09       ` [PATCH v2 3/4] sched: use highest_prio.curr for pull threshold Gregory Haskins
                         ` (2 subsequent siblings)
  4 siblings, 1 reply; 25+ messages in thread
From: Gregory Haskins @ 2008-12-03 22:09 UTC (permalink / raw)
  To: mingo; +Cc: peterz, rostedt, ghaskins, linux-kernel, linux-rt-users

We will use this later in the series to reduce the amount of rq-lock
contention during a pull operation

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c    |    8 ++++-
 kernel/sched_rt.c |   81 ++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 6237b9b..24b11eb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -463,7 +463,10 @@ struct rt_rq {
 	struct rt_prio_array active;
 	unsigned long rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	int highest_prio; /* highest queued rt task prio */
+	struct {
+		int curr; /* highest queued rt task prio */
+		int next; /* next highest */
+	} highest_prio;
 #endif
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
@@ -8073,7 +8076,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	rt_rq->highest_prio = MAX_RT_PRIO;
+	rt_rq->highest_prio.curr = MAX_RT_PRIO;
+	rt_rq->highest_prio.next = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fb1d4d7..a4022b6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -108,7 +108,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 	if (rt_rq->rt_nr_running) {
 		if (rt_se && !on_rt_rq(rt_se))
 			enqueue_rt_entity(rt_se);
-		if (rt_rq->highest_prio < curr->prio)
+		if (rt_rq->highest_prio.curr < curr->prio)
 			resched_task(curr);
 	}
 }
@@ -473,7 +473,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 	struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
 	if (rt_rq)
-		return rt_rq->highest_prio;
+		return rt_rq->highest_prio.curr;
 #endif
 
 	return rt_task_of(rt_se)->prio;
@@ -547,6 +547,21 @@ static void update_curr_rt(struct rq *rq)
 	}
 }
 
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
+
+static inline int next_prio(struct rq *rq)
+{
+	struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
+
+	if (next && rt_prio(next->prio))
+		return next->prio;
+	else
+		return MAX_RT_PRIO;
+}
+#endif
+
 static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
@@ -560,14 +575,32 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	WARN_ON(!rt_prio(prio));
 	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	if (prio < rt_rq->highest_prio) {
+	if (prio < rt_rq->highest_prio.curr) {
 
-		rt_rq->highest_prio = prio;
+		/*
+		 * If the new task is higher in priority than anything on the
+		 * run-queue, we have a new high that must be published to
+		 * the world.  We also know that the previous high becomes
+		 * our next-highest.
+		 */
+		rt_rq->highest_prio.next = rt_rq->highest_prio.curr;
+		rt_rq->highest_prio.curr = prio;
 #ifdef CONFIG_SMP
 		if (rq->online)
 			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 #endif
-	}
+	} else if (prio == rt_rq->highest_prio.curr)
+		/*
+		 * If the next task is equal in priority to the highest on
+		 * the run-queue, then we implicitly know that the next highest
+		 * task cannot be any lower than current
+		 */
+		rt_rq->highest_prio.next = prio;
+	else if (prio < rt_rq->highest_prio.next)
+		/*
+		 * Otherwise, we need to recompute next-highest
+		 */
+		rt_rq->highest_prio.next = next_prio(rq);
 #endif
 #ifdef CONFIG_SMP
 	if (rt_se->nr_cpus_allowed > 1)
@@ -591,7 +624,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 #ifdef CONFIG_SMP
 	struct rq *rq = rq_of_rt_rq(rt_rq);
-	int highest_prio = rt_rq->highest_prio;
+	int highest_prio = rt_rq->highest_prio.curr;
 #endif
 
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
@@ -599,24 +632,32 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rt_nr_running--;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	if (rt_rq->rt_nr_running) {
-		struct rt_prio_array *array;
+		int prio = rt_se_prio(rt_se);
+
+		WARN_ON(prio < rt_rq->highest_prio.curr);
 
-		WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
-		if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
-			/* recalculate */
-			array = &rt_rq->active;
-			rt_rq->highest_prio =
+		/*
+		 * This may have been our highest or next-highest priority
+		 * task and therefore we may have some recomputation to do
+		 */
+		if (prio == rt_rq->highest_prio.curr) {
+			struct rt_prio_array *array = &rt_rq->active;
+
+			rt_rq->highest_prio.curr =
 				sched_find_first_bit(array->bitmap);
-		} /* otherwise leave rq->highest prio alone */
+		}
+
+		if (prio == rt_rq->highest_prio.next)
+			rt_rq->highest_prio.next = next_prio(rq);
 	} else
-		rt_rq->highest_prio = MAX_RT_PRIO;
+		rt_rq->highest_prio.curr = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
 	if (rt_se->nr_cpus_allowed > 1)
 		rq->rt.rt_nr_migratory--;
 
-	if (rq->online && rt_rq->highest_prio != highest_prio)
-		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
+	if (rq->online && rt_rq->highest_prio.curr != highest_prio)
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 
 	update_rt_migration(rq);
 #endif /* CONFIG_SMP */
@@ -1066,7 +1107,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 		}
 
 		/* If this rq is still suitable use it. */
-		if (lowest_rq->rt.highest_prio > task->prio)
+		if (lowest_rq->rt.highest_prio.curr > task->prio)
 			break;
 
 		/* try again */
@@ -1254,7 +1295,7 @@ static int pull_rt_task(struct rq *this_rq)
 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
-	if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+	if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
 		pull_rt_task(rq);
 }
 
@@ -1340,7 +1381,7 @@ static void rq_online_rt(struct rq *rq)
 
 	__enable_runtime(rq);
 
-	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
 }
 
 /* Assumes rq->lock is held */
@@ -1431,7 +1472,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
 		 * can release the rq lock and p could migrate.
 		 * Only reschedule if p is still on the same runqueue.
 		 */
-		if (p->prio > rq->rt.highest_prio && rq->curr == p)
+		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
 			resched_task(p);
 #else
 		/* For UP simply resched on drop of prio */

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v2 3/4] sched: use highest_prio.curr for pull threshold
  2008-12-03 22:09     ` [PATCH v2 0/4] sched: track next-highest priority Gregory Haskins
  2008-12-03 22:09       ` [PATCH v2 1/4] sched: cleanup inc/dec_rt_tasks Gregory Haskins
  2008-12-03 22:09       ` [PATCH v2 2/4] sched: track the next-highest priority on each runqueue Gregory Haskins
@ 2008-12-03 22:09       ` Gregory Haskins
  2008-12-03 22:09       ` [PATCH v2 4/4] sched: use highest_prio.next to optimize pull operations Gregory Haskins
  2008-12-04 15:43       ` [PATCH v3 0/4] sched: track next-highest priority Gregory Haskins
  4 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-03 22:09 UTC (permalink / raw)
  To: mingo; +Cc: peterz, rostedt, ghaskins, linux-kernel, linux-rt-users

highest_prio.curr is actually a more accurate way to keep track of
the pull_rt_task() threshold since it is always up to date, even
if the "next" task migrates during double_lock.  Therefore, stop
looking at the "next" task object and simply use the highest_prio.curr.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched_rt.c |   31 ++++++-------------------------
 1 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a4022b6..7431d19 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1209,14 +1209,12 @@ static void push_rt_tasks(struct rq *rq)
 static int pull_rt_task(struct rq *this_rq)
 {
 	int this_cpu = this_rq->cpu, ret = 0, cpu;
-	struct task_struct *p, *next;
+	struct task_struct *p;
 	struct rq *src_rq;
 
 	if (likely(!rt_overloaded(this_rq)))
 		return 0;
 
-	next = pick_next_task_rt(this_rq);
-
 	for_each_cpu(cpu, this_rq->rd->rto_mask) {
 		if (this_cpu == cpu)
 			continue;
@@ -1225,17 +1223,9 @@ static int pull_rt_task(struct rq *this_rq)
 		/*
 		 * We can potentially drop this_rq's lock in
 		 * double_lock_balance, and another CPU could
-		 * steal our next task - hence we must cause
-		 * the caller to recalculate the next task
-		 * in that case:
+		 * alter this_rq
 		 */
-		if (double_lock_balance(this_rq, src_rq)) {
-			struct task_struct *old_next = next;
-
-			next = pick_next_task_rt(this_rq);
-			if (next != old_next)
-				ret = 1;
-		}
+		double_lock_balance(this_rq, src_rq);
 
 		/*
 		 * Are there still pullable RT tasks?
@@ -1249,7 +1239,7 @@ static int pull_rt_task(struct rq *this_rq)
 		 * Do we have an RT task that preempts
 		 * the to-be-scheduled task?
 		 */
-		if (p && (!next || (p->prio < next->prio))) {
+		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
 			WARN_ON(p == src_rq->curr);
 			WARN_ON(!p->se.on_rq);
 
@@ -1259,12 +1249,9 @@ static int pull_rt_task(struct rq *this_rq)
 			 * This is just that p is wakeing up and hasn't
 			 * had a chance to schedule. We only pull
 			 * p if it is lower in priority than the
-			 * current task on the run queue or
-			 * this_rq next task is lower in prio than
-			 * the current task on that rq.
+			 * current task on the run queue
 			 */
-			if (p->prio < src_rq->curr->prio ||
-			    (next && next->prio < src_rq->curr->prio))
+			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
 			ret = 1;
@@ -1277,13 +1264,7 @@ static int pull_rt_task(struct rq *this_rq)
 			 * case there's an even higher prio task
 			 * in another runqueue. (low likelyhood
 			 * but possible)
-			 *
-			 * Update next so that we won't pick a task
-			 * on another cpu with a priority lower (or equal)
-			 * than the one we just picked.
 			 */
-			next = p;
-
 		}
  skip:
 		double_unlock_balance(this_rq, src_rq);

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v2 4/4] sched: use highest_prio.next to optimize pull operations
  2008-12-03 22:09     ` [PATCH v2 0/4] sched: track next-highest priority Gregory Haskins
                         ` (2 preceding siblings ...)
  2008-12-03 22:09       ` [PATCH v2 3/4] sched: use highest_prio.curr for pull threshold Gregory Haskins
@ 2008-12-03 22:09       ` Gregory Haskins
  2008-12-04 15:43       ` [PATCH v3 0/4] sched: track next-highest priority Gregory Haskins
  4 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-03 22:09 UTC (permalink / raw)
  To: mingo; +Cc: peterz, rostedt, ghaskins, linux-kernel, linux-rt-users

We currently take the rq->lock for every cpu in an overload state during
pull_rt_tasks().  However, we now have enough information via the
highest_prio.[curr|next] fields to determine if there is any tasks of
interest to warrant the overhead of the rq->lock, before we actually take
it.  So we use this information to reduce lock contention during the
pull for the case where the source-rq doesnt have tasks that preempt
the current task.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched_rt.c |   12 ++++++++++++
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 7431d19..6072f24 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1220,6 +1220,18 @@ static int pull_rt_task(struct rq *this_rq)
 			continue;
 
 		src_rq = cpu_rq(cpu);
+
+		/*
+		 * Don't bother taking the src_rq->lock if the next highest
+		 * task is known to be lower-priority than our current task.
+		 * This may look racy, but if this value is about to go
+		 * logically higher, the src_rq will push this task away.
+		 * And if its going logically lower, we do not care
+		 */
+		if (src_rq->rt.highest_prio.next >=
+		    this_rq->rt.highest_prio.curr)
+			continue;
+
 		/*
 		 * We can potentially drop this_rq's lock in
 		 * double_lock_balance, and another CPU could

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/4] sched: track the next-highest priority on each runqueue
  2008-12-03 22:09       ` [PATCH v2 2/4] sched: track the next-highest priority on each runqueue Gregory Haskins
@ 2008-12-04  4:22         ` Gregory Haskins
  0 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-04  4:22 UTC (permalink / raw)
  To: mingo; +Cc: peterz, rostedt, linux-kernel, linux-rt-users

[-- Attachment #1: Type: text/plain, Size: 7657 bytes --]

Gregory Haskins wrote:
> We will use this later in the series to reduce the amount of rq-lock
> contention during a pull operation
>
> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> ---
>
>  kernel/sched.c    |    8 ++++-
>  kernel/sched_rt.c |   81 ++++++++++++++++++++++++++++++++++++++++-------------
>  2 files changed, 67 insertions(+), 22 deletions(-)
>
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 6237b9b..24b11eb 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -463,7 +463,10 @@ struct rt_rq {
>  	struct rt_prio_array active;
>  	unsigned long rt_nr_running;
>  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
> -	int highest_prio; /* highest queued rt task prio */
> +	struct {
> +		int curr; /* highest queued rt task prio */
> +		int next; /* next highest */
> +	} highest_prio;
>  #endif
>  #ifdef CONFIG_SMP
>  	unsigned long rt_nr_migratory;
> @@ -8073,7 +8076,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
>  	__set_bit(MAX_RT_PRIO, array->bitmap);
>  
>  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
> -	rt_rq->highest_prio = MAX_RT_PRIO;
> +	rt_rq->highest_prio.curr = MAX_RT_PRIO;
> +	rt_rq->highest_prio.next = MAX_RT_PRIO;
>  #endif
>  #ifdef CONFIG_SMP
>  	rt_rq->rt_nr_migratory = 0;
> diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
> index fb1d4d7..a4022b6 100644
> --- a/kernel/sched_rt.c
> +++ b/kernel/sched_rt.c
> @@ -108,7 +108,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
>  	if (rt_rq->rt_nr_running) {
>  		if (rt_se && !on_rt_rq(rt_se))
>  			enqueue_rt_entity(rt_se);
> -		if (rt_rq->highest_prio < curr->prio)
> +		if (rt_rq->highest_prio.curr < curr->prio)
>  			resched_task(curr);
>  	}
>  }
> @@ -473,7 +473,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
>  	struct rt_rq *rt_rq = group_rt_rq(rt_se);
>  
>  	if (rt_rq)
> -		return rt_rq->highest_prio;
> +		return rt_rq->highest_prio.curr;
>  #endif
>  
>  	return rt_task_of(rt_se)->prio;
> @@ -547,6 +547,21 @@ static void update_curr_rt(struct rq *rq)
>  	}
>  }
>  
> +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
> +
> +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
> +
> +static inline int next_prio(struct rq *rq)
> +{
> +	struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
> +
> +	if (next && rt_prio(next->prio))
> +		return next->prio;
> +	else
> +		return MAX_RT_PRIO;
> +}
> +#endif
> +
>  static inline
>  void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>  {
> @@ -560,14 +575,32 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>  	WARN_ON(!rt_prio(prio));
>  	rt_rq->rt_nr_running++;
>  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
> -	if (prio < rt_rq->highest_prio) {
> +	if (prio < rt_rq->highest_prio.curr) {
>  
> -		rt_rq->highest_prio = prio;
> +		/*
> +		 * If the new task is higher in priority than anything on the
> +		 * run-queue, we have a new high that must be published to
> +		 * the world.  We also know that the previous high becomes
> +		 * our next-highest.
> +		 */
> +		rt_rq->highest_prio.next = rt_rq->highest_prio.curr;
> +		rt_rq->highest_prio.curr = prio;
>  #ifdef CONFIG_SMP
>  		if (rq->online)
>  			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
>  #endif
> -	}
> +	} else if (prio == rt_rq->highest_prio.curr)
> +		/*
> +		 * If the next task is equal in priority to the highest on
> +		 * the run-queue, then we implicitly know that the next highest
> +		 * task cannot be any lower than current
> +		 */
> +		rt_rq->highest_prio.next = prio;
> +	else if (prio < rt_rq->highest_prio.next)
> +		/*
> +		 * Otherwise, we need to recompute next-highest
> +		 */
> +		rt_rq->highest_prio.next = next_prio(rq);
>  #endif
>  #ifdef CONFIG_SMP
>  	if (rt_se->nr_cpus_allowed > 1)
> @@ -591,7 +624,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>  {
>  #ifdef CONFIG_SMP
>  	struct rq *rq = rq_of_rt_rq(rt_rq);
> -	int highest_prio = rt_rq->highest_prio;
> +	int highest_prio = rt_rq->highest_prio.curr;
>  #endif
>  
>  	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
> @@ -599,24 +632,32 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>  	rt_rq->rt_nr_running--;
>  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
>  	if (rt_rq->rt_nr_running) {
> -		struct rt_prio_array *array;
> +		int prio = rt_se_prio(rt_se);
> +
> +		WARN_ON(prio < rt_rq->highest_prio.curr);
>  
> -		WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
> -		if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
> -			/* recalculate */
> -			array = &rt_rq->active;
> -			rt_rq->highest_prio =
> +		/*
> +		 * This may have been our highest or next-highest priority
> +		 * task and therefore we may have some recomputation to do
> +		 */
> +		if (prio == rt_rq->highest_prio.curr) {
> +			struct rt_prio_array *array = &rt_rq->active;
> +
> +			rt_rq->highest_prio.curr =
>  				sched_find_first_bit(array->bitmap);
> -		} /* otherwise leave rq->highest prio alone */
> +		}
> +
> +		if (prio == rt_rq->highest_prio.next)
>   

Crap.  Trying to fall asleep tonight, I realized this is a bug I think. 
Looks like I will need a v3

It should be "prio <= rt_rq->highest_prio.next" or we can miss updating
.next properly.

> +			rt_rq->highest_prio.next = next_prio(rq);
>  	} else
> -		rt_rq->highest_prio = MAX_RT_PRIO;
> +		rt_rq->highest_prio.curr = MAX_RT_PRIO;
>  #endif
>  #ifdef CONFIG_SMP
>  	if (rt_se->nr_cpus_allowed > 1)
>  		rq->rt.rt_nr_migratory--;
>  
> -	if (rq->online && rt_rq->highest_prio != highest_prio)
> -		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
> +	if (rq->online && rt_rq->highest_prio.curr != highest_prio)
> +		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
>  
>  	update_rt_migration(rq);
>  #endif /* CONFIG_SMP */
> @@ -1066,7 +1107,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
>  		}
>  
>  		/* If this rq is still suitable use it. */
> -		if (lowest_rq->rt.highest_prio > task->prio)
> +		if (lowest_rq->rt.highest_prio.curr > task->prio)
>  			break;
>  
>  		/* try again */
> @@ -1254,7 +1295,7 @@ static int pull_rt_task(struct rq *this_rq)
>  static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
>  {
>  	/* Try to pull RT tasks here if we lower this rq's prio */
> -	if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
> +	if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
>  		pull_rt_task(rq);
>  }
>  
> @@ -1340,7 +1381,7 @@ static void rq_online_rt(struct rq *rq)
>  
>  	__enable_runtime(rq);
>  
> -	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
> +	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
>  }
>  
>  /* Assumes rq->lock is held */
> @@ -1431,7 +1472,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
>  		 * can release the rq lock and p could migrate.
>  		 * Only reschedule if p is still on the same runqueue.
>  		 */
> -		if (p->prio > rq->rt.highest_prio && rq->curr == p)
> +		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
>  			resched_task(p);
>  #else
>  		/* For UP simply resched on drop of prio */
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v3 0/4] sched: track next-highest priority
  2008-12-03 22:09     ` [PATCH v2 0/4] sched: track next-highest priority Gregory Haskins
                         ` (3 preceding siblings ...)
  2008-12-03 22:09       ` [PATCH v2 4/4] sched: use highest_prio.next to optimize pull operations Gregory Haskins
@ 2008-12-04 15:43       ` Gregory Haskins
  2008-12-04 15:43           ` Gregory Haskins
                           ` (3 more replies)
  4 siblings, 4 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-04 15:43 UTC (permalink / raw)
  To: mingo; +Cc: peterz, rostedt, ghaskins, linux-kernel, linux-rt-users

Hi Ingo,
  Here is the updated series, per my note about the bug in v2 last night.

Changes since v2

Fix bug in 2/4 related to recomputing .next in dec_rt_task()

-Greg

---

Gregory Haskins (4):
      sched: use highest_prio.next to optimize pull operations
      sched: use highest_prio.curr for pull threshold
      sched: track the next-highest priority on each runqueue
      sched: cleanup inc/dec_rt_tasks


 kernel/sched.c    |    8 ++-
 kernel/sched_rt.c |  156 +++++++++++++++++++++++++++++++----------------------
 2 files changed, 98 insertions(+), 66 deletions(-)


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v3 1/4] sched: cleanup inc/dec_rt_tasks
  2008-12-04 15:43       ` [PATCH v3 0/4] sched: track next-highest priority Gregory Haskins
@ 2008-12-04 15:43           ` Gregory Haskins
  2008-12-04 15:43         ` [PATCH v3 2/4] sched: track the next-highest priority on each runqueue Gregory Haskins
                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-04 15:43 UTC (permalink / raw)
  To: mingo; +Cc: peterz, rostedt, ghaskins, linux-kernel, linux-rt-users

Move some common definitions up to the function prologe to simplify the
body logic.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched_rt.c |   40 +++++++++++++++++-----------------------
 1 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1bbd990..fb1d4d7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -550,30 +550,30 @@ static void update_curr_rt(struct rq *rq)
 static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
-	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+	int prio = rt_se_prio(rt_se);
+#endif
 #ifdef CONFIG_SMP
-		struct rq *rq = rq_of_rt_rq(rt_rq);
+	struct rq *rq = rq_of_rt_rq(rt_rq);
 #endif
 
-		rt_rq->highest_prio = rt_se_prio(rt_se);
+	WARN_ON(!rt_prio(prio));
+	rt_rq->rt_nr_running++;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+	if (prio < rt_rq->highest_prio) {
+
+		rt_rq->highest_prio = prio;
 #ifdef CONFIG_SMP
 		if (rq->online)
-			cpupri_set(&rq->rd->cpupri, rq->cpu,
-				   rt_se_prio(rt_se));
+			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 #endif
 	}
 #endif
 #ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
+	if (rt_se->nr_cpus_allowed > 1)
 		rq->rt.rt_nr_migratory++;
-	}
 
-	update_rt_migration(rq_of_rt_rq(rt_rq));
+	update_rt_migration(rq);
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (rt_se_boosted(rt_se))
@@ -590,6 +590,7 @@ static inline
 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 #ifdef CONFIG_SMP
+	struct rq *rq = rq_of_rt_rq(rt_rq);
 	int highest_prio = rt_rq->highest_prio;
 #endif
 
@@ -611,20 +612,13 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 		rt_rq->highest_prio = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
+	if (rt_se->nr_cpus_allowed > 1)
 		rq->rt.rt_nr_migratory--;
-	}
 
-	if (rt_rq->highest_prio != highest_prio) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v3 1/4] sched: cleanup inc/dec_rt_tasks
@ 2008-12-04 15:43           ` Gregory Haskins
  0 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-04 15:43 UTC (permalink / raw)
  To: mingo; +Cc: peterz, rostedt, ghaskins, linux-kernel, linux-rt-users

Move some common definitions up to the function prologe to simplify the
body logic.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched_rt.c |   40 +++++++++++++++++-----------------------
 1 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1bbd990..fb1d4d7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -550,30 +550,30 @@ static void update_curr_rt(struct rq *rq)
 static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
-	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+	int prio = rt_se_prio(rt_se);
+#endif
 #ifdef CONFIG_SMP
-		struct rq *rq = rq_of_rt_rq(rt_rq);
+	struct rq *rq = rq_of_rt_rq(rt_rq);
 #endif
 
-		rt_rq->highest_prio = rt_se_prio(rt_se);
+	WARN_ON(!rt_prio(prio));
+	rt_rq->rt_nr_running++;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+	if (prio < rt_rq->highest_prio) {
+
+		rt_rq->highest_prio = prio;
 #ifdef CONFIG_SMP
 		if (rq->online)
-			cpupri_set(&rq->rd->cpupri, rq->cpu,
-				   rt_se_prio(rt_se));
+			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 #endif
 	}
 #endif
 #ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
+	if (rt_se->nr_cpus_allowed > 1)
 		rq->rt.rt_nr_migratory++;
-	}
 
-	update_rt_migration(rq_of_rt_rq(rt_rq));
+	update_rt_migration(rq);
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (rt_se_boosted(rt_se))
@@ -590,6 +590,7 @@ static inline
 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 #ifdef CONFIG_SMP
+	struct rq *rq = rq_of_rt_rq(rt_rq);
 	int highest_prio = rt_rq->highest_prio;
 #endif
 
@@ -611,20 +612,13 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 		rt_rq->highest_prio = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
+	if (rt_se->nr_cpus_allowed > 1)
 		rq->rt.rt_nr_migratory--;
-	}
 
-	if (rt_rq->highest_prio != highest_prio) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
-		if (rq->online)
-			cpupri_set(&rq->rd->cpupri, rq->cpu,
-				   rt_rq->highest_prio);
-	}
+	if (rq->online && rt_rq->highest_prio != highest_prio)
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
 
-	update_rt_migration(rq_of_rt_rq(rt_rq));
+	update_rt_migration(rq);
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (rt_se_boosted(rt_se))


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v3 2/4] sched: track the next-highest priority on each runqueue
  2008-12-04 15:43       ` [PATCH v3 0/4] sched: track next-highest priority Gregory Haskins
  2008-12-04 15:43           ` Gregory Haskins
@ 2008-12-04 15:43         ` Gregory Haskins
  2008-12-04 15:43           ` Gregory Haskins
  2008-12-04 15:43         ` [PATCH v3 4/4] sched: use highest_prio.next to optimize pull operations Gregory Haskins
  3 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-04 15:43 UTC (permalink / raw)
  To: mingo; +Cc: peterz, rostedt, ghaskins, linux-kernel, linux-rt-users

We will use this later in the series to reduce the amount of rq-lock
contention during a pull operation

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c    |    8 ++++-
 kernel/sched_rt.c |   81 ++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 6237b9b..24b11eb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -463,7 +463,10 @@ struct rt_rq {
 	struct rt_prio_array active;
 	unsigned long rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	int highest_prio; /* highest queued rt task prio */
+	struct {
+		int curr; /* highest queued rt task prio */
+		int next; /* next highest */
+	} highest_prio;
 #endif
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
@@ -8073,7 +8076,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	rt_rq->highest_prio = MAX_RT_PRIO;
+	rt_rq->highest_prio.curr = MAX_RT_PRIO;
+	rt_rq->highest_prio.next = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fb1d4d7..7dad033 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -108,7 +108,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 	if (rt_rq->rt_nr_running) {
 		if (rt_se && !on_rt_rq(rt_se))
 			enqueue_rt_entity(rt_se);
-		if (rt_rq->highest_prio < curr->prio)
+		if (rt_rq->highest_prio.curr < curr->prio)
 			resched_task(curr);
 	}
 }
@@ -473,7 +473,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 	struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
 	if (rt_rq)
-		return rt_rq->highest_prio;
+		return rt_rq->highest_prio.curr;
 #endif
 
 	return rt_task_of(rt_se)->prio;
@@ -547,6 +547,21 @@ static void update_curr_rt(struct rq *rq)
 	}
 }
 
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
+
+static inline int next_prio(struct rq *rq)
+{
+	struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
+
+	if (next && rt_prio(next->prio))
+		return next->prio;
+	else
+		return MAX_RT_PRIO;
+}
+#endif
+
 static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
@@ -560,14 +575,32 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	WARN_ON(!rt_prio(prio));
 	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	if (prio < rt_rq->highest_prio) {
+	if (prio < rt_rq->highest_prio.curr) {
 
-		rt_rq->highest_prio = prio;
+		/*
+		 * If the new task is higher in priority than anything on the
+		 * run-queue, we have a new high that must be published to
+		 * the world.  We also know that the previous high becomes
+		 * our next-highest.
+		 */
+		rt_rq->highest_prio.next = rt_rq->highest_prio.curr;
+		rt_rq->highest_prio.curr = prio;
 #ifdef CONFIG_SMP
 		if (rq->online)
 			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 #endif
-	}
+	} else if (prio == rt_rq->highest_prio.curr)
+		/*
+		 * If the next task is equal in priority to the highest on
+		 * the run-queue, then we implicitly know that the next highest
+		 * task cannot be any lower than current
+		 */
+		rt_rq->highest_prio.next = prio;
+	else if (prio < rt_rq->highest_prio.next)
+		/*
+		 * Otherwise, we need to recompute next-highest
+		 */
+		rt_rq->highest_prio.next = next_prio(rq);
 #endif
 #ifdef CONFIG_SMP
 	if (rt_se->nr_cpus_allowed > 1)
@@ -591,7 +624,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 #ifdef CONFIG_SMP
 	struct rq *rq = rq_of_rt_rq(rt_rq);
-	int highest_prio = rt_rq->highest_prio;
+	int highest_prio = rt_rq->highest_prio.curr;
 #endif
 
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
@@ -599,24 +632,32 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rt_nr_running--;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	if (rt_rq->rt_nr_running) {
-		struct rt_prio_array *array;
+		int prio = rt_se_prio(rt_se);
+
+		WARN_ON(prio < rt_rq->highest_prio.curr);
 
-		WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
-		if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
-			/* recalculate */
-			array = &rt_rq->active;
-			rt_rq->highest_prio =
+		/*
+		 * This may have been our highest or next-highest priority
+		 * task and therefore we may have some recomputation to do
+		 */
+		if (prio == rt_rq->highest_prio.curr) {
+			struct rt_prio_array *array = &rt_rq->active;
+
+			rt_rq->highest_prio.curr =
 				sched_find_first_bit(array->bitmap);
-		} /* otherwise leave rq->highest prio alone */
+		}
+
+		if (prio <= rt_rq->highest_prio.next)
+			rt_rq->highest_prio.next = next_prio(rq);
 	} else
-		rt_rq->highest_prio = MAX_RT_PRIO;
+		rt_rq->highest_prio.curr = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
 	if (rt_se->nr_cpus_allowed > 1)
 		rq->rt.rt_nr_migratory--;
 
-	if (rq->online && rt_rq->highest_prio != highest_prio)
-		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
+	if (rq->online && rt_rq->highest_prio.curr != highest_prio)
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 
 	update_rt_migration(rq);
 #endif /* CONFIG_SMP */
@@ -1066,7 +1107,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 		}
 
 		/* If this rq is still suitable use it. */
-		if (lowest_rq->rt.highest_prio > task->prio)
+		if (lowest_rq->rt.highest_prio.curr > task->prio)
 			break;
 
 		/* try again */
@@ -1254,7 +1295,7 @@ static int pull_rt_task(struct rq *this_rq)
 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
-	if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+	if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
 		pull_rt_task(rq);
 }
 
@@ -1340,7 +1381,7 @@ static void rq_online_rt(struct rq *rq)
 
 	__enable_runtime(rq);
 
-	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
 }
 
 /* Assumes rq->lock is held */
@@ -1431,7 +1472,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
 		 * can release the rq lock and p could migrate.
 		 * Only reschedule if p is still on the same runqueue.
 		 */
-		if (p->prio > rq->rt.highest_prio && rq->curr == p)
+		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
 			resched_task(p);
 #else
 		/* For UP simply resched on drop of prio */


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v3 3/4] sched: use highest_prio.curr for pull threshold
  2008-12-04 15:43       ` [PATCH v3 0/4] sched: track next-highest priority Gregory Haskins
@ 2008-12-04 15:43           ` Gregory Haskins
  2008-12-04 15:43         ` [PATCH v3 2/4] sched: track the next-highest priority on each runqueue Gregory Haskins
                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-04 15:43 UTC (permalink / raw)
  To: mingo; +Cc: peterz, rostedt, ghaskins, linux-kernel, linux-rt-users

highest_prio.curr is actually a more accurate way to keep track of
the pull_rt_task() threshold since it is always up to date, even
if the "next" task migrates during double_lock.  Therefore, stop
looking at the "next" task object and simply use the highest_prio.curr.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched_rt.c |   31 ++++++-------------------------
 1 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 7dad033..ffdd00e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1209,14 +1209,12 @@ static void push_rt_tasks(struct rq *rq)
 static int pull_rt_task(struct rq *this_rq)
 {
 	int this_cpu = this_rq->cpu, ret = 0, cpu;
-	struct task_struct *p, *next;
+	struct task_struct *p;
 	struct rq *src_rq;
 
 	if (likely(!rt_overloaded(this_rq)))
 		return 0;
 
-	next = pick_next_task_rt(this_rq);
-
 	for_each_cpu(cpu, this_rq->rd->rto_mask) {
 		if (this_cpu == cpu)
 			continue;
@@ -1225,17 +1223,9 @@ static int pull_rt_task(struct rq *this_rq)
 		/*
 		 * We can potentially drop this_rq's lock in
 		 * double_lock_balance, and another CPU could
-		 * steal our next task - hence we must cause
-		 * the caller to recalculate the next task
-		 * in that case:
+		 * alter this_rq
 		 */
-		if (double_lock_balance(this_rq, src_rq)) {
-			struct task_struct *old_next = next;
-
-			next = pick_next_task_rt(this_rq);
-			if (next != old_next)
-				ret = 1;
-		}
+		double_lock_balance(this_rq, src_rq);
 
 		/*
 		 * Are there still pullable RT tasks?
@@ -1249,7 +1239,7 @@ static int pull_rt_task(struct rq *this_rq)
 		 * Do we have an RT task that preempts
 		 * the to-be-scheduled task?
 		 */
-		if (p && (!next || (p->prio < next->prio))) {
+		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
 			WARN_ON(p == src_rq->curr);
 			WARN_ON(!p->se.on_rq);
 
@@ -1259,12 +1249,9 @@ static int pull_rt_task(struct rq *this_rq)
 			 * This is just that p is wakeing up and hasn't
 			 * had a chance to schedule. We only pull
 			 * p if it is lower in priority than the
-			 * current task on the run queue or
-			 * this_rq next task is lower in prio than
-			 * the current task on that rq.
+			 * current task on the run queue
 			 */
-			if (p->prio < src_rq->curr->prio ||
-			    (next && next->prio < src_rq->curr->prio))
+			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
 			ret = 1;
@@ -1277,13 +1264,7 @@ static int pull_rt_task(struct rq *this_rq)
 			 * case there's an even higher prio task
 			 * in another runqueue. (low likelyhood
 			 * but possible)
-			 *
-			 * Update next so that we won't pick a task
-			 * on another cpu with a priority lower (or equal)
-			 * than the one we just picked.
 			 */
-			next = p;

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v3 3/4] sched: use highest_prio.curr for pull threshold
@ 2008-12-04 15:43           ` Gregory Haskins
  0 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-04 15:43 UTC (permalink / raw)
  To: mingo; +Cc: peterz, rostedt, ghaskins, linux-kernel, linux-rt-users

highest_prio.curr is actually a more accurate way to keep track of
the pull_rt_task() threshold since it is always up to date, even
if the "next" task migrates during double_lock.  Therefore, stop
looking at the "next" task object and simply use the highest_prio.curr.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched_rt.c |   31 ++++++-------------------------
 1 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 7dad033..ffdd00e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1209,14 +1209,12 @@ static void push_rt_tasks(struct rq *rq)
 static int pull_rt_task(struct rq *this_rq)
 {
 	int this_cpu = this_rq->cpu, ret = 0, cpu;
-	struct task_struct *p, *next;
+	struct task_struct *p;
 	struct rq *src_rq;
 
 	if (likely(!rt_overloaded(this_rq)))
 		return 0;
 
-	next = pick_next_task_rt(this_rq);
-
 	for_each_cpu(cpu, this_rq->rd->rto_mask) {
 		if (this_cpu == cpu)
 			continue;
@@ -1225,17 +1223,9 @@ static int pull_rt_task(struct rq *this_rq)
 		/*
 		 * We can potentially drop this_rq's lock in
 		 * double_lock_balance, and another CPU could
-		 * steal our next task - hence we must cause
-		 * the caller to recalculate the next task
-		 * in that case:
+		 * alter this_rq
 		 */
-		if (double_lock_balance(this_rq, src_rq)) {
-			struct task_struct *old_next = next;
-
-			next = pick_next_task_rt(this_rq);
-			if (next != old_next)
-				ret = 1;
-		}
+		double_lock_balance(this_rq, src_rq);
 
 		/*
 		 * Are there still pullable RT tasks?
@@ -1249,7 +1239,7 @@ static int pull_rt_task(struct rq *this_rq)
 		 * Do we have an RT task that preempts
 		 * the to-be-scheduled task?
 		 */
-		if (p && (!next || (p->prio < next->prio))) {
+		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
 			WARN_ON(p == src_rq->curr);
 			WARN_ON(!p->se.on_rq);
 
@@ -1259,12 +1249,9 @@ static int pull_rt_task(struct rq *this_rq)
 			 * This is just that p is wakeing up and hasn't
 			 * had a chance to schedule. We only pull
 			 * p if it is lower in priority than the
-			 * current task on the run queue or
-			 * this_rq next task is lower in prio than
-			 * the current task on that rq.
+			 * current task on the run queue
 			 */
-			if (p->prio < src_rq->curr->prio ||
-			    (next && next->prio < src_rq->curr->prio))
+			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
 			ret = 1;
@@ -1277,13 +1264,7 @@ static int pull_rt_task(struct rq *this_rq)
 			 * case there's an even higher prio task
 			 * in another runqueue. (low likelyhood
 			 * but possible)
-			 *
-			 * Update next so that we won't pick a task
-			 * on another cpu with a priority lower (or equal)
-			 * than the one we just picked.
 			 */
-			next = p;
-
 		}
  skip:
 		double_unlock_balance(this_rq, src_rq);


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v3 4/4] sched: use highest_prio.next to optimize pull operations
  2008-12-04 15:43       ` [PATCH v3 0/4] sched: track next-highest priority Gregory Haskins
                           ` (2 preceding siblings ...)
  2008-12-04 15:43           ` Gregory Haskins
@ 2008-12-04 15:43         ` Gregory Haskins
  3 siblings, 0 replies; 25+ messages in thread
From: Gregory Haskins @ 2008-12-04 15:43 UTC (permalink / raw)
  To: mingo; +Cc: peterz, rostedt, ghaskins, linux-kernel, linux-rt-users

We currently take the rq->lock for every cpu in an overload state during
pull_rt_tasks().  However, we now have enough information via the
highest_prio.[curr|next] fields to determine if there is any tasks of
interest to warrant the overhead of the rq->lock, before we actually take
it.  So we use this information to reduce lock contention during the
pull for the case where the source-rq doesnt have tasks that preempt
the current task.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched_rt.c |   12 ++++++++++++
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ffdd00e..afcc67a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1220,6 +1220,18 @@ static int pull_rt_task(struct rq *this_rq)
 			continue;
 
 		src_rq = cpu_rq(cpu);
+
+		/*
+		 * Don't bother taking the src_rq->lock if the next highest
+		 * task is known to be lower-priority than our current task.
+		 * This may look racy, but if this value is about to go
+		 * logically higher, the src_rq will push this task away.
+		 * And if its going logically lower, we do not care
+		 */
+		if (src_rq->rt.highest_prio.next >=
+		    this_rq->rt.highest_prio.curr)
+			continue;
+
 		/*
 		 * We can potentially drop this_rq's lock in
 		 * double_lock_balance, and another CPU could


^ permalink raw reply related	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2008-12-04 15:41 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-11-11 14:26 [RFC PATCH 0/3] Series short description Gregory Haskins
2008-11-11 14:26 ` [RFC PATCH 1/3] sched: track the next-highest priority on each runqueue Gregory Haskins
2008-11-11 14:26 ` [RFC PATCH 2/3] sched: use highest_prio.curr for pull threshold Gregory Haskins
2008-11-11 14:26 ` [RFC PATCH 3/3] sched: use highest_prio.next to optimize pull operations Gregory Haskins
2008-11-11 17:56 ` [RFC PATCH 0/3] Series short description Ingo Molnar
2008-11-11 18:51   ` Gregory Haskins
2008-11-11 19:31 ` Chris Friesen
2008-11-11 19:50   ` Gregory Haskins
2008-12-03 20:01 ` Gregory Haskins
2008-12-03 20:30   ` Randy Dunlap
2008-12-03 20:39     ` [RFC PATCH 0/3] sched: track next-highest priority (was "Series short discription") Gregory Haskins
2008-12-03 20:40   ` [RFC PATCH 0/3] Series short description Ingo Molnar
2008-12-03 22:09     ` [PATCH v2 0/4] sched: track next-highest priority Gregory Haskins
2008-12-03 22:09       ` [PATCH v2 1/4] sched: cleanup inc/dec_rt_tasks Gregory Haskins
2008-12-03 22:09       ` [PATCH v2 2/4] sched: track the next-highest priority on each runqueue Gregory Haskins
2008-12-04  4:22         ` Gregory Haskins
2008-12-03 22:09       ` [PATCH v2 3/4] sched: use highest_prio.curr for pull threshold Gregory Haskins
2008-12-03 22:09       ` [PATCH v2 4/4] sched: use highest_prio.next to optimize pull operations Gregory Haskins
2008-12-04 15:43       ` [PATCH v3 0/4] sched: track next-highest priority Gregory Haskins
2008-12-04 15:43         ` [PATCH v3 1/4] sched: cleanup inc/dec_rt_tasks Gregory Haskins
2008-12-04 15:43           ` Gregory Haskins
2008-12-04 15:43         ` [PATCH v3 2/4] sched: track the next-highest priority on each runqueue Gregory Haskins
2008-12-04 15:43         ` [PATCH v3 3/4] sched: use highest_prio.curr for pull threshold Gregory Haskins
2008-12-04 15:43           ` Gregory Haskins
2008-12-04 15:43         ` [PATCH v3 4/4] sched: use highest_prio.next to optimize pull operations Gregory Haskins

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.