[PATCH] sched: Reduce overhead in balance

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] sched: Reduce overhead in balance_tasks()
@ 2007-08-15  4:42 Peter Williams
  2007-08-24  6:04 ` Ingo Molnar
  0 siblings, 1 reply; 3+ messages in thread
From: Peter Williams @ 2007-08-15  4:42 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Nick Piggin, Siddha, Suresh B, Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 833 bytes --]

At the moment, balance_tasks() provides low level functionality for both 
  move_tasks() and move_one_task() (indirectly) via the load_balance() 
function (in the sched_class interface) which also provides dual 
functionality.  This dual functionality complicates the interfaces and 
internal mechanisms and makes the run time overhead of operations that 
are called with two run queue locks held.

This patch addresses this issue and reduces the overhead of these 
operations.

This patch is not urgent and can be held back until the next merge 
window without compromising the safety of the kernel.

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
    -- Ambrose Bierce



[-- Attachment #2: reduce-balance_tasks-overhead.patch --]
[-- Type: text/x-patch, Size: 11995 bytes --]

diff -r 90691a597f06 include/linux/sched.h
--- a/include/linux/sched.h	Mon Aug 13 15:06:35 2007 +0000
+++ b/include/linux/sched.h	Tue Aug 14 11:11:47 2007 +1000
@@ -865,10 +865,13 @@ struct sched_class {
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
-			struct rq *busiest,
-			unsigned long max_nr_move, unsigned long max_load_move,
+			struct rq *busiest, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *all_pinned, int *this_best_prio);
+
+	int (*move_one_task) (struct rq *this_rq, int this_cpu,
+			      struct rq *busiest, struct sched_domain *sd,
+			      enum cpu_idle_type idle);
 
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
diff -r 90691a597f06 kernel/sched.c
--- a/kernel/sched.c	Mon Aug 13 15:06:35 2007 +0000
+++ b/kernel/sched.c	Tue Aug 14 16:26:24 2007 +1000
@@ -753,11 +753,35 @@ struct rq_iterator {
 	struct task_struct *(*next)(void *);
 };
 
-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_nr_move, unsigned long max_load_move,
-		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *all_pinned, unsigned long *load_moved,
-		      int *this_best_prio, struct rq_iterator *iterator);
+#ifdef CONFIG_SMP
+static unsigned long
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+	      unsigned long max_load_move, struct sched_domain *sd,
+	      enum cpu_idle_type idle, int *all_pinned,
+	      int *this_best_prio, struct rq_iterator *iterator);
+
+static int
+iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		   struct sched_domain *sd, enum cpu_idle_type idle,
+		   struct rq_iterator *iterator);
+#else
+static inline unsigned long
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+	      unsigned long max_load_move, struct sched_domain *sd,
+	      enum cpu_idle_type idle, int *all_pinned,
+	      int *this_best_prio, struct rq_iterator *iterator)
+{
+	return 0;
+}
+
+static inline int
+iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		   struct sched_domain *sd, enum cpu_idle_type idle,
+		   struct rq_iterator *iterator)
+{
+	return 0;
+}
+#endif
 
 #include "sched_stats.h"
 #include "sched_rt.c"
@@ -2166,17 +2190,17 @@ int can_migrate_task(struct task_struct 
 	return 1;
 }
 
-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_nr_move, unsigned long max_load_move,
-		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *all_pinned, unsigned long *load_moved,
-		      int *this_best_prio, struct rq_iterator *iterator)
+static unsigned long
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+	      unsigned long max_load_move, struct sched_domain *sd,
+	      enum cpu_idle_type idle, int *all_pinned,
+	      int *this_best_prio, struct rq_iterator *iterator)
 {
 	int pulled = 0, pinned = 0, skip_for_load;
 	struct task_struct *p;
 	long rem_load_move = max_load_move;
 
-	if (max_nr_move == 0 || max_load_move == 0)
+	if (max_load_move == 0)
 		goto out;
 
 	pinned = 1;
@@ -2209,7 +2233,7 @@ next:
 	 * We only want to steal up to the prescribed number of tasks
 	 * and the prescribed amount of weighted load.
 	 */
-	if (pulled < max_nr_move && rem_load_move > 0) {
+	if (rem_load_move > 0) {
 		if (p->prio < *this_best_prio)
 			*this_best_prio = p->prio;
 		p = iterator->next(iterator->arg);
@@ -2217,7 +2241,7 @@ next:
 	}
 out:
 	/*
-	 * Right now, this is the only place pull_task() is called,
+	 * Right now, this is one of only two places pull_task() is called,
 	 * so we can safely collect pull_task() stats here rather than
 	 * inside pull_task().
 	 */
@@ -2225,8 +2249,8 @@ out:
 
 	if (all_pinned)
 		*all_pinned = pinned;
-	*load_moved = max_load_move - rem_load_move;
-	return pulled;
+
+	return max_load_move - rem_load_move;
 }
 
 /*
@@ -2248,7 +2272,7 @@ static int move_tasks(struct rq *this_rq
 	do {
 		total_load_moved +=
 			class->load_balance(this_rq, this_cpu, busiest,
-				ULONG_MAX, max_load_move - total_load_moved,
+				max_load_move - total_load_moved,
 				sd, idle, all_pinned, &this_best_prio);
 		class = class->next;
 	} while (class && max_load_move > total_load_moved);
@@ -2256,6 +2280,32 @@ static int move_tasks(struct rq *this_rq
 	return total_load_moved > 0;
 }
 
+static int
+iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		   struct sched_domain *sd, enum cpu_idle_type idle,
+		   struct rq_iterator *iterator)
+{
+	struct task_struct *p = iterator->start(iterator->arg);
+	int pinned = 0;
+
+	while (p) {
+		if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
+			pull_task(busiest, p, this_rq, this_cpu);
+			/*
+			 * Right now, this is only the second place pull_task()
+			 * is called, so we can safely collect pull_task()
+			 * stats here rather than inside pull_task().
+			 */
+			schedstat_inc(sd, lb_gained[idle]);
+
+			return 1;
+		}
+		p = iterator->next(iterator->arg);
+	}
+
+	return 0;
+}
+
 /*
  * move_one_task tries to move exactly one task from busiest to this_rq, as
  * part of active balancing operations within "domain".
@@ -2267,12 +2317,9 @@ static int move_one_task(struct rq *this
 			 struct sched_domain *sd, enum cpu_idle_type idle)
 {
 	struct sched_class *class;
-	int this_best_prio = MAX_PRIO;
 
 	for (class = sched_class_highest; class; class = class->next)
-		if (class->load_balance(this_rq, this_cpu, busiest,
-					1, ULONG_MAX, sd, idle, NULL,
-					&this_best_prio))
+		if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
 			return 1;
 
 	return 0;
@@ -3184,18 +3231,6 @@ static inline void trigger_load_balance(
  */
 static inline void idle_balance(int cpu, struct rq *rq)
 {
-}
-
-/* Avoid "used but not defined" warning on UP */
-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_nr_move, unsigned long max_load_move,
-		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *all_pinned, unsigned long *load_moved,
-		      int *this_best_prio, struct rq_iterator *iterator)
-{
-	*load_moved = 0;
-
-	return 0;
 }
 
 #endif
diff -r 90691a597f06 kernel/sched_fair.c
--- a/kernel/sched_fair.c	Mon Aug 13 15:06:35 2007 +0000
+++ b/kernel/sched_fair.c	Tue Aug 14 20:04:17 2007 +1000
@@ -944,24 +944,23 @@ static int cfs_rq_best_prio(struct cfs_r
 
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		  unsigned long max_nr_move, unsigned long max_load_move,
+		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
 		  int *all_pinned, int *this_best_prio)
 {
-	struct cfs_rq *busy_cfs_rq;
-	unsigned long load_moved, total_nr_moved = 0, nr_moved;
 	long rem_load_move = max_load_move;
 	struct rq_iterator cfs_rq_iterator;
 
 	cfs_rq_iterator.start = load_balance_start_fair;
 	cfs_rq_iterator.next = load_balance_next_fair;
 
-	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+	for_each_leaf_cfs_rq(busiest, cfs_rq_iterator.arg) {
 #ifdef CONFIG_FAIR_GROUP_SCHED
-		struct cfs_rq *this_cfs_rq;
+		struct cfs_rq *busy_cfs_rq, *this_cfs_rq;
 		long imbalance;
 		unsigned long maxload;
 
+		busy_cfs_rq = cfs_rq_iterator.arg;
 		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
 
 		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
@@ -977,23 +976,33 @@ load_balance_fair(struct rq *this_rq, in
 #else
 # define maxload rem_load_move
 #endif
-		/* pass busy_cfs_rq argument into
-		 * load_balance_[start|next]_fair iterators
-		 */
-		cfs_rq_iterator.arg = busy_cfs_rq;
-		nr_moved = balance_tasks(this_rq, this_cpu, busiest,
-				max_nr_move, maxload, sd, idle, all_pinned,
-				&load_moved, this_best_prio, &cfs_rq_iterator);
-
-		total_nr_moved += nr_moved;
-		max_nr_move -= nr_moved;
-		rem_load_move -= load_moved;
-
-		if (max_nr_move <= 0 || rem_load_move <= 0)
+		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+					       maxload, sd, idle, all_pinned,
+					       this_best_prio,
+					       &cfs_rq_iterator);
+
+		if (rem_load_move <= 0)
 			break;
 	}
 
 	return max_load_move - rem_load_move;
+}
+
+static int
+move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		   struct sched_domain *sd, enum cpu_idle_type idle)
+{
+	struct rq_iterator cfs_rq_iterator;
+
+	cfs_rq_iterator.start = load_balance_start_fair;
+	cfs_rq_iterator.next = load_balance_next_fair;
+
+	for_each_leaf_cfs_rq(busiest, cfs_rq_iterator.arg)
+		if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
+				       &cfs_rq_iterator))
+		    return 1;
+
+	return 0;
 }
 
 /*
@@ -1082,6 +1091,7 @@ struct sched_class fair_sched_class __re
 	.put_prev_task		= put_prev_task_fair,
 
 	.load_balance		= load_balance_fair,
+	.move_one_task		= move_one_task_fair,
 
 	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
diff -r 90691a597f06 kernel/sched_idletask.c
--- a/kernel/sched_idletask.c	Mon Aug 13 15:06:35 2007 +0000
+++ b/kernel/sched_idletask.c	Tue Aug 14 11:11:47 2007 +1000
@@ -39,9 +39,16 @@ static void put_prev_task_idle(struct rq
 
 static unsigned long
 load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-			unsigned long max_nr_move, unsigned long max_load_move,
-			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, int *this_best_prio)
+		  unsigned long max_load_move,
+		  struct sched_domain *sd, enum cpu_idle_type idle,
+		  int *all_pinned, int *this_best_prio)
+{
+	return 0;
+}
+
+static int
+move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		   struct sched_domain *sd, enum cpu_idle_type idle)
 {
 	return 0;
 }
@@ -65,6 +72,7 @@ static struct sched_class idle_sched_cla
 	.put_prev_task		= put_prev_task_idle,
 
 	.load_balance		= load_balance_idle,
+	.move_one_task		= move_one_task_idle,
 
 	.task_tick		= task_tick_idle,
 	/* no .task_new for idle tasks */
diff -r 90691a597f06 kernel/sched_rt.c
--- a/kernel/sched_rt.c	Mon Aug 13 15:06:35 2007 +0000
+++ b/kernel/sched_rt.c	Tue Aug 14 11:11:47 2007 +1000
@@ -172,13 +172,11 @@ static struct task_struct *load_balance_
 
 static unsigned long
 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-			unsigned long max_nr_move, unsigned long max_load_move,
-			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, int *this_best_prio)
-{
-	int nr_moved;
+		unsigned long max_load_move,
+		struct sched_domain *sd, enum cpu_idle_type idle,
+		int *all_pinned, int *this_best_prio)
+{
 	struct rq_iterator rt_rq_iterator;
-	unsigned long load_moved;
 
 	rt_rq_iterator.start = load_balance_start_rt;
 	rt_rq_iterator.next = load_balance_next_rt;
@@ -187,11 +185,22 @@ load_balance_rt(struct rq *this_rq, int 
 	 */
 	rt_rq_iterator.arg = busiest;
 
-	nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
-			max_load_move, sd, idle, all_pinned, &load_moved,
-			this_best_prio, &rt_rq_iterator);
-
-	return load_moved;
+	return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
+			     idle, all_pinned, this_best_prio, &rt_rq_iterator);
+}
+
+static int
+move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		 struct sched_domain *sd, enum cpu_idle_type idle)
+{
+	struct rq_iterator rt_rq_iterator;
+
+	rt_rq_iterator.start = load_balance_start_rt;
+	rt_rq_iterator.next = load_balance_next_rt;
+	rt_rq_iterator.arg = busiest;
+
+	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
+				  &rt_rq_iterator);
 }
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
@@ -224,6 +233,7 @@ static struct sched_class rt_sched_class
 	.put_prev_task		= put_prev_task_rt,
 
 	.load_balance		= load_balance_rt,
+	.move_one_task		= move_one_task_rt,
 
 	.task_tick		= task_tick_rt,
 };

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] sched: Reduce overhead in balance_tasks()
  2007-08-15  4:42 [PATCH] sched: Reduce overhead in balance_tasks() Peter Williams
@ 2007-08-24  6:04 ` Ingo Molnar
  2007-08-25  1:50   ` Peter Williams
  0 siblings, 1 reply; 3+ messages in thread
From: Ingo Molnar @ 2007-08-24  6:04 UTC (permalink / raw)
  To: Peter Williams; +Cc: Nick Piggin, Siddha, Suresh B, Linux Kernel Mailing List


* Peter Williams <pwil3058@bigpond.net.au> wrote:

> At the moment, balance_tasks() provides low level functionality for 
> both
>  move_tasks() and move_one_task() (indirectly) via the load_balance() 
> function (in the sched_class interface) which also provides dual 
> functionality.  This dual functionality complicates the interfaces and 
> internal mechanisms and makes the run time overhead of operations that 
> are called with two run queue locks held.
> 
> This patch addresses this issue and reduces the overhead of these 
> operations.

hm, i like it, and added it to my queue (probably .24 material though), 
but note that it increases .text and .data overhead:

   text    data     bss     dec     hex filename
  41028   37794    2168   80990   13c5e sched.o.before
  41349   37826    2168   81343   13dbf sched.o.after

is that expected?

	Ingo

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] sched: Reduce overhead in balance_tasks()
  2007-08-24  6:04 ` Ingo Molnar
@ 2007-08-25  1:50   ` Peter Williams
  0 siblings, 0 replies; 3+ messages in thread
From: Peter Williams @ 2007-08-25  1:50 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Nick Piggin, Siddha, Suresh B, Linux Kernel Mailing List

Ingo Molnar wrote:
> * Peter Williams <pwil3058@bigpond.net.au> wrote:
> 
>> At the moment, balance_tasks() provides low level functionality for 
>> both
>>  move_tasks() and move_one_task() (indirectly) via the load_balance() 
>> function (in the sched_class interface) which also provides dual 
>> functionality.  This dual functionality complicates the interfaces and 
>> internal mechanisms and makes the run time overhead of operations that 
>> are called with two run queue locks held.
>>
>> This patch addresses this issue and reduces the overhead of these 
>> operations.
> 
> hm, i like it, and added it to my queue (probably .24 material though), 
> but note that it increases .text and .data overhead:
> 
>    text    data     bss     dec     hex filename
>   41028   37794    2168   80990   13c5e sched.o.before
>   41349   37826    2168   81343   13dbf sched.o.after
> 
> is that expected?

Yes, sort off.  It's a trade off of space for time and I expected an 
increase (although I didn't think that it would be quite that much). 
But it's still less than 1% and since the time saved is time when two 
run queue locks are held I figure that it's a trade worth making.  Also 
this separation lays the ground for a clean up of the active load 
balancing code which should gain some space including making it possible 
to exclude active load balancing on systems that don't need it (i.e. 
those that don't have multiple multi core/hyperthreading packages).

I've got a patch available that reduces the .text and .data for non SMP 
systems by excluding the load balancing stuff (that has crept into those 
systems) so that should help on embedded systems where memory is tight.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2007-08-25  4:34 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-08-15  4:42 [PATCH] sched: Reduce overhead in balance_tasks() Peter Williams
2007-08-24  6:04 ` Ingo Molnar
2007-08-25  1:50   ` Peter Williams

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox