public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Peter Williams <pwil3058@bigpond.net.au>
To: Andy Whitcroft <apw@shadowen.org>
Cc: Con Kolivas <kernel@kolivas.org>,
	Martin Bligh <mbligh@google.com>, Andrew Morton <akpm@osdl.org>,
	linux-kernel@vger.kernel.org, Ingo Molnar <mingo@elte.hu>
Subject: Re: -mm seems significanty slower than mainline on kernbench
Date: Sat, 14 Jan 2006 09:59:28 +1100	[thread overview]
Message-ID: <43C830D0.5060707@bigpond.net.au> (raw)
In-Reply-To: <43C7E96D.7000003@shadowen.org>

[-- Attachment #1: Type: text/plain, Size: 963 bytes --]

Andy Whitcroft wrote:
> Andy Whitcroft wrote:
> 
>>Peter Williams wrote:
>>
>>
>>
>>>Attached is a new patch to fix the excessive idle problem.  This patch
>>>takes a new approach to the problem as it was becoming obvious that
>>>trying to alter the load balancing code to cope with biased load was
>>>harder than it seemed.
>>
>>
>>Ok.  Tried testing different-approach-to-smp-nice-problem against the
>>transition release 2.6.14-rc2-mm1 but it doesn't apply.  Am testing
>>against 2.6.15-mm3 right now.  Will let you know.
> 
> 
> Doesn't appear to help if I am analysing the graphs right.  Martin?
> 
> Also tried to back out the original patch against the 2.6.15-mm3 tree
> but that also won't apply (2 rejects).  Peter?

Attached is a patch to back the original patch out of 2.6.15-mm3.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

[-- Attachment #2: remove-smp-nice-imp-patch --]
[-- Type: text/plain, Size: 11133 bytes --]

Index: MM-2.6.X/kernel/sched.c
===================================================================
--- MM-2.6.X.orig/kernel/sched.c	2006-01-13 18:09:49.000000000 +1100
+++ MM-2.6.X/kernel/sched.c	2006-01-13 18:21:38.000000000 +1100
@@ -63,16 +63,6 @@
 #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 
-#ifdef CONFIG_SMP
-/*
- * Priority bias for load balancing ranges from 1 (nice==19) to 139 (RT
- * priority of 100).
- */
-#define NICE_TO_BIAS_PRIO(nice)	(20 - (nice))
-#define PRIO_TO_BIAS_PRIO(prio)	NICE_TO_BIAS_PRIO(PRIO_TO_NICE(prio))
-#define RTPRIO_TO_BIAS_PRIO(rp)	(40 + (rp))
-#endif
-
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
@@ -695,53 +685,46 @@ static int effective_prio(task_t *p)
 }
 
 #ifdef CONFIG_SMP
-static inline void set_bias_prio(task_t *p)
-{
-	if (rt_task(p)) {
-		if (p == task_rq(p)->migration_thread)
-			/*
-			 * The migration thread does the actual balancing. Do
-			 * not bias by its priority as the ultra high priority
-			 * will skew balancing adversely.
-			 */
-			p->bias_prio = 0;
-		else
-			p->bias_prio = RTPRIO_TO_BIAS_PRIO(p->rt_priority);
-	} else
-		p->bias_prio = PRIO_TO_BIAS_PRIO(p->static_prio);
-}
-
-static inline void inc_prio_bias(runqueue_t *rq, const task_t *p)
+static inline void inc_prio_bias(runqueue_t *rq, int prio)
 {
-	rq->prio_bias += p->bias_prio;
+	rq->prio_bias += MAX_PRIO - prio;
 }
 
-static inline void dec_prio_bias(runqueue_t *rq, const task_t *p)
+static inline void dec_prio_bias(runqueue_t *rq, int prio)
 {
-	rq->prio_bias -= p->bias_prio;
+	rq->prio_bias -= MAX_PRIO - prio;
 }
 
 static inline void inc_nr_running(task_t *p, runqueue_t *rq)
 {
 	rq->nr_running++;
-	inc_prio_bias(rq, p);
+	if (rt_task(p)) {
+		if (p != rq->migration_thread)
+			/*
+			 * The migration thread does the actual balancing. Do
+			 * not bias by its priority as the ultra high priority
+			 * will skew balancing adversely.
+			 */
+			inc_prio_bias(rq, p->prio);
+	} else
+		inc_prio_bias(rq, p->static_prio);
 }
 
 static inline void dec_nr_running(task_t *p, runqueue_t *rq)
 {
 	rq->nr_running--;
-	dec_prio_bias(rq, p);
+	if (rt_task(p)) {
+		if (p != rq->migration_thread)
+			dec_prio_bias(rq, p->prio);
+	} else
+		dec_prio_bias(rq, p->static_prio);
 }
 #else
-static inline void set_bias_prio(task_t *p)
-{
-}
-
-static inline void inc_prio_bias(runqueue_t *rq, const task_t *p)
+static inline void inc_prio_bias(runqueue_t *rq, int prio)
 {
 }
 
-static inline void dec_prio_bias(runqueue_t *rq, const task_t *p)
+static inline void dec_prio_bias(runqueue_t *rq, int prio)
 {
 }
 
@@ -1039,29 +1022,61 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static unsigned long source_load(int cpu, int type)
+static inline unsigned long __source_load(int cpu, int type, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->prio_bias * SCHED_LOAD_SCALE;
+	unsigned long running = rq->nr_running;
+	unsigned long source_load, cpu_load = rq->cpu_load[type-1],
+		load_now = running * SCHED_LOAD_SCALE;
 
 	if (type == 0)
-		return load_now;
+		source_load = load_now;
+	else
+		source_load = min(cpu_load, load_now);
+
+	if (running > 1 || (idle == NOT_IDLE && running))
+		/*
+		 * If we are busy rebalancing the load is biased by
+		 * priority to create 'nice' support across cpus. When
+		 * idle rebalancing we should only bias the source_load if
+		 * there is more than one task running on that queue to
+		 * prevent idle rebalance from trying to pull tasks from a
+		 * queue with only one running task.
+		 */
+		source_load = source_load * rq->prio_bias / running;
+
+	return source_load;
+}
 
-	return min(rq->cpu_load[type-1], load_now);
+static inline unsigned long source_load(int cpu, int type)
+{
+	return __source_load(cpu, type, NOT_IDLE);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long target_load(int cpu, int type)
+static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->prio_bias * SCHED_LOAD_SCALE;
+	unsigned long running = rq->nr_running;
+	unsigned long target_load, cpu_load = rq->cpu_load[type-1],
+		load_now = running * SCHED_LOAD_SCALE;
 
 	if (type == 0)
-		return load_now;
+		target_load = load_now;
+	else
+		target_load = max(cpu_load, load_now);
+
+	if (running > 1 || (idle == NOT_IDLE && running))
+		target_load = target_load * rq->prio_bias / running;
+
+	return target_load;
+}
 
-	return max(rq->cpu_load[type-1], load_now);
+static inline unsigned long target_load(int cpu, int type)
+{
+	return __target_load(cpu, type, NOT_IDLE);
 }
 
 /*
@@ -1322,7 +1337,7 @@ static int try_to_wake_up(task_t *p, uns
 			 * of the current CPU:
 			 */
 			if (sync)
-				tl -= p->bias_prio * SCHED_LOAD_SCALE;
+				tl -= SCHED_LOAD_SCALE;
 
 			if ((tl <= load &&
 				tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
@@ -1934,16 +1949,15 @@ int can_migrate_task(task_t *p, runqueue
  * Called with both runqueues locked.
  */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-		      unsigned long max_nr_move, long max_bias_move,
-		      struct sched_domain *sd, enum idle_type idle,
-		      int *all_pinned)
+		      unsigned long max_nr_move, struct sched_domain *sd,
+		      enum idle_type idle, int *all_pinned)
 {
 	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
 	int idx, pulled = 0, pinned = 0;
 	task_t *tmp;
 
-	if (max_nr_move == 0 || max_bias_move == 0)
+	if (max_nr_move == 0)
 		goto out;
 
 	pinned = 1;
@@ -1986,8 +2000,7 @@ skip_queue:
 
 	curr = curr->prev;
 
-	if (tmp->bias_prio > max_bias_move ||
-	    !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -2001,13 +2014,9 @@ skip_queue:
 
 	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
 	pulled++;
-	max_bias_move -= tmp->bias_prio;
 
-	/*
-	 * We only want to steal up to the prescribed number of tasks
-	 * and the prescribed amount of biased load.
-	 */
-	if (pulled < max_nr_move && max_bias_move > 0) {
+	/* We only want to steal up to the prescribed number of tasks. */
+	if (pulled < max_nr_move) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -2028,7 +2037,7 @@ out:
 
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the amount of biased load which should be
+ * domain. It calculates and returns the number of tasks which should be
  * moved to restore balance via the imbalance parameter.
  */
 static struct sched_group *
@@ -2064,9 +2073,9 @@ find_busiest_group(struct sched_domain *
 
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = target_load(i, load_idx);
+				load = __target_load(i, load_idx, idle);
 			else
-				load = source_load(i, load_idx);
+				load = __source_load(i, load_idx, idle);
 
 			avg_load += load;
 		}
@@ -2121,7 +2130,7 @@ find_busiest_group(struct sched_domain *
 		unsigned long tmp;
 
 		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-			*imbalance = NICE_TO_BIAS_PRIO(0);
+			*imbalance = 1;
 			return busiest;
 		}
 
@@ -2154,7 +2163,7 @@ find_busiest_group(struct sched_domain *
 		if (pwr_move <= pwr_now)
 			goto out_balanced;
 
-		*imbalance = NICE_TO_BIAS_PRIO(0);
+		*imbalance = 1;
 		return busiest;
 	}
 
@@ -2171,14 +2180,15 @@ out_balanced:
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
-static runqueue_t *find_busiest_queue(struct sched_group *group)
+static runqueue_t *find_busiest_queue(struct sched_group *group,
+	enum idle_type idle)
 {
 	unsigned long load, max_load = 0;
 	runqueue_t *busiest = NULL;
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = source_load(i, 0);
+		load = __source_load(i, 0, idle);
 
 		if (load > max_load) {
 			max_load = load;
@@ -2195,7 +2205,6 @@ static runqueue_t *find_busiest_queue(st
  */
 #define MAX_PINNED_INTERVAL	512
 
-#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
@@ -2223,7 +2232,7 @@ static int load_balance(int this_cpu, ru
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group);
+	busiest = find_busiest_queue(group, idle);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -2243,7 +2252,6 @@ static int load_balance(int this_cpu, ru
 		 */
 		double_rq_lock(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
-					minus_1_or_zero(busiest->nr_running),
 					imbalance, sd, idle, &all_pinned);
 		double_rq_unlock(this_rq, busiest);
 
@@ -2347,7 +2355,7 @@ static int load_balance_newidle(int this
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group);
+	busiest = find_busiest_queue(group, NEWLY_IDLE);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
@@ -2362,7 +2370,6 @@ static int load_balance_newidle(int this
 		/* Attempt to move tasks */
 		double_lock_balance(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
-					minus_1_or_zero(busiest->nr_running),
 					imbalance, sd, NEWLY_IDLE, NULL);
 		spin_unlock(&busiest->lock);
 	}
@@ -2443,8 +2450,7 @@ static void active_load_balance(runqueue
 
 	schedstat_inc(sd, alb_cnt);
 
-	if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
-			RTPRIO_TO_BIAS_PRIO(100), sd, SCHED_IDLE, NULL))
+	if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
 		schedstat_inc(sd, alb_pushed);
 	else
 		schedstat_inc(sd, alb_failed);
@@ -2472,7 +2478,7 @@ static void rebalance_tick(int this_cpu,
 	struct sched_domain *sd;
 	int i;
 
-	this_load = this_rq->prio_bias * SCHED_LOAD_SCALE;
+	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
 	/* Update our load */
 	for (i = 0; i < 3; i++) {
 		unsigned long new_load = this_load;
@@ -3612,19 +3618,18 @@ void set_user_nice(task_t *p, long nice)
 	array = p->array;
 	if (array) {
 		dequeue_task(p, array);
-		dec_prio_bias(rq, p);
+		dec_prio_bias(rq, p->static_prio);
 	}
 
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
 	delta = new_prio - old_prio;
 	p->static_prio = NICE_TO_PRIO(nice);
-	set_bias_prio(p);
 	p->prio += delta;
 
 	if (array) {
 		enqueue_task(p, array);
-		inc_prio_bias(rq, p);
+		inc_prio_bias(rq, p->static_prio);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -3767,7 +3772,6 @@ static void __setscheduler(struct task_s
 		if (policy == SCHED_BATCH)
 			p->sleep_avg = 0;
 	}
-	set_bias_prio(p);
 }
 
 /**
@@ -6309,7 +6313,6 @@ void __init sched_init(void)
 		}
 	}
 
-	set_bias_prio(&init_task);
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */

  parent reply	other threads:[~2006-01-13 23:00 UTC|newest]

Thread overview: 59+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-01-11  1:14 -mm seems significanty slower than mainline on kernbench Martin Bligh
2006-01-11  1:31 ` Andrew Morton
2006-01-11  1:41   ` Martin Bligh
2006-01-11  1:48     ` Andrew Morton
2006-01-11  1:49     ` Con Kolivas
2006-01-11  2:38       ` Peter Williams
2006-01-11  3:07         ` Con Kolivas
2006-01-11  3:12           ` Martin Bligh
2006-01-11  3:40           ` Peter Williams
2006-01-11  3:49             ` Con Kolivas
2006-01-11  4:33               ` Peter Williams
2006-01-11  5:14             ` Peter Williams
2006-01-11  6:21               ` Martin J. Bligh
2006-01-11 12:24                 ` Peter Williams
2006-01-11 14:29                   ` Con Kolivas
2006-01-11 22:05                     ` Peter Williams
2006-01-12  0:54                       ` Peter Williams
2006-01-12  1:18                         ` Con Kolivas
2006-01-12  1:29                           ` Peter Williams
2006-01-12  1:36                             ` Con Kolivas
2006-01-12  2:23                               ` Peter Williams
2006-01-12  2:26                                 ` Martin Bligh
2006-01-12  6:39                                   ` Con Kolivas
2006-01-23 19:28                                     ` Martin Bligh
2006-01-24  1:25                                       ` Peter Williams
2006-01-24  3:50                                         ` Peter Williams
2006-01-24  4:41                                           ` Martin J. Bligh
2006-01-24  6:22                                             ` Peter Williams
2006-01-24  6:42                                               ` Martin J. Bligh
2006-01-28 23:20                                                 ` Peter Williams
2006-01-29  0:52                                                   ` Martin J. Bligh
2006-01-12  2:27                                 ` Con Kolivas
2006-01-12  2:04                           ` Martin Bligh
2006-01-12  6:35                             ` Martin J. Bligh
2006-01-12  6:41                               ` Con Kolivas
2006-01-12  6:54                                 ` Peter Williams
2006-01-12 18:39                         ` Martin Bligh
2006-01-12 20:03                           ` Peter Williams
2006-01-12 22:20                             ` Peter Williams
2006-01-13  7:06                               ` Peter Williams
2006-01-13 12:00                                 ` Peter Williams
2006-01-13 16:15                                 ` Martin J. Bligh
2006-01-13 16:26                                 ` Andy Whitcroft
2006-01-13 17:54                                   ` Andy Whitcroft
2006-01-13 20:41                                     ` Martin Bligh
2006-01-14  0:23                                       ` Peter Williams
2006-01-14  5:03                                         ` Nick Piggin
2006-01-14  5:40                                           ` Con Kolivas
2006-01-14  6:05                                             ` Nick Piggin
2006-01-14  5:53                                           ` Peter Williams
2006-01-14  6:13                                             ` Nick Piggin
2006-01-13 22:59                                     ` Peter Williams [this message]
2006-01-14 18:48                                 ` Martin J. Bligh
2006-01-15  0:05                                   ` Peter Williams
2006-01-15  2:04                                     ` Con Kolivas
2006-01-15  2:09                                     ` [PATCH] sched - remove unnecessary smpnice ifdefs Con Kolivas
2006-01-15  3:50                                     ` -mm seems significanty slower than mainline on kernbench Ingo Molnar
2006-01-12  1:25                       ` Peter Williams
2006-01-11  1:52     ` Andrew Morton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=43C830D0.5060707@bigpond.net.au \
    --to=pwil3058@bigpond.net.au \
    --cc=akpm@osdl.org \
    --cc=apw@shadowen.org \
    --cc=kernel@kolivas.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mbligh@google.com \
    --cc=mingo@elte.hu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox