[PATCH] sched: fix smpnice abmormal nice anomalies

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] sched: fix smpnice abmormal nice anomalies
@ 2006-02-16  3:09 Peter Williams
  0 siblings, 0 replies; only message in thread
From: Peter Williams @ 2006-02-16  3:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel Mailing List, Linus Torvalds, Ingo Molnar,
	Con Kolivas, npiggin, Steven Rostedt, Siddha, Suresh B

[-- Attachment #1: Type: text/plain, Size: 1621 bytes --]

Suresh B. Siddha has reported:

"On a lightly loaded system, this can result in HT scheduler 
optimizations being disabled in presence of low priority tasks... in 
this case, they(low priority ones) can end up running on the same 
package, even in the presence of other idle packages."

Analysis has shown that this is a manifestation of a more general 
problem which occurs when the average of the nice values assigned to 
runnable tasks is skewed in either direction.  The cause is that 
find_busiest_group() assumes that the average weighted load per task is 
SCHED_LOAD_SCALE which will not be true when the distribution of nice 
values is skewed in one direction or the other.  This in turn will cause 
the load balancing code to under balance when the skew is towards low 
priority tasks (i.e. positive nice) and over balance when it is skewed 
in the opposite direction.

The attached patch fixes this problem.  It replaces SCHED_LOAD_SCALE 
with the average load per task (calculated during the search for the 
busiest group) in those places where SCHED_LOAD_SCALE was being used to 
represent the load generated by a single task.

Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>

--

Andrew,
	This patch in conjunction with the "Fix smpnice high priority task 
hopping problem" patch posted earlier should address all of the 
outstanding smpnice issues.  Could you please add them to -mm so that 
they can get wider testing?

Thanks,
Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

[-- Attachment #2: fix-smpnice-abnormal-nice-anomalies --]
[-- Type: text/plain, Size: 4608 bytes --]

Index: MM-2.6.X/kernel/sched.c
===================================================================
--- MM-2.6.X.orig/kernel/sched.c	2006-02-16 11:02:45.000000000 +1100
+++ MM-2.6.X/kernel/sched.c	2006-02-16 12:39:30.000000000 +1100
@@ -2025,9 +2025,11 @@ find_busiest_group(struct sched_domain *
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 	unsigned long max_pull;
+	unsigned long avg_load_per_task, busiest_nr_running;
 	int load_idx;
 
 	max_load = this_load = total_load = total_pwr = 0;
+	busiest_nr_running = 0;
 	if (idle == NOT_IDLE)
 		load_idx = sd->busy_idx;
 	else if (idle == NEWLY_IDLE)
@@ -2039,11 +2041,12 @@ find_busiest_group(struct sched_domain *
 		unsigned long load;
 		int local_group;
 		int i;
+		unsigned long sum_nr_running;
 
 		local_group = cpu_isset(this_cpu, group->cpumask);
 
 		/* Tally up the load of all CPUs in the group */
-		avg_load = 0;
+		sum_nr_running = avg_load = 0;
 
 		for_each_cpu_mask(i, group->cpumask) {
 			if (*sd_idle && !idle_cpu(i))
@@ -2056,6 +2059,7 @@ find_busiest_group(struct sched_domain *
 				load = source_load(i, load_idx);
 
 			avg_load += load;
+			sum_nr_running += cpu_rq(i)->nr_running;
 		}
 
 		total_load += avg_load;
@@ -2070,11 +2074,15 @@ find_busiest_group(struct sched_domain *
 		} else if (avg_load > max_load) {
 			max_load = avg_load;
 			busiest = group;
+			busiest_nr_running = sum_nr_running;
 		}
 		group = group->next;
 	} while (group != sd->groups);
 
-	if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
+	/* Don't assume that busiest_nr_running > 0 */
+	avg_load_per_task = busiest_nr_running ? max_load / busiest_nr_running : max_load;
+
+	if (!busiest || this_load >= max_load || max_load <= avg_load_per_task)
 		goto out_balanced;
 
 	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2096,19 +2104,25 @@ find_busiest_group(struct sched_domain *
 	 */
 
 	/* Don't want to pull so many tasks that a group would go idle */
-	max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+	max_pull = min(max_load - avg_load, max_load - avg_load_per_task);
 
 	/* How much load to actually move to equalise the imbalance */
 	*imbalance = min(max_pull * busiest->cpu_power,
 				(avg_load - this_load) * this->cpu_power)
 			/ SCHED_LOAD_SCALE;
 
-	if (*imbalance < SCHED_LOAD_SCALE) {
+	/*
+	 * if *imbalance is less than the average load per runnable task
+	 * there is no gaurantee that any tasks will be moved so we'll have
+	 * a think about bumping its value to force at least one task to be
+	 * moved
+	 */
+	if (*imbalance < avg_load_per_task) {
 		unsigned long pwr_now = 0, pwr_move = 0;
 		unsigned long tmp;
 
-		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-			*imbalance = NICE_TO_BIAS_PRIO(0);
+		if (max_load - this_load >= avg_load_per_task*2) {
+			*imbalance = biased_load(avg_load_per_task);
 			return busiest;
 		}
 
@@ -2118,31 +2132,32 @@ find_busiest_group(struct sched_domain *
 		 * moving them.
 		 */
 
-		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
-		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+		pwr_now += busiest->cpu_power*min(avg_load_per_task, max_load);
+		pwr_now += this->cpu_power*min(avg_load_per_task, this_load);
 		pwr_now /= SCHED_LOAD_SCALE;
 
 		/* Amount of load we'd subtract */
-		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+		tmp = avg_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
 		if (max_load > tmp)
-			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
+			pwr_move += busiest->cpu_power*min(avg_load_per_task,
 							max_load - tmp);
 
 		/* Amount of load we'd add */
 		if (max_load*busiest->cpu_power <
-				SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
+				avg_load_per_task*SCHED_LOAD_SCALE)
 			tmp = max_load*busiest->cpu_power/this->cpu_power;
 		else
-			tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
-		pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+			tmp = avg_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
+		pwr_move += this->cpu_power*min(avg_load_per_task, this_load + tmp);
 		pwr_move /= SCHED_LOAD_SCALE;
 
-		/* Move if we gain throughput */
-		if (pwr_move <= pwr_now)
+		/* Move if we gain throughput
+		 * or if there's a reasonable chance that *imbalance is big enough to cause a move
+		 */
+		if (pwr_move > pwr_now)
+			*imbalance = avg_load_per_task;
+		 else if (*imbalance <= avg_load_per_task / 2)
 			goto out_balanced;
-
-		*imbalance = NICE_TO_BIAS_PRIO(0);
-		return busiest;
 	}
 
 	/*

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2006-02-16  3:09 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-02-16  3:09 [PATCH] sched: fix smpnice abmormal nice anomalies Peter Williams

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.