From: Peter Williams <pwil3058@bigpond.net.au>
To: Andy Whitcroft <apw@shadowen.org>
Cc: Con Kolivas <kernel@kolivas.org>,
Martin Bligh <mbligh@google.com>, Andrew Morton <akpm@osdl.org>,
linux-kernel@vger.kernel.org, Ingo Molnar <mingo@elte.hu>
Subject: Re: -mm seems significanty slower than mainline on kernbench
Date: Sat, 14 Jan 2006 09:59:28 +1100 [thread overview]
Message-ID: <43C830D0.5060707@bigpond.net.au> (raw)
In-Reply-To: <43C7E96D.7000003@shadowen.org>
[-- Attachment #1: Type: text/plain, Size: 963 bytes --]
Andy Whitcroft wrote:
> Andy Whitcroft wrote:
>
>>Peter Williams wrote:
>>
>>
>>
>>>Attached is a new patch to fix the excessive idle problem. This patch
>>>takes a new approach to the problem as it was becoming obvious that
>>>trying to alter the load balancing code to cope with biased load was
>>>harder than it seemed.
>>
>>
>>Ok. Tried testing different-approach-to-smp-nice-problem against the
>>transition release 2.6.14-rc2-mm1 but it doesn't apply. Am testing
>>against 2.6.15-mm3 right now. Will let you know.
>
>
> Doesn't appear to help if I am analysing the graphs right. Martin?
>
> Also tried to back out the original patch against the 2.6.15-mm3 tree
> but that also won't apply (2 rejects). Peter?
Attached is a patch to back the original patch out of 2.6.15-mm3.
Peter
--
Peter Williams pwil3058@bigpond.net.au
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
[-- Attachment #2: remove-smp-nice-imp-patch --]
[-- Type: text/plain, Size: 11133 bytes --]
Index: MM-2.6.X/kernel/sched.c
===================================================================
--- MM-2.6.X.orig/kernel/sched.c 2006-01-13 18:09:49.000000000 +1100
+++ MM-2.6.X/kernel/sched.c 2006-01-13 18:21:38.000000000 +1100
@@ -63,16 +63,6 @@
#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
-#ifdef CONFIG_SMP
-/*
- * Priority bias for load balancing ranges from 1 (nice==19) to 139 (RT
- * priority of 100).
- */
-#define NICE_TO_BIAS_PRIO(nice) (20 - (nice))
-#define PRIO_TO_BIAS_PRIO(prio) NICE_TO_BIAS_PRIO(PRIO_TO_NICE(prio))
-#define RTPRIO_TO_BIAS_PRIO(rp) (40 + (rp))
-#endif
-
/*
* 'User priority' is the nice value converted to something we
* can work with better when scaling various scheduler parameters,
@@ -695,53 +685,46 @@ static int effective_prio(task_t *p)
}
#ifdef CONFIG_SMP
-static inline void set_bias_prio(task_t *p)
-{
- if (rt_task(p)) {
- if (p == task_rq(p)->migration_thread)
- /*
- * The migration thread does the actual balancing. Do
- * not bias by its priority as the ultra high priority
- * will skew balancing adversely.
- */
- p->bias_prio = 0;
- else
- p->bias_prio = RTPRIO_TO_BIAS_PRIO(p->rt_priority);
- } else
- p->bias_prio = PRIO_TO_BIAS_PRIO(p->static_prio);
-}
-
-static inline void inc_prio_bias(runqueue_t *rq, const task_t *p)
+static inline void inc_prio_bias(runqueue_t *rq, int prio)
{
- rq->prio_bias += p->bias_prio;
+ rq->prio_bias += MAX_PRIO - prio;
}
-static inline void dec_prio_bias(runqueue_t *rq, const task_t *p)
+static inline void dec_prio_bias(runqueue_t *rq, int prio)
{
- rq->prio_bias -= p->bias_prio;
+ rq->prio_bias -= MAX_PRIO - prio;
}
static inline void inc_nr_running(task_t *p, runqueue_t *rq)
{
rq->nr_running++;
- inc_prio_bias(rq, p);
+ if (rt_task(p)) {
+ if (p != rq->migration_thread)
+ /*
+ * The migration thread does the actual balancing. Do
+ * not bias by its priority as the ultra high priority
+ * will skew balancing adversely.
+ */
+ inc_prio_bias(rq, p->prio);
+ } else
+ inc_prio_bias(rq, p->static_prio);
}
static inline void dec_nr_running(task_t *p, runqueue_t *rq)
{
rq->nr_running--;
- dec_prio_bias(rq, p);
+ if (rt_task(p)) {
+ if (p != rq->migration_thread)
+ dec_prio_bias(rq, p->prio);
+ } else
+ dec_prio_bias(rq, p->static_prio);
}
#else
-static inline void set_bias_prio(task_t *p)
-{
-}
-
-static inline void inc_prio_bias(runqueue_t *rq, const task_t *p)
+static inline void inc_prio_bias(runqueue_t *rq, int prio)
{
}
-static inline void dec_prio_bias(runqueue_t *rq, const task_t *p)
+static inline void dec_prio_bias(runqueue_t *rq, int prio)
{
}
@@ -1039,29 +1022,61 @@ void kick_process(task_t *p)
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
-static unsigned long source_load(int cpu, int type)
+static inline unsigned long __source_load(int cpu, int type, enum idle_type idle)
{
runqueue_t *rq = cpu_rq(cpu);
- unsigned long load_now = rq->prio_bias * SCHED_LOAD_SCALE;
+ unsigned long running = rq->nr_running;
+ unsigned long source_load, cpu_load = rq->cpu_load[type-1],
+ load_now = running * SCHED_LOAD_SCALE;
if (type == 0)
- return load_now;
+ source_load = load_now;
+ else
+ source_load = min(cpu_load, load_now);
+
+ if (running > 1 || (idle == NOT_IDLE && running))
+ /*
+ * If we are busy rebalancing the load is biased by
+ * priority to create 'nice' support across cpus. When
+ * idle rebalancing we should only bias the source_load if
+ * there is more than one task running on that queue to
+ * prevent idle rebalance from trying to pull tasks from a
+ * queue with only one running task.
+ */
+ source_load = source_load * rq->prio_bias / running;
+
+ return source_load;
+}
- return min(rq->cpu_load[type-1], load_now);
+static inline unsigned long source_load(int cpu, int type)
+{
+ return __source_load(cpu, type, NOT_IDLE);
}
/*
* Return a high guess at the load of a migration-target cpu
*/
-static inline unsigned long target_load(int cpu, int type)
+static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
{
runqueue_t *rq = cpu_rq(cpu);
- unsigned long load_now = rq->prio_bias * SCHED_LOAD_SCALE;
+ unsigned long running = rq->nr_running;
+ unsigned long target_load, cpu_load = rq->cpu_load[type-1],
+ load_now = running * SCHED_LOAD_SCALE;
if (type == 0)
- return load_now;
+ target_load = load_now;
+ else
+ target_load = max(cpu_load, load_now);
+
+ if (running > 1 || (idle == NOT_IDLE && running))
+ target_load = target_load * rq->prio_bias / running;
+
+ return target_load;
+}
- return max(rq->cpu_load[type-1], load_now);
+static inline unsigned long target_load(int cpu, int type)
+{
+ return __target_load(cpu, type, NOT_IDLE);
}
/*
@@ -1322,7 +1337,7 @@ static int try_to_wake_up(task_t *p, uns
* of the current CPU:
*/
if (sync)
- tl -= p->bias_prio * SCHED_LOAD_SCALE;
+ tl -= SCHED_LOAD_SCALE;
if ((tl <= load &&
tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
@@ -1934,16 +1949,15 @@ int can_migrate_task(task_t *p, runqueue
* Called with both runqueues locked.
*/
static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
- unsigned long max_nr_move, long max_bias_move,
- struct sched_domain *sd, enum idle_type idle,
- int *all_pinned)
+ unsigned long max_nr_move, struct sched_domain *sd,
+ enum idle_type idle, int *all_pinned)
{
prio_array_t *array, *dst_array;
struct list_head *head, *curr;
int idx, pulled = 0, pinned = 0;
task_t *tmp;
- if (max_nr_move == 0 || max_bias_move == 0)
+ if (max_nr_move == 0)
goto out;
pinned = 1;
@@ -1986,8 +2000,7 @@ skip_queue:
curr = curr->prev;
- if (tmp->bias_prio > max_bias_move ||
- !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+ if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
if (curr != head)
goto skip_queue;
idx++;
@@ -2001,13 +2014,9 @@ skip_queue:
pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
pulled++;
- max_bias_move -= tmp->bias_prio;
- /*
- * We only want to steal up to the prescribed number of tasks
- * and the prescribed amount of biased load.
- */
- if (pulled < max_nr_move && max_bias_move > 0) {
+ /* We only want to steal up to the prescribed number of tasks. */
+ if (pulled < max_nr_move) {
if (curr != head)
goto skip_queue;
idx++;
@@ -2028,7 +2037,7 @@ out:
/*
* find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the amount of biased load which should be
+ * domain. It calculates and returns the number of tasks which should be
* moved to restore balance via the imbalance parameter.
*/
static struct sched_group *
@@ -2064,9 +2073,9 @@ find_busiest_group(struct sched_domain *
/* Bias balancing toward cpus of our domain */
if (local_group)
- load = target_load(i, load_idx);
+ load = __target_load(i, load_idx, idle);
else
- load = source_load(i, load_idx);
+ load = __source_load(i, load_idx, idle);
avg_load += load;
}
@@ -2121,7 +2130,7 @@ find_busiest_group(struct sched_domain *
unsigned long tmp;
if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
- *imbalance = NICE_TO_BIAS_PRIO(0);
+ *imbalance = 1;
return busiest;
}
@@ -2154,7 +2163,7 @@ find_busiest_group(struct sched_domain *
if (pwr_move <= pwr_now)
goto out_balanced;
- *imbalance = NICE_TO_BIAS_PRIO(0);
+ *imbalance = 1;
return busiest;
}
@@ -2171,14 +2180,15 @@ out_balanced:
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
-static runqueue_t *find_busiest_queue(struct sched_group *group)
+static runqueue_t *find_busiest_queue(struct sched_group *group,
+ enum idle_type idle)
{
unsigned long load, max_load = 0;
runqueue_t *busiest = NULL;
int i;
for_each_cpu_mask(i, group->cpumask) {
- load = source_load(i, 0);
+ load = __source_load(i, 0, idle);
if (load > max_load) {
max_load = load;
@@ -2195,7 +2205,6 @@ static runqueue_t *find_busiest_queue(st
*/
#define MAX_PINNED_INTERVAL 512
-#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
@@ -2223,7 +2232,7 @@ static int load_balance(int this_cpu, ru
goto out_balanced;
}
- busiest = find_busiest_queue(group);
+ busiest = find_busiest_queue(group, idle);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
@@ -2243,7 +2252,6 @@ static int load_balance(int this_cpu, ru
*/
double_rq_lock(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
- minus_1_or_zero(busiest->nr_running),
imbalance, sd, idle, &all_pinned);
double_rq_unlock(this_rq, busiest);
@@ -2347,7 +2355,7 @@ static int load_balance_newidle(int this
goto out_balanced;
}
- busiest = find_busiest_queue(group);
+ busiest = find_busiest_queue(group, NEWLY_IDLE);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
goto out_balanced;
@@ -2362,7 +2370,6 @@ static int load_balance_newidle(int this
/* Attempt to move tasks */
double_lock_balance(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
- minus_1_or_zero(busiest->nr_running),
imbalance, sd, NEWLY_IDLE, NULL);
spin_unlock(&busiest->lock);
}
@@ -2443,8 +2450,7 @@ static void active_load_balance(runqueue
schedstat_inc(sd, alb_cnt);
- if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
- RTPRIO_TO_BIAS_PRIO(100), sd, SCHED_IDLE, NULL))
+ if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
@@ -2472,7 +2478,7 @@ static void rebalance_tick(int this_cpu,
struct sched_domain *sd;
int i;
- this_load = this_rq->prio_bias * SCHED_LOAD_SCALE;
+ this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
/* Update our load */
for (i = 0; i < 3; i++) {
unsigned long new_load = this_load;
@@ -3612,19 +3618,18 @@ void set_user_nice(task_t *p, long nice)
array = p->array;
if (array) {
dequeue_task(p, array);
- dec_prio_bias(rq, p);
+ dec_prio_bias(rq, p->static_prio);
}
old_prio = p->prio;
new_prio = NICE_TO_PRIO(nice);
delta = new_prio - old_prio;
p->static_prio = NICE_TO_PRIO(nice);
- set_bias_prio(p);
p->prio += delta;
if (array) {
enqueue_task(p, array);
- inc_prio_bias(rq, p);
+ inc_prio_bias(rq, p->static_prio);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -3767,7 +3772,6 @@ static void __setscheduler(struct task_s
if (policy == SCHED_BATCH)
p->sleep_avg = 0;
}
- set_bias_prio(p);
}
/**
@@ -6309,7 +6313,6 @@ void __init sched_init(void)
}
}
- set_bias_prio(&init_task);
/*
* The boot idle thread does lazy MMU switching as well:
*/
next prev parent reply other threads:[~2006-01-13 23:00 UTC|newest]
Thread overview: 59+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-01-11 1:14 -mm seems significanty slower than mainline on kernbench Martin Bligh
2006-01-11 1:31 ` Andrew Morton
2006-01-11 1:41 ` Martin Bligh
2006-01-11 1:48 ` Andrew Morton
2006-01-11 1:49 ` Con Kolivas
2006-01-11 2:38 ` Peter Williams
2006-01-11 3:07 ` Con Kolivas
2006-01-11 3:12 ` Martin Bligh
2006-01-11 3:40 ` Peter Williams
2006-01-11 3:49 ` Con Kolivas
2006-01-11 4:33 ` Peter Williams
2006-01-11 5:14 ` Peter Williams
2006-01-11 6:21 ` Martin J. Bligh
2006-01-11 12:24 ` Peter Williams
2006-01-11 14:29 ` Con Kolivas
2006-01-11 22:05 ` Peter Williams
2006-01-12 0:54 ` Peter Williams
2006-01-12 1:18 ` Con Kolivas
2006-01-12 1:29 ` Peter Williams
2006-01-12 1:36 ` Con Kolivas
2006-01-12 2:23 ` Peter Williams
2006-01-12 2:26 ` Martin Bligh
2006-01-12 6:39 ` Con Kolivas
2006-01-23 19:28 ` Martin Bligh
2006-01-24 1:25 ` Peter Williams
2006-01-24 3:50 ` Peter Williams
2006-01-24 4:41 ` Martin J. Bligh
2006-01-24 6:22 ` Peter Williams
2006-01-24 6:42 ` Martin J. Bligh
2006-01-28 23:20 ` Peter Williams
2006-01-29 0:52 ` Martin J. Bligh
2006-01-12 2:27 ` Con Kolivas
2006-01-12 2:04 ` Martin Bligh
2006-01-12 6:35 ` Martin J. Bligh
2006-01-12 6:41 ` Con Kolivas
2006-01-12 6:54 ` Peter Williams
2006-01-12 18:39 ` Martin Bligh
2006-01-12 20:03 ` Peter Williams
2006-01-12 22:20 ` Peter Williams
2006-01-13 7:06 ` Peter Williams
2006-01-13 12:00 ` Peter Williams
2006-01-13 16:15 ` Martin J. Bligh
2006-01-13 16:26 ` Andy Whitcroft
2006-01-13 17:54 ` Andy Whitcroft
2006-01-13 20:41 ` Martin Bligh
2006-01-14 0:23 ` Peter Williams
2006-01-14 5:03 ` Nick Piggin
2006-01-14 5:40 ` Con Kolivas
2006-01-14 6:05 ` Nick Piggin
2006-01-14 5:53 ` Peter Williams
2006-01-14 6:13 ` Nick Piggin
2006-01-13 22:59 ` Peter Williams [this message]
2006-01-14 18:48 ` Martin J. Bligh
2006-01-15 0:05 ` Peter Williams
2006-01-15 2:04 ` Con Kolivas
2006-01-15 2:09 ` [PATCH] sched - remove unnecessary smpnice ifdefs Con Kolivas
2006-01-15 3:50 ` -mm seems significanty slower than mainline on kernbench Ingo Molnar
2006-01-12 1:25 ` Peter Williams
2006-01-11 1:52 ` Andrew Morton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=43C830D0.5060707@bigpond.net.au \
--to=pwil3058@bigpond.net.au \
--cc=akpm@osdl.org \
--cc=apw@shadowen.org \
--cc=kernel@kolivas.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mbligh@google.com \
--cc=mingo@elte.hu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox