From: Eric Dumazet <dada1@cosmosbay.com>
To: Ingo Molnar <mingo@elte.hu>, Andrew Morton <akpm@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Subject: [PATCH, take 2] Speedup divides by cpu_power in scheduler
Date: Thu, 22 Feb 2007 09:19:35 +0100 [thread overview]
Message-ID: <45DD5217.3000608@cosmosbay.com> (raw)
In-Reply-To: <45DD46E5.4060804@cosmosbay.com>
[-- Attachment #1: Type: text/plain, Size: 1484 bytes --]
I noticed expensive divides done in try_to_wakeup() and find_busiest_group()
on a bi dual core Opteron machine (total of 4 cores), moderatly loaded (15.000
context switch per second)
oprofile numbers :
CPU: AMD64 processors, speed 2600.05 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit
mask of 0x00 (No unit mask) count 50000
samples % symbol name
...
613914 1.0498 try_to_wake_up
834 0.0013 :ffffffff80227ae1: div %rcx
77513 0.1191 :ffffffff80227ae4: mov %rax,%r11
608893 1.0413 find_busiest_group
1841 0.0031 :ffffffff802260bf: div %rdi
140109 0.2394 :ffffffff802260c2: test %sil,%sil
Some of these divides can use the reciprocal divides we introduced some time
ago (currently used in slab AFAIK)
We can assume a load will fit in a 32bits number, because with a
SCHED_LOAD_SCALE=128 value, its still a theorical limit of 33554432
When/if we reach this limit one day, probably cpus will have a fast hardware
divide and we can zap the reciprocal divide trick.
Ingo suggested to rename cpu_power to __cpu_power to make clear it should not
be modified without changing its reciprocal value too.
I did not convert the divide in cpu_avg_load_per_task(), because tracking
nr_running changes may be not worth it ? We could use a static table of 32
reciprocal values but it would add a conditional branch and table lookup.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
[-- Attachment #2: sched_use_reciprocal_divide.patch --]
[-- Type: text/plain, Size: 6320 bytes --]
--- linux-2.6.21-rc1/include/linux/sched.h 2007-02-21 21:08:32.000000000 +0100
+++ linux-2.6.21-rc1-ed/include/linux/sched.h 2007-02-22 10:12:00.000000000 +0100
@@ -668,8 +668,14 @@ struct sched_group {
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
* single CPU. This is read only (except for setup, hotplug CPU).
+ * Note : Never change cpu_power without recompute its reciprocal
*/
- unsigned long cpu_power;
+ unsigned int __cpu_power;
+ /*
+ * reciprocal value of cpu_power to avoid expensive divides
+ * (see include/linux/reciprocal_div.h)
+ */
+ u32 reciprocal_cpu_power;
};
struct sched_domain {
--- linux-2.6.21-rc1/kernel/sched.c 2007-02-21 21:10:54.000000000 +0100
+++ linux-2.6.21-rc1-ed/kernel/sched.c 2007-02-22 10:12:00.000000000 +0100
@@ -52,6 +52,7 @@
#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
+#include <linux/reciprocal_div.h>
#include <asm/tlb.h>
#include <asm/unistd.h>
@@ -182,6 +183,25 @@ static unsigned int static_prio_timeslic
}
/*
+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+ * Since cpu_power is a 'constant', we can use a reciprocal divide.
+ */
+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
+{
+ return reciprocal_divide(load, sg->reciprocal_cpu_power);
+}
+/*
+ * Each time a sched group cpu_power is changed,
+ * we must compute its reciprocal value
+ */
+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
+{
+ sg->__cpu_power += val;
+ sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+}
+
+
+/*
* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
* to time slice values: [800ms ... 100ms ... 5ms]
*
@@ -1241,7 +1261,8 @@ find_idlest_group(struct sched_domain *s
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
if (local_group) {
this_load = avg_load;
@@ -2352,12 +2373,13 @@ find_busiest_group(struct sched_domain *
}
total_load += avg_load;
- total_pwr += group->cpu_power;
+ total_pwr += group->__cpu_power;
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
- group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+ group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
if (local_group) {
this_load = avg_load;
@@ -2468,8 +2490,8 @@ group_next:
max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * busiest->cpu_power,
- (avg_load - this_load) * this->cpu_power)
+ *imbalance = min(max_pull * busiest->__cpu_power,
+ (avg_load - this_load) * this->__cpu_power)
/ SCHED_LOAD_SCALE;
/*
@@ -2503,27 +2525,28 @@ small_imbalance:
* moving them.
*/
- pwr_now += busiest->cpu_power *
+ pwr_now += busiest->__cpu_power *
min(busiest_load_per_task, max_load);
- pwr_now += this->cpu_power *
+ pwr_now += this->__cpu_power *
min(this_load_per_task, this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- busiest->cpu_power;
+ tmp = sg_div_cpu_power(busiest,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
if (max_load > tmp)
- pwr_move += busiest->cpu_power *
+ pwr_move += busiest->__cpu_power *
min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
- if (max_load * busiest->cpu_power <
+ if (max_load * busiest->__cpu_power <
busiest_load_per_task * SCHED_LOAD_SCALE)
- tmp = max_load * busiest->cpu_power / this->cpu_power;
+ tmp = sg_div_cpu_power(this,
+ max_load * busiest->__cpu_power);
else
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- this->cpu_power;
- pwr_move += this->cpu_power *
+ tmp = sg_div_cpu_power(this,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
+ pwr_move += this->__cpu_power *
min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
@@ -5486,7 +5509,7 @@ static void sched_domain_debug(struct sc
break;
}
- if (!group->cpu_power) {
+ if (!group->__cpu_power) {
printk("\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
@@ -5663,7 +5686,7 @@ init_sched_build_groups(cpumask_t span,
continue;
sg->cpumask = CPU_MASK_NONE;
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
for_each_cpu_mask(j, span) {
if (group_fn(j, cpu_map, NULL) != group)
@@ -6352,7 +6375,7 @@ next_sg:
continue;
}
- sg->cpu_power += sd->groups->cpu_power;
+ sg_inc_cpu_power(sg, sd->groups->__cpu_power);
}
sg = sg->next;
if (sg != group_head)
@@ -6427,6 +6450,8 @@ static void init_sched_groups_power(int
child = sd->child;
+ sd->groups->__cpu_power = 0;
+
/*
* For perf policy, if the groups in child domain share resources
* (for example cores sharing some portions of the cache hierarchy
@@ -6437,18 +6462,16 @@ static void init_sched_groups_power(int
if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
(child->flags &
(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
return;
}
- sd->groups->cpu_power = 0;
-
/*
* add cpu_power of each child group to this groups cpu_power
*/
group = child->groups;
do {
- sd->groups->cpu_power += group->cpu_power;
+ sg_inc_cpu_power(sd->groups, group->__cpu_power);
group = group->next;
} while (group != child->groups);
}
@@ -6608,7 +6631,7 @@ static int build_sched_domains(const cpu
sd = &per_cpu(node_domains, j);
sd->groups = sg;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = nodemask;
sg->next = sg;
cpus_or(covered, covered, nodemask);
@@ -6636,7 +6659,7 @@ static int build_sched_domains(const cpu
"Can not alloc domain group for node %d\n", j);
goto error;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = tmp;
sg->next = prev->next;
cpus_or(covered, covered, tmp);
next prev parent reply other threads:[~2007-02-22 8:19 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-02-21 21:10 [Patch 1/2] cciss: fix for 2TB support Mike Miller (OS Dev)
2007-02-22 3:14 ` Andrew Morton
2007-02-22 7:31 ` [PATCH] Speedup divides by cpu_power in scheduler Eric Dumazet
2007-02-22 7:56 ` Ingo Molnar
2007-02-22 8:19 ` Eric Dumazet [this message]
2007-02-22 8:19 ` [PATCH, take 2] " Ingo Molnar
2007-02-22 16:51 ` [Patch 1/2] cciss: fix for 2TB support Mike Miller (OS Dev)
2007-02-22 21:24 ` Andrew Morton
2007-02-22 21:41 ` James Bottomley
2007-02-22 22:02 ` Mike Miller (OS Dev)
2007-02-22 22:06 ` James Bottomley
2007-02-23 20:52 ` Mike Miller (OS Dev)
2007-02-24 6:35 ` Andrew Morton
2007-02-22 20:18 ` Mike Miller (OS Dev)
2007-02-22 21:22 ` Miller, Mike (OS Dev)
2007-02-22 21:22 ` Miller, Mike (OS Dev)
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=45DD5217.3000608@cosmosbay.com \
--to=dada1@cosmosbay.com \
--cc=akpm@linux-foundation.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.