From: Lin Ming <ming.m.lin@intel.com>
To: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>,
lkml <linux-kernel@vger.kernel.org>,
"Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Subject: Re: [RFC PATCH] sched: Pass affine target cpu into wake_affine
Date: Thu, 07 Jan 2010 16:45:03 +0800 [thread overview]
Message-ID: <1262853903.18931.17.camel@minggr.sh.intel.com> (raw)
In-Reply-To: <1262673817.9534.37.camel@marge.simson.net>
On Tue, 2010-01-05 at 14:43 +0800, Mike Galbraith wrote:
> On Tue, 2010-01-05 at 04:44 +0100, Mike Galbraith wrote:
> > On Tue, 2010-01-05 at 10:48 +0800, Lin Ming wrote:
> > > On Mon, 2010-01-04 at 17:03 +0800, Lin Ming wrote:
> > > > commit a03ecf08d7bbdd979d81163ea13d194fe21ad339
> > > > Author: Lin Ming <ming.m.lin@intel.com>
> > > > Date: Mon Jan 4 14:14:50 2010 +0800
> > > >
> > > > sched: Pass affine target cpu into wake_affine
> > > >
> > > > Since commit a1f84a3(sched: Check for an idle shared cache in select_task_rq_fair()),
> > > > the affine target maybe adjusted to any idle cpu in cache sharing domains
> > > > instead of current cpu.
> > > > But wake_affine still use current cpu to calculate load which is wrong.
> > > >
> > > > This patch passes affine cpu into wake_affine.
> > > >
> > > > Signed-off-by: Lin Ming <ming.m.lin@intel.com>
> > >
> > > Mike,
> > >
> > > Any comment of this patch?
> >
> > The patch definitely looks like the right thing to do, but when I tried
> > this, it didn't work out well. Since I can't seem to recall precise
> > details, I'll let my box either remind me or give it's ack.
>
> Unfortunately, box reminded me. mysql+oltp peak throughput with
> nr_clients == nr_cpus
Did you test with your vmark regression fix patch also applied?
I tested on below 2 machines with the 2 patches both applied and the
oltp(sysbench+mysql) data shows good.
Tigerton x86_64 machine: 16cpus(4P/4Cores), 40G mem
IA64 machine: 32cpus(4P/4Cores/HT), 16G mem
Compared with upstream 2.6.33-rc2, IA64 improves ~15% and Tigerton
improves ~3%.
The 2 patches are merged as below,
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 57e6357..5b81156 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -99,7 +99,7 @@ int arch_update_cpu_topology(void);
| 1*SD_WAKE_AFFINE \
| 1*SD_SHARE_CPUPOWER \
| 0*SD_POWERSAVINGS_BALANCE \
- | 0*SD_SHARE_PKG_RESOURCES \
+ | 1*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
| 0*SD_PREFER_SIBLING \
, \
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 42ac3c9..cbf4bd2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1237,11 +1237,11 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
#endif
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int affine_cpu, int sync)
{
struct task_struct *curr = current;
- unsigned long this_load, load;
- int idx, this_cpu, prev_cpu;
+ unsigned long affine_load, load;
+ int idx, prev_cpu;
unsigned long tl_per_task;
unsigned int imbalance;
struct task_group *tg;
@@ -1249,10 +1249,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
int balanced;
idx = sd->wake_idx;
- this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
load = source_load(prev_cpu, idx);
- this_load = target_load(this_cpu, idx);
+ affine_load = target_load(affine_cpu, idx);
if (sync) {
if (sched_feat(SYNC_LESS) &&
@@ -1275,7 +1274,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
tg = task_group(current);
weight = current->se.load.weight;
- this_load += effective_load(tg, this_cpu, -weight, -weight);
+ affine_load += effective_load(tg, affine_cpu, -weight, -weight);
load += effective_load(tg, prev_cpu, 0, -weight);
}
@@ -1285,16 +1284,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
imbalance = 100 + (sd->imbalance_pct - 100) / 2;
/*
- * In low-load situations, where prev_cpu is idle and this_cpu is idle
- * due to the sync cause above having dropped this_load to 0, we'll
+ * In low-load situations, where prev_cpu is idle and affine_cpu is idle
+ * due to the sync cause above having dropped affine_load to 0, we'll
* always have an imbalance, but there's really nothing you can do
* about that, so that's good too.
*
* Otherwise check if either cpus are near enough in load to allow this
- * task to be woken on this_cpu.
+ * task to be woken on affine_cpu.
*/
- balanced = !this_load ||
- 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
+ balanced = !affine_load ||
+ 100*(affine_load + effective_load(tg, affine_cpu, weight, weight)) <=
imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
/*
@@ -1306,11 +1305,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
return 1;
schedstat_inc(p, se.nr_wakeups_affine_attempts);
- tl_per_task = cpu_avg_load_per_task(this_cpu);
+ tl_per_task = cpu_avg_load_per_task(affine_cpu);
if (balanced ||
- (this_load <= load &&
- this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
+ (affine_load <= load &&
+ affine_load + target_load(prev_cpu, idx) <= tl_per_task)) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
@@ -1508,7 +1507,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
* If there's an idle sibling in this domain, make that
* the wake_affine target instead of the current cpu.
*/
- if (tmp->flags & SD_PREFER_SIBLING)
+ if (tmp->flags & SD_SHARE_PKG_RESOURCES)
target = select_idle_sibling(p, tmp, target);
if (target >= 0) {
@@ -1544,7 +1543,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
update_shares(tmp);
}
- if (affine_sd && wake_affine(affine_sd, p, sync))
+ if (affine_sd && wake_affine(affine_sd, p, cpu, sync))
return cpu;
while (sd) {
next prev parent reply other threads:[~2010-01-07 9:00 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-01-04 9:03 [RFC PATCH] sched: Pass affine target cpu into wake_affine Lin Ming
2010-01-04 9:25 ` Peter Zijlstra
2010-01-04 9:12 ` Lin Ming
2010-01-04 9:32 ` Peter Zijlstra
2010-01-04 10:59 ` Mike Galbraith
2010-01-04 11:07 ` Lin Ming
2010-01-05 2:48 ` Lin Ming
2010-01-05 3:44 ` Mike Galbraith
2010-01-05 6:43 ` Mike Galbraith
2010-01-05 11:49 ` Mike Galbraith
2010-01-07 8:45 ` Lin Ming [this message]
2010-01-07 9:15 ` Peter Zijlstra
2010-01-07 9:33 ` Mike Galbraith
2010-01-07 13:14 ` Mike Galbraith
2010-01-08 2:38 ` Lin Ming
2010-01-08 3:34 ` Mike Galbraith
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1262853903.18931.17.camel@minggr.sh.intel.com \
--to=ming.m.lin@intel.com \
--cc=efault@gmx.de \
--cc=linux-kernel@vger.kernel.org \
--cc=peterz@infradead.org \
--cc=yanmin_zhang@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.