From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>,
"Zhang, Yanmin" <yanmin_zhang@linux.intel.com>,
Dhaval Giani <dhaval@linux.vnet.ibm.com>,
LKML <linux-kernel@vger.kernel.org>, Ingo Molnar <mingo@elte.hu>,
Aneesh Kumar KV <aneesh.kumar@linux.vnet.ibm.com>
Subject: Re: volanoMark regression with kernel 2.6.26-rc1
Date: Thu, 15 May 2008 22:40:42 +0530 [thread overview]
Message-ID: <20080515171042.GG14823@linux.vnet.ibm.com> (raw)
In-Reply-To: <1210840898.21616.27.camel@twins>
On Thu, May 15, 2008 at 10:41:38AM +0200, Peter Zijlstra wrote:
> Yes - and that should not be too big an issue as long as we can deal
> with it.
>
> Any number we'll put to it will be based on a snapshot of the state so
> we're wrong no matter what we do. The trick is trying to keep sane.
more than staleness, interspersing of writes/reads is my concern.
Lets say that CPU0 is updating tg->cfs_rq[0]->aggregate.load,shares,rq_weight
at CPU domain (comprising of cpu 0-7). That will result in writes in this order:
->rq_weight = ?
->task_weight = ?
->shares = ?
->load = ?
At the same time, CPU1 could be doing a load_balance_fair() in SMT
domain, reading the above same words in this order:
->rq_weight
->load
->task_weight
->shares
What if the writes (on cpu0) and reads (on cpu1) are interspersed? Won't
it give incoherent results for the reader?
> My current stack on top of sched-devel:
>
> http://programming.kicks-ass.net/kernel-patches/sched-smp-group-fixes/
Doesnt improve things here (8-way Intel Xeon with SCHED_SMT set):
2.6.25 : 21762.4
2.6.26-rc1 + sched_devel : 17937.5 (-17.6%)
2.6.26-rc1 + sched_devel + your patches : 17047 (-21.6%)
2.6.26-rc1 + sched_devel + patch_below : 18368.9 (-15.6%)
2.6.26-rc1 + sched_devel + patch_below + ^NORMALIZED_SLEEPER : 19589.6 (-9.9%)
I will check if patch_below + your patches help close down the gap (tomorrow):
---
kernel/sched.c | 98 ++++++++++++++++++++++++++--------------------------
kernel/sched_fair.c | 26 +++++--------
2 files changed, 61 insertions(+), 63 deletions(-)
Index: current/kernel/sched.c
===================================================================
--- current.orig/kernel/sched.c
+++ current/kernel/sched.c
@@ -1568,12 +1568,12 @@ static int task_hot(struct task_struct *
*/
static inline struct aggregate_struct *
-aggregate(struct task_group *tg, struct sched_domain *sd)
+aggregate(struct task_group *tg, int this_cpu)
{
- return &tg->cfs_rq[sd->first_cpu]->aggregate;
+ return &tg->cfs_rq[this_cpu]->aggregate;
}
-typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+typedef void (*aggregate_func)(struct task_group *, struct sched_domain *, int);
/*
* Iterate the full tree, calling @down when first entering a node and @up when
@@ -1581,14 +1581,14 @@ typedef void (*aggregate_func)(struct ta
*/
static
void aggregate_walk_tree(aggregate_func down, aggregate_func up,
- struct sched_domain *sd)
+ struct sched_domain *sd, int this_cpu)
{
struct task_group *parent, *child;
rcu_read_lock();
parent = &root_task_group;
down:
- (*down)(parent, sd);
+ (*down)(parent, sd, this_cpu);
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
@@ -1596,7 +1596,7 @@ down:
up:
continue;
}
- (*up)(parent, sd);
+ (*up)(parent, sd, this_cpu);
child = parent;
parent = parent->parent;
@@ -1609,7 +1609,8 @@ up:
* Calculate the aggregate runqueue weight.
*/
static
-void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd,
+ int this_cpu)
{
unsigned long rq_weight = 0;
unsigned long task_weight = 0;
@@ -1620,15 +1621,16 @@ void aggregate_group_weight(struct task_
task_weight += tg->cfs_rq[i]->task_weight;
}
- aggregate(tg, sd)->rq_weight = rq_weight;
- aggregate(tg, sd)->task_weight = task_weight;
+ aggregate(tg, this_cpu)->rq_weight = rq_weight;
+ aggregate(tg, this_cpu)->task_weight = task_weight;
}
/*
* Compute the weight of this group on the given cpus.
*/
static
-void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd,
+ int this_cpu)
{
unsigned long shares = 0;
int i;
@@ -1636,10 +1638,11 @@ void aggregate_group_shares(struct task_
for_each_cpu_mask(i, sd->span)
shares += tg->cfs_rq[i]->shares;
- if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+ if ((!shares && aggregate(tg, this_cpu)->rq_weight) ||
+ shares > tg->shares)
shares = tg->shares;
- aggregate(tg, sd)->shares = shares;
+ aggregate(tg, this_cpu)->shares = shares;
}
/*
@@ -1647,7 +1650,8 @@ void aggregate_group_shares(struct task_
* weight and this group's parent's load, i.e. top-down.
*/
static
-void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+void aggregate_group_load(struct task_group *tg, struct sched_domain *sd,
+ int this_cpu)
{
unsigned long load;
@@ -1659,17 +1663,17 @@ void aggregate_group_load(struct task_gr
load += cpu_rq(i)->load.weight;
} else {
- load = aggregate(tg->parent, sd)->load;
+ load = aggregate(tg->parent, this_cpu)->load;
/*
* shares is our weight in the parent's rq so
* shares/parent->rq_weight gives our fraction of the load
*/
- load *= aggregate(tg, sd)->shares;
- load /= aggregate(tg->parent, sd)->rq_weight + 1;
+ load *= aggregate(tg, this_cpu)->shares;
+ load /= aggregate(tg->parent, this_cpu)->rq_weight + 1;
}
- aggregate(tg, sd)->load = load;
+ aggregate(tg, this_cpu)->load = load;
}
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1679,7 +1683,7 @@ static void __set_se_shares(struct sched
*/
static void
__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
- int tcpu)
+ int tcpu, int this_cpu)
{
int boost = 0;
unsigned long shares;
@@ -1706,8 +1710,8 @@ __update_group_shares_cpu(struct task_gr
* \Sum rq_weight
*
*/
- shares = aggregate(tg, sd)->shares * rq_weight;
- shares /= aggregate(tg, sd)->rq_weight + 1;
+ shares = aggregate(tg, this_cpu)->shares * rq_weight;
+ shares /= aggregate(tg, this_cpu)->rq_weight + 1;
/*
* record the actual number of shares, not the boosted amount.
@@ -1734,8 +1738,8 @@ __move_group_shares(struct task_group *t
shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
- __update_group_shares_cpu(tg, sd, scpu);
- __update_group_shares_cpu(tg, sd, dcpu);
+ __update_group_shares_cpu(tg, sd, scpu, dcpu);
+ __update_group_shares_cpu(tg, sd, dcpu, dcpu);
/*
* ensure we never loose shares due to rounding errors in the
@@ -1761,9 +1765,10 @@ move_group_shares(struct task_group *tg,
}
static
-void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd,
+ int this_cpu)
{
- unsigned long shares = aggregate(tg, sd)->shares;
+ unsigned long shares = aggregate(tg, this_cpu)->shares;
int i;
for_each_cpu_mask(i, sd->span) {
@@ -1771,20 +1776,20 @@ void aggregate_group_set_shares(struct t
unsigned long flags;
spin_lock_irqsave(&rq->lock, flags);
- __update_group_shares_cpu(tg, sd, i);
+ __update_group_shares_cpu(tg, sd, i, this_cpu);
spin_unlock_irqrestore(&rq->lock, flags);
}
- aggregate_group_shares(tg, sd);
+ aggregate_group_shares(tg, sd, this_cpu);
/*
* ensure we never loose shares due to rounding errors in the
* above redistribution.
*/
- shares -= aggregate(tg, sd)->shares;
+ shares -= aggregate(tg, this_cpu)->shares;
if (shares) {
tg->cfs_rq[sd->first_cpu]->shares += shares;
- aggregate(tg, sd)->shares += shares;
+ aggregate(tg, this_cpu)->shares += shares;
}
}
@@ -1793,20 +1798,22 @@ void aggregate_group_set_shares(struct t
* while walking down the tree.
*/
static
-void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+void aggregate_get_down(struct task_group *tg, struct sched_domain *sd,
+ int this_cpu)
{
- aggregate_group_weight(tg, sd);
- aggregate_group_shares(tg, sd);
- aggregate_group_load(tg, sd);
+ aggregate_group_weight(tg, sd, this_cpu);
+ aggregate_group_shares(tg, sd, this_cpu);
+ aggregate_group_load(tg, sd, this_cpu);
}
/*
* Rebalance the cpu shares while walking back up the tree.
*/
static
-void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+void aggregate_get_up(struct task_group *tg, struct sched_domain *sd,
+ int this_cpu)
{
- aggregate_group_set_shares(tg, sd);
+ aggregate_group_set_shares(tg, sd, this_cpu);
}
static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
@@ -1819,18 +1826,15 @@ static void __init init_aggregate(void)
spin_lock_init(&per_cpu(aggregate_lock, i));
}
-static int get_aggregate(struct sched_domain *sd)
+static void get_aggregate(struct sched_domain *sd, int this_cpu)
{
- if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
- return 0;
-
- aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
- return 1;
+ spin_lock(&per_cpu(aggregate_lock, this_cpu));
+ aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd, this_cpu);
}
-static void put_aggregate(struct sched_domain *sd)
+static void put_aggregate(struct sched_domain *sd, int this_cpu)
{
- spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+ spin_unlock(&per_cpu(aggregate_lock, this_cpu));
}
static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1844,12 +1848,12 @@ static inline void init_aggregate(void)
{
}
-static inline int get_aggregate(struct sched_domain *sd)
+static void get_aggregate(struct sched_domain *sd, int this_cpu)
{
return 0;
}
-static inline void put_aggregate(struct sched_domain *sd)
+static void put_aggregate(struct sched_domain *sd, int this_cpu)
{
}
#endif
@@ -3635,11 +3639,10 @@ static int load_balance(int this_cpu, st
unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
- int unlock_aggregate;
cpus_setall(*cpus);
- unlock_aggregate = get_aggregate(sd);
+ get_aggregate(sd, this_cpu);
/*
* When power savings policy is enabled for the parent domain, idle
@@ -3777,8 +3780,7 @@ out_one_pinned:
else
ld_moved = 0;
out:
- if (unlock_aggregate)
- put_aggregate(sd);
+ put_aggregate(sd, this_cpu);
return ld_moved;
}
Index: current/kernel/sched_fair.c
===================================================================
--- current.orig/kernel/sched_fair.c
+++ current/kernel/sched_fair.c
@@ -1388,28 +1388,24 @@ load_balance_fair(struct rq *this_rq, in
rcu_read_lock();
list_for_each_entry(tg, &task_groups, list) {
- long imbalance;
- unsigned long this_weight, busiest_weight;
+ unsigned long busiest_weight;
long rem_load, max_load, moved_load;
+ busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
+
/*
* empty group
*/
- if (!aggregate(tg, sd)->task_weight)
+ if (!aggregate(tg, this_cpu)->task_weight || !busiest_weight)
continue;
- rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
- rem_load /= aggregate(tg, sd)->load + 1;
-
- this_weight = tg->cfs_rq[this_cpu]->task_weight;
- busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
-
- imbalance = (busiest_weight - this_weight) / 2;
+ rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
+ rem_load /= aggregate(tg, this_cpu)->load + 1;
- if (imbalance < 0)
- imbalance = busiest_weight;
+ if (!rem_load)
+ continue;
- max_load = max(rem_load, imbalance);
+ max_load = rem_load;
moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
max_load, sd, idle, all_pinned, this_best_prio,
tg->cfs_rq[busiest_cpu]);
@@ -1419,8 +1415,8 @@ load_balance_fair(struct rq *this_rq, in
move_group_shares(tg, sd, busiest_cpu, this_cpu);
- moved_load *= aggregate(tg, sd)->load;
- moved_load /= aggregate(tg, sd)->rq_weight + 1;
+ moved_load *= aggregate(tg, this_cpu)->load;
+ moved_load /= aggregate(tg, this_cpu)->rq_weight + 1;
rem_load_move -= moved_load;
if (rem_load_move < 0)
--
Regards,
vatsa
next prev parent reply other threads:[~2008-05-15 17:01 UTC|newest]
Thread overview: 41+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-05-06 2:06 volanoMark regression with kernel 2.6.26-rc1 Zhang, Yanmin
2008-05-06 5:41 ` Zhang, Yanmin
2008-05-06 11:52 ` Dhaval Giani
2008-05-07 17:33 ` Dhaval Giani
2008-05-08 5:18 ` Zhang, Yanmin
2008-05-08 5:32 ` Dhaval Giani
2008-05-08 5:40 ` Dhaval Giani
2008-05-08 5:53 ` Zhang, Yanmin
2008-05-08 6:04 ` Dhaval Giani
2008-05-08 6:11 ` Srivatsa Vaddagiri
2008-05-09 15:52 ` Srivatsa Vaddagiri
2008-05-09 15:54 ` Srivatsa Vaddagiri
2008-05-12 1:39 ` Zhang, Yanmin
2008-05-12 2:04 ` Dhaval Giani
2008-05-12 2:37 ` Srivatsa Vaddagiri
2008-05-12 3:33 ` Zhang, Yanmin
2008-05-12 4:52 ` Srivatsa Vaddagiri
2008-05-12 5:02 ` Zhang, Yanmin
2008-05-12 5:43 ` Zhang, Yanmin
2008-05-12 9:04 ` Mike Galbraith
2008-05-12 9:20 ` Peter Zijlstra
2008-05-14 9:22 ` Zhang, Yanmin
2008-05-14 13:44 ` Srivatsa Vaddagiri
2008-05-14 14:50 ` Mike Galbraith
2008-05-14 15:12 ` Peter Zijlstra
2008-05-15 8:20 ` Srivatsa Vaddagiri
2008-05-15 8:41 ` Peter Zijlstra
2008-05-15 17:10 ` Srivatsa Vaddagiri [this message]
2008-05-07 7:04 ` Andrew Morton
2008-05-07 9:17 ` Ingo Molnar
2008-05-07 9:33 ` Zhang, Yanmin
2008-05-07 17:34 ` Peter Zijlstra
2008-05-07 18:58 ` Peter Zijlstra
2008-05-08 6:07 ` Zhang, Yanmin
2008-05-08 5:20 ` Zhang, Yanmin
2008-05-08 5:34 ` Dhaval Giani
2008-05-08 6:43 ` Peter Zijlstra
2008-05-07 17:42 ` Dhaval Giani
2008-05-08 5:21 ` Zhang, Yanmin
2008-05-08 5:39 ` Dhaval Giani
2008-05-08 6:03 ` Zhang, Yanmin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080515171042.GG14823@linux.vnet.ibm.com \
--to=vatsa@linux.vnet.ibm.com \
--cc=a.p.zijlstra@chello.nl \
--cc=aneesh.kumar@linux.vnet.ibm.com \
--cc=dhaval@linux.vnet.ibm.com \
--cc=efault@gmx.de \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=yanmin_zhang@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.