From: Preeti U Murthy <preeti@linux.vnet.ibm.com>
To: peterz@infradead.org, mikey@neuling.org,
svaidy@linux.vnet.ibm.com, mingo@kernel.org
Cc: vincent.guittot@linaro.org, bitbucket@online.de,
linux-kernel@vger.kernel.org, anton@samba.org,
linuxppc-dev@lists.ozlabs.org, Morten.Rasmussen@arm.com,
pjt@google.com
Subject: Re: [PATCH V2 2/2] sched: Remove un-necessary iteration over sched domains to update nr_busy_cpus
Date: Wed, 30 Oct 2013 08:50:34 +0530 [thread overview]
Message-ID: <52707B02.7030100@linux.vnet.ibm.com> (raw)
In-Reply-To: <20131030031252.23426.4417.stgit@preeti.in.ibm.com>
The changelog has missed mentioning the introduction of sd_asym per_cpu sched domain.
Apologies for this. The patch with the changelog including mention of sd_asym is
pasted below.
Regards
Preeti U Murthy
---------------
sched: Remove un-necessary iteration over sched domains to update nr_busy_cpus
From: Preeti U Murthy <preeti@linux.vnet.ibm.com>
nr_busy_cpus parameter is used by nohz_kick_needed() to find out the number
of busy cpus in a sched domain which has SD_SHARE_PKG_RESOURCES flag set.
Therefore instead of updating nr_busy_cpus at every level of sched domain,
since it is irrelevant, we can update this parameter only at the parent
domain of the sd which has this flag set. Introduce a per-cpu parameter
sd_busy which represents this parent domain.
In nohz_kick_needed() we directly query the nr_busy_cpus parameter
associated with the groups of sd_busy.
By associating sd_busy with the highest domain which has
SD_SHARE_PKG_RESOURCES flag set, we cover all lower level domains which could
have this flag set and trigger nohz_idle_balancing if any of the levels have
more than one busy cpu.
sd_busy is irrelevant for asymmetric load balancing. However sd_asym has been
introduced to represent the highest sched domain which has SD_ASYM_PACKING flag set
so that it can be queried directly when required.
While we are at it, we might as well change the nohz_idle parameter to be
updated at the sd_busy domain level alone and not the base domain level of a CPU.
This will unify the concept of busy cpus at just one level of sched domain
where it is currently used.
Signed-off-by: Preeti U Murthy<preeti@linux.vnet.ibm.com>
---
kernel/sched/core.c | 6 ++++++
kernel/sched/fair.c | 38 ++++++++++++++++++++------------------
kernel/sched/sched.h | 2 ++
3 files changed, 28 insertions(+), 18 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c06b8d3..e6a6244 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5271,6 +5271,8 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu)
{
@@ -5282,6 +5284,7 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
+ rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
}
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
@@ -5290,6 +5293,9 @@ static void update_top_cache_domain(int cpu)
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+ sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+ rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9c9549..8602b2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6515,16 +6515,16 @@ static inline void nohz_balance_exit_idle(int cpu)
static inline void set_cpu_sd_state_busy(void)
{
struct sched_domain *sd;
+ int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
- for (; sd; sd = sd->parent)
- atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ atomic_inc(&sd->groups->sgp->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -6532,16 +6532,16 @@ unlock:
void set_cpu_sd_state_idle(void)
{
struct sched_domain *sd;
+ int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
- for (; sd; sd = sd->parent)
- atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+ atomic_dec(&sd->groups->sgp->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -6748,6 +6748,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
{
unsigned long now = jiffies;
struct sched_domain *sd;
+ struct sched_group_power *sgp;
+ int nr_busy;
if (unlikely(idle_cpu(cpu)))
return 0;
@@ -6773,22 +6775,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
goto need_kick;
rcu_read_lock();
- for_each_domain(cpu, sd) {
- struct sched_group *sg = sd->groups;
- struct sched_group_power *sgp = sg->sgp;
- int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
- goto need_kick_unlock;
+ if (sd) {
+ sgp = sd->groups->sgp;
+ nr_busy = atomic_read(&sgp->nr_busy_cpus);
- if (sd->flags & SD_ASYM_PACKING
- && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
+ if (nr_busy > 1)
goto need_kick_unlock;
-
- if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
- break;
}
+
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
+
+ if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
+ sched_domain_span(sd)) < cpu))
+ goto need_kick_unlock;
+
rcu_read_unlock();
return 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ffc7087..c8cb145 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -623,6 +623,8 @@ DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain *, sd_busy);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_power {
atomic_t ref;
WARNING: multiple messages have this Message-ID (diff)
From: Preeti U Murthy <preeti@linux.vnet.ibm.com>
To: peterz@infradead.org, mikey@neuling.org,
svaidy@linux.vnet.ibm.com, mingo@kernel.org
Cc: vincent.guittot@linaro.org, bitbucket@online.de,
benh@kernel.crashing.org, linux-kernel@vger.kernel.org,
anton@samba.org, linuxppc-dev@lists.ozlabs.org,
Morten.Rasmussen@arm.com, pjt@google.com
Subject: Re: [PATCH V2 2/2] sched: Remove un-necessary iteration over sched domains to update nr_busy_cpus
Date: Wed, 30 Oct 2013 08:50:34 +0530 [thread overview]
Message-ID: <52707B02.7030100@linux.vnet.ibm.com> (raw)
In-Reply-To: <20131030031252.23426.4417.stgit@preeti.in.ibm.com>
The changelog has missed mentioning the introduction of sd_asym per_cpu sched domain.
Apologies for this. The patch with the changelog including mention of sd_asym is
pasted below.
Regards
Preeti U Murthy
---------------
sched: Remove un-necessary iteration over sched domains to update nr_busy_cpus
From: Preeti U Murthy <preeti@linux.vnet.ibm.com>
nr_busy_cpus parameter is used by nohz_kick_needed() to find out the number
of busy cpus in a sched domain which has SD_SHARE_PKG_RESOURCES flag set.
Therefore instead of updating nr_busy_cpus at every level of sched domain,
since it is irrelevant, we can update this parameter only at the parent
domain of the sd which has this flag set. Introduce a per-cpu parameter
sd_busy which represents this parent domain.
In nohz_kick_needed() we directly query the nr_busy_cpus parameter
associated with the groups of sd_busy.
By associating sd_busy with the highest domain which has
SD_SHARE_PKG_RESOURCES flag set, we cover all lower level domains which could
have this flag set and trigger nohz_idle_balancing if any of the levels have
more than one busy cpu.
sd_busy is irrelevant for asymmetric load balancing. However sd_asym has been
introduced to represent the highest sched domain which has SD_ASYM_PACKING flag set
so that it can be queried directly when required.
While we are at it, we might as well change the nohz_idle parameter to be
updated at the sd_busy domain level alone and not the base domain level of a CPU.
This will unify the concept of busy cpus at just one level of sched domain
where it is currently used.
Signed-off-by: Preeti U Murthy<preeti@linux.vnet.ibm.com>
---
kernel/sched/core.c | 6 ++++++
kernel/sched/fair.c | 38 ++++++++++++++++++++------------------
kernel/sched/sched.h | 2 ++
3 files changed, 28 insertions(+), 18 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c06b8d3..e6a6244 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5271,6 +5271,8 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu)
{
@@ -5282,6 +5284,7 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
+ rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
}
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
@@ -5290,6 +5293,9 @@ static void update_top_cache_domain(int cpu)
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+ sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+ rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9c9549..8602b2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6515,16 +6515,16 @@ static inline void nohz_balance_exit_idle(int cpu)
static inline void set_cpu_sd_state_busy(void)
{
struct sched_domain *sd;
+ int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
- for (; sd; sd = sd->parent)
- atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ atomic_inc(&sd->groups->sgp->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -6532,16 +6532,16 @@ unlock:
void set_cpu_sd_state_idle(void)
{
struct sched_domain *sd;
+ int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
- for (; sd; sd = sd->parent)
- atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+ atomic_dec(&sd->groups->sgp->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -6748,6 +6748,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
{
unsigned long now = jiffies;
struct sched_domain *sd;
+ struct sched_group_power *sgp;
+ int nr_busy;
if (unlikely(idle_cpu(cpu)))
return 0;
@@ -6773,22 +6775,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
goto need_kick;
rcu_read_lock();
- for_each_domain(cpu, sd) {
- struct sched_group *sg = sd->groups;
- struct sched_group_power *sgp = sg->sgp;
- int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
- goto need_kick_unlock;
+ if (sd) {
+ sgp = sd->groups->sgp;
+ nr_busy = atomic_read(&sgp->nr_busy_cpus);
- if (sd->flags & SD_ASYM_PACKING
- && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
+ if (nr_busy > 1)
goto need_kick_unlock;
-
- if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
- break;
}
+
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
+
+ if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
+ sched_domain_span(sd)) < cpu))
+ goto need_kick_unlock;
+
rcu_read_unlock();
return 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ffc7087..c8cb145 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -623,6 +623,8 @@ DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain *, sd_busy);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_power {
atomic_t ref;
next prev parent reply other threads:[~2013-10-30 3:23 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-10-30 3:12 [PATCH V2 0/2] sched: Cleanups,fixes in nohz_kick_needed() Preeti U Murthy
2013-10-30 3:12 ` Preeti U Murthy
2013-10-30 3:12 ` [PATCH V2 1/2] sched: Fix asymmetric scheduling for POWER7 Preeti U Murthy
2013-10-30 3:12 ` Preeti U Murthy
2013-11-06 13:20 ` [tip:sched/core] " tip-bot for Vaidyanathan Srinivasan
2013-10-30 3:12 ` [PATCH V2 2/2] sched: Remove un-necessary iteration over sched domains to update nr_busy_cpus Preeti U Murthy
2013-10-30 3:12 ` Preeti U Murthy
2013-10-30 3:20 ` Preeti U Murthy [this message]
2013-10-30 3:20 ` Preeti U Murthy
2013-10-30 9:23 ` Kamalesh Babulal
2013-10-30 9:23 ` Kamalesh Babulal
2013-10-30 10:03 ` Preeti U Murthy
2013-10-30 10:03 ` Preeti U Murthy
2013-11-06 13:20 ` [tip:sched/core] sched: Remove unnecessary " tip-bot for Preeti U Murthy
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=52707B02.7030100@linux.vnet.ibm.com \
--to=preeti@linux.vnet.ibm.com \
--cc=Morten.Rasmussen@arm.com \
--cc=anton@samba.org \
--cc=bitbucket@online.de \
--cc=linux-kernel@vger.kernel.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=mikey@neuling.org \
--cc=mingo@kernel.org \
--cc=peterz@infradead.org \
--cc=pjt@google.com \
--cc=svaidy@linux.vnet.ibm.com \
--cc=vincent.guittot@linaro.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.