From: Gautham R Shenoy <ego@in.ibm.com>
To: "Ingo Molnar" <mingo@elte.hu>,
Peter Zijlstra <a.p.zijlstra@chello.nl>,
"Vaidyanathan Srinivasan" <svaidy@linux.vnet.ibm.com>
Cc: linux-kernel@vger.kernel.org,
Suresh Siddha <suresh.b.siddha@intel.com>,
"Balbir Singh" <balbir@in.ibm.com>,
Nick Piggin <nickpiggin@yahoo.com.au>,
"Dhaval Giani" <dhaval@linux.vnet.ibm.com>,
Bharata B Rao <bharata@linux.vnet.ibm.com>,
Gautham R Shenoy <ego@in.ibm.com>
Subject: [RFC PATCH 05/11] sched: Define structure to store the sched_domain statistics for fbg()
Date: Wed, 25 Mar 2009 14:43:56 +0530 [thread overview]
Message-ID: <20090325091356.13992.25970.stgit@sofia.in.ibm.com> (raw)
In-Reply-To: <20090325091239.13992.96090.stgit@sofia.in.ibm.com>
Currently we use a lot of local variables in find_busiest_group() to capture
the various statistics related to the sched_domain. Group them together into a
single data structure.
This will help us to offload the job of updating the sched_domain statistics
to a helper function.
Credit: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
---
kernel/sched.c | 207 +++++++++++++++++++++++++++++++++-----------------------
1 files changed, 121 insertions(+), 86 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index d2e9b8a..c1b92da 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3086,6 +3086,37 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
return 0;
}
/********** Helpers for find_busiest_group ************************/
+/**
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * during load balancing.
+ */
+struct sd_lb_stats {
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *this; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_pwr; /* Total power of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+
+ /** Statistics of this group */
+ unsigned long this_load;
+ unsigned long this_load_per_task;
+ unsigned long this_nr_running;
+
+ /* Statistics of the busiest group */
+ unsigned long max_load;
+ unsigned long busiest_load_per_task;
+ unsigned long busiest_nr_running;
+
+ int group_imb; /* Is there imbalance in this sd */
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ int power_savings_balance; /* Is powersave balance needed for this sd */
+ struct sched_group *group_min; /* Least loaded group in sd */
+ struct sched_group *group_leader; /* Group which relieves group_min */
+ unsigned long min_load_per_task; /* load_per_task in group_min */
+ unsigned long leader_nr_running; /* Nr running of group_leader */
+ unsigned long min_nr_running; /* Nr running of group_min */
+#endif
+};
/**
* sg_lb_stats - stats of a sched_group required for load_balancing
@@ -3242,23 +3273,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum cpu_idle_type idle,
int *sd_idle, const struct cpumask *cpus, int *balance)
{
- struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
- unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+ struct sd_lb_stats sds;
+ struct sched_group *group = sd->groups;
unsigned long max_pull;
- unsigned long busiest_load_per_task, busiest_nr_running;
- unsigned long this_load_per_task, this_nr_running;
- int load_idx, group_imb = 0;
+ int load_idx;
+
+ memset(&sds, 0, sizeof(sds));
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- int power_savings_balance = 1;
- unsigned long leader_nr_running = 0, min_load_per_task = 0;
- unsigned long min_nr_running = ULONG_MAX;
- struct sched_group *group_min = NULL, *group_leader = NULL;
+ sds.power_savings_balance = 1;
+ sds.min_nr_running = ULONG_MAX;
#endif
-
- max_load = this_load = total_load = total_pwr = 0;
- busiest_load_per_task = busiest_nr_running = 0;
- this_load_per_task = this_nr_running = 0;
-
load_idx = get_sd_load_idx(sd, idle);
do {
@@ -3274,22 +3298,22 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (balance && !(*balance))
goto ret;
- total_load += sgs.group_load;
- total_pwr += group->__cpu_power;
+ sds.total_load += sgs.group_load;
+ sds.total_pwr += group->__cpu_power;
if (local_group) {
- this_load = sgs.avg_load;
- this = group;
- this_nr_running = sgs.sum_nr_running;
- this_load_per_task = sgs.sum_weighted_load;
- } else if (sgs.avg_load > max_load &&
+ sds.this_load = sgs.avg_load;
+ sds.this = group;
+ sds.this_nr_running = sgs.sum_nr_running;
+ sds.this_load_per_task = sgs.sum_weighted_load;
+ } else if (sgs.avg_load > sds.max_load &&
(sgs.sum_nr_running > sgs.group_capacity ||
sgs.group_imb)) {
- max_load = sgs.avg_load;
- busiest = group;
- busiest_nr_running = sgs.sum_nr_running;
- busiest_load_per_task = sgs.sum_weighted_load;
- group_imb = sgs.group_imb;
+ sds.max_load = sgs.avg_load;
+ sds.busiest = group;
+ sds.busiest_nr_running = sgs.sum_nr_running;
+ sds.busiest_load_per_task = sgs.sum_weighted_load;
+ sds.group_imb = sgs.group_imb;
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -3305,15 +3329,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* If the local group is idle or completely loaded
* no need to do power savings balance at this domain
*/
- if (local_group && (this_nr_running >= sgs.group_capacity ||
- !this_nr_running))
- power_savings_balance = 0;
+ if (local_group &&
+ (sds.this_nr_running >= sgs.group_capacity ||
+ !sds.this_nr_running))
+ sds.power_savings_balance = 0;
/*
* If a group is already running at full capacity or idle,
* don't include that group in power savings calculations
*/
- if (!power_savings_balance ||
+ if (!sds.power_savings_balance ||
sgs.sum_nr_running >= sgs.group_capacity ||
!sgs.sum_nr_running)
goto group_next;
@@ -3323,12 +3348,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* This is the group from where we need to pick up the load
* for saving power
*/
- if ((sgs.sum_nr_running < min_nr_running) ||
- (sgs.sum_nr_running == min_nr_running &&
- group_first_cpu(group) > group_first_cpu(group_min))) {
- group_min = group;
- min_nr_running = sgs.sum_nr_running;
- min_load_per_task = sgs.sum_weighted_load /
+ if ((sgs.sum_nr_running < sds.min_nr_running) ||
+ (sgs.sum_nr_running == sds.min_nr_running &&
+ group_first_cpu(group) >
+ group_first_cpu(sds.group_min))) {
+ sds.group_min = group;
+ sds.min_nr_running = sgs.sum_nr_running;
+ sds.min_load_per_task = sgs.sum_weighted_load /
sgs.sum_nr_running;
}
@@ -3340,29 +3366,32 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (sgs.sum_nr_running > sgs.group_capacity - 1)
goto group_next;
- if (sgs.sum_nr_running > leader_nr_running ||
- (sgs.sum_nr_running == leader_nr_running &&
- group_first_cpu(group) < group_first_cpu(group_leader))) {
- group_leader = group;
- leader_nr_running = sgs.sum_nr_running;
+ if (sgs.sum_nr_running > sds.leader_nr_running ||
+ (sgs.sum_nr_running == sds.leader_nr_running &&
+ group_first_cpu(group) <
+ group_first_cpu(sds.group_leader))) {
+ sds.group_leader = group;
+ sds.leader_nr_running = sgs.sum_nr_running;
}
group_next:
#endif
group = group->next;
} while (group != sd->groups);
- if (!busiest || this_load >= max_load || busiest_nr_running == 0)
+ if (!sds.busiest || sds.this_load >= sds.max_load
+ || sds.busiest_nr_running == 0)
goto out_balanced;
- avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+ sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
- if (this_load >= avg_load ||
- 100*max_load <= sd->imbalance_pct*this_load)
+ if (sds.this_load >= sds.avg_load ||
+ 100*sds.max_load <= sd->imbalance_pct * sds.this_load)
goto out_balanced;
- busiest_load_per_task /= busiest_nr_running;
- if (group_imb)
- busiest_load_per_task = min(busiest_load_per_task, avg_load);
+ sds.busiest_load_per_task /= sds.busiest_nr_running;
+ if (sds.group_imb)
+ sds.busiest_load_per_task =
+ min(sds.busiest_load_per_task, sds.avg_load);
/*
* We're trying to get all the cpus to the average_load, so we don't
@@ -3375,7 +3404,7 @@ group_next:
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/
- if (max_load <= busiest_load_per_task)
+ if (sds.max_load <= sds.busiest_load_per_task)
goto out_balanced;
/*
@@ -3383,17 +3412,18 @@ group_next:
* max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..)
*/
- if (max_load < avg_load) {
+ if (sds.max_load < sds.avg_load) {
*imbalance = 0;
goto small_imbalance;
}
/* Don't want to pull so many tasks that a group would go idle */
- max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+ max_pull = min(sds.max_load - sds.avg_load,
+ sds.max_load - sds.busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * busiest->__cpu_power,
- (avg_load - this_load) * this->__cpu_power)
+ *imbalance = min(max_pull * sds.busiest->__cpu_power,
+ (sds.avg_load - sds.this_load) * sds.this->__cpu_power)
/ SCHED_LOAD_SCALE;
/*
@@ -3402,24 +3432,27 @@ group_next:
* a think about bumping its value to force at least one task to be
* moved
*/
- if (*imbalance < busiest_load_per_task) {
+ if (*imbalance < sds.busiest_load_per_task) {
unsigned long tmp, pwr_now, pwr_move;
unsigned int imbn;
small_imbalance:
pwr_move = pwr_now = 0;
imbn = 2;
- if (this_nr_running) {
- this_load_per_task /= this_nr_running;
- if (busiest_load_per_task > this_load_per_task)
+ if (sds.this_nr_running) {
+ sds.this_load_per_task /= sds.this_nr_running;
+ if (sds.busiest_load_per_task >
+ sds.this_load_per_task)
imbn = 1;
} else
- this_load_per_task = cpu_avg_load_per_task(this_cpu);
-
- if (max_load - this_load + busiest_load_per_task >=
- busiest_load_per_task * imbn) {
- *imbalance = busiest_load_per_task;
- return busiest;
+ sds.this_load_per_task =
+ cpu_avg_load_per_task(this_cpu);
+
+ if (sds.max_load - sds.this_load +
+ sds.busiest_load_per_task >=
+ sds.busiest_load_per_task * imbn) {
+ *imbalance = sds.busiest_load_per_task;
+ return sds.busiest;
}
/*
@@ -3428,52 +3461,54 @@ small_imbalance:
* moving them.
*/
- pwr_now += busiest->__cpu_power *
- min(busiest_load_per_task, max_load);
- pwr_now += this->__cpu_power *
- min(this_load_per_task, this_load);
+ pwr_now += sds.busiest->__cpu_power *
+ min(sds.busiest_load_per_task, sds.max_load);
+ pwr_now += sds.this->__cpu_power *
+ min(sds.this_load_per_task, sds.this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
- tmp = sg_div_cpu_power(busiest,
- busiest_load_per_task * SCHED_LOAD_SCALE);
- if (max_load > tmp)
- pwr_move += busiest->__cpu_power *
- min(busiest_load_per_task, max_load - tmp);
+ tmp = sg_div_cpu_power(sds.busiest,
+ sds.busiest_load_per_task * SCHED_LOAD_SCALE);
+ if (sds.max_load > tmp)
+ pwr_move += sds.busiest->__cpu_power *
+ min(sds.busiest_load_per_task,
+ sds.max_load - tmp);
/* Amount of load we'd add */
- if (max_load * busiest->__cpu_power <
- busiest_load_per_task * SCHED_LOAD_SCALE)
- tmp = sg_div_cpu_power(this,
- max_load * busiest->__cpu_power);
+ if (sds.max_load * sds.busiest->__cpu_power <
+ sds.busiest_load_per_task * SCHED_LOAD_SCALE)
+ tmp = sg_div_cpu_power(sds.this,
+ sds.max_load * sds.busiest->__cpu_power);
else
- tmp = sg_div_cpu_power(this,
- busiest_load_per_task * SCHED_LOAD_SCALE);
- pwr_move += this->__cpu_power *
- min(this_load_per_task, this_load + tmp);
+ tmp = sg_div_cpu_power(sds.this,
+ sds.busiest_load_per_task * SCHED_LOAD_SCALE);
+ pwr_move += sds.this->__cpu_power *
+ min(sds.this_load_per_task,
+ sds.this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
if (pwr_move > pwr_now)
- *imbalance = busiest_load_per_task;
+ *imbalance = sds.busiest_load_per_task;
}
- return busiest;
+ return sds.busiest;
out_balanced:
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
goto ret;
- if (this != group_leader || group_leader == group_min)
+ if (sds.this != sds.group_leader || sds.group_leader == sds.group_min)
goto ret;
- *imbalance = min_load_per_task;
+ *imbalance = sds.min_load_per_task;
if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
- group_first_cpu(group_leader);
+ group_first_cpu(sds.group_leader);
}
- return group_min;
+ return sds.group_min;
#endif
ret:
next prev parent reply other threads:[~2009-03-25 9:16 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-03-25 9:13 [RFC PATCH 00/11] sched: find_busiest_group() cleanup Gautham R Shenoy
2009-03-25 9:13 ` [RFC PATCH 01/11] sched: Simple helper functions for find_busiest_group() Gautham R Shenoy
2009-03-25 9:46 ` [tip:sched/balancing] " Gautham R Shenoy
2009-03-25 9:13 ` [RFC PATCH 02/11] sched: Fix indentations in find_busiest_group using gotos Gautham R Shenoy
2009-03-25 9:46 ` [tip:sched/balancing] sched: Fix indentations in find_busiest_group() " Gautham R Shenoy
2009-03-25 9:13 ` [RFC PATCH 03/11] sched: Define structure to store the sched_group statistics for fbg() Gautham R Shenoy
2009-03-25 9:46 ` [tip:sched/balancing] " Gautham R Shenoy
2009-03-25 9:13 ` [RFC PATCH 04/11] sched: Create a helper function to calculate sched_group stats " Gautham R Shenoy
2009-03-25 9:46 ` [tip:sched/balancing] " Gautham R Shenoy
2009-03-25 9:13 ` Gautham R Shenoy [this message]
2009-03-25 9:46 ` [tip:sched/balancing] sched: Define structure to store the sched_domain statistics " Gautham R Shenoy
2009-03-25 9:14 ` [RFC PATCH 06/11] sched: Create a helper function to calculate sched_domain stats " Gautham R Shenoy
2009-03-25 9:46 ` [tip:sched/balancing] " Gautham R Shenoy
2009-03-25 9:14 ` [RFC PATCH 07/11] sched: Create helper to calculate small_imbalance in find_busiest_group Gautham R Shenoy
2009-03-25 9:46 ` [tip:sched/balancing] sched: Create helper to calculate small_imbalance in fbg() Gautham R Shenoy
2009-03-25 9:14 ` [RFC PATCH 08/11] sched: Create a helper function to calculate imbalance Gautham R Shenoy
2009-03-25 9:46 ` [tip:sched/balancing] " Gautham R Shenoy
2009-03-25 9:14 ` [RFC PATCH 09/11] sched: Optimize the !power_savings_balance during find_busiest_group Gautham R Shenoy
2009-03-25 9:47 ` [tip:sched/balancing] sched: Optimize the !power_savings_balance during fbg() Gautham R Shenoy
2009-03-25 9:14 ` [RFC PATCH 10/11] sched: Refactor the power savings balance code Gautham R Shenoy
2009-03-25 9:47 ` [tip:sched/balancing] " Gautham R Shenoy
2009-03-25 9:14 ` [RFC PATCH 11/11] sched: Add comments to find_busiest_group() function Gautham R Shenoy
2009-03-25 9:47 ` [tip:sched/balancing] " Gautham R Shenoy
2009-03-25 11:43 ` [RFC PATCH 11/11] " Gautham R Shenoy
2009-03-25 12:29 ` Ingo Molnar
2009-03-25 13:07 ` Gautham R Shenoy
2009-03-25 13:10 ` Ingo Molnar
2009-03-25 12:30 ` [tip:sched/balancing] " Gautham R Shenoy
2009-03-25 16:04 ` Ray Lee
2009-03-25 16:17 ` Ingo Molnar
2009-03-25 19:17 ` Gautham R Shenoy
2009-03-25 9:30 ` [RFC PATCH 00/11] sched: find_busiest_group() cleanup Ingo Molnar
2009-03-25 9:42 ` Ingo Molnar
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20090325091356.13992.25970.stgit@sofia.in.ibm.com \
--to=ego@in.ibm.com \
--cc=a.p.zijlstra@chello.nl \
--cc=balbir@in.ibm.com \
--cc=bharata@linux.vnet.ibm.com \
--cc=dhaval@linux.vnet.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=nickpiggin@yahoo.com.au \
--cc=suresh.b.siddha@intel.com \
--cc=svaidy@linux.vnet.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.