All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>,
	LKML <linux-kernel@vger.kernel.org>,
	"Ma, Ling" <ling.ma@intel.com>,
	"Zhang, Yanmin" <yanmin_zhang@linux.intel.com>,
	"ego@in.ibm.com" <ego@in.ibm.com>,
	"svaidy@linux.vnet.ibm.com" <svaidy@linux.vnet.ibm.com>
Subject: Re: change in sched cpu_power causing regressions with SCHED_MC
Date: Mon, 22 Feb 2010 19:50:18 +0100	[thread overview]
Message-ID: <1266864618.6122.472.camel@laptop> (raw)
In-Reply-To: <1266628424.4729.23.camel@sbs-t61.sc.intel.com>

On Fri, 2010-02-19 at 17:13 -0800, Suresh Siddha wrote:

> Ok Peter. There is another place that is scaling load_per_task with
> cpu_power but later comparing with the difference of max and min of the
> actual cpu load. :(
> 
>         avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
>                 group->cpu_power;
> 
>         if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
>                 sgs->group_imb = 1;
> 
> Fixing this seems to have fixed the problem you mentioned. Can you
> please checkout the appended patch? If everything seems ok, then I will
> send the patch (against -tip tree) on monday morning with the detailed
> changelog.

Yes, this one does seem to generate the intended behaviour and does look
good (after cleaning up some of the now redundant comments).

Thanks!

> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
> ---
> 
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 3a8fb30..213b445 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -3423,6 +3423,7 @@ struct sd_lb_stats {
>  	unsigned long max_load;
>  	unsigned long busiest_load_per_task;
>  	unsigned long busiest_nr_running;
> +	unsigned long busiest_group_capacity;
>  
>  	int group_imb; /* Is there imbalance in this sd */
>  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
> @@ -3742,8 +3743,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
>  	unsigned long load, max_cpu_load, min_cpu_load;
>  	int i;
>  	unsigned int balance_cpu = -1, first_idle_cpu = 0;
> -	unsigned long sum_avg_load_per_task;
> -	unsigned long avg_load_per_task;
> +	unsigned long avg_load_per_task = 0;
>  
>  	if (local_group) {
>  		balance_cpu = group_first_cpu(group);
> @@ -3752,7 +3752,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
>  	}
>  
>  	/* Tally up the load of all CPUs in the group */
> -	sum_avg_load_per_task = avg_load_per_task = 0;
>  	max_cpu_load = 0;
>  	min_cpu_load = ~0UL;
>  
> @@ -3782,7 +3781,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
>  		sgs->sum_nr_running += rq->nr_running;
>  		sgs->sum_weighted_load += weighted_cpuload(i);
>  
> -		sum_avg_load_per_task += cpu_avg_load_per_task(i);
>  	}
>  
>  	/*
> @@ -3801,6 +3799,9 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
>  	sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
>  
> 
> +	if (sgs->sum_nr_running)
> +		avg_load_per_task =
> +				sgs->sum_weighted_load / sgs->sum_nr_running;
>  	/*
>  	 * Consider the group unbalanced when the imbalance is larger
>  	 * than the average weight of two tasks.
> @@ -3810,9 +3811,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
>  	 *      normalized nr_running number somewhere that negates
>  	 *      the hierarchy?
>  	 */
> -	avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
> -		group->cpu_power;
> -
>  	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
>  		sgs->group_imb = 1;
>  
> @@ -3880,6 +3878,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
>  			sds->max_load = sgs.avg_load;
>  			sds->busiest = group;
>  			sds->busiest_nr_running = sgs.sum_nr_running;
> +			sds->busiest_group_capacity = sgs.group_capacity;
>  			sds->busiest_load_per_task = sgs.sum_weighted_load;
>  			sds->group_imb = sgs.group_imb;
>  		}
> @@ -3902,6 +3901,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
>  {
>  	unsigned long tmp, pwr_now = 0, pwr_move = 0;
>  	unsigned int imbn = 2;
> +	unsigned long scaled_busy_load_per_task;
>  
>  	if (sds->this_nr_running) {
>  		sds->this_load_per_task /= sds->this_nr_running;
> @@ -3912,8 +3912,12 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
>  		sds->this_load_per_task =
>  			cpu_avg_load_per_task(this_cpu);
>  
> -	if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
> -			sds->busiest_load_per_task * imbn) {
> +	scaled_busy_load_per_task = sds->busiest_load_per_task
> +						 * SCHED_LOAD_SCALE;
> +	scaled_busy_load_per_task /= sds->busiest->cpu_power;
> +
> +	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
> +			(scaled_busy_load_per_task * imbn)) {
>  		*imbalance = sds->busiest_load_per_task;
>  		return;
>  	}
> @@ -3964,7 +3968,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
>  static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
>  		unsigned long *imbalance)
>  {
> -	unsigned long max_pull;
> +	unsigned long max_pull, load_above_capacity = ~0UL;
>  	/*
>  	 * In the presence of smp nice balancing, certain scenarios can have
>  	 * max load less than avg load(as we skip the groups at or below
> @@ -3975,9 +3979,30 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
>  		return fix_small_imbalance(sds, this_cpu, imbalance);
>  	}
>  
> -	/* Don't want to pull so many tasks that a group would go idle */
> -	max_pull = min(sds->max_load - sds->avg_load,
> -			sds->max_load - sds->busiest_load_per_task);
> +	if (!sds->group_imb) {
> +		/*
> + 	 	 * Don't want to pull so many tasks that a group would go idle.
> +	 	 */
> +		load_above_capacity = (sds->busiest_nr_running - 
> +						sds->busiest_group_capacity);
> +
> +		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
> +	
> +		load_above_capacity /= sds->busiest->cpu_power;
> +	}
> +
> +	/*
> +	 * We're trying to get all the cpus to the average_load, so we don't
> +	 * want to push ourselves above the average load, nor do we wish to
> +	 * reduce the max loaded cpu below the average load, as either of these
> +	 * actions would just result in more rebalancing later, and ping-pong
> +	 * tasks around. Thus we look for the minimum possible imbalance.
> +	 * Negative imbalances (*we* are more loaded than anyone else) will
> +	 * be counted as no imbalance for these purposes -- we can't fix that
> +	 * by pulling tasks to us. Be careful of negative numbers as they'll
> +	 * appear as very large values with unsigned longs.
> +	 */
> +	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
>  
>  	/* How much load to actually move to equalise the imbalance */
>  	*imbalance = min(max_pull * sds->busiest->cpu_power,
> @@ -4069,19 +4094,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
>  		sds.busiest_load_per_task =
>  			min(sds.busiest_load_per_task, sds.avg_load);
>  
> -	/*
> -	 * We're trying to get all the cpus to the average_load, so we don't
> -	 * want to push ourselves above the average load, nor do we wish to
> -	 * reduce the max loaded cpu below the average load, as either of these
> -	 * actions would just result in more rebalancing later, and ping-pong
> -	 * tasks around. Thus we look for the minimum possible imbalance.
> -	 * Negative imbalances (*we* are more loaded than anyone else) will
> -	 * be counted as no imbalance for these purposes -- we can't fix that
> -	 * by pulling tasks to us. Be careful of negative numbers as they'll
> -	 * appear as very large values with unsigned longs.
> -	 */
> -	if (sds.max_load <= sds.busiest_load_per_task)
> -		goto out_balanced;
>  
>  	/* Looks like there is an imbalance. Compute it */
>  	calculate_imbalance(&sds, this_cpu, imbalance);
> 
>  
> 
> 



  reply	other threads:[~2010-02-22 18:50 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-02-13  1:14 [patch] sched: fix SMT scheduler regression in find_busiest_queue() Suresh Siddha
2010-02-13  1:31 ` change in sched cpu_power causing regressions with SCHED_MC Suresh Siddha
2010-02-13 10:36   ` Peter Zijlstra
2010-02-13 10:42     ` Peter Zijlstra
2010-02-13 18:37       ` Vaidyanathan Srinivasan
2010-02-13 18:49         ` Suresh Siddha
2010-02-13 18:39     ` Vaidyanathan Srinivasan
2010-02-19  2:16     ` Suresh Siddha
2010-02-19 12:32       ` Arun R Bharadwaj
2010-02-19 13:03       ` Vaidyanathan Srinivasan
2010-02-19 19:15         ` Suresh Siddha
2010-02-19 14:05       ` Peter Zijlstra
2010-02-19 18:36         ` Suresh Siddha
2010-02-19 19:47           ` Peter Zijlstra
2010-02-19 19:50             ` Suresh Siddha
2010-02-19 20:02               ` Peter Zijlstra
2010-02-20  1:13                 ` Suresh Siddha
2010-02-22 18:50                   ` Peter Zijlstra [this message]
2010-02-24  0:13                     ` Suresh Siddha
2010-02-24 17:43                       ` Peter Zijlstra
2010-02-24 19:31                         ` Suresh Siddha
2010-02-26 10:24                       ` [tip:sched/core] sched: Fix SCHED_MC regression caused by change in sched cpu_power tip-bot for Suresh Siddha
2010-02-26 14:55                       ` tip-bot for Suresh Siddha
2010-02-19 19:52           ` change in sched cpu_power causing regressions with SCHED_MC Peter Zijlstra
2010-02-13 18:33   ` Vaidyanathan Srinivasan
2010-02-13 18:27 ` [patch] sched: fix SMT scheduler regression in find_busiest_queue() Vaidyanathan Srinivasan
2010-02-13 18:39   ` Suresh Siddha
2010-02-13 18:56     ` Vaidyanathan Srinivasan
2010-02-13 20:25   ` Vaidyanathan Srinivasan
2010-02-13 20:36     ` Vaidyanathan Srinivasan
2010-02-14 10:11       ` Peter Zijlstra
2010-02-15 12:35         ` Vaidyanathan Srinivasan
2010-02-15 13:00           ` Peter Zijlstra
2010-02-16 15:59             ` Vaidyanathan Srinivasan
2010-02-16 17:28               ` Peter Zijlstra
2010-02-16 18:25                 ` Vaidyanathan Srinivasan
2010-02-16 18:46                   ` Vaidyanathan Srinivasan
2010-02-16 18:48                   ` Peter Zijlstra
2010-02-15 22:29 ` Peter Zijlstra
2010-02-16 14:16 ` [tip:sched/urgent] sched: Fix " tip-bot for Suresh Siddha

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1266864618.6122.472.camel@laptop \
    --to=peterz@infradead.org \
    --cc=ego@in.ibm.com \
    --cc=ling.ma@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=suresh.b.siddha@intel.com \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=yanmin_zhang@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.