Re: [RFC][PATCH 8/8] sched: remove reciprocal for cpu_power

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Andreas Herrmann <andreas.herrmann3@amd.com>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>,
	linux-kernel@vger.kernel.org, Gautham R Shenoy <ego@in.ibm.com>,
	Balbir Singh <balbir@in.ibm.com>
Subject: Re: [RFC][PATCH 8/8] sched: remove reciprocal for cpu_power
Date: Thu, 3 Sep 2009 14:12:27 +0200	[thread overview]
Message-ID: <20090903121227.GL7216@alberich.amd.com> (raw)
In-Reply-To: <20090901083826.425896304@chello.nl>

On Tue, Sep 01, 2009 at 10:34:39AM +0200, Peter Zijlstra wrote:
> Its a source of fail, also, now that cpu_power is dynamical, its a
> waste of time.
> 
> before:
> <idle>-0   [000]   132.877936: find_busiest_group: avg_load: 0 group_load: 8241 power: 1 
> 
> after:
> bash-1689  [001]   137.862151: find_busiest_group: avg_load: 10636288 group_load: 10387 power: 1
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/sched.h |   10 +----
>  kernel/sched.c        |  100 +++++++++++++++++---------------------------------
>  2 files changed, 36 insertions(+), 74 deletions(-)
> 
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -120,30 +120,8 @@
>   */
>  #define RUNTIME_INF	((u64)~0ULL)
>  
> -#ifdef CONFIG_SMP
> -
>  static void double_rq_lock(struct rq *rq1, struct rq *rq2);
>  
> -/*
> - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
> - * Since cpu_power is a 'constant', we can use a reciprocal divide.
> - */
> -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
> -{
> -	return reciprocal_divide(load, sg->reciprocal_cpu_power);
> -}
> -
> -/*
> - * Each time a sched group cpu_power is changed,
> - * we must compute its reciprocal value
> - */
> -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
> -{
> -	sg->__cpu_power += val;
> -	sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
> -}
> -#endif
> -
>  static inline int rt_policy(int policy)
>  {
>  	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
> @@ -2335,8 +2313,7 @@ find_idlest_group(struct sched_domain *s
>  		}
>  
>  		/* Adjust by relative CPU power of the group */
> -		avg_load = sg_div_cpu_power(group,
> -				avg_load * SCHED_LOAD_SCALE);
> +		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
>  
>  		if (local_group) {
>  			this_load = avg_load;
> @@ -3768,7 +3745,6 @@ static void update_cpu_power(struct sche
>  	unsigned long weight = cpumask_weight(sched_domain_span(sd));
>  	unsigned long power = SCHED_LOAD_SCALE;
>  	struct sched_group *sdg = sd->groups;
> -	unsigned long old = sdg->__cpu_power;
>  
>  	/* here we could scale based on cpufreq */
>  
> @@ -3783,33 +3759,26 @@ static void update_cpu_power(struct sche
>  	if (!power)
>  		power = 1;
>  
> -	if (power != old) {
> -		sdg->__cpu_power = power;
> -		sdg->reciprocal_cpu_power = reciprocal_value(power);
> -	}
> +	sdg->cpu_power = power;
>  }
>  
>  static void update_group_power(struct sched_domain *sd, int cpu)
>  {
>  	struct sched_domain *child = sd->child;
>  	struct sched_group *group, *sdg = sd->groups;
> -	unsigned long power = sdg->__cpu_power;
>  
>  	if (!child) {
>  		update_cpu_power(sd, cpu);
>  		return;
>  	}
>  
> -	sdg->__cpu_power = 0;
> +	sdg->cpu_power = 0;
>  
>  	group = child->groups;
>  	do {
> -		sdg->__cpu_power += group->__cpu_power;
> +		sdg->cpu_power += group->cpu_power;
>  		group = group->next;
>  	} while (group != child->groups);
> -
> -	if (power != sdg->__cpu_power)
> -		sdg->reciprocal_cpu_power = reciprocal_value(sdg->__cpu_power);
>  }
>  
>  /**
> @@ -3889,8 +3858,7 @@ static inline void update_sg_lb_stats(st
>  	}
>  
>  	/* Adjust by relative CPU power of the group */
> -	sgs->avg_load = sg_div_cpu_power(group,
> -			sgs->group_load * SCHED_LOAD_SCALE);
> +	sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
>  
>  
>  	/*
> @@ -3902,14 +3870,14 @@ static inline void update_sg_lb_stats(st
>  	 *      normalized nr_running number somewhere that negates
>  	 *      the hierarchy?
>  	 */
> -	avg_load_per_task = sg_div_cpu_power(group,
> -			sum_avg_load_per_task * SCHED_LOAD_SCALE);
> +	avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
> +		group->cpu_power;
>  
>  	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
>  		sgs->group_imb = 1;
>  
>  	sgs->group_capacity =
> -		DIV_ROUND_CLOSEST(group->__cpu_power, SCHED_LOAD_SCALE);
> +		DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
>  }
>  
>  /**
> @@ -3951,7 +3919,7 @@ static inline void update_sd_lb_stats(st
>  			return;
>  
>  		sds->total_load += sgs.group_load;
> -		sds->total_pwr += group->__cpu_power;
> +		sds->total_pwr += group->cpu_power;
>  
>  		/*
>  		 * In case the child domain prefers tasks go to siblings
> @@ -4016,28 +3984,28 @@ static inline void fix_small_imbalance(s
>  	 * moving them.
>  	 */
>  
> -	pwr_now += sds->busiest->__cpu_power *
> +	pwr_now += sds->busiest->cpu_power *
>  			min(sds->busiest_load_per_task, sds->max_load);
> -	pwr_now += sds->this->__cpu_power *
> +	pwr_now += sds->this->cpu_power *
>  			min(sds->this_load_per_task, sds->this_load);
>  	pwr_now /= SCHED_LOAD_SCALE;
>  
>  	/* Amount of load we'd subtract */
> -	tmp = sg_div_cpu_power(sds->busiest,
> -			sds->busiest_load_per_task * SCHED_LOAD_SCALE);
> +	tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
> +		sds->busiest->cpu_power;
>  	if (sds->max_load > tmp)
> -		pwr_move += sds->busiest->__cpu_power *
> +		pwr_move += sds->busiest->cpu_power *
>  			min(sds->busiest_load_per_task, sds->max_load - tmp);
>  
>  	/* Amount of load we'd add */
> -	if (sds->max_load * sds->busiest->__cpu_power <
> +	if (sds->max_load * sds->busiest->cpu_power <
>  		sds->busiest_load_per_task * SCHED_LOAD_SCALE)
> -		tmp = sg_div_cpu_power(sds->this,
> -			sds->max_load * sds->busiest->__cpu_power);
> +		tmp = (sds->max_load * sds->busiest->cpu_power) /
> +			sds->this->cpu_power;
>  	else
> -		tmp = sg_div_cpu_power(sds->this,
> -			sds->busiest_load_per_task * SCHED_LOAD_SCALE);
> -	pwr_move += sds->this->__cpu_power *
> +		tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
> +			sds->this->cpu_power;
> +	pwr_move += sds->this->cpu_power *
>  			min(sds->this_load_per_task, sds->this_load + tmp);
>  	pwr_move /= SCHED_LOAD_SCALE;
>  
> @@ -4072,8 +4040,8 @@ static inline void calculate_imbalance(s
>  			sds->max_load - sds->busiest_load_per_task);
>  
>  	/* How much load to actually move to equalise the imbalance */
> -	*imbalance = min(max_pull * sds->busiest->__cpu_power,
> -		(sds->avg_load - sds->this_load) * sds->this->__cpu_power)
> +	*imbalance = min(max_pull * sds->busiest->cpu_power,
> +		(sds->avg_load - sds->this_load) * sds->this->cpu_power)
>  			/ SCHED_LOAD_SCALE;
>  
>  	/*
> @@ -4208,7 +4176,7 @@ static unsigned long power_of(int cpu)
>  	if (!group)
>  		return SCHED_LOAD_SCALE;
>  
> -	return group->__cpu_power;
> +	return group->cpu_power;
>  }
>  
>  /*
> @@ -7934,7 +7902,7 @@ static int sched_domain_debug_one(struct
>  			break;
>  		}
>  
> -		if (!group->__cpu_power) {
> +		if (!group->cpu_power) {
>  			printk(KERN_CONT "\n");
>  			printk(KERN_ERR "ERROR: domain->cpu_power not "
>  					"set\n");
> @@ -7958,9 +7926,9 @@ static int sched_domain_debug_one(struct
>  		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
>  
>  		printk(KERN_CONT " %s", str);
> -		if (group->__cpu_power != SCHED_LOAD_SCALE) {
> -			printk(KERN_CONT " (__cpu_power = %d)",
> -				group->__cpu_power);
> +		if (group->cpu_power != SCHED_LOAD_SCALE) {
> +			printk(KERN_CONT " (cpu_power = %d)",
> +				group->cpu_power);
>  		}
>  
>  		group = group->next;
> @@ -8245,7 +8213,7 @@ init_sched_build_groups(const struct cpu
>  			continue;
>  
>  		cpumask_clear(sched_group_cpus(sg));
> -		sg->__cpu_power = 0;
> +		sg->cpu_power = 0;
>  
>  		for_each_cpu(j, span) {
>  			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
> @@ -8503,7 +8471,7 @@ static void init_numa_sched_groups_power
>  				continue;
>  			}
>  
> -			sg_inc_cpu_power(sg, sd->groups->__cpu_power);
> +			sg->cpu_power += sd->groups->cpu_power;
>  		}
>  		sg = sg->next;
>  	} while (sg != group_head);
> @@ -8540,7 +8508,7 @@ static int build_numa_sched_groups(struc
>  		sd->groups = sg;
>  	}
>  
> -	sg->__cpu_power = 0;
> +	sg->cpu_power = 0;
>  	cpumask_copy(sched_group_cpus(sg), d->nodemask);
>  	sg->next = sg;
>  	cpumask_or(d->covered, d->covered, d->nodemask);
> @@ -8563,7 +8531,7 @@ static int build_numa_sched_groups(struc
>  			       "Can not alloc domain group for node %d\n", j);
>  			return -ENOMEM;
>  		}
> -		sg->__cpu_power = 0;
> +		sg->cpu_power = 0;
>  		cpumask_copy(sched_group_cpus(sg), d->tmpmask);
>  		sg->next = prev->next;
>  		cpumask_or(d->covered, d->covered, d->tmpmask);
> @@ -8641,7 +8609,7 @@ static void init_sched_groups_power(int 
>  
>  	child = sd->child;
>  
> -	sd->groups->__cpu_power = 0;
> +	sd->groups->cpu_power = 0;
>  
>  	if (!child) {
>  		power = SCHED_LOAD_SCALE;
> @@ -8657,7 +8625,7 @@ static void init_sched_groups_power(int 
>  			power /= weight;
>  			power >>= SCHED_LOAD_SHIFT;
>  		}
> -		sg_inc_cpu_power(sd->groups, power);
> +		sd->groups->cpu_power += power;
>  		return;
>  	}
>  
> @@ -8666,7 +8634,7 @@ static void init_sched_groups_power(int 
>  	 */
>  	group = child->groups;
>  	do {
> -		sg_inc_cpu_power(sd->groups, group->__cpu_power);
> +		sd->groups->cpu_power += group->cpu_power;
>  		group = group->next;
>  	} while (group != child->groups);
>  }
> Index: linux-2.6/include/linux/sched.h
> ===================================================================
> --- linux-2.6.orig/include/linux/sched.h
> +++ linux-2.6/include/linux/sched.h
> @@ -870,15 +870,9 @@ struct sched_group {
>  
>  	/*
>  	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
> -	 * single CPU. This is read only (except for setup, hotplug CPU).
> -	 * Note : Never change cpu_power without recompute its reciprocal
> +	 * single CPU.
>  	 */
> -	unsigned int __cpu_power;
> -	/*
> -	 * reciprocal value of cpu_power to avoid expensive divides
> -	 * (see include/linux/reciprocal_div.h)
> -	 */
> -	u32 reciprocal_cpu_power;
> +	unsigned int cpu_power;
>  
>  	/*
>  	 * The CPUs this group covers.
> 
> -- 
> 

Nice.
You might also want to remove the respective header file:

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -64,7 +64,6 @@
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
-#include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>


Regards,
Andreas

-- 
Operating | Advanced Micro Devices GmbH
  System  | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. München, Germany
 Research | Geschäftsführer: Andrew Bowd, Thomas M. McCoy, Giuliano Meroni
  Center  | Sitz: Dornach, Gemeinde Aschheim, Landkreis München
  (OSRC)  | Registergericht München, HRB Nr. 43632

next prev parent reply	other threads:[~2009-09-03 12:12 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-09-01  8:34 [RFC][PATCH 0/8] load-balancing and cpu_power -v2 Peter Zijlstra
2009-09-01  8:34 ` [RFC][PATCH 1/8] sched: restore __cpu_power to a straight sum of power Peter Zijlstra
2009-09-04  8:54   ` [tip:sched/balancing] sched: Restore " tip-bot for Peter Zijlstra
2009-09-01  8:34 ` [RFC][PATCH 2/8] sched: SD_PREFER_SIBLING Peter Zijlstra
2009-09-04  8:55   ` [tip:sched/balancing] sched: Add SD_PREFER_SIBLING tip-bot for Peter Zijlstra
2009-09-01  8:34 ` [RFC][PATCH 3/8] sched: update the cpu_power sum during load-balance Peter Zijlstra
2009-09-02 11:17   ` Gautham R Shenoy
2009-09-02 11:25     ` Peter Zijlstra
2009-09-04  8:55   ` [tip:sched/balancing] sched: Update " tip-bot for Peter Zijlstra
2009-09-01  8:34 ` [RFC][PATCH 4/8] sched: add smt_gain Peter Zijlstra
2009-09-02 11:22   ` Gautham R Shenoy
2009-09-02 11:26     ` Peter Zijlstra
2009-09-04  8:55   ` [tip:sched/balancing] sched: Add smt_gain tip-bot for Peter Zijlstra
2009-09-01  8:34 ` [RFC][PATCH 5/8] sched: dynamic cpu_power Peter Zijlstra
2009-09-02 11:24   ` Gautham R Shenoy
2009-09-04  8:55   ` [tip:sched/balancing] sched: Implement " tip-bot for Peter Zijlstra
2009-09-01  8:34 ` [RFC][PATCH 6/8] sched: scale down cpu_power due to RT tasks Peter Zijlstra
2009-09-04  8:56   ` [tip:sched/balancing] sched: Scale " tip-bot for Peter Zijlstra
2009-09-01  8:34 ` [RFC][PATCH 7/8] sched: try to deal with low capacity Peter Zijlstra
2009-09-02 11:29   ` Gautham R Shenoy
2009-09-04  8:56   ` [tip:sched/balancing] sched: Try " tip-bot for Peter Zijlstra
2009-09-01  8:34 ` [RFC][PATCH 8/8] sched: remove reciprocal for cpu_power Peter Zijlstra
2009-09-03 12:12   ` Andreas Herrmann [this message]
2009-09-04  8:56   ` [tip:sched/balancing] sched: Remove " tip-bot for Peter Zijlstra
2009-09-02 10:57 ` [RFC][PATCH 0/8] load-balancing and cpu_power -v2 Gautham R Shenoy
2009-09-03 12:10 ` Andreas Herrmann
2009-09-03 13:38   ` Peter Zijlstra
2009-09-04  7:19   ` Ingo Molnar
2009-09-04  9:27     ` [crash] " Ingo Molnar
2009-09-04 10:25       ` [tip:sched/balancing] sched: Fix dynamic power-balancing crash tip-bot for Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090903121227.GL7216@alberich.amd.com \
    --to=andreas.herrmann3@amd.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=balbir@in.ibm.com \
    --cc=ego@in.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.