Re: [RFC][PATCH 2/2] sched: fair-group: per root-domain load balancing

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Gregory Haskins <gregory.haskins@gmail.com>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>,
	Dhaval Giani <dhaval@linux.vnet.ibm.com>,
	Paul Jackson <pj@sgi.com>, Arjan van de Ven <arjan@infradead.org>,
	Ingo Molnar <mingo@elte.hu>, Thomas Gleixner <tglx@linutronix.de>,
	Gregory Haskins <ghaskins@novell.com>,
	LKML <linux-kernel@vger.kernel.org>
Subject: Re: [RFC][PATCH 2/2] sched: fair-group: per root-domain load balancing
Date: Fri, 15 Feb 2008 11:46:25 -0500	[thread overview]
Message-ID: <47B5C1E1.5090706@gmail.com> (raw)
In-Reply-To: <20080214160234.681387000@chello.nl>

[-- Attachment #1: Type: text/plain, Size: 7598 bytes --]

Peter Zijlstra wrote:
> Currently the lb_monitor will walk all the domains/cpus from a single
> cpu's timer interrupt. This will cause massive cache-trashing and cache-line
> bouncing on larger machines.
> 
> Split the lb_monitor into root_domain (disjoint sched-domains).
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> CC: Gregory Haskins <gregory.haskins.ml@gmail.com>
> ---
>  kernel/sched.c      |  106 ++++++++++++++++++++++++++++------------------------
>  kernel/sched_fair.c |    2 
>  2 files changed, 59 insertions(+), 49 deletions(-)
> 
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -357,8 +357,6 @@ struct lb_monitor {
>  	spinlock_t lock;
>  };
>  
> -static struct lb_monitor lb_monitor;
> -
>  /*
>   * How frequently should we rebalance_shares() across cpus?
>   *
> @@ -417,6 +415,9 @@ static void lb_monitor_wake(struct lb_mo
>  	if (hrtimer_active(&lb_monitor->timer))
>  		return;
>  
> +	/*
> +	 * XXX: rd->load_balance && weight(rd->span) > 1
> +	 */
>  	if (nr_cpu_ids == 1)
>  		return;
>  
> @@ -444,6 +445,11 @@ static void lb_monitor_init(struct lb_mo
>  
>  	spin_lock_init(&lb_monitor->lock);
>  }
> +
> +static int lb_monitor_destroy(struct lb_monitor *lb_monitor)
> +{
> +	return hrtimer_cancel(&lb_monitor->timer);
> +}
>  #endif
>  
>  static void set_se_shares(struct sched_entity *se, unsigned long shares);
> @@ -607,6 +613,8 @@ struct root_domain {
>  	 */
>  	cpumask_t rto_mask;
>  	atomic_t rto_count;
> +
> +	struct lb_monitor lb_monitor;
>  };
>  
>  /*
> @@ -6328,6 +6336,7 @@ static void rq_attach_root(struct rq *rq
>  {
>  	unsigned long flags;
>  	const struct sched_class *class;
> +	int active = 0;
>  
>  	spin_lock_irqsave(&rq->lock, flags);
>  
> @@ -6342,8 +6351,14 @@ static void rq_attach_root(struct rq *rq
>  		cpu_clear(rq->cpu, old_rd->span);
>  		cpu_clear(rq->cpu, old_rd->online);
>  
> -		if (atomic_dec_and_test(&old_rd->refcount))
> +		if (atomic_dec_and_test(&old_rd->refcount)) {
> +			/*
> +			 * sync with active timers.
> +			 */
> +			active = lb_monitor_destroy(&old_rd->lb_monitor);
> +
>  			kfree(old_rd);

Note that this works out to be a bug in my code on -rt since you cannot 
kfree() while the raw rq->lock is held.  This isn't your problem, per 
se, but just a heads up that I might need to patch this area ASAP.

> +		}
>  	}
>  
>  	atomic_inc(&rd->refcount);
> @@ -6358,6 +6373,9 @@ static void rq_attach_root(struct rq *rq
>  			class->join_domain(rq);
>  	}
>  
> +	if (active)
> +		lb_monitor_wake(&rd->lb_monitor);
> +
>  	spin_unlock_irqrestore(&rq->lock, flags);
>  }
>  
> @@ -6367,6 +6385,8 @@ static void init_rootdomain(struct root_
>  
>  	cpus_clear(rd->span);
>  	cpus_clear(rd->online);
> +
> +	lb_monitor_init(&rd->lb_monitor);
>  }
>  
>  static void init_defrootdomain(void)
> @@ -7398,10 +7418,6 @@ void __init sched_init(void)
>  
>  #ifdef CONFIG_SMP
>  	init_defrootdomain();
> -
> -#ifdef CONFIG_FAIR_GROUP_SCHED
> -	lb_monitor_init(&lb_monitor);
> -#endif
>  #endif
>  	init_rt_bandwidth(&def_rt_bandwidth,
>  			global_rt_period(), global_rt_runtime());
> @@ -7631,11 +7647,11 @@ void set_curr_task(int cpu, struct task_
>   * distribute shares of all task groups among their schedulable entities,
>   * to reflect load distribution across cpus.
>   */
> -static int rebalance_shares(struct sched_domain *sd, int this_cpu)
> +static int rebalance_shares(struct root_domain *rd, int this_cpu)
>  {
>  	struct cfs_rq *cfs_rq;
>  	struct rq *rq = cpu_rq(this_cpu);
> -	cpumask_t sdspan = sd->span;
> +	cpumask_t sdspan = rd->span;
>  	int state = shares_idle;
>  
>  	/* Walk thr' all the task groups that we have */
> @@ -7685,50 +7701,12 @@ static int rebalance_shares(struct sched
>  	return state;
>  }
>  
> -static int load_balance_shares(struct lb_monitor *lb_monitor)
> +static void set_lb_monitor_timeout(struct lb_monitor *lb_monitor, int state)
>  {
> -	int i, cpu, state = shares_idle;
>  	u64 max_timeout = (u64)sysctl_sched_max_bal_int_shares * NSEC_PER_MSEC;
>  	u64 min_timeout = (u64)sysctl_sched_min_bal_int_shares * NSEC_PER_MSEC;
>  	u64 timeout;
>  
> -	/* Prevent cpus going down or coming up */
> -	/* get_online_cpus(); */
> -	/* lockout changes to doms_cur[] array */
> -	/* lock_doms_cur(); */
> -	/*
> -	 * Enter a rcu read-side critical section to safely walk rq->sd
> -	 * chain on various cpus and to walk task group list
> -	 * (rq->leaf_cfs_rq_list) in rebalance_shares().
> -	 */
> -	rcu_read_lock();
> -
> -	for (i = 0; i < ndoms_cur; i++) {
> -		cpumask_t cpumap = doms_cur[i];
> -		struct sched_domain *sd = NULL, *sd_prev = NULL;
> -
> -		cpu = first_cpu(cpumap);
> -
> -		/* Find the highest domain at which to balance shares */
> -		for_each_domain(cpu, sd) {
> -			if (!(sd->flags & SD_LOAD_BALANCE))
> -				continue;
> -			sd_prev = sd;
> -		}
> -
> -		sd = sd_prev;
> -		/* sd == NULL? No load balance reqd in this domain */
> -		if (!sd)
> -			continue;
> -
> -		state = max(state, rebalance_shares(sd, cpu));
> -	}
> -
> -	rcu_read_unlock();
> -
> -	/* unlock_doms_cur(); */
> -	/* put_online_cpus(); */
> -
>  	timeout = ktime_to_ns(lb_monitor->timeout);
>  	switch (state) {
>  	case shares_balanced:
> @@ -7741,6 +7719,38 @@ static int load_balance_shares(struct lb
>  		break;
>  	}
>  	lb_monitor->timeout = ns_to_ktime(timeout);
> +}
> +
> +static int load_balance_shares(struct lb_monitor *lb_monitor)
> +{
> +	int cpu = smp_processor_id();
> +	struct rq *rq = cpu_rq(cpu);
> +	struct root_domain *rd;
> +	struct sched_domain *sd, *sd_prev = NULL;
> +	int state = shares_idle;
> +
> +	spin_lock(&rq->lock);
> +	/*
> +	 * root_domain will stay valid until timer exits - synchronized by
> +	 * hrtimer_cancel().
> +	 */
> +	rd = rq->rd;
> +	spin_unlock(&rq->lock);

I know we talked about this on IRC, I'm am still skeptical about this 
part of the code.  Normally we only guarantee the stability of the 
rq->rd pointer while the rq->lock is held or a rd->refcount is added. 
It would be "safer" to bracket this code with an up/down sequence on the 
rd->refcount, but perhaps you can convince me that it is not needed? 
(i.e. I am still not understanding how the timer guarantees the stability).

[up-ref]

> +
> +	/*
> +	 * complicated way to find rd->load_balance
> +	 */
> +	rcu_read_lock();
> +	for_each_domain(cpu, sd) {
> +		if (!(sd->flags & SD_LOAD_BALANCE))
> +			continue;
> +		sd_prev = sd;
> +	}
> +	if (sd_prev)
> +		state = max(state, rebalance_shares(rd, cpu));
> +	rcu_read_unlock();
> +

[down-ref]

We would of course need to re-work the drop-ref code so it could be 
freed independent of the rq_attach_root() function, but that should be 
trivial.

> +	set_lb_monitor_timeout(lb_monitor, state);
>  
>  	return state;
>  }
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c
> +++ linux-2.6/kernel/sched_fair.c
> @@ -553,7 +553,7 @@ account_entity_enqueue(struct cfs_rq *cf
>  	se->on_rq = 1;
>  	list_add(&se->group_node, &cfs_rq->tasks);
>  #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> -	lb_monitor_wake(&lb_monitor);
> +	lb_monitor_wake(&rq_of(cfs_rq)->rd->lb_monitor);
>  #endif
>  }
>  
> 
> --
> 
> 



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 250 bytes --]

next prev parent reply	other threads:[~2008-02-15 16:47 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-02-14 15:57 [RFC][PATCH 0/2] reworking load_balance_monitor Peter Zijlstra
2008-02-14 15:57 ` [RFC][PATCH 1/2] sched: fair-group: rework load_balance_monitor Peter Zijlstra
2008-02-14 15:57 ` [RFC][PATCH 2/2] sched: fair-group: per root-domain load balancing Peter Zijlstra
2008-02-15 16:46   ` Gregory Haskins [this message]
2008-02-15 19:46     ` Peter Zijlstra
2008-02-19 12:42       ` Gregory Haskins
2008-02-14 16:09 ` [RFC][PATCH 0/2] reworking load_balance_monitor Gregory Haskins
2008-02-14 18:15 ` Paul Jackson
2008-02-14 19:16   ` Gregory Haskins
2008-02-18  8:24 ` Dhaval Giani

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=47B5C1E1.5090706@gmail.com \
    --to=gregory.haskins@gmail.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=arjan@infradead.org \
    --cc=dhaval@linux.vnet.ibm.com \
    --cc=ghaskins@novell.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=pj@sgi.com \
    --cc=tglx@linutronix.de \
    --cc=vatsa@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.