public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
To: Paul Turner <pjt@google.com>
Cc: linux-kernel@vger.kernel.org,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Bharata B Rao <bharata@linux.vnet.ibm.com>,
	Dhaval Giani <dhaval.giani@gmail.com>,
	Balbir Singh <bsingharora@gmail.com>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>,
	Ingo Molnar <mingo@elte.hu>, Pavel Emelyanov <xemul@openvz.org>,
	Jason Baron <jbaron@redhat.com>
Subject: Re: [patch 11/18] sched: prevent interactions with throttled entities
Date: Fri, 22 Jul 2011 17:11:41 +0530	[thread overview]
Message-ID: <20110722114141.GA29349@linux.vnet.ibm.com> (raw)
In-Reply-To: <20110721184757.777916795@google.com>

* Paul Turner <pjt@google.com> [2011-07-21 09:43:36]:

> >From the perspective of load-balance and shares distribution, throttled
> entities should be invisible.
> 
> However, both of these operations work on 'active' lists and are not
> inherently aware of what group hierarchies may be present.  In some cases this
> may be side-stepped (e.g. we could sideload via tg_load_down in load balance) 
> while in others (e.g. update_shares()) it is more difficult to compute without
> incurring some O(n^2) costs.
> 
> Instead, track hierarchicaal throttled state at time of transition.  This
> allows us to easily identify whether an entity belongs to a throttled hierarchy
> and avoid incorrect interactions with it.
> 
> Also, when an entity leaves a throttled hierarchy we need to advance its
> time averaging for shares averaging so that the elapsed throttled time is not
> considered as part of the cfs_rq's operation.
> 
> We also use this information to prevent buddy interactions in the wakeup and
> yield_to() paths.
> 
> Signed-off-by: Paul Turner <pjt@google.com>
> Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
> 
> ---
>  kernel/sched.c      |    2 -
>  kernel/sched_fair.c |   99 ++++++++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 94 insertions(+), 7 deletions(-)
> 
> Index: tip/kernel/sched_fair.c
> ===================================================================
> --- tip.orig/kernel/sched_fair.c
> +++ tip/kernel/sched_fair.c
> @@ -725,6 +725,8 @@ account_entity_dequeue(struct cfs_rq *cf
>  }
> 
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> +/* we need this in update_cfs_load and load-balance functions below */
> +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
>  # ifdef CONFIG_SMP
>  static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
>  					    int global_update)
> @@ -747,7 +749,7 @@ static void update_cfs_load(struct cfs_r
>  	u64 now, delta;
>  	unsigned long load = cfs_rq->load.weight;
> 
> -	if (cfs_rq->tg == &root_task_group)
> +	if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
>  		return;
> 
>  	now = rq_of(cfs_rq)->clock_task;
> @@ -856,7 +858,7 @@ static void update_cfs_shares(struct cfs
> 
>  	tg = cfs_rq->tg;
>  	se = tg->se[cpu_of(rq_of(cfs_rq))];
> -	if (!se)
> +	if (!se || throttled_hierarchy(cfs_rq))
>  		return;
>  #ifndef CONFIG_SMP
>  	if (likely(se->load.weight == tg->shares))
> @@ -1425,6 +1427,65 @@ static inline int cfs_rq_throttled(struc
>  	return cfs_rq->throttled;
>  }
> 
> +/* check whether cfs_rq, or any parent, is throttled */
> +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
> +{
> +	return cfs_rq->throttle_count;
> +}
> +
> +/*
> + * Ensure that neither of the group entities corresponding to src_cpu or
> + * dest_cpu are members of a throttled hierarchy when performing group
> + * load-balance operations.
> + */
> +static inline int throttled_lb_pair(struct task_group *tg,
> +				    int src_cpu, int dest_cpu)
> +{
> +	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
> +
> +	src_cfs_rq = tg->cfs_rq[src_cpu];
> +	dest_cfs_rq = tg->cfs_rq[dest_cpu];
> +
> +	return throttled_hierarchy(src_cfs_rq) ||
> +	       throttled_hierarchy(dest_cfs_rq);
> +}
> +
> +/* updated child weight may affect parent so we have to do this bottom up */
> +static int tg_unthrottle_up(struct task_group *tg, void *data)
> +{
> +	struct rq *rq = data;
> +	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> +
> +	cfs_rq->throttle_count--;
> +#ifdef CONFIG_SMP
> +	if (!cfs_rq->throttle_count) {
> +		u64 delta = rq->clock_task - cfs_rq->load_stamp;
> +
> +		/* leaving throttled state, advance shares averaging windows */
> +		cfs_rq->load_stamp += delta;
> +		cfs_rq->load_last += delta;
> +
> +		/* update entity weight now that we are on_rq again */
> +		update_cfs_shares(cfs_rq);
> +	}
> +#endif
> +
> +	return 0;
> +}
> +
> +static int tg_throttle_down(struct task_group *tg, void *data)
> +{
> +	struct rq *rq = data;
> +	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> +
> +	/* group is entering throttled state, record last load */
> +	if (!cfs_rq->throttle_count)
> +		update_cfs_load(cfs_rq, 0);
> +	cfs_rq->throttle_count++;
> +
> +	return 0;
> +}
> +
>  static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  {
>  	struct rq *rq = rq_of(cfs_rq);
> @@ -1435,7 +1496,9 @@ static __used void throttle_cfs_rq(struc
>  	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
> 
>  	/* account load preceding throttle */
> -	update_cfs_load(cfs_rq, 0);
> +	rcu_read_lock();
> +	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
> +	rcu_read_unlock();
> 
>  	task_delta = cfs_rq->h_nr_running;
>  	for_each_sched_entity(se) {
> @@ -1476,6 +1539,10 @@ static void unthrottle_cfs_rq(struct cfs
>  	list_del_rcu(&cfs_rq->throttled_list);
>  	raw_spin_unlock(&cfs_b->lock);
> 
> +	update_rq_clock(rq);
> +	/* update hierarchical throttle state */
> +	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
> +
>  	if (!cfs_rq->load.weight)
>  		return;
> 
> @@ -1620,6 +1687,17 @@ static inline int cfs_rq_throttled(struc
>  {
>  	return 0;
>  }
> +
> +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
> +{
> +	return 0;
> +}
> +
> +static inline int throttled_lb_pair(struct task_group *tg,
> +				    int src_cpu, int dest_cpu)
> +{
> +	return 0;
> +}
>  #endif
> 
>  /**************************************************
> @@ -2519,6 +2597,9 @@ move_one_task(struct rq *this_rq, int th
> 
>  	for_each_leaf_cfs_rq(busiest, cfs_rq) {
>  		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
> +			if (throttled_lb_pair(task_group(p),
> +					      busiest->cpu, this_cpu))
> +				break;
> 
>  			if (!can_migrate_task(p, busiest, this_cpu,
>  						sd, idle, &pinned))
> @@ -2630,8 +2711,13 @@ static void update_shares(int cpu)
>  	struct rq *rq = cpu_rq(cpu);
> 
>  	rcu_read_lock();
> -	for_each_leaf_cfs_rq(rq, cfs_rq)
> +	for_each_leaf_cfs_rq(rq, cfs_rq) {
> +		/* throttled entities do not contribute to load */
> +		if (throttled_hierarchy(cfs_rq))
> +			continue;
> +
>  		update_shares_cpu(cfs_rq->tg, cpu);
> +	}
>  	rcu_read_unlock();
>  }
> 
> @@ -2655,9 +2741,10 @@ load_balance_fair(struct rq *this_rq, in
>  		u64 rem_load, moved_load;
> 
>  		/*
> -		 * empty group
> +		 * empty group or part of a throttled hierarchy
>  		 */
> -		if (!busiest_cfs_rq->task_weight)
> +		if (!busiest_cfs_rq->task_weight ||
> +		    throttled_lb_pair(tg, busiest_cpu, this_cpu))

tip commit 9763b67fb9f30 removes both tg and busiest_cpu from
load_balance_fair.

>  			continue;
> 
>  		rem_load = (u64)rem_load_move * busiest_weight;
> Index: tip/kernel/sched.c
> ===================================================================
> --- tip.orig/kernel/sched.c
> +++ tip/kernel/sched.c
> @@ -399,7 +399,7 @@ struct cfs_rq {
>  	u64 runtime_expires;
>  	s64 runtime_remaining;
> 
> -	int throttled;
> +	int throttled, throttle_count;
>  	struct list_head throttled_list;
>  #endif
>  #endif
> 
> 

  parent reply	other threads:[~2011-07-22 11:42 UTC|newest]

Thread overview: 60+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-07-21 16:43 [patch 00/18] CFS Bandwidth Control v7.2 Paul Turner
2011-07-21 16:43 ` [patch 01/18] sched: (fixlet) dont update shares twice on on_rq parent Paul Turner
2011-07-22 11:06   ` Kamalesh Babulal
2011-07-21 16:43 ` [patch 02/18] sched: hierarchical task accounting for SCHED_OTHER Paul Turner
2011-08-14 16:15   ` [tip:sched/core] sched: Implement " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 03/18] sched: introduce primitives to account for CFS bandwidth tracking Paul Turner
2011-07-22 11:14   ` Kamalesh Babulal
2011-08-14 16:17   ` [tip:sched/core] sched: Introduce " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 04/18] sched: validate CFS quota hierarchies Paul Turner
2011-08-14 16:19   ` [tip:sched/core] sched: Validate " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 05/18] sched: accumulate per-cfs_rq cpu usage and charge against bandwidth Paul Turner
2011-08-14 16:21   ` [tip:sched/core] sched: Accumulate " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 06/18] sched: add a timer to handle CFS bandwidth refresh Paul Turner
2011-08-14 16:23   ` [tip:sched/core] sched: Add " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 07/18] sched: expire invalid runtime Paul Turner
2011-08-14 16:24   ` [tip:sched/core] sched: Expire " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 08/18] sched: add support for throttling group entities Paul Turner
2011-08-08 15:46   ` Lin Ming
2011-08-08 16:00     ` Peter Zijlstra
2011-08-08 16:16       ` Paul Turner
2011-08-14 16:26   ` [tip:sched/core] sched: Add " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 09/18] sched: add support for unthrottling " Paul Turner
2011-08-14 16:27   ` [tip:sched/core] sched: Add " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 10/18] sched: allow for positional tg_tree walks Paul Turner
2011-08-14 16:29   ` [tip:sched/core] sched: Allow " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 11/18] sched: prevent interactions with throttled entities Paul Turner
2011-07-22 11:26   ` Kamalesh Babulal
2011-07-22 11:37     ` Peter Zijlstra
2011-07-22 11:41   ` Kamalesh Babulal [this message]
2011-07-22 11:43     ` Peter Zijlstra
2011-07-22 18:16       ` Kamalesh Babulal
2011-08-14 16:30   ` [tip:sched/core] sched: Prevent " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 12/18] sched: prevent buddy " Paul Turner
2011-08-14 16:32   ` [tip:sched/core] sched: Prevent " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 13/18] sched: migrate throttled tasks on HOTPLUG Paul Turner
2011-08-14 16:34   ` [tip:sched/core] sched: Migrate " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 14/18] sched: throttle entities exceeding their allowed bandwidth Paul Turner
2011-08-14 16:35   ` [tip:sched/core] sched: Throttle " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 15/18] sched: add exports tracking cfs bandwidth control statistics Paul Turner
2011-08-14 16:37   ` [tip:sched/core] sched: Add " tip-bot for Nikhil Rao
2011-07-21 16:43 ` [patch 16/18] sched: return unused runtime on group dequeue Paul Turner
2011-08-14 16:39   ` [tip:sched/core] sched: Return " tip-bot for Paul Turner
2011-07-21 16:43 ` [RFT][patch 17/18] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
2011-07-21 16:43 ` [patch 18/18] sched: add documentation for bandwidth control Paul Turner
2011-08-14 16:41   ` [tip:sched/core] sched: Add " tip-bot for Bharata B Rao
2011-07-21 23:01 ` [patch 00/18] CFS Bandwidth Control v7.2 Paul Turner
2011-07-25 14:58 ` Peter Zijlstra
2011-07-25 15:00   ` Peter Zijlstra
2011-07-25 16:21     ` Paul E. McKenney
2011-07-25 16:28       ` Peter Zijlstra
2011-07-25 16:46         ` Paul E. McKenney
2011-07-25 17:08           ` Peter Zijlstra
2011-07-25 17:11             ` Dhaval Giani
2011-07-25 17:35               ` Peter Zijlstra
2011-07-28  2:59     ` Paul Turner
2011-09-13 12:10 ` Vladimir Davydov
2011-09-13 14:00   ` Peter Zijlstra
2011-09-16  8:06   ` Paul Turner
2011-09-19  8:22     ` Vladimir Davydov
2011-09-19  8:33       ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110722114141.GA29349@linux.vnet.ibm.com \
    --to=kamalesh@linux.vnet.ibm.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=bharata@linux.vnet.ibm.com \
    --cc=bsingharora@gmail.com \
    --cc=dhaval.giani@gmail.com \
    --cc=jbaron@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=pjt@google.com \
    --cc=seto.hidetoshi@jp.fujitsu.com \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=vatsa@in.ibm.com \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox