Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Paul Turner <pjt@google.com>
Cc: linux-kernel@vger.kernel.org,
	Bharata B Rao <bharata@linux.vnet.ibm.com>,
	Dhaval Giani <dhaval@linux.vnet.ibm.com>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
	Gautham R Shenoy <ego@in.ibm.com>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>,
	Ingo Molnar <mingo@elte.hu>, Pavel Emelyanov <xemul@openvz.org>,
	Herbert Poetzl <herbert@13thfloor.at>,
	Avi Kivity <avi@redhat.com>, Chris Friesen <cfriesen@nortel.com>,
	Nikhil Rao <ncrao@google.com>
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota
Date: Wed, 23 Feb 2011 14:32:13 +0100	[thread overview]
Message-ID: <1298467933.2217.765.camel@twins> (raw)
In-Reply-To: <20110216031841.068673650@google.com>

On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:

> +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> +{
> +	return cfs_rq->throttled;
> +}
> +
> +/* it's possible to be 'on_rq' in a dequeued (e.g. throttled) hierarchy */
> +static inline int entity_on_rq(struct sched_entity *se)
> +{
> +	for_each_sched_entity(se)
> +		if (!se->on_rq)
> +			return 0;

Please add block braces over multi line stmts even if not strictly
needed.

> +
> +	return 1;
> +}


> @@ -761,7 +788,11 @@ static void update_cfs_load(struct cfs_r
>  	u64 now, delta;
>  	unsigned long load = cfs_rq->load.weight;
>  
> -	if (cfs_rq->tg == &root_task_group)
> +	/*
> +	 * Don't maintain averages for the root task group, or while we are
> +	 * throttled.
> +	 */
> +	if (cfs_rq->tg == &root_task_group || cfs_rq_throttled(cfs_rq))
>  		return;
>  
>  	now = rq_of(cfs_rq)->clock_task;

Placing the return there avoids updating the timestamps, so once we get
unthrottled we'll observe a very long period and skew the load avg?

Ideally we'd never call this on throttled groups to begin with and
handle them like full dequeue/enqueue like things.

> @@ -1015,6 +1046,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
>  	 * Update run-time statistics of the 'current'.
>  	 */
>  	update_curr(cfs_rq);
> +
> +
> +#ifdef CONFIG_CFS_BANDWIDTH
> +	if (!entity_is_task(se) && (cfs_rq_throttled(group_cfs_rq(se)) ||
> +	     !group_cfs_rq(se)->nr_running))
> +		return;
> +#endif
> +
>  	update_cfs_load(cfs_rq, 0);
>  	account_entity_enqueue(cfs_rq, se);
>  	update_cfs_shares(cfs_rq);
> @@ -1087,6 +1126,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
>  	 */
>  	update_curr(cfs_rq);
>  
> +#ifdef CONFIG_CFS_BANDWIDTH
> +	if (!entity_is_task(se) && cfs_rq_throttled(group_cfs_rq(se)))
> +		return;
> +#endif
> +
>  	update_stats_dequeue(cfs_rq, se);
>  	if (flags & DEQUEUE_SLEEP) {
>  #ifdef CONFIG_SCHEDSTATS

These make me very nervous, on enqueue you bail after adding
min_vruntime to ->vruntime and calling update_curr(), but on dequeue you
bail before subtracting min_vruntime from ->vruntime.

> @@ -1363,6 +1407,9 @@ enqueue_task_fair(struct rq *rq, struct 
>  			break;
>  		cfs_rq = cfs_rq_of(se);
>  		enqueue_entity(cfs_rq, se, flags);
> +		/* don't continue to enqueue if our parent is throttled */
> +		if (cfs_rq_throttled(cfs_rq))
> +			break;
>  		flags = ENQUEUE_WAKEUP;
>  	}
>  
> @@ -1390,8 +1437,11 @@ static void dequeue_task_fair(struct rq 
>  		cfs_rq = cfs_rq_of(se);
>  		dequeue_entity(cfs_rq, se, flags);
>  
> -		/* Don't dequeue parent if it has other entities besides us */
> -		if (cfs_rq->load.weight)
> +		/*
> +		 * Don't dequeue parent if it has other entities besides us,
> +		 * or if it is throttled
> +		 */
> +		if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
>  			break;
>  		flags |= DEQUEUE_SLEEP;
>  	}

How could we even be running if our parent was throttled?

> @@ -1430,6 +1480,42 @@ static u64 tg_request_cfs_quota(struct t
>  	return delta;
>  }
>  
> +static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> +{
> +	struct sched_entity *se;
> +
> +	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
> +
> +	/* account load preceeding throttle */

My spell checker thinks that should be written as: preceding.

> +	update_cfs_load(cfs_rq, 0);
> +
> +	/* prevent previous buddy nominations from re-picking this se */
> +	clear_buddies(cfs_rq_of(se), se);
> +
> +	/*
> +	 * It's possible for the current task to block and re-wake before task
> +	 * switch, leading to a throttle within enqueue_task->update_curr()
> +	 * versus an an entity that has not technically been enqueued yet.

I'm not quite seeing how this would happen.. care to expand on this?

> +	 * In this case, since we haven't actually done the enqueue yet, cut
> +	 * out and allow enqueue_entity() to short-circuit
> +	 */
> +	if (!se->on_rq)
> +		goto out_throttled;
> +
> +	for_each_sched_entity(se) {
> +		struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> +		dequeue_entity(cfs_rq, se, 1);
> +		if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
> +			break;
> +	}
> +
> +out_throttled:
> +	cfs_rq->throttled = 1;
> +	update_cfs_rq_load_contribution(cfs_rq, 1);
> +}
> +
>  static void account_cfs_rq_quota(struct cfs_rq *cfs_rq,
>  		unsigned long delta_exec)
>  {
> @@ -1438,10 +1524,16 @@ static void account_cfs_rq_quota(struct 
>  
>  	cfs_rq->quota_used += delta_exec;
>  
> -	if (cfs_rq->quota_used < cfs_rq->quota_assigned)
> +	if (cfs_rq_throttled(cfs_rq) ||
> +		cfs_rq->quota_used < cfs_rq->quota_assigned)
>  		return;

So we are throttled but running anyway, I suppose this comes from the PI
ceiling muck?

>  	cfs_rq->quota_assigned += tg_request_cfs_quota(cfs_rq->tg);
> +
> +	if (cfs_rq->quota_used >= cfs_rq->quota_assigned) {
> +		throttle_cfs_rq(cfs_rq);
> +		resched_task(cfs_rq->rq->curr);
> +	}
>  }
>  
>  static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
> @@ -1941,6 +2033,12 @@ static void check_preempt_wakeup(struct 
>  	if (unlikely(se == pse))
>  		return;
>  
> +#ifdef CONFIG_CFS_BANDWIDTH
> +	/* avoid pre-emption check/buddy nomination for throttled tasks */

Somehow my spell checker doesn't like that hyphen.

> +	if (!entity_on_rq(pse))
> +		return;
> +#endif

Ideally that #ifdef'ery would go away too. 

>  	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
>  		set_next_buddy(pse);
>  
> @@ -2060,7 +2158,8 @@ static bool yield_to_task_fair(struct rq
>  {
>  	struct sched_entity *se = &p->se;
>  
> -	if (!se->on_rq)
> +	/* ensure entire hierarchy is on rq (e.g. running & not throttled) */
> +	if (!entity_on_rq(se))
>  		return false;

like here..

>  	/* Tell the scheduler that we'd really like pse to run next. */
> @@ -2280,7 +2379,8 @@ static void update_shares(int cpu)
>  
>  	rcu_read_lock();
>  	for_each_leaf_cfs_rq(rq, cfs_rq)
> -		update_shares_cpu(cfs_rq->tg, cpu);
> +		if (!cfs_rq_throttled(cfs_rq))
> +			update_shares_cpu(cfs_rq->tg, cpu);

This wants extra braces

>  	rcu_read_unlock();
>  }
>  
> @@ -2304,9 +2404,10 @@ load_balance_fair(struct rq *this_rq, in
>  		u64 rem_load, moved_load;
>  
>  		/*
> -		 * empty group
> +		 * empty group or throttled cfs_rq
>  		 */
> -		if (!busiest_cfs_rq->task_weight)
> +		if (!busiest_cfs_rq->task_weight ||
> +				cfs_rq_throttled(busiest_cfs_rq))
>  			continue;
>  
>  		rem_load = (u64)rem_load_move * busiest_weight;
> 
>

next prev parent reply	other threads:[~2011-02-23 13:32 UTC|newest]

Thread overview: 71+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-02-16  3:18 [CFS Bandwidth Control v4 0/7] Introduction Paul Turner
2011-02-16  3:18 ` [CFS Bandwidth Control v4 1/7] sched: introduce primitives to account for CFS bandwidth tracking Paul Turner
2011-02-16 16:52   ` Balbir Singh
2011-02-17  2:54     ` Bharata B Rao
2011-02-23 13:32   ` Peter Zijlstra
2011-02-25  3:11     ` Paul Turner
2011-02-25 20:53     ` Paul Turner
2011-02-16  3:18 ` [CFS Bandwidth Control v4 2/7] sched: accumulate per-cfs_rq cpu usage Paul Turner
2011-02-16 17:45   ` Balbir Singh
2011-02-23 13:32   ` Peter Zijlstra
2011-02-25  3:33     ` Paul Turner
2011-02-25 12:31       ` Peter Zijlstra
2011-02-16  3:18 ` [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota Paul Turner
2011-02-18  6:52   ` Balbir Singh
2011-02-23 13:32   ` Peter Zijlstra [this message]
2011-02-24  5:21     ` Bharata B Rao
2011-02-24 11:05       ` Peter Zijlstra
2011-02-24 15:45         ` Bharata B Rao
2011-02-24 15:52           ` Peter Zijlstra
2011-02-24 16:39             ` Bharata B Rao
2011-02-24 17:20               ` Peter Zijlstra
2011-02-25  3:59                 ` Paul Turner
2011-02-25  3:41         ` Paul Turner
2011-02-25  3:10     ` Paul Turner
2011-02-25 13:58       ` Bharata B Rao
2011-02-25 20:51         ` Paul Turner
2011-02-28  3:50           ` Bharata B Rao
2011-02-28  6:38             ` Paul Turner
2011-02-28 13:48       ` Peter Zijlstra
2011-03-01  8:31         ` Paul Turner
2011-03-02  7:23   ` Bharata B Rao
2011-03-02  8:05     ` Paul Turner
2011-02-16  3:18 ` [CFS Bandwidth Control v4 4/7] sched: unthrottle cfs_rq(s) who ran out of quota at period refresh Paul Turner
2011-02-18  7:19   ` Balbir Singh
2011-02-18  8:10     ` Bharata B Rao
2011-02-23 12:23   ` Peter Zijlstra
2011-02-23 13:32   ` Peter Zijlstra
2011-02-24  7:04     ` Bharata B Rao
2011-02-24 11:14       ` Peter Zijlstra
2011-02-26  0:02     ` Paul Turner
2011-02-16  3:18 ` [CFS Bandwidth Control v4 5/7] sched: add exports tracking cfs bandwidth control statistics Paul Turner
2011-02-22  3:14   ` Balbir Singh
2011-02-22  4:13     ` Bharata B Rao
2011-02-22  4:40       ` Balbir Singh
2011-02-23  8:03         ` Paul Turner
2011-02-23 10:13           ` Balbir Singh
2011-02-23 13:32   ` Peter Zijlstra
2011-02-25  3:26     ` Paul Turner
2011-02-25  8:54       ` Peter Zijlstra
2011-02-16  3:18 ` [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED_OTHER Paul Turner
2011-02-22  3:17   ` Balbir Singh
2011-02-23  8:05     ` Paul Turner
2011-02-23  2:02   ` Hidetoshi Seto
2011-02-23  2:20     ` Paul Turner
2011-02-23  2:43     ` Balbir Singh
2011-02-23 13:32   ` Peter Zijlstra
2011-02-25  3:25     ` Paul Turner
2011-02-25 12:17       ` Peter Zijlstra
2011-02-16  3:18 ` [CFS Bandwidth Control v4 7/7] sched: add documentation for bandwidth control Paul Turner
2011-02-21  2:47 ` [CFS Bandwidth Control v4 0/7] Introduction Xiao Guangrong
2011-02-22 10:28   ` Bharata B Rao
2011-02-23  7:42   ` Paul Turner
2011-02-23  7:51     ` Balbir Singh
2011-02-23  7:56       ` Paul Turner
2011-02-23  8:31         ` Bharata B Rao
     [not found] ` <20110224161111.7d83a884@jacob-laptop>
2011-02-25 10:03   ` Paul Turner
2011-02-25 13:06     ` jacob pan
2011-03-08  3:57       ` Balbir Singh
2011-03-08 18:18         ` Jacob Pan
2011-03-09 10:12       ` Paul Turner
2011-03-09 21:57         ` jacob pan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1298467933.2217.765.camel@twins \
    --to=a.p.zijlstra@chello.nl \
    --cc=avi@redhat.com \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=bharata@linux.vnet.ibm.com \
    --cc=cfriesen@nortel.com \
    --cc=dhaval@linux.vnet.ibm.com \
    --cc=ego@in.ibm.com \
    --cc=herbert@13thfloor.at \
    --cc=kamalesh@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=ncrao@google.com \
    --cc=pjt@google.com \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=vatsa@in.ibm.com \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.