Re: [PATCH v5 1/4] sched/fair: add util

Linux Power Management development
 help / color / mirror / Atom feed

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
       [not found] ` <20180222170153.673-2-patrick.bellasi@arm.com>
@ 2018-03-01 17:42   ` Patrick Bellasi
  2018-03-06 18:56   ` Peter Zijlstra
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 21+ messages in thread
From: Patrick Bellasi @ 2018-03-01 17:42 UTC (permalink / raw)
  To: linux-kernel, linux-pm
  Cc: Ingo Molnar, Peter Zijlstra, Rafael J . Wysocki, Viresh Kumar,
	Vincent Guittot, Paul Turner, Dietmar Eggemann, Morten Rasmussen,
	Juri Lelli, Todd Kjos, Joel Fernandes, Steve Muckle

This is missing the below #ifdef guards, adding here has a note for
the next resping on list.

On 22-Feb 17:01, Patrick Bellasi wrote:

[...]

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e1febd252a84..c8526687f107 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5205,6 +5205,23 @@ static inline void hrtick_update(struct rq *rq)
>  }
>  #endif
> 

#ifdef CONFIG_SMP

> +static inline unsigned long task_util(struct task_struct *p);
> +static inline unsigned long _task_util_est(struct task_struct *p);
> +
> +static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
> +				    struct task_struct *p)
> +{
> +	unsigned int enqueued;
> +
> +	if (!sched_feat(UTIL_EST))
> +		return;
> +
> +	/* Update root cfs_rq's estimated utilization */
> +	enqueued  = READ_ONCE(cfs_rq->avg.util_est.enqueued);
> +	enqueued += _task_util_est(p);
> +	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
> +}
> +

#else
static inline void util_est_enqueue(struct cfs_rq *cfs_rq
                                    struct task_struct *p)
{
}
#endif /* CONFIG_SMP */

>  /*
>   * The enqueue_task method is called before nr_running is
>   * increased. Here we update the fair scheduling stats and
> @@ -5257,9 +5274,86 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	if (!se)
>  		add_nr_running(rq, 1);
> 
> +	util_est_enqueue(&rq->cfs, p);
>  	hrtick_update(rq);
>  }
> 
> +/*
> + * Check if a (signed) value is within a specified (unsigned) margin,
> + * based on the observation that:
> + *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
> + *
> + * NOTE: this only works when value + maring < INT_MAX.
> + */
> +static inline bool within_margin(int value, int margin)
> +{
> +	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
> +}
> +
> +static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
> +				    struct task_struct *p,
> +				    bool task_sleep)
> +{

#ifdef CONFIG_SMP

> +	long last_ewma_diff;
> +	struct util_est ue;
> +
> +	if (!sched_feat(UTIL_EST))
> +		return;
> +
> +	/*
> +	 * Update root cfs_rq's estimated utilization
> +	 *
> +	 * If *p is the last task then the root cfs_rq's estimated utilization
> +	 * of a CPU is 0 by definition.
> +	 */
> +	ue.enqueued = 0;
> +	if (cfs_rq->nr_running) {
> +		ue.enqueued  = READ_ONCE(cfs_rq->avg.util_est.enqueued);
> +		ue.enqueued -= min_t(unsigned int, ue.enqueued,
> +				     _task_util_est(p));
> +	}
> +	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
> +
> +	/*
> +	 * Skip update of task's estimated utilization when the task has not
> +	 * yet completed an activation, e.g. being migrated.
> +	 */
> +	if (!task_sleep)
> +		return;
> +
> +	/*
> +	 * Skip update of task's estimated utilization when its EWMA is
> +	 * already ~1% close to its last activation value.
> +	 */
> +	ue = READ_ONCE(p->se.avg.util_est);
> +	ue.enqueued = task_util(p);
> +	last_ewma_diff = ue.enqueued - ue.ewma;
> +	if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
> +		return;
> +
> +	/*
> +	 * Update Task's estimated utilization
> +	 *
> +	 * When *p completes an activation we can consolidate another sample
> +	 * of the task size. This is done by storing the current PELT value
> +	 * as ue.enqueued and by using this value to update the Exponential
> +	 * Weighted Moving Average (EWMA):
> +	 *
> +	 *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
> +	 *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
> +	 *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
> +	 *          = w * (      last_ewma_diff            ) +     ewma(t-1)
> +	 *          = w * (last_ewma_diff  +  ewma(t-1) / w)
> +	 *
> +	 * Where 'w' is the weight of new samples, which is configured to be
> +	 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
> +	 */
> +	ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
> +	ue.ewma  += last_ewma_diff;
> +	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
> +	WRITE_ONCE(p->se.avg.util_est, ue);

#endif /* CONFIG_SMP */

> +}
> +
>  static void set_next_buddy(struct sched_entity *se);
> 
>  /*
> @@ -5316,6 +5410,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	if (!se)
>  		sub_nr_running(rq, 1);
> 
> +	util_est_dequeue(&rq->cfs, p, task_sleep);
>  	hrtick_update(rq);
>  }
> 

-- 
#include <best/regards.h>

Patrick Bellasi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
       [not found] ` <20180222170153.673-2-patrick.bellasi@arm.com>
  2018-03-01 17:42   ` [PATCH v5 1/4] sched/fair: add util_est on top of PELT Patrick Bellasi
@ 2018-03-06 18:56   ` Peter Zijlstra
  2018-03-07 12:32     ` Patrick Bellasi
  2018-03-06 18:58   ` Peter Zijlstra
  2018-03-06 19:02   ` Peter Zijlstra
  3 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2018-03-06 18:56 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On Thu, Feb 22, 2018 at 05:01:50PM +0000, Patrick Bellasi wrote:
> +/**
> + * Estimation Utilization for FAIR tasks.
> + *
> + * Support data structure to track an Exponential Weighted Moving Average
> + * (EWMA) of a FAIR task's utilization. New samples are added to the moving
> + * average each time a task completes an activation. Sample's weight is
> + * chosen so that the EWMA will be relatively insensitive to transient changes
> + * to the task's workload.
> + *
> + * @enqueued: instantaneous estimated utilization of a task/cpu
> + *      task: the task's util_avg at last task dequeue time
> + *    cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
> + *
> + *  Thus, the util_est.enqueued of a task represents the contribution on the
> + *  estimated utilization of the CPU where that task is currently enqueued.
> + *
> + * @ewma: the Exponential Weighted Moving Average (EWMA) utilization of a task
> + *  Only for tasks we track a moving average of the past instantaneous
> + *  estimated utilization. This allows to absorb sporadic drops in
> + *  utilization of an otherwise almost periodic task.
> + *
> + */

The above comment appears to have whitespace issues, the paragraph
starting with "Thus" looks indented by one character for exmaple.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
  2018-03-06 18:56   ` Peter Zijlstra
@ 2018-03-07 12:32     ` Patrick Bellasi
  0 siblings, 0 replies; 21+ messages in thread
From: Patrick Bellasi @ 2018-03-07 12:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On 06-Mar 19:56, Peter Zijlstra wrote:
> On Thu, Feb 22, 2018 at 05:01:50PM +0000, Patrick Bellasi wrote:
> > +/**
> > + * Estimation Utilization for FAIR tasks.
> > + *
> > + * Support data structure to track an Exponential Weighted Moving Average
> > + * (EWMA) of a FAIR task's utilization. New samples are added to the moving
> > + * average each time a task completes an activation. Sample's weight is
> > + * chosen so that the EWMA will be relatively insensitive to transient changes
> > + * to the task's workload.
> > + *
> > + * @enqueued: instantaneous estimated utilization of a task/cpu
> > + *      task: the task's util_avg at last task dequeue time
> > + *    cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
> > + *
> > + *  Thus, the util_est.enqueued of a task represents the contribution on the
> > + *  estimated utilization of the CPU where that task is currently enqueued.
> > + *
> > + * @ewma: the Exponential Weighted Moving Average (EWMA) utilization of a task
> > + *  Only for tasks we track a moving average of the past instantaneous
> > + *  estimated utilization. This allows to absorb sporadic drops in
> > + *  utilization of an otherwise almost periodic task.
> > + *
> > + */
> 
> The above comment appears to have whitespace issues, the paragraph
> starting with "Thus" looks indented by one character for exmaple.

That was actually intentional... I wanted to keep it aligned after the
"@" to better mark paragraphs describing the struct members.

However, I've just notice the overall format is not sphinx valid.
Thus, I'll update it to ensure also that the documentation is properly
generated.

-- 
#include <best/regards.h>

Patrick Bellasi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
       [not found] ` <20180222170153.673-2-patrick.bellasi@arm.com>
  2018-03-01 17:42   ` [PATCH v5 1/4] sched/fair: add util_est on top of PELT Patrick Bellasi
  2018-03-06 18:56   ` Peter Zijlstra
@ 2018-03-06 18:58   ` Peter Zijlstra
  2018-03-07  9:39     ` Peter Zijlstra
  2018-03-07 11:31     ` Patrick Bellasi
  2018-03-06 19:02   ` Peter Zijlstra
  3 siblings, 2 replies; 21+ messages in thread
From: Peter Zijlstra @ 2018-03-06 18:58 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On Thu, Feb 22, 2018 at 05:01:50PM +0000, Patrick Bellasi wrote:
> +static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
> +				    struct task_struct *p)
> +{
> +	unsigned int enqueued;
> +
> +	if (!sched_feat(UTIL_EST))
> +		return;
> +
> +	/* Update root cfs_rq's estimated utilization */
> +	enqueued  = READ_ONCE(cfs_rq->avg.util_est.enqueued);
> +	enqueued += _task_util_est(p);
> +	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
> +}

> +static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
> +				    struct task_struct *p,
> +				    bool task_sleep)
> +{
> +	long last_ewma_diff;
> +	struct util_est ue;
> +
> +	if (!sched_feat(UTIL_EST))
> +		return;
> +
> +	/*
> +	 * Update root cfs_rq's estimated utilization
> +	 *
> +	 * If *p is the last task then the root cfs_rq's estimated utilization
> +	 * of a CPU is 0 by definition.
> +	 */
> +	ue.enqueued = 0;
> +	if (cfs_rq->nr_running) {
> +		ue.enqueued  = READ_ONCE(cfs_rq->avg.util_est.enqueued);
> +		ue.enqueued -= min_t(unsigned int, ue.enqueued,
> +				     _task_util_est(p));
> +	}
> +	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);

It appears to me this isn't a stable situation and completely relies on
the !nr_running case to recalibrate. If we ensure that doesn't happen
for a significant while the sum can run-away, right?

Should we put a max in enqueue to avoid this?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
  2018-03-06 18:58   ` Peter Zijlstra
@ 2018-03-07  9:39     ` Peter Zijlstra
  2018-03-07 15:37       ` Patrick Bellasi
  2018-03-07 11:31     ` Patrick Bellasi
  1 sibling, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2018-03-07  9:39 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On Tue, Mar 06, 2018 at 07:58:51PM +0100, Peter Zijlstra wrote:
> On Thu, Feb 22, 2018 at 05:01:50PM +0000, Patrick Bellasi wrote:
> > +static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
> > +				    struct task_struct *p)
> > +{
> > +	unsigned int enqueued;
> > +
> > +	if (!sched_feat(UTIL_EST))
> > +		return;
> > +
> > +	/* Update root cfs_rq's estimated utilization */
> > +	enqueued  = READ_ONCE(cfs_rq->avg.util_est.enqueued);
> > +	enqueued += _task_util_est(p);
> > +	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
> > +}

> It appears to me this isn't a stable situation and completely relies on
> the !nr_running case to recalibrate. If we ensure that doesn't happen
> for a significant while the sum can run-away, right?
> 
> Should we put a max in enqueue to avoid this?

Thinking about this a bit more; would it make sense to adjust the
running sum/avg on migration? Something along the lines of:

  util_avg = se->load_avg / (cfs_rq->load_avg + se->load_avg);

(which disregards cgroups), because that should more or less be the time
it ends up running, given the WFQ rule.

That way the disparity between tasks migrating into the CPU at u=1 and
them going to sleep at u<1 is much smaller and the above sum doesn't run
away nearly as wild (it still needs some upper bound though).

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
  2018-03-07  9:39     ` Peter Zijlstra
@ 2018-03-07 15:37       ` Patrick Bellasi
  0 siblings, 0 replies; 21+ messages in thread
From: Patrick Bellasi @ 2018-03-07 15:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On 07-Mar 10:39, Peter Zijlstra wrote:
> On Tue, Mar 06, 2018 at 07:58:51PM +0100, Peter Zijlstra wrote:
> > On Thu, Feb 22, 2018 at 05:01:50PM +0000, Patrick Bellasi wrote:
> > > +static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
> > > +				    struct task_struct *p)
> > > +{
> > > +	unsigned int enqueued;
> > > +
> > > +	if (!sched_feat(UTIL_EST))
> > > +		return;
> > > +
> > > +	/* Update root cfs_rq's estimated utilization */
> > > +	enqueued  = READ_ONCE(cfs_rq->avg.util_est.enqueued);
> > > +	enqueued += _task_util_est(p);
> > > +	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
> > > +}
> 
> > It appears to me this isn't a stable situation and completely relies on
> > the !nr_running case to recalibrate. If we ensure that doesn't happen
> > for a significant while the sum can run-away, right?
> > 
> > Should we put a max in enqueue to avoid this?
> 
> Thinking about this a bit more; would it make sense to adjust the
> running sum/avg on migration? Something along the lines of:
> 
>   util_avg = se->load_avg / (cfs_rq->load_avg + se->load_avg);
> 
> (which disregards cgroups), because that should more or less be the time
> it ends up running, given the WFQ rule.

I would say it makes sense from a purely mechanism stanpoing, but I'm
not entirely convinced it can be useful from a practical stanpoint.

First of all, that should be applied only when we migrate to a more
saturated CPU. Otherwise, when migrating on an empty CPU we would set
util_avg = 100%

Secondly, when we migrate to a saturated CPU, this adjustment will
contribute to under-estimate the task utilization.
Let say the task was running on a completely empty CPU, and thus we
was able to ramp up without being preempted. This value represents a
good estimation of the (most recent) task CPU demands.

Now, if on a following activation, we wakeup the task on an IDLE CPU
with a lot of blocked load, then we will scale down its util_avg
and assume the task will be smaller.
But:

a) if the blocked load does not turns into some task waking up again,
   underestimated the task introduces only further ramp-up latencies

b) if the load it due to really active tasks, the task will be
   preempted and it's utilization smaller... but we are already in a
   domain where utilization does not tell us anything useful for a
   task... and thus, why bothering to make it converging sooner?

> That way the disparity between tasks migrating into the CPU at u=1 and
> them going to sleep at u<1 is much smaller and the above sum doesn't run
> away nearly as wild (it still needs some upper bound though).

-- 
#include <best/regards.h>

Patrick Bellasi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
  2018-03-06 18:58   ` Peter Zijlstra
  2018-03-07  9:39     ` Peter Zijlstra
@ 2018-03-07 11:31     ` Patrick Bellasi
  2018-03-07 12:24       ` Peter Zijlstra
  1 sibling, 1 reply; 21+ messages in thread
From: Patrick Bellasi @ 2018-03-07 11:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On 06-Mar 19:58, Peter Zijlstra wrote:
> On Thu, Feb 22, 2018 at 05:01:50PM +0000, Patrick Bellasi wrote:
> > +static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
> > +				    struct task_struct *p)
> > +{
> > +	unsigned int enqueued;
> > +
> > +	if (!sched_feat(UTIL_EST))
> > +		return;
> > +
> > +	/* Update root cfs_rq's estimated utilization */
> > +	enqueued  = READ_ONCE(cfs_rq->avg.util_est.enqueued);
> > +	enqueued += _task_util_est(p);
> > +	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
> > +}
> 
> > +static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
> > +				    struct task_struct *p,
> > +				    bool task_sleep)
> > +{
> > +	long last_ewma_diff;
> > +	struct util_est ue;
> > +
> > +	if (!sched_feat(UTIL_EST))
> > +		return;
> > +
> > +	/*
> > +	 * Update root cfs_rq's estimated utilization
> > +	 *
> > +	 * If *p is the last task then the root cfs_rq's estimated utilization
> > +	 * of a CPU is 0 by definition.
> > +	 */
> > +	ue.enqueued = 0;
> > +	if (cfs_rq->nr_running) {
> > +		ue.enqueued  = READ_ONCE(cfs_rq->avg.util_est.enqueued);
> > +		ue.enqueued -= min_t(unsigned int, ue.enqueued,
> > +				     _task_util_est(p));
> > +	}
> > +	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
> 
> It appears to me this isn't a stable situation and completely relies on
> the !nr_running case to recalibrate. If we ensure that doesn't happen
> for a significant while the sum can run-away, right?

By away you mean go over 1024 or overflow the unsigned int storage?

In the first case, I think we don't care about exceeding 1024 since:
- we cap to capacity_orig_of in cpu_util_est
- by directly reading the cfs_rq->avg.util_est.enqueued we can
  actually detect conditions in which a CPU is over-saturated.

In the second case, with an unsigned int we can enqueue up to few
millions of 100% tasks on a single CPU without overflowing.

> Should we put a max in enqueue to avoid this?

IMO the capping from the cpu_util_est getter should be enough...

Maybe I'm missing your point here?

-- 
#include <best/regards.h>

Patrick Bellasi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
  2018-03-07 11:31     ` Patrick Bellasi
@ 2018-03-07 12:24       ` Peter Zijlstra
  2018-03-07 15:24         ` Patrick Bellasi
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2018-03-07 12:24 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On Wed, Mar 07, 2018 at 11:31:49AM +0000, Patrick Bellasi wrote:
> > It appears to me this isn't a stable situation and completely relies on
> > the !nr_running case to recalibrate. If we ensure that doesn't happen
> > for a significant while the sum can run-away, right?
> 
> By away you mean go over 1024 or overflow the unsigned int storage?

the later, I think you can make it arbitrarily large. Have a busy task
on CPU0, this ensure !nr_running never happens.

Start a busy task on CPU1, wait for it to hit u=1, then migrate it to
CPU0, then wait for it to hit u=.5 then kill it, this effectively adds
.5 to the enqueued value, repeat indefinitely.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
  2018-03-07 12:24       ` Peter Zijlstra
@ 2018-03-07 15:24         ` Patrick Bellasi
  2018-03-07 17:35           ` Peter Zijlstra
  0 siblings, 1 reply; 21+ messages in thread
From: Patrick Bellasi @ 2018-03-07 15:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On 07-Mar 13:24, Peter Zijlstra wrote:
> On Wed, Mar 07, 2018 at 11:31:49AM +0000, Patrick Bellasi wrote:
> > > It appears to me this isn't a stable situation and completely relies on
> > > the !nr_running case to recalibrate. If we ensure that doesn't happen
> > > for a significant while the sum can run-away, right?
> > 
> > By away you mean go over 1024 or overflow the unsigned int storage?
> 
> the later, I think you can make it arbitrarily large. Have a busy task
> on CPU0, this ensure !nr_running never happens.
> 
> Start a busy task on CPU1, wait for it to hit u=1, then migrate it to
> CPU0, 

At this point util_est(CPU0) = 2048, which is:

   +1024 for the busy running task
         assuming it has been enqueued with the utilization since the beginning
   +1024 for the newly migrated task from CPU1
         which is enqueued with the value he reached at dequeued time
         from CPU1

> then wait for it to hit u=.5 then kill it,

... but when we kill it, the task is dequeued, and thus we remove
1024.

Maybe that's the tricky bit: we remove the value we enqueued, _not_
the current util_avg. Notice we use _task_util_est(p)... with the
leading "_".

> this effectively adds
> .5 to the enqueued value, repeat indefinitely.

Thus this should not happen.

Basically, the RQ's util_est is the sum of the RUNNABLE tasks's
util_est at their enqueue time... which has been update at their last
dequeue time, hence the usage of name "dequeued" for both tasks and
rqs.

Does it make sense now?

-- 
#include <best/regards.h>

Patrick Bellasi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
  2018-03-07 15:24         ` Patrick Bellasi
@ 2018-03-07 17:35           ` Peter Zijlstra
  0 siblings, 0 replies; 21+ messages in thread
From: Peter Zijlstra @ 2018-03-07 17:35 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On Wed, Mar 07, 2018 at 03:24:58PM +0000, Patrick Bellasi wrote:

> Maybe that's the tricky bit: we remove the value we enqueued, _not_
> the current util_avg. Notice we use _task_util_est(p)... with the
> leading "_".

ARGH, ok let me try that again ;-)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
       [not found] ` <20180222170153.673-2-patrick.bellasi@arm.com>
                     ` (2 preceding siblings ...)
  2018-03-06 18:58   ` Peter Zijlstra
@ 2018-03-06 19:02   ` Peter Zijlstra
  2018-03-07 11:47     ` Patrick Bellasi
  3 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2018-03-06 19:02 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On Thu, Feb 22, 2018 at 05:01:50PM +0000, Patrick Bellasi wrote:
> +struct util_est {
> +	unsigned int			enqueued;
> +	unsigned int			ewma;
> +#define UTIL_EST_WEIGHT_SHIFT		2
> +};

> +	ue = READ_ONCE(p->se.avg.util_est);

> +	WRITE_ONCE(p->se.avg.util_est, ue);

That is actually quite dodgy... and relies on the fact that we have the
8 byte case in __write_once_size() and __read_once_size()
unconditionally. It then further relies on the compiler DTRT for 32bit
platforms, which is generating 2 32bit loads/stores.

The advantage is of course that it will use single u64 loads/stores
where available.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
  2018-03-06 19:02   ` Peter Zijlstra
@ 2018-03-07 11:47     ` Patrick Bellasi
  2018-03-07 12:26       ` Peter Zijlstra
  0 siblings, 1 reply; 21+ messages in thread
From: Patrick Bellasi @ 2018-03-07 11:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On 06-Mar 20:02, Peter Zijlstra wrote:
> On Thu, Feb 22, 2018 at 05:01:50PM +0000, Patrick Bellasi wrote:
> > +struct util_est {
> > +	unsigned int			enqueued;
> > +	unsigned int			ewma;
> > +#define UTIL_EST_WEIGHT_SHIFT		2
> > +};
> 
> > +	ue = READ_ONCE(p->se.avg.util_est);
> 
> > +	WRITE_ONCE(p->se.avg.util_est, ue);
> 
> That is actually quite dodgy... and relies on the fact that we have the
> 8 byte case in __write_once_size() and __read_once_size()
> unconditionally. It then further relies on the compiler DTRT for 32bit
> platforms, which is generating 2 32bit loads/stores.
>
> The advantage is of course that it will use single u64 loads/stores
> where available.

Yes, that's mainly an "optimization" for 64bit targets... but perhaps
the benefits are negligible.

Do you prefer to keep more "under control" the generated code by using
two {READ,WRITE}_ONCEs?

IMO here we can also go with just the WRITE_ONCEs. I don't see a case
for the compiler to mangle load/store. While the WRITE_ONCE are still
required to sync with non rq-lock serialized code.
But... maybe I'm missing something... ?

-- 
#include <best/regards.h>

Patrick Bellasi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
  2018-03-07 11:47     ` Patrick Bellasi
@ 2018-03-07 12:26       ` Peter Zijlstra
  2018-03-07 15:16         ` Patrick Bellasi
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2018-03-07 12:26 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On Wed, Mar 07, 2018 at 11:47:11AM +0000, Patrick Bellasi wrote:
> On 06-Mar 20:02, Peter Zijlstra wrote:
> > On Thu, Feb 22, 2018 at 05:01:50PM +0000, Patrick Bellasi wrote:
> > > +struct util_est {
> > > +	unsigned int			enqueued;
> > > +	unsigned int			ewma;
> > > +#define UTIL_EST_WEIGHT_SHIFT		2
> > > +};
> > 
> > > +	ue = READ_ONCE(p->se.avg.util_est);
> > 
> > > +	WRITE_ONCE(p->se.avg.util_est, ue);
> > 
> > That is actually quite dodgy... and relies on the fact that we have the
> > 8 byte case in __write_once_size() and __read_once_size()
> > unconditionally. It then further relies on the compiler DTRT for 32bit
> > platforms, which is generating 2 32bit loads/stores.
> >
> > The advantage is of course that it will use single u64 loads/stores
> > where available.
> 
> Yes, that's mainly an "optimization" for 64bit targets... but perhaps
> the benefits are negligible.
> 
> Do you prefer to keep more "under control" the generated code by using
> two {READ,WRITE}_ONCEs?
> 
> IMO here we can also go with just the WRITE_ONCEs. I don't see a case
> for the compiler to mangle load/store. While the WRITE_ONCE are still
> required to sync with non rq-lock serialized code.
> But... maybe I'm missing something... ?

I'm not sure we rely on READ/WRITE_ONCE() of 64bit variables on 32bit
targets to be sane anywhere else (we could be, I just dont know).

I suspect it all works as expected... but its a tad tricky.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/4] sched/fair: add util_est on top of PELT
  2018-03-07 12:26       ` Peter Zijlstra
@ 2018-03-07 15:16         ` Patrick Bellasi
  0 siblings, 0 replies; 21+ messages in thread
From: Patrick Bellasi @ 2018-03-07 15:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On 07-Mar 13:26, Peter Zijlstra wrote:
> On Wed, Mar 07, 2018 at 11:47:11AM +0000, Patrick Bellasi wrote:
> > On 06-Mar 20:02, Peter Zijlstra wrote:
> > > On Thu, Feb 22, 2018 at 05:01:50PM +0000, Patrick Bellasi wrote:
> > > > +struct util_est {
> > > > +	unsigned int			enqueued;
> > > > +	unsigned int			ewma;
> > > > +#define UTIL_EST_WEIGHT_SHIFT		2
> > > > +};
> > > 
> > > > +	ue = READ_ONCE(p->se.avg.util_est);
> > > 
> > > > +	WRITE_ONCE(p->se.avg.util_est, ue);
> > > 
> > > That is actually quite dodgy... and relies on the fact that we have the
> > > 8 byte case in __write_once_size() and __read_once_size()
> > > unconditionally. It then further relies on the compiler DTRT for 32bit
> > > platforms, which is generating 2 32bit loads/stores.
> > >
> > > The advantage is of course that it will use single u64 loads/stores
> > > where available.
> > 
> > Yes, that's mainly an "optimization" for 64bit targets... but perhaps
> > the benefits are negligible.
> > 
> > Do you prefer to keep more "under control" the generated code by using
> > two {READ,WRITE}_ONCEs?

Any specific preference on this previous point?

> > IMO here we can also go with just the WRITE_ONCEs. I don't see a case
> > for the compiler to mangle load/store. While the WRITE_ONCE are still
> > required to sync with non rq-lock serialized code.
> > But... maybe I'm missing something... ?
> 
> I'm not sure we rely on READ/WRITE_ONCE() of 64bit variables on 32bit
> targets to be sane anywhere else (we could be, I just dont know).

My understating is that, since here we are in an rq-lock protected
section, and only in this section we can write these vars, then the
load is a dependency for the store and the compiler cannot screw up...

> I suspect it all works as expected... but its a tad tricky.

Then let's keep them for the time being... meanwhile I try to get
some more "internal" feedback before next posting.

-- 
#include <best/regards.h>

Patrick Bellasi

^ permalink raw reply	[flat|nested] 21+ messages in thread

[parent not found: <20180222170153.673-4-patrick.bellasi@arm.com>]

* Re: [PATCH v5 3/4] sched/cpufreq_schedutil: use util_est for OPP selection
       [not found] ` <20180222170153.673-4-patrick.bellasi@arm.com>
@ 2018-02-26  4:04   ` Viresh Kumar
  2018-03-07 10:12   ` Peter Zijlstra
  1 sibling, 0 replies; 21+ messages in thread
From: Viresh Kumar @ 2018-02-26  4:04 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, Ingo Molnar, Peter Zijlstra,
	Rafael J . Wysocki, Vincent Guittot, Paul Turner,
	Dietmar Eggemann, Morten Rasmussen, Juri Lelli, Todd Kjos,
	Joel Fernandes, Steve Muckle

On 22-02-18, 17:01, Patrick Bellasi wrote:
> When schedutil looks at the CPU utilization, the current PELT value for
> that CPU is returned straight away. In certain scenarios this can have
> undesired side effects and delays on frequency selection.
> 
> For example, since the task utilization is decayed at wakeup time, a
> long sleeping big task newly enqueued does not add immediately a
> significant contribution to the target CPU. This introduces some latency
> before schedutil will be able to detect the best frequency required by
> that task.
> 
> Moreover, the PELT signal build-up time is a function of the current
> frequency, because of the scale invariant load tracking support. Thus,
> starting from a lower frequency, the utilization build-up time will
> increase even more and further delays the selection of the actual
> frequency which better serves the task requirements.
> 
> In order to reduce this kind of latencies, we integrate the usage
> of the CPU's estimated utilization in the sugov_get_util function.
> This allows to properly consider the expected utilization of a CPU which,
> for example, has just got a big task running after a long sleep period.
> Ultimately this allows to select the best frequency to run a task
> right after its wake-up.
> 
> Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> Cc: Viresh Kumar <viresh.kumar@linaro.org>
> Cc: Paul Turner <pjt@google.com>
> Cc: Vincent Guittot <vincent.guittot@linaro.org>
> Cc: Morten Rasmussen <morten.rasmussen@arm.com>
> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Cc: linux-kernel@vger.kernel.org
> Cc: linux-pm@vger.kernel.org
> 
> ---
> Changes in v5:
>  - add missing READ_ONCE() barrieres
>  - add acked-by Rafael tag
> 
> Changes in v4:
>  - rebased on today's tip/sched/core (commit 460e8c3340a2)
>  - use util_est.enqueued for cfs_rq's util_est (Joel)
>  - simplify cpu_util_cfs() integration (Dietmar)
> 
> Changes in v3:
>  - rebase on today's tip/sched/core (commit 07881166a892)
>  - moved into Juri's cpu_util_cfs(), which should also
>    address Rafael's suggestion to use a local variable.
> 
> Changes in v2:
>  - rebase on top of v4.15-rc2
>  - tested that overhauled PELT code does not affect the util_est
> ---
>  kernel/sched/sched.h | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index dc6c8b5a24ad..ce33a5649bf2 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2122,7 +2122,12 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
>  
>  static inline unsigned long cpu_util_cfs(struct rq *rq)
>  {
> -	return rq->cfs.avg.util_avg;
> +	if (!sched_feat(UTIL_EST))
> +		return READ_ONCE(rq->cfs.avg.util_avg);
> +
> +	return max_t(unsigned long,
> +		     READ_ONCE(rq->cfs.avg.util_avg),
> +		     READ_ONCE(rq->cfs.avg.util_est.enqueued));
>  }

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>

-- 
viresh

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 3/4] sched/cpufreq_schedutil: use util_est for OPP selection
       [not found] ` <20180222170153.673-4-patrick.bellasi@arm.com>
  2018-02-26  4:04   ` [PATCH v5 3/4] sched/cpufreq_schedutil: use util_est for OPP selection Viresh Kumar
@ 2018-03-07 10:12   ` Peter Zijlstra
  1 sibling, 0 replies; 21+ messages in thread
From: Peter Zijlstra @ 2018-03-07 10:12 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On Thu, Feb 22, 2018 at 05:01:52PM +0000, Patrick Bellasi wrote:
>  static inline unsigned long cpu_util_cfs(struct rq *rq)
>  {
> +	if (!sched_feat(UTIL_EST))
> +		return READ_ONCE(rq->cfs.avg.util_avg);
> +
> +	return max_t(unsigned long,
> +		     READ_ONCE(rq->cfs.avg.util_avg),
> +		     READ_ONCE(rq->cfs.avg.util_est.enqueued));
>  }

static inline unsigned long cpu_util_cfs(struct rq *rq)
{
	unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);

	if (sched_feat(UTIL_EST)) {
		util = max_t(unsigned long, util,
			     READ_ONCE(rq->cfs.avg.util_est.enqueued));
	}

	return util;
}

Seems like a more readable variant.

^ permalink raw reply	[flat|nested] 21+ messages in thread

[parent not found: <20180222170153.673-5-patrick.bellasi@arm.com>]

* Re: [PATCH v5 4/4] sched/fair: update util_est only on util_avg updates
       [not found] ` <20180222170153.673-5-patrick.bellasi@arm.com>
@ 2018-03-01 17:46   ` Patrick Bellasi
  2018-03-07 10:38   ` Peter Zijlstra
  2018-03-08  9:48   ` Peter Zijlstra
  2 siblings, 0 replies; 21+ messages in thread
From: Patrick Bellasi @ 2018-03-01 17:46 UTC (permalink / raw)
  To: linux-kernel, linux-pm
  Cc: Ingo Molnar, Peter Zijlstra, Rafael J . Wysocki, Viresh Kumar,
	Vincent Guittot, Paul Turner, Dietmar Eggemann, Morten Rasmussen,
	Juri Lelli, Todd Kjos, Joel Fernandes, Steve Muckle

The changelog is missing the below CCs. :(

Since that's a new patch in this series, I expect some feedbacks and
thus I'll add them on the next respin.

On 22-Feb 17:01, Patrick Bellasi wrote:
> The estimated utilization of a task is currently updated every time the
> task is dequeued. However, to keep overheads under control, PELT signals
> are effectively updated at maximum once every 1ms.
> 
> Thus, for really short running tasks, it can happen that their util_avg
> value has not been updates since their last enqueue.  If such tasks are
> also frequently running tasks (e.g. the kind of workload generated by
> hackbench) it can also happen that their util_avg is updated only every
> few activations.
> 
> This means that updating util_est at every dequeue potentially introduces
> not necessary overheads and it's also conceptually wrong if the util_avg
> signal has never been updated during a task activation.
> 
> Let's introduce a throttling mechanism on task's util_est updates
> to sync them with util_avg updates. To make the solution memory
> efficient, both in terms of space and load/store operations, we encode a
> synchronization flag into the LSB of util_est.enqueued.
> This makes util_est an even values only metric, which is still
> considered good enough for its purpose.
> The synchronization bit is (re)set by __update_load_avg_se() once the
> PELT signal of a task has been updated during its last activation.
> 
> Such a throttling mechanism allows to keep under control util_est
> overheads in the wakeup hot path, thus making it a suitable mechanism
> which can be enabled also on high-intensity workload systems.
> Thus, this now switches on by default the estimation utilization
> scheduler feature.
> 
> Suggested-by: Chris Redpath <chris.redpath@arm.com>
> Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Turner <pjt@google.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: linux-kernel@vger.kernel.org

[...]

-- 
#include <best/regards.h>

Patrick Bellasi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 4/4] sched/fair: update util_est only on util_avg updates
       [not found] ` <20180222170153.673-5-patrick.bellasi@arm.com>
  2018-03-01 17:46   ` [PATCH v5 4/4] sched/fair: update util_est only on util_avg updates Patrick Bellasi
@ 2018-03-07 10:38   ` Peter Zijlstra
  2018-03-08  9:15     ` Peter Zijlstra
  2018-03-08  9:48   ` Peter Zijlstra
  2 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2018-03-07 10:38 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On Thu, Feb 22, 2018 at 05:01:53PM +0000, Patrick Bellasi wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 8364771f7301..1bf9a86ebc39 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3047,6 +3047,29 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
>  	}
>  }
>  
> +/*
> + * When a task is dequeued, its estimated utilization should not be update if
> + * its util_avg has not been updated at least once.
> + * This flag is used to synchronize util_avg updates with util_est updates.
> + * We map this information into the LSB bit of the utilization saved at
> + * dequeue time (i.e. util_est.dequeued).
> + */
> +#define UTIL_EST_NEED_UPDATE_FLAG 0x1
> +
> +static inline void cfs_se_util_change(struct sched_avg *avg)
> +{
	if (!sched_feat(UTIL_EST))
		return;

> +	if (sched_feat(UTIL_EST)) {
> +		struct util_est ue = READ_ONCE(avg->util_est);
> +
> +		if (!(ue.enqueued & UTIL_EST_NEED_UPDATE_FLAG))
> +			return;
> +
> +		/* Reset flag to report util_avg has been updated */
> +		ue.enqueued &= ~UTIL_EST_NEED_UPDATE_FLAG;
> +		WRITE_ONCE(avg->util_est, ue);
> +	}

and loose the indent. Also, since we only update the enqueued value, we
don't need to load/store the entire util_est thing here.

> +}
> +
>  #ifdef CONFIG_SMP
>  /*
>   * Approximate:
> @@ -3308,6 +3331,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit
>  				cfs_rq->curr == se)) {
>  
>  		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> +		cfs_se_util_change(&se->avg);
>  		return 1;
>  	}
>  

So we only clear the bit for @se updates.

> @@ -5218,7 +5242,7 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
>  
>  	/* Update root cfs_rq's estimated utilization */
>  	enqueued  = READ_ONCE(cfs_rq->avg.util_est.enqueued);
> -	enqueued += _task_util_est(p);
> +	enqueued += (_task_util_est(p) | 0x1);

UTIL_EST_NEED_UPDATE_FLAG, although I do agree that 0x1 is much easier
to type ;-)

But you set it for the cfs_rq value ?! That doesn't seem right.

>  	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
>  }
>  
> @@ -5310,7 +5334,7 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
>  	if (cfs_rq->nr_running) {
>  		ue.enqueued  = READ_ONCE(cfs_rq->avg.util_est.enqueued);
>  		ue.enqueued -= min_t(unsigned int, ue.enqueued,
> -				     _task_util_est(p));
> +				     (_task_util_est(p) | UTIL_EST_NEED_UPDATE_FLAG));
>  	}
>  	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
>  

Would it be really horrible if you separate the value and flag using a
bitfield/shifts?

> @@ -5321,12 +5345,19 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
>  	if (!task_sleep)
>  		return;
>  
> +	/*
> +	 * Skip update of task's estimated utilization if the PELT signal has
> +	 * never been updated (at least once) since last enqueue time.
> +	 */
> +	ue = READ_ONCE(p->se.avg.util_est);
> +	if (ue.enqueued & UTIL_EST_NEED_UPDATE_FLAG)
> +		return;
> +
>  	/*
>  	 * Skip update of task's estimated utilization when its EWMA is
>  	 * already ~1% close to its last activation value.
>  	 */
> +	ue.enqueued = (task_util(p) | UTIL_EST_NEED_UPDATE_FLAG);
>  	last_ewma_diff = ue.enqueued - ue.ewma;
>  	if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
>  		return;


I see what you do, but Yuck! that's really nasty. Then again, I've not
actually got a better suggestion.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 4/4] sched/fair: update util_est only on util_avg updates
  2018-03-07 10:38   ` Peter Zijlstra
@ 2018-03-08  9:15     ` Peter Zijlstra
  0 siblings, 0 replies; 21+ messages in thread
From: Peter Zijlstra @ 2018-03-08  9:15 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On Wed, Mar 07, 2018 at 11:38:52AM +0100, Peter Zijlstra wrote:
> On Thu, Feb 22, 2018 at 05:01:53PM +0000, Patrick Bellasi wrote:

> > @@ -5218,7 +5242,7 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
> >  
> >  	/* Update root cfs_rq's estimated utilization */
> >  	enqueued  = READ_ONCE(cfs_rq->avg.util_est.enqueued);
> > -	enqueued += _task_util_est(p);
> > +	enqueued += (_task_util_est(p) | 0x1);
> 
> UTIL_EST_NEED_UPDATE_FLAG, although I do agree that 0x1 is much easier
> to type ;-)
> 
> But you set it for the cfs_rq value ?! That doesn't seem right.
> 
> >  	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
> >  }
> >  
> > @@ -5310,7 +5334,7 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
> >  	if (cfs_rq->nr_running) {
> >  		ue.enqueued  = READ_ONCE(cfs_rq->avg.util_est.enqueued);
> >  		ue.enqueued -= min_t(unsigned int, ue.enqueued,
> > -				     _task_util_est(p));
> > +				     (_task_util_est(p) | UTIL_EST_NEED_UPDATE_FLAG));
> >  	}
> >  	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
> >  

OK, so you unconditionally set that bit here to make the add/sub match.
Clearly I wasn't having a good day yesterday.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 4/4] sched/fair: update util_est only on util_avg updates
       [not found] ` <20180222170153.673-5-patrick.bellasi@arm.com>
  2018-03-01 17:46   ` [PATCH v5 4/4] sched/fair: update util_est only on util_avg updates Patrick Bellasi
  2018-03-07 10:38   ` Peter Zijlstra
@ 2018-03-08  9:48   ` Peter Zijlstra
  2018-03-08 10:37     ` Patrick Bellasi
  2 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2018-03-08  9:48 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On Thu, Feb 22, 2018 at 05:01:53PM +0000, Patrick Bellasi wrote:
> +#define UTIL_EST_NEED_UPDATE_FLAG 0x1

> @@ -5321,12 +5345,19 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
>  	if (!task_sleep)
>  		return;
>  
> +	/*
> +	 * Skip update of task's estimated utilization if the PELT signal has
> +	 * never been updated (at least once) since last enqueue time.
> +	 */
> +	ue = READ_ONCE(p->se.avg.util_est);
> +	if (ue.enqueued & UTIL_EST_NEED_UPDATE_FLAG)
> +		return;

The name and function seem inverted, if the flag is set, we do _NOT_
update util_est.

How about something like UTIL_EST_UNCHANGED ? That would give:

	/*
	 * If the PELT values haven't changed since enqueue time,
	 * skip the util_est update.
	 */
	if (enqueue & UTIL_EST_UNCHANGED)
		return;

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 4/4] sched/fair: update util_est only on util_avg updates
  2018-03-08  9:48   ` Peter Zijlstra
@ 2018-03-08 10:37     ` Patrick Bellasi
  0 siblings, 0 replies; 21+ messages in thread
From: Patrick Bellasi @ 2018-03-08 10:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, linux-pm, Ingo Molnar, Rafael J . Wysocki,
	Viresh Kumar, Vincent Guittot, Paul Turner, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle

On 08-Mar 10:48, Peter Zijlstra wrote:
> On Thu, Feb 22, 2018 at 05:01:53PM +0000, Patrick Bellasi wrote:
> > +#define UTIL_EST_NEED_UPDATE_FLAG 0x1
> 
> > @@ -5321,12 +5345,19 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
> >  	if (!task_sleep)
> >  		return;
> >  
> > +	/*
> > +	 * Skip update of task's estimated utilization if the PELT signal has
> > +	 * never been updated (at least once) since last enqueue time.
> > +	 */
> > +	ue = READ_ONCE(p->se.avg.util_est);
> > +	if (ue.enqueued & UTIL_EST_NEED_UPDATE_FLAG)
> > +		return;
> 
> The name and function seem inverted, if the flag is set, we do _NOT_
> update util_est.

My reading was along the line "when the flag is set we need an
update of util_avg to collect a new util_est sample"...

... but I agree that's confusing... and unnecessary long.

> How about something like UTIL_EST_UNCHANGED ? That would give:

I would prefer UTIL_AVG_UNCHANGED, since the flags is reset when we
have a change in util_avg, thus enabling util_est updates.

> 	/*
> 	 * If the PELT values haven't changed since enqueue time,
> 	 * skip the util_est update.
> 	 */
> 	if (enqueue & UTIL_EST_UNCHANGED)
> 		return;

-- 
#include <best/regards.h>

Patrick Bellasi

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2018-03-08 10:37 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20180222170153.673-1-patrick.bellasi@arm.com>
     [not found] ` <20180222170153.673-2-patrick.bellasi@arm.com>
2018-03-01 17:42   ` [PATCH v5 1/4] sched/fair: add util_est on top of PELT Patrick Bellasi
2018-03-06 18:56   ` Peter Zijlstra
2018-03-07 12:32     ` Patrick Bellasi
2018-03-06 18:58   ` Peter Zijlstra
2018-03-07  9:39     ` Peter Zijlstra
2018-03-07 15:37       ` Patrick Bellasi
2018-03-07 11:31     ` Patrick Bellasi
2018-03-07 12:24       ` Peter Zijlstra
2018-03-07 15:24         ` Patrick Bellasi
2018-03-07 17:35           ` Peter Zijlstra
2018-03-06 19:02   ` Peter Zijlstra
2018-03-07 11:47     ` Patrick Bellasi
2018-03-07 12:26       ` Peter Zijlstra
2018-03-07 15:16         ` Patrick Bellasi
     [not found] ` <20180222170153.673-4-patrick.bellasi@arm.com>
2018-02-26  4:04   ` [PATCH v5 3/4] sched/cpufreq_schedutil: use util_est for OPP selection Viresh Kumar
2018-03-07 10:12   ` Peter Zijlstra
     [not found] ` <20180222170153.673-5-patrick.bellasi@arm.com>
2018-03-01 17:46   ` [PATCH v5 4/4] sched/fair: update util_est only on util_avg updates Patrick Bellasi
2018-03-07 10:38   ` Peter Zijlstra
2018-03-08  9:15     ` Peter Zijlstra
2018-03-08  9:48   ` Peter Zijlstra
2018-03-08 10:37     ` Patrick Bellasi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox