* [RFC PATCH 4/9] sched/fair: Rename variable names for sched averages
2016-05-15 18:59 [RFC PATCH 0/9] Clean up and optimize sched averages Yuyang Du
` (2 preceding siblings ...)
2016-05-15 18:59 ` [RFC PATCH 3/9] sched/fair: Add static to remove_entity_load_avg() Yuyang Du
@ 2016-05-15 18:59 ` Yuyang Du
2016-05-23 20:50 ` Yuyang Du
2016-05-15 18:59 ` [RFC PATCH 5/9] sched/fair: Change the variable to hold the number of periods to 32-bit Yuyang Du
` (5 subsequent siblings)
9 siblings, 1 reply; 12+ messages in thread
From: Yuyang Du @ 2016-05-15 18:59 UTC (permalink / raw)
To: peterz, mingo, linux-kernel
Cc: bsegall, pjt, morten.rasmussen, vincent.guittot, dietmar.eggemann,
juri.lelli, Yuyang Du
The names of sched averages (including load_avg and util_avg) have
been changed and added in the past a couple of years, some of the
names are a bit confusing especially to people who first read them.
This patch attempts to make the names more self-explaining. And some
comments are updated too.
The renames are listed as follows:
- update_load_avg() to update_sched_avg()
- enqueue_entity_load_avg() to enqueue_entity_sched_avg()
- dequeue_entity_load_avg() to dequeue_entity_sched_avg()
- detach_entity_load_avg() to detach_entity_sched_avg()
- attach_entity_load_avg() to attach_entity_sched_avg()
- remove_entity_load_avg() to remove_entity_sched_avg()
- LOAD_AVG_PERIOD to SCHED_AVG_HALFLIFE
- LOAD_AVG_MAX_N to SCHED_AVG_MAX_N
- LOAD_AVG_MAX to SCHED_AVG_MAX
- runnable_avg_yN_sum[] to __accumulated_sum_N[]
- runnable_avg_yN_inv[] to __decay_inv_multiply_N[]
- __compute_runnable_contrib() to __accumulate_sum()
- decay_load() to __decay_sum()
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
---
include/linux/sched.h | 2 +-
kernel/sched/fair.c | 219 +++++++++++++++++++++++++------------------------
kernel/sched/sched.h | 2 +-
3 files changed, 114 insertions(+), 109 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1b43b45..9710e2b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1221,7 +1221,7 @@ struct load_weight {
/*
* The load_avg/util_avg accumulates an infinite geometric series
- * (see __update_load_avg() in kernel/sched/fair.c).
+ * (see __update_sched_avg() in kernel/sched/fair.c).
*
* [load_avg definition]
*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 66fba3f..fddaa61 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -660,13 +660,15 @@ static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
- * dependent on this value.
+ * Note: everything in sched average calculation, including
+ * __decay_inv_multiply_N, __accumulated_sum_N, __accumulated_sum_N32,
+ * SCHED_AVG_MAX, and SCHED_AVG_MAX_N, is dependent on and only on
+ * (1) exponential decay, (2) a period of 1024*1024ns (~1ms), and (3)
+ * a half-life of 32 periods.
*/
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 347 /* number of full periods to produce LOAD_AVG_MAX */
+#define SCHED_AVG_HALFLIFE 32 /* number of periods as a half-life */
+#define SCHED_AVG_MAX 47742 /* maximum possible sched avg */
+#define SCHED_AVG_MAX_N 345 /* number of full periods to produce SCHED_AVG_MAX */
/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
@@ -681,7 +683,7 @@ void init_entity_runnable_average(struct sched_entity *se)
*/
sa->period_contrib = 1023;
sa->load_avg = scale_load_down(se->load.weight);
- sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+ sa->load_sum = sa->load_avg * SCHED_AVG_MAX;
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
*/
@@ -731,7 +733,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
} else {
sa->util_avg = cap;
}
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ sa->util_sum = sa->util_avg * SCHED_AVG_MAX;
}
}
@@ -1834,7 +1836,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
*period = now - p->last_task_numa_placement;
} else {
delta = p->se.avg.load_sum / p->se.load.weight;
- *period = LOAD_AVG_MAX;
+ *period = SCHED_AVG_MAX;
}
p->last_sum_exec_runtime = runtime;
@@ -2583,7 +2585,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
#ifdef CONFIG_SMP
/* Precomputed fixed inverse multiplies for multiplication by y^n */
-static const u32 runnable_avg_yN_inv[] = {
+static const u32 __decay_inv_multiply_N[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
@@ -2596,7 +2598,7 @@ static const u32 runnable_avg_yN_inv[] = {
* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
* over-estimates when re-combining.
*/
-static const u32 runnable_avg_yN_sum[] = {
+static const u32 __accumulated_sum_N[] = {
0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
@@ -2613,93 +2615,95 @@ static const u32 __accumulated_sum_N32[] = {
};
/*
- * Approximate:
- * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ * val * y^n, where y^m ~= 0.5
+ *
+ * n is the number of periods past; a period is ~1ms
+ * m is half-life in exponential decay; here it is SCHED_AVG_HALFLIFE=32.
*/
-static __always_inline u64 decay_load(u64 val, u64 n)
+static __always_inline u64 __decay_sum(u64 val, u64 n)
{
unsigned int local_n;
if (!n)
return val;
- else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ else if (unlikely(n > SCHED_AVG_HALFLIFE * 63))
return 0;
/* after bounds checking we can collapse to 32-bit */
local_n = n;
/*
- * As y^PERIOD = 1/2, we can combine
- * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
- * With a look-up table which covers y^n (n<PERIOD)
+ * As y^HALFLIFE = 1/2, we can combine
+ * y^n = 1/2^(n/HALFLIFE) * y^(n%HALFLIFE)
+ * With a look-up table which covers y^n (n<HALFLIFE)
*
- * To achieve constant time decay_load.
+ * To achieve constant time __decay_load.
*/
- if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
- val >>= local_n / LOAD_AVG_PERIOD;
- local_n %= LOAD_AVG_PERIOD;
+ if (unlikely(local_n >= SCHED_AVG_HALFLIFE)) {
+ val >>= local_n / SCHED_AVG_HALFLIFE;
+ local_n %= SCHED_AVG_HALFLIFE;
}
- val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
+ val = mul_u64_u32_shr(val, __decay_inv_multiply_N[local_n], 32);
return val;
}
/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
+ * For updates fully spanning n periods, the accumulated contribution
+ * will be: \Sum 1024*y^n.
*
- * We can compute this reasonably efficiently by combining:
- * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ * We can compute this efficiently by combining:
+ * y^32 = 1/2 with precomputed \Sum 1024*y^n (where n < 32)
*/
-static u32 __compute_runnable_contrib(u64 n)
+static u32 __accumulate_sum(u64 n)
{
u32 contrib = 0;
- if (likely(n <= LOAD_AVG_PERIOD))
- return runnable_avg_yN_sum[n];
- else if (unlikely(n >= LOAD_AVG_MAX_N))
- return LOAD_AVG_MAX;
+ if (likely(n <= SCHED_AVG_HALFLIFE))
+ return __accumulated_sum_N[n];
+ else if (unlikely(n >= SCHED_AVG_MAX_N))
+ return SCHED_AVG_MAX;
- /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
- contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
- n %= LOAD_AVG_PERIOD;
- contrib = decay_load(contrib, n);
- return contrib + runnable_avg_yN_sum[n];
+ /* Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11 */
+ contrib = __accumulated_sum_N32[n/SCHED_AVG_HALFLIFE];
+ n %= SCHED_AVG_HALFLIFE;
+ contrib = __decay_sum(contrib, n);
+ return contrib + __accumulated_sum_N[n];
}
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
/*
- * We can represent the historical contribution to runnable average as the
- * coefficients of a geometric series. To do this we sub-divide our runnable
- * history into segments of approximately 1ms (1024us); label the segment that
- * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ * We can represent the historical contribution to sched average as the
+ * coefficients of a geometric series. To do this we divide the history
+ * into segments of approximately 1ms (1024*1024ns); label the segment that
+ * occurred N-1024us ago p_N, with p_0 corresponding to the current period, e.g.
*
* [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
* p0 p1 p2
* (now) (~1ms ago) (~2ms ago)
*
- * Let u_i denote the fraction of p_i that the entity was runnable.
+ * Let u_i denote the fraction of p_i whose state (runnable/running) we count.
*
* We then designate the fractions u_i as our co-efficients, yielding the
- * following representation of historical load:
+ * following representation of a sched metric:
* u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
*
- * We choose y based on the with of a reasonably scheduling period, fixing:
- * y^32 = 0.5
+ * We choose y based on a half-life of 32 periods (which is ~32ms):
+ * y^32 = 0.5 => y = (0.5)^(1/32)
*
- * This means that the contribution to load ~32ms ago (u_32) will be weighted
- * approximately half as much as the contribution to load within the last ms
- * (u_0).
+ * where 32 is the number of periods that a past period's contribution is
+ * halved. This means that the impact of a period every ~32ms ago will be
+ * as much as 50% of the previous value.
*
* When a period "rolls over" and we have new u_0`, multiplying the previous
* sum again by y is sufficient to update:
- * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
- * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ * avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+__update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
u64 delta, scaled_delta, periods;
u32 contrib;
@@ -2759,15 +2763,15 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
periods = delta / 1024;
delta %= 1024;
- sa->load_sum = decay_load(sa->load_sum, periods + 1);
+ sa->load_sum = __decay_sum(sa->load_sum, periods + 1);
if (cfs_rq) {
cfs_rq->runnable_load_sum =
- decay_load(cfs_rq->runnable_load_sum, periods + 1);
+ __decay_sum(cfs_rq->runnable_load_sum, periods + 1);
}
- sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
+ sa->util_sum = __decay_sum((u64)(sa->util_sum), periods + 1);
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
- contrib = __compute_runnable_contrib(periods);
+ contrib = __accumulate_sum(periods);
contrib = cap_scale(contrib, scale_freq);
if (weight) {
sa->load_sum += weight * contrib;
@@ -2791,12 +2795,12 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
sa->period_contrib += delta;
if (decayed) {
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
if (cfs_rq) {
cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
}
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
}
return decayed;
@@ -2864,8 +2868,8 @@ void set_task_rq_fair(struct sched_entity *se,
p_last_update_time = prev->avg.last_update_time;
n_last_update_time = next->avg.last_update_time;
#endif
- __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
- &se->avg, 0, 0, NULL);
+ __update_sched_avg(p_last_update_time, cpu_of(rq_of(prev)),
+ &se->avg, 0, 0, NULL);
se->avg.last_update_time = n_last_update_time;
}
}
@@ -2906,7 +2910,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_sched_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
struct sched_avg *sa = &cfs_rq->avg;
int decayed, removed_load = 0, removed_util = 0;
@@ -2914,18 +2918,18 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sa->load_avg = max_t(long, sa->load_avg - r, 0);
- sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+ sa->load_sum = max_t(s64, sa->load_sum - r * SCHED_AVG_MAX, 0);
removed_load = 1;
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
sa->util_avg = max_t(long, sa->util_avg - r, 0);
- sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ sa->util_sum = max_t(s32, sa->util_sum - r * SCHED_AVG_MAX, 0);
removed_util = 1;
}
- decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+ decayed = __update_sched_avg(now, cpu_of(rq_of(cfs_rq)), sa,
scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
#ifndef CONFIG_64BIT
@@ -2940,7 +2944,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
}
/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_sched_avg(struct sched_entity *se, int update_tg)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
@@ -2951,15 +2955,15 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- __update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
+ __update_sched_avg(now, cpu, &se->avg,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
- if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
+ if (update_cfs_rq_sched_avg(now, cfs_rq, true) && update_tg)
update_tg_load_avg(cfs_rq, 0);
}
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void attach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (!sched_feat(ATTACH_AGE_LOAD))
goto skip_aging;
@@ -2969,8 +2973,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* have aged the average right before clearing @last_update_time.
*/
if (se->avg.last_update_time) {
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, 0, 0, NULL);
+ __update_sched_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
+ &se->avg, 0, 0, NULL);
/*
* XXX: we could have just aged the entire load away if we've been
@@ -2988,11 +2992,11 @@ skip_aging:
cfs_rq_util_change(cfs_rq);
}
-static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void detach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
+ __update_sched_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
+ &se->avg, se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
@@ -3004,7 +3008,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/* Add the load generated by se into cfs_rq's load average */
static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+enqueue_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
u64 now = cfs_rq_clock_task(cfs_rq);
@@ -3012,18 +3016,18 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
migrated = !sa->last_update_time;
if (!migrated) {
- __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
+ __update_sched_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
}
- decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
+ decayed = update_cfs_rq_sched_avg(now, cfs_rq, !migrated);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
if (migrated)
- attach_entity_load_avg(cfs_rq, se);
+ attach_entity_sched_avg(cfs_rq, se);
if (decayed || migrated)
update_tg_load_avg(cfs_rq, 0);
@@ -3031,9 +3035,9 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
/* Remove the runnable load generated by se from cfs_rq's runnable load average */
static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+dequeue_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- update_load_avg(se, 1);
+ update_sched_avg(se, 1);
cfs_rq->runnable_load_avg =
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
@@ -3066,7 +3070,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
-static void remove_entity_load_avg(struct sched_entity *se)
+static void remove_entity_sched_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 last_update_time;
@@ -3080,7 +3084,8 @@ static void remove_entity_load_avg(struct sched_entity *se)
last_update_time = cfs_rq_last_update_time(cfs_rq);
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ __update_sched_avg(last_update_time, cpu_of(rq_of(cfs_rq)),
+ &se->avg, 0, 0, NULL);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
@@ -3099,7 +3104,7 @@ static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
-static inline void update_load_avg(struct sched_entity *se, int not_used)
+static inline void update_sched_avg(struct sched_entity *se, int not_used)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct rq *rq = rq_of(cfs_rq);
@@ -3108,15 +3113,15 @@ static inline void update_load_avg(struct sched_entity *se, int not_used)
}
static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+enqueue_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline void remove_entity_load_avg(struct sched_entity *se) {}
+dequeue_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void remove_entity_sched_avg(struct sched_entity *se) {}
static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+attach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
-detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+detach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline int idle_balance(struct rq *rq)
{
@@ -3309,7 +3314,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
- enqueue_entity_load_avg(cfs_rq, se);
+ enqueue_entity_sched_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -3388,7 +3393,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- dequeue_entity_load_avg(cfs_rq, se);
+ dequeue_entity_sched_avg(cfs_rq, se);
if (schedstat_enabled())
update_stats_dequeue(cfs_rq, se, flags);
@@ -3468,7 +3473,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (schedstat_enabled())
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(se, 1);
+ update_sched_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -3572,7 +3577,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
- update_load_avg(prev, 0);
+ update_sched_avg(prev, 0);
}
cfs_rq->curr = NULL;
}
@@ -3588,7 +3593,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(curr, 1);
+ update_sched_avg(curr, 1);
update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
@@ -4461,7 +4466,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
+ update_sched_avg(se, 1);
update_cfs_shares(cfs_rq);
}
@@ -4521,7 +4526,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
+ update_sched_avg(se, 1);
update_cfs_shares(cfs_rq);
}
@@ -5418,7 +5423,7 @@ static void migrate_task_rq_fair(struct task_struct *p)
* will result in the wakee task is less decayed, but giving the wakee more
* load sounds not bad.
*/
- remove_entity_load_avg(&p->se);
+ remove_entity_sched_avg(&p->se);
/* Tell new CPU we are migrated */
p->se.avg.last_update_time = 0;
@@ -5429,7 +5434,7 @@ static void migrate_task_rq_fair(struct task_struct *p)
static void task_dead_fair(struct task_struct *p)
{
- remove_entity_load_avg(&p->se);
+ remove_entity_sched_avg(&p->se);
}
#endif /* CONFIG_SMP */
@@ -6310,7 +6315,7 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
+ if (update_cfs_rq_sched_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6371,7 +6376,7 @@ static inline void update_blocked_averages(int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
+ update_cfs_rq_sched_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -8388,7 +8393,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
}
/* Catch up with the cfs_rq and remove our load when we leave */
- detach_entity_load_avg(cfs_rq, se);
+ detach_entity_sched_avg(cfs_rq, se);
}
static void attach_task_cfs_rq(struct task_struct *p)
@@ -8405,7 +8410,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
#endif
/* Synchronize task with its cfs_rq */
- attach_entity_load_avg(cfs_rq, se);
+ attach_entity_sched_avg(cfs_rq, se);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
@@ -8544,7 +8549,7 @@ void unregister_fair_sched_group(struct task_group *tg)
for_each_possible_cpu(cpu) {
if (tg->se[cpu])
- remove_entity_load_avg(tg->se[cpu]);
+ remove_entity_sched_avg(tg->se[cpu]);
/*
* Only empty task groups can be destroyed; so we can speculatively
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e51145e..3432985 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1764,7 +1764,7 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
* @max: Utilization ceiling.
*
* This function is called by the scheduler on every invocation of
- * update_load_avg() on the CPU whose utilization is being updated.
+ * update_sched_avg() on the CPU whose utilization is being updated.
*
* It can only be called from RCU-sched read-side critical sections.
*/
--
1.7.9.5
^ permalink raw reply related [flat|nested] 12+ messages in thread* [RFC PATCH 7/9] sched/fair: Optimize __update_sched_avg()
2016-05-15 18:59 [RFC PATCH 0/9] Clean up and optimize sched averages Yuyang Du
` (5 preceding siblings ...)
2016-05-15 18:59 ` [RFC PATCH 6/9] sched/fair: Add __always_inline compiler attribute to __accumulate_sum() Yuyang Du
@ 2016-05-15 18:59 ` Yuyang Du
2016-05-15 18:59 ` [RFC PATCH 8/9] sched/fair: Remove scale_load_down() for load_avg Yuyang Du
` (2 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Yuyang Du @ 2016-05-15 18:59 UTC (permalink / raw)
To: peterz, mingo, linux-kernel
Cc: bsegall, pjt, morten.rasmussen, vincent.guittot, dietmar.eggemann,
juri.lelli, Yuyang Du
__update_sched_avg() has these steps:
1. add the remainder of the last incomplete period
2. decay old sum
3. accumulate new sum in full periods since last_update_time
4. add the current incomplete period
5. update averages
Previously, we separately computed steps 1, 3, and 4, leading to
each one of them ugly in codes and costly in overhead.
Illustation:
c1 c3 c4
^ ^ ^
| | |
|<->|<----------------->|<--->|
... |---x---|------| ... |------|-----x (now)
c1, c3, and c4 are the accumulated (meanwhile decayed) contributions
in timing aspect of steps 1, 3, and 4 respectively.
With them, the accumulated contribution to load_sum, for example, is:
contrib = c1 * weight * freq_scaled;
contrib += c3 * weight * freq_scaled;
contrib += c4 * weight * freq_scaled;
Obviously, we can optimize the above by:
contrib = c1 + c3 + c4;
contrib *= weight * freq_scaled;
One issue is that c1 must be additionally decayed as opposed to
decaying it with step 2. However, peformance wise, the two approaches
should be on par with each other if you compare __decay_sum() with a
round of contrib accumulation. And overall, we definitely will save
tens of instructions, although the performance gain may not be observed
by benchmarks, whereas code wise, this apprach is obviously much simpler.
Code size (allyesconfig):
Before: kernel/sched/built-in.o 1404840
After : kernel/sched/built-in.o 1404016 (-824B)
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
---
kernel/sched/fair.c | 180 +++++++++++++++++++++++++--------------------------
1 file changed, 89 insertions(+), 91 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1bbac7e..e1cde19 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -668,7 +668,7 @@ static unsigned long task_h_load(struct task_struct *p);
*/
#define SCHED_AVG_HALFLIFE 32 /* number of periods as a half-life */
#define SCHED_AVG_MAX 47742 /* maximum possible sched avg */
-#define SCHED_AVG_MAX_N 345 /* number of full periods to produce SCHED_AVG_MAX */
+#define SCHED_AVG_MAX_N 347 /* number of full periods to produce SCHED_AVG_MAX */
/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
@@ -2652,24 +2652,83 @@ static __always_inline u64 __decay_sum(u64 val, u32 n)
* We can compute this efficiently by combining:
* y^32 = 1/2 with precomputed \Sum 1024*y^n (where n < 32)
*/
-static __always_inline u32 __accumulate_sum(u32 n)
+static __always_inline u32
+__accumulate_sum(u32 periods, u32 period_contrib, u32 remainder)
{
- u32 contrib = 0;
+ u32 contrib;
- if (likely(n <= SCHED_AVG_HALFLIFE))
- return __accumulated_sum_N[n];
- else if (unlikely(n >= SCHED_AVG_MAX_N))
+ if (!periods)
+ return remainder - period_contrib;
+
+ if (unlikely(periods >= SCHED_AVG_MAX_N))
return SCHED_AVG_MAX;
- /* Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11 */
- contrib = __accumulated_sum_N32[n/SCHED_AVG_HALFLIFE];
- n %= SCHED_AVG_HALFLIFE;
- contrib = __decay_sum(contrib, n);
- return contrib + __accumulated_sum_N[n];
+ remainder += __decay_sum((u64)(1024 - period_contrib), periods);
+
+ periods -= 1;
+ if (likely(periods <= SCHED_AVG_HALFLIFE))
+ contrib = __accumulated_sum_N[periods];
+ else {
+ contrib = __accumulated_sum_N32[periods/SCHED_AVG_HALFLIFE];
+ periods %= SCHED_AVG_HALFLIFE;
+ contrib = __decay_sum(contrib, periods);
+ contrib += __accumulated_sum_N[periods];
+ }
+
+ return contrib + remainder;
}
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
+ struct cfs_rq *cfs_rq, int cpu, unsigned long weight, int running)
+{
+ u32 contrib, periods;
+ unsigned long scale_freq, scale_cpu;
+
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ delta += sa->period_contrib;
+ periods = delta >> 10; /* A period is 1024us (~1ms) */
+
+ /*
+ * Accumulating *_sum has two steps.
+ *
+ * Step 1: decay old *_sum if we crossed period boundaries.
+ */
+ if (periods) {
+ sa->load_sum = __decay_sum(sa->load_sum, periods);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ __decay_sum(cfs_rq->runnable_load_sum, periods);
+ }
+ sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
+ }
+
+ /*
+ * Step 2: accumulate new *_sum since last_update_time. This at most has
+ * three parts (at least one part): (1) remainder of incomplete last
+ * period, (2) full periods since (1), and (3) incomplete current period.
+ *
+ * Fortunately, we can (and should) do all these three at once.
+ */
+ delta %= 1024;
+ contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+ sa->period_contrib = delta;
+
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+
+ return periods;
+}
+
/*
* We can represent the historical contribution to sched average as the
* coefficients of a geometric series. To do this we divide the history
@@ -2702,10 +2761,7 @@ static __always_inline int
__update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
- u64 delta, scaled_delta;
- u32 contrib, periods;
- unsigned int delta_w, scaled_delta_w, decayed = 0;
- unsigned long scale_freq, scale_cpu;
+ u64 delta;
delta = now - sa->last_update_time;
/*
@@ -2726,85 +2782,27 @@ __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
return 0;
sa->last_update_time = now;
- scale_freq = arch_scale_freq_capacity(NULL, cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
- /* delta_w is the amount already accumulated against our next period */
- delta_w = sa->period_contrib;
- if (delta + delta_w >= 1024) {
- decayed = 1;
-
- /* how much left for next period will start over, we don't know yet */
- sa->period_contrib = 0;
-
- /*
- * Now that we know we're crossing a period boundary, figure
- * out how much from delta we need to complete the current
- * period and accrue it.
- */
- delta_w = 1024 - delta_w;
- scaled_delta_w = cap_scale(delta_w, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta_w;
- if (cfs_rq) {
- cfs_rq->runnable_load_sum +=
- weight * scaled_delta_w;
- }
- }
- if (running)
- sa->util_sum += scaled_delta_w * scale_cpu;
-
- delta -= delta_w;
-
- /*
- * Figure out how many additional periods this update spans.
- * A period is 1024*1024ns or ~1ms, so a 32bit integer can hold
- * approximately a maximum of 49 (=2^32/1000/3600/24) days.
- */
- periods = delta / 1024;
- delta %= 1024;
-
- sa->load_sum = __decay_sum(sa->load_sum, periods + 1);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- __decay_sum(cfs_rq->runnable_load_sum, periods + 1);
- }
- sa->util_sum = __decay_sum((u64)(sa->util_sum), periods + 1);
-
- /* Efficiently calculate \sum (1..n_period) 1024*y^i */
- contrib = __accumulate_sum(periods);
- contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
- }
- if (running)
- sa->util_sum += contrib * scale_cpu;
- }
-
- /* Remainder of delta accrued against u_0` */
- scaled_delta = cap_scale(delta, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * scaled_delta;
- }
- if (running)
- sa->util_sum += scaled_delta * scale_cpu;
-
- sa->period_contrib += delta;
+ /*
+ * Now we know we crossed measurement unit boundaries. The *_avg
+ * accrues by two steps:
+ *
+ * Step 1: accumulate *_sum since last_update_time. If we haven't
+ * crossed period boundaries, finish.
+ */
+ if (!accumulate_sum(delta, sa, cfs_rq, cpu, weight, running))
+ return 0;
- if (decayed) {
- sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
- if (cfs_rq) {
- cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
- }
- sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
+ /*
+ * Step 2: update *_avg.
+ */
+ sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
}
+ sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
- return decayed;
+ return 1;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
--
1.7.9.5
^ permalink raw reply related [flat|nested] 12+ messages in thread* [RFC PATCH 8/9] sched/fair: Remove scale_load_down() for load_avg
2016-05-15 18:59 [RFC PATCH 0/9] Clean up and optimize sched averages Yuyang Du
` (6 preceding siblings ...)
2016-05-15 18:59 ` [RFC PATCH 7/9] sched/fair: Optimize __update_sched_avg() Yuyang Du
@ 2016-05-15 18:59 ` Yuyang Du
2016-05-15 18:59 ` [RFC PATCH 9/9] sched/fair: Rename scale_load() and scale_load_down() Yuyang Du
2016-05-23 20:26 ` [RFC PATCH 0/9] Clean up and optimize sched averages Yuyang Du
9 siblings, 0 replies; 12+ messages in thread
From: Yuyang Du @ 2016-05-15 18:59 UTC (permalink / raw)
To: peterz, mingo, linux-kernel
Cc: bsegall, pjt, morten.rasmussen, vincent.guittot, dietmar.eggemann,
juri.lelli, Yuyang Du
Currently, load_avg = scale_load_down(load) * runnable%. The extra scaling
down of load does not make much sense, because load_avg is primarily THE
load and on top of that, we take runnable time into account.
We therefore remove scale_load_down() for load_avg. But we need to
carefully consider the overflow risk if load has higher fixed point range
(2*SCHED_FIXEDPOINT_SHIFT). The only case an overflow may occur due
to us is on 64bit kernel with increased fixed point range. In this case,
the 64bit load_sum can afford 4251057 (=2^64/47742/88761/1024)
entities with the highest load (=88761*1024) always runnable on one
single cfs_rq, which may be an issue, but should be fine. Even if this
occurs at the end of the day, on the condition where it occurs, the
load average will not be useful anyway. And afterwards if the machine
can survive, the load will correct itself very quickly in no more than
~2 seconds (=32ms*64).
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
---
include/linux/sched.h | 19 ++++++++++++++-----
kernel/sched/fair.c | 11 +++++------
2 files changed, 19 insertions(+), 11 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9710e2b..aca6b6f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1225,7 +1225,7 @@ struct load_weight {
*
* [load_avg definition]
*
- * load_avg = runnable% * scale_load_down(load)
+ * load_avg = runnable% * load
*
* where runnable% is the time ratio that a sched_entity is runnable.
* For cfs_rq, it is the aggregated load_avg of all runnable and
@@ -1233,7 +1233,7 @@ struct load_weight {
*
* load_avg may also take frequency scaling into account:
*
- * load_avg = runnable% * scale_load_down(load) * freq%
+ * load_avg = runnable% * load * freq%
*
* where freq% is the CPU frequency normalized to the highest frequency.
*
@@ -1259,9 +1259,18 @@ struct load_weight {
*
* [Overflow issue]
*
- * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
- * with the highest load (=88761), always runnable on a single cfs_rq,
- * and should not overflow as the number already hits PID_MAX_LIMIT.
+ * On 64bit kernel:
+ *
+ * When load has small fixed point range (SCHED_FIXEDPOINT_SHIFT), the
+ * 64bit load_sum can have 4353082796 (=2^64/47742/88761) tasks with
+ * the highest load (=88761) always runnable on a cfs_rq, we should
+ * not overflow as the number already hits PID_MAX_LIMIT.
+ *
+ * When load has large fixed point range (2*SCHED_FIXEDPOINT_SHIFT),
+ * the 64bit load_sum can have 4251057 (=2^64/47742/88761/1024) tasks
+ * with the highest load (=88761*1024) always runnable on ONE cfs_rq,
+ * we should be fine. Even if the overflow occurs at the end of day,
+ * at the time the load_avg won't be useful anyway in that situation.
*
* For all other cases (including 32-bit kernels), struct load_weight's
* weight will overflow first before we do, because:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e1cde19..88913b8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -682,7 +682,7 @@ void init_entity_runnable_average(struct sched_entity *se)
* will definitely be update (after enqueue).
*/
sa->period_contrib = 1023;
- sa->load_avg = scale_load_down(se->load.weight);
+ sa->load_avg = se->load.weight;
sa->load_sum = sa->load_avg * SCHED_AVG_MAX;
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
@@ -2929,7 +2929,7 @@ update_cfs_rq_sched_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
}
decayed = __update_sched_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
+ cfs_rq->load.weight, cfs_rq->curr != NULL, cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -2954,8 +2954,7 @@ static inline void update_sched_avg(struct sched_entity *se, int update_tg)
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- __update_sched_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
+ __update_sched_avg(now, cpu, &se->avg, se->on_rq * se->load.weight,
cfs_rq->curr == se, NULL);
if (update_cfs_rq_sched_avg(now, cfs_rq, true) && update_tg)
@@ -2994,7 +2993,7 @@ skip_aging:
static void detach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
__update_sched_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
+ &se->avg, se->on_rq * se->load.weight,
cfs_rq->curr == se, NULL);
cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
@@ -3016,7 +3015,7 @@ enqueue_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
migrated = !sa->last_update_time;
if (!migrated) {
__update_sched_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- se->on_rq * scale_load_down(se->load.weight),
+ se->on_rq * se->load.weight,
cfs_rq->curr == se, NULL);
}
--
1.7.9.5
^ permalink raw reply related [flat|nested] 12+ messages in thread