linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: mingo@kernel.org, vincent.guittot@linaro.org
Cc: linux-kernel@vger.kernel.org, peterz@infradead.org,
	juri.lelli@redhat.com, dietmar.eggemann@arm.com,
	rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
	bristot@redhat.com, corbet@lwn.net, qyousef@layalina.io,
	chris.hyser@oracle.com, patrick.bellasi@matbug.net,
	pjt@google.com, pavel@ucw.cz, qperret@google.com,
	tim.c.chen@linux.intel.com, joshdon@google.com, timj@gnu.org,
	kprateek.nayak@amd.com, yu.c.chen@intel.com,
	youssefesmat@chromium.org, joel@joelfernandes.org, efault@gmx.de
Subject: [PATCH 04/17] sched/fair: Add avg_vruntime
Date: Tue, 28 Mar 2023 11:26:26 +0200	[thread overview]
Message-ID: <20230328110353.853385546@infradead.org> (raw)
In-Reply-To: 20230328092622.062917921@infradead.org

In order to move to an eligibility based scheduling policy it is
needed to have a better approximation of the ideal scheduler.

Specifically, for a virtual time weighted fair queueing based
scheduler the ideal scheduler will be the weighted average of the
individual virtual runtimes (math in the comment).

As such, compute the weighted average to approximate the ideal
scheduler -- note that the approximation is in the individual task
behaviour, which isn't strictly conformant.

Specifically consider adding a task with a vruntime left of center, in
this case the average will move backwards in time -- something the
ideal scheduler would of course never do.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/debug.c |   32 ++++++--------
 kernel/sched/fair.c  |  111 +++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h |    5 ++
 3 files changed, 128 insertions(+), 20 deletions(-)

--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -580,10 +580,9 @@ static void print_rq(struct seq_file *m,
 
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
-	s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
-		spread, rq0_min_vruntime, spread0;
+	s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
+	struct sched_entity *last, *first;
 	struct rq *rq = cpu_rq(cpu);
-	struct sched_entity *last;
 	unsigned long flags;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -597,26 +596,25 @@ void print_cfs_rq(struct seq_file *m, in
 			SPLIT_NS(cfs_rq->exec_clock));
 
 	raw_spin_rq_lock_irqsave(rq, flags);
-	if (rb_first_cached(&cfs_rq->tasks_timeline))
-		MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
+	first = __pick_first_entity(cfs_rq);
+	if (first)
+		left_vruntime = first->vruntime;
 	last = __pick_last_entity(cfs_rq);
 	if (last)
-		max_vruntime = last->vruntime;
+		right_vruntime = last->vruntime;
 	min_vruntime = cfs_rq->min_vruntime;
-	rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
 	raw_spin_rq_unlock_irqrestore(rq, flags);
-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
-			SPLIT_NS(MIN_vruntime));
+
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "left_vruntime",
+			SPLIT_NS(left_vruntime));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
 			SPLIT_NS(min_vruntime));
-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "max_vruntime",
-			SPLIT_NS(max_vruntime));
-	spread = max_vruntime - MIN_vruntime;
-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread",
-			SPLIT_NS(spread));
-	spread0 = min_vruntime - rq0_min_vruntime;
-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
-			SPLIT_NS(spread0));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "avg_vruntime",
+			SPLIT_NS(avg_vruntime(cfs_rq)));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "right_vruntime",
+			SPLIT_NS(right_vruntime));
+	spread = right_vruntime - left_vruntime;
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
 			cfs_rq->nr_spread_over);
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -601,9 +601,108 @@ static inline bool entity_before(const s
 	return (s64)(a->vruntime - b->vruntime) < 0;
 }
 
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	return (s64)(se->vruntime - cfs_rq->min_vruntime);
+}
+
 #define __node_2_se(node) \
 	rb_entry((node), struct sched_entity, run_node)
 
+/*
+ * Compute virtual time from the per-task service numbers:
+ *
+ * Fair schedulers conserve lag: \Sum lag_i = 0
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * \Sum lag_i = 0 -> \Sum w_i * (V - v_i) = V * \Sum w_i - \Sum w_i * v_i = 0
+ *
+ * From which we solve V:
+ *
+ *     \Sum v_i * w_i
+ * V = --------------
+ *        \Sum w_i
+ *
+ * However, since v_i is u64, and the multiplcation could easily overflow
+ * transform it into a relative form that uses smaller quantities:
+ *
+ * Substitute: v_i == (v_i - v) + v
+ *
+ *     \Sum ((v_i - v) + v) * w_i   \Sum (v_i - v) * w_i
+ * V = -------------------------- = -------------------- + v
+ *              \Sum w_i                   \Sum w_i
+ *
+ * min_vruntime = v
+ * avg_vruntime = \Sum (v_i - v) * w_i
+ * cfs_rq->load = \Sum w_i
+ *
+ * Since min_vruntime is a monotonic increasing variable that closely tracks
+ * the per-task service, these deltas: (v_i - v), will be in the order of the
+ * maximal (virtual) lag induced in the system due to quantisation.
+ */
+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long weight = scale_load_down(se->load.weight);
+	s64 key = entity_key(cfs_rq, se);
+
+	cfs_rq->avg_vruntime += key * weight;
+	cfs_rq->avg_load += weight;
+}
+
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long weight = scale_load_down(se->load.weight);
+	s64 key = entity_key(cfs_rq, se);
+
+	cfs_rq->avg_vruntime -= key * weight;
+	cfs_rq->avg_load -= weight;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+	/*
+	 * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+	 */
+	cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+}
+
+u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr = cfs_rq->curr;
+	s64 avg = cfs_rq->avg_vruntime;
+	long load = cfs_rq->avg_load;
+
+	if (curr && curr->on_rq) {
+		unsigned long weight = scale_load_down(curr->load.weight);
+
+		avg += entity_key(cfs_rq, curr) * weight;
+		load += weight;
+	}
+
+	if (load)
+		avg = div_s64(avg, load);
+
+	return cfs_rq->min_vruntime + avg;
+}
+
+static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+	u64 min_vruntime = cfs_rq->min_vruntime;
+	/*
+	 * open coded max_vruntime() to allow updating avg_vruntime
+	 */
+	s64 delta = (s64)(vruntime - min_vruntime);
+	if (delta > 0) {
+		avg_vruntime_update(cfs_rq, delta);
+		min_vruntime = vruntime;
+	}
+	return min_vruntime;
+}
+
 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
@@ -629,7 +728,7 @@ static void update_min_vruntime(struct c
 
 	/* ensure we never gain time by being placed backwards. */
 	u64_u32_store(cfs_rq->min_vruntime,
-		      max_vruntime(cfs_rq->min_vruntime, vruntime));
+		      __update_min_vruntime(cfs_rq, vruntime));
 }
 
 static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -642,12 +741,14 @@ static inline bool __entity_less(struct
  */
 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	avg_vruntime_add(cfs_rq, se);
 	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
 }
 
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+	avg_vruntime_sub(cfs_rq, se);
 }
 
 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -3330,6 +3431,8 @@ static void reweight_entity(struct cfs_r
 		/* commit outstanding execution time */
 		if (cfs_rq->curr == se)
 			update_curr(cfs_rq);
+		else
+			avg_vruntime_sub(cfs_rq, se);
 		update_load_sub(&cfs_rq->load, se->load.weight);
 	}
 	dequeue_load_avg(cfs_rq, se);
@@ -3345,9 +3448,11 @@ static void reweight_entity(struct cfs_r
 #endif
 
 	enqueue_load_avg(cfs_rq, se);
-	if (se->on_rq)
+	if (se->on_rq) {
 		update_load_add(&cfs_rq->load, se->load.weight);
-
+		if (cfs_rq->curr != se)
+			avg_vruntime_add(cfs_rq, se);
+	}
 }
 
 void reweight_task(struct task_struct *p, int prio)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -558,6 +558,9 @@ struct cfs_rq {
 	unsigned int		idle_nr_running;   /* SCHED_IDLE */
 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
 
+	s64			avg_vruntime;
+	u64			avg_load;
+
 	u64			exec_clock;
 	u64			min_vruntime;
 #ifdef CONFIG_SCHED_CORE
@@ -3312,4 +3315,6 @@ static inline void switch_mm_cid(struct
 static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
 #endif
 
+extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+
 #endif /* _KERNEL_SCHED_SCHED_H */



  parent reply	other threads:[~2023-03-28 11:08 UTC|newest]

Thread overview: 55+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-03-28  9:26 [PATCH 00/17] sched: EEVDF using latency-nice Peter Zijlstra
2023-03-28  9:26 ` [PATCH 01/17] sched: Introduce latency-nice as a per-task attribute Peter Zijlstra
2023-03-28  9:26 ` [PATCH 02/17] sched/fair: Add latency_offset Peter Zijlstra
2023-03-28  9:26 ` [PATCH 03/17] sched/fair: Add sched group latency support Peter Zijlstra
2023-03-28  9:26 ` Peter Zijlstra [this message]
2023-03-28 23:57   ` [PATCH 04/17] sched/fair: Add avg_vruntime Josh Don
2023-03-29  7:50     ` Peter Zijlstra
2023-04-05 19:13       ` Peter Zijlstra
2023-03-28  9:26 ` [PATCH 05/17] sched/fair: Remove START_DEBIT Peter Zijlstra
2023-03-28  9:26 ` [PATCH 06/17] sched/fair: Add lag based placement Peter Zijlstra
2023-04-03  9:18   ` Chen Yu
2023-04-05  9:47     ` Peter Zijlstra
2023-04-06  3:03       ` Chen Yu
2023-04-13 15:42       ` Chen Yu
2023-04-13 15:55         ` Chen Yu
2023-03-28  9:26 ` [PATCH 07/17] rbtree: Add rb_add_augmented_cached() helper Peter Zijlstra
2023-03-28  9:26 ` [PATCH 08/17] sched/fair: Implement an EEVDF like policy Peter Zijlstra
2023-03-29  1:26   ` Josh Don
2023-03-29  8:02     ` Peter Zijlstra
2023-03-29  8:06     ` Peter Zijlstra
2023-03-29  8:22       ` Peter Zijlstra
2023-03-29 18:48         ` Josh Don
2023-03-29  8:12     ` Peter Zijlstra
2023-03-29 18:54       ` Josh Don
2023-03-29  8:18     ` Peter Zijlstra
2023-03-29 14:35   ` Vincent Guittot
2023-03-30  8:01     ` Peter Zijlstra
2023-03-30 17:05       ` Vincent Guittot
2023-04-04 12:00         ` Peter Zijlstra
2023-03-28  9:26 ` [PATCH 09/17] sched: Commit to lag based placement Peter Zijlstra
2023-03-28  9:26 ` [PATCH 10/17] sched/smp: Use lag to simplify cross-runqueue placement Peter Zijlstra
2023-03-28  9:26 ` [PATCH 11/17] sched: Commit to EEVDF Peter Zijlstra
2023-03-28  9:26 ` [PATCH 12/17] sched/debug: Rename min_granularity to base_slice Peter Zijlstra
2023-03-28  9:26 ` [PATCH 13/17] sched: Merge latency_offset into slice Peter Zijlstra
2023-03-28  9:26 ` [PATCH 14/17] sched/eevdf: Better handle mixed slice length Peter Zijlstra
2023-03-31 15:26   ` Vincent Guittot
2023-04-04  9:29     ` Peter Zijlstra
2023-04-04 13:50       ` Joel Fernandes
2023-04-05  5:41         ` Mike Galbraith
2023-04-05  8:35         ` Peter Zijlstra
2023-04-05 20:05           ` Joel Fernandes
2023-04-14 11:18             ` Phil Auld
2023-04-16  5:10               ` Joel Fernandes
     [not found]   ` <20230401232355.336-1-hdanton@sina.com>
2023-04-02  2:40     ` Mike Galbraith
2023-03-28  9:26 ` [PATCH 15/17] [RFC] sched/eevdf: Sleeper bonus Peter Zijlstra
2023-03-29  9:10   ` Mike Galbraith
2023-03-28  9:26 ` [PATCH 16/17] [RFC] sched/eevdf: Minimal vavg option Peter Zijlstra
2023-03-28  9:26 ` [PATCH 17/17] [DEBUG] sched/eevdf: Debug / validation crud Peter Zijlstra
2023-04-03  7:42 ` [PATCH 00/17] sched: EEVDF using latency-nice Shrikanth Hegde
2023-04-10  3:13 ` David Vernet
2023-04-11  2:09   ` David Vernet
     [not found] ` <20230410082307.1327-1-hdanton@sina.com>
2023-04-11 10:15   ` Mike Galbraith
     [not found]   ` <20230411133333.1790-1-hdanton@sina.com>
2023-04-11 14:56     ` Mike Galbraith
     [not found]     ` <20230412025042.1413-1-hdanton@sina.com>
2023-04-12  4:05       ` Mike Galbraith
2023-04-25 12:32 ` Phil Auld

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230328110353.853385546@infradead.org \
    --to=peterz@infradead.org \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=chris.hyser@oracle.com \
    --cc=corbet@lwn.net \
    --cc=dietmar.eggemann@arm.com \
    --cc=efault@gmx.de \
    --cc=joel@joelfernandes.org \
    --cc=joshdon@google.com \
    --cc=juri.lelli@redhat.com \
    --cc=kprateek.nayak@amd.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@kernel.org \
    --cc=patrick.bellasi@matbug.net \
    --cc=pavel@ucw.cz \
    --cc=pjt@google.com \
    --cc=qperret@google.com \
    --cc=qyousef@layalina.io \
    --cc=rostedt@goodmis.org \
    --cc=tim.c.chen@linux.intel.com \
    --cc=timj@gnu.org \
    --cc=vincent.guittot@linaro.org \
    --cc=youssefesmat@chromium.org \
    --cc=yu.c.chen@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).