public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock
@ 2024-06-13  1:58 John Stultz
  2024-06-13  3:54 ` John Stultz
  2024-06-13 10:04 ` Peter Zijlstra
  0 siblings, 2 replies; 16+ messages in thread
From: John Stultz @ 2024-06-13  1:58 UTC (permalink / raw)
  To: LKML
  Cc: John Stultz, Thomas Gleixner, Frederic Weisbecker, Ingo Molnar,
	Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, Qais Yousef,
	Joel Fernandes, kernel-team

I recently got a bug report that
clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) had regressed between
5.10 and 6.1. Its not a huge regression in absolute time
(~30-40ns), but is >10% change.

I narrowed the cause down to the addition of
psi_account_irqtime() in update_rq_clock_task(), in commit
52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
pressure")

So that explains the behavior change, but it also seems odd that
we're doing psi irq accounting from a syscall that is just
trying to read the thread's cputime.

Thinking about it more, it seems the re-use of update_rq_clock()
to handle accounting for any in-progress time for the current
task has the potential for side effects and unnecessary work.

So instead rework the logic so we calculate the current cpu
runtime in a read-only fashion.

This has the side benefit of improving
clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) performance by ~12%
over the behavior in 5.10, and ~21% over the 6.1 behavior.

NOTE: I'm not 100% sure this is correct yet. There may be some
edge cases I've overlooked, so I'd greatly appreciate any
review or feedback.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Qais Yousef <qyousef@layalina.io>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: kernel-team@android.com
Signed-off-by: John Stultz <jstultz@google.com>
---
 kernel/sched/core.c | 82 ++++++++++++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcf2c4cc0522..b29cde5ded84 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -692,16 +692,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
  * RQ-clock updating methods:
  */
 
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
-	s64 __maybe_unused steal = 0, irq_delta = 0;
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
-	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+static inline s64 get_irq_delta(struct rq *rq, s64 delta)
+{
+	s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 
 	/*
 	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
@@ -720,7 +715,45 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 	 */
 	if (irq_delta > delta)
 		irq_delta = delta;
+	return irq_delta;
+}
+#else
+static inline s64 get_irq_delta(struct rq *rq, s64 delta)
+{
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+static inline s64 get_steal_time(struct rq *rq, s64 delta)
+{
+	s64 steal;
 
+	if (!static_key_false(&paravirt_steal_rq_enabled))
+		return 0;
+	steal = paravirt_steal_clock(cpu_of(rq));
+	steal -= rq->prev_steal_time_rq;
+	if (unlikely(steal > delta))
+		steal = delta;
+	return steal;
+}
+#else
+static inline s64 get_steal_time(struct rq *rq, s64 delta)
+{
+	return 0;
+}
+#endif
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+	s64 __maybe_unused steal = 0, irq_delta = 0;
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	irq_delta = get_irq_delta(rq, delta);
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
 	psi_account_irqtime(rq->curr, irq_delta);
@@ -728,12 +761,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 	if (static_key_false((&paravirt_steal_rq_enabled))) {
-		steal = paravirt_steal_clock(cpu_of(rq));
-		steal -= rq->prev_steal_time_rq;
-
-		if (unlikely(steal > delta))
-			steal = delta;
-
+		steal = get_steal_time(rq, delta);
 		rq->prev_steal_time_rq += steal;
 		delta -= steal;
 	}
@@ -5547,23 +5575,6 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
 
-/*
- * The function fair_sched_class.update_curr accesses the struct curr
- * and its field curr->exec_start; when called from task_sched_runtime(),
- * we observe a high rate of cache misses in practice.
- * Prefetching this data results in improved performance.
- */
-static inline void prefetch_curr_exec_start(struct task_struct *p)
-{
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	struct sched_entity *curr = (&p->se)->cfs_rq->curr;
-#else
-	struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
-#endif
-	prefetch(curr);
-	prefetch(&curr->exec_start);
-}
-
 /*
  * Return accounted runtime for the task.
  * In case the task is currently running, return the runtime plus current's
@@ -5573,6 +5584,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 {
 	struct rq_flags rf;
 	struct rq *rq;
+	s64 delta_exec = 0;
 	u64 ns;
 
 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
@@ -5598,11 +5610,11 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	 * thread, breaking clock_gettime().
 	 */
 	if (task_current(rq, p) && task_on_rq_queued(p)) {
-		prefetch_curr_exec_start(p);
-		update_rq_clock(rq);
-		p->sched_class->update_curr(rq);
+		delta_exec = sched_clock_cpu(cpu_of(rq)) - p->se.exec_start;
+		delta_exec -= get_irq_delta(rq, delta_exec);
+		delta_exec -= get_steal_time(rq, delta_exec);
 	}
-	ns = p->se.sum_exec_runtime;
+	ns = p->se.sum_exec_runtime + delta_exec;
 	task_rq_unlock(rq, p, &rf);
 
 	return ns;
-- 
2.45.2.505.gda0bf45e8d-goog


^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2024-06-18 19:02 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-06-13  1:58 [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock John Stultz
2024-06-13  3:54 ` John Stultz
2024-06-13 10:04 ` Peter Zijlstra
2024-06-13 11:51   ` Qais Yousef
2024-06-14  9:48     ` Peter Zijlstra
2024-06-15  4:30       ` John Stultz
2024-06-16 22:36       ` Qais Yousef
2024-06-18  7:04         ` Peter Zijlstra
2024-06-18 15:24           ` Qais Yousef
2024-06-18  0:42       ` John Stultz
2024-06-18  4:45         ` John Stultz
2024-06-18 15:04           ` Qais Yousef
2024-06-18  8:12         ` Peter Zijlstra
2024-06-18 17:59           ` Johannes Weiner
2024-06-18 19:02             ` John Stultz
2024-06-13 18:59   ` John Stultz

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox