All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression
@ 2017-04-25 14:10 Frederic Weisbecker
  2017-04-26 14:30 ` Jesper Dangaard Brouer
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Frederic Weisbecker @ 2017-04-25 14:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: LKML, Frederic Weisbecker, Jesper Dangaard Brouer, Peter Zijlstra,
	Linus Torvalds, Wanpeng Li, Thomas Gleixner, Rik van Riel,
	Stanislaw Gruszka

irq_time_read() returns the irqtime minus the ksoftirqd time. This
is necessary because irq_time_read() is used to substract the IRQ time
from the sum_exec_runtime of a task. If we were to include the softirq
time of ksoftirqd, this task would substract its own CPU time everytime
it updates ksoftirqd->sum_exec_runtime which would therefore never
progress.

But this behaviour got broken by commit a499a5a14db:
	("sched/cputime: Increment kcpustat directly on irqtime account")
which now includes ksoftirqd softirq time in the time returned by
irq_time_read().

This has resulted in wrong ksotfirqd cputime reported to userspace
through /proc/stat and thus "top" not showing ksoftirqd when it should
after intense networking load.

ksoftirqd->stime happens to be correct but it gets scaled down by
sum_exec_runtime through task_cputime_adjusted().

To fix this, just account the strict IRQ time in a separate counter and
use it to report the IRQ time.

Reported-and-tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/sched/cputime.c | 27 ++++++++++++++++-----------
 kernel/sched/sched.h   |  9 +++++++--
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b..aea3135 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
 	sched_clock_irqtime = 0;
 }
 
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+				  enum cpu_usage_stat idx)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+	u64_stats_update_begin(&irqtime->sync);
+	cpustat[idx] += delta;
+	irqtime->total += delta;
+	irqtime->tick_delta += delta;
+	u64_stats_update_end(&irqtime->sync);
+}
+
 /*
  * Called before incrementing preempt_count on {soft,}irq_enter
  * and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
 void irqtime_account_irq(struct task_struct *curr)
 {
 	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
 	s64 delta;
 	int cpu;
 
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
 	delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
 	irqtime->irq_start_time += delta;
 
-	u64_stats_update_begin(&irqtime->sync);
 	/*
 	 * We do not account for softirq time from ksoftirqd here.
 	 * We want to continue accounting softirq time to ksoftirqd thread
 	 * in that case, so as not to confuse scheduler with a special task
 	 * that do not consume any time, but still wants to run.
 	 */
-	if (hardirq_count()) {
-		cpustat[CPUTIME_IRQ] += delta;
-		irqtime->tick_delta += delta;
-	} else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
-		cpustat[CPUTIME_SOFTIRQ] += delta;
-		irqtime->tick_delta += delta;
-	}
-
-	u64_stats_update_end(&irqtime->sync);
+	if (hardirq_count())
+		irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+		irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf922..767aab3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1869,6 +1869,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 struct irqtime {
+	u64			total;
 	u64			tick_delta;
 	u64			irq_start_time;
 	struct u64_stats_sync	sync;
@@ -1876,16 +1877,20 @@ struct irqtime {
 
 DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
 
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
 static inline u64 irq_time_read(int cpu)
 {
 	struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
-	u64 *cpustat = kcpustat_cpu(cpu).cpustat;
 	unsigned int seq;
 	u64 total;
 
 	do {
 		seq = __u64_stats_fetch_begin(&irqtime->sync);
-		total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+		total = irqtime->total;
 	} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
 
 	return total;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2017-04-27 14:22 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-04-25 14:10 [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression Frederic Weisbecker
2017-04-26 14:30 ` Jesper Dangaard Brouer
2017-04-26 15:44 ` Rik van Riel
2017-04-27 14:16 ` [tip:sched/urgent] " tip-bot for Frederic Weisbecker

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.