* [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression
@ 2017-04-25 14:10 Frederic Weisbecker
2017-04-26 14:30 ` Jesper Dangaard Brouer
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Frederic Weisbecker @ 2017-04-25 14:10 UTC (permalink / raw)
To: Ingo Molnar
Cc: LKML, Frederic Weisbecker, Jesper Dangaard Brouer, Peter Zijlstra,
Linus Torvalds, Wanpeng Li, Thomas Gleixner, Rik van Riel,
Stanislaw Gruszka
irq_time_read() returns the irqtime minus the ksoftirqd time. This
is necessary because irq_time_read() is used to substract the IRQ time
from the sum_exec_runtime of a task. If we were to include the softirq
time of ksoftirqd, this task would substract its own CPU time everytime
it updates ksoftirqd->sum_exec_runtime which would therefore never
progress.
But this behaviour got broken by commit a499a5a14db:
("sched/cputime: Increment kcpustat directly on irqtime account")
which now includes ksoftirqd softirq time in the time returned by
irq_time_read().
This has resulted in wrong ksotfirqd cputime reported to userspace
through /proc/stat and thus "top" not showing ksoftirqd when it should
after intense networking load.
ksoftirqd->stime happens to be correct but it gets scaled down by
sum_exec_runtime through task_cputime_adjusted().
To fix this, just account the strict IRQ time in a separate counter and
use it to report the IRQ time.
Reported-and-tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
kernel/sched/cputime.c | 27 ++++++++++++++++-----------
kernel/sched/sched.h | 9 +++++++--
2 files changed, 23 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b..aea3135 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+ enum cpu_usage_stat idx)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ u64_stats_update_begin(&irqtime->sync);
+ cpustat[idx] += delta;
+ irqtime->total += delta;
+ irqtime->tick_delta += delta;
+ u64_stats_update_end(&irqtime->sync);
+}
+
/*
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
void irqtime_account_irq(struct task_struct *curr)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
- u64 *cpustat = kcpustat_this_cpu->cpustat;
s64 delta;
int cpu;
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
- u64_stats_update_begin(&irqtime->sync);
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count()) {
- cpustat[CPUTIME_IRQ] += delta;
- irqtime->tick_delta += delta;
- } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
- cpustat[CPUTIME_SOFTIRQ] += delta;
- irqtime->tick_delta += delta;
- }
-
- u64_stats_update_end(&irqtime->sync);
+ if (hardirq_count())
+ irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf922..767aab3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1869,6 +1869,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
+ u64 total;
u64 tick_delta;
u64 irq_start_time;
struct u64_stats_sync sync;
@@ -1876,16 +1877,20 @@ struct irqtime {
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
static inline u64 irq_time_read(int cpu)
{
struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
- u64 *cpustat = kcpustat_cpu(cpu).cpustat;
unsigned int seq;
u64 total;
do {
seq = __u64_stats_fetch_begin(&irqtime->sync);
- total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+ total = irqtime->total;
} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
return total;
--
2.7.4
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression
2017-04-25 14:10 [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression Frederic Weisbecker
@ 2017-04-26 14:30 ` Jesper Dangaard Brouer
2017-04-26 15:44 ` Rik van Riel
2017-04-27 14:16 ` [tip:sched/urgent] " tip-bot for Frederic Weisbecker
2 siblings, 0 replies; 4+ messages in thread
From: Jesper Dangaard Brouer @ 2017-04-26 14:30 UTC (permalink / raw)
To: Frederic Weisbecker
Cc: Ingo Molnar, LKML, Peter Zijlstra, Linus Torvalds, Wanpeng Li,
Thomas Gleixner, Rik van Riel, Stanislaw Gruszka, brouer
On Tue, 25 Apr 2017 16:10:48 +0200
Frederic Weisbecker <fweisbec@gmail.com> wrote:
> irq_time_read() returns the irqtime minus the ksoftirqd time. This
> is necessary because irq_time_read() is used to substract the IRQ time
> from the sum_exec_runtime of a task. If we were to include the softirq
> time of ksoftirqd, this task would substract its own CPU time everytime
> it updates ksoftirqd->sum_exec_runtime which would therefore never
> progress.
>
> But this behaviour got broken by commit a499a5a14db:
> ("sched/cputime: Increment kcpustat directly on irqtime account")
> which now includes ksoftirqd softirq time in the time returned by
> irq_time_read().
>
> This has resulted in wrong ksotfirqd cputime reported to userspace
^^^^^^^^^
Typo in commit message, guess maintainer can fix/amend that.
> through /proc/stat and thus "top" not showing ksoftirqd when it should
> after intense networking load.
>
> ksoftirqd->stime happens to be correct but it gets scaled down by
> sum_exec_runtime through task_cputime_adjusted().
>
> To fix this, just account the strict IRQ time in a separate counter and
> use it to report the IRQ time.
>
> Reported-and-tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Confirming I have tested patch before Frederic posted it, and have been
running with this applied on my net-next testlab kernels since Monday.
And it does fix the bisected regression I reported here:
http://lkml.kernel.org/r/20170328101403.34a82fbf@redhat.com
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: Stanislaw Gruszka <sgruszka@redhat.com>
> Cc: Wanpeng Li <wanpeng.li@hotmail.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
Please add:
Fixed: a499a5a14dbd ("sched/cputime: Increment kcpustat directly on irqtime account")
--Jesper
> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
> ---
> kernel/sched/cputime.c | 27 ++++++++++++++++-----------
> kernel/sched/sched.h | 9 +++++++--
> 2 files changed, 23 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index f3778e2b..aea3135 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
> sched_clock_irqtime = 0;
> }
>
> +static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
> + enum cpu_usage_stat idx)
> +{
> + u64 *cpustat = kcpustat_this_cpu->cpustat;
> +
> + u64_stats_update_begin(&irqtime->sync);
> + cpustat[idx] += delta;
> + irqtime->total += delta;
> + irqtime->tick_delta += delta;
> + u64_stats_update_end(&irqtime->sync);
> +}
> +
> /*
> * Called before incrementing preempt_count on {soft,}irq_enter
> * and before decrementing preempt_count on {soft,}irq_exit.
> @@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
> void irqtime_account_irq(struct task_struct *curr)
> {
> struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
> - u64 *cpustat = kcpustat_this_cpu->cpustat;
> s64 delta;
> int cpu;
>
> @@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
> delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
> irqtime->irq_start_time += delta;
>
> - u64_stats_update_begin(&irqtime->sync);
> /*
> * We do not account for softirq time from ksoftirqd here.
> * We want to continue accounting softirq time to ksoftirqd thread
> * in that case, so as not to confuse scheduler with a special task
> * that do not consume any time, but still wants to run.
> */
> - if (hardirq_count()) {
> - cpustat[CPUTIME_IRQ] += delta;
> - irqtime->tick_delta += delta;
> - } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
> - cpustat[CPUTIME_SOFTIRQ] += delta;
> - irqtime->tick_delta += delta;
> - }
> -
> - u64_stats_update_end(&irqtime->sync);
> + if (hardirq_count())
> + irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
> + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
> + irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
> }
> EXPORT_SYMBOL_GPL(irqtime_account_irq);
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 5cbf922..767aab3 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1869,6 +1869,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
>
> #ifdef CONFIG_IRQ_TIME_ACCOUNTING
> struct irqtime {
> + u64 total;
> u64 tick_delta;
> u64 irq_start_time;
> struct u64_stats_sync sync;
> @@ -1876,16 +1877,20 @@ struct irqtime {
>
> DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
>
> +/*
> + * Returns the irqtime minus the softirq time computed by ksoftirqd.
> + * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
> + * and never move forward.
> + */
> static inline u64 irq_time_read(int cpu)
> {
> struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
> - u64 *cpustat = kcpustat_cpu(cpu).cpustat;
> unsigned int seq;
> u64 total;
>
> do {
> seq = __u64_stats_fetch_begin(&irqtime->sync);
> - total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
> + total = irqtime->total;
> } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
>
> return total;
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression
2017-04-25 14:10 [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression Frederic Weisbecker
2017-04-26 14:30 ` Jesper Dangaard Brouer
@ 2017-04-26 15:44 ` Rik van Riel
2017-04-27 14:16 ` [tip:sched/urgent] " tip-bot for Frederic Weisbecker
2 siblings, 0 replies; 4+ messages in thread
From: Rik van Riel @ 2017-04-26 15:44 UTC (permalink / raw)
To: Frederic Weisbecker, Ingo Molnar
Cc: LKML, Jesper Dangaard Brouer, Peter Zijlstra, Linus Torvalds,
Wanpeng Li, Thomas Gleixner, Stanislaw Gruszka
On Tue, 2017-04-25 at 16:10 +0200, Frederic Weisbecker wrote:
> irq_time_read() returns the irqtime minus the ksoftirqd time. This
> is necessary because irq_time_read() is used to substract the IRQ
> time
> from the sum_exec_runtime of a task. If we were to include the
> softirq
> time of ksoftirqd, this task would substract its own CPU time
> everytime
> it updates ksoftirqd->sum_exec_runtime which would therefore never
> progress.
>
> But this behaviour got broken by commit a499a5a14db:
> ("sched/cputime: Increment kcpustat directly on irqtime
> account")
> which now includes ksoftirqd softirq time in the time returned by
> irq_time_read().
Ahhh, after a look at account_system_time, it becomes
clear why that was happening!
Reviewed-by: Rik van Riel <riel@redhat.com>
^ permalink raw reply [flat|nested] 4+ messages in thread
* [tip:sched/urgent] sched/cputime: Fix ksoftirqd cputime accounting regression
2017-04-25 14:10 [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression Frederic Weisbecker
2017-04-26 14:30 ` Jesper Dangaard Brouer
2017-04-26 15:44 ` Rik van Riel
@ 2017-04-27 14:16 ` tip-bot for Frederic Weisbecker
2 siblings, 0 replies; 4+ messages in thread
From: tip-bot for Frederic Weisbecker @ 2017-04-27 14:16 UTC (permalink / raw)
To: linux-tip-commits
Cc: sgruszka, wanpeng.li, torvalds, hpa, linux-kernel, riel, fweisbec,
peterz, tglx, brouer, mingo
Commit-ID: 25e2d8c1b9e327ed260edd13169cc22bc7a78bc6
Gitweb: http://git.kernel.org/tip/25e2d8c1b9e327ed260edd13169cc22bc7a78bc6
Author: Frederic Weisbecker <fweisbec@gmail.com>
AuthorDate: Tue, 25 Apr 2017 16:10:48 +0200
Committer: Ingo Molnar <mingo@kernel.org>
CommitDate: Thu, 27 Apr 2017 09:08:26 +0200
sched/cputime: Fix ksoftirqd cputime accounting regression
irq_time_read() returns the irqtime minus the ksoftirqd time. This
is necessary because irq_time_read() is used to substract the IRQ time
from the sum_exec_runtime of a task. If we were to include the softirq
time of ksoftirqd, this task would substract its own CPU time everytime
it updates ksoftirqd->sum_exec_runtime which would therefore never
progress.
But this behaviour got broken by:
a499a5a14db ("sched/cputime: Increment kcpustat directly on irqtime account")
... which now includes ksoftirqd softirq time in the time returned by
irq_time_read().
This has resulted in wrong ksoftirqd cputime reported to userspace
through /proc/stat and thus "top" not showing ksoftirqd when it should
after intense networking load.
ksoftirqd->stime happens to be correct but it gets scaled down by
sum_exec_runtime through task_cputime_adjusted().
To fix this, just account the strict IRQ time in a separate counter and
use it to report the IRQ time.
Reported-and-tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1493129448-5356-1-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
kernel/sched/cputime.c | 27 ++++++++++++++++-----------
kernel/sched/sched.h | 9 +++++++--
2 files changed, 23 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b..aea3135 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+ enum cpu_usage_stat idx)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ u64_stats_update_begin(&irqtime->sync);
+ cpustat[idx] += delta;
+ irqtime->total += delta;
+ irqtime->tick_delta += delta;
+ u64_stats_update_end(&irqtime->sync);
+}
+
/*
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
void irqtime_account_irq(struct task_struct *curr)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
- u64 *cpustat = kcpustat_this_cpu->cpustat;
s64 delta;
int cpu;
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
- u64_stats_update_begin(&irqtime->sync);
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count()) {
- cpustat[CPUTIME_IRQ] += delta;
- irqtime->tick_delta += delta;
- } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
- cpustat[CPUTIME_SOFTIRQ] += delta;
- irqtime->tick_delta += delta;
- }
-
- u64_stats_update_end(&irqtime->sync);
+ if (hardirq_count())
+ irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf922..767aab3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1869,6 +1869,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
+ u64 total;
u64 tick_delta;
u64 irq_start_time;
struct u64_stats_sync sync;
@@ -1876,16 +1877,20 @@ struct irqtime {
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
static inline u64 irq_time_read(int cpu)
{
struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
- u64 *cpustat = kcpustat_cpu(cpu).cpustat;
unsigned int seq;
u64 total;
do {
seq = __u64_stats_fetch_begin(&irqtime->sync);
- total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+ total = irqtime->total;
} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
return total;
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2017-04-27 14:22 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-04-25 14:10 [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression Frederic Weisbecker
2017-04-26 14:30 ` Jesper Dangaard Brouer
2017-04-26 15:44 ` Rik van Riel
2017-04-27 14:16 ` [tip:sched/urgent] " tip-bot for Frederic Weisbecker
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.