* [PATCH v8 1/4] sched: Define sched_clock_irqtime as static key
2025-01-03 2:24 [PATCH v8 resend 0/4] sched: Fix missing irq time when CONFIG_IRQ_TIME_ACCOUNTING is enabled Yafang Shao
@ 2025-01-03 2:24 ` Yafang Shao
2025-01-03 13:32 ` Vincent Guittot
2025-01-15 9:17 ` [tip: sched/core] " tip-bot2 for Yafang Shao
2025-01-03 2:24 ` [PATCH v8 2/4] sched: Don't account irq time if sched_clock_irqtime is disabled Yafang Shao
` (2 subsequent siblings)
3 siblings, 2 replies; 16+ messages in thread
From: Yafang Shao @ 2025-01-03 2:24 UTC (permalink / raw)
To: peterz, mingo
Cc: mkoutny, hannes, juri.lelli, vincent.guittot, dietmar.eggemann,
rostedt, bsegall, mgorman, vschneid, surenb, linux-kernel,
cgroups, lkp, Yafang Shao
Since CPU time accounting is a performance-critical path, let's define
sched_clock_irqtime as a static key to minimize potential overhead.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
---
kernel/sched/cputime.c | 16 +++++++---------
kernel/sched/sched.h | 13 +++++++++++++
2 files changed, 20 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 0bed0fa1acd9..5d9143dd0879 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -9,6 +9,8 @@
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime);
+
/*
* There are no locks covering percpu hardirq/softirq time.
* They are only modified in vtime_account, on corresponding CPU
@@ -22,16 +24,14 @@
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-static int sched_clock_irqtime;
-
void enable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 1;
+ static_branch_enable(&sched_clock_irqtime);
}
void disable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 0;
+ static_branch_disable(&sched_clock_irqtime);
}
static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
@@ -57,7 +57,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
s64 delta;
int cpu;
- if (!sched_clock_irqtime)
+ if (!irqtime_enabled())
return;
cpu = smp_processor_id();
@@ -90,8 +90,6 @@ static u64 irqtime_tick_accounted(u64 maxtime)
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-#define sched_clock_irqtime (0)
-
static u64 irqtime_tick_accounted(u64 dummy)
{
return 0;
@@ -478,7 +476,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (vtime_accounting_enabled_this_cpu())
return;
- if (sched_clock_irqtime) {
+ if (irqtime_enabled()) {
irqtime_account_process_tick(p, user_tick, 1);
return;
}
@@ -507,7 +505,7 @@ void account_idle_ticks(unsigned long ticks)
{
u64 cputime, steal;
- if (sched_clock_irqtime) {
+ if (irqtime_enabled()) {
irqtime_account_idle_ticks(ticks);
return;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index aef716c41edb..7e8c73110884 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3233,6 +3233,12 @@ struct irqtime {
};
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime);
+
+static inline int irqtime_enabled(void)
+{
+ return static_branch_likely(&sched_clock_irqtime);
+}
/*
* Returns the irqtime minus the softirq time computed by ksoftirqd.
@@ -3253,6 +3259,13 @@ static inline u64 irq_time_read(int cpu)
return total;
}
+#else
+
+static inline int irqtime_enabled(void)
+{
+ return 0;
+}
+
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCH v8 1/4] sched: Define sched_clock_irqtime as static key
2025-01-03 2:24 ` [PATCH v8 1/4] sched: Define sched_clock_irqtime as static key Yafang Shao
@ 2025-01-03 13:32 ` Vincent Guittot
2025-01-15 9:17 ` [tip: sched/core] " tip-bot2 for Yafang Shao
1 sibling, 0 replies; 16+ messages in thread
From: Vincent Guittot @ 2025-01-03 13:32 UTC (permalink / raw)
To: Yafang Shao
Cc: peterz, mingo, mkoutny, hannes, juri.lelli, dietmar.eggemann,
rostedt, bsegall, mgorman, vschneid, surenb, linux-kernel,
cgroups, lkp
On Fri, 3 Jan 2025 at 03:24, Yafang Shao <laoar.shao@gmail.com> wrote:
>
> Since CPU time accounting is a performance-critical path, let's define
> sched_clock_irqtime as a static key to minimize potential overhead.
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> Reviewed-by: Michal Koutný <mkoutny@suse.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
> kernel/sched/cputime.c | 16 +++++++---------
> kernel/sched/sched.h | 13 +++++++++++++
> 2 files changed, 20 insertions(+), 9 deletions(-)
>
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index 0bed0fa1acd9..5d9143dd0879 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -9,6 +9,8 @@
>
> #ifdef CONFIG_IRQ_TIME_ACCOUNTING
>
> +DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime);
> +
> /*
> * There are no locks covering percpu hardirq/softirq time.
> * They are only modified in vtime_account, on corresponding CPU
> @@ -22,16 +24,14 @@
> */
> DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
>
> -static int sched_clock_irqtime;
> -
> void enable_sched_clock_irqtime(void)
> {
> - sched_clock_irqtime = 1;
> + static_branch_enable(&sched_clock_irqtime);
> }
>
> void disable_sched_clock_irqtime(void)
> {
> - sched_clock_irqtime = 0;
> + static_branch_disable(&sched_clock_irqtime);
> }
>
> static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
> @@ -57,7 +57,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
> s64 delta;
> int cpu;
>
> - if (!sched_clock_irqtime)
> + if (!irqtime_enabled())
> return;
>
> cpu = smp_processor_id();
> @@ -90,8 +90,6 @@ static u64 irqtime_tick_accounted(u64 maxtime)
>
> #else /* CONFIG_IRQ_TIME_ACCOUNTING */
>
> -#define sched_clock_irqtime (0)
> -
> static u64 irqtime_tick_accounted(u64 dummy)
> {
> return 0;
> @@ -478,7 +476,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
> if (vtime_accounting_enabled_this_cpu())
> return;
>
> - if (sched_clock_irqtime) {
> + if (irqtime_enabled()) {
> irqtime_account_process_tick(p, user_tick, 1);
> return;
> }
> @@ -507,7 +505,7 @@ void account_idle_ticks(unsigned long ticks)
> {
> u64 cputime, steal;
>
> - if (sched_clock_irqtime) {
> + if (irqtime_enabled()) {
> irqtime_account_idle_ticks(ticks);
> return;
> }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index aef716c41edb..7e8c73110884 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3233,6 +3233,12 @@ struct irqtime {
> };
>
> DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
> +DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime);
> +
> +static inline int irqtime_enabled(void)
> +{
> + return static_branch_likely(&sched_clock_irqtime);
> +}
>
> /*
> * Returns the irqtime minus the softirq time computed by ksoftirqd.
> @@ -3253,6 +3259,13 @@ static inline u64 irq_time_read(int cpu)
> return total;
> }
>
> +#else
> +
> +static inline int irqtime_enabled(void)
> +{
> + return 0;
> +}
> +
> #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
>
> #ifdef CONFIG_CPU_FREQ
> --
> 2.43.5
>
^ permalink raw reply [flat|nested] 16+ messages in thread* [tip: sched/core] sched: Define sched_clock_irqtime as static key
2025-01-03 2:24 ` [PATCH v8 1/4] sched: Define sched_clock_irqtime as static key Yafang Shao
2025-01-03 13:32 ` Vincent Guittot
@ 2025-01-15 9:17 ` tip-bot2 for Yafang Shao
1 sibling, 0 replies; 16+ messages in thread
From: tip-bot2 for Yafang Shao @ 2025-01-15 9:17 UTC (permalink / raw)
To: linux-tip-commits
Cc: Yafang Shao, Peter Zijlstra (Intel), mkoutny, Vincent Guittot,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 8722903cbb8f0d51057fbf9ef1c680756b74119e
Gitweb: https://git.kernel.org/tip/8722903cbb8f0d51057fbf9ef1c680756b74119e
Author: Yafang Shao <laoar.shao@gmail.com>
AuthorDate: Fri, 03 Jan 2025 10:24:06 +08:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 13 Jan 2025 14:10:25 +01:00
sched: Define sched_clock_irqtime as static key
Since CPU time accounting is a performance-critical path, let's define
sched_clock_irqtime as a static key to minimize potential overhead.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20250103022409.2544-2-laoar.shao@gmail.com
---
kernel/sched/cputime.c | 16 +++++++---------
kernel/sched/sched.h | 13 +++++++++++++
2 files changed, 20 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 0bed0fa..5d9143d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -9,6 +9,8 @@
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime);
+
/*
* There are no locks covering percpu hardirq/softirq time.
* They are only modified in vtime_account, on corresponding CPU
@@ -22,16 +24,14 @@
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-static int sched_clock_irqtime;
-
void enable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 1;
+ static_branch_enable(&sched_clock_irqtime);
}
void disable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 0;
+ static_branch_disable(&sched_clock_irqtime);
}
static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
@@ -57,7 +57,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
s64 delta;
int cpu;
- if (!sched_clock_irqtime)
+ if (!irqtime_enabled())
return;
cpu = smp_processor_id();
@@ -90,8 +90,6 @@ static u64 irqtime_tick_accounted(u64 maxtime)
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-#define sched_clock_irqtime (0)
-
static u64 irqtime_tick_accounted(u64 dummy)
{
return 0;
@@ -478,7 +476,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (vtime_accounting_enabled_this_cpu())
return;
- if (sched_clock_irqtime) {
+ if (irqtime_enabled()) {
irqtime_account_process_tick(p, user_tick, 1);
return;
}
@@ -507,7 +505,7 @@ void account_idle_ticks(unsigned long ticks)
{
u64 cputime, steal;
- if (sched_clock_irqtime) {
+ if (irqtime_enabled()) {
irqtime_account_idle_ticks(ticks);
return;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 300db6f..3da237c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3233,6 +3233,12 @@ struct irqtime {
};
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime);
+
+static inline int irqtime_enabled(void)
+{
+ return static_branch_likely(&sched_clock_irqtime);
+}
/*
* Returns the irqtime minus the softirq time computed by ksoftirqd.
@@ -3253,6 +3259,13 @@ static inline u64 irq_time_read(int cpu)
return total;
}
+#else
+
+static inline int irqtime_enabled(void)
+{
+ return 0;
+}
+
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH v8 2/4] sched: Don't account irq time if sched_clock_irqtime is disabled
2025-01-03 2:24 [PATCH v8 resend 0/4] sched: Fix missing irq time when CONFIG_IRQ_TIME_ACCOUNTING is enabled Yafang Shao
2025-01-03 2:24 ` [PATCH v8 1/4] sched: Define sched_clock_irqtime as static key Yafang Shao
@ 2025-01-03 2:24 ` Yafang Shao
2025-01-03 13:32 ` Vincent Guittot
2025-01-15 9:17 ` [tip: sched/core] " tip-bot2 for Yafang Shao
2025-01-03 2:24 ` [PATCH v8 3/4] sched, psi: " Yafang Shao
2025-01-03 2:24 ` [PATCH v8 4/4] sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING Yafang Shao
3 siblings, 2 replies; 16+ messages in thread
From: Yafang Shao @ 2025-01-03 2:24 UTC (permalink / raw)
To: peterz, mingo
Cc: mkoutny, hannes, juri.lelli, vincent.guittot, dietmar.eggemann,
rostedt, bsegall, mgorman, vschneid, surenb, linux-kernel,
cgroups, lkp, Yafang Shao
sched_clock_irqtime may be disabled due to the clock source, in which case
IRQ time should not be accounted. Let's add a conditional check to avoid
unnecessary logic.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
---
kernel/sched/core.c | 44 +++++++++++++++++++++++---------------------
1 file changed, 23 insertions(+), 21 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 84902936a620..22dfcd3e92ed 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -740,29 +740,31 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+ if (irqtime_enabled()) {
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
- /*
- * Since irq_time is only updated on {soft,}irq_exit, we might run into
- * this case when a previous update_rq_clock() happened inside a
- * {soft,}IRQ region.
- *
- * When this happens, we stop ->clock_task and only update the
- * prev_irq_time stamp to account for the part that fit, so that a next
- * update will consume the rest. This ensures ->clock_task is
- * monotonic.
- *
- * It does however cause some slight miss-attribution of {soft,}IRQ
- * time, a more accurate solution would be to update the irq_time using
- * the current rq->clock timestamp, except that would require using
- * atomic ops.
- */
- if (irq_delta > delta)
- irq_delta = delta;
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}IRQ region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}IRQ
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
- rq->prev_irq_time += irq_delta;
- delta -= irq_delta;
- delayacct_irq(rq->curr, irq_delta);
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ delayacct_irq(rq->curr, irq_delta);
+ }
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCH v8 2/4] sched: Don't account irq time if sched_clock_irqtime is disabled
2025-01-03 2:24 ` [PATCH v8 2/4] sched: Don't account irq time if sched_clock_irqtime is disabled Yafang Shao
@ 2025-01-03 13:32 ` Vincent Guittot
2025-01-15 9:17 ` [tip: sched/core] " tip-bot2 for Yafang Shao
1 sibling, 0 replies; 16+ messages in thread
From: Vincent Guittot @ 2025-01-03 13:32 UTC (permalink / raw)
To: Yafang Shao
Cc: peterz, mingo, mkoutny, hannes, juri.lelli, dietmar.eggemann,
rostedt, bsegall, mgorman, vschneid, surenb, linux-kernel,
cgroups, lkp
On Fri, 3 Jan 2025 at 03:24, Yafang Shao <laoar.shao@gmail.com> wrote:
>
> sched_clock_irqtime may be disabled due to the clock source, in which case
> IRQ time should not be accounted. Let's add a conditional check to avoid
> unnecessary logic.
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> Reviewed-by: Michal Koutný <mkoutny@suse.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
> kernel/sched/core.c | 44 +++++++++++++++++++++++---------------------
> 1 file changed, 23 insertions(+), 21 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 84902936a620..22dfcd3e92ed 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -740,29 +740,31 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
> s64 __maybe_unused steal = 0, irq_delta = 0;
>
> #ifdef CONFIG_IRQ_TIME_ACCOUNTING
> - irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
> + if (irqtime_enabled()) {
> + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
>
> - /*
> - * Since irq_time is only updated on {soft,}irq_exit, we might run into
> - * this case when a previous update_rq_clock() happened inside a
> - * {soft,}IRQ region.
> - *
> - * When this happens, we stop ->clock_task and only update the
> - * prev_irq_time stamp to account for the part that fit, so that a next
> - * update will consume the rest. This ensures ->clock_task is
> - * monotonic.
> - *
> - * It does however cause some slight miss-attribution of {soft,}IRQ
> - * time, a more accurate solution would be to update the irq_time using
> - * the current rq->clock timestamp, except that would require using
> - * atomic ops.
> - */
> - if (irq_delta > delta)
> - irq_delta = delta;
> + /*
> + * Since irq_time is only updated on {soft,}irq_exit, we might run into
> + * this case when a previous update_rq_clock() happened inside a
> + * {soft,}IRQ region.
> + *
> + * When this happens, we stop ->clock_task and only update the
> + * prev_irq_time stamp to account for the part that fit, so that a next
> + * update will consume the rest. This ensures ->clock_task is
> + * monotonic.
> + *
> + * It does however cause some slight miss-attribution of {soft,}IRQ
> + * time, a more accurate solution would be to update the irq_time using
> + * the current rq->clock timestamp, except that would require using
> + * atomic ops.
> + */
> + if (irq_delta > delta)
> + irq_delta = delta;
>
> - rq->prev_irq_time += irq_delta;
> - delta -= irq_delta;
> - delayacct_irq(rq->curr, irq_delta);
> + rq->prev_irq_time += irq_delta;
> + delta -= irq_delta;
> + delayacct_irq(rq->curr, irq_delta);
> + }
> #endif
> #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> if (static_key_false((¶virt_steal_rq_enabled))) {
> --
> 2.43.5
>
^ permalink raw reply [flat|nested] 16+ messages in thread* [tip: sched/core] sched: Don't account irq time if sched_clock_irqtime is disabled
2025-01-03 2:24 ` [PATCH v8 2/4] sched: Don't account irq time if sched_clock_irqtime is disabled Yafang Shao
2025-01-03 13:32 ` Vincent Guittot
@ 2025-01-15 9:17 ` tip-bot2 for Yafang Shao
1 sibling, 0 replies; 16+ messages in thread
From: tip-bot2 for Yafang Shao @ 2025-01-15 9:17 UTC (permalink / raw)
To: linux-tip-commits
Cc: Yafang Shao, Peter Zijlstra (Intel), mkoutny, Vincent Guittot,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 763a744e24a8cfbcc13f699dcdae13a627b8588e
Gitweb: https://git.kernel.org/tip/763a744e24a8cfbcc13f699dcdae13a627b8588e
Author: Yafang Shao <laoar.shao@gmail.com>
AuthorDate: Fri, 03 Jan 2025 10:24:07 +08:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 13 Jan 2025 14:10:25 +01:00
sched: Don't account irq time if sched_clock_irqtime is disabled
sched_clock_irqtime may be disabled due to the clock source, in which case
IRQ time should not be accounted. Let's add a conditional check to avoid
unnecessary logic.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20250103022409.2544-3-laoar.shao@gmail.com
---
kernel/sched/core.c | 44 +++++++++++++++++++++++---------------------
1 file changed, 23 insertions(+), 21 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8490293..22dfcd3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -740,29 +740,31 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+ if (irqtime_enabled()) {
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
- /*
- * Since irq_time is only updated on {soft,}irq_exit, we might run into
- * this case when a previous update_rq_clock() happened inside a
- * {soft,}IRQ region.
- *
- * When this happens, we stop ->clock_task and only update the
- * prev_irq_time stamp to account for the part that fit, so that a next
- * update will consume the rest. This ensures ->clock_task is
- * monotonic.
- *
- * It does however cause some slight miss-attribution of {soft,}IRQ
- * time, a more accurate solution would be to update the irq_time using
- * the current rq->clock timestamp, except that would require using
- * atomic ops.
- */
- if (irq_delta > delta)
- irq_delta = delta;
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}IRQ region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}IRQ
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
- rq->prev_irq_time += irq_delta;
- delta -= irq_delta;
- delayacct_irq(rq->curr, irq_delta);
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ delayacct_irq(rq->curr, irq_delta);
+ }
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH v8 3/4] sched, psi: Don't account irq time if sched_clock_irqtime is disabled
2025-01-03 2:24 [PATCH v8 resend 0/4] sched: Fix missing irq time when CONFIG_IRQ_TIME_ACCOUNTING is enabled Yafang Shao
2025-01-03 2:24 ` [PATCH v8 1/4] sched: Define sched_clock_irqtime as static key Yafang Shao
2025-01-03 2:24 ` [PATCH v8 2/4] sched: Don't account irq time if sched_clock_irqtime is disabled Yafang Shao
@ 2025-01-03 2:24 ` Yafang Shao
2025-01-15 9:17 ` [tip: sched/core] " tip-bot2 for Yafang Shao
2025-01-03 2:24 ` [PATCH v8 4/4] sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING Yafang Shao
3 siblings, 1 reply; 16+ messages in thread
From: Yafang Shao @ 2025-01-03 2:24 UTC (permalink / raw)
To: peterz, mingo
Cc: mkoutny, hannes, juri.lelli, vincent.guittot, dietmar.eggemann,
rostedt, bsegall, mgorman, vschneid, surenb, linux-kernel,
cgroups, lkp, Yafang Shao
sched_clock_irqtime may be disabled due to the clock source. When disabled,
irq_time_read() won't change over time, so there is nothing to account. We
can save iterating the whole hierarchy on every tick and context switch.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
kernel/sched/psi.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 84dad1511d1e..bb56805e3d47 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -998,7 +998,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
s64 delta;
u64 irq;
- if (static_branch_likely(&psi_disabled))
+ if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
return;
if (!curr->pid)
@@ -1240,6 +1240,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (!irqtime_enabled() && res == PSI_IRQ)
+ return -EOPNOTSUPP;
+#endif
+
/* Update averages before reporting them */
mutex_lock(&group->avgs_lock);
now = sched_clock();
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [tip: sched/core] sched, psi: Don't account irq time if sched_clock_irqtime is disabled
2025-01-03 2:24 ` [PATCH v8 3/4] sched, psi: " Yafang Shao
@ 2025-01-15 9:17 ` tip-bot2 for Yafang Shao
0 siblings, 0 replies; 16+ messages in thread
From: tip-bot2 for Yafang Shao @ 2025-01-15 9:17 UTC (permalink / raw)
To: linux-tip-commits
Cc: Yafang Shao, Peter Zijlstra (Intel), mkoutny, Johannes Weiner,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: a6fd16148fdd7e9da143037106956a3447c247a8
Gitweb: https://git.kernel.org/tip/a6fd16148fdd7e9da143037106956a3447c247a8
Author: Yafang Shao <laoar.shao@gmail.com>
AuthorDate: Fri, 03 Jan 2025 10:24:08 +08:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 13 Jan 2025 14:10:26 +01:00
sched, psi: Don't account irq time if sched_clock_irqtime is disabled
sched_clock_irqtime may be disabled due to the clock source. When disabled,
irq_time_read() won't change over time, so there is nothing to account. We
can save iterating the whole hierarchy on every tick and context switch.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20250103022409.2544-4-laoar.shao@gmail.com
---
kernel/sched/psi.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 84dad15..bb56805 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -998,7 +998,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
s64 delta;
u64 irq;
- if (static_branch_likely(&psi_disabled))
+ if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
return;
if (!curr->pid)
@@ -1240,6 +1240,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (!irqtime_enabled() && res == PSI_IRQ)
+ return -EOPNOTSUPP;
+#endif
+
/* Update averages before reporting them */
mutex_lock(&group->avgs_lock);
now = sched_clock();
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH v8 4/4] sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING
2025-01-03 2:24 [PATCH v8 resend 0/4] sched: Fix missing irq time when CONFIG_IRQ_TIME_ACCOUNTING is enabled Yafang Shao
` (2 preceding siblings ...)
2025-01-03 2:24 ` [PATCH v8 3/4] sched, psi: " Yafang Shao
@ 2025-01-03 2:24 ` Yafang Shao
2025-01-09 10:46 ` Michal Koutný
2025-01-10 3:04 ` kernel test robot
3 siblings, 2 replies; 16+ messages in thread
From: Yafang Shao @ 2025-01-03 2:24 UTC (permalink / raw)
To: peterz, mingo
Cc: mkoutny, hannes, juri.lelli, vincent.guittot, dietmar.eggemann,
rostedt, bsegall, mgorman, vschneid, surenb, linux-kernel,
cgroups, lkp, Yafang Shao
After enabling CONFIG_IRQ_TIME_ACCOUNTING to monitor IRQ pressure in our
container environment, we observed several noticeable behavioral changes.
One of our IRQ-heavy services, such as Redis, reported a significant
reduction in CPU usage after upgrading to the new kernel with
CONFIG_IRQ_TIME_ACCOUNTING enabled. However, despite adding more threads
to handle an increased workload, the CPU usage could not be raised. In
other words, even though the container’s CPU usage appeared low, it was
unable to process more workloads to utilize additional CPU resources, which
caused issues.
This behavior can be demonstrated using netperf:
function start_server() {
for j in `seq 1 3`; do
netserver -p $[12345+j] > /dev/null &
done
}
server_ip=$1
function start_client() {
# That applies to cgroup2 as well.
mkdir -p /sys/fs/cgroup/cpuacct/test
echo $$ > /sys/fs/cgroup/cpuacct/test/cgroup.procs
for j in `seq 1 3`; do
port=$[12345+j]
taskset -c 0 netperf -H ${server_ip} -l ${run_time:-30000} \
-t TCP_STREAM -p $port -- -D -m 1k -M 1K -s 8k -S 8k \
> /dev/null &
done
}
start_server
start_client
We can verify the CPU usage of the test cgroup using cpuacct.stat. The
output shows:
system: 53
user: 2
The CPU usage of the cgroup is relatively low at around 55%, but this usage
doesn't increase, even with more netperf tasks. The reason is that CPU0 is
at 100% utilization, as confirmed by mpstat:
02:56:22 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
02:56:23 PM 0 0.99 0.00 55.45 0.00 0.99 42.57 0.00 0.00 0.00 0.00
02:56:23 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
02:56:24 PM 0 2.00 0.00 55.00 0.00 0.00 43.00 0.00 0.00 0.00 0.00
It is clear that the %soft is excluded in the cgroup of the interrupted
task. This behavior is unexpected. We should include IRQ time in the
cgroup to reflect the pressure the group is under.
After a thorough analysis, I discovered that this change in behavior is due
to commit 305e6835e055 ("sched: Do not account irq time to current task"),
which altered whether IRQ time should be charged to the interrupted task.
While I agree that a task should not be penalized by random interrupts, the
task itself cannot progress while interrupted. Therefore, the interrupted
time should be reported to the user.
The system metric in cpuacct.stat is crucial in indicating whether a
container is under heavy system pressure, including IRQ/softirq activity.
Hence, IRQ/softirq time should be included in the cpuacct system usage,
which also applies to cgroup2’s rstat.
The reason it doesn't just add the cgroup_account_*() to
irqtime_account_irq() is that it might result in performance hit to hold
the rq_lock in the critical path. Taking inspiration from
commit ddae0ca2a8fe ("sched: Move psi_account_irqtime() out of
update_rq_clock_task() hotpath"), I've now adapted the approach to handle
it in a non-critical path, reducing the performance impact.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Koutný <mkoutny@suse.com>
---
kernel/sched/core.c | 33 +++++++++++++++++++++++++++++++--
kernel/sched/psi.c | 13 +++----------
kernel/sched/sched.h | 2 +-
kernel/sched/stats.h | 7 ++++---
4 files changed, 39 insertions(+), 16 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 22dfcd3e92ed..7faacf320af9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5622,6 +5622,35 @@ __setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
#endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static void account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev)
+{
+ int cpu = smp_processor_id();
+ s64 delta;
+ u64 irq;
+
+ if (!irqtime_enabled())
+ return;
+
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - rq->irq_time);
+ if (delta < 0)
+ return;
+
+ rq->irq_time = irq;
+ psi_account_irqtime(rq, curr, prev, delta);
+ cgroup_account_cputime(curr, delta);
+ /* We account both softirq and irq into CPUTIME_IRQ */
+ cgroup_account_cputime_field(curr, CPUTIME_IRQ, delta);
+}
+#else
+static inline void account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev)
+{
+}
+#endif
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -5644,7 +5673,7 @@ void sched_tick(void)
rq_lock(rq, &rf);
donor = rq->donor;
- psi_account_irqtime(rq, donor, NULL);
+ account_irqtime(rq, donor, NULL);
update_rq_clock(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
@@ -6751,7 +6780,7 @@ static void __sched notrace __schedule(int sched_mode)
++*switch_count;
migrate_disable_switch(rq, prev);
- psi_account_irqtime(rq, prev, next);
+ account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, block);
trace_sched_switch(preempt, prev, next, prev_state);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index bb56805e3d47..6fa3233fcba9 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -990,15 +990,14 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev,
+ s64 delta)
{
int cpu = task_cpu(curr);
struct psi_group *group;
struct psi_group_cpu *groupc;
- s64 delta;
- u64 irq;
- if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
+ if (static_branch_likely(&psi_disabled))
return;
if (!curr->pid)
@@ -1009,12 +1008,6 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
if (prev && task_psi_group(prev) == group)
return;
- irq = irq_time_read(cpu);
- delta = (s64)(irq - rq->psi_irq_time);
- if (delta < 0)
- return;
- rq->psi_irq_time = irq;
-
do {
u64 now;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7e8c73110884..570cc19a0060 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1227,7 +1227,7 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
- u64 psi_irq_time;
+ u64 irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8ee0add5a48a..b9ed9fea5ab7 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -111,10 +111,11 @@ void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev, s64 delta);
#else
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
- struct task_struct *prev) {}
+ struct task_struct *prev, s64 delta) {}
#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
@@ -224,7 +225,7 @@ static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
- struct task_struct *prev) {}
+ struct task_struct *prev, s64 delta) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCH v8 4/4] sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING
2025-01-03 2:24 ` [PATCH v8 4/4] sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING Yafang Shao
@ 2025-01-09 10:46 ` Michal Koutný
2025-01-09 13:46 ` Yafang Shao
2025-01-10 3:04 ` kernel test robot
1 sibling, 1 reply; 16+ messages in thread
From: Michal Koutný @ 2025-01-09 10:46 UTC (permalink / raw)
To: Yafang Shao
Cc: peterz, mingo, hannes, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, surenb,
linux-kernel, cgroups, lkp
[-- Attachment #1: Type: text/plain, Size: 2706 bytes --]
Hello Yafang.
I consider the runtimization you did in the first three patches
sensible, however, this fourth patch is a hard sell.
On Fri, Jan 03, 2025 at 10:24:09AM +0800, Yafang Shao <laoar.shao@gmail.com> wrote:
> However, despite adding more threads to handle an increased workload,
> the CPU usage could not be raised.
(Is that behavior same in both CONFIG_IRQ_TIME_ACCOUNTING and
!CONFIG_IRQ_TIME_ACCOUNTING cases?)
> In other words, even though the container’s CPU usage appeared low, it
> was unable to process more workloads to utilize additional CPU
> resources, which caused issues.
Hm, I think this would be worth documenting in the context of
CONFIG_IRQ_TIME_ACCOUNTING and irq.pressure.
> The CPU usage of the cgroup is relatively low at around 55%, but this usage
> doesn't increase, even with more netperf tasks. The reason is that CPU0 is
> at 100% utilization, as confirmed by mpstat:
>
> 02:56:22 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
> 02:56:23 PM 0 0.99 0.00 55.45 0.00 0.99 42.57 0.00 0.00 0.00 0.00
>
> 02:56:23 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
> 02:56:24 PM 0 2.00 0.00 55.00 0.00 0.00 43.00 0.00 0.00 0.00 0.00
>
> It is clear that the %soft is excluded in the cgroup of the interrupted
> task. This behavior is unexpected. We should include IRQ time in the
> cgroup to reflect the pressure the group is under.
What is irq.pressure shown in this case?
> The system metric in cpuacct.stat is crucial in indicating whether a
> container is under heavy system pressure, including IRQ/softirq activity.
> Hence, IRQ/softirq time should be included in the cpuacct system usage,
> which also applies to cgroup2’s rstat.
But this only works for you where cgroup's workload induces IRQ on
itself but generally it'd be confusing (showing some sys time that
originates out of the cgroup). irq.pressure covers this universally (or
it should).
On Fri, Jan 03, 2025 at 10:24:05AM +0800, Yafang Shao <laoar.shao@gmail.com> wrote:
> The load balancer is malfunctioning due to the exclusion of IRQ time from
> CPU utilization calculations. What's worse, there is no effective way to
> add the irq time back into the CPU utilization based on current
> available metrics. Therefore, we have to change the kernel code.
That's IMO what irq.pressure (PSI) should be useful for. Adjusting
cgroup's CPU usage with irq.pressue (perhaps not as simple as
multiplication, Johannes may step in here) should tell you info for load
balancer.
Regards,
Michal
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v8 4/4] sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING
2025-01-09 10:46 ` Michal Koutný
@ 2025-01-09 13:46 ` Yafang Shao
2025-01-14 15:48 ` Michal Koutný
0 siblings, 1 reply; 16+ messages in thread
From: Yafang Shao @ 2025-01-09 13:46 UTC (permalink / raw)
To: Michal Koutný, Peter Zijlstra
Cc: mingo, hannes, juri.lelli, vincent.guittot, dietmar.eggemann,
rostedt, bsegall, mgorman, vschneid, surenb, linux-kernel,
cgroups, lkp
On Thu, Jan 9, 2025 at 6:47 PM Michal Koutný <mkoutny@suse.com> wrote:
>
> Hello Yafang.
>
> I consider the runtimization you did in the first three patches
> sensible,
The first three patches can be considered as a separate series and are
not directly related to this issue.
Hello Peter,
Since the first three patches have already received many reviews,
could you please apply them first?
> however, this fourth patch is a hard sell.
Seems that way. I'll do my best to advocate for it.
>
> On Fri, Jan 03, 2025 at 10:24:09AM +0800, Yafang Shao <laoar.shao@gmail.com> wrote:
> > However, despite adding more threads to handle an increased workload,
> > the CPU usage could not be raised.
>
> (Is that behavior same in both CONFIG_IRQ_TIME_ACCOUNTING and
> !CONFIG_IRQ_TIME_ACCOUNTING cases?)
No, in the case of !CONFIG_IRQ_TIME_ACCOUNTING, CPU usage will
increase as we add more workloads. In other words, this is a
user-visible behavior change, and we should aim to avoid it.
>
> > In other words, even though the container’s CPU usage appeared low, it
> > was unable to process more workloads to utilize additional CPU
> > resources, which caused issues.
>
> Hm, I think this would be worth documenting in the context of
> CONFIG_IRQ_TIME_ACCOUNTING and irq.pressure.
Document it as follows?
"Enabling CONFIG_IRQ_TIME_ACCOUNTING will exclude IRQ usage from the
CPU usage of your tasks. In other words, your task's CPU usage will
only reflect user time and system time."
If we document it clearly this way, I believe no one will try to enable it ;-)
>
> > The CPU usage of the cgroup is relatively low at around 55%, but this usage
> > doesn't increase, even with more netperf tasks. The reason is that CPU0 is
> > at 100% utilization, as confirmed by mpstat:
> >
> > 02:56:22 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
> > 02:56:23 PM 0 0.99 0.00 55.45 0.00 0.99 42.57 0.00 0.00 0.00 0.00
> >
> > 02:56:23 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
> > 02:56:24 PM 0 2.00 0.00 55.00 0.00 0.00 43.00 0.00 0.00 0.00 0.00
> >
> > It is clear that the %soft is excluded in the cgroup of the interrupted
> > task. This behavior is unexpected. We should include IRQ time in the
> > cgroup to reflect the pressure the group is under.
>
> What is irq.pressure shown in this case?
$ cat irq.pressure ; sleep 1; cat irq.pressure
full avg10=42.96 avg60=39.31 avg300=17.44 total=66518335
full avg10=42.96 avg60=39.31 avg300=17.44 total=66952730
It seems that after a certain point, avg10 is almost identical to
%irq. However, can we simply add avg10 to the CPU utilization? I don't
believe we can.
>
> > The system metric in cpuacct.stat is crucial in indicating whether a
> > container is under heavy system pressure, including IRQ/softirq activity.
> > Hence, IRQ/softirq time should be included in the cpuacct system usage,
> > which also applies to cgroup2’s rstat.
>
> But this only works for you where cgroup's workload induces IRQ on
> itself but generally it'd be confusing (showing some sys time that
> originates out of the cgroup). irq.pressure covers this universally (or
> it should).
It worked well before the introduction of CONFIG_IRQ_TIME_ACCOUNTING.
Why not just maintain the previous behavior, especially since it's not
difficult to do so?
>
> On Fri, Jan 03, 2025 at 10:24:05AM +0800, Yafang Shao <laoar.shao@gmail.com> wrote:
> > The load balancer is malfunctioning due to the exclusion of IRQ time from
> > CPU utilization calculations. What's worse, there is no effective way to
> > add the irq time back into the CPU utilization based on current
> > available metrics. Therefore, we have to change the kernel code.
>
> That's IMO what irq.pressure (PSI) should be useful for. Adjusting
> cgroup's CPU usage with irq.pressue (perhaps not as simple as
> multiplication, Johannes may step in here) should tell you info for load
> balancer.
We’re unsure how to use this metric to guide us, and I don't think
there will be clear guidance on how irq.pressure relates to CPU
utilization. :(
--
Regards
Yafang
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v8 4/4] sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING
2025-01-09 13:46 ` Yafang Shao
@ 2025-01-14 15:48 ` Michal Koutný
2025-01-15 2:58 ` Yafang Shao
0 siblings, 1 reply; 16+ messages in thread
From: Michal Koutný @ 2025-01-14 15:48 UTC (permalink / raw)
To: Yafang Shao
Cc: Peter Zijlstra, mingo, hannes, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, surenb,
linux-kernel, cgroups, lkp
On Thu, Jan 09, 2025 at 09:46:23PM +0800, Yafang Shao <laoar.shao@gmail.com> wrote:
> No, in the case of !CONFIG_IRQ_TIME_ACCOUNTING, CPU usage will
> increase as we add more workloads. In other words, this is a
> user-visible behavior change, and we should aim to avoid it.
I wouldn't be excited about that -- differently configured kernel is
supposed to behave differently.
> Document it as follows?
That makes sense to me, with explanation of "where" the time
(dis)appears.
>
> "Enabling CONFIG_IRQ_TIME_ACCOUNTING will exclude IRQ usage from the
> CPU usage of your tasks. In other words, your task's CPU usage will
> only reflect user time and system time."
reflect proper user ...
and IRQ usage is only attributed on the global level visible e.g. in
/proc/stat or irq.pressure (possibly on cgroup level).
> If we document it clearly this way, I believe no one will try to enable it ;-)
I understand that users who want to have the insight between real system
time and IRQ time would enable it.
> It worked well before the introduction of CONFIG_IRQ_TIME_ACCOUNTING.
> Why not just maintain the previous behavior, especially since it's not
> difficult to do so?
Then why do you need CONFIG_IRQ_TIME_ACCOUNTING enabled? Bundling it
together with (not so) random tasks used to work for you.
> We’re unsure how to use this metric to guide us, and I don't think
> there will be clear guidance on how irq.pressure relates to CPU
> utilization. :(
(If irq.pressure is not useful in this case, then when is it useful? I
obviously need to brush up on this.)
Michal
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH v8 4/4] sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING
2025-01-14 15:48 ` Michal Koutný
@ 2025-01-15 2:58 ` Yafang Shao
0 siblings, 0 replies; 16+ messages in thread
From: Yafang Shao @ 2025-01-15 2:58 UTC (permalink / raw)
To: Michal Koutný
Cc: Peter Zijlstra, mingo, hannes, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, surenb,
linux-kernel, cgroups, lkp
On Tue, Jan 14, 2025 at 11:48 PM Michal Koutný <mkoutny@suse.com> wrote:
>
> On Thu, Jan 09, 2025 at 09:46:23PM +0800, Yafang Shao <laoar.shao@gmail.com> wrote:
> > No, in the case of !CONFIG_IRQ_TIME_ACCOUNTING, CPU usage will
> > increase as we add more workloads. In other words, this is a
> > user-visible behavior change, and we should aim to avoid it.
>
> I wouldn't be excited about that -- differently configured kernel is
> supposed to behave differently.
>
> > Document it as follows?
>
> That makes sense to me, with explanation of "where" the time
> (dis)appears.
>
> >
> > "Enabling CONFIG_IRQ_TIME_ACCOUNTING will exclude IRQ usage from the
> > CPU usage of your tasks. In other words, your task's CPU usage will
> > only reflect user time and system time."
> reflect proper user ...
> and IRQ usage is only attributed on the global level visible e.g. in
> /proc/stat or irq.pressure (possibly on cgroup level).
I completely agree with you that it's essential to clearly document
this behavior change. Thanks for your suggestion.
>
> > If we document it clearly this way, I believe no one will try to enable it ;-)
>
> I understand that users who want to have the insight between real system
> time and IRQ time would enable it.
>
>
> > It worked well before the introduction of CONFIG_IRQ_TIME_ACCOUNTING.
> > Why not just maintain the previous behavior, especially since it's not
> > difficult to do so?
>
> Then why do you need CONFIG_IRQ_TIME_ACCOUNTING enabled? Bundling it
> together with (not so) random tasks used to work for you.
Our motivation for enabling irq.pressure was to monitor and address
issues caused by IRQ activity. On our production servers, we have
already enabled {cpu,memory,io}.pressure, which have proven to be very
helpful. We believed that irq.pressure could provide similar benefits.
However, we encountered an unexpected behavior change introduced by
enabling irq.pressure, which has been unacceptable to our users. As a
result, we have reverted this configuration in our production
environment. If the issues we observed cannot be resolved, we will not
enable irq.pressure again.
>
> > We’re unsure how to use this metric to guide us, and I don't think
> > there will be clear guidance on how irq.pressure relates to CPU
> > utilization. :(
>
> (If irq.pressure is not useful in this case, then when is it useful? I
> obviously need to brush up on this.)
It’s not that “irq.pressure is not useful in this case,” but rather
that irq.pressure introduces a behavior change that we find
unacceptable.
Our motivation for enabling irq.pressure stems from the lack of a
dedicated IRQ utilization metric in cpuacct.stat or cpu.stat
(cgroup2). For example:
$ cat cpuacct.stat
user 21763829
system 1052269 <<<< irq is included in system
$ cat cpu.stat
usage_usec 3878342
user_usec 1589312
system_usec 2289029 <<<< irq is included here
...
To address this, I propose introducing a separate metric, such as irq
or irq_usec (cgroup2), to explicitly reflect the IRQ utilization of a
cgroup. This approach would eliminate the need to enable irq.pressure
while providing the desired insights into IRQ usage.
WDYT ?
--
Regards
Yafang
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v8 4/4] sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING
2025-01-03 2:24 ` [PATCH v8 4/4] sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING Yafang Shao
2025-01-09 10:46 ` Michal Koutný
@ 2025-01-10 3:04 ` kernel test robot
1 sibling, 0 replies; 16+ messages in thread
From: kernel test robot @ 2025-01-10 3:04 UTC (permalink / raw)
To: Yafang Shao
Cc: oe-lkp, lkp, Johannes Weiner, Michal Koutný, linux-kernel,
aubrey.li, yu.c.chen, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, surenb,
cgroups, Yafang Shao
Hello,
kernel test robot noticed a 2.3% regression of will-it-scale.per_process_ops on:
commit: efcf26d77a358557d9d847fdca09ccd0690e6831 ("[PATCH v8 4/4] sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING")
url: https://github.com/intel-lab-lkp/linux/commits/Yafang-Shao/sched-Define-sched_clock_irqtime-as-static-key/20250103-102726
base: https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git 7c8cd569ff66755f17b0c0c03a9d8df1b6f3e9ed
patch link: https://lore.kernel.org/all/20250103022409.2544-5-laoar.shao@gmail.com/
patch subject: [PATCH v8 4/4] sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING
testcase: will-it-scale
config: x86_64-rhel-9.4
compiler: gcc-12
test machine: 104 threads 2 sockets (Skylake) with 192G memory
parameters:
nr_task: 100%
mode: process
test: context_switch1
cpufreq_governor: performance
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202501101033.b66070d2-lkp@intel.com
Details are as below:
-------------------------------------------------------------------------------------------------->
The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20250110/202501101033.b66070d2-lkp@intel.com
=========================================================================================
compiler/cpufreq_governor/kconfig/mode/nr_task/rootfs/tbox_group/test/testcase:
gcc-12/performance/x86_64-rhel-9.4/process/100%/debian-12-x86_64-20240206.cgz/lkp-skl-fpga01/context_switch1/will-it-scale
commit:
278e29c10e ("sched, psi: Don't account irq time if sched_clock_irqtime is disabled")
efcf26d77a ("sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING")
278e29c10e4553fa efcf26d77a358557d9d847fdca0
---------------- ---------------------------
%stddev %change %stddev
\ | \
0.01 ± 5% +20.0% 0.01 ± 8% perf-sched.sch_delay.avg.ms.__cond_resched.mutex_lock.pipe_read.vfs_read.ksys_read
20608395 -2.4% 20105294 vmstat.system.cs
20905845 -2.3% 20428184 will-it-scale.104.processes
201017 -2.3% 196424 will-it-scale.per_process_ops
20905845 -2.3% 20428184 will-it-scale.workload
1.42 -0.0 1.39 perf-stat.i.branch-miss-rate%
3.348e+08 -3.3% 3.237e+08 perf-stat.i.branch-misses
20793493 -2.5% 20282595 perf-stat.i.context-switches
199.94 -2.5% 195.03 perf-stat.i.metric.K/sec
1.40 -0.0 1.37 perf-stat.overall.branch-miss-rate%
1728376 +1.9% 1761726 perf-stat.overall.path-length
3.336e+08 -3.3% 3.226e+08 perf-stat.ps.branch-misses
20722198 -2.5% 20214311 perf-stat.ps.context-switches
2.55 -0.4 2.12 ± 2% perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_safe_stack.read
8.10 -0.4 7.70 perf-profile.calltrace.cycles-pp.__wake_up_sync_key.pipe_write.vfs_write.ksys_write.do_syscall_64
7.35 -0.3 7.00 perf-profile.calltrace.cycles-pp.autoremove_wake_function.__wake_up_common.__wake_up_sync_key.pipe_write.vfs_write
7.60 -0.3 7.25 perf-profile.calltrace.cycles-pp.__wake_up_common.__wake_up_sync_key.pipe_write.vfs_write.ksys_write
6.96 -0.3 6.63 perf-profile.calltrace.cycles-pp.try_to_wake_up.autoremove_wake_function.__wake_up_common.__wake_up_sync_key.pipe_write
2.68 -0.3 2.35 perf-profile.calltrace.cycles-pp.wakeup_preempt.try_to_wake_up.autoremove_wake_function.__wake_up_common.__wake_up_sync_key
2.29 -0.3 2.00 perf-profile.calltrace.cycles-pp.check_preempt_wakeup_fair.wakeup_preempt.try_to_wake_up.autoremove_wake_function.__wake_up_common
7.85 -0.3 7.56 perf-profile.calltrace.cycles-pp.syscall_return_via_sysret.read
14.42 -0.2 14.19 perf-profile.calltrace.cycles-pp.ksys_write.do_syscall_64.entry_SYSCALL_64_after_hwframe.write
1.99 -0.2 1.78 perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_safe_stack.write
13.40 -0.2 13.20 perf-profile.calltrace.cycles-pp.vfs_write.ksys_write.do_syscall_64.entry_SYSCALL_64_after_hwframe.write
1.34 -0.2 1.17 perf-profile.calltrace.cycles-pp.update_curr.check_preempt_wakeup_fair.wakeup_preempt.try_to_wake_up.autoremove_wake_function
1.42 -0.2 1.27 perf-profile.calltrace.cycles-pp.update_curr.dequeue_entity.dequeue_entities.dequeue_task_fair.try_to_block_task
2.49 -0.1 2.37 perf-profile.calltrace.cycles-pp.entry_SYSCALL_64.read
2.30 -0.1 2.20 perf-profile.calltrace.cycles-pp.entry_SYSCALL_64.write
0.80 -0.1 0.72 perf-profile.calltrace.cycles-pp.mutex_lock.pipe_read.vfs_read.ksys_read.do_syscall_64
1.94 -0.0 1.90 perf-profile.calltrace.cycles-pp.switch_mm_irqs_off.__schedule.schedule.pipe_read.vfs_read
0.89 -0.0 0.85 perf-profile.calltrace.cycles-pp.prepare_to_wait_event.pipe_read.vfs_read.ksys_read.do_syscall_64
0.99 -0.0 0.96 perf-profile.calltrace.cycles-pp.copy_page_to_iter.pipe_read.vfs_read.ksys_read.do_syscall_64
0.79 -0.0 0.76 perf-profile.calltrace.cycles-pp._copy_to_iter.copy_page_to_iter.pipe_read.vfs_read.ksys_read
0.71 -0.0 0.69 perf-profile.calltrace.cycles-pp.fdget_pos.ksys_write.do_syscall_64.entry_SYSCALL_64_after_hwframe.write
0.72 -0.0 0.70 perf-profile.calltrace.cycles-pp.__switch_to_asm.read
0.70 ± 2% +0.0 0.73 ± 2% perf-profile.calltrace.cycles-pp.inode_needs_update_time.file_update_time.pipe_write.vfs_write.ksys_write
0.80 +0.0 0.84 perf-profile.calltrace.cycles-pp.file_update_time.pipe_write.vfs_write.ksys_write.do_syscall_64
0.62 +0.1 0.69 perf-profile.calltrace.cycles-pp.update_load_avg.dequeue_entity.dequeue_entities.dequeue_task_fair.try_to_block_task
1.48 +0.1 1.57 perf-profile.calltrace.cycles-pp.set_next_entity.pick_next_task_fair.__pick_next_task.__schedule.schedule
1.29 +0.1 1.40 perf-profile.calltrace.cycles-pp.touch_atime.pipe_read.vfs_read.ksys_read.do_syscall_64
1.05 +0.1 1.17 perf-profile.calltrace.cycles-pp.atime_needs_update.touch_atime.pipe_read.vfs_read.ksys_read
2.19 +0.1 2.32 perf-profile.calltrace.cycles-pp.put_prev_entity.pick_next_task_fair.__pick_next_task.__schedule.schedule
3.22 +0.1 3.34 perf-profile.calltrace.cycles-pp.pick_next_task_fair.__pick_next_task.__schedule.schedule.syscall_exit_to_user_mode
3.37 +0.1 3.51 perf-profile.calltrace.cycles-pp.__pick_next_task.__schedule.schedule.syscall_exit_to_user_mode.do_syscall_64
0.34 ± 70% +0.2 0.55 ± 3% perf-profile.calltrace.cycles-pp.update_load_avg.requeue_delayed_entity.enqueue_task.try_to_wake_up.autoremove_wake_function
3.82 +0.2 4.03 perf-profile.calltrace.cycles-pp.pick_next_task_fair.__pick_next_task.__schedule.schedule.pipe_read
4.01 +0.2 4.23 perf-profile.calltrace.cycles-pp.__pick_next_task.__schedule.schedule.pipe_read.vfs_read
48.03 +0.2 48.27 perf-profile.calltrace.cycles-pp.write
25.66 +0.4 26.03 perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.write
30.22 +0.4 30.60 perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.read
29.31 +0.4 29.69 perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.write
26.54 +0.4 26.92 perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.read
0.17 ±141% +0.4 0.61 perf-profile.calltrace.cycles-pp.current_time.atime_needs_update.touch_atime.pipe_read.vfs_read
19.76 +0.5 20.22 perf-profile.calltrace.cycles-pp.pipe_read.vfs_read.ksys_read.do_syscall_64.entry_SYSCALL_64_after_hwframe
22.07 +0.5 22.57 perf-profile.calltrace.cycles-pp.ksys_read.do_syscall_64.entry_SYSCALL_64_after_hwframe.read
20.96 +0.5 21.46 perf-profile.calltrace.cycles-pp.vfs_read.ksys_read.do_syscall_64.entry_SYSCALL_64_after_hwframe.read
0.00 +0.5 0.52 ± 2% perf-profile.calltrace.cycles-pp.current_time.inode_needs_update_time.file_update_time.pipe_write.vfs_write
7.25 +0.5 7.77 perf-profile.calltrace.cycles-pp.schedule.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe.write
7.04 +0.5 7.58 perf-profile.calltrace.cycles-pp.__schedule.schedule.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe
12.97 +0.6 13.56 perf-profile.calltrace.cycles-pp.schedule.pipe_read.vfs_read.ksys_read.do_syscall_64
10.18 +0.6 10.78 perf-profile.calltrace.cycles-pp.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe.write
12.68 +0.6 13.29 perf-profile.calltrace.cycles-pp.__schedule.schedule.pipe_read.vfs_read.ksys_read
6.83 -0.5 6.29 perf-profile.children.cycles-pp.entry_SYSCALL_64
8.12 -0.4 7.71 perf-profile.children.cycles-pp.__wake_up_sync_key
7.36 -0.4 7.00 perf-profile.children.cycles-pp.autoremove_wake_function
7.08 -0.4 6.73 perf-profile.children.cycles-pp.try_to_wake_up
7.62 -0.4 7.27 perf-profile.children.cycles-pp.__wake_up_common
15.19 -0.3 14.86 perf-profile.children.cycles-pp.syscall_return_via_sysret
2.72 -0.3 2.39 perf-profile.children.cycles-pp.wakeup_preempt
4.36 -0.3 4.04 perf-profile.children.cycles-pp.update_curr
2.51 -0.3 2.19 perf-profile.children.cycles-pp.entry_SYSCALL_64_safe_stack
2.33 -0.3 2.04 perf-profile.children.cycles-pp.check_preempt_wakeup_fair
14.46 -0.2 14.22 perf-profile.children.cycles-pp.ksys_write
13.44 -0.2 13.24 perf-profile.children.cycles-pp.vfs_write
51.15 -0.2 50.96 perf-profile.children.cycles-pp.read
0.44 -0.1 0.35 ± 24% perf-profile.children.cycles-pp.reader__read_event
1.22 -0.1 1.14 perf-profile.children.cycles-pp.mutex_lock
1.09 -0.1 1.02 perf-profile.children.cycles-pp._raw_spin_lock_irqsave
1.16 -0.1 1.09 perf-profile.children.cycles-pp.pick_eevdf
0.49 ± 3% -0.1 0.42 ± 2% perf-profile.children.cycles-pp.update_curr_dl_se
3.47 -0.1 3.41 perf-profile.children.cycles-pp.switch_mm_irqs_off
1.00 -0.1 0.94 perf-profile.children.cycles-pp.__switch_to
0.74 -0.1 0.68 perf-profile.children.cycles-pp.__enqueue_entity
0.50 -0.0 0.46 perf-profile.children.cycles-pp.anon_pipe_buf_release
0.90 -0.0 0.86 perf-profile.children.cycles-pp.prepare_to_wait_event
0.13 ± 2% -0.0 0.09 ± 5% perf-profile.children.cycles-pp.__x64_sys_read
1.00 -0.0 0.97 perf-profile.children.cycles-pp.copy_page_to_iter
0.38 -0.0 0.34 perf-profile.children.cycles-pp.testcase
0.65 -0.0 0.61 perf-profile.children.cycles-pp.mutex_unlock
0.49 -0.0 0.46 ± 3% perf-profile.children.cycles-pp.rep_movs_alternative
0.85 -0.0 0.82 perf-profile.children.cycles-pp._copy_to_iter
1.38 -0.0 1.35 perf-profile.children.cycles-pp.__switch_to_asm
0.06 -0.0 0.03 ± 70% perf-profile.children.cycles-pp.psi_account_irqtime
0.63 -0.0 0.60 perf-profile.children.cycles-pp.___perf_sw_event
0.15 ± 2% -0.0 0.12 ± 3% perf-profile.children.cycles-pp.__resched_curr
0.56 -0.0 0.54 perf-profile.children.cycles-pp.__cond_resched
0.44 -0.0 0.42 perf-profile.children.cycles-pp.finish_task_switch
0.37 -0.0 0.35 perf-profile.children.cycles-pp.__wrgsbase_inactive
0.36 -0.0 0.34 ± 2% perf-profile.children.cycles-pp.child
0.09 ± 4% -0.0 0.07 ± 6% perf-profile.children.cycles-pp.mm_cid_get
0.22 ± 2% -0.0 0.20 ± 3% perf-profile.children.cycles-pp.dl_scaled_delta_exec
0.08 ± 6% +0.0 0.09 perf-profile.children.cycles-pp.__x64_sys_write
0.09 ± 4% +0.0 0.11 ± 4% perf-profile.children.cycles-pp.read@plt
0.66 +0.0 0.68 perf-profile.children.cycles-pp.update_rq_clock
0.22 ± 2% +0.0 0.25 perf-profile.children.cycles-pp._raw_spin_unlock_irqrestore
0.16 ± 3% +0.0 0.18 ± 3% perf-profile.children.cycles-pp.dequeue_task
0.21 +0.0 0.24 ± 2% perf-profile.children.cycles-pp.__calc_delta
0.11 ± 6% +0.0 0.13 ± 3% perf-profile.children.cycles-pp.blkcg_maybe_throttle_current
0.32 +0.0 0.34 ± 2% perf-profile.children.cycles-pp.x64_sys_call
0.71 ± 2% +0.0 0.74 perf-profile.children.cycles-pp.vruntime_eligible
0.70 ± 2% +0.0 0.74 ± 2% perf-profile.children.cycles-pp.inode_needs_update_time
0.82 +0.0 0.86 perf-profile.children.cycles-pp.file_update_time
0.08 ± 4% +0.1 0.14 ± 3% perf-profile.children.cycles-pp.sched_update_worker
0.00 +0.1 0.07 ± 7% perf-profile.children.cycles-pp.arch_scale_cpu_capacity
0.07 +0.1 0.14 ± 2% perf-profile.children.cycles-pp.__set_next_task_fair
0.82 +0.1 0.89 perf-profile.children.cycles-pp.__update_load_avg_cfs_rq
0.00 +0.1 0.08 ± 13% perf-profile.children.cycles-pp.cpuacct_account_field
1.54 +0.1 1.64 perf-profile.children.cycles-pp.set_next_entity
2.35 +0.1 2.46 perf-profile.children.cycles-pp.put_prev_entity
1.30 +0.1 1.41 perf-profile.children.cycles-pp.touch_atime
1.08 +0.1 1.19 perf-profile.children.cycles-pp.atime_needs_update
0.58 +0.1 0.70 ± 2% perf-profile.children.cycles-pp.ktime_get_coarse_real_ts64_mg
0.27 ± 8% +0.1 0.40 ± 5% perf-profile.children.cycles-pp.__cgroup_account_cputime
0.07 ± 11% +0.1 0.21 ± 5% perf-profile.children.cycles-pp.cgroup_rstat_updated
0.26 ± 6% +0.2 0.41 ± 8% perf-profile.children.cycles-pp.cpuacct_charge
1.01 +0.2 1.16 perf-profile.children.cycles-pp.current_time
3.00 +0.2 3.17 perf-profile.children.cycles-pp.update_load_avg
0.00 +0.2 0.19 ± 4% perf-profile.children.cycles-pp.__cgroup_account_cputime_field
48.50 +0.2 48.74 perf-profile.children.cycles-pp.write
7.40 +0.3 7.75 perf-profile.children.cycles-pp.__pick_next_task
7.20 +0.4 7.55 perf-profile.children.cycles-pp.pick_next_task_fair
19.93 +0.5 20.39 perf-profile.children.cycles-pp.pipe_read
22.09 +0.5 22.59 perf-profile.children.cycles-pp.ksys_read
20.98 +0.5 21.49 perf-profile.children.cycles-pp.vfs_read
13.69 +0.6 14.25 perf-profile.children.cycles-pp.syscall_exit_to_user_mode
52.30 +0.8 53.06 perf-profile.children.cycles-pp.do_syscall_64
60.19 +0.8 60.96 perf-profile.children.cycles-pp.entry_SYSCALL_64_after_hwframe
0.00 +0.8 0.78 perf-profile.children.cycles-pp.account_irqtime
20.22 +1.1 21.35 perf-profile.children.cycles-pp.schedule
19.84 +1.1 20.98 perf-profile.children.cycles-pp.__schedule
6.15 -0.5 5.64 perf-profile.self.cycles-pp.entry_SYSCALL_64
14.70 -0.3 14.36 perf-profile.self.cycles-pp.syscall_return_via_sysret
1.98 -0.2 1.78 perf-profile.self.cycles-pp.update_curr
0.54 -0.1 0.43 perf-profile.self.cycles-pp.check_preempt_wakeup_fair
1.68 -0.1 1.59 perf-profile.self.cycles-pp.pipe_read
1.45 -0.1 1.37 perf-profile.self.cycles-pp.do_syscall_64
1.06 -0.1 0.99 perf-profile.self.cycles-pp._raw_spin_lock_irqsave
0.72 -0.1 0.64 perf-profile.self.cycles-pp.pick_eevdf
0.61 ± 2% -0.1 0.55 perf-profile.self.cycles-pp.try_to_wake_up
3.45 -0.1 3.38 perf-profile.self.cycles-pp.switch_mm_irqs_off
0.28 ± 3% -0.0 0.23 ± 2% perf-profile.self.cycles-pp.update_curr_dl_se
0.09 -0.0 0.04 ± 44% perf-profile.self.cycles-pp.__x64_sys_read
0.92 -0.0 0.87 perf-profile.self.cycles-pp.__switch_to
0.49 -0.0 0.44 perf-profile.self.cycles-pp.anon_pipe_buf_release
0.50 -0.0 0.46 perf-profile.self.cycles-pp.prepare_to_wait_event
0.53 -0.0 0.50 perf-profile.self.cycles-pp.update_rq_clock_task
0.69 -0.0 0.66 perf-profile.self.cycles-pp.__enqueue_entity
0.62 -0.0 0.58 perf-profile.self.cycles-pp.mutex_unlock
0.28 -0.0 0.25 ± 2% perf-profile.self.cycles-pp.testcase
1.37 -0.0 1.34 perf-profile.self.cycles-pp.__switch_to_asm
0.76 -0.0 0.73 perf-profile.self.cycles-pp.mutex_lock
0.39 -0.0 0.37 ± 3% perf-profile.self.cycles-pp.rep_movs_alternative
0.37 -0.0 0.35 perf-profile.self.cycles-pp.__cond_resched
0.14 ± 3% -0.0 0.12 ± 5% perf-profile.self.cycles-pp.__resched_curr
0.32 ± 2% -0.0 0.29 perf-profile.self.cycles-pp.finish_task_switch
0.50 -0.0 0.48 perf-profile.self.cycles-pp.__get_user_8
0.09 ± 4% -0.0 0.07 ± 5% perf-profile.self.cycles-pp.mm_cid_get
0.37 -0.0 0.35 perf-profile.self.cycles-pp.__wrgsbase_inactive
0.21 -0.0 0.20 perf-profile.self.cycles-pp.wakeup_preempt
0.30 -0.0 0.28 perf-profile.self.cycles-pp.dequeue_task_fair
0.08 +0.0 0.09 perf-profile.self.cycles-pp.kill_fasync
0.14 ± 3% +0.0 0.16 ± 3% perf-profile.self.cycles-pp.update_rq_clock
0.08 ± 4% +0.0 0.10 ± 5% perf-profile.self.cycles-pp.read@plt
0.09 +0.0 0.10 ± 4% perf-profile.self.cycles-pp.raw_spin_rq_unlock
0.19 +0.0 0.22 ± 2% perf-profile.self.cycles-pp._raw_spin_unlock_irqrestore
0.09 ± 7% +0.0 0.12 ± 4% perf-profile.self.cycles-pp.blkcg_maybe_throttle_current
0.14 ± 3% +0.0 0.17 ± 2% perf-profile.self.cycles-pp.dequeue_task
0.30 +0.0 0.32 ± 2% perf-profile.self.cycles-pp.x64_sys_call
0.32 +0.0 0.34 perf-profile.self.cycles-pp.copy_page_from_iter
0.12 ± 7% +0.0 0.15 ± 3% perf-profile.self.cycles-pp.enqueue_task
0.22 +0.0 0.25 perf-profile.self.cycles-pp.rseq_ip_fixup
0.20 +0.0 0.23 ± 2% perf-profile.self.cycles-pp.__calc_delta
0.24 ± 2% +0.0 0.27 ± 7% perf-profile.self.cycles-pp.perf_tp_event
0.66 +0.0 0.69 perf-profile.self.cycles-pp.vruntime_eligible
0.42 ± 2% +0.0 0.46 perf-profile.self.cycles-pp.current_time
0.42 ± 3% +0.0 0.46 ± 2% perf-profile.self.cycles-pp.dequeue_entity
0.00 +0.1 0.05 ± 8% perf-profile.self.cycles-pp.cpuacct_account_field
0.63 ± 3% +0.1 0.69 ± 2% perf-profile.self.cycles-pp.pick_task_fair
0.05 ± 7% +0.1 0.12 ± 4% perf-profile.self.cycles-pp.__set_next_task_fair
0.00 +0.1 0.06 ± 7% perf-profile.self.cycles-pp.__x64_sys_write
0.00 +0.1 0.06 ± 7% perf-profile.self.cycles-pp.arch_scale_cpu_capacity
0.08 ± 6% +0.1 0.14 ± 2% perf-profile.self.cycles-pp.sched_update_worker
0.65 +0.1 0.71 perf-profile.self.cycles-pp.vfs_read
0.76 +0.1 0.83 perf-profile.self.cycles-pp.__update_load_avg_cfs_rq
0.29 +0.1 0.36 perf-profile.self.cycles-pp.set_next_entity
1.25 ± 2% +0.1 1.33 ± 2% perf-profile.self.cycles-pp.update_load_avg
0.20 ± 8% +0.1 0.28 ± 6% perf-profile.self.cycles-pp.__cgroup_account_cputime
0.36 ± 2% +0.1 0.44 perf-profile.self.cycles-pp.put_prev_entity
0.77 +0.1 0.87 perf-profile.self.cycles-pp.pick_next_task_fair
0.00 +0.1 0.11 ± 18% perf-profile.self.cycles-pp.__cgroup_account_cputime_field
0.56 ± 2% +0.1 0.68 ± 2% perf-profile.self.cycles-pp.ktime_get_coarse_real_ts64_mg
1.24 ± 3% +0.1 1.36 ± 3% perf-profile.self.cycles-pp.write
0.07 ± 11% +0.1 0.20 ± 4% perf-profile.self.cycles-pp.cgroup_rstat_updated
1.56 +0.1 1.70 perf-profile.self.cycles-pp.pipe_write
0.25 ± 6% +0.1 0.39 ± 8% perf-profile.self.cycles-pp.cpuacct_charge
0.00 +0.4 0.38 ± 3% perf-profile.self.cycles-pp.account_irqtime
Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH v8 4/4] sched: Fix cgroup irq time for CONFIG_IRQ_TIME_ACCOUNTING
2024-12-22 2:47 [PATCH v8 0/4] sched: Fix missing irq time when CONFIG_IRQ_TIME_ACCOUNTING is enabled Yafang Shao
@ 2024-12-22 2:47 ` Yafang Shao
0 siblings, 0 replies; 16+ messages in thread
From: Yafang Shao @ 2024-12-22 2:47 UTC (permalink / raw)
To: mingo, peterz, mkoutny, hannes
Cc: juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
mgorman, vschneid, surenb, linux-kernel, cgroups, lkp,
Yafang Shao
After enabling CONFIG_IRQ_TIME_ACCOUNTING to monitor IRQ pressure in our
container environment, we observed several noticeable behavioral changes.
One of our IRQ-heavy services, such as Redis, reported a significant
reduction in CPU usage after upgrading to the new kernel with
CONFIG_IRQ_TIME_ACCOUNTING enabled. However, despite adding more threads
to handle an increased workload, the CPU usage could not be raised. In
other words, even though the container’s CPU usage appeared low, it was
unable to process more workloads to utilize additional CPU resources, which
caused issues.
This behavior can be demonstrated using netperf:
function start_server() {
for j in `seq 1 3`; do
netserver -p $[12345+j] > /dev/null &
done
}
server_ip=$1
function start_client() {
# That applies to cgroup2 as well.
mkdir -p /sys/fs/cgroup/cpuacct/test
echo $$ > /sys/fs/cgroup/cpuacct/test/cgroup.procs
for j in `seq 1 3`; do
port=$[12345+j]
taskset -c 0 netperf -H ${server_ip} -l ${run_time:-30000} \
-t TCP_STREAM -p $port -- -D -m 1k -M 1K -s 8k -S 8k \
> /dev/null &
done
}
start_server
start_client
We can verify the CPU usage of the test cgroup using cpuacct.stat. The
output shows:
system: 53
user: 2
The CPU usage of the cgroup is relatively low at around 55%, but this usage
doesn't increase, even with more netperf tasks. The reason is that CPU0 is
at 100% utilization, as confirmed by mpstat:
02:56:22 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
02:56:23 PM 0 0.99 0.00 55.45 0.00 0.99 42.57 0.00 0.00 0.00 0.00
02:56:23 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
02:56:24 PM 0 2.00 0.00 55.00 0.00 0.00 43.00 0.00 0.00 0.00 0.00
It is clear that the %soft is excluded in the cgroup of the interrupted
task. This behavior is unexpected. We should include IRQ time in the
cgroup to reflect the pressure the group is under.
After a thorough analysis, I discovered that this change in behavior is due
to commit 305e6835e055 ("sched: Do not account irq time to current task"),
which altered whether IRQ time should be charged to the interrupted task.
While I agree that a task should not be penalized by random interrupts, the
task itself cannot progress while interrupted. Therefore, the interrupted
time should be reported to the user.
The system metric in cpuacct.stat is crucial in indicating whether a
container is under heavy system pressure, including IRQ/softirq activity.
Hence, IRQ/softirq time should be included in the cpuacct system usage,
which also applies to cgroup2’s rstat.
The reason it doesn't just add the cgroup_account_*() to
irqtime_account_irq() is that it might result in performance hit to hold
the rq_lock in the critical path. Taking inspiration from
commit ddae0ca2a8fe ("sched: Move psi_account_irqtime() out of
update_rq_clock_task() hotpath"), I've now adapted the approach to handle
it in a non-critical path, reducing the performance impact.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Koutný <mkoutny@suse.com>
---
kernel/sched/core.c | 33 +++++++++++++++++++++++++++++++--
kernel/sched/psi.c | 13 +++----------
kernel/sched/sched.h | 2 +-
kernel/sched/stats.h | 7 ++++---
4 files changed, 39 insertions(+), 16 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 22dfcd3e92ed..7faacf320af9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5622,6 +5622,35 @@ __setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
#endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static void account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev)
+{
+ int cpu = smp_processor_id();
+ s64 delta;
+ u64 irq;
+
+ if (!irqtime_enabled())
+ return;
+
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - rq->irq_time);
+ if (delta < 0)
+ return;
+
+ rq->irq_time = irq;
+ psi_account_irqtime(rq, curr, prev, delta);
+ cgroup_account_cputime(curr, delta);
+ /* We account both softirq and irq into CPUTIME_IRQ */
+ cgroup_account_cputime_field(curr, CPUTIME_IRQ, delta);
+}
+#else
+static inline void account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev)
+{
+}
+#endif
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -5644,7 +5673,7 @@ void sched_tick(void)
rq_lock(rq, &rf);
donor = rq->donor;
- psi_account_irqtime(rq, donor, NULL);
+ account_irqtime(rq, donor, NULL);
update_rq_clock(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
@@ -6751,7 +6780,7 @@ static void __sched notrace __schedule(int sched_mode)
++*switch_count;
migrate_disable_switch(rq, prev);
- psi_account_irqtime(rq, prev, next);
+ account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, block);
trace_sched_switch(preempt, prev, next, prev_state);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index bb56805e3d47..6fa3233fcba9 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -990,15 +990,14 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev,
+ s64 delta)
{
int cpu = task_cpu(curr);
struct psi_group *group;
struct psi_group_cpu *groupc;
- s64 delta;
- u64 irq;
- if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
+ if (static_branch_likely(&psi_disabled))
return;
if (!curr->pid)
@@ -1009,12 +1008,6 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
if (prev && task_psi_group(prev) == group)
return;
- irq = irq_time_read(cpu);
- delta = (s64)(irq - rq->psi_irq_time);
- if (delta < 0)
- return;
- rq->psi_irq_time = irq;
-
do {
u64 now;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7e8c73110884..570cc19a0060 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1227,7 +1227,7 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
- u64 psi_irq_time;
+ u64 irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8ee0add5a48a..b9ed9fea5ab7 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -111,10 +111,11 @@ void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev, s64 delta);
#else
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
- struct task_struct *prev) {}
+ struct task_struct *prev, s64 delta) {}
#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
@@ -224,7 +225,7 @@ static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
- struct task_struct *prev) {}
+ struct task_struct *prev, s64 delta) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread