* [PATCH] sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath
@ 2024-06-18 21:58 John Stultz
2024-06-19 11:15 ` Qais Yousef
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: John Stultz @ 2024-06-18 21:58 UTC (permalink / raw)
To: LKML
Cc: John Stultz, Ingo Molnar, Peter Zijlstra, Juri Lelli,
Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
Mel Gorman, Daniel Bristot de Oliveira, Valentin Schneider,
Johannes Weiner, Suren Baghdasaryan, Chengming Zhou,
Thomas Gleixner, Frederic Weisbecker, Qais Yousef, Joel Fernandes,
kernel-team, Jimmy Shiu
It was reported that in moving to 6.1, a larger then 10%
regression was seen in the performance of
clock_gettime(CLOCK_THREAD_CPUTIME_ID,...).
Using a simple reproducer, I found:
5.10:
100000000 calls in 24345994193 ns => 243.460 ns per call
100000000 calls in 24288172050 ns => 242.882 ns per call
100000000 calls in 24289135225 ns => 242.891 ns per call
6.1:
100000000 calls in 28248646742 ns => 282.486 ns per call
100000000 calls in 28227055067 ns => 282.271 ns per call
100000000 calls in 28177471287 ns => 281.775 ns per call
The cause of this was finally narrowed down to the addition of
psi_account_irqtime() in update_rq_clock_task(), in commit
52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
pressure").
In my initial attempt to resolve this, I leaned towards moving
all accounting work out of the clock_gettime() call path, but it
wasn't very pretty, so it will have to wait for a later deeper
rework. Instead, Peter shared this approach:
Rework psi_account_irqtime() to use its own psi_irq_time base
for accounting, and move it out of the hotpath, calling it
instead from sched_tick() and __schedule().
In testing this, we found the importance of ensuring
psi_account_irqtime() is run under the rq_lock, which Johannes
Weiner helpfully explained, so also add some lockdep annotations
to make that requirement clear.
With this change the performance is back in-line with 5.10:
6.1+fix:
100000000 calls in 24297324597 ns => 242.973 ns per call
100000000 calls in 24318869234 ns => 243.189 ns per call
100000000 calls in 24291564588 ns => 242.916 ns per call
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Qais Yousef <qyousef@layalina.io>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: kernel-team@android.com
Originally-by: Peter Zijlstra <peterz@infradead.org>
Reported-by: Jimmy Shiu <jimmyshiu@google.com>
Signed-off-by: John Stultz <jstultz@google.com>
---
kernel/sched/core.c | 7 +++++--
kernel/sched/psi.c | 21 ++++++++++++++++-----
kernel/sched/sched.h | 1 +
kernel/sched/stats.h | 11 ++++++++---
4 files changed, 30 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcf2c4cc0522..59ce0841eb1f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -723,7 +723,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
- psi_account_irqtime(rq->curr, irq_delta);
delayacct_irq(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
@@ -5665,7 +5664,7 @@ void sched_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr = rq->curr;
+ struct task_struct *curr;
struct rq_flags rf;
unsigned long hw_pressure;
u64 resched_latency;
@@ -5677,6 +5676,9 @@ void sched_tick(void)
rq_lock(rq, &rf);
+ curr = rq->curr;
+ psi_account_irqtime(rq, curr, NULL);
+
update_rq_clock(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
@@ -6737,6 +6739,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
++*switch_count;
migrate_disable_switch(rq, prev);
+ psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7b4aa5809c0f..507d7b8d79af 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -773,6 +773,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
enum psi_states s;
u32 state_mask;
+ lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
@@ -991,22 +992,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct task_struct *task, u32 delta)
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{
- int cpu = task_cpu(task);
+ int cpu = task_cpu(curr);
struct psi_group *group;
struct psi_group_cpu *groupc;
- u64 now;
+ u64 now, irq;
+ s64 delta;
if (static_branch_likely(&psi_disabled))
return;
- if (!task->pid)
+ if (!curr->pid)
+ return;
+
+ lockdep_assert_rq_held(rq);
+ group = task_psi_group(curr);
+ if (prev && task_psi_group(prev) == group)
return;
now = cpu_clock(cpu);
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - rq->psi_irq_time);
+ if (delta < 0)
+ return;
+ rq->psi_irq_time = irq;
- group = task_psi_group(task);
do {
if (!group->enabled)
continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a831af102070..ef20c61004eb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1126,6 +1126,7 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
+ u64 psi_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 38f3698f5e5b..b02dfc322951 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se)
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
-void psi_account_irqtime(struct task_struct *task, u32 delta);
-
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
+#else
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
+#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
-static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
--
2.45.2.627.g7a2c4fd464-goog
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath
2024-06-18 21:58 [PATCH] sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath John Stultz
@ 2024-06-19 11:15 ` Qais Yousef
2024-07-01 7:06 ` [tip: sched/urgent] " tip-bot2 for John Stultz
2024-07-01 11:07 ` tip-bot2 for John Stultz
2 siblings, 0 replies; 4+ messages in thread
From: Qais Yousef @ 2024-06-19 11:15 UTC (permalink / raw)
To: John Stultz
Cc: LKML, Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
Daniel Bristot de Oliveira, Valentin Schneider, Johannes Weiner,
Suren Baghdasaryan, Chengming Zhou, Thomas Gleixner,
Frederic Weisbecker, Joel Fernandes, kernel-team, Jimmy Shiu
On 06/18/24 14:58, John Stultz wrote:
> It was reported that in moving to 6.1, a larger then 10%
> regression was seen in the performance of
> clock_gettime(CLOCK_THREAD_CPUTIME_ID,...).
>
> Using a simple reproducer, I found:
> 5.10:
> 100000000 calls in 24345994193 ns => 243.460 ns per call
> 100000000 calls in 24288172050 ns => 242.882 ns per call
> 100000000 calls in 24289135225 ns => 242.891 ns per call
>
> 6.1:
> 100000000 calls in 28248646742 ns => 282.486 ns per call
> 100000000 calls in 28227055067 ns => 282.271 ns per call
> 100000000 calls in 28177471287 ns => 281.775 ns per call
>
> The cause of this was finally narrowed down to the addition of
> psi_account_irqtime() in update_rq_clock_task(), in commit
> 52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
> pressure").
>
> In my initial attempt to resolve this, I leaned towards moving
> all accounting work out of the clock_gettime() call path, but it
> wasn't very pretty, so it will have to wait for a later deeper
> rework. Instead, Peter shared this approach:
>
> Rework psi_account_irqtime() to use its own psi_irq_time base
> for accounting, and move it out of the hotpath, calling it
> instead from sched_tick() and __schedule().
I think we can be more accurate towards group change if we check in
sched_move_task() if task is running. But as-is doesn't change current behavior
AFAICT.
Reviewed-by: Qais Yousef <qyousef@layalina.io>
>
> In testing this, we found the importance of ensuring
> psi_account_irqtime() is run under the rq_lock, which Johannes
> Weiner helpfully explained, so also add some lockdep annotations
> to make that requirement clear.
>
> With this change the performance is back in-line with 5.10:
> 6.1+fix:
> 100000000 calls in 24297324597 ns => 242.973 ns per call
> 100000000 calls in 24318869234 ns => 243.189 ns per call
> 100000000 calls in 24291564588 ns => 242.916 ns per call
>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Juri Lelli <juri.lelli@redhat.com>
> Cc: Vincent Guittot <vincent.guittot@linaro.org>
> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Ben Segall <bsegall@google.com>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
> Cc: Valentin Schneider <vschneid@redhat.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Chengming Zhou <zhouchengming@bytedance.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Frederic Weisbecker <frederic@kernel.org>
> Cc: Qais Yousef <qyousef@layalina.io>
> Cc: Joel Fernandes <joel@joelfernandes.org>
> Cc: kernel-team@android.com
> Originally-by: Peter Zijlstra <peterz@infradead.org>
> Reported-by: Jimmy Shiu <jimmyshiu@google.com>
> Signed-off-by: John Stultz <jstultz@google.com>
> ---
> kernel/sched/core.c | 7 +++++--
> kernel/sched/psi.c | 21 ++++++++++++++++-----
> kernel/sched/sched.h | 1 +
> kernel/sched/stats.h | 11 ++++++++---
> 4 files changed, 30 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index bcf2c4cc0522..59ce0841eb1f 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -723,7 +723,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
>
> rq->prev_irq_time += irq_delta;
> delta -= irq_delta;
> - psi_account_irqtime(rq->curr, irq_delta);
> delayacct_irq(rq->curr, irq_delta);
> #endif
> #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> @@ -5665,7 +5664,7 @@ void sched_tick(void)
> {
> int cpu = smp_processor_id();
> struct rq *rq = cpu_rq(cpu);
> - struct task_struct *curr = rq->curr;
> + struct task_struct *curr;
> struct rq_flags rf;
> unsigned long hw_pressure;
> u64 resched_latency;
> @@ -5677,6 +5676,9 @@ void sched_tick(void)
>
> rq_lock(rq, &rf);
>
> + curr = rq->curr;
> + psi_account_irqtime(rq, curr, NULL);
> +
> update_rq_clock(rq);
> hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
> update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
> @@ -6737,6 +6739,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> ++*switch_count;
>
> migrate_disable_switch(rq, prev);
> + psi_account_irqtime(rq, prev, next);
> psi_sched_switch(prev, next, !task_on_rq_queued(prev));
>
> trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 7b4aa5809c0f..507d7b8d79af 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -773,6 +773,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
> enum psi_states s;
> u32 state_mask;
>
> + lockdep_assert_rq_held(cpu_rq(cpu));
> groupc = per_cpu_ptr(group->pcpu, cpu);
>
> /*
> @@ -991,22 +992,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
> }
>
> #ifdef CONFIG_IRQ_TIME_ACCOUNTING
> -void psi_account_irqtime(struct task_struct *task, u32 delta)
> +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
> {
> - int cpu = task_cpu(task);
> + int cpu = task_cpu(curr);
> struct psi_group *group;
> struct psi_group_cpu *groupc;
> - u64 now;
> + u64 now, irq;
> + s64 delta;
>
> if (static_branch_likely(&psi_disabled))
> return;
>
> - if (!task->pid)
> + if (!curr->pid)
> + return;
> +
> + lockdep_assert_rq_held(rq);
> + group = task_psi_group(curr);
> + if (prev && task_psi_group(prev) == group)
> return;
>
> now = cpu_clock(cpu);
> + irq = irq_time_read(cpu);
> + delta = (s64)(irq - rq->psi_irq_time);
> + if (delta < 0)
> + return;
> + rq->psi_irq_time = irq;
>
> - group = task_psi_group(task);
> do {
> if (!group->enabled)
> continue;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index a831af102070..ef20c61004eb 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1126,6 +1126,7 @@ struct rq {
>
> #ifdef CONFIG_IRQ_TIME_ACCOUNTING
> u64 prev_irq_time;
> + u64 psi_irq_time;
> #endif
> #ifdef CONFIG_PARAVIRT
> u64 prev_steal_time;
> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index 38f3698f5e5b..b02dfc322951 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se)
> void psi_task_change(struct task_struct *task, int clear, int set);
> void psi_task_switch(struct task_struct *prev, struct task_struct *next,
> bool sleep);
> -void psi_account_irqtime(struct task_struct *task, u32 delta);
> -
> +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
> +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
> +#else
> +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
> + struct task_struct *prev) {}
> +#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
> /*
> * PSI tracks state that persists across sleeps, such as iowaits and
> * memory stalls. As a result, it has to distinguish between sleeps,
> @@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
> static inline void psi_sched_switch(struct task_struct *prev,
> struct task_struct *next,
> bool sleep) {}
> -static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
> +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
> + struct task_struct *prev) {}
> #endif /* CONFIG_PSI */
>
> #ifdef CONFIG_SCHED_INFO
> --
> 2.45.2.627.g7a2c4fd464-goog
>
^ permalink raw reply [flat|nested] 4+ messages in thread
* [tip: sched/urgent] sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath
2024-06-18 21:58 [PATCH] sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath John Stultz
2024-06-19 11:15 ` Qais Yousef
@ 2024-07-01 7:06 ` tip-bot2 for John Stultz
2024-07-01 11:07 ` tip-bot2 for John Stultz
2 siblings, 0 replies; 4+ messages in thread
From: tip-bot2 for John Stultz @ 2024-07-01 7:06 UTC (permalink / raw)
To: linux-tip-commits
Cc: Jimmy Shiu, Peter Zijlstra, John Stultz, Chengming Zhou,
Qais Yousef, x86, linux-kernel
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: 45655e7bd66c78920b0a579d146aa67788545e3c
Gitweb: https://git.kernel.org/tip/45655e7bd66c78920b0a579d146aa67788545e3c
Author: John Stultz <jstultz@google.com>
AuthorDate: Tue, 18 Jun 2024 14:58:55 -07:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 25 Jun 2024 10:43:42 +02:00
sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath
It was reported that in moving to 6.1, a larger then 10%
regression was seen in the performance of
clock_gettime(CLOCK_THREAD_CPUTIME_ID,...).
Using a simple reproducer, I found:
5.10:
100000000 calls in 24345994193 ns => 243.460 ns per call
100000000 calls in 24288172050 ns => 242.882 ns per call
100000000 calls in 24289135225 ns => 242.891 ns per call
6.1:
100000000 calls in 28248646742 ns => 282.486 ns per call
100000000 calls in 28227055067 ns => 282.271 ns per call
100000000 calls in 28177471287 ns => 281.775 ns per call
The cause of this was finally narrowed down to the addition of
psi_account_irqtime() in update_rq_clock_task(), in commit
52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
pressure").
In my initial attempt to resolve this, I leaned towards moving
all accounting work out of the clock_gettime() call path, but it
wasn't very pretty, so it will have to wait for a later deeper
rework. Instead, Peter shared this approach:
Rework psi_account_irqtime() to use its own psi_irq_time base
for accounting, and move it out of the hotpath, calling it
instead from sched_tick() and __schedule().
In testing this, we found the importance of ensuring
psi_account_irqtime() is run under the rq_lock, which Johannes
Weiner helpfully explained, so also add some lockdep annotations
to make that requirement clear.
With this change the performance is back in-line with 5.10:
6.1+fix:
100000000 calls in 24297324597 ns => 242.973 ns per call
100000000 calls in 24318869234 ns => 243.189 ns per call
100000000 calls in 24291564588 ns => 242.916 ns per call
Reported-by: Jimmy Shiu <jimmyshiu@google.com>
Originally-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Reviewed-by: Qais Yousef <qyousef@layalina.io>
Link: https://lore.kernel.org/r/20240618215909.4099720-1-jstultz@google.com
---
kernel/sched/core.c | 7 +++++--
kernel/sched/psi.c | 21 ++++++++++++++++-----
kernel/sched/sched.h | 1 +
kernel/sched/stats.h | 11 ++++++++---
4 files changed, 30 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcf2c4c..59ce084 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -723,7 +723,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
- psi_account_irqtime(rq->curr, irq_delta);
delayacct_irq(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
@@ -5665,7 +5664,7 @@ void sched_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr = rq->curr;
+ struct task_struct *curr;
struct rq_flags rf;
unsigned long hw_pressure;
u64 resched_latency;
@@ -5677,6 +5676,9 @@ void sched_tick(void)
rq_lock(rq, &rf);
+ curr = rq->curr;
+ psi_account_irqtime(rq, curr, NULL);
+
update_rq_clock(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
@@ -6737,6 +6739,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
++*switch_count;
migrate_disable_switch(rq, prev);
+ psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7b4aa58..507d7b8 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -773,6 +773,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
enum psi_states s;
u32 state_mask;
+ lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
@@ -991,22 +992,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct task_struct *task, u32 delta)
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{
- int cpu = task_cpu(task);
+ int cpu = task_cpu(curr);
struct psi_group *group;
struct psi_group_cpu *groupc;
- u64 now;
+ u64 now, irq;
+ s64 delta;
if (static_branch_likely(&psi_disabled))
return;
- if (!task->pid)
+ if (!curr->pid)
+ return;
+
+ lockdep_assert_rq_held(rq);
+ group = task_psi_group(curr);
+ if (prev && task_psi_group(prev) == group)
return;
now = cpu_clock(cpu);
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - rq->psi_irq_time);
+ if (delta < 0)
+ return;
+ rq->psi_irq_time = irq;
- group = task_psi_group(task);
do {
if (!group->enabled)
continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a831af1..ef20c61 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1126,6 +1126,7 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
+ u64 psi_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 38f3698..b02dfc3 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se)
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
-void psi_account_irqtime(struct task_struct *task, u32 delta);
-
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
+#else
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
+#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
-static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [tip: sched/urgent] sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath
2024-06-18 21:58 [PATCH] sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath John Stultz
2024-06-19 11:15 ` Qais Yousef
2024-07-01 7:06 ` [tip: sched/urgent] " tip-bot2 for John Stultz
@ 2024-07-01 11:07 ` tip-bot2 for John Stultz
2 siblings, 0 replies; 4+ messages in thread
From: tip-bot2 for John Stultz @ 2024-07-01 11:07 UTC (permalink / raw)
To: linux-tip-commits
Cc: Jimmy Shiu, Peter Zijlstra, John Stultz, Chengming Zhou,
Qais Yousef, x86, linux-kernel
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: ddae0ca2a8fe12d0e24ab10ba759c3fbd755ada8
Gitweb: https://git.kernel.org/tip/ddae0ca2a8fe12d0e24ab10ba759c3fbd755ada8
Author: John Stultz <jstultz@google.com>
AuthorDate: Tue, 18 Jun 2024 14:58:55 -07:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 01 Jul 2024 13:01:44 +02:00
sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath
It was reported that in moving to 6.1, a larger then 10%
regression was seen in the performance of
clock_gettime(CLOCK_THREAD_CPUTIME_ID,...).
Using a simple reproducer, I found:
5.10:
100000000 calls in 24345994193 ns => 243.460 ns per call
100000000 calls in 24288172050 ns => 242.882 ns per call
100000000 calls in 24289135225 ns => 242.891 ns per call
6.1:
100000000 calls in 28248646742 ns => 282.486 ns per call
100000000 calls in 28227055067 ns => 282.271 ns per call
100000000 calls in 28177471287 ns => 281.775 ns per call
The cause of this was finally narrowed down to the addition of
psi_account_irqtime() in update_rq_clock_task(), in commit
52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
pressure").
In my initial attempt to resolve this, I leaned towards moving
all accounting work out of the clock_gettime() call path, but it
wasn't very pretty, so it will have to wait for a later deeper
rework. Instead, Peter shared this approach:
Rework psi_account_irqtime() to use its own psi_irq_time base
for accounting, and move it out of the hotpath, calling it
instead from sched_tick() and __schedule().
In testing this, we found the importance of ensuring
psi_account_irqtime() is run under the rq_lock, which Johannes
Weiner helpfully explained, so also add some lockdep annotations
to make that requirement clear.
With this change the performance is back in-line with 5.10:
6.1+fix:
100000000 calls in 24297324597 ns => 242.973 ns per call
100000000 calls in 24318869234 ns => 243.189 ns per call
100000000 calls in 24291564588 ns => 242.916 ns per call
Reported-by: Jimmy Shiu <jimmyshiu@google.com>
Originally-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Reviewed-by: Qais Yousef <qyousef@layalina.io>
Link: https://lore.kernel.org/r/20240618215909.4099720-1-jstultz@google.com
---
kernel/sched/core.c | 7 +++++--
kernel/sched/psi.c | 21 ++++++++++++++++-----
kernel/sched/sched.h | 1 +
kernel/sched/stats.h | 11 ++++++++---
4 files changed, 30 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcf2c4c..59ce084 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -723,7 +723,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
- psi_account_irqtime(rq->curr, irq_delta);
delayacct_irq(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
@@ -5665,7 +5664,7 @@ void sched_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr = rq->curr;
+ struct task_struct *curr;
struct rq_flags rf;
unsigned long hw_pressure;
u64 resched_latency;
@@ -5677,6 +5676,9 @@ void sched_tick(void)
rq_lock(rq, &rf);
+ curr = rq->curr;
+ psi_account_irqtime(rq, curr, NULL);
+
update_rq_clock(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
@@ -6737,6 +6739,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
++*switch_count;
migrate_disable_switch(rq, prev);
+ psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7b4aa58..507d7b8 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -773,6 +773,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
enum psi_states s;
u32 state_mask;
+ lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
@@ -991,22 +992,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct task_struct *task, u32 delta)
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{
- int cpu = task_cpu(task);
+ int cpu = task_cpu(curr);
struct psi_group *group;
struct psi_group_cpu *groupc;
- u64 now;
+ u64 now, irq;
+ s64 delta;
if (static_branch_likely(&psi_disabled))
return;
- if (!task->pid)
+ if (!curr->pid)
+ return;
+
+ lockdep_assert_rq_held(rq);
+ group = task_psi_group(curr);
+ if (prev && task_psi_group(prev) == group)
return;
now = cpu_clock(cpu);
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - rq->psi_irq_time);
+ if (delta < 0)
+ return;
+ rq->psi_irq_time = irq;
- group = task_psi_group(task);
do {
if (!group->enabled)
continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a831af1..ef20c61 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1126,6 +1126,7 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
+ u64 psi_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 38f3698..b02dfc3 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se)
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
-void psi_account_irqtime(struct task_struct *task, u32 delta);
-
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
+#else
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
+#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
-static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2024-07-01 11:07 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-06-18 21:58 [PATCH] sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath John Stultz
2024-06-19 11:15 ` Qais Yousef
2024-07-01 7:06 ` [tip: sched/urgent] " tip-bot2 for John Stultz
2024-07-01 11:07 ` tip-bot2 for John Stultz
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox