* [PATCH 1/3] cgroup/rstat: Fix forceidle time in cpu.stat
2025-01-23 17:47 [PATCH 0/3] Fix and cleanup and extend cpu.stat Abel Wu
@ 2025-01-23 17:47 ` Abel Wu
2025-01-24 1:47 ` chenridong
2025-01-24 9:22 ` Michal Koutný
2025-01-23 17:47 ` [PATCH 2/3] cgroup/rstat: Cleanup cpu.stat once for all Abel Wu
2025-01-23 17:47 ` [PATCH 3/3] cgroup/rstat: Add run_delay accounting for cgroups Abel Wu
2 siblings, 2 replies; 14+ messages in thread
From: Abel Wu @ 2025-01-23 17:47 UTC (permalink / raw)
To: Tejun Heo, Johannes Weiner, Michal Koutný, Jonathan Corbet,
Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
Valentin Schneider, Thomas Gleixner, Yury Norov, Abel Wu,
Andrew Morton, Bitao Hu, Chen Ridong
Cc: open list:CONTROL GROUP (CGROUP), open list:DOCUMENTATION,
open list
The commit b824766504e4 ("cgroup/rstat: add force idle show helper")
retrieves forceidle_time outside cgroup_rstat_lock for non-root cgroups
which can be potentially inconsistent with other stats.
Rather than reverting that commit, fix it in a way that retains the
effort of cleaning up the ifdef-messes.
Fixes: b824766504e4 ("cgroup/rstat: add force idle show helper")
Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
---
kernel/cgroup/rstat.c | 29 +++++++++++++----------------
1 file changed, 13 insertions(+), 16 deletions(-)
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 5877974ece92..c2784c317cdd 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -613,36 +613,33 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- u64 usage, utime, stime, ntime;
+ struct cgroup_base_stat bstat;
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
- usage = cgrp->bstat.cputime.sum_exec_runtime;
+ bstat = cgrp->bstat;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
- &utime, &stime);
- ntime = cgrp->bstat.ntime;
+ &bstat.cputime.utime, &bstat.cputime.stime);
cgroup_rstat_flush_release(cgrp);
} else {
- /* cgrp->bstat of root is not actually used, reuse it */
- root_cgroup_cputime(&cgrp->bstat);
- usage = cgrp->bstat.cputime.sum_exec_runtime;
- utime = cgrp->bstat.cputime.utime;
- stime = cgrp->bstat.cputime.stime;
- ntime = cgrp->bstat.ntime;
+ root_cgroup_cputime(&bstat);
}
- do_div(usage, NSEC_PER_USEC);
- do_div(utime, NSEC_PER_USEC);
- do_div(stime, NSEC_PER_USEC);
- do_div(ntime, NSEC_PER_USEC);
+ do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC);
+ do_div(bstat.cputime.utime, NSEC_PER_USEC);
+ do_div(bstat.cputime.stime, NSEC_PER_USEC);
+ do_div(bstat.ntime, NSEC_PER_USEC);
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n"
"nice_usec %llu\n",
- usage, utime, stime, ntime);
+ bstat.cputime.sum_exec_runtime,
+ bstat.cputime.utime,
+ bstat.cputime.stime,
+ bstat.ntime);
- cgroup_force_idle_show(seq, &cgrp->bstat);
+ cgroup_force_idle_show(seq, &bstat);
}
/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
--
2.37.3
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [PATCH 1/3] cgroup/rstat: Fix forceidle time in cpu.stat
2025-01-23 17:47 ` [PATCH 1/3] cgroup/rstat: Fix forceidle time in cpu.stat Abel Wu
@ 2025-01-24 1:47 ` chenridong
2025-01-24 7:49 ` Abel Wu
2025-01-24 9:22 ` Michal Koutný
1 sibling, 1 reply; 14+ messages in thread
From: chenridong @ 2025-01-24 1:47 UTC (permalink / raw)
To: Abel Wu, Tejun Heo, Johannes Weiner, Michal Koutný,
Jonathan Corbet, Ingo Molnar, Peter Zijlstra, Juri Lelli,
Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
Mel Gorman, Valentin Schneider, Thomas Gleixner, Yury Norov,
Andrew Morton, Bitao Hu
Cc: open list:CONTROL GROUP (CGROUP), open list:DOCUMENTATION,
open list
On 2025/1/24 1:47, Abel Wu wrote:
> The commit b824766504e4 ("cgroup/rstat: add force idle show helper")
> retrieves forceidle_time outside cgroup_rstat_lock for non-root cgroups
> which can be potentially inconsistent with other stats.
>
> Rather than reverting that commit, fix it in a way that retains the
> effort of cleaning up the ifdef-messes.
>
> Fixes: b824766504e4 ("cgroup/rstat: add force idle show helper")
> Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
> ---
> kernel/cgroup/rstat.c | 29 +++++++++++++----------------
> 1 file changed, 13 insertions(+), 16 deletions(-)
>
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index 5877974ece92..c2784c317cdd 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -613,36 +613,33 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
> void cgroup_base_stat_cputime_show(struct seq_file *seq)
> {
> struct cgroup *cgrp = seq_css(seq)->cgroup;
> - u64 usage, utime, stime, ntime;
> + struct cgroup_base_stat bstat;
>
> if (cgroup_parent(cgrp)) {
> cgroup_rstat_flush_hold(cgrp);
> - usage = cgrp->bstat.cputime.sum_exec_runtime;
> + bstat = cgrp->bstat;
Thank you for finding that.
In my version 2, I used to assign cgrp->bstat to bstat.
This is Tj's comment:
https://lore.kernel.org/linux-kernel/ZoQ2ti7nnz9EJSc3@slm.duckdns.org/
Best regards,
Ridong
> cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
> - &utime, &stime);
> - ntime = cgrp->bstat.ntime;
> + &bstat.cputime.utime, &bstat.cputime.stime);
> cgroup_rstat_flush_release(cgrp);
> } else {
> - /* cgrp->bstat of root is not actually used, reuse it */
> - root_cgroup_cputime(&cgrp->bstat);
> - usage = cgrp->bstat.cputime.sum_exec_runtime;
> - utime = cgrp->bstat.cputime.utime;
> - stime = cgrp->bstat.cputime.stime;
> - ntime = cgrp->bstat.ntime;
> + root_cgroup_cputime(&bstat);
> }
>
> - do_div(usage, NSEC_PER_USEC);
> - do_div(utime, NSEC_PER_USEC);
> - do_div(stime, NSEC_PER_USEC);
> - do_div(ntime, NSEC_PER_USEC);
> + do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC);
> + do_div(bstat.cputime.utime, NSEC_PER_USEC);
> + do_div(bstat.cputime.stime, NSEC_PER_USEC);
> + do_div(bstat.ntime, NSEC_PER_USEC);
>
> seq_printf(seq, "usage_usec %llu\n"
> "user_usec %llu\n"
> "system_usec %llu\n"
> "nice_usec %llu\n",
> - usage, utime, stime, ntime);
> + bstat.cputime.sum_exec_runtime,
> + bstat.cputime.utime,
> + bstat.cputime.stime,
> + bstat.ntime);
>
> - cgroup_force_idle_show(seq, &cgrp->bstat);
> + cgroup_force_idle_show(seq, &bstat);
> }
>
> /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: Re: [PATCH 1/3] cgroup/rstat: Fix forceidle time in cpu.stat
2025-01-24 1:47 ` chenridong
@ 2025-01-24 7:49 ` Abel Wu
2025-01-24 23:49 ` Tejun Heo
0 siblings, 1 reply; 14+ messages in thread
From: Abel Wu @ 2025-01-24 7:49 UTC (permalink / raw)
To: chenridong, Tejun Heo, Johannes Weiner, Michal Koutný,
Jonathan Corbet, Ingo Molnar, Peter Zijlstra, Juri Lelli,
Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
Mel Gorman, Valentin Schneider, Thomas Gleixner, Yury Norov,
Andrew Morton, Bitao Hu
Cc: open list:CONTROL GROUP (CGROUP), open list:DOCUMENTATION,
open list
On 1/24/25 9:47 AM, chenridong Wrote:
>
>
> On 2025/1/24 1:47, Abel Wu wrote:
>> The commit b824766504e4 ("cgroup/rstat: add force idle show helper")
>> retrieves forceidle_time outside cgroup_rstat_lock for non-root cgroups
>> which can be potentially inconsistent with other stats.
>>
>> Rather than reverting that commit, fix it in a way that retains the
>> effort of cleaning up the ifdef-messes.
>>
>> Fixes: b824766504e4 ("cgroup/rstat: add force idle show helper")
>> Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
>> ---
>> kernel/cgroup/rstat.c | 29 +++++++++++++----------------
>> 1 file changed, 13 insertions(+), 16 deletions(-)
>>
>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>> index 5877974ece92..c2784c317cdd 100644
>> --- a/kernel/cgroup/rstat.c
>> +++ b/kernel/cgroup/rstat.c
>> @@ -613,36 +613,33 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
>> void cgroup_base_stat_cputime_show(struct seq_file *seq)
>> {
>> struct cgroup *cgrp = seq_css(seq)->cgroup;
>> - u64 usage, utime, stime, ntime;
>> + struct cgroup_base_stat bstat;
>>
>> if (cgroup_parent(cgrp)) {
>> cgroup_rstat_flush_hold(cgrp);
>> - usage = cgrp->bstat.cputime.sum_exec_runtime;
>> + bstat = cgrp->bstat;
>
> Thank you for finding that.
> In my version 2, I used to assign cgrp->bstat to bstat.
> This is Tj's comment:
> https://lore.kernel.org/linux-kernel/ZoQ2ti7nnz9EJSc3@slm.duckdns.org/
Thanks for pointing out. Using memcpy() is fine for me, but I see
cgroup_base_stat_flush() is using the same pattern, and if we don't
want copy like this, it would be better unify them in a separate
patch.
But IMHO in either way, I don't think reading forceidle time outside
cgroup_rstat_lock is the right thing to do.
Best Regards,
Abel
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: Re: [PATCH 1/3] cgroup/rstat: Fix forceidle time in cpu.stat
2025-01-24 7:49 ` Abel Wu
@ 2025-01-24 23:49 ` Tejun Heo
2025-01-25 4:54 ` Abel Wu
0 siblings, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2025-01-24 23:49 UTC (permalink / raw)
To: Abel Wu
Cc: chenridong, Johannes Weiner, Michal Koutný, Jonathan Corbet,
Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
Valentin Schneider, Thomas Gleixner, Yury Norov, Andrew Morton,
Bitao Hu, open list:CONTROL GROUP (CGROUP),
open list:DOCUMENTATION, open list
On Fri, Jan 24, 2025 at 03:49:16PM +0800, Abel Wu wrote:
...
> > > --- a/kernel/cgroup/rstat.c
> > > +++ b/kernel/cgroup/rstat.c
> > > @@ -613,36 +613,33 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
> > > void cgroup_base_stat_cputime_show(struct seq_file *seq)
> > > {
> > > struct cgroup *cgrp = seq_css(seq)->cgroup;
> > > - u64 usage, utime, stime, ntime;
> > > + struct cgroup_base_stat bstat;
> > > if (cgroup_parent(cgrp)) {
> > > cgroup_rstat_flush_hold(cgrp);
> > > - usage = cgrp->bstat.cputime.sum_exec_runtime;
> > > + bstat = cgrp->bstat;
> >
> > Thank you for finding that.
> > In my version 2, I used to assign cgrp->bstat to bstat.
> > This is Tj's comment:
> > https://lore.kernel.org/linux-kernel/ZoQ2ti7nnz9EJSc3@slm.duckdns.org/
I wasn't saying that memcpy() should be used instead of assignment. I was
saying that if a non-trivial struct can be pointed to instead of being
copied, it should be pointed to. If all the fields need to be snapshotted,
assigning is fine.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: Re: Re: [PATCH 1/3] cgroup/rstat: Fix forceidle time in cpu.stat
2025-01-24 23:49 ` Tejun Heo
@ 2025-01-25 4:54 ` Abel Wu
0 siblings, 0 replies; 14+ messages in thread
From: Abel Wu @ 2025-01-25 4:54 UTC (permalink / raw)
To: Tejun Heo
Cc: chenridong, Johannes Weiner, Michal Koutný, Jonathan Corbet,
Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
Valentin Schneider, Thomas Gleixner, Yury Norov, Andrew Morton,
Bitao Hu, open list:CONTROL GROUP (CGROUP),
open list:DOCUMENTATION, open list
On 1/25/25 7:49 AM, Tejun Heo Wrote:
> On Fri, Jan 24, 2025 at 03:49:16PM +0800, Abel Wu wrote:
> ...
>>>> --- a/kernel/cgroup/rstat.c
>>>> +++ b/kernel/cgroup/rstat.c
>>>> @@ -613,36 +613,33 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
>>>> void cgroup_base_stat_cputime_show(struct seq_file *seq)
>>>> {
>>>> struct cgroup *cgrp = seq_css(seq)->cgroup;
>>>> - u64 usage, utime, stime, ntime;
>>>> + struct cgroup_base_stat bstat;
>>>> if (cgroup_parent(cgrp)) {
>>>> cgroup_rstat_flush_hold(cgrp);
>>>> - usage = cgrp->bstat.cputime.sum_exec_runtime;
>>>> + bstat = cgrp->bstat;
>>>
>>> Thank you for finding that.
>>> In my version 2, I used to assign cgrp->bstat to bstat.
>>> This is Tj's comment:
>>> https://lore.kernel.org/linux-kernel/ZoQ2ti7nnz9EJSc3@slm.duckdns.org/
>
> I wasn't saying that memcpy() should be used instead of assignment. I was
> saying that if a non-trivial struct can be pointed to instead of being
> copied, it should be pointed to. If all the fields need to be snapshotted,
> assigning is fine.
Thanks for your clarification, Tejun. I will post v2 to address Michal's
comments.
Best Regards,
Abel
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 1/3] cgroup/rstat: Fix forceidle time in cpu.stat
2025-01-23 17:47 ` [PATCH 1/3] cgroup/rstat: Fix forceidle time in cpu.stat Abel Wu
2025-01-24 1:47 ` chenridong
@ 2025-01-24 9:22 ` Michal Koutný
2025-01-24 9:58 ` Abel Wu
1 sibling, 1 reply; 14+ messages in thread
From: Michal Koutný @ 2025-01-24 9:22 UTC (permalink / raw)
To: Abel Wu
Cc: Tejun Heo, Johannes Weiner, Jonathan Corbet, Ingo Molnar,
Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
Thomas Gleixner, Yury Norov, Andrew Morton, Bitao Hu, Chen Ridong,
open list:CONTROL GROUP (CGROUP), open list:DOCUMENTATION,
open list
[-- Attachment #1: Type: text/plain, Size: 570 bytes --]
Hello.
On Fri, Jan 24, 2025 at 01:47:01AM +0800, Abel Wu <wuyun.abel@bytedance.com> wrote:
> The commit b824766504e4 ("cgroup/rstat: add force idle show helper")
> retrieves forceidle_time outside cgroup_rstat_lock for non-root cgroups
> which can be potentially inconsistent with other stats.
>
> Rather than reverting that commit, fix it in a way that retains the
> effort of cleaning up the ifdef-messes.
Sorry, I'm blind, where's the change moving wrt cgroup_rstat_lock?
(I only see unuse of root cgroup's bstat and a few renames).
Thanks,
Michal
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH 1/3] cgroup/rstat: Fix forceidle time in cpu.stat
2025-01-24 9:22 ` Michal Koutný
@ 2025-01-24 9:58 ` Abel Wu
2025-01-24 10:39 ` Michal Koutný
0 siblings, 1 reply; 14+ messages in thread
From: Abel Wu @ 2025-01-24 9:58 UTC (permalink / raw)
To: Michal Koutný
Cc: Tejun Heo, Johannes Weiner, Jonathan Corbet, Ingo Molnar,
Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
Thomas Gleixner, Yury Norov, Andrew Morton, Bitao Hu, Chen Ridong,
open list:CONTROL GROUP (CGROUP), open list:DOCUMENTATION,
open list
On 1/24/25 5:22 PM, Michal Koutný Wrote:
> Hello.
>
> On Fri, Jan 24, 2025 at 01:47:01AM +0800, Abel Wu <wuyun.abel@bytedance.com> wrote:
>> The commit b824766504e4 ("cgroup/rstat: add force idle show helper")
>> retrieves forceidle_time outside cgroup_rstat_lock for non-root cgroups
>> which can be potentially inconsistent with other stats.
>>
>> Rather than reverting that commit, fix it in a way that retains the
>> effort of cleaning up the ifdef-messes.
>
> Sorry, I'm blind, where's the change moving wrt cgroup_rstat_lock?
> (I only see unuse of root cgroup's bstat and a few renames).
Hi Michal,
The following hunk deleted the snapshot of cgrp->bstat.forceidle_sum:
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
-#ifdef CONFIG_SCHED_CORE
- forceidle_time = cgrp->bstat.forceidle_sum;
-#endif
cgroup_rstat_flush_release(cgrp);
} else {
and then read forceidle_sum from @cgrp directly outside of the lock,
but its value can be changed in this window, so...
-#ifdef CONFIG_SCHED_CORE
- seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
-#endif
+ cgroup_force_idle_show(seq, &cgrp->bstat);
}
result in the inconsistence between forceidle and other cputimes.
Best Regards,
Abel
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH 1/3] cgroup/rstat: Fix forceidle time in cpu.stat
2025-01-24 9:58 ` Abel Wu
@ 2025-01-24 10:39 ` Michal Koutný
2025-01-24 10:58 ` Abel Wu
0 siblings, 1 reply; 14+ messages in thread
From: Michal Koutný @ 2025-01-24 10:39 UTC (permalink / raw)
To: Abel Wu
Cc: Tejun Heo, Johannes Weiner, Jonathan Corbet, Ingo Molnar,
Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
Thomas Gleixner, Yury Norov, Andrew Morton, Bitao Hu, Chen Ridong,
open list:CONTROL GROUP (CGROUP), open list:DOCUMENTATION,
open list
[-- Attachment #1: Type: text/plain, Size: 204 bytes --]
On Fri, Jan 24, 2025 at 05:58:26PM +0800, Abel Wu <wuyun.abel@bytedance.com> wrote:
> The following hunk deleted the snapshot of cgrp->bstat.forceidle_sum:
But there's no such hunk in your PATCH 1/3 :-o
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 1/3] cgroup/rstat: Fix forceidle time in cpu.stat
2025-01-24 10:39 ` Michal Koutný
@ 2025-01-24 10:58 ` Abel Wu
0 siblings, 0 replies; 14+ messages in thread
From: Abel Wu @ 2025-01-24 10:58 UTC (permalink / raw)
To: Michal Koutný
Cc: Tejun Heo, Johannes Weiner, Jonathan Corbet, Ingo Molnar,
Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
Thomas Gleixner, Yury Norov, Andrew Morton, Bitao Hu, Chen Ridong,
open list:CONTROL GROUP (CGROUP), open list:DOCUMENTATION,
open list
On 1/24/25 6:39 PM, Michal Koutný Wrote:
> On Fri, Jan 24, 2025 at 05:58:26PM +0800, Abel Wu <wuyun.abel@bytedance.com> wrote:
>> The following hunk deleted the snapshot of cgrp->bstat.forceidle_sum:
>
> But there's no such hunk in your PATCH 1/3 :-o
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
- usage = cgrp->bstat.cputime.sum_exec_runtime;
+ bstat = cgrp->bstat;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
- &utime, &stime);
- ntime = cgrp->bstat.ntime;
+ &bstat.cputime.utime, &bstat.cputime.stime);
cgroup_rstat_flush_release(cgrp);
} else {
Because @bstat takes a full snapshot of @cgrp->bstat which contains
forceidle time. I think the lack of readability is the reason why
TJ doesn't like this :(
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 2/3] cgroup/rstat: Cleanup cpu.stat once for all
2025-01-23 17:47 [PATCH 0/3] Fix and cleanup and extend cpu.stat Abel Wu
2025-01-23 17:47 ` [PATCH 1/3] cgroup/rstat: Fix forceidle time in cpu.stat Abel Wu
@ 2025-01-23 17:47 ` Abel Wu
2025-01-24 9:22 ` Michal Koutný
2025-01-23 17:47 ` [PATCH 3/3] cgroup/rstat: Add run_delay accounting for cgroups Abel Wu
2 siblings, 1 reply; 14+ messages in thread
From: Abel Wu @ 2025-01-23 17:47 UTC (permalink / raw)
To: Tejun Heo, Johannes Weiner, Michal Koutný, Jonathan Corbet,
Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
Valentin Schneider, Andrew Morton, Abel Wu, Bitao Hu,
Thomas Gleixner, Yury Norov, Chen Ridong
Cc: open list:CONTROL GROUP (CGROUP), open list:DOCUMENTATION,
open list
There were efforts like b824766504e4 ("cgroup/rstat: add force idle show helper")
to escape from #ifdef hells, and there could be new stats coming out in
the future, let's clean it up once for all.
Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
---
kernel/cgroup/rstat.c | 46 +++++++++++++++++++++++--------------------
1 file changed, 25 insertions(+), 21 deletions(-)
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index c2784c317cdd..6ad647f3e241 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -599,21 +599,38 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
}
}
+static struct bstat_entry {
+ const char *name;
+ const int offset;
+} bstats[] = {
+#define BSTAT_ENTRY(name, field) \
+ { name, offsetof(struct cgroup_base_stat, field) }
+ BSTAT_ENTRY("usage_usec", cputime.sum_exec_runtime),
+ BSTAT_ENTRY("user_usec", cputime.utime),
+ BSTAT_ENTRY("system_usec", cputime.stime),
+ BSTAT_ENTRY("nice_usec", ntime),
+#ifdef CONFIG_SCHED_CORE
+ BSTAT_ENTRY("core_sched.force_idle_usec", forceidle_sum),
+#endif
+ { NULL } /* must be at end */
+#undef BSTAT_ENTRY
+};
-static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
+static void __append_bstat(struct seq_file *seq, struct cgroup_base_stat *bstat,
+ struct bstat_entry *entry)
{
-#ifdef CONFIG_SCHED_CORE
- u64 forceidle_time = bstat->forceidle_sum;
+ u64 *val;
- do_div(forceidle_time, NSEC_PER_USEC);
- seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
-#endif
+ val = (void *)bstat + entry->offset;
+ do_div(*val, NSEC_PER_USEC);
+ seq_printf(seq, "%s %llu\n", entry->name, *val);
}
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct cgroup_base_stat bstat;
+ struct bstat_entry *e;
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
@@ -625,21 +642,8 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
root_cgroup_cputime(&bstat);
}
- do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC);
- do_div(bstat.cputime.utime, NSEC_PER_USEC);
- do_div(bstat.cputime.stime, NSEC_PER_USEC);
- do_div(bstat.ntime, NSEC_PER_USEC);
-
- seq_printf(seq, "usage_usec %llu\n"
- "user_usec %llu\n"
- "system_usec %llu\n"
- "nice_usec %llu\n",
- bstat.cputime.sum_exec_runtime,
- bstat.cputime.utime,
- bstat.cputime.stime,
- bstat.ntime);
-
- cgroup_force_idle_show(seq, &bstat);
+ for (e = bstats; e->name; e++)
+ __append_bstat(seq, &bstat, e);
}
/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
--
2.37.3
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [PATCH 2/3] cgroup/rstat: Cleanup cpu.stat once for all
2025-01-23 17:47 ` [PATCH 2/3] cgroup/rstat: Cleanup cpu.stat once for all Abel Wu
@ 2025-01-24 9:22 ` Michal Koutný
2025-01-24 10:00 ` Abel Wu
0 siblings, 1 reply; 14+ messages in thread
From: Michal Koutný @ 2025-01-24 9:22 UTC (permalink / raw)
To: Abel Wu
Cc: Tejun Heo, Johannes Weiner, Jonathan Corbet, Ingo Molnar,
Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
Andrew Morton, Bitao Hu, Thomas Gleixner, Yury Norov, Chen Ridong,
open list:CONTROL GROUP (CGROUP), open list:DOCUMENTATION,
open list
[-- Attachment #1: Type: text/plain, Size: 459 bytes --]
On Fri, Jan 24, 2025 at 01:47:02AM +0800, Abel Wu <wuyun.abel@bytedance.com> wrote:
> -static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
> +static void __append_bstat(struct seq_file *seq, struct cgroup_base_stat *bstat,
> + struct bstat_entry *entry)
Not sure if starting with double underscore is needed when the helper is
`static`. Also something like s/append/show/ -> cgroup_bstat_entry_show.
Thanks,
Michal
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/3] cgroup/rstat: Cleanup cpu.stat once for all
2025-01-24 9:22 ` Michal Koutný
@ 2025-01-24 10:00 ` Abel Wu
0 siblings, 0 replies; 14+ messages in thread
From: Abel Wu @ 2025-01-24 10:00 UTC (permalink / raw)
To: Michal Koutný
Cc: Tejun Heo, Johannes Weiner, Jonathan Corbet, Ingo Molnar,
Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
Andrew Morton, Bitao Hu, Thomas Gleixner, Yury Norov, Chen Ridong,
open list:CONTROL GROUP (CGROUP), open list:DOCUMENTATION,
open list
On 1/24/25 5:22 PM, Michal Koutný Wrote:
> On Fri, Jan 24, 2025 at 01:47:02AM +0800, Abel Wu <wuyun.abel@bytedance.com> wrote:
>> -static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
>> +static void __append_bstat(struct seq_file *seq, struct cgroup_base_stat *bstat,
>> + struct bstat_entry *entry)
>
> Not sure if starting with double underscore is needed when the helper is
> `static`. Also something like s/append/show/ -> cgroup_bstat_entry_show.
Yeah, "cgroup_bstat_entry_show" looks better! Will fix.
Thanks,
Abel
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 3/3] cgroup/rstat: Add run_delay accounting for cgroups
2025-01-23 17:47 [PATCH 0/3] Fix and cleanup and extend cpu.stat Abel Wu
2025-01-23 17:47 ` [PATCH 1/3] cgroup/rstat: Fix forceidle time in cpu.stat Abel Wu
2025-01-23 17:47 ` [PATCH 2/3] cgroup/rstat: Cleanup cpu.stat once for all Abel Wu
@ 2025-01-23 17:47 ` Abel Wu
2 siblings, 0 replies; 14+ messages in thread
From: Abel Wu @ 2025-01-23 17:47 UTC (permalink / raw)
To: Tejun Heo, Johannes Weiner, Michal Koutný, Jonathan Corbet,
Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
Valentin Schneider, Yury Norov, Thomas Gleixner, Bitao Hu,
Andrew Morton, Abel Wu, Chen Ridong
Cc: open list:CONTROL GROUP (CGROUP), open list:DOCUMENTATION,
open list
The per-task and per-cpu accounting have already been tracked by
t->sched_info.run_delay and rq->rq_sched_info.run_delay respectively.
Extends this to also include cgroups.
The PSI indicator, "some" of cpu.pressure, loses the insight into how
severely that cgroup is stalled. Say 100 tasks or just 1 task that gets
stalled at a certain point will show no difference in "some" pressure.
IOW "some" is a flat value that not weighted by the severity (e.g. # of
tasks).
Only cgroup v2 is supported. Similar to the task accounting, the cgroup
accounting requires that CONFIG_SCHED_INFO is enabled.
Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
---
Documentation/admin-guide/cgroup-v2.rst | 1 +
include/linux/cgroup-defs.h | 3 +++
include/linux/kernel_stat.h | 14 ++++++++++++++
kernel/cgroup/rstat.c | 17 +++++++++++++++++
kernel/sched/cputime.c | 12 ++++++++++++
kernel/sched/stats.h | 2 ++
6 files changed, 49 insertions(+)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 315ede811c9d..440c3800c49c 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1100,6 +1100,7 @@ All time durations are in microseconds.
- usage_usec
- user_usec
- system_usec
+ - run_delay_usec (requires CONFIG_SCHED_INFO)
and the following five when the controller is enabled:
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1b20d2d8ef7c..287366e60414 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -328,6 +328,9 @@ struct cgroup_base_stat {
u64 forceidle_sum;
#endif
u64 ntime;
+#ifdef CONFIG_SCHED_INFO
+ u64 run_delay;
+#endif
};
/*
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index b97ce2df376f..256b1a55de62 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -31,6 +31,15 @@ enum cpu_usage_stat {
CPUTIME_FORCEIDLE,
#endif
NR_STATS,
+
+#ifdef CONFIG_SCHED_INFO
+ /*
+ * Instead of cputime, run_delay is tracked through
+ * sched_info by task and rq, so there is no need to
+ * enlarge the cpustat[] array.
+ */
+ CPUTIME_RUN_DELAY,
+#endif
};
struct kernel_cpustat {
@@ -141,4 +150,9 @@ extern void account_idle_ticks(unsigned long ticks);
extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
#endif
+#ifdef CONFIG_SCHED_INFO
+extern void account_run_delay_time(struct task_struct *tsk, u64 delta);
+extern u64 get_cpu_run_delay(int cpu);
+#endif
+
#endif /* _LINUX_KERNEL_STAT_H */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 6ad647f3e241..6a365a7fb7b8 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -445,6 +445,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
dst_bstat->ntime += src_bstat->ntime;
+#ifdef CONFIG_SCHED_INFO
+ dst_bstat->run_delay += src_bstat->run_delay;
+#endif
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -457,6 +460,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
dst_bstat->ntime -= src_bstat->ntime;
+#ifdef CONFIG_SCHED_INFO
+ dst_bstat->run_delay -= src_bstat->run_delay;
+#endif
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@@ -551,6 +557,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
case CPUTIME_FORCEIDLE:
rstatc->bstat.forceidle_sum += delta_exec;
break;
+#endif
+#ifdef CONFIG_SCHED_INFO
+ case CPUTIME_RUN_DELAY:
+ rstatc->bstat.run_delay += delta_exec;
+ break;
#endif
default:
break;
@@ -596,6 +607,9 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
bstat->ntime += cpustat[CPUTIME_NICE];
+#ifdef CONFIG_SCHED_INFO
+ bstat->run_delay += get_cpu_run_delay(i);
+#endif
}
}
@@ -611,6 +625,9 @@ static struct bstat_entry {
BSTAT_ENTRY("nice_usec", ntime),
#ifdef CONFIG_SCHED_CORE
BSTAT_ENTRY("core_sched.force_idle_usec", forceidle_sum),
+#endif
+#ifdef CONFIG_SCHED_INFO
+ BSTAT_ENTRY("run_delay_usec", run_delay),
#endif
{ NULL } /* must be at end */
#undef BSTAT_ENTRY
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5d9143dd0879..e6be57cdb54e 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -243,6 +243,18 @@ void __account_forceidle_time(struct task_struct *p, u64 delta)
}
#endif
+#ifdef CONFIG_SCHED_INFO
+void account_run_delay_time(struct task_struct *p, u64 delta)
+{
+ cgroup_account_cputime_field(p, CPUTIME_RUN_DELAY, delta);
+}
+
+u64 get_cpu_run_delay(int cpu)
+{
+ return cpu_rq(cpu)->rq_sched_info.run_delay;
+}
+#endif
+
/*
* When a guest is interrupted for a longer amount of time, missed clock
* ticks are not redelivered later. Due to that, this function may on
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 6ade91bce63e..b21a2c4b9c54 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -249,6 +249,7 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
+ account_run_delay_time(t, delta);
rq_sched_info_dequeue(rq, delta);
}
@@ -271,6 +272,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
+ account_run_delay_time(t, delta);
rq_sched_info_arrive(rq, delta);
}
--
2.37.3
^ permalink raw reply related [flat|nested] 14+ messages in thread