* [PATCH] perf trace: Support --summary-mode=cgroup
@ 2025-05-01 22:53 Namhyung Kim
2025-05-13 21:15 ` Namhyung Kim
2025-05-14 20:07 ` Howard Chu
0 siblings, 2 replies; 4+ messages in thread
From: Namhyung Kim @ 2025-05-01 22:53 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo, Ian Rogers, Kan Liang
Cc: Jiri Olsa, Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
linux-perf-users, Song Liu, bpf, Howard Chu
Add a new summary mode to collect stats for each cgroup.
$ sudo ./perf trace -as --bpf-summary --summary-mode=cgroup -- sleep 1
Summary of events:
cgroup /user.slice/user-657345.slice/user@657345.service/session.slice/org.gnome.Shell@x11.service, 535 events
syscall calls errors total min avg max stddev
(msec) (msec) (msec) (msec) (%)
--------------- -------- ------ -------- --------- --------- --------- ------
ppoll 15 0 373.600 0.004 24.907 197.491 55.26%
poll 15 0 1.325 0.001 0.088 0.369 38.76%
close 66 0 0.567 0.007 0.009 0.026 3.55%
write 150 0 0.471 0.001 0.003 0.010 3.29%
recvmsg 94 83 0.290 0.000 0.003 0.037 16.39%
ioctl 26 0 0.237 0.001 0.009 0.096 50.13%
timerfd_create 66 0 0.236 0.003 0.004 0.024 8.92%
timerfd_settime 70 0 0.160 0.001 0.002 0.012 7.66%
writev 10 0 0.118 0.001 0.012 0.019 18.17%
read 9 0 0.021 0.001 0.002 0.004 14.07%
getpid 14 0 0.019 0.000 0.001 0.004 20.28%
cgroup /system.slice/polkit.service, 94 events
syscall calls errors total min avg max stddev
(msec) (msec) (msec) (msec) (%)
--------------- -------- ------ -------- --------- --------- --------- ------
ppoll 22 0 19.811 0.000 0.900 9.273 63.88%
write 30 0 0.040 0.001 0.001 0.003 12.09%
recvmsg 12 0 0.018 0.001 0.002 0.006 28.15%
read 18 0 0.013 0.000 0.001 0.003 21.99%
poll 12 0 0.006 0.000 0.001 0.001 4.48%
cgroup /user.slice/user-657345.slice/user@657345.service/app.slice/app-org.gnome.Terminal.slice/gnome-terminal-server.service, 21 events
syscall calls errors total min avg max stddev
(msec) (msec) (msec) (msec) (%)
--------------- -------- ------ -------- --------- --------- --------- ------
ppoll 4 0 17.476 0.003 4.369 13.298 69.65%
recvmsg 15 12 0.068 0.002 0.005 0.014 26.53%
writev 1 0 0.033 0.033 0.033 0.033 0.00%
poll 1 0 0.005 0.005 0.005 0.005 0.00%
...
It works only for --bpf-summary for now.
Cc: Howard Chu <howardchu95@gmail.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
tools/perf/Documentation/perf-trace.txt | 3 +-
tools/perf/builtin-trace.c | 10 +-
tools/perf/util/bpf-trace-summary.c | 123 +++++++++++++++++-
.../perf/util/bpf_skel/syscall_summary.bpf.c | 43 +++++-
tools/perf/util/bpf_skel/syscall_summary.h | 2 +
tools/perf/util/trace.h | 1 +
6 files changed, 170 insertions(+), 12 deletions(-)
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index a8a0d8c33438fef7..c1fb6056a0d36dda 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -152,7 +152,8 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
--summary-mode=mode::
To be used with -s or -S, to select how to show summary. By default it'll
- show the syscall summary by thread. Possible values are: thread, total.
+ show the syscall summary by thread. Possible values are: thread, total,
+ cgroup.
--tool_stats::
Show tool stats such as number of times fd->pathname was discovered thru
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index b2c5a9b765ab5d33..83c62c30d914306c 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -5301,6 +5301,8 @@ static int trace__parse_summary_mode(const struct option *opt, const char *str,
trace->summary_mode = SUMMARY__BY_THREAD;
} else if (!strcmp(str, "total")) {
trace->summary_mode = SUMMARY__BY_TOTAL;
+ } else if (!strcmp(str, "cgroup")) {
+ trace->summary_mode = SUMMARY__BY_CGROUP;
} else {
pr_err("Unknown summary mode: %s\n", str);
return -1;
@@ -5460,7 +5462,7 @@ int cmd_trace(int argc, const char **argv)
OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
"Show errno stats per syscall, use with -s or -S"),
OPT_CALLBACK(0, "summary-mode", &trace, "mode",
- "How to show summary: select thread (default) or total",
+ "How to show summary: select thread (default), total or cgroup",
trace__parse_summary_mode),
OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
"Trace pagefaults", parse_pagefaults, "maj"),
@@ -5774,6 +5776,12 @@ int cmd_trace(int argc, const char **argv)
symbol_conf.keep_exited_threads = true;
if (trace.summary_mode == SUMMARY__NONE)
trace.summary_mode = SUMMARY__BY_THREAD;
+
+ if (!trace.summary_bpf && trace.summary_mode == SUMMARY__BY_CGROUP) {
+ pr_err("Error: --summary-mode=cgroup only works with --bpf-summary\n");
+ err = -EINVAL;
+ goto out;
+ }
}
if (output_name != NULL) {
diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
index 114d8d9ed9b2d3f3..69fb165da206b01f 100644
--- a/tools/perf/util/bpf-trace-summary.c
+++ b/tools/perf/util/bpf-trace-summary.c
@@ -6,10 +6,12 @@
#include "dwarf-regs.h" /* for EM_HOST */
#include "syscalltbl.h"
+#include "util/cgroup.h"
#include "util/hashmap.h"
#include "util/trace.h"
#include "util/util.h"
#include <bpf/bpf.h>
+#include <linux/rbtree.h>
#include <linux/time64.h>
#include <tools/libc_compat.h> /* reallocarray */
@@ -18,6 +20,7 @@
static struct syscall_summary_bpf *skel;
+static struct rb_root cgroups = RB_ROOT;
int trace_prepare_bpf_summary(enum trace_summary_mode mode)
{
@@ -29,9 +32,14 @@ int trace_prepare_bpf_summary(enum trace_summary_mode mode)
if (mode == SUMMARY__BY_THREAD)
skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
+ else if (mode == SUMMARY__BY_CGROUP)
+ skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP;
else
skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
+ if (cgroup_is_v2("perf_event") > 0)
+ skel->rodata->use_cgroup_v2 = 1;
+
if (syscall_summary_bpf__load(skel) < 0) {
fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
return -1;
@@ -42,6 +50,9 @@ int trace_prepare_bpf_summary(enum trace_summary_mode mode)
return -1;
}
+ if (mode == SUMMARY__BY_CGROUP)
+ read_all_cgroups(&cgroups);
+
return 0;
}
@@ -88,9 +99,13 @@ static double rel_stddev(struct syscall_stats *stat)
* per-cpu analysis so it's keyed by the syscall number to combine stats
* from different CPUs. And syscall_data always has a syscall_node so
* it can effectively work as flat hierarchy.
+ *
+ * For per-cgroup stats, it uses two-level data structure like thread
+ * syscall_data is keyed by CGROUP and has an array of node which
+ * represents each syscall for the cgroup.
*/
struct syscall_data {
- int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
+ u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */
int nr_events;
int nr_nodes;
u64 total_time;
@@ -191,7 +206,7 @@ static int print_thread_stat(struct syscall_data *data, FILE *fp)
qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
- printed += fprintf(fp, " thread (%d), ", data->key);
+ printed += fprintf(fp, " thread (%d), ", (int)data->key);
printed += fprintf(fp, "%d events\n\n", data->nr_events);
printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
@@ -283,6 +298,75 @@ static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
return printed;
}
+static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key,
+ struct syscall_stats *map_data)
+{
+ struct syscall_data *data;
+ struct syscall_node *nodes;
+
+ if (!hashmap__find(hash, map_key->cgroup, &data)) {
+ data = zalloc(sizeof(*data));
+ if (data == NULL)
+ return -ENOMEM;
+
+ data->key = map_key->cgroup;
+ if (hashmap__add(hash, data->key, data) < 0) {
+ free(data);
+ return -ENOMEM;
+ }
+ }
+
+ /* update thread total stats */
+ data->nr_events += map_data->count;
+ data->total_time += map_data->total_time;
+
+ nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
+ if (nodes == NULL)
+ return -ENOMEM;
+
+ data->nodes = nodes;
+ nodes = &data->nodes[data->nr_nodes++];
+ nodes->syscall_nr = map_key->nr;
+
+ /* each thread has an entry for each syscall, just use the stat */
+ memcpy(&nodes->stats, map_data, sizeof(*map_data));
+ return 0;
+}
+
+static int print_cgroup_stat(struct syscall_data *data, FILE *fp)
+{
+ int printed = 0;
+ struct cgroup *cgrp = __cgroup__find(&cgroups, data->key);
+
+ qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
+
+ if (cgrp)
+ printed += fprintf(fp, " cgroup %s,", cgrp->name);
+ else
+ printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key);
+
+ printed += fprintf(fp, " %d events\n\n", data->nr_events);
+
+ printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
+ printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
+ printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
+
+ printed += print_common_stats(data, fp);
+ printed += fprintf(fp, "\n\n");
+
+ return printed;
+}
+
+static int print_cgroup_stats(struct syscall_data **data, int nr_data, FILE *fp)
+{
+ int printed = 0;
+
+ for (int i = 0; i < nr_data; i++)
+ printed += print_cgroup_stat(data[i], fp);
+
+ return printed;
+}
+
int trace_print_bpf_summary(FILE *fp)
{
struct bpf_map *map = skel->maps.syscall_stats_map;
@@ -305,10 +389,19 @@ int trace_print_bpf_summary(FILE *fp)
struct syscall_stats stat;
if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
- if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
+ switch (skel->rodata->aggr_mode) {
+ case SYSCALL_AGGR_THREAD:
update_thread_stats(&schash, &key, &stat);
- else
+ break;
+ case SYSCALL_AGGR_CPU:
update_total_stats(&schash, &key, &stat);
+ break;
+ case SYSCALL_AGGR_CGROUP:
+ update_cgroup_stats(&schash, &key, &stat);
+ break;
+ default:
+ break;
+ }
}
prev_key = &key;
@@ -325,10 +418,19 @@ int trace_print_bpf_summary(FILE *fp)
qsort(data, nr_data, sizeof(*data), datacmp);
- if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
+ switch (skel->rodata->aggr_mode) {
+ case SYSCALL_AGGR_THREAD:
printed += print_thread_stats(data, nr_data, fp);
- else
+ break;
+ case SYSCALL_AGGR_CPU:
printed += print_total_stats(data, nr_data, fp);
+ break;
+ case SYSCALL_AGGR_CGROUP:
+ printed += print_cgroup_stats(data, nr_data, fp);
+ break;
+ default:
+ break;
+ }
for (i = 0; i < nr_data && data; i++) {
free(data[i]->nodes);
@@ -343,5 +445,14 @@ int trace_print_bpf_summary(FILE *fp)
void trace_cleanup_bpf_summary(void)
{
+ if (!RB_EMPTY_ROOT(&cgroups)) {
+ struct cgroup *cgrp, *tmp;
+
+ rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node)
+ cgroup__put(cgrp);
+
+ cgroups = RB_ROOT;
+ }
+
syscall_summary_bpf__destroy(skel);
}
diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
index b25f53b3c1351392..1bcd066a5199a476 100644
--- a/tools/perf/util/bpf_skel/syscall_summary.bpf.c
+++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
@@ -8,6 +8,7 @@
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
/* This is to calculate a delta between sys-enter and sys-exit for each thread */
struct syscall_trace {
@@ -35,10 +36,41 @@ struct syscall_stats_map {
int enabled; /* controlled from userspace */
const volatile enum syscall_aggr_mode aggr_mode;
+const volatile int use_cgroup_v2;
-static void update_stats(int cpu_or_tid, int nr, s64 duration, long ret)
+int perf_subsys_id = -1;
+
+static inline __u64 get_current_cgroup_id(void)
+{
+ struct task_struct *task;
+ struct cgroup *cgrp;
+
+ if (use_cgroup_v2)
+ return bpf_get_current_cgroup_id();
+
+ task = bpf_get_current_task_btf();
+
+ if (perf_subsys_id == -1) {
+#if __has_builtin(__builtin_preserve_enum_value)
+ perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
+ perf_event_cgrp_id);
+#else
+ perf_subsys_id = perf_event_cgrp_id;
+#endif
+ }
+
+ cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
+ return BPF_CORE_READ(cgrp, kn, id);
+}
+
+static void update_stats(int cpu_or_tid, u64 cgroup_id, int nr, s64 duration,
+ long ret)
{
- struct syscall_key key = { .cpu_or_tid = cpu_or_tid, .nr = nr, };
+ struct syscall_key key = {
+ .cpu_or_tid = cpu_or_tid,
+ .cgroup = cgroup_id,
+ .nr = nr,
+ };
struct syscall_stats *stats;
stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
@@ -90,7 +122,8 @@ SEC("tp_btf/sys_exit")
int sys_exit(u64 *ctx)
{
int tid;
- int key;
+ int key = 0;
+ u64 cgroup = 0;
long ret = ctx[1]; /* return value of the syscall */
struct syscall_trace *st;
s64 delta;
@@ -105,11 +138,13 @@ int sys_exit(u64 *ctx)
if (aggr_mode == SYSCALL_AGGR_THREAD)
key = tid;
+ else if (aggr_mode == SYSCALL_AGGR_CGROUP)
+ cgroup = get_current_cgroup_id();
else
key = bpf_get_smp_processor_id();
delta = bpf_ktime_get_ns() - st->timestamp;
- update_stats(key, st->nr, delta, ret);
+ update_stats(key, cgroup, st->nr, delta, ret);
bpf_map_delete_elem(&syscall_trace_map, &tid);
return 0;
diff --git a/tools/perf/util/bpf_skel/syscall_summary.h b/tools/perf/util/bpf_skel/syscall_summary.h
index 17f9ecba657088aa..72ccccb45925cd10 100644
--- a/tools/perf/util/bpf_skel/syscall_summary.h
+++ b/tools/perf/util/bpf_skel/syscall_summary.h
@@ -6,9 +6,11 @@
enum syscall_aggr_mode {
SYSCALL_AGGR_THREAD,
SYSCALL_AGGR_CPU,
+ SYSCALL_AGGR_CGROUP,
};
struct syscall_key {
+ u64 cgroup;
int cpu_or_tid;
int nr;
};
diff --git a/tools/perf/util/trace.h b/tools/perf/util/trace.h
index ef8361ed12c4edc1..fa8d480527a22cef 100644
--- a/tools/perf/util/trace.h
+++ b/tools/perf/util/trace.h
@@ -8,6 +8,7 @@ enum trace_summary_mode {
SUMMARY__NONE = 0,
SUMMARY__BY_TOTAL,
SUMMARY__BY_THREAD,
+ SUMMARY__BY_CGROUP,
};
#ifdef HAVE_BPF_SKEL
--
2.49.0.906.g1f30a19c02-goog
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] perf trace: Support --summary-mode=cgroup
2025-05-01 22:53 [PATCH] perf trace: Support --summary-mode=cgroup Namhyung Kim
@ 2025-05-13 21:15 ` Namhyung Kim
2025-05-13 21:23 ` Arnaldo Carvalho de Melo
2025-05-14 20:07 ` Howard Chu
1 sibling, 1 reply; 4+ messages in thread
From: Namhyung Kim @ 2025-05-13 21:15 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo, Ian Rogers, Kan Liang
Cc: Jiri Olsa, Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
linux-perf-users, Song Liu, bpf, Howard Chu
Ping!
On Thu, May 01, 2025 at 03:53:37PM -0700, Namhyung Kim wrote:
> Add a new summary mode to collect stats for each cgroup.
>
> $ sudo ./perf trace -as --bpf-summary --summary-mode=cgroup -- sleep 1
>
> Summary of events:
>
> cgroup /user.slice/user-657345.slice/user@657345.service/session.slice/org.gnome.Shell@x11.service, 535 events
>
> syscall calls errors total min avg max stddev
> (msec) (msec) (msec) (msec) (%)
> --------------- -------- ------ -------- --------- --------- --------- ------
> ppoll 15 0 373.600 0.004 24.907 197.491 55.26%
> poll 15 0 1.325 0.001 0.088 0.369 38.76%
> close 66 0 0.567 0.007 0.009 0.026 3.55%
> write 150 0 0.471 0.001 0.003 0.010 3.29%
> recvmsg 94 83 0.290 0.000 0.003 0.037 16.39%
> ioctl 26 0 0.237 0.001 0.009 0.096 50.13%
> timerfd_create 66 0 0.236 0.003 0.004 0.024 8.92%
> timerfd_settime 70 0 0.160 0.001 0.002 0.012 7.66%
> writev 10 0 0.118 0.001 0.012 0.019 18.17%
> read 9 0 0.021 0.001 0.002 0.004 14.07%
> getpid 14 0 0.019 0.000 0.001 0.004 20.28%
>
> cgroup /system.slice/polkit.service, 94 events
>
> syscall calls errors total min avg max stddev
> (msec) (msec) (msec) (msec) (%)
> --------------- -------- ------ -------- --------- --------- --------- ------
> ppoll 22 0 19.811 0.000 0.900 9.273 63.88%
> write 30 0 0.040 0.001 0.001 0.003 12.09%
> recvmsg 12 0 0.018 0.001 0.002 0.006 28.15%
> read 18 0 0.013 0.000 0.001 0.003 21.99%
> poll 12 0 0.006 0.000 0.001 0.001 4.48%
>
> cgroup /user.slice/user-657345.slice/user@657345.service/app.slice/app-org.gnome.Terminal.slice/gnome-terminal-server.service, 21 events
>
> syscall calls errors total min avg max stddev
> (msec) (msec) (msec) (msec) (%)
> --------------- -------- ------ -------- --------- --------- --------- ------
> ppoll 4 0 17.476 0.003 4.369 13.298 69.65%
> recvmsg 15 12 0.068 0.002 0.005 0.014 26.53%
> writev 1 0 0.033 0.033 0.033 0.033 0.00%
> poll 1 0 0.005 0.005 0.005 0.005 0.00%
>
> ...
>
> It works only for --bpf-summary for now.
>
> Cc: Howard Chu <howardchu95@gmail.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> tools/perf/Documentation/perf-trace.txt | 3 +-
> tools/perf/builtin-trace.c | 10 +-
> tools/perf/util/bpf-trace-summary.c | 123 +++++++++++++++++-
> .../perf/util/bpf_skel/syscall_summary.bpf.c | 43 +++++-
> tools/perf/util/bpf_skel/syscall_summary.h | 2 +
> tools/perf/util/trace.h | 1 +
> 6 files changed, 170 insertions(+), 12 deletions(-)
>
> diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> index a8a0d8c33438fef7..c1fb6056a0d36dda 100644
> --- a/tools/perf/Documentation/perf-trace.txt
> +++ b/tools/perf/Documentation/perf-trace.txt
> @@ -152,7 +152,8 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
>
> --summary-mode=mode::
> To be used with -s or -S, to select how to show summary. By default it'll
> - show the syscall summary by thread. Possible values are: thread, total.
> + show the syscall summary by thread. Possible values are: thread, total,
> + cgroup.
>
> --tool_stats::
> Show tool stats such as number of times fd->pathname was discovered thru
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index b2c5a9b765ab5d33..83c62c30d914306c 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -5301,6 +5301,8 @@ static int trace__parse_summary_mode(const struct option *opt, const char *str,
> trace->summary_mode = SUMMARY__BY_THREAD;
> } else if (!strcmp(str, "total")) {
> trace->summary_mode = SUMMARY__BY_TOTAL;
> + } else if (!strcmp(str, "cgroup")) {
> + trace->summary_mode = SUMMARY__BY_CGROUP;
> } else {
> pr_err("Unknown summary mode: %s\n", str);
> return -1;
> @@ -5460,7 +5462,7 @@ int cmd_trace(int argc, const char **argv)
> OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
> "Show errno stats per syscall, use with -s or -S"),
> OPT_CALLBACK(0, "summary-mode", &trace, "mode",
> - "How to show summary: select thread (default) or total",
> + "How to show summary: select thread (default), total or cgroup",
> trace__parse_summary_mode),
> OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
> "Trace pagefaults", parse_pagefaults, "maj"),
> @@ -5774,6 +5776,12 @@ int cmd_trace(int argc, const char **argv)
> symbol_conf.keep_exited_threads = true;
> if (trace.summary_mode == SUMMARY__NONE)
> trace.summary_mode = SUMMARY__BY_THREAD;
> +
> + if (!trace.summary_bpf && trace.summary_mode == SUMMARY__BY_CGROUP) {
> + pr_err("Error: --summary-mode=cgroup only works with --bpf-summary\n");
> + err = -EINVAL;
> + goto out;
> + }
> }
>
> if (output_name != NULL) {
> diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
> index 114d8d9ed9b2d3f3..69fb165da206b01f 100644
> --- a/tools/perf/util/bpf-trace-summary.c
> +++ b/tools/perf/util/bpf-trace-summary.c
> @@ -6,10 +6,12 @@
>
> #include "dwarf-regs.h" /* for EM_HOST */
> #include "syscalltbl.h"
> +#include "util/cgroup.h"
> #include "util/hashmap.h"
> #include "util/trace.h"
> #include "util/util.h"
> #include <bpf/bpf.h>
> +#include <linux/rbtree.h>
> #include <linux/time64.h>
> #include <tools/libc_compat.h> /* reallocarray */
>
> @@ -18,6 +20,7 @@
>
>
> static struct syscall_summary_bpf *skel;
> +static struct rb_root cgroups = RB_ROOT;
>
> int trace_prepare_bpf_summary(enum trace_summary_mode mode)
> {
> @@ -29,9 +32,14 @@ int trace_prepare_bpf_summary(enum trace_summary_mode mode)
>
> if (mode == SUMMARY__BY_THREAD)
> skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
> + else if (mode == SUMMARY__BY_CGROUP)
> + skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP;
> else
> skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
>
> + if (cgroup_is_v2("perf_event") > 0)
> + skel->rodata->use_cgroup_v2 = 1;
> +
> if (syscall_summary_bpf__load(skel) < 0) {
> fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
> return -1;
> @@ -42,6 +50,9 @@ int trace_prepare_bpf_summary(enum trace_summary_mode mode)
> return -1;
> }
>
> + if (mode == SUMMARY__BY_CGROUP)
> + read_all_cgroups(&cgroups);
> +
> return 0;
> }
>
> @@ -88,9 +99,13 @@ static double rel_stddev(struct syscall_stats *stat)
> * per-cpu analysis so it's keyed by the syscall number to combine stats
> * from different CPUs. And syscall_data always has a syscall_node so
> * it can effectively work as flat hierarchy.
> + *
> + * For per-cgroup stats, it uses two-level data structure like thread
> + * syscall_data is keyed by CGROUP and has an array of node which
> + * represents each syscall for the cgroup.
> */
> struct syscall_data {
> - int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
> + u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */
> int nr_events;
> int nr_nodes;
> u64 total_time;
> @@ -191,7 +206,7 @@ static int print_thread_stat(struct syscall_data *data, FILE *fp)
>
> qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
>
> - printed += fprintf(fp, " thread (%d), ", data->key);
> + printed += fprintf(fp, " thread (%d), ", (int)data->key);
> printed += fprintf(fp, "%d events\n\n", data->nr_events);
>
> printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
> @@ -283,6 +298,75 @@ static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
> return printed;
> }
>
> +static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key,
> + struct syscall_stats *map_data)
> +{
> + struct syscall_data *data;
> + struct syscall_node *nodes;
> +
> + if (!hashmap__find(hash, map_key->cgroup, &data)) {
> + data = zalloc(sizeof(*data));
> + if (data == NULL)
> + return -ENOMEM;
> +
> + data->key = map_key->cgroup;
> + if (hashmap__add(hash, data->key, data) < 0) {
> + free(data);
> + return -ENOMEM;
> + }
> + }
> +
> + /* update thread total stats */
> + data->nr_events += map_data->count;
> + data->total_time += map_data->total_time;
> +
> + nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
> + if (nodes == NULL)
> + return -ENOMEM;
> +
> + data->nodes = nodes;
> + nodes = &data->nodes[data->nr_nodes++];
> + nodes->syscall_nr = map_key->nr;
> +
> + /* each thread has an entry for each syscall, just use the stat */
> + memcpy(&nodes->stats, map_data, sizeof(*map_data));
> + return 0;
> +}
> +
> +static int print_cgroup_stat(struct syscall_data *data, FILE *fp)
> +{
> + int printed = 0;
> + struct cgroup *cgrp = __cgroup__find(&cgroups, data->key);
> +
> + qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
> +
> + if (cgrp)
> + printed += fprintf(fp, " cgroup %s,", cgrp->name);
> + else
> + printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key);
> +
> + printed += fprintf(fp, " %d events\n\n", data->nr_events);
> +
> + printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
> + printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
> + printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
> +
> + printed += print_common_stats(data, fp);
> + printed += fprintf(fp, "\n\n");
> +
> + return printed;
> +}
> +
> +static int print_cgroup_stats(struct syscall_data **data, int nr_data, FILE *fp)
> +{
> + int printed = 0;
> +
> + for (int i = 0; i < nr_data; i++)
> + printed += print_cgroup_stat(data[i], fp);
> +
> + return printed;
> +}
> +
> int trace_print_bpf_summary(FILE *fp)
> {
> struct bpf_map *map = skel->maps.syscall_stats_map;
> @@ -305,10 +389,19 @@ int trace_print_bpf_summary(FILE *fp)
> struct syscall_stats stat;
>
> if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
> - if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> + switch (skel->rodata->aggr_mode) {
> + case SYSCALL_AGGR_THREAD:
> update_thread_stats(&schash, &key, &stat);
> - else
> + break;
> + case SYSCALL_AGGR_CPU:
> update_total_stats(&schash, &key, &stat);
> + break;
> + case SYSCALL_AGGR_CGROUP:
> + update_cgroup_stats(&schash, &key, &stat);
> + break;
> + default:
> + break;
> + }
> }
>
> prev_key = &key;
> @@ -325,10 +418,19 @@ int trace_print_bpf_summary(FILE *fp)
>
> qsort(data, nr_data, sizeof(*data), datacmp);
>
> - if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> + switch (skel->rodata->aggr_mode) {
> + case SYSCALL_AGGR_THREAD:
> printed += print_thread_stats(data, nr_data, fp);
> - else
> + break;
> + case SYSCALL_AGGR_CPU:
> printed += print_total_stats(data, nr_data, fp);
> + break;
> + case SYSCALL_AGGR_CGROUP:
> + printed += print_cgroup_stats(data, nr_data, fp);
> + break;
> + default:
> + break;
> + }
>
> for (i = 0; i < nr_data && data; i++) {
> free(data[i]->nodes);
> @@ -343,5 +445,14 @@ int trace_print_bpf_summary(FILE *fp)
>
> void trace_cleanup_bpf_summary(void)
> {
> + if (!RB_EMPTY_ROOT(&cgroups)) {
> + struct cgroup *cgrp, *tmp;
> +
> + rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node)
> + cgroup__put(cgrp);
> +
> + cgroups = RB_ROOT;
> + }
> +
> syscall_summary_bpf__destroy(skel);
> }
> diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
> index b25f53b3c1351392..1bcd066a5199a476 100644
> --- a/tools/perf/util/bpf_skel/syscall_summary.bpf.c
> +++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
> @@ -8,6 +8,7 @@
>
> #include <bpf/bpf_helpers.h>
> #include <bpf/bpf_tracing.h>
> +#include <bpf/bpf_core_read.h>
>
> /* This is to calculate a delta between sys-enter and sys-exit for each thread */
> struct syscall_trace {
> @@ -35,10 +36,41 @@ struct syscall_stats_map {
> int enabled; /* controlled from userspace */
>
> const volatile enum syscall_aggr_mode aggr_mode;
> +const volatile int use_cgroup_v2;
>
> -static void update_stats(int cpu_or_tid, int nr, s64 duration, long ret)
> +int perf_subsys_id = -1;
> +
> +static inline __u64 get_current_cgroup_id(void)
> +{
> + struct task_struct *task;
> + struct cgroup *cgrp;
> +
> + if (use_cgroup_v2)
> + return bpf_get_current_cgroup_id();
> +
> + task = bpf_get_current_task_btf();
> +
> + if (perf_subsys_id == -1) {
> +#if __has_builtin(__builtin_preserve_enum_value)
> + perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
> + perf_event_cgrp_id);
> +#else
> + perf_subsys_id = perf_event_cgrp_id;
> +#endif
> + }
> +
> + cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
> + return BPF_CORE_READ(cgrp, kn, id);
> +}
> +
> +static void update_stats(int cpu_or_tid, u64 cgroup_id, int nr, s64 duration,
> + long ret)
> {
> - struct syscall_key key = { .cpu_or_tid = cpu_or_tid, .nr = nr, };
> + struct syscall_key key = {
> + .cpu_or_tid = cpu_or_tid,
> + .cgroup = cgroup_id,
> + .nr = nr,
> + };
> struct syscall_stats *stats;
>
> stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
> @@ -90,7 +122,8 @@ SEC("tp_btf/sys_exit")
> int sys_exit(u64 *ctx)
> {
> int tid;
> - int key;
> + int key = 0;
> + u64 cgroup = 0;
> long ret = ctx[1]; /* return value of the syscall */
> struct syscall_trace *st;
> s64 delta;
> @@ -105,11 +138,13 @@ int sys_exit(u64 *ctx)
>
> if (aggr_mode == SYSCALL_AGGR_THREAD)
> key = tid;
> + else if (aggr_mode == SYSCALL_AGGR_CGROUP)
> + cgroup = get_current_cgroup_id();
> else
> key = bpf_get_smp_processor_id();
>
> delta = bpf_ktime_get_ns() - st->timestamp;
> - update_stats(key, st->nr, delta, ret);
> + update_stats(key, cgroup, st->nr, delta, ret);
>
> bpf_map_delete_elem(&syscall_trace_map, &tid);
> return 0;
> diff --git a/tools/perf/util/bpf_skel/syscall_summary.h b/tools/perf/util/bpf_skel/syscall_summary.h
> index 17f9ecba657088aa..72ccccb45925cd10 100644
> --- a/tools/perf/util/bpf_skel/syscall_summary.h
> +++ b/tools/perf/util/bpf_skel/syscall_summary.h
> @@ -6,9 +6,11 @@
> enum syscall_aggr_mode {
> SYSCALL_AGGR_THREAD,
> SYSCALL_AGGR_CPU,
> + SYSCALL_AGGR_CGROUP,
> };
>
> struct syscall_key {
> + u64 cgroup;
> int cpu_or_tid;
> int nr;
> };
> diff --git a/tools/perf/util/trace.h b/tools/perf/util/trace.h
> index ef8361ed12c4edc1..fa8d480527a22cef 100644
> --- a/tools/perf/util/trace.h
> +++ b/tools/perf/util/trace.h
> @@ -8,6 +8,7 @@ enum trace_summary_mode {
> SUMMARY__NONE = 0,
> SUMMARY__BY_TOTAL,
> SUMMARY__BY_THREAD,
> + SUMMARY__BY_CGROUP,
> };
>
> #ifdef HAVE_BPF_SKEL
> --
> 2.49.0.906.g1f30a19c02-goog
>
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] perf trace: Support --summary-mode=cgroup
2025-05-13 21:15 ` Namhyung Kim
@ 2025-05-13 21:23 ` Arnaldo Carvalho de Melo
0 siblings, 0 replies; 4+ messages in thread
From: Arnaldo Carvalho de Melo @ 2025-05-13 21:23 UTC (permalink / raw)
To: Namhyung Kim
Cc: Ian Rogers, Kan Liang, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
Ingo Molnar, LKML, linux-perf-users, Song Liu, bpf, Howard Chu
On Tue, May 13, 2025 at 02:15:27PM -0700, Namhyung Kim wrote:
> Ping!
Applied!
- Arnaldo
> On Thu, May 01, 2025 at 03:53:37PM -0700, Namhyung Kim wrote:
> > Add a new summary mode to collect stats for each cgroup.
> >
> > $ sudo ./perf trace -as --bpf-summary --summary-mode=cgroup -- sleep 1
> >
> > Summary of events:
> >
> > cgroup /user.slice/user-657345.slice/user@657345.service/session.slice/org.gnome.Shell@x11.service, 535 events
> >
> > syscall calls errors total min avg max stddev
> > (msec) (msec) (msec) (msec) (%)
> > --------------- -------- ------ -------- --------- --------- --------- ------
> > ppoll 15 0 373.600 0.004 24.907 197.491 55.26%
> > poll 15 0 1.325 0.001 0.088 0.369 38.76%
> > close 66 0 0.567 0.007 0.009 0.026 3.55%
> > write 150 0 0.471 0.001 0.003 0.010 3.29%
> > recvmsg 94 83 0.290 0.000 0.003 0.037 16.39%
> > ioctl 26 0 0.237 0.001 0.009 0.096 50.13%
> > timerfd_create 66 0 0.236 0.003 0.004 0.024 8.92%
> > timerfd_settime 70 0 0.160 0.001 0.002 0.012 7.66%
> > writev 10 0 0.118 0.001 0.012 0.019 18.17%
> > read 9 0 0.021 0.001 0.002 0.004 14.07%
> > getpid 14 0 0.019 0.000 0.001 0.004 20.28%
> >
> > cgroup /system.slice/polkit.service, 94 events
> >
> > syscall calls errors total min avg max stddev
> > (msec) (msec) (msec) (msec) (%)
> > --------------- -------- ------ -------- --------- --------- --------- ------
> > ppoll 22 0 19.811 0.000 0.900 9.273 63.88%
> > write 30 0 0.040 0.001 0.001 0.003 12.09%
> > recvmsg 12 0 0.018 0.001 0.002 0.006 28.15%
> > read 18 0 0.013 0.000 0.001 0.003 21.99%
> > poll 12 0 0.006 0.000 0.001 0.001 4.48%
> >
> > cgroup /user.slice/user-657345.slice/user@657345.service/app.slice/app-org.gnome.Terminal.slice/gnome-terminal-server.service, 21 events
> >
> > syscall calls errors total min avg max stddev
> > (msec) (msec) (msec) (msec) (%)
> > --------------- -------- ------ -------- --------- --------- --------- ------
> > ppoll 4 0 17.476 0.003 4.369 13.298 69.65%
> > recvmsg 15 12 0.068 0.002 0.005 0.014 26.53%
> > writev 1 0 0.033 0.033 0.033 0.033 0.00%
> > poll 1 0 0.005 0.005 0.005 0.005 0.00%
> >
> > ...
> >
> > It works only for --bpf-summary for now.
> >
> > Cc: Howard Chu <howardchu95@gmail.com>
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> > tools/perf/Documentation/perf-trace.txt | 3 +-
> > tools/perf/builtin-trace.c | 10 +-
> > tools/perf/util/bpf-trace-summary.c | 123 +++++++++++++++++-
> > .../perf/util/bpf_skel/syscall_summary.bpf.c | 43 +++++-
> > tools/perf/util/bpf_skel/syscall_summary.h | 2 +
> > tools/perf/util/trace.h | 1 +
> > 6 files changed, 170 insertions(+), 12 deletions(-)
> >
> > diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> > index a8a0d8c33438fef7..c1fb6056a0d36dda 100644
> > --- a/tools/perf/Documentation/perf-trace.txt
> > +++ b/tools/perf/Documentation/perf-trace.txt
> > @@ -152,7 +152,8 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
> >
> > --summary-mode=mode::
> > To be used with -s or -S, to select how to show summary. By default it'll
> > - show the syscall summary by thread. Possible values are: thread, total.
> > + show the syscall summary by thread. Possible values are: thread, total,
> > + cgroup.
> >
> > --tool_stats::
> > Show tool stats such as number of times fd->pathname was discovered thru
> > diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> > index b2c5a9b765ab5d33..83c62c30d914306c 100644
> > --- a/tools/perf/builtin-trace.c
> > +++ b/tools/perf/builtin-trace.c
> > @@ -5301,6 +5301,8 @@ static int trace__parse_summary_mode(const struct option *opt, const char *str,
> > trace->summary_mode = SUMMARY__BY_THREAD;
> > } else if (!strcmp(str, "total")) {
> > trace->summary_mode = SUMMARY__BY_TOTAL;
> > + } else if (!strcmp(str, "cgroup")) {
> > + trace->summary_mode = SUMMARY__BY_CGROUP;
> > } else {
> > pr_err("Unknown summary mode: %s\n", str);
> > return -1;
> > @@ -5460,7 +5462,7 @@ int cmd_trace(int argc, const char **argv)
> > OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
> > "Show errno stats per syscall, use with -s or -S"),
> > OPT_CALLBACK(0, "summary-mode", &trace, "mode",
> > - "How to show summary: select thread (default) or total",
> > + "How to show summary: select thread (default), total or cgroup",
> > trace__parse_summary_mode),
> > OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
> > "Trace pagefaults", parse_pagefaults, "maj"),
> > @@ -5774,6 +5776,12 @@ int cmd_trace(int argc, const char **argv)
> > symbol_conf.keep_exited_threads = true;
> > if (trace.summary_mode == SUMMARY__NONE)
> > trace.summary_mode = SUMMARY__BY_THREAD;
> > +
> > + if (!trace.summary_bpf && trace.summary_mode == SUMMARY__BY_CGROUP) {
> > + pr_err("Error: --summary-mode=cgroup only works with --bpf-summary\n");
> > + err = -EINVAL;
> > + goto out;
> > + }
> > }
> >
> > if (output_name != NULL) {
> > diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
> > index 114d8d9ed9b2d3f3..69fb165da206b01f 100644
> > --- a/tools/perf/util/bpf-trace-summary.c
> > +++ b/tools/perf/util/bpf-trace-summary.c
> > @@ -6,10 +6,12 @@
> >
> > #include "dwarf-regs.h" /* for EM_HOST */
> > #include "syscalltbl.h"
> > +#include "util/cgroup.h"
> > #include "util/hashmap.h"
> > #include "util/trace.h"
> > #include "util/util.h"
> > #include <bpf/bpf.h>
> > +#include <linux/rbtree.h>
> > #include <linux/time64.h>
> > #include <tools/libc_compat.h> /* reallocarray */
> >
> > @@ -18,6 +20,7 @@
> >
> >
> > static struct syscall_summary_bpf *skel;
> > +static struct rb_root cgroups = RB_ROOT;
> >
> > int trace_prepare_bpf_summary(enum trace_summary_mode mode)
> > {
> > @@ -29,9 +32,14 @@ int trace_prepare_bpf_summary(enum trace_summary_mode mode)
> >
> > if (mode == SUMMARY__BY_THREAD)
> > skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
> > + else if (mode == SUMMARY__BY_CGROUP)
> > + skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP;
> > else
> > skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
> >
> > + if (cgroup_is_v2("perf_event") > 0)
> > + skel->rodata->use_cgroup_v2 = 1;
> > +
> > if (syscall_summary_bpf__load(skel) < 0) {
> > fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
> > return -1;
> > @@ -42,6 +50,9 @@ int trace_prepare_bpf_summary(enum trace_summary_mode mode)
> > return -1;
> > }
> >
> > + if (mode == SUMMARY__BY_CGROUP)
> > + read_all_cgroups(&cgroups);
> > +
> > return 0;
> > }
> >
> > @@ -88,9 +99,13 @@ static double rel_stddev(struct syscall_stats *stat)
> > * per-cpu analysis so it's keyed by the syscall number to combine stats
> > * from different CPUs. And syscall_data always has a syscall_node so
> > * it can effectively work as flat hierarchy.
> > + *
> > + * For per-cgroup stats, it uses two-level data structure like thread
> > + * syscall_data is keyed by CGROUP and has an array of node which
> > + * represents each syscall for the cgroup.
> > */
> > struct syscall_data {
> > - int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
> > + u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */
> > int nr_events;
> > int nr_nodes;
> > u64 total_time;
> > @@ -191,7 +206,7 @@ static int print_thread_stat(struct syscall_data *data, FILE *fp)
> >
> > qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
> >
> > - printed += fprintf(fp, " thread (%d), ", data->key);
> > + printed += fprintf(fp, " thread (%d), ", (int)data->key);
> > printed += fprintf(fp, "%d events\n\n", data->nr_events);
> >
> > printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
> > @@ -283,6 +298,75 @@ static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
> > return printed;
> > }
> >
> > +static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key,
> > + struct syscall_stats *map_data)
> > +{
> > + struct syscall_data *data;
> > + struct syscall_node *nodes;
> > +
> > + if (!hashmap__find(hash, map_key->cgroup, &data)) {
> > + data = zalloc(sizeof(*data));
> > + if (data == NULL)
> > + return -ENOMEM;
> > +
> > + data->key = map_key->cgroup;
> > + if (hashmap__add(hash, data->key, data) < 0) {
> > + free(data);
> > + return -ENOMEM;
> > + }
> > + }
> > +
> > + /* update thread total stats */
> > + data->nr_events += map_data->count;
> > + data->total_time += map_data->total_time;
> > +
> > + nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
> > + if (nodes == NULL)
> > + return -ENOMEM;
> > +
> > + data->nodes = nodes;
> > + nodes = &data->nodes[data->nr_nodes++];
> > + nodes->syscall_nr = map_key->nr;
> > +
> > + /* each thread has an entry for each syscall, just use the stat */
> > + memcpy(&nodes->stats, map_data, sizeof(*map_data));
> > + return 0;
> > +}
> > +
> > +static int print_cgroup_stat(struct syscall_data *data, FILE *fp)
> > +{
> > + int printed = 0;
> > + struct cgroup *cgrp = __cgroup__find(&cgroups, data->key);
> > +
> > + qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
> > +
> > + if (cgrp)
> > + printed += fprintf(fp, " cgroup %s,", cgrp->name);
> > + else
> > + printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key);
> > +
> > + printed += fprintf(fp, " %d events\n\n", data->nr_events);
> > +
> > + printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
> > + printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
> > + printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
> > +
> > + printed += print_common_stats(data, fp);
> > + printed += fprintf(fp, "\n\n");
> > +
> > + return printed;
> > +}
> > +
> > +static int print_cgroup_stats(struct syscall_data **data, int nr_data, FILE *fp)
> > +{
> > + int printed = 0;
> > +
> > + for (int i = 0; i < nr_data; i++)
> > + printed += print_cgroup_stat(data[i], fp);
> > +
> > + return printed;
> > +}
> > +
> > int trace_print_bpf_summary(FILE *fp)
> > {
> > struct bpf_map *map = skel->maps.syscall_stats_map;
> > @@ -305,10 +389,19 @@ int trace_print_bpf_summary(FILE *fp)
> > struct syscall_stats stat;
> >
> > if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
> > - if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> > + switch (skel->rodata->aggr_mode) {
> > + case SYSCALL_AGGR_THREAD:
> > update_thread_stats(&schash, &key, &stat);
> > - else
> > + break;
> > + case SYSCALL_AGGR_CPU:
> > update_total_stats(&schash, &key, &stat);
> > + break;
> > + case SYSCALL_AGGR_CGROUP:
> > + update_cgroup_stats(&schash, &key, &stat);
> > + break;
> > + default:
> > + break;
> > + }
> > }
> >
> > prev_key = &key;
> > @@ -325,10 +418,19 @@ int trace_print_bpf_summary(FILE *fp)
> >
> > qsort(data, nr_data, sizeof(*data), datacmp);
> >
> > - if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> > + switch (skel->rodata->aggr_mode) {
> > + case SYSCALL_AGGR_THREAD:
> > printed += print_thread_stats(data, nr_data, fp);
> > - else
> > + break;
> > + case SYSCALL_AGGR_CPU:
> > printed += print_total_stats(data, nr_data, fp);
> > + break;
> > + case SYSCALL_AGGR_CGROUP:
> > + printed += print_cgroup_stats(data, nr_data, fp);
> > + break;
> > + default:
> > + break;
> > + }
> >
> > for (i = 0; i < nr_data && data; i++) {
> > free(data[i]->nodes);
> > @@ -343,5 +445,14 @@ int trace_print_bpf_summary(FILE *fp)
> >
> > void trace_cleanup_bpf_summary(void)
> > {
> > + if (!RB_EMPTY_ROOT(&cgroups)) {
> > + struct cgroup *cgrp, *tmp;
> > +
> > + rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node)
> > + cgroup__put(cgrp);
> > +
> > + cgroups = RB_ROOT;
> > + }
> > +
> > syscall_summary_bpf__destroy(skel);
> > }
> > diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
> > index b25f53b3c1351392..1bcd066a5199a476 100644
> > --- a/tools/perf/util/bpf_skel/syscall_summary.bpf.c
> > +++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
> > @@ -8,6 +8,7 @@
> >
> > #include <bpf/bpf_helpers.h>
> > #include <bpf/bpf_tracing.h>
> > +#include <bpf/bpf_core_read.h>
> >
> > /* This is to calculate a delta between sys-enter and sys-exit for each thread */
> > struct syscall_trace {
> > @@ -35,10 +36,41 @@ struct syscall_stats_map {
> > int enabled; /* controlled from userspace */
> >
> > const volatile enum syscall_aggr_mode aggr_mode;
> > +const volatile int use_cgroup_v2;
> >
> > -static void update_stats(int cpu_or_tid, int nr, s64 duration, long ret)
> > +int perf_subsys_id = -1;
> > +
> > +static inline __u64 get_current_cgroup_id(void)
> > +{
> > + struct task_struct *task;
> > + struct cgroup *cgrp;
> > +
> > + if (use_cgroup_v2)
> > + return bpf_get_current_cgroup_id();
> > +
> > + task = bpf_get_current_task_btf();
> > +
> > + if (perf_subsys_id == -1) {
> > +#if __has_builtin(__builtin_preserve_enum_value)
> > + perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
> > + perf_event_cgrp_id);
> > +#else
> > + perf_subsys_id = perf_event_cgrp_id;
> > +#endif
> > + }
> > +
> > + cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
> > + return BPF_CORE_READ(cgrp, kn, id);
> > +}
> > +
> > +static void update_stats(int cpu_or_tid, u64 cgroup_id, int nr, s64 duration,
> > + long ret)
> > {
> > - struct syscall_key key = { .cpu_or_tid = cpu_or_tid, .nr = nr, };
> > + struct syscall_key key = {
> > + .cpu_or_tid = cpu_or_tid,
> > + .cgroup = cgroup_id,
> > + .nr = nr,
> > + };
> > struct syscall_stats *stats;
> >
> > stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
> > @@ -90,7 +122,8 @@ SEC("tp_btf/sys_exit")
> > int sys_exit(u64 *ctx)
> > {
> > int tid;
> > - int key;
> > + int key = 0;
> > + u64 cgroup = 0;
> > long ret = ctx[1]; /* return value of the syscall */
> > struct syscall_trace *st;
> > s64 delta;
> > @@ -105,11 +138,13 @@ int sys_exit(u64 *ctx)
> >
> > if (aggr_mode == SYSCALL_AGGR_THREAD)
> > key = tid;
> > + else if (aggr_mode == SYSCALL_AGGR_CGROUP)
> > + cgroup = get_current_cgroup_id();
> > else
> > key = bpf_get_smp_processor_id();
> >
> > delta = bpf_ktime_get_ns() - st->timestamp;
> > - update_stats(key, st->nr, delta, ret);
> > + update_stats(key, cgroup, st->nr, delta, ret);
> >
> > bpf_map_delete_elem(&syscall_trace_map, &tid);
> > return 0;
> > diff --git a/tools/perf/util/bpf_skel/syscall_summary.h b/tools/perf/util/bpf_skel/syscall_summary.h
> > index 17f9ecba657088aa..72ccccb45925cd10 100644
> > --- a/tools/perf/util/bpf_skel/syscall_summary.h
> > +++ b/tools/perf/util/bpf_skel/syscall_summary.h
> > @@ -6,9 +6,11 @@
> > enum syscall_aggr_mode {
> > SYSCALL_AGGR_THREAD,
> > SYSCALL_AGGR_CPU,
> > + SYSCALL_AGGR_CGROUP,
> > };
> >
> > struct syscall_key {
> > + u64 cgroup;
> > int cpu_or_tid;
> > int nr;
> > };
> > diff --git a/tools/perf/util/trace.h b/tools/perf/util/trace.h
> > index ef8361ed12c4edc1..fa8d480527a22cef 100644
> > --- a/tools/perf/util/trace.h
> > +++ b/tools/perf/util/trace.h
> > @@ -8,6 +8,7 @@ enum trace_summary_mode {
> > SUMMARY__NONE = 0,
> > SUMMARY__BY_TOTAL,
> > SUMMARY__BY_THREAD,
> > + SUMMARY__BY_CGROUP,
> > };
> >
> > #ifdef HAVE_BPF_SKEL
> > --
> > 2.49.0.906.g1f30a19c02-goog
> >
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] perf trace: Support --summary-mode=cgroup
2025-05-01 22:53 [PATCH] perf trace: Support --summary-mode=cgroup Namhyung Kim
2025-05-13 21:15 ` Namhyung Kim
@ 2025-05-14 20:07 ` Howard Chu
1 sibling, 0 replies; 4+ messages in thread
From: Howard Chu @ 2025-05-14 20:07 UTC (permalink / raw)
To: Namhyung Kim
Cc: Arnaldo Carvalho de Melo, Ian Rogers, Kan Liang, Jiri Olsa,
Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
linux-perf-users, Song Liu, bpf
Hello Namhyung,
Just a single comment although this has been applied.
On Thu, May 1, 2025 at 3:53 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Add a new summary mode to collect stats for each cgroup.
>
> $ sudo ./perf trace -as --bpf-summary --summary-mode=cgroup -- sleep 1
>
> Summary of events:
>
> cgroup /user.slice/user-657345.slice/user@657345.service/session.slice/org.gnome.Shell@x11.service, 535 events
>
> syscall calls errors total min avg max stddev
> (msec) (msec) (msec) (msec) (%)
> --------------- -------- ------ -------- --------- --------- --------- ------
> ppoll 15 0 373.600 0.004 24.907 197.491 55.26%
> poll 15 0 1.325 0.001 0.088 0.369 38.76%
> close 66 0 0.567 0.007 0.009 0.026 3.55%
> write 150 0 0.471 0.001 0.003 0.010 3.29%
> recvmsg 94 83 0.290 0.000 0.003 0.037 16.39%
> ioctl 26 0 0.237 0.001 0.009 0.096 50.13%
> timerfd_create 66 0 0.236 0.003 0.004 0.024 8.92%
> timerfd_settime 70 0 0.160 0.001 0.002 0.012 7.66%
> writev 10 0 0.118 0.001 0.012 0.019 18.17%
> read 9 0 0.021 0.001 0.002 0.004 14.07%
> getpid 14 0 0.019 0.000 0.001 0.004 20.28%
>
<SNIP>
> +static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key,
> + struct syscall_stats *map_data)
> +{
> + struct syscall_data *data;
> + struct syscall_node *nodes;
> +
> + if (!hashmap__find(hash, map_key->cgroup, &data)) {
> + data = zalloc(sizeof(*data));
> + if (data == NULL)
> + return -ENOMEM;
> +
> + data->key = map_key->cgroup;
> + if (hashmap__add(hash, data->key, data) < 0) {
> + free(data);
> + return -ENOMEM;
> + }
> + }
> +
> + /* update thread total stats */
> + data->nr_events += map_data->count;
> + data->total_time += map_data->total_time;
> +
> + nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
> + if (nodes == NULL)
> + return -ENOMEM;
> +
> + data->nodes = nodes;
> + nodes = &data->nodes[data->nr_nodes++];
> + nodes->syscall_nr = map_key->nr;
> +
> + /* each thread has an entry for each syscall, just use the stat */
This comment shouldn't be here.
Otherwise,
Reviewed-by: Howard Chu <howardchu95@gmail.com>
Thanks,
Howard
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2025-05-14 20:08 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-01 22:53 [PATCH] perf trace: Support --summary-mode=cgroup Namhyung Kim
2025-05-13 21:15 ` Namhyung Kim
2025-05-13 21:23 ` Arnaldo Carvalho de Melo
2025-05-14 20:07 ` Howard Chu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).