From: Howard Chu <howardchu95@gmail.com>
To: namhyung@kernel.org
Cc: irogers@google.com, acme@kernel.org, adrian.hunter@intel.com,
jolsa@kernel.org, kan.liang@linux.intel.com,
linux-perf-users@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH v3 2/5] perf record off-cpu: Dumping samples in BPF
Date: Fri, 26 Jul 2024 18:28:23 +0800 [thread overview]
Message-ID: <20240726102826.787004-3-howardchu95@gmail.com> (raw)
In-Reply-To: <20240726102826.787004-1-howardchu95@gmail.com>
Add perf_event_array map for dumping direct off-cpu samples, but keep
the in-the-end approach.
Tons of checking before access to pass the BPF verifier.
If off-cpu time (represented as delta) is greater than the output
threshold, do the output.
Signed-off-by: Howard Chu <howardchu95@gmail.com>
Suggested-by: Ian Rogers <irogers@google.com>
---
tools/perf/util/bpf_skel/off_cpu.bpf.c | 143 +++++++++++++++++++++++++
1 file changed, 143 insertions(+)
diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
index d877a0a9731f..4b0412a7aa5c 100644
--- a/tools/perf/util/bpf_skel/off_cpu.bpf.c
+++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
@@ -18,6 +18,9 @@
#define MAX_STACKS 32
#define MAX_ENTRIES 102400
+#define MAX_CPUS 4096
+#define MAX_OFFCPU_LEN 128
+
struct tstamp_data {
__u32 stack_id;
__u32 state;
@@ -32,6 +35,7 @@ struct offcpu_key {
__u64 cgroup_id;
};
+/* for dumping at the end */
struct {
__uint(type, BPF_MAP_TYPE_STACK_TRACE);
__uint(key_size, sizeof(__u32));
@@ -39,6 +43,45 @@ struct {
__uint(max_entries, MAX_ENTRIES);
} stacks SEC(".maps");
+struct offcpu_data {
+ u64 array[MAX_OFFCPU_LEN];
+};
+
+struct stack_data {
+ u64 array[MAX_STACKS];
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+ __uint(max_entries, MAX_CPUS);
+} offcpu_output SEC(".maps");
+
+/* temporary offcpu sample */
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct offcpu_data));
+ __uint(max_entries, 1);
+} offcpu_payload SEC(".maps");
+
+/* temporary stack data */
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct stack_data));
+ __uint(max_entries, 1);
+} stack_tmp SEC(".maps");
+
+/* cached stack per task storage */
+struct {
+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct stack_data);
+} stack_cache SEC(".maps");
+
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
@@ -96,6 +139,8 @@ const volatile bool uses_cgroup_v1 = false;
int perf_subsys_id = -1;
+__u64 sample_id, sample_type, offcpu_thresh;
+
/*
* Old kernel used to call it task_struct->state and now it's '__state'.
* Use BPF CO-RE "ignored suffix rule" to deal with it like below:
@@ -182,12 +227,87 @@ static inline int can_record(struct task_struct *t, int state)
return 1;
}
+static inline bool check_bounds(int index)
+{
+ if (index >= 0 && index < MAX_OFFCPU_LEN)
+ return true;
+
+ return false;
+}
+
+static inline int copy_stack(struct stack_data *from,
+ struct offcpu_data *to, int n)
+{
+ int max_stacks = MAX_STACKS, len = 0;
+
+ if (!from)
+ return len;
+
+ for (int i = 0; i < max_stacks && from->array[i]; ++i) {
+ if (check_bounds(n + 2 + i)) {
+ to->array[n + 2 + i] = from->array[i];
+ ++len;
+ }
+ }
+ return len;
+}
+
+static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key,
+ struct stack_data *stack_p, __u64 delta, __u64 timestamp)
+{
+ int size, n = 0, ip_pos = -1, len = 0;
+
+ if (sample_type & PERF_SAMPLE_IDENTIFIER && check_bounds(n))
+ data->array[n++] = sample_id;
+ if (sample_type & PERF_SAMPLE_IP && check_bounds(n)) {
+ ip_pos = n;
+ data->array[n++] = 0; /* will be updated */
+ }
+ if (sample_type & PERF_SAMPLE_TID && check_bounds(n))
+ data->array[n++] = (u64)key->pid << 32 | key->tgid;
+ if (sample_type & PERF_SAMPLE_TIME && check_bounds(n))
+ data->array[n++] = timestamp;
+ if (sample_type & PERF_SAMPLE_ID && check_bounds(n))
+ data->array[n++] = sample_id;
+ if (sample_type & PERF_SAMPLE_CPU && check_bounds(n))
+ data->array[n++] = 0;
+ if (sample_type & PERF_SAMPLE_PERIOD && check_bounds(n))
+ data->array[n++] = delta;
+ if (sample_type & PERF_SAMPLE_CALLCHAIN && check_bounds(n + 2)) {
+ /* data->array[n] is callchain->nr (updated later) */
+ data->array[n + 1] = PERF_CONTEXT_USER;
+ data->array[n + 2] = 0;
+
+ len = copy_stack(stack_p, data, n);
+
+ /* update length of callchain */
+ data->array[n] = len + 1;
+
+ /* update sample ip with the first callchain entry */
+ if (ip_pos >= 0)
+ data->array[ip_pos] = data->array[n + 2];
+
+ /* calculate sample callchain data->array length */
+ n += len + 2;
+ }
+ if (sample_type & PERF_SAMPLE_CGROUP && check_bounds(n))
+ data->array[n++] = key->cgroup_id;
+
+ size = n * sizeof(u64);
+ if (size >= 0 && size <= MAX_OFFCPU_LEN * sizeof(u64))
+ bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, size);
+
+ return 0;
+}
+
static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
struct task_struct *next, int state)
{
__u64 ts;
__u32 stack_id;
struct tstamp_data *pelem;
+ struct stack_data *stack_tmp_p, *stack_p;
+ int zero = 0, len = 0;
ts = bpf_ktime_get_ns();
@@ -197,6 +317,22 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
stack_id = bpf_get_stackid(ctx, &stacks,
BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
+ /* temporary stack data */
+ stack_tmp_p = bpf_map_lookup_elem(&stack_tmp, &zero);
+ if (stack_tmp_p)
+ len = bpf_get_stack(ctx, stack_tmp_p->array, MAX_STACKS * sizeof(u64),
+ BPF_F_USER_STACK) / sizeof(u64);
+
+ /* save stacks if collectable */
+ if (len > 0) {
+ stack_p = bpf_task_storage_get(&stack_cache, prev, NULL,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (stack_p) {
+ for (int i = 0; i < len && i < MAX_STACKS; ++i)
+ stack_p->array[i] = stack_tmp_p->array[i];
+ }
+ }
+
pelem = bpf_task_storage_get(&tstamp, prev, NULL,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (!pelem)
@@ -226,6 +362,13 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
else
bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
+ if (delta >= offcpu_thresh) {
+ struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero);
+ stack_p = bpf_task_storage_get(&stack_cache, next, NULL, 0);
+ if (data && stack_p)
+ off_cpu_dump(ctx, data, &key, stack_p, delta, pelem->timestamp);
+ }
+
/* prevent to reuse the timestamp later */
pelem->timestamp = 0;
}
--
2.45.2
next prev parent reply other threads:[~2024-07-26 10:28 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-07-26 10:28 [PATCH v3 0/5] Dump off-cpu samples directly Howard Chu
2024-07-26 10:28 ` [PATCH v3 1/5] perf record off-cpu: Add direct off-cpu event Howard Chu
2024-07-26 23:48 ` Ian Rogers
2024-07-29 13:36 ` Howard Chu
2024-07-26 10:28 ` Howard Chu [this message]
2024-07-26 10:28 ` [PATCH v3 3/5] perf record off-cpu: processing of embedded sample Howard Chu
2024-07-26 10:28 ` [PATCH v3 4/5] perf record off-cpu: save embedded sample type Howard Chu
2024-07-27 0:49 ` Ian Rogers
2024-07-26 10:28 ` [PATCH v3 5/5] perf record off-cpu: Add direct off-cpu test Howard Chu
2024-07-27 0:54 ` Ian Rogers
2024-07-29 13:29 ` Howard Chu
2024-07-27 1:06 ` [PATCH v3 0/5] Dump off-cpu samples directly Ian Rogers
2024-07-29 1:21 ` Namhyung Kim
2024-07-29 15:24 ` Howard Chu
2024-07-31 17:46 ` Namhyung Kim
2024-07-31 18:23 ` Arnaldo Carvalho de Melo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240726102826.787004-3-howardchu95@gmail.com \
--to=howardchu95@gmail.com \
--cc=acme@kernel.org \
--cc=adrian.hunter@intel.com \
--cc=irogers@google.com \
--cc=jolsa@kernel.org \
--cc=kan.liang@linux.intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-perf-users@vger.kernel.org \
--cc=namhyung@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).