Re: [PATCH v6 4/8] perf record off-cpu: Dump direct off-cpu samples in BPF

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Namhyung Kim <namhyung@kernel.org>
To: Howard Chu <howardchu95@gmail.com>
Cc: peterz@infradead.org, mingo@redhat.com, acme@kernel.org,
	mark.rutland@arm.com, alexander.shishkin@linux.intel.com,
	jolsa@kernel.org, irogers@google.com, adrian.hunter@intel.com,
	kan.liang@linux.intel.com, linux-perf-users@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH v6 4/8] perf record off-cpu: Dump direct off-cpu samples in BPF
Date: Sun, 29 Sep 2024 23:23:52 -0700	[thread overview]
Message-ID: <ZvpD-PiP0wKdBL0C@google.com> (raw)
In-Reply-To: <20240927202736.767941-5-howardchu95@gmail.com>

On Fri, Sep 27, 2024 at 01:27:32PM -0700, Howard Chu wrote:
> Add perf_event_array map for dumping direct off-cpu samples, but keep
> the at-the-end approach.
> 
> Tons of checking before access, to pass the BPF verifier.
> 
> If off-cpu time (represented as delta) exceeds the off-cpu threshold, do
> output.
> 
> Output PERF_SAMPLE_TID, PERF_SAMPLE_PERIOD, PERF_SAMPLE_CALLCHAIN, and
> PERF_SAMPLE_CGROUP in bpf_perf_event_output().
> 
> Ideally, we should only output
> PERF_SAMPLE_PERIOD (off-cpu time) and PERF_SAMPLE_CALLCHAIN (sched_in
> process's callchain). One only needs to set PERF_SAMPLE_TID and
> PERF_SAMPLE_CGROUP, and perf_event will do everything for us.
> 
> But in reality, that's not the case. Setting PERF_SAMPLE_TID will mostly
> give us TID of 0. We might get the correct TID for offcpu-time event
> from time to time, but it is really rare.
>          swapper       0 [000] offcpu-time:  /
>         :1321819 1321819 [002] offcpu-time:  /user.slice/user-1000.slice/session-2.scope
>          swapper       0 [001] offcpu-time:  /
>          swapper       0 [003] offcpu-time:  /
> 
> And setting PERF_SAMPLE_CGROUP doesn't work properly either.
>     tmux: server    3701 [003] offcpu-time:  /
>     blueman-tray    1064 [001] offcpu-time:  /
>             bash 1350867 [001] offcpu-time:  /
>             bash 1350844 [000] offcpu-time:  /
> 
> We need to retrieve PERF_SAMPLE_TID, PERF_SAMPLE_PERIOD,
> PERF_SAMPLE_CALLCHAIN, and PERF_SAMPLE_CGROUP using BPF and output these
> four fields.
> 
> Suggested-by: Namhyung Kim <namhyung@kernel.org>
> Signed-off-by: Howard Chu <howardchu95@gmail.com>
> ---
>  tools/perf/util/bpf_skel/off_cpu.bpf.c | 112 +++++++++++++++++++++++++
>  tools/perf/util/off_cpu.h              |   8 +-
>  2 files changed, 119 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
> index e2a887228fd9..c42d0e2d91d8 100644
> --- a/tools/perf/util/bpf_skel/off_cpu.bpf.c
> +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
> @@ -19,6 +19,7 @@
>  #define MAX_ENTRIES  102400
>  
>  #define MAX_CPUS  4096
> +#define MAX_OFFCPU_LEN 128

I guess it's too big as you only collect stack and a few more data.

>  
>  struct tstamp_data {
>  	__u32 stack_id;
> @@ -34,6 +35,7 @@ struct offcpu_key {
>  	__u64 cgroup_id;
>  };
>  
> +/* for dumping at the end */
>  struct {
>  	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
>  	__uint(key_size, sizeof(__u32));
> @@ -41,6 +43,14 @@ struct {
>  	__uint(max_entries, MAX_ENTRIES);
>  } stacks SEC(".maps");
>  
> +struct offcpu_data {
> +	u64 array[MAX_OFFCPU_LEN];
> +};
> +
> +struct stack_data {
> +	u64 array[MAX_STACKS];
> +};
> +
>  struct {
>  	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
>  	__uint(key_size, sizeof(__u32));
> @@ -48,6 +58,22 @@ struct {
>  	__uint(max_entries, MAX_CPUS);
>  } offcpu_output SEC(".maps");
>  
> +/* temporary offcpu sample */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +	__uint(key_size, sizeof(__u32));
> +	__uint(value_size, sizeof(struct offcpu_data));
> +	__uint(max_entries, 1);
> +} offcpu_payload SEC(".maps");
> +
> +/* cached stack per task storage */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
> +	__uint(map_flags, BPF_F_NO_PREALLOC);
> +	__type(key, int);
> +	__type(value, struct stack_data);
> +} stack_cache SEC(".maps");
> +
>  struct {
>  	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
>  	__uint(map_flags, BPF_F_NO_PREALLOC);
> @@ -194,12 +220,75 @@ static inline int can_record(struct task_struct *t, int state)
>  	return 1;
>  }
>  
> +static inline bool check_bounds(int index)
> +{
> +	if (index < 0 || index >= MAX_OFFCPU_LEN)
> +		return false;
> +
> +	return true;
> +}

Likewise, we may get rid of the bound check entirely as it's always
guaranteed.

> +
> +static inline int copy_stack(struct stack_data *from,
> +			     struct offcpu_data *to, int n)
> +{
> +	int max_stacks = MAX_STACKS, len = 0;
> +
> +	if (!from)
> +		return len;
> +
> +	for (int i = 0; i < max_stacks && from->array[i]; ++i) {
> +		if (check_bounds(n + 2 + i)) {
> +			to->array[n + 2 + i] = from->array[i];
> +			++len;
> +		}
> +	}
> +	return len;
> +}
> +
> +static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key,
> +			struct stack_data *stack_p, __u64 delta, __u64 timestamp)
> +{
> +	int size, n = 0, ip_pos = -1, len = 0;
> +
> +	if (sample_type & PERF_SAMPLE_TID && check_bounds(n))

You didn't set the sample_type yet.  Is it intentional?


> +		data->array[n++] = (u64)key->tgid << 32 | key->pid;
> +	if (sample_type & PERF_SAMPLE_PERIOD && check_bounds(n))
> +		data->array[n++] = delta;
> +	if (sample_type & PERF_SAMPLE_CALLCHAIN && check_bounds(n + 2)) {
> +		/* data->array[n] is callchain->nr (updated later) */
> +		data->array[n + 1] = PERF_CONTEXT_USER;
> +		data->array[n + 2] = 0;
> +
> +		len = copy_stack(stack_p, data, n);
> +
> +		/* update length of callchain */
> +		data->array[n] = len + 1;
> +
> +		/* update sample ip with the first callchain entry */
> +		if (ip_pos >= 0)

You don't set the ip_pos, just remove this part and let userspace handle
it later.


> +			data->array[ip_pos] = data->array[n + 2];
> +
> +		/* calculate sample callchain data->array length */
> +		n += len + 2;
> +	}
> +	if (sample_type & PERF_SAMPLE_CGROUP && check_bounds(n))
> +		data->array[n++] = key->cgroup_id;
> +
> +	size = n * sizeof(u64);
> +	if (size >= 0 && size <= MAX_OFFCPU_LEN * sizeof(u64))
> +		bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, size);
> +
> +	return 0;
> +}
> +
>  static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
>  			struct task_struct *next, int state)
>  {
>  	__u64 ts;
>  	__u32 stack_id;
>  	struct tstamp_data *pelem;
> +	struct stack_data *stack_p;
> +	int zero = 0;
>  
>  	ts = bpf_ktime_get_ns();
>  
> @@ -209,6 +298,21 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
>  	stack_id = bpf_get_stackid(ctx, &stacks,
>  				   BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
>  
> +	/*
> +	 * if stacks are successfully collected, cache them to task_storage, they are then
> +	 * dumped if the off-cpu time hits the threshold.
> +	 */
> +	if (stack_id > 0) {
> +		stack_p = bpf_task_storage_get(&stack_cache, prev, NULL,
> +					       BPF_LOCAL_STORAGE_GET_F_CREATE);

Can you just add a new field in the tstamp map instead?


> +		if (stack_p) {
> +			/* to pass the clang result unused warning */
> +			int __attribute__((unused)) len;
> +			len = bpf_get_stack(ctx, stack_p->array, MAX_STACKS * sizeof(u64),
> +					    BPF_F_USER_STACK) / sizeof(u64);

I think you can move to the next task if it failed to get the stack
trace.


> +		}
> +	}
> +
>  	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
>  				     BPF_LOCAL_STORAGE_GET_F_CREATE);
>  	if (!pelem)
> @@ -238,6 +342,14 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
>  		else
>  			bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
>  
> +		if (delta >= offcpu_thresh) {
> +			struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero);
> +
> +			stack_p = bpf_task_storage_get(&stack_cache, next, NULL, 0);
> +			if (data && stack_p)
> +				off_cpu_dump(ctx, data, &key, stack_p, delta, pelem->timestamp);
> +		}

I think you should either dump the data directly or save it in the
off_cpu map.  Otherwise the time is accounted twice.

Thanks,
Namhyung

> +
>  		/* prevent to reuse the timestamp later */
>  		pelem->timestamp = 0;
>  	}
> diff --git a/tools/perf/util/off_cpu.h b/tools/perf/util/off_cpu.h
> index 357231cb1c38..eaf7be92472d 100644
> --- a/tools/perf/util/off_cpu.h
> +++ b/tools/perf/util/off_cpu.h
> @@ -15,9 +15,15 @@ struct record_opts;
>  #define OFFCPU_SAMPLE_TYPES  (PERF_SAMPLE_IDENTIFIER | PERF_SAMPLE_IP | \
>  			      PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
>  			      PERF_SAMPLE_ID | PERF_SAMPLE_CPU | \
> -			      PERF_SAMPLE_PERIOD | PERF_SAMPLE_CALLCHAIN | \
> +			      PERF_SAMPLE_PERIOD | PERF_SAMPLE_RAW | \
>  			      PERF_SAMPLE_CGROUP)
>  
> +/*
> + * for embedded data to overwrite the original sample, duplicated sample types
> + * must be set in the original OFFCPU_SAMPLE_TYPES, except for callchain.
> + */
> +#define OFFCPU_EMBEDDED_SAMPLE_TYPES  (PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD | \
> +				       PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_CGROUP)
>  
>  #ifdef HAVE_BPF_SKEL
>  int off_cpu_prepare(struct evlist *evlist, struct target *target,
> -- 
> 2.43.0
>

next prev parent reply	other threads:[~2024-09-30  6:23 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-27 20:27 [PATCH v6 0/8] perf record --off-cpu: Dump off-cpu samples directly Howard Chu
2024-09-27 20:27 ` [PATCH v6 1/8] perf evsel: Set off-cpu BPF output to system-wide Howard Chu
2024-09-30  5:37   ` Namhyung Kim
2024-09-27 20:27 ` [PATCH v6 2/8] perf record --off-cpu: Add --off-cpu-thresh Howard Chu
2024-09-30  5:40   ` Namhyung Kim
2024-09-27 20:27 ` [PATCH v6 3/8] perf record --off-cpu: Parse offcpu-time event Howard Chu
2024-09-30  5:51   ` Namhyung Kim
2024-09-27 20:27 ` [PATCH v6 4/8] perf record off-cpu: Dump direct off-cpu samples in BPF Howard Chu
2024-09-30  6:23   ` Namhyung Kim [this message]
2024-09-27 20:27 ` [PATCH v6 5/8] perf record --off-cpu: Dump total off-cpu time at the end Howard Chu
2024-09-30  6:39   ` Namhyung Kim
2024-09-27 20:27 ` [PATCH v6 6/8] perf evsel: Delete unnecessary = 0 Howard Chu
2024-09-27 20:27 ` [PATCH v6 7/8] perf record --off-cpu: Parse BPF output embedded data Howard Chu
2024-09-30  6:51   ` Namhyung Kim
2024-09-27 20:27 ` [PATCH v6 8/8] perf test: Add direct off-cpu dumping test Howard Chu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZvpD-PiP0wKdBL0C@google.com \
    --to=namhyung@kernel.org \
    --cc=acme@kernel.org \
    --cc=adrian.hunter@intel.com \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=howardchu95@gmail.com \
    --cc=irogers@google.com \
    --cc=jolsa@kernel.org \
    --cc=kan.liang@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.