Re: [PATCH v4 4/9] perf record off-cpu: Dump direct off-cpu samples in BPF

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Namhyung Kim <namhyung@kernel.org>
To: Howard Chu <howardchu95@gmail.com>
Cc: irogers@google.com, acme@kernel.org, adrian.hunter@intel.com,
	jolsa@kernel.org, kan.liang@linux.intel.com,
	linux-perf-users@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH v4 4/9] perf record off-cpu: Dump direct off-cpu samples in BPF
Date: Wed, 7 Aug 2024 16:49:59 -0700	[thread overview]
Message-ID: <ZrQIJ9xRg22qALoQ@google.com> (raw)
In-Reply-To: <20240807153843.3231451-5-howardchu95@gmail.com>

On Wed, Aug 07, 2024 at 11:38:38PM +0800, Howard Chu wrote:
> Output PERF_SAMPLE_TID, PERF_SAMPLE_PERIOD, PERF_SAMPLE_CALLCHAIN, and
> PERF_SAMPLE_CGROUP in BPF. Ideally, we should only output
> PERF_SAMPLE_PERIOD (off-cpu time) and PERF_SAMPLE_CALLCHAIN (sched_in
> process's callchain). One only needs to set PERF_SAMPLE_TID and
> PERF_SAMPLE_CGROUP, and perf_event will do everything for us.
> 
> But in reality, that's not the case. Setting PERF_SAMPLE_TID will mostly
> give us TID of 0. We might get the correct TID for offcpu-time event
> from time to time, but it is really rare.
>          swapper       0 [000] offcpu-time:  /
>         :1321819 1321819 [002] offcpu-time:  /user.slice/user-1000.slice/session-2.scope
>          swapper       0 [001] offcpu-time:  /
>          swapper       0 [003] offcpu-time:  /
> 
> And setting PERF_SAMPLE_CGROUP doesn't work properly either.
>     tmux: server    3701 [003] offcpu-time:  /
>     blueman-tray    1064 [001] offcpu-time:  /
>             bash 1350867 [001] offcpu-time:  /
>             bash 1350844 [000] offcpu-time:  /

Isn't it just because your system was idle?

> 
> We need to retrieve PERF_SAMPLE_TID, PERF_SAMPLE_PERIOD,
> PERF_SAMPLE_CALLCHAIN, and PERF_SAMPLE_CGROUP using BPF and output these
> four fields.
> 
> Add perf_event_array map for dumping direct off-cpu samples, but keep
> the at-the-end approach.
> 
> Tons of checking before access, to pass the BPF verifier.
> 
> If off-cpu time (represented as delta) exceeds the off-cpu threshold, do
> output.
> 
> Suggested-by: Namhyung Kim <namhyung@kernel.org>
> Signed-off-by: Howard Chu <howardchu95@gmail.com>
> ---
>  tools/perf/util/bpf_skel/off_cpu.bpf.c | 133 +++++++++++++++++++++++++
>  1 file changed, 133 insertions(+)
> 
> diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
> index cca1b6990a57..95cc130bb67e 100644
> --- a/tools/perf/util/bpf_skel/off_cpu.bpf.c
> +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
> @@ -18,6 +18,9 @@
>  #define MAX_STACKS   32
>  #define MAX_ENTRIES  102400
>  
> +#define MAX_CPUS  4096
> +#define MAX_OFFCPU_LEN 128
> +
>  struct tstamp_data {
>  	__u32 stack_id;
>  	__u32 state;
> @@ -32,6 +35,7 @@ struct offcpu_key {
>  	__u64 cgroup_id;
>  };
>  
> +/* for dumping at the end */
>  struct {
>  	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
>  	__uint(key_size, sizeof(__u32));
> @@ -39,6 +43,45 @@ struct {
>  	__uint(max_entries, MAX_ENTRIES);
>  } stacks SEC(".maps");
>  
> +struct offcpu_data {
> +	u64 array[MAX_OFFCPU_LEN];
> +};
> +
> +struct stack_data {
> +	u64 array[MAX_STACKS];
> +};
> +
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
> +	__uint(key_size, sizeof(__u32));
> +	__uint(value_size, sizeof(__u32));
> +	__uint(max_entries, MAX_CPUS);
> +} offcpu_output SEC(".maps");
> +
> +/* temporary offcpu sample */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +	__uint(key_size, sizeof(__u32));
> +	__uint(value_size, sizeof(struct offcpu_data));
> +	__uint(max_entries, 1);
> +} offcpu_payload SEC(".maps");
> +
> +/* temporary stack data */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +	__uint(key_size, sizeof(__u32));
> +	__uint(value_size, sizeof(struct stack_data));
> +	__uint(max_entries, 1);
> +} stack_tmp SEC(".maps");
> +
> +/* cached stack per task storage */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
> +	__uint(map_flags, BPF_F_NO_PREALLOC);
> +	__type(key, int);
> +	__type(value, struct stack_data);
> +} stack_cache SEC(".maps");
> +
>  struct {
>  	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
>  	__uint(map_flags, BPF_F_NO_PREALLOC);
> @@ -184,12 +227,75 @@ static inline int can_record(struct task_struct *t, int state)
>  	return 1;
>  }
>  
> +static inline bool check_bounds(int index)
> +{
> +	if (index >= 0 && index < MAX_OFFCPU_LEN)
> +		return true;
> +
> +	return false;
> +}
> +
> +static inline int copy_stack(struct stack_data *from,
> +			     struct offcpu_data *to, int n)
> +{
> +	int max_stacks = MAX_STACKS, len = 0;
> +
> +	if (!from)
> +		return len;
> +
> +	for (int i = 0; i < max_stacks && from->array[i]; ++i) {
> +		if (check_bounds(n + 2 + i)) {
> +			to->array[n + 2 + i] = from->array[i];
> +			++len;
> +		}
> +	}
> +	return len;
> +}
> +
> +static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key,
> +			struct stack_data *stack_p, __u64 delta, __u64 timestamp)
> +{
> +	int size, n = 0, ip_pos = -1, len = 0;
> +
> +	if (sample_type & PERF_SAMPLE_TID && check_bounds(n))
> +		data->array[n++] = (u64)key->tgid << 32 | key->pid;
> +	if (sample_type & PERF_SAMPLE_PERIOD && check_bounds(n))
> +		data->array[n++] = delta;
> +	if (sample_type & PERF_SAMPLE_CALLCHAIN && check_bounds(n + 2)) {
> +		/* data->array[n] is callchain->nr (updated later) */
> +		data->array[n + 1] = PERF_CONTEXT_USER;
> +		data->array[n + 2] = 0;
> +
> +		len = copy_stack(stack_p, data, n);
> +
> +		/* update length of callchain */
> +		data->array[n] = len + 1;
> +
> +		/* update sample ip with the first callchain entry */
> +		if (ip_pos >= 0)
> +			data->array[ip_pos] = data->array[n + 2];

Where is ip_pos set?  I guess we don't need this as we don't update the
SAMPLE_IP here.

> +
> +		/* calculate sample callchain data->array length */
> +		n += len + 2;
> +	}
> +	if (sample_type & PERF_SAMPLE_CGROUP && check_bounds(n))
> +		data->array[n++] = key->cgroup_id;
> +
> +	size = n * sizeof(u64);
> +	if (size >= 0 && size <= MAX_OFFCPU_LEN * sizeof(u64))
> +		bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, size);
> +
> +	return 0;
> +}
> +
>  static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
>  			struct task_struct *next, int state)
>  {
>  	__u64 ts;
>  	__u32 stack_id;
>  	struct tstamp_data *pelem;
> +	struct stack_data *stack_tmp_p, *stack_p;
> +	int zero = 0, len = 0;
>  
>  	ts = bpf_ktime_get_ns();
>  
> @@ -199,6 +305,25 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
>  	stack_id = bpf_get_stackid(ctx, &stacks,
>  				   BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
>  
> +	/* cache stacks */
> +	stack_tmp_p = bpf_map_lookup_elem(&stack_tmp, &zero);
> +	if (stack_tmp_p)
> +		len = bpf_get_stack(ctx, stack_tmp_p->array, MAX_STACKS * sizeof(u64),
> +				    BPF_F_USER_STACK) / sizeof(u64);

Looks redundant, but not sure if we share the stackid..

> +
> +	/*
> +	 * if stacks are successfully collected, cache them to task_storage, they are then
> +	 * dumped if the off-cpu time hits the threshold.
> +	 */
> +	if (len > 0) {
> +		stack_p = bpf_task_storage_get(&stack_cache, prev, NULL,
> +					       BPF_LOCAL_STORAGE_GET_F_CREATE);
> +		if (stack_p) {
> +			for (int i = 0; i < len && i < MAX_STACKS; ++i)
> +				stack_p->array[i] = stack_tmp_p->array[i];
> +		}
> +	}

Why not use the task storage to get the stacktrace directly?

Thanks,
Namhyung

> +
>  	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
>  				     BPF_LOCAL_STORAGE_GET_F_CREATE);
>  	if (!pelem)
> @@ -228,6 +353,14 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
>  		else
>  			bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
>  
> +		if (delta >= offcpu_thresh) {
> +			struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero);
> +
> +			stack_p = bpf_task_storage_get(&stack_cache, next, NULL, 0);
> +			if (data && stack_p)
> +				off_cpu_dump(ctx, data, &key, stack_p, delta, pelem->timestamp);
> +		}
> +
>  		/* prevent to reuse the timestamp later */
>  		pelem->timestamp = 0;
>  	}
> -- 
> 2.45.2
>

next prev parent reply	other threads:[~2024-08-07 23:50 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-08-07 15:38 [PATCH v4 0/9] perf record --off-cpu: Dump off-cpu samples directly Howard Chu
2024-08-07 15:38 ` [PATCH v4 1/9] perf evsel: Set BPF output to system-wide Howard Chu
2024-08-07 23:21   ` Namhyung Kim
2024-08-08  3:58     ` Howard Chu
2024-09-25  2:53       ` Ian Rogers
2024-09-25  7:07         ` Howard Chu
2024-08-07 15:38 ` [PATCH v4 2/9] perf record --off-cpu: Add --off-cpu-thresh Howard Chu
2024-08-07 23:22   ` Namhyung Kim
2024-08-08  4:01     ` Howard Chu
2024-08-07 15:38 ` [PATCH v4 3/9] perf record --off-cpu: Parse offcpu-time event Howard Chu
2024-08-07 23:25   ` Namhyung Kim
2024-08-08  8:52     ` Howard Chu
2024-08-07 15:38 ` [PATCH v4 4/9] perf record off-cpu: Dump direct off-cpu samples in BPF Howard Chu
2024-08-07 23:49   ` Namhyung Kim [this message]
2024-08-08 11:57     ` Howard Chu
2024-08-07 15:38 ` [PATCH v4 5/9] perf record --off-cpu: Dump total off-cpu time at the end Howard Chu
2024-08-07 15:38 ` [PATCH v4 6/9] perf evsel: Delete unnecessary = 0 Howard Chu
2024-08-07 15:38 ` [PATCH v4 7/9] perf record --off-cpu: Parse BPF output embedded data Howard Chu
2024-08-07 15:38 ` [PATCH v4 8/9] perf header: Add field 'embed' Howard Chu
2024-08-07 23:52   ` Namhyung Kim
2024-08-08 13:57     ` Howard Chu
2024-08-07 15:38 ` [PATCH v4 9/9] perf test: Add direct off-cpu dumping test Howard Chu
2024-09-25  3:04 ` [PATCH v4 0/9] perf record --off-cpu: Dump off-cpu samples directly Ian Rogers
2024-09-25  6:59   ` Howard Chu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZrQIJ9xRg22qALoQ@google.com \
    --to=namhyung@kernel.org \
    --cc=acme@kernel.org \
    --cc=adrian.hunter@intel.com \
    --cc=howardchu95@gmail.com \
    --cc=irogers@google.com \
    --cc=jolsa@kernel.org \
    --cc=kan.liang@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-perf-users@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.