All of lore.kernel.org
 help / color / mirror / Atom feed
From: Arnaldo Carvalho de Melo <acme@kernel.org>
To: Howard Chu <howardchu95@gmail.com>
Cc: namhyung@kernel.org, mark.rutland@arm.com,
	alexander.shishkin@linux.intel.com, jolsa@kernel.org,
	irogers@google.com, adrian.hunter@intel.com,
	kan.liang@linux.intel.com, linux-perf-users@vger.kernel.org,
	linux-kernel@vger.kernel.org, Ingo Molnar <mingo@redhat.com>,
	James Clark <james.clark@linaro.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Arnaldo Carvalho de Melo <acme@redhat.com>
Subject: Re: [PATCH v14 04/10] perf record --off-cpu: Dump off-cpu samples in BPF
Date: Tue, 11 Feb 2025 18:28:15 +0100	[thread overview]
Message-ID: <Z6uIr8cRZBllFKt4@x1> (raw)
In-Reply-To: <20241215181220.754822-5-howardchu95@gmail.com>

On Sun, Dec 15, 2024 at 10:12:14AM -0800, Howard Chu wrote:
> Collect tid, period, callchain, and cgroup id and dump them when off-cpu
> time threshold is reached.
> 
> We don't collect the off-cpu time twice (the delta), it's either in
> direct samples, or accumulated samples that are dumped at the end of
> perf.data.
> 
> Suggested-by: Namhyung Kim <namhyung@kernel.org>
> Signed-off-by: Howard Chu <howardchu95@gmail.com>
> Cc: Adrian Hunter <adrian.hunter@intel.com>
> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
> Cc: Ian Rogers <irogers@google.com>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: James Clark <james.clark@linaro.org>
> Cc: Jiri Olsa <jolsa@kernel.org>
> Cc: Kan Liang <kan.liang@linux.intel.com>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Link: https://lore.kernel.org/r/20241108204137.2444151-6-howardchu95@gmail.com
> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
> ---
>  tools/perf/util/bpf_skel/off_cpu.bpf.c | 88 ++++++++++++++++++++++++--
>  1 file changed, 83 insertions(+), 5 deletions(-)
> 
> diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
> index 1cdd4d63ea92..77fdc9e81db3 100644
> --- a/tools/perf/util/bpf_skel/off_cpu.bpf.c
> +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
> @@ -19,11 +19,17 @@
>  #define MAX_ENTRIES  102400
>  
>  #define MAX_CPUS  4096
> +#define MAX_OFFCPU_LEN 37
> +
> +struct stack {
> +	u64 array[MAX_STACKS];
> +};

So I needed to rename the above as it fails 'make -C tools/perf
build-test', the direct make command line to hit that:

⬢ [acme@toolbox perf-tools-next]$ rm -rf /tmp/build/$(basename $PWD)/ ; mkdir -p /tmp/build/$(basename $PWD)/  
⬢ [acme@toolbox perf-tools-next]$ alias m='rm -rf ~/libexec/perf-core/ ; make -k GEN_VMLINUX_H=1 CORESIGHT=1 O=/tmp/build/$(basename $PWD)/ -C tools/perf install-bin && perf test python && cat /tmp/build/perf-tools-next/feature/test-all.output'
⬢ [acme@toolbox perf-tools-next]$ m 
  GENSKEL /tmp/build/perf-tools-next/util/bpf_skel/sample_filter.skel.h
util/bpf_skel/off_cpu.bpf.c:24:8: error: redefinition of 'stack'
   24 | struct stack {
      |        ^
/tmp/build/perf-tools-next/util/bpf_skel/.tmp/../vmlinux.h:128096:8: note: previous definition is here
 128096 | struct stack {
        |        ^
util/bpf_skel/off_cpu.bpf.c:218:42: error: no member named 'array' in 'struct stack'
  218 |         for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len)
      |                                           ~~~~  ^
util/bpf_skel/off_cpu.bpf.c:219:32: error: no member named 'array' in 'struct stack'
  219 |                 to->array[n + 2 + i] = from->array[i];
      |                                        ~~~~  ^
3 errors generated.
make[2]: *** [Makefile.perf:1276: /tmp/build/perf-tools-next/util/bpf_skel/.tmp/off_cpu.bpf.o] Error 1

So for now I'm adding the patch below to get it going.

IIRC there was some discussion about ditching GEN_VMLINUX_H=1, we can
remove this hack later if/when we progress on that.

- Arnaldo

diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
index 77fdc9e81db395d1..848a123e5610f17b 100644
--- a/tools/perf/util/bpf_skel/off_cpu.bpf.c
+++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
@@ -21,7 +21,8 @@
 #define MAX_CPUS  4096
 #define MAX_OFFCPU_LEN 37
 
-struct stack {
+// We have a 'struct stack' in vmlinux.h when building with GEN_VMLINUX_H=1
+struct __stack {
 	u64 array[MAX_STACKS];
 };
 
@@ -29,7 +30,7 @@ struct tstamp_data {
 	__u32 stack_id;
 	__u32 state;
 	__u64 timestamp;
-	struct stack stack;
+	struct __stack stack;
 };
 
 struct offcpu_key {
@@ -211,7 +212,7 @@ static inline int can_record(struct task_struct *t, int state)
 	return 1;
 }
 
-static inline int copy_stack(struct stack *from, struct offcpu_data *to, int n)
+static inline int copy_stack(struct __stack *from, struct offcpu_data *to, int n)
 {
 	int len = 0;
 
@@ -231,7 +232,7 @@ static inline int copy_stack(struct stack *from, struct offcpu_data *to, int n)
  * information of the task, and dump it as a raw sample to perf ring buffer
  */
 static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key,
-			struct stack *stack, __u64 delta)
+			struct __stack *stack, __u64 delta)
 {
 	int n = 0, len = 0;
 
  
>  struct tstamp_data {
>  	__u32 stack_id;
>  	__u32 state;
>  	__u64 timestamp;
> +	struct stack stack;
>  };
>  
>  struct offcpu_key {
> @@ -41,6 +47,10 @@ struct {
>  	__uint(max_entries, MAX_ENTRIES);
>  } stacks SEC(".maps");
>  
> +struct offcpu_data {
> +	u64 array[MAX_OFFCPU_LEN];
> +};
> +
>  struct {
>  	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
>  	__uint(key_size, sizeof(__u32));
> @@ -48,6 +58,13 @@ struct {
>  	__uint(max_entries, MAX_CPUS);
>  } offcpu_output SEC(".maps");
>  
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +	__uint(key_size, sizeof(__u32));
> +	__uint(value_size, sizeof(struct offcpu_data));
> +	__uint(max_entries, 1);
> +} offcpu_payload SEC(".maps");
> +
>  struct {
>  	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
>  	__uint(map_flags, BPF_F_NO_PREALLOC);
> @@ -106,6 +123,8 @@ const volatile bool uses_cgroup_v1 = false;
>  
>  int perf_subsys_id = -1;
>  
> +__u64 offcpu_thresh_ns = 500000000ull;
> +
>  /*
>   * Old kernel used to call it task_struct->state and now it's '__state'.
>   * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
> @@ -192,6 +211,47 @@ static inline int can_record(struct task_struct *t, int state)
>  	return 1;
>  }
>  
> +static inline int copy_stack(struct stack *from, struct offcpu_data *to, int n)
> +{
> +	int len = 0;
> +
> +	for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len)
> +		to->array[n + 2 + i] = from->array[i];
> +
> +	return len;
> +}
> +
> +/**
> + * off_cpu_dump - dump off-cpu samples to ring buffer
> + * @data: payload for dumping off-cpu samples
> + * @key: off-cpu data
> + * @stack: stack trace of the task before being scheduled out
> + *
> + * If the threshold of off-cpu time is reached, acquire tid, period, callchain, and cgroup id
> + * information of the task, and dump it as a raw sample to perf ring buffer
> + */
> +static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key,
> +			struct stack *stack, __u64 delta)
> +{
> +	int n = 0, len = 0;
> +
> +	data->array[n++] = (u64)key->tgid << 32 | key->pid;
> +	data->array[n++] = delta;
> +
> +	/* data->array[n] is callchain->nr (updated later) */
> +	data->array[n + 1] = PERF_CONTEXT_USER;
> +	data->array[n + 2] = 0;
> +	len = copy_stack(stack, data, n);
> +
> +	/* update length of callchain */
> +	data->array[n] = len + 1;
> +	n += len + 2;
> +
> +	data->array[n++] = key->cgroup_id;
> +
> +	return bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, n * sizeof(u64));
> +}
> +
>  static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
>  			struct task_struct *next, int state)
>  {
> @@ -216,6 +276,16 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
>  	pelem->state = state;
>  	pelem->stack_id = stack_id;
>  
> +	/*
> +	 * If stacks are successfully collected by bpf_get_stackid(), collect them once more
> +	 * in task_storage for direct off-cpu sample dumping
> +	 */
> +	if (stack_id > 0 && bpf_get_stack(ctx, &pelem->stack, MAX_STACKS * sizeof(u64), BPF_F_USER_STACK)) {
> +		/*
> +		 * This empty if block is used to avoid 'result unused warning' from bpf_get_stack().
> +		 * If the collection fails, continue with the logic for the next task.
> +		 */
> +	}
>  next:
>  	pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
>  
> @@ -230,11 +300,19 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
>  		__u64 delta = ts - pelem->timestamp;
>  		__u64 *total;
>  
> -		total = bpf_map_lookup_elem(&off_cpu, &key);
> -		if (total)
> -			*total += delta;
> -		else
> -			bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
> +		if (delta >= offcpu_thresh_ns) {
> +			int zero = 0;
> +			struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero);
> +
> +			if (data)
> +				off_cpu_dump(ctx, data, &key, &pelem->stack, delta);
> +		} else {
> +			total = bpf_map_lookup_elem(&off_cpu, &key);
> +			if (total)
> +				*total += delta;
> +			else
> +				bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
> +		}
>  
>  		/* prevent to reuse the timestamp later */
>  		pelem->timestamp = 0;
> -- 
> 2.43.0

  reply	other threads:[~2025-02-11 17:28 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-15 18:12 [PATCH v14 00/10] perf record --off-cpu: Dump off-cpu samples directly Howard Chu
2024-12-15 18:12 ` [PATCH v14 01/10] perf evsel: Expose evsel__is_offcpu_event() for future use Howard Chu
2024-12-15 18:12 ` [PATCH v14 02/10] perf record --off-cpu: Parse off-cpu event Howard Chu
2024-12-15 18:12 ` [PATCH v14 03/10] perf record --off-cpu: Preparation of off-cpu BPF program Howard Chu
2024-12-15 18:12 ` [PATCH v14 04/10] perf record --off-cpu: Dump off-cpu samples in BPF Howard Chu
2025-02-11 17:28   ` Arnaldo Carvalho de Melo [this message]
2025-02-11 17:31     ` Howard Chu
2024-12-15 18:12 ` [PATCH v14 05/10] perf evsel: Assemble offcpu samples Howard Chu
2025-02-06 19:27   ` Arnaldo Carvalho de Melo
2025-02-06 22:08     ` Howard Chu
2024-12-15 18:12 ` [PATCH v14 06/10] perf record --off-cpu: Disable perf_event's callchain collection Howard Chu
2024-12-15 18:12 ` [PATCH v14 07/10] perf script: Display off-cpu samples correctly Howard Chu
2024-12-15 18:12 ` [PATCH v14 08/10] perf record --off-cpu: Dump the remaining samples in BPF's stack trace map Howard Chu
2024-12-15 18:12 ` [PATCH v14 09/10] perf record --off-cpu: Add --off-cpu-thresh option Howard Chu
2024-12-15 18:12 ` [PATCH v14 10/10] perf test: Add direct off-cpu test Howard Chu
2024-12-16 20:11 ` [PATCH v14 00/10] perf record --off-cpu: Dump off-cpu samples directly Namhyung Kim
2024-12-16 20:49   ` Howard Chu
     [not found]     ` <CA+JHD92xXH=6K0-imdObf+dw3=B0uiGdFH16DO3ArdHv3r1Zzg@mail.gmail.com>
2025-02-04 17:56       ` Ian Rogers
2025-02-06 17:22         ` Arnaldo Carvalho de Melo
2025-02-11 18:02   ` Arnaldo Carvalho de Melo
2025-02-11 18:26     ` Arnaldo Carvalho de Melo
2025-02-11 18:30     ` Howard Chu
2025-02-11 18:49       ` Arnaldo Carvalho de Melo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Z6uIr8cRZBllFKt4@x1 \
    --to=acme@kernel.org \
    --cc=acme@redhat.com \
    --cc=adrian.hunter@intel.com \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=howardchu95@gmail.com \
    --cc=irogers@google.com \
    --cc=james.clark@linaro.org \
    --cc=jolsa@kernel.org \
    --cc=kan.liang@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mingo@redhat.com \
    --cc=namhyung@kernel.org \
    --cc=peterz@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.