[PATCH v1 0/2] perf trace: Better -p support

linux-perf-users.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v1 0/2] perf trace: Better -p support
@ 2024-08-27  9:20 Howard Chu
  2024-08-27  9:20 ` [PATCH v1 1/2] perf trace: Collect data only for certain pids Howard Chu
  2024-08-27  9:20 ` [PATCH v1 2/2] perf trace: Use pid to index perf_event in BPF Howard Chu
  0 siblings, 2 replies; 3+ messages in thread
From: Howard Chu @ 2024-08-27  9:20 UTC (permalink / raw)
  To: acme
  Cc: namhyung, irogers, jolsa, adrian.hunter, kan.liang,
	linux-perf-users, linux-kernel, Howard Chu

Currently we don't support only collecting data for certain pids in perf
trace. We either collect nothing or everything.

Another thing is when using -p, becausing it's a per-pid mmap,
bpf_perf_event_output will fail under current per-cpu only
configuration. But since I fixed the -p problem already in the perf
trace augmentation v3 patch series, this method will be an optimization
instead of an actually bug fix.

!! Note that this patch series should be applied on top of the perf
trace v3 patch series.

Before:
perf $ perf trace -e open -p 79768
         ? (         ):  ... [continued]: open())                                             = -1 ENOENT (No such file or directory)
         ? (         ):  ... [continued]: open())                                             = -1 ENOENT (No such file or directory)
         ? (         ):  ... [continued]: open())                                             = -1 ENOENT (No such file or directory)

After:
perf $ ./perf trace -e open -p 79768
     0.000 ( 0.019 ms): open(filename: "DINGZHEN", flags: WRONLY)                             = -1 ENOENT (No such file or directory)
  1000.187 ( 0.031 ms): open(filename: "DINGZHEN", flags: WRONLY)                             = -1 ENOENT (No such file or directory)
  2000.377 ( 0.019 ms): open(filename: "DINGZHEN", flags: WRONLY)                             = -1 ENOENT (No such file or directory)

Howard Chu (2):
  perf trace: Collect data only for certain pids
  perf trace: Use pid to index perf_event in BPF

 tools/perf/builtin-trace.c                    | 100 ++++++++++++++++--
 .../bpf_skel/augmented_raw_syscalls.bpf.c     |  54 ++++++++--
 tools/perf/util/evlist.c                      |   2 +-
 3 files changed, 135 insertions(+), 21 deletions(-)

-- 
2.46.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH v1 1/2] perf trace: Collect data only for certain pids
  2024-08-27  9:20 [PATCH v1 0/2] perf trace: Better -p support Howard Chu
@ 2024-08-27  9:20 ` Howard Chu
  2024-08-27  9:20 ` [PATCH v1 2/2] perf trace: Use pid to index perf_event in BPF Howard Chu
  1 sibling, 0 replies; 3+ messages in thread
From: Howard Chu @ 2024-08-27  9:20 UTC (permalink / raw)
  To: acme
  Cc: namhyung, irogers, jolsa, adrian.hunter, kan.liang,
	linux-perf-users, linux-kernel, Howard Chu

Currently, we collect augmented data for __every__ process, even when -p
argument is used.

For instance, if we do perf trace -e write -p 11451, we don't just
collect write buffer for process 11451, we collect write buffers for all
the processes running on the system. Those data will eventually go to
waste, we are not interested in them.

In this patch, I add a new BPF map: pids_allowed. We only allow
augmented data collection on certain processes. This is different from
pids_filtered, which is a map that filters the pids we don't want, for
example, we don't want to trace the perf trace itself.

Signed-off-by: Howard Chu <howardchu95@gmail.com>
---
 tools/perf/builtin-trace.c                    | 45 +++++++++++++++++++
 .../bpf_skel/augmented_raw_syscalls.bpf.c     | 23 ++++++++--
 2 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 115f8dffb272..d38e0b919e8e 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -3911,6 +3911,44 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
 
 	return err;
 }
+
+static int trace__set_allowed_pids(struct trace *trace)
+{
+	int err, pids_allowed_fd = bpf_map__fd(trace->skel->maps.pids_allowed);
+	bool exists = true;
+	struct str_node *pos;
+	struct strlist *pids_slist = strlist__new(trace->opts.target.pid, NULL);
+
+	trace->skel->bss->task_specific = false;
+
+	if (pids_slist) {
+		strlist__for_each_entry(pos, pids_slist) {
+			char *end_ptr;
+			int pid = strtol(pos->s, &end_ptr, 10);
+
+			if (pid == INT_MIN || pid == INT_MAX ||
+			    (*end_ptr != '\0' && *end_ptr != ','))
+				continue;
+
+			err = bpf_map_update_elem(pids_allowed_fd, &pid, &exists, BPF_ANY);
+			if (err)
+				return err;
+
+			trace->skel->bss->task_specific = true;
+		}
+	}
+
+	if (workload_pid != -1) {
+		err = bpf_map_update_elem(pids_allowed_fd, &workload_pid, &exists, BPF_ANY);
+		if (err)
+			return err;
+
+		trace->skel->bss->task_specific = true;
+	}
+
+	strlist__delete(pids_slist);
+	return 0;
+}
 #endif // HAVE_BPF_SKEL
 
 static int trace__set_ev_qualifier_filter(struct trace *trace)
@@ -4305,6 +4343,13 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 #ifdef HAVE_BPF_SKEL
 	if (trace->skel && trace->skel->progs.sys_enter)
 		trace__init_syscalls_bpf_prog_array_maps(trace);
+
+	if (trace->skel) {
+		/* set up workload and user-specified pids for BPF */
+		err = trace__set_allowed_pids(trace);
+		if (err)
+			goto out_error_mem;
+	}
 #endif
 
 	if (trace->ev_qualifier_ids.nr > 0) {
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index f29a8dfca044..1ab0a56c8f35 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -24,6 +24,8 @@
 
 #define MAX_CPUS  4096
 
+volatile bool task_specific;
+
 /* bpf-output associated map */
 struct __augmented_syscalls__ {
 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
@@ -81,6 +83,13 @@ struct pids_filtered {
 	__uint(max_entries, 64);
 } pids_filtered SEC(".maps");
 
+struct pids_allowed {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, pid_t);
+	__type(value, bool);
+	__uint(max_entries, 512);
+} pids_allowed SEC(".maps");
+
 /*
  * Desired design of maximum size and alignment (see RFC2553)
  */
@@ -393,9 +402,15 @@ static pid_t getpid(void)
 	return bpf_get_current_pid_tgid();
 }
 
-static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
+static inline bool should_filter(void)
 {
-	return bpf_map_lookup_elem(pids, &pid) != NULL;
+	pid_t pid = getpid();
+
+	if (bpf_map_lookup_elem(&pids_filtered, &pid) ||
+	    (task_specific && !bpf_map_lookup_elem(&pids_allowed, &pid)))
+		return true;
+
+	return false;
 }
 
 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
@@ -497,7 +512,7 @@ int sys_enter(struct syscall_enter_args *args)
 	 * initial, non-augmented raw_syscalls:sys_enter payload.
 	 */
 
-	if (pid_filter__has(&pids_filtered, getpid()))
+	if (should_filter())
 		return 0;
 
 	augmented_args = augmented_args_payload();
@@ -523,7 +538,7 @@ int sys_exit(struct syscall_exit_args *args)
 {
 	struct syscall_exit_args exit_args;
 
-	if (pid_filter__has(&pids_filtered, getpid()))
+	if (should_filter())
 		return 0;
 
 	bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH v1 2/2] perf trace: Use pid to index perf_event in BPF
  2024-08-27  9:20 [PATCH v1 0/2] perf trace: Better -p support Howard Chu
  2024-08-27  9:20 ` [PATCH v1 1/2] perf trace: Collect data only for certain pids Howard Chu
@ 2024-08-27  9:20 ` Howard Chu
  1 sibling, 0 replies; 3+ messages in thread
From: Howard Chu @ 2024-08-27  9:20 UTC (permalink / raw)
  To: acme
  Cc: namhyung, irogers, jolsa, adrian.hunter, kan.liang,
	linux-perf-users, linux-kernel, Howard Chu

Currently, perf trace -p <PID> is broken for some syscalls. This patch
fixes the it.

Before:
perf $ perf trace -e open -p 79768
         ? (         ):  ... [continued]: open())                                             = -1 ENOENT (No such file or directory)
         ? (         ):  ... [continued]: open())                                             = -1 ENOENT (No such file or directory)
         ? (         ):  ... [continued]: open())                                             = -1 ENOENT (No such file or directory)

After:
perf $ ./perf trace -e open -p 79768
     0.000 ( 0.019 ms): open(filename: "DINGZHEN", flags: WRONLY)                             = -1 ENOENT (No such file or directory)
  1000.187 ( 0.031 ms): open(filename: "DINGZHEN", flags: WRONLY)                             = -1 ENOENT (No such file or directory)
  2000.377 ( 0.019 ms): open(filename: "DINGZHEN", flags: WRONLY)                             = -1 ENOENT (No such file or directory)

This is because when using -p <PID> in perf trace, we mmap the pids
instead of cpus. But in BPF, we tend to use a per-cpu mapped perf_event
to output the augmented data (such as using BPF_F_CURRENT_CPU). That
means the index for perf_event map is cpu.

When we are using -p <PID>, there is "cpu = -1, pid = <PID>".

perf_event_map
[-1]  =  target_perf_event_of_this_pid

This -1 index will never work in BPF. So my original solution is to map
every cpu on this single pid, which is:

perf_event_map
[0]   =  target_perf_event_of_this_pid
[1]   =  target_perf_event_of_this_pid
[2]   =  target_perf_event_of_this_pid
[3]   =  target_perf_event_of_this_pid

But that will cause <number-of-pid> * <number-of-cpu> times
sys_perf_event_open.

So Namhyung's solution is to introduce a new map. I call it
pid2perf_event.

pid2perf_event_map
[pid] = perf_event_index

and then:

perf_event_map
[perf_event_index] = target_perf_event_of_this_pid

we use pid to get the correct index in perf_event map, and
retrieve the correct perf_event using this index.

Suggested-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Howard Chu <howardchu95@gmail.com>
---
 tools/perf/builtin-trace.c                    | 55 +++++++++++++++----
 .../bpf_skel/augmented_raw_syscalls.bpf.c     | 33 +++++++++--
 tools/perf/util/evlist.c                      |  2 +-
 3 files changed, 72 insertions(+), 18 deletions(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index d38e0b919e8e..f9ff65c3d4d2 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -3920,6 +3920,7 @@ static int trace__set_allowed_pids(struct trace *trace)
 	struct strlist *pids_slist = strlist__new(trace->opts.target.pid, NULL);
 
 	trace->skel->bss->task_specific = false;
+	trace->skel->bss->is_workload = false;
 
 	if (pids_slist) {
 		strlist__for_each_entry(pos, pids_slist) {
@@ -3944,6 +3945,7 @@ static int trace__set_allowed_pids(struct trace *trace)
 			return err;
 
 		trace->skel->bss->task_specific = true;
+		trace->skel->bss->is_workload = true;
 	}
 
 	strlist__delete(pids_slist);
@@ -4321,18 +4323,49 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 		goto out_error_open;
 #ifdef HAVE_BPF_SKEL
 	if (trace->syscalls.events.bpf_output) {
-		struct perf_cpu cpu;
+		if (trace->opts.target.pid) {
+			/*
+			 * perf_event map is supposed to be a cpu to perf_event mapping, which is
+			 * different from which when we specified -p, with cpu = -1, pid = <PID>.
+			 * In this case, we treat perf_event map as an array and ignore the cpu
+			 * mapping side of it, and use pid to retrieve the correct index to its
+			 * corresponding perf_event.
+			 */
+			int j = 0;
+			struct perf_thread_map *threads;
+			struct evsel *evsel_aug_sys = evlist__find_evsel_by_str(trace->evlist, "__augmented_syscalls__");
 
-		/*
-		 * Set up the __augmented_syscalls__ BPF map to hold for each
-		 * CPU the bpf-output event's file descriptor.
-		 */
-		perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
-			bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
-					&cpu.cpu, sizeof(int),
-					xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
-						       cpu.cpu, 0),
-					sizeof(__u32), BPF_ANY);
+			if (evsel_aug_sys == NULL)
+				goto out_error;
+
+			threads = evsel_aug_sys->core.threads;
+
+			for (int thread = 0; thread < perf_thread_map__nr(threads); thread++, j++) {
+				pid_t pid = perf_thread_map__pid(threads, thread);
+
+				bpf_map__update_elem(trace->skel->maps.pid2perf_event, &pid, sizeof(pid_t),
+						     &j, sizeof(int), BPF_ANY);
+
+				bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
+						&j, sizeof(int),
+						xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
+							       0, j),
+						sizeof(__u32), BPF_ANY);
+			}
+		} else {
+			struct perf_cpu cpu;
+
+			/*
+			 * Set up the __augmented_syscalls__ BPF map to hold for each
+			 * CPU the bpf-output event's file descriptor.
+			 */
+			perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
+				bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
+						&cpu.cpu, sizeof(int),
+						xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
+							       cpu.cpu, 0),
+						sizeof(__u32), BPF_ANY);
+			}
 		}
 	}
 #endif
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 1ab0a56c8f35..ef8aa0bd2275 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -25,6 +25,7 @@
 #define MAX_CPUS  4096
 
 volatile bool task_specific;
+volatile bool is_workload;
 
 /* bpf-output associated map */
 struct __augmented_syscalls__ {
@@ -90,6 +91,13 @@ struct pids_allowed {
 	__uint(max_entries, 512);
 } pids_allowed SEC(".maps");
 
+struct pid2perf_event {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, pid_t);
+	__type(value, int);
+	__uint(max_entries, MAX_CPUS);
+} pid2perf_event SEC(".maps");
+
 /*
  * Desired design of maximum size and alignment (see RFC2553)
  */
@@ -154,6 +162,11 @@ struct beauty_payload_enter_map {
 	__uint(max_entries, 1);
 } beauty_payload_enter_map SEC(".maps");
 
+static pid_t getpid(void)
+{
+	return bpf_get_current_pid_tgid();
+}
+
 static inline struct augmented_args_payload *augmented_args_payload(void)
 {
 	int key = 0;
@@ -168,7 +181,20 @@ static inline int augmented__output(void *ctx, struct augmented_args_payload *ar
 
 static inline int augmented__beauty_output(void *ctx, void *data, int len)
 {
-	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len);
+	/*
+	 * when it's cpu = -1 pid = PID, we look up the perf_event for this PID. Workload is
+	 * per-cpu mapped so we don't do so.
+	 */
+	if (task_specific && !is_workload) {
+		pid_t pid = getpid();
+		u32 *perf_event = bpf_map_lookup_elem(&pid2perf_event, &pid);
+		if (perf_event)
+			return bpf_perf_event_output(ctx, &__augmented_syscalls__, *perf_event, data, len);
+	} else {
+		return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len);
+	}
+
+	return -1;
 }
 
 static inline
@@ -397,11 +423,6 @@ int sys_enter_nanosleep(struct syscall_enter_args *args)
 	return 1; /* Failure: don't filter */
 }
 
-static pid_t getpid(void)
-{
-	return bpf_get_current_pid_tgid();
-}
-
 static inline bool should_filter()
 {
 	pid_t pid = getpid();
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index f14b7e6ff1dc..ef58a7764318 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1067,7 +1067,7 @@ int evlist__create_maps(struct evlist *evlist, struct target *target)
 	if (!threads)
 		return -1;
 
-	if (target__uses_dummy_map(target) && !evlist__has_bpf_output(evlist))
+	if (target__uses_dummy_map(target))
 		cpus = perf_cpu_map__new_any_cpu();
 	else
 		cpus = perf_cpu_map__new(target->cpu_list);
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-08-27  9:20 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-27  9:20 [PATCH v1 0/2] perf trace: Better -p support Howard Chu
2024-08-27  9:20 ` [PATCH v1 1/2] perf trace: Collect data only for certain pids Howard Chu
2024-08-27  9:20 ` [PATCH v1 2/2] perf trace: Use pid to index perf_event in BPF Howard Chu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).