[RFC/PATCH] perf trace: Add --syscall-period option

linux-perf-users.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC/PATCH] perf trace: Add --syscall-period option
@ 2024-12-11 22:21 Namhyung Kim
  2024-12-11 22:52 ` Arnaldo Carvalho de Melo
  2024-12-18  5:40 ` Howard Chu
  0 siblings, 2 replies; 8+ messages in thread
From: Namhyung Kim @ 2024-12-11 22:21 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Ian Rogers, Kan Liang
  Cc: Jiri Olsa, Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users, Howard Chu

This option is to implement the event sampling for system calls.
The period is given in msec as it shows the time in msec.

  # perf trace -C 0 --syscall-period 100 sleep 1
           ? (         ): fleetspeakd/1828559  ... [continued]: futex())                                            = -1 ETIMEDOUT (Connection timed out)
       0.050 (100.247 ms): gnome-shell/572531 recvmsg(fd: 10<socket:[3355761]>, msg: 0x7ffef8b39d20)                = 40
     100.357 (100.149 ms): pipewire-pulse/572245 read(fd: 5<anon_inode:[eventfd]>, buf: 0x7ffc0b9dc8f0, count: 8)      = 8
     200.553 (100.268 ms): NetworkManager/3424 epoll_wait(epfd: 19<anon_inode:[eventpoll]>, events: 0x5607b85bb880, maxevents: 6) = 0
     300.876 (         ): mon/4932 poll(ufds: 0x7fa392784df0, nfds: 1, timeout_msecs: 100)            ...
     400.901 ( 0.025 ms): TaskCon~ller #/620145 futex(uaddr: 0x7f3fc596fa00, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
     300.876 (100.123 ms): mon/4932  ... [continued]: poll())                                             = 0 (Timeout)
     500.901 ( 0.012 ms): evdefer/2/2335122 futex(uaddr: 0x5640baac5198, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
     602.701 ( 0.017 ms): Compositor/1992200 futex(uaddr: 0x7f1a51dfdd40, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
     705.589 ( 0.017 ms): JS Watchdog/947933 futex(uaddr: 0x7f4cac1d4240, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
     812.667 ( 0.027 ms): fix/1985151 futex(uaddr: 0xc0008f7148, op: WAKE|PRIVATE_FLAG, val: 1)             = 1
     912.807 ( 0.017 ms): Xorg/572315 setitimer(value: 0x7ffc375d6ba0)                                      = 0

The timestamp is kept in a per-cpu array and the allowed task is saved
in a hash map.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-trace.txt       |  6 ++
 tools/perf/builtin-trace.c                    |  5 ++
 .../bpf_skel/augmented_raw_syscalls.bpf.c     | 67 ++++++++++++++++++-
 3 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 6e0cc50bbc13fc7f..9f338a8c5357a67e 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -241,6 +241,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
 	printing using the existing 'perf trace' syscall arg beautifiers to map integer
 	arguments to strings (pid to comm, syscall id to syscall name, etc).
 
+--syscall-period::
+	Trace a system call in the given period (in msec).  This implements
+	sampling for syscalls in order to reduce the monitoring overhead.
+	For example, setting the sysall period to 100 (msec) means it will
+	sample a syscall and next one after 100 msec.
+
 
 PAGEFAULTS
 ----------
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 3c46de1a8d79bfe6..789eb0ffd5f90b61 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -185,6 +185,7 @@ struct trace {
 	} stats;
 	unsigned int		max_stack;
 	unsigned int		min_stack;
+	unsigned long		sample_period_ms;
 	int			raw_augmented_syscalls_args_size;
 	bool			raw_augmented_syscalls;
 	bool			fd_path_disabled;
@@ -5219,6 +5220,7 @@ int cmd_trace(int argc, const char **argv)
 		     "start"),
 	OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
 		       "to customized ones"),
+	OPT_ULONG(0, "syscall-period", &trace.sample_period_ms, "syscall sampling period in ms"),
 	OPTS_EVSWITCH(&trace.evswitch),
 	OPT_END()
 	};
@@ -5326,6 +5328,9 @@ int cmd_trace(int argc, const char **argv)
 				bpf_program__set_autoattach(prog, /*autoattach=*/false);
 		}
 
+		if (trace.sample_period_ms)
+			trace.skel->rodata->sample_period = trace.sample_period_ms * NSEC_PER_MSEC;
+
 		err = augmented_raw_syscalls_bpf__load(trace.skel);
 
 		if (err < 0) {
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 4a62ed593e84edf8..12272620dcd73700 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -113,6 +113,22 @@ struct pids_filtered {
 	__uint(max_entries, 64);
 } pids_filtered SEC(".maps");
 
+struct sample_timestamp {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, int);
+	__type(value, __u64);
+	__uint(max_entries, 1);
+} sample_timestamp SEC(".maps");
+
+struct sample_filtered {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, pid_t);
+	__type(value, bool);
+	__uint(max_entries, MAX_CPUS);
+} sample_filtered SEC(".maps");
+
+const volatile __u64 sample_period;
+
 struct augmented_args_payload {
 	struct syscall_enter_args args;
 	struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc)
@@ -428,6 +444,44 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
 	return bpf_map_lookup_elem(pids, &pid) != NULL;
 }
 
+static bool sample_filter__allow_enter(__u64 timestamp, pid_t pid)
+{
+	int idx = 0;
+	__u64 *prev_ts;
+	bool ok = true;
+
+	/* default behavior */
+	if (sample_period == 0)
+		return true;
+
+	prev_ts = bpf_map_lookup_elem(&sample_timestamp, &idx);
+
+	if (prev_ts) {
+		if ((*prev_ts + sample_period) > timestamp)
+			return false;
+		*prev_ts = timestamp;
+	} else {
+		bpf_map_update_elem(&sample_timestamp, &idx, &timestamp, BPF_ANY);
+	}
+
+	bpf_map_update_elem(&sample_filtered, &pid, &ok, BPF_ANY);
+
+	return true;
+}
+
+static bool sample_filter__allow_exit(pid_t pid)
+{
+	/* default behavior */
+	if (sample_period == 0)
+		return true;
+
+	if (!bpf_map_lookup_elem(&sample_filtered, &pid))
+		return false;
+
+	bpf_map_delete_elem(&sample_filtered, &pid);
+	return true;
+}
+
 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 {
 	bool augmented, do_output = false;
@@ -526,7 +580,9 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 SEC("tp/raw_syscalls/sys_enter")
 int sys_enter(struct syscall_enter_args *args)
 {
+	pid_t pid = getpid();
 	struct augmented_args_payload *augmented_args;
+
 	/*
 	 * We start len, the amount of data that will be in the perf ring
 	 * buffer, if this is not filtered out by one of pid_filter__has(),
@@ -537,7 +593,10 @@ int sys_enter(struct syscall_enter_args *args)
 	 * initial, non-augmented raw_syscalls:sys_enter payload.
 	 */
 
-	if (pid_filter__has(&pids_filtered, getpid()))
+	if (pid_filter__has(&pids_filtered, pid))
+		return 0;
+
+	if (!sample_filter__allow_enter(bpf_ktime_get_ns(), pid))
 		return 0;
 
 	augmented_args = augmented_args_payload();
@@ -561,9 +620,13 @@ int sys_enter(struct syscall_enter_args *args)
 SEC("tp/raw_syscalls/sys_exit")
 int sys_exit(struct syscall_exit_args *args)
 {
+	pid_t pid = getpid();
 	struct syscall_exit_args exit_args;
 
-	if (pid_filter__has(&pids_filtered, getpid()))
+	if (pid_filter__has(&pids_filtered, pid))
+		return 0;
+
+	if (!sample_filter__allow_exit(pid))
 		return 0;
 
 	bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
-- 
2.47.0.338.g60cca15819-goog


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [RFC/PATCH] perf trace: Add --syscall-period option
  2024-12-11 22:21 [RFC/PATCH] perf trace: Add --syscall-period option Namhyung Kim
@ 2024-12-11 22:52 ` Arnaldo Carvalho de Melo
  2024-12-12  1:34   ` Ian Rogers
  2024-12-18  5:40 ` Howard Chu
  1 sibling, 1 reply; 8+ messages in thread
From: Arnaldo Carvalho de Melo @ 2024-12-11 22:52 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Ian Rogers, Kan Liang, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Howard Chu

On Wed, Dec 11, 2024 at 02:21:10PM -0800, Namhyung Kim wrote:
> This option is to implement the event sampling for system calls.
> The period is given in msec as it shows the time in msec.
> 
>   # perf trace -C 0 --syscall-period 100 sleep 1
>            ? (         ): fleetspeakd/1828559  ... [continued]: futex())                                            = -1 ETIMEDOUT (Connection timed out)
>        0.050 (100.247 ms): gnome-shell/572531 recvmsg(fd: 10<socket:[3355761]>, msg: 0x7ffef8b39d20)                = 40
>      100.357 (100.149 ms): pipewire-pulse/572245 read(fd: 5<anon_inode:[eventfd]>, buf: 0x7ffc0b9dc8f0, count: 8)      = 8
>      200.553 (100.268 ms): NetworkManager/3424 epoll_wait(epfd: 19<anon_inode:[eventpoll]>, events: 0x5607b85bb880, maxevents: 6) = 0
>      300.876 (         ): mon/4932 poll(ufds: 0x7fa392784df0, nfds: 1, timeout_msecs: 100)            ...
>      400.901 ( 0.025 ms): TaskCon~ller #/620145 futex(uaddr: 0x7f3fc596fa00, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
>      300.876 (100.123 ms): mon/4932  ... [continued]: poll())                                             = 0 (Timeout)
>      500.901 ( 0.012 ms): evdefer/2/2335122 futex(uaddr: 0x5640baac5198, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
>      602.701 ( 0.017 ms): Compositor/1992200 futex(uaddr: 0x7f1a51dfdd40, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
>      705.589 ( 0.017 ms): JS Watchdog/947933 futex(uaddr: 0x7f4cac1d4240, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
>      812.667 ( 0.027 ms): fix/1985151 futex(uaddr: 0xc0008f7148, op: WAKE|PRIVATE_FLAG, val: 1)             = 1
>      912.807 ( 0.017 ms): Xorg/572315 setitimer(value: 0x7ffc375d6ba0)                                      = 0
> 
> The timestamp is kept in a per-cpu array and the allowed task is saved
> in a hash map.

Interesting concept, and one that is done just on the BPF part, so I
think we should at least warn a user that is running this on a build
without BPF skels.

Will try it tomorrow,

- Arnaldo
 
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  tools/perf/Documentation/perf-trace.txt       |  6 ++
>  tools/perf/builtin-trace.c                    |  5 ++
>  .../bpf_skel/augmented_raw_syscalls.bpf.c     | 67 ++++++++++++++++++-
>  3 files changed, 76 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> index 6e0cc50bbc13fc7f..9f338a8c5357a67e 100644
> --- a/tools/perf/Documentation/perf-trace.txt
> +++ b/tools/perf/Documentation/perf-trace.txt
> @@ -241,6 +241,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
>  	printing using the existing 'perf trace' syscall arg beautifiers to map integer
>  	arguments to strings (pid to comm, syscall id to syscall name, etc).
>  
> +--syscall-period::
> +	Trace a system call in the given period (in msec).  This implements
> +	sampling for syscalls in order to reduce the monitoring overhead.
> +	For example, setting the sysall period to 100 (msec) means it will
> +	sample a syscall and next one after 100 msec.
> +
>  
>  PAGEFAULTS
>  ----------
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index 3c46de1a8d79bfe6..789eb0ffd5f90b61 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -185,6 +185,7 @@ struct trace {
>  	} stats;
>  	unsigned int		max_stack;
>  	unsigned int		min_stack;
> +	unsigned long		sample_period_ms;
>  	int			raw_augmented_syscalls_args_size;
>  	bool			raw_augmented_syscalls;
>  	bool			fd_path_disabled;
> @@ -5219,6 +5220,7 @@ int cmd_trace(int argc, const char **argv)
>  		     "start"),
>  	OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
>  		       "to customized ones"),
> +	OPT_ULONG(0, "syscall-period", &trace.sample_period_ms, "syscall sampling period in ms"),
>  	OPTS_EVSWITCH(&trace.evswitch),
>  	OPT_END()
>  	};
> @@ -5326,6 +5328,9 @@ int cmd_trace(int argc, const char **argv)
>  				bpf_program__set_autoattach(prog, /*autoattach=*/false);
>  		}
>  
> +		if (trace.sample_period_ms)
> +			trace.skel->rodata->sample_period = trace.sample_period_ms * NSEC_PER_MSEC;
> +
>  		err = augmented_raw_syscalls_bpf__load(trace.skel);
>  
>  		if (err < 0) {
> diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> index 4a62ed593e84edf8..12272620dcd73700 100644
> --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> @@ -113,6 +113,22 @@ struct pids_filtered {
>  	__uint(max_entries, 64);
>  } pids_filtered SEC(".maps");
>  
> +struct sample_timestamp {
> +	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +	__type(key, int);
> +	__type(value, __u64);
> +	__uint(max_entries, 1);
> +} sample_timestamp SEC(".maps");
> +
> +struct sample_filtered {
> +	__uint(type, BPF_MAP_TYPE_HASH);
> +	__type(key, pid_t);
> +	__type(value, bool);
> +	__uint(max_entries, MAX_CPUS);
> +} sample_filtered SEC(".maps");
> +
> +const volatile __u64 sample_period;
> +
>  struct augmented_args_payload {
>  	struct syscall_enter_args args;
>  	struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc)
> @@ -428,6 +444,44 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
>  	return bpf_map_lookup_elem(pids, &pid) != NULL;
>  }
>  
> +static bool sample_filter__allow_enter(__u64 timestamp, pid_t pid)
> +{
> +	int idx = 0;
> +	__u64 *prev_ts;
> +	bool ok = true;
> +
> +	/* default behavior */
> +	if (sample_period == 0)
> +		return true;
> +
> +	prev_ts = bpf_map_lookup_elem(&sample_timestamp, &idx);
> +
> +	if (prev_ts) {
> +		if ((*prev_ts + sample_period) > timestamp)
> +			return false;
> +		*prev_ts = timestamp;
> +	} else {
> +		bpf_map_update_elem(&sample_timestamp, &idx, &timestamp, BPF_ANY);
> +	}
> +
> +	bpf_map_update_elem(&sample_filtered, &pid, &ok, BPF_ANY);
> +
> +	return true;
> +}
> +
> +static bool sample_filter__allow_exit(pid_t pid)
> +{
> +	/* default behavior */
> +	if (sample_period == 0)
> +		return true;
> +
> +	if (!bpf_map_lookup_elem(&sample_filtered, &pid))
> +		return false;
> +
> +	bpf_map_delete_elem(&sample_filtered, &pid);
> +	return true;
> +}
> +
>  static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
>  {
>  	bool augmented, do_output = false;
> @@ -526,7 +580,9 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
>  SEC("tp/raw_syscalls/sys_enter")
>  int sys_enter(struct syscall_enter_args *args)
>  {
> +	pid_t pid = getpid();
>  	struct augmented_args_payload *augmented_args;
> +
>  	/*
>  	 * We start len, the amount of data that will be in the perf ring
>  	 * buffer, if this is not filtered out by one of pid_filter__has(),
> @@ -537,7 +593,10 @@ int sys_enter(struct syscall_enter_args *args)
>  	 * initial, non-augmented raw_syscalls:sys_enter payload.
>  	 */
>  
> -	if (pid_filter__has(&pids_filtered, getpid()))
> +	if (pid_filter__has(&pids_filtered, pid))
> +		return 0;
> +
> +	if (!sample_filter__allow_enter(bpf_ktime_get_ns(), pid))
>  		return 0;
>  
>  	augmented_args = augmented_args_payload();
> @@ -561,9 +620,13 @@ int sys_enter(struct syscall_enter_args *args)
>  SEC("tp/raw_syscalls/sys_exit")
>  int sys_exit(struct syscall_exit_args *args)
>  {
> +	pid_t pid = getpid();
>  	struct syscall_exit_args exit_args;
>  
> -	if (pid_filter__has(&pids_filtered, getpid()))
> +	if (pid_filter__has(&pids_filtered, pid))
> +		return 0;
> +
> +	if (!sample_filter__allow_exit(pid))
>  		return 0;
>  
>  	bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
> -- 
> 2.47.0.338.g60cca15819-goog

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC/PATCH] perf trace: Add --syscall-period option
  2024-12-11 22:52 ` Arnaldo Carvalho de Melo
@ 2024-12-12  1:34   ` Ian Rogers
  2024-12-12  1:47     ` Howard Chu
  0 siblings, 1 reply; 8+ messages in thread
From: Ian Rogers @ 2024-12-12  1:34 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Namhyung Kim, Kan Liang, Jiri Olsa, Adrian Hunter, Peter Zijlstra,
	Ingo Molnar, LKML, linux-perf-users, Howard Chu

On Wed, Dec 11, 2024 at 2:52 PM Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
>
> On Wed, Dec 11, 2024 at 02:21:10PM -0800, Namhyung Kim wrote:
> > This option is to implement the event sampling for system calls.
> > The period is given in msec as it shows the time in msec.
> >
> >   # perf trace -C 0 --syscall-period 100 sleep 1
> >            ? (         ): fleetspeakd/1828559  ... [continued]: futex())                                            = -1 ETIMEDOUT (Connection timed out)
> >        0.050 (100.247 ms): gnome-shell/572531 recvmsg(fd: 10<socket:[3355761]>, msg: 0x7ffef8b39d20)                = 40
> >      100.357 (100.149 ms): pipewire-pulse/572245 read(fd: 5<anon_inode:[eventfd]>, buf: 0x7ffc0b9dc8f0, count: 8)      = 8
> >      200.553 (100.268 ms): NetworkManager/3424 epoll_wait(epfd: 19<anon_inode:[eventpoll]>, events: 0x5607b85bb880, maxevents: 6) = 0
> >      300.876 (         ): mon/4932 poll(ufds: 0x7fa392784df0, nfds: 1, timeout_msecs: 100)            ...
> >      400.901 ( 0.025 ms): TaskCon~ller #/620145 futex(uaddr: 0x7f3fc596fa00, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> >      300.876 (100.123 ms): mon/4932  ... [continued]: poll())                                             = 0 (Timeout)
> >      500.901 ( 0.012 ms): evdefer/2/2335122 futex(uaddr: 0x5640baac5198, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> >      602.701 ( 0.017 ms): Compositor/1992200 futex(uaddr: 0x7f1a51dfdd40, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> >      705.589 ( 0.017 ms): JS Watchdog/947933 futex(uaddr: 0x7f4cac1d4240, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> >      812.667 ( 0.027 ms): fix/1985151 futex(uaddr: 0xc0008f7148, op: WAKE|PRIVATE_FLAG, val: 1)             = 1
> >      912.807 ( 0.017 ms): Xorg/572315 setitimer(value: 0x7ffc375d6ba0)                                      = 0
> >
> > The timestamp is kept in a per-cpu array and the allowed task is saved
> > in a hash map.
>
> Interesting concept, and one that is done just on the BPF part, so I
> think we should at least warn a user that is running this on a build
> without BPF skels.
>
> Will try it tomorrow,
>
> - Arnaldo
>
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> >  tools/perf/Documentation/perf-trace.txt       |  6 ++
> >  tools/perf/builtin-trace.c                    |  5 ++
> >  .../bpf_skel/augmented_raw_syscalls.bpf.c     | 67 ++++++++++++++++++-
> >  3 files changed, 76 insertions(+), 2 deletions(-)
> >
> > diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> > index 6e0cc50bbc13fc7f..9f338a8c5357a67e 100644
> > --- a/tools/perf/Documentation/perf-trace.txt
> > +++ b/tools/perf/Documentation/perf-trace.txt
> > @@ -241,6 +241,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
> >       printing using the existing 'perf trace' syscall arg beautifiers to map integer
> >       arguments to strings (pid to comm, syscall id to syscall name, etc).
> >
> > +--syscall-period::
> > +     Trace a system call in the given period (in msec).  This implements
> > +     sampling for syscalls in order to reduce the monitoring overhead.
> > +     For example, setting the sysall period to 100 (msec) means it will
> > +     sample a syscall and next one after 100 msec.

Is this similar to the -D/--delay option to perf-record? Perhaps share the name?

> > +
> >
> >  PAGEFAULTS
> >  ----------
> > diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> > index 3c46de1a8d79bfe6..789eb0ffd5f90b61 100644
> > --- a/tools/perf/builtin-trace.c
> > +++ b/tools/perf/builtin-trace.c
> > @@ -185,6 +185,7 @@ struct trace {
> >       } stats;
> >       unsigned int            max_stack;
> >       unsigned int            min_stack;
> > +     unsigned long           sample_period_ms;
> >       int                     raw_augmented_syscalls_args_size;
> >       bool                    raw_augmented_syscalls;
> >       bool                    fd_path_disabled;
> > @@ -5219,6 +5220,7 @@ int cmd_trace(int argc, const char **argv)
> >                    "start"),
> >       OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
> >                      "to customized ones"),
> > +     OPT_ULONG(0, "syscall-period", &trace.sample_period_ms, "syscall sampling period in ms"),
> >       OPTS_EVSWITCH(&trace.evswitch),
> >       OPT_END()
> >       };
> > @@ -5326,6 +5328,9 @@ int cmd_trace(int argc, const char **argv)
> >                               bpf_program__set_autoattach(prog, /*autoattach=*/false);
> >               }
> >
> > +             if (trace.sample_period_ms)
> > +                     trace.skel->rodata->sample_period = trace.sample_period_ms * NSEC_PER_MSEC;
> > +
> >               err = augmented_raw_syscalls_bpf__load(trace.skel);
> >
> >               if (err < 0) {
> > diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > index 4a62ed593e84edf8..12272620dcd73700 100644
> > --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > @@ -113,6 +113,22 @@ struct pids_filtered {
> >       __uint(max_entries, 64);
> >  } pids_filtered SEC(".maps");
> >
> > +struct sample_timestamp {
> > +     __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> > +     __type(key, int);
> > +     __type(value, __u64);
> > +     __uint(max_entries, 1);
> > +} sample_timestamp SEC(".maps");
> > +
> > +struct sample_filtered {
> > +     __uint(type, BPF_MAP_TYPE_HASH);
> > +     __type(key, pid_t);
> > +     __type(value, bool);
> > +     __uint(max_entries, MAX_CPUS);
> > +} sample_filtered SEC(".maps");
> > +
> > +const volatile __u64 sample_period;

nit: sample_period -> sample_period_ns

Thanks,
Ian

> > +
> >  struct augmented_args_payload {
> >       struct syscall_enter_args args;
> >       struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc)
> > @@ -428,6 +444,44 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
> >       return bpf_map_lookup_elem(pids, &pid) != NULL;
> >  }
> >
> > +static bool sample_filter__allow_enter(__u64 timestamp, pid_t pid)
> > +{
> > +     int idx = 0;
> > +     __u64 *prev_ts;
> > +     bool ok = true;
> > +
> > +     /* default behavior */
> > +     if (sample_period == 0)
> > +             return true;
> > +
> > +     prev_ts = bpf_map_lookup_elem(&sample_timestamp, &idx);
> > +
> > +     if (prev_ts) {
> > +             if ((*prev_ts + sample_period) > timestamp)
> > +                     return false;
> > +             *prev_ts = timestamp;
> > +     } else {
> > +             bpf_map_update_elem(&sample_timestamp, &idx, &timestamp, BPF_ANY);
> > +     }
> > +
> > +     bpf_map_update_elem(&sample_filtered, &pid, &ok, BPF_ANY);
> > +
> > +     return true;
> > +}
> > +
> > +static bool sample_filter__allow_exit(pid_t pid)
> > +{
> > +     /* default behavior */
> > +     if (sample_period == 0)
> > +             return true;
> > +
> > +     if (!bpf_map_lookup_elem(&sample_filtered, &pid))
> > +             return false;
> > +
> > +     bpf_map_delete_elem(&sample_filtered, &pid);
> > +     return true;
> > +}
> > +
> >  static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
> >  {
> >       bool augmented, do_output = false;
> > @@ -526,7 +580,9 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
> >  SEC("tp/raw_syscalls/sys_enter")
> >  int sys_enter(struct syscall_enter_args *args)
> >  {
> > +     pid_t pid = getpid();
> >       struct augmented_args_payload *augmented_args;
> > +
> >       /*
> >        * We start len, the amount of data that will be in the perf ring
> >        * buffer, if this is not filtered out by one of pid_filter__has(),
> > @@ -537,7 +593,10 @@ int sys_enter(struct syscall_enter_args *args)
> >        * initial, non-augmented raw_syscalls:sys_enter payload.
> >        */
> >
> > -     if (pid_filter__has(&pids_filtered, getpid()))
> > +     if (pid_filter__has(&pids_filtered, pid))
> > +             return 0;
> > +
> > +     if (!sample_filter__allow_enter(bpf_ktime_get_ns(), pid))
> >               return 0;
> >
> >       augmented_args = augmented_args_payload();
> > @@ -561,9 +620,13 @@ int sys_enter(struct syscall_enter_args *args)
> >  SEC("tp/raw_syscalls/sys_exit")
> >  int sys_exit(struct syscall_exit_args *args)
> >  {
> > +     pid_t pid = getpid();
> >       struct syscall_exit_args exit_args;
> >
> > -     if (pid_filter__has(&pids_filtered, getpid()))
> > +     if (pid_filter__has(&pids_filtered, pid))
> > +             return 0;
> > +
> > +     if (!sample_filter__allow_exit(pid))
> >               return 0;
> >
> >       bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
> > --
> > 2.47.0.338.g60cca15819-goog

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC/PATCH] perf trace: Add --syscall-period option
  2024-12-12  1:34   ` Ian Rogers
@ 2024-12-12  1:47     ` Howard Chu
  2024-12-12  4:28       ` Ian Rogers
  0 siblings, 1 reply; 8+ messages in thread
From: Howard Chu @ 2024-12-12  1:47 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Arnaldo Carvalho de Melo, Namhyung Kim, Kan Liang, Jiri Olsa,
	Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users

Hello Ian,

On Wed, Dec 11, 2024 at 5:34 PM Ian Rogers <irogers@google.com> wrote:
>
> On Wed, Dec 11, 2024 at 2:52 PM Arnaldo Carvalho de Melo
> <acme@kernel.org> wrote:
> >
> > On Wed, Dec 11, 2024 at 02:21:10PM -0800, Namhyung Kim wrote:
> > > This option is to implement the event sampling for system calls.
> > > The period is given in msec as it shows the time in msec.
> > >
> > >   # perf trace -C 0 --syscall-period 100 sleep 1
> > >            ? (         ): fleetspeakd/1828559  ... [continued]: futex())                                            = -1 ETIMEDOUT (Connection timed out)
> > >        0.050 (100.247 ms): gnome-shell/572531 recvmsg(fd: 10<socket:[3355761]>, msg: 0x7ffef8b39d20)                = 40
> > >      100.357 (100.149 ms): pipewire-pulse/572245 read(fd: 5<anon_inode:[eventfd]>, buf: 0x7ffc0b9dc8f0, count: 8)      = 8
> > >      200.553 (100.268 ms): NetworkManager/3424 epoll_wait(epfd: 19<anon_inode:[eventpoll]>, events: 0x5607b85bb880, maxevents: 6) = 0
> > >      300.876 (         ): mon/4932 poll(ufds: 0x7fa392784df0, nfds: 1, timeout_msecs: 100)            ...
> > >      400.901 ( 0.025 ms): TaskCon~ller #/620145 futex(uaddr: 0x7f3fc596fa00, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> > >      300.876 (100.123 ms): mon/4932  ... [continued]: poll())                                             = 0 (Timeout)
> > >      500.901 ( 0.012 ms): evdefer/2/2335122 futex(uaddr: 0x5640baac5198, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> > >      602.701 ( 0.017 ms): Compositor/1992200 futex(uaddr: 0x7f1a51dfdd40, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> > >      705.589 ( 0.017 ms): JS Watchdog/947933 futex(uaddr: 0x7f4cac1d4240, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> > >      812.667 ( 0.027 ms): fix/1985151 futex(uaddr: 0xc0008f7148, op: WAKE|PRIVATE_FLAG, val: 1)             = 1
> > >      912.807 ( 0.017 ms): Xorg/572315 setitimer(value: 0x7ffc375d6ba0)                                      = 0
> > >
> > > The timestamp is kept in a per-cpu array and the allowed task is saved
> > > in a hash map.
> >
> > Interesting concept, and one that is done just on the BPF part, so I
> > think we should at least warn a user that is running this on a build
> > without BPF skels.
> >
> > Will try it tomorrow,
> >
> > - Arnaldo
> >
> > > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > > ---
> > >  tools/perf/Documentation/perf-trace.txt       |  6 ++
> > >  tools/perf/builtin-trace.c                    |  5 ++
> > >  .../bpf_skel/augmented_raw_syscalls.bpf.c     | 67 ++++++++++++++++++-
> > >  3 files changed, 76 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> > > index 6e0cc50bbc13fc7f..9f338a8c5357a67e 100644
> > > --- a/tools/perf/Documentation/perf-trace.txt
> > > +++ b/tools/perf/Documentation/perf-trace.txt
> > > @@ -241,6 +241,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
> > >       printing using the existing 'perf trace' syscall arg beautifiers to map integer
> > >       arguments to strings (pid to comm, syscall id to syscall name, etc).
> > >
> > > +--syscall-period::
> > > +     Trace a system call in the given period (in msec).  This implements
> > > +     sampling for syscalls in order to reduce the monitoring overhead.
> > > +     For example, setting the sysall period to 100 (msec) means it will
> > > +     sample a syscall and next one after 100 msec.
>
> Is this similar to the -D/--delay option to perf-record? Perhaps share the name?

I think this is more like a throttling for the perf trace output,
instead of a one-time delay or a range of enabled time like -D, as the
prev_ts here is continuously updated. This restricts the maximum
sampling frequency to 1 / syscall-period, meaning every syscall is
sampled just after the syscall-period has passed.

Thanks,
Howard

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC/PATCH] perf trace: Add --syscall-period option
  2024-12-12  1:47     ` Howard Chu
@ 2024-12-12  4:28       ` Ian Rogers
  2024-12-13  2:08         ` Namhyung Kim
  0 siblings, 1 reply; 8+ messages in thread
From: Ian Rogers @ 2024-12-12  4:28 UTC (permalink / raw)
  To: Howard Chu
  Cc: Arnaldo Carvalho de Melo, Namhyung Kim, Kan Liang, Jiri Olsa,
	Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users

On Wed, Dec 11, 2024 at 5:47 PM Howard Chu <howardchu95@gmail.com> wrote:
>
> Hello Ian,
>
> On Wed, Dec 11, 2024 at 5:34 PM Ian Rogers <irogers@google.com> wrote:
> >
> > On Wed, Dec 11, 2024 at 2:52 PM Arnaldo Carvalho de Melo
> > <acme@kernel.org> wrote:
> > >
> > > On Wed, Dec 11, 2024 at 02:21:10PM -0800, Namhyung Kim wrote:
> > > > This option is to implement the event sampling for system calls.
> > > > The period is given in msec as it shows the time in msec.
> > > >
> > > >   # perf trace -C 0 --syscall-period 100 sleep 1
> > > >            ? (         ): fleetspeakd/1828559  ... [continued]: futex())                                            = -1 ETIMEDOUT (Connection timed out)
> > > >        0.050 (100.247 ms): gnome-shell/572531 recvmsg(fd: 10<socket:[3355761]>, msg: 0x7ffef8b39d20)                = 40
> > > >      100.357 (100.149 ms): pipewire-pulse/572245 read(fd: 5<anon_inode:[eventfd]>, buf: 0x7ffc0b9dc8f0, count: 8)      = 8
> > > >      200.553 (100.268 ms): NetworkManager/3424 epoll_wait(epfd: 19<anon_inode:[eventpoll]>, events: 0x5607b85bb880, maxevents: 6) = 0
> > > >      300.876 (         ): mon/4932 poll(ufds: 0x7fa392784df0, nfds: 1, timeout_msecs: 100)            ...
> > > >      400.901 ( 0.025 ms): TaskCon~ller #/620145 futex(uaddr: 0x7f3fc596fa00, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> > > >      300.876 (100.123 ms): mon/4932  ... [continued]: poll())                                             = 0 (Timeout)
> > > >      500.901 ( 0.012 ms): evdefer/2/2335122 futex(uaddr: 0x5640baac5198, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> > > >      602.701 ( 0.017 ms): Compositor/1992200 futex(uaddr: 0x7f1a51dfdd40, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> > > >      705.589 ( 0.017 ms): JS Watchdog/947933 futex(uaddr: 0x7f4cac1d4240, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> > > >      812.667 ( 0.027 ms): fix/1985151 futex(uaddr: 0xc0008f7148, op: WAKE|PRIVATE_FLAG, val: 1)             = 1
> > > >      912.807 ( 0.017 ms): Xorg/572315 setitimer(value: 0x7ffc375d6ba0)                                      = 0
> > > >
> > > > The timestamp is kept in a per-cpu array and the allowed task is saved
> > > > in a hash map.
> > >
> > > Interesting concept, and one that is done just on the BPF part, so I
> > > think we should at least warn a user that is running this on a build
> > > without BPF skels.
> > >
> > > Will try it tomorrow,
> > >
> > > - Arnaldo
> > >
> > > > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > > > ---
> > > >  tools/perf/Documentation/perf-trace.txt       |  6 ++
> > > >  tools/perf/builtin-trace.c                    |  5 ++
> > > >  .../bpf_skel/augmented_raw_syscalls.bpf.c     | 67 ++++++++++++++++++-
> > > >  3 files changed, 76 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> > > > index 6e0cc50bbc13fc7f..9f338a8c5357a67e 100644
> > > > --- a/tools/perf/Documentation/perf-trace.txt
> > > > +++ b/tools/perf/Documentation/perf-trace.txt
> > > > @@ -241,6 +241,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
> > > >       printing using the existing 'perf trace' syscall arg beautifiers to map integer
> > > >       arguments to strings (pid to comm, syscall id to syscall name, etc).
> > > >
> > > > +--syscall-period::
> > > > +     Trace a system call in the given period (in msec).  This implements
> > > > +     sampling for syscalls in order to reduce the monitoring overhead.
> > > > +     For example, setting the sysall period to 100 (msec) means it will
> > > > +     sample a syscall and next one after 100 msec.
> >
> > Is this similar to the -D/--delay option to perf-record? Perhaps share the name?
>
> I think this is more like a throttling for the perf trace output,
> instead of a one-time delay or a range of enabled time like -D, as the
> prev_ts here is continuously updated. This restricts the maximum
> sampling frequency to 1 / syscall-period, meaning every syscall is
> sampled just after the syscall-period has passed.

Thanks Howard, so it is some kind of "throttle" or "max-per" value.
I'd suggest rephrasing the text.

Thanks,
Ian

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC/PATCH] perf trace: Add --syscall-period option
  2024-12-12  4:28       ` Ian Rogers
@ 2024-12-13  2:08         ` Namhyung Kim
  0 siblings, 0 replies; 8+ messages in thread
From: Namhyung Kim @ 2024-12-13  2:08 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Howard Chu, Arnaldo Carvalho de Melo, Kan Liang, Jiri Olsa,
	Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users

On Wed, Dec 11, 2024 at 08:28:33PM -0800, Ian Rogers wrote:
> On Wed, Dec 11, 2024 at 5:47 PM Howard Chu <howardchu95@gmail.com> wrote:
> >
> > Hello Ian,
> >
> > On Wed, Dec 11, 2024 at 5:34 PM Ian Rogers <irogers@google.com> wrote:
> > >
> > > On Wed, Dec 11, 2024 at 2:52 PM Arnaldo Carvalho de Melo
> > > <acme@kernel.org> wrote:
> > > >
> > > > On Wed, Dec 11, 2024 at 02:21:10PM -0800, Namhyung Kim wrote:
> > > > > This option is to implement the event sampling for system calls.
> > > > > The period is given in msec as it shows the time in msec.
> > > > >
> > > > >   # perf trace -C 0 --syscall-period 100 sleep 1
> > > > >            ? (         ): fleetspeakd/1828559  ... [continued]: futex())                                            = -1 ETIMEDOUT (Connection timed out)
> > > > >        0.050 (100.247 ms): gnome-shell/572531 recvmsg(fd: 10<socket:[3355761]>, msg: 0x7ffef8b39d20)                = 40
> > > > >      100.357 (100.149 ms): pipewire-pulse/572245 read(fd: 5<anon_inode:[eventfd]>, buf: 0x7ffc0b9dc8f0, count: 8)      = 8
> > > > >      200.553 (100.268 ms): NetworkManager/3424 epoll_wait(epfd: 19<anon_inode:[eventpoll]>, events: 0x5607b85bb880, maxevents: 6) = 0
> > > > >      300.876 (         ): mon/4932 poll(ufds: 0x7fa392784df0, nfds: 1, timeout_msecs: 100)            ...
> > > > >      400.901 ( 0.025 ms): TaskCon~ller #/620145 futex(uaddr: 0x7f3fc596fa00, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> > > > >      300.876 (100.123 ms): mon/4932  ... [continued]: poll())                                             = 0 (Timeout)
> > > > >      500.901 ( 0.012 ms): evdefer/2/2335122 futex(uaddr: 0x5640baac5198, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> > > > >      602.701 ( 0.017 ms): Compositor/1992200 futex(uaddr: 0x7f1a51dfdd40, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> > > > >      705.589 ( 0.017 ms): JS Watchdog/947933 futex(uaddr: 0x7f4cac1d4240, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> > > > >      812.667 ( 0.027 ms): fix/1985151 futex(uaddr: 0xc0008f7148, op: WAKE|PRIVATE_FLAG, val: 1)             = 1
> > > > >      912.807 ( 0.017 ms): Xorg/572315 setitimer(value: 0x7ffc375d6ba0)                                      = 0
> > > > >
> > > > > The timestamp is kept in a per-cpu array and the allowed task is saved
> > > > > in a hash map.
> > > >
> > > > Interesting concept, and one that is done just on the BPF part, so I
> > > > think we should at least warn a user that is running this on a build
> > > > without BPF skels.
> > > >
> > > > Will try it tomorrow,
> > > >
> > > > - Arnaldo
> > > >
> > > > > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > > > > ---
> > > > >  tools/perf/Documentation/perf-trace.txt       |  6 ++
> > > > >  tools/perf/builtin-trace.c                    |  5 ++
> > > > >  .../bpf_skel/augmented_raw_syscalls.bpf.c     | 67 ++++++++++++++++++-
> > > > >  3 files changed, 76 insertions(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> > > > > index 6e0cc50bbc13fc7f..9f338a8c5357a67e 100644
> > > > > --- a/tools/perf/Documentation/perf-trace.txt
> > > > > +++ b/tools/perf/Documentation/perf-trace.txt
> > > > > @@ -241,6 +241,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
> > > > >       printing using the existing 'perf trace' syscall arg beautifiers to map integer
> > > > >       arguments to strings (pid to comm, syscall id to syscall name, etc).
> > > > >
> > > > > +--syscall-period::
> > > > > +     Trace a system call in the given period (in msec).  This implements
> > > > > +     sampling for syscalls in order to reduce the monitoring overhead.
> > > > > +     For example, setting the sysall period to 100 (msec) means it will
> > > > > +     sample a syscall and next one after 100 msec.
> > >
> > > Is this similar to the -D/--delay option to perf-record? Perhaps share the name?
> >
> > I think this is more like a throttling for the perf trace output,
> > instead of a one-time delay or a range of enabled time like -D, as the
> > prev_ts here is continuously updated. This restricts the maximum
> > sampling frequency to 1 / syscall-period, meaning every syscall is
> > sampled just after the syscall-period has passed.
> 
> Thanks Howard, so it is some kind of "throttle" or "max-per" value.
> I'd suggest rephrasing the text.

How about this?

--syscall-sample=<period>::
	Enable sampling of system calls with a given period in msec.
	The sampling frequency would be 1 / period, in other words,
	it will trace a system call only after the given period of
	time is passed.  The sampling period is tracked per CPU.

Thanks,
Namhyung


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC/PATCH] perf trace: Add --syscall-period option
  2024-12-11 22:21 [RFC/PATCH] perf trace: Add --syscall-period option Namhyung Kim
  2024-12-11 22:52 ` Arnaldo Carvalho de Melo
@ 2024-12-18  5:40 ` Howard Chu
  2024-12-19  5:37   ` Namhyung Kim
  1 sibling, 1 reply; 8+ messages in thread
From: Howard Chu @ 2024-12-18  5:40 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Arnaldo Carvalho de Melo, Ian Rogers, Kan Liang, Jiri Olsa,
	Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users

Hello Namhyung,

Thanks for the patch.

On Wed, Dec 11, 2024 at 2:21 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> This option is to implement the event sampling for system calls.
> The period is given in msec as it shows the time in msec.
>
>   # perf trace -C 0 --syscall-period 100 sleep 1
>            ? (         ): fleetspeakd/1828559  ... [continued]: futex())                                            = -1 ETIMEDOUT (Connection timed out)
>        0.050 (100.247 ms): gnome-shell/572531 recvmsg(fd: 10<socket:[3355761]>, msg: 0x7ffef8b39d20)                = 40
>      100.357 (100.149 ms): pipewire-pulse/572245 read(fd: 5<anon_inode:[eventfd]>, buf: 0x7ffc0b9dc8f0, count: 8)      = 8
>      200.553 (100.268 ms): NetworkManager/3424 epoll_wait(epfd: 19<anon_inode:[eventpoll]>, events: 0x5607b85bb880, maxevents: 6) = 0
>      300.876 (         ): mon/4932 poll(ufds: 0x7fa392784df0, nfds: 1, timeout_msecs: 100)            ...
>      400.901 ( 0.025 ms): TaskCon~ller #/620145 futex(uaddr: 0x7f3fc596fa00, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
>      300.876 (100.123 ms): mon/4932  ... [continued]: poll())                                             = 0 (Timeout)
>      500.901 ( 0.012 ms): evdefer/2/2335122 futex(uaddr: 0x5640baac5198, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
>      602.701 ( 0.017 ms): Compositor/1992200 futex(uaddr: 0x7f1a51dfdd40, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
>      705.589 ( 0.017 ms): JS Watchdog/947933 futex(uaddr: 0x7f4cac1d4240, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
>      812.667 ( 0.027 ms): fix/1985151 futex(uaddr: 0xc0008f7148, op: WAKE|PRIVATE_FLAG, val: 1)             = 1
>      912.807 ( 0.017 ms): Xorg/572315 setitimer(value: 0x7ffc375d6ba0)                                      = 0
>
> The timestamp is kept in a per-cpu array and the allowed task is saved
> in a hash map.
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  tools/perf/Documentation/perf-trace.txt       |  6 ++
>  tools/perf/builtin-trace.c                    |  5 ++
>  .../bpf_skel/augmented_raw_syscalls.bpf.c     | 67 ++++++++++++++++++-
>  3 files changed, 76 insertions(+), 2 deletions(-)
>
> diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> index 6e0cc50bbc13fc7f..9f338a8c5357a67e 100644
> --- a/tools/perf/Documentation/perf-trace.txt
> +++ b/tools/perf/Documentation/perf-trace.txt
> @@ -241,6 +241,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
>         printing using the existing 'perf trace' syscall arg beautifiers to map integer
>         arguments to strings (pid to comm, syscall id to syscall name, etc).
>
> +--syscall-period::
> +       Trace a system call in the given period (in msec).  This implements
> +       sampling for syscalls in order to reduce the monitoring overhead.
> +       For example, setting the sysall period to 100 (msec) means it will
> +       sample a syscall and next one after 100 msec.
> +
>
>  PAGEFAULTS
>  ----------
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index 3c46de1a8d79bfe6..789eb0ffd5f90b61 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -185,6 +185,7 @@ struct trace {
>         } stats;
>         unsigned int            max_stack;
>         unsigned int            min_stack;
> +       unsigned long           sample_period_ms;
>         int                     raw_augmented_syscalls_args_size;
>         bool                    raw_augmented_syscalls;
>         bool                    fd_path_disabled;
> @@ -5219,6 +5220,7 @@ int cmd_trace(int argc, const char **argv)
>                      "start"),
>         OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
>                        "to customized ones"),
> +       OPT_ULONG(0, "syscall-period", &trace.sample_period_ms, "syscall sampling period in ms"),
>         OPTS_EVSWITCH(&trace.evswitch),
>         OPT_END()
>         };
> @@ -5326,6 +5328,9 @@ int cmd_trace(int argc, const char **argv)
>                                 bpf_program__set_autoattach(prog, /*autoattach=*/false);
>                 }
>
> +               if (trace.sample_period_ms)
> +                       trace.skel->rodata->sample_period = trace.sample_period_ms * NSEC_PER_MSEC;
> +
>                 err = augmented_raw_syscalls_bpf__load(trace.skel);
>
>                 if (err < 0) {
> diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> index 4a62ed593e84edf8..12272620dcd73700 100644
> --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> @@ -113,6 +113,22 @@ struct pids_filtered {
>         __uint(max_entries, 64);
>  } pids_filtered SEC(".maps");
>
> +struct sample_timestamp {
> +       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +       __type(key, int);
> +       __type(value, __u64);
> +       __uint(max_entries, 1);
> +} sample_timestamp SEC(".maps");
> +
> +struct sample_filtered {
> +       __uint(type, BPF_MAP_TYPE_HASH);
> +       __type(key, pid_t);
> +       __type(value, bool);
> +       __uint(max_entries, MAX_CPUS);
> +} sample_filtered SEC(".maps");

Do you think MAX_CPUS provides enough entries? If there are MAX_CPUS
syscalls that have entered but haven't exited, new syscalls cannot
write to the map, which results in the new syscall's sys_exit not
being collected. This can lead to trace logs like the following:

210.947 (         ): tmux: server/4399 poll(ufds: 0x579c819eac40,
nfds: 51, timeout_msecs: 9)

which doesn't have a duration, because perf trace doesn't know when it
exits. For example, a poll system call can take a long time to exit,
if there are MAX_CPUS number of polls, they will occupy all the space
in the sample_filtered map.

> +       if (prev_ts) {
> +               if ((*prev_ts + sample_period) > timestamp)
> +                       return false;
> +               *prev_ts = timestamp;
> +       } else {
> +               bpf_map_update_elem(&sample_timestamp, &idx, &timestamp, BPF_ANY);
> +       }
> +
> +       bpf_map_update_elem(&sample_filtered, &pid, &ok, BPF_ANY);

and during polling, when a new syscall enters, this
bpf_map_update_elem() will return an error and fail to insert, and in
sys_exit(), this:

> +       if (!bpf_map_lookup_elem(&sample_filtered, &pid))
> +               return false;

returns false, and creates a sample with no duration.

In order to eliminate samples with no duration, there should be no
more than MAX_CPUS simultaneous syscalls, but I haven't yet created a
scenario where there are MAX_CPUS polls at the same time. (such
creation needs some tricks, because --syscall-period=1 allows maximum
<number-of-cpus> number of syscalls every millisecond)

That said, I agree that MAX_CPUS (4096) is generally sufficient for
the sample_filtered map in most situations.

Aside from this edge case, this patch looks good to me, I tested it with:

perf trace --syscall-period=500

   544.863 ( 0.008 ms): sshd/869528 rt_sigprocmask(how: UNBLOCK, nset:
0x7fffef5f71f0, oset: 0x7fffef5f7270, sigsetsize: 8) = 0
   545.034 ( 0.007 ms): tmux: server/4399 ioctl(fd: 54</dev/ptmx>,
cmd: FIONREAD, arg: 0x7ffe020ec758)          = 0
   593.054 (100.214 ms): gnome-shell/2419 poll(ufds: 0x6236d31eb850,
nfds: 15, timeout_msecs: 100)              = 0 (Timeout)
  1073.251 ( 0.059 ms): gmain/38955 inotify_add_watch(fd:
4<anon_inode:inotify>, pathname:
"/var/lib/fwupd/metadata/lvfs-testing", mask: 16789454) = -1 ENOENT
(No such file or directory)
  1073.641 ( 0.014 ms): tmux: server/4399 ioctl(fd: 54</dev/ptmx>,
cmd: FIONREAD, arg: 0x7ffe020ec758)          = 0
  1074.588 ( 0.013 ms): sshd/869528 rt_sigprocmask(how: UNBLOCK, nset:
0x7fffef5f71f0, oset: 0x7fffef5f7270, sigsetsize: 8) = 0
  1094.651 (100.370 ms): gnome-shell/2419 poll(ufds: 0x6236d31eb850,
nfds: 15, timeout_msecs: 100)              = 0 (Timeout)
  1596.817 (100.363 ms): gnome-shell/2419 poll(ufds: 0x6236d31eb850,
nfds: 15, timeout_msecs: 100)

The timestamp makes sense for the 500 ms period.

Thanks,
Howard

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC/PATCH] perf trace: Add --syscall-period option
  2024-12-18  5:40 ` Howard Chu
@ 2024-12-19  5:37   ` Namhyung Kim
  0 siblings, 0 replies; 8+ messages in thread
From: Namhyung Kim @ 2024-12-19  5:37 UTC (permalink / raw)
  To: Howard Chu
  Cc: Arnaldo Carvalho de Melo, Ian Rogers, Kan Liang, Jiri Olsa,
	Adrian Hunter, Peter Zijlstra, Ingo Molnar, LKML,
	linux-perf-users

On Tue, Dec 17, 2024 at 09:40:21PM -0800, Howard Chu wrote:
> Hello Namhyung,
> 
> Thanks for the patch.
> 
> On Wed, Dec 11, 2024 at 2:21 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > This option is to implement the event sampling for system calls.
> > The period is given in msec as it shows the time in msec.
> >
> >   # perf trace -C 0 --syscall-period 100 sleep 1
> >            ? (         ): fleetspeakd/1828559  ... [continued]: futex())                                            = -1 ETIMEDOUT (Connection timed out)
> >        0.050 (100.247 ms): gnome-shell/572531 recvmsg(fd: 10<socket:[3355761]>, msg: 0x7ffef8b39d20)                = 40
> >      100.357 (100.149 ms): pipewire-pulse/572245 read(fd: 5<anon_inode:[eventfd]>, buf: 0x7ffc0b9dc8f0, count: 8)      = 8
> >      200.553 (100.268 ms): NetworkManager/3424 epoll_wait(epfd: 19<anon_inode:[eventpoll]>, events: 0x5607b85bb880, maxevents: 6) = 0
> >      300.876 (         ): mon/4932 poll(ufds: 0x7fa392784df0, nfds: 1, timeout_msecs: 100)            ...
> >      400.901 ( 0.025 ms): TaskCon~ller #/620145 futex(uaddr: 0x7f3fc596fa00, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> >      300.876 (100.123 ms): mon/4932  ... [continued]: poll())                                             = 0 (Timeout)
> >      500.901 ( 0.012 ms): evdefer/2/2335122 futex(uaddr: 0x5640baac5198, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> >      602.701 ( 0.017 ms): Compositor/1992200 futex(uaddr: 0x7f1a51dfdd40, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> >      705.589 ( 0.017 ms): JS Watchdog/947933 futex(uaddr: 0x7f4cac1d4240, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
> >      812.667 ( 0.027 ms): fix/1985151 futex(uaddr: 0xc0008f7148, op: WAKE|PRIVATE_FLAG, val: 1)             = 1
> >      912.807 ( 0.017 ms): Xorg/572315 setitimer(value: 0x7ffc375d6ba0)                                      = 0
> >
> > The timestamp is kept in a per-cpu array and the allowed task is saved
> > in a hash map.
> >
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> >  tools/perf/Documentation/perf-trace.txt       |  6 ++
> >  tools/perf/builtin-trace.c                    |  5 ++
> >  .../bpf_skel/augmented_raw_syscalls.bpf.c     | 67 ++++++++++++++++++-
> >  3 files changed, 76 insertions(+), 2 deletions(-)
> >
> > diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> > index 6e0cc50bbc13fc7f..9f338a8c5357a67e 100644
> > --- a/tools/perf/Documentation/perf-trace.txt
> > +++ b/tools/perf/Documentation/perf-trace.txt
> > @@ -241,6 +241,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
> >         printing using the existing 'perf trace' syscall arg beautifiers to map integer
> >         arguments to strings (pid to comm, syscall id to syscall name, etc).
> >
> > +--syscall-period::
> > +       Trace a system call in the given period (in msec).  This implements
> > +       sampling for syscalls in order to reduce the monitoring overhead.
> > +       For example, setting the sysall period to 100 (msec) means it will
> > +       sample a syscall and next one after 100 msec.
> > +
> >
> >  PAGEFAULTS
> >  ----------
> > diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> > index 3c46de1a8d79bfe6..789eb0ffd5f90b61 100644
> > --- a/tools/perf/builtin-trace.c
> > +++ b/tools/perf/builtin-trace.c
> > @@ -185,6 +185,7 @@ struct trace {
> >         } stats;
> >         unsigned int            max_stack;
> >         unsigned int            min_stack;
> > +       unsigned long           sample_period_ms;
> >         int                     raw_augmented_syscalls_args_size;
> >         bool                    raw_augmented_syscalls;
> >         bool                    fd_path_disabled;
> > @@ -5219,6 +5220,7 @@ int cmd_trace(int argc, const char **argv)
> >                      "start"),
> >         OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
> >                        "to customized ones"),
> > +       OPT_ULONG(0, "syscall-period", &trace.sample_period_ms, "syscall sampling period in ms"),
> >         OPTS_EVSWITCH(&trace.evswitch),
> >         OPT_END()
> >         };
> > @@ -5326,6 +5328,9 @@ int cmd_trace(int argc, const char **argv)
> >                                 bpf_program__set_autoattach(prog, /*autoattach=*/false);
> >                 }
> >
> > +               if (trace.sample_period_ms)
> > +                       trace.skel->rodata->sample_period = trace.sample_period_ms * NSEC_PER_MSEC;
> > +
> >                 err = augmented_raw_syscalls_bpf__load(trace.skel);
> >
> >                 if (err < 0) {
> > diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > index 4a62ed593e84edf8..12272620dcd73700 100644
> > --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > @@ -113,6 +113,22 @@ struct pids_filtered {
> >         __uint(max_entries, 64);
> >  } pids_filtered SEC(".maps");
> >
> > +struct sample_timestamp {
> > +       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> > +       __type(key, int);
> > +       __type(value, __u64);
> > +       __uint(max_entries, 1);
> > +} sample_timestamp SEC(".maps");
> > +
> > +struct sample_filtered {
> > +       __uint(type, BPF_MAP_TYPE_HASH);
> > +       __type(key, pid_t);
> > +       __type(value, bool);
> > +       __uint(max_entries, MAX_CPUS);
> > +} sample_filtered SEC(".maps");
> 
> Do you think MAX_CPUS provides enough entries? If there are MAX_CPUS
> syscalls that have entered but haven't exited, new syscalls cannot
> write to the map, which results in the new syscall's sys_exit not
> being collected. This can lead to trace logs like the following:
> 
> 210.947 (         ): tmux: server/4399 poll(ufds: 0x579c819eac40,
> nfds: 51, timeout_msecs: 9)
> 
> which doesn't have a duration, because perf trace doesn't know when it
> exits. For example, a poll system call can take a long time to exit,
> if there are MAX_CPUS number of polls, they will occupy all the space
> in the sample_filtered map.
> 
> > +       if (prev_ts) {
> > +               if ((*prev_ts + sample_period) > timestamp)
> > +                       return false;
> > +               *prev_ts = timestamp;
> > +       } else {
> > +               bpf_map_update_elem(&sample_timestamp, &idx, &timestamp, BPF_ANY);
> > +       }
> > +
> > +       bpf_map_update_elem(&sample_filtered, &pid, &ok, BPF_ANY);
> 
> and during polling, when a new syscall enters, this
> bpf_map_update_elem() will return an error and fail to insert, and in
> sys_exit(), this:
> 
> > +       if (!bpf_map_lookup_elem(&sample_filtered, &pid))
> > +               return false;
> 
> returns false, and creates a sample with no duration.
> 
> In order to eliminate samples with no duration, there should be no
> more than MAX_CPUS simultaneous syscalls, but I haven't yet created a
> scenario where there are MAX_CPUS polls at the same time. (such
> creation needs some tricks, because --syscall-period=1 allows maximum
> <number-of-cpus> number of syscalls every millisecond)
> 
> That said, I agree that MAX_CPUS (4096) is generally sufficient for
> the sample_filtered map in most situations.

Right, usually the number of CPUs is far less than that, but I remember
there's a system with more than 2K CPUs.  I can add a command line
option to control the number of entries or change the default using the
actual number of CPUs in the system.

> 
> Aside from this edge case, this patch looks good to me, I tested it with:
> 
> perf trace --syscall-period=500
> 
>    544.863 ( 0.008 ms): sshd/869528 rt_sigprocmask(how: UNBLOCK, nset:
> 0x7fffef5f71f0, oset: 0x7fffef5f7270, sigsetsize: 8) = 0
>    545.034 ( 0.007 ms): tmux: server/4399 ioctl(fd: 54</dev/ptmx>,
> cmd: FIONREAD, arg: 0x7ffe020ec758)          = 0
>    593.054 (100.214 ms): gnome-shell/2419 poll(ufds: 0x6236d31eb850,
> nfds: 15, timeout_msecs: 100)              = 0 (Timeout)
>   1073.251 ( 0.059 ms): gmain/38955 inotify_add_watch(fd:
> 4<anon_inode:inotify>, pathname:
> "/var/lib/fwupd/metadata/lvfs-testing", mask: 16789454) = -1 ENOENT
> (No such file or directory)
>   1073.641 ( 0.014 ms): tmux: server/4399 ioctl(fd: 54</dev/ptmx>,
> cmd: FIONREAD, arg: 0x7ffe020ec758)          = 0
>   1074.588 ( 0.013 ms): sshd/869528 rt_sigprocmask(how: UNBLOCK, nset:
> 0x7fffef5f71f0, oset: 0x7fffef5f7270, sigsetsize: 8) = 0
>   1094.651 (100.370 ms): gnome-shell/2419 poll(ufds: 0x6236d31eb850,
> nfds: 15, timeout_msecs: 100)              = 0 (Timeout)
>   1596.817 (100.363 ms): gnome-shell/2419 poll(ufds: 0x6236d31eb850,
> nfds: 15, timeout_msecs: 100)
> 
> The timestamp makes sense for the 500 ms period.

Thanks for the review and test!
Namhyung


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-12-19  5:37 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-12-11 22:21 [RFC/PATCH] perf trace: Add --syscall-period option Namhyung Kim
2024-12-11 22:52 ` Arnaldo Carvalho de Melo
2024-12-12  1:34   ` Ian Rogers
2024-12-12  1:47     ` Howard Chu
2024-12-12  4:28       ` Ian Rogers
2024-12-13  2:08         ` Namhyung Kim
2024-12-18  5:40 ` Howard Chu
2024-12-19  5:37   ` Namhyung Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).