[PATCH v1 0/2] perf tool_pmu: Support enable/disable for tool PMU events

Linux Perf Users
 help / color / mirror / Atom feed

* [PATCH v1 0/2] perf tool_pmu: Support enable/disable for tool PMU events
@ 2026-05-18 18:39 Ian Rogers
  2026-05-18 18:39 ` [PATCH v1 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Ian Rogers @ 2026-05-18 18:39 UTC (permalink / raw)
  To: Francesco Nigro, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Jiri Olsa, Ian Rogers,
	Adrian Hunter, James Clark, Thomas Richter, linux-perf-users,
	linux-kernel

A regression in perf stat was reported where tool PMU events (like
duration_time used in CPUs_utilized metric) incorrectly included the
delay period when using the delay option (-D).

This series fixes the regression by making tool PMU events
(duration_time, user_time, system_time) behave more like regular
counters by implementing proper enable and disable support. They now
correctly accumulate values only when enabled.

The first patch implements the core enable/disable support for tool PMU
events, and the second patch adds a shell test to verify that
duration_time correctly excludes the delay period.

Ian Rogers (2):
  perf tool_pmu: Make tool PMU events respect enable/disable
  perf tests: Add test for stat delay option with duration_time

 tools/perf/tests/shell/stat.sh |  33 +++++
 tools/perf/util/evsel.c        |  85 +++++++------
 tools/perf/util/evsel.h        |  10 +-
 tools/perf/util/tool_pmu.c     | 212 ++++++++++++++++++++++++---------
 tools/perf/util/tool_pmu.h     |   2 +
 5 files changed, 249 insertions(+), 93 deletions(-)

-- 
2.54.0.631.ge1b05301d1-goog

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v1 1/2] perf tool_pmu: Make tool PMU events respect enable/disable
  2026-05-18 18:39 [PATCH v1 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
@ 2026-05-18 18:39 ` Ian Rogers
  2026-05-18 19:26   ` sashiko-bot
  2026-05-18 18:39 ` [PATCH v1 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers
  2026-05-18 20:14 ` [PATCH v2 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
  2 siblings, 1 reply; 20+ messages in thread
From: Ian Rogers @ 2026-05-18 18:39 UTC (permalink / raw)
  To: Francesco Nigro, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Jiri Olsa, Ian Rogers,
	Adrian Hunter, James Clark, Thomas Richter, linux-perf-users,
	linux-kernel

Tool PMU events (duration_time, user_time, system_time) currently
count from when the event is opened to when it is read. This causes
issues with features like the delay option (-D) or control fd, where
events are opened but should not start counting immediately.

Make these events behave more like regular counters by implementing
proper enable and disable support. Add accumulated_time to struct
evsel to track time while enabled, and implement enable/disable CPU
callbacks to start/stop counting.

Fixes: b71f46a6a708 ("perf stat: Remove hard coded shadow metrics")
Reported-by: Francesco Nigro <nigro.fra@gmail.com>
Closes: https://lore.kernel.org/linux-perf-users/20260517093650.2540920-1-nigro.fra@gmail.com/
Signed-off-by: Ian Rogers <irogers@google.com>
Assisted-by: Antigravity:gemini-3-flash
---
 tools/perf/util/evsel.c    |  85 ++++++++-------
 tools/perf/util/evsel.h    |  10 +-
 tools/perf/util/tool_pmu.c | 212 +++++++++++++++++++++++++++----------
 tools/perf/util/tool_pmu.h |   2 +
 4 files changed, 216 insertions(+), 93 deletions(-)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 2ee87fd84d3e..3646156500f6 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -11,68 +11,71 @@
  */
 #define __SANE_USERSPACE_TYPES__
 
-#include <byteswap.h>
+#include "evsel.h"
+
 #include <errno.h>
 #include <inttypes.h>
+#include <stdlib.h>
+
+#include <dirent.h>
 #include <linux/bitops.h>
-#include <api/fs/fs.h>
-#include <api/fs/tracing_path.h>
-#include <linux/hw_breakpoint.h>
-#include <linux/perf_event.h>
 #include <linux/compiler.h>
+#include <linux/ctype.h>
 #include <linux/err.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/perf_event.h>
 #include <linux/zalloc.h>
 #include <sys/ioctl.h>
 #include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
-#include <dirent.h>
-#include <stdlib.h>
+
+#include <api/fs/fs.h>
+#include <api/fs/tracing_path.h>
+#include <byteswap.h>
+#include <internal/lib.h>
+#include <internal/threadmap.h>
+#include <internal/xyarray.h>
+#include <perf/cpumap.h>
 #include <perf/evsel.h>
+
+#include "../perf-sys.h"
 #include "asm/bug.h"
+#include "bpf-filter.h"
 #include "bpf_counter.h"
 #include "callchain.h"
 #include "cgroup.h"
 #include "counts.h"
+#include "debug.h"
+#include "drm_pmu.h"
 #include "dwarf-regs.h"
+#include "env.h"
 #include "event.h"
-#include "evsel.h"
-#include "time-utils.h"
-#include "util/env.h"
-#include "util/evsel_config.h"
-#include "util/evsel_fprintf.h"
 #include "evlist.h"
-#include <perf/cpumap.h>
-#include "thread_map.h"
-#include "target.h"
+#include "evsel_config.h"
+#include "evsel_fprintf.h"
+#include "hashmap.h"
+#include "hist.h"
+#include "hwmon_pmu.h"
+#include "intel-tpebs.h"
+#include "memswap.h"
+#include "off_cpu.h"
+#include "parse-branch-options.h"
 #include "perf_regs.h"
+#include "pmu.h"
+#include "pmus.h"
 #include "record.h"
-#include "debug.h"
-#include "trace-event.h"
+#include "rlimit.h"
 #include "session.h"
 #include "stat.h"
 #include "string2.h"
-#include "memswap.h"
-#include "util.h"
-#include "util/hashmap.h"
-#include "off_cpu.h"
-#include "pmu.h"
-#include "pmus.h"
-#include "drm_pmu.h"
-#include "hwmon_pmu.h"
+#include "target.h"
+#include "thread_map.h"
+#include "time-utils.h"
 #include "tool_pmu.h"
 #include "tp_pmu.h"
-#include "rlimit.h"
-#include "../perf-sys.h"
-#include "util/parse-branch-options.h"
-#include "util/bpf-filter.h"
-#include "util/hist.h"
-#include <internal/xyarray.h>
-#include <internal/lib.h>
-#include <internal/threadmap.h>
-#include "util/intel-tpebs.h"
-
-#include <linux/ctype.h>
+#include "trace-event.h"
+#include "util.h"
 
 #ifdef HAVE_LIBTRACEEVENT
 #include <event-parse.h>
@@ -1795,6 +1798,8 @@ int evsel__append_addr_filter(struct evsel *evsel, const char *filter)
 /* Caller has to clear disabled after going through all CPUs. */
 int evsel__enable_cpu(struct evsel *evsel, int cpu_map_idx)
 {
+	if (evsel__is_tool(evsel))
+		return evsel__tool_pmu_enable_cpu(evsel, cpu_map_idx);
 	return perf_evsel__enable_cpu(&evsel->core, cpu_map_idx);
 }
 
@@ -1810,6 +1815,8 @@ int evsel__enable(struct evsel *evsel)
 /* Caller has to set disabled after going through all CPUs. */
 int evsel__disable_cpu(struct evsel *evsel, int cpu_map_idx)
 {
+	if (evsel__is_tool(evsel))
+		return evsel__tool_pmu_disable_cpu(evsel, cpu_map_idx);
 	return perf_evsel__disable_cpu(&evsel->core, cpu_map_idx);
 }
 
@@ -1885,8 +1892,10 @@ void evsel__exit(struct evsel *evsel)
 		evsel__priv_destructor(evsel->priv);
 	perf_evsel__object.fini(evsel);
 	if (evsel__tool_event(evsel) == TOOL_PMU__EVENT_SYSTEM_TIME ||
-	    evsel__tool_event(evsel) == TOOL_PMU__EVENT_USER_TIME)
-		xyarray__delete(evsel->start_times);
+	    evsel__tool_event(evsel) == TOOL_PMU__EVENT_USER_TIME) {
+		xyarray__delete(evsel->process_time.start_times);
+		xyarray__delete(evsel->process_time.accumulated_times);
+	}
 }
 
 void evsel__delete(struct evsel *evsel)
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 339b5c08a33d..0b6a6df0e4ef 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -190,12 +190,18 @@ struct evsel {
 			double max;
 		} retirement_latency;
 		/* duration_time is a single global time. */
-		__u64 start_time;
+		struct {
+			__u64 start_time;
+			__u64 accumulated_time;
+		} duration_time;
 		/*
 		 * user_time and system_time read an initial value potentially
 		 * per-CPU or per-pid.
 		 */
-		struct xyarray *start_times;
+		struct {
+			struct xyarray *start_times;
+			struct xyarray *accumulated_times;
+		} process_time;
 	};
 	/* Is the tool's fd for /proc/pid/stat or /proc/stat. */
 	bool pid_stat;
diff --git a/tools/perf/util/tool_pmu.c b/tools/perf/util/tool_pmu.c
index 6a9df3dc0e07..cfefd97cb95b 100644
--- a/tools/perf/util/tool_pmu.c
+++ b/tools/perf/util/tool_pmu.c
@@ -205,20 +205,57 @@ int evsel__tool_pmu_prepare_open(struct evsel *evsel,
 				 struct perf_cpu_map *cpus,
 				 int nthreads)
 {
-	if ((evsel__tool_event(evsel) == TOOL_PMU__EVENT_SYSTEM_TIME ||
-	     evsel__tool_event(evsel) == TOOL_PMU__EVENT_USER_TIME) &&
-	    !evsel->start_times) {
-		evsel->start_times = xyarray__new(perf_cpu_map__nr(cpus),
-						  nthreads,
-						  sizeof(__u64));
-		if (!evsel->start_times)
-			return -ENOMEM;
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+
+	if (ev == TOOL_PMU__EVENT_SYSTEM_TIME || ev == TOOL_PMU__EVENT_USER_TIME) {
+		if (!evsel->process_time.start_times) {
+			evsel->process_time.start_times =
+				xyarray__new(perf_cpu_map__nr(cpus), nthreads, sizeof(__u64));
+			if (!evsel->process_time.start_times)
+				return -ENOMEM;
+		}
+		if (!evsel->process_time.accumulated_times) {
+			evsel->process_time.accumulated_times =
+				xyarray__new(perf_cpu_map__nr(cpus), nthreads, sizeof(__u64));
+			if (!evsel->process_time.accumulated_times)
+				return -ENOMEM;
+		}
 	}
 	return 0;
 }
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->core.fd, x, y))
 
+static int tool_pmu__read_stat(struct evsel *evsel, int cpu_map_idx, int thread, __u64 *val)
+{
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+	bool system = ev == TOOL_PMU__EVENT_SYSTEM_TIME;
+	int fd = FD(evsel, cpu_map_idx, thread);
+	int err = 0;
+
+	if (fd < 0) {
+		*val = 0;
+		return 0;
+	}
+
+	lseek(fd, SEEK_SET, 0);
+	if (evsel->pid_stat) {
+		if (cpu_map_idx == 0)
+			err = read_pid_stat_field(fd, system ? 15 : 14, val);
+		else
+			*val = 0;
+	} else {
+		if (thread == 0) {
+			struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus, cpu_map_idx);
+
+			err = read_stat_field(fd, cpu, system ? 3 : 1, val);
+		} else {
+			*val = 0;
+		}
+	}
+	return err;
+}
+
 int evsel__tool_pmu_open(struct evsel *evsel,
 			 struct perf_thread_map *threads,
 			 int start_cpu_map_idx, int end_cpu_map_idx)
@@ -232,7 +269,14 @@ int evsel__tool_pmu_open(struct evsel *evsel,
 	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
 		if (evsel->core.attr.sample_period) /* no sampling */
 			return -EINVAL;
-		evsel->start_time = rdclock();
+		evsel->duration_time.accumulated_time = 0;
+		if (evsel->core.attr.disabled) {
+			evsel->disabled = true;
+			evsel->duration_time.start_time = 0;
+		} else {
+			evsel->disabled = false;
+			evsel->duration_time.start_time = rdclock();
+		}
 		return 0;
 	}
 
@@ -246,8 +290,8 @@ int evsel__tool_pmu_open(struct evsel *evsel,
 				pid = perf_thread_map__pid(threads, thread);
 
 			if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
-				bool system = ev == TOOL_PMU__EVENT_SYSTEM_TIME;
 				__u64 *start_time = NULL;
+				__u64 *accumulated_time = NULL;
 				int fd;
 
 				if (evsel->core.attr.sample_period) {
@@ -269,21 +313,22 @@ int evsel__tool_pmu_open(struct evsel *evsel,
 					err = -errno;
 					goto out_close;
 				}
-				start_time = xyarray__entry(evsel->start_times, idx, thread);
-				if (pid > -1) {
-					err = read_pid_stat_field(fd, system ? 15 : 14,
-								  start_time);
+				start_time = xyarray__entry(evsel->process_time.start_times, idx,
+							    thread);
+				accumulated_time = xyarray__entry(
+					evsel->process_time.accumulated_times, idx, thread);
+				*accumulated_time = 0;
+
+				if (evsel->core.attr.disabled) {
+					evsel->disabled = true;
+					*start_time = 0;
 				} else {
-					struct perf_cpu cpu;
-
-					cpu = perf_cpu_map__cpu(evsel->core.cpus, idx);
-					err = read_stat_field(fd, cpu, system ? 3 : 1,
-							      start_time);
+					evsel->disabled = false;
+					err = tool_pmu__read_stat(evsel, idx, thread, start_time);
+					if (err)
+						goto out_close;
 				}
-				if (err)
-					goto out_close;
 			}
-
 		}
 	}
 	return 0;
@@ -467,10 +512,79 @@ static void perf_counts__update(struct perf_counts_values *count,
 		count->lost = 0;
 	}
 }
+int evsel__tool_pmu_enable_cpu(struct evsel *evsel, int cpu_map_idx)
+{
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+	int thread, nthreads;
+
+	if (!evsel->disabled)
+		return 0;
+
+	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
+		if (cpu_map_idx == 0) {
+			evsel->duration_time.start_time = rdclock();
+			evsel->disabled = false;
+		}
+		return 0;
+	}
+
+	if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
+		nthreads = xyarray__max_y(evsel->process_time.start_times);
+		for (thread = 0; thread < nthreads; thread++) {
+			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
+							   cpu_map_idx, thread);
+			__u64 val;
+			int err;
+
+			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &val);
+			if (!err)
+				*start_time = val;
+		}
+	}
+	return 0;
+}
+
+int evsel__tool_pmu_disable_cpu(struct evsel *evsel, int cpu_map_idx)
+{
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+	int thread, nthreads;
+
+	if (evsel->disabled)
+		return 0;
+
+	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
+		if (cpu_map_idx == 0) {
+			__u64 delta = rdclock() - evsel->duration_time.start_time;
+
+			evsel->duration_time.accumulated_time += delta;
+			evsel->disabled = true;
+		}
+		return 0;
+	}
+
+	if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
+		nthreads = xyarray__max_y(evsel->process_time.start_times);
+		for (thread = 0; thread < nthreads; thread++) {
+			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
+							   cpu_map_idx, thread);
+			__u64 *accumulated_time = xyarray__entry(
+				evsel->process_time.accumulated_times, cpu_map_idx, thread);
+			__u64 val;
+			int err;
+
+			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &val);
+			if (!err) {
+				if (val >= *start_time)
+					*accumulated_time += (val - *start_time);
+			}
+		}
+	}
+	return 0;
+}
 
 int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
 {
-	__u64 *start_time, cur_time, delta_start;
+	__u64 delta_start = 0;
 	int err = 0;
 	struct perf_counts_values *count, *old_count = NULL;
 	bool adjust = false;
@@ -507,39 +621,32 @@ int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
 		return 0;
 	}
 	case TOOL_PMU__EVENT_DURATION_TIME:
-		/*
-		 * Pretend duration_time is only on the first CPU and thread, or
-		 * else aggregation will scale duration_time by the number of
-		 * CPUs/threads.
-		 */
-		start_time = &evsel->start_time;
-		if (cpu_map_idx == 0 && thread == 0)
-			cur_time = rdclock();
-		else
-			cur_time = *start_time;
+		if (cpu_map_idx == 0 && thread == 0) {
+			delta_start = evsel->duration_time.accumulated_time;
+			if (!evsel->disabled)
+				delta_start += (rdclock() - evsel->duration_time.start_time);
+		} else {
+			delta_start = 0;
+		}
 		break;
 	case TOOL_PMU__EVENT_USER_TIME:
 	case TOOL_PMU__EVENT_SYSTEM_TIME: {
-		bool system = evsel__tool_event(evsel) == TOOL_PMU__EVENT_SYSTEM_TIME;
-		int fd = FD(evsel, cpu_map_idx, thread);
-
-		start_time = xyarray__entry(evsel->start_times, cpu_map_idx, thread);
-		lseek(fd, SEEK_SET, 0);
-		if (evsel->pid_stat) {
-			/* The event exists solely on 1 CPU. */
-			if (cpu_map_idx == 0)
-				err = read_pid_stat_field(fd, system ? 15 : 14, &cur_time);
-			else
-				cur_time = 0;
-		} else {
-			/* The event is for all threads. */
-			if (thread == 0) {
-				struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus,
-									cpu_map_idx);
+		__u64 accumulated = *(__u64 *)xyarray__entry(evsel->process_time.accumulated_times,
+							     cpu_map_idx, thread);
 
-				err = read_stat_field(fd, cpu, system ? 3 : 1, &cur_time);
-			} else {
-				cur_time = 0;
+		if (evsel->disabled) {
+			delta_start = accumulated;
+		} else {
+			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
+							   cpu_map_idx, thread);
+			__u64 cur_time;
+
+			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &cur_time);
+			if (!err) {
+				if (cur_time >= *start_time)
+					delta_start = accumulated + (cur_time - *start_time);
+				else
+					delta_start = accumulated;
 			}
 		}
 		adjust = true;
@@ -553,7 +660,6 @@ int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
 	if (err)
 		return err;
 
-	delta_start = cur_time - *start_time;
 	if (adjust) {
 		__u64 ticks_per_sec = sysconf(_SC_CLK_TCK);
 
diff --git a/tools/perf/util/tool_pmu.h b/tools/perf/util/tool_pmu.h
index f1714001bc1d..f06ef5bce1c1 100644
--- a/tools/perf/util/tool_pmu.h
+++ b/tools/perf/util/tool_pmu.h
@@ -56,6 +56,8 @@ int evsel__tool_pmu_prepare_open(struct evsel *evsel,
 int evsel__tool_pmu_open(struct evsel *evsel,
 			 struct perf_thread_map *threads,
 			 int start_cpu_map_idx, int end_cpu_map_idx);
+int evsel__tool_pmu_enable_cpu(struct evsel *evsel, int cpu_map_idx);
+int evsel__tool_pmu_disable_cpu(struct evsel *evsel, int cpu_map_idx);
 int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread);
 
 struct perf_pmu *tool_pmu__new(void);
-- 
2.54.0.631.ge1b05301d1-goog


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v1 1/2] perf tool_pmu: Make tool PMU events respect enable/disable
  2026-05-18 18:39 ` [PATCH v1 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
@ 2026-05-18 19:26   ` sashiko-bot
  0 siblings, 0 replies; 20+ messages in thread
From: sashiko-bot @ 2026-05-18 19:26 UTC (permalink / raw)
  To: Ian Rogers; +Cc: linux-perf-users

Thank you for your contribution! Sashiko AI review found 4 potential issue(s) to consider:
- [High] Tool PMU events configured as members of an event group are never enabled and will incorrectly report zero.
- [Medium] The high-level `evsel__enable()` and `evsel__disable()` API functions bypass the tool PMU logic, causing `-ENOTTY` failures.
- [Medium] Silent error masking in `evsel__tool_pmu_enable_cpu` can lead to massive time calculation errors.
- [Low] Incorrect parameter ordering in `lseek` call within `tool_pmu__read_stat()`.
--

commit 859669b871dfb5dac6379e7cd2030d7d210d68ac
Author: Ian Rogers <irogers@google.com>

perf tool_pmu: Make tool PMU events respect enable/disable

Tool PMU events (duration_time, user_time, system_time) currently
count from when the event is opened to when it is read. This causes
issues with features like the delay option (-D) or control fd, where
events are opened but should not start counting immediately.

Make these events behave more like regular counters by implementing
proper enable and disable support.

Link: https://lore.kernel.org/linux-perf-users/20260517093650.2540920-1-nigro.fra@gmail.com/

> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 2ee87fd84d3ea..3646156500f6e 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
[ ... ]
> @@ -1795,6 +1798,8 @@ int evsel__append_addr_filter(struct evsel *evsel, const char *filter)
>  /* Caller has to clear disabled after going through all CPUs. */
>  int evsel__enable_cpu(struct evsel *evsel, int cpu_map_idx)
>  {
> +	if (evsel__is_tool(evsel))
> +		return evsel__tool_pmu_enable_cpu(evsel, cpu_map_idx);
>  	return perf_evsel__enable_cpu(&evsel->core, cpu_map_idx);
>  }
>  
> @@ -1810,6 +1815,8 @@ int evsel__enable(struct evsel *evsel)

Does evsel__enable() also need to be updated to handle tool PMUs?

The per-CPU functions are updated here, but the batch functions
evsel__enable() and evsel__disable() directly call perf_evsel__enable().

Since perf_evsel__enable() executes an ioctl(fd, PERF_EVENT_IOC_ENABLE)
and tool events use plain text files like /proc/stat, won't this result in
an -ENOTTY failure and leave the batch-enabled events disabled?

>  /* Caller has to set disabled after going through all CPUs. */
>  int evsel__disable_cpu(struct evsel *evsel, int cpu_map_idx)
>  {
> +	if (evsel__is_tool(evsel))
> +		return evsel__tool_pmu_disable_cpu(evsel, cpu_map_idx);
>  	return perf_evsel__disable_cpu(&evsel->core, cpu_map_idx);
>  }
[ ... ]
> diff --git a/tools/perf/util/tool_pmu.c b/tools/perf/util/tool_pmu.c
> index 6a9df3dc0e076..cfefd97cb95bc 100644
> --- a/tools/perf/util/tool_pmu.c
> +++ b/tools/perf/util/tool_pmu.c
[ ... ]
> +static int tool_pmu__read_stat(struct evsel *evsel, int cpu_map_idx, int thread, __u64 *val)
> +{
> +	enum tool_pmu_event ev = evsel__tool_event(evsel);
> +	bool system = ev == TOOL_PMU__EVENT_SYSTEM_TIME;
> +	int fd = FD(evsel, cpu_map_idx, thread);
> +	int err = 0;
> +
> +	if (fd < 0) {
> +		*val = 0;
> +		return 0;
> +	}
> +
> +	lseek(fd, SEEK_SET, 0);

This isn't a bug, but are the offset and whence arguments to lseek swapped
here? 

The POSIX prototype expects lseek(fd, offset, whence). This happens to
work because SEEK_SET is 0, but it misuses the API.

> +	if (evsel->pid_stat) {
> +		if (cpu_map_idx == 0)
> +			err = read_pid_stat_field(fd, system ? 15 : 14, val);
[ ... ]
> @@ -467,10 +512,79 @@ static void perf_counts__update(struct perf_counts_values *count,
>  		count->lost = 0;
>  	}
>  }
> +int evsel__tool_pmu_enable_cpu(struct evsel *evsel, int cpu_map_idx)
> +{

Will tool PMU events configured as members of an event group be successfully
enabled?

In the core event iteration logic like __evlist__enable(), the code
explicitly skips processing any event that is not a group leader. For kernel
PMU events, the kernel automatically enables the entire group when the leader
is enabled.

Since the kernel has no concept of synthetic userspace tool PMU events, if a
tool PMU event is grouped as a non-leader (e.g. 
perf stat -e '{cycles,duration_time}'), doesn't userspace skip it, leaving
evsel__tool_pmu_enable_cpu() uncalled and the event erroneously reporting 0?

> +	enum tool_pmu_event ev = evsel__tool_event(evsel);
> +	int thread, nthreads;
> +
> +	if (!evsel->disabled)
> +		return 0;
> +
> +	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
> +		if (cpu_map_idx == 0) {
> +			evsel->duration_time.start_time = rdclock();
> +			evsel->disabled = false;
> +		}
> +		return 0;
> +	}
> +
> +	if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
> +		nthreads = xyarray__max_y(evsel->process_time.start_times);
> +		for (thread = 0; thread < nthreads; thread++) {
> +			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
> +							   cpu_map_idx, thread);
> +			__u64 val;
> +			int err;
> +
> +			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &val);
> +			if (!err)
> +				*start_time = val;

What happens if tool_pmu__read_stat() fails here?

If reading /proc/<pid>/stat fails because the monitored process just exited,
the error is ignored and *start_time remains 0.

If a subsequent read in evsel__tool_pmu_read() succeeds, could this cause the
delta calculation to subtract the stale 0 start time from the current system
time, incorrectly accumulating massive values and breaking the counter?

> +		}
> +	}
> +	return 0;
> +}

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260518183920.2894502-1-irogers@google.com?part=1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v1 2/2] perf tests: Add test for stat delay option with duration_time
  2026-05-18 18:39 [PATCH v1 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
  2026-05-18 18:39 ` [PATCH v1 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
@ 2026-05-18 18:39 ` Ian Rogers
  2026-05-18 19:33   ` sashiko-bot
  2026-05-18 20:14 ` [PATCH v2 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
  2 siblings, 1 reply; 20+ messages in thread
From: Ian Rogers @ 2026-05-18 18:39 UTC (permalink / raw)
  To: Francesco Nigro, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Jiri Olsa, Ian Rogers,
	Adrian Hunter, James Clark, Thomas Richter, linux-perf-users,
	linux-kernel

Add a new test case `test_stat_delay` to `stat.sh` to verify that
`duration_time` correctly excludes the delay period when using the
delay option (-D).

The test runs `perf stat -D 1000 -e duration_time sleep 2` and
verifies that `duration_time` is ~1s (excluding the 1s delay), not
~2s.

Signed-off-by: Ian Rogers <irogers@google.com>
Assisted-by: Antigravity:gemini-3-flash
---
 tools/perf/tests/shell/stat.sh | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh
index 4edb04039036..e9a9b2c91cae 100755
--- a/tools/perf/tests/shell/stat.sh
+++ b/tools/perf/tests/shell/stat.sh
@@ -483,6 +483,38 @@ test_stat_pid() {
   wait $pid 2>/dev/null || true
 }
 
+test_stat_delay() {
+  echo "stat -D test"
+  if ! perf stat -D 1000 -e duration_time sleep 2 > "${stat_output}" 2>&1
+  then
+    echo "stat -D test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  duration=$(grep "duration_time" "${stat_output}" | awk '{print $1}' | tr -d ',')
+  if [ -z "$duration" ]
+  then
+    echo "stat -D test [Failed - failed to find duration_time in output]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  # duration is in ns. We expect it to be ~1s (1,000,000,000 ns) because
+  # sleep 2 runs for 2s, and we delay by 1s, so measured window is 1s.
+  # If it includes delay, it will be > 2s (2,000,000,000 ns).
+  if [ "$duration" -gt 1500000000 ]
+  then
+    echo "stat -D test [Failed - duration_time ($duration ns) includes delay]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+  echo "stat -D test [Success]"
+}
+
 test_default_stat
 test_null_stat
 test_offline_cpu_stat
@@ -498,6 +530,7 @@ test_stat_no_aggr
 test_stat_detailed
 test_stat_repeat
 test_stat_pid
+test_stat_delay
 
 cleanup
 exit $err
-- 
2.54.0.631.ge1b05301d1-goog


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v1 2/2] perf tests: Add test for stat delay option with duration_time
  2026-05-18 18:39 ` [PATCH v1 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers
@ 2026-05-18 19:33   ` sashiko-bot
  0 siblings, 0 replies; 20+ messages in thread
From: sashiko-bot @ 2026-05-18 19:33 UTC (permalink / raw)
  To: Ian Rogers; +Cc: linux-perf-users

Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:
- [Medium] The test lacks a lower bound check, failing to verify that duration_time is actually ~1s as claimed.
- [Medium] Parsing formatted counter output assumes commas for thousands separators, causing silent test validation bypass in non-English locales.
- [Medium] The test uses a tight 1.5s threshold for duration_time, making it flaky on heavily loaded systems.
--

commit c6900a5f9502c500776ded0f9cab3bb2ae678d17
Author: Ian Rogers <irogers@google.com>

perf tests: Add test for stat delay option with duration_time

This commit adds a new test case to stat.sh to verify that the duration_time
event correctly excludes the delay period when using the delay option.

> diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh
> index 4edb04039036e..e9a9b2c91cae3 100755
> --- a/tools/perf/tests/shell/stat.sh
> +++ b/tools/perf/tests/shell/stat.sh
[ ... ]
> @@ -483,6 +483,38 @@ test_stat_pid() {
>    wait $pid 2>/dev/null || true
>  }
>  
> +test_stat_delay() {
> +  echo "stat -D test"
> +  if ! perf stat -D 1000 -e duration_time sleep 2 > "${stat_output}" 2>&1
> +  then
> +    echo "stat -D test [Failed - command failed]"
> +    cat "${stat_output}"
> +    err=1
> +    return
> +  fi
> +
> +  duration=$(grep "duration_time" "${stat_output}" | awk '{print $1}' | tr -d ',')

Does this correctly parse the output in non-English locales?

If the locale uses periods or spaces for thousands separators instead of
commas, the string will retain those characters. When bash evaluates the
integer condition later, a string like 1.000.000.000 causes a syntax error.
Because this occurs inside an if condition, the script will silently bypass
the failure check and incorrectly report success. Should the perf stat
command be executed with the C locale to guarantee a consistent format?

> +  if [ -z "$duration" ]
> +  then
> +    echo "stat -D test [Failed - failed to find duration_time in output]"
> +    cat "${stat_output}"
> +    err=1
> +    return
> +  fi
> +
> +  # duration is in ns. We expect it to be ~1s (1,000,000,000 ns) because
> +  # sleep 2 runs for 2s, and we delay by 1s, so measured window is 1s.
> +  # If it includes delay, it will be > 2s (2,000,000,000 ns).
> +  if [ "$duration" -gt 1500000000 ]

Could a heavily loaded test machine or CI environment cause the sleep command
to take significantly longer than 2 seconds? If it takes 2.6 seconds, the
duration would exceed the 1500000000 ns threshold and cause the test to
falsely fail.

Additionally, is there a need to check a lower bound? If the delay option
breaks the counter and duration is zero or abnormally small, this upper bound
check will evaluate to false, which bypasses the failure block and leads
to a false success.

> +  then
> +    echo "stat -D test [Failed - duration_time ($duration ns) includes delay]"
> +    cat "${stat_output}"
> +    err=1
> +    return
> +  fi
> +  echo "stat -D test [Success]"
> +}

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260518183920.2894502-1-irogers@google.com?part=2

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v2 0/2] perf tool_pmu: Support enable/disable for tool PMU events
  2026-05-18 18:39 [PATCH v1 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
  2026-05-18 18:39 ` [PATCH v1 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
  2026-05-18 18:39 ` [PATCH v1 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers
@ 2026-05-18 20:14 ` Ian Rogers
  2026-05-18 20:14   ` [PATCH v2 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
                     ` (2 more replies)
  2 siblings, 3 replies; 20+ messages in thread
From: Ian Rogers @ 2026-05-18 20:14 UTC (permalink / raw)
  To: irogers, acme, namhyung
  Cc: adrian.hunter, james.clark, jolsa, linux-kernel, linux-perf-users,
	mingo, nigro.fra, peterz, tmricht

A regression in perf stat was reported where tool PMU events (like
duration_time used in CPUs_utilized metric) incorrectly included the
delay period when using the delay option (-D).

This series fixes the regression by making tool PMU events
(duration_time, user_time, system_time) behave more like regular
counters by implementing proper enable and disable support. They now
correctly accumulate values only when enabled.

The first patch implements the core enable/disable support for tool PMU
events, and the second patch adds a shell test to verify that
duration_time correctly excludes the delay period.

Changes in v2:
- Implement evsel__tool_pmu_enable() and evsel__tool_pmu_disable() to
  avoid ioctl failures in batch evsel__enable() and evsel__disable()
  functions.
- Correctly iterate and enable/disable tool PMU events configured as
  non-leader members of event groups.
- Correct the lseek() arguments order in the read_stat helper:
  lseek(fd, 0, SEEK_SET) instead of lseek(fd, SEEK_SET, 0).
- Introduce INVALID_START_TIME (~0ULL) to prevent erroneous large delta
  accumulation in evsel__tool_pmu_read() if /proc/<pid>/stat fails to
  read in enable_cpu (e.g., process exited).
- Improve test parsing to use LC_ALL=C and cut to be robust against
  different locales, and use awk to dynamically compare duration_time to
  time elapsed with a 200ms tolerance (avoiding loaded CI false failures).
  Also added a lower-bound check.

Ian Rogers (2):
  perf tool_pmu: Make tool PMU events respect enable/disable
  perf tests: Add test for stat delay option with duration_time

 tools/perf/tests/shell/stat.sh |  48 +++++++
 tools/perf/util/evsel.c        | 190 +++++++++++++++++++------
 tools/perf/util/evsel.h        |  10 +-
 tools/perf/util/tool_pmu.c     | 244 ++++++++++++++++++++++++++-------
 tools/perf/util/tool_pmu.h     |   4 +
 5 files changed, 399 insertions(+), 97 deletions(-)

-- 
2.54.0.631.ge1b05301d1-goog


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v2 1/2] perf tool_pmu: Make tool PMU events respect enable/disable
  2026-05-18 20:14 ` [PATCH v2 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
@ 2026-05-18 20:14   ` Ian Rogers
  2026-05-18 20:49     ` sashiko-bot
  2026-05-18 20:14   ` [PATCH v2 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers
  2026-05-18 22:37   ` [PATCH v3 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
  2 siblings, 1 reply; 20+ messages in thread
From: Ian Rogers @ 2026-05-18 20:14 UTC (permalink / raw)
  To: irogers, acme, namhyung
  Cc: adrian.hunter, james.clark, jolsa, linux-kernel, linux-perf-users,
	mingo, nigro.fra, peterz, tmricht

Tool PMU events (duration_time, user_time, system_time) currently
count from when the event is opened to when it is read. This causes
issues with features like the delay option (-D) or control fd, where
events are opened but should not start counting immediately.

Make these events behave more like regular counters by implementing
proper enable and disable support. Add accumulated_time to struct
evsel to track time while enabled, and implement enable/disable CPU
callbacks to start/stop counting.

Fixes: b71f46a6a708 ("perf stat: Remove hard coded shadow metrics")
Reported-by: Francesco Nigro <nigro.fra@gmail.com>
Closes: https://lore.kernel.org/linux-perf-users/20260517093650.2540920-1-nigro.fra@gmail.com/
Signed-off-by: Ian Rogers <irogers@google.com>
Assisted-by: Antigravity:gemini-3-flash
---
 tools/perf/util/evsel.c    | 190 ++++++++++++++++++++++-------
 tools/perf/util/evsel.h    |  10 +-
 tools/perf/util/tool_pmu.c | 244 +++++++++++++++++++++++++++++--------
 tools/perf/util/tool_pmu.h |   4 +
 4 files changed, 351 insertions(+), 97 deletions(-)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 2ee87fd84d3e..63bfb03b6b13 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -11,68 +11,71 @@
  */
 #define __SANE_USERSPACE_TYPES__
 
-#include <byteswap.h>
+#include "evsel.h"
+
 #include <errno.h>
 #include <inttypes.h>
+#include <stdlib.h>
+
+#include <dirent.h>
 #include <linux/bitops.h>
-#include <api/fs/fs.h>
-#include <api/fs/tracing_path.h>
-#include <linux/hw_breakpoint.h>
-#include <linux/perf_event.h>
 #include <linux/compiler.h>
+#include <linux/ctype.h>
 #include <linux/err.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/perf_event.h>
 #include <linux/zalloc.h>
 #include <sys/ioctl.h>
 #include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
-#include <dirent.h>
-#include <stdlib.h>
+
+#include <api/fs/fs.h>
+#include <api/fs/tracing_path.h>
+#include <byteswap.h>
+#include <internal/lib.h>
+#include <internal/threadmap.h>
+#include <internal/xyarray.h>
+#include <perf/cpumap.h>
 #include <perf/evsel.h>
+
+#include "../perf-sys.h"
 #include "asm/bug.h"
+#include "bpf-filter.h"
 #include "bpf_counter.h"
 #include "callchain.h"
 #include "cgroup.h"
 #include "counts.h"
+#include "debug.h"
+#include "drm_pmu.h"
 #include "dwarf-regs.h"
+#include "env.h"
 #include "event.h"
-#include "evsel.h"
-#include "time-utils.h"
-#include "util/env.h"
-#include "util/evsel_config.h"
-#include "util/evsel_fprintf.h"
 #include "evlist.h"
-#include <perf/cpumap.h>
-#include "thread_map.h"
-#include "target.h"
+#include "evsel_config.h"
+#include "evsel_fprintf.h"
+#include "hashmap.h"
+#include "hist.h"
+#include "hwmon_pmu.h"
+#include "intel-tpebs.h"
+#include "memswap.h"
+#include "off_cpu.h"
+#include "parse-branch-options.h"
 #include "perf_regs.h"
+#include "pmu.h"
+#include "pmus.h"
 #include "record.h"
-#include "debug.h"
-#include "trace-event.h"
+#include "rlimit.h"
 #include "session.h"
 #include "stat.h"
 #include "string2.h"
-#include "memswap.h"
-#include "util.h"
-#include "util/hashmap.h"
-#include "off_cpu.h"
-#include "pmu.h"
-#include "pmus.h"
-#include "drm_pmu.h"
-#include "hwmon_pmu.h"
+#include "target.h"
+#include "thread_map.h"
+#include "time-utils.h"
 #include "tool_pmu.h"
 #include "tp_pmu.h"
-#include "rlimit.h"
-#include "../perf-sys.h"
-#include "util/parse-branch-options.h"
-#include "util/bpf-filter.h"
-#include "util/hist.h"
-#include <internal/xyarray.h>
-#include <internal/lib.h>
-#include <internal/threadmap.h>
-#include "util/intel-tpebs.h"
-
-#include <linux/ctype.h>
+#include "trace-event.h"
+#include "util.h"
 
 #ifdef HAVE_LIBTRACEEVENT
 #include <event-parse.h>
@@ -1795,12 +1798,63 @@ int evsel__append_addr_filter(struct evsel *evsel, const char *filter)
 /* Caller has to clear disabled after going through all CPUs. */
 int evsel__enable_cpu(struct evsel *evsel, int cpu_map_idx)
 {
-	return perf_evsel__enable_cpu(&evsel->core, cpu_map_idx);
+	int err;
+
+	if (evsel__is_tool(evsel))
+		err = evsel__tool_pmu_enable_cpu(evsel, cpu_map_idx);
+	else
+		err = perf_evsel__enable_cpu(&evsel->core, cpu_map_idx);
+
+	if (!err && evsel__is_group_leader(evsel)) {
+		struct evsel *member;
+
+		for_each_group_member(member, evsel) {
+			if (evsel__is_tool(member)) {
+				/*
+				 * In a mixed PMU group, tool PMU events are not
+				 * grouped in the kernel (opened with group_fd = -1)
+				 * and are skipped by the kernel when enabling the
+				 * group leader. We must manually enable them in
+				 * userspace.
+				 */
+				int mem_err = evsel__tool_pmu_enable_cpu(member, cpu_map_idx);
+
+				if (mem_err)
+					return mem_err;
+			}
+		}
+	}
+	return err;
 }
 
 int evsel__enable(struct evsel *evsel)
 {
-	int err = perf_evsel__enable(&evsel->core);
+	int err;
+
+	if (evsel__is_tool(evsel))
+		err = evsel__tool_pmu_enable(evsel);
+	else
+		err = perf_evsel__enable(&evsel->core);
+
+	if (!err && evsel__is_group_leader(evsel)) {
+		struct evsel *member;
+
+		for_each_group_member(member, evsel) {
+			if (evsel__is_tool(member)) {
+				/*
+				 * In a mixed PMU group, tool PMU events are not
+				 * grouped in the kernel (opened with group_fd = -1)
+				 * and are skipped by the kernel when enabling the
+				 * group leader. We must manually enable them in
+				 * userspace.
+				 */
+				int mem_err = evsel__tool_pmu_enable(member);
+
+				if (mem_err)
+					return mem_err;
+			}
+		}
+	}
 
 	if (!err)
 		evsel->disabled = false;
@@ -1810,12 +1864,62 @@ int evsel__enable(struct evsel *evsel)
 /* Caller has to set disabled after going through all CPUs. */
 int evsel__disable_cpu(struct evsel *evsel, int cpu_map_idx)
 {
-	return perf_evsel__disable_cpu(&evsel->core, cpu_map_idx);
+	int err;
+
+	if (evsel__is_tool(evsel))
+		err = evsel__tool_pmu_disable_cpu(evsel, cpu_map_idx);
+	else
+		err = perf_evsel__disable_cpu(&evsel->core, cpu_map_idx);
+
+	if (!err && evsel__is_group_leader(evsel)) {
+		struct evsel *member;
+
+		for_each_group_member(member, evsel) {
+			if (evsel__is_tool(member)) {
+				/*
+				 * In a mixed PMU group, tool PMU events are not
+				 * grouped in the kernel and are skipped by the
+				 * kernel when disabling the group leader. We must
+				 * manually disable them in userspace.
+				 */
+				int mem_err = evsel__tool_pmu_disable_cpu(member, cpu_map_idx);
+
+				if (mem_err)
+					return mem_err;
+			}
+		}
+	}
+	return err;
 }
 
 int evsel__disable(struct evsel *evsel)
 {
-	int err = perf_evsel__disable(&evsel->core);
+	int err;
+
+	if (evsel__is_tool(evsel))
+		err = evsel__tool_pmu_disable(evsel);
+	else
+		err = perf_evsel__disable(&evsel->core);
+
+	if (!err && evsel__is_group_leader(evsel)) {
+		struct evsel *member;
+
+		for_each_group_member(member, evsel) {
+			if (evsel__is_tool(member)) {
+				/*
+				 * In a mixed PMU group, tool PMU events are not
+				 * grouped in the kernel and are skipped by the
+				 * kernel when disabling the group leader. We must
+				 * manually disable them in userspace.
+				 */
+				int mem_err = evsel__tool_pmu_disable(member);
+
+				if (mem_err)
+					return mem_err;
+			}
+		}
+	}
+
 	/*
 	 * We mark it disabled here so that tools that disable a event can
 	 * ignore events after they disable it. I.e. the ring buffer may have
@@ -1885,8 +1989,10 @@ void evsel__exit(struct evsel *evsel)
 		evsel__priv_destructor(evsel->priv);
 	perf_evsel__object.fini(evsel);
 	if (evsel__tool_event(evsel) == TOOL_PMU__EVENT_SYSTEM_TIME ||
-	    evsel__tool_event(evsel) == TOOL_PMU__EVENT_USER_TIME)
-		xyarray__delete(evsel->start_times);
+	    evsel__tool_event(evsel) == TOOL_PMU__EVENT_USER_TIME) {
+		xyarray__delete(evsel->process_time.start_times);
+		xyarray__delete(evsel->process_time.accumulated_times);
+	}
 }
 
 void evsel__delete(struct evsel *evsel)
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 339b5c08a33d..0b6a6df0e4ef 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -190,12 +190,18 @@ struct evsel {
 			double max;
 		} retirement_latency;
 		/* duration_time is a single global time. */
-		__u64 start_time;
+		struct {
+			__u64 start_time;
+			__u64 accumulated_time;
+		} duration_time;
 		/*
 		 * user_time and system_time read an initial value potentially
 		 * per-CPU or per-pid.
 		 */
-		struct xyarray *start_times;
+		struct {
+			struct xyarray *start_times;
+			struct xyarray *accumulated_times;
+		} process_time;
 	};
 	/* Is the tool's fd for /proc/pid/stat or /proc/stat. */
 	bool pid_stat;
diff --git a/tools/perf/util/tool_pmu.c b/tools/perf/util/tool_pmu.c
index 6a9df3dc0e07..bb398fe82f6d 100644
--- a/tools/perf/util/tool_pmu.c
+++ b/tools/perf/util/tool_pmu.c
@@ -17,6 +17,8 @@
 #include <fcntl.h>
 #include <strings.h>
 
+#define INVALID_START_TIME ~0ULL
+
 static const char *const tool_pmu__event_names[TOOL_PMU__EVENT_MAX] = {
 	NULL,
 	"duration_time",
@@ -205,20 +207,57 @@ int evsel__tool_pmu_prepare_open(struct evsel *evsel,
 				 struct perf_cpu_map *cpus,
 				 int nthreads)
 {
-	if ((evsel__tool_event(evsel) == TOOL_PMU__EVENT_SYSTEM_TIME ||
-	     evsel__tool_event(evsel) == TOOL_PMU__EVENT_USER_TIME) &&
-	    !evsel->start_times) {
-		evsel->start_times = xyarray__new(perf_cpu_map__nr(cpus),
-						  nthreads,
-						  sizeof(__u64));
-		if (!evsel->start_times)
-			return -ENOMEM;
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+
+	if (ev == TOOL_PMU__EVENT_SYSTEM_TIME || ev == TOOL_PMU__EVENT_USER_TIME) {
+		if (!evsel->process_time.start_times) {
+			evsel->process_time.start_times =
+				xyarray__new(perf_cpu_map__nr(cpus), nthreads, sizeof(__u64));
+			if (!evsel->process_time.start_times)
+				return -ENOMEM;
+		}
+		if (!evsel->process_time.accumulated_times) {
+			evsel->process_time.accumulated_times =
+				xyarray__new(perf_cpu_map__nr(cpus), nthreads, sizeof(__u64));
+			if (!evsel->process_time.accumulated_times)
+				return -ENOMEM;
+		}
 	}
 	return 0;
 }
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->core.fd, x, y))
 
+static int tool_pmu__read_stat(struct evsel *evsel, int cpu_map_idx, int thread, __u64 *val)
+{
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+	bool system = ev == TOOL_PMU__EVENT_SYSTEM_TIME;
+	int fd = FD(evsel, cpu_map_idx, thread);
+	int err = 0;
+
+	if (fd < 0) {
+		*val = 0;
+		return 0;
+	}
+
+	lseek(fd, 0, SEEK_SET);
+	if (evsel->pid_stat) {
+		if (cpu_map_idx == 0)
+			err = read_pid_stat_field(fd, system ? 15 : 14, val);
+		else
+			*val = 0;
+	} else {
+		if (thread == 0) {
+			struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus, cpu_map_idx);
+
+			err = read_stat_field(fd, cpu, system ? 3 : 1, val);
+		} else {
+			*val = 0;
+		}
+	}
+	return err;
+}
+
 int evsel__tool_pmu_open(struct evsel *evsel,
 			 struct perf_thread_map *threads,
 			 int start_cpu_map_idx, int end_cpu_map_idx)
@@ -232,7 +271,14 @@ int evsel__tool_pmu_open(struct evsel *evsel,
 	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
 		if (evsel->core.attr.sample_period) /* no sampling */
 			return -EINVAL;
-		evsel->start_time = rdclock();
+		evsel->duration_time.accumulated_time = 0;
+		if (evsel->core.attr.disabled) {
+			evsel->disabled = true;
+			evsel->duration_time.start_time = INVALID_START_TIME;
+		} else {
+			evsel->disabled = false;
+			evsel->duration_time.start_time = rdclock();
+		}
 		return 0;
 	}
 
@@ -246,8 +292,8 @@ int evsel__tool_pmu_open(struct evsel *evsel,
 				pid = perf_thread_map__pid(threads, thread);
 
 			if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
-				bool system = ev == TOOL_PMU__EVENT_SYSTEM_TIME;
 				__u64 *start_time = NULL;
+				__u64 *accumulated_time = NULL;
 				int fd;
 
 				if (evsel->core.attr.sample_period) {
@@ -269,21 +315,22 @@ int evsel__tool_pmu_open(struct evsel *evsel,
 					err = -errno;
 					goto out_close;
 				}
-				start_time = xyarray__entry(evsel->start_times, idx, thread);
-				if (pid > -1) {
-					err = read_pid_stat_field(fd, system ? 15 : 14,
-								  start_time);
+				start_time = xyarray__entry(evsel->process_time.start_times, idx,
+							    thread);
+				accumulated_time = xyarray__entry(
+					evsel->process_time.accumulated_times, idx, thread);
+				*accumulated_time = 0;
+
+				if (evsel->core.attr.disabled) {
+					evsel->disabled = true;
+					*start_time = INVALID_START_TIME;
 				} else {
-					struct perf_cpu cpu;
-
-					cpu = perf_cpu_map__cpu(evsel->core.cpus, idx);
-					err = read_stat_field(fd, cpu, system ? 3 : 1,
-							      start_time);
+					evsel->disabled = false;
+					err = tool_pmu__read_stat(evsel, idx, thread, start_time);
+					if (err)
+						goto out_close;
 				}
-				if (err)
-					goto out_close;
 			}
-
 		}
 	}
 	return 0;
@@ -467,10 +514,108 @@ static void perf_counts__update(struct perf_counts_values *count,
 		count->lost = 0;
 	}
 }
+int evsel__tool_pmu_enable_cpu(struct evsel *evsel, int cpu_map_idx)
+{
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+	int thread, nthreads;
+
+	if (!evsel->disabled)
+		return 0;
+
+	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
+		if (cpu_map_idx == 0) {
+			evsel->duration_time.start_time = rdclock();
+			evsel->disabled = false;
+		}
+		return 0;
+	}
+
+	if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
+		nthreads = xyarray__max_y(evsel->process_time.start_times);
+		for (thread = 0; thread < nthreads; thread++) {
+			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
+							   cpu_map_idx, thread);
+			__u64 val;
+			int err;
+
+			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &val);
+			if (!err)
+				*start_time = val;
+			else
+				*start_time = INVALID_START_TIME;
+		}
+	}
+	return 0;
+}
+
+int evsel__tool_pmu_enable(struct evsel *evsel)
+{
+	unsigned int idx;
+	int err = 0;
+
+	for (idx = 0; idx < perf_cpu_map__nr(evsel->core.cpus); idx++) {
+		err = evsel__tool_pmu_enable_cpu(evsel, idx);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+int evsel__tool_pmu_disable_cpu(struct evsel *evsel, int cpu_map_idx)
+{
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+	int thread, nthreads;
+
+	if (evsel->disabled)
+		return 0;
+
+	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
+		if (cpu_map_idx == 0) {
+			__u64 delta = rdclock() - evsel->duration_time.start_time;
+
+			evsel->duration_time.accumulated_time += delta;
+			evsel->disabled = true;
+		}
+		return 0;
+	}
+
+	if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
+		nthreads = xyarray__max_y(evsel->process_time.start_times);
+		for (thread = 0; thread < nthreads; thread++) {
+			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
+							   cpu_map_idx, thread);
+			__u64 *accumulated_time = xyarray__entry(
+				evsel->process_time.accumulated_times, cpu_map_idx, thread);
+			__u64 val;
+			int err;
+
+			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &val);
+			if (!err) {
+				if (*start_time != INVALID_START_TIME && val >= *start_time)
+					*accumulated_time += (val - *start_time);
+				*start_time = INVALID_START_TIME;
+			}
+		}
+	}
+	return 0;
+}
+
+int evsel__tool_pmu_disable(struct evsel *evsel)
+{
+	unsigned int idx;
+	int err = 0;
+
+	for (idx = 0; idx < perf_cpu_map__nr(evsel->core.cpus); idx++) {
+		err = evsel__tool_pmu_disable_cpu(evsel, idx);
+		if (err)
+			break;
+	}
+	return err;
+}
 
 int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
 {
-	__u64 *start_time, cur_time, delta_start;
+	__u64 delta_start = 0;
 	int err = 0;
 	struct perf_counts_values *count, *old_count = NULL;
 	bool adjust = false;
@@ -507,39 +652,33 @@ int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
 		return 0;
 	}
 	case TOOL_PMU__EVENT_DURATION_TIME:
-		/*
-		 * Pretend duration_time is only on the first CPU and thread, or
-		 * else aggregation will scale duration_time by the number of
-		 * CPUs/threads.
-		 */
-		start_time = &evsel->start_time;
-		if (cpu_map_idx == 0 && thread == 0)
-			cur_time = rdclock();
-		else
-			cur_time = *start_time;
+		if (cpu_map_idx == 0 && thread == 0) {
+			delta_start = evsel->duration_time.accumulated_time;
+			if (!evsel->disabled &&
+			    evsel->duration_time.start_time != INVALID_START_TIME)
+				delta_start += (rdclock() - evsel->duration_time.start_time);
+		} else {
+			delta_start = 0;
+		}
 		break;
 	case TOOL_PMU__EVENT_USER_TIME:
 	case TOOL_PMU__EVENT_SYSTEM_TIME: {
-		bool system = evsel__tool_event(evsel) == TOOL_PMU__EVENT_SYSTEM_TIME;
-		int fd = FD(evsel, cpu_map_idx, thread);
-
-		start_time = xyarray__entry(evsel->start_times, cpu_map_idx, thread);
-		lseek(fd, SEEK_SET, 0);
-		if (evsel->pid_stat) {
-			/* The event exists solely on 1 CPU. */
-			if (cpu_map_idx == 0)
-				err = read_pid_stat_field(fd, system ? 15 : 14, &cur_time);
-			else
-				cur_time = 0;
-		} else {
-			/* The event is for all threads. */
-			if (thread == 0) {
-				struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus,
-									cpu_map_idx);
+		__u64 accumulated = *(__u64 *)xyarray__entry(evsel->process_time.accumulated_times,
+							     cpu_map_idx, thread);
 
-				err = read_stat_field(fd, cpu, system ? 3 : 1, &cur_time);
-			} else {
-				cur_time = 0;
+		if (evsel->disabled) {
+			delta_start = accumulated;
+		} else {
+			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
+							   cpu_map_idx, thread);
+			__u64 cur_time;
+
+			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &cur_time);
+			if (!err) {
+				if (*start_time != INVALID_START_TIME && cur_time >= *start_time)
+					delta_start = accumulated + (cur_time - *start_time);
+				else
+					delta_start = accumulated;
 			}
 		}
 		adjust = true;
@@ -553,7 +692,6 @@ int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
 	if (err)
 		return err;
 
-	delta_start = cur_time - *start_time;
 	if (adjust) {
 		__u64 ticks_per_sec = sysconf(_SC_CLK_TCK);
 
diff --git a/tools/perf/util/tool_pmu.h b/tools/perf/util/tool_pmu.h
index f1714001bc1d..f66d24cf3502 100644
--- a/tools/perf/util/tool_pmu.h
+++ b/tools/perf/util/tool_pmu.h
@@ -56,6 +56,10 @@ int evsel__tool_pmu_prepare_open(struct evsel *evsel,
 int evsel__tool_pmu_open(struct evsel *evsel,
 			 struct perf_thread_map *threads,
 			 int start_cpu_map_idx, int end_cpu_map_idx);
+int evsel__tool_pmu_enable_cpu(struct evsel *evsel, int cpu_map_idx);
+int evsel__tool_pmu_enable(struct evsel *evsel);
+int evsel__tool_pmu_disable_cpu(struct evsel *evsel, int cpu_map_idx);
+int evsel__tool_pmu_disable(struct evsel *evsel);
 int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread);
 
 struct perf_pmu *tool_pmu__new(void);
-- 
2.54.0.631.ge1b05301d1-goog


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 1/2] perf tool_pmu: Make tool PMU events respect enable/disable
  2026-05-18 20:14   ` [PATCH v2 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
@ 2026-05-18 20:49     ` sashiko-bot
  0 siblings, 0 replies; 20+ messages in thread
From: sashiko-bot @ 2026-05-18 20:49 UTC (permalink / raw)
  To: Ian Rogers; +Cc: linux-perf-users

Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
- [High] Tool PMU events (user_time and system_time) fail to update their `disabled` state when acting as event group members, leading to missing data (reporting 0).
- [Medium] File descriptor leak in `evsel__tool_pmu_open()` error path.
--

commit 1e5a2efb24e0315c712078601af14ae8cc8be1ff
Author: Ian Rogers <irogers@google.com>

perf tool_pmu: Make tool PMU events respect enable/disable

This commit implements enable and disable support for tool PMU events
(duration_time, user_time, system_time) to make them behave more like regular
counters. It adds accumulated_time tracking to prevent the events from
counting time while paused.

Closes: https://lore.kernel.org/linux-perf-users/20260517093650.2540920-1-nigro.fra@gmail.com/

> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 2ee87fd84d3ea..63bfb03b6b13a 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
[ ... ]
> @@ -1795,12 +1798,63 @@ int evsel__append_addr_filter(struct evsel *evsel, const char *filter)
[ ... ]
>  int evsel__enable(struct evsel *evsel)
>  {
> -	int err = perf_evsel__enable(&evsel->core);
> +	int err;
> +
> +	if (evsel__is_tool(evsel))
> +		err = evsel__tool_pmu_enable(evsel);
> +	else
> +		err = perf_evsel__enable(&evsel->core);
> +
> +	if (!err && evsel__is_group_leader(evsel)) {
> +		struct evsel *member;
> +
> +		for_each_group_member(member, evsel) {
> +			if (evsel__is_tool(member)) {
[ ... ]
> +				int mem_err = evsel__tool_pmu_enable(member);
> +
> +				if (mem_err)
> +					return mem_err;
> +			}
> +		}
> +	}
>  
>  	if (!err)
>  		evsel->disabled = false;

When evsel__enable() iterates over group members to enable them, does the
disabled flag get updated correctly for user_time and system_time events?

Since evsel->disabled = false at the end of this function only applies to
the group leader, it seems the group members are reliant on
evsel__tool_pmu_enable_cpu() to update their state.

> diff --git a/tools/perf/util/tool_pmu.c b/tools/perf/util/tool_pmu.c
> index 6a9df3dc0e076..bb398fe82f6d9 100644
> --- a/tools/perf/util/tool_pmu.c
> +++ b/tools/perf/util/tool_pmu.c
[ ... ]
> @@ -246,8 +292,8 @@ int evsel__tool_pmu_open(struct evsel *evsel,
[ ... ]
> +				if (evsel->core.attr.disabled) {
> +					evsel->disabled = true;
> +					*start_time = INVALID_START_TIME;
>  				} else {
> -					struct perf_cpu cpu;
> -
> -					cpu = perf_cpu_map__cpu(evsel->core.cpus, idx);
> -					err = read_stat_field(fd, cpu, system ? 3 : 1,
> -							      start_time);
> +					evsel->disabled = false;
> +					err = tool_pmu__read_stat(evsel, idx, thread, start_time);
> +					if (err)
> +						goto out_close;
>  				}

If tool_pmu__read_stat() fails here and jumps to out_close, is the file
descriptor we just successfully opened and saved in FD(evsel, idx, thread)
leaked?

Looking at the out_close cleanup logic:

out_close:
        ...
        do {
                while (--thread >= 0) {
                        if (FD(evsel, idx, thread) >= 0)
                                close(FD(evsel, idx, thread));
                        FD(evsel, idx, thread) = -1;
                }
                thread = nthreads;
        } while (--idx >= 0);

Since the loop pre-decrements the thread index, it seems to skip closing
the file descriptor for the currently failing thread.

[ ... ]
> @@ -467,10 +514,108 @@ static void perf_counts__update(struct perf_counts_values *count,
[ ... ]
> +int evsel__tool_pmu_enable_cpu(struct evsel *evsel, int cpu_map_idx)
> +{
> +	enum tool_pmu_event ev = evsel__tool_event(evsel);
> +	int thread, nthreads;
> +
> +	if (!evsel->disabled)
> +		return 0;
> +
> +	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
> +		if (cpu_map_idx == 0) {
> +			evsel->duration_time.start_time = rdclock();
> +			evsel->disabled = false;
> +		}
> +		return 0;
> +	}
> +
> +	if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
> +		nthreads = xyarray__max_y(evsel->process_time.start_times);
> +		for (thread = 0; thread < nthreads; thread++) {
> +			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
> +							   cpu_map_idx, thread);
> +			__u64 val;
> +			int err;
> +
> +			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &val);
> +			if (!err)
> +				*start_time = val;
> +			else
> +				*start_time = INVALID_START_TIME;
> +		}
> +	}
> +	return 0;
> +}

Unlike the duration_time branch, it looks like evsel->disabled is not set
to false when processing user_time or system_time events here.

Will this cause evsel__tool_pmu_read() to see evsel->disabled as still true
and unconditionally return the frozen accumulated time for these members?
A similar pattern appears to be present in the disable path as well.

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260518201419.2943132-1-irogers@google.com?part=1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v2 2/2] perf tests: Add test for stat delay option with duration_time
  2026-05-18 20:14 ` [PATCH v2 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
  2026-05-18 20:14   ` [PATCH v2 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
@ 2026-05-18 20:14   ` Ian Rogers
  2026-05-18 20:55     ` sashiko-bot
  2026-05-18 22:37   ` [PATCH v3 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
  2 siblings, 1 reply; 20+ messages in thread
From: Ian Rogers @ 2026-05-18 20:14 UTC (permalink / raw)
  To: irogers, acme, namhyung
  Cc: adrian.hunter, james.clark, jolsa, linux-kernel, linux-perf-users,
	mingo, nigro.fra, peterz, tmricht

Add a new test case `test_stat_delay` to `stat.sh` to verify that
`duration_time` correctly excludes the delay period when using the
delay option (-D).

The test runs `perf stat -D 1000 -e duration_time sleep 2` and
verifies that `duration_time` is ~1s (excluding the 1s delay), not
~2s.

Signed-off-by: Ian Rogers <irogers@google.com>
Assisted-by: Antigravity:gemini-3-flash
---
 tools/perf/tests/shell/stat.sh | 48 ++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh
index 4edb04039036..a9eadd57e164 100755
--- a/tools/perf/tests/shell/stat.sh
+++ b/tools/perf/tests/shell/stat.sh
@@ -483,6 +483,53 @@ test_stat_pid() {
   wait $pid 2>/dev/null || true
 }
 
+test_stat_delay() {
+  echo "stat -D test"
+  if ! env LC_ALL=C perf stat -D 1000 -e duration_time sleep 2 > "${stat_output}" 2>&1
+  then
+    echo "stat -D test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  duration=$(grep "duration_time" "${stat_output}" | awk '{print $1}' | tr -d ',')
+  elapsed=$(grep "seconds time elapsed" "${stat_output}" | awk '{print $1}')
+
+  if [ -z "$duration" ] || [ -z "$elapsed" ]
+  then
+    echo "stat -D test [Failed - failed to find duration_time or time elapsed in output]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  # Compare duration (ns) and elapsed (s) using awk to handle float and allow tolerance.
+  if ! awk -v d="$duration" -v e="$elapsed" '
+    BEGIN {
+      diff = d - (e * 1e9);
+      if (diff < 0) diff = -diff;
+      # Allow 200ms tolerance (200,000,000 ns) for loaded CI machines.
+      if (diff > 200000000) {
+        printf "Fail: duration (%d ns) and elapsed (%f s) mismatch (diff %d ns)\n", d, e, diff;
+        exit 1;
+      }
+      # Lower bound check: must be at least 0.5s.
+      if (d < 500000000) {
+        printf "Fail: duration (%d ns) is abnormally small\n", d;
+        exit 1;
+      }
+      exit 0;
+    }'
+  then
+    echo "stat -D test [Failed - validation failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+  echo "stat -D test [Success]"
+}
+
 test_default_stat
 test_null_stat
 test_offline_cpu_stat
@@ -498,6 +545,7 @@ test_stat_no_aggr
 test_stat_detailed
 test_stat_repeat
 test_stat_pid
+test_stat_delay
 
 cleanup
 exit $err
-- 
2.54.0.631.ge1b05301d1-goog


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 2/2] perf tests: Add test for stat delay option with duration_time
  2026-05-18 20:14   ` [PATCH v2 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers
@ 2026-05-18 20:55     ` sashiko-bot
  0 siblings, 0 replies; 20+ messages in thread
From: sashiko-bot @ 2026-05-18 20:55 UTC (permalink / raw)
  To: Ian Rogers; +Cc: linux-perf-users

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] The `test_stat_delay` test lacks an upper bound check, failing to verify that the delay is excluded from `duration_time`.
--

commit b57a62c916c7df08ef099bd333919c50a5312b13
Author: Ian Rogers <irogers@google.com>

perf tests: Add test for stat delay option with duration_time

This commit adds a test case test_stat_delay to stat.sh to verify that
duration_time correctly excludes the delay period when using the delay
option (-D). It runs a perf stat command with a 1000ms delay and a 2 second
sleep, and checks the resulting duration_time.

> diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh
> index 4edb04039036e..a9eadd57e1644 100755
> --- a/tools/perf/tests/shell/stat.sh
> +++ b/tools/perf/tests/shell/stat.sh
[ ... ]
> @@ -483,6 +483,53 @@ test_stat_pid() {
>    wait $pid 2>/dev/null || true
>  }
>  
> +test_stat_delay() {
> +  echo "stat -D test"
> +  if ! env LC_ALL=C perf stat -D 1000 -e duration_time sleep 2 > "${stat_output}" 2>&1
> +  then
> +    echo "stat -D test [Failed - command failed]"
> +    cat "${stat_output}"
> +    err=1
> +    return
> +  fi
> +
> +  duration=$(grep "duration_time" "${stat_output}" | awk '{print $1}' | tr -d ',')
> +  elapsed=$(grep "seconds time elapsed" "${stat_output}" | awk '{print $1}')
> +
> +  if [ -z "$duration" ] || [ -z "$elapsed" ]
> +  then
> +    echo "stat -D test [Failed - failed to find duration_time or time elapsed in output]"
> +    cat "${stat_output}"
> +    err=1
> +    return
> +  fi
> +
> +  # Compare duration (ns) and elapsed (s) using awk to handle float and allow tolerance.
> +  if ! awk -v d="$duration" -v e="$elapsed" '
> +    BEGIN {
> +      diff = d - (e * 1e9);
> +      if (diff < 0) diff = -diff;
> +      # Allow 200ms tolerance (200,000,000 ns) for loaded CI machines.
> +      if (diff > 200000000) {
> +        printf "Fail: duration (%d ns) and elapsed (%f s) mismatch (diff %d ns)\n", d, e, diff;
> +        exit 1;
> +      }
> +      # Lower bound check: must be at least 0.5s.
> +      if (d < 500000000) {
> +        printf "Fail: duration (%d ns) is abnormally small\n", d;
> +        exit 1;
> +      }

Is there a missing upper bound check here to verify the delay was actually
excluded from duration_time?

The commit message states the test verifies duration_time is ~1s and not ~2s.
However, if a regression caused the -D option to be ignored, both duration
and elapsed would measure the full 2 seconds of the sleep command.

In that scenario, the difference between them would be near 0 (passing the
mismatch check), and duration would be ~2s which is > 0.5s (passing the lower
bound check). The test would silently pass, failing to detect the issue.

Could an upper bound check be added to ensure duration_time is strictly less
than 2 seconds?

> +      exit 0;
> +    }'

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260518201419.2943132-1-irogers@google.com?part=2

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3 0/2] perf tool_pmu: Support enable/disable for tool PMU events
  2026-05-18 20:14 ` [PATCH v2 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
  2026-05-18 20:14   ` [PATCH v2 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
  2026-05-18 20:14   ` [PATCH v2 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers
@ 2026-05-18 22:37   ` Ian Rogers
  2026-05-18 22:37     ` [PATCH v3 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
                       ` (2 more replies)
  2 siblings, 3 replies; 20+ messages in thread
From: Ian Rogers @ 2026-05-18 22:37 UTC (permalink / raw)
  To: irogers, acme, namhyung
  Cc: adrian.hunter, james.clark, jolsa, linux-kernel, linux-perf-users,
	mingo, nigro.fra, peterz, tmricht

A regression in perf stat was reported where tool PMU events (like
duration_time used in CPUs_utilized metric) incorrectly included the
delay period when using the delay option (-D).

This series fixes the regression by making tool PMU events
(duration_time, user_time, system_time) behave more like regular
counters by implementing proper enable and disable support. They now
correctly accumulate values only when enabled.

The first patch implements the core enable/disable support for tool PMU
events, and the second patch adds a shell test to verify that
duration_time correctly excludes the delay period.

Changes in v3:
- Refine group handling: only manually enable/disable group members when
  the leader or member is a non-perf-event open PMU, as the kernel allows
  grouping of software and hardware PMUs.
- Fix a file descriptor leak in evsel__tool_pmu_open() on error paths by
  explicitly closing the successfully opened fd before exiting.
- Synchronize the 'disabled' flag for all group members in enable/disable
  paths (both per-CPU and batch loops) to prevent stale disabled flags.
- Add explicit early exits to evsel__tool_pmu_enable() and disable() based
  on evsel->disabled to protect internal metric state.
- Add an upper bound check in test_stat_delay to verify that the delay was
  actually excluded.

Changes in v2:
- Implement evsel__tool_pmu_enable() and evsel__tool_pmu_disable() to
  avoid ioctl failures in batch evsel__enable() and evsel__disable()
  functions.
- Correctly iterate and enable/disable tool PMU events configured as
  non-leader members of event groups.
- Correct the lseek() arguments order in the read_stat helper:
  lseek(fd, 0, SEEK_SET) instead of lseek(fd, SEEK_SET, 0).
- Introduce INVALID_START_TIME (~0ULL) to prevent erroneous large delta
  accumulation in evsel__tool_pmu_read() if /proc/<pid>/stat fails to
  read in enable_cpu (e.g., process exited).
- Improve test parsing to use LC_ALL=C and cut to be robust against
  different locales, and use awk to dynamically compare duration_time to
  time elapsed with a 200ms tolerance (avoiding loaded CI false failures).
  Also added a lower-bound check.
- Fix style warnings from checkpatch.pl (line lengths, braces, and blank
  lines).

Ian Rogers (2):
  perf tool_pmu: Make tool PMU events respect enable/disable
  perf tests: Add test for stat delay option with duration_time

 tools/perf/tests/shell/stat.sh |  53 +++++++
 tools/perf/util/evlist.c       |  10 +-
 tools/perf/util/evsel.c        | 196 +++++++++++++++++++------
 tools/perf/util/evsel.h        |  15 +-
 tools/perf/util/tool_pmu.c     | 253 ++++++++++++++++++++++++++-------
 tools/perf/util/tool_pmu.h     |   4 +
 6 files changed, 432 insertions(+), 99 deletions(-)

-- 
2.54.0.631.ge1b05301d1-goog


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3 1/2] perf tool_pmu: Make tool PMU events respect enable/disable
  2026-05-18 22:37   ` [PATCH v3 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
@ 2026-05-18 22:37     ` Ian Rogers
  2026-05-18 23:02       ` sashiko-bot
  2026-05-18 22:37     ` [PATCH v3 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers
  2026-05-19  1:41     ` [PATCH v4 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
  2 siblings, 1 reply; 20+ messages in thread
From: Ian Rogers @ 2026-05-18 22:37 UTC (permalink / raw)
  To: irogers, acme, namhyung
  Cc: adrian.hunter, james.clark, jolsa, linux-kernel, linux-perf-users,
	mingo, nigro.fra, peterz, tmricht

Tool PMU events (duration_time, user_time, system_time) currently
count from when the event is opened to when it is read. This causes
issues with features like the delay option (-D) or control fd, where
events are opened but should not start counting immediately.

Make these events behave more like regular counters by implementing
proper enable and disable support. Add accumulated_time to struct
evsel to track time while enabled, and implement enable/disable CPU
callbacks to start/stop counting.

Also generalize userspace PMU mixed group handling. Userspace synthetic
PMUs (type > PERF_PMU_TYPE_PE_END) do not have kernel implementations and
cannot be grouped in the kernel (opened with group_fd = -1), and are
skipped by kernel enable/disable calls. Iterate over group members in
userspace and manually enable/disable any members if the leader or the
member is a non-perf-event open PMU, and synchronize their disabled flags.

Fixes: b71f46a6a708 ("perf stat: Remove hard coded shadow metrics")
Reported-by: Francesco Nigro <nigro.fra@gmail.com>
Closes: https://lore.kernel.org/linux-perf-users/20260517093650.2540920-1-nigro.fra@gmail.com/
Signed-off-by: Ian Rogers <irogers@google.com>
Assisted-by: Antigravity:gemini-3-flash
---
 tools/perf/util/evlist.c   |  10 +-
 tools/perf/util/evsel.c    | 196 ++++++++++++++++++++++------
 tools/perf/util/evsel.h    |  15 ++-
 tools/perf/util/tool_pmu.c | 253 +++++++++++++++++++++++++++++--------
 tools/perf/util/tool_pmu.h |   4 +
 5 files changed, 379 insertions(+), 99 deletions(-)

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index ee971d15b3c6..1a238b245b3a 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -529,7 +529,7 @@ static int evlist__is_enabled(struct evlist *evlist)
 
 static void __evlist__disable(struct evlist *evlist, char *evsel_name, bool excl_dummy)
 {
-	struct evsel *pos;
+	struct evsel *pos, *member;
 	struct evlist_cpu_iterator evlist_cpu_itr;
 	bool has_imm = false;
 
@@ -561,6 +561,9 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name, bool excl
 		if (excl_dummy && evsel__is_dummy_event(pos))
 			continue;
 		pos->disabled = true;
+
+		for_each_group_member(member, pos)
+			member->disabled = true;
 	}
 
 	/*
@@ -590,7 +593,7 @@ void evlist__disable_evsel(struct evlist *evlist, char *evsel_name)
 
 static void __evlist__enable(struct evlist *evlist, char *evsel_name, bool excl_dummy)
 {
-	struct evsel *pos;
+	struct evsel *pos, *member;
 	struct evlist_cpu_iterator evlist_cpu_itr;
 
 	evlist__for_each_cpu(evlist_cpu_itr, evlist) {
@@ -611,6 +614,9 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name, bool excl_
 		if (excl_dummy && evsel__is_dummy_event(pos))
 			continue;
 		pos->disabled = false;
+
+		for_each_group_member(member, pos)
+			member->disabled = false;
 	}
 
 	/*
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 2ee87fd84d3e..687571ac6dfc 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -11,68 +11,71 @@
  */
 #define __SANE_USERSPACE_TYPES__
 
-#include <byteswap.h>
+#include "evsel.h"
+
 #include <errno.h>
 #include <inttypes.h>
+#include <stdlib.h>
+
+#include <dirent.h>
 #include <linux/bitops.h>
-#include <api/fs/fs.h>
-#include <api/fs/tracing_path.h>
-#include <linux/hw_breakpoint.h>
-#include <linux/perf_event.h>
 #include <linux/compiler.h>
+#include <linux/ctype.h>
 #include <linux/err.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/perf_event.h>
 #include <linux/zalloc.h>
 #include <sys/ioctl.h>
 #include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
-#include <dirent.h>
-#include <stdlib.h>
+
+#include <api/fs/fs.h>
+#include <api/fs/tracing_path.h>
+#include <byteswap.h>
+#include <internal/lib.h>
+#include <internal/threadmap.h>
+#include <internal/xyarray.h>
+#include <perf/cpumap.h>
 #include <perf/evsel.h>
+
+#include "../perf-sys.h"
 #include "asm/bug.h"
+#include "bpf-filter.h"
 #include "bpf_counter.h"
 #include "callchain.h"
 #include "cgroup.h"
 #include "counts.h"
+#include "debug.h"
+#include "drm_pmu.h"
 #include "dwarf-regs.h"
+#include "env.h"
 #include "event.h"
-#include "evsel.h"
-#include "time-utils.h"
-#include "util/env.h"
-#include "util/evsel_config.h"
-#include "util/evsel_fprintf.h"
 #include "evlist.h"
-#include <perf/cpumap.h>
-#include "thread_map.h"
-#include "target.h"
+#include "evsel_config.h"
+#include "evsel_fprintf.h"
+#include "hashmap.h"
+#include "hist.h"
+#include "hwmon_pmu.h"
+#include "intel-tpebs.h"
+#include "memswap.h"
+#include "off_cpu.h"
+#include "parse-branch-options.h"
 #include "perf_regs.h"
+#include "pmu.h"
+#include "pmus.h"
 #include "record.h"
-#include "debug.h"
-#include "trace-event.h"
+#include "rlimit.h"
 #include "session.h"
 #include "stat.h"
 #include "string2.h"
-#include "memswap.h"
-#include "util.h"
-#include "util/hashmap.h"
-#include "off_cpu.h"
-#include "pmu.h"
-#include "pmus.h"
-#include "drm_pmu.h"
-#include "hwmon_pmu.h"
+#include "target.h"
+#include "thread_map.h"
+#include "time-utils.h"
 #include "tool_pmu.h"
 #include "tp_pmu.h"
-#include "rlimit.h"
-#include "../perf-sys.h"
-#include "util/parse-branch-options.h"
-#include "util/bpf-filter.h"
-#include "util/hist.h"
-#include <internal/xyarray.h>
-#include <internal/lib.h>
-#include <internal/threadmap.h>
-#include "util/intel-tpebs.h"
-
-#include <linux/ctype.h>
+#include "trace-event.h"
+#include "util.h"
 
 #ifdef HAVE_LIBTRACEEVENT
 #include <event-parse.h>
@@ -1795,12 +1798,66 @@ int evsel__append_addr_filter(struct evsel *evsel, const char *filter)
 /* Caller has to clear disabled after going through all CPUs. */
 int evsel__enable_cpu(struct evsel *evsel, int cpu_map_idx)
 {
-	return perf_evsel__enable_cpu(&evsel->core, cpu_map_idx);
+	int err;
+
+	if (evsel__is_tool(evsel))
+		err = evsel__tool_pmu_enable_cpu(evsel, cpu_map_idx);
+	else
+		err = perf_evsel__enable_cpu(&evsel->core, cpu_map_idx);
+
+	if (!err && evsel__is_group_leader(evsel)) {
+		struct evsel *member;
+
+		for_each_group_member(member, evsel) {
+			if (evsel__is_non_perf_event_open_pmu(evsel) ||
+			    evsel__is_non_perf_event_open_pmu(member)) {
+				/*
+				 * In a mixed PMU group, userspace PMUs are not
+				 * grouped in the kernel (opened with group_fd = -1)
+				 * and are skipped by the kernel when enabling the
+				 * group leader. We must manually enable them in
+				 * userspace.
+				 */
+				int mem_err = evsel__enable_cpu(member, cpu_map_idx);
+
+				if (mem_err)
+					return mem_err;
+			}
+		}
+	}
+	return err;
 }
 
 int evsel__enable(struct evsel *evsel)
 {
-	int err = perf_evsel__enable(&evsel->core);
+	int err;
+
+	if (evsel__is_tool(evsel))
+		err = evsel__tool_pmu_enable(evsel);
+	else
+		err = perf_evsel__enable(&evsel->core);
+
+	if (!err && evsel__is_group_leader(evsel)) {
+		struct evsel *member;
+
+		for_each_group_member(member, evsel) {
+			if (evsel__is_non_perf_event_open_pmu(evsel) ||
+			    evsel__is_non_perf_event_open_pmu(member)) {
+				/*
+				 * In a mixed PMU group, userspace PMUs are not
+				 * grouped in the kernel (opened with group_fd = -1)
+				 * and are skipped by the kernel when enabling the
+				 * group leader. We must manually enable them in
+				 * userspace.
+				 */
+				int mem_err = evsel__enable(member);
+
+				if (mem_err)
+					return mem_err;
+			}
+			member->disabled = false;
+		}
+	}
 
 	if (!err)
 		evsel->disabled = false;
@@ -1810,12 +1867,65 @@ int evsel__enable(struct evsel *evsel)
 /* Caller has to set disabled after going through all CPUs. */
 int evsel__disable_cpu(struct evsel *evsel, int cpu_map_idx)
 {
-	return perf_evsel__disable_cpu(&evsel->core, cpu_map_idx);
+	int err;
+
+	if (evsel__is_tool(evsel))
+		err = evsel__tool_pmu_disable_cpu(evsel, cpu_map_idx);
+	else
+		err = perf_evsel__disable_cpu(&evsel->core, cpu_map_idx);
+
+	if (!err && evsel__is_group_leader(evsel)) {
+		struct evsel *member;
+
+		for_each_group_member(member, evsel) {
+			if (evsel__is_non_perf_event_open_pmu(evsel) ||
+			    evsel__is_non_perf_event_open_pmu(member)) {
+				/*
+				 * In a mixed PMU group, userspace PMUs are not
+				 * grouped in the kernel and are skipped by the
+				 * kernel when disabling the group leader. We must
+				 * manually disable them in userspace.
+				 */
+				int mem_err = evsel__disable_cpu(member, cpu_map_idx);
+
+				if (mem_err)
+					return mem_err;
+			}
+		}
+	}
+	return err;
 }
 
 int evsel__disable(struct evsel *evsel)
 {
-	int err = perf_evsel__disable(&evsel->core);
+	int err;
+
+	if (evsel__is_tool(evsel))
+		err = evsel__tool_pmu_disable(evsel);
+	else
+		err = perf_evsel__disable(&evsel->core);
+
+	if (!err && evsel__is_group_leader(evsel)) {
+		struct evsel *member;
+
+		for_each_group_member(member, evsel) {
+			if (evsel__is_non_perf_event_open_pmu(evsel) ||
+			    evsel__is_non_perf_event_open_pmu(member)) {
+				/*
+				 * In a mixed PMU group, userspace PMUs are not
+				 * grouped in the kernel and are skipped by the
+				 * kernel when disabling the group leader. We must
+				 * manually disable them in userspace.
+				 */
+				int mem_err = evsel__disable(member);
+
+				if (mem_err)
+					return mem_err;
+			}
+			member->disabled = true;
+		}
+	}
+
 	/*
 	 * We mark it disabled here so that tools that disable a event can
 	 * ignore events after they disable it. I.e. the ring buffer may have
@@ -1885,8 +1995,10 @@ void evsel__exit(struct evsel *evsel)
 		evsel__priv_destructor(evsel->priv);
 	perf_evsel__object.fini(evsel);
 	if (evsel__tool_event(evsel) == TOOL_PMU__EVENT_SYSTEM_TIME ||
-	    evsel__tool_event(evsel) == TOOL_PMU__EVENT_USER_TIME)
-		xyarray__delete(evsel->start_times);
+	    evsel__tool_event(evsel) == TOOL_PMU__EVENT_USER_TIME) {
+		xyarray__delete(evsel->process_time.start_times);
+		xyarray__delete(evsel->process_time.accumulated_times);
+	}
 }
 
 void evsel__delete(struct evsel *evsel)
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 339b5c08a33d..7e3746480269 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -190,12 +190,18 @@ struct evsel {
 			double max;
 		} retirement_latency;
 		/* duration_time is a single global time. */
-		__u64 start_time;
+		struct {
+			__u64 start_time;
+			__u64 accumulated_time;
+		} duration_time;
 		/*
 		 * user_time and system_time read an initial value potentially
 		 * per-CPU or per-pid.
 		 */
-		struct xyarray *start_times;
+		struct {
+			struct xyarray *start_times;
+			struct xyarray *accumulated_times;
+		} process_time;
 	};
 	/* Is the tool's fd for /proc/pid/stat or /proc/stat. */
 	bool pid_stat;
@@ -350,6 +356,11 @@ void arch_evsel__apply_ratio_to_prev(struct evsel *evsel, struct perf_event_attr
 int evsel__set_filter(struct evsel *evsel, const char *filter);
 int evsel__append_tp_filter(struct evsel *evsel, const char *filter);
 int evsel__append_addr_filter(struct evsel *evsel, const char *filter);
+static inline bool evsel__is_non_perf_event_open_pmu(const struct evsel *evsel)
+{
+	return evsel->pmu && evsel->pmu->type > PERF_PMU_TYPE_PE_END;
+}
+
 int evsel__enable_cpu(struct evsel *evsel, int cpu_map_idx);
 int evsel__enable(struct evsel *evsel);
 int evsel__disable(struct evsel *evsel);
diff --git a/tools/perf/util/tool_pmu.c b/tools/perf/util/tool_pmu.c
index 6a9df3dc0e07..247e7a623a75 100644
--- a/tools/perf/util/tool_pmu.c
+++ b/tools/perf/util/tool_pmu.c
@@ -17,6 +17,8 @@
 #include <fcntl.h>
 #include <strings.h>
 
+#define INVALID_START_TIME ~0ULL
+
 static const char *const tool_pmu__event_names[TOOL_PMU__EVENT_MAX] = {
 	NULL,
 	"duration_time",
@@ -205,20 +207,57 @@ int evsel__tool_pmu_prepare_open(struct evsel *evsel,
 				 struct perf_cpu_map *cpus,
 				 int nthreads)
 {
-	if ((evsel__tool_event(evsel) == TOOL_PMU__EVENT_SYSTEM_TIME ||
-	     evsel__tool_event(evsel) == TOOL_PMU__EVENT_USER_TIME) &&
-	    !evsel->start_times) {
-		evsel->start_times = xyarray__new(perf_cpu_map__nr(cpus),
-						  nthreads,
-						  sizeof(__u64));
-		if (!evsel->start_times)
-			return -ENOMEM;
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+
+	if (ev == TOOL_PMU__EVENT_SYSTEM_TIME || ev == TOOL_PMU__EVENT_USER_TIME) {
+		if (!evsel->process_time.start_times) {
+			evsel->process_time.start_times =
+				xyarray__new(perf_cpu_map__nr(cpus), nthreads, sizeof(__u64));
+			if (!evsel->process_time.start_times)
+				return -ENOMEM;
+		}
+		if (!evsel->process_time.accumulated_times) {
+			evsel->process_time.accumulated_times =
+				xyarray__new(perf_cpu_map__nr(cpus), nthreads, sizeof(__u64));
+			if (!evsel->process_time.accumulated_times)
+				return -ENOMEM;
+		}
 	}
 	return 0;
 }
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->core.fd, x, y))
 
+static int tool_pmu__read_stat(struct evsel *evsel, int cpu_map_idx, int thread, __u64 *val)
+{
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+	bool system = ev == TOOL_PMU__EVENT_SYSTEM_TIME;
+	int fd = FD(evsel, cpu_map_idx, thread);
+	int err = 0;
+
+	if (fd < 0) {
+		*val = 0;
+		return 0;
+	}
+
+	lseek(fd, 0, SEEK_SET);
+	if (evsel->pid_stat) {
+		if (cpu_map_idx == 0)
+			err = read_pid_stat_field(fd, system ? 15 : 14, val);
+		else
+			*val = 0;
+	} else {
+		if (thread == 0) {
+			struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus, cpu_map_idx);
+
+			err = read_stat_field(fd, cpu, system ? 3 : 1, val);
+		} else {
+			*val = 0;
+		}
+	}
+	return err;
+}
+
 int evsel__tool_pmu_open(struct evsel *evsel,
 			 struct perf_thread_map *threads,
 			 int start_cpu_map_idx, int end_cpu_map_idx)
@@ -232,7 +271,14 @@ int evsel__tool_pmu_open(struct evsel *evsel,
 	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
 		if (evsel->core.attr.sample_period) /* no sampling */
 			return -EINVAL;
-		evsel->start_time = rdclock();
+		evsel->duration_time.accumulated_time = 0;
+		if (evsel->core.attr.disabled) {
+			evsel->disabled = true;
+			evsel->duration_time.start_time = INVALID_START_TIME;
+		} else {
+			evsel->disabled = false;
+			evsel->duration_time.start_time = rdclock();
+		}
 		return 0;
 	}
 
@@ -246,8 +292,8 @@ int evsel__tool_pmu_open(struct evsel *evsel,
 				pid = perf_thread_map__pid(threads, thread);
 
 			if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
-				bool system = ev == TOOL_PMU__EVENT_SYSTEM_TIME;
 				__u64 *start_time = NULL;
+				__u64 *accumulated_time = NULL;
 				int fd;
 
 				if (evsel->core.attr.sample_period) {
@@ -269,21 +315,25 @@ int evsel__tool_pmu_open(struct evsel *evsel,
 					err = -errno;
 					goto out_close;
 				}
-				start_time = xyarray__entry(evsel->start_times, idx, thread);
-				if (pid > -1) {
-					err = read_pid_stat_field(fd, system ? 15 : 14,
-								  start_time);
+				start_time = xyarray__entry(evsel->process_time.start_times, idx,
+							    thread);
+				accumulated_time = xyarray__entry(
+					evsel->process_time.accumulated_times, idx, thread);
+				*accumulated_time = 0;
+
+				if (evsel->core.attr.disabled) {
+					evsel->disabled = true;
+					*start_time = INVALID_START_TIME;
 				} else {
-					struct perf_cpu cpu;
-
-					cpu = perf_cpu_map__cpu(evsel->core.cpus, idx);
-					err = read_stat_field(fd, cpu, system ? 3 : 1,
-							      start_time);
+					evsel->disabled = false;
+					err = tool_pmu__read_stat(evsel, idx, thread, start_time);
+					if (err) {
+						close(fd);
+						FD(evsel, idx, thread) = -1;
+						goto out_close;
+					}
 				}
-				if (err)
-					goto out_close;
 			}
-
 		}
 	}
 	return 0;
@@ -467,10 +517,114 @@ static void perf_counts__update(struct perf_counts_values *count,
 		count->lost = 0;
 	}
 }
+int evsel__tool_pmu_enable_cpu(struct evsel *evsel, int cpu_map_idx)
+{
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+	int thread, nthreads;
+
+	if (!evsel->disabled)
+		return 0;
+
+	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
+		if (cpu_map_idx == 0) {
+			evsel->duration_time.start_time = rdclock();
+			evsel->disabled = false;
+		}
+		return 0;
+	}
+
+	if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
+		nthreads = xyarray__max_y(evsel->process_time.start_times);
+		for (thread = 0; thread < nthreads; thread++) {
+			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
+							   cpu_map_idx, thread);
+			__u64 val;
+			int err;
+
+			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &val);
+			if (!err)
+				*start_time = val;
+			else
+				*start_time = INVALID_START_TIME;
+		}
+	}
+	return 0;
+}
+
+int evsel__tool_pmu_enable(struct evsel *evsel)
+{
+	unsigned int idx;
+	int err = 0;
+
+	if (!evsel->disabled)
+		return 0;
+
+	for (idx = 0; idx < perf_cpu_map__nr(evsel->core.cpus); idx++) {
+		err = evsel__tool_pmu_enable_cpu(evsel, idx);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+int evsel__tool_pmu_disable_cpu(struct evsel *evsel, int cpu_map_idx)
+{
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+	int thread, nthreads;
+
+	if (evsel->disabled)
+		return 0;
+
+	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
+		if (cpu_map_idx == 0) {
+			__u64 delta = rdclock() - evsel->duration_time.start_time;
+
+			evsel->duration_time.accumulated_time += delta;
+			evsel->disabled = true;
+		}
+		return 0;
+	}
+
+	if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
+		nthreads = xyarray__max_y(evsel->process_time.start_times);
+		for (thread = 0; thread < nthreads; thread++) {
+			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
+							   cpu_map_idx, thread);
+			__u64 *accumulated_time = xyarray__entry(
+				evsel->process_time.accumulated_times, cpu_map_idx, thread);
+			__u64 val;
+			int err;
+
+			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &val);
+			if (!err) {
+				if (*start_time != INVALID_START_TIME && val >= *start_time)
+					*accumulated_time += (val - *start_time);
+				*start_time = INVALID_START_TIME;
+			}
+		}
+	}
+	return 0;
+}
+
+int evsel__tool_pmu_disable(struct evsel *evsel)
+{
+	unsigned int idx;
+	int err = 0;
+
+	if (evsel->disabled)
+		return 0;
+
+	for (idx = 0; idx < perf_cpu_map__nr(evsel->core.cpus); idx++) {
+		err = evsel__tool_pmu_disable_cpu(evsel, idx);
+		if (err)
+			break;
+	}
+	return err;
+}
 
 int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
 {
-	__u64 *start_time, cur_time, delta_start;
+	__u64 delta_start = 0;
 	int err = 0;
 	struct perf_counts_values *count, *old_count = NULL;
 	bool adjust = false;
@@ -507,39 +661,33 @@ int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
 		return 0;
 	}
 	case TOOL_PMU__EVENT_DURATION_TIME:
-		/*
-		 * Pretend duration_time is only on the first CPU and thread, or
-		 * else aggregation will scale duration_time by the number of
-		 * CPUs/threads.
-		 */
-		start_time = &evsel->start_time;
-		if (cpu_map_idx == 0 && thread == 0)
-			cur_time = rdclock();
-		else
-			cur_time = *start_time;
+		if (cpu_map_idx == 0 && thread == 0) {
+			delta_start = evsel->duration_time.accumulated_time;
+			if (!evsel->disabled &&
+			    evsel->duration_time.start_time != INVALID_START_TIME)
+				delta_start += (rdclock() - evsel->duration_time.start_time);
+		} else {
+			delta_start = 0;
+		}
 		break;
 	case TOOL_PMU__EVENT_USER_TIME:
 	case TOOL_PMU__EVENT_SYSTEM_TIME: {
-		bool system = evsel__tool_event(evsel) == TOOL_PMU__EVENT_SYSTEM_TIME;
-		int fd = FD(evsel, cpu_map_idx, thread);
-
-		start_time = xyarray__entry(evsel->start_times, cpu_map_idx, thread);
-		lseek(fd, SEEK_SET, 0);
-		if (evsel->pid_stat) {
-			/* The event exists solely on 1 CPU. */
-			if (cpu_map_idx == 0)
-				err = read_pid_stat_field(fd, system ? 15 : 14, &cur_time);
-			else
-				cur_time = 0;
-		} else {
-			/* The event is for all threads. */
-			if (thread == 0) {
-				struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus,
-									cpu_map_idx);
+		__u64 accumulated = *(__u64 *)xyarray__entry(evsel->process_time.accumulated_times,
+							     cpu_map_idx, thread);
 
-				err = read_stat_field(fd, cpu, system ? 3 : 1, &cur_time);
-			} else {
-				cur_time = 0;
+		if (evsel->disabled) {
+			delta_start = accumulated;
+		} else {
+			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
+							   cpu_map_idx, thread);
+			__u64 cur_time;
+
+			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &cur_time);
+			if (!err) {
+				if (*start_time != INVALID_START_TIME && cur_time >= *start_time)
+					delta_start = accumulated + (cur_time - *start_time);
+				else
+					delta_start = accumulated;
 			}
 		}
 		adjust = true;
@@ -553,7 +701,6 @@ int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
 	if (err)
 		return err;
 
-	delta_start = cur_time - *start_time;
 	if (adjust) {
 		__u64 ticks_per_sec = sysconf(_SC_CLK_TCK);
 
diff --git a/tools/perf/util/tool_pmu.h b/tools/perf/util/tool_pmu.h
index f1714001bc1d..f66d24cf3502 100644
--- a/tools/perf/util/tool_pmu.h
+++ b/tools/perf/util/tool_pmu.h
@@ -56,6 +56,10 @@ int evsel__tool_pmu_prepare_open(struct evsel *evsel,
 int evsel__tool_pmu_open(struct evsel *evsel,
 			 struct perf_thread_map *threads,
 			 int start_cpu_map_idx, int end_cpu_map_idx);
+int evsel__tool_pmu_enable_cpu(struct evsel *evsel, int cpu_map_idx);
+int evsel__tool_pmu_enable(struct evsel *evsel);
+int evsel__tool_pmu_disable_cpu(struct evsel *evsel, int cpu_map_idx);
+int evsel__tool_pmu_disable(struct evsel *evsel);
 int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread);
 
 struct perf_pmu *tool_pmu__new(void);
-- 
2.54.0.631.ge1b05301d1-goog


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 1/2] perf tool_pmu: Make tool PMU events respect enable/disable
  2026-05-18 22:37     ` [PATCH v3 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
@ 2026-05-18 23:02       ` sashiko-bot
  0 siblings, 0 replies; 20+ messages in thread
From: sashiko-bot @ 2026-05-18 23:02 UTC (permalink / raw)
  To: Ian Rogers; +Cc: linux-perf-users

Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:
- [Medium] Inconsistent disabled state on early return in evsel__enable() and evsel__disable().
- [Medium] evsel__tool_pmu_disable_cpu() leaves start_time intact if reading stats fails, causing disabled CPUs to keep counting.
- [Medium] Disabling a duration_time group leader prematurely sets its disabled flag, causing __evlist__disable() to skip member events on CPUs > 0.
--

commit c37afe120a59773ddf6b54a63fa825fceec20137
Author: Ian Rogers <irogers@google.com>

perf tool_pmu: Make tool PMU events respect enable/disable

Tool PMU events (duration_time, user_time, system_time) currently
count from when the event is opened to when it is read. This causes
issues with features like the delay option (-D) or control fd, where
events are opened but should not start counting immediately. Make these
events behave more like regular counters by implementing proper enable
and disable support.

Link: https://lore.kernel.org/linux-perf-users/20260517093650.2540920-1-nigro.fra@gmail.com/

> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
[ ... ]
> @@ -1810,12 +1867,65 @@ int evsel__enable(struct evsel *evsel)
>  int evsel__enable(struct evsel *evsel)
>  {
> -	int err = perf_evsel__enable(&evsel->core);
> +	int err;
> +
> +	if (evsel__is_tool(evsel))
> +		err = evsel__tool_pmu_enable(evsel);
> +	else
> +		err = perf_evsel__enable(&evsel->core);
> +
> +	if (!err && evsel__is_group_leader(evsel)) {
> +		struct evsel *member;
> +
> +		for_each_group_member(member, evsel) {
> +			if (evsel__is_non_perf_event_open_pmu(evsel) ||
> +			    evsel__is_non_perf_event_open_pmu(member)) {
[ ... ]
> +				int mem_err = evsel__enable(member);
> +
> +				if (mem_err)
> +					return mem_err;

Does this early return on mem_err skip setting evsel->disabled = false at
the end of the function? It looks like this could leave the group leader with
its software disabled flag set to true, even though the kernel already
successfully enabled it via perf_evsel__enable(&evsel->core).

> +			}
> +			member->disabled = false;
> +		}
> +	}
>  
>  	if (!err)
>  		evsel->disabled = false;
>  	return err;
>  }
[ ... ]
>  int evsel__disable(struct evsel *evsel)
>  {
> -	int err = perf_evsel__disable(&evsel->core);
> +	int err;
> +
> +	if (evsel__is_tool(evsel))
> +		err = evsel__tool_pmu_disable(evsel);
> +	else
> +		err = perf_evsel__disable(&evsel->core);
> +
> +	if (!err && evsel__is_group_leader(evsel)) {
> +		struct evsel *member;
> +
> +		for_each_group_member(member, evsel) {
> +			if (evsel__is_non_perf_event_open_pmu(evsel) ||
> +			    evsel__is_non_perf_event_open_pmu(member)) {
[ ... ]
> +				int mem_err = evsel__disable(member);
> +
> +				if (mem_err)
> +					return mem_err;

A similar early return happens here. If evsel__disable(member) fails, it
skips marking the group leader evsel->disabled = true at the bottom of the
function. Could this leave the state inconsistent for the leader?

> +			}
> +			member->disabled = true;
> +		}
> +	}

> diff --git a/tools/perf/util/tool_pmu.c b/tools/perf/util/tool_pmu.c
> --- a/tools/perf/util/tool_pmu.c
> +++ b/tools/perf/util/tool_pmu.c
[ ... ]
> +int evsel__tool_pmu_disable_cpu(struct evsel *evsel, int cpu_map_idx)
> +{
> +	enum tool_pmu_event ev = evsel__tool_event(evsel);
> +	int thread, nthreads;
> +
> +	if (evsel->disabled)
> +		return 0;
> +
> +	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
> +		if (cpu_map_idx == 0) {
> +			__u64 delta = rdclock() - evsel->duration_time.start_time;
> +
> +			evsel->duration_time.accumulated_time += delta;
> +			evsel->disabled = true;

Since evsel->disabled = true is set globally here when processing
cpu_map_idx 0, will __evlist__disable() see pos->disabled == true for this
group leader when iterating over the remaining CPUs?

If so, would it skip calling evsel__disable_cpu() for CPUs > 0, failing to
disable the associated member events on those CPUs?

> +		}
> +		return 0;
> +	}
> +
> +	if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
> +		nthreads = xyarray__max_y(evsel->process_time.start_times);
> +		for (thread = 0; thread < nthreads; thread++) {
> +			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
> +							   cpu_map_idx, thread);
> +			__u64 *accumulated_time = xyarray__entry(
> +				evsel->process_time.accumulated_times, cpu_map_idx, thread);
> +			__u64 val;
> +			int err;
> +
> +			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &val);
> +			if (!err) {
> +				if (*start_time != INVALID_START_TIME && val >= *start_time)
> +					*accumulated_time += (val - *start_time);
> +				*start_time = INVALID_START_TIME;
> +			}

If tool_pmu__read_stat() fails here (for instance, due to transient /proc
errors or a terminating thread), does leaving *start_time intact cause
subsequent calls to evsel__tool_pmu_read() to continue calculating deltas
using the lingering *start_time and erroneously add them to accumulated_time?

Should *start_time = INVALID_START_TIME; be done unconditionally outside
the if (!err) block?

> +		}
> +	}
> +	return 0;
> +}

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260518223733.3034897-1-irogers@google.com?part=1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3 2/2] perf tests: Add test for stat delay option with duration_time
  2026-05-18 22:37   ` [PATCH v3 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
  2026-05-18 22:37     ` [PATCH v3 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
@ 2026-05-18 22:37     ` Ian Rogers
  2026-05-19  1:41     ` [PATCH v4 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
  2 siblings, 0 replies; 20+ messages in thread
From: Ian Rogers @ 2026-05-18 22:37 UTC (permalink / raw)
  To: irogers, acme, namhyung
  Cc: adrian.hunter, james.clark, jolsa, linux-kernel, linux-perf-users,
	mingo, nigro.fra, peterz, tmricht

Add a new test case `test_stat_delay` to `stat.sh` to verify that
`duration_time` correctly excludes the delay period when using the
delay option (-D).

The test runs `perf stat -D 1000 -e duration_time sleep 2` and
verifies that `duration_time` is ~1s (excluding the 1s delay), not
~2s.

Signed-off-by: Ian Rogers <irogers@google.com>
Assisted-by: Antigravity:gemini-3-flash
---
 tools/perf/tests/shell/stat.sh | 53 ++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh
index 4edb04039036..1e17bee026bd 100755
--- a/tools/perf/tests/shell/stat.sh
+++ b/tools/perf/tests/shell/stat.sh
@@ -483,6 +483,58 @@ test_stat_pid() {
   wait $pid 2>/dev/null || true
 }
 
+test_stat_delay() {
+  echo "stat -D test"
+  if ! env LC_ALL=C perf stat -D 1000 -e duration_time sleep 2 > "${stat_output}" 2>&1
+  then
+    echo "stat -D test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  duration=$(grep "duration_time" "${stat_output}" | awk '{print $1}' | tr -d ',')
+  elapsed=$(grep "seconds time elapsed" "${stat_output}" | awk '{print $1}')
+
+  if [ -z "$duration" ] || [ -z "$elapsed" ]
+  then
+    echo "stat -D test [Failed - failed to find duration_time or time elapsed in output]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  # Compare duration (ns) and elapsed (s) using awk to handle float and allow tolerance.
+  if ! awk -v d="$duration" -v e="$elapsed" '
+    BEGIN {
+      diff = d - (e * 1e9);
+      if (diff < 0) diff = -diff;
+      # Allow 200ms tolerance (200,000,000 ns) for loaded CI machines.
+      if (diff > 200000000) {
+        printf "Fail: duration (%d ns) and elapsed (%f s) mismatch (diff %d ns)\n", d, e, diff;
+        exit 1;
+      }
+      # Lower bound check: must be at least 0.5s.
+      if (d < 500000000) {
+        printf "Fail: duration (%d ns) is abnormally small\n", d;
+        exit 1;
+      }
+      # Upper bound check: must be strictly less than 1.7s (proving delay was excluded).
+      if (d > 1700000000) {
+        printf "Fail: duration (%d ns) is too large (delay might not be excluded)\n", d;
+        exit 1;
+      }
+      exit 0;
+    }'
+  then
+    echo "stat -D test [Failed - validation failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+  echo "stat -D test [Success]"
+}
+
 test_default_stat
 test_null_stat
 test_offline_cpu_stat
@@ -498,6 +550,7 @@ test_stat_no_aggr
 test_stat_detailed
 test_stat_repeat
 test_stat_pid
+test_stat_delay
 
 cleanup
 exit $err
-- 
2.54.0.631.ge1b05301d1-goog


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v4 0/2] perf tool_pmu: Support enable/disable for tool PMU events
  2026-05-18 22:37   ` [PATCH v3 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
  2026-05-18 22:37     ` [PATCH v3 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
  2026-05-18 22:37     ` [PATCH v3 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers
@ 2026-05-19  1:41     ` Ian Rogers
  2026-05-19  1:41       ` [PATCH v4 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
  2026-05-19  1:41       ` [PATCH v4 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers
  2 siblings, 2 replies; 20+ messages in thread
From: Ian Rogers @ 2026-05-19  1:41 UTC (permalink / raw)
  To: irogers, acme, namhyung
  Cc: adrian.hunter, james.clark, jolsa, linux-kernel, linux-perf-users,
	mingo, nigro.fra, peterz, tmricht

A regression in perf stat was reported where tool PMU events (like
duration_time used in CPUs_utilized metric) incorrectly included the
delay period when using the delay option (-D).

This series fixes the regression by making tool PMU events
(duration_time, user_time, system_time) behave more like regular
counters by implementing proper enable and disable support. They now
correctly accumulate values only when enabled.

The first patch implements the core enable/disable support for tool PMU
events, and the second patch adds a shell test to verify that
duration_time correctly excludes the delay period.

Changes in v4:
- Update evsel->disabled immediately after the leader's own enable/disable
  succeeds in evsel__enable() / evsel__disable(), preventing state
  inconsistency on early return if a member fails.
- Remove evsel->disabled state changes from within the CPU loops in
  evsel__tool_pmu_enable_cpu() and evsel__tool_pmu_disable_cpu() to
  prevent __evlist__disable() from skipping disable_cpu calls for other
  CPUs.
- Make *start_time = INVALID_START_TIME unconditional in
  evsel__tool_pmu_disable_cpu() to ensure safe inactive state invalidation.
- Address a checkpatch warning regarding unnecessary braces for a single
  statement if block.

Changes in v3:
- Refine group handling: only manually enable/disable group members when
  the leader or member is a non-perf-event open PMU, as the kernel allows
  grouping of software and hardware PMUs.
- Fix a file descriptor leak in evsel__tool_pmu_open() on error paths by
  explicitly closing the successfully opened fd before exiting.
- Synchronize the 'disabled' flag for all group members in enable/disable
  paths (both per-CPU and batch loops) to prevent stale disabled flags.
- Add explicit early exits to evsel__tool_pmu_enable() and disable() based
  on evsel->disabled to protect internal metric state.
- Add an upper bound check in test_stat_delay to verify that the delay was
  actually excluded.

Changes in v2:
- Implement evsel__tool_pmu_enable() and evsel__tool_pmu_disable() to
  avoid ioctl failures in batch evsel__enable() and evsel__disable()
  functions.
- Correctly iterate and enable/disable tool PMU events configured as
  non-leader members of event groups.
- Correct the lseek() arguments order in the read_stat helper:
  lseek(fd, 0, SEEK_SET) instead of lseek(fd, SEEK_SET, 0).
- Introduce INVALID_START_TIME (~0ULL) to prevent erroneous large delta
  accumulation in evsel__tool_pmu_read() if /proc/<pid>/stat fails to
  read in enable_cpu (e.g., process exited).
- Improve test parsing to use LC_ALL=C and cut to be robust against
  different locales, and use awk to dynamically compare duration_time to
  time elapsed with a 200ms tolerance (avoiding loaded CI false failures).
  Also added a lower-bound check.
- Fix style warnings from checkpatch.pl (line lengths, braces, and blank
  lines).

Ian Rogers (2):
  perf tool_pmu: Make tool PMU events respect enable/disable
  perf tests: Add test for stat delay option with duration_time

 tools/perf/tests/shell/stat.sh |  53 +++++++
 tools/perf/util/evlist.c       |  10 +-
 tools/perf/util/evsel.c        | 197 ++++++++++++++++++++------
 tools/perf/util/evsel.h        |  15 +-
 tools/perf/util/tool_pmu.c     | 250 ++++++++++++++++++++++++++-------
 tools/perf/util/tool_pmu.h     |   4 +
 6 files changed, 430 insertions(+), 99 deletions(-)

-- 
2.54.0.631.ge1b05301d1-goog


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v4 1/2] perf tool_pmu: Make tool PMU events respect enable/disable
  2026-05-19  1:41     ` [PATCH v4 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
@ 2026-05-19  1:41       ` Ian Rogers
  2026-05-19  6:43         ` Namhyung Kim
  2026-05-19  1:41       ` [PATCH v4 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers
  1 sibling, 1 reply; 20+ messages in thread
From: Ian Rogers @ 2026-05-19  1:41 UTC (permalink / raw)
  To: irogers, acme, namhyung
  Cc: adrian.hunter, james.clark, jolsa, linux-kernel, linux-perf-users,
	mingo, nigro.fra, peterz, tmricht

Tool PMU events (duration_time, user_time, system_time) currently
count from when the event is opened to when it is read. This causes
issues with features like the delay option (-D) or control fd, where
events are opened but should not start counting immediately.

Make these events behave more like regular counters by implementing
proper enable and disable support. Add accumulated_time to struct
evsel to track time while enabled, and implement enable/disable CPU
callbacks to start/stop counting.

Also generalize userspace PMU mixed group handling. Userspace synthetic
PMUs (type > PERF_PMU_TYPE_PE_END) do not have kernel implementations and
cannot be grouped in the kernel (opened with group_fd = -1), and are
skipped by kernel enable/disable calls. Iterate over group members in
userspace and manually enable/disable any members if the leader or the
member is a non-perf-event open PMU, and synchronize their disabled flags.

Fixes: b71f46a6a708 ("perf stat: Remove hard coded shadow metrics")
Reported-by: Francesco Nigro <nigro.fra@gmail.com>
Closes: https://lore.kernel.org/linux-perf-users/20260517093650.2540920-1-nigro.fra@gmail.com/
Signed-off-by: Ian Rogers <irogers@google.com>
Assisted-by: Antigravity:gemini-3-flash
---
 tools/perf/util/evlist.c   |  10 +-
 tools/perf/util/evsel.c    | 197 ++++++++++++++++++++++-------
 tools/perf/util/evsel.h    |  15 ++-
 tools/perf/util/tool_pmu.c | 250 +++++++++++++++++++++++++++++--------
 tools/perf/util/tool_pmu.h |   4 +
 5 files changed, 377 insertions(+), 99 deletions(-)

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index ee971d15b3c6..1a238b245b3a 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -529,7 +529,7 @@ static int evlist__is_enabled(struct evlist *evlist)
 
 static void __evlist__disable(struct evlist *evlist, char *evsel_name, bool excl_dummy)
 {
-	struct evsel *pos;
+	struct evsel *pos, *member;
 	struct evlist_cpu_iterator evlist_cpu_itr;
 	bool has_imm = false;
 
@@ -561,6 +561,9 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name, bool excl
 		if (excl_dummy && evsel__is_dummy_event(pos))
 			continue;
 		pos->disabled = true;
+
+		for_each_group_member(member, pos)
+			member->disabled = true;
 	}
 
 	/*
@@ -590,7 +593,7 @@ void evlist__disable_evsel(struct evlist *evlist, char *evsel_name)
 
 static void __evlist__enable(struct evlist *evlist, char *evsel_name, bool excl_dummy)
 {
-	struct evsel *pos;
+	struct evsel *pos, *member;
 	struct evlist_cpu_iterator evlist_cpu_itr;
 
 	evlist__for_each_cpu(evlist_cpu_itr, evlist) {
@@ -611,6 +614,9 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name, bool excl_
 		if (excl_dummy && evsel__is_dummy_event(pos))
 			continue;
 		pos->disabled = false;
+
+		for_each_group_member(member, pos)
+			member->disabled = false;
 	}
 
 	/*
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 2ee87fd84d3e..8a80d2e15f5c 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -11,68 +11,71 @@
  */
 #define __SANE_USERSPACE_TYPES__
 
-#include <byteswap.h>
+#include "evsel.h"
+
 #include <errno.h>
 #include <inttypes.h>
+#include <stdlib.h>
+
+#include <dirent.h>
 #include <linux/bitops.h>
-#include <api/fs/fs.h>
-#include <api/fs/tracing_path.h>
-#include <linux/hw_breakpoint.h>
-#include <linux/perf_event.h>
 #include <linux/compiler.h>
+#include <linux/ctype.h>
 #include <linux/err.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/perf_event.h>
 #include <linux/zalloc.h>
 #include <sys/ioctl.h>
 #include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
-#include <dirent.h>
-#include <stdlib.h>
+
+#include <api/fs/fs.h>
+#include <api/fs/tracing_path.h>
+#include <byteswap.h>
+#include <internal/lib.h>
+#include <internal/threadmap.h>
+#include <internal/xyarray.h>
+#include <perf/cpumap.h>
 #include <perf/evsel.h>
+
+#include "../perf-sys.h"
 #include "asm/bug.h"
+#include "bpf-filter.h"
 #include "bpf_counter.h"
 #include "callchain.h"
 #include "cgroup.h"
 #include "counts.h"
+#include "debug.h"
+#include "drm_pmu.h"
 #include "dwarf-regs.h"
+#include "env.h"
 #include "event.h"
-#include "evsel.h"
-#include "time-utils.h"
-#include "util/env.h"
-#include "util/evsel_config.h"
-#include "util/evsel_fprintf.h"
 #include "evlist.h"
-#include <perf/cpumap.h>
-#include "thread_map.h"
-#include "target.h"
+#include "evsel_config.h"
+#include "evsel_fprintf.h"
+#include "hashmap.h"
+#include "hist.h"
+#include "hwmon_pmu.h"
+#include "intel-tpebs.h"
+#include "memswap.h"
+#include "off_cpu.h"
+#include "parse-branch-options.h"
 #include "perf_regs.h"
+#include "pmu.h"
+#include "pmus.h"
 #include "record.h"
-#include "debug.h"
-#include "trace-event.h"
+#include "rlimit.h"
 #include "session.h"
 #include "stat.h"
 #include "string2.h"
-#include "memswap.h"
-#include "util.h"
-#include "util/hashmap.h"
-#include "off_cpu.h"
-#include "pmu.h"
-#include "pmus.h"
-#include "drm_pmu.h"
-#include "hwmon_pmu.h"
+#include "target.h"
+#include "thread_map.h"
+#include "time-utils.h"
 #include "tool_pmu.h"
 #include "tp_pmu.h"
-#include "rlimit.h"
-#include "../perf-sys.h"
-#include "util/parse-branch-options.h"
-#include "util/bpf-filter.h"
-#include "util/hist.h"
-#include <internal/xyarray.h>
-#include <internal/lib.h>
-#include <internal/threadmap.h>
-#include "util/intel-tpebs.h"
-
-#include <linux/ctype.h>
+#include "trace-event.h"
+#include "util.h"
 
 #ifdef HAVE_LIBTRACEEVENT
 #include <event-parse.h>
@@ -1795,27 +1798,114 @@ int evsel__append_addr_filter(struct evsel *evsel, const char *filter)
 /* Caller has to clear disabled after going through all CPUs. */
 int evsel__enable_cpu(struct evsel *evsel, int cpu_map_idx)
 {
-	return perf_evsel__enable_cpu(&evsel->core, cpu_map_idx);
+	int err;
+
+	if (evsel__is_tool(evsel))
+		err = evsel__tool_pmu_enable_cpu(evsel, cpu_map_idx);
+	else
+		err = perf_evsel__enable_cpu(&evsel->core, cpu_map_idx);
+
+	if (!err && evsel__is_group_leader(evsel)) {
+		struct evsel *member;
+
+		for_each_group_member(member, evsel) {
+			if (evsel__is_non_perf_event_open_pmu(evsel) ||
+			    evsel__is_non_perf_event_open_pmu(member)) {
+				/*
+				 * In a mixed PMU group, userspace PMUs are not
+				 * grouped in the kernel (opened with group_fd = -1)
+				 * and are skipped by the kernel when enabling the
+				 * group leader. We must manually enable them in
+				 * userspace.
+				 */
+				int mem_err = evsel__enable_cpu(member, cpu_map_idx);
+
+				if (mem_err)
+					return mem_err;
+			}
+		}
+	}
+	return err;
 }
 
 int evsel__enable(struct evsel *evsel)
 {
-	int err = perf_evsel__enable(&evsel->core);
+	int err;
+
+	if (evsel__is_tool(evsel))
+		err = evsel__tool_pmu_enable(evsel);
+	else
+		err = perf_evsel__enable(&evsel->core);
 
 	if (!err)
 		evsel->disabled = false;
+
+	if (!err && evsel__is_group_leader(evsel)) {
+		struct evsel *member;
+
+		for_each_group_member(member, evsel) {
+			if (evsel__is_non_perf_event_open_pmu(evsel) ||
+			    evsel__is_non_perf_event_open_pmu(member)) {
+				/*
+				 * In a mixed PMU group, userspace PMUs are not
+				 * grouped in the kernel (opened with group_fd = -1)
+				 * and are skipped by the kernel when enabling the
+				 * group leader. We must manually enable them in
+				 * userspace.
+				 */
+				int mem_err = evsel__enable(member);
+
+				if (mem_err)
+					return mem_err;
+			}
+			member->disabled = false;
+		}
+	}
+
 	return err;
 }
 
 /* Caller has to set disabled after going through all CPUs. */
 int evsel__disable_cpu(struct evsel *evsel, int cpu_map_idx)
 {
-	return perf_evsel__disable_cpu(&evsel->core, cpu_map_idx);
+	int err;
+
+	if (evsel__is_tool(evsel))
+		err = evsel__tool_pmu_disable_cpu(evsel, cpu_map_idx);
+	else
+		err = perf_evsel__disable_cpu(&evsel->core, cpu_map_idx);
+
+	if (!err && evsel__is_group_leader(evsel)) {
+		struct evsel *member;
+
+		for_each_group_member(member, evsel) {
+			if (evsel__is_non_perf_event_open_pmu(evsel) ||
+			    evsel__is_non_perf_event_open_pmu(member)) {
+				/*
+				 * In a mixed PMU group, userspace PMUs are not
+				 * grouped in the kernel and are skipped by the
+				 * kernel when disabling the group leader. We must
+				 * manually disable them in userspace.
+				 */
+				int mem_err = evsel__disable_cpu(member, cpu_map_idx);
+
+				if (mem_err)
+					return mem_err;
+			}
+		}
+	}
+	return err;
 }
 
 int evsel__disable(struct evsel *evsel)
 {
-	int err = perf_evsel__disable(&evsel->core);
+	int err;
+
+	if (evsel__is_tool(evsel))
+		err = evsel__tool_pmu_disable(evsel);
+	else
+		err = perf_evsel__disable(&evsel->core);
+
 	/*
 	 * We mark it disabled here so that tools that disable a event can
 	 * ignore events after they disable it. I.e. the ring buffer may have
@@ -1825,6 +1915,27 @@ int evsel__disable(struct evsel *evsel)
 	if (!err)
 		evsel->disabled = true;
 
+	if (!err && evsel__is_group_leader(evsel)) {
+		struct evsel *member;
+
+		for_each_group_member(member, evsel) {
+			if (evsel__is_non_perf_event_open_pmu(evsel) ||
+			    evsel__is_non_perf_event_open_pmu(member)) {
+				/*
+				 * In a mixed PMU group, userspace PMUs are not
+				 * grouped in the kernel and are skipped by the
+				 * kernel when disabling the group leader. We must
+				 * manually disable them in userspace.
+				 */
+				int mem_err = evsel__disable(member);
+
+				if (mem_err)
+					return mem_err;
+			}
+			member->disabled = true;
+		}
+	}
+
 	return err;
 }
 
@@ -1885,8 +1996,10 @@ void evsel__exit(struct evsel *evsel)
 		evsel__priv_destructor(evsel->priv);
 	perf_evsel__object.fini(evsel);
 	if (evsel__tool_event(evsel) == TOOL_PMU__EVENT_SYSTEM_TIME ||
-	    evsel__tool_event(evsel) == TOOL_PMU__EVENT_USER_TIME)
-		xyarray__delete(evsel->start_times);
+	    evsel__tool_event(evsel) == TOOL_PMU__EVENT_USER_TIME) {
+		xyarray__delete(evsel->process_time.start_times);
+		xyarray__delete(evsel->process_time.accumulated_times);
+	}
 }
 
 void evsel__delete(struct evsel *evsel)
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 339b5c08a33d..7e3746480269 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -190,12 +190,18 @@ struct evsel {
 			double max;
 		} retirement_latency;
 		/* duration_time is a single global time. */
-		__u64 start_time;
+		struct {
+			__u64 start_time;
+			__u64 accumulated_time;
+		} duration_time;
 		/*
 		 * user_time and system_time read an initial value potentially
 		 * per-CPU or per-pid.
 		 */
-		struct xyarray *start_times;
+		struct {
+			struct xyarray *start_times;
+			struct xyarray *accumulated_times;
+		} process_time;
 	};
 	/* Is the tool's fd for /proc/pid/stat or /proc/stat. */
 	bool pid_stat;
@@ -350,6 +356,11 @@ void arch_evsel__apply_ratio_to_prev(struct evsel *evsel, struct perf_event_attr
 int evsel__set_filter(struct evsel *evsel, const char *filter);
 int evsel__append_tp_filter(struct evsel *evsel, const char *filter);
 int evsel__append_addr_filter(struct evsel *evsel, const char *filter);
+static inline bool evsel__is_non_perf_event_open_pmu(const struct evsel *evsel)
+{
+	return evsel->pmu && evsel->pmu->type > PERF_PMU_TYPE_PE_END;
+}
+
 int evsel__enable_cpu(struct evsel *evsel, int cpu_map_idx);
 int evsel__enable(struct evsel *evsel);
 int evsel__disable(struct evsel *evsel);
diff --git a/tools/perf/util/tool_pmu.c b/tools/perf/util/tool_pmu.c
index 6a9df3dc0e07..5c30854b4644 100644
--- a/tools/perf/util/tool_pmu.c
+++ b/tools/perf/util/tool_pmu.c
@@ -17,6 +17,8 @@
 #include <fcntl.h>
 #include <strings.h>
 
+#define INVALID_START_TIME ~0ULL
+
 static const char *const tool_pmu__event_names[TOOL_PMU__EVENT_MAX] = {
 	NULL,
 	"duration_time",
@@ -205,20 +207,57 @@ int evsel__tool_pmu_prepare_open(struct evsel *evsel,
 				 struct perf_cpu_map *cpus,
 				 int nthreads)
 {
-	if ((evsel__tool_event(evsel) == TOOL_PMU__EVENT_SYSTEM_TIME ||
-	     evsel__tool_event(evsel) == TOOL_PMU__EVENT_USER_TIME) &&
-	    !evsel->start_times) {
-		evsel->start_times = xyarray__new(perf_cpu_map__nr(cpus),
-						  nthreads,
-						  sizeof(__u64));
-		if (!evsel->start_times)
-			return -ENOMEM;
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+
+	if (ev == TOOL_PMU__EVENT_SYSTEM_TIME || ev == TOOL_PMU__EVENT_USER_TIME) {
+		if (!evsel->process_time.start_times) {
+			evsel->process_time.start_times =
+				xyarray__new(perf_cpu_map__nr(cpus), nthreads, sizeof(__u64));
+			if (!evsel->process_time.start_times)
+				return -ENOMEM;
+		}
+		if (!evsel->process_time.accumulated_times) {
+			evsel->process_time.accumulated_times =
+				xyarray__new(perf_cpu_map__nr(cpus), nthreads, sizeof(__u64));
+			if (!evsel->process_time.accumulated_times)
+				return -ENOMEM;
+		}
 	}
 	return 0;
 }
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->core.fd, x, y))
 
+static int tool_pmu__read_stat(struct evsel *evsel, int cpu_map_idx, int thread, __u64 *val)
+{
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+	bool system = ev == TOOL_PMU__EVENT_SYSTEM_TIME;
+	int fd = FD(evsel, cpu_map_idx, thread);
+	int err = 0;
+
+	if (fd < 0) {
+		*val = 0;
+		return 0;
+	}
+
+	lseek(fd, 0, SEEK_SET);
+	if (evsel->pid_stat) {
+		if (cpu_map_idx == 0)
+			err = read_pid_stat_field(fd, system ? 15 : 14, val);
+		else
+			*val = 0;
+	} else {
+		if (thread == 0) {
+			struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus, cpu_map_idx);
+
+			err = read_stat_field(fd, cpu, system ? 3 : 1, val);
+		} else {
+			*val = 0;
+		}
+	}
+	return err;
+}
+
 int evsel__tool_pmu_open(struct evsel *evsel,
 			 struct perf_thread_map *threads,
 			 int start_cpu_map_idx, int end_cpu_map_idx)
@@ -232,7 +271,14 @@ int evsel__tool_pmu_open(struct evsel *evsel,
 	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
 		if (evsel->core.attr.sample_period) /* no sampling */
 			return -EINVAL;
-		evsel->start_time = rdclock();
+		evsel->duration_time.accumulated_time = 0;
+		if (evsel->core.attr.disabled) {
+			evsel->disabled = true;
+			evsel->duration_time.start_time = INVALID_START_TIME;
+		} else {
+			evsel->disabled = false;
+			evsel->duration_time.start_time = rdclock();
+		}
 		return 0;
 	}
 
@@ -246,8 +292,8 @@ int evsel__tool_pmu_open(struct evsel *evsel,
 				pid = perf_thread_map__pid(threads, thread);
 
 			if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
-				bool system = ev == TOOL_PMU__EVENT_SYSTEM_TIME;
 				__u64 *start_time = NULL;
+				__u64 *accumulated_time = NULL;
 				int fd;
 
 				if (evsel->core.attr.sample_period) {
@@ -269,21 +315,25 @@ int evsel__tool_pmu_open(struct evsel *evsel,
 					err = -errno;
 					goto out_close;
 				}
-				start_time = xyarray__entry(evsel->start_times, idx, thread);
-				if (pid > -1) {
-					err = read_pid_stat_field(fd, system ? 15 : 14,
-								  start_time);
+				start_time = xyarray__entry(evsel->process_time.start_times, idx,
+							    thread);
+				accumulated_time = xyarray__entry(
+					evsel->process_time.accumulated_times, idx, thread);
+				*accumulated_time = 0;
+
+				if (evsel->core.attr.disabled) {
+					evsel->disabled = true;
+					*start_time = INVALID_START_TIME;
 				} else {
-					struct perf_cpu cpu;
-
-					cpu = perf_cpu_map__cpu(evsel->core.cpus, idx);
-					err = read_stat_field(fd, cpu, system ? 3 : 1,
-							      start_time);
+					evsel->disabled = false;
+					err = tool_pmu__read_stat(evsel, idx, thread, start_time);
+					if (err) {
+						close(fd);
+						FD(evsel, idx, thread) = -1;
+						goto out_close;
+					}
 				}
-				if (err)
-					goto out_close;
 			}
-
 		}
 	}
 	return 0;
@@ -467,10 +517,111 @@ static void perf_counts__update(struct perf_counts_values *count,
 		count->lost = 0;
 	}
 }
+int evsel__tool_pmu_enable_cpu(struct evsel *evsel, int cpu_map_idx)
+{
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+	int thread, nthreads;
+
+	if (!evsel->disabled)
+		return 0;
+
+	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
+		if (cpu_map_idx == 0)
+			evsel->duration_time.start_time = rdclock();
+		return 0;
+	}
+
+	if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
+		nthreads = xyarray__max_y(evsel->process_time.start_times);
+		for (thread = 0; thread < nthreads; thread++) {
+			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
+							   cpu_map_idx, thread);
+			__u64 val;
+			int err;
+
+			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &val);
+			if (!err)
+				*start_time = val;
+			else
+				*start_time = INVALID_START_TIME;
+		}
+	}
+	return 0;
+}
+
+int evsel__tool_pmu_enable(struct evsel *evsel)
+{
+	unsigned int idx;
+	int err = 0;
+
+	if (!evsel->disabled)
+		return 0;
+
+	for (idx = 0; idx < perf_cpu_map__nr(evsel->core.cpus); idx++) {
+		err = evsel__tool_pmu_enable_cpu(evsel, idx);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+int evsel__tool_pmu_disable_cpu(struct evsel *evsel, int cpu_map_idx)
+{
+	enum tool_pmu_event ev = evsel__tool_event(evsel);
+	int thread, nthreads;
+
+	if (evsel->disabled)
+		return 0;
+
+	if (ev == TOOL_PMU__EVENT_DURATION_TIME) {
+		if (cpu_map_idx == 0) {
+			__u64 delta = rdclock() - evsel->duration_time.start_time;
+
+			evsel->duration_time.accumulated_time += delta;
+		}
+		return 0;
+	}
+
+	if (ev == TOOL_PMU__EVENT_USER_TIME || ev == TOOL_PMU__EVENT_SYSTEM_TIME) {
+		nthreads = xyarray__max_y(evsel->process_time.start_times);
+		for (thread = 0; thread < nthreads; thread++) {
+			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
+							   cpu_map_idx, thread);
+			__u64 *accumulated_time = xyarray__entry(
+				evsel->process_time.accumulated_times, cpu_map_idx, thread);
+			__u64 val;
+			int err;
+
+			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &val);
+			if (!err) {
+				if (*start_time != INVALID_START_TIME && val >= *start_time)
+					*accumulated_time += (val - *start_time);
+			}
+			*start_time = INVALID_START_TIME;
+		}
+	}
+	return 0;
+}
+
+int evsel__tool_pmu_disable(struct evsel *evsel)
+{
+	unsigned int idx;
+	int err = 0;
+
+	if (evsel->disabled)
+		return 0;
+
+	for (idx = 0; idx < perf_cpu_map__nr(evsel->core.cpus); idx++) {
+		err = evsel__tool_pmu_disable_cpu(evsel, idx);
+		if (err)
+			break;
+	}
+	return err;
+}
 
 int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
 {
-	__u64 *start_time, cur_time, delta_start;
+	__u64 delta_start = 0;
 	int err = 0;
 	struct perf_counts_values *count, *old_count = NULL;
 	bool adjust = false;
@@ -507,39 +658,33 @@ int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
 		return 0;
 	}
 	case TOOL_PMU__EVENT_DURATION_TIME:
-		/*
-		 * Pretend duration_time is only on the first CPU and thread, or
-		 * else aggregation will scale duration_time by the number of
-		 * CPUs/threads.
-		 */
-		start_time = &evsel->start_time;
-		if (cpu_map_idx == 0 && thread == 0)
-			cur_time = rdclock();
-		else
-			cur_time = *start_time;
+		if (cpu_map_idx == 0 && thread == 0) {
+			delta_start = evsel->duration_time.accumulated_time;
+			if (!evsel->disabled &&
+			    evsel->duration_time.start_time != INVALID_START_TIME)
+				delta_start += (rdclock() - evsel->duration_time.start_time);
+		} else {
+			delta_start = 0;
+		}
 		break;
 	case TOOL_PMU__EVENT_USER_TIME:
 	case TOOL_PMU__EVENT_SYSTEM_TIME: {
-		bool system = evsel__tool_event(evsel) == TOOL_PMU__EVENT_SYSTEM_TIME;
-		int fd = FD(evsel, cpu_map_idx, thread);
-
-		start_time = xyarray__entry(evsel->start_times, cpu_map_idx, thread);
-		lseek(fd, SEEK_SET, 0);
-		if (evsel->pid_stat) {
-			/* The event exists solely on 1 CPU. */
-			if (cpu_map_idx == 0)
-				err = read_pid_stat_field(fd, system ? 15 : 14, &cur_time);
-			else
-				cur_time = 0;
-		} else {
-			/* The event is for all threads. */
-			if (thread == 0) {
-				struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus,
-									cpu_map_idx);
+		__u64 accumulated = *(__u64 *)xyarray__entry(evsel->process_time.accumulated_times,
+							     cpu_map_idx, thread);
 
-				err = read_stat_field(fd, cpu, system ? 3 : 1, &cur_time);
-			} else {
-				cur_time = 0;
+		if (evsel->disabled) {
+			delta_start = accumulated;
+		} else {
+			__u64 *start_time = xyarray__entry(evsel->process_time.start_times,
+							   cpu_map_idx, thread);
+			__u64 cur_time;
+
+			err = tool_pmu__read_stat(evsel, cpu_map_idx, thread, &cur_time);
+			if (!err) {
+				if (*start_time != INVALID_START_TIME && cur_time >= *start_time)
+					delta_start = accumulated + (cur_time - *start_time);
+				else
+					delta_start = accumulated;
 			}
 		}
 		adjust = true;
@@ -553,7 +698,6 @@ int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
 	if (err)
 		return err;
 
-	delta_start = cur_time - *start_time;
 	if (adjust) {
 		__u64 ticks_per_sec = sysconf(_SC_CLK_TCK);
 
diff --git a/tools/perf/util/tool_pmu.h b/tools/perf/util/tool_pmu.h
index f1714001bc1d..f66d24cf3502 100644
--- a/tools/perf/util/tool_pmu.h
+++ b/tools/perf/util/tool_pmu.h
@@ -56,6 +56,10 @@ int evsel__tool_pmu_prepare_open(struct evsel *evsel,
 int evsel__tool_pmu_open(struct evsel *evsel,
 			 struct perf_thread_map *threads,
 			 int start_cpu_map_idx, int end_cpu_map_idx);
+int evsel__tool_pmu_enable_cpu(struct evsel *evsel, int cpu_map_idx);
+int evsel__tool_pmu_enable(struct evsel *evsel);
+int evsel__tool_pmu_disable_cpu(struct evsel *evsel, int cpu_map_idx);
+int evsel__tool_pmu_disable(struct evsel *evsel);
 int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread);
 
 struct perf_pmu *tool_pmu__new(void);
-- 
2.54.0.631.ge1b05301d1-goog


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v4 1/2] perf tool_pmu: Make tool PMU events respect enable/disable
  2026-05-19  1:41       ` [PATCH v4 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
@ 2026-05-19  6:43         ` Namhyung Kim
  2026-05-19  8:13           ` Ian Rogers
  0 siblings, 1 reply; 20+ messages in thread
From: Namhyung Kim @ 2026-05-19  6:43 UTC (permalink / raw)
  To: Ian Rogers
  Cc: acme, adrian.hunter, james.clark, jolsa, linux-kernel,
	linux-perf-users, mingo, nigro.fra, peterz, tmricht

On Mon, May 18, 2026 at 06:41:05PM -0700, Ian Rogers wrote:
> Tool PMU events (duration_time, user_time, system_time) currently
> count from when the event is opened to when it is read. This causes
> issues with features like the delay option (-D) or control fd, where
> events are opened but should not start counting immediately.
> 
> Make these events behave more like regular counters by implementing
> proper enable and disable support. Add accumulated_time to struct
> evsel to track time while enabled, and implement enable/disable CPU
> callbacks to start/stop counting.
> 
> Also generalize userspace PMU mixed group handling. Userspace synthetic
> PMUs (type > PERF_PMU_TYPE_PE_END) do not have kernel implementations and
> cannot be grouped in the kernel (opened with group_fd = -1), and are
> skipped by kernel enable/disable calls. Iterate over group members in
> userspace and manually enable/disable any members if the leader or the
> member is a non-perf-event open PMU, and synchronize their disabled flags.

Can we divide the commit into smaller pieces?  I think we can have

 * preparation for accumulated_time
 * implement enable/disable for tool PMU
 * wire them to evsel__{enable,disable}[_cpu]
 * support group members properly

What do you think?

Thanks,
Namhyung

> 
> Fixes: b71f46a6a708 ("perf stat: Remove hard coded shadow metrics")
> Reported-by: Francesco Nigro <nigro.fra@gmail.com>
> Closes: https://lore.kernel.org/linux-perf-users/20260517093650.2540920-1-nigro.fra@gmail.com/
> Signed-off-by: Ian Rogers <irogers@google.com>
> Assisted-by: Antigravity:gemini-3-flash
> ---
>  tools/perf/util/evlist.c   |  10 +-
>  tools/perf/util/evsel.c    | 197 ++++++++++++++++++++++-------
>  tools/perf/util/evsel.h    |  15 ++-
>  tools/perf/util/tool_pmu.c | 250 +++++++++++++++++++++++++++++--------
>  tools/perf/util/tool_pmu.h |   4 +
>  5 files changed, 377 insertions(+), 99 deletions(-)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4 1/2] perf tool_pmu: Make tool PMU events respect enable/disable
  2026-05-19  6:43         ` Namhyung Kim
@ 2026-05-19  8:13           ` Ian Rogers
  2026-05-20 23:06             ` Namhyung Kim
  0 siblings, 1 reply; 20+ messages in thread
From: Ian Rogers @ 2026-05-19  8:13 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: acme, adrian.hunter, james.clark, jolsa, linux-kernel,
	linux-perf-users, mingo, nigro.fra, peterz, tmricht

On Mon, May 18, 2026 at 11:43 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> On Mon, May 18, 2026 at 06:41:05PM -0700, Ian Rogers wrote:
> > Tool PMU events (duration_time, user_time, system_time) currently
> > count from when the event is opened to when it is read. This causes
> > issues with features like the delay option (-D) or control fd, where
> > events are opened but should not start counting immediately.
> >
> > Make these events behave more like regular counters by implementing
> > proper enable and disable support. Add accumulated_time to struct
> > evsel to track time while enabled, and implement enable/disable CPU
> > callbacks to start/stop counting.
> >
> > Also generalize userspace PMU mixed group handling. Userspace synthetic
> > PMUs (type > PERF_PMU_TYPE_PE_END) do not have kernel implementations and
> > cannot be grouped in the kernel (opened with group_fd = -1), and are
> > skipped by kernel enable/disable calls. Iterate over group members in
> > userspace and manually enable/disable any members if the leader or the
> > member is a non-perf-event open PMU, and synchronize their disabled flags.
>
> Can we divide the commit into smaller pieces?  I think we can have
>
>  * preparation for accumulated_time
>  * implement enable/disable for tool PMU
>  * wire them to evsel__{enable,disable}[_cpu]
>  * support group members properly
>
> What do you think?

That sounds needlessly painful, involving many irrelevant intermediate
states with dead functions and variables. Most of the patch is
confined to tool_pmu, we could make the evsel patch smaller by
removing comments. The changes in evsel are pretty minimal and the
patch is largely confined to tool_pmu.

Thanks,
Ian

> Thanks,
> Namhyung
>
> >
> > Fixes: b71f46a6a708 ("perf stat: Remove hard coded shadow metrics")
> > Reported-by: Francesco Nigro <nigro.fra@gmail.com>
> > Closes: https://lore.kernel.org/linux-perf-users/20260517093650.2540920-1-nigro.fra@gmail.com/
> > Signed-off-by: Ian Rogers <irogers@google.com>
> > Assisted-by: Antigravity:gemini-3-flash
> > ---
> >  tools/perf/util/evlist.c   |  10 +-
> >  tools/perf/util/evsel.c    | 197 ++++++++++++++++++++++-------
> >  tools/perf/util/evsel.h    |  15 ++-
> >  tools/perf/util/tool_pmu.c | 250 +++++++++++++++++++++++++++++--------
> >  tools/perf/util/tool_pmu.h |   4 +
> >  5 files changed, 377 insertions(+), 99 deletions(-)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4 1/2] perf tool_pmu: Make tool PMU events respect enable/disable
  2026-05-19  8:13           ` Ian Rogers
@ 2026-05-20 23:06             ` Namhyung Kim
  0 siblings, 0 replies; 20+ messages in thread
From: Namhyung Kim @ 2026-05-20 23:06 UTC (permalink / raw)
  To: Ian Rogers
  Cc: acme, adrian.hunter, james.clark, jolsa, linux-kernel,
	linux-perf-users, mingo, nigro.fra, peterz, tmricht

On Tue, May 19, 2026 at 01:13:35AM -0700, Ian Rogers wrote:
> On Mon, May 18, 2026 at 11:43 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Mon, May 18, 2026 at 06:41:05PM -0700, Ian Rogers wrote:
> > > Tool PMU events (duration_time, user_time, system_time) currently
> > > count from when the event is opened to when it is read. This causes
> > > issues with features like the delay option (-D) or control fd, where
> > > events are opened but should not start counting immediately.
> > >
> > > Make these events behave more like regular counters by implementing
> > > proper enable and disable support. Add accumulated_time to struct
> > > evsel to track time while enabled, and implement enable/disable CPU
> > > callbacks to start/stop counting.
> > >
> > > Also generalize userspace PMU mixed group handling. Userspace synthetic
> > > PMUs (type > PERF_PMU_TYPE_PE_END) do not have kernel implementations and
> > > cannot be grouped in the kernel (opened with group_fd = -1), and are
> > > skipped by kernel enable/disable calls. Iterate over group members in
> > > userspace and manually enable/disable any members if the leader or the
> > > member is a non-perf-event open PMU, and synchronize their disabled flags.
> >
> > Can we divide the commit into smaller pieces?  I think we can have
> >
> >  * preparation for accumulated_time
> >  * implement enable/disable for tool PMU
> >  * wire them to evsel__{enable,disable}[_cpu]
> >  * support group members properly
> >
> > What do you think?
> 
> That sounds needlessly painful, involving many irrelevant intermediate
> states with dead functions and variables. Most of the patch is
> confined to tool_pmu, we could make the evsel patch smaller by
> removing comments. The changes in evsel are pretty minimal and the
> patch is largely confined to tool_pmu.

I didn't think it'd add many temporary code especially for group
handling.  But I'm fine with the single patch if breaking it up would
cause a lot of pains to you.

Thanks,
Namhyung


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v4 2/2] perf tests: Add test for stat delay option with duration_time
  2026-05-19  1:41     ` [PATCH v4 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
  2026-05-19  1:41       ` [PATCH v4 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
@ 2026-05-19  1:41       ` Ian Rogers
  1 sibling, 0 replies; 20+ messages in thread
From: Ian Rogers @ 2026-05-19  1:41 UTC (permalink / raw)
  To: irogers, acme, namhyung
  Cc: adrian.hunter, james.clark, jolsa, linux-kernel, linux-perf-users,
	mingo, nigro.fra, peterz, tmricht

Add a new test case `test_stat_delay` to `stat.sh` to verify that
`duration_time` correctly excludes the delay period when using the
delay option (-D).

The test runs `perf stat -D 1000 -e duration_time sleep 2` and
verifies that `duration_time` is ~1s (excluding the 1s delay), not
~2s.

Signed-off-by: Ian Rogers <irogers@google.com>
Assisted-by: Antigravity:gemini-3-flash
---
 tools/perf/tests/shell/stat.sh | 53 ++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh
index 4edb04039036..1e17bee026bd 100755
--- a/tools/perf/tests/shell/stat.sh
+++ b/tools/perf/tests/shell/stat.sh
@@ -483,6 +483,58 @@ test_stat_pid() {
   wait $pid 2>/dev/null || true
 }
 
+test_stat_delay() {
+  echo "stat -D test"
+  if ! env LC_ALL=C perf stat -D 1000 -e duration_time sleep 2 > "${stat_output}" 2>&1
+  then
+    echo "stat -D test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  duration=$(grep "duration_time" "${stat_output}" | awk '{print $1}' | tr -d ',')
+  elapsed=$(grep "seconds time elapsed" "${stat_output}" | awk '{print $1}')
+
+  if [ -z "$duration" ] || [ -z "$elapsed" ]
+  then
+    echo "stat -D test [Failed - failed to find duration_time or time elapsed in output]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  # Compare duration (ns) and elapsed (s) using awk to handle float and allow tolerance.
+  if ! awk -v d="$duration" -v e="$elapsed" '
+    BEGIN {
+      diff = d - (e * 1e9);
+      if (diff < 0) diff = -diff;
+      # Allow 200ms tolerance (200,000,000 ns) for loaded CI machines.
+      if (diff > 200000000) {
+        printf "Fail: duration (%d ns) and elapsed (%f s) mismatch (diff %d ns)\n", d, e, diff;
+        exit 1;
+      }
+      # Lower bound check: must be at least 0.5s.
+      if (d < 500000000) {
+        printf "Fail: duration (%d ns) is abnormally small\n", d;
+        exit 1;
+      }
+      # Upper bound check: must be strictly less than 1.7s (proving delay was excluded).
+      if (d > 1700000000) {
+        printf "Fail: duration (%d ns) is too large (delay might not be excluded)\n", d;
+        exit 1;
+      }
+      exit 0;
+    }'
+  then
+    echo "stat -D test [Failed - validation failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+  echo "stat -D test [Success]"
+}
+
 test_default_stat
 test_null_stat
 test_offline_cpu_stat
@@ -498,6 +550,7 @@ test_stat_no_aggr
 test_stat_detailed
 test_stat_repeat
 test_stat_pid
+test_stat_delay
 
 cleanup
 exit $err
-- 
2.54.0.631.ge1b05301d1-goog


^ permalink raw reply related	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2026-05-20 23:06 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-18 18:39 [PATCH v1 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
2026-05-18 18:39 ` [PATCH v1 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
2026-05-18 19:26   ` sashiko-bot
2026-05-18 18:39 ` [PATCH v1 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers
2026-05-18 19:33   ` sashiko-bot
2026-05-18 20:14 ` [PATCH v2 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
2026-05-18 20:14   ` [PATCH v2 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
2026-05-18 20:49     ` sashiko-bot
2026-05-18 20:14   ` [PATCH v2 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers
2026-05-18 20:55     ` sashiko-bot
2026-05-18 22:37   ` [PATCH v3 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
2026-05-18 22:37     ` [PATCH v3 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
2026-05-18 23:02       ` sashiko-bot
2026-05-18 22:37     ` [PATCH v3 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers
2026-05-19  1:41     ` [PATCH v4 0/2] perf tool_pmu: Support enable/disable for tool PMU events Ian Rogers
2026-05-19  1:41       ` [PATCH v4 1/2] perf tool_pmu: Make tool PMU events respect enable/disable Ian Rogers
2026-05-19  6:43         ` Namhyung Kim
2026-05-19  8:13           ` Ian Rogers
2026-05-20 23:06             ` Namhyung Kim
2026-05-19  1:41       ` [PATCH v4 2/2] perf tests: Add test for stat delay option with duration_time Ian Rogers

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox