* [PATCH v2 1/3] perf tool_pmu: Use old_count when computing count values for time events
@ 2025-11-04 23:41 Ian Rogers
2025-11-04 23:41 ` [PATCH v2 2/3] perf stat-shadow: Read tool events directly Ian Rogers
2025-11-04 23:41 ` [PATCH v2 3/3] perf stat: Reduce scope of ru_stats Ian Rogers
0 siblings, 2 replies; 3+ messages in thread
From: Ian Rogers @ 2025-11-04 23:41 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Alexander Shishkin, Jiri Olsa, Ian Rogers,
Adrian Hunter, Dr. David Alan Gilbert, Yang Li, James Clark,
Thomas Falcon, Thomas Richter, linux-perf-users, linux-kernel
When running in interval mode every third count of a time event isn't
showing properly:
```
$ perf stat -e duration_time -a -I 1000
1.001082862 1,002,290,425 duration_time
2.004264262 1,003,183,516 duration_time
3.007381401 <not counted> duration_time
4.011160141 1,003,705,631 duration_time
5.014515385 1,003,290,110 duration_time
6.018539680 <not counted> duration_time
7.022065321 1,003,591,720 duration_time
```
The regression came in with a different fix, found through bisection,
commit 68cb1567439f ("perf tool_pmu: Fix aggregation on
duration_time"). The issue is caused by the enabled and running time
of the event matching the old_count's and creating a delta of 0, which
is indicative of an error.
Fixes: 68cb1567439f ("perf tool_pmu: Fix aggregation on duration_time")
Signed-off-by: Ian Rogers <irogers@google.com>
---
tools/perf/builtin-stat.c | 16 +++++++++--
tools/perf/util/tool_pmu.c | 59 +++++++++++++++++++++-----------------
2 files changed, 46 insertions(+), 29 deletions(-)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 886727ae8529..d89fa9468f89 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -282,17 +282,27 @@ static int read_single_counter(struct evsel *counter, int cpu_map_idx, int threa
if (err && cpu_map_idx == 0 &&
(evsel__tool_event(counter) == TOOL_PMU__EVENT_USER_TIME ||
evsel__tool_event(counter) == TOOL_PMU__EVENT_SYSTEM_TIME)) {
- u64 val, *start_time;
struct perf_counts_values *count =
perf_counts(counter->counts, cpu_map_idx, thread);
+ struct perf_counts_values *old_count = NULL;
+ u64 val;
+
+ if (counter->prev_raw_counts)
+ old_count = perf_counts(counter->prev_raw_counts, cpu_map_idx, thread);
- start_time = xyarray__entry(counter->start_times, cpu_map_idx, thread);
if (evsel__tool_event(counter) == TOOL_PMU__EVENT_USER_TIME)
val = ru_stats.ru_utime_usec_stat.mean;
else
val = ru_stats.ru_stime_usec_stat.mean;
- count->ena = count->run = *start_time + val;
+
count->val = val;
+ if (old_count) {
+ count->run = old_count->run + 1;
+ count->ena = old_count->ena + 1;
+ } else {
+ count->run++;
+ count->ena++;
+ }
return 0;
}
return err;
diff --git a/tools/perf/util/tool_pmu.c b/tools/perf/util/tool_pmu.c
index f075098488ba..b895e88ff740 100644
--- a/tools/perf/util/tool_pmu.c
+++ b/tools/perf/util/tool_pmu.c
@@ -431,16 +431,39 @@ bool tool_pmu__read_event(enum tool_pmu_event ev, struct evsel *evsel, u64 *resu
}
}
+static void perf_counts__update(struct perf_counts_values *count,
+ const struct perf_counts_values *old_count,
+ bool raw, u64 val)
+{
+ /*
+ * The values of enabled and running must make a ratio of 100%. The
+ * exact values don't matter as long as they are non-zero to avoid
+ * issues with evsel__count_has_error.
+ */
+ if (old_count) {
+ count->val = raw ? val : old_count->val + val;
+ count->run = old_count->run + 1;
+ count->ena = old_count->ena + 1;
+ count->lost = old_count->lost;
+ } else {
+ count->val = val;
+ count->run++;
+ count->ena++;
+ count->lost = 0;
+ }
+}
+
int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
{
__u64 *start_time, cur_time, delta_start;
- u64 val;
- int fd, err = 0;
+ int err = 0;
struct perf_counts_values *count, *old_count = NULL;
bool adjust = false;
enum tool_pmu_event ev = evsel__tool_event(evsel);
count = perf_counts(evsel->counts, cpu_map_idx, thread);
+ if (evsel->prev_raw_counts)
+ old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
switch (ev) {
case TOOL_PMU__EVENT_HAS_PMEM:
@@ -451,26 +474,18 @@ int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
case TOOL_PMU__EVENT_NUM_PACKAGES:
case TOOL_PMU__EVENT_SLOTS:
case TOOL_PMU__EVENT_SMT_ON:
- case TOOL_PMU__EVENT_SYSTEM_TSC_FREQ:
- if (evsel->prev_raw_counts)
- old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
- val = 0;
+ case TOOL_PMU__EVENT_SYSTEM_TSC_FREQ: {
+ u64 val = 0;
+
if (cpu_map_idx == 0 && thread == 0) {
if (!tool_pmu__read_event(ev, evsel, &val)) {
count->lost++;
val = 0;
}
}
- if (old_count) {
- count->val = old_count->val + val;
- count->run = old_count->run + 1;
- count->ena = old_count->ena + 1;
- } else {
- count->val = val;
- count->run++;
- count->ena++;
- }
+ perf_counts__update(count, old_count, /*raw=*/false, val);
return 0;
+ }
case TOOL_PMU__EVENT_DURATION_TIME:
/*
* Pretend duration_time is only on the first CPU and thread, or
@@ -486,9 +501,9 @@ int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
case TOOL_PMU__EVENT_USER_TIME:
case TOOL_PMU__EVENT_SYSTEM_TIME: {
bool system = evsel__tool_event(evsel) == TOOL_PMU__EVENT_SYSTEM_TIME;
+ int fd = FD(evsel, cpu_map_idx, thread);
start_time = xyarray__entry(evsel->start_times, cpu_map_idx, thread);
- fd = FD(evsel, cpu_map_idx, thread);
lseek(fd, SEEK_SET, 0);
if (evsel->pid_stat) {
/* The event exists solely on 1 CPU. */
@@ -522,17 +537,9 @@ int evsel__tool_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
if (adjust) {
__u64 ticks_per_sec = sysconf(_SC_CLK_TCK);
- delta_start *= 1000000000 / ticks_per_sec;
+ delta_start *= 1e9 / ticks_per_sec;
}
- count->val = delta_start;
- count->lost = 0;
- /*
- * The values of enabled and running must make a ratio of 100%. The
- * exact values don't matter as long as they are non-zero to avoid
- * issues with evsel__count_has_error.
- */
- count->ena++;
- count->run++;
+ perf_counts__update(count, old_count, /*raw=*/true, delta_start);
return 0;
}
--
2.51.2.1006.ga50a493c49-goog
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH v2 2/3] perf stat-shadow: Read tool events directly
2025-11-04 23:41 [PATCH v2 1/3] perf tool_pmu: Use old_count when computing count values for time events Ian Rogers
@ 2025-11-04 23:41 ` Ian Rogers
2025-11-04 23:41 ` [PATCH v2 3/3] perf stat: Reduce scope of ru_stats Ian Rogers
1 sibling, 0 replies; 3+ messages in thread
From: Ian Rogers @ 2025-11-04 23:41 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Alexander Shishkin, Jiri Olsa, Ian Rogers,
Adrian Hunter, Dr. David Alan Gilbert, Yang Li, James Clark,
Thomas Falcon, Thomas Richter, linux-perf-users, linux-kernel
When reading time values for metrics don't use the globals updated in
builtin-stat, just read the events as regular events. The only
exception is for time events where nanoseconds need converting to
seconds as metrics assume time metrics are in seconds.
Signed-off-by: Ian Rogers <irogers@google.com>
---
v2. Make sure that tool time events read from the aggregation index for CPU0.
---
tools/perf/util/stat-shadow.c | 147 +++++++++++++++-------------------
1 file changed, 66 insertions(+), 81 deletions(-)
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index abaf6b579bfc..939ac3269a44 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -371,7 +371,32 @@ static void print_nsecs(struct perf_stat_config *config,
}
}
-static int prepare_metric(const struct metric_expr *mexp,
+static bool tool_pmu__is_time_event(const struct perf_stat_config *config,
+ const struct evsel *evsel, int *tool_aggr_idx)
+{
+ enum tool_pmu_event event = evsel__tool_event(evsel);
+ int aggr_idx;
+
+ if (event != TOOL_PMU__EVENT_DURATION_TIME &&
+ event != TOOL_PMU__EVENT_USER_TIME &&
+ event != TOOL_PMU__EVENT_SYSTEM_TIME)
+ return false;
+
+ if (config) {
+ cpu_aggr_map__for_each_idx(aggr_idx, config->aggr_map) {
+ if (config->aggr_map->map[aggr_idx].cpu.cpu == 0) {
+ *tool_aggr_idx = aggr_idx;
+ return true;
+ }
+ }
+ pr_debug("Unexpected CPU0 missing in aggregation for tool event.\n");
+ }
+ *tool_aggr_idx = 0; /* Assume the first aggregation index works. */
+ return true;
+}
+
+static int prepare_metric(const struct perf_stat_config *config,
+ const struct metric_expr *mexp,
const struct evsel *evsel,
struct expr_parse_ctx *pctx,
int aggr_idx)
@@ -381,91 +406,51 @@ static int prepare_metric(const struct metric_expr *mexp,
int i;
for (i = 0; metric_events[i]; i++) {
+ int source_count = 0, tool_aggr_idx;
+ bool is_tool_time =
+ tool_pmu__is_time_event(config, metric_events[i], &tool_aggr_idx);
+ struct perf_stat_evsel *ps = metric_events[i]->stats;
+ struct perf_stat_aggr *aggr;
char *n;
double val;
- int source_count = 0;
- if (evsel__is_tool(metric_events[i])) {
- struct stats *stats;
- double scale;
-
- switch (evsel__tool_event(metric_events[i])) {
- case TOOL_PMU__EVENT_DURATION_TIME:
- stats = &walltime_nsecs_stats;
- scale = 1e-9;
- break;
- case TOOL_PMU__EVENT_USER_TIME:
- stats = &ru_stats.ru_utime_usec_stat;
- scale = 1e-6;
- break;
- case TOOL_PMU__EVENT_SYSTEM_TIME:
- stats = &ru_stats.ru_stime_usec_stat;
- scale = 1e-6;
+ /*
+ * If there are multiple uncore PMUs and we're not reading the
+ * leader's stats, determine the stats for the appropriate
+ * uncore PMU.
+ */
+ if (evsel && evsel->metric_leader &&
+ evsel->pmu != evsel->metric_leader->pmu &&
+ mexp->metric_events[i]->pmu == evsel->metric_leader->pmu) {
+ struct evsel *pos;
+
+ evlist__for_each_entry(evsel->evlist, pos) {
+ if (pos->pmu != evsel->pmu)
+ continue;
+ if (pos->metric_leader != mexp->metric_events[i])
+ continue;
+ ps = pos->stats;
+ source_count = 1;
break;
- case TOOL_PMU__EVENT_NONE:
- pr_err("Invalid tool event 'none'");
- abort();
- case TOOL_PMU__EVENT_MAX:
- pr_err("Invalid tool event 'max'");
- abort();
- case TOOL_PMU__EVENT_HAS_PMEM:
- case TOOL_PMU__EVENT_NUM_CORES:
- case TOOL_PMU__EVENT_NUM_CPUS:
- case TOOL_PMU__EVENT_NUM_CPUS_ONLINE:
- case TOOL_PMU__EVENT_NUM_DIES:
- case TOOL_PMU__EVENT_NUM_PACKAGES:
- case TOOL_PMU__EVENT_SLOTS:
- case TOOL_PMU__EVENT_SMT_ON:
- case TOOL_PMU__EVENT_SYSTEM_TSC_FREQ:
- default:
- pr_err("Unexpected tool event '%s'", evsel__name(metric_events[i]));
- abort();
}
- val = avg_stats(stats) * scale;
- source_count = 1;
- } else {
- struct perf_stat_evsel *ps = metric_events[i]->stats;
- struct perf_stat_aggr *aggr;
-
+ }
+ /* Time events are always on CPU0, the first aggregation index. */
+ aggr = &ps->aggr[is_tool_time ? tool_aggr_idx : aggr_idx];
+ if (!aggr || !metric_events[i]->supported) {
/*
- * If there are multiple uncore PMUs and we're not
- * reading the leader's stats, determine the stats for
- * the appropriate uncore PMU.
+ * Not supported events will have a count of 0, which
+ * can be confusing in a metric. Explicitly set the
+ * value to NAN. Not counted events (enable time of 0)
+ * are read as 0.
*/
- if (evsel && evsel->metric_leader &&
- evsel->pmu != evsel->metric_leader->pmu &&
- mexp->metric_events[i]->pmu == evsel->metric_leader->pmu) {
- struct evsel *pos;
-
- evlist__for_each_entry(evsel->evlist, pos) {
- if (pos->pmu != evsel->pmu)
- continue;
- if (pos->metric_leader != mexp->metric_events[i])
- continue;
- ps = pos->stats;
- source_count = 1;
- break;
- }
- }
- aggr = &ps->aggr[aggr_idx];
- if (!aggr)
- break;
-
- if (!metric_events[i]->supported) {
- /*
- * Not supported events will have a count of 0,
- * which can be confusing in a
- * metric. Explicitly set the value to NAN. Not
- * counted events (enable time of 0) are read as
- * 0.
- */
- val = NAN;
- source_count = 0;
- } else {
- val = aggr->counts.val;
- if (!source_count)
- source_count = evsel__source_count(metric_events[i]);
- }
+ val = NAN;
+ source_count = 0;
+ } else {
+ val = aggr->counts.val;
+ if (is_tool_time)
+ val *= 1e-9; /* Convert time event nanoseconds to seconds. */
+ if (!source_count)
+ source_count = evsel__source_count(metric_events[i]);
}
n = strdup(evsel__metric_id(metric_events[i]));
if (!n)
@@ -511,7 +496,7 @@ static void generic_metric(struct perf_stat_config *config,
pctx->sctx.user_requested_cpu_list = strdup(config->user_requested_cpu_list);
pctx->sctx.runtime = runtime;
pctx->sctx.system_wide = config->system_wide;
- i = prepare_metric(mexp, evsel, pctx, aggr_idx);
+ i = prepare_metric(config, mexp, evsel, pctx, aggr_idx);
if (i < 0) {
expr__ctx_free(pctx);
return;
@@ -572,7 +557,7 @@ double test_generic_metric(struct metric_expr *mexp, int aggr_idx)
if (!pctx)
return NAN;
- if (prepare_metric(mexp, /*evsel=*/NULL, pctx, aggr_idx) < 0)
+ if (prepare_metric(/*config=*/NULL, mexp, /*evsel=*/NULL, pctx, aggr_idx) < 0)
goto out;
if (expr__parse(&ratio, pctx, mexp->metric_expr))
--
2.51.2.1006.ga50a493c49-goog
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH v2 3/3] perf stat: Reduce scope of ru_stats
2025-11-04 23:41 [PATCH v2 1/3] perf tool_pmu: Use old_count when computing count values for time events Ian Rogers
2025-11-04 23:41 ` [PATCH v2 2/3] perf stat-shadow: Read tool events directly Ian Rogers
@ 2025-11-04 23:41 ` Ian Rogers
1 sibling, 0 replies; 3+ messages in thread
From: Ian Rogers @ 2025-11-04 23:41 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Alexander Shishkin, Jiri Olsa, Ian Rogers,
Adrian Hunter, Dr. David Alan Gilbert, Yang Li, James Clark,
Thomas Falcon, Thomas Richter, linux-perf-users, linux-kernel
The ru_stats are used to capture user and system time stats when a
process exits. These are then applied to user and system time tool
events if their reads fail due to the process terminating. Reduce the
scope now the metric code no longer reads these values.
Signed-off-by: Ian Rogers <irogers@google.com>
---
tools/perf/builtin-stat.c | 14 +++++++++++++-
tools/perf/util/config.c | 1 -
tools/perf/util/stat-shadow.c | 2 --
tools/perf/util/stat.h | 16 ----------------
4 files changed, 13 insertions(+), 20 deletions(-)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index d89fa9468f89..3cd663b3b357 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -132,6 +132,7 @@ static bool interval_count;
static const char *output_name;
static int output_fd;
static char *metrics;
+static struct rusage_stats ru_stats;
struct perf_stat {
bool record;
@@ -729,6 +730,17 @@ static int create_perf_stat_counter(struct evsel *evsel,
evsel->core.threads);
}
+static void update_rusage_stats(const struct rusage *rusage)
+{
+ const u64 us_to_ns = 1000;
+ const u64 s_to_ns = 1000000000;
+
+ update_stats(&ru_stats.ru_utime_usec_stat,
+ (rusage->ru_utime.tv_usec * us_to_ns + rusage->ru_utime.tv_sec * s_to_ns));
+ update_stats(&ru_stats.ru_stime_usec_stat,
+ (rusage->ru_stime.tv_usec * us_to_ns + rusage->ru_stime.tv_sec * s_to_ns));
+}
+
static int __run_perf_stat(int argc, const char **argv, int run_idx)
{
int interval = stat_config.interval;
@@ -978,7 +990,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
evlist__reset_aggr_stats(evsel_list);
} else {
update_stats(&walltime_nsecs_stats, t1 - t0);
- update_rusage_stats(&ru_stats, &stat_config.ru_data);
+ update_rusage_stats(&stat_config.ru_data);
}
/*
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 6f914620c6ff..cc0746f494f4 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -45,7 +45,6 @@ struct perf_stat_config stat_config = {
.run_count = 1,
.metric_only_len = METRIC_ONLY_LEN,
.walltime_nsecs_stats = &walltime_nsecs_stats,
- .ru_stats = &ru_stats,
.big_num = true,
.ctl_fd = -1,
.ctl_fd_ack = -1,
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 939ac3269a44..c59578886b4f 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -18,7 +18,6 @@
#include "tool_pmu.h"
struct stats walltime_nsecs_stats;
-struct rusage_stats ru_stats;
enum {
CTX_BIT_USER = 1 << 0,
@@ -74,7 +73,6 @@ static int evsel_context(const struct evsel *evsel)
void perf_stat__reset_shadow_stats(void)
{
memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
- memset(&ru_stats, 0, sizeof(ru_stats));
}
static enum stat_type evsel__stat_type(struct evsel *evsel)
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 34f30a295f89..5b8b4675883c 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -102,7 +102,6 @@ struct perf_stat_config {
const char *csv_sep;
struct stats *walltime_nsecs_stats;
struct rusage ru_data;
- struct rusage_stats *ru_stats;
struct cpu_aggr_map *aggr_map;
aggr_get_id_t aggr_get_id;
struct cpu_aggr_map *cpus_aggr_map;
@@ -132,25 +131,10 @@ static inline void init_stats(struct stats *stats)
stats->max = 0;
}
-static inline void init_rusage_stats(struct rusage_stats *ru_stats) {
- init_stats(&ru_stats->ru_utime_usec_stat);
- init_stats(&ru_stats->ru_stime_usec_stat);
-}
-
-static inline void update_rusage_stats(struct rusage_stats *ru_stats, struct rusage* rusage) {
- const u64 us_to_ns = 1000;
- const u64 s_to_ns = 1000000000;
- update_stats(&ru_stats->ru_utime_usec_stat,
- (rusage->ru_utime.tv_usec * us_to_ns + rusage->ru_utime.tv_sec * s_to_ns));
- update_stats(&ru_stats->ru_stime_usec_stat,
- (rusage->ru_stime.tv_usec * us_to_ns + rusage->ru_stime.tv_sec * s_to_ns));
-}
-
struct evsel;
struct evlist;
extern struct stats walltime_nsecs_stats;
-extern struct rusage_stats ru_stats;
enum metric_threshold_classify {
METRIC_THRESHOLD_UNKNOWN,
--
2.51.2.1006.ga50a493c49-goog
^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-11-04 23:42 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-04 23:41 [PATCH v2 1/3] perf tool_pmu: Use old_count when computing count values for time events Ian Rogers
2025-11-04 23:41 ` [PATCH v2 2/3] perf stat-shadow: Read tool events directly Ian Rogers
2025-11-04 23:41 ` [PATCH v2 3/3] perf stat: Reduce scope of ru_stats Ian Rogers
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).