* [PATCH v3 1/2] perf stat: Add aggr_nr metric parser support
2026-05-21 20:15 [PATCH v3 0/2] perf stat: Fix uncore metric scaling across aggregation modes Chun-Tse Shao
@ 2026-05-21 20:15 ` Chun-Tse Shao
2026-05-21 20:15 ` [PATCH v3 2/2] perf stat: Use aggr_nr scaling for Intel uncore miss latency metrics Chun-Tse Shao
` (2 subsequent siblings)
3 siblings, 0 replies; 7+ messages in thread
From: Chun-Tse Shao @ 2026-05-21 20:15 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung
Cc: mark.rutland, alexander.shishkin, jolsa, irogers, adrian.hunter,
james.clark, sandipan.das, leo.yan, thomas.falcon, yang.lee,
linux-perf-users, linux-kernel, Chun-Tse Shao
Introduce the `aggr_nr` function to the metric expression parser.
`aggr_nr` allows metric formulas to dynamically utilize the number of
aggregated targets (`aggr->nr`) instead of relying on the static
`source_count` (which represents the static socket or node count).
This adds the `AGGR_NR` token to the lexer and parser, updates the
expression parsing context helpers to store `aggr_nr`, and feeds
`aggr->nr` from the aggregation structure in `prepare_metric`.
Signed-off-by: Chun-Tse Shao <ctshao@google.com>
Assisted-by: Gemini:gemini-3.1-pro-preview
---
tools/perf/util/expr.c | 26 ++++++++++++++++++++++----
tools/perf/util/expr.h | 6 +++++-
tools/perf/util/expr.l | 1 +
tools/perf/util/expr.y | 24 +++++++++++++++++-------
tools/perf/util/stat-shadow.c | 6 +++++-
5 files changed, 50 insertions(+), 13 deletions(-)
diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index 644769e92708..232998fef72b 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -27,6 +27,7 @@ struct expr_id_data {
struct {
double val;
int source_count;
+ int aggr_nr;
} val;
struct {
double val;
@@ -151,8 +152,8 @@ int expr__add_id_val(struct expr_parse_ctx *ctx, const char *id, double val)
}
/* Caller must make sure id is allocated */
-int expr__add_id_val_source_count(struct expr_parse_ctx *ctx, const char *id,
- double val, int source_count)
+int expr__add_id_val_source_count_aggr_nr(struct expr_parse_ctx *ctx, const char *id,
+ double val, int source_count, int aggr_nr)
{
struct expr_id_data *data_ptr = NULL, *old_data = NULL;
char *old_key = NULL;
@@ -163,6 +164,7 @@ int expr__add_id_val_source_count(struct expr_parse_ctx *ctx, const char *id,
return -ENOMEM;
data_ptr->val.val = val;
data_ptr->val.source_count = source_count;
+ data_ptr->val.aggr_nr = aggr_nr;
data_ptr->kind = EXPR_ID_DATA__VALUE;
ret = hashmap__set(ctx->ids, id, data_ptr, &old_key, &old_data);
@@ -171,12 +173,20 @@ int expr__add_id_val_source_count(struct expr_parse_ctx *ctx, const char *id,
} else if (old_data) {
data_ptr->val.val += old_data->val.val;
data_ptr->val.source_count += old_data->val.source_count;
+ data_ptr->val.aggr_nr += old_data->val.aggr_nr;
}
free(old_key);
free(old_data);
return ret;
}
+/* Caller must make sure id is allocated */
+int expr__add_id_val_source_count(struct expr_parse_ctx *ctx, const char *id,
+ double val, int source_count)
+{
+ return expr__add_id_val_source_count_aggr_nr(ctx, id, val, source_count, 1);
+}
+
int expr__add_ref(struct expr_parse_ctx *ctx, struct metric_ref *ref)
{
struct expr_id_data *data_ptr = NULL, *old_data = NULL;
@@ -390,8 +400,16 @@ double expr_id_data__value(const struct expr_id_data *data)
double expr_id_data__source_count(const struct expr_id_data *data)
{
- assert(data->kind == EXPR_ID_DATA__VALUE);
- return data->val.source_count;
+ if (data->kind == EXPR_ID_DATA__VALUE)
+ return data->val.source_count;
+ return 1.0;
+}
+
+double expr_id_data__aggr_nr(const struct expr_id_data *data)
+{
+ if (data->kind == EXPR_ID_DATA__VALUE)
+ return data->val.aggr_nr;
+ return 1.0;
}
double expr__get_literal(const char *literal, const struct expr_scanner_ctx *ctx)
diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
index c0cec29ddc29..ed12e4007d2d 100644
--- a/tools/perf/util/expr.h
+++ b/tools/perf/util/expr.h
@@ -36,7 +36,9 @@ void expr__del_id(struct expr_parse_ctx *ctx, const char *id);
int expr__add_id(struct expr_parse_ctx *ctx, const char *id);
int expr__add_id_val(struct expr_parse_ctx *ctx, const char *id, double val);
int expr__add_id_val_source_count(struct expr_parse_ctx *ctx, const char *id,
- double val, int source_count);
+ double val, int source_count);
+int expr__add_id_val_source_count_aggr_nr(struct expr_parse_ctx *ctx, const char *id,
+ double val, int source_count, int aggr_nr);
int expr__add_ref(struct expr_parse_ctx *ctx, struct metric_ref *ref);
int expr__get_id(struct expr_parse_ctx *ctx, const char *id,
struct expr_id_data **data);
@@ -53,6 +55,8 @@ int expr__find_ids(const char *expr, const char *one,
double expr_id_data__value(const struct expr_id_data *data);
double expr_id_data__source_count(const struct expr_id_data *data);
+double expr_id_data__aggr_nr(const struct expr_id_data *data);
+
double expr__get_literal(const char *literal, const struct expr_scanner_ctx *ctx);
double expr__has_event(const struct expr_parse_ctx *ctx, bool compute_ids, const char *id);
double expr__strcmp_cpuid_str(const struct expr_parse_ctx *ctx, bool compute_ids, const char *id);
diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l
index a2fc43159ee9..f16ccff278d0 100644
--- a/tools/perf/util/expr.l
+++ b/tools/perf/util/expr.l
@@ -121,6 +121,7 @@ min { return MIN; }
if { return IF; }
else { return ELSE; }
source_count { return SOURCE_COUNT; }
+aggr_nr { return AGGR_NR; }
has_event { return HAS_EVENT; }
strcmp_cpuid_str { return STRCMP_CPUID_STR; }
NaN { return nan_value(yyscanner); }
diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y
index e364790babb5..e20f649354cf 100644
--- a/tools/perf/util/expr.y
+++ b/tools/perf/util/expr.y
@@ -41,7 +41,7 @@ int expr_lex(YYSTYPE * yylval_param , void *yyscanner);
} ids;
}
-%token ID NUMBER MIN MAX IF ELSE LITERAL D_RATIO SOURCE_COUNT HAS_EVENT STRCMP_CPUID_STR EXPR_ERROR
+%token ID NUMBER MIN MAX IF ELSE LITERAL D_RATIO SOURCE_COUNT AGGR_NR HAS_EVENT STRCMP_CPUID_STR EXPR_ERROR
%left MIN MAX IF
%left '|'
%left '^'
@@ -87,8 +87,14 @@ static struct ids union_expr(struct ids ids1, struct ids ids2)
return result;
}
+enum expr_id_kind {
+ EXPR_ID_KIND__VALUE,
+ EXPR_ID_KIND__SOURCE_COUNT,
+ EXPR_ID_KIND__AGGR_NR,
+};
+
static struct ids handle_id(struct expr_parse_ctx *ctx, char *id,
- bool compute_ids, bool source_count)
+ bool compute_ids, enum expr_id_kind kind)
{
struct ids result;
@@ -101,9 +107,12 @@ static struct ids handle_id(struct expr_parse_ctx *ctx, char *id,
result.val = NAN;
if (expr__resolve_id(ctx, id, &data) == 0) {
- result.val = source_count
- ? expr_id_data__source_count(data)
- : expr_id_data__value(data);
+ if (kind == EXPR_ID_KIND__SOURCE_COUNT)
+ result.val = expr_id_data__source_count(data);
+ else if (kind == EXPR_ID_KIND__AGGR_NR)
+ result.val = expr_id_data__aggr_nr(data);
+ else
+ result.val = expr_id_data__value(data);
}
result.ids = NULL;
free(id);
@@ -201,8 +210,9 @@ expr: NUMBER
$$.val = $1;
$$.ids = NULL;
}
-| ID { $$ = handle_id(ctx, $1, compute_ids, /*source_count=*/false); }
-| SOURCE_COUNT '(' ID ')' { $$ = handle_id(ctx, $3, compute_ids, /*source_count=*/true); }
+| ID { $$ = handle_id(ctx, $1, compute_ids, EXPR_ID_KIND__VALUE); }
+| SOURCE_COUNT '(' ID ')' { $$ = handle_id(ctx, $3, compute_ids, EXPR_ID_KIND__SOURCE_COUNT); }
+| AGGR_NR '(' ID ')' { $$ = handle_id(ctx, $3, compute_ids, EXPR_ID_KIND__AGGR_NR); }
| HAS_EVENT '(' ID ')'
{
$$.val = expr__has_event(ctx, compute_ids, $3);
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index bc2d44df7baf..c17373bb0e1e 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -53,6 +53,7 @@ static int prepare_metric(struct perf_stat_config *config,
for (i = 0; metric_events[i]; i++) {
int source_count = 0, tool_aggr_idx;
+ int aggr_nr = 1;
bool is_tool_time =
tool_pmu__is_time_event(config, metric_events[i], &tool_aggr_idx);
struct perf_stat_evsel *ps = metric_events[i]->stats;
@@ -89,6 +90,7 @@ static int prepare_metric(struct perf_stat_config *config,
*/
val = NAN;
source_count = 0;
+ aggr_nr = 0;
} else {
struct perf_stat_aggr *aggr =
&ps->aggr[is_tool_time ? tool_aggr_idx : aggr_idx];
@@ -96,6 +98,7 @@ static int prepare_metric(struct perf_stat_config *config,
if (aggr->counts.run == 0) {
val = NAN;
source_count = 0;
+ aggr_nr = 0;
} else {
val = aggr->counts.val;
if (is_tool_time) {
@@ -104,13 +107,14 @@ static int prepare_metric(struct perf_stat_config *config,
}
if (!source_count)
source_count = evsel__source_count(metric_events[i]);
+ aggr_nr = aggr->nr ?: 1;
}
}
n = strdup(evsel__metric_id(metric_events[i]));
if (!n)
return -ENOMEM;
- expr__add_id_val_source_count(pctx, n, val, source_count);
+ expr__add_id_val_source_count_aggr_nr(pctx, n, val, source_count, aggr_nr);
}
for (int j = 0; metric_refs && metric_refs[j].metric_name; j++) {
--
2.54.0.746.g67dd491aae-goog
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v3 2/2] perf stat: Use aggr_nr scaling for Intel uncore miss latency metrics
2026-05-21 20:15 [PATCH v3 0/2] perf stat: Fix uncore metric scaling across aggregation modes Chun-Tse Shao
2026-05-21 20:15 ` [PATCH v3 1/2] perf stat: Add aggr_nr metric parser support Chun-Tse Shao
@ 2026-05-21 20:15 ` Chun-Tse Shao
2026-05-21 21:08 ` sashiko-bot
2026-05-27 19:10 ` [PATCH v3 0/2] perf stat: Fix uncore metric scaling across aggregation modes Chen, Zide
2026-05-28 19:17 ` Namhyung Kim
3 siblings, 1 reply; 7+ messages in thread
From: Chun-Tse Shao @ 2026-05-21 20:15 UTC (permalink / raw)
To: peterz, mingo, acme, namhyung
Cc: mark.rutland, alexander.shishkin, jolsa, irogers, adrian.hunter,
james.clark, sandipan.das, leo.yan, thomas.falcon, yang.lee,
linux-perf-users, linux-kernel, Chun-Tse Shao
Update `metric.py` to support the new `aggr_nr` keyword in the python
metric generator. Replace the usage of `source_count` with `aggr_nr` in
`IntelMissLat` inside `intel_metrics.py` so that uncore latency metrics
(like `lpm_miss_lat`) scale correctly on multi-socket and SNC systems when
aggregated globally.
Additionally, update the validation bypass logic in `CheckEveryEvent()`
inside `metric.py` to whitelist 'cha' and 'uncore' events. This
prevents validation failures when compiling metrics referencing these
PMU-specific uncore events.
Signed-off-by: Chun-Tse Shao <ctshao@google.com>
Assisted-by: Gemini:gemini-3.1-pro-preview
---
tools/perf/pmu-events/intel_metrics.py | 6 +++---
tools/perf/pmu-events/metric.py | 9 +++++++--
2 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 52035433b505..d99c7dd43797 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -7,7 +7,7 @@ import os
import re
from typing import Optional
from common_metrics import Cycles
-from metric import (d_ratio, has_event, max, source_count, CheckPmu, Event,
+from metric import (d_ratio, has_event, max, aggr_nr, CheckPmu, Event,
JsonEncodeMetric, JsonEncodeMetricGroupDescriptions,
Literal, LoadEvents, Metric, MetricConstraint, MetricGroup,
MetricRef, Select)
@@ -674,10 +674,10 @@ def IntelMissLat() -> Optional[MetricGroup]:
else:
assert data_rd_loc_occ.name == "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL", data_rd_loc_occ
- ticks_per_cha = ticks / source_count(data_rd_loc_ins)
+ ticks_per_cha = ticks / aggr_nr(data_rd_loc_ins)
loc_lat = interval_sec * 1e9 * data_rd_loc_occ / \
(ticks_per_cha * data_rd_loc_ins)
- ticks_per_cha = ticks / source_count(data_rd_rem_ins)
+ ticks_per_cha = ticks / aggr_nr(data_rd_rem_ins)
rem_lat = interval_sec * 1e9 * data_rd_rem_occ / \
(ticks_per_cha * data_rd_rem_ins)
return MetricGroup("lpm_miss_lat", [
diff --git a/tools/perf/pmu-events/metric.py b/tools/perf/pmu-events/metric.py
index ac582db785fc..a91ccb5977f0 100644
--- a/tools/perf/pmu-events/metric.py
+++ b/tools/perf/pmu-events/metric.py
@@ -93,7 +93,7 @@ def CheckEveryEvent(*names: str) -> None:
name = name[:name.find(':')]
elif '/' in name:
name = name[:name.find('/')]
- if any([name.startswith(x) for x in ['amd', 'arm', 'cpu', 'msr', 'power']]):
+ if any([name.startswith(x) for x in ['amd', 'arm', 'cpu', 'msr', 'power', 'cha', 'uncore']]):
continue
if name not in all_events_all_models:
raise Exception(f"Is {name} a named json event?")
@@ -576,6 +576,11 @@ def source_count(event: Event) -> Function:
return Function('source_count', event)
+def aggr_nr(event: Event) -> Function:
+ # pylint: disable=invalid-name
+ return Function('aggr_nr', event)
+
+
def has_event(event: Event) -> Function:
# pylint: disable=redefined-builtin
# pylint: disable=invalid-name
@@ -762,7 +767,7 @@ def ParsePerfJson(orig: str) -> Expression:
# Convert accidentally converted scientific notation constants back
py = re.sub(r'([0-9]+)Event\(r"(e[0-9]*)"\)', r'\1\2', py)
# Convert all the known keywords back from events to just the keyword
- keywords = ['if', 'else', 'min', 'max', 'd_ratio', 'source_count', 'has_event', 'strcmp_cpuid_str']
+ keywords = ['if', 'else', 'min', 'max', 'd_ratio', 'source_count', 'aggr_nr', 'has_event', 'strcmp_cpuid_str']
for kw in keywords:
py = re.sub(rf'Event\(r"{kw}"\)', kw, py)
try:
--
2.54.0.746.g67dd491aae-goog
^ permalink raw reply related [flat|nested] 7+ messages in thread* Re: [PATCH v3 2/2] perf stat: Use aggr_nr scaling for Intel uncore miss latency metrics
2026-05-21 20:15 ` [PATCH v3 2/2] perf stat: Use aggr_nr scaling for Intel uncore miss latency metrics Chun-Tse Shao
@ 2026-05-21 21:08 ` sashiko-bot
2026-05-21 22:01 ` Chun-Tse Shao
0 siblings, 1 reply; 7+ messages in thread
From: sashiko-bot @ 2026-05-21 21:08 UTC (permalink / raw)
To: Chun-Tse Shao; +Cc: linux-perf-users
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] Replacing `source_count` with `aggr_nr` for uncore events yields mathematically incorrect metrics.
--
commit 25ed3f24e52414351780a2cc517f2d6534c79d03
Author: Chun-Tse Shao <ctshao@google.com>
perf stat: Use aggr_nr scaling for Intel uncore miss latency metrics
Update metric.py to support the new aggr_nr keyword in the python metric
generator. Replace the usage of source_count with aggr_nr in IntelMissLat
inside intel_metrics.py so that uncore latency metrics scale correctly on
multi-socket and SNC systems when aggregated globally.
> diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
> index 52035433b5054..d99c7dd43797c 100755
> --- a/tools/perf/pmu-events/intel_metrics.py
> +++ b/tools/perf/pmu-events/intel_metrics.py
[ ... ]
> @@ -674,10 +674,10 @@ def IntelMissLat() -> Optional[MetricGroup]:
> else:
> assert data_rd_loc_occ.name == "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL", data_rd_loc_occ
>
> - ticks_per_cha = ticks / source_count(data_rd_loc_ins)
> + ticks_per_cha = ticks / aggr_nr(data_rd_loc_ins)
[Severity: Medium]
Does replacing source_count with aggr_nr here yield a mathematically incorrect
metric?
aggr_nr evaluates to the number of CPUs in the aggregation unit for a specific
evsel, while source_count correctly tracks the number of PMU event sources
(e.g., total CHAs).
For system-wide uncore events in global aggregation mode (AGGR_GLOBAL), each
PMU's evsel typically monitors a single CPU mask, so its aggr->nr evaluates
to 1.
Could dividing ticks by 1 instead of the total number of CHAs result in
metrics being off by a factor of the system's CHA count?
> loc_lat = interval_sec * 1e9 * data_rd_loc_occ / \
> (ticks_per_cha * data_rd_loc_ins)
> - ticks_per_cha = ticks / source_count(data_rd_rem_ins)
> + ticks_per_cha = ticks / aggr_nr(data_rd_rem_ins)
> rem_lat = interval_sec * 1e9 * data_rd_rem_occ / \
> (ticks_per_cha * data_rd_rem_ins)
> return MetricGroup("lpm_miss_lat", [
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260521201505.124690-1-ctshao@google.com?part=2
^ permalink raw reply [flat|nested] 7+ messages in thread* Re: [PATCH v3 2/2] perf stat: Use aggr_nr scaling for Intel uncore miss latency metrics
2026-05-21 21:08 ` sashiko-bot
@ 2026-05-21 22:01 ` Chun-Tse Shao
0 siblings, 0 replies; 7+ messages in thread
From: Chun-Tse Shao @ 2026-05-21 22:01 UTC (permalink / raw)
To: sashiko-reviews; +Cc: linux-perf-users
On Thu, May 21, 2026 at 2:08 PM <sashiko-bot@kernel.org> wrote:
>
> Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
> - [Medium] Replacing `source_count` with `aggr_nr` for uncore events yields mathematically incorrect metrics.
> --
>
> commit 25ed3f24e52414351780a2cc517f2d6534c79d03
> Author: Chun-Tse Shao <ctshao@google.com>
>
> perf stat: Use aggr_nr scaling for Intel uncore miss latency metrics
>
> Update metric.py to support the new aggr_nr keyword in the python metric
> generator. Replace the usage of source_count with aggr_nr in IntelMissLat
> inside intel_metrics.py so that uncore latency metrics scale correctly on
> multi-socket and SNC systems when aggregated globally.
>
> > diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
> > index 52035433b5054..d99c7dd43797c 100755
> > --- a/tools/perf/pmu-events/intel_metrics.py
> > +++ b/tools/perf/pmu-events/intel_metrics.py
> [ ... ]
> > @@ -674,10 +674,10 @@ def IntelMissLat() -> Optional[MetricGroup]:
> > else:
> > assert data_rd_loc_occ.name == "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL", data_rd_loc_occ
> >
> > - ticks_per_cha = ticks / source_count(data_rd_loc_ins)
> > + ticks_per_cha = ticks / aggr_nr(data_rd_loc_ins)
>
> [Severity: Medium]
> Does replacing source_count with aggr_nr here yield a mathematically incorrect
> metric?
>
> aggr_nr evaluates to the number of CPUs in the aggregation unit for a specific
> evsel, while source_count correctly tracks the number of PMU event sources
> (e.g., total CHAs).
>
> For system-wide uncore events in global aggregation mode (AGGR_GLOBAL), each
> PMU's evsel typically monitors a single CPU mask, so its aggr->nr evaluates
> to 1.
>
> Could dividing ticks by 1 instead of the total number of CHAs result in
> metrics being off by a factor of the system's CHA count?
>
A false alarm, source_count is always a static number of evsel, which
should be replaced by aggr_nr when calculating per-socket metrics.
> > loc_lat = interval_sec * 1e9 * data_rd_loc_occ / \
> > (ticks_per_cha * data_rd_loc_ins)
> > - ticks_per_cha = ticks / source_count(data_rd_rem_ins)
> > + ticks_per_cha = ticks / aggr_nr(data_rd_rem_ins)
> > rem_lat = interval_sec * 1e9 * data_rd_rem_occ / \
> > (ticks_per_cha * data_rd_rem_ins)
> > return MetricGroup("lpm_miss_lat", [
>
> --
> Sashiko AI review · https://sashiko.dev/#/patchset/20260521201505.124690-1-ctshao@google.com?part=2
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v3 0/2] perf stat: Fix uncore metric scaling across aggregation modes
2026-05-21 20:15 [PATCH v3 0/2] perf stat: Fix uncore metric scaling across aggregation modes Chun-Tse Shao
2026-05-21 20:15 ` [PATCH v3 1/2] perf stat: Add aggr_nr metric parser support Chun-Tse Shao
2026-05-21 20:15 ` [PATCH v3 2/2] perf stat: Use aggr_nr scaling for Intel uncore miss latency metrics Chun-Tse Shao
@ 2026-05-27 19:10 ` Chen, Zide
2026-05-28 19:17 ` Namhyung Kim
3 siblings, 0 replies; 7+ messages in thread
From: Chen, Zide @ 2026-05-27 19:10 UTC (permalink / raw)
To: Chun-Tse Shao, peterz, mingo, acme, namhyung
Cc: mark.rutland, alexander.shishkin, jolsa, irogers, adrian.hunter,
james.clark, sandipan.das, leo.yan, thomas.falcon, yang.lee,
linux-perf-users, linux-kernel
On 5/21/2026 3:15 PM, Chun-Tse Shao wrote:
> This series fixes a scaling issue for metrics (like lpm_miss_lat) across
> different runtime aggregation modes.
>
> Uncore metrics currently use `source_count` to scale events. However,
> `source_count` returns the total uncore unit count regardless of the
> selected aggregation mode. When evaluating metrics in different
> aggregation mode other than `--per-socket`, this incorrectly divides
> aggregated uncore events against the total uncore count rather than the
> uncores belonging to the aggregation, leading to wrong metric results.
>
> To fix this, we:
> 1. Introduce the aggr_nr() keyword to the metric parser, which
> dynamically resolves to the active units in the current aggregation
> group (`gr->nr`).
>
> 2. Update the python metrics to use `aggr_nr` instead of `source_count`,
> ensuring correct scaling across all runtime aggregation boundaries.
>
> Before the fix (incorrect low latency in global mode):
> $ perf stat -M lpm_miss_lat --metric-only -a -j -- sleep 1
> {"ns lpm_miss_lat_rem" : "122.8", "ns lpm_miss_lat_loc" : "114.5"}
> $ perf stat -M lpm_miss_lat --per-socket --metric-only -a -j -- sleep 1
> {"socket" : "S0", "ns lpm_miss_lat_rem" : "232.1", "ns lpm_miss_lat_loc" : "278.2"}
> {"socket" : "S1", "ns lpm_miss_lat_rem" : "233.9", "ns lpm_miss_lat_loc" : "257.5"}
>
> After the fix (correct scaled latency in all aggregation modes):
> $ perf stat -M lpm_miss_lat --metric-only -a -j -- sleep 1
> {"ns lpm_miss_lat_rem" : "231.7", "ns lpm_miss_lat_loc" : "245.0"}
> $ perf stat -M lpm_miss_lat --per-socket --metric-only -a -j -- sleep 1
> {"socket" : "S0", "ns lpm_miss_lat_rem" : "238.3", "ns lpm_miss_lat_loc" : "249.4"}
> {"socket" : "S1", "ns lpm_miss_lat_rem" : "259.1", "ns lpm_miss_lat_loc" : "253.1"}
>
> v3:
> Fixed based on Sashiko review:
> - Removed the unnecessary, copied `redefined-builtin` pylint-disable
> comment from `aggr_nr` definition inside `metric.py`.
>
> v2: lore.kernel.org/20260521035941.3860145-1-ctshao@google.com
> Fixed based on Sashiko review:
> - Fixed `aggr_nr` setting when an uncore event fails to run
> (counts.run == 0) to explicitly set it to 0 instead of defaulting to
> 1.
> - Accumulated `aggr_nr` when multiple unmerged PMU events are
> associated with the same metric ID to prevent incorrect scaling
> across active sockets.
> - Removed unused `List` import from `typing` in `intel_metrics.py`.
>
> v1: lore.kernel.org/20260520180032.3045144-1-ctshao@google.com
>
> Chun-Tse Shao (2):
> perf stat: Add aggr_nr metric parser support
> perf stat: Use aggr_nr scaling for Intel uncore miss latency metrics
>
> tools/perf/pmu-events/intel_metrics.py | 6 +++---
> tools/perf/pmu-events/metric.py | 9 +++++++--
> tools/perf/util/expr.c | 26 ++++++++++++++++++++++----
> tools/perf/util/expr.h | 6 +++++-
> tools/perf/util/expr.l | 1 +
> tools/perf/util/expr.y | 24 +++++++++++++++++-------
> tools/perf/util/stat-shadow.c | 6 +++++-
> 7 files changed, 60 insertions(+), 18 deletions(-)
>
> --
> 2.54.0.746.g67dd491aae-goog
Tested the series on Intel CPUs.
Tested-by: Zide Chen <zide.chen@intel.com>
^ permalink raw reply [flat|nested] 7+ messages in thread* Re: [PATCH v3 0/2] perf stat: Fix uncore metric scaling across aggregation modes
2026-05-21 20:15 [PATCH v3 0/2] perf stat: Fix uncore metric scaling across aggregation modes Chun-Tse Shao
` (2 preceding siblings ...)
2026-05-27 19:10 ` [PATCH v3 0/2] perf stat: Fix uncore metric scaling across aggregation modes Chen, Zide
@ 2026-05-28 19:17 ` Namhyung Kim
3 siblings, 0 replies; 7+ messages in thread
From: Namhyung Kim @ 2026-05-28 19:17 UTC (permalink / raw)
To: Chun-Tse Shao
Cc: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
irogers, adrian.hunter, james.clark, sandipan.das, leo.yan,
thomas.falcon, yang.lee, linux-perf-users, linux-kernel
On Thu, May 21, 2026 at 01:15:03PM -0700, Chun-Tse Shao wrote:
> This series fixes a scaling issue for metrics (like lpm_miss_lat) across
> different runtime aggregation modes.
>
> Uncore metrics currently use `source_count` to scale events. However,
> `source_count` returns the total uncore unit count regardless of the
> selected aggregation mode. When evaluating metrics in different
> aggregation mode other than `--per-socket`, this incorrectly divides
> aggregated uncore events against the total uncore count rather than the
> uncores belonging to the aggregation, leading to wrong metric results.
>
> To fix this, we:
> 1. Introduce the aggr_nr() keyword to the metric parser, which
> dynamically resolves to the active units in the current aggregation
> group (`gr->nr`).
>
> 2. Update the python metrics to use `aggr_nr` instead of `source_count`,
> ensuring correct scaling across all runtime aggregation boundaries.
>
> Before the fix (incorrect low latency in global mode):
> $ perf stat -M lpm_miss_lat --metric-only -a -j -- sleep 1
> {"ns lpm_miss_lat_rem" : "122.8", "ns lpm_miss_lat_loc" : "114.5"}
> $ perf stat -M lpm_miss_lat --per-socket --metric-only -a -j -- sleep 1
> {"socket" : "S0", "ns lpm_miss_lat_rem" : "232.1", "ns lpm_miss_lat_loc" : "278.2"}
> {"socket" : "S1", "ns lpm_miss_lat_rem" : "233.9", "ns lpm_miss_lat_loc" : "257.5"}
>
> After the fix (correct scaled latency in all aggregation modes):
> $ perf stat -M lpm_miss_lat --metric-only -a -j -- sleep 1
> {"ns lpm_miss_lat_rem" : "231.7", "ns lpm_miss_lat_loc" : "245.0"}
> $ perf stat -M lpm_miss_lat --per-socket --metric-only -a -j -- sleep 1
> {"socket" : "S0", "ns lpm_miss_lat_rem" : "238.3", "ns lpm_miss_lat_loc" : "249.4"}
> {"socket" : "S1", "ns lpm_miss_lat_rem" : "259.1", "ns lpm_miss_lat_loc" : "253.1"}
>
> v3:
> Fixed based on Sashiko review:
> - Removed the unnecessary, copied `redefined-builtin` pylint-disable
> comment from `aggr_nr` definition inside `metric.py`.
>
> v2: lore.kernel.org/20260521035941.3860145-1-ctshao@google.com
> Fixed based on Sashiko review:
> - Fixed `aggr_nr` setting when an uncore event fails to run
> (counts.run == 0) to explicitly set it to 0 instead of defaulting to
> 1.
> - Accumulated `aggr_nr` when multiple unmerged PMU events are
> associated with the same metric ID to prevent incorrect scaling
> across active sockets.
> - Removed unused `List` import from `typing` in `intel_metrics.py`.
>
> v1: lore.kernel.org/20260520180032.3045144-1-ctshao@google.com
>
> Chun-Tse Shao (2):
> perf stat: Add aggr_nr metric parser support
> perf stat: Use aggr_nr scaling for Intel uncore miss latency metrics
Acked-by: Namhyung Kim <namhyung@kernel.org>
Thanks,
Namhyung
>
> tools/perf/pmu-events/intel_metrics.py | 6 +++---
> tools/perf/pmu-events/metric.py | 9 +++++++--
> tools/perf/util/expr.c | 26 ++++++++++++++++++++++----
> tools/perf/util/expr.h | 6 +++++-
> tools/perf/util/expr.l | 1 +
> tools/perf/util/expr.y | 24 +++++++++++++++++-------
> tools/perf/util/stat-shadow.c | 6 +++++-
> 7 files changed, 60 insertions(+), 18 deletions(-)
>
> --
> 2.54.0.746.g67dd491aae-goog
>
^ permalink raw reply [flat|nested] 7+ messages in thread