* [Patch v7 1/4] perf headers: Sync with the kernel headers
2026-03-24 0:57 [Patch v7 0/4] Perf tools: Support eGPRs/SSP/SIMD registers sampling Dapeng Mi
@ 2026-03-24 0:57 ` Dapeng Mi
2026-03-24 0:57 ` [Patch v7 2/4] perf regs: Support x86 eGPRs/SSP sampling Dapeng Mi
` (2 subsequent siblings)
3 siblings, 0 replies; 10+ messages in thread
From: Dapeng Mi @ 2026-03-24 0:57 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Thomas Gleixner, Dave Hansen, Ian Rogers,
Adrian Hunter, Jiri Olsa, Alexander Shishkin, Andi Kleen,
Eranian Stephane
Cc: Mark Rutland, broonie, Ravi Bangoria, linux-kernel,
linux-perf-users, Zide Chen, Falcon Thomas, Dapeng Mi, Xudong Hao,
Kan Liang, Dapeng Mi
From: Kan Liang <kan.liang@linux.intel.com>
Update include/uapi/linux/perf_event.h and
arch/x86/include/uapi/asm/perf_regs.h to support extended regs.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Co-developed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
---
V7: Add more comments for newly added register indexes.
tools/arch/x86/include/uapi/asm/perf_regs.h | 51 +++++++++++++++++++++
tools/include/uapi/linux/perf_event.h | 50 ++++++++++++++++++--
2 files changed, 97 insertions(+), 4 deletions(-)
diff --git a/tools/arch/x86/include/uapi/asm/perf_regs.h b/tools/arch/x86/include/uapi/asm/perf_regs.h
index 7c9d2bb3833b..98a5b6c8e24c 100644
--- a/tools/arch/x86/include/uapi/asm/perf_regs.h
+++ b/tools/arch/x86/include/uapi/asm/perf_regs.h
@@ -27,9 +27,35 @@ enum perf_event_x86_regs {
PERF_REG_X86_R13,
PERF_REG_X86_R14,
PERF_REG_X86_R15,
+ /*
+ * The eGPRs/SSP and XMM have overlaps. Only one can be used
+ * at a time. The ABI PERF_SAMPLE_REGS_ABI_SIMD is used to
+ * distinguish which one is used. If PERF_SAMPLE_REGS_ABI_SIMD
+ * is set, then eGPRs/SSP is used, otherwise, XMM is used.
+ *
+ * Extended GPRs (eGPRs)
+ */
+ PERF_REG_X86_R16,
+ PERF_REG_X86_R17,
+ PERF_REG_X86_R18,
+ PERF_REG_X86_R19,
+ PERF_REG_X86_R20,
+ PERF_REG_X86_R21,
+ PERF_REG_X86_R22,
+ PERF_REG_X86_R23,
+ PERF_REG_X86_R24,
+ PERF_REG_X86_R25,
+ PERF_REG_X86_R26,
+ PERF_REG_X86_R27,
+ PERF_REG_X86_R28,
+ PERF_REG_X86_R29,
+ PERF_REG_X86_R30,
+ PERF_REG_X86_R31,
+ PERF_REG_X86_SSP,
/* These are the limits for the GPRs. */
PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1,
PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1,
+ PERF_REG_MISC_MAX = PERF_REG_X86_SSP + 1,
/* These all need two bits set because they are 128bit */
PERF_REG_X86_XMM0 = 32,
@@ -54,5 +80,30 @@ enum perf_event_x86_regs {
};
#define PERF_REG_EXTENDED_MASK (~((1ULL << PERF_REG_X86_XMM0) - 1))
+#define PERF_X86_EGPRS_MASK GENMASK_ULL(PERF_REG_X86_R31, PERF_REG_X86_R16)
+
+enum {
+ PERF_X86_SIMD_XMM_REGS = 16,
+ PERF_X86_SIMD_YMM_REGS = 16,
+ PERF_X86_SIMD_ZMM_REGS = 32,
+ PERF_X86_SIMD_VEC_REGS_MAX = PERF_X86_SIMD_ZMM_REGS,
+
+ PERF_X86_SIMD_OPMASK_REGS = 8,
+ PERF_X86_SIMD_PRED_REGS_MAX = PERF_X86_SIMD_OPMASK_REGS,
+};
+
+#define PERF_X86_SIMD_PRED_MASK GENMASK(PERF_X86_SIMD_PRED_REGS_MAX - 1, 0)
+#define PERF_X86_SIMD_VEC_MASK GENMASK_ULL(PERF_X86_SIMD_VEC_REGS_MAX - 1, 0)
+
+#define PERF_X86_H16ZMM_BASE 16
+
+enum {
+ /* 1 qword = 8 bytes */
+ PERF_X86_OPMASK_QWORDS = 1,
+ PERF_X86_XMM_QWORDS = 2,
+ PERF_X86_YMM_QWORDS = 4,
+ PERF_X86_ZMM_QWORDS = 8,
+ PERF_X86_SIMD_QWORDS_MAX = PERF_X86_ZMM_QWORDS,
+};
#endif /* _ASM_X86_PERF_REGS_H */
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 76e9d0664d0c..00bc0a262735 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -314,8 +314,9 @@ enum {
*/
enum perf_sample_regs_abi {
PERF_SAMPLE_REGS_ABI_NONE = 0,
- PERF_SAMPLE_REGS_ABI_32 = 1,
- PERF_SAMPLE_REGS_ABI_64 = 2,
+ PERF_SAMPLE_REGS_ABI_32 = (1 << 0),
+ PERF_SAMPLE_REGS_ABI_64 = (1 << 1),
+ PERF_SAMPLE_REGS_ABI_SIMD = (1 << 2),
};
/*
@@ -383,6 +384,7 @@ enum perf_event_read_format {
#define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */
#define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */
#define PERF_ATTR_SIZE_VER9 144 /* add: config4 */
+#define PERF_ATTR_SIZE_VER10 176 /* Add: sample_simd_{pred,vec}_reg_* */
/*
* 'struct perf_event_attr' contains various attributes that define
@@ -547,6 +549,30 @@ struct perf_event_attr {
__u64 config3; /* extension of config2 */
__u64 config4; /* extension of config3 */
+
+ /*
+ * Defines the sampling SIMD/PRED registers bitmap and qwords
+ * (8 bytes) length.
+ *
+ * sample_simd_regs_enabled != 0 indicates there are SIMD/PRED registers
+ * to be sampled, the SIMD/PRED registers bitmap and qwords length are
+ * represented in sample_{simd|pred}_pred_reg_{intr|user} and
+ * sample_simd_{vec|pred}_reg_qwords fields.
+ *
+ * sample_simd_regs_enabled == 0 indicates no SIMD/PRED registers are
+ * sampled.
+ */
+ union {
+ __u16 sample_simd_regs_enabled;
+ __u16 sample_simd_pred_reg_qwords;
+ };
+ __u16 sample_simd_vec_reg_qwords;
+ __u32 __reserved_4;
+
+ __u32 sample_simd_pred_reg_intr;
+ __u32 sample_simd_pred_reg_user;
+ __u64 sample_simd_vec_reg_intr;
+ __u64 sample_simd_vec_reg_user;
};
/*
@@ -1020,7 +1046,15 @@ enum perf_event_type {
* } && PERF_SAMPLE_BRANCH_STACK
*
* { u64 abi; # enum perf_sample_regs_abi
- * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
+ * u64 regs[weight(mask)];
+ * struct {
+ * u16 nr_vectors; # 0 ... weight(sample_simd_vec_reg_user)
+ * u16 vector_qwords; # 0 ... sample_simd_vec_reg_qwords
+ * u16 nr_pred; # 0 ... weight(sample_simd_pred_reg_user)
+ * u16 pred_qwords; # 0 ... sample_simd_pred_reg_qwords
+ * u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
+ * } && (abi & PERF_SAMPLE_REGS_ABI_SIMD)
+ * } && PERF_SAMPLE_REGS_USER
*
* { u64 size;
* char data[size];
@@ -1047,7 +1081,15 @@ enum perf_event_type {
* { u64 data_src; } && PERF_SAMPLE_DATA_SRC
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
* { u64 abi; # enum perf_sample_regs_abi
- * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+ * u64 regs[weight(mask)];
+ * struct {
+ * u16 nr_vectors; # 0 ... weight(sample_simd_vec_reg_intr)
+ * u16 vector_qwords; # 0 ... sample_simd_vec_reg_qwords
+ * u16 nr_pred; # 0 ... weight(sample_simd_pred_reg_intr)
+ * u16 pred_qwords; # 0 ... sample_simd_pred_reg_qwords
+ * u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
+ * } && (abi & PERF_SAMPLE_REGS_ABI_SIMD)
+ * } && PERF_SAMPLE_REGS_INTR
* { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
* { u64 cgroup;} && PERF_SAMPLE_CGROUP
* { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
--
2.34.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* [Patch v7 2/4] perf regs: Support x86 eGPRs/SSP sampling
2026-03-24 0:57 [Patch v7 0/4] Perf tools: Support eGPRs/SSP/SIMD registers sampling Dapeng Mi
2026-03-24 0:57 ` [Patch v7 1/4] perf headers: Sync with the kernel headers Dapeng Mi
@ 2026-03-24 0:57 ` Dapeng Mi
2026-03-24 2:49 ` Ian Rogers
2026-03-26 1:41 ` Mi, Dapeng
2026-03-24 0:57 ` [Patch v7 3/4] perf regs: Support x86 SIMD registers sampling Dapeng Mi
2026-03-24 0:57 ` [Patch v7 4/4] perf regs: Enable dumping of SIMD registers Dapeng Mi
3 siblings, 2 replies; 10+ messages in thread
From: Dapeng Mi @ 2026-03-24 0:57 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Thomas Gleixner, Dave Hansen, Ian Rogers,
Adrian Hunter, Jiri Olsa, Alexander Shishkin, Andi Kleen,
Eranian Stephane
Cc: Mark Rutland, broonie, Ravi Bangoria, linux-kernel,
linux-perf-users, Zide Chen, Falcon Thomas, Dapeng Mi, Xudong Hao,
Dapeng Mi
This patch adds support for sampling x86 extended GP registers (R16-R31)
and the shadow stack pointer (SSP) register.
The original XMM registers space in sample_regs_user/sample_regs_intr is
reclaimed to represent the eGPRs and SSP when SIMD registers sampling is
supported with the new SIMD sampling fields in the perf_event_attr
structure. This necessitates a way to distinguish which register layout
is used for the sample_regs_user/sample_regs_intr bitmap.
To address this, a new "abi" argument is added to the helpers
perf_intr_reg_mask(), perf_user_reg_mask(), and perf_reg_name(). When
"abi & PERF_SAMPLE_REGS_ABI_SIMD" is true, it indicates the eGPRs and SSP
layout is represented; otherwise, the legacy XMM registers are
represented.
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
---
V7: Limit dwarf minimal regs to legacy GPRs (excluding APX eGPRs).
tools/perf/builtin-script.c | 2 +-
tools/perf/util/evsel.c | 7 +-
tools/perf/util/parse-regs-options.c | 17 ++-
.../perf/util/perf-regs-arch/perf_regs_x86.c | 124 +++++++++++++++---
tools/perf/util/perf_regs.c | 12 +-
tools/perf/util/perf_regs.h | 10 +-
.../scripting-engines/trace-event-python.c | 2 +-
tools/perf/util/session.c | 9 +-
8 files changed, 142 insertions(+), 41 deletions(-)
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index b80c406d1fc1..714528732e02 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -730,7 +730,7 @@ static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask,
for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) {
u64 val = regs->regs[i++];
printed += fprintf(fp, "%5s:0x%"PRIx64" ",
- perf_reg_name(r, e_machine, e_flags),
+ perf_reg_name(r, e_machine, e_flags, regs->abi),
val);
}
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 5a294595a677..f565ef2eb476 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1054,19 +1054,22 @@ static void __evsel__config_callchain(struct evsel *evsel, const struct record_o
}
if (param->record_mode == CALLCHAIN_DWARF) {
+ int abi = -1; /* -1 indicates only basic GPRs are needed. */
+
if (!function) {
uint16_t e_machine = evsel__e_machine(evsel, /*e_flags=*/NULL);
evsel__set_sample_bit(evsel, REGS_USER);
evsel__set_sample_bit(evsel, STACK_USER);
if (opts->sample_user_regs &&
- DWARF_MINIMAL_REGS(e_machine) != perf_user_reg_mask(EM_HOST)) {
+ DWARF_MINIMAL_REGS(e_machine) != perf_user_reg_mask(EM_HOST, &abi)) {
attr->sample_regs_user |= DWARF_MINIMAL_REGS(e_machine);
pr_warning("WARNING: The use of --call-graph=dwarf may require all the user registers, "
"specifying a subset with --user-regs may render DWARF unwinding unreliable, "
"so the minimal registers set (IP, SP) is explicitly forced.\n");
} else {
- attr->sample_regs_user |= perf_user_reg_mask(EM_HOST);
+ abi = -1;
+ attr->sample_regs_user |= perf_user_reg_mask(EM_HOST, &abi);
}
attr->sample_stack_user = param->dump_size;
attr->exclude_callchain_user = 1;
diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c
index c93c2f0c8105..6cf865bfc2f7 100644
--- a/tools/perf/util/parse-regs-options.c
+++ b/tools/perf/util/parse-regs-options.c
@@ -10,7 +10,8 @@
#include "util/perf_regs.h"
#include "util/parse-regs-options.h"
-static void list_perf_regs(FILE *fp, uint64_t mask)
+static void
+list_perf_regs(FILE *fp, uint64_t mask, int abi)
{
const char *last_name = NULL;
@@ -21,7 +22,7 @@ static void list_perf_regs(FILE *fp, uint64_t mask)
if (((1ULL << reg) & mask) == 0)
continue;
- name = perf_reg_name(reg, EM_HOST, EF_HOST);
+ name = perf_reg_name(reg, EM_HOST, EF_HOST, abi);
if (name && (!last_name || strcmp(last_name, name)))
fprintf(fp, "%s%s", reg > 0 ? " " : "", name);
last_name = name;
@@ -29,7 +30,8 @@ static void list_perf_regs(FILE *fp, uint64_t mask)
fputc('\n', fp);
}
-static uint64_t name_to_perf_reg_mask(const char *to_match, uint64_t mask)
+static uint64_t
+name_to_perf_reg_mask(const char *to_match, uint64_t mask, int abi)
{
uint64_t reg_mask = 0;
@@ -39,7 +41,7 @@ static uint64_t name_to_perf_reg_mask(const char *to_match, uint64_t mask)
if (((1ULL << reg) & mask) == 0)
continue;
- name = perf_reg_name(reg, EM_HOST, EF_HOST);
+ name = perf_reg_name(reg, EM_HOST, EF_HOST, abi);
if (!name)
continue;
@@ -56,6 +58,7 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
char *s, *os = NULL, *p;
int ret = -1;
uint64_t mask;
+ int abi = 0;
if (unset)
return 0;
@@ -66,7 +69,7 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
if (*mode)
return -1;
- mask = intr ? perf_intr_reg_mask(EM_HOST) : perf_user_reg_mask(EM_HOST);
+ mask = intr ? perf_intr_reg_mask(EM_HOST, &abi) : perf_user_reg_mask(EM_HOST, &abi);
/* str may be NULL in case no arg is passed to -I */
if (!str) {
@@ -87,11 +90,11 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
*p = '\0';
if (!strcmp(s, "?")) {
- list_perf_regs(stderr, mask);
+ list_perf_regs(stderr, mask, abi);
goto error;
}
- reg_mask = name_to_perf_reg_mask(s, mask);
+ reg_mask = name_to_perf_reg_mask(s, mask, abi);
if (reg_mask == 0) {
ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n",
s, intr ? "-I" : "--user-regs=");
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_x86.c b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
index b6d20522b4e8..ae26d991cdc9 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_x86.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
@@ -235,26 +235,26 @@ int __perf_sdt_arg_parse_op_x86(char *old_op, char **new_op)
return SDT_ARG_VALID;
}
-uint64_t __perf_reg_mask_x86(bool intr)
+static uint64_t __arch__reg_mask(u64 sample_type, u64 mask, bool has_simd_regs)
{
struct perf_event_attr attr = {
- .type = PERF_TYPE_HARDWARE,
- .config = PERF_COUNT_HW_CPU_CYCLES,
- .sample_type = PERF_SAMPLE_REGS_INTR,
- .sample_regs_intr = PERF_REG_EXTENDED_MASK,
- .precise_ip = 1,
- .disabled = 1,
- .exclude_kernel = 1,
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .sample_type = sample_type,
+ .precise_ip = 1,
+ .disabled = 1,
+ .exclude_kernel = 1,
+ .sample_simd_regs_enabled = has_simd_regs,
};
int fd;
-
- if (!intr)
- return PERF_REGS_MASK;
-
/*
* In an unnamed union, init it here to build on older gcc versions
*/
attr.sample_period = 1;
+ if (sample_type == PERF_SAMPLE_REGS_INTR)
+ attr.sample_regs_intr = mask;
+ else
+ attr.sample_regs_user = mask;
if (perf_pmus__num_core_pmus() > 1) {
struct perf_pmu *pmu = NULL;
@@ -276,13 +276,38 @@ uint64_t __perf_reg_mask_x86(bool intr)
/*group_fd=*/-1, /*flags=*/0);
if (fd != -1) {
close(fd);
- return (PERF_REG_EXTENDED_MASK | PERF_REGS_MASK);
+ return mask;
+ }
+
+ return 0;
+}
+
+uint64_t __perf_reg_mask_x86(bool intr, int *abi)
+{
+ u64 sample_type = intr ? PERF_SAMPLE_REGS_INTR : PERF_SAMPLE_REGS_USER;
+ uint64_t mask = PERF_REGS_MASK;
+
+ /* -1 indicates only basic GPRs are needed. */
+ if (*abi < 0)
+ return PERF_REGS_MASK;
+
+ *abi = 0;
+ mask |= __arch__reg_mask(sample_type,
+ GENMASK_ULL(PERF_REG_X86_R31, PERF_REG_X86_R16),
+ true);
+ mask |= __arch__reg_mask(sample_type, BIT_ULL(PERF_REG_X86_SSP), true);
+
+ if (mask != PERF_REGS_MASK) {
+ *abi |= PERF_SAMPLE_REGS_ABI_SIMD;
+ } else {
+ mask |= __arch__reg_mask(sample_type, PERF_REG_EXTENDED_MASK,
+ false);
}
- return PERF_REGS_MASK;
+ return mask;
}
-const char *__perf_reg_name_x86(int id)
+static const char *__arch_reg_gpr_name(int id)
{
switch (id) {
case PERF_REG_X86_AX:
@@ -333,7 +358,60 @@ const char *__perf_reg_name_x86(int id)
return "R14";
case PERF_REG_X86_R15:
return "R15";
+ default:
+ return NULL;
+ }
+
+ return NULL;
+}
+static const char *__arch_reg_egpr_name(int id)
+{
+ switch (id) {
+ case PERF_REG_X86_R16:
+ return "R16";
+ case PERF_REG_X86_R17:
+ return "R17";
+ case PERF_REG_X86_R18:
+ return "R18";
+ case PERF_REG_X86_R19:
+ return "R19";
+ case PERF_REG_X86_R20:
+ return "R20";
+ case PERF_REG_X86_R21:
+ return "R21";
+ case PERF_REG_X86_R22:
+ return "R22";
+ case PERF_REG_X86_R23:
+ return "R23";
+ case PERF_REG_X86_R24:
+ return "R24";
+ case PERF_REG_X86_R25:
+ return "R25";
+ case PERF_REG_X86_R26:
+ return "R26";
+ case PERF_REG_X86_R27:
+ return "R27";
+ case PERF_REG_X86_R28:
+ return "R28";
+ case PERF_REG_X86_R29:
+ return "R29";
+ case PERF_REG_X86_R30:
+ return "R30";
+ case PERF_REG_X86_R31:
+ return "R31";
+ case PERF_REG_X86_SSP:
+ return "SSP";
+ default:
+ return NULL;
+ }
+
+ return NULL;
+}
+
+static const char *__arch_reg_xmm_name(int id)
+{
+ switch (id) {
#define XMM(x) \
case PERF_REG_X86_XMM ## x: \
case PERF_REG_X86_XMM ## x + 1: \
@@ -362,6 +440,22 @@ const char *__perf_reg_name_x86(int id)
return NULL;
}
+const char *__perf_reg_name_x86(int id, int abi)
+{
+ const char *name;
+
+ name = __arch_reg_gpr_name(id);
+ if (name)
+ return name;
+
+ if (abi & PERF_SAMPLE_REGS_ABI_SIMD)
+ name = __arch_reg_egpr_name(id);
+ else
+ name = __arch_reg_xmm_name(id);
+
+ return name;
+}
+
uint64_t __perf_reg_ip_x86(void)
{
return PERF_REG_X86_IP;
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index 5b8f34beb24e..afc567718bee 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -32,7 +32,7 @@ int perf_sdt_arg_parse_op(uint16_t e_machine, char *old_op, char **new_op)
return ret;
}
-uint64_t perf_intr_reg_mask(uint16_t e_machine)
+uint64_t perf_intr_reg_mask(uint16_t e_machine, int *abi)
{
uint64_t mask = 0;
@@ -64,7 +64,7 @@ uint64_t perf_intr_reg_mask(uint16_t e_machine)
break;
case EM_386:
case EM_X86_64:
- mask = __perf_reg_mask_x86(/*intr=*/true);
+ mask = __perf_reg_mask_x86(/*intr=*/true, abi);
break;
default:
pr_debug("Unknown ELF machine %d, interrupt sampling register mask will be empty.\n",
@@ -75,7 +75,7 @@ uint64_t perf_intr_reg_mask(uint16_t e_machine)
return mask;
}
-uint64_t perf_user_reg_mask(uint16_t e_machine)
+uint64_t perf_user_reg_mask(uint16_t e_machine, int *abi)
{
uint64_t mask = 0;
@@ -107,7 +107,7 @@ uint64_t perf_user_reg_mask(uint16_t e_machine)
break;
case EM_386:
case EM_X86_64:
- mask = __perf_reg_mask_x86(/*intr=*/false);
+ mask = __perf_reg_mask_x86(/*intr=*/false, abi);
break;
default:
pr_debug("Unknown ELF machine %d, user sampling register mask will be empty.\n",
@@ -118,7 +118,7 @@ uint64_t perf_user_reg_mask(uint16_t e_machine)
return mask;
}
-const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags)
+const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags, int abi)
{
const char *reg_name = NULL;
@@ -150,7 +150,7 @@ const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags)
break;
case EM_386:
case EM_X86_64:
- reg_name = __perf_reg_name_x86(id);
+ reg_name = __perf_reg_name_x86(id, abi);
break;
default:
break;
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index 7c04700bf837..c9501ca8045d 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -13,10 +13,10 @@ enum {
};
int perf_sdt_arg_parse_op(uint16_t e_machine, char *old_op, char **new_op);
-uint64_t perf_intr_reg_mask(uint16_t e_machine);
-uint64_t perf_user_reg_mask(uint16_t e_machine);
+uint64_t perf_intr_reg_mask(uint16_t e_machine, int *abi);
+uint64_t perf_user_reg_mask(uint16_t e_machine, int *abi);
-const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags);
+const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags, int abi);
int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
uint64_t perf_arch_reg_ip(uint16_t e_machine);
uint64_t perf_arch_reg_sp(uint16_t e_machine);
@@ -64,8 +64,8 @@ uint64_t __perf_reg_ip_s390(void);
uint64_t __perf_reg_sp_s390(void);
int __perf_sdt_arg_parse_op_x86(char *old_op, char **new_op);
-uint64_t __perf_reg_mask_x86(bool intr);
-const char *__perf_reg_name_x86(int id);
+uint64_t __perf_reg_mask_x86(bool intr, int *abi);
+const char *__perf_reg_name_x86(int id, int abi);
uint64_t __perf_reg_ip_x86(void);
uint64_t __perf_reg_sp_x86(void);
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 2b0df7bd9a46..4cc5b96898e6 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -733,7 +733,7 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine,
printed += scnprintf(bf + printed, size - printed,
"%5s:0x%" PRIx64 " ",
- perf_reg_name(r, e_machine, e_flags), val);
+ perf_reg_name(r, e_machine, e_flags, regs->abi), val);
}
}
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 4b465abfa36c..7cf7bf86205d 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -959,15 +959,16 @@ static void branch_stack__printf(struct perf_sample *sample,
}
}
-static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine, uint32_t e_flags)
+static void regs_dump__printf(u64 mask, struct regs_dump *regs,
+ uint16_t e_machine, uint32_t e_flags)
{
unsigned rid, i = 0;
for_each_set_bit(rid, (unsigned long *) &mask, sizeof(mask) * 8) {
- u64 val = regs[i++];
+ u64 val = regs->regs[i++];
printf(".... %-5s 0x%016" PRIx64 "\n",
- perf_reg_name(rid, e_machine, e_flags), val);
+ perf_reg_name(rid, e_machine, e_flags, regs->abi), val);
}
}
@@ -995,7 +996,7 @@ static void regs__printf(const char *type, struct regs_dump *regs,
mask,
regs_dump_abi(regs));
- regs_dump__printf(mask, regs->regs, e_machine, e_flags);
+ regs_dump__printf(mask, regs, e_machine, e_flags);
}
static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags)
--
2.34.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [Patch v7 2/4] perf regs: Support x86 eGPRs/SSP sampling
2026-03-24 0:57 ` [Patch v7 2/4] perf regs: Support x86 eGPRs/SSP sampling Dapeng Mi
@ 2026-03-24 2:49 ` Ian Rogers
2026-03-25 2:08 ` Mi, Dapeng
2026-03-26 1:41 ` Mi, Dapeng
1 sibling, 1 reply; 10+ messages in thread
From: Ian Rogers @ 2026-03-24 2:49 UTC (permalink / raw)
To: Dapeng Mi
Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Thomas Gleixner, Dave Hansen, Adrian Hunter,
Jiri Olsa, Alexander Shishkin, Andi Kleen, Eranian Stephane,
Mark Rutland, broonie, Ravi Bangoria, linux-kernel,
linux-perf-users, Zide Chen, Falcon Thomas, Dapeng Mi, Xudong Hao
On Mon, Mar 23, 2026 at 6:01 PM Dapeng Mi <dapeng1.mi@linux.intel.com> wrote:
>
> This patch adds support for sampling x86 extended GP registers (R16-R31)
> and the shadow stack pointer (SSP) register.
>
> The original XMM registers space in sample_regs_user/sample_regs_intr is
> reclaimed to represent the eGPRs and SSP when SIMD registers sampling is
> supported with the new SIMD sampling fields in the perf_event_attr
> structure. This necessitates a way to distinguish which register layout
> is used for the sample_regs_user/sample_regs_intr bitmap.
>
> To address this, a new "abi" argument is added to the helpers
> perf_intr_reg_mask(), perf_user_reg_mask(), and perf_reg_name(). When
> "abi & PERF_SAMPLE_REGS_ABI_SIMD" is true, it indicates the eGPRs and SSP
> layout is represented; otherwise, the legacy XMM registers are
> represented.
>
> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
> ---
>
> V7: Limit dwarf minimal regs to legacy GPRs (excluding APX eGPRs).
So, R16 to R31 won't be set up? This sounds worrying because a
function, like a leaf function, might use these registers as a frame
pointer with -fomit-frame-pointer when the code includes uses
functions like alloca.
So not having vector register support potentially breaks things like
LLVM's spill2reg, which aims to avoid using the stack and prefers
vector registers:
https://discourse.llvm.org/t/rfc-spill2reg-selectively-replace-spills-to-stack-with-spills-to-vector-registers/59630
I think the dwarf minimal registers need extending to cover SIMD
registers, specifically those outside the mask, but this isn't
supported currently. For now, however, it would be useful to at least
warn when we receive a request (via the dwarf information in a binary)
for a register that we're choosing not to include in the sample set,
otherwise they could look like a spurious unwind errors.
>
> tools/perf/builtin-script.c | 2 +-
> tools/perf/util/evsel.c | 7 +-
> tools/perf/util/parse-regs-options.c | 17 ++-
> .../perf/util/perf-regs-arch/perf_regs_x86.c | 124 +++++++++++++++---
> tools/perf/util/perf_regs.c | 12 +-
> tools/perf/util/perf_regs.h | 10 +-
> .../scripting-engines/trace-event-python.c | 2 +-
> tools/perf/util/session.c | 9 +-
> 8 files changed, 142 insertions(+), 41 deletions(-)
>
> diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
> index b80c406d1fc1..714528732e02 100644
> --- a/tools/perf/builtin-script.c
> +++ b/tools/perf/builtin-script.c
> @@ -730,7 +730,7 @@ static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask,
> for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) {
> u64 val = regs->regs[i++];
> printed += fprintf(fp, "%5s:0x%"PRIx64" ",
> - perf_reg_name(r, e_machine, e_flags),
> + perf_reg_name(r, e_machine, e_flags, regs->abi),
> val);
> }
>
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 5a294595a677..f565ef2eb476 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -1054,19 +1054,22 @@ static void __evsel__config_callchain(struct evsel *evsel, const struct record_o
> }
>
> if (param->record_mode == CALLCHAIN_DWARF) {
> + int abi = -1; /* -1 indicates only basic GPRs are needed. */
Should the abi come from checking
evsel->core.attr.sample_simd_regs_enabled and other related values?
> +
> if (!function) {
> uint16_t e_machine = evsel__e_machine(evsel, /*e_flags=*/NULL);
>
> evsel__set_sample_bit(evsel, REGS_USER);
> evsel__set_sample_bit(evsel, STACK_USER);
> if (opts->sample_user_regs &&
> - DWARF_MINIMAL_REGS(e_machine) != perf_user_reg_mask(EM_HOST)) {
> + DWARF_MINIMAL_REGS(e_machine) != perf_user_reg_mask(EM_HOST, &abi)) {
> attr->sample_regs_user |= DWARF_MINIMAL_REGS(e_machine);
> pr_warning("WARNING: The use of --call-graph=dwarf may require all the user registers, "
> "specifying a subset with --user-regs may render DWARF unwinding unreliable, "
> "so the minimal registers set (IP, SP) is explicitly forced.\n");
> } else {
> - attr->sample_regs_user |= perf_user_reg_mask(EM_HOST);
> + abi = -1;
This assignment is redundant.
> + attr->sample_regs_user |= perf_user_reg_mask(EM_HOST, &abi);
> }
> attr->sample_stack_user = param->dump_size;
> attr->exclude_callchain_user = 1;
> diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c
> index c93c2f0c8105..6cf865bfc2f7 100644
> --- a/tools/perf/util/parse-regs-options.c
> +++ b/tools/perf/util/parse-regs-options.c
> @@ -10,7 +10,8 @@
> #include "util/perf_regs.h"
> #include "util/parse-regs-options.h"
>
> -static void list_perf_regs(FILE *fp, uint64_t mask)
> +static void
> +list_perf_regs(FILE *fp, uint64_t mask, int abi)
> {
> const char *last_name = NULL;
>
> @@ -21,7 +22,7 @@ static void list_perf_regs(FILE *fp, uint64_t mask)
> if (((1ULL << reg) & mask) == 0)
> continue;
>
> - name = perf_reg_name(reg, EM_HOST, EF_HOST);
> + name = perf_reg_name(reg, EM_HOST, EF_HOST, abi);
> if (name && (!last_name || strcmp(last_name, name)))
> fprintf(fp, "%s%s", reg > 0 ? " " : "", name);
> last_name = name;
> @@ -29,7 +30,8 @@ static void list_perf_regs(FILE *fp, uint64_t mask)
> fputc('\n', fp);
> }
>
> -static uint64_t name_to_perf_reg_mask(const char *to_match, uint64_t mask)
> +static uint64_t
> +name_to_perf_reg_mask(const char *to_match, uint64_t mask, int abi)
> {
> uint64_t reg_mask = 0;
>
> @@ -39,7 +41,7 @@ static uint64_t name_to_perf_reg_mask(const char *to_match, uint64_t mask)
> if (((1ULL << reg) & mask) == 0)
> continue;
>
> - name = perf_reg_name(reg, EM_HOST, EF_HOST);
> + name = perf_reg_name(reg, EM_HOST, EF_HOST, abi);
> if (!name)
> continue;
>
> @@ -56,6 +58,7 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
> char *s, *os = NULL, *p;
> int ret = -1;
> uint64_t mask;
> + int abi = 0;
0 here is PERF_SAMPLE_REGS_ABI_NONE, perhaps we can use the perf_env
that has kernel_is_64_bit to set the ABI to at least
PERF_SAMPLE_REGS_ABI_32 or PERF_SAMPLE_REGS_ABI_64. How will the SIMD
registers outside of the register mask be encoded?
>
> if (unset)
> return 0;
> @@ -66,7 +69,7 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
> if (*mode)
> return -1;
>
> - mask = intr ? perf_intr_reg_mask(EM_HOST) : perf_user_reg_mask(EM_HOST);
> + mask = intr ? perf_intr_reg_mask(EM_HOST, &abi) : perf_user_reg_mask(EM_HOST, &abi);
>
> /* str may be NULL in case no arg is passed to -I */
> if (!str) {
> @@ -87,11 +90,11 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
> *p = '\0';
>
> if (!strcmp(s, "?")) {
> - list_perf_regs(stderr, mask);
> + list_perf_regs(stderr, mask, abi);
> goto error;
> }
>
> - reg_mask = name_to_perf_reg_mask(s, mask);
> + reg_mask = name_to_perf_reg_mask(s, mask, abi);
> if (reg_mask == 0) {
> ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n",
> s, intr ? "-I" : "--user-regs=");
> diff --git a/tools/perf/util/perf-regs-arch/perf_regs_x86.c b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
> index b6d20522b4e8..ae26d991cdc9 100644
> --- a/tools/perf/util/perf-regs-arch/perf_regs_x86.c
> +++ b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
> @@ -235,26 +235,26 @@ int __perf_sdt_arg_parse_op_x86(char *old_op, char **new_op)
> return SDT_ARG_VALID;
> }
>
> -uint64_t __perf_reg_mask_x86(bool intr)
> +static uint64_t __arch__reg_mask(u64 sample_type, u64 mask, bool has_simd_regs)
> {
> struct perf_event_attr attr = {
> - .type = PERF_TYPE_HARDWARE,
> - .config = PERF_COUNT_HW_CPU_CYCLES,
> - .sample_type = PERF_SAMPLE_REGS_INTR,
> - .sample_regs_intr = PERF_REG_EXTENDED_MASK,
> - .precise_ip = 1,
> - .disabled = 1,
> - .exclude_kernel = 1,
> + .type = PERF_TYPE_HARDWARE,
> + .config = PERF_COUNT_HW_CPU_CYCLES,
> + .sample_type = sample_type,
> + .precise_ip = 1,
> + .disabled = 1,
> + .exclude_kernel = 1,
> + .sample_simd_regs_enabled = has_simd_regs,
> };
> int fd;
> -
> - if (!intr)
> - return PERF_REGS_MASK;
> -
> /*
> * In an unnamed union, init it here to build on older gcc versions
> */
> attr.sample_period = 1;
> + if (sample_type == PERF_SAMPLE_REGS_INTR)
> + attr.sample_regs_intr = mask;
> + else
> + attr.sample_regs_user = mask;
>
> if (perf_pmus__num_core_pmus() > 1) {
> struct perf_pmu *pmu = NULL;
> @@ -276,13 +276,38 @@ uint64_t __perf_reg_mask_x86(bool intr)
> /*group_fd=*/-1, /*flags=*/0);
> if (fd != -1) {
> close(fd);
> - return (PERF_REG_EXTENDED_MASK | PERF_REGS_MASK);
> + return mask;
> + }
> +
> + return 0;
> +}
> +
> +uint64_t __perf_reg_mask_x86(bool intr, int *abi)
> +{
> + u64 sample_type = intr ? PERF_SAMPLE_REGS_INTR : PERF_SAMPLE_REGS_USER;
> + uint64_t mask = PERF_REGS_MASK;
> +
> + /* -1 indicates only basic GPRs are needed. */
> + if (*abi < 0)
> + return PERF_REGS_MASK;
> +
> + *abi = 0;
> + mask |= __arch__reg_mask(sample_type,
> + GENMASK_ULL(PERF_REG_X86_R31, PERF_REG_X86_R16),
> + true);
> + mask |= __arch__reg_mask(sample_type, BIT_ULL(PERF_REG_X86_SSP), true);
> +
> + if (mask != PERF_REGS_MASK) {
> + *abi |= PERF_SAMPLE_REGS_ABI_SIMD;
> + } else {
> + mask |= __arch__reg_mask(sample_type, PERF_REG_EXTENDED_MASK,
> + false);
> }
>
> - return PERF_REGS_MASK;
> + return mask;
> }
>
> -const char *__perf_reg_name_x86(int id)
> +static const char *__arch_reg_gpr_name(int id)
> {
> switch (id) {
> case PERF_REG_X86_AX:
> @@ -333,7 +358,60 @@ const char *__perf_reg_name_x86(int id)
> return "R14";
> case PERF_REG_X86_R15:
> return "R15";
> + default:
> + return NULL;
> + }
> +
> + return NULL;
> +}
>
> +static const char *__arch_reg_egpr_name(int id)
> +{
> + switch (id) {
> + case PERF_REG_X86_R16:
> + return "R16";
> + case PERF_REG_X86_R17:
> + return "R17";
> + case PERF_REG_X86_R18:
> + return "R18";
> + case PERF_REG_X86_R19:
> + return "R19";
> + case PERF_REG_X86_R20:
> + return "R20";
> + case PERF_REG_X86_R21:
> + return "R21";
> + case PERF_REG_X86_R22:
> + return "R22";
> + case PERF_REG_X86_R23:
> + return "R23";
> + case PERF_REG_X86_R24:
> + return "R24";
> + case PERF_REG_X86_R25:
> + return "R25";
> + case PERF_REG_X86_R26:
> + return "R26";
> + case PERF_REG_X86_R27:
> + return "R27";
> + case PERF_REG_X86_R28:
> + return "R28";
> + case PERF_REG_X86_R29:
> + return "R29";
> + case PERF_REG_X86_R30:
> + return "R30";
> + case PERF_REG_X86_R31:
> + return "R31";
> + case PERF_REG_X86_SSP:
> + return "SSP";
> + default:
> + return NULL;
> + }
> +
> + return NULL;
> +}
> +
> +static const char *__arch_reg_xmm_name(int id)
> +{
> + switch (id) {
> #define XMM(x) \
> case PERF_REG_X86_XMM ## x: \
> case PERF_REG_X86_XMM ## x + 1: \
> @@ -362,6 +440,22 @@ const char *__perf_reg_name_x86(int id)
> return NULL;
> }
>
> +const char *__perf_reg_name_x86(int id, int abi)
> +{
> + const char *name;
> +
> + name = __arch_reg_gpr_name(id);
> + if (name)
> + return name;
> +
> + if (abi & PERF_SAMPLE_REGS_ABI_SIMD)
> + name = __arch_reg_egpr_name(id);
> + else
> + name = __arch_reg_xmm_name(id);
> +
> + return name;
> +}
> +
> uint64_t __perf_reg_ip_x86(void)
> {
> return PERF_REG_X86_IP;
> diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
> index 5b8f34beb24e..afc567718bee 100644
> --- a/tools/perf/util/perf_regs.c
> +++ b/tools/perf/util/perf_regs.c
> @@ -32,7 +32,7 @@ int perf_sdt_arg_parse_op(uint16_t e_machine, char *old_op, char **new_op)
> return ret;
> }
>
> -uint64_t perf_intr_reg_mask(uint16_t e_machine)
> +uint64_t perf_intr_reg_mask(uint16_t e_machine, int *abi)
> {
> uint64_t mask = 0;
>
> @@ -64,7 +64,7 @@ uint64_t perf_intr_reg_mask(uint16_t e_machine)
> break;
> case EM_386:
> case EM_X86_64:
> - mask = __perf_reg_mask_x86(/*intr=*/true);
> + mask = __perf_reg_mask_x86(/*intr=*/true, abi);
> break;
> default:
> pr_debug("Unknown ELF machine %d, interrupt sampling register mask will be empty.\n",
> @@ -75,7 +75,7 @@ uint64_t perf_intr_reg_mask(uint16_t e_machine)
> return mask;
> }
>
> -uint64_t perf_user_reg_mask(uint16_t e_machine)
> +uint64_t perf_user_reg_mask(uint16_t e_machine, int *abi)
> {
> uint64_t mask = 0;
>
> @@ -107,7 +107,7 @@ uint64_t perf_user_reg_mask(uint16_t e_machine)
> break;
> case EM_386:
> case EM_X86_64:
> - mask = __perf_reg_mask_x86(/*intr=*/false);
> + mask = __perf_reg_mask_x86(/*intr=*/false, abi);
> break;
> default:
> pr_debug("Unknown ELF machine %d, user sampling register mask will be empty.\n",
> @@ -118,7 +118,7 @@ uint64_t perf_user_reg_mask(uint16_t e_machine)
> return mask;
> }
>
> -const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags)
> +const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags, int abi)
> {
> const char *reg_name = NULL;
>
> @@ -150,7 +150,7 @@ const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags)
> break;
> case EM_386:
> case EM_X86_64:
> - reg_name = __perf_reg_name_x86(id);
> + reg_name = __perf_reg_name_x86(id, abi);
> break;
> default:
> break;
> diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
> index 7c04700bf837..c9501ca8045d 100644
> --- a/tools/perf/util/perf_regs.h
> +++ b/tools/perf/util/perf_regs.h
> @@ -13,10 +13,10 @@ enum {
> };
>
> int perf_sdt_arg_parse_op(uint16_t e_machine, char *old_op, char **new_op);
> -uint64_t perf_intr_reg_mask(uint16_t e_machine);
> -uint64_t perf_user_reg_mask(uint16_t e_machine);
> +uint64_t perf_intr_reg_mask(uint16_t e_machine, int *abi);
> +uint64_t perf_user_reg_mask(uint16_t e_machine, int *abi);
Can we add an "/*inout*/" comment for the abi argument?
>
> -const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags);
> +const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags, int abi);
> int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
> uint64_t perf_arch_reg_ip(uint16_t e_machine);
> uint64_t perf_arch_reg_sp(uint16_t e_machine);
> @@ -64,8 +64,8 @@ uint64_t __perf_reg_ip_s390(void);
> uint64_t __perf_reg_sp_s390(void);
>
> int __perf_sdt_arg_parse_op_x86(char *old_op, char **new_op);
> -uint64_t __perf_reg_mask_x86(bool intr);
> -const char *__perf_reg_name_x86(int id);
> +uint64_t __perf_reg_mask_x86(bool intr, int *abi);
> +const char *__perf_reg_name_x86(int id, int abi);
> uint64_t __perf_reg_ip_x86(void);
> uint64_t __perf_reg_sp_x86(void);
In dwarf_regs.h is also:
int __get_dwarf_regnum_x86_64(const char *name);
This needs extending for r16 to r31, xmm16-xmm31, etc.
__get_dwarf_regnum_for_perf_regnum_x86_64(int perf_regnum);
I think this needs an ABI argument otherwise, how to differentiate r16
from XMM0?
Thanks,
Ian
> diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
> index 2b0df7bd9a46..4cc5b96898e6 100644
> --- a/tools/perf/util/scripting-engines/trace-event-python.c
> +++ b/tools/perf/util/scripting-engines/trace-event-python.c
> @@ -733,7 +733,7 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine,
>
> printed += scnprintf(bf + printed, size - printed,
> "%5s:0x%" PRIx64 " ",
> - perf_reg_name(r, e_machine, e_flags), val);
> + perf_reg_name(r, e_machine, e_flags, regs->abi), val);
> }
> }
>
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 4b465abfa36c..7cf7bf86205d 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -959,15 +959,16 @@ static void branch_stack__printf(struct perf_sample *sample,
> }
> }
>
> -static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine, uint32_t e_flags)
> +static void regs_dump__printf(u64 mask, struct regs_dump *regs,
> + uint16_t e_machine, uint32_t e_flags)
> {
> unsigned rid, i = 0;
>
> for_each_set_bit(rid, (unsigned long *) &mask, sizeof(mask) * 8) {
> - u64 val = regs[i++];
> + u64 val = regs->regs[i++];
>
> printf(".... %-5s 0x%016" PRIx64 "\n",
> - perf_reg_name(rid, e_machine, e_flags), val);
> + perf_reg_name(rid, e_machine, e_flags, regs->abi), val);
> }
> }
>
> @@ -995,7 +996,7 @@ static void regs__printf(const char *type, struct regs_dump *regs,
> mask,
> regs_dump_abi(regs));
>
> - regs_dump__printf(mask, regs->regs, e_machine, e_flags);
> + regs_dump__printf(mask, regs, e_machine, e_flags);
> }
>
> static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags)
> --
> 2.34.1
>
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [Patch v7 2/4] perf regs: Support x86 eGPRs/SSP sampling
2026-03-24 2:49 ` Ian Rogers
@ 2026-03-25 2:08 ` Mi, Dapeng
0 siblings, 0 replies; 10+ messages in thread
From: Mi, Dapeng @ 2026-03-25 2:08 UTC (permalink / raw)
To: Ian Rogers
Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Thomas Gleixner, Dave Hansen, Adrian Hunter,
Jiri Olsa, Alexander Shishkin, Andi Kleen, Eranian Stephane,
Mark Rutland, broonie, Ravi Bangoria, linux-kernel,
linux-perf-users, Zide Chen, Falcon Thomas, Dapeng Mi, Xudong Hao
On 3/24/2026 10:49 AM, Ian Rogers wrote:
> On Mon, Mar 23, 2026 at 6:01 PM Dapeng Mi <dapeng1.mi@linux.intel.com> wrote:
>> This patch adds support for sampling x86 extended GP registers (R16-R31)
>> and the shadow stack pointer (SSP) register.
>>
>> The original XMM registers space in sample_regs_user/sample_regs_intr is
>> reclaimed to represent the eGPRs and SSP when SIMD registers sampling is
>> supported with the new SIMD sampling fields in the perf_event_attr
>> structure. This necessitates a way to distinguish which register layout
>> is used for the sample_regs_user/sample_regs_intr bitmap.
>>
>> To address this, a new "abi" argument is added to the helpers
>> perf_intr_reg_mask(), perf_user_reg_mask(), and perf_reg_name(). When
>> "abi & PERF_SAMPLE_REGS_ABI_SIMD" is true, it indicates the eGPRs and SSP
>> layout is represented; otherwise, the legacy XMM registers are
>> represented.
>>
>> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
>> ---
>>
>> V7: Limit dwarf minimal regs to legacy GPRs (excluding APX eGPRs).
> So, R16 to R31 won't be set up? This sounds worrying because a
> function, like a leaf function, might use these registers as a frame
> pointer with -fomit-frame-pointer when the code includes uses
> functions like alloca.
No, perf_user_reg_mask()/perf_intr_reg_mask() would still return R16~R31
mask if they are supported except the input argument "abi" is set to -1.
The reason that only returns the basic GPRs in the case "abi = -1" is to
avoid the these eGPRs are added into sample_user_regs in below code.
```
if (opts->sample_user_regs &&
DWARF_MINIMAL_REGS(e_machine) !=
perf_user_reg_mask(EM_HOST, &abi)) {
attr->sample_regs_user |= DWARF_MINIMAL_REGS(e_machine);
pr_warning("WARNING: The use of --call-graph=dwarf may
require all the user registers, "
"specifying a subset with --user-regs may render
DWARF unwinding unreliable, "
"so the minimal registers set (IP, SP) is explicitly
forced.\n");
} else {
abi = -1;
attr->sample_regs_user |= perf_user_reg_mask(EM_HOST, &abi);
}
```
Users may leverage the "cpu-clock" or "task-clock" to sample the dwarf
call-stack but doesn't explicitly set the sample_regs_user. In this case,
if the returned mask from perf_user_reg_mask() contains the R16-R31 mask,
it would cause the event creation failure since "cpu-clock" or "task-clock"
doesn't support to sample R16-R31 registers yet (This would cause the
addr2line test fails).
>
> So not having vector register support potentially breaks things like
> LLVM's spill2reg, which aims to avoid using the stack and prefers
> vector registers:
> https://discourse.llvm.org/t/rfc-spill2reg-selectively-replace-spills-to-stack-with-spills-to-vector-registers/59630
> I think the dwarf minimal registers need extending to cover SIMD
> registers, specifically those outside the mask, but this isn't
> supported currently. For now, however, it would be useful to at least
> warn when we receive a request (via the dwarf information in a binary)
> for a register that we're choosing not to include in the sample set,
> otherwise they could look like a spurious unwind errors.
Just take some time to investigate the status of libdw supporting APX/SIMD
regs, but I didn't find there is the support for APX/SIMD regs in the
latest libdw (https://sourceware.org/git/elfutils.git). Not sure if I miss
something. If wrong, Please correct me.
So suppose we have to defer the supporting of APX/SIMD regs supporting
until libdw supports it?
Sure. I would add messages to warn APX/SIMD regs are not supported if libdw
tries to request it.
>
>> tools/perf/builtin-script.c | 2 +-
>> tools/perf/util/evsel.c | 7 +-
>> tools/perf/util/parse-regs-options.c | 17 ++-
>> .../perf/util/perf-regs-arch/perf_regs_x86.c | 124 +++++++++++++++---
>> tools/perf/util/perf_regs.c | 12 +-
>> tools/perf/util/perf_regs.h | 10 +-
>> .../scripting-engines/trace-event-python.c | 2 +-
>> tools/perf/util/session.c | 9 +-
>> 8 files changed, 142 insertions(+), 41 deletions(-)
>>
>> diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
>> index b80c406d1fc1..714528732e02 100644
>> --- a/tools/perf/builtin-script.c
>> +++ b/tools/perf/builtin-script.c
>> @@ -730,7 +730,7 @@ static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask,
>> for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) {
>> u64 val = regs->regs[i++];
>> printed += fprintf(fp, "%5s:0x%"PRIx64" ",
>> - perf_reg_name(r, e_machine, e_flags),
>> + perf_reg_name(r, e_machine, e_flags, regs->abi),
>> val);
>> }
>>
>> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
>> index 5a294595a677..f565ef2eb476 100644
>> --- a/tools/perf/util/evsel.c
>> +++ b/tools/perf/util/evsel.c
>> @@ -1054,19 +1054,22 @@ static void __evsel__config_callchain(struct evsel *evsel, const struct record_o
>> }
>>
>> if (param->record_mode == CALLCHAIN_DWARF) {
>> + int abi = -1; /* -1 indicates only basic GPRs are needed. */
> Should the abi come from checking
> evsel->core.attr.sample_simd_regs_enabled and other related values?
Actually no. It's has nothing to do with
evsel->core.attr.sample_simd_regs_enabled or something else. We only want
to get the basic supported GPRs here as previously explained.
>
>> +
>> if (!function) {
>> uint16_t e_machine = evsel__e_machine(evsel, /*e_flags=*/NULL);
>>
>> evsel__set_sample_bit(evsel, REGS_USER);
>> evsel__set_sample_bit(evsel, STACK_USER);
>> if (opts->sample_user_regs &&
>> - DWARF_MINIMAL_REGS(e_machine) != perf_user_reg_mask(EM_HOST)) {
>> + DWARF_MINIMAL_REGS(e_machine) != perf_user_reg_mask(EM_HOST, &abi)) {
>> attr->sample_regs_user |= DWARF_MINIMAL_REGS(e_machine);
>> pr_warning("WARNING: The use of --call-graph=dwarf may require all the user registers, "
>> "specifying a subset with --user-regs may render DWARF unwinding unreliable, "
>> "so the minimal registers set (IP, SP) is explicitly forced.\n");
>> } else {
>> - attr->sample_regs_user |= perf_user_reg_mask(EM_HOST);
>> + abi = -1;
> This assignment is redundant.
It's intended. Since "abi" is a input and output argument. The "abi" could
be overwritten by the above perf_user_reg_mask(), so here set to "-1" again.
>
>> + attr->sample_regs_user |= perf_user_reg_mask(EM_HOST, &abi);
>> }
>> attr->sample_stack_user = param->dump_size;
>> attr->exclude_callchain_user = 1;
>> diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c
>> index c93c2f0c8105..6cf865bfc2f7 100644
>> --- a/tools/perf/util/parse-regs-options.c
>> +++ b/tools/perf/util/parse-regs-options.c
>> @@ -10,7 +10,8 @@
>> #include "util/perf_regs.h"
>> #include "util/parse-regs-options.h"
>>
>> -static void list_perf_regs(FILE *fp, uint64_t mask)
>> +static void
>> +list_perf_regs(FILE *fp, uint64_t mask, int abi)
>> {
>> const char *last_name = NULL;
>>
>> @@ -21,7 +22,7 @@ static void list_perf_regs(FILE *fp, uint64_t mask)
>> if (((1ULL << reg) & mask) == 0)
>> continue;
>>
>> - name = perf_reg_name(reg, EM_HOST, EF_HOST);
>> + name = perf_reg_name(reg, EM_HOST, EF_HOST, abi);
>> if (name && (!last_name || strcmp(last_name, name)))
>> fprintf(fp, "%s%s", reg > 0 ? " " : "", name);
>> last_name = name;
>> @@ -29,7 +30,8 @@ static void list_perf_regs(FILE *fp, uint64_t mask)
>> fputc('\n', fp);
>> }
>>
>> -static uint64_t name_to_perf_reg_mask(const char *to_match, uint64_t mask)
>> +static uint64_t
>> +name_to_perf_reg_mask(const char *to_match, uint64_t mask, int abi)
>> {
>> uint64_t reg_mask = 0;
>>
>> @@ -39,7 +41,7 @@ static uint64_t name_to_perf_reg_mask(const char *to_match, uint64_t mask)
>> if (((1ULL << reg) & mask) == 0)
>> continue;
>>
>> - name = perf_reg_name(reg, EM_HOST, EF_HOST);
>> + name = perf_reg_name(reg, EM_HOST, EF_HOST, abi);
>> if (!name)
>> continue;
>>
>> @@ -56,6 +58,7 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
>> char *s, *os = NULL, *p;
>> int ret = -1;
>> uint64_t mask;
>> + int abi = 0;
> 0 here is PERF_SAMPLE_REGS_ABI_NONE, perhaps we can use the perf_env
> that has kernel_is_64_bit to set the ABI to at least
> PERF_SAMPLE_REGS_ABI_32 or PERF_SAMPLE_REGS_ABI_64. How will the SIMD
> registers outside of the register mask be encoded?
IMO, it's unnecessary to make "abi" as input argument so complicated.
Currently only "-1" has specific meaning for the input argument, all others
would return all supported registers.
What do you mean about "the SIMD registers outside of the register mask"?
The YMM/ZMM/OPMASK registers?
After introducing the new SIMD representation fields, all SIMD registers
would be represented by these SIMD fields.
>
>> if (unset)
>> return 0;
>> @@ -66,7 +69,7 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
>> if (*mode)
>> return -1;
>>
>> - mask = intr ? perf_intr_reg_mask(EM_HOST) : perf_user_reg_mask(EM_HOST);
>> + mask = intr ? perf_intr_reg_mask(EM_HOST, &abi) : perf_user_reg_mask(EM_HOST, &abi);
>>
>> /* str may be NULL in case no arg is passed to -I */
>> if (!str) {
>> @@ -87,11 +90,11 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
>> *p = '\0';
>>
>> if (!strcmp(s, "?")) {
>> - list_perf_regs(stderr, mask);
>> + list_perf_regs(stderr, mask, abi);
>> goto error;
>> }
>>
>> - reg_mask = name_to_perf_reg_mask(s, mask);
>> + reg_mask = name_to_perf_reg_mask(s, mask, abi);
>> if (reg_mask == 0) {
>> ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n",
>> s, intr ? "-I" : "--user-regs=");
>> diff --git a/tools/perf/util/perf-regs-arch/perf_regs_x86.c b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
>> index b6d20522b4e8..ae26d991cdc9 100644
>> --- a/tools/perf/util/perf-regs-arch/perf_regs_x86.c
>> +++ b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
>> @@ -235,26 +235,26 @@ int __perf_sdt_arg_parse_op_x86(char *old_op, char **new_op)
>> return SDT_ARG_VALID;
>> }
>>
>> -uint64_t __perf_reg_mask_x86(bool intr)
>> +static uint64_t __arch__reg_mask(u64 sample_type, u64 mask, bool has_simd_regs)
>> {
>> struct perf_event_attr attr = {
>> - .type = PERF_TYPE_HARDWARE,
>> - .config = PERF_COUNT_HW_CPU_CYCLES,
>> - .sample_type = PERF_SAMPLE_REGS_INTR,
>> - .sample_regs_intr = PERF_REG_EXTENDED_MASK,
>> - .precise_ip = 1,
>> - .disabled = 1,
>> - .exclude_kernel = 1,
>> + .type = PERF_TYPE_HARDWARE,
>> + .config = PERF_COUNT_HW_CPU_CYCLES,
>> + .sample_type = sample_type,
>> + .precise_ip = 1,
>> + .disabled = 1,
>> + .exclude_kernel = 1,
>> + .sample_simd_regs_enabled = has_simd_regs,
>> };
>> int fd;
>> -
>> - if (!intr)
>> - return PERF_REGS_MASK;
>> -
>> /*
>> * In an unnamed union, init it here to build on older gcc versions
>> */
>> attr.sample_period = 1;
>> + if (sample_type == PERF_SAMPLE_REGS_INTR)
>> + attr.sample_regs_intr = mask;
>> + else
>> + attr.sample_regs_user = mask;
>>
>> if (perf_pmus__num_core_pmus() > 1) {
>> struct perf_pmu *pmu = NULL;
>> @@ -276,13 +276,38 @@ uint64_t __perf_reg_mask_x86(bool intr)
>> /*group_fd=*/-1, /*flags=*/0);
>> if (fd != -1) {
>> close(fd);
>> - return (PERF_REG_EXTENDED_MASK | PERF_REGS_MASK);
>> + return mask;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +uint64_t __perf_reg_mask_x86(bool intr, int *abi)
>> +{
>> + u64 sample_type = intr ? PERF_SAMPLE_REGS_INTR : PERF_SAMPLE_REGS_USER;
>> + uint64_t mask = PERF_REGS_MASK;
>> +
>> + /* -1 indicates only basic GPRs are needed. */
>> + if (*abi < 0)
>> + return PERF_REGS_MASK;
>> +
>> + *abi = 0;
>> + mask |= __arch__reg_mask(sample_type,
>> + GENMASK_ULL(PERF_REG_X86_R31, PERF_REG_X86_R16),
>> + true);
>> + mask |= __arch__reg_mask(sample_type, BIT_ULL(PERF_REG_X86_SSP), true);
>> +
>> + if (mask != PERF_REGS_MASK) {
>> + *abi |= PERF_SAMPLE_REGS_ABI_SIMD;
>> + } else {
>> + mask |= __arch__reg_mask(sample_type, PERF_REG_EXTENDED_MASK,
>> + false);
>> }
>>
>> - return PERF_REGS_MASK;
>> + return mask;
>> }
>>
>> -const char *__perf_reg_name_x86(int id)
>> +static const char *__arch_reg_gpr_name(int id)
>> {
>> switch (id) {
>> case PERF_REG_X86_AX:
>> @@ -333,7 +358,60 @@ const char *__perf_reg_name_x86(int id)
>> return "R14";
>> case PERF_REG_X86_R15:
>> return "R15";
>> + default:
>> + return NULL;
>> + }
>> +
>> + return NULL;
>> +}
>>
>> +static const char *__arch_reg_egpr_name(int id)
>> +{
>> + switch (id) {
>> + case PERF_REG_X86_R16:
>> + return "R16";
>> + case PERF_REG_X86_R17:
>> + return "R17";
>> + case PERF_REG_X86_R18:
>> + return "R18";
>> + case PERF_REG_X86_R19:
>> + return "R19";
>> + case PERF_REG_X86_R20:
>> + return "R20";
>> + case PERF_REG_X86_R21:
>> + return "R21";
>> + case PERF_REG_X86_R22:
>> + return "R22";
>> + case PERF_REG_X86_R23:
>> + return "R23";
>> + case PERF_REG_X86_R24:
>> + return "R24";
>> + case PERF_REG_X86_R25:
>> + return "R25";
>> + case PERF_REG_X86_R26:
>> + return "R26";
>> + case PERF_REG_X86_R27:
>> + return "R27";
>> + case PERF_REG_X86_R28:
>> + return "R28";
>> + case PERF_REG_X86_R29:
>> + return "R29";
>> + case PERF_REG_X86_R30:
>> + return "R30";
>> + case PERF_REG_X86_R31:
>> + return "R31";
>> + case PERF_REG_X86_SSP:
>> + return "SSP";
>> + default:
>> + return NULL;
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> +static const char *__arch_reg_xmm_name(int id)
>> +{
>> + switch (id) {
>> #define XMM(x) \
>> case PERF_REG_X86_XMM ## x: \
>> case PERF_REG_X86_XMM ## x + 1: \
>> @@ -362,6 +440,22 @@ const char *__perf_reg_name_x86(int id)
>> return NULL;
>> }
>>
>> +const char *__perf_reg_name_x86(int id, int abi)
>> +{
>> + const char *name;
>> +
>> + name = __arch_reg_gpr_name(id);
>> + if (name)
>> + return name;
>> +
>> + if (abi & PERF_SAMPLE_REGS_ABI_SIMD)
>> + name = __arch_reg_egpr_name(id);
>> + else
>> + name = __arch_reg_xmm_name(id);
>> +
>> + return name;
>> +}
>> +
>> uint64_t __perf_reg_ip_x86(void)
>> {
>> return PERF_REG_X86_IP;
>> diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
>> index 5b8f34beb24e..afc567718bee 100644
>> --- a/tools/perf/util/perf_regs.c
>> +++ b/tools/perf/util/perf_regs.c
>> @@ -32,7 +32,7 @@ int perf_sdt_arg_parse_op(uint16_t e_machine, char *old_op, char **new_op)
>> return ret;
>> }
>>
>> -uint64_t perf_intr_reg_mask(uint16_t e_machine)
>> +uint64_t perf_intr_reg_mask(uint16_t e_machine, int *abi)
>> {
>> uint64_t mask = 0;
>>
>> @@ -64,7 +64,7 @@ uint64_t perf_intr_reg_mask(uint16_t e_machine)
>> break;
>> case EM_386:
>> case EM_X86_64:
>> - mask = __perf_reg_mask_x86(/*intr=*/true);
>> + mask = __perf_reg_mask_x86(/*intr=*/true, abi);
>> break;
>> default:
>> pr_debug("Unknown ELF machine %d, interrupt sampling register mask will be empty.\n",
>> @@ -75,7 +75,7 @@ uint64_t perf_intr_reg_mask(uint16_t e_machine)
>> return mask;
>> }
>>
>> -uint64_t perf_user_reg_mask(uint16_t e_machine)
>> +uint64_t perf_user_reg_mask(uint16_t e_machine, int *abi)
>> {
>> uint64_t mask = 0;
>>
>> @@ -107,7 +107,7 @@ uint64_t perf_user_reg_mask(uint16_t e_machine)
>> break;
>> case EM_386:
>> case EM_X86_64:
>> - mask = __perf_reg_mask_x86(/*intr=*/false);
>> + mask = __perf_reg_mask_x86(/*intr=*/false, abi);
>> break;
>> default:
>> pr_debug("Unknown ELF machine %d, user sampling register mask will be empty.\n",
>> @@ -118,7 +118,7 @@ uint64_t perf_user_reg_mask(uint16_t e_machine)
>> return mask;
>> }
>>
>> -const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags)
>> +const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags, int abi)
>> {
>> const char *reg_name = NULL;
>>
>> @@ -150,7 +150,7 @@ const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags)
>> break;
>> case EM_386:
>> case EM_X86_64:
>> - reg_name = __perf_reg_name_x86(id);
>> + reg_name = __perf_reg_name_x86(id, abi);
>> break;
>> default:
>> break;
>> diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
>> index 7c04700bf837..c9501ca8045d 100644
>> --- a/tools/perf/util/perf_regs.h
>> +++ b/tools/perf/util/perf_regs.h
>> @@ -13,10 +13,10 @@ enum {
>> };
>>
>> int perf_sdt_arg_parse_op(uint16_t e_machine, char *old_op, char **new_op);
>> -uint64_t perf_intr_reg_mask(uint16_t e_machine);
>> -uint64_t perf_user_reg_mask(uint16_t e_machine);
>> +uint64_t perf_intr_reg_mask(uint16_t e_machine, int *abi);
>> +uint64_t perf_user_reg_mask(uint16_t e_machine, int *abi);
> Can we add an "/*inout*/" comment for the abi argument?
Sure. good idea.
>
>> -const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags);
>> +const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags, int abi);
>> int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
>> uint64_t perf_arch_reg_ip(uint16_t e_machine);
>> uint64_t perf_arch_reg_sp(uint16_t e_machine);
>> @@ -64,8 +64,8 @@ uint64_t __perf_reg_ip_s390(void);
>> uint64_t __perf_reg_sp_s390(void);
>>
>> int __perf_sdt_arg_parse_op_x86(char *old_op, char **new_op);
>> -uint64_t __perf_reg_mask_x86(bool intr);
>> -const char *__perf_reg_name_x86(int id);
>> +uint64_t __perf_reg_mask_x86(bool intr, int *abi);
>> +const char *__perf_reg_name_x86(int id, int abi);
>> uint64_t __perf_reg_ip_x86(void);
>> uint64_t __perf_reg_sp_x86(void);
> In dwarf_regs.h is also:
> int __get_dwarf_regnum_x86_64(const char *name);
> This needs extending for r16 to r31, xmm16-xmm31, etc.
As above mentioned, it seems currently libdw still doesn't support APX/SIMD
regs. It'd better push back the support until libdw supports it. I would
add message in this function to warn and notice APX/SIMD regs are not
supported if libdw requests them.
>
> __get_dwarf_regnum_for_perf_regnum_x86_64(int perf_regnum);
> I think this needs an ABI argument otherwise, how to differentiate r16
> from XMM0?
Yes, would enhance this function. Thanks.
>
> Thanks,
> Ian
>
>> diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
>> index 2b0df7bd9a46..4cc5b96898e6 100644
>> --- a/tools/perf/util/scripting-engines/trace-event-python.c
>> +++ b/tools/perf/util/scripting-engines/trace-event-python.c
>> @@ -733,7 +733,7 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine,
>>
>> printed += scnprintf(bf + printed, size - printed,
>> "%5s:0x%" PRIx64 " ",
>> - perf_reg_name(r, e_machine, e_flags), val);
>> + perf_reg_name(r, e_machine, e_flags, regs->abi), val);
>> }
>> }
>>
>> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
>> index 4b465abfa36c..7cf7bf86205d 100644
>> --- a/tools/perf/util/session.c
>> +++ b/tools/perf/util/session.c
>> @@ -959,15 +959,16 @@ static void branch_stack__printf(struct perf_sample *sample,
>> }
>> }
>>
>> -static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine, uint32_t e_flags)
>> +static void regs_dump__printf(u64 mask, struct regs_dump *regs,
>> + uint16_t e_machine, uint32_t e_flags)
>> {
>> unsigned rid, i = 0;
>>
>> for_each_set_bit(rid, (unsigned long *) &mask, sizeof(mask) * 8) {
>> - u64 val = regs[i++];
>> + u64 val = regs->regs[i++];
>>
>> printf(".... %-5s 0x%016" PRIx64 "\n",
>> - perf_reg_name(rid, e_machine, e_flags), val);
>> + perf_reg_name(rid, e_machine, e_flags, regs->abi), val);
>> }
>> }
>>
>> @@ -995,7 +996,7 @@ static void regs__printf(const char *type, struct regs_dump *regs,
>> mask,
>> regs_dump_abi(regs));
>>
>> - regs_dump__printf(mask, regs->regs, e_machine, e_flags);
>> + regs_dump__printf(mask, regs, e_machine, e_flags);
>> }
>>
>> static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags)
>> --
>> 2.34.1
>>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [Patch v7 2/4] perf regs: Support x86 eGPRs/SSP sampling
2026-03-24 0:57 ` [Patch v7 2/4] perf regs: Support x86 eGPRs/SSP sampling Dapeng Mi
2026-03-24 2:49 ` Ian Rogers
@ 2026-03-26 1:41 ` Mi, Dapeng
1 sibling, 0 replies; 10+ messages in thread
From: Mi, Dapeng @ 2026-03-26 1:41 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Thomas Gleixner, Dave Hansen, Ian Rogers,
Adrian Hunter, Jiri Olsa, Alexander Shishkin, Andi Kleen,
Eranian Stephane
Cc: Mark Rutland, broonie, Ravi Bangoria, linux-kernel,
linux-perf-users, Zide Chen, Falcon Thomas, Dapeng Mi, Xudong Hao
Copy reasonable Sashiko's comments here.
On 3/24/2026 8:57 AM, Dapeng Mi wrote:
> This patch adds support for sampling x86 extended GP registers (R16-R31)
> and the shadow stack pointer (SSP) register.
>
> The original XMM registers space in sample_regs_user/sample_regs_intr is
> reclaimed to represent the eGPRs and SSP when SIMD registers sampling is
> supported with the new SIMD sampling fields in the perf_event_attr
> structure. This necessitates a way to distinguish which register layout
> is used for the sample_regs_user/sample_regs_intr bitmap.
>
> To address this, a new "abi" argument is added to the helpers
> perf_intr_reg_mask(), perf_user_reg_mask(), and perf_reg_name(). When
> "abi & PERF_SAMPLE_REGS_ABI_SIMD" is true, it indicates the eGPRs and SSP
> layout is represented; otherwise, the legacy XMM registers are
> represented.
>
> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
> ---
>
> V7: Limit dwarf minimal regs to legacy GPRs (excluding APX eGPRs).
>
> tools/perf/builtin-script.c | 2 +-
> tools/perf/util/evsel.c | 7 +-
> tools/perf/util/parse-regs-options.c | 17 ++-
> .../perf/util/perf-regs-arch/perf_regs_x86.c | 124 +++++++++++++++---
> tools/perf/util/perf_regs.c | 12 +-
> tools/perf/util/perf_regs.h | 10 +-
> .../scripting-engines/trace-event-python.c | 2 +-
> tools/perf/util/session.c | 9 +-
> 8 files changed, 142 insertions(+), 41 deletions(-)
>
> diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
> index b80c406d1fc1..714528732e02 100644
> --- a/tools/perf/builtin-script.c
> +++ b/tools/perf/builtin-script.c
> @@ -730,7 +730,7 @@ static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask,
> for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) {
> u64 val = regs->regs[i++];
> printed += fprintf(fp, "%5s:0x%"PRIx64" ",
> - perf_reg_name(r, e_machine, e_flags),
> + perf_reg_name(r, e_machine, e_flags, regs->abi),
> val);
> }
>
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 5a294595a677..f565ef2eb476 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -1054,19 +1054,22 @@ static void __evsel__config_callchain(struct evsel *evsel, const struct record_o
> }
>
> if (param->record_mode == CALLCHAIN_DWARF) {
> + int abi = -1; /* -1 indicates only basic GPRs are needed. */
> +
> if (!function) {
> uint16_t e_machine = evsel__e_machine(evsel, /*e_flags=*/NULL);
>
> evsel__set_sample_bit(evsel, REGS_USER);
> evsel__set_sample_bit(evsel, STACK_USER);
> if (opts->sample_user_regs &&
> - DWARF_MINIMAL_REGS(e_machine) != perf_user_reg_mask(EM_HOST)) {
> + DWARF_MINIMAL_REGS(e_machine) != perf_user_reg_mask(EM_HOST, &abi)) {
> attr->sample_regs_user |= DWARF_MINIMAL_REGS(e_machine);
> pr_warning("WARNING: The use of --call-graph=dwarf may require all the user registers, "
> "specifying a subset with --user-regs may render DWARF unwinding unreliable, "
> "so the minimal registers set (IP, SP) is explicitly forced.\n");
> } else {
> - attr->sample_regs_user |= perf_user_reg_mask(EM_HOST);
> + abi = -1;
> + attr->sample_regs_user |= perf_user_reg_mask(EM_HOST, &abi);
> }
> attr->sample_stack_user = param->dump_size;
> attr->exclude_callchain_user = 1;
> diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c
> index c93c2f0c8105..6cf865bfc2f7 100644
> --- a/tools/perf/util/parse-regs-options.c
> +++ b/tools/perf/util/parse-regs-options.c
> @@ -10,7 +10,8 @@
> #include "util/perf_regs.h"
> #include "util/parse-regs-options.h"
>
> -static void list_perf_regs(FILE *fp, uint64_t mask)
> +static void
> +list_perf_regs(FILE *fp, uint64_t mask, int abi)
> {
> const char *last_name = NULL;
>
> @@ -21,7 +22,7 @@ static void list_perf_regs(FILE *fp, uint64_t mask)
> if (((1ULL << reg) & mask) == 0)
> continue;
>
> - name = perf_reg_name(reg, EM_HOST, EF_HOST);
> + name = perf_reg_name(reg, EM_HOST, EF_HOST, abi);
> if (name && (!last_name || strcmp(last_name, name)))
> fprintf(fp, "%s%s", reg > 0 ? " " : "", name);
> last_name = name;
> @@ -29,7 +30,8 @@ static void list_perf_regs(FILE *fp, uint64_t mask)
> fputc('\n', fp);
> }
>
> -static uint64_t name_to_perf_reg_mask(const char *to_match, uint64_t mask)
> +static uint64_t
> +name_to_perf_reg_mask(const char *to_match, uint64_t mask, int abi)
> {
> uint64_t reg_mask = 0;
>
> @@ -39,7 +41,7 @@ static uint64_t name_to_perf_reg_mask(const char *to_match, uint64_t mask)
> if (((1ULL << reg) & mask) == 0)
> continue;
>
> - name = perf_reg_name(reg, EM_HOST, EF_HOST);
> + name = perf_reg_name(reg, EM_HOST, EF_HOST, abi);
> if (!name)
> continue;
>
> @@ -56,6 +58,7 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
> char *s, *os = NULL, *p;
> int ret = -1;
> uint64_t mask;
> + int abi = 0;
>
> if (unset)
> return 0;
> @@ -66,7 +69,7 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
> if (*mode)
> return -1;
>
> - mask = intr ? perf_intr_reg_mask(EM_HOST) : perf_user_reg_mask(EM_HOST);
> + mask = intr ? perf_intr_reg_mask(EM_HOST, &abi) : perf_user_reg_mask(EM_HOST, &abi);
>
> /* str may be NULL in case no arg is passed to -I */
> if (!str) {
> @@ -87,11 +90,11 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
> *p = '\0';
>
> if (!strcmp(s, "?")) {
> - list_perf_regs(stderr, mask);
> + list_perf_regs(stderr, mask, abi);
> goto error;
> }
>
> - reg_mask = name_to_perf_reg_mask(s, mask);
> + reg_mask = name_to_perf_reg_mask(s, mask, abi);
> if (reg_mask == 0) {
> ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n",
> s, intr ? "-I" : "--user-regs=");
Sashiko comments
"
When a user requests eGPRs and the register mask is updated here, is there a
regression where the required sample_simd_regs_enabled flag is missing from
the actual perf_event_attr structure?
If sample_simd_regs_enabled is not explicitly configured to be non-zero in
the event attributes, the kernel will assume the legacy ABI. This could
cause the kernel to misinterpret the bits if they overlap with legacy bits,
or reject the event entirely with an error, breaking eGPR sampling.
I see this is resolved later in the series in commit 7905e90d7ad1 (perf
regs: Support x86 SIMD registers sampling), which correctly sets the flag
based on the parsed options.
"
It's correct. The corresponding changes should be moved into this patch
from the next patch.
> diff --git a/tools/perf/util/perf-regs-arch/perf_regs_x86.c b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
> index b6d20522b4e8..ae26d991cdc9 100644
> --- a/tools/perf/util/perf-regs-arch/perf_regs_x86.c
> +++ b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
> @@ -235,26 +235,26 @@ int __perf_sdt_arg_parse_op_x86(char *old_op, char **new_op)
> return SDT_ARG_VALID;
> }
>
> -uint64_t __perf_reg_mask_x86(bool intr)
> +static uint64_t __arch__reg_mask(u64 sample_type, u64 mask, bool has_simd_regs)
> {
> struct perf_event_attr attr = {
> - .type = PERF_TYPE_HARDWARE,
> - .config = PERF_COUNT_HW_CPU_CYCLES,
> - .sample_type = PERF_SAMPLE_REGS_INTR,
> - .sample_regs_intr = PERF_REG_EXTENDED_MASK,
> - .precise_ip = 1,
> - .disabled = 1,
> - .exclude_kernel = 1,
> + .type = PERF_TYPE_HARDWARE,
> + .config = PERF_COUNT_HW_CPU_CYCLES,
> + .sample_type = sample_type,
> + .precise_ip = 1,
> + .disabled = 1,
> + .exclude_kernel = 1,
> + .sample_simd_regs_enabled = has_simd_regs,
> };
> int fd;
> -
> - if (!intr)
> - return PERF_REGS_MASK;
> -
> /*
> * In an unnamed union, init it here to build on older gcc versions
> */
> attr.sample_period = 1;
> + if (sample_type == PERF_SAMPLE_REGS_INTR)
> + attr.sample_regs_intr = mask;
> + else
> + attr.sample_regs_user = mask;
>
> if (perf_pmus__num_core_pmus() > 1) {
> struct perf_pmu *pmu = NULL;
> @@ -276,13 +276,38 @@ uint64_t __perf_reg_mask_x86(bool intr)
> /*group_fd=*/-1, /*flags=*/0);
> if (fd != -1) {
> close(fd);
> - return (PERF_REG_EXTENDED_MASK | PERF_REGS_MASK);
> + return mask;
> + }
> +
> + return 0;
> +}
> +
> +uint64_t __perf_reg_mask_x86(bool intr, int *abi)
> +{
> + u64 sample_type = intr ? PERF_SAMPLE_REGS_INTR : PERF_SAMPLE_REGS_USER;
> + uint64_t mask = PERF_REGS_MASK;
> +
> + /* -1 indicates only basic GPRs are needed. */
> + if (*abi < 0)
> + return PERF_REGS_MASK;
> +
> + *abi = 0;
> + mask |= __arch__reg_mask(sample_type,
> + GENMASK_ULL(PERF_REG_X86_R31, PERF_REG_X86_R16),
> + true);
> + mask |= __arch__reg_mask(sample_type, BIT_ULL(PERF_REG_X86_SSP), true);
> +
> + if (mask != PERF_REGS_MASK) {
> + *abi |= PERF_SAMPLE_REGS_ABI_SIMD;
> + } else {
> + mask |= __arch__reg_mask(sample_type, PERF_REG_EXTENDED_MASK,
> + false);
> }
>
> - return PERF_REGS_MASK;
> + return mask;
> }
>
> -const char *__perf_reg_name_x86(int id)
> +static const char *__arch_reg_gpr_name(int id)
> {
> switch (id) {
> case PERF_REG_X86_AX:
> @@ -333,7 +358,60 @@ const char *__perf_reg_name_x86(int id)
> return "R14";
> case PERF_REG_X86_R15:
> return "R15";
> + default:
> + return NULL;
> + }
> +
> + return NULL;
> +}
>
> +static const char *__arch_reg_egpr_name(int id)
> +{
> + switch (id) {
> + case PERF_REG_X86_R16:
> + return "R16";
> + case PERF_REG_X86_R17:
> + return "R17";
> + case PERF_REG_X86_R18:
> + return "R18";
> + case PERF_REG_X86_R19:
> + return "R19";
> + case PERF_REG_X86_R20:
> + return "R20";
> + case PERF_REG_X86_R21:
> + return "R21";
> + case PERF_REG_X86_R22:
> + return "R22";
> + case PERF_REG_X86_R23:
> + return "R23";
> + case PERF_REG_X86_R24:
> + return "R24";
> + case PERF_REG_X86_R25:
> + return "R25";
> + case PERF_REG_X86_R26:
> + return "R26";
> + case PERF_REG_X86_R27:
> + return "R27";
> + case PERF_REG_X86_R28:
> + return "R28";
> + case PERF_REG_X86_R29:
> + return "R29";
> + case PERF_REG_X86_R30:
> + return "R30";
> + case PERF_REG_X86_R31:
> + return "R31";
> + case PERF_REG_X86_SSP:
> + return "SSP";
> + default:
> + return NULL;
> + }
> +
> + return NULL;
> +}
> +
> +static const char *__arch_reg_xmm_name(int id)
> +{
> + switch (id) {
> #define XMM(x) \
> case PERF_REG_X86_XMM ## x: \
> case PERF_REG_X86_XMM ## x + 1: \
> @@ -362,6 +440,22 @@ const char *__perf_reg_name_x86(int id)
> return NULL;
> }
>
> +const char *__perf_reg_name_x86(int id, int abi)
> +{
> + const char *name;
> +
> + name = __arch_reg_gpr_name(id);
> + if (name)
> + return name;
> +
> + if (abi & PERF_SAMPLE_REGS_ABI_SIMD)
> + name = __arch_reg_egpr_name(id);
> + else
> + name = __arch_reg_xmm_name(id);
> +
> + return name;
> +}
> +
> uint64_t __perf_reg_ip_x86(void)
> {
> return PERF_REG_X86_IP;
> diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
> index 5b8f34beb24e..afc567718bee 100644
> --- a/tools/perf/util/perf_regs.c
> +++ b/tools/perf/util/perf_regs.c
> @@ -32,7 +32,7 @@ int perf_sdt_arg_parse_op(uint16_t e_machine, char *old_op, char **new_op)
> return ret;
> }
>
> -uint64_t perf_intr_reg_mask(uint16_t e_machine)
> +uint64_t perf_intr_reg_mask(uint16_t e_machine, int *abi)
> {
> uint64_t mask = 0;
>
> @@ -64,7 +64,7 @@ uint64_t perf_intr_reg_mask(uint16_t e_machine)
> break;
> case EM_386:
> case EM_X86_64:
> - mask = __perf_reg_mask_x86(/*intr=*/true);
> + mask = __perf_reg_mask_x86(/*intr=*/true, abi);
> break;
> default:
> pr_debug("Unknown ELF machine %d, interrupt sampling register mask will be empty.\n",
> @@ -75,7 +75,7 @@ uint64_t perf_intr_reg_mask(uint16_t e_machine)
> return mask;
> }
>
> -uint64_t perf_user_reg_mask(uint16_t e_machine)
> +uint64_t perf_user_reg_mask(uint16_t e_machine, int *abi)
> {
> uint64_t mask = 0;
>
> @@ -107,7 +107,7 @@ uint64_t perf_user_reg_mask(uint16_t e_machine)
> break;
> case EM_386:
> case EM_X86_64:
> - mask = __perf_reg_mask_x86(/*intr=*/false);
> + mask = __perf_reg_mask_x86(/*intr=*/false, abi);
> break;
> default:
> pr_debug("Unknown ELF machine %d, user sampling register mask will be empty.\n",
> @@ -118,7 +118,7 @@ uint64_t perf_user_reg_mask(uint16_t e_machine)
> return mask;
> }
>
> -const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags)
> +const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags, int abi)
> {
> const char *reg_name = NULL;
>
> @@ -150,7 +150,7 @@ const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags)
> break;
> case EM_386:
> case EM_X86_64:
> - reg_name = __perf_reg_name_x86(id);
> + reg_name = __perf_reg_name_x86(id, abi);
> break;
> default:
> break;
> diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
> index 7c04700bf837..c9501ca8045d 100644
> --- a/tools/perf/util/perf_regs.h
> +++ b/tools/perf/util/perf_regs.h
> @@ -13,10 +13,10 @@ enum {
> };
>
> int perf_sdt_arg_parse_op(uint16_t e_machine, char *old_op, char **new_op);
> -uint64_t perf_intr_reg_mask(uint16_t e_machine);
> -uint64_t perf_user_reg_mask(uint16_t e_machine);
> +uint64_t perf_intr_reg_mask(uint16_t e_machine, int *abi);
> +uint64_t perf_user_reg_mask(uint16_t e_machine, int *abi);
>
> -const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags);
> +const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags, int abi);
> int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
> uint64_t perf_arch_reg_ip(uint16_t e_machine);
> uint64_t perf_arch_reg_sp(uint16_t e_machine);
> @@ -64,8 +64,8 @@ uint64_t __perf_reg_ip_s390(void);
> uint64_t __perf_reg_sp_s390(void);
>
> int __perf_sdt_arg_parse_op_x86(char *old_op, char **new_op);
> -uint64_t __perf_reg_mask_x86(bool intr);
> -const char *__perf_reg_name_x86(int id);
> +uint64_t __perf_reg_mask_x86(bool intr, int *abi);
> +const char *__perf_reg_name_x86(int id, int abi);
> uint64_t __perf_reg_ip_x86(void);
> uint64_t __perf_reg_sp_x86(void);
>
> diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
> index 2b0df7bd9a46..4cc5b96898e6 100644
> --- a/tools/perf/util/scripting-engines/trace-event-python.c
> +++ b/tools/perf/util/scripting-engines/trace-event-python.c
> @@ -733,7 +733,7 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine,
>
> printed += scnprintf(bf + printed, size - printed,
> "%5s:0x%" PRIx64 " ",
> - perf_reg_name(r, e_machine, e_flags), val);
> + perf_reg_name(r, e_machine, e_flags, regs->abi), val);
> }
> }
>
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 4b465abfa36c..7cf7bf86205d 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -959,15 +959,16 @@ static void branch_stack__printf(struct perf_sample *sample,
> }
> }
>
> -static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine, uint32_t e_flags)
> +static void regs_dump__printf(u64 mask, struct regs_dump *regs,
> + uint16_t e_machine, uint32_t e_flags)
> {
> unsigned rid, i = 0;
>
> for_each_set_bit(rid, (unsigned long *) &mask, sizeof(mask) * 8) {
> - u64 val = regs[i++];
> + u64 val = regs->regs[i++];
>
> printf(".... %-5s 0x%016" PRIx64 "\n",
> - perf_reg_name(rid, e_machine, e_flags), val);
> + perf_reg_name(rid, e_machine, e_flags, regs->abi), val);
> }
> }
Sashiko comments
"
Will this result in perf script and perf report printing an unknown ABI
string when SIMD registers are present?
The new PERF_SAMPLE_REGS_ABI_SIMD macro has a value of 4. When combined
with the 64-bit flag (value of 2), the kernel emits an abi value of 6.
The regs_dump_abi function checks if the abi is greater than
PERF_SAMPLE_REGS_ABI_64 (which is 2). Since 6 is greater than 2, it will
unconditionally return the string unknown.
I see this is fixed later in the series in commit 1f29b13d22a6 (perf regs:
Enable dumping of SIMD registers), which updates the ABI array to properly
handle the new bitmask.
"
It's correct, too. Would move the corresponding changes into this patch
from next patch.
Thanks.
>
> @@ -995,7 +996,7 @@ static void regs__printf(const char *type, struct regs_dump *regs,
> mask,
> regs_dump_abi(regs));
>
> - regs_dump__printf(mask, regs->regs, e_machine, e_flags);
> + regs_dump__printf(mask, regs, e_machine, e_flags);
> }
>
> static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags)
^ permalink raw reply [flat|nested] 10+ messages in thread
* [Patch v7 3/4] perf regs: Support x86 SIMD registers sampling
2026-03-24 0:57 [Patch v7 0/4] Perf tools: Support eGPRs/SSP/SIMD registers sampling Dapeng Mi
2026-03-24 0:57 ` [Patch v7 1/4] perf headers: Sync with the kernel headers Dapeng Mi
2026-03-24 0:57 ` [Patch v7 2/4] perf regs: Support x86 eGPRs/SSP sampling Dapeng Mi
@ 2026-03-24 0:57 ` Dapeng Mi
2026-03-26 2:50 ` Mi, Dapeng
2026-03-24 0:57 ` [Patch v7 4/4] perf regs: Enable dumping of SIMD registers Dapeng Mi
3 siblings, 1 reply; 10+ messages in thread
From: Dapeng Mi @ 2026-03-24 0:57 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Thomas Gleixner, Dave Hansen, Ian Rogers,
Adrian Hunter, Jiri Olsa, Alexander Shishkin, Andi Kleen,
Eranian Stephane
Cc: Mark Rutland, broonie, Ravi Bangoria, linux-kernel,
linux-perf-users, Zide Chen, Falcon Thomas, Dapeng Mi, Xudong Hao,
Dapeng Mi
This patch adds support for the newly introduced SIMD register sampling
format by adding the following 5 functions:
uint64_t perf_intr_simd_reg_class_mask(uint16_t e_machine, bool pred);
uint64_t perf_user_simd_reg_class_mask(uint16_t e_machine, bool pred);
uint64_t perf_intr_simd_reg_class_bitmap_qwords(uint16_t e_machine, int reg_c,
uint16_t *qwords, bool pred);
uint64_t perf_user_simd_reg_class_bitmap_qwords(uint16_t e_machine, int reg_c,
uint16_t *qwords, bool pred);
const char *perf_simd_reg_class_name(uint16_t e_machine, int id, bool pred);
The perf_{intr|user}_simd_reg_class_mask() functions retrieve the bitmap
of kernel supported SIMD/PRED register classes on current platform for
intr-regs and user-regs sampling, such as OPMASK/XMM/YMM/ZMM on
x86 platforms.
The perf_{intr|user}_simd_reg_class_bitmap_qwords() functions retrieve
the bitmap and qwords length of a certain class of SIMD/PRED register
on current platform for intr-regs and user-regs sampling. For example,
for the XMM registers on x86 platforms, the returned bitmap is 0xffff
(XMM0 ~ XMM15) and the qwords length is 2 (128 bits for each XMM
register).
The perf_simd_reg_class_name() function gets the register class name for
a certain register class index.
Additionally, the function __parse_regs() is enhanced to support parsing
these newly introduced SIMD/PRED registers. Currently, each class of
register can only be sampled collectively; sampling a specific SIMD
register is not supported. For example, all XMM registers are sampled
together rather than sampling only XMM0.
When multiple overlapping register types, such as XMM and YMM, are
sampled simultaneously, only the superset (YMM registers) is sampled.
With this patch, all supported sampling registers on x86 platforms are
displayed as follows.
$perf record --intr-regs=?
available registers: AX BX CX DX SI DI BP SP IP FLAGS CS SS R8 R9 R10
R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28
R29 R30 R31 SSP XMM0-15 YMM0-15 ZMM0-31 OPMASK0-7
$perf record --user-regs=?
available registers: AX BX CX DX SI DI BP SP IP FLAGS CS SS R8 R9 R10
R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28
R29 R30 R31 SSP XMM0-15 YMM0-15 ZMM0-31 OPMASK0-7
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Reviewed-by: Ian Rogers <irogers@google.com>
---
tools/perf/util/evsel.c | 27 ++
tools/perf/util/parse-regs-options.c | 164 +++++++++-
.../perf/util/perf-regs-arch/perf_regs_x86.c | 292 ++++++++++++++++++
tools/perf/util/perf_event_attr_fprintf.c | 6 +
tools/perf/util/perf_regs.c | 72 +++++
tools/perf/util/perf_regs.h | 11 +
tools/perf/util/record.h | 6 +
7 files changed, 567 insertions(+), 11 deletions(-)
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index f565ef2eb476..5f00489e714a 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1589,12 +1589,39 @@ void evsel__config(struct evsel *evsel, const struct record_opts *opts,
if (opts->sample_intr_regs && !evsel->no_aux_samples &&
!evsel__is_dummy_event(evsel)) {
attr->sample_regs_intr = opts->sample_intr_regs;
+ attr->sample_simd_regs_enabled = !!opts->sample_pred_reg_qwords;
+ evsel__set_sample_bit(evsel, REGS_INTR);
+ }
+
+ if ((opts->sample_intr_vec_regs || opts->sample_intr_pred_regs) &&
+ !evsel->no_aux_samples && !evsel__is_dummy_event(evsel)) {
+ /* The pred qwords is to implies the set of SIMD registers is used */
+ if (opts->sample_pred_reg_qwords)
+ attr->sample_simd_pred_reg_qwords = opts->sample_pred_reg_qwords;
+ else
+ attr->sample_simd_pred_reg_qwords = 1;
+ attr->sample_simd_vec_reg_intr = opts->sample_intr_vec_regs;
+ attr->sample_simd_vec_reg_qwords = opts->sample_vec_reg_qwords;
+ attr->sample_simd_pred_reg_intr = opts->sample_intr_pred_regs;
evsel__set_sample_bit(evsel, REGS_INTR);
}
if (opts->sample_user_regs && !evsel->no_aux_samples &&
!evsel__is_dummy_event(evsel)) {
attr->sample_regs_user |= opts->sample_user_regs;
+ attr->sample_simd_regs_enabled = !!opts->sample_pred_reg_qwords;
+ evsel__set_sample_bit(evsel, REGS_USER);
+ }
+
+ if ((opts->sample_user_vec_regs || opts->sample_user_pred_regs) &&
+ !evsel->no_aux_samples && !evsel__is_dummy_event(evsel)) {
+ if (opts->sample_pred_reg_qwords)
+ attr->sample_simd_pred_reg_qwords = opts->sample_pred_reg_qwords;
+ else
+ attr->sample_simd_pred_reg_qwords = 1;
+ attr->sample_simd_vec_reg_user = opts->sample_user_vec_regs;
+ attr->sample_simd_vec_reg_qwords = opts->sample_vec_reg_qwords;
+ attr->sample_simd_pred_reg_user = opts->sample_user_pred_regs;
evsel__set_sample_bit(evsel, REGS_USER);
}
diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c
index 6cf865bfc2f7..3dfa7ec276c2 100644
--- a/tools/perf/util/parse-regs-options.c
+++ b/tools/perf/util/parse-regs-options.c
@@ -9,13 +9,13 @@
#include <subcmd/parse-options.h>
#include "util/perf_regs.h"
#include "util/parse-regs-options.h"
+#include "record.h"
static void
-list_perf_regs(FILE *fp, uint64_t mask, int abi)
+__list_gp_regs(FILE *fp, uint64_t mask, int abi)
{
const char *last_name = NULL;
- fprintf(fp, "available registers: ");
for (int reg = 0; reg < 64; reg++) {
const char *name;
@@ -27,14 +27,68 @@ list_perf_regs(FILE *fp, uint64_t mask, int abi)
fprintf(fp, "%s%s", reg > 0 ? " " : "", name);
last_name = name;
}
+}
+
+static void
+__list_simd_regs(FILE *fp, uint64_t mask, bool intr, bool pred)
+{
+ uint64_t bitmap = 0;
+ uint16_t qwords = 0;
+ const char *name;
+ int i = 0;
+
+ for (int reg_c = 0; reg_c < 64; reg_c++) {
+ if (((1ULL << reg_c) & mask) == 0)
+ continue;
+
+ name = perf_simd_reg_class_name(EM_HOST, reg_c, pred);
+ bitmap = intr ?
+ perf_intr_simd_reg_class_bitmap_qwords(EM_HOST, reg_c, &qwords, pred) :
+ perf_user_simd_reg_class_bitmap_qwords(EM_HOST, reg_c, &qwords, pred);
+ if (name && bitmap)
+ fprintf(fp, "%s%s0-%d", i++ > 0 ? " " : "",
+ name, fls64(bitmap) - 1);
+ }
+}
+
+static void
+list_perf_regs(FILE *fp, uint64_t mask, uint64_t simd_mask,
+ uint64_t pred_mask, int abi, bool intr)
+{
+ bool printed = false;
+
+ fprintf(fp, "available registers: ");
+
+ if (mask) {
+ __list_gp_regs(fp, mask, abi);
+ printed = true;
+ }
+
+ if (simd_mask) {
+ if (printed)
+ fprintf(fp, " ");
+ __list_simd_regs(fp, simd_mask, intr, /*pred=*/false);
+ printed = true;
+ }
+
+ if (pred_mask) {
+ if (printed)
+ fprintf(fp, " ");
+ __list_simd_regs(fp, pred_mask, intr, /*pred=*/true);
+ printed = true;
+ }
+
fputc('\n', fp);
}
static uint64_t
-name_to_perf_reg_mask(const char *to_match, uint64_t mask, int abi)
+name_to_gp_reg_mask(const char *to_match, uint64_t mask, int abi)
{
uint64_t reg_mask = 0;
+ if (!mask)
+ return reg_mask;
+
for (int reg = 0; reg < 64; reg++) {
const char *name;
@@ -51,13 +105,79 @@ name_to_perf_reg_mask(const char *to_match, uint64_t mask, int abi)
return reg_mask;
}
+static bool
+name_to_simd_reg_mask(struct record_opts *opts, const char *to_match,
+ uint64_t mask, bool intr, bool pred)
+{
+ bool matched = false;
+ uint64_t bitmap;
+ uint16_t qwords;
+ int reg_c;
+
+ if (!mask)
+ return false;
+
+ for (reg_c = 0; reg_c < 64; reg_c++) {
+ const char *name;
+
+ if (((1ULL << reg_c) & mask) == 0)
+ continue;
+
+ name = perf_simd_reg_class_name(EM_HOST, reg_c, pred);
+ if (!name)
+ continue;
+
+ if (!strcasecmp(to_match, name)) {
+ matched = true;
+ break;
+ }
+ }
+
+ if (!matched)
+ return false;
+
+ if (intr) {
+ bitmap = perf_intr_simd_reg_class_bitmap_qwords(EM_HOST,
+ reg_c, &qwords, pred);
+ } else {
+ bitmap = perf_user_simd_reg_class_bitmap_qwords(EM_HOST,
+ reg_c, &qwords, pred);
+ }
+
+ /* Just need the highest qwords */
+ if (pred) {
+ if (qwords >= opts->sample_pred_reg_qwords) {
+ opts->sample_pred_reg_qwords = qwords;
+ if (intr)
+ opts->sample_intr_pred_regs = bitmap;
+ else
+ opts->sample_user_pred_regs = bitmap;
+ }
+ } else {
+ if (qwords >= opts->sample_vec_reg_qwords) {
+ opts->sample_vec_reg_qwords = qwords;
+ if (intr)
+ opts->sample_intr_vec_regs = bitmap;
+ else
+ opts->sample_user_vec_regs = bitmap;
+ }
+ }
+
+ return true;
+}
+
static int
__parse_regs(const struct option *opt, const char *str, int unset, bool intr)
{
uint64_t *mode = (uint64_t *)opt->value;
+ struct record_opts *opts;
char *s, *os = NULL, *p;
- int ret = -1;
+ uint64_t simd_mask;
+ uint64_t pred_mask;
uint64_t mask;
+ const char *warn;
+ bool matched;
+ int ret = -1;
int abi = 0;
if (unset)
@@ -69,11 +189,16 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
if (*mode)
return -1;
- mask = intr ? perf_intr_reg_mask(EM_HOST, &abi) : perf_user_reg_mask(EM_HOST, &abi);
+ mask = intr ? perf_intr_reg_mask(EM_HOST, &abi) :
+ perf_user_reg_mask(EM_HOST, &abi);
+ opts = intr ? container_of(opt->value, struct record_opts, sample_intr_regs) :
+ container_of(opt->value, struct record_opts, sample_user_regs);
/* str may be NULL in case no arg is passed to -I */
if (!str) {
*mode = mask;
+ if (abi & PERF_SAMPLE_REGS_ABI_SIMD)
+ opts->sample_pred_reg_qwords = 1;
return 0;
}
@@ -82,6 +207,15 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
if (!s)
return -1;
+ if (intr) {
+ simd_mask = perf_intr_simd_reg_class_mask(EM_HOST, /*pred=*/false);
+ pred_mask = perf_intr_simd_reg_class_mask(EM_HOST, /*pred=*/true);
+ } else {
+ simd_mask = perf_user_simd_reg_class_mask(EM_HOST, /*pred=*/false);
+ pred_mask = perf_user_simd_reg_class_mask(EM_HOST, /*pred=*/true);
+ }
+
+ warn = "Unknown register \"%s\", check man page or run \"perf record %s?\"\n";
for (;;) {
uint64_t reg_mask;
@@ -90,15 +224,23 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
*p = '\0';
if (!strcmp(s, "?")) {
- list_perf_regs(stderr, mask, abi);
+ list_perf_regs(stderr, mask, simd_mask, pred_mask, abi, intr);
goto error;
}
- reg_mask = name_to_perf_reg_mask(s, mask, abi);
- if (reg_mask == 0) {
- ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n",
- s, intr ? "-I" : "--user-regs=");
- goto error;
+ reg_mask = name_to_gp_reg_mask(s, mask, abi);
+ if (reg_mask) {
+ if (abi & PERF_SAMPLE_REGS_ABI_SIMD)
+ opts->sample_pred_reg_qwords = 1;
+ } else {
+ matched = name_to_simd_reg_mask(opts, s, simd_mask,
+ intr, /*pred=*/false) ||
+ name_to_simd_reg_mask(opts, s, pred_mask,
+ intr, /*pred=*/true);
+ if (!matched) {
+ ui__warning(warn, s, intr ? "-I" : "--user-regs=");
+ goto error;
+ }
}
*mode |= reg_mask;
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_x86.c b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
index ae26d991cdc9..2bc93b600662 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_x86.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
@@ -465,3 +465,295 @@ uint64_t __perf_reg_sp_x86(void)
{
return PERF_REG_X86_SP;
}
+
+enum {
+ PERF_REG_CLASS_X86_OPMASK = 0,
+ PERF_REG_CLASS_X86_XMM,
+ PERF_REG_CLASS_X86_YMM,
+ PERF_REG_CLASS_X86_ZMM,
+ PERF_REG_X86_MAX_SIMD_CLASSES,
+};
+
+#define PERF_REG_CLASS_X86_PRED_MASK (BIT(PERF_REG_CLASS_X86_OPMASK))
+#define PERF_REG_CLASS_X86_SIMD_MASK (BIT(PERF_REG_CLASS_X86_XMM) | \
+ BIT(PERF_REG_CLASS_X86_YMM) | \
+ BIT(PERF_REG_CLASS_X86_ZMM))
+
+/*
+ * This function is used to determin whether kernel perf subsystem supports
+ * which kinds of SIMD registers (OPMASK/XMM/YMM/ZMM) sampling.
+ *
+ * @sample_type: PERF_SAMPLE_REGS_INTR or PERF_SAMPLE_REGS_USER
+ * @qwords: the length of SIMD register, like 1/2/4/8 qwords for
+ * OPMASK/XMM/YMM/ZMM regisers.
+ * @mask: the bitamsk of SIMD register, like 0xffff for XMM0 ~ XMM15
+ * @pred: whether It's a preceding SIMD register, like OPMASK register.
+ *
+ * Return value: true indicates support, otherwise no support.
+ */
+static bool
+__support_simd_reg_class(uint64_t sample_type, uint16_t qwords,
+ uint64_t mask, bool pred)
+{
+ struct perf_event_attr attr = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .sample_type = sample_type,
+ .disabled = 1,
+ .exclude_kernel = 1,
+ .sample_simd_regs_enabled = 1,
+ };
+ int fd;
+
+ attr.sample_period = 1;
+
+ if (!pred) {
+ attr.sample_simd_vec_reg_qwords = qwords;
+ if (sample_type == PERF_SAMPLE_REGS_INTR)
+ attr.sample_simd_vec_reg_intr = mask;
+ else
+ attr.sample_simd_vec_reg_user = mask;
+ } else {
+ attr.sample_simd_pred_reg_qwords = PERF_X86_OPMASK_QWORDS;
+ if (sample_type == PERF_SAMPLE_REGS_INTR)
+ attr.sample_simd_pred_reg_intr = PERF_X86_SIMD_PRED_MASK;
+ else
+ attr.sample_simd_pred_reg_user = PERF_X86_SIMD_PRED_MASK;
+ }
+
+ if (perf_pmus__num_core_pmus() > 1) {
+ __u64 type = perf_pmus__find_core_pmu()->type;
+
+ attr.config |= type << PERF_PMU_TYPE_SHIFT;
+ }
+
+ event_attr_init(&attr);
+
+ fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+ if (fd != -1) {
+ close(fd);
+ return true;
+ }
+
+ return false;
+}
+
+#define PERF_X86_SIMD_ZMMH_REGS (PERF_X86_SIMD_ZMM_REGS / 2)
+
+static bool __arch_has_simd_reg_class(uint64_t sample_type, int reg_class,
+ uint64_t *mask, uint16_t *qwords)
+{
+ bool supported = false;
+ uint64_t bits;
+
+ *mask = 0;
+ *qwords = 0;
+
+ switch (reg_class) {
+ case PERF_REG_CLASS_X86_OPMASK:
+ bits = BIT_ULL(PERF_X86_SIMD_OPMASK_REGS) - 1;
+ supported = __support_simd_reg_class(sample_type,
+ PERF_X86_OPMASK_QWORDS,
+ bits, true);
+ if (supported) {
+ *mask = bits;
+ *qwords = PERF_X86_OPMASK_QWORDS;
+ }
+ break;
+ case PERF_REG_CLASS_X86_XMM:
+ bits = BIT_ULL(PERF_X86_SIMD_XMM_REGS) - 1;
+ supported = __support_simd_reg_class(sample_type,
+ PERF_X86_XMM_QWORDS,
+ bits, false);
+ if (supported) {
+ *mask = bits;
+ *qwords = PERF_X86_XMM_QWORDS;
+ }
+ break;
+ case PERF_REG_CLASS_X86_YMM:
+ bits = BIT_ULL(PERF_X86_SIMD_YMM_REGS) - 1;
+ supported = __support_simd_reg_class(sample_type,
+ PERF_X86_YMM_QWORDS,
+ bits, false);
+ if (supported) {
+ *mask = bits;
+ *qwords = PERF_X86_YMM_QWORDS;
+ }
+ break;
+ case PERF_REG_CLASS_X86_ZMM:
+ bits = BIT_ULL(PERF_X86_SIMD_ZMM_REGS) - 1;
+ supported = __support_simd_reg_class(sample_type,
+ PERF_X86_ZMM_QWORDS,
+ bits, false);
+ if (supported) {
+ *mask = bits;
+ *qwords = PERF_X86_ZMM_QWORDS;
+ break;
+ }
+
+ bits = BIT_ULL(PERF_X86_SIMD_ZMMH_REGS) - 1;
+ supported = __support_simd_reg_class(sample_type,
+ PERF_X86_ZMM_QWORDS,
+ bits, false);
+ if (supported) {
+ *mask = bits;
+ *qwords = PERF_X86_ZMM_QWORDS;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return supported;
+}
+
+static bool __support_simd_sampling(void)
+{
+ uint64_t mask = BIT_ULL(PERF_X86_SIMD_XMM_REGS) - 1;
+ uint16_t qwords = PERF_X86_XMM_QWORDS;
+ static bool simd_sampling_supported;
+ static bool cached;
+
+ if (cached)
+ return simd_sampling_supported;
+
+ simd_sampling_supported =
+ __arch_has_simd_reg_class(PERF_SAMPLE_REGS_INTR,
+ PERF_REG_CLASS_X86_XMM,
+ &mask, &qwords);
+ simd_sampling_supported |=
+ __arch_has_simd_reg_class(PERF_SAMPLE_REGS_USER,
+ PERF_REG_CLASS_X86_XMM,
+ &mask, &qwords);
+ cached = true;
+
+ return simd_sampling_supported;
+}
+
+/*
+ * @x86_intr_simd_cached: indicates the data of below 3
+ * x86_intr_simd_* items has been retrieved from kernel and cached.
+ * @x86_intr_simd_reg_class_mask: indicates which kinds of PRED/SIMD
+ * registers are supported for intr-regs option. Assume kernel perf
+ * subsystem supports XMM/YMM sampling, then the mask is
+ * PERF_REG_CLASS_X86_XMM|PERF_REG_CLASS_X86_YMM.
+ * @x86_intr_simd_mask: indicates register bitmask for each kind of
+ * supported PRED/SIMD register, like
+ * x86_intr_simd_mask[PERF_REG_CLASS_X86_XMM] = 0xffff.
+ * @x86_intr_simd_mask: indicates the register length (qwords uinit)
+ * for each kind of supported PRED/SIMD register, like
+ * x86_intr_simd_qwords[PERF_REG_CLASS_X86_XMM] = 2.
+ */
+static bool x86_intr_simd_cached;
+static uint64_t x86_intr_simd_reg_class_mask;
+static uint64_t x86_intr_simd_mask[PERF_REG_X86_MAX_SIMD_CLASSES];
+static uint16_t x86_intr_simd_qwords[PERF_REG_X86_MAX_SIMD_CLASSES];
+
+/*
+ * Similar with above x86_intr_simd_* items, the difference is these
+ * items are used for user-regs option.
+ */
+static bool x86_user_simd_cached;
+static uint64_t x86_user_simd_reg_class_mask;
+static uint64_t x86_user_simd_mask[PERF_REG_X86_MAX_SIMD_CLASSES];
+static uint16_t x86_user_simd_qwords[PERF_REG_X86_MAX_SIMD_CLASSES];
+
+static uint64_t __arch__simd_reg_class_mask(bool intr)
+{
+ uint64_t mask = 0;
+ bool supported;
+ int reg_c;
+
+ if (!__support_simd_sampling())
+ return 0;
+
+ if (intr && x86_intr_simd_cached)
+ return x86_intr_simd_reg_class_mask;
+
+ if (!intr && x86_user_simd_cached)
+ return x86_user_simd_reg_class_mask;
+
+ for (reg_c = 0; reg_c < PERF_REG_X86_MAX_SIMD_CLASSES; reg_c++) {
+ supported = false;
+
+ if (intr) {
+ supported = __arch_has_simd_reg_class(
+ PERF_SAMPLE_REGS_INTR,
+ reg_c,
+ &x86_intr_simd_mask[reg_c],
+ &x86_intr_simd_qwords[reg_c]);
+ } else {
+ supported = __arch_has_simd_reg_class(
+ PERF_SAMPLE_REGS_USER,
+ reg_c,
+ &x86_user_simd_mask[reg_c],
+ &x86_user_simd_qwords[reg_c]);
+ }
+ if (supported)
+ mask |= BIT_ULL(reg_c);
+ }
+
+ if (intr) {
+ x86_intr_simd_reg_class_mask = mask;
+ x86_intr_simd_cached = true;
+ } else {
+ x86_user_simd_reg_class_mask = mask;
+ x86_user_simd_cached = true;
+ }
+
+ return mask;
+}
+
+static uint64_t
+__arch__simd_reg_class_bitmap_qwords(bool intr, int reg_c, uint16_t *qwords)
+{
+ uint64_t mask = 0;
+
+ *qwords = 0;
+ if (reg_c >= PERF_REG_X86_MAX_SIMD_CLASSES)
+ return mask;
+
+ if (intr) {
+ mask = x86_intr_simd_mask[reg_c];
+ *qwords = x86_intr_simd_qwords[reg_c];
+ } else {
+ mask = x86_user_simd_mask[reg_c];
+ *qwords = x86_user_simd_qwords[reg_c];
+ }
+
+ return mask;
+}
+
+uint64_t __perf_simd_reg_class_mask_x86(bool intr, bool pred)
+{
+ uint64_t mask = __arch__simd_reg_class_mask(intr);
+
+ return pred ? mask & PERF_REG_CLASS_X86_PRED_MASK :
+ mask & PERF_REG_CLASS_X86_SIMD_MASK;
+}
+
+uint64_t __perf_simd_reg_class_bitmap_qwords_x86(int reg_c, uint16_t *qwords,
+ bool intr, bool pred)
+{
+ if (!x86_intr_simd_cached)
+ __perf_simd_reg_class_mask_x86(intr, pred);
+ return __arch__simd_reg_class_bitmap_qwords(intr, reg_c, qwords);
+}
+
+const char *__perf_simd_reg_class_name_x86(int id, bool pred __maybe_unused)
+{
+ switch (id) {
+ case PERF_REG_CLASS_X86_OPMASK:
+ return "OPMASK";
+ case PERF_REG_CLASS_X86_XMM:
+ return "XMM";
+ case PERF_REG_CLASS_X86_YMM:
+ return "YMM";
+ case PERF_REG_CLASS_X86_ZMM:
+ return "ZMM";
+ default:
+ return NULL;
+ }
+
+ return NULL;
+}
diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c
index 741c3d657a8b..c6b8e53e06fd 100644
--- a/tools/perf/util/perf_event_attr_fprintf.c
+++ b/tools/perf/util/perf_event_attr_fprintf.c
@@ -362,6 +362,12 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
PRINT_ATTRf(aux_start_paused, p_unsigned);
PRINT_ATTRf(aux_pause, p_unsigned);
PRINT_ATTRf(aux_resume, p_unsigned);
+ PRINT_ATTRf(sample_simd_pred_reg_qwords, p_unsigned);
+ PRINT_ATTRf(sample_simd_pred_reg_intr, p_hex);
+ PRINT_ATTRf(sample_simd_pred_reg_user, p_hex);
+ PRINT_ATTRf(sample_simd_vec_reg_qwords, p_unsigned);
+ PRINT_ATTRf(sample_simd_vec_reg_intr, p_hex);
+ PRINT_ATTRf(sample_simd_vec_reg_user, p_hex);
return ret;
}
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index afc567718bee..dc99e797e715 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -246,3 +246,75 @@ uint64_t perf_arch_reg_sp(uint16_t e_machine)
return 0;
}
}
+
+uint64_t perf_intr_simd_reg_class_mask(uint16_t e_machine, bool pred)
+{
+ switch (e_machine) {
+ case EM_386:
+ case EM_X86_64:
+ return __perf_simd_reg_class_mask_x86(/*intr=*/true, pred);
+ default:
+ return 0;
+ }
+}
+
+uint64_t perf_user_simd_reg_class_mask(uint16_t e_machine, bool pred)
+{
+ switch (e_machine) {
+ case EM_386:
+ case EM_X86_64:
+ return __perf_simd_reg_class_mask_x86(/*intr=*/false, pred);
+ default:
+ return 0;
+ }
+}
+
+uint64_t perf_intr_simd_reg_class_bitmap_qwords(uint16_t e_machine, int reg_c,
+ uint16_t *qwords, bool pred)
+{
+ switch (e_machine) {
+ case EM_386:
+ case EM_X86_64:
+ return __perf_simd_reg_class_bitmap_qwords_x86(reg_c, qwords,
+ /*intr=*/true,
+ pred);
+ default:
+ *qwords = 0;
+ return 0;
+ }
+}
+
+uint64_t perf_user_simd_reg_class_bitmap_qwords(uint16_t e_machine, int reg_c,
+ uint16_t *qwords, bool pred)
+{
+ switch (e_machine) {
+ case EM_386:
+ case EM_X86_64:
+ return __perf_simd_reg_class_bitmap_qwords_x86(reg_c, qwords,
+ /*intr=*/false,
+ pred);
+ default:
+ *qwords = 0;
+ return 0;
+ }
+}
+
+const char *perf_simd_reg_class_name(uint16_t e_machine, int id, bool pred)
+{
+ const char *name = NULL;
+
+ switch (e_machine) {
+ case EM_386:
+ case EM_X86_64:
+ name = __perf_simd_reg_class_name_x86(id, pred);
+ break;
+ default:
+ break;
+ }
+ if (name)
+ return name;
+
+ pr_debug("Failed to find %s register %d for ELF machine type %u\n",
+ pred ? "PRED" : "SIMD", id, e_machine);
+ return "unknown";
+}
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index c9501ca8045d..80d1d7316188 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -20,6 +20,13 @@ const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags, int abi)
int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
uint64_t perf_arch_reg_ip(uint16_t e_machine);
uint64_t perf_arch_reg_sp(uint16_t e_machine);
+uint64_t perf_intr_simd_reg_class_mask(uint16_t e_machine, bool pred);
+uint64_t perf_user_simd_reg_class_mask(uint16_t e_machine, bool pred);
+uint64_t perf_intr_simd_reg_class_bitmap_qwords(uint16_t e_machine, int reg_c,
+ uint16_t *qwords, bool pred);
+uint64_t perf_user_simd_reg_class_bitmap_qwords(uint16_t e_machine, int reg_c,
+ uint16_t *qwords, bool pred);
+const char *perf_simd_reg_class_name(uint16_t e_machine, int id, bool pred);
int __perf_sdt_arg_parse_op_arm64(char *old_op, char **new_op);
uint64_t __perf_reg_mask_arm64(bool intr);
@@ -68,6 +75,10 @@ uint64_t __perf_reg_mask_x86(bool intr, int *abi);
const char *__perf_reg_name_x86(int id, int abi);
uint64_t __perf_reg_ip_x86(void);
uint64_t __perf_reg_sp_x86(void);
+uint64_t __perf_simd_reg_class_mask_x86(bool intr, bool pred);
+uint64_t __perf_simd_reg_class_bitmap_qwords_x86(int reg_c, uint16_t *qwords,
+ bool intr, bool pred);
+const char *__perf_simd_reg_class_name_x86(int id, bool pred);
static inline uint64_t DWARF_MINIMAL_REGS(uint16_t e_machine)
{
diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h
index 93627c9a7338..37ed44b5f15b 100644
--- a/tools/perf/util/record.h
+++ b/tools/perf/util/record.h
@@ -62,6 +62,12 @@ struct record_opts {
u64 branch_stack;
u64 sample_intr_regs;
u64 sample_user_regs;
+ u64 sample_intr_vec_regs;
+ u64 sample_user_vec_regs;
+ u32 sample_intr_pred_regs;
+ u32 sample_user_pred_regs;
+ u16 sample_vec_reg_qwords;
+ u16 sample_pred_reg_qwords;
u64 default_interval;
u64 user_interval;
size_t auxtrace_snapshot_size;
--
2.34.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [Patch v7 3/4] perf regs: Support x86 SIMD registers sampling
2026-03-24 0:57 ` [Patch v7 3/4] perf regs: Support x86 SIMD registers sampling Dapeng Mi
@ 2026-03-26 2:50 ` Mi, Dapeng
0 siblings, 0 replies; 10+ messages in thread
From: Mi, Dapeng @ 2026-03-26 2:50 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Thomas Gleixner, Dave Hansen, Ian Rogers,
Adrian Hunter, Jiri Olsa, Alexander Shishkin, Andi Kleen,
Eranian Stephane
Cc: Mark Rutland, broonie, Ravi Bangoria, linux-kernel,
linux-perf-users, Zide Chen, Falcon Thomas, Dapeng Mi, Xudong Hao
On 3/24/2026 8:57 AM, Dapeng Mi wrote:
> This patch adds support for the newly introduced SIMD register sampling
> format by adding the following 5 functions:
>
> uint64_t perf_intr_simd_reg_class_mask(uint16_t e_machine, bool pred);
> uint64_t perf_user_simd_reg_class_mask(uint16_t e_machine, bool pred);
> uint64_t perf_intr_simd_reg_class_bitmap_qwords(uint16_t e_machine, int reg_c,
> uint16_t *qwords, bool pred);
> uint64_t perf_user_simd_reg_class_bitmap_qwords(uint16_t e_machine, int reg_c,
> uint16_t *qwords, bool pred);
> const char *perf_simd_reg_class_name(uint16_t e_machine, int id, bool pred);
>
> The perf_{intr|user}_simd_reg_class_mask() functions retrieve the bitmap
> of kernel supported SIMD/PRED register classes on current platform for
> intr-regs and user-regs sampling, such as OPMASK/XMM/YMM/ZMM on
> x86 platforms.
>
> The perf_{intr|user}_simd_reg_class_bitmap_qwords() functions retrieve
> the bitmap and qwords length of a certain class of SIMD/PRED register
> on current platform for intr-regs and user-regs sampling. For example,
> for the XMM registers on x86 platforms, the returned bitmap is 0xffff
> (XMM0 ~ XMM15) and the qwords length is 2 (128 bits for each XMM
> register).
>
> The perf_simd_reg_class_name() function gets the register class name for
> a certain register class index.
>
> Additionally, the function __parse_regs() is enhanced to support parsing
> these newly introduced SIMD/PRED registers. Currently, each class of
> register can only be sampled collectively; sampling a specific SIMD
> register is not supported. For example, all XMM registers are sampled
> together rather than sampling only XMM0.
>
> When multiple overlapping register types, such as XMM and YMM, are
> sampled simultaneously, only the superset (YMM registers) is sampled.
>
> With this patch, all supported sampling registers on x86 platforms are
> displayed as follows.
>
> $perf record --intr-regs=?
> available registers: AX BX CX DX SI DI BP SP IP FLAGS CS SS R8 R9 R10
> R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28
> R29 R30 R31 SSP XMM0-15 YMM0-15 ZMM0-31 OPMASK0-7
>
> $perf record --user-regs=?
> available registers: AX BX CX DX SI DI BP SP IP FLAGS CS SS R8 R9 R10
> R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28
> R29 R30 R31 SSP XMM0-15 YMM0-15 ZMM0-31 OPMASK0-7
>
> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
> Reviewed-by: Ian Rogers <irogers@google.com>
> ---
> tools/perf/util/evsel.c | 27 ++
> tools/perf/util/parse-regs-options.c | 164 +++++++++-
> .../perf/util/perf-regs-arch/perf_regs_x86.c | 292 ++++++++++++++++++
> tools/perf/util/perf_event_attr_fprintf.c | 6 +
> tools/perf/util/perf_regs.c | 72 +++++
> tools/perf/util/perf_regs.h | 11 +
> tools/perf/util/record.h | 6 +
> 7 files changed, 567 insertions(+), 11 deletions(-)
>
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index f565ef2eb476..5f00489e714a 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -1589,12 +1589,39 @@ void evsel__config(struct evsel *evsel, const struct record_opts *opts,
> if (opts->sample_intr_regs && !evsel->no_aux_samples &&
> !evsel__is_dummy_event(evsel)) {
> attr->sample_regs_intr = opts->sample_intr_regs;
> + attr->sample_simd_regs_enabled = !!opts->sample_pred_reg_qwords;
> + evsel__set_sample_bit(evsel, REGS_INTR);
> + }
> +
> + if ((opts->sample_intr_vec_regs || opts->sample_intr_pred_regs) &&
> + !evsel->no_aux_samples && !evsel__is_dummy_event(evsel)) {
> + /* The pred qwords is to implies the set of SIMD registers is used */
> + if (opts->sample_pred_reg_qwords)
> + attr->sample_simd_pred_reg_qwords = opts->sample_pred_reg_qwords;
> + else
> + attr->sample_simd_pred_reg_qwords = 1;
> + attr->sample_simd_vec_reg_intr = opts->sample_intr_vec_regs;
> + attr->sample_simd_vec_reg_qwords = opts->sample_vec_reg_qwords;
> + attr->sample_simd_pred_reg_intr = opts->sample_intr_pred_regs;
> evsel__set_sample_bit(evsel, REGS_INTR);
> }
>
> if (opts->sample_user_regs && !evsel->no_aux_samples &&
> !evsel__is_dummy_event(evsel)) {
> attr->sample_regs_user |= opts->sample_user_regs;
> + attr->sample_simd_regs_enabled = !!opts->sample_pred_reg_qwords;
> + evsel__set_sample_bit(evsel, REGS_USER);
> + }
> +
> + if ((opts->sample_user_vec_regs || opts->sample_user_pred_regs) &&
> + !evsel->no_aux_samples && !evsel__is_dummy_event(evsel)) {
> + if (opts->sample_pred_reg_qwords)
> + attr->sample_simd_pred_reg_qwords = opts->sample_pred_reg_qwords;
> + else
> + attr->sample_simd_pred_reg_qwords = 1;
> + attr->sample_simd_vec_reg_user = opts->sample_user_vec_regs;
> + attr->sample_simd_vec_reg_qwords = opts->sample_vec_reg_qwords;
> + attr->sample_simd_pred_reg_user = opts->sample_user_pred_regs;
> evsel__set_sample_bit(evsel, REGS_USER);
> }
>
> diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c
> index 6cf865bfc2f7..3dfa7ec276c2 100644
> --- a/tools/perf/util/parse-regs-options.c
> +++ b/tools/perf/util/parse-regs-options.c
> @@ -9,13 +9,13 @@
> #include <subcmd/parse-options.h>
> #include "util/perf_regs.h"
> #include "util/parse-regs-options.h"
> +#include "record.h"
>
> static void
> -list_perf_regs(FILE *fp, uint64_t mask, int abi)
> +__list_gp_regs(FILE *fp, uint64_t mask, int abi)
> {
> const char *last_name = NULL;
>
> - fprintf(fp, "available registers: ");
> for (int reg = 0; reg < 64; reg++) {
> const char *name;
>
> @@ -27,14 +27,68 @@ list_perf_regs(FILE *fp, uint64_t mask, int abi)
> fprintf(fp, "%s%s", reg > 0 ? " " : "", name);
> last_name = name;
> }
> +}
> +
> +static void
> +__list_simd_regs(FILE *fp, uint64_t mask, bool intr, bool pred)
> +{
> + uint64_t bitmap = 0;
> + uint16_t qwords = 0;
> + const char *name;
> + int i = 0;
> +
> + for (int reg_c = 0; reg_c < 64; reg_c++) {
> + if (((1ULL << reg_c) & mask) == 0)
> + continue;
> +
> + name = perf_simd_reg_class_name(EM_HOST, reg_c, pred);
> + bitmap = intr ?
> + perf_intr_simd_reg_class_bitmap_qwords(EM_HOST, reg_c, &qwords, pred) :
> + perf_user_simd_reg_class_bitmap_qwords(EM_HOST, reg_c, &qwords, pred);
> + if (name && bitmap)
> + fprintf(fp, "%s%s0-%d", i++ > 0 ? " " : "",
> + name, fls64(bitmap) - 1);
> + }
> +}
Sashiko comments
"
This formats the help output with index ranges (e.g., XMM0-15).
However, name_to_simd_reg_mask() expects the base class name (e.g., XMM).
If a user copies the register name directly from the help output, will the
string comparison fail and reject it with an "Unknown register" error?
"
It makes sense in some ways. Would enhance the perf-record man page to
explicitly describe only SIMD register class name is needed and don't need
to input the index.
> +
> +static void
> +list_perf_regs(FILE *fp, uint64_t mask, uint64_t simd_mask,
> + uint64_t pred_mask, int abi, bool intr)
> +{
> + bool printed = false;
> +
> + fprintf(fp, "available registers: ");
> +
> + if (mask) {
> + __list_gp_regs(fp, mask, abi);
> + printed = true;
> + }
> +
> + if (simd_mask) {
> + if (printed)
> + fprintf(fp, " ");
> + __list_simd_regs(fp, simd_mask, intr, /*pred=*/false);
> + printed = true;
> + }
> +
> + if (pred_mask) {
> + if (printed)
> + fprintf(fp, " ");
> + __list_simd_regs(fp, pred_mask, intr, /*pred=*/true);
> + printed = true;
> + }
> +
> fputc('\n', fp);
> }
>
> static uint64_t
> -name_to_perf_reg_mask(const char *to_match, uint64_t mask, int abi)
> +name_to_gp_reg_mask(const char *to_match, uint64_t mask, int abi)
> {
> uint64_t reg_mask = 0;
>
> + if (!mask)
> + return reg_mask;
> +
> for (int reg = 0; reg < 64; reg++) {
> const char *name;
>
> @@ -51,13 +105,79 @@ name_to_perf_reg_mask(const char *to_match, uint64_t mask, int abi)
> return reg_mask;
> }
>
> +static bool
> +name_to_simd_reg_mask(struct record_opts *opts, const char *to_match,
> + uint64_t mask, bool intr, bool pred)
> +{
> + bool matched = false;
> + uint64_t bitmap;
> + uint16_t qwords;
> + int reg_c;
> +
> + if (!mask)
> + return false;
> +
> + for (reg_c = 0; reg_c < 64; reg_c++) {
> + const char *name;
> +
> + if (((1ULL << reg_c) & mask) == 0)
> + continue;
> +
> + name = perf_simd_reg_class_name(EM_HOST, reg_c, pred);
> + if (!name)
> + continue;
> +
> + if (!strcasecmp(to_match, name)) {
> + matched = true;
> + break;
> + }
> + }
> +
> + if (!matched)
> + return false;
> +
> + if (intr) {
> + bitmap = perf_intr_simd_reg_class_bitmap_qwords(EM_HOST,
> + reg_c, &qwords, pred);
> + } else {
> + bitmap = perf_user_simd_reg_class_bitmap_qwords(EM_HOST,
> + reg_c, &qwords, pred);
> + }
> +
> + /* Just need the highest qwords */
> + if (pred) {
> + if (qwords >= opts->sample_pred_reg_qwords) {
> + opts->sample_pred_reg_qwords = qwords;
> + if (intr)
> + opts->sample_intr_pred_regs = bitmap;
> + else
> + opts->sample_user_pred_regs = bitmap;
> + }
> + } else {
> + if (qwords >= opts->sample_vec_reg_qwords) {
> + opts->sample_vec_reg_qwords = qwords;
> + if (intr)
> + opts->sample_intr_vec_regs = bitmap;
> + else
> + opts->sample_user_vec_regs = bitmap;
> + }
> + }
> +
> + return true;
> +}
Sashiko comments
"
Since opts->sample_vec_reg_qwords is a single variable shared globally for
both interrupt and user configurations, does this cross-option interference
silently drop SIMD register sampling requests?
If a larger register is parsed for one option (e.g., "--user-regs=YMM" with
4 qwords), it inflates this shared threshold. If a smaller register is
subsequently parsed for the other option (e.g., "--intr-regs=XMM" with
2 qwords), the "2 >= 4" check fails.
Will this cause the function to skip setting opts->sample_intr_vec_regs,
entirely dropping the user's interrupt vector register request?
"
This is correct. Besides picking the highest "qwords", we need to pick the
largest bitmap for both intr and user options.
> +
> static int
> __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
> {
> uint64_t *mode = (uint64_t *)opt->value;
> + struct record_opts *opts;
> char *s, *os = NULL, *p;
> - int ret = -1;
> + uint64_t simd_mask;
> + uint64_t pred_mask;
> uint64_t mask;
> + const char *warn;
> + bool matched;
> + int ret = -1;
> int abi = 0;
>
> if (unset)
> @@ -69,11 +189,16 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
> if (*mode)
> return -1;
>
> - mask = intr ? perf_intr_reg_mask(EM_HOST, &abi) : perf_user_reg_mask(EM_HOST, &abi);
> + mask = intr ? perf_intr_reg_mask(EM_HOST, &abi) :
> + perf_user_reg_mask(EM_HOST, &abi);
> + opts = intr ? container_of(opt->value, struct record_opts, sample_intr_regs) :
> + container_of(opt->value, struct record_opts, sample_user_regs);
>
> /* str may be NULL in case no arg is passed to -I */
> if (!str) {
> *mode = mask;
> + if (abi & PERF_SAMPLE_REGS_ABI_SIMD)
> + opts->sample_pred_reg_qwords = 1;
> return 0;
> }
>
> @@ -82,6 +207,15 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
> if (!s)
> return -1;
>
> + if (intr) {
> + simd_mask = perf_intr_simd_reg_class_mask(EM_HOST, /*pred=*/false);
> + pred_mask = perf_intr_simd_reg_class_mask(EM_HOST, /*pred=*/true);
> + } else {
> + simd_mask = perf_user_simd_reg_class_mask(EM_HOST, /*pred=*/false);
> + pred_mask = perf_user_simd_reg_class_mask(EM_HOST, /*pred=*/true);
> + }
> +
> + warn = "Unknown register \"%s\", check man page or run \"perf record %s?\"\n";
> for (;;) {
> uint64_t reg_mask;
>
> @@ -90,15 +224,23 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
> *p = '\0';
>
> if (!strcmp(s, "?")) {
> - list_perf_regs(stderr, mask, abi);
> + list_perf_regs(stderr, mask, simd_mask, pred_mask, abi, intr);
> goto error;
> }
>
> - reg_mask = name_to_perf_reg_mask(s, mask, abi);
> - if (reg_mask == 0) {
> - ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n",
> - s, intr ? "-I" : "--user-regs=");
> - goto error;
> + reg_mask = name_to_gp_reg_mask(s, mask, abi);
> + if (reg_mask) {
> + if (abi & PERF_SAMPLE_REGS_ABI_SIMD)
> + opts->sample_pred_reg_qwords = 1;
Sashiko comments
"
If a future predicate register requires a length greater than 1 qword (which
is correctly established by name_to_simd_reg_mask() if parsed first), will
this subsequent unconditional assignment silently truncate the globally
tracked predicate register size back to 1?
"
Although currently the largest qwords length is 1 on x86 platform, it may
be not on other architectures and then the truncation would happen if put
eGPRs behind the PRED registers. Would fix this issue in next version.
> + } else {
> + matched = name_to_simd_reg_mask(opts, s, simd_mask,
> + intr, /*pred=*/false) ||
> + name_to_simd_reg_mask(opts, s, pred_mask,
> + intr, /*pred=*/true);
> + if (!matched) {
> + ui__warning(warn, s, intr ? "-I" : "--user-regs=");
> + goto error;
> + }
> }
> *mode |= reg_mask;
>
> diff --git a/tools/perf/util/perf-regs-arch/perf_regs_x86.c b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
> index ae26d991cdc9..2bc93b600662 100644
> --- a/tools/perf/util/perf-regs-arch/perf_regs_x86.c
> +++ b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
> @@ -465,3 +465,295 @@ uint64_t __perf_reg_sp_x86(void)
> {
> return PERF_REG_X86_SP;
> }
> +
> +enum {
> + PERF_REG_CLASS_X86_OPMASK = 0,
> + PERF_REG_CLASS_X86_XMM,
> + PERF_REG_CLASS_X86_YMM,
> + PERF_REG_CLASS_X86_ZMM,
> + PERF_REG_X86_MAX_SIMD_CLASSES,
> +};
> +
> +#define PERF_REG_CLASS_X86_PRED_MASK (BIT(PERF_REG_CLASS_X86_OPMASK))
> +#define PERF_REG_CLASS_X86_SIMD_MASK (BIT(PERF_REG_CLASS_X86_XMM) | \
> + BIT(PERF_REG_CLASS_X86_YMM) | \
> + BIT(PERF_REG_CLASS_X86_ZMM))
> +
> +/*
> + * This function is used to determin whether kernel perf subsystem supports
> + * which kinds of SIMD registers (OPMASK/XMM/YMM/ZMM) sampling.
> + *
> + * @sample_type: PERF_SAMPLE_REGS_INTR or PERF_SAMPLE_REGS_USER
> + * @qwords: the length of SIMD register, like 1/2/4/8 qwords for
> + * OPMASK/XMM/YMM/ZMM regisers.
> + * @mask: the bitamsk of SIMD register, like 0xffff for XMM0 ~ XMM15
> + * @pred: whether It's a preceding SIMD register, like OPMASK register.
> + *
> + * Return value: true indicates support, otherwise no support.
> + */
> +static bool
> +__support_simd_reg_class(uint64_t sample_type, uint16_t qwords,
> + uint64_t mask, bool pred)
> +{
> + struct perf_event_attr attr = {
> + .type = PERF_TYPE_HARDWARE,
> + .config = PERF_COUNT_HW_CPU_CYCLES,
> + .sample_type = sample_type,
> + .disabled = 1,
> + .exclude_kernel = 1,
> + .sample_simd_regs_enabled = 1,
> + };
> + int fd;
> +
> + attr.sample_period = 1;
> +
> + if (!pred) {
> + attr.sample_simd_vec_reg_qwords = qwords;
> + if (sample_type == PERF_SAMPLE_REGS_INTR)
> + attr.sample_simd_vec_reg_intr = mask;
> + else
> + attr.sample_simd_vec_reg_user = mask;
> + } else {
> + attr.sample_simd_pred_reg_qwords = PERF_X86_OPMASK_QWORDS;
> + if (sample_type == PERF_SAMPLE_REGS_INTR)
> + attr.sample_simd_pred_reg_intr = PERF_X86_SIMD_PRED_MASK;
> + else
> + attr.sample_simd_pred_reg_user = PERF_X86_SIMD_PRED_MASK;
> + }
> +
> + if (perf_pmus__num_core_pmus() > 1) {
> + __u64 type = perf_pmus__find_core_pmu()->type;
> +
> + attr.config |= type << PERF_PMU_TYPE_SHIFT;
> + }
> +
> + event_attr_init(&attr);
> +
> + fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
> + if (fd != -1) {
> + close(fd);
> + return true;
> + }
> +
> + return false;
> +}
> +
> +#define PERF_X86_SIMD_ZMMH_REGS (PERF_X86_SIMD_ZMM_REGS / 2)
> +
> +static bool __arch_has_simd_reg_class(uint64_t sample_type, int reg_class,
> + uint64_t *mask, uint16_t *qwords)
> +{
> + bool supported = false;
> + uint64_t bits;
> +
> + *mask = 0;
> + *qwords = 0;
> +
> + switch (reg_class) {
> + case PERF_REG_CLASS_X86_OPMASK:
> + bits = BIT_ULL(PERF_X86_SIMD_OPMASK_REGS) - 1;
> + supported = __support_simd_reg_class(sample_type,
> + PERF_X86_OPMASK_QWORDS,
> + bits, true);
> + if (supported) {
> + *mask = bits;
> + *qwords = PERF_X86_OPMASK_QWORDS;
> + }
> + break;
> + case PERF_REG_CLASS_X86_XMM:
> + bits = BIT_ULL(PERF_X86_SIMD_XMM_REGS) - 1;
> + supported = __support_simd_reg_class(sample_type,
> + PERF_X86_XMM_QWORDS,
> + bits, false);
> + if (supported) {
> + *mask = bits;
> + *qwords = PERF_X86_XMM_QWORDS;
> + }
> + break;
> + case PERF_REG_CLASS_X86_YMM:
> + bits = BIT_ULL(PERF_X86_SIMD_YMM_REGS) - 1;
> + supported = __support_simd_reg_class(sample_type,
> + PERF_X86_YMM_QWORDS,
> + bits, false);
> + if (supported) {
> + *mask = bits;
> + *qwords = PERF_X86_YMM_QWORDS;
> + }
> + break;
> + case PERF_REG_CLASS_X86_ZMM:
> + bits = BIT_ULL(PERF_X86_SIMD_ZMM_REGS) - 1;
> + supported = __support_simd_reg_class(sample_type,
> + PERF_X86_ZMM_QWORDS,
> + bits, false);
> + if (supported) {
> + *mask = bits;
> + *qwords = PERF_X86_ZMM_QWORDS;
> + break;
> + }
> +
> + bits = BIT_ULL(PERF_X86_SIMD_ZMMH_REGS) - 1;
> + supported = __support_simd_reg_class(sample_type,
> + PERF_X86_ZMM_QWORDS,
> + bits, false);
> + if (supported) {
> + *mask = bits;
> + *qwords = PERF_X86_ZMM_QWORDS;
> + }
> + break;
> + default:
> + break;
> + }
> +
> + return supported;
> +}
> +
> +static bool __support_simd_sampling(void)
> +{
> + uint64_t mask = BIT_ULL(PERF_X86_SIMD_XMM_REGS) - 1;
> + uint16_t qwords = PERF_X86_XMM_QWORDS;
> + static bool simd_sampling_supported;
> + static bool cached;
> +
> + if (cached)
> + return simd_sampling_supported;
> +
> + simd_sampling_supported =
> + __arch_has_simd_reg_class(PERF_SAMPLE_REGS_INTR,
> + PERF_REG_CLASS_X86_XMM,
> + &mask, &qwords);
> + simd_sampling_supported |=
> + __arch_has_simd_reg_class(PERF_SAMPLE_REGS_USER,
> + PERF_REG_CLASS_X86_XMM,
> + &mask, &qwords);
> + cached = true;
> +
> + return simd_sampling_supported;
> +}
> +
> +/*
> + * @x86_intr_simd_cached: indicates the data of below 3
> + * x86_intr_simd_* items has been retrieved from kernel and cached.
> + * @x86_intr_simd_reg_class_mask: indicates which kinds of PRED/SIMD
> + * registers are supported for intr-regs option. Assume kernel perf
> + * subsystem supports XMM/YMM sampling, then the mask is
> + * PERF_REG_CLASS_X86_XMM|PERF_REG_CLASS_X86_YMM.
> + * @x86_intr_simd_mask: indicates register bitmask for each kind of
> + * supported PRED/SIMD register, like
> + * x86_intr_simd_mask[PERF_REG_CLASS_X86_XMM] = 0xffff.
> + * @x86_intr_simd_mask: indicates the register length (qwords uinit)
> + * for each kind of supported PRED/SIMD register, like
> + * x86_intr_simd_qwords[PERF_REG_CLASS_X86_XMM] = 2.
> + */
> +static bool x86_intr_simd_cached;
> +static uint64_t x86_intr_simd_reg_class_mask;
> +static uint64_t x86_intr_simd_mask[PERF_REG_X86_MAX_SIMD_CLASSES];
> +static uint16_t x86_intr_simd_qwords[PERF_REG_X86_MAX_SIMD_CLASSES];
> +
> +/*
> + * Similar with above x86_intr_simd_* items, the difference is these
> + * items are used for user-regs option.
> + */
> +static bool x86_user_simd_cached;
> +static uint64_t x86_user_simd_reg_class_mask;
> +static uint64_t x86_user_simd_mask[PERF_REG_X86_MAX_SIMD_CLASSES];
> +static uint16_t x86_user_simd_qwords[PERF_REG_X86_MAX_SIMD_CLASSES];
> +
> +static uint64_t __arch__simd_reg_class_mask(bool intr)
> +{
> + uint64_t mask = 0;
> + bool supported;
> + int reg_c;
> +
> + if (!__support_simd_sampling())
> + return 0;
> +
> + if (intr && x86_intr_simd_cached)
> + return x86_intr_simd_reg_class_mask;
> +
> + if (!intr && x86_user_simd_cached)
> + return x86_user_simd_reg_class_mask;
> +
> + for (reg_c = 0; reg_c < PERF_REG_X86_MAX_SIMD_CLASSES; reg_c++) {
> + supported = false;
> +
> + if (intr) {
> + supported = __arch_has_simd_reg_class(
> + PERF_SAMPLE_REGS_INTR,
> + reg_c,
> + &x86_intr_simd_mask[reg_c],
> + &x86_intr_simd_qwords[reg_c]);
> + } else {
> + supported = __arch_has_simd_reg_class(
> + PERF_SAMPLE_REGS_USER,
> + reg_c,
> + &x86_user_simd_mask[reg_c],
> + &x86_user_simd_qwords[reg_c]);
> + }
> + if (supported)
> + mask |= BIT_ULL(reg_c);
> + }
> +
> + if (intr) {
> + x86_intr_simd_reg_class_mask = mask;
> + x86_intr_simd_cached = true;
> + } else {
> + x86_user_simd_reg_class_mask = mask;
> + x86_user_simd_cached = true;
> + }
> +
> + return mask;
> +}
> +
> +static uint64_t
> +__arch__simd_reg_class_bitmap_qwords(bool intr, int reg_c, uint16_t *qwords)
> +{
> + uint64_t mask = 0;
> +
> + *qwords = 0;
> + if (reg_c >= PERF_REG_X86_MAX_SIMD_CLASSES)
> + return mask;
> +
> + if (intr) {
> + mask = x86_intr_simd_mask[reg_c];
> + *qwords = x86_intr_simd_qwords[reg_c];
> + } else {
> + mask = x86_user_simd_mask[reg_c];
> + *qwords = x86_user_simd_qwords[reg_c];
> + }
> +
> + return mask;
> +}
> +
> +uint64_t __perf_simd_reg_class_mask_x86(bool intr, bool pred)
> +{
> + uint64_t mask = __arch__simd_reg_class_mask(intr);
> +
> + return pred ? mask & PERF_REG_CLASS_X86_PRED_MASK :
> + mask & PERF_REG_CLASS_X86_SIMD_MASK;
> +}
> +
> +uint64_t __perf_simd_reg_class_bitmap_qwords_x86(int reg_c, uint16_t *qwords,
> + bool intr, bool pred)
> +{
> + if (!x86_intr_simd_cached)
> + __perf_simd_reg_class_mask_x86(intr, pred);
> + return __arch__simd_reg_class_bitmap_qwords(intr, reg_c, qwords);
> +}
Sashiko comments
"
This check is hardcoded to use the interrupt cache flag
(x86_intr_simd_cached) even when the caller is querying user-mode registers
(intr == false).
If the public API is called for user-mode when the interrupt cache is
initialized but the user-mode cache is not, does the function incorrectly
skip initialization and return 0 because the underlying x86_user_simd_mask
array remains empty?
"
This is correct. Need to check both x86_intr_simd_cached and
x86_use_simd_cached. If either one is not true, then
__perf_simd_reg_class_mask_x86() needs to be called.
Thanks.
> +
> +const char *__perf_simd_reg_class_name_x86(int id, bool pred __maybe_unused)
> +{
> + switch (id) {
> + case PERF_REG_CLASS_X86_OPMASK:
> + return "OPMASK";
> + case PERF_REG_CLASS_X86_XMM:
> + return "XMM";
> + case PERF_REG_CLASS_X86_YMM:
> + return "YMM";
> + case PERF_REG_CLASS_X86_ZMM:
> + return "ZMM";
> + default:
> + return NULL;
> + }
> +
> + return NULL;
> +}
> diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c
> index 741c3d657a8b..c6b8e53e06fd 100644
> --- a/tools/perf/util/perf_event_attr_fprintf.c
> +++ b/tools/perf/util/perf_event_attr_fprintf.c
> @@ -362,6 +362,12 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
> PRINT_ATTRf(aux_start_paused, p_unsigned);
> PRINT_ATTRf(aux_pause, p_unsigned);
> PRINT_ATTRf(aux_resume, p_unsigned);
> + PRINT_ATTRf(sample_simd_pred_reg_qwords, p_unsigned);
> + PRINT_ATTRf(sample_simd_pred_reg_intr, p_hex);
> + PRINT_ATTRf(sample_simd_pred_reg_user, p_hex);
> + PRINT_ATTRf(sample_simd_vec_reg_qwords, p_unsigned);
> + PRINT_ATTRf(sample_simd_vec_reg_intr, p_hex);
> + PRINT_ATTRf(sample_simd_vec_reg_user, p_hex);
>
> return ret;
> }
> diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
> index afc567718bee..dc99e797e715 100644
> --- a/tools/perf/util/perf_regs.c
> +++ b/tools/perf/util/perf_regs.c
> @@ -246,3 +246,75 @@ uint64_t perf_arch_reg_sp(uint16_t e_machine)
> return 0;
> }
> }
> +
> +uint64_t perf_intr_simd_reg_class_mask(uint16_t e_machine, bool pred)
> +{
> + switch (e_machine) {
> + case EM_386:
> + case EM_X86_64:
> + return __perf_simd_reg_class_mask_x86(/*intr=*/true, pred);
> + default:
> + return 0;
> + }
> +}
> +
> +uint64_t perf_user_simd_reg_class_mask(uint16_t e_machine, bool pred)
> +{
> + switch (e_machine) {
> + case EM_386:
> + case EM_X86_64:
> + return __perf_simd_reg_class_mask_x86(/*intr=*/false, pred);
> + default:
> + return 0;
> + }
> +}
> +
> +uint64_t perf_intr_simd_reg_class_bitmap_qwords(uint16_t e_machine, int reg_c,
> + uint16_t *qwords, bool pred)
> +{
> + switch (e_machine) {
> + case EM_386:
> + case EM_X86_64:
> + return __perf_simd_reg_class_bitmap_qwords_x86(reg_c, qwords,
> + /*intr=*/true,
> + pred);
> + default:
> + *qwords = 0;
> + return 0;
> + }
> +}
> +
> +uint64_t perf_user_simd_reg_class_bitmap_qwords(uint16_t e_machine, int reg_c,
> + uint16_t *qwords, bool pred)
> +{
> + switch (e_machine) {
> + case EM_386:
> + case EM_X86_64:
> + return __perf_simd_reg_class_bitmap_qwords_x86(reg_c, qwords,
> + /*intr=*/false,
> + pred);
> + default:
> + *qwords = 0;
> + return 0;
> + }
> +}
> +
> +const char *perf_simd_reg_class_name(uint16_t e_machine, int id, bool pred)
> +{
> + const char *name = NULL;
> +
> + switch (e_machine) {
> + case EM_386:
> + case EM_X86_64:
> + name = __perf_simd_reg_class_name_x86(id, pred);
> + break;
> + default:
> + break;
> + }
> + if (name)
> + return name;
> +
> + pr_debug("Failed to find %s register %d for ELF machine type %u\n",
> + pred ? "PRED" : "SIMD", id, e_machine);
> + return "unknown";
> +}
> diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
> index c9501ca8045d..80d1d7316188 100644
> --- a/tools/perf/util/perf_regs.h
> +++ b/tools/perf/util/perf_regs.h
> @@ -20,6 +20,13 @@ const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags, int abi)
> int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
> uint64_t perf_arch_reg_ip(uint16_t e_machine);
> uint64_t perf_arch_reg_sp(uint16_t e_machine);
> +uint64_t perf_intr_simd_reg_class_mask(uint16_t e_machine, bool pred);
> +uint64_t perf_user_simd_reg_class_mask(uint16_t e_machine, bool pred);
> +uint64_t perf_intr_simd_reg_class_bitmap_qwords(uint16_t e_machine, int reg_c,
> + uint16_t *qwords, bool pred);
> +uint64_t perf_user_simd_reg_class_bitmap_qwords(uint16_t e_machine, int reg_c,
> + uint16_t *qwords, bool pred);
> +const char *perf_simd_reg_class_name(uint16_t e_machine, int id, bool pred);
>
> int __perf_sdt_arg_parse_op_arm64(char *old_op, char **new_op);
> uint64_t __perf_reg_mask_arm64(bool intr);
> @@ -68,6 +75,10 @@ uint64_t __perf_reg_mask_x86(bool intr, int *abi);
> const char *__perf_reg_name_x86(int id, int abi);
> uint64_t __perf_reg_ip_x86(void);
> uint64_t __perf_reg_sp_x86(void);
> +uint64_t __perf_simd_reg_class_mask_x86(bool intr, bool pred);
> +uint64_t __perf_simd_reg_class_bitmap_qwords_x86(int reg_c, uint16_t *qwords,
> + bool intr, bool pred);
> +const char *__perf_simd_reg_class_name_x86(int id, bool pred);
>
> static inline uint64_t DWARF_MINIMAL_REGS(uint16_t e_machine)
> {
> diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h
> index 93627c9a7338..37ed44b5f15b 100644
> --- a/tools/perf/util/record.h
> +++ b/tools/perf/util/record.h
> @@ -62,6 +62,12 @@ struct record_opts {
> u64 branch_stack;
> u64 sample_intr_regs;
> u64 sample_user_regs;
> + u64 sample_intr_vec_regs;
> + u64 sample_user_vec_regs;
> + u32 sample_intr_pred_regs;
> + u32 sample_user_pred_regs;
> + u16 sample_vec_reg_qwords;
> + u16 sample_pred_reg_qwords;
> u64 default_interval;
> u64 user_interval;
> size_t auxtrace_snapshot_size;
^ permalink raw reply [flat|nested] 10+ messages in thread
* [Patch v7 4/4] perf regs: Enable dumping of SIMD registers
2026-03-24 0:57 [Patch v7 0/4] Perf tools: Support eGPRs/SSP/SIMD registers sampling Dapeng Mi
` (2 preceding siblings ...)
2026-03-24 0:57 ` [Patch v7 3/4] perf regs: Support x86 SIMD registers sampling Dapeng Mi
@ 2026-03-24 0:57 ` Dapeng Mi
2026-03-26 5:48 ` Mi, Dapeng
3 siblings, 1 reply; 10+ messages in thread
From: Dapeng Mi @ 2026-03-24 0:57 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Thomas Gleixner, Dave Hansen, Ian Rogers,
Adrian Hunter, Jiri Olsa, Alexander Shishkin, Andi Kleen,
Eranian Stephane
Cc: Mark Rutland, broonie, Ravi Bangoria, linux-kernel,
linux-perf-users, Zide Chen, Falcon Thomas, Dapeng Mi, Xudong Hao,
Kan Liang, Dapeng Mi
From: Kan Liang <kan.liang@linux.intel.com>
This patch adds support for dumping SIMD registers using the new
PERF_SAMPLE_REGS_ABI_SIMD ABI.
Currently, the XMM, YMM, ZMM, OPMASK, eGPRs, and SSP registers on x86
platforms are supported with the PERF_SAMPLE_REGS_ABI_SIMD ABI.
An example of the output is displayed below.
Example:
$perf record -e cycles:p -IXMM,YMM,OPMASK,SSP ./test
$perf report -D
... ...
237538985992962 0x454d0 [0x480]: PERF_RECORD_SAMPLE(IP, 0x1):
179370/179370: 0xffffffff969627fc period: 124999 addr: 0
... intr regs: mask 0x20000000000 ABI 64-bit
.... SSP 0x0000000000000000
... SIMD ABI nr_vectors 32 vector_qwords 4 nr_pred 8 pred_qwords 1
.... YMM [0] 0x0000000000004000
.... YMM [0] 0x000055e828695270
.... YMM [0] 0x0000000000000000
.... YMM [0] 0x0000000000000000
.... YMM [1] 0x000055e8286990e0
.... YMM [1] 0x000055e828698dd0
.... YMM [1] 0x0000000000000000
.... YMM [1] 0x0000000000000000
... ...
.... YMM [31] 0x0000000000000000
.... YMM [31] 0x0000000000000000
.... YMM [31] 0x0000000000000000
.... YMM [31] 0x0000000000000000
.... OPMASK[0] 0x0000000000100221
.... OPMASK[1] 0x0000000000000020
.... OPMASK[2] 0x000000007fffffff
.... OPMASK[3] 0x0000000000000000
.... OPMASK[4] 0x0000000000000000
.... OPMASK[5] 0x0000000000000000
.... OPMASK[6] 0x0000000000000000
.... OPMASK[7] 0x0000000000000000
... ...
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Co-developed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
---
V7: 1) add assert() check for SIMD fields in sample data.
2) optimize regs_abi[] defination.
tools/perf/util/evsel.c | 36 +++++++++++++++++++++
tools/perf/util/sample.h | 10 ++++++
tools/perf/util/session.c | 66 ++++++++++++++++++++++++++++++++++++++-
3 files changed, 111 insertions(+), 1 deletion(-)
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 5f00489e714a..24cc7ba71ae1 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -3520,6 +3520,24 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
regs->mask = mask;
regs->regs = (u64 *)array;
array = (void *)array + sz;
+
+ if (regs->abi & PERF_SAMPLE_REGS_ABI_SIMD) {
+ assert(regs->nr_vectors <=
+ hweight64(evsel->core.attr.sample_simd_vec_reg_user));
+ assert(regs->vector_qwords <=
+ evsel->core.attr.sample_simd_vec_reg_qwords);
+ assert(regs->nr_pred <=
+ hweight64(evsel->core.attr.sample_simd_pred_reg_user));
+ assert(regs->pred_qwords <=
+ evsel->core.attr.sample_simd_pred_reg_qwords);
+ regs->config = *(u64 *)array;
+ array = (void *)array + sizeof(u64);
+ regs->simd_data = (u64 *)array;
+ sz = (regs->nr_vectors * regs->vector_qwords +
+ regs->nr_pred * regs->pred_qwords) * sizeof(u64);
+ OVERFLOW_CHECK(array, sz, max_size);
+ array = (void *)array + sz;
+ }
}
}
@@ -3577,6 +3595,24 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
regs->mask = mask;
regs->regs = (u64 *)array;
array = (void *)array + sz;
+
+ if (regs->abi & PERF_SAMPLE_REGS_ABI_SIMD) {
+ assert(regs->nr_vectors <=
+ hweight64(evsel->core.attr.sample_simd_vec_reg_intr));
+ assert(regs->vector_qwords <=
+ evsel->core.attr.sample_simd_vec_reg_qwords);
+ assert(regs->nr_pred <=
+ hweight64(evsel->core.attr.sample_simd_pred_reg_intr));
+ assert(regs->pred_qwords <=
+ evsel->core.attr.sample_simd_pred_reg_qwords);
+ regs->config = *(u64 *)array;
+ array = (void *)array + sizeof(u64);
+ regs->simd_data = (u64 *)array;
+ sz = (regs->nr_vectors * regs->vector_qwords +
+ regs->nr_pred * regs->pred_qwords) * sizeof(u64);
+ OVERFLOW_CHECK(array, sz, max_size);
+ array = (void *)array + sz;
+ }
}
}
diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
index 3cce8dd202aa..21f3416d3755 100644
--- a/tools/perf/util/sample.h
+++ b/tools/perf/util/sample.h
@@ -15,6 +15,16 @@ struct regs_dump {
u64 abi;
u64 mask;
u64 *regs;
+ union {
+ u64 config;
+ struct {
+ u16 nr_vectors;
+ u16 vector_qwords;
+ u16 nr_pred;
+ u16 pred_qwords;
+ };
+ };
+ u64 *simd_data;
/* Cached values/mask filled by first register access. */
u64 cache_regs[PERF_SAMPLE_REGS_CACHE_SIZE];
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7cf7bf86205d..453d44d32162 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -972,15 +972,77 @@ static void regs_dump__printf(u64 mask, struct regs_dump *regs,
}
}
+static void simd_regs_dump__printf(uint16_t e_machine, struct regs_dump *regs, bool intr)
+{
+ const char *name = "unknown";
+ int i, idx = 0;
+ uint16_t qwords;
+ int reg_c;
+
+ if (!(regs->abi & PERF_SAMPLE_REGS_ABI_SIMD))
+ return;
+
+ printf("... SIMD ABI nr_vectors %d vector_qwords %d nr_pred %d pred_qwords %d\n",
+ regs->nr_vectors, regs->vector_qwords,
+ regs->nr_pred, regs->pred_qwords);
+
+ for (reg_c = 0; reg_c < 64; reg_c++) {
+ if (intr) {
+ perf_intr_simd_reg_class_bitmap_qwords(e_machine, reg_c,
+ &qwords, /*pred=*/false);
+ } else {
+ perf_user_simd_reg_class_bitmap_qwords(e_machine, reg_c,
+ &qwords, /*pred=*/false);
+ }
+ if (regs->vector_qwords == qwords) {
+ name = perf_simd_reg_class_name(e_machine, reg_c, /*pred=*/false);
+ break;
+ }
+ }
+
+ for (i = 0; i < regs->nr_vectors; i++) {
+ printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
+ printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
+ if (regs->vector_qwords > 2) {
+ printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
+ printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
+ }
+ if (regs->vector_qwords > 4) {
+ printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
+ printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
+ printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
+ printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
+ }
+ }
+
+ name = "unknown";
+ for (reg_c = 0; reg_c < 64; reg_c++) {
+ if (intr) {
+ perf_intr_simd_reg_class_bitmap_qwords(e_machine, reg_c,
+ &qwords, /*pred=*/true);
+ } else {
+ perf_user_simd_reg_class_bitmap_qwords(e_machine, reg_c,
+ &qwords, /*pred=*/true);
+ }
+ if (regs->pred_qwords == qwords) {
+ name = perf_simd_reg_class_name(e_machine, reg_c, /*pred=*/true);
+ break;
+ }
+ }
+ for (i = 0; i < regs->nr_pred; i++)
+ printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
+}
+
static const char *regs_abi[] = {
[PERF_SAMPLE_REGS_ABI_NONE] = "none",
[PERF_SAMPLE_REGS_ABI_32] = "32-bit",
[PERF_SAMPLE_REGS_ABI_64] = "64-bit",
+ [PERF_SAMPLE_REGS_ABI_SIMD | PERF_SAMPLE_REGS_ABI_64] = "64-bit SIMD",
};
static inline const char *regs_dump_abi(struct regs_dump *d)
{
- if (d->abi > PERF_SAMPLE_REGS_ABI_64)
+ if (d->abi >= ARRAY_SIZE(regs_abi) || !regs_abi[d->abi])
return "unknown";
return regs_abi[d->abi];
@@ -1010,6 +1072,7 @@ static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine, ui
if (user_regs->regs)
regs__printf("user", user_regs, e_machine, e_flags);
+ simd_regs_dump__printf(e_machine, user_regs, /*intr=*/false);
}
static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags)
@@ -1023,6 +1086,7 @@ static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine, ui
if (intr_regs->regs)
regs__printf("intr", intr_regs, e_machine, e_flags);
+ simd_regs_dump__printf(e_machine, intr_regs, /*intr=*/true);
}
static void stack_user__printf(struct stack_dump *dump)
--
2.34.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [Patch v7 4/4] perf regs: Enable dumping of SIMD registers
2026-03-24 0:57 ` [Patch v7 4/4] perf regs: Enable dumping of SIMD registers Dapeng Mi
@ 2026-03-26 5:48 ` Mi, Dapeng
0 siblings, 0 replies; 10+ messages in thread
From: Mi, Dapeng @ 2026-03-26 5:48 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Namhyung Kim, Thomas Gleixner, Dave Hansen, Ian Rogers,
Adrian Hunter, Jiri Olsa, Alexander Shishkin, Andi Kleen,
Eranian Stephane
Cc: Mark Rutland, broonie, Ravi Bangoria, linux-kernel,
linux-perf-users, Zide Chen, Falcon Thomas, Dapeng Mi, Xudong Hao,
Kan Liang
On 3/24/2026 8:57 AM, Dapeng Mi wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
>
> This patch adds support for dumping SIMD registers using the new
> PERF_SAMPLE_REGS_ABI_SIMD ABI.
>
> Currently, the XMM, YMM, ZMM, OPMASK, eGPRs, and SSP registers on x86
> platforms are supported with the PERF_SAMPLE_REGS_ABI_SIMD ABI.
>
> An example of the output is displayed below.
>
> Example:
>
> $perf record -e cycles:p -IXMM,YMM,OPMASK,SSP ./test
> $perf report -D
> ... ...
> 237538985992962 0x454d0 [0x480]: PERF_RECORD_SAMPLE(IP, 0x1):
> 179370/179370: 0xffffffff969627fc period: 124999 addr: 0
> ... intr regs: mask 0x20000000000 ABI 64-bit
> .... SSP 0x0000000000000000
> ... SIMD ABI nr_vectors 32 vector_qwords 4 nr_pred 8 pred_qwords 1
> .... YMM [0] 0x0000000000004000
> .... YMM [0] 0x000055e828695270
> .... YMM [0] 0x0000000000000000
> .... YMM [0] 0x0000000000000000
> .... YMM [1] 0x000055e8286990e0
> .... YMM [1] 0x000055e828698dd0
> .... YMM [1] 0x0000000000000000
> .... YMM [1] 0x0000000000000000
> ... ...
> .... YMM [31] 0x0000000000000000
> .... YMM [31] 0x0000000000000000
> .... YMM [31] 0x0000000000000000
> .... YMM [31] 0x0000000000000000
> .... OPMASK[0] 0x0000000000100221
> .... OPMASK[1] 0x0000000000000020
> .... OPMASK[2] 0x000000007fffffff
> .... OPMASK[3] 0x0000000000000000
> .... OPMASK[4] 0x0000000000000000
> .... OPMASK[5] 0x0000000000000000
> .... OPMASK[6] 0x0000000000000000
> .... OPMASK[7] 0x0000000000000000
> ... ...
>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> Co-developed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
> ---
>
> V7: 1) add assert() check for SIMD fields in sample data.
> 2) optimize regs_abi[] defination.
>
> tools/perf/util/evsel.c | 36 +++++++++++++++++++++
> tools/perf/util/sample.h | 10 ++++++
> tools/perf/util/session.c | 66 ++++++++++++++++++++++++++++++++++++++-
> 3 files changed, 111 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 5f00489e714a..24cc7ba71ae1 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -3520,6 +3520,24 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
> regs->mask = mask;
> regs->regs = (u64 *)array;
> array = (void *)array + sz;
> +
> + if (regs->abi & PERF_SAMPLE_REGS_ABI_SIMD) {
> + assert(regs->nr_vectors <=
> + hweight64(evsel->core.attr.sample_simd_vec_reg_user));
> + assert(regs->vector_qwords <=
> + evsel->core.attr.sample_simd_vec_reg_qwords);
> + assert(regs->nr_pred <=
> + hweight64(evsel->core.attr.sample_simd_pred_reg_user));
> + assert(regs->pred_qwords <=
> + evsel->core.attr.sample_simd_pred_reg_qwords);
Sashiko complains
"
Is it safe to use assert() to validate untrusted perf.data file contents here?
If the sample data exceeds the SIMD register bounds, wouldn't this crash the
tool instead of gracefully returning an error?
Additionally, if compiled with NDEBUG, would the bounds checks be removed
completely?
Could these asserts also be evaluating uninitialized or zeroed memory?
The assertions are executed before the union is populated from the sample via
regs->config = *(u64 *)array. This would allow the checks to trivially pass,
potentially allowing an attacker to bypass the intended capability bounds by
providing arbitrarily large values in the configuration data.
"
It's correct. Need to move the assert check after "regs->config = *(u64
*)array;", and would change the assert() check to a regular if check. If
the check fail, gracefully return an error.
> + regs->config = *(u64 *)array;
Sashiko comments
"
Is a bounds check missing before reading the SIMD configuration word?
There doesn't appear to be an OVERFLOW_CHECK_u64(array) before dereferencing
the array pointer. If a malformed perf.data event payload ends exactly before
this configuration word, could this result in an out-of-bounds read?
"
It's correct. Need to move the later "OVERFLOW_CHECK(array, sz, max_size)"
before the accessing.
> + array = (void *)array + sizeof(u64);
> + regs->simd_data = (u64 *)array;
> + sz = (regs->nr_vectors * regs->vector_qwords +
> + regs->nr_pred * regs->pred_qwords) * sizeof(u64);
Sashiko comments
"
Can this calculation overflow and bypass the OVERFLOW_CHECK below?
Because nr_vectors and vector_qwords are u16 fields, C integer promotion
evaluates the multiplication as a signed 32-bit int. If large values are
supplied (e.g., nr_vectors = 65535, vector_qwords = 65535), could the 32-bit
integer wrap around to a small positive value?
This might allow the size check to pass while subsequent reads process a large
amount of out-of-bounds memory.
"
Although "nr_vectors" and "vector_qwords" can't be so large in reality,
it's a risk in logic. Would fix it.
> + OVERFLOW_CHECK(array, sz, max_size);
> + array = (void *)array + sz;
> + }
> }
> }
>
> @@ -3577,6 +3595,24 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
> regs->mask = mask;
> regs->regs = (u64 *)array;
> array = (void *)array + sz;
> +
> + if (regs->abi & PERF_SAMPLE_REGS_ABI_SIMD) {
> + assert(regs->nr_vectors <=
> + hweight64(evsel->core.attr.sample_simd_vec_reg_intr));
> + assert(regs->vector_qwords <=
> + evsel->core.attr.sample_simd_vec_reg_qwords);
> + assert(regs->nr_pred <=
> + hweight64(evsel->core.attr.sample_simd_pred_reg_intr));
> + assert(regs->pred_qwords <=
> + evsel->core.attr.sample_simd_pred_reg_qwords);
> + regs->config = *(u64 *)array;
> + array = (void *)array + sizeof(u64);
> + regs->simd_data = (u64 *)array;
> + sz = (regs->nr_vectors * regs->vector_qwords +
> + regs->nr_pred * regs->pred_qwords) * sizeof(u64);
> + OVERFLOW_CHECK(array, sz, max_size);
> + array = (void *)array + sz;
> + }
> }
> }
>
> diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
> index 3cce8dd202aa..21f3416d3755 100644
> --- a/tools/perf/util/sample.h
> +++ b/tools/perf/util/sample.h
> @@ -15,6 +15,16 @@ struct regs_dump {
> u64 abi;
> u64 mask;
> u64 *regs;
> + union {
> + u64 config;
> + struct {
> + u16 nr_vectors;
> + u16 vector_qwords;
> + u16 nr_pred;
> + u16 pred_qwords;
> + };
> + };
Sashiko comments
"
Does overlaying the 64-bit integer with four 16-bit fields break cross-endian
parsing?
If perf processes a file recorded on a machine with different endianness, the
sample payload is typically byte-swapped as an array of 64-bit integers.
Swapping the 64-bit config word would reverse the structural byte order of the
four 16-bit fields (e.g., nr_vectors shifts to the position of pred_qwords),
which could corrupt the dimension variables.
"
Wow, Sashiko really found a such deeply hidden issue (Copilot didn't find
it). Considering currently perf-tools only does 64bits word swap for the
endians, so it looks the simplest way to fix this issue is to change the 4
u16 fields to 4 u64 fields.
@Peter, @Arnaldo, @Namhyung, @Ian, do you have any idea on this?
> + u64 *simd_data;
>
> /* Cached values/mask filled by first register access. */
> u64 cache_regs[PERF_SAMPLE_REGS_CACHE_SIZE];
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 7cf7bf86205d..453d44d32162 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -972,15 +972,77 @@ static void regs_dump__printf(u64 mask, struct regs_dump *regs,
> }
> }
>
> +static void simd_regs_dump__printf(uint16_t e_machine, struct regs_dump *regs, bool intr)
> +{
> + const char *name = "unknown";
> + int i, idx = 0;
> + uint16_t qwords;
> + int reg_c;
> +
> + if (!(regs->abi & PERF_SAMPLE_REGS_ABI_SIMD))
> + return;
> +
> + printf("... SIMD ABI nr_vectors %d vector_qwords %d nr_pred %d pred_qwords %d\n",
> + regs->nr_vectors, regs->vector_qwords,
> + regs->nr_pred, regs->pred_qwords);
> +
> + for (reg_c = 0; reg_c < 64; reg_c++) {
> + if (intr) {
> + perf_intr_simd_reg_class_bitmap_qwords(e_machine, reg_c,
> + &qwords, /*pred=*/false);
> + } else {
> + perf_user_simd_reg_class_bitmap_qwords(e_machine, reg_c,
> + &qwords, /*pred=*/false);
> + }
> + if (regs->vector_qwords == qwords) {
> + name = perf_simd_reg_class_name(e_machine, reg_c, /*pred=*/false);
> + break;
> + }
> + }
> +
> + for (i = 0; i < regs->nr_vectors; i++) {
> + printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
> + printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
> + if (regs->vector_qwords > 2) {
> + printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
> + printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
> + }
> + if (regs->vector_qwords > 4) {
> + printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
> + printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
> + printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
> + printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
> + }
> + }
Sashiko comments
"
Does this print logic safely handle all values of vector_qwords?
The loop unconditionally prints 2 elements per vector, and extends to 4 or 8
elements if vector_qwords > 2 or > 4. If an unusual value like vector_qwords
is 1 or 3, could the loop consume more elements than were allocated, leading
to reading past the simd_data array bounds?
The same pattern seems to apply to pred_qwords below if it is set to 0.
"
Make sense. Would precisely check if the qwords length instead of just a
range.
Thanks.
> +
> + name = "unknown";
> + for (reg_c = 0; reg_c < 64; reg_c++) {
> + if (intr) {
> + perf_intr_simd_reg_class_bitmap_qwords(e_machine, reg_c,
> + &qwords, /*pred=*/true);
> + } else {
> + perf_user_simd_reg_class_bitmap_qwords(e_machine, reg_c,
> + &qwords, /*pred=*/true);
> + }
> + if (regs->pred_qwords == qwords) {
> + name = perf_simd_reg_class_name(e_machine, reg_c, /*pred=*/true);
> + break;
> + }
> + }
> + for (i = 0; i < regs->nr_pred; i++)
> + printf(".... %-5s[%d] 0x%016" PRIx64 "\n", name, i, regs->simd_data[idx++]);
> +}
> +
> static const char *regs_abi[] = {
> [PERF_SAMPLE_REGS_ABI_NONE] = "none",
> [PERF_SAMPLE_REGS_ABI_32] = "32-bit",
> [PERF_SAMPLE_REGS_ABI_64] = "64-bit",
> + [PERF_SAMPLE_REGS_ABI_SIMD | PERF_SAMPLE_REGS_ABI_64] = "64-bit SIMD",
> };
>
> static inline const char *regs_dump_abi(struct regs_dump *d)
> {
> - if (d->abi > PERF_SAMPLE_REGS_ABI_64)
> + if (d->abi >= ARRAY_SIZE(regs_abi) || !regs_abi[d->abi])
> return "unknown";
>
> return regs_abi[d->abi];
> @@ -1010,6 +1072,7 @@ static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine, ui
>
> if (user_regs->regs)
> regs__printf("user", user_regs, e_machine, e_flags);
> + simd_regs_dump__printf(e_machine, user_regs, /*intr=*/false);
> }
>
> static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags)
> @@ -1023,6 +1086,7 @@ static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine, ui
>
> if (intr_regs->regs)
> regs__printf("intr", intr_regs, e_machine, e_flags);
> + simd_regs_dump__printf(e_machine, intr_regs, /*intr=*/true);
> }
>
> static void stack_user__printf(struct stack_dump *dump)
^ permalink raw reply [flat|nested] 10+ messages in thread